{ "best_global_step": 10000, "best_metric": 0.24540361762046814, "best_model_checkpoint": "../data/cache/tvc_LLaVA-OneVision-1_5-4B-Instruct_v2/checkpoint-10000", "epoch": 2.0422772529997446, "eval_steps": 500, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "copy_logits_max": 0.09779547154903412, "copy_logits_min": -750000000.0, "copy_num_tokens": 629.0, "epoch": 0.00020423793719683432, "gen_logits_max": 21.79285430908203, "gen_logits_mean": 2.00488543510437, "gen_logits_min": -10.557607650756836, "gen_logits_std": 2.4507598876953125, "gen_loss": 6.528437614440918, "grad_norm": 66.34748259859441, "learning_rate": 0.0, "loss": 6.0205, "mean_copy_accuracy": 0.0, "mean_gen_accuracy": 0.735689178109169, "mean_token_accuracy": 0.5482160300016403, "num_tokens": 227734.0, "sample_num_tokens": 9754.5, "step": 1, "total_num_tokens": 266752.0, "z_loss": 0.0076296464540064335 }, { "copy_logits_max": 0.10593772679567337, "copy_logits_min": -750000000.0, "copy_num_tokens": 337.625, "epoch": 0.00040847587439366863, "gen_logits_max": 22.748485565185547, "gen_logits_mean": 2.6494312286376953, "gen_logits_min": -10.027822494506836, "gen_logits_std": 2.4453282356262207, "gen_loss": 5.181606769561768, "grad_norm": 49.64169556877416, "learning_rate": 4.081632653061224e-08, "loss": 4.7867, "mean_copy_accuracy": 0.0, "mean_gen_accuracy": 0.7407035529613495, "mean_token_accuracy": 0.6024519801139832, "num_tokens": 479376.0, "sample_num_tokens": 7578.5, "step": 2, "total_num_tokens": 509690.0, "z_loss": 0.008161867037415504 }, { "copy_logits_max": 0.10155119001865387, "copy_logits_min": -750000000.0, "copy_num_tokens": 341.5, "epoch": 0.0006127138115905029, "gen_logits_max": 22.76348114013672, "gen_logits_mean": 2.786215305328369, "gen_logits_min": -9.69410514831543, "gen_logits_std": 2.4163084030151367, "gen_loss": 4.954564094543457, "grad_norm": 58.91338704347023, "learning_rate": 8.163265306122448e-08, "loss": 5.427, "mean_copy_accuracy": 0.0, "mean_gen_accuracy": 0.74184550344944, "mean_token_accuracy": 0.582280844449997, "num_tokens": 747097.0, "sample_num_tokens": 8258.75, "step": 3, "total_num_tokens": 780132.0, "z_loss": 0.008677965961396694 }, { "copy_logits_max": 0.09201942384243011, "copy_logits_min": -687500032.0, "copy_num_tokens": 619.125, "epoch": 0.0008169517487873373, "gen_logits_max": 20.709577560424805, "gen_logits_mean": 2.017320156097412, "gen_logits_min": -10.309803009033203, "gen_logits_std": 2.388370990753174, "gen_loss": 6.448441505432129, "grad_norm": 68.044496004906, "learning_rate": 1.2244897959183676e-07, "loss": 6.1024, "mean_copy_accuracy": 0.0, "mean_gen_accuracy": 0.7350551337003708, "mean_token_accuracy": 0.5494187325239182, "num_tokens": 1002096.0, "sample_num_tokens": 9990.5, "step": 4, "total_num_tokens": 1042058.0, "z_loss": 0.007640200201421976 }, { "copy_logits_max": 0.09588945657014847, "copy_logits_min": -687500032.0, "copy_num_tokens": 420.75, "epoch": 0.0010211896859841716, "gen_logits_max": 22.60548973083496, "gen_logits_mean": 2.3431355953216553, "gen_logits_min": -10.909655570983887, "gen_logits_std": 2.489755153656006, "gen_loss": 5.003510475158691, "grad_norm": 73.31505140238214, "learning_rate": 1.6326530612244896e-07, "loss": 5.5261, "mean_copy_accuracy": 0.0, "mean_gen_accuracy": 0.7356636822223663, "mean_token_accuracy": 0.5594876855611801, "num_tokens": 1285853.0, "sample_num_tokens": 8577.75, "step": 5, "total_num_tokens": 1320164.0, "z_loss": 0.008216841146349907 }, { "copy_logits_max": 0.10855862498283386, "copy_logits_min": -750000000.0, "copy_num_tokens": 345.5625, "epoch": 0.0012254276231810058, "gen_logits_max": 22.440841674804688, "gen_logits_mean": 2.532465934753418, "gen_logits_min": -9.915996551513672, "gen_logits_std": 2.4212372303009033, "gen_loss": 4.919076442718506, "grad_norm": 58.70375081265489, "learning_rate": 2.0408163265306121e-07, "loss": 5.1518, "mean_copy_accuracy": 0.0, "mean_gen_accuracy": 0.7372919023036957, "mean_token_accuracy": 0.582348108291626, "num_tokens": 1547959.0, "sample_num_tokens": 7264.25, "step": 6, "total_num_tokens": 1577016.0, "z_loss": 0.007893863134086132 }, { "copy_logits_max": 0.10790550708770752, "copy_logits_min": -750000000.0, "copy_num_tokens": 475.25, "epoch": 0.0014296655603778402, "gen_logits_max": 22.717004776000977, "gen_logits_mean": 2.573240280151367, "gen_logits_min": -10.134801864624023, "gen_logits_std": 2.442633867263794, "gen_loss": 6.173992156982422, "grad_norm": 63.436171645629216, "learning_rate": 2.448979591836735e-07, "loss": 5.3229, "mean_copy_accuracy": 0.0, "mean_gen_accuracy": 0.7284181863069534, "mean_token_accuracy": 0.5713682770729065, "num_tokens": 1800130.0, "sample_num_tokens": 8224.0, "step": 7, "total_num_tokens": 1833026.0, "z_loss": 0.007799461483955383 }, { "copy_logits_max": 0.09666276723146439, "copy_logits_min": -750000000.0, "copy_num_tokens": 617.5625, "epoch": 0.0016339034975746745, "gen_logits_max": 21.749828338623047, "gen_logits_mean": 2.1207878589630127, "gen_logits_min": -10.823326110839844, "gen_logits_std": 2.4322762489318848, "gen_loss": 6.129856109619141, "grad_norm": 64.1988514695857, "learning_rate": 2.8571428571428575e-07, "loss": 5.8297, "mean_copy_accuracy": 0.0, "mean_gen_accuracy": 0.7538372427225113, "mean_token_accuracy": 0.568273738026619, "num_tokens": 2074314.0, "sample_num_tokens": 9920.0, "step": 8, "total_num_tokens": 2113994.0, "z_loss": 0.007768387906253338 }, { "copy_logits_max": 0.08708208799362183, "copy_logits_min": -687500032.0, "copy_num_tokens": 577.25, "epoch": 0.001838141434771509, "gen_logits_max": 20.006107330322266, "gen_logits_mean": 1.430530071258545, "gen_logits_min": -11.239981651306152, "gen_logits_std": 2.4480690956115723, "gen_loss": 6.450387477874756, "grad_norm": 69.16396780593014, "learning_rate": 3.265306122448979e-07, "loss": 6.0851, "mean_copy_accuracy": 0.0, "mean_gen_accuracy": 0.7404308170080185, "mean_token_accuracy": 0.5464346408843994, "num_tokens": 2359603.0, "sample_num_tokens": 8891.75, "step": 9, "total_num_tokens": 2395170.0, "z_loss": 0.007410875521600246 }, { "copy_logits_max": 0.1178402304649353, "copy_logits_min": -750000000.0, "copy_num_tokens": 329.3125, "epoch": 0.0020423793719683433, "gen_logits_max": 23.671871185302734, "gen_logits_mean": 2.847024917602539, "gen_logits_min": -9.878509521484375, "gen_logits_std": 2.399857521057129, "gen_loss": 4.941654205322266, "grad_norm": 77.13222905298892, "learning_rate": 3.673469387755102e-07, "loss": 5.7208, "mean_copy_accuracy": 0.0, "mean_gen_accuracy": 0.7297149151563644, "mean_token_accuracy": 0.5579126626253128, "num_tokens": 2628382.0, "sample_num_tokens": 7696.5, "step": 10, "total_num_tokens": 2659168.0, "z_loss": 0.008119698613882065 }, { "copy_logits_max": 0.09114391356706619, "copy_logits_min": -750000000.0, "copy_num_tokens": 429.625, "epoch": 0.0022466173091651776, "gen_logits_max": 21.374664306640625, "gen_logits_mean": 2.042901039123535, "gen_logits_min": -10.55974292755127, "gen_logits_std": 2.4826364517211914, "gen_loss": 5.243967533111572, "grad_norm": 69.07648252377145, "learning_rate": 4.0816326530612243e-07, "loss": 5.7018, "mean_copy_accuracy": 0.0, "mean_gen_accuracy": 0.7260888963937759, "mean_token_accuracy": 0.5525022447109222, "num_tokens": 2894677.0, "sample_num_tokens": 8386.75, "step": 11, "total_num_tokens": 2928224.0, "z_loss": 0.008197125047445297 }, { "copy_logits_max": 0.10676076263189316, "copy_logits_min": -750000000.0, "copy_num_tokens": 411.625, "epoch": 0.0024508552463620116, "gen_logits_max": 23.025306701660156, "gen_logits_mean": 2.5036139488220215, "gen_logits_min": -10.590240478515625, "gen_logits_std": 2.4679999351501465, "gen_loss": 5.243176460266113, "grad_norm": 58.725822436852724, "learning_rate": 4.489795918367347e-07, "loss": 5.0661, "mean_copy_accuracy": 0.0, "mean_gen_accuracy": 0.7437483370304108, "mean_token_accuracy": 0.577948585152626, "num_tokens": 3165651.0, "sample_num_tokens": 8394.75, "step": 12, "total_num_tokens": 3199230.0, "z_loss": 0.008212652057409286 }, { "copy_logits_max": 0.10728956013917923, "copy_logits_min": -750000000.0, "copy_num_tokens": 375.1875, "epoch": 0.002655093183558846, "gen_logits_max": 24.104612350463867, "gen_logits_mean": 2.7759037017822266, "gen_logits_min": -9.80027961730957, "gen_logits_std": 2.457521915435791, "gen_loss": 5.220744609832764, "grad_norm": 64.92683667776177, "learning_rate": 4.89795918367347e-07, "loss": 5.8545, "mean_copy_accuracy": 0.0, "mean_gen_accuracy": 0.7243976444005966, "mean_token_accuracy": 0.5379521548748016, "num_tokens": 3424681.0, "sample_num_tokens": 7764.75, "step": 13, "total_num_tokens": 3455740.0, "z_loss": 0.008220614865422249 }, { "copy_logits_max": 0.11228838562965393, "copy_logits_min": -750000000.0, "copy_num_tokens": 484.5, "epoch": 0.0028593311207556803, "gen_logits_max": 22.425819396972656, "gen_logits_mean": 2.349261522293091, "gen_logits_min": -10.083433151245117, "gen_logits_std": 2.3702750205993652, "gen_loss": 5.905946254730225, "grad_norm": 68.69829694122855, "learning_rate": 5.306122448979592e-07, "loss": 6.1584, "mean_copy_accuracy": 0.0, "mean_gen_accuracy": 0.7552435547113419, "mean_token_accuracy": 0.5505102425813675, "num_tokens": 3711391.0, "sample_num_tokens": 7822.75, "step": 14, "total_num_tokens": 3742682.0, "z_loss": 0.007828688248991966 }, { "copy_logits_max": 0.11191266775131226, "copy_logits_min": -750000000.0, "copy_num_tokens": 406.75, "epoch": 0.0030635690579525147, "gen_logits_max": 22.147048950195312, "gen_logits_mean": 2.6463875770568848, "gen_logits_min": -10.407133102416992, "gen_logits_std": 2.4538798332214355, "gen_loss": 5.468090057373047, "grad_norm": 58.4738848517535, "learning_rate": 5.714285714285715e-07, "loss": 5.2412, "mean_copy_accuracy": 0.0, "mean_gen_accuracy": 0.7413399368524551, "mean_token_accuracy": 0.5672057569026947, "num_tokens": 3973070.0, "sample_num_tokens": 7859.5, "step": 15, "total_num_tokens": 4004508.0, "z_loss": 0.007940905168652534 }, { "copy_logits_max": 0.10840710252523422, "copy_logits_min": -750000000.0, "copy_num_tokens": 725.125, "epoch": 0.003267806995149349, "gen_logits_max": 20.134122848510742, "gen_logits_mean": 1.6297985315322876, "gen_logits_min": -10.667466163635254, "gen_logits_std": 2.3918850421905518, "gen_loss": 6.589597702026367, "grad_norm": 78.60517972600303, "learning_rate": 6.122448979591837e-07, "loss": 6.5674, "mean_copy_accuracy": 0.0, "mean_gen_accuracy": 0.7369340062141418, "mean_token_accuracy": 0.5122593119740486, "num_tokens": 4249820.0, "sample_num_tokens": 10148.5, "step": 16, "total_num_tokens": 4290414.0, "z_loss": 0.007212984841316938 }, { "copy_logits_max": 0.12900689244270325, "copy_logits_min": -750000000.0, "copy_num_tokens": 504.9375, "epoch": 0.0034720449323461834, "gen_logits_max": 23.211711883544922, "gen_logits_mean": 2.5596604347229004, "gen_logits_min": -9.87232780456543, "gen_logits_std": 2.3687644004821777, "gen_loss": 5.711223602294922, "grad_norm": 73.09026078161634, "learning_rate": 6.530612244897958e-07, "loss": 5.8316, "mean_copy_accuracy": 0.0, "mean_gen_accuracy": 0.7435218840837479, "mean_token_accuracy": 0.5503934174776077, "num_tokens": 4521447.0, "sample_num_tokens": 8967.75, "step": 17, "total_num_tokens": 4557318.0, "z_loss": 0.007628780789673328 }, { "copy_logits_max": 0.1297341287136078, "copy_logits_min": -750000000.0, "copy_num_tokens": 387.5, "epoch": 0.003676282869543018, "gen_logits_max": 23.96619987487793, "gen_logits_mean": 3.0754449367523193, "gen_logits_min": -9.316421508789062, "gen_logits_std": 2.3801872730255127, "gen_loss": 4.981304168701172, "grad_norm": 60.769545355402386, "learning_rate": 6.938775510204081e-07, "loss": 5.4507, "mean_copy_accuracy": 0.0, "mean_gen_accuracy": 0.7402247488498688, "mean_token_accuracy": 0.5602815598249435, "num_tokens": 4788697.0, "sample_num_tokens": 8325.25, "step": 18, "total_num_tokens": 4821998.0, "z_loss": 0.008055349811911583 }, { "copy_logits_max": 0.11688800156116486, "copy_logits_min": -687500032.0, "copy_num_tokens": 445.4375, "epoch": 0.0038805208067398517, "gen_logits_max": 20.55951499938965, "gen_logits_mean": 1.4928474426269531, "gen_logits_min": -11.896738052368164, "gen_logits_std": 2.496037006378174, "gen_loss": 6.236581802368164, "grad_norm": 85.06481359626648, "learning_rate": 7.346938775510204e-07, "loss": 6.4756, "mean_copy_accuracy": 0.0, "mean_gen_accuracy": 0.7468514740467072, "mean_token_accuracy": 0.5219495892524719, "num_tokens": 5085884.0, "sample_num_tokens": 7353.5, "step": 19, "total_num_tokens": 5115298.0, "z_loss": 0.007357113528996706 }, { "copy_logits_max": 0.11199302971363068, "copy_logits_min": -750000000.0, "copy_num_tokens": 573.375, "epoch": 0.0040847587439366865, "gen_logits_max": 20.379913330078125, "gen_logits_mean": 1.0640289783477783, "gen_logits_min": -11.518363952636719, "gen_logits_std": 2.451239585876465, "gen_loss": 6.391770362854004, "grad_norm": 64.7671908873009, "learning_rate": 7.755102040816327e-07, "loss": 5.6374, "mean_copy_accuracy": 0.0, "mean_gen_accuracy": 0.7514811605215073, "mean_token_accuracy": 0.5573947429656982, "num_tokens": 5358737.0, "sample_num_tokens": 8688.25, "step": 20, "total_num_tokens": 5393490.0, "z_loss": 0.007309796288609505 }, { "copy_logits_max": 0.1372780203819275, "copy_logits_min": -750000000.0, "copy_num_tokens": 454.4375, "epoch": 0.004288996681133521, "gen_logits_max": 22.15497589111328, "gen_logits_mean": 2.2648887634277344, "gen_logits_min": -10.000024795532227, "gen_logits_std": 2.3322877883911133, "gen_loss": 4.757246017456055, "grad_norm": 67.77256173126092, "learning_rate": 8.163265306122449e-07, "loss": 5.6298, "mean_copy_accuracy": 0.0, "mean_gen_accuracy": 0.7269752770662308, "mean_token_accuracy": 0.514225497841835, "num_tokens": 5626270.0, "sample_num_tokens": 8456.5, "step": 21, "total_num_tokens": 5660096.0, "z_loss": 0.007222291547805071 }, { "copy_logits_max": 0.14444811642169952, "copy_logits_min": -750000000.0, "copy_num_tokens": 492.9375, "epoch": 0.004493234618330355, "gen_logits_max": 22.747188568115234, "gen_logits_mean": 2.6350936889648438, "gen_logits_min": -9.596160888671875, "gen_logits_std": 2.352829933166504, "gen_loss": 4.5786027908325195, "grad_norm": 48.80223720168462, "learning_rate": 8.571428571428571e-07, "loss": 4.4078, "mean_copy_accuracy": 0.0, "mean_gen_accuracy": 0.7448815256357193, "mean_token_accuracy": 0.5693961530923843, "num_tokens": 5899833.0, "sample_num_tokens": 8361.75, "step": 22, "total_num_tokens": 5933280.0, "z_loss": 0.0073621077463030815 }, { "copy_logits_max": 0.11737050861120224, "copy_logits_min": -750000000.0, "copy_num_tokens": 655.125, "epoch": 0.004697472555527189, "gen_logits_max": 18.661909103393555, "gen_logits_mean": 1.0917857885360718, "gen_logits_min": -10.781993865966797, "gen_logits_std": 2.381573438644409, "gen_loss": 5.54071569442749, "grad_norm": 52.680848955812074, "learning_rate": 8.979591836734694e-07, "loss": 4.8167, "mean_copy_accuracy": 0.0, "mean_gen_accuracy": 0.7382203340530396, "mean_token_accuracy": 0.5457352548837662, "num_tokens": 6167995.0, "sample_num_tokens": 9290.25, "step": 23, "total_num_tokens": 6205156.0, "z_loss": 0.006331139709800482 }, { "copy_logits_max": 0.13230249285697937, "copy_logits_min": -750000000.0, "copy_num_tokens": 516.75, "epoch": 0.004901710492724023, "gen_logits_max": 19.75667953491211, "gen_logits_mean": 1.5387582778930664, "gen_logits_min": -10.032064437866211, "gen_logits_std": 2.320077896118164, "gen_loss": 4.543444633483887, "grad_norm": 51.12377305711582, "learning_rate": 9.387755102040817e-07, "loss": 4.4836, "mean_copy_accuracy": 0.0, "mean_gen_accuracy": 0.7490358650684357, "mean_token_accuracy": 0.5633227378129959, "num_tokens": 6445015.0, "sample_num_tokens": 9605.25, "step": 24, "total_num_tokens": 6483436.0, "z_loss": 0.0063138664700090885 }, { "copy_logits_max": 0.13293671607971191, "copy_logits_min": -750000000.0, "copy_num_tokens": 499.1875, "epoch": 0.0051059484299208575, "gen_logits_max": 19.786895751953125, "gen_logits_mean": 1.4847599267959595, "gen_logits_min": -10.700145721435547, "gen_logits_std": 2.406343936920166, "gen_loss": 4.719970703125, "grad_norm": 52.62751809772118, "learning_rate": 9.79591836734694e-07, "loss": 4.7466, "mean_copy_accuracy": 0.0, "mean_gen_accuracy": 0.7356186956167221, "mean_token_accuracy": 0.5365262627601624, "num_tokens": 6719243.0, "sample_num_tokens": 8289.25, "step": 25, "total_num_tokens": 6752400.0, "z_loss": 0.00650453194975853 }, { "copy_logits_max": 0.14142678678035736, "copy_logits_min": -750000000.0, "copy_num_tokens": 347.875, "epoch": 0.005310186367117692, "gen_logits_max": 20.631546020507812, "gen_logits_mean": 1.9539722204208374, "gen_logits_min": -10.821287155151367, "gen_logits_std": 2.4404807090759277, "gen_loss": 4.046385288238525, "grad_norm": 104.593870262727, "learning_rate": 1.0204081632653063e-06, "loss": 4.109, "mean_copy_accuracy": 0.0, "mean_gen_accuracy": 0.7314056307077408, "mean_token_accuracy": 0.574714407324791, "num_tokens": 6984501.0, "sample_num_tokens": 7842.75, "step": 26, "total_num_tokens": 7015872.0, "z_loss": 0.007191919721662998 }, { "copy_logits_max": 0.1615469753742218, "copy_logits_min": -687500032.0, "copy_num_tokens": 495.9375, "epoch": 0.005514424304314526, "gen_logits_max": 20.584657669067383, "gen_logits_mean": 1.4802104234695435, "gen_logits_min": -9.80564022064209, "gen_logits_std": 2.2849841117858887, "gen_loss": 4.394665241241455, "grad_norm": 45.37416217685235, "learning_rate": 1.0612244897959184e-06, "loss": 4.3197, "mean_copy_accuracy": 0.0, "mean_gen_accuracy": 0.7467457056045532, "mean_token_accuracy": 0.5648910254240036, "num_tokens": 7254005.0, "sample_num_tokens": 8965.75, "step": 27, "total_num_tokens": 7289868.0, "z_loss": 0.006368150003254414 }, { "copy_logits_max": 0.17637954652309418, "copy_logits_min": -750000000.0, "copy_num_tokens": 597.875, "epoch": 0.005718662241511361, "gen_logits_max": 20.84345245361328, "gen_logits_mean": 1.6991294622421265, "gen_logits_min": -9.911264419555664, "gen_logits_std": 2.2851223945617676, "gen_loss": 4.1195549964904785, "grad_norm": 43.770544487380484, "learning_rate": 1.1020408163265308e-06, "loss": 4.0863, "mean_copy_accuracy": 0.0, "mean_gen_accuracy": 0.751825213432312, "mean_token_accuracy": 0.5732047110795975, "num_tokens": 7560235.0, "sample_num_tokens": 10053.75, "step": 28, "total_num_tokens": 7600450.0, "z_loss": 0.006484922952950001 }, { "copy_logits_max": 0.1667424440383911, "copy_logits_min": -750000000.0, "copy_num_tokens": 376.5, "epoch": 0.005922900178708195, "gen_logits_max": 19.591434478759766, "gen_logits_mean": 1.1058894395828247, "gen_logits_min": -10.532214164733887, "gen_logits_std": 2.31685471534729, "gen_loss": 3.202162742614746, "grad_norm": 38.686242496042055, "learning_rate": 1.142857142857143e-06, "loss": 3.5567, "mean_copy_accuracy": 0.0, "mean_gen_accuracy": 0.7380028665065765, "mean_token_accuracy": 0.5493502616882324, "num_tokens": 7858339.0, "sample_num_tokens": 7801.75, "step": 29, "total_num_tokens": 7889546.0, "z_loss": 0.006109167821705341 }, { "copy_logits_max": 0.1608644723892212, "copy_logits_min": -750000000.0, "copy_num_tokens": 624.0625, "epoch": 0.006127138115905029, "gen_logits_max": 16.700714111328125, "gen_logits_mean": -0.3430884778499603, "gen_logits_min": -11.412601470947266, "gen_logits_std": 2.3282811641693115, "gen_loss": 3.8941354751586914, "grad_norm": 32.84290331366769, "learning_rate": 1.183673469387755e-06, "loss": 3.3406, "mean_copy_accuracy": 0.0, "mean_gen_accuracy": 0.7378443032503128, "mean_token_accuracy": 0.5411000773310661, "num_tokens": 8118421.0, "sample_num_tokens": 8755.25, "step": 30, "total_num_tokens": 8153442.0, "z_loss": 0.004420141689479351 }, { "copy_logits_max": 0.1760975569486618, "copy_logits_min": -750000000.0, "copy_num_tokens": 392.5625, "epoch": 0.006331376053101864, "gen_logits_max": 17.595046997070312, "gen_logits_mean": -0.06064918637275696, "gen_logits_min": -10.736461639404297, "gen_logits_std": 2.273881196975708, "gen_loss": 2.980567693710327, "grad_norm": 29.265285367634196, "learning_rate": 1.2244897959183673e-06, "loss": 3.1937, "mean_copy_accuracy": 0.0, "mean_gen_accuracy": 0.7407780587673187, "mean_token_accuracy": 0.5408824533224106, "num_tokens": 8397955.0, "sample_num_tokens": 7556.25, "step": 31, "total_num_tokens": 8428180.0, "z_loss": 0.004861264955252409 }, { "copy_logits_max": 0.1787678450345993, "copy_logits_min": -750000000.0, "copy_num_tokens": 390.5, "epoch": 0.006535613990298698, "gen_logits_max": 17.75771713256836, "gen_logits_mean": 0.11022302508354187, "gen_logits_min": -10.4576416015625, "gen_logits_std": 2.2746939659118652, "gen_loss": 2.735884666442871, "grad_norm": 27.46800744489276, "learning_rate": 1.2653061224489795e-06, "loss": 2.9299, "mean_copy_accuracy": 0.0, "mean_gen_accuracy": 0.7525875866413116, "mean_token_accuracy": 0.569966271519661, "num_tokens": 8675524.0, "sample_num_tokens": 8101.0, "step": 32, "total_num_tokens": 8707928.0, "z_loss": 0.00473781768232584 }, { "copy_logits_max": 0.20596285164356232, "copy_logits_min": -687500032.0, "copy_num_tokens": 481.5625, "epoch": 0.0067398519274955325, "gen_logits_max": 16.92922592163086, "gen_logits_mean": -0.5839678645133972, "gen_logits_min": -11.345216751098633, "gen_logits_std": 2.215341091156006, "gen_loss": 2.853631019592285, "grad_norm": 26.931679258943383, "learning_rate": 1.3061224489795917e-06, "loss": 2.9582, "mean_copy_accuracy": 0.0, "mean_gen_accuracy": 0.746116116642952, "mean_token_accuracy": 0.5437665283679962, "num_tokens": 8965283.0, "sample_num_tokens": 8420.25, "step": 33, "total_num_tokens": 8998964.0, "z_loss": 0.0043963599018752575 }, { "copy_logits_max": 0.19037404656410217, "copy_logits_min": -750000000.0, "copy_num_tokens": 703.0625, "epoch": 0.006944089864692367, "gen_logits_max": 14.538131713867188, "gen_logits_mean": -1.7795518636703491, "gen_logits_min": -12.701723098754883, "gen_logits_std": 2.3196663856506348, "gen_loss": 3.0772030353546143, "grad_norm": 21.213004889489294, "learning_rate": 1.346938775510204e-06, "loss": 2.6, "mean_copy_accuracy": 0.0, "mean_gen_accuracy": 0.7482524514198303, "mean_token_accuracy": 0.5836998969316483, "num_tokens": 9224625.0, "sample_num_tokens": 10354.75, "step": 34, "total_num_tokens": 9266044.0, "z_loss": 0.0038729794323444366 }, { "copy_logits_max": 0.23595723509788513, "copy_logits_min": -750000000.0, "copy_num_tokens": 345.4375, "epoch": 0.007148327801889201, "gen_logits_max": 17.165388107299805, "gen_logits_mean": -0.2967764735221863, "gen_logits_min": -10.830656051635742, "gen_logits_std": 2.2292840480804443, "gen_loss": 2.472027063369751, "grad_norm": 25.730129421600974, "learning_rate": 1.3877551020408162e-06, "loss": 3.0425, "mean_copy_accuracy": 0.0, "mean_gen_accuracy": 0.7376810163259506, "mean_token_accuracy": 0.5291746407747269, "num_tokens": 9518798.0, "sample_num_tokens": 7830.5, "step": 35, "total_num_tokens": 9550120.0, "z_loss": 0.00468211155384779 }, { "copy_logits_max": 0.2551490366458893, "copy_logits_min": -750000000.0, "copy_num_tokens": 448.75, "epoch": 0.007352565739086036, "gen_logits_max": 16.787681579589844, "gen_logits_mean": -0.5677087306976318, "gen_logits_min": -10.956445693969727, "gen_logits_std": 2.2176971435546875, "gen_loss": 2.458376884460449, "grad_norm": 21.087685374800557, "learning_rate": 1.4285714285714286e-06, "loss": 2.7557, "mean_copy_accuracy": 3.723562622326426e-05, "mean_gen_accuracy": 0.7457600831985474, "mean_token_accuracy": 0.5523572117090225, "num_tokens": 9795837.0, "sample_num_tokens": 8768.75, "step": 36, "total_num_tokens": 9830912.0, "z_loss": 0.004380979109555483 }, { "copy_logits_max": 0.22108887135982513, "copy_logits_min": -750000000.0, "copy_num_tokens": 595.4375, "epoch": 0.00755680367628287, "gen_logits_max": 13.919954299926758, "gen_logits_mean": -2.036656618118286, "gen_logits_min": -13.591475486755371, "gen_logits_std": 2.384538173675537, "gen_loss": 3.2249388694763184, "grad_norm": 23.353265373200813, "learning_rate": 1.4693877551020408e-06, "loss": 2.8407, "mean_copy_accuracy": 2.6241208615829237e-05, "mean_gen_accuracy": 0.7453801929950714, "mean_token_accuracy": 0.5386418551206589, "num_tokens": 10064312.0, "sample_num_tokens": 8047.5, "step": 37, "total_num_tokens": 10096502.0, "z_loss": 0.003673675935715437 }, { "copy_logits_max": 0.28824740648269653, "copy_logits_min": -750000000.0, "copy_num_tokens": 502.4375, "epoch": 0.0077610416134797034, "gen_logits_max": 16.33595848083496, "gen_logits_mean": -1.0495940446853638, "gen_logits_min": -11.276924133300781, "gen_logits_std": 2.2035820484161377, "gen_loss": 2.2786664962768555, "grad_norm": 13.423871118445838, "learning_rate": 1.510204081632653e-06, "loss": 2.36, "mean_copy_accuracy": 0.00024198061146307737, "mean_gen_accuracy": 0.7542173117399216, "mean_token_accuracy": 0.5706998854875565, "num_tokens": 10327695.0, "sample_num_tokens": 9230.25, "step": 38, "total_num_tokens": 10364616.0, "z_loss": 0.004193628206849098 }, { "copy_logits_max": 0.2671026587486267, "copy_logits_min": -750000000.0, "copy_num_tokens": 509.9375, "epoch": 0.007965279550676538, "gen_logits_max": 13.9749174118042, "gen_logits_mean": -2.631692886352539, "gen_logits_min": -13.816847801208496, "gen_logits_std": 2.3710639476776123, "gen_loss": 2.3242228031158447, "grad_norm": 9.177605748946535, "learning_rate": 1.5510204081632654e-06, "loss": 2.2018, "mean_copy_accuracy": 0.0015189698315225542, "mean_gen_accuracy": 0.7459445595741272, "mean_token_accuracy": 0.5762438029050827, "num_tokens": 10593687.0, "sample_num_tokens": 8534.75, "step": 39, "total_num_tokens": 10627826.0, "z_loss": 0.0035586701706051826 }, { "copy_logits_max": 0.3151508867740631, "copy_logits_min": -687500032.0, "copy_num_tokens": 388.4375, "epoch": 0.008169517487873373, "gen_logits_max": 15.182219505310059, "gen_logits_mean": -2.2029128074645996, "gen_logits_min": -13.261783599853516, "gen_logits_std": 2.3398795127868652, "gen_loss": 2.312443971633911, "grad_norm": 5.427075701211521, "learning_rate": 1.5918367346938775e-06, "loss": 2.2814, "mean_copy_accuracy": 0.003191183292074129, "mean_gen_accuracy": 0.7451813966035843, "mean_token_accuracy": 0.5580424219369888, "num_tokens": 10861842.0, "sample_num_tokens": 7597.5, "step": 40, "total_num_tokens": 10892232.0, "z_loss": 0.00372996274381876 }, { "copy_logits_max": 0.29441410303115845, "copy_logits_min": -687500032.0, "copy_num_tokens": 452.6875, "epoch": 0.008373755425070207, "gen_logits_max": 14.387956619262695, "gen_logits_mean": -2.456757068634033, "gen_logits_min": -13.268865585327148, "gen_logits_std": 2.34305477142334, "gen_loss": 2.22420597076416, "grad_norm": 4.194138717818094, "learning_rate": 1.6326530612244897e-06, "loss": 2.1918, "mean_copy_accuracy": 0.004125061794184148, "mean_gen_accuracy": 0.7541363388299942, "mean_token_accuracy": 0.5701615512371063, "num_tokens": 11131691.0, "sample_num_tokens": 8282.25, "step": 41, "total_num_tokens": 11164820.0, "z_loss": 0.0034006708301603794 }, { "copy_logits_max": 0.3382081389427185, "copy_logits_min": -750000000.0, "copy_num_tokens": 307.0, "epoch": 0.008577993362267042, "gen_logits_max": 13.697223663330078, "gen_logits_mean": -3.189354181289673, "gen_logits_min": -14.431692123413086, "gen_logits_std": 2.364396572113037, "gen_loss": 1.9173020124435425, "grad_norm": 5.227512802136602, "learning_rate": 1.673469387755102e-06, "loss": 2.2381, "mean_copy_accuracy": 0.004992320551536977, "mean_gen_accuracy": 0.762908473610878, "mean_token_accuracy": 0.5612527281045914, "num_tokens": 11422795.0, "sample_num_tokens": 7098.25, "step": 42, "total_num_tokens": 11451188.0, "z_loss": 0.0031799995340406895 }, { "copy_logits_max": 0.3730608820915222, "copy_logits_min": -750000000.0, "copy_num_tokens": 696.25, "epoch": 0.008782231299463875, "gen_logits_max": 10.632061004638672, "gen_logits_mean": -5.513230323791504, "gen_logits_min": -16.661724090576172, "gen_logits_std": 2.3867807388305664, "gen_loss": 2.623044013977051, "grad_norm": 6.8553485412457285, "learning_rate": 1.7142857142857143e-06, "loss": 2.3119, "mean_copy_accuracy": 0.004803931748028845, "mean_gen_accuracy": 0.7516964226961136, "mean_token_accuracy": 0.549820065498352, "num_tokens": 11685028.0, "sample_num_tokens": 9645.0, "step": 43, "total_num_tokens": 11723608.0, "z_loss": 0.0022637443616986275 }, { "copy_logits_max": 0.36048927903175354, "copy_logits_min": -750000000.0, "copy_num_tokens": 559.5625, "epoch": 0.00898646923666071, "gen_logits_max": 11.919404983520508, "gen_logits_mean": -4.431253433227539, "gen_logits_min": -15.702001571655273, "gen_logits_std": 2.3917760848999023, "gen_loss": 2.3639321327209473, "grad_norm": 8.244601856337827, "learning_rate": 1.7551020408163264e-06, "loss": 2.1976, "mean_copy_accuracy": 0.006901151267811656, "mean_gen_accuracy": 0.7492636442184448, "mean_token_accuracy": 0.5621085315942764, "num_tokens": 11962265.0, "sample_num_tokens": 8807.75, "step": 44, "total_num_tokens": 11997496.0, "z_loss": 0.0025096999015659094 }, { "copy_logits_max": 0.453457772731781, "copy_logits_min": -750000000.0, "copy_num_tokens": 608.5625, "epoch": 0.009190707173857544, "gen_logits_max": 10.335043907165527, "gen_logits_mean": -6.095094680786133, "gen_logits_min": -16.867080688476562, "gen_logits_std": 2.332141876220703, "gen_loss": 2.5279996395111084, "grad_norm": 8.431706030803635, "learning_rate": 1.7959183673469388e-06, "loss": 2.1523, "mean_copy_accuracy": 0.006318146479316056, "mean_gen_accuracy": 0.7546356469392776, "mean_token_accuracy": 0.5747770071029663, "num_tokens": 12219426.0, "sample_num_tokens": 9222.0, "step": 45, "total_num_tokens": 12256314.0, "z_loss": 0.0021000674460083246 }, { "copy_logits_max": 0.3758019208908081, "copy_logits_min": -750000000.0, "copy_num_tokens": 303.875, "epoch": 0.009394945111054378, "gen_logits_max": 13.003425598144531, "gen_logits_mean": -3.828381061553955, "gen_logits_min": -14.651461601257324, "gen_logits_std": 2.351402521133423, "gen_loss": 1.7210594415664673, "grad_norm": 11.379117772947618, "learning_rate": 1.836734693877551e-06, "loss": 2.0995, "mean_copy_accuracy": 0.007504499168135226, "mean_gen_accuracy": 0.7638255655765533, "mean_token_accuracy": 0.5912750959396362, "num_tokens": 12492994.0, "sample_num_tokens": 8314.5, "step": 46, "total_num_tokens": 12526252.0, "z_loss": 0.0029808166436851025 }, { "copy_logits_max": 0.3967694044113159, "copy_logits_min": -750000000.0, "copy_num_tokens": 477.5625, "epoch": 0.009599183048251213, "gen_logits_max": 11.255477905273438, "gen_logits_mean": -4.80055046081543, "gen_logits_min": -16.071094512939453, "gen_logits_std": 2.3839738368988037, "gen_loss": 2.1713883876800537, "grad_norm": 12.840889792071637, "learning_rate": 1.8775510204081634e-06, "loss": 2.1795, "mean_copy_accuracy": 0.005328264087438583, "mean_gen_accuracy": 0.7700029015541077, "mean_token_accuracy": 0.5821642279624939, "num_tokens": 12765773.0, "sample_num_tokens": 8962.25, "step": 47, "total_num_tokens": 12801622.0, "z_loss": 0.0026343734934926033 }, { "copy_logits_max": 0.3753702640533447, "copy_logits_min": -750000000.0, "copy_num_tokens": 488.5, "epoch": 0.009803420985448046, "gen_logits_max": 11.660208702087402, "gen_logits_mean": -4.389382362365723, "gen_logits_min": -15.558262825012207, "gen_logits_std": 2.380446434020996, "gen_loss": 2.0952773094177246, "grad_norm": 9.82449078266781, "learning_rate": 1.9183673469387756e-06, "loss": 2.1401, "mean_copy_accuracy": 0.006632244447246194, "mean_gen_accuracy": 0.7484649121761322, "mean_token_accuracy": 0.5714273899793625, "num_tokens": 13023324.0, "sample_num_tokens": 9086.0, "step": 48, "total_num_tokens": 13059668.0, "z_loss": 0.002742484211921692 }, { "copy_logits_max": 0.332322895526886, "copy_logits_min": -687500032.0, "copy_num_tokens": 613.3125, "epoch": 0.010007658922644882, "gen_logits_max": 11.112168312072754, "gen_logits_mean": -4.3170084953308105, "gen_logits_min": -15.566532135009766, "gen_logits_std": 2.4335975646972656, "gen_loss": 2.4714479446411133, "grad_norm": 9.783881796628487, "learning_rate": 1.959183673469388e-06, "loss": 2.2329, "mean_copy_accuracy": 0.005755227175541222, "mean_gen_accuracy": 0.752150297164917, "mean_token_accuracy": 0.5631617307662964, "num_tokens": 13300191.0, "sample_num_tokens": 9668.75, "step": 49, "total_num_tokens": 13338866.0, "z_loss": 0.0024844235740602016 }, { "copy_logits_max": 0.37605515122413635, "copy_logits_min": -687500032.0, "copy_num_tokens": 419.5, "epoch": 0.010211896859841715, "gen_logits_max": 12.543835639953613, "gen_logits_mean": -4.2913970947265625, "gen_logits_min": -15.450634002685547, "gen_logits_std": 2.3683724403381348, "gen_loss": 1.9862715005874634, "grad_norm": 6.791514907160671, "learning_rate": 2e-06, "loss": 2.0018, "mean_copy_accuracy": 0.006322556408122182, "mean_gen_accuracy": 0.7616151720285416, "mean_token_accuracy": 0.5986615419387817, "num_tokens": 13574297.0, "sample_num_tokens": 8700.75, "step": 50, "total_num_tokens": 13609100.0, "z_loss": 0.00304853986017406 }, { "copy_logits_max": 0.43755292892456055, "copy_logits_min": -750000000.0, "copy_num_tokens": 353.5625, "epoch": 0.01041613479703855, "gen_logits_max": 12.014739990234375, "gen_logits_mean": -5.015246391296387, "gen_logits_min": -16.070415496826172, "gen_logits_std": 2.3205502033233643, "gen_loss": 2.0009002685546875, "grad_norm": 6.37506178479508, "learning_rate": 2.0408163265306125e-06, "loss": 2.1026, "mean_copy_accuracy": 0.006218694965355098, "mean_gen_accuracy": 0.7647188007831573, "mean_token_accuracy": 0.5767951309680939, "num_tokens": 13855925.0, "sample_num_tokens": 6858.75, "step": 51, "total_num_tokens": 13883360.0, "z_loss": 0.0029250015504658222 }, { "copy_logits_max": 0.3695090413093567, "copy_logits_min": -687499968.0, "copy_num_tokens": 398.75, "epoch": 0.010620372734235384, "gen_logits_max": 12.783805847167969, "gen_logits_mean": -4.112650394439697, "gen_logits_min": -14.98923397064209, "gen_logits_std": 2.3277554512023926, "gen_loss": 1.9788283109664917, "grad_norm": 5.090602023246404, "learning_rate": 2.0816326530612247e-06, "loss": 2.0028, "mean_copy_accuracy": 0.0060896220384165645, "mean_gen_accuracy": 0.7696325182914734, "mean_token_accuracy": 0.5896810293197632, "num_tokens": 14121092.0, "sample_num_tokens": 8186.5, "step": 52, "total_num_tokens": 14153838.0, "z_loss": 0.003294185735285282 }, { "copy_logits_max": 0.3243092894554138, "copy_logits_min": -750000000.0, "copy_num_tokens": 486.0625, "epoch": 0.010824610671432219, "gen_logits_max": 12.217062950134277, "gen_logits_mean": -4.340140342712402, "gen_logits_min": -15.762327194213867, "gen_logits_std": 2.3983988761901855, "gen_loss": 2.165098190307617, "grad_norm": 4.181903091973121, "learning_rate": 2.122448979591837e-06, "loss": 2.1802, "mean_copy_accuracy": 0.004962274921126664, "mean_gen_accuracy": 0.7669226676225662, "mean_token_accuracy": 0.5653284937143326, "num_tokens": 14414740.0, "sample_num_tokens": 8463.5, "step": 53, "total_num_tokens": 14448594.0, "z_loss": 0.003069302998483181 }, { "copy_logits_max": 0.34537261724472046, "copy_logits_min": -750000000.0, "copy_num_tokens": 374.0, "epoch": 0.011028848608629052, "gen_logits_max": 14.05003833770752, "gen_logits_mean": -3.3236265182495117, "gen_logits_min": -14.216756820678711, "gen_logits_std": 2.329352855682373, "gen_loss": 1.724212408065796, "grad_norm": 3.2303511774861597, "learning_rate": 2.163265306122449e-06, "loss": 2.0004, "mean_copy_accuracy": 0.006948238820768893, "mean_gen_accuracy": 0.77427077293396, "mean_token_accuracy": 0.5886626094579697, "num_tokens": 14700513.0, "sample_num_tokens": 8997.75, "step": 54, "total_num_tokens": 14736504.0, "z_loss": 0.0038739496376365423 }, { "copy_logits_max": 0.3201480507850647, "copy_logits_min": -750000000.0, "copy_num_tokens": 558.0625, "epoch": 0.011233086545825888, "gen_logits_max": 12.152703285217285, "gen_logits_mean": -4.158370018005371, "gen_logits_min": -15.365289688110352, "gen_logits_std": 2.3469786643981934, "gen_loss": 2.073719024658203, "grad_norm": 2.6467630136152653, "learning_rate": 2.2040816326530616e-06, "loss": 1.9884, "mean_copy_accuracy": 0.006293621729128063, "mean_gen_accuracy": 0.7743992358446121, "mean_token_accuracy": 0.5916468799114227, "num_tokens": 14966038.0, "sample_num_tokens": 9262.5, "step": 55, "total_num_tokens": 15003088.0, "z_loss": 0.0033395199570804834 }, { "copy_logits_max": 0.2910640835762024, "copy_logits_min": -625000000.0, "copy_num_tokens": 528.6875, "epoch": 0.011437324483022721, "gen_logits_max": 12.676331520080566, "gen_logits_mean": -3.677168369293213, "gen_logits_min": -15.206951141357422, "gen_logits_std": 2.423225164413452, "gen_loss": 2.172508478164673, "grad_norm": 2.350863375687612, "learning_rate": 2.2448979591836734e-06, "loss": 1.93, "mean_copy_accuracy": 0.0058472586097195745, "mean_gen_accuracy": 0.7717824578285217, "mean_token_accuracy": 0.5963986217975616, "num_tokens": 15243182.0, "sample_num_tokens": 8264.5, "step": 56, "total_num_tokens": 15276240.0, "z_loss": 0.003528048750013113 }, { "copy_logits_max": 0.27108633518218994, "copy_logits_min": -750000000.0, "copy_num_tokens": 567.4375, "epoch": 0.011641562420219556, "gen_logits_max": 13.220484733581543, "gen_logits_mean": -3.520643711090088, "gen_logits_min": -14.610040664672852, "gen_logits_std": 2.351140022277832, "gen_loss": 2.117520570755005, "grad_norm": 2.2152913479824816, "learning_rate": 2.285714285714286e-06, "loss": 1.9906, "mean_copy_accuracy": 0.006134759518317878, "mean_gen_accuracy": 0.772791251540184, "mean_token_accuracy": 0.5831224620342255, "num_tokens": 15524688.0, "sample_num_tokens": 9367.5, "step": 57, "total_num_tokens": 15562158.0, "z_loss": 0.003915813751518726 }, { "copy_logits_max": 0.2807992100715637, "copy_logits_min": -750000000.0, "copy_num_tokens": 366.1875, "epoch": 0.01184580035741639, "gen_logits_max": 14.566445350646973, "gen_logits_mean": -2.743739604949951, "gen_logits_min": -14.98134994506836, "gen_logits_std": 2.4215927124023438, "gen_loss": 1.8697479963302612, "grad_norm": 1.9774678125711844, "learning_rate": 2.326530612244898e-06, "loss": 1.9268, "mean_copy_accuracy": 0.007063909957651049, "mean_gen_accuracy": 0.7775748074054718, "mean_token_accuracy": 0.5984036922454834, "num_tokens": 15802123.0, "sample_num_tokens": 7451.25, "step": 58, "total_num_tokens": 15831928.0, "z_loss": 0.004374993033707142 }, { "copy_logits_max": 0.28019028902053833, "copy_logits_min": -750000000.0, "copy_num_tokens": 278.625, "epoch": 0.012050038294613225, "gen_logits_max": 15.962348937988281, "gen_logits_mean": -1.9918627738952637, "gen_logits_min": -13.093669891357422, "gen_logits_std": 2.350250005722046, "gen_loss": 1.706833004951477, "grad_norm": 2.008819224675325, "learning_rate": 2.36734693877551e-06, "loss": 2.0165, "mean_copy_accuracy": 0.005575795832555741, "mean_gen_accuracy": 0.7695398777723312, "mean_token_accuracy": 0.5823814868927002, "num_tokens": 16066270.0, "sample_num_tokens": 6562.5, "step": 59, "total_num_tokens": 16092520.0, "z_loss": 0.0047048418782651424 }, { "copy_logits_max": 0.26902222633361816, "copy_logits_min": -750000000.0, "copy_num_tokens": 328.0, "epoch": 0.012254276231810059, "gen_logits_max": 14.888544082641602, "gen_logits_mean": -2.8207712173461914, "gen_logits_min": -14.186553955078125, "gen_logits_std": 2.3672609329223633, "gen_loss": 1.8256477117538452, "grad_norm": 1.9368934670763227, "learning_rate": 2.4081632653061225e-06, "loss": 1.7366, "mean_copy_accuracy": 0.009926793281920254, "mean_gen_accuracy": 0.776409775018692, "mean_token_accuracy": 0.6254546046257019, "num_tokens": 16316975.0, "sample_num_tokens": 7294.75, "step": 60, "total_num_tokens": 16346154.0, "z_loss": 0.004536774009466171 }, { "copy_logits_max": 0.2962132394313812, "copy_logits_min": -750000000.0, "copy_num_tokens": 337.375, "epoch": 0.012458514169006892, "gen_logits_max": 15.658403396606445, "gen_logits_mean": -2.8882815837860107, "gen_logits_min": -14.190391540527344, "gen_logits_std": 2.3237357139587402, "gen_loss": 1.7048072814941406, "grad_norm": 1.860224847549994, "learning_rate": 2.4489795918367347e-06, "loss": 1.7966, "mean_copy_accuracy": 0.00643019110430032, "mean_gen_accuracy": 0.7888190746307373, "mean_token_accuracy": 0.6184335798025131, "num_tokens": 16568494.0, "sample_num_tokens": 7198.5, "step": 61, "total_num_tokens": 16597288.0, "z_loss": 0.004500327631831169 }, { "copy_logits_max": 0.27594634890556335, "copy_logits_min": -750000000.0, "copy_num_tokens": 594.9375, "epoch": 0.012662752106203727, "gen_logits_max": 13.803407669067383, "gen_logits_mean": -3.3271677494049072, "gen_logits_min": -15.00541877746582, "gen_logits_std": 2.370197296142578, "gen_loss": 2.2288312911987305, "grad_norm": 1.815639116285586, "learning_rate": 2.489795918367347e-06, "loss": 1.7635, "mean_copy_accuracy": 0.008881627931259573, "mean_gen_accuracy": 0.7804757654666901, "mean_token_accuracy": 0.618374228477478, "num_tokens": 16826250.0, "sample_num_tokens": 9471.0, "step": 62, "total_num_tokens": 16864134.0, "z_loss": 0.004156989045441151 }, { "copy_logits_max": 0.3029296398162842, "copy_logits_min": -750000000.0, "copy_num_tokens": 424.0, "epoch": 0.012866990043400561, "gen_logits_max": 14.752171516418457, "gen_logits_mean": -2.9625580310821533, "gen_logits_min": -14.377264976501465, "gen_logits_std": 2.297773838043213, "gen_loss": 1.9262446165084839, "grad_norm": 1.736451409154038, "learning_rate": 2.530612244897959e-06, "loss": 2.0945, "mean_copy_accuracy": 0.006006812211126089, "mean_gen_accuracy": 0.7827368080615997, "mean_token_accuracy": 0.5698300153017044, "num_tokens": 17106822.0, "sample_num_tokens": 8175.0, "step": 63, "total_num_tokens": 17139522.0, "z_loss": 0.004415543749928474 }, { "copy_logits_max": 0.27154266834259033, "copy_logits_min": -750000000.0, "copy_num_tokens": 263.5, "epoch": 0.013071227980597396, "gen_logits_max": 15.446842193603516, "gen_logits_mean": -2.953843116760254, "gen_logits_min": -14.121468544006348, "gen_logits_std": 2.3207321166992188, "gen_loss": 1.6537760496139526, "grad_norm": 1.8859051167472307, "learning_rate": 2.5714285714285716e-06, "loss": 1.7587, "mean_copy_accuracy": 0.006521657109260559, "mean_gen_accuracy": 0.784865140914917, "mean_token_accuracy": 0.6271564811468124, "num_tokens": 17357888.0, "sample_num_tokens": 6637.0, "step": 64, "total_num_tokens": 17384436.0, "z_loss": 0.004577910993248224 }, { "copy_logits_max": 0.2523893713951111, "copy_logits_min": -750000000.0, "copy_num_tokens": 405.0, "epoch": 0.01327546591779423, "gen_logits_max": 15.02393913269043, "gen_logits_mean": -2.682387351989746, "gen_logits_min": -14.034008026123047, "gen_logits_std": 2.3380165100097656, "gen_loss": 1.8666143417358398, "grad_norm": 1.736394257923023, "learning_rate": 2.6122448979591834e-06, "loss": 1.8965, "mean_copy_accuracy": 0.0068092833971604705, "mean_gen_accuracy": 0.7832990884780884, "mean_token_accuracy": 0.6027748286724091, "num_tokens": 17614170.0, "sample_num_tokens": 8052.0, "step": 65, "total_num_tokens": 17646378.0, "z_loss": 0.004907175432890654 }, { "copy_logits_max": 0.23300352692604065, "copy_logits_min": -750000000.0, "copy_num_tokens": 317.9375, "epoch": 0.013479703854991065, "gen_logits_max": 16.36262321472168, "gen_logits_mean": -1.4607629776000977, "gen_logits_min": -12.530577659606934, "gen_logits_std": 2.335437297821045, "gen_loss": 1.5722568035125732, "grad_norm": 1.6717348138471324, "learning_rate": 2.653061224489796e-06, "loss": 1.947, "mean_copy_accuracy": 0.008702389081008732, "mean_gen_accuracy": 0.7835212349891663, "mean_token_accuracy": 0.5971561819314957, "num_tokens": 17891720.0, "sample_num_tokens": 7453.5, "step": 66, "total_num_tokens": 17921534.0, "z_loss": 0.005303751677274704 }, { "copy_logits_max": 0.26952385902404785, "copy_logits_min": -750000000.0, "copy_num_tokens": 393.4375, "epoch": 0.013683941792187898, "gen_logits_max": 17.025279998779297, "gen_logits_mean": -1.7841260433197021, "gen_logits_min": -13.157672882080078, "gen_logits_std": 2.3525280952453613, "gen_loss": 1.6178419589996338, "grad_norm": 1.592336989789187, "learning_rate": 2.693877551020408e-06, "loss": 1.7532, "mean_copy_accuracy": 0.007569761713966727, "mean_gen_accuracy": 0.7833515405654907, "mean_token_accuracy": 0.6107648313045502, "num_tokens": 18175652.0, "sample_num_tokens": 7955.5, "step": 67, "total_num_tokens": 18207474.0, "z_loss": 0.005232574883848429 }, { "copy_logits_max": 0.20879702270030975, "copy_logits_min": -750000000.0, "copy_num_tokens": 500.6875, "epoch": 0.013888179729384734, "gen_logits_max": 15.201114654541016, "gen_logits_mean": -2.279402256011963, "gen_logits_min": -14.216625213623047, "gen_logits_std": 2.387026309967041, "gen_loss": 1.7236237525939941, "grad_norm": 1.5357376659180104, "learning_rate": 2.7346938775510203e-06, "loss": 1.9326, "mean_copy_accuracy": 0.006873883539810777, "mean_gen_accuracy": 0.7910032421350479, "mean_token_accuracy": 0.5956061035394669, "num_tokens": 18460861.0, "sample_num_tokens": 10037.25, "step": 68, "total_num_tokens": 18501010.0, "z_loss": 0.004793708212673664 }, { "copy_logits_max": 0.23679384589195251, "copy_logits_min": -750000000.0, "copy_num_tokens": 340.6875, "epoch": 0.014092417666581567, "gen_logits_max": 16.780065536499023, "gen_logits_mean": -2.079575300216675, "gen_logits_min": -13.582870483398438, "gen_logits_std": 2.3458449840545654, "gen_loss": 1.5540673732757568, "grad_norm": 1.5401489567634845, "learning_rate": 2.7755102040816325e-06, "loss": 1.9697, "mean_copy_accuracy": 0.008154670475050807, "mean_gen_accuracy": 0.7888696938753128, "mean_token_accuracy": 0.5827923715114594, "num_tokens": 18769213.0, "sample_num_tokens": 8392.75, "step": 69, "total_num_tokens": 18802784.0, "z_loss": 0.005312290508300066 }, { "copy_logits_max": 0.23521818220615387, "copy_logits_min": -750000000.0, "copy_num_tokens": 548.75, "epoch": 0.014296655603778402, "gen_logits_max": 15.048111915588379, "gen_logits_mean": -2.996866226196289, "gen_logits_min": -14.429082870483398, "gen_logits_std": 2.3381247520446777, "gen_loss": 1.8624169826507568, "grad_norm": 1.5743020428981969, "learning_rate": 2.816326530612245e-06, "loss": 1.8411, "mean_copy_accuracy": 0.006671121111139655, "mean_gen_accuracy": 0.7957730442285538, "mean_token_accuracy": 0.6103444397449493, "num_tokens": 19026728.0, "sample_num_tokens": 9489.0, "step": 70, "total_num_tokens": 19064684.0, "z_loss": 0.004644765518605709 }, { "copy_logits_max": 0.21133042871952057, "copy_logits_min": -750000000.0, "copy_num_tokens": 508.4375, "epoch": 0.014500893540975236, "gen_logits_max": 14.791900634765625, "gen_logits_mean": -2.3944804668426514, "gen_logits_min": -13.924467086791992, "gen_logits_std": 2.3874640464782715, "gen_loss": 1.98216712474823, "grad_norm": 1.8261120312053036, "learning_rate": 2.8571428571428573e-06, "loss": 1.8974, "mean_copy_accuracy": 0.0063593905651941895, "mean_gen_accuracy": 0.7896262407302856, "mean_token_accuracy": 0.6023283153772354, "num_tokens": 19289712.0, "sample_num_tokens": 8508.0, "step": 71, "total_num_tokens": 19323744.0, "z_loss": 0.0048521896824240685 }, { "copy_logits_max": 0.21494920551776886, "copy_logits_min": -750000000.0, "copy_num_tokens": 313.375, "epoch": 0.014705131478172071, "gen_logits_max": 16.955753326416016, "gen_logits_mean": -1.662434697151184, "gen_logits_min": -13.33212661743164, "gen_logits_std": 2.3829598426818848, "gen_loss": 1.5068825483322144, "grad_norm": 1.5033821877866111, "learning_rate": 2.8979591836734694e-06, "loss": 1.6974, "mean_copy_accuracy": 0.0088197240838781, "mean_gen_accuracy": 0.7903143912553787, "mean_token_accuracy": 0.6252225786447525, "num_tokens": 19553561.0, "sample_num_tokens": 7696.75, "step": 72, "total_num_tokens": 19584348.0, "z_loss": 0.005274729337543249 }, { "copy_logits_max": 0.23926328122615814, "copy_logits_min": -687500032.0, "copy_num_tokens": 663.6875, "epoch": 0.014909369415368905, "gen_logits_max": 14.296074867248535, "gen_logits_mean": -3.440171957015991, "gen_logits_min": -14.877103805541992, "gen_logits_std": 2.3523902893066406, "gen_loss": 2.1859145164489746, "grad_norm": 1.4379256882460425, "learning_rate": 2.9387755102040816e-06, "loss": 1.8404, "mean_copy_accuracy": 0.005582665791735053, "mean_gen_accuracy": 0.7950621545314789, "mean_token_accuracy": 0.6063280403614044, "num_tokens": 19798831.0, "sample_num_tokens": 9755.75, "step": 73, "total_num_tokens": 19837854.0, "z_loss": 0.004276187624782324 }, { "copy_logits_max": 0.24309085309505463, "copy_logits_min": -750000000.0, "copy_num_tokens": 476.0625, "epoch": 0.01511360735256574, "gen_logits_max": 16.67106819152832, "gen_logits_mean": -2.3599917888641357, "gen_logits_min": -13.758578300476074, "gen_logits_std": 2.3359203338623047, "gen_loss": 1.6676545143127441, "grad_norm": 1.3017108739941174, "learning_rate": 2.979591836734694e-06, "loss": 1.8504, "mean_copy_accuracy": 0.006877436651848257, "mean_gen_accuracy": 0.7976953089237213, "mean_token_accuracy": 0.6111768782138824, "num_tokens": 20080802.0, "sample_num_tokens": 9194.0, "step": 74, "total_num_tokens": 20117578.0, "z_loss": 0.005161330569535494 }, { "copy_logits_max": 0.22890035808086395, "copy_logits_min": -687500032.0, "copy_num_tokens": 537.4375, "epoch": 0.015317845289762573, "gen_logits_max": 14.992313385009766, "gen_logits_mean": -2.822190523147583, "gen_logits_min": -14.14471435546875, "gen_logits_std": 2.3517873287200928, "gen_loss": 1.9339063167572021, "grad_norm": 1.3318801056125202, "learning_rate": 3.020408163265306e-06, "loss": 1.8892, "mean_copy_accuracy": 0.006323486915789545, "mean_gen_accuracy": 0.7902374863624573, "mean_token_accuracy": 0.5980061292648315, "num_tokens": 20345308.0, "sample_num_tokens": 9253.5, "step": 75, "total_num_tokens": 20382322.0, "z_loss": 0.004842181224375963 }, { "copy_logits_max": 0.2513517439365387, "copy_logits_min": -687499968.0, "copy_num_tokens": 485.25, "epoch": 0.015522083226959407, "gen_logits_max": 16.423107147216797, "gen_logits_mean": -2.640580654144287, "gen_logits_min": -14.695709228515625, "gen_logits_std": 2.3215954303741455, "gen_loss": 1.9450006484985352, "grad_norm": 2.3912576073966663, "learning_rate": 3.0612244897959185e-06, "loss": 1.9874, "mean_copy_accuracy": 0.0052425338653847575, "mean_gen_accuracy": 0.786719560623169, "mean_token_accuracy": 0.5818337947130203, "num_tokens": 20615557.0, "sample_num_tokens": 8817.25, "step": 76, "total_num_tokens": 20650826.0, "z_loss": 0.0052013457752764225 }, { "copy_logits_max": 0.23241540789604187, "copy_logits_min": -750000000.0, "copy_num_tokens": 421.0625, "epoch": 0.015726321164156242, "gen_logits_max": 16.368749618530273, "gen_logits_mean": -2.213658332824707, "gen_logits_min": -14.492197036743164, "gen_logits_std": 2.3666160106658936, "gen_loss": 1.6857633590698242, "grad_norm": 1.2496847759895664, "learning_rate": 3.1020408163265307e-06, "loss": 1.8428, "mean_copy_accuracy": 0.007098816800862551, "mean_gen_accuracy": 0.8095733523368835, "mean_token_accuracy": 0.6201061755418777, "num_tokens": 20906994.0, "sample_num_tokens": 8726.0, "step": 77, "total_num_tokens": 20941898.0, "z_loss": 0.005252907983958721 }, { "copy_logits_max": 0.21852856874465942, "copy_logits_min": -750000000.0, "copy_num_tokens": 507.8125, "epoch": 0.015930559101353076, "gen_logits_max": 14.610885620117188, "gen_logits_mean": -2.784465789794922, "gen_logits_min": -14.378002166748047, "gen_logits_std": 2.374756336212158, "gen_loss": 2.0880157947540283, "grad_norm": 1.6835643362490225, "learning_rate": 3.142857142857143e-06, "loss": 2.0434, "mean_copy_accuracy": 0.004962934355717152, "mean_gen_accuracy": 0.7938743531703949, "mean_token_accuracy": 0.579725980758667, "num_tokens": 21182690.0, "sample_num_tokens": 8578.5, "step": 78, "total_num_tokens": 21217004.0, "z_loss": 0.005041802302002907 }, { "copy_logits_max": 0.2327195107936859, "copy_logits_min": -625000000.0, "copy_num_tokens": 525.875, "epoch": 0.01613479703854991, "gen_logits_max": 15.58886432647705, "gen_logits_mean": -2.7925782203674316, "gen_logits_min": -14.465396881103516, "gen_logits_std": 2.359266757965088, "gen_loss": 1.8991237878799438, "grad_norm": 1.1954928025811318, "learning_rate": 3.183673469387755e-06, "loss": 1.8092, "mean_copy_accuracy": 0.007169452728703618, "mean_gen_accuracy": 0.8027073591947556, "mean_token_accuracy": 0.6151049435138702, "num_tokens": 21453165.0, "sample_num_tokens": 8832.75, "step": 79, "total_num_tokens": 21488496.0, "z_loss": 0.0050385016947984695 }, { "copy_logits_max": 0.2249172031879425, "copy_logits_min": -750000000.0, "copy_num_tokens": 544.3125, "epoch": 0.016339034975746746, "gen_logits_max": 15.668265342712402, "gen_logits_mean": -2.5821871757507324, "gen_logits_min": -14.067635536193848, "gen_logits_std": 2.3805222511291504, "gen_loss": 1.7962872982025146, "grad_norm": 1.399188344910279, "learning_rate": 3.2244897959183677e-06, "loss": 1.7565, "mean_copy_accuracy": 0.006412941147573292, "mean_gen_accuracy": 0.7986547499895096, "mean_token_accuracy": 0.6134431660175323, "num_tokens": 21719456.0, "sample_num_tokens": 9181.0, "step": 80, "total_num_tokens": 21756180.0, "z_loss": 0.005009463056921959 }, { "copy_logits_max": 0.20695438981056213, "copy_logits_min": -750000000.0, "copy_num_tokens": 426.25, "epoch": 0.01654327291294358, "gen_logits_max": 15.796799659729004, "gen_logits_mean": -2.0267181396484375, "gen_logits_min": -13.952568054199219, "gen_logits_std": 2.411449909210205, "gen_loss": 1.8893955945968628, "grad_norm": 1.077731773704304, "learning_rate": 3.2653061224489794e-06, "loss": 1.8155, "mean_copy_accuracy": 0.006386213703081012, "mean_gen_accuracy": 0.7999549806118011, "mean_token_accuracy": 0.6049928218126297, "num_tokens": 21999430.0, "sample_num_tokens": 7269.0, "step": 81, "total_num_tokens": 22028506.0, "z_loss": 0.0051287692040205 }, { "copy_logits_max": 0.2500241994857788, "copy_logits_min": -750000064.0, "copy_num_tokens": 384.25, "epoch": 0.016747510850140413, "gen_logits_max": 16.098960876464844, "gen_logits_mean": -2.7433462142944336, "gen_logits_min": -14.387664794921875, "gen_logits_std": 2.316431760787964, "gen_loss": 1.6426857709884644, "grad_norm": 1.1570137772531388, "learning_rate": 3.306122448979592e-06, "loss": 1.8665, "mean_copy_accuracy": 0.006496565416455269, "mean_gen_accuracy": 0.7962946891784668, "mean_token_accuracy": 0.6044894754886627, "num_tokens": 22269717.0, "sample_num_tokens": 8178.25, "step": 82, "total_num_tokens": 22302430.0, "z_loss": 0.005024137906730175 }, { "copy_logits_max": 0.2715167701244354, "copy_logits_min": -750000000.0, "copy_num_tokens": 467.3125, "epoch": 0.016951748787337247, "gen_logits_max": 16.257699966430664, "gen_logits_mean": -2.9621024131774902, "gen_logits_min": -14.67983627319336, "gen_logits_std": 2.297214984893799, "gen_loss": 1.7410249710083008, "grad_norm": 1.053683097423648, "learning_rate": 3.346938775510204e-06, "loss": 1.9242, "mean_copy_accuracy": 0.006165514816530049, "mean_gen_accuracy": 0.8068076372146606, "mean_token_accuracy": 0.5973591804504395, "num_tokens": 22563571.0, "sample_num_tokens": 9121.75, "step": 83, "total_num_tokens": 22600058.0, "z_loss": 0.005096848588436842 }, { "copy_logits_max": 0.21765199303627014, "copy_logits_min": -687500032.0, "copy_num_tokens": 665.0, "epoch": 0.017155986724534084, "gen_logits_max": 15.043432235717773, "gen_logits_mean": -3.1628646850585938, "gen_logits_min": -15.0199613571167, "gen_logits_std": 2.4089877605438232, "gen_loss": 2.2071468830108643, "grad_norm": 1.013980744510412, "learning_rate": 3.3877551020408164e-06, "loss": 1.8924, "mean_copy_accuracy": 0.005259880679659545, "mean_gen_accuracy": 0.7948393076658249, "mean_token_accuracy": 0.5973489880561829, "num_tokens": 22836642.0, "sample_num_tokens": 9436.5, "step": 84, "total_num_tokens": 22874388.0, "z_loss": 0.004778986796736717 }, { "copy_logits_max": 0.2737056016921997, "copy_logits_min": -750000000.0, "copy_num_tokens": 483.1875, "epoch": 0.017360224661730917, "gen_logits_max": 15.166851043701172, "gen_logits_mean": -3.436777114868164, "gen_logits_min": -15.155643463134766, "gen_logits_std": 2.372462034225464, "gen_loss": 2.090043067932129, "grad_norm": 1.2096903648326485, "learning_rate": 3.4285714285714285e-06, "loss": 2.0203, "mean_copy_accuracy": 0.005532322102226317, "mean_gen_accuracy": 0.7980922907590866, "mean_token_accuracy": 0.5814859122037888, "num_tokens": 23128876.0, "sample_num_tokens": 7398.5, "step": 85, "total_num_tokens": 23158470.0, "z_loss": 0.0046770088374614716 }, { "copy_logits_max": 0.22343382239341736, "copy_logits_min": -750000000.0, "copy_num_tokens": 342.6875, "epoch": 0.01756446259892775, "gen_logits_max": 16.872936248779297, "gen_logits_mean": -1.8513312339782715, "gen_logits_min": -13.286677360534668, "gen_logits_std": 2.3366799354553223, "gen_loss": 1.5007855892181396, "grad_norm": 1.109283371382659, "learning_rate": 3.469387755102041e-06, "loss": 1.6989, "mean_copy_accuracy": 0.0064699233043938875, "mean_gen_accuracy": 0.8065247237682343, "mean_token_accuracy": 0.6363362222909927, "num_tokens": 23375350.0, "sample_num_tokens": 7885.5, "step": 86, "total_num_tokens": 23406892.0, "z_loss": 0.005369802936911583 }, { "copy_logits_max": 0.25587767362594604, "copy_logits_min": -687500032.0, "copy_num_tokens": 441.6875, "epoch": 0.017768700536124584, "gen_logits_max": 15.274674415588379, "gen_logits_mean": -3.1954221725463867, "gen_logits_min": -15.46343994140625, "gen_logits_std": 2.379456043243408, "gen_loss": 1.9064775705337524, "grad_norm": 1.0283657415607532, "learning_rate": 3.510204081632653e-06, "loss": 1.8915, "mean_copy_accuracy": 0.005455719889141619, "mean_gen_accuracy": 0.799133688211441, "mean_token_accuracy": 0.5987005829811096, "num_tokens": 23636569.0, "sample_num_tokens": 8615.25, "step": 87, "total_num_tokens": 23671030.0, "z_loss": 0.005060550756752491 }, { "copy_logits_max": 0.2287471741437912, "copy_logits_min": -750000000.0, "copy_num_tokens": 660.9375, "epoch": 0.01797293847332142, "gen_logits_max": 15.256936073303223, "gen_logits_mean": -2.6315999031066895, "gen_logits_min": -14.390693664550781, "gen_logits_std": 2.3697381019592285, "gen_loss": 1.9393482208251953, "grad_norm": 0.8833982905604051, "learning_rate": 3.5510204081632655e-06, "loss": 1.8873, "mean_copy_accuracy": 0.006928324582986534, "mean_gen_accuracy": 0.8088514655828476, "mean_token_accuracy": 0.6068081706762314, "num_tokens": 23939656.0, "sample_num_tokens": 11204.0, "step": 88, "total_num_tokens": 23984472.0, "z_loss": 0.0049246521666646 }, { "copy_logits_max": 0.24247056245803833, "copy_logits_min": -750000000.0, "copy_num_tokens": 392.125, "epoch": 0.018177176410518255, "gen_logits_max": 15.47488784790039, "gen_logits_mean": -3.0399129390716553, "gen_logits_min": -15.23694896697998, "gen_logits_std": 2.404982805252075, "gen_loss": 1.8881949186325073, "grad_norm": 0.9760809355204123, "learning_rate": 3.5918367346938777e-06, "loss": 1.9274, "mean_copy_accuracy": 0.005550091154873371, "mean_gen_accuracy": 0.8044985383749008, "mean_token_accuracy": 0.6015517264604568, "num_tokens": 24232472.0, "sample_num_tokens": 7070.5, "step": 89, "total_num_tokens": 24260754.0, "z_loss": 0.004979471210390329 }, { "copy_logits_max": 0.2513766288757324, "copy_logits_min": -687499968.0, "copy_num_tokens": 732.875, "epoch": 0.018381414347715088, "gen_logits_max": 14.530845642089844, "gen_logits_mean": -3.4539008140563965, "gen_logits_min": -15.086888313293457, "gen_logits_std": 2.364557981491089, "gen_loss": 2.1273579597473145, "grad_norm": 0.9794075235136688, "learning_rate": 3.63265306122449e-06, "loss": 1.5934, "mean_copy_accuracy": 0.00594909826759249, "mean_gen_accuracy": 0.8170244544744492, "mean_token_accuracy": 0.6526192277669907, "num_tokens": 24493058.0, "sample_num_tokens": 11029.0, "step": 90, "total_num_tokens": 24537174.0, "z_loss": 0.004499673843383789 }, { "copy_logits_max": 0.26213765144348145, "copy_logits_min": -750000000.0, "copy_num_tokens": 234.5625, "epoch": 0.01858565228491192, "gen_logits_max": 17.926265716552734, "gen_logits_mean": -1.492379069328308, "gen_logits_min": -13.259575843811035, "gen_logits_std": 2.3457343578338623, "gen_loss": 1.2662510871887207, "grad_norm": 1.0239776730418348, "learning_rate": 3.673469387755102e-06, "loss": 1.7751, "mean_copy_accuracy": 0.0056491048308089375, "mean_gen_accuracy": 0.8118876665830612, "mean_token_accuracy": 0.6216405183076859, "num_tokens": 24781226.0, "sample_num_tokens": 6505.0, "step": 91, "total_num_tokens": 24807246.0, "z_loss": 0.005726979114115238 }, { "copy_logits_max": 0.2620519995689392, "copy_logits_min": -687500032.0, "copy_num_tokens": 406.75, "epoch": 0.018789890222108755, "gen_logits_max": 15.77545166015625, "gen_logits_mean": -2.5531697273254395, "gen_logits_min": -14.00265121459961, "gen_logits_std": 2.3462462425231934, "gen_loss": 1.7059438228607178, "grad_norm": 1.0102324550060362, "learning_rate": 3.7142857142857146e-06, "loss": 1.8234, "mean_copy_accuracy": 0.006046626600436866, "mean_gen_accuracy": 0.8037054687738419, "mean_token_accuracy": 0.6109449416399002, "num_tokens": 25038401.0, "sample_num_tokens": 8114.25, "step": 92, "total_num_tokens": 25070858.0, "z_loss": 0.005080725532025099 }, { "copy_logits_max": 0.241506889462471, "copy_logits_min": -687500032.0, "copy_num_tokens": 399.5625, "epoch": 0.018994128159305592, "gen_logits_max": 16.117507934570312, "gen_logits_mean": -1.9874147176742554, "gen_logits_min": -13.40537166595459, "gen_logits_std": 2.337139129638672, "gen_loss": 1.7019214630126953, "grad_norm": 1.0849265665114283, "learning_rate": 3.7551020408163268e-06, "loss": 1.8239, "mean_copy_accuracy": 0.006705488543957472, "mean_gen_accuracy": 0.8074507117271423, "mean_token_accuracy": 0.6104612648487091, "num_tokens": 25324179.0, "sample_num_tokens": 8519.25, "step": 93, "total_num_tokens": 25358256.0, "z_loss": 0.005172316916286945 }, { "copy_logits_max": 0.28681957721710205, "copy_logits_min": -625000000.0, "copy_num_tokens": 942.375, "epoch": 0.019198366096502426, "gen_logits_max": 13.196821212768555, "gen_logits_mean": -4.481657981872559, "gen_logits_min": -16.106521606445312, "gen_logits_std": 2.375662326812744, "gen_loss": 2.5529656410217285, "grad_norm": 1.0468415419709896, "learning_rate": 3.7959183673469385e-06, "loss": 2.0394, "mean_copy_accuracy": 0.005021630029659718, "mean_gen_accuracy": 0.8042506873607635, "mean_token_accuracy": 0.5796473324298859, "num_tokens": 25598639.0, "sample_num_tokens": 11009.75, "step": 94, "total_num_tokens": 25642678.0, "z_loss": 0.004126415587961674 }, { "copy_logits_max": 0.27908089756965637, "copy_logits_min": -750000000.0, "copy_num_tokens": 418.75, "epoch": 0.01940260403369926, "gen_logits_max": 15.968863487243652, "gen_logits_mean": -2.364475965499878, "gen_logits_min": -13.813739776611328, "gen_logits_std": 2.3498077392578125, "gen_loss": 1.7721202373504639, "grad_norm": 1.7034783477813846, "learning_rate": 3.836734693877551e-06, "loss": 1.7987, "mean_copy_accuracy": 0.00569134543184191, "mean_gen_accuracy": 0.8068333566188812, "mean_token_accuracy": 0.6117137968540192, "num_tokens": 25858037.0, "sample_num_tokens": 7567.75, "step": 95, "total_num_tokens": 25888308.0, "z_loss": 0.005350516177713871 }, { "copy_logits_max": 0.31039220094680786, "copy_logits_min": -750000000.0, "copy_num_tokens": 505.75, "epoch": 0.019606841970896093, "gen_logits_max": 14.804826736450195, "gen_logits_mean": -3.6259231567382812, "gen_logits_min": -15.753438949584961, "gen_logits_std": 2.3909430503845215, "gen_loss": 2.0817461013793945, "grad_norm": 1.0247167779707422, "learning_rate": 3.877551020408164e-06, "loss": 1.8976, "mean_copy_accuracy": 0.006233674474060535, "mean_gen_accuracy": 0.8092396259307861, "mean_token_accuracy": 0.6027213633060455, "num_tokens": 26126154.0, "sample_num_tokens": 8208.0, "step": 96, "total_num_tokens": 26158986.0, "z_loss": 0.004707840271294117 }, { "copy_logits_max": 0.3095499575138092, "copy_logits_min": -750000000.0, "copy_num_tokens": 439.4375, "epoch": 0.01981107990809293, "gen_logits_max": 15.99781322479248, "gen_logits_mean": -3.11993408203125, "gen_logits_min": -14.75420093536377, "gen_logits_std": 2.3497893810272217, "gen_loss": 1.7615922689437866, "grad_norm": 1.079385676529868, "learning_rate": 3.918367346938776e-06, "loss": 1.8922, "mean_copy_accuracy": 0.007754598860628903, "mean_gen_accuracy": 0.8014644235372543, "mean_token_accuracy": 0.6005775779485703, "num_tokens": 26396556.0, "sample_num_tokens": 8929.0, "step": 97, "total_num_tokens": 26432272.0, "z_loss": 0.004999426193535328 }, { "copy_logits_max": 0.2950288653373718, "copy_logits_min": -687499904.0, "copy_num_tokens": 503.125, "epoch": 0.020015317845289763, "gen_logits_max": 15.784894943237305, "gen_logits_mean": -3.0141515731811523, "gen_logits_min": -14.916322708129883, "gen_logits_std": 2.385923385620117, "gen_loss": 1.828289270401001, "grad_norm": 1.2133384383105708, "learning_rate": 3.959183673469387e-06, "loss": 1.9353, "mean_copy_accuracy": 0.004599862557370216, "mean_gen_accuracy": 0.7929174304008484, "mean_token_accuracy": 0.5950381904840469, "num_tokens": 26670619.0, "sample_num_tokens": 9791.75, "step": 98, "total_num_tokens": 26709786.0, "z_loss": 0.005102065857499838 }, { "copy_logits_max": 0.28519779443740845, "copy_logits_min": -624999936.0, "copy_num_tokens": 538.0625, "epoch": 0.020219555782486597, "gen_logits_max": 16.622356414794922, "gen_logits_mean": -1.9886929988861084, "gen_logits_min": -13.615019798278809, "gen_logits_std": 2.368690013885498, "gen_loss": 1.7964061498641968, "grad_norm": 1.0981650109604053, "learning_rate": 4e-06, "loss": 1.8402, "mean_copy_accuracy": 0.005338207120075822, "mean_gen_accuracy": 0.8077354729175568, "mean_token_accuracy": 0.6078235507011414, "num_tokens": 26944900.0, "sample_num_tokens": 9614.5, "step": 99, "total_num_tokens": 26983358.0, "z_loss": 0.005203167907893658 }, { "copy_logits_max": 0.2626984715461731, "copy_logits_min": -687500032.0, "copy_num_tokens": 498.375, "epoch": 0.02042379371968343, "gen_logits_max": 15.111788749694824, "gen_logits_mean": -2.6818132400512695, "gen_logits_min": -14.274892807006836, "gen_logits_std": 2.3777809143066406, "gen_loss": 1.9355825185775757, "grad_norm": 1.1223768087504544, "learning_rate": 4.040816326530612e-06, "loss": 1.752, "mean_copy_accuracy": 0.006476277252659202, "mean_gen_accuracy": 0.808905303478241, "mean_token_accuracy": 0.6250621676445007, "num_tokens": 27220477.0, "sample_num_tokens": 8902.75, "step": 100, "total_num_tokens": 27256088.0, "z_loss": 0.004849182907491922 }, { "copy_logits_max": 0.33011746406555176, "copy_logits_min": -750000000.0, "copy_num_tokens": 403.875, "epoch": 0.020628031656880267, "gen_logits_max": 16.848222732543945, "gen_logits_mean": -2.418602466583252, "gen_logits_min": -14.082497596740723, "gen_logits_std": 2.3436813354492188, "gen_loss": 1.652030348777771, "grad_norm": 0.8502701804514802, "learning_rate": 4.081632653061225e-06, "loss": 1.8581, "mean_copy_accuracy": 0.005384935240726918, "mean_gen_accuracy": 0.8109974414110184, "mean_token_accuracy": 0.6098534911870956, "num_tokens": 27506813.0, "sample_num_tokens": 8328.75, "step": 101, "total_num_tokens": 27540128.0, "z_loss": 0.005418155342340469 }, { "copy_logits_max": 0.32117611169815063, "copy_logits_min": -750000000.0, "copy_num_tokens": 420.3125, "epoch": 0.0208322695940771, "gen_logits_max": 16.72113037109375, "gen_logits_mean": -2.6774942874908447, "gen_logits_min": -14.386058807373047, "gen_logits_std": 2.3648018836975098, "gen_loss": 1.6967741250991821, "grad_norm": 0.8690751411041476, "learning_rate": 4.122448979591837e-06, "loss": 1.9253, "mean_copy_accuracy": 0.007091630715876818, "mean_gen_accuracy": 0.8029487878084183, "mean_token_accuracy": 0.596234992146492, "num_tokens": 27780998.0, "sample_num_tokens": 8532.0, "step": 102, "total_num_tokens": 27815126.0, "z_loss": 0.0052591124549508095 }, { "copy_logits_max": 0.31170228123664856, "copy_logits_min": -750000000.0, "copy_num_tokens": 454.5625, "epoch": 0.021036507531273934, "gen_logits_max": 15.029852867126465, "gen_logits_mean": -3.068275213241577, "gen_logits_min": -14.685498237609863, "gen_logits_std": 2.3425421714782715, "gen_loss": 1.7988775968551636, "grad_norm": 0.8106363938763662, "learning_rate": 4.163265306122449e-06, "loss": 1.6851, "mean_copy_accuracy": 0.008268411504104733, "mean_gen_accuracy": 0.8136473894119263, "mean_token_accuracy": 0.6348376721143723, "num_tokens": 28060879.0, "sample_num_tokens": 8305.75, "step": 103, "total_num_tokens": 28094102.0, "z_loss": 0.0048975227400660515 }, { "copy_logits_max": 0.2986694276332855, "copy_logits_min": -750000000.0, "copy_num_tokens": 461.625, "epoch": 0.021240745468470767, "gen_logits_max": 15.768790245056152, "gen_logits_mean": -2.4798035621643066, "gen_logits_min": -13.99229621887207, "gen_logits_std": 2.3595669269561768, "gen_loss": 1.8269612789154053, "grad_norm": 0.9995822350830129, "learning_rate": 4.204081632653061e-06, "loss": 1.7301, "mean_copy_accuracy": 0.005994886625558138, "mean_gen_accuracy": 0.8132375031709671, "mean_token_accuracy": 0.616353914141655, "num_tokens": 28323888.0, "sample_num_tokens": 7433.0, "step": 104, "total_num_tokens": 28353620.0, "z_loss": 0.00529879005625844 }, { "copy_logits_max": 0.3443242013454437, "copy_logits_min": -687500032.0, "copy_num_tokens": 330.4375, "epoch": 0.021444983405667604, "gen_logits_max": 17.58780860900879, "gen_logits_mean": -1.9546217918395996, "gen_logits_min": -13.792943000793457, "gen_logits_std": 2.366382122039795, "gen_loss": 1.3464783430099487, "grad_norm": 0.7954815241283436, "learning_rate": 4.244897959183674e-06, "loss": 1.6706, "mean_copy_accuracy": 0.00940381980035454, "mean_gen_accuracy": 0.8129797130823135, "mean_token_accuracy": 0.6395962834358215, "num_tokens": 28575960.0, "sample_num_tokens": 8911.5, "step": 105, "total_num_tokens": 28611606.0, "z_loss": 0.0058172899298369884 }, { "copy_logits_max": 0.3772084712982178, "copy_logits_min": -687500032.0, "copy_num_tokens": 770.1875, "epoch": 0.021649221342864438, "gen_logits_max": 14.856880187988281, "gen_logits_mean": -4.007568359375, "gen_logits_min": -15.746488571166992, "gen_logits_std": 2.3634326457977295, "gen_loss": 2.052290439605713, "grad_norm": 0.8435958773874317, "learning_rate": 4.2857142857142855e-06, "loss": 1.8338, "mean_copy_accuracy": 0.007495202822610736, "mean_gen_accuracy": 0.813751608133316, "mean_token_accuracy": 0.6107572913169861, "num_tokens": 28842002.0, "sample_num_tokens": 10465.0, "step": 106, "total_num_tokens": 28883862.0, "z_loss": 0.004592191893607378 }, { "copy_logits_max": 0.27208301424980164, "copy_logits_min": -750000000.0, "copy_num_tokens": 466.875, "epoch": 0.02185345928006127, "gen_logits_max": 16.20379638671875, "gen_logits_mean": -1.8095743656158447, "gen_logits_min": -14.149658203125, "gen_logits_std": 2.4164438247680664, "gen_loss": 1.5631294250488281, "grad_norm": 0.8654450892127116, "learning_rate": 4.326530612244898e-06, "loss": 1.7682, "mean_copy_accuracy": 0.008276042179204524, "mean_gen_accuracy": 0.8103573024272919, "mean_token_accuracy": 0.6214407086372375, "num_tokens": 29132948.0, "sample_num_tokens": 9447.0, "step": 107, "total_num_tokens": 29170736.0, "z_loss": 0.005262289196252823 }, { "copy_logits_max": 0.36516982316970825, "copy_logits_min": -624999936.0, "copy_num_tokens": 566.375, "epoch": 0.022057697217258105, "gen_logits_max": 13.883743286132812, "gen_logits_mean": -4.398077011108398, "gen_logits_min": -16.016780853271484, "gen_logits_std": 2.3787002563476562, "gen_loss": 2.240591287612915, "grad_norm": 0.9248137614984719, "learning_rate": 4.367346938775511e-06, "loss": 1.9541, "mean_copy_accuracy": 0.0051296616438776255, "mean_gen_accuracy": 0.8085129708051682, "mean_token_accuracy": 0.5924995839595795, "num_tokens": 29398892.0, "sample_num_tokens": 8466.5, "step": 108, "total_num_tokens": 29432758.0, "z_loss": 0.0045992908999323845 }, { "copy_logits_max": 0.36763447523117065, "copy_logits_min": -750000000.0, "copy_num_tokens": 508.8125, "epoch": 0.02226193515445494, "gen_logits_max": 15.339609146118164, "gen_logits_mean": -3.4129998683929443, "gen_logits_min": -15.101006507873535, "gen_logits_std": 2.4061222076416016, "gen_loss": 2.0008373260498047, "grad_norm": 0.7605672965314675, "learning_rate": 4.408163265306123e-06, "loss": 1.8389, "mean_copy_accuracy": 0.006548550445586443, "mean_gen_accuracy": 0.8129734396934509, "mean_token_accuracy": 0.6152309030294418, "num_tokens": 29670837.0, "sample_num_tokens": 8534.75, "step": 109, "total_num_tokens": 29704976.0, "z_loss": 0.004948443733155727 }, { "copy_logits_max": 0.3327622413635254, "copy_logits_min": -750000000.0, "copy_num_tokens": 539.375, "epoch": 0.022466173091651775, "gen_logits_max": 14.993467330932617, "gen_logits_mean": -3.0170483589172363, "gen_logits_min": -14.741717338562012, "gen_logits_std": 2.3938727378845215, "gen_loss": 2.0200557708740234, "grad_norm": 1.4154716176703217, "learning_rate": 4.448979591836735e-06, "loss": 1.7423, "mean_copy_accuracy": 0.007265674299560487, "mean_gen_accuracy": 0.8158100545406342, "mean_token_accuracy": 0.6326255947351456, "num_tokens": 29935554.0, "sample_num_tokens": 8689.5, "step": 110, "total_num_tokens": 29970312.0, "z_loss": 0.004903031047433615 }, { "copy_logits_max": 0.3253996670246124, "copy_logits_min": -750000000.0, "copy_num_tokens": 474.1875, "epoch": 0.02267041102884861, "gen_logits_max": 15.649758338928223, "gen_logits_mean": -3.197464942932129, "gen_logits_min": -14.875080108642578, "gen_logits_std": 2.389925956726074, "gen_loss": 1.736518144607544, "grad_norm": 0.7348410393281447, "learning_rate": 4.489795918367347e-06, "loss": 1.8804, "mean_copy_accuracy": 0.006812068051658571, "mean_gen_accuracy": 0.817378580570221, "mean_token_accuracy": 0.605778694152832, "num_tokens": 30217644.0, "sample_num_tokens": 8622.0, "step": 111, "total_num_tokens": 30252132.0, "z_loss": 0.005083741620182991 }, { "copy_logits_max": 0.3717939257621765, "copy_logits_min": -750000000.0, "copy_num_tokens": 604.6875, "epoch": 0.022874648966045442, "gen_logits_max": 15.218963623046875, "gen_logits_mean": -3.409083366394043, "gen_logits_min": -15.20867919921875, "gen_logits_std": 2.410067558288574, "gen_loss": 1.9282578229904175, "grad_norm": 0.8637748742288223, "learning_rate": 4.530612244897959e-06, "loss": 1.8871, "mean_copy_accuracy": 0.005954586900770664, "mean_gen_accuracy": 0.8159853219985962, "mean_token_accuracy": 0.6033642739057541, "num_tokens": 30510772.0, "sample_num_tokens": 9838.5, "step": 112, "total_num_tokens": 30550126.0, "z_loss": 0.004777314607053995 }, { "copy_logits_max": 0.3863529860973358, "copy_logits_min": -750000000.0, "copy_num_tokens": 415.875, "epoch": 0.023078886903242276, "gen_logits_max": 18.038551330566406, "gen_logits_mean": -2.366307258605957, "gen_logits_min": -13.930316925048828, "gen_logits_std": 2.3340697288513184, "gen_loss": 1.484809160232544, "grad_norm": 0.7017194846161393, "learning_rate": 4.571428571428572e-06, "loss": 1.5809, "mean_copy_accuracy": 0.009394244523718953, "mean_gen_accuracy": 0.8206883817911148, "mean_token_accuracy": 0.6473643332719803, "num_tokens": 30779336.0, "sample_num_tokens": 8490.5, "step": 113, "total_num_tokens": 30813298.0, "z_loss": 0.0057066017761826515 }, { "copy_logits_max": 0.36824649572372437, "copy_logits_min": -750000000.0, "copy_num_tokens": 338.25, "epoch": 0.023283124840439113, "gen_logits_max": 17.158611297607422, "gen_logits_mean": -2.4160594940185547, "gen_logits_min": -14.117355346679688, "gen_logits_std": 2.362607955932617, "gen_loss": 1.5275273323059082, "grad_norm": 0.923869955798217, "learning_rate": 4.612244897959184e-06, "loss": 1.8173, "mean_copy_accuracy": 0.0068750971695408225, "mean_gen_accuracy": 0.8189477324485779, "mean_token_accuracy": 0.6146019995212555, "num_tokens": 31066179.0, "sample_num_tokens": 7370.25, "step": 114, "total_num_tokens": 31095660.0, "z_loss": 0.005370710976421833 }, { "copy_logits_max": 0.34842565655708313, "copy_logits_min": -750000000.0, "copy_num_tokens": 468.3125, "epoch": 0.023487362777635946, "gen_logits_max": 16.619064331054688, "gen_logits_mean": -2.604813575744629, "gen_logits_min": -14.640246391296387, "gen_logits_std": 2.392782688140869, "gen_loss": 1.6670870780944824, "grad_norm": 1.1151968478388088, "learning_rate": 4.653061224489796e-06, "loss": 1.9127, "mean_copy_accuracy": 0.00632823898922652, "mean_gen_accuracy": 0.8153856992721558, "mean_token_accuracy": 0.6016408503055573, "num_tokens": 31348369.0, "sample_num_tokens": 8698.75, "step": 115, "total_num_tokens": 31383164.0, "z_loss": 0.005266408436000347 }, { "copy_logits_max": 0.39084088802337646, "copy_logits_min": -687500032.0, "copy_num_tokens": 371.1875, "epoch": 0.02369160071483278, "gen_logits_max": 17.013568878173828, "gen_logits_mean": -3.119877338409424, "gen_logits_min": -15.06803035736084, "gen_logits_std": 2.36704683303833, "gen_loss": 1.65847909450531, "grad_norm": 0.7816522056686527, "learning_rate": 4.693877551020409e-06, "loss": 1.9825, "mean_copy_accuracy": 0.006531369173899293, "mean_gen_accuracy": 0.8116834759712219, "mean_token_accuracy": 0.5881875604391098, "num_tokens": 31632299.0, "sample_num_tokens": 7867.75, "step": 116, "total_num_tokens": 31663770.0, "z_loss": 0.005414796061813831 }, { "copy_logits_max": 0.38006889820098877, "copy_logits_min": -750000000.0, "copy_num_tokens": 327.8125, "epoch": 0.023895838652029613, "gen_logits_max": 17.000505447387695, "gen_logits_mean": -2.5059192180633545, "gen_logits_min": -14.234354972839355, "gen_logits_std": 2.385425090789795, "gen_loss": 1.5533783435821533, "grad_norm": 0.7618503671292521, "learning_rate": 4.73469387755102e-06, "loss": 1.7165, "mean_copy_accuracy": 0.008532611420378089, "mean_gen_accuracy": 0.8081308752298355, "mean_token_accuracy": 0.6266613751649857, "num_tokens": 31888591.0, "sample_num_tokens": 7380.25, "step": 117, "total_num_tokens": 31918112.0, "z_loss": 0.005597083829343319 }, { "copy_logits_max": 0.36539584398269653, "copy_logits_min": -750000000.0, "copy_num_tokens": 269.3125, "epoch": 0.02410007658922645, "gen_logits_max": 17.764915466308594, "gen_logits_mean": -2.082509994506836, "gen_logits_min": -13.73103141784668, "gen_logits_std": 2.3709919452667236, "gen_loss": 1.371803641319275, "grad_norm": 0.9291379470727597, "learning_rate": 4.775510204081632e-06, "loss": 1.7759, "mean_copy_accuracy": 0.00831518485210836, "mean_gen_accuracy": 0.8129763007164001, "mean_token_accuracy": 0.6249655485153198, "num_tokens": 32155437.0, "sample_num_tokens": 7357.75, "step": 118, "total_num_tokens": 32184868.0, "z_loss": 0.00571560999378562 }, { "copy_logits_max": 0.3388817012310028, "copy_logits_min": -625000000.0, "copy_num_tokens": 580.6875, "epoch": 0.024304314526423284, "gen_logits_max": 14.969828605651855, "gen_logits_mean": -3.5937907695770264, "gen_logits_min": -15.386434555053711, "gen_logits_std": 2.4133033752441406, "gen_loss": 1.876824140548706, "grad_norm": 0.748889564189635, "learning_rate": 4.816326530612245e-06, "loss": 1.8555, "mean_copy_accuracy": 0.009268363239243627, "mean_gen_accuracy": 0.8244404047727585, "mean_token_accuracy": 0.6132529601454735, "num_tokens": 32428404.0, "sample_num_tokens": 9802.0, "step": 119, "total_num_tokens": 32467612.0, "z_loss": 0.004764518700540066 }, { "copy_logits_max": 0.41798990964889526, "copy_logits_min": -750000000.0, "copy_num_tokens": 508.0, "epoch": 0.024508552463620117, "gen_logits_max": 16.292409896850586, "gen_logits_mean": -3.7537875175476074, "gen_logits_min": -15.209672927856445, "gen_logits_std": 2.336569309234619, "gen_loss": 1.685308814048767, "grad_norm": 0.7830478460506508, "learning_rate": 4.857142857142858e-06, "loss": 1.6292, "mean_copy_accuracy": 0.009343163343146443, "mean_gen_accuracy": 0.8137447088956833, "mean_token_accuracy": 0.6408524513244629, "num_tokens": 32694671.0, "sample_num_tokens": 9620.75, "step": 120, "total_num_tokens": 32733154.0, "z_loss": 0.005006009247153997 }, { "copy_logits_max": 0.3691571056842804, "copy_logits_min": -625000064.0, "copy_num_tokens": 531.375, "epoch": 0.02471279040081695, "gen_logits_max": 15.103744506835938, "gen_logits_mean": -3.155717134475708, "gen_logits_min": -15.095008850097656, "gen_logits_std": 2.437190294265747, "gen_loss": 1.8382949829101562, "grad_norm": 0.7163492636212921, "learning_rate": 4.897959183673469e-06, "loss": 1.8808, "mean_copy_accuracy": 0.006766251171939075, "mean_gen_accuracy": 0.8159428238868713, "mean_token_accuracy": 0.6039989292621613, "num_tokens": 32979008.0, "sample_num_tokens": 9266.5, "step": 121, "total_num_tokens": 33016074.0, "z_loss": 0.005060330964624882 }, { "copy_logits_max": 0.32683494687080383, "copy_logits_min": -750000000.0, "copy_num_tokens": 393.0, "epoch": 0.024917028338013784, "gen_logits_max": 16.411266326904297, "gen_logits_mean": -2.0275139808654785, "gen_logits_min": -13.888360023498535, "gen_logits_std": 2.4056496620178223, "gen_loss": 1.711780071258545, "grad_norm": 0.7624301557313877, "learning_rate": 4.938775510204082e-06, "loss": 1.6146, "mean_copy_accuracy": 0.009168137330561876, "mean_gen_accuracy": 0.8033343404531479, "mean_token_accuracy": 0.6392836719751358, "num_tokens": 33236054.0, "sample_num_tokens": 7889.0, "step": 122, "total_num_tokens": 33267610.0, "z_loss": 0.005512566305696964 }, { "copy_logits_max": 0.3975295126438141, "copy_logits_min": -687500032.0, "copy_num_tokens": 611.75, "epoch": 0.02512126627521062, "gen_logits_max": 15.065792083740234, "gen_logits_mean": -3.3374080657958984, "gen_logits_min": -15.307952880859375, "gen_logits_std": 2.4024715423583984, "gen_loss": 2.012613296508789, "grad_norm": 0.8356660353636461, "learning_rate": 4.979591836734694e-06, "loss": 1.6943, "mean_copy_accuracy": 0.0075409801211208105, "mean_gen_accuracy": 0.815427303314209, "mean_token_accuracy": 0.6308886110782623, "num_tokens": 33496786.0, "sample_num_tokens": 9456.5, "step": 123, "total_num_tokens": 33534612.0, "z_loss": 0.004806654993444681 }, { "copy_logits_max": 0.3913469910621643, "copy_logits_min": -687500032.0, "copy_num_tokens": 528.25, "epoch": 0.025325504212407455, "gen_logits_max": 14.450545310974121, "gen_logits_mean": -3.8881614208221436, "gen_logits_min": -15.41922378540039, "gen_logits_std": 2.4278931617736816, "gen_loss": 2.0332932472229004, "grad_norm": 2.1819728667055904, "learning_rate": 5.020408163265306e-06, "loss": 1.9246, "mean_copy_accuracy": 0.006796907167881727, "mean_gen_accuracy": 0.817225843667984, "mean_token_accuracy": 0.6018355190753937, "num_tokens": 33783002.0, "sample_num_tokens": 8258.5, "step": 124, "total_num_tokens": 33816036.0, "z_loss": 0.004854405764490366 }, { "copy_logits_max": 0.39180445671081543, "copy_logits_min": -750000000.0, "copy_num_tokens": 550.25, "epoch": 0.02552974214960429, "gen_logits_max": 15.116968154907227, "gen_logits_mean": -3.398125648498535, "gen_logits_min": -15.431843757629395, "gen_logits_std": 2.4359359741210938, "gen_loss": 2.020327091217041, "grad_norm": 1.2212114710268451, "learning_rate": 5.061224489795918e-06, "loss": 1.8833, "mean_copy_accuracy": 0.006955962395295501, "mean_gen_accuracy": 0.8118726015090942, "mean_token_accuracy": 0.5879622101783752, "num_tokens": 34047960.0, "sample_num_tokens": 8452.5, "step": 125, "total_num_tokens": 34081770.0, "z_loss": 0.005023709498345852 }, { "copy_logits_max": 0.40191158652305603, "copy_logits_min": -750000000.0, "copy_num_tokens": 548.5, "epoch": 0.025733980086801122, "gen_logits_max": 15.38507080078125, "gen_logits_mean": -3.574467658996582, "gen_logits_min": -15.705259323120117, "gen_logits_std": 2.4703824520111084, "gen_loss": 1.8916609287261963, "grad_norm": 0.7427310121366284, "learning_rate": 5.102040816326531e-06, "loss": 1.8285, "mean_copy_accuracy": 0.008753987145610154, "mean_gen_accuracy": 0.8156073689460754, "mean_token_accuracy": 0.6101388335227966, "num_tokens": 34331191.0, "sample_num_tokens": 9495.25, "step": 126, "total_num_tokens": 34369172.0, "z_loss": 0.0051438165828585625 }, { "copy_logits_max": 0.4317501485347748, "copy_logits_min": -750000000.0, "copy_num_tokens": 458.4375, "epoch": 0.02593821802399796, "gen_logits_max": 15.538984298706055, "gen_logits_mean": -3.5992672443389893, "gen_logits_min": -15.843780517578125, "gen_logits_std": 2.4139347076416016, "gen_loss": 1.843682050704956, "grad_norm": 0.9105311335797386, "learning_rate": 5.142857142857143e-06, "loss": 1.7756, "mean_copy_accuracy": 0.007844764622859657, "mean_gen_accuracy": 0.811530813574791, "mean_token_accuracy": 0.6144099682569504, "num_tokens": 34589302.0, "sample_num_tokens": 7893.5, "step": 127, "total_num_tokens": 34620876.0, "z_loss": 0.005144147202372551 }, { "copy_logits_max": 0.4027734398841858, "copy_logits_min": -750000000.0, "copy_num_tokens": 520.0625, "epoch": 0.026142455961194792, "gen_logits_max": 15.297794342041016, "gen_logits_mean": -3.2598400115966797, "gen_logits_min": -15.137125015258789, "gen_logits_std": 2.4408135414123535, "gen_loss": 1.8700391054153442, "grad_norm": 0.7857212790205799, "learning_rate": 5.183673469387756e-06, "loss": 1.7462, "mean_copy_accuracy": 0.010134953656233847, "mean_gen_accuracy": 0.8191550374031067, "mean_token_accuracy": 0.6246538907289505, "num_tokens": 34856546.0, "sample_num_tokens": 9175.0, "step": 128, "total_num_tokens": 34893246.0, "z_loss": 0.005254632793366909 }, { "copy_logits_max": 0.38655322790145874, "copy_logits_min": -750000000.0, "copy_num_tokens": 577.6875, "epoch": 0.026346693898391626, "gen_logits_max": 15.173701286315918, "gen_logits_mean": -3.290494680404663, "gen_logits_min": -14.985370635986328, "gen_logits_std": 2.3996877670288086, "gen_loss": 1.770315170288086, "grad_norm": 1.2487800652592131, "learning_rate": 5.224489795918367e-06, "loss": 1.9137, "mean_copy_accuracy": 0.008185238228179514, "mean_gen_accuracy": 0.823258101940155, "mean_token_accuracy": 0.6089904010295868, "num_tokens": 35134075.0, "sample_num_tokens": 10478.75, "step": 129, "total_num_tokens": 35175990.0, "z_loss": 0.0049111065454781055 }, { "copy_logits_max": 0.3905785381793976, "copy_logits_min": -750000000.0, "copy_num_tokens": 575.4375, "epoch": 0.02655093183558846, "gen_logits_max": 15.181297302246094, "gen_logits_mean": -3.182528495788574, "gen_logits_min": -15.167875289916992, "gen_logits_std": 2.4557671546936035, "gen_loss": 1.924932599067688, "grad_norm": 0.7666227778366429, "learning_rate": 5.265306122448979e-06, "loss": 1.6728, "mean_copy_accuracy": 0.011111899744719267, "mean_gen_accuracy": 0.8150617927312851, "mean_token_accuracy": 0.6380162388086319, "num_tokens": 35407027.0, "sample_num_tokens": 9334.25, "step": 130, "total_num_tokens": 35444364.0, "z_loss": 0.0049422308802604675 }, { "copy_logits_max": 0.34587204456329346, "copy_logits_min": -750000000.0, "copy_num_tokens": 326.5, "epoch": 0.026755169772785296, "gen_logits_max": 17.233884811401367, "gen_logits_mean": -2.013291358947754, "gen_logits_min": -13.996427536010742, "gen_logits_std": 2.4356510639190674, "gen_loss": 1.3139878511428833, "grad_norm": 0.82146410642728, "learning_rate": 5.306122448979592e-06, "loss": 1.6885, "mean_copy_accuracy": 0.008158816723152995, "mean_gen_accuracy": 0.8204198628664017, "mean_token_accuracy": 0.6367156058549881, "num_tokens": 35702449.0, "sample_num_tokens": 8140.25, "step": 131, "total_num_tokens": 35735010.0, "z_loss": 0.00563031155616045 }, { "copy_logits_max": 0.38572171330451965, "copy_logits_min": -750000000.0, "copy_num_tokens": 403.4375, "epoch": 0.02695940770998213, "gen_logits_max": 15.73817253112793, "gen_logits_mean": -2.816476345062256, "gen_logits_min": -14.486745834350586, "gen_logits_std": 2.368276596069336, "gen_loss": 1.5787410736083984, "grad_norm": 0.8585588030903334, "learning_rate": 5.3469387755102045e-06, "loss": 1.9068, "mean_copy_accuracy": 0.006223707576282322, "mean_gen_accuracy": 0.8165148049592972, "mean_token_accuracy": 0.6025437861680984, "num_tokens": 35977667.0, "sample_num_tokens": 8547.75, "step": 132, "total_num_tokens": 36011858.0, "z_loss": 0.005262074060738087 }, { "copy_logits_max": 0.4288927912712097, "copy_logits_min": -750000000.0, "copy_num_tokens": 419.0625, "epoch": 0.027163645647178963, "gen_logits_max": 16.11811065673828, "gen_logits_mean": -3.142336845397949, "gen_logits_min": -15.070537567138672, "gen_logits_std": 2.4146039485931396, "gen_loss": 1.763510823249817, "grad_norm": 0.9721790028923933, "learning_rate": 5.387755102040816e-06, "loss": 1.7482, "mean_copy_accuracy": 0.007447061478160322, "mean_gen_accuracy": 0.8202284723520279, "mean_token_accuracy": 0.6280601471662521, "num_tokens": 36228864.0, "sample_num_tokens": 7779.0, "step": 133, "total_num_tokens": 36259980.0, "z_loss": 0.005410299636423588 }, { "copy_logits_max": 0.35773682594299316, "copy_logits_min": -750000000.0, "copy_num_tokens": 234.5625, "epoch": 0.027367883584375797, "gen_logits_max": 17.74761199951172, "gen_logits_mean": -1.8110665082931519, "gen_logits_min": -13.497949600219727, "gen_logits_std": 2.4111502170562744, "gen_loss": 1.3240219354629517, "grad_norm": 0.8188321327080538, "learning_rate": 5.428571428571429e-06, "loss": 1.548, "mean_copy_accuracy": 0.011864274623803794, "mean_gen_accuracy": 0.811237171292305, "mean_token_accuracy": 0.6540581285953522, "num_tokens": 36473531.0, "sample_num_tokens": 6630.25, "step": 134, "total_num_tokens": 36500052.0, "z_loss": 0.006038378924131393 }, { "copy_logits_max": 0.4186250567436218, "copy_logits_min": -750000000.0, "copy_num_tokens": 532.0625, "epoch": 0.027572121521572634, "gen_logits_max": 15.609956741333008, "gen_logits_mean": -2.8182101249694824, "gen_logits_min": -14.738057136535645, "gen_logits_std": 2.431122303009033, "gen_loss": 1.908292293548584, "grad_norm": 0.9384750923932382, "learning_rate": 5.469387755102041e-06, "loss": 1.7535, "mean_copy_accuracy": 0.007498654187656939, "mean_gen_accuracy": 0.8148326575756073, "mean_token_accuracy": 0.6242430657148361, "num_tokens": 36724049.0, "sample_num_tokens": 8180.75, "step": 135, "total_num_tokens": 36756772.0, "z_loss": 0.005116046406328678 }, { "copy_logits_max": 0.4075840413570404, "copy_logits_min": -750000000.0, "copy_num_tokens": 287.5, "epoch": 0.027776359458769467, "gen_logits_max": 16.92349624633789, "gen_logits_mean": -2.3482604026794434, "gen_logits_min": -14.187660217285156, "gen_logits_std": 2.4022862911224365, "gen_loss": 1.5110405683517456, "grad_norm": 0.9458489777560154, "learning_rate": 5.510204081632653e-06, "loss": 1.7142, "mean_copy_accuracy": 0.009520614170469344, "mean_gen_accuracy": 0.8181194067001343, "mean_token_accuracy": 0.628944918513298, "num_tokens": 36986329.0, "sample_num_tokens": 6795.75, "step": 136, "total_num_tokens": 37013512.0, "z_loss": 0.00562078133225441 }, { "copy_logits_max": 0.4344250559806824, "copy_logits_min": -750000000.0, "copy_num_tokens": 558.4375, "epoch": 0.0279805973959663, "gen_logits_max": 15.530709266662598, "gen_logits_mean": -3.4460554122924805, "gen_logits_min": -15.184711456298828, "gen_logits_std": 2.3765294551849365, "gen_loss": 1.7714085578918457, "grad_norm": 0.8030951827599735, "learning_rate": 5.551020408163265e-06, "loss": 1.77, "mean_copy_accuracy": 0.00979360705241561, "mean_gen_accuracy": 0.8200900107622147, "mean_token_accuracy": 0.6094701439142227, "num_tokens": 37260526.0, "sample_num_tokens": 8993.0, "step": 137, "total_num_tokens": 37296498.0, "z_loss": 0.005115999840199947 }, { "copy_logits_max": 0.444309800863266, "copy_logits_min": -562499968.0, "copy_num_tokens": 489.375, "epoch": 0.028184835333163134, "gen_logits_max": 14.645302772521973, "gen_logits_mean": -3.8456485271453857, "gen_logits_min": -15.962122917175293, "gen_logits_std": 2.4184212684631348, "gen_loss": 1.997194528579712, "grad_norm": 0.8730192118813542, "learning_rate": 5.5918367346938776e-06, "loss": 1.6721, "mean_copy_accuracy": 0.012467152206227183, "mean_gen_accuracy": 0.8176655918359756, "mean_token_accuracy": 0.6333799064159393, "num_tokens": 37539047.0, "sample_num_tokens": 8404.75, "step": 138, "total_num_tokens": 37572666.0, "z_loss": 0.004832136444747448 }, { "copy_logits_max": 0.41620922088623047, "copy_logits_min": -687499968.0, "copy_num_tokens": 384.6875, "epoch": 0.028389073270359968, "gen_logits_max": 15.331548690795898, "gen_logits_mean": -3.0513525009155273, "gen_logits_min": -15.069830894470215, "gen_logits_std": 2.445966958999634, "gen_loss": 1.7336207628250122, "grad_norm": 0.9650549741825107, "learning_rate": 5.63265306122449e-06, "loss": 1.8073, "mean_copy_accuracy": 0.009587822249159217, "mean_gen_accuracy": 0.8132760971784592, "mean_token_accuracy": 0.6056623160839081, "num_tokens": 37813444.0, "sample_num_tokens": 7873.5, "step": 139, "total_num_tokens": 37844938.0, "z_loss": 0.005385005380958319 }, { "copy_logits_max": 0.46556586027145386, "copy_logits_min": -687499968.0, "copy_num_tokens": 573.25, "epoch": 0.028593311207556805, "gen_logits_max": 14.568370819091797, "gen_logits_mean": -3.9522690773010254, "gen_logits_min": -16.16497802734375, "gen_logits_std": 2.448564052581787, "gen_loss": 2.0675151348114014, "grad_norm": 2.113508891927413, "learning_rate": 5.673469387755103e-06, "loss": 1.8365, "mean_copy_accuracy": 0.009574544266797602, "mean_gen_accuracy": 0.8129200786352158, "mean_token_accuracy": 0.6101237088441849, "num_tokens": 38085178.0, "sample_num_tokens": 9230.0, "step": 140, "total_num_tokens": 38122098.0, "z_loss": 0.004980975762009621 }, { "copy_logits_max": 0.47290217876434326, "copy_logits_min": -687500032.0, "copy_num_tokens": 523.875, "epoch": 0.02879754914475364, "gen_logits_max": 14.556257247924805, "gen_logits_mean": -3.810757637023926, "gen_logits_min": -15.952705383300781, "gen_logits_std": 2.425363063812256, "gen_loss": 1.8288015127182007, "grad_norm": 1.0086852990330746, "learning_rate": 5.7142857142857145e-06, "loss": 1.7889, "mean_copy_accuracy": 0.007849538349546492, "mean_gen_accuracy": 0.8223817646503448, "mean_token_accuracy": 0.6162081658840179, "num_tokens": 38337517.0, "sample_num_tokens": 8504.25, "step": 141, "total_num_tokens": 38371534.0, "z_loss": 0.0049438984133303165 }, { "copy_logits_max": 0.4279232621192932, "copy_logits_min": -687500032.0, "copy_num_tokens": 456.375, "epoch": 0.029001787081950472, "gen_logits_max": 15.791919708251953, "gen_logits_mean": -2.5722055435180664, "gen_logits_min": -14.398528099060059, "gen_logits_std": 2.4078450202941895, "gen_loss": 1.6559290885925293, "grad_norm": 1.6061687930674995, "learning_rate": 5.755102040816326e-06, "loss": 1.6502, "mean_copy_accuracy": 0.011058779200538993, "mean_gen_accuracy": 0.8142252117395401, "mean_token_accuracy": 0.6353695541620255, "num_tokens": 38596068.0, "sample_num_tokens": 8878.0, "step": 142, "total_num_tokens": 38631580.0, "z_loss": 0.0054604592733085155 }, { "copy_logits_max": 0.48683202266693115, "copy_logits_min": -750000000.0, "copy_num_tokens": 482.0625, "epoch": 0.029206025019147305, "gen_logits_max": 15.640838623046875, "gen_logits_mean": -3.4222283363342285, "gen_logits_min": -15.322596549987793, "gen_logits_std": 2.4154272079467773, "gen_loss": 1.9573453664779663, "grad_norm": 1.111682061431762, "learning_rate": 5.795918367346939e-06, "loss": 1.8578, "mean_copy_accuracy": 0.010189679800532758, "mean_gen_accuracy": 0.8151086270809174, "mean_token_accuracy": 0.6021738350391388, "num_tokens": 38868516.0, "sample_num_tokens": 8207.0, "step": 143, "total_num_tokens": 38901344.0, "z_loss": 0.005104248411953449 }, { "copy_logits_max": 0.511967658996582, "copy_logits_min": -750000000.0, "copy_num_tokens": 618.4375, "epoch": 0.029410262956344142, "gen_logits_max": 14.039267539978027, "gen_logits_mean": -3.965210437774658, "gen_logits_min": -15.924103736877441, "gen_logits_std": 2.4283063411712646, "gen_loss": 1.9535706043243408, "grad_norm": 1.0503664651291433, "learning_rate": 5.8367346938775515e-06, "loss": 1.9351, "mean_copy_accuracy": 0.008642725297249854, "mean_gen_accuracy": 0.8197916895151138, "mean_token_accuracy": 0.602021649479866, "num_tokens": 39137017.0, "sample_num_tokens": 9765.25, "step": 144, "total_num_tokens": 39176078.0, "z_loss": 0.004583224654197693 }, { "copy_logits_max": 0.484887033700943, "copy_logits_min": -750000000.0, "copy_num_tokens": 378.5625, "epoch": 0.029614500893540976, "gen_logits_max": 15.635761260986328, "gen_logits_mean": -3.424913167953491, "gen_logits_min": -15.45236587524414, "gen_logits_std": 2.445758819580078, "gen_loss": 1.7260518074035645, "grad_norm": 1.1069432771480272, "learning_rate": 5.877551020408163e-06, "loss": 1.7523, "mean_copy_accuracy": 0.012526590144261718, "mean_gen_accuracy": 0.8210150003433228, "mean_token_accuracy": 0.6178490221500397, "num_tokens": 39413580.0, "sample_num_tokens": 7220.5, "step": 145, "total_num_tokens": 39442462.0, "z_loss": 0.005271631292998791 }, { "copy_logits_max": 0.537408709526062, "copy_logits_min": -750000000.0, "copy_num_tokens": 455.3125, "epoch": 0.02981873883073781, "gen_logits_max": 15.612749099731445, "gen_logits_mean": -3.6701912879943848, "gen_logits_min": -15.382369995117188, "gen_logits_std": 2.394308090209961, "gen_loss": 1.7697635889053345, "grad_norm": 0.9313584084651029, "learning_rate": 5.918367346938776e-06, "loss": 1.7162, "mean_copy_accuracy": 0.009992871899157763, "mean_gen_accuracy": 0.8235260993242264, "mean_token_accuracy": 0.6334382146596909, "num_tokens": 39659857.0, "sample_num_tokens": 8240.75, "step": 146, "total_num_tokens": 39692820.0, "z_loss": 0.005138816777616739 }, { "copy_logits_max": 0.5363541841506958, "copy_logits_min": -750000000.0, "copy_num_tokens": 595.9375, "epoch": 0.030022976767934643, "gen_logits_max": 14.587538719177246, "gen_logits_mean": -3.890058755874634, "gen_logits_min": -15.899435043334961, "gen_logits_std": 2.4461350440979004, "gen_loss": 1.9211604595184326, "grad_norm": 1.7107382171370509, "learning_rate": 5.959183673469388e-06, "loss": 1.6653, "mean_copy_accuracy": 0.01321604778058827, "mean_gen_accuracy": 0.8221376091241837, "mean_token_accuracy": 0.6365844905376434, "num_tokens": 39931569.0, "sample_num_tokens": 9163.25, "step": 147, "total_num_tokens": 39968222.0, "z_loss": 0.00487527996301651 }, { "copy_logits_max": 0.5488533973693848, "copy_logits_min": -750000000.0, "copy_num_tokens": 407.6875, "epoch": 0.03022721470513148, "gen_logits_max": 15.912069320678711, "gen_logits_mean": -2.6978235244750977, "gen_logits_min": -14.4906005859375, "gen_logits_std": 2.4118704795837402, "gen_loss": 1.6557717323303223, "grad_norm": 1.4202551620671324, "learning_rate": 6e-06, "loss": 1.6583, "mean_copy_accuracy": 0.013040502090007067, "mean_gen_accuracy": 0.8209642469882965, "mean_token_accuracy": 0.6365784257650375, "num_tokens": 40191733.0, "sample_num_tokens": 8477.75, "step": 148, "total_num_tokens": 40225644.0, "z_loss": 0.005301406607031822 }, { "copy_logits_max": 0.5459011793136597, "copy_logits_min": -750000000.0, "copy_num_tokens": 521.625, "epoch": 0.030431452642328313, "gen_logits_max": 14.206854820251465, "gen_logits_mean": -3.622994899749756, "gen_logits_min": -15.172473907470703, "gen_logits_std": 2.4232821464538574, "gen_loss": 1.9304113388061523, "grad_norm": 1.2760385142435253, "learning_rate": 6.040816326530612e-06, "loss": 1.8125, "mean_copy_accuracy": 0.01181779894977808, "mean_gen_accuracy": 0.8206182420253754, "mean_token_accuracy": 0.617417648434639, "num_tokens": 40450294.0, "sample_num_tokens": 8566.0, "step": 149, "total_num_tokens": 40484558.0, "z_loss": 0.004938791505992413 }, { "copy_logits_max": 0.6421394348144531, "copy_logits_min": -750000000.0, "copy_num_tokens": 384.9375, "epoch": 0.030635690579525147, "gen_logits_max": 15.847124099731445, "gen_logits_mean": -3.280658006668091, "gen_logits_min": -15.357563018798828, "gen_logits_std": 2.408278465270996, "gen_loss": 1.6572238206863403, "grad_norm": 1.295027285002894, "learning_rate": 6.0816326530612245e-06, "loss": 1.5843, "mean_copy_accuracy": 0.015945886494591832, "mean_gen_accuracy": 0.8283590972423553, "mean_token_accuracy": 0.6479132920503616, "num_tokens": 40705038.0, "sample_num_tokens": 7654.5, "step": 150, "total_num_tokens": 40735656.0, "z_loss": 0.005284770391881466 }, { "copy_logits_max": 0.6468299031257629, "copy_logits_min": -750000000.0, "copy_num_tokens": 415.25, "epoch": 0.03083992851672198, "gen_logits_max": 15.213061332702637, "gen_logits_mean": -2.9379303455352783, "gen_logits_min": -15.353582382202148, "gen_logits_std": 2.3890647888183594, "gen_loss": 1.5941181182861328, "grad_norm": 1.3843199339087735, "learning_rate": 6.122448979591837e-06, "loss": 1.6577, "mean_copy_accuracy": 0.016351777827367187, "mean_gen_accuracy": 0.8252602964639664, "mean_token_accuracy": 0.6377371996641159, "num_tokens": 40979560.0, "sample_num_tokens": 8258.5, "step": 151, "total_num_tokens": 41012594.0, "z_loss": 0.005214243661612272 }, { "copy_logits_max": 0.6005268096923828, "copy_logits_min": -687500032.0, "copy_num_tokens": 508.125, "epoch": 0.031044166453918814, "gen_logits_max": 15.171477317810059, "gen_logits_mean": -2.837195873260498, "gen_logits_min": -14.928417205810547, "gen_logits_std": 2.4706737995147705, "gen_loss": 1.7774945497512817, "grad_norm": 2.014316639199494, "learning_rate": 6.16326530612245e-06, "loss": 1.7682, "mean_copy_accuracy": 0.014411451760679483, "mean_gen_accuracy": 0.8153952807188034, "mean_token_accuracy": 0.6182972490787506, "num_tokens": 41260990.0, "sample_num_tokens": 9163.0, "step": 152, "total_num_tokens": 41297642.0, "z_loss": 0.005210201721638441 }, { "copy_logits_max": 0.6528217792510986, "copy_logits_min": -687500032.0, "copy_num_tokens": 192.8125, "epoch": 0.03124840439111565, "gen_logits_max": 18.795406341552734, "gen_logits_mean": -1.4085054397583008, "gen_logits_min": -13.519855499267578, "gen_logits_std": 2.4241323471069336, "gen_loss": 1.1388550996780396, "grad_norm": 1.5429582969151072, "learning_rate": 6.2040816326530614e-06, "loss": 1.6162, "mean_copy_accuracy": 0.017964153084903955, "mean_gen_accuracy": 0.8232436180114746, "mean_token_accuracy": 0.6357969492673874, "num_tokens": 41534842.0, "sample_num_tokens": 6419.0, "step": 153, "total_num_tokens": 41560518.0, "z_loss": 0.006257589906454086 }, { "copy_logits_max": 0.7510867118835449, "copy_logits_min": -687500032.0, "copy_num_tokens": 547.4375, "epoch": 0.031452642328312484, "gen_logits_max": 14.971303939819336, "gen_logits_mean": -3.067669630050659, "gen_logits_min": -15.399520874023438, "gen_logits_std": 2.4648258686065674, "gen_loss": 1.7852208614349365, "grad_norm": 2.71322911694978, "learning_rate": 6.244897959183673e-06, "loss": 1.5261, "mean_copy_accuracy": 0.02150239492766559, "mean_gen_accuracy": 0.8231005072593689, "mean_token_accuracy": 0.6458623856306076, "num_tokens": 41781880.0, "sample_num_tokens": 8891.0, "step": 154, "total_num_tokens": 41817444.0, "z_loss": 0.005054047331213951 }, { "copy_logits_max": 0.8349194526672363, "copy_logits_min": -687500032.0, "copy_num_tokens": 667.9375, "epoch": 0.03165688026550932, "gen_logits_max": 14.55735969543457, "gen_logits_mean": -3.5387120246887207, "gen_logits_min": -15.663320541381836, "gen_logits_std": 2.4269368648529053, "gen_loss": 1.9806358814239502, "grad_norm": 3.655617245364351, "learning_rate": 6.285714285714286e-06, "loss": 1.7944, "mean_copy_accuracy": 0.014802045654505491, "mean_gen_accuracy": 0.8178888708353043, "mean_token_accuracy": 0.6061045825481415, "num_tokens": 42053140.0, "sample_num_tokens": 9712.5, "step": 155, "total_num_tokens": 42091990.0, "z_loss": 0.004856688901782036 }, { "copy_logits_max": 1.1822670698165894, "copy_logits_min": -687500032.0, "copy_num_tokens": 754.0, "epoch": 0.03186111820270615, "gen_logits_max": 14.527410507202148, "gen_logits_mean": -4.5390520095825195, "gen_logits_min": -16.904354095458984, "gen_logits_std": 2.4645557403564453, "gen_loss": 2.039830207824707, "grad_norm": 3.393090356436817, "learning_rate": 6.326530612244898e-06, "loss": 1.6902, "mean_copy_accuracy": 0.018807226791977882, "mean_gen_accuracy": 0.8223070502281189, "mean_token_accuracy": 0.6229227483272552, "num_tokens": 42335731.0, "sample_num_tokens": 10623.25, "step": 156, "total_num_tokens": 42378224.0, "z_loss": 0.004800046794116497 }, { "copy_logits_max": 0.7944447994232178, "copy_logits_min": -687500032.0, "copy_num_tokens": 524.1875, "epoch": 0.03206535613990299, "gen_logits_max": 14.435151100158691, "gen_logits_mean": -2.702327251434326, "gen_logits_min": -14.627052307128906, "gen_logits_std": 2.4632060527801514, "gen_loss": 1.7529082298278809, "grad_norm": 3.3975726564104485, "learning_rate": 6.36734693877551e-06, "loss": 1.7872, "mean_copy_accuracy": 0.01614256273023784, "mean_gen_accuracy": 0.8188546001911163, "mean_token_accuracy": 0.6045759469270706, "num_tokens": 42609260.0, "sample_num_tokens": 8866.5, "step": 157, "total_num_tokens": 42644726.0, "z_loss": 0.005109229125082493 }, { "copy_logits_max": 1.0552146434783936, "copy_logits_min": -750000000.0, "copy_num_tokens": 379.8125, "epoch": 0.03226959407709982, "gen_logits_max": 16.26897430419922, "gen_logits_mean": -2.0267210006713867, "gen_logits_min": -14.177093505859375, "gen_logits_std": 2.4050135612487793, "gen_loss": 1.5902302265167236, "grad_norm": 5.682018952531134, "learning_rate": 6.408163265306123e-06, "loss": 1.6732, "mean_copy_accuracy": 0.014584550866857171, "mean_gen_accuracy": 0.8271148055791855, "mean_token_accuracy": 0.6329462975263596, "num_tokens": 42884577.0, "sample_num_tokens": 7302.25, "step": 158, "total_num_tokens": 42913786.0, "z_loss": 0.005583674646914005 }, { "copy_logits_max": 0.9574659466743469, "copy_logits_min": -750000000.0, "copy_num_tokens": 324.125, "epoch": 0.032473832014296655, "gen_logits_max": 16.923904418945312, "gen_logits_mean": -2.1534523963928223, "gen_logits_min": -14.062832832336426, "gen_logits_std": 2.401902437210083, "gen_loss": 1.3325061798095703, "grad_norm": 2.465529490348478, "learning_rate": 6.448979591836735e-06, "loss": 1.6744, "mean_copy_accuracy": 0.019364646868780255, "mean_gen_accuracy": 0.8201893419027328, "mean_token_accuracy": 0.606128454208374, "num_tokens": 43154642.0, "sample_num_tokens": 7040.5, "step": 159, "total_num_tokens": 43182804.0, "z_loss": 0.0059595173224806786 }, { "copy_logits_max": 1.1286747455596924, "copy_logits_min": -750000000.0, "copy_num_tokens": 316.1875, "epoch": 0.03267806995149349, "gen_logits_max": 16.29709815979004, "gen_logits_mean": -2.9134490489959717, "gen_logits_min": -15.104169845581055, "gen_logits_std": 2.4260411262512207, "gen_loss": 1.4282360076904297, "grad_norm": 3.959214323054176, "learning_rate": 6.489795918367347e-06, "loss": 1.5226, "mean_copy_accuracy": 0.02015062514692545, "mean_gen_accuracy": 0.8213674575090408, "mean_token_accuracy": 0.642204225063324, "num_tokens": 43398202.0, "sample_num_tokens": 7232.0, "step": 160, "total_num_tokens": 43427130.0, "z_loss": 0.005544677376747131 }, { "copy_logits_max": 1.305729866027832, "copy_logits_min": -687500032.0, "copy_num_tokens": 642.4375, "epoch": 0.03288230788869032, "gen_logits_max": 14.676899909973145, "gen_logits_mean": -3.0294482707977295, "gen_logits_min": -16.022933959960938, "gen_logits_std": 2.4994072914123535, "gen_loss": 1.8656049966812134, "grad_norm": 4.372117612698719, "learning_rate": 6.530612244897959e-06, "loss": 1.8126, "mean_copy_accuracy": 0.014821541495621204, "mean_gen_accuracy": 0.8148042410612106, "mean_token_accuracy": 0.5922726988792419, "num_tokens": 43673682.0, "sample_num_tokens": 9475.5, "step": 161, "total_num_tokens": 43711584.0, "z_loss": 0.004902997054159641 }, { "copy_logits_max": 1.1664385795593262, "copy_logits_min": -750000000.0, "copy_num_tokens": 356.3125, "epoch": 0.03308654582588716, "gen_logits_max": 16.4538516998291, "gen_logits_mean": -2.609647274017334, "gen_logits_min": -14.91530990600586, "gen_logits_std": 2.4605188369750977, "gen_loss": 1.345777988433838, "grad_norm": 2.2337515316759746, "learning_rate": 6.5714285714285714e-06, "loss": 1.6655, "mean_copy_accuracy": 0.020766612142324448, "mean_gen_accuracy": 0.8239531219005585, "mean_token_accuracy": 0.6265722662210464, "num_tokens": 43956293.0, "sample_num_tokens": 7929.25, "step": 162, "total_num_tokens": 43988010.0, "z_loss": 0.005603511352092028 }, { "copy_logits_max": 1.3400859832763672, "copy_logits_min": -750000000.0, "copy_num_tokens": 433.5, "epoch": 0.033290783763083996, "gen_logits_max": 15.734847068786621, "gen_logits_mean": -2.704705238342285, "gen_logits_min": -15.405709266662598, "gen_logits_std": 2.4587674140930176, "gen_loss": 1.658475399017334, "grad_norm": 4.458124609208876, "learning_rate": 6.612244897959184e-06, "loss": 1.6466, "mean_copy_accuracy": 0.02089492417871952, "mean_gen_accuracy": 0.8204730898141861, "mean_token_accuracy": 0.6186976879835129, "num_tokens": 44223490.0, "sample_num_tokens": 7448.0, "step": 163, "total_num_tokens": 44253282.0, "z_loss": 0.005614594556391239 }, { "copy_logits_max": 1.4323352575302124, "copy_logits_min": -750000000.0, "copy_num_tokens": 468.875, "epoch": 0.033495021700280826, "gen_logits_max": 15.081459999084473, "gen_logits_mean": -3.1909847259521484, "gen_logits_min": -15.814695358276367, "gen_logits_std": 2.5106914043426514, "gen_loss": 1.6491141319274902, "grad_norm": 1.8584100528198042, "learning_rate": 6.653061224489797e-06, "loss": 1.6341, "mean_copy_accuracy": 0.026382834184914827, "mean_gen_accuracy": 0.8321038633584976, "mean_token_accuracy": 0.6214771419763565, "num_tokens": 44496467.0, "sample_num_tokens": 7883.75, "step": 164, "total_num_tokens": 44528002.0, "z_loss": 0.005269450135529041 }, { "copy_logits_max": 1.3955507278442383, "copy_logits_min": -750000000.0, "copy_num_tokens": 579.75, "epoch": 0.03369925963747766, "gen_logits_max": 15.325044631958008, "gen_logits_mean": -2.559323787689209, "gen_logits_min": -14.738171577453613, "gen_logits_std": 2.4489593505859375, "gen_loss": 1.7814648151397705, "grad_norm": 3.8629217043300397, "learning_rate": 6.693877551020408e-06, "loss": 1.6396, "mean_copy_accuracy": 0.027683197520673275, "mean_gen_accuracy": 0.8134190291166306, "mean_token_accuracy": 0.6175630390644073, "num_tokens": 44764178.0, "sample_num_tokens": 8692.5, "step": 165, "total_num_tokens": 44798948.0, "z_loss": 0.005236773286014795 }, { "copy_logits_max": 1.546446681022644, "copy_logits_min": -750000000.0, "copy_num_tokens": 496.6875, "epoch": 0.03390349757467449, "gen_logits_max": 15.005916595458984, "gen_logits_mean": -3.083531379699707, "gen_logits_min": -15.672900199890137, "gen_logits_std": 2.5115647315979004, "gen_loss": 1.530595064163208, "grad_norm": 2.1167314350297954, "learning_rate": 6.734693877551021e-06, "loss": 1.6589, "mean_copy_accuracy": 0.028570000547915697, "mean_gen_accuracy": 0.8219717144966125, "mean_token_accuracy": 0.6220346689224243, "num_tokens": 45030624.0, "sample_num_tokens": 9368.5, "step": 166, "total_num_tokens": 45068098.0, "z_loss": 0.005348206963390112 }, { "copy_logits_max": 1.5111982822418213, "copy_logits_min": -749999936.0, "copy_num_tokens": 473.0, "epoch": 0.03410773551187133, "gen_logits_max": 15.259551048278809, "gen_logits_mean": -2.7394304275512695, "gen_logits_min": -15.232861518859863, "gen_logits_std": 2.4593629837036133, "gen_loss": 1.590254783630371, "grad_norm": 2.677614175931616, "learning_rate": 6.775510204081633e-06, "loss": 1.5049, "mean_copy_accuracy": 0.03095625340938568, "mean_gen_accuracy": 0.8259060978889465, "mean_token_accuracy": 0.6410855650901794, "num_tokens": 45306498.0, "sample_num_tokens": 8308.0, "step": 167, "total_num_tokens": 45339730.0, "z_loss": 0.005389784462749958 }, { "copy_logits_max": 1.95760178565979, "copy_logits_min": -750000000.0, "copy_num_tokens": 595.25, "epoch": 0.03431197344906817, "gen_logits_max": 14.813941955566406, "gen_logits_mean": -3.7897510528564453, "gen_logits_min": -16.36227035522461, "gen_logits_std": 2.4795660972595215, "gen_loss": 1.6756633520126343, "grad_norm": 2.8972754789557817, "learning_rate": 6.816326530612245e-06, "loss": 1.6437, "mean_copy_accuracy": 0.028147021308541298, "mean_gen_accuracy": 0.8330717086791992, "mean_token_accuracy": 0.6223598420619965, "num_tokens": 45585393.0, "sample_num_tokens": 9323.25, "step": 168, "total_num_tokens": 45622686.0, "z_loss": 0.004978995770215988 }, { "copy_logits_max": 1.848151445388794, "copy_logits_min": -750000000.0, "copy_num_tokens": 682.375, "epoch": 0.034516211386265, "gen_logits_max": 13.7852783203125, "gen_logits_mean": -3.8711607456207275, "gen_logits_min": -16.272647857666016, "gen_logits_std": 2.4717278480529785, "gen_loss": 1.726351022720337, "grad_norm": 1.9792636719989785, "learning_rate": 6.857142857142857e-06, "loss": 1.6956, "mean_copy_accuracy": 0.026340255979448557, "mean_gen_accuracy": 0.8277959227561951, "mean_token_accuracy": 0.6063498556613922, "num_tokens": 45884987.0, "sample_num_tokens": 10416.25, "step": 169, "total_num_tokens": 45926652.0, "z_loss": 0.0047938087955117226 }, { "copy_logits_max": 2.0554847717285156, "copy_logits_min": -687500032.0, "copy_num_tokens": 635.875, "epoch": 0.034720449323461834, "gen_logits_max": 14.799083709716797, "gen_logits_mean": -3.5101661682128906, "gen_logits_min": -15.906919479370117, "gen_logits_std": 2.4886622428894043, "gen_loss": 1.8530659675598145, "grad_norm": 2.6259343701237676, "learning_rate": 6.89795918367347e-06, "loss": 1.5656, "mean_copy_accuracy": 0.029574512504041195, "mean_gen_accuracy": 0.8249736428260803, "mean_token_accuracy": 0.6277126967906952, "num_tokens": 46151681.0, "sample_num_tokens": 9466.75, "step": 170, "total_num_tokens": 46189548.0, "z_loss": 0.005059043411165476 }, { "copy_logits_max": 2.4795069694519043, "copy_logits_min": -750000064.0, "copy_num_tokens": 692.4375, "epoch": 0.034924687260658664, "gen_logits_max": 13.834188461303711, "gen_logits_mean": -4.313276767730713, "gen_logits_min": -17.099397659301758, "gen_logits_std": 2.5105671882629395, "gen_loss": 1.854468584060669, "grad_norm": 2.2773104449724038, "learning_rate": 6.938775510204082e-06, "loss": 1.6454, "mean_copy_accuracy": 0.031962763983756304, "mean_gen_accuracy": 0.8211050033569336, "mean_token_accuracy": 0.610436737537384, "num_tokens": 46420251.0, "sample_num_tokens": 9542.25, "step": 171, "total_num_tokens": 46458420.0, "z_loss": 0.004715287126600742 }, { "copy_logits_max": 2.3787412643432617, "copy_logits_min": -687500032.0, "copy_num_tokens": 398.8125, "epoch": 0.0351289251978555, "gen_logits_max": 16.543006896972656, "gen_logits_mean": -2.9541938304901123, "gen_logits_min": -15.39212417602539, "gen_logits_std": 2.4578418731689453, "gen_loss": 1.3541028499603271, "grad_norm": 2.5352781873623047, "learning_rate": 6.979591836734694e-06, "loss": 1.4904, "mean_copy_accuracy": 0.03555974364280701, "mean_gen_accuracy": 0.8254533857107162, "mean_token_accuracy": 0.6324843317270279, "num_tokens": 46694892.0, "sample_num_tokens": 8574.5, "step": 172, "total_num_tokens": 46729190.0, "z_loss": 0.005662835668772459 }, { "copy_logits_max": 2.3014965057373047, "copy_logits_min": -750000000.0, "copy_num_tokens": 405.875, "epoch": 0.03533316313505234, "gen_logits_max": 16.25592041015625, "gen_logits_mean": -2.6334009170532227, "gen_logits_min": -15.054811477661133, "gen_logits_std": 2.462979793548584, "gen_loss": 1.438162088394165, "grad_norm": 2.9892864598510007, "learning_rate": 7.020408163265306e-06, "loss": 1.6043, "mean_copy_accuracy": 0.0345284603536129, "mean_gen_accuracy": 0.8150897920131683, "mean_token_accuracy": 0.6109757125377655, "num_tokens": 46981460.0, "sample_num_tokens": 8049.0, "step": 173, "total_num_tokens": 47013656.0, "z_loss": 0.005471327807754278 }, { "copy_logits_max": 3.094301223754883, "copy_logits_min": -750000000.0, "copy_num_tokens": 524.8125, "epoch": 0.03553740107224917, "gen_logits_max": 15.726346969604492, "gen_logits_mean": -3.8969478607177734, "gen_logits_min": -16.535625457763672, "gen_logits_std": 2.5051190853118896, "gen_loss": 1.7054026126861572, "grad_norm": 3.0237422430111707, "learning_rate": 7.061224489795918e-06, "loss": 1.4721, "mean_copy_accuracy": 0.04415766196325421, "mean_gen_accuracy": 0.8196456581354141, "mean_token_accuracy": 0.6357328146696091, "num_tokens": 47246052.0, "sample_num_tokens": 8641.5, "step": 174, "total_num_tokens": 47280618.0, "z_loss": 0.005111840087920427 }, { "copy_logits_max": 3.3773155212402344, "copy_logits_min": -687500032.0, "copy_num_tokens": 700.75, "epoch": 0.035741639009446005, "gen_logits_max": 13.483785629272461, "gen_logits_mean": -4.192911148071289, "gen_logits_min": -16.76261329650879, "gen_logits_std": 2.510896682739258, "gen_loss": 1.9262466430664062, "grad_norm": 3.0348391635875487, "learning_rate": 7.102040816326531e-06, "loss": 1.5722, "mean_copy_accuracy": 0.038767216727137566, "mean_gen_accuracy": 0.8255505859851837, "mean_token_accuracy": 0.6205999702215195, "num_tokens": 47536002.0, "sample_num_tokens": 9585.0, "step": 175, "total_num_tokens": 47574342.0, "z_loss": 0.004784976597875357 }, { "copy_logits_max": 2.9892630577087402, "copy_logits_min": -750000000.0, "copy_num_tokens": 619.1875, "epoch": 0.03594587694664284, "gen_logits_max": 15.466449737548828, "gen_logits_mean": -2.7933642864227295, "gen_logits_min": -15.52298641204834, "gen_logits_std": 2.476165771484375, "gen_loss": 1.5457179546356201, "grad_norm": 2.0855069511590734, "learning_rate": 7.142857142857143e-06, "loss": 1.5502, "mean_copy_accuracy": 0.03627229109406471, "mean_gen_accuracy": 0.8184649348258972, "mean_token_accuracy": 0.6173628717660904, "num_tokens": 47809725.0, "sample_num_tokens": 10396.25, "step": 176, "total_num_tokens": 47851310.0, "z_loss": 0.005383916664868593 }, { "copy_logits_max": 3.5942602157592773, "copy_logits_min": -750000000.0, "copy_num_tokens": 472.5, "epoch": 0.03615011488383967, "gen_logits_max": 16.421993255615234, "gen_logits_mean": -2.806680917739868, "gen_logits_min": -15.19815444946289, "gen_logits_std": 2.4662787914276123, "gen_loss": 1.473526954650879, "grad_norm": 1.692080101954113, "learning_rate": 7.183673469387755e-06, "loss": 1.414, "mean_copy_accuracy": 0.04738669563084841, "mean_gen_accuracy": 0.8303102850914001, "mean_token_accuracy": 0.6486553400754929, "num_tokens": 48088496.0, "sample_num_tokens": 8833.0, "step": 177, "total_num_tokens": 48123828.0, "z_loss": 0.005576561205089092 }, { "copy_logits_max": 3.5251359939575195, "copy_logits_min": -750000000.0, "copy_num_tokens": 474.4375, "epoch": 0.03635435282103651, "gen_logits_max": 15.93404483795166, "gen_logits_mean": -2.27492094039917, "gen_logits_min": -14.841827392578125, "gen_logits_std": 2.516040325164795, "gen_loss": 1.6218652725219727, "grad_norm": 3.4335322374654895, "learning_rate": 7.224489795918368e-06, "loss": 1.5288, "mean_copy_accuracy": 0.04010504111647606, "mean_gen_accuracy": 0.8142769634723663, "mean_token_accuracy": 0.6197704374790192, "num_tokens": 48349940.0, "sample_num_tokens": 8231.5, "step": 178, "total_num_tokens": 48382866.0, "z_loss": 0.0055260322988033295 }, { "copy_logits_max": 4.0446014404296875, "copy_logits_min": -750000000.0, "copy_num_tokens": 612.4375, "epoch": 0.03655859075823334, "gen_logits_max": 15.272046089172363, "gen_logits_mean": -3.3421061038970947, "gen_logits_min": -15.979751586914062, "gen_logits_std": 2.515641212463379, "gen_loss": 1.7756516933441162, "grad_norm": 3.232762776514208, "learning_rate": 7.26530612244898e-06, "loss": 1.3661, "mean_copy_accuracy": 0.04686634708195925, "mean_gen_accuracy": 0.8235195279121399, "mean_token_accuracy": 0.650551438331604, "num_tokens": 48602007.0, "sample_num_tokens": 8869.75, "step": 179, "total_num_tokens": 48637486.0, "z_loss": 0.005174950696527958 }, { "copy_logits_max": 3.3156886100769043, "copy_logits_min": -750000000.0, "copy_num_tokens": 407.875, "epoch": 0.036762828695430176, "gen_logits_max": 16.70644760131836, "gen_logits_mean": -2.020192861557007, "gen_logits_min": -14.206207275390625, "gen_logits_std": 2.4870972633361816, "gen_loss": 1.2322659492492676, "grad_norm": 2.8190125495153495, "learning_rate": 7.306122448979591e-06, "loss": 1.4739, "mean_copy_accuracy": 0.04488868126645684, "mean_gen_accuracy": 0.8193408399820328, "mean_token_accuracy": 0.6218288987874985, "num_tokens": 48889649.0, "sample_num_tokens": 9060.75, "step": 180, "total_num_tokens": 48925892.0, "z_loss": 0.005996513180434704 }, { "copy_logits_max": 3.5598769187927246, "copy_logits_min": -687500032.0, "copy_num_tokens": 494.5, "epoch": 0.03696706663262701, "gen_logits_max": 15.420354843139648, "gen_logits_mean": -2.514174461364746, "gen_logits_min": -15.05392837524414, "gen_logits_std": 2.529041290283203, "gen_loss": 1.526696801185608, "grad_norm": 1.828337071440447, "learning_rate": 7.346938775510204e-06, "loss": 1.4236, "mean_copy_accuracy": 0.048396884463727474, "mean_gen_accuracy": 0.818640947341919, "mean_token_accuracy": 0.6358779221773148, "num_tokens": 49164954.0, "sample_num_tokens": 8863.0, "step": 181, "total_num_tokens": 49200406.0, "z_loss": 0.005615205504000187 }, { "copy_logits_max": 4.351193904876709, "copy_logits_min": -750000000.0, "copy_num_tokens": 557.8125, "epoch": 0.03717130456982384, "gen_logits_max": 15.760797500610352, "gen_logits_mean": -2.737454414367676, "gen_logits_min": -15.685338973999023, "gen_logits_std": 2.49727725982666, "gen_loss": 1.6954455375671387, "grad_norm": 3.3768916389961037, "learning_rate": 7.387755102040817e-06, "loss": 1.5488, "mean_copy_accuracy": 0.043107565492391586, "mean_gen_accuracy": 0.8230414092540741, "mean_token_accuracy": 0.6094843745231628, "num_tokens": 49449481.0, "sample_num_tokens": 8539.75, "step": 182, "total_num_tokens": 49483640.0, "z_loss": 0.005438497290015221 }, { "copy_logits_max": 4.676092624664307, "copy_logits_min": -687499904.0, "copy_num_tokens": 562.5625, "epoch": 0.03737554250702068, "gen_logits_max": 15.67310905456543, "gen_logits_mean": -2.4767043590545654, "gen_logits_min": -15.254802703857422, "gen_logits_std": 2.4791665077209473, "gen_loss": 1.6214864253997803, "grad_norm": 3.655947898807197, "learning_rate": 7.428571428571429e-06, "loss": 1.4445, "mean_copy_accuracy": 0.04635883681476116, "mean_gen_accuracy": 0.8162380903959274, "mean_token_accuracy": 0.6290844678878784, "num_tokens": 49737115.0, "sample_num_tokens": 8994.75, "step": 183, "total_num_tokens": 49773094.0, "z_loss": 0.0056136008352041245 }, { "copy_logits_max": 4.6480712890625, "copy_logits_min": -750000000.0, "copy_num_tokens": 521.0, "epoch": 0.03757978044421751, "gen_logits_max": 16.235393524169922, "gen_logits_mean": -2.16386342048645, "gen_logits_min": -14.893739700317383, "gen_logits_std": 2.485039710998535, "gen_loss": 1.41769540309906, "grad_norm": 2.1004844026980134, "learning_rate": 7.469387755102041e-06, "loss": 1.2699, "mean_copy_accuracy": 0.06304344814270735, "mean_gen_accuracy": 0.8288749307394028, "mean_token_accuracy": 0.6577936708927155, "num_tokens": 49988354.0, "sample_num_tokens": 9186.5, "step": 184, "total_num_tokens": 50025100.0, "z_loss": 0.005840410478413105 }, { "copy_logits_max": 4.858803749084473, "copy_logits_min": -750000000.0, "copy_num_tokens": 395.8125, "epoch": 0.03778401838141435, "gen_logits_max": 18.094844818115234, "gen_logits_mean": -1.3860807418823242, "gen_logits_min": -13.676172256469727, "gen_logits_std": 2.4485220909118652, "gen_loss": 1.1818976402282715, "grad_norm": 2.1011433278648606, "learning_rate": 7.5102040816326536e-06, "loss": 1.2446, "mean_copy_accuracy": 0.06668279506266117, "mean_gen_accuracy": 0.8274768441915512, "mean_token_accuracy": 0.664390817284584, "num_tokens": 50242595.0, "sample_num_tokens": 8387.25, "step": 185, "total_num_tokens": 50276144.0, "z_loss": 0.006124170962721109 }, { "copy_logits_max": 4.620163917541504, "copy_logits_min": -750000000.0, "copy_num_tokens": 255.875, "epoch": 0.037988256318611184, "gen_logits_max": 19.350358963012695, "gen_logits_mean": -1.1642022132873535, "gen_logits_min": -13.796688079833984, "gen_logits_std": 2.453838348388672, "gen_loss": 1.0547103881835938, "grad_norm": 3.5988937244036774, "learning_rate": 7.551020408163265e-06, "loss": 1.2834, "mean_copy_accuracy": 0.06021048128604889, "mean_gen_accuracy": 0.833522692322731, "mean_token_accuracy": 0.6624544709920883, "num_tokens": 50511953.0, "sample_num_tokens": 6730.75, "step": 186, "total_num_tokens": 50538876.0, "z_loss": 0.006460933014750481 }, { "copy_logits_max": 6.222530364990234, "copy_logits_min": -750000000.0, "copy_num_tokens": 549.4375, "epoch": 0.038192494255808014, "gen_logits_max": 17.505050659179688, "gen_logits_mean": -2.1629738807678223, "gen_logits_min": -14.80245304107666, "gen_logits_std": 2.501399517059326, "gen_loss": 1.531411051750183, "grad_norm": 4.181768063256812, "learning_rate": 7.591836734693877e-06, "loss": 1.4471, "mean_copy_accuracy": 0.06508148740977049, "mean_gen_accuracy": 0.8163024485111237, "mean_token_accuracy": 0.6111390143632889, "num_tokens": 50762041.0, "sample_num_tokens": 7973.25, "step": 187, "total_num_tokens": 50793934.0, "z_loss": 0.005982615984976292 }, { "copy_logits_max": 5.413874626159668, "copy_logits_min": -687500032.0, "copy_num_tokens": 655.875, "epoch": 0.03839673219300485, "gen_logits_max": 15.452495574951172, "gen_logits_mean": -2.246565580368042, "gen_logits_min": -14.928654670715332, "gen_logits_std": 2.490100860595703, "gen_loss": 1.5537731647491455, "grad_norm": 3.474263700747834, "learning_rate": 7.63265306122449e-06, "loss": 1.3781, "mean_copy_accuracy": 0.06761266104876995, "mean_gen_accuracy": 0.8239929229021072, "mean_token_accuracy": 0.6331105679273605, "num_tokens": 51023519.0, "sample_num_tokens": 9186.75, "step": 188, "total_num_tokens": 51060266.0, "z_loss": 0.005721116438508034 }, { "copy_logits_max": 6.307609558105469, "copy_logits_min": -750000000.0, "copy_num_tokens": 321.5, "epoch": 0.03860097013020169, "gen_logits_max": 17.970691680908203, "gen_logits_mean": -1.9131255149841309, "gen_logits_min": -14.82356071472168, "gen_logits_std": 2.4989404678344727, "gen_loss": 1.2172189950942993, "grad_norm": 2.214305067246611, "learning_rate": 7.673469387755102e-06, "loss": 1.3026, "mean_copy_accuracy": 0.07956800982356071, "mean_gen_accuracy": 0.8127826899290085, "mean_token_accuracy": 0.6478521525859833, "num_tokens": 51269771.0, "sample_num_tokens": 7121.75, "step": 189, "total_num_tokens": 51298258.0, "z_loss": 0.0061336238868534565 }, { "copy_logits_max": 5.868645668029785, "copy_logits_min": -687500032.0, "copy_num_tokens": 376.8125, "epoch": 0.03880520806739852, "gen_logits_max": 17.179786682128906, "gen_logits_mean": -1.8639922142028809, "gen_logits_min": -15.271303176879883, "gen_logits_std": 2.5044658184051514, "gen_loss": 1.230647325515747, "grad_norm": 2.6284166961714277, "learning_rate": 7.714285714285714e-06, "loss": 1.3669, "mean_copy_accuracy": 0.08037722948938608, "mean_gen_accuracy": 0.827299565076828, "mean_token_accuracy": 0.6342322379350662, "num_tokens": 51552910.0, "sample_num_tokens": 7867.5, "step": 190, "total_num_tokens": 51584380.0, "z_loss": 0.0062379916198551655 }, { "copy_logits_max": 6.395384788513184, "copy_logits_min": -687500032.0, "copy_num_tokens": 458.0625, "epoch": 0.039009446004595355, "gen_logits_max": 17.733518600463867, "gen_logits_mean": -1.6649057865142822, "gen_logits_min": -14.804603576660156, "gen_logits_std": 2.516559600830078, "gen_loss": 1.263792634010315, "grad_norm": 4.484100437575168, "learning_rate": 7.755102040816327e-06, "loss": 1.3947, "mean_copy_accuracy": 0.07343006692826748, "mean_gen_accuracy": 0.8231886476278305, "mean_token_accuracy": 0.6215732097625732, "num_tokens": 51825843.0, "sample_num_tokens": 8073.25, "step": 191, "total_num_tokens": 51858136.0, "z_loss": 0.006123990286141634 }, { "copy_logits_max": 5.147528648376465, "copy_logits_min": -687499968.0, "copy_num_tokens": 417.5, "epoch": 0.039213683941792185, "gen_logits_max": 17.524620056152344, "gen_logits_mean": -0.8816179633140564, "gen_logits_min": -13.392288208007812, "gen_logits_std": 2.478119373321533, "gen_loss": 1.208829641342163, "grad_norm": 4.85967319256569, "learning_rate": 7.79591836734694e-06, "loss": 1.1701, "mean_copy_accuracy": 0.09685856848955154, "mean_gen_accuracy": 0.8234604746103287, "mean_token_accuracy": 0.6710903644561768, "num_tokens": 52079959.0, "sample_num_tokens": 7475.25, "step": 192, "total_num_tokens": 52109860.0, "z_loss": 0.0063698310405015945 }, { "copy_logits_max": 6.581585884094238, "copy_logits_min": -750000000.0, "copy_num_tokens": 438.5625, "epoch": 0.03941792187898902, "gen_logits_max": 18.570003509521484, "gen_logits_mean": -1.2685215473175049, "gen_logits_min": -14.087724685668945, "gen_logits_std": 2.480684518814087, "gen_loss": 1.1738636493682861, "grad_norm": 3.3140870368468702, "learning_rate": 7.836734693877553e-06, "loss": 1.2763, "mean_copy_accuracy": 0.08812104538083076, "mean_gen_accuracy": 0.8312075436115265, "mean_token_accuracy": 0.6541620641946793, "num_tokens": 52362317.0, "sample_num_tokens": 8647.25, "step": 193, "total_num_tokens": 52396906.0, "z_loss": 0.006324429530650377 }, { "copy_logits_max": 6.284794807434082, "copy_logits_min": -687500032.0, "copy_num_tokens": 312.8125, "epoch": 0.03962215981618586, "gen_logits_max": 18.014278411865234, "gen_logits_mean": -0.8744681477546692, "gen_logits_min": -13.355145454406738, "gen_logits_std": 2.458125352859497, "gen_loss": 1.110029697418213, "grad_norm": 2.6222011817103104, "learning_rate": 7.877551020408164e-06, "loss": 1.2208, "mean_copy_accuracy": 0.09444012492895126, "mean_gen_accuracy": 0.8156919628381729, "mean_token_accuracy": 0.66291144490242, "num_tokens": 52624516.0, "sample_num_tokens": 6637.5, "step": 194, "total_num_tokens": 52651066.0, "z_loss": 0.006597050931304693 }, { "copy_logits_max": 6.959019184112549, "copy_logits_min": -750000000.0, "copy_num_tokens": 785.8125, "epoch": 0.03982639775338269, "gen_logits_max": 16.088045120239258, "gen_logits_mean": -1.7025258541107178, "gen_logits_min": -14.18149471282959, "gen_logits_std": 2.4777708053588867, "gen_loss": 1.5308680534362793, "grad_norm": 5.237512293528287, "learning_rate": 7.918367346938774e-06, "loss": 1.3916, "mean_copy_accuracy": 0.09353894181549549, "mean_gen_accuracy": 0.8211493641138077, "mean_token_accuracy": 0.6128822416067123, "num_tokens": 52907268.0, "sample_num_tokens": 9504.0, "step": 195, "total_num_tokens": 52945284.0, "z_loss": 0.00633987644687295 }, { "copy_logits_max": 7.8909101486206055, "copy_logits_min": -750000000.0, "copy_num_tokens": 464.1875, "epoch": 0.040030635690579526, "gen_logits_max": 18.431652069091797, "gen_logits_mean": -0.8455494046211243, "gen_logits_min": -13.911916732788086, "gen_logits_std": 2.468217372894287, "gen_loss": 1.3548147678375244, "grad_norm": 5.429583703364336, "learning_rate": 7.959183673469388e-06, "loss": 1.1345, "mean_copy_accuracy": 0.10480443947017193, "mean_gen_accuracy": 0.8235245645046234, "mean_token_accuracy": 0.6829423904418945, "num_tokens": 53173588.0, "sample_num_tokens": 8642.0, "step": 196, "total_num_tokens": 53208156.0, "z_loss": 0.0067257629707455635 }, { "copy_logits_max": 7.762280464172363, "copy_logits_min": -750000000.0, "copy_num_tokens": 676.4375, "epoch": 0.040234873627776356, "gen_logits_max": 16.615657806396484, "gen_logits_mean": -1.8474527597427368, "gen_logits_min": -14.653738021850586, "gen_logits_std": 2.5259077548980713, "gen_loss": 1.5083097219467163, "grad_norm": 3.659324190042751, "learning_rate": 8e-06, "loss": 1.368, "mean_copy_accuracy": 0.1100818682461977, "mean_gen_accuracy": 0.8211876153945923, "mean_token_accuracy": 0.6193948090076447, "num_tokens": 53453502.0, "sample_num_tokens": 9887.5, "step": 197, "total_num_tokens": 53493052.0, "z_loss": 0.0063684238120913506 }, { "copy_logits_max": 7.709421634674072, "copy_logits_min": -687500032.0, "copy_num_tokens": 738.0, "epoch": 0.04043911156497319, "gen_logits_max": 16.087770462036133, "gen_logits_mean": -1.9843542575836182, "gen_logits_min": -14.699874877929688, "gen_logits_std": 2.4743332862854004, "gen_loss": 1.492227554321289, "grad_norm": 2.737558678259505, "learning_rate": 8.040816326530611e-06, "loss": 1.3147, "mean_copy_accuracy": 0.11567025259137154, "mean_gen_accuracy": 0.8168688863515854, "mean_token_accuracy": 0.634861558675766, "num_tokens": 53713905.0, "sample_num_tokens": 10454.25, "step": 198, "total_num_tokens": 53755722.0, "z_loss": 0.006296142004430294 }, { "copy_logits_max": 7.890571117401123, "copy_logits_min": -750000000.0, "copy_num_tokens": 328.0, "epoch": 0.04064334950217003, "gen_logits_max": 19.415224075317383, "gen_logits_mean": -0.8162095546722412, "gen_logits_min": -13.428437232971191, "gen_logits_std": 2.4438436031341553, "gen_loss": 1.0153216123580933, "grad_norm": 3.4212268914594266, "learning_rate": 8.081632653061225e-06, "loss": 1.061, "mean_copy_accuracy": 0.15889528207480907, "mean_gen_accuracy": 0.8268612772226334, "mean_token_accuracy": 0.688885435461998, "num_tokens": 53965926.0, "sample_num_tokens": 6957.5, "step": 199, "total_num_tokens": 53993756.0, "z_loss": 0.006844506599009037 }, { "copy_logits_max": 7.233445167541504, "copy_logits_min": -750000000.0, "copy_num_tokens": 325.3125, "epoch": 0.04084758743936686, "gen_logits_max": 19.64327621459961, "gen_logits_mean": -0.39995017647743225, "gen_logits_min": -13.324122428894043, "gen_logits_std": 2.4508862495422363, "gen_loss": 0.9687333703041077, "grad_norm": 2.2353865736876006, "learning_rate": 8.122448979591837e-06, "loss": 1.1757, "mean_copy_accuracy": 0.15982110053300858, "mean_gen_accuracy": 0.8164225816726685, "mean_token_accuracy": 0.6620576530694962, "num_tokens": 54216794.0, "sample_num_tokens": 8591.0, "step": 200, "total_num_tokens": 54251158.0, "z_loss": 0.006840687710791826 }, { "copy_logits_max": 7.569568157196045, "copy_logits_min": -687500032.0, "copy_num_tokens": 510.6875, "epoch": 0.0410518253765637, "gen_logits_max": 18.570068359375, "gen_logits_mean": -0.8485678434371948, "gen_logits_min": -14.073381423950195, "gen_logits_std": 2.515064239501953, "gen_loss": 1.2457430362701416, "grad_norm": 2.771259332939073, "learning_rate": 8.16326530612245e-06, "loss": 1.1699, "mean_copy_accuracy": 0.141862029209733, "mean_gen_accuracy": 0.8224043697118759, "mean_token_accuracy": 0.6680694371461868, "num_tokens": 54496638.0, "sample_num_tokens": 9297.0, "step": 201, "total_num_tokens": 54533826.0, "z_loss": 0.006632478442043066 }, { "copy_logits_max": 8.309343338012695, "copy_logits_min": -687500032.0, "copy_num_tokens": 470.0625, "epoch": 0.041256063313760534, "gen_logits_max": 17.840789794921875, "gen_logits_mean": -0.9759266376495361, "gen_logits_min": -14.26498031616211, "gen_logits_std": 2.469733953475952, "gen_loss": 1.2282838821411133, "grad_norm": 2.701432210751704, "learning_rate": 8.204081632653062e-06, "loss": 1.1827, "mean_copy_accuracy": 0.14848944172263145, "mean_gen_accuracy": 0.8279850333929062, "mean_token_accuracy": 0.6616006642580032, "num_tokens": 54776081.0, "sample_num_tokens": 8436.25, "step": 202, "total_num_tokens": 54809826.0, "z_loss": 0.0066234152764081955 }, { "copy_logits_max": 7.901510238647461, "copy_logits_min": -750000000.0, "copy_num_tokens": 472.625, "epoch": 0.041460301250957364, "gen_logits_max": 18.254236221313477, "gen_logits_mean": -0.7656098008155823, "gen_logits_min": -13.548554420471191, "gen_logits_std": 2.498544931411743, "gen_loss": 1.1036508083343506, "grad_norm": 2.064093521668526, "learning_rate": 8.244897959183674e-06, "loss": 0.9955, "mean_copy_accuracy": 0.21018094941973686, "mean_gen_accuracy": 0.8262828886508942, "mean_token_accuracy": 0.6995692253112793, "num_tokens": 55042676.0, "sample_num_tokens": 8320.0, "step": 203, "total_num_tokens": 55075956.0, "z_loss": 0.006815421860665083 }, { "copy_logits_max": 8.718034744262695, "copy_logits_min": -750000000.0, "copy_num_tokens": 362.6875, "epoch": 0.0416645391881542, "gen_logits_max": 19.290096282958984, "gen_logits_mean": -0.6147240400314331, "gen_logits_min": -13.450429916381836, "gen_logits_std": 2.467068672180176, "gen_loss": 1.0772171020507812, "grad_norm": 3.9740116471739073, "learning_rate": 8.285714285714287e-06, "loss": 1.1411, "mean_copy_accuracy": 0.16743046045303345, "mean_gen_accuracy": 0.8234656453132629, "mean_token_accuracy": 0.6712160408496857, "num_tokens": 55293972.0, "sample_num_tokens": 7176.5, "step": 204, "total_num_tokens": 55322678.0, "z_loss": 0.007084951736032963 }, { "copy_logits_max": 9.36131477355957, "copy_logits_min": -750000000.0, "copy_num_tokens": 580.9375, "epoch": 0.04186877712535103, "gen_logits_max": 18.285289764404297, "gen_logits_mean": -0.7955367565155029, "gen_logits_min": -14.173450469970703, "gen_logits_std": 2.481675386428833, "gen_loss": 1.1942123174667358, "grad_norm": 2.965007047016161, "learning_rate": 8.326530612244899e-06, "loss": 1.1449, "mean_copy_accuracy": 0.1743188425898552, "mean_gen_accuracy": 0.8270836025476456, "mean_token_accuracy": 0.6587987244129181, "num_tokens": 55602075.0, "sample_num_tokens": 9256.75, "step": 205, "total_num_tokens": 55639102.0, "z_loss": 0.007168139796704054 }, { "copy_logits_max": 8.774736404418945, "copy_logits_min": -750000000.0, "copy_num_tokens": 449.3125, "epoch": 0.04207301506254787, "gen_logits_max": 18.022560119628906, "gen_logits_mean": -0.46758395433425903, "gen_logits_min": -13.759920120239258, "gen_logits_std": 2.4866342544555664, "gen_loss": 1.232496976852417, "grad_norm": 2.471276524978534, "learning_rate": 8.36734693877551e-06, "loss": 1.1892, "mean_copy_accuracy": 0.1873854361474514, "mean_gen_accuracy": 0.8217073976993561, "mean_token_accuracy": 0.6565968841314316, "num_tokens": 55880958.0, "sample_num_tokens": 7547.0, "step": 206, "total_num_tokens": 55911146.0, "z_loss": 0.007390889339148998 }, { "copy_logits_max": 9.394164085388184, "copy_logits_min": -750000000.0, "copy_num_tokens": 735.4375, "epoch": 0.042277252999744705, "gen_logits_max": 18.28595733642578, "gen_logits_mean": -0.3798733949661255, "gen_logits_min": -13.048201560974121, "gen_logits_std": 2.4099323749542236, "gen_loss": 1.1695315837860107, "grad_norm": 2.816998212448906, "learning_rate": 8.408163265306122e-06, "loss": 1.1556, "mean_copy_accuracy": 0.205304604023695, "mean_gen_accuracy": 0.8241191059350967, "mean_token_accuracy": 0.6585402488708496, "num_tokens": 56173421.0, "sample_num_tokens": 11064.25, "step": 207, "total_num_tokens": 56217678.0, "z_loss": 0.007340020965784788 }, { "copy_logits_max": 8.996871948242188, "copy_logits_min": -750000000.0, "copy_num_tokens": 559.5, "epoch": 0.042481490936941535, "gen_logits_max": 18.188709259033203, "gen_logits_mean": -0.2485104203224182, "gen_logits_min": -13.24935531616211, "gen_logits_std": 2.4457077980041504, "gen_loss": 1.2583673000335693, "grad_norm": 2.8114078029450726, "learning_rate": 8.448979591836734e-06, "loss": 1.2099, "mean_copy_accuracy": 0.2123684138059616, "mean_gen_accuracy": 0.8191992193460464, "mean_token_accuracy": 0.6487045735120773, "num_tokens": 56449135.0, "sample_num_tokens": 8684.75, "step": 208, "total_num_tokens": 56483874.0, "z_loss": 0.00778938690200448 }, { "copy_logits_max": 10.240531921386719, "copy_logits_min": -750000000.0, "copy_num_tokens": 330.875, "epoch": 0.04268572887413837, "gen_logits_max": 20.428466796875, "gen_logits_mean": -0.015172306448221207, "gen_logits_min": -13.422272682189941, "gen_logits_std": 2.441107749938965, "gen_loss": 0.9449967741966248, "grad_norm": 4.0524579939184076, "learning_rate": 8.489795918367347e-06, "loss": 1.0235, "mean_copy_accuracy": 0.23572328686714172, "mean_gen_accuracy": 0.8257973939180374, "mean_token_accuracy": 0.7012067586183548, "num_tokens": 56714910.0, "sample_num_tokens": 7833.0, "step": 209, "total_num_tokens": 56746242.0, "z_loss": 0.007535854354500771 }, { "copy_logits_max": 10.319262504577637, "copy_logits_min": -750000000.0, "copy_num_tokens": 503.8125, "epoch": 0.04288996681133521, "gen_logits_max": 18.45291519165039, "gen_logits_mean": -0.4233894646167755, "gen_logits_min": -15.504935264587402, "gen_logits_std": 2.539619207382202, "gen_loss": 1.2223622798919678, "grad_norm": 3.588174216935488, "learning_rate": 8.53061224489796e-06, "loss": 1.1682, "mean_copy_accuracy": 0.21497662737965584, "mean_gen_accuracy": 0.8247158080339432, "mean_token_accuracy": 0.6622321903705597, "num_tokens": 56984345.0, "sample_num_tokens": 8155.25, "step": 210, "total_num_tokens": 57016966.0, "z_loss": 0.007601022254675627 }, { "copy_logits_max": 10.397897720336914, "copy_logits_min": -624999936.0, "copy_num_tokens": 399.3125, "epoch": 0.04309420474853204, "gen_logits_max": 18.765867233276367, "gen_logits_mean": -0.07708373665809631, "gen_logits_min": -13.8007173538208, "gen_logits_std": 2.477530002593994, "gen_loss": 1.129248023033142, "grad_norm": 2.6304781633025462, "learning_rate": 8.571428571428571e-06, "loss": 1.1306, "mean_copy_accuracy": 0.22107388824224472, "mean_gen_accuracy": 0.8255243301391602, "mean_token_accuracy": 0.6750030964612961, "num_tokens": 57267478.0, "sample_num_tokens": 7743.5, "step": 211, "total_num_tokens": 57298452.0, "z_loss": 0.007868686690926552 }, { "copy_logits_max": 9.433669090270996, "copy_logits_min": -750000000.0, "copy_num_tokens": 410.1875, "epoch": 0.043298442685728876, "gen_logits_max": 18.351821899414062, "gen_logits_mean": -0.3335738182067871, "gen_logits_min": -13.1907377243042, "gen_logits_std": 2.44150447845459, "gen_loss": 0.9884703159332275, "grad_norm": 3.1515880432406203, "learning_rate": 8.612244897959184e-06, "loss": 1.0473, "mean_copy_accuracy": 0.2540598288178444, "mean_gen_accuracy": 0.8180442452430725, "mean_token_accuracy": 0.6892678439617157, "num_tokens": 57525729.0, "sample_num_tokens": 9149.25, "step": 212, "total_num_tokens": 57562326.0, "z_loss": 0.007721829693764448 }, { "copy_logits_max": 12.28240966796875, "copy_logits_min": -687500032.0, "copy_num_tokens": 619.3125, "epoch": 0.043502680622925706, "gen_logits_max": 19.10744857788086, "gen_logits_mean": -0.3519388437271118, "gen_logits_min": -14.229814529418945, "gen_logits_std": 2.4879391193389893, "gen_loss": 1.1845216751098633, "grad_norm": 2.6623747563714413, "learning_rate": 8.653061224489796e-06, "loss": 1.2058, "mean_copy_accuracy": 0.24258000031113625, "mean_gen_accuracy": 0.816080778837204, "mean_token_accuracy": 0.6489794105291367, "num_tokens": 57817794.0, "sample_num_tokens": 9758.5, "step": 213, "total_num_tokens": 57856828.0, "z_loss": 0.008076674304902554 }, { "copy_logits_max": 10.261638641357422, "copy_logits_min": -750000000.0, "copy_num_tokens": 346.8125, "epoch": 0.04370691856012254, "gen_logits_max": 19.490478515625, "gen_logits_mean": 0.3557378053665161, "gen_logits_min": -13.010141372680664, "gen_logits_std": 2.45896577835083, "gen_loss": 0.9739574193954468, "grad_norm": 2.6502825675692874, "learning_rate": 8.693877551020408e-06, "loss": 1.0395, "mean_copy_accuracy": 0.2639022469520569, "mean_gen_accuracy": 0.8160045444965363, "mean_token_accuracy": 0.6999426782131195, "num_tokens": 58097177.0, "sample_num_tokens": 8685.25, "step": 214, "total_num_tokens": 58131918.0, "z_loss": 0.008011983707547188 }, { "copy_logits_max": 11.405035018920898, "copy_logits_min": -750000000.0, "copy_num_tokens": 406.25, "epoch": 0.04391115649731938, "gen_logits_max": 19.612735748291016, "gen_logits_mean": 0.3052937388420105, "gen_logits_min": -13.720640182495117, "gen_logits_std": 2.4520554542541504, "gen_loss": 0.968535304069519, "grad_norm": 3.1729738233822804, "learning_rate": 8.734693877551021e-06, "loss": 1.0381, "mean_copy_accuracy": 0.2803967744112015, "mean_gen_accuracy": 0.8224339783191681, "mean_token_accuracy": 0.6952160447835922, "num_tokens": 58345477.0, "sample_num_tokens": 8195.75, "step": 215, "total_num_tokens": 58378260.0, "z_loss": 0.00833409745246172 }, { "copy_logits_max": 11.564542770385742, "copy_logits_min": -750000000.0, "copy_num_tokens": 491.875, "epoch": 0.04411539443451621, "gen_logits_max": 19.638900756835938, "gen_logits_mean": 0.2593845725059509, "gen_logits_min": -13.381373405456543, "gen_logits_std": 2.423738479614258, "gen_loss": 1.0641164779663086, "grad_norm": 2.7301604470644243, "learning_rate": 8.775510204081633e-06, "loss": 1.0732, "mean_copy_accuracy": 0.2985445261001587, "mean_gen_accuracy": 0.8156170547008514, "mean_token_accuracy": 0.6789926290512085, "num_tokens": 58619496.0, "sample_num_tokens": 9134.0, "step": 216, "total_num_tokens": 58656032.0, "z_loss": 0.008377794176340103 }, { "copy_logits_max": 10.94249153137207, "copy_logits_min": -750000000.0, "copy_num_tokens": 331.4375, "epoch": 0.04431963237171305, "gen_logits_max": 20.30185890197754, "gen_logits_mean": 0.6929407119750977, "gen_logits_min": -12.280216217041016, "gen_logits_std": 2.4473164081573486, "gen_loss": 0.8997340202331543, "grad_norm": 3.9513127521967717, "learning_rate": 8.816326530612247e-06, "loss": 0.9906, "mean_copy_accuracy": 0.2984873726963997, "mean_gen_accuracy": 0.8191156089305878, "mean_token_accuracy": 0.7016932815313339, "num_tokens": 58881791.0, "sample_num_tokens": 7446.75, "step": 217, "total_num_tokens": 58911578.0, "z_loss": 0.008439131081104279 }, { "copy_logits_max": 11.011430740356445, "copy_logits_min": -750000000.0, "copy_num_tokens": 375.5625, "epoch": 0.04452387030890988, "gen_logits_max": 19.10193634033203, "gen_logits_mean": 0.2943873703479767, "gen_logits_min": -12.752486228942871, "gen_logits_std": 2.466684579849243, "gen_loss": 0.9618228673934937, "grad_norm": 1.6645015210112866, "learning_rate": 8.857142857142858e-06, "loss": 0.9256, "mean_copy_accuracy": 0.34042228758335114, "mean_gen_accuracy": 0.8272972702980042, "mean_token_accuracy": 0.7182482331991196, "num_tokens": 59131325.0, "sample_num_tokens": 7491.25, "step": 218, "total_num_tokens": 59161290.0, "z_loss": 0.008723508566617966 }, { "copy_logits_max": 13.291006088256836, "copy_logits_min": -562499968.0, "copy_num_tokens": 330.375, "epoch": 0.044728108246106714, "gen_logits_max": 21.639406204223633, "gen_logits_mean": 0.6595839858055115, "gen_logits_min": -12.773493766784668, "gen_logits_std": 2.4350507259368896, "gen_loss": 0.9242995381355286, "grad_norm": 2.265544728014234, "learning_rate": 8.89795918367347e-06, "loss": 0.9782, "mean_copy_accuracy": 0.3590652570128441, "mean_gen_accuracy": 0.8146993666887283, "mean_token_accuracy": 0.7144950926303864, "num_tokens": 59373110.0, "sample_num_tokens": 7458.0, "step": 219, "total_num_tokens": 59402942.0, "z_loss": 0.008614521473646164 }, { "copy_logits_max": 11.984346389770508, "copy_logits_min": -687499904.0, "copy_num_tokens": 652.5, "epoch": 0.04493234618330355, "gen_logits_max": 17.505964279174805, "gen_logits_mean": -0.16963806748390198, "gen_logits_min": -14.534191131591797, "gen_logits_std": 2.4908430576324463, "gen_loss": 1.158292531967163, "grad_norm": 2.3724264019488954, "learning_rate": 8.938775510204082e-06, "loss": 1.022, "mean_copy_accuracy": 0.3492289409041405, "mean_gen_accuracy": 0.8163708299398422, "mean_token_accuracy": 0.7056114226579666, "num_tokens": 59630334.0, "sample_num_tokens": 9314.5, "step": 220, "total_num_tokens": 59667592.0, "z_loss": 0.009037336334586143 }, { "copy_logits_max": 13.143251419067383, "copy_logits_min": -687500032.0, "copy_num_tokens": 668.1875, "epoch": 0.04513658412050038, "gen_logits_max": 18.807605743408203, "gen_logits_mean": 0.2075028121471405, "gen_logits_min": -14.00195598602295, "gen_logits_std": 2.479769229888916, "gen_loss": 1.0705690383911133, "grad_norm": 2.567569595958816, "learning_rate": 8.979591836734694e-06, "loss": 1.0538, "mean_copy_accuracy": 0.34453459084033966, "mean_gen_accuracy": 0.8175877332687378, "mean_token_accuracy": 0.6921034008264542, "num_tokens": 59899174.0, "sample_num_tokens": 9914.0, "step": 221, "total_num_tokens": 59938830.0, "z_loss": 0.009153975173830986 }, { "copy_logits_max": 13.158060073852539, "copy_logits_min": -750000000.0, "copy_num_tokens": 400.1875, "epoch": 0.04534082205769722, "gen_logits_max": 19.91530990600586, "gen_logits_mean": 0.5064417123794556, "gen_logits_min": -13.63334846496582, "gen_logits_std": 2.4822847843170166, "gen_loss": 0.930882453918457, "grad_norm": 5.453368660024857, "learning_rate": 9.020408163265305e-06, "loss": 0.9912, "mean_copy_accuracy": 0.38409287482500076, "mean_gen_accuracy": 0.8147111982107162, "mean_token_accuracy": 0.7052944153547287, "num_tokens": 60165102.0, "sample_num_tokens": 7380.5, "step": 222, "total_num_tokens": 60194624.0, "z_loss": 0.009136587381362915 }, { "copy_logits_max": 14.404080390930176, "copy_logits_min": -750000000.0, "copy_num_tokens": 509.4375, "epoch": 0.045545059994894055, "gen_logits_max": 19.39330291748047, "gen_logits_mean": 0.36953842639923096, "gen_logits_min": -14.820420265197754, "gen_logits_std": 2.4424004554748535, "gen_loss": 1.043508768081665, "grad_norm": 3.133607988617104, "learning_rate": 9.061224489795919e-06, "loss": 1.0054, "mean_copy_accuracy": 0.39074431359767914, "mean_gen_accuracy": 0.8206281214952469, "mean_token_accuracy": 0.712597668170929, "num_tokens": 60413973.0, "sample_num_tokens": 8394.25, "step": 223, "total_num_tokens": 60447550.0, "z_loss": 0.009300626814365387 }, { "copy_logits_max": 14.455404281616211, "copy_logits_min": -687499904.0, "copy_num_tokens": 373.8125, "epoch": 0.045749297932090885, "gen_logits_max": 20.46621322631836, "gen_logits_mean": 0.6019715070724487, "gen_logits_min": -13.391522407531738, "gen_logits_std": 2.4634368419647217, "gen_loss": 0.957994818687439, "grad_norm": 2.686461935374302, "learning_rate": 9.10204081632653e-06, "loss": 0.935, "mean_copy_accuracy": 0.36811020970344543, "mean_gen_accuracy": 0.8239792585372925, "mean_token_accuracy": 0.7203884869813919, "num_tokens": 60682032.0, "sample_num_tokens": 8299.0, "step": 224, "total_num_tokens": 60715228.0, "z_loss": 0.009166544303297997 }, { "copy_logits_max": 12.99760913848877, "copy_logits_min": -750000000.0, "copy_num_tokens": 376.9375, "epoch": 0.04595353586928772, "gen_logits_max": 19.664087295532227, "gen_logits_mean": 0.7921144962310791, "gen_logits_min": -12.919170379638672, "gen_logits_std": 2.416106700897217, "gen_loss": 0.9586464762687683, "grad_norm": 2.5542487044829794, "learning_rate": 9.142857142857144e-06, "loss": 0.9549, "mean_copy_accuracy": 0.3917250484228134, "mean_gen_accuracy": 0.820776954293251, "mean_token_accuracy": 0.717478334903717, "num_tokens": 60948545.0, "sample_num_tokens": 7361.75, "step": 225, "total_num_tokens": 60977992.0, "z_loss": 0.009517242200672626 }, { "copy_logits_max": 13.240978240966797, "copy_logits_min": -750000064.0, "copy_num_tokens": 432.3125, "epoch": 0.04615777380648455, "gen_logits_max": 19.407833099365234, "gen_logits_mean": 0.8548132181167603, "gen_logits_min": -12.459532737731934, "gen_logits_std": 2.409493923187256, "gen_loss": 0.8817785382270813, "grad_norm": 2.821335302518767, "learning_rate": 9.183673469387756e-06, "loss": 0.9179, "mean_copy_accuracy": 0.37055841088294983, "mean_gen_accuracy": 0.8281940668821335, "mean_token_accuracy": 0.7293533384799957, "num_tokens": 61209733.0, "sample_num_tokens": 8633.75, "step": 226, "total_num_tokens": 61244268.0, "z_loss": 0.009614239446818829 }, { "copy_logits_max": 15.403507232666016, "copy_logits_min": -750000000.0, "copy_num_tokens": 549.125, "epoch": 0.04636201174368139, "gen_logits_max": 19.93764305114746, "gen_logits_mean": 1.0406134128570557, "gen_logits_min": -13.316944122314453, "gen_logits_std": 2.4640402793884277, "gen_loss": 1.0843777656555176, "grad_norm": 2.1944906830118422, "learning_rate": 9.224489795918367e-06, "loss": 0.9675, "mean_copy_accuracy": 0.41477520763874054, "mean_gen_accuracy": 0.818361833691597, "mean_token_accuracy": 0.7178725749254227, "num_tokens": 61469859.0, "sample_num_tokens": 9175.25, "step": 227, "total_num_tokens": 61506560.0, "z_loss": 0.010155457071959972 }, { "copy_logits_max": 11.70405101776123, "copy_logits_min": -750000000.0, "copy_num_tokens": 335.9375, "epoch": 0.046566249680878226, "gen_logits_max": 20.119007110595703, "gen_logits_mean": 1.4371931552886963, "gen_logits_min": -12.520384788513184, "gen_logits_std": 2.40511417388916, "gen_loss": 0.8204251527786255, "grad_norm": 2.7165898934699366, "learning_rate": 9.26530612244898e-06, "loss": 0.995, "mean_copy_accuracy": 0.4049629494547844, "mean_gen_accuracy": 0.808481827378273, "mean_token_accuracy": 0.7106903642416, "num_tokens": 61735070.0, "sample_num_tokens": 8041.0, "step": 228, "total_num_tokens": 61767234.0, "z_loss": 0.009728880599141121 }, { "copy_logits_max": 12.92612075805664, "copy_logits_min": -750000000.0, "copy_num_tokens": 384.9375, "epoch": 0.046770487618075056, "gen_logits_max": 19.918292999267578, "gen_logits_mean": 0.9265654683113098, "gen_logits_min": -12.244714736938477, "gen_logits_std": 2.455270767211914, "gen_loss": 0.8001508712768555, "grad_norm": 1.8982157750800526, "learning_rate": 9.306122448979593e-06, "loss": 0.927, "mean_copy_accuracy": 0.43376659601926804, "mean_gen_accuracy": 0.8294589221477509, "mean_token_accuracy": 0.7324887961149216, "num_tokens": 62010141.0, "sample_num_tokens": 8376.75, "step": 229, "total_num_tokens": 62043648.0, "z_loss": 0.01010814867913723 }, { "copy_logits_max": 16.1214542388916, "copy_logits_min": -750000000.0, "copy_num_tokens": 665.1875, "epoch": 0.04697472555527189, "gen_logits_max": 20.0047607421875, "gen_logits_mean": 0.8497263193130493, "gen_logits_min": -14.142420768737793, "gen_logits_std": 2.476130962371826, "gen_loss": 1.0612788200378418, "grad_norm": 3.0504088045866506, "learning_rate": 9.346938775510204e-06, "loss": 0.9186, "mean_copy_accuracy": 0.4739571288228035, "mean_gen_accuracy": 0.816222608089447, "mean_token_accuracy": 0.7347744256258011, "num_tokens": 62273740.0, "sample_num_tokens": 9490.5, "step": 230, "total_num_tokens": 62311702.0, "z_loss": 0.011358540505170822 }, { "copy_logits_max": 14.728808403015137, "copy_logits_min": -687500032.0, "copy_num_tokens": 820.5625, "epoch": 0.04717896349246872, "gen_logits_max": 18.826438903808594, "gen_logits_mean": 0.4498274326324463, "gen_logits_min": -14.166170120239258, "gen_logits_std": 2.484534978866577, "gen_loss": 1.15060293674469, "grad_norm": 2.5795540158663344, "learning_rate": 9.387755102040818e-06, "loss": 1.0054, "mean_copy_accuracy": 0.4502963125705719, "mean_gen_accuracy": 0.8182011544704437, "mean_token_accuracy": 0.7138559818267822, "num_tokens": 62556135.0, "sample_num_tokens": 10423.75, "step": 231, "total_num_tokens": 62597830.0, "z_loss": 0.011666661128401756 }, { "copy_logits_max": 13.739233016967773, "copy_logits_min": -750000000.0, "copy_num_tokens": 337.25, "epoch": 0.04738320142966556, "gen_logits_max": 20.086044311523438, "gen_logits_mean": 1.3084170818328857, "gen_logits_min": -12.571630477905273, "gen_logits_std": 2.4039313793182373, "gen_loss": 0.7859437465667725, "grad_norm": 2.5457103895040474, "learning_rate": 9.428571428571428e-06, "loss": 0.8588, "mean_copy_accuracy": 0.4805699735879898, "mean_gen_accuracy": 0.8293699473142624, "mean_token_accuracy": 0.7478636503219604, "num_tokens": 62840451.0, "sample_num_tokens": 8062.75, "step": 232, "total_num_tokens": 62872702.0, "z_loss": 0.010090749710798264 }, { "copy_logits_max": 14.59565258026123, "copy_logits_min": -750000000.0, "copy_num_tokens": 553.6875, "epoch": 0.0475874393668624, "gen_logits_max": 19.669713973999023, "gen_logits_mean": 0.9092301726341248, "gen_logits_min": -12.04434585571289, "gen_logits_std": 2.3691089153289795, "gen_loss": 0.8683677911758423, "grad_norm": 2.6180138173165277, "learning_rate": 9.46938775510204e-06, "loss": 0.8728, "mean_copy_accuracy": 0.4964970275759697, "mean_gen_accuracy": 0.828391820192337, "mean_token_accuracy": 0.7528853714466095, "num_tokens": 63102675.0, "sample_num_tokens": 9063.75, "step": 233, "total_num_tokens": 63138930.0, "z_loss": 0.01089521124958992 }, { "copy_logits_max": 14.389992713928223, "copy_logits_min": -750000000.0, "copy_num_tokens": 566.5, "epoch": 0.04779167730405923, "gen_logits_max": 17.952117919921875, "gen_logits_mean": 0.7126785516738892, "gen_logits_min": -13.457103729248047, "gen_logits_std": 2.367380380630493, "gen_loss": 0.9996317625045776, "grad_norm": 2.990305350706595, "learning_rate": 9.510204081632653e-06, "loss": 0.9533, "mean_copy_accuracy": 0.49258531630039215, "mean_gen_accuracy": 0.8161908835172653, "mean_token_accuracy": 0.7291106879711151, "num_tokens": 63393983.0, "sample_num_tokens": 8625.75, "step": 234, "total_num_tokens": 63428486.0, "z_loss": 0.01167040690779686 }, { "copy_logits_max": 19.527189254760742, "copy_logits_min": -750000000.0, "copy_num_tokens": 580.75, "epoch": 0.047995915241256064, "gen_logits_max": 21.17768669128418, "gen_logits_mean": 0.9781937599182129, "gen_logits_min": -12.887962341308594, "gen_logits_std": 2.4742424488067627, "gen_loss": 0.9560816287994385, "grad_norm": 3.2127201337827493, "learning_rate": 9.551020408163265e-06, "loss": 0.8883, "mean_copy_accuracy": 0.4902190491557121, "mean_gen_accuracy": 0.8178011327981949, "mean_token_accuracy": 0.7346081286668777, "num_tokens": 63672181.0, "sample_num_tokens": 9395.25, "step": 235, "total_num_tokens": 63709762.0, "z_loss": 0.01147463172674179 }, { "copy_logits_max": 17.056171417236328, "copy_logits_min": -750000000.0, "copy_num_tokens": 297.25, "epoch": 0.0482001531784529, "gen_logits_max": 21.02391815185547, "gen_logits_mean": 1.434338927268982, "gen_logits_min": -13.519644737243652, "gen_logits_std": 2.4139981269836426, "gen_loss": 0.8072724938392639, "grad_norm": 2.4396678592870518, "learning_rate": 9.591836734693878e-06, "loss": 0.8403, "mean_copy_accuracy": 0.5278849601745605, "mean_gen_accuracy": 0.821034386754036, "mean_token_accuracy": 0.754957988858223, "num_tokens": 63931252.0, "sample_num_tokens": 6602.0, "step": 236, "total_num_tokens": 63957660.0, "z_loss": 0.011015243828296661 }, { "copy_logits_max": 15.63701057434082, "copy_logits_min": -750000000.0, "copy_num_tokens": 355.375, "epoch": 0.04840439111564973, "gen_logits_max": 21.9086971282959, "gen_logits_mean": 1.7228713035583496, "gen_logits_min": -11.716968536376953, "gen_logits_std": 2.4053969383239746, "gen_loss": 0.8274197578430176, "grad_norm": 2.8046606509634624, "learning_rate": 9.63265306122449e-06, "loss": 0.8149, "mean_copy_accuracy": 0.5504017472267151, "mean_gen_accuracy": 0.82364721596241, "mean_token_accuracy": 0.7638494819402695, "num_tokens": 64183367.0, "sample_num_tokens": 7497.75, "step": 237, "total_num_tokens": 64213358.0, "z_loss": 0.011251984164118767 }, { "copy_logits_max": 15.302522659301758, "copy_logits_min": -687500032.0, "copy_num_tokens": 327.875, "epoch": 0.04860862905284657, "gen_logits_max": 20.448871612548828, "gen_logits_mean": 1.3209667205810547, "gen_logits_min": -11.381547927856445, "gen_logits_std": 2.402101755142212, "gen_loss": 0.8062506914138794, "grad_norm": 3.466280338164196, "learning_rate": 9.673469387755102e-06, "loss": 0.9625, "mean_copy_accuracy": 0.4995785281062126, "mean_gen_accuracy": 0.8177999556064606, "mean_token_accuracy": 0.725414365530014, "num_tokens": 64467867.0, "sample_num_tokens": 7628.25, "step": 238, "total_num_tokens": 64498380.0, "z_loss": 0.010868029668927193 }, { "copy_logits_max": 15.011358261108398, "copy_logits_min": -750000000.0, "copy_num_tokens": 390.3125, "epoch": 0.0488128669900434, "gen_logits_max": 18.883174896240234, "gen_logits_mean": 0.6383655071258545, "gen_logits_min": -12.148515701293945, "gen_logits_std": 2.4464123249053955, "gen_loss": 0.8378558158874512, "grad_norm": 2.878570135814634, "learning_rate": 9.714285714285715e-06, "loss": 0.8728, "mean_copy_accuracy": 0.5046984851360321, "mean_gen_accuracy": 0.8189926296472549, "mean_token_accuracy": 0.742345541715622, "num_tokens": 64736198.0, "sample_num_tokens": 7706.0, "step": 239, "total_num_tokens": 64767022.0, "z_loss": 0.011967284604907036 }, { "copy_logits_max": 16.41595458984375, "copy_logits_min": -749999872.0, "copy_num_tokens": 444.125, "epoch": 0.049017104927240235, "gen_logits_max": 19.864898681640625, "gen_logits_mean": 0.9083412885665894, "gen_logits_min": -12.444904327392578, "gen_logits_std": 2.4684503078460693, "gen_loss": 0.9377923011779785, "grad_norm": 6.9644077953315335, "learning_rate": 9.755102040816327e-06, "loss": 0.8623, "mean_copy_accuracy": 0.5804030746221542, "mean_gen_accuracy": 0.8203320056200027, "mean_token_accuracy": 0.7639439553022385, "num_tokens": 64986077.0, "sample_num_tokens": 7377.25, "step": 240, "total_num_tokens": 65015586.0, "z_loss": 0.013062568381428719 }, { "copy_logits_max": 16.63766860961914, "copy_logits_min": -750000000.0, "copy_num_tokens": 349.4375, "epoch": 0.04922134286443707, "gen_logits_max": 20.734691619873047, "gen_logits_mean": 1.2560852766036987, "gen_logits_min": -11.635360717773438, "gen_logits_std": 2.376924991607666, "gen_loss": 0.785043478012085, "grad_norm": 2.1756089839647177, "learning_rate": 9.795918367346939e-06, "loss": 0.8497, "mean_copy_accuracy": 0.5550093501806259, "mean_gen_accuracy": 0.8302558213472366, "mean_token_accuracy": 0.7629297822713852, "num_tokens": 65231180.0, "sample_num_tokens": 7273.5, "step": 241, "total_num_tokens": 65260274.0, "z_loss": 0.011980395764112473 }, { "copy_logits_max": 18.97376823425293, "copy_logits_min": -687500032.0, "copy_num_tokens": 735.4375, "epoch": 0.0494255808016339, "gen_logits_max": 20.540569305419922, "gen_logits_mean": 1.004536509513855, "gen_logits_min": -12.310890197753906, "gen_logits_std": 2.409116268157959, "gen_loss": 0.9481694102287292, "grad_norm": 4.131459515443562, "learning_rate": 9.836734693877552e-06, "loss": 0.8986, "mean_copy_accuracy": 0.5161498636007309, "mean_gen_accuracy": 0.8222175389528275, "mean_token_accuracy": 0.7380973100662231, "num_tokens": 65499443.0, "sample_num_tokens": 10817.75, "step": 242, "total_num_tokens": 65542714.0, "z_loss": 0.013072647154331207 }, { "copy_logits_max": 22.131147384643555, "copy_logits_min": -625000000.0, "copy_num_tokens": 691.3125, "epoch": 0.04962981873883074, "gen_logits_max": 20.390304565429688, "gen_logits_mean": 1.0659360885620117, "gen_logits_min": -12.499553680419922, "gen_logits_std": 2.42732310295105, "gen_loss": 0.9801145792007446, "grad_norm": 2.4519374615146363, "learning_rate": 9.877551020408164e-06, "loss": 0.8839, "mean_copy_accuracy": 0.5647898763418198, "mean_gen_accuracy": 0.8215673714876175, "mean_token_accuracy": 0.7468262612819672, "num_tokens": 65796450.0, "sample_num_tokens": 9719.0, "step": 243, "total_num_tokens": 65835326.0, "z_loss": 0.01477324403822422 }, { "copy_logits_max": 16.416282653808594, "copy_logits_min": -750000000.0, "copy_num_tokens": 530.625, "epoch": 0.04983405667602757, "gen_logits_max": 19.593488693237305, "gen_logits_mean": 0.8087419271469116, "gen_logits_min": -12.013084411621094, "gen_logits_std": 2.4181039333343506, "gen_loss": 0.8136142492294312, "grad_norm": 2.70658727687651, "learning_rate": 9.918367346938776e-06, "loss": 0.8136, "mean_copy_accuracy": 0.6094354391098022, "mean_gen_accuracy": 0.8241146206855774, "mean_token_accuracy": 0.7718391567468643, "num_tokens": 66067488.0, "sample_num_tokens": 8966.0, "step": 244, "total_num_tokens": 66103352.0, "z_loss": 0.013535790145397186 }, { "copy_logits_max": 15.67197322845459, "copy_logits_min": -750000000.0, "copy_num_tokens": 356.1875, "epoch": 0.050038294613224406, "gen_logits_max": 20.067447662353516, "gen_logits_mean": 1.3467519283294678, "gen_logits_min": -12.60100269317627, "gen_logits_std": 2.3519957065582275, "gen_loss": 0.7338964939117432, "grad_norm": 2.544235382690079, "learning_rate": 9.959183673469387e-06, "loss": 0.7312, "mean_copy_accuracy": 0.6108577251434326, "mean_gen_accuracy": 0.8251013904809952, "mean_token_accuracy": 0.783539742231369, "num_tokens": 66344949.0, "sample_num_tokens": 8550.25, "step": 245, "total_num_tokens": 66379150.0, "z_loss": 0.012469001114368439 }, { "copy_logits_max": 16.061485290527344, "copy_logits_min": -750000000.0, "copy_num_tokens": 430.875, "epoch": 0.05024253255042124, "gen_logits_max": 20.983863830566406, "gen_logits_mean": 1.729079246520996, "gen_logits_min": -11.401927947998047, "gen_logits_std": 2.3202805519104004, "gen_loss": 0.7355089783668518, "grad_norm": 2.2170233967676927, "learning_rate": 9.999999999999999e-06, "loss": 0.7839, "mean_copy_accuracy": 0.6229086965322495, "mean_gen_accuracy": 0.8197129666805267, "mean_token_accuracy": 0.7756600677967072, "num_tokens": 66597534.0, "sample_num_tokens": 9119.0, "step": 246, "total_num_tokens": 66634010.0, "z_loss": 0.012929629534482956 }, { "copy_logits_max": 17.474700927734375, "copy_logits_min": -750000000.0, "copy_num_tokens": 420.125, "epoch": 0.05044677048761807, "gen_logits_max": 21.345643997192383, "gen_logits_mean": 1.669575572013855, "gen_logits_min": -10.993992805480957, "gen_logits_std": 2.392068862915039, "gen_loss": 0.7158191800117493, "grad_norm": 3.9694393499850436, "learning_rate": 1.0040816326530613e-05, "loss": 0.7818, "mean_copy_accuracy": 0.6468042880296707, "mean_gen_accuracy": 0.8314082473516464, "mean_token_accuracy": 0.7838725000619888, "num_tokens": 66867960.0, "sample_num_tokens": 8474.5, "step": 247, "total_num_tokens": 66901858.0, "z_loss": 0.013317208737134933 }, { "copy_logits_max": 21.302885055541992, "copy_logits_min": -750000000.0, "copy_num_tokens": 346.1875, "epoch": 0.05065100842481491, "gen_logits_max": 20.286460876464844, "gen_logits_mean": 1.185070276260376, "gen_logits_min": -11.829534530639648, "gen_logits_std": 2.4018430709838867, "gen_loss": 0.778323769569397, "grad_norm": 4.162366908780564, "learning_rate": 1.0081632653061224e-05, "loss": 0.8042, "mean_copy_accuracy": 0.5919601321220398, "mean_gen_accuracy": 0.8255287408828735, "mean_token_accuracy": 0.7712641358375549, "num_tokens": 67132205.0, "sample_num_tokens": 6865.75, "step": 248, "total_num_tokens": 67159668.0, "z_loss": 0.013806572183966637 }, { "copy_logits_max": 20.937847137451172, "copy_logits_min": -750000000.0, "copy_num_tokens": 588.1875, "epoch": 0.05085524636201175, "gen_logits_max": 19.942346572875977, "gen_logits_mean": 0.734269917011261, "gen_logits_min": -12.11971664428711, "gen_logits_std": 2.4022367000579834, "gen_loss": 0.7973306179046631, "grad_norm": 3.05055257743803, "learning_rate": 1.0122448979591836e-05, "loss": 0.8381, "mean_copy_accuracy": 0.6289480924606323, "mean_gen_accuracy": 0.8213125914335251, "mean_token_accuracy": 0.7676423639059067, "num_tokens": 67407964.0, "sample_num_tokens": 8750.0, "step": 249, "total_num_tokens": 67442964.0, "z_loss": 0.016357623040676117 }, { "copy_logits_max": 19.87725067138672, "copy_logits_min": -750000000.0, "copy_num_tokens": 401.875, "epoch": 0.05105948429920858, "gen_logits_max": 19.79574203491211, "gen_logits_mean": 1.0667271614074707, "gen_logits_min": -11.636470794677734, "gen_logits_std": 2.3477048873901367, "gen_loss": 0.8137480020523071, "grad_norm": 4.160362801048779, "learning_rate": 1.016326530612245e-05, "loss": 0.8683, "mean_copy_accuracy": 0.599190428853035, "mean_gen_accuracy": 0.820781946182251, "mean_token_accuracy": 0.7584179639816284, "num_tokens": 67677163.0, "sample_num_tokens": 7561.75, "step": 250, "total_num_tokens": 67707410.0, "z_loss": 0.015555715188384056 }, { "copy_logits_max": 19.753366470336914, "copy_logits_min": -687500032.0, "copy_num_tokens": 592.75, "epoch": 0.051263722236405414, "gen_logits_max": 17.63125228881836, "gen_logits_mean": 0.4740431010723114, "gen_logits_min": -12.081117630004883, "gen_logits_std": 2.3427815437316895, "gen_loss": 0.8800212144851685, "grad_norm": 2.01640207815758, "learning_rate": 1.0204081632653061e-05, "loss": 0.7852, "mean_copy_accuracy": 0.655714601278305, "mean_gen_accuracy": 0.825475811958313, "mean_token_accuracy": 0.7840881645679474, "num_tokens": 67947544.0, "sample_num_tokens": 8620.5, "step": 251, "total_num_tokens": 67982026.0, "z_loss": 0.017220497131347656 }, { "copy_logits_max": 22.330463409423828, "copy_logits_min": -750000000.0, "copy_num_tokens": 558.5625, "epoch": 0.051467960173602244, "gen_logits_max": 18.832597732543945, "gen_logits_mean": 0.7445457577705383, "gen_logits_min": -12.289752960205078, "gen_logits_std": 2.361340045928955, "gen_loss": 0.9081143140792847, "grad_norm": 3.7775945234236317, "learning_rate": 1.0244897959183675e-05, "loss": 0.811, "mean_copy_accuracy": 0.6301526576280594, "mean_gen_accuracy": 0.8271070569753647, "mean_token_accuracy": 0.7737657129764557, "num_tokens": 68217602.0, "sample_num_tokens": 8639.0, "step": 252, "total_num_tokens": 68252158.0, "z_loss": 0.016840171068906784 }, { "copy_logits_max": 22.05569839477539, "copy_logits_min": -750000000.0, "copy_num_tokens": 462.875, "epoch": 0.05167219811079908, "gen_logits_max": 20.09473419189453, "gen_logits_mean": 1.2722233533859253, "gen_logits_min": -11.684003829956055, "gen_logits_std": 2.3386287689208984, "gen_loss": 0.7794272303581238, "grad_norm": 2.0057606405614967, "learning_rate": 1.0285714285714286e-05, "loss": 0.7704, "mean_copy_accuracy": 0.6916080713272095, "mean_gen_accuracy": 0.8191066831350327, "mean_token_accuracy": 0.7899493724107742, "num_tokens": 68482957.0, "sample_num_tokens": 9182.75, "step": 253, "total_num_tokens": 68519688.0, "z_loss": 0.01621488854289055 }, { "copy_logits_max": 18.806610107421875, "copy_logits_min": -750000000.0, "copy_num_tokens": 239.0, "epoch": 0.05187643604799592, "gen_logits_max": 21.222043991088867, "gen_logits_mean": 1.5019111633300781, "gen_logits_min": -10.951804161071777, "gen_logits_std": 2.2940850257873535, "gen_loss": 0.6117040514945984, "grad_norm": 2.376944484754408, "learning_rate": 1.0326530612244898e-05, "loss": 0.7174, "mean_copy_accuracy": 0.659827321767807, "mean_gen_accuracy": 0.831630066037178, "mean_token_accuracy": 0.7922707349061966, "num_tokens": 68723657.0, "sample_num_tokens": 6580.75, "step": 254, "total_num_tokens": 68749980.0, "z_loss": 0.01358532253652811 }, { "copy_logits_max": 19.779998779296875, "copy_logits_min": -750000000.0, "copy_num_tokens": 552.9375, "epoch": 0.05208067398519275, "gen_logits_max": 17.843730926513672, "gen_logits_mean": 0.8347726464271545, "gen_logits_min": -11.635835647583008, "gen_logits_std": 2.324838638305664, "gen_loss": 0.7765686511993408, "grad_norm": 7.969047185928089, "learning_rate": 1.0367346938775512e-05, "loss": 0.7917, "mean_copy_accuracy": 0.6502136439085007, "mean_gen_accuracy": 0.8285649418830872, "mean_token_accuracy": 0.7813954651355743, "num_tokens": 68993628.0, "sample_num_tokens": 9217.0, "step": 255, "total_num_tokens": 69030496.0, "z_loss": 0.01925131306052208 }, { "copy_logits_max": 25.4896183013916, "copy_logits_min": -687499968.0, "copy_num_tokens": 491.3125, "epoch": 0.052284911922389585, "gen_logits_max": 19.255054473876953, "gen_logits_mean": 0.6798661947250366, "gen_logits_min": -13.361431121826172, "gen_logits_std": 2.4486396312713623, "gen_loss": 0.7907527685165405, "grad_norm": 2.723766634441986, "learning_rate": 1.0408163265306123e-05, "loss": 0.7932, "mean_copy_accuracy": 0.6389333456754684, "mean_gen_accuracy": 0.8280960917472839, "mean_token_accuracy": 0.7738615423440933, "num_tokens": 69267215.0, "sample_num_tokens": 7556.25, "step": 256, "total_num_tokens": 69297440.0, "z_loss": 0.019290991127490997 }, { "copy_logits_max": 23.27057647705078, "copy_logits_min": -750000000.0, "copy_num_tokens": 552.375, "epoch": 0.052489149859586415, "gen_logits_max": 21.092010498046875, "gen_logits_mean": 1.3756989240646362, "gen_logits_min": -11.510250091552734, "gen_logits_std": 2.3779456615448, "gen_loss": 0.7421891689300537, "grad_norm": 3.0013036215111684, "learning_rate": 1.0448979591836733e-05, "loss": 0.7251, "mean_copy_accuracy": 0.7074778378009796, "mean_gen_accuracy": 0.828076958656311, "mean_token_accuracy": 0.7988956421613693, "num_tokens": 69524312.0, "sample_num_tokens": 8765.5, "step": 257, "total_num_tokens": 69559374.0, "z_loss": 0.01788087747991085 }, { "copy_logits_max": 23.79057502746582, "copy_logits_min": -750000000.0, "copy_num_tokens": 469.0625, "epoch": 0.05269338779678325, "gen_logits_max": 20.418075561523438, "gen_logits_mean": 1.3945863246917725, "gen_logits_min": -11.55817985534668, "gen_logits_std": 2.3561925888061523, "gen_loss": 0.7149221897125244, "grad_norm": 2.190821254790608, "learning_rate": 1.0489795918367347e-05, "loss": 0.734, "mean_copy_accuracy": 0.7114300429821014, "mean_gen_accuracy": 0.8220135867595673, "mean_token_accuracy": 0.7946249544620514, "num_tokens": 69793825.0, "sample_num_tokens": 8640.25, "step": 258, "total_num_tokens": 69828386.0, "z_loss": 0.01912097819149494 }, { "copy_logits_max": 25.37484359741211, "copy_logits_min": -687500032.0, "copy_num_tokens": 471.625, "epoch": 0.05289762573398009, "gen_logits_max": 20.380205154418945, "gen_logits_mean": 1.3690252304077148, "gen_logits_min": -12.055829048156738, "gen_logits_std": 2.3978328704833984, "gen_loss": 0.7342296838760376, "grad_norm": 3.0240075109060642, "learning_rate": 1.0530612244897959e-05, "loss": 0.8082, "mean_copy_accuracy": 0.6583452224731445, "mean_gen_accuracy": 0.8147303462028503, "mean_token_accuracy": 0.7725066095590591, "num_tokens": 70059989.0, "sample_num_tokens": 8136.25, "step": 259, "total_num_tokens": 70092534.0, "z_loss": 0.020475832745432854 }, { "copy_logits_max": 21.30191421508789, "copy_logits_min": -687499968.0, "copy_num_tokens": 250.875, "epoch": 0.05310186367117692, "gen_logits_max": 22.073421478271484, "gen_logits_mean": 1.7238749265670776, "gen_logits_min": -11.210039138793945, "gen_logits_std": 2.3919737339019775, "gen_loss": 0.6940690875053406, "grad_norm": 3.295655259564266, "learning_rate": 1.0571428571428572e-05, "loss": 0.761, "mean_copy_accuracy": 0.7214289903640747, "mean_gen_accuracy": 0.8168700486421585, "mean_token_accuracy": 0.7941341400146484, "num_tokens": 70338660.0, "sample_num_tokens": 7632.5, "step": 260, "total_num_tokens": 70369190.0, "z_loss": 0.015481078997254372 }, { "copy_logits_max": 22.733976364135742, "copy_logits_min": -687500032.0, "copy_num_tokens": 503.75, "epoch": 0.053306101608373756, "gen_logits_max": 19.84450912475586, "gen_logits_mean": 1.4543788433074951, "gen_logits_min": -11.47646713256836, "gen_logits_std": 2.3568809032440186, "gen_loss": 0.714242160320282, "grad_norm": 6.114667831288252, "learning_rate": 1.0612244897959184e-05, "loss": 0.7335, "mean_copy_accuracy": 0.6138810366392136, "mean_gen_accuracy": 0.8270563036203384, "mean_token_accuracy": 0.7724748700857162, "num_tokens": 70611087.0, "sample_num_tokens": 8988.25, "step": 261, "total_num_tokens": 70647040.0, "z_loss": 0.02027115598320961 }, { "copy_logits_max": 20.712947845458984, "copy_logits_min": -750000000.0, "copy_num_tokens": 316.3125, "epoch": 0.05351033954557059, "gen_logits_max": 21.22939682006836, "gen_logits_mean": 2.054882049560547, "gen_logits_min": -10.82197380065918, "gen_logits_std": 2.341325044631958, "gen_loss": 0.6501064896583557, "grad_norm": 5.164384895147264, "learning_rate": 1.0653061224489796e-05, "loss": 0.6908, "mean_copy_accuracy": 0.7151466012001038, "mean_gen_accuracy": 0.832583948969841, "mean_token_accuracy": 0.8071481883525848, "num_tokens": 70860264.0, "sample_num_tokens": 6956.0, "step": 262, "total_num_tokens": 70888088.0, "z_loss": 0.0182422436773777 }, { "copy_logits_max": 22.760122299194336, "copy_logits_min": -750000000.0, "copy_num_tokens": 433.1875, "epoch": 0.05371457748276742, "gen_logits_max": 20.093788146972656, "gen_logits_mean": 1.4014110565185547, "gen_logits_min": -11.319934844970703, "gen_logits_std": 2.298438549041748, "gen_loss": 0.7365206480026245, "grad_norm": 2.1457918830642457, "learning_rate": 1.0693877551020409e-05, "loss": 0.7375, "mean_copy_accuracy": 0.7068738490343094, "mean_gen_accuracy": 0.8252816796302795, "mean_token_accuracy": 0.7951444387435913, "num_tokens": 71117504.0, "sample_num_tokens": 8433.0, "step": 263, "total_num_tokens": 71151236.0, "z_loss": 0.01973690092563629 }, { "copy_logits_max": 21.69708251953125, "copy_logits_min": -687500032.0, "copy_num_tokens": 534.5625, "epoch": 0.05391881541996426, "gen_logits_max": 18.83429718017578, "gen_logits_mean": 0.7533174753189087, "gen_logits_min": -11.980175971984863, "gen_logits_std": 2.4276366233825684, "gen_loss": 0.6668022871017456, "grad_norm": 3.8732081897720514, "learning_rate": 1.073469387755102e-05, "loss": 0.6623, "mean_copy_accuracy": 0.6409342885017395, "mean_gen_accuracy": 0.8316268175840378, "mean_token_accuracy": 0.795085072517395, "num_tokens": 71371338.0, "sample_num_tokens": 9451.0, "step": 264, "total_num_tokens": 71409142.0, "z_loss": 0.021032176911830902 }, { "copy_logits_max": 24.400114059448242, "copy_logits_min": -687500032.0, "copy_num_tokens": 541.5625, "epoch": 0.05412305335716109, "gen_logits_max": 18.800559997558594, "gen_logits_mean": 0.5382847785949707, "gen_logits_min": -12.485950469970703, "gen_logits_std": 2.425834894180298, "gen_loss": 0.7154592275619507, "grad_norm": 5.0899672828534435, "learning_rate": 1.0775510204081633e-05, "loss": 0.6972, "mean_copy_accuracy": 0.7501198202371597, "mean_gen_accuracy": 0.8373272269964218, "mean_token_accuracy": 0.8152828365564346, "num_tokens": 71652870.0, "sample_num_tokens": 9321.5, "step": 265, "total_num_tokens": 71690156.0, "z_loss": 0.02341199293732643 }, { "copy_logits_max": 24.520280838012695, "copy_logits_min": -750000000.0, "copy_num_tokens": 490.0625, "epoch": 0.05432729129435793, "gen_logits_max": 19.409486770629883, "gen_logits_mean": 0.9564986228942871, "gen_logits_min": -11.526283264160156, "gen_logits_std": 2.3178253173828125, "gen_loss": 0.7140775322914124, "grad_norm": 5.1552281002316835, "learning_rate": 1.0816326530612246e-05, "loss": 0.7476, "mean_copy_accuracy": 0.6630995124578476, "mean_gen_accuracy": 0.8264676332473755, "mean_token_accuracy": 0.7807312905788422, "num_tokens": 71939629.0, "sample_num_tokens": 8287.75, "step": 266, "total_num_tokens": 71972780.0, "z_loss": 0.023266147822141647 }, { "copy_logits_max": 21.946369171142578, "copy_logits_min": -750000000.0, "copy_num_tokens": 781.375, "epoch": 0.054531529231554764, "gen_logits_max": 16.78117561340332, "gen_logits_mean": 0.3100811541080475, "gen_logits_min": -11.885469436645508, "gen_logits_std": 2.3040881156921387, "gen_loss": 0.6847052574157715, "grad_norm": 3.0157776941198624, "learning_rate": 1.0857142857142858e-05, "loss": 0.7329, "mean_copy_accuracy": 0.7299139350652695, "mean_gen_accuracy": 0.8381142318248749, "mean_token_accuracy": 0.8063161373138428, "num_tokens": 72228243.0, "sample_num_tokens": 10391.75, "step": 267, "total_num_tokens": 72269810.0, "z_loss": 0.02761586382985115 }, { "copy_logits_max": 24.408248901367188, "copy_logits_min": -750000000.0, "copy_num_tokens": 573.75, "epoch": 0.054735767168751594, "gen_logits_max": 18.87759017944336, "gen_logits_mean": 0.7246658802032471, "gen_logits_min": -12.259422302246094, "gen_logits_std": 2.3650221824645996, "gen_loss": 0.7050978541374207, "grad_norm": 2.8453129663733625, "learning_rate": 1.0897959183673471e-05, "loss": 0.7195, "mean_copy_accuracy": 0.7511490285396576, "mean_gen_accuracy": 0.8323038071393967, "mean_token_accuracy": 0.8114633411169052, "num_tokens": 72543187.0, "sample_num_tokens": 9948.75, "step": 268, "total_num_tokens": 72582982.0, "z_loss": 0.024528807029128075 }, { "copy_logits_max": 19.595691680908203, "copy_logits_min": -750000000.0, "copy_num_tokens": 579.4375, "epoch": 0.05494000510594843, "gen_logits_max": 19.034984588623047, "gen_logits_mean": 0.7594853639602661, "gen_logits_min": -11.820253372192383, "gen_logits_std": 2.370833396911621, "gen_loss": 0.5827770233154297, "grad_norm": 6.654417532189855, "learning_rate": 1.0938775510204081e-05, "loss": 0.6818, "mean_copy_accuracy": 0.7655339241027832, "mean_gen_accuracy": 0.8264623731374741, "mean_token_accuracy": 0.8095615357160568, "num_tokens": 72829917.0, "sample_num_tokens": 9784.25, "step": 269, "total_num_tokens": 72869054.0, "z_loss": 0.023297488689422607 }, { "copy_logits_max": 23.5628662109375, "copy_logits_min": -750000000.0, "copy_num_tokens": 395.9375, "epoch": 0.05514424304314527, "gen_logits_max": 20.606563568115234, "gen_logits_mean": 1.2347348928451538, "gen_logits_min": -11.668081283569336, "gen_logits_std": 2.360764265060425, "gen_loss": 0.5803620219230652, "grad_norm": 2.228034535703626, "learning_rate": 1.0979591836734693e-05, "loss": 0.6408, "mean_copy_accuracy": 0.7903157025575638, "mean_gen_accuracy": 0.8311562240123749, "mean_token_accuracy": 0.8232649266719818, "num_tokens": 73099877.0, "sample_num_tokens": 8893.25, "step": 270, "total_num_tokens": 73135450.0, "z_loss": 0.02158219739794731 }, { "copy_logits_max": 31.366750717163086, "copy_logits_min": -750000000.0, "copy_num_tokens": 602.3125, "epoch": 0.0553484809803421, "gen_logits_max": 19.143720626831055, "gen_logits_mean": 0.9476490020751953, "gen_logits_min": -12.488191604614258, "gen_logits_std": 2.4327878952026367, "gen_loss": 0.795853316783905, "grad_norm": 5.015797871871739, "learning_rate": 1.1020408163265306e-05, "loss": 0.7154, "mean_copy_accuracy": 0.7326385080814362, "mean_gen_accuracy": 0.83293317258358, "mean_token_accuracy": 0.8044640719890594, "num_tokens": 73387780.0, "sample_num_tokens": 8443.5, "step": 271, "total_num_tokens": 73421554.0, "z_loss": 0.03134811669588089 }, { "copy_logits_max": 26.09117317199707, "copy_logits_min": -750000000.0, "copy_num_tokens": 351.5625, "epoch": 0.055552718917538935, "gen_logits_max": 19.160463333129883, "gen_logits_mean": 0.8531904220581055, "gen_logits_min": -11.685752868652344, "gen_logits_std": 2.3801674842834473, "gen_loss": 0.7281582951545715, "grad_norm": 2.8539310184581668, "learning_rate": 1.1061224489795918e-05, "loss": 0.6971, "mean_copy_accuracy": 0.7831302583217621, "mean_gen_accuracy": 0.8182907849550247, "mean_token_accuracy": 0.8094447255134583, "num_tokens": 73637983.0, "sample_num_tokens": 6643.75, "step": 272, "total_num_tokens": 73664558.0, "z_loss": 0.024490270763635635 }, { "copy_logits_max": 24.469932556152344, "copy_logits_min": -687500032.0, "copy_num_tokens": 409.4375, "epoch": 0.055756956854735765, "gen_logits_max": 22.051815032958984, "gen_logits_mean": 1.4836935997009277, "gen_logits_min": -10.982970237731934, "gen_logits_std": 2.3227810859680176, "gen_loss": 0.6086477041244507, "grad_norm": 3.6543878329950883, "learning_rate": 1.110204081632653e-05, "loss": 0.678, "mean_copy_accuracy": 0.7803031206130981, "mean_gen_accuracy": 0.8252749294042587, "mean_token_accuracy": 0.8150555491447449, "num_tokens": 73900235.0, "sample_num_tokens": 8749.25, "step": 273, "total_num_tokens": 73935232.0, "z_loss": 0.021527310833334923 }, { "copy_logits_max": 29.545528411865234, "copy_logits_min": -625000064.0, "copy_num_tokens": 570.6875, "epoch": 0.0559611947919326, "gen_logits_max": 19.928512573242188, "gen_logits_mean": 1.064643144607544, "gen_logits_min": -11.80007553100586, "gen_logits_std": 2.3540396690368652, "gen_loss": 0.6422465443611145, "grad_norm": 3.2678177064192564, "learning_rate": 1.1142857142857143e-05, "loss": 0.6476, "mean_copy_accuracy": 0.7844220995903015, "mean_gen_accuracy": 0.8382882326841354, "mean_token_accuracy": 0.8251858502626419, "num_tokens": 74161082.0, "sample_num_tokens": 9704.0, "step": 274, "total_num_tokens": 74199898.0, "z_loss": 0.028245151042938232 }, { "copy_logits_max": 24.065784454345703, "copy_logits_min": -750000000.0, "copy_num_tokens": 335.4375, "epoch": 0.05616543272912944, "gen_logits_max": 20.60040855407715, "gen_logits_mean": 1.503995656967163, "gen_logits_min": -10.888593673706055, "gen_logits_std": 2.3373870849609375, "gen_loss": 0.5809478759765625, "grad_norm": 2.167870286412551, "learning_rate": 1.1183673469387755e-05, "loss": 0.5931, "mean_copy_accuracy": 0.8186214417219162, "mean_gen_accuracy": 0.8374069631099701, "mean_token_accuracy": 0.832679346203804, "num_tokens": 74416714.0, "sample_num_tokens": 7343.0, "step": 275, "total_num_tokens": 74446086.0, "z_loss": 0.02385469153523445 }, { "copy_logits_max": 25.872831344604492, "copy_logits_min": -750000000.0, "copy_num_tokens": 333.6875, "epoch": 0.05636967066632627, "gen_logits_max": 20.28070640563965, "gen_logits_mean": 1.4572097063064575, "gen_logits_min": -11.045039176940918, "gen_logits_std": 2.3329153060913086, "gen_loss": 0.6590529680252075, "grad_norm": 2.784513052726542, "learning_rate": 1.1224489795918369e-05, "loss": 0.662, "mean_copy_accuracy": 0.7811218500137329, "mean_gen_accuracy": 0.8232487142086029, "mean_token_accuracy": 0.8138023763895035, "num_tokens": 74678002.0, "sample_num_tokens": 7245.0, "step": 276, "total_num_tokens": 74706982.0, "z_loss": 0.024738866835832596 }, { "copy_logits_max": 21.389007568359375, "copy_logits_min": -750000000.0, "copy_num_tokens": 242.0, "epoch": 0.056573908603523106, "gen_logits_max": 20.417461395263672, "gen_logits_mean": 1.0665309429168701, "gen_logits_min": -11.487674713134766, "gen_logits_std": 2.303647041320801, "gen_loss": 0.5580828785896301, "grad_norm": 2.8756057088729254, "learning_rate": 1.126530612244898e-05, "loss": 0.6448, "mean_copy_accuracy": 0.8093405067920685, "mean_gen_accuracy": 0.8285670429468155, "mean_token_accuracy": 0.8239090442657471, "num_tokens": 74956662.0, "sample_num_tokens": 7327.5, "step": 277, "total_num_tokens": 74985972.0, "z_loss": 0.019894346594810486 }, { "copy_logits_max": 28.403493881225586, "copy_logits_min": -750000064.0, "copy_num_tokens": 332.8125, "epoch": 0.056778146540719936, "gen_logits_max": 20.826274871826172, "gen_logits_mean": 1.2356109619140625, "gen_logits_min": -11.707534790039062, "gen_logits_std": 2.3143458366394043, "gen_loss": 0.5837714672088623, "grad_norm": 1.9921687517757756, "learning_rate": 1.1306122448979592e-05, "loss": 0.6359, "mean_copy_accuracy": 0.8073210567235947, "mean_gen_accuracy": 0.8324306607246399, "mean_token_accuracy": 0.8260656893253326, "num_tokens": 75227512.0, "sample_num_tokens": 7368.5, "step": 278, "total_num_tokens": 75256986.0, "z_loss": 0.025111302733421326 }, { "copy_logits_max": 24.363506317138672, "copy_logits_min": -750000000.0, "copy_num_tokens": 484.8125, "epoch": 0.05698238447791677, "gen_logits_max": 20.410337448120117, "gen_logits_mean": 1.3010773658752441, "gen_logits_min": -11.068425178527832, "gen_logits_std": 2.2714390754699707, "gen_loss": 0.5644785761833191, "grad_norm": 2.611020556796586, "learning_rate": 1.1346938775510206e-05, "loss": 0.6031, "mean_copy_accuracy": 0.8042745143175125, "mean_gen_accuracy": 0.8422632962465286, "mean_token_accuracy": 0.8339017331600189, "num_tokens": 75480902.0, "sample_num_tokens": 9905.0, "step": 279, "total_num_tokens": 75520522.0, "z_loss": 0.02717069908976555 }, { "copy_logits_max": 26.205799102783203, "copy_logits_min": -750000000.0, "copy_num_tokens": 314.1875, "epoch": 0.05718662241511361, "gen_logits_max": 21.365543365478516, "gen_logits_mean": 1.415513038635254, "gen_logits_min": -11.023170471191406, "gen_logits_std": 2.321833610534668, "gen_loss": 0.5868639945983887, "grad_norm": 3.208076232892963, "learning_rate": 1.1387755102040817e-05, "loss": 0.6612, "mean_copy_accuracy": 0.7999467998743057, "mean_gen_accuracy": 0.8228898793458939, "mean_token_accuracy": 0.8176373988389969, "num_tokens": 75754971.0, "sample_num_tokens": 7357.75, "step": 280, "total_num_tokens": 75784402.0, "z_loss": 0.024380043148994446 }, { "copy_logits_max": 28.83745002746582, "copy_logits_min": -687500032.0, "copy_num_tokens": 324.125, "epoch": 0.05739086035231044, "gen_logits_max": 20.196025848388672, "gen_logits_mean": 0.800650417804718, "gen_logits_min": -12.120824813842773, "gen_logits_std": 2.387413740158081, "gen_loss": 0.684500515460968, "grad_norm": 2.068476906165197, "learning_rate": 1.1428571428571429e-05, "loss": 0.6389, "mean_copy_accuracy": 0.7942062616348267, "mean_gen_accuracy": 0.8274705708026886, "mean_token_accuracy": 0.8210462182760239, "num_tokens": 76007546.0, "sample_num_tokens": 7896.0, "step": 281, "total_num_tokens": 76039130.0, "z_loss": 0.025451814755797386 }, { "copy_logits_max": 29.769012451171875, "copy_logits_min": -750000000.0, "copy_num_tokens": 448.625, "epoch": 0.05759509828950728, "gen_logits_max": 19.785133361816406, "gen_logits_mean": 1.161590576171875, "gen_logits_min": -11.457977294921875, "gen_logits_std": 2.346597671508789, "gen_loss": 0.6417392492294312, "grad_norm": 3.2249861921809395, "learning_rate": 1.146938775510204e-05, "loss": 0.6849, "mean_copy_accuracy": 0.7644912451505661, "mean_gen_accuracy": 0.8208607733249664, "mean_token_accuracy": 0.8079733401536942, "num_tokens": 76257459.0, "sample_num_tokens": 8123.25, "step": 282, "total_num_tokens": 76289952.0, "z_loss": 0.031871430575847626 }, { "copy_logits_max": 28.06396484375, "copy_logits_min": -750000000.0, "copy_num_tokens": 548.4375, "epoch": 0.057799336226704114, "gen_logits_max": 17.46919059753418, "gen_logits_mean": 0.5366836190223694, "gen_logits_min": -12.352351188659668, "gen_logits_std": 2.41542387008667, "gen_loss": 0.6839067935943604, "grad_norm": 3.4780980770967247, "learning_rate": 1.1510204081632653e-05, "loss": 0.6647, "mean_copy_accuracy": 0.8086415827274323, "mean_gen_accuracy": 0.8307128548622131, "mean_token_accuracy": 0.8254571408033371, "num_tokens": 76532848.0, "sample_num_tokens": 8159.0, "step": 283, "total_num_tokens": 76565484.0, "z_loss": 0.03746400773525238 }, { "copy_logits_max": 29.64068603515625, "copy_logits_min": -750000000.0, "copy_num_tokens": 389.3125, "epoch": 0.058003574163900944, "gen_logits_max": 19.58780288696289, "gen_logits_mean": 0.8290995955467224, "gen_logits_min": -12.043367385864258, "gen_logits_std": 2.3860161304473877, "gen_loss": 0.6232255697250366, "grad_norm": 1.846414663421741, "learning_rate": 1.1551020408163264e-05, "loss": 0.6687, "mean_copy_accuracy": 0.8359451144933701, "mean_gen_accuracy": 0.8236626535654068, "mean_token_accuracy": 0.8270805031061172, "num_tokens": 76798240.0, "sample_num_tokens": 7343.0, "step": 284, "total_num_tokens": 76827612.0, "z_loss": 0.032279230654239655 }, { "copy_logits_max": 24.145864486694336, "copy_logits_min": -750000000.0, "copy_num_tokens": 402.4375, "epoch": 0.05820781210109778, "gen_logits_max": 19.428817749023438, "gen_logits_mean": 0.7086533904075623, "gen_logits_min": -11.333023071289062, "gen_logits_std": 2.2857580184936523, "gen_loss": 0.5188809633255005, "grad_norm": 3.1836535665588057, "learning_rate": 1.1591836734693878e-05, "loss": 0.6314, "mean_copy_accuracy": 0.8040560334920883, "mean_gen_accuracy": 0.8313572406768799, "mean_token_accuracy": 0.8244669288396835, "num_tokens": 77075026.0, "sample_num_tokens": 9714.5, "step": 285, "total_num_tokens": 77113884.0, "z_loss": 0.02651454135775566 }, { "copy_logits_max": 28.59036636352539, "copy_logits_min": -750000000.0, "copy_num_tokens": 555.25, "epoch": 0.05841205003829461, "gen_logits_max": 18.571044921875, "gen_logits_mean": 0.484935998916626, "gen_logits_min": -11.903104782104492, "gen_logits_std": 2.3581278324127197, "gen_loss": 0.6942604780197144, "grad_norm": 2.8100340746276196, "learning_rate": 1.163265306122449e-05, "loss": 0.656, "mean_copy_accuracy": 0.8128997832536697, "mean_gen_accuracy": 0.8304143100976944, "mean_token_accuracy": 0.8255188912153244, "num_tokens": 77341030.0, "sample_num_tokens": 8282.0, "step": 286, "total_num_tokens": 77374158.0, "z_loss": 0.03675302863121033 }, { "copy_logits_max": 29.44737434387207, "copy_logits_min": -750000000.0, "copy_num_tokens": 604.6875, "epoch": 0.05861628797549145, "gen_logits_max": 18.535240173339844, "gen_logits_mean": 0.29313117265701294, "gen_logits_min": -12.108497619628906, "gen_logits_std": 2.3287317752838135, "gen_loss": 0.6463546752929688, "grad_norm": 2.542591812194566, "learning_rate": 1.1673469387755103e-05, "loss": 0.6391, "mean_copy_accuracy": 0.8127334117889404, "mean_gen_accuracy": 0.8342240750789642, "mean_token_accuracy": 0.8289180994033813, "num_tokens": 77604439.0, "sample_num_tokens": 8968.25, "step": 287, "total_num_tokens": 77640312.0, "z_loss": 0.03868965059518814 }, { "copy_logits_max": 35.17841339111328, "copy_logits_min": -750000000.0, "copy_num_tokens": 505.3125, "epoch": 0.058820525912688285, "gen_logits_max": 18.893451690673828, "gen_logits_mean": 0.5980625152587891, "gen_logits_min": -12.536474227905273, "gen_logits_std": 2.3845105171203613, "gen_loss": 0.6619110107421875, "grad_norm": 3.180422123728821, "learning_rate": 1.1714285714285715e-05, "loss": 0.6701, "mean_copy_accuracy": 0.8373452574014664, "mean_gen_accuracy": 0.823316901922226, "mean_token_accuracy": 0.827134758234024, "num_tokens": 77907293.0, "sample_num_tokens": 8350.25, "step": 288, "total_num_tokens": 77940694.0, "z_loss": 0.03880362957715988 }, { "copy_logits_max": 34.95293426513672, "copy_logits_min": -687500032.0, "copy_num_tokens": 596.0625, "epoch": 0.059024763849885115, "gen_logits_max": 19.737152099609375, "gen_logits_mean": 0.7811904549598694, "gen_logits_min": -12.066730499267578, "gen_logits_std": 2.385225772857666, "gen_loss": 0.6572142839431763, "grad_norm": 2.9537363139456234, "learning_rate": 1.1755102040816326e-05, "loss": 0.6393, "mean_copy_accuracy": 0.8493283241987228, "mean_gen_accuracy": 0.8324966877698898, "mean_token_accuracy": 0.8369825631380081, "num_tokens": 78206854.0, "sample_num_tokens": 8750.5, "step": 289, "total_num_tokens": 78241856.0, "z_loss": 0.04109104722738266 }, { "copy_logits_max": 29.267127990722656, "copy_logits_min": -687500032.0, "copy_num_tokens": 448.6875, "epoch": 0.05922900178708195, "gen_logits_max": 18.83236312866211, "gen_logits_mean": 0.2858102321624756, "gen_logits_min": -12.462000846862793, "gen_logits_std": 2.4092893600463867, "gen_loss": 0.610322892665863, "grad_norm": 2.7805804747941916, "learning_rate": 1.179591836734694e-05, "loss": 0.6523, "mean_copy_accuracy": 0.8054473847150803, "mean_gen_accuracy": 0.8302927017211914, "mean_token_accuracy": 0.8244513422250748, "num_tokens": 78486791.0, "sample_num_tokens": 8285.75, "step": 290, "total_num_tokens": 78519934.0, "z_loss": 0.035143911838531494 }, { "copy_logits_max": 32.10761642456055, "copy_logits_min": -750000000.0, "copy_num_tokens": 408.25, "epoch": 0.05943323972427878, "gen_logits_max": 19.76791000366211, "gen_logits_mean": 0.6526317596435547, "gen_logits_min": -12.029611587524414, "gen_logits_std": 2.385721206665039, "gen_loss": 0.6148079633712769, "grad_norm": 2.339095393723772, "learning_rate": 1.1836734693877552e-05, "loss": 0.6252, "mean_copy_accuracy": 0.8680202513933182, "mean_gen_accuracy": 0.8279585838317871, "mean_token_accuracy": 0.8374550640583038, "num_tokens": 78753671.0, "sample_num_tokens": 7799.75, "step": 291, "total_num_tokens": 78784870.0, "z_loss": 0.035508379340171814 }, { "copy_logits_max": 26.567045211791992, "copy_logits_min": -750000000.0, "copy_num_tokens": 285.625, "epoch": 0.05963747766147562, "gen_logits_max": 20.533241271972656, "gen_logits_mean": 1.2103939056396484, "gen_logits_min": -11.293665885925293, "gen_logits_std": 2.3686623573303223, "gen_loss": 0.5701991319656372, "grad_norm": 2.219505717164035, "learning_rate": 1.1877551020408163e-05, "loss": 0.6342, "mean_copy_accuracy": 0.842870831489563, "mean_gen_accuracy": 0.8244584500789642, "mean_token_accuracy": 0.8288000971078873, "num_tokens": 79023347.0, "sample_num_tokens": 7056.25, "step": 292, "total_num_tokens": 79051572.0, "z_loss": 0.027651293203234673 }, { "copy_logits_max": 33.0145263671875, "copy_logits_min": -750000000.0, "copy_num_tokens": 525.5, "epoch": 0.059841715598672456, "gen_logits_max": 19.641321182250977, "gen_logits_mean": 0.7783047556877136, "gen_logits_min": -12.563254356384277, "gen_logits_std": 2.41274094581604, "gen_loss": 0.6238414645195007, "grad_norm": 2.1404295883111253, "learning_rate": 1.1918367346938777e-05, "loss": 0.6351, "mean_copy_accuracy": 0.83913154900074, "mean_gen_accuracy": 0.8271310925483704, "mean_token_accuracy": 0.8301411122083664, "num_tokens": 79299006.0, "sample_num_tokens": 9361.5, "step": 293, "total_num_tokens": 79336452.0, "z_loss": 0.03680099546909332 }, { "copy_logits_max": 30.047542572021484, "copy_logits_min": -750000000.0, "copy_num_tokens": 474.75, "epoch": 0.060045953535869286, "gen_logits_max": 18.746814727783203, "gen_logits_mean": 0.5360540151596069, "gen_logits_min": -12.09023666381836, "gen_logits_std": 2.2782111167907715, "gen_loss": 0.5433059930801392, "grad_norm": 2.2936711698163674, "learning_rate": 1.1959183673469387e-05, "loss": 0.6057, "mean_copy_accuracy": 0.8348738998174667, "mean_gen_accuracy": 0.8363572508096695, "mean_token_accuracy": 0.8361614495515823, "num_tokens": 79558955.0, "sample_num_tokens": 8715.25, "step": 294, "total_num_tokens": 79593816.0, "z_loss": 0.037497371435165405 }, { "copy_logits_max": 29.27387809753418, "copy_logits_min": -750000000.0, "copy_num_tokens": 549.125, "epoch": 0.06025019147306612, "gen_logits_max": 17.458755493164062, "gen_logits_mean": 0.07359687238931656, "gen_logits_min": -12.100808143615723, "gen_logits_std": 2.2954459190368652, "gen_loss": 0.5867046117782593, "grad_norm": 2.2227716834337774, "learning_rate": 1.2e-05, "loss": 0.5877, "mean_copy_accuracy": 0.8603205233812332, "mean_gen_accuracy": 0.8406050503253937, "mean_token_accuracy": 0.8445432484149933, "num_tokens": 79819030.0, "sample_num_tokens": 8463.5, "step": 295, "total_num_tokens": 79852884.0, "z_loss": 0.04382564127445221 }, { "copy_logits_max": 29.786239624023438, "copy_logits_min": -750000000.0, "copy_num_tokens": 394.6875, "epoch": 0.06045442941026296, "gen_logits_max": 19.73297119140625, "gen_logits_mean": 0.7420817613601685, "gen_logits_min": -11.690420150756836, "gen_logits_std": 2.3202600479125977, "gen_loss": 0.5687384009361267, "grad_norm": 2.9988774141750234, "learning_rate": 1.2040816326530612e-05, "loss": 0.6123, "mean_copy_accuracy": 0.8589498698711395, "mean_gen_accuracy": 0.8307023793458939, "mean_token_accuracy": 0.8364351838827133, "num_tokens": 80076238.0, "sample_num_tokens": 8427.5, "step": 296, "total_num_tokens": 80109948.0, "z_loss": 0.03296500816941261 }, { "copy_logits_max": 27.15337371826172, "copy_logits_min": -750000000.0, "copy_num_tokens": 530.875, "epoch": 0.06065866734745979, "gen_logits_max": 17.866046905517578, "gen_logits_mean": 0.2463168203830719, "gen_logits_min": -11.624996185302734, "gen_logits_std": 2.3017237186431885, "gen_loss": 0.6098927855491638, "grad_norm": 2.205594551525199, "learning_rate": 1.2081632653061224e-05, "loss": 0.6148, "mean_copy_accuracy": 0.8555984050035477, "mean_gen_accuracy": 0.8351164013147354, "mean_token_accuracy": 0.8398543894290924, "num_tokens": 80338098.0, "sample_num_tokens": 8758.5, "step": 297, "total_num_tokens": 80373132.0, "z_loss": 0.04137212783098221 }, { "copy_logits_max": 35.189697265625, "copy_logits_min": -750000000.0, "copy_num_tokens": 524.875, "epoch": 0.060862905284656627, "gen_logits_max": 19.681697845458984, "gen_logits_mean": 0.6385667324066162, "gen_logits_min": -11.744010925292969, "gen_logits_std": 2.3580076694488525, "gen_loss": 0.5785985589027405, "grad_norm": 2.0627824168806144, "learning_rate": 1.2122448979591837e-05, "loss": 0.6415, "mean_copy_accuracy": 0.8547157794237137, "mean_gen_accuracy": 0.8235305845737457, "mean_token_accuracy": 0.8316297978162766, "num_tokens": 80598121.0, "sample_num_tokens": 8758.25, "step": 298, "total_num_tokens": 80633154.0, "z_loss": 0.04397309944033623 }, { "copy_logits_max": 31.21039390563965, "copy_logits_min": -625000000.0, "copy_num_tokens": 688.8125, "epoch": 0.06106714322185346, "gen_logits_max": 17.527530670166016, "gen_logits_mean": -0.033630020916461945, "gen_logits_min": -13.049851417541504, "gen_logits_std": 2.400444984436035, "gen_loss": 0.5822136402130127, "grad_norm": 2.7772442675352877, "learning_rate": 1.2163265306122449e-05, "loss": 0.6153, "mean_copy_accuracy": 0.8638289421796799, "mean_gen_accuracy": 0.8316937386989594, "mean_token_accuracy": 0.8388273268938065, "num_tokens": 80857742.0, "sample_num_tokens": 10307.0, "step": 299, "total_num_tokens": 80898970.0, "z_loss": 0.04902370274066925 }, { "copy_logits_max": 36.360626220703125, "copy_logits_min": -687500032.0, "copy_num_tokens": 603.375, "epoch": 0.061271381159050294, "gen_logits_max": 18.778108596801758, "gen_logits_mean": 0.5039204359054565, "gen_logits_min": -12.22874641418457, "gen_logits_std": 2.3948915004730225, "gen_loss": 0.6012382507324219, "grad_norm": 2.065559276437149, "learning_rate": 1.220408163265306e-05, "loss": 0.5678, "mean_copy_accuracy": 0.8852356970310211, "mean_gen_accuracy": 0.8406361043453217, "mean_token_accuracy": 0.8512444496154785, "num_tokens": 81117309.0, "sample_num_tokens": 8870.75, "step": 300, "total_num_tokens": 81152792.0, "z_loss": 0.04966273158788681 }, { "copy_logits_max": 29.779216766357422, "copy_logits_min": -750000000.0, "copy_num_tokens": 443.375, "epoch": 0.06147561909624713, "gen_logits_max": 19.499832153320312, "gen_logits_mean": 0.4839397072792053, "gen_logits_min": -11.720643997192383, "gen_logits_std": 2.3832454681396484, "gen_loss": 0.5140382051467896, "grad_norm": 2.5819513554659643, "learning_rate": 1.2244897959183674e-05, "loss": 0.6066, "mean_copy_accuracy": 0.875515028834343, "mean_gen_accuracy": 0.8354846984148026, "mean_token_accuracy": 0.8436983525753021, "num_tokens": 81393050.0, "sample_num_tokens": 8606.0, "step": 301, "total_num_tokens": 81427474.0, "z_loss": 0.03912189230322838 }, { "copy_logits_max": 27.19171905517578, "copy_logits_min": -750000000.0, "copy_num_tokens": 381.375, "epoch": 0.06167985703344396, "gen_logits_max": 20.06126594543457, "gen_logits_mean": 0.6642391085624695, "gen_logits_min": -11.572998046875, "gen_logits_std": 2.2533960342407227, "gen_loss": 0.5411894917488098, "grad_norm": 2.2356298461106587, "learning_rate": 1.2285714285714286e-05, "loss": 0.5808, "mean_copy_accuracy": 0.8707084655761719, "mean_gen_accuracy": 0.8366517722606659, "mean_token_accuracy": 0.8444003909826279, "num_tokens": 81663522.0, "sample_num_tokens": 9030.0, "step": 302, "total_num_tokens": 81699642.0, "z_loss": 0.028933752328157425 }, { "copy_logits_max": 28.766544342041016, "copy_logits_min": -750000000.0, "copy_num_tokens": 402.5625, "epoch": 0.0618840949706408, "gen_logits_max": 19.100160598754883, "gen_logits_mean": 0.5837496519088745, "gen_logits_min": -11.958925247192383, "gen_logits_std": 2.3283135890960693, "gen_loss": 0.6023284792900085, "grad_norm": 2.8032092725891378, "learning_rate": 1.23265306122449e-05, "loss": 0.6414, "mean_copy_accuracy": 0.8660258650779724, "mean_gen_accuracy": 0.8250443637371063, "mean_token_accuracy": 0.8351379781961441, "num_tokens": 81944875.0, "sample_num_tokens": 8208.25, "step": 303, "total_num_tokens": 81977708.0, "z_loss": 0.03503067418932915 }, { "copy_logits_max": 36.73702621459961, "copy_logits_min": -687500032.0, "copy_num_tokens": 465.0625, "epoch": 0.06208833290783763, "gen_logits_max": 19.646778106689453, "gen_logits_mean": 0.33485788106918335, "gen_logits_min": -12.785041809082031, "gen_logits_std": 2.389495372772217, "gen_loss": 0.6020550727844238, "grad_norm": 3.7091800256511855, "learning_rate": 1.2367346938775511e-05, "loss": 0.6327, "mean_copy_accuracy": 0.8245518952608109, "mean_gen_accuracy": 0.8262102156877518, "mean_token_accuracy": 0.8256911933422089, "num_tokens": 82215515.0, "sample_num_tokens": 8455.25, "step": 304, "total_num_tokens": 82249336.0, "z_loss": 0.040953125804662704 }, { "copy_logits_max": 34.404823303222656, "copy_logits_min": -750000000.0, "copy_num_tokens": 708.5625, "epoch": 0.062292570845034465, "gen_logits_max": 18.247621536254883, "gen_logits_mean": 0.11095050722360611, "gen_logits_min": -12.54043197631836, "gen_logits_std": 2.3868184089660645, "gen_loss": 0.6904184818267822, "grad_norm": 2.6609241784711344, "learning_rate": 1.2408163265306123e-05, "loss": 0.6676, "mean_copy_accuracy": 0.8621853888034821, "mean_gen_accuracy": 0.8204783499240875, "mean_token_accuracy": 0.8321450799703598, "num_tokens": 82513593.0, "sample_num_tokens": 10824.25, "step": 305, "total_num_tokens": 82556890.0, "z_loss": 0.049110569059848785 }, { "copy_logits_max": 40.02666473388672, "copy_logits_min": -562500032.0, "copy_num_tokens": 591.4375, "epoch": 0.0624968087822313, "gen_logits_max": 18.74073600769043, "gen_logits_mean": 0.24078655242919922, "gen_logits_min": -12.313352584838867, "gen_logits_std": 2.3125178813934326, "gen_loss": 0.5734477043151855, "grad_norm": 2.9275840158084865, "learning_rate": 1.2448979591836736e-05, "loss": 0.6196, "mean_copy_accuracy": 0.8697117120027542, "mean_gen_accuracy": 0.8304175585508347, "mean_token_accuracy": 0.8402353078126907, "num_tokens": 82796095.0, "sample_num_tokens": 9559.25, "step": 306, "total_num_tokens": 82834332.0, "z_loss": 0.050450824201107025 }, { "copy_logits_max": 38.094844818115234, "copy_logits_min": -750000000.0, "copy_num_tokens": 527.3125, "epoch": 0.06270104671942814, "gen_logits_max": 18.22092056274414, "gen_logits_mean": 0.03772847354412079, "gen_logits_min": -12.77901840209961, "gen_logits_std": 2.3977699279785156, "gen_loss": 0.6033939719200134, "grad_norm": 1.9338630571897097, "learning_rate": 1.2489795918367346e-05, "loss": 0.5931, "mean_copy_accuracy": 0.9010059088468552, "mean_gen_accuracy": 0.8268922120332718, "mean_token_accuracy": 0.8439023196697235, "num_tokens": 83069542.0, "sample_num_tokens": 8231.0, "step": 307, "total_num_tokens": 83102466.0, "z_loss": 0.05238692834973335 }, { "copy_logits_max": 32.53559875488281, "copy_logits_min": -750000000.0, "copy_num_tokens": 362.625, "epoch": 0.06290528465662497, "gen_logits_max": 19.590118408203125, "gen_logits_mean": 0.4584014117717743, "gen_logits_min": -11.642770767211914, "gen_logits_std": 2.301360607147217, "gen_loss": 0.5525191426277161, "grad_norm": 2.180928790047755, "learning_rate": 1.2530612244897958e-05, "loss": 0.585, "mean_copy_accuracy": 0.8678147494792938, "mean_gen_accuracy": 0.8379785716533661, "mean_token_accuracy": 0.8446778208017349, "num_tokens": 83337049.0, "sample_num_tokens": 7214.25, "step": 308, "total_num_tokens": 83365906.0, "z_loss": 0.03840520232915878 }, { "copy_logits_max": 38.20581817626953, "copy_logits_min": -750000000.0, "copy_num_tokens": 490.5625, "epoch": 0.0631095225938218, "gen_logits_max": 18.958568572998047, "gen_logits_mean": 0.15980543196201324, "gen_logits_min": -12.660340309143066, "gen_logits_std": 2.376101493835449, "gen_loss": 0.5991458296775818, "grad_norm": 2.4640806113879576, "learning_rate": 1.2571428571428572e-05, "loss": 0.6454, "mean_copy_accuracy": 0.8751478642225266, "mean_gen_accuracy": 0.8135766386985779, "mean_token_accuracy": 0.8301740288734436, "num_tokens": 83582634.0, "sample_num_tokens": 8026.0, "step": 309, "total_num_tokens": 83614738.0, "z_loss": 0.04923747852444649 }, { "copy_logits_max": 28.789003372192383, "copy_logits_min": -750000000.0, "copy_num_tokens": 355.4375, "epoch": 0.06331376053101864, "gen_logits_max": 18.317119598388672, "gen_logits_mean": 0.32265716791152954, "gen_logits_min": -11.99441146850586, "gen_logits_std": 2.3350443840026855, "gen_loss": 0.5531904697418213, "grad_norm": 2.630111164990522, "learning_rate": 1.2612244897959183e-05, "loss": 0.5932, "mean_copy_accuracy": 0.8902673572301865, "mean_gen_accuracy": 0.8294292241334915, "mean_token_accuracy": 0.8434469699859619, "num_tokens": 83834458.0, "sample_num_tokens": 7485.5, "step": 310, "total_num_tokens": 83864400.0, "z_loss": 0.03934915363788605 }, { "copy_logits_max": 31.816383361816406, "copy_logits_min": -687500032.0, "copy_num_tokens": 460.375, "epoch": 0.06351799846821547, "gen_logits_max": 18.773910522460938, "gen_logits_mean": 0.401533305644989, "gen_logits_min": -11.792841911315918, "gen_logits_std": 2.2703957557678223, "gen_loss": 0.5093444585800171, "grad_norm": 2.791643550972229, "learning_rate": 1.2653061224489797e-05, "loss": 0.6055, "mean_copy_accuracy": 0.8660593628883362, "mean_gen_accuracy": 0.8326889872550964, "mean_token_accuracy": 0.8415597379207611, "num_tokens": 84132447.0, "sample_num_tokens": 9005.75, "step": 311, "total_num_tokens": 84168470.0, "z_loss": 0.04193251579999924 }, { "copy_logits_max": 32.42195510864258, "copy_logits_min": -750000000.0, "copy_num_tokens": 467.5, "epoch": 0.0637222364054123, "gen_logits_max": 20.68928337097168, "gen_logits_mean": 0.7203028202056885, "gen_logits_min": -11.391983032226562, "gen_logits_std": 2.2757205963134766, "gen_loss": 0.546445369720459, "grad_norm": 6.623262388190093, "learning_rate": 1.2693877551020409e-05, "loss": 0.5872, "mean_copy_accuracy": 0.8934047222137451, "mean_gen_accuracy": 0.832646906375885, "mean_token_accuracy": 0.8468173295259476, "num_tokens": 84399914.0, "sample_num_tokens": 8596.5, "step": 312, "total_num_tokens": 84434300.0, "z_loss": 0.03859437257051468 }, { "copy_logits_max": 33.653953552246094, "copy_logits_min": -750000000.0, "copy_num_tokens": 541.3125, "epoch": 0.06392647434260915, "gen_logits_max": 18.654287338256836, "gen_logits_mean": 0.13582926988601685, "gen_logits_min": -12.613146781921387, "gen_logits_std": 2.3731837272644043, "gen_loss": 0.5284528732299805, "grad_norm": 2.003029537597901, "learning_rate": 1.273469387755102e-05, "loss": 0.6262, "mean_copy_accuracy": 0.8761695325374603, "mean_gen_accuracy": 0.8296587914228439, "mean_token_accuracy": 0.8430715352296829, "num_tokens": 84689733.0, "sample_num_tokens": 9249.25, "step": 313, "total_num_tokens": 84726730.0, "z_loss": 0.04747755825519562 }, { "copy_logits_max": 29.17340660095215, "copy_logits_min": -625000000.0, "copy_num_tokens": 202.0, "epoch": 0.06413071227980598, "gen_logits_max": 20.149080276489258, "gen_logits_mean": 0.5310180187225342, "gen_logits_min": -11.436954498291016, "gen_logits_std": 2.261836528778076, "gen_loss": 0.5218088626861572, "grad_norm": 3.1925544721769548, "learning_rate": 1.2775510204081634e-05, "loss": 0.6255, "mean_copy_accuracy": 0.8649478256702423, "mean_gen_accuracy": 0.8264436721801758, "mean_token_accuracy": 0.8351768553256989, "num_tokens": 84961814.0, "sample_num_tokens": 6346.0, "step": 314, "total_num_tokens": 84987198.0, "z_loss": 0.028581485152244568 }, { "copy_logits_max": 32.943241119384766, "copy_logits_min": -750000000.0, "copy_num_tokens": 422.625, "epoch": 0.0643349502170028, "gen_logits_max": 18.02375602722168, "gen_logits_mean": -0.003159802407026291, "gen_logits_min": -11.878631591796875, "gen_logits_std": 2.3094565868377686, "gen_loss": 0.5942530632019043, "grad_norm": 4.34646203251572, "learning_rate": 1.2816326530612245e-05, "loss": 0.6311, "mean_copy_accuracy": 0.9002175778150558, "mean_gen_accuracy": 0.8293023556470871, "mean_token_accuracy": 0.8474496752023697, "num_tokens": 85233042.0, "sample_num_tokens": 7443.5, "step": 315, "total_num_tokens": 85262816.0, "z_loss": 0.048260241746902466 }, { "copy_logits_max": 30.91387176513672, "copy_logits_min": -750000000.0, "copy_num_tokens": 423.25, "epoch": 0.06453918815419964, "gen_logits_max": 18.496936798095703, "gen_logits_mean": 0.15832021832466125, "gen_logits_min": -12.122274398803711, "gen_logits_std": 2.3195157051086426, "gen_loss": 0.5165412425994873, "grad_norm": 2.950454263282256, "learning_rate": 1.2857142857142857e-05, "loss": 0.5952, "mean_copy_accuracy": 0.8578984290361404, "mean_gen_accuracy": 0.8328672647476196, "mean_token_accuracy": 0.8387956023216248, "num_tokens": 85492416.0, "sample_num_tokens": 8032.0, "step": 316, "total_num_tokens": 85524544.0, "z_loss": 0.04426761716604233 }, { "copy_logits_max": 40.313621520996094, "copy_logits_min": -687500032.0, "copy_num_tokens": 502.875, "epoch": 0.06474342609139648, "gen_logits_max": 19.981769561767578, "gen_logits_mean": 0.3027011454105377, "gen_logits_min": -12.545351028442383, "gen_logits_std": 2.3556365966796875, "gen_loss": 0.5512483716011047, "grad_norm": 2.205266849578681, "learning_rate": 1.289795918367347e-05, "loss": 0.5969, "mean_copy_accuracy": 0.8857145607471466, "mean_gen_accuracy": 0.8305371701717377, "mean_token_accuracy": 0.8446386456489563, "num_tokens": 85738177.0, "sample_num_tokens": 8415.25, "step": 317, "total_num_tokens": 85771838.0, "z_loss": 0.048711083829402924 }, { "copy_logits_max": 37.30657196044922, "copy_logits_min": -625000000.0, "copy_num_tokens": 577.25, "epoch": 0.06494766402859331, "gen_logits_max": 17.728761672973633, "gen_logits_mean": -0.16111454367637634, "gen_logits_min": -12.185986518859863, "gen_logits_std": 2.279982089996338, "gen_loss": 0.5778636932373047, "grad_norm": 3.3611783187153637, "learning_rate": 1.2938775510204082e-05, "loss": 0.602, "mean_copy_accuracy": 0.8921942263841629, "mean_gen_accuracy": 0.8311754167079926, "mean_token_accuracy": 0.84556944668293, "num_tokens": 86004339.0, "sample_num_tokens": 9702.25, "step": 318, "total_num_tokens": 86043148.0, "z_loss": 0.052720122039318085 }, { "copy_logits_max": 37.0968017578125, "copy_logits_min": -750000000.0, "copy_num_tokens": 695.25, "epoch": 0.06515190196579014, "gen_logits_max": 17.59444808959961, "gen_logits_mean": -0.26341763138771057, "gen_logits_min": -12.833352088928223, "gen_logits_std": 2.3466796875, "gen_loss": 0.5720725059509277, "grad_norm": 3.658458359039841, "learning_rate": 1.2979591836734694e-05, "loss": 0.601, "mean_copy_accuracy": 0.8901582360267639, "mean_gen_accuracy": 0.8333929777145386, "mean_token_accuracy": 0.850210890173912, "num_tokens": 86307553.0, "sample_num_tokens": 9869.25, "step": 319, "total_num_tokens": 86347030.0, "z_loss": 0.05888870358467102 }, { "copy_logits_max": 44.50075912475586, "copy_logits_min": -687500032.0, "copy_num_tokens": 645.9375, "epoch": 0.06535613990298698, "gen_logits_max": 18.21834373474121, "gen_logits_mean": -0.2753693163394928, "gen_logits_min": -12.700408935546875, "gen_logits_std": 2.3372654914855957, "gen_loss": 0.6295223832130432, "grad_norm": 3.086822234539929, "learning_rate": 1.3020408163265306e-05, "loss": 0.6076, "mean_copy_accuracy": 0.8772944957017899, "mean_gen_accuracy": 0.8297412842512131, "mean_token_accuracy": 0.8417798429727554, "num_tokens": 86566213.0, "sample_num_tokens": 9299.75, "step": 320, "total_num_tokens": 86603412.0, "z_loss": 0.05848976969718933 }, { "copy_logits_max": 38.006107330322266, "copy_logits_min": -687500032.0, "copy_num_tokens": 577.9375, "epoch": 0.06556037784018381, "gen_logits_max": 19.190166473388672, "gen_logits_mean": -0.04539524018764496, "gen_logits_min": -12.558839797973633, "gen_logits_std": 2.351814031600952, "gen_loss": 0.6162668466567993, "grad_norm": 2.5920018259447595, "learning_rate": 1.3061224489795918e-05, "loss": 0.6183, "mean_copy_accuracy": 0.8737147748470306, "mean_gen_accuracy": 0.8271790593862534, "mean_token_accuracy": 0.8392373025417328, "num_tokens": 86857403.0, "sample_num_tokens": 9577.25, "step": 321, "total_num_tokens": 86895712.0, "z_loss": 0.04923664405941963 }, { "copy_logits_max": 42.72813415527344, "copy_logits_min": -687500032.0, "copy_num_tokens": 668.375, "epoch": 0.06576461577738064, "gen_logits_max": 18.943557739257812, "gen_logits_mean": 0.01943102478981018, "gen_logits_min": -12.928276062011719, "gen_logits_std": 2.396151542663574, "gen_loss": 0.5508971810340881, "grad_norm": 2.6410829607489172, "learning_rate": 1.3102040816326531e-05, "loss": 0.6127, "mean_copy_accuracy": 0.8935198485851288, "mean_gen_accuracy": 0.8307943046092987, "mean_token_accuracy": 0.8473130166530609, "num_tokens": 87159918.0, "sample_num_tokens": 9390.5, "step": 322, "total_num_tokens": 87197480.0, "z_loss": 0.057454124093055725 }, { "copy_logits_max": 36.9849853515625, "copy_logits_min": -750000000.0, "copy_num_tokens": 650.8125, "epoch": 0.06596885371457749, "gen_logits_max": 17.69489288330078, "gen_logits_mean": -0.25401613116264343, "gen_logits_min": -12.68602466583252, "gen_logits_std": 2.3073482513427734, "gen_loss": 0.5395482778549194, "grad_norm": 2.6473298801337823, "learning_rate": 1.3142857142857143e-05, "loss": 0.5888, "mean_copy_accuracy": 0.8822260499000549, "mean_gen_accuracy": 0.8377607315778732, "mean_token_accuracy": 0.8490242511034012, "num_tokens": 87458669.0, "sample_num_tokens": 9820.25, "step": 323, "total_num_tokens": 87497950.0, "z_loss": 0.054587192833423615 }, { "copy_logits_max": 32.59043884277344, "copy_logits_min": -750000000.0, "copy_num_tokens": 525.75, "epoch": 0.06617309165177432, "gen_logits_max": 17.73939323425293, "gen_logits_mean": -0.519134521484375, "gen_logits_min": -13.709016799926758, "gen_logits_std": 2.424814224243164, "gen_loss": 0.5528184175491333, "grad_norm": 1.645462922134338, "learning_rate": 1.3183673469387755e-05, "loss": 0.567, "mean_copy_accuracy": 0.904300257563591, "mean_gen_accuracy": 0.8355744183063507, "mean_token_accuracy": 0.8496961146593094, "num_tokens": 87719029.0, "sample_num_tokens": 8884.75, "step": 324, "total_num_tokens": 87754568.0, "z_loss": 0.05283088609576225 }, { "copy_logits_max": 33.43061065673828, "copy_logits_min": -750000000.0, "copy_num_tokens": 528.0, "epoch": 0.06637732958897115, "gen_logits_max": 18.130596160888672, "gen_logits_mean": -0.07920649647712708, "gen_logits_min": -12.269876480102539, "gen_logits_std": 2.3481123447418213, "gen_loss": 0.5354891419410706, "grad_norm": 2.3124458034558986, "learning_rate": 1.3224489795918368e-05, "loss": 0.6133, "mean_copy_accuracy": 0.8833384960889816, "mean_gen_accuracy": 0.8266817480325699, "mean_token_accuracy": 0.840738981962204, "num_tokens": 87999403.0, "sample_num_tokens": 9144.75, "step": 325, "total_num_tokens": 88035982.0, "z_loss": 0.04919569939374924 }, { "copy_logits_max": 31.279457092285156, "copy_logits_min": -750000000.0, "copy_num_tokens": 676.0625, "epoch": 0.06658156752616799, "gen_logits_max": 17.02922821044922, "gen_logits_mean": -0.2667812705039978, "gen_logits_min": -12.474441528320312, "gen_logits_std": 2.328763961791992, "gen_loss": 0.5072534084320068, "grad_norm": 3.08984520785493, "learning_rate": 1.326530612244898e-05, "loss": 0.6154, "mean_copy_accuracy": 0.9115516990423203, "mean_gen_accuracy": 0.8288794308900833, "mean_token_accuracy": 0.8521890193223953, "num_tokens": 88292281.0, "sample_num_tokens": 9955.25, "step": 326, "total_num_tokens": 88332102.0, "z_loss": 0.06080601364374161 }, { "copy_logits_max": 25.221757888793945, "copy_logits_min": -750000000.0, "copy_num_tokens": 339.125, "epoch": 0.06678580546336482, "gen_logits_max": 19.34355926513672, "gen_logits_mean": 0.16492800414562225, "gen_logits_min": -11.658458709716797, "gen_logits_std": 2.260866403579712, "gen_loss": 0.5124898552894592, "grad_norm": 2.798605118009267, "learning_rate": 1.3306122448979593e-05, "loss": 0.5859, "mean_copy_accuracy": 0.8738647550344467, "mean_gen_accuracy": 0.8356567323207855, "mean_token_accuracy": 0.8450314551591873, "num_tokens": 88580531.0, "sample_num_tokens": 8034.25, "step": 327, "total_num_tokens": 88612668.0, "z_loss": 0.03357068449258804 }, { "copy_logits_max": 30.965023040771484, "copy_logits_min": -750000000.0, "copy_num_tokens": 390.875, "epoch": 0.06699004340056165, "gen_logits_max": 18.94266128540039, "gen_logits_mean": -0.2335498034954071, "gen_logits_min": -12.579229354858398, "gen_logits_std": 2.3386240005493164, "gen_loss": 0.5271021127700806, "grad_norm": 1.6789844353701642, "learning_rate": 1.3346938775510205e-05, "loss": 0.5732, "mean_copy_accuracy": 0.9071089029312134, "mean_gen_accuracy": 0.8347006291151047, "mean_token_accuracy": 0.8504109978675842, "num_tokens": 88833452.0, "sample_num_tokens": 7703.5, "step": 328, "total_num_tokens": 88864266.0, "z_loss": 0.040065914392471313 }, { "copy_logits_max": 44.211090087890625, "copy_logits_min": -750000000.0, "copy_num_tokens": 497.0625, "epoch": 0.06719428133775848, "gen_logits_max": 18.383811950683594, "gen_logits_mean": -0.519189715385437, "gen_logits_min": -13.150943756103516, "gen_logits_std": 2.36690616607666, "gen_loss": 0.5728081464767456, "grad_norm": 2.6529152996690435, "learning_rate": 1.3387755102040817e-05, "loss": 0.5771, "mean_copy_accuracy": 0.900732085108757, "mean_gen_accuracy": 0.8347468674182892, "mean_token_accuracy": 0.8491240441799164, "num_tokens": 89079241.0, "sample_num_tokens": 8292.75, "step": 329, "total_num_tokens": 89112412.0, "z_loss": 0.05359817296266556 }, { "copy_logits_max": 35.439605712890625, "copy_logits_min": -625000000.0, "copy_num_tokens": 536.75, "epoch": 0.06739851927495533, "gen_logits_max": 17.280780792236328, "gen_logits_mean": -0.4645249843597412, "gen_logits_min": -12.28950023651123, "gen_logits_std": 2.304353713989258, "gen_loss": 0.5740358829498291, "grad_norm": 1.3928623375277154, "learning_rate": 1.342857142857143e-05, "loss": 0.5868, "mean_copy_accuracy": 0.9174630343914032, "mean_gen_accuracy": 0.8260972201824188, "mean_token_accuracy": 0.8484161645174026, "num_tokens": 89341561.0, "sample_num_tokens": 9135.75, "step": 330, "total_num_tokens": 89378104.0, "z_loss": 0.05130026862025261 }, { "copy_logits_max": 43.204856872558594, "copy_logits_min": -625000064.0, "copy_num_tokens": 653.25, "epoch": 0.06760275721215216, "gen_logits_max": 18.141536712646484, "gen_logits_mean": -0.5153432488441467, "gen_logits_min": -13.326568603515625, "gen_logits_std": 2.3798749446868896, "gen_loss": 0.5213122367858887, "grad_norm": 2.4858253930292427, "learning_rate": 1.3469387755102042e-05, "loss": 0.6034, "mean_copy_accuracy": 0.9037714749574661, "mean_gen_accuracy": 0.8265238106250763, "mean_token_accuracy": 0.8475059419870377, "num_tokens": 89625773.0, "sample_num_tokens": 9463.25, "step": 331, "total_num_tokens": 89663626.0, "z_loss": 0.05939318239688873 }, { "copy_logits_max": 28.775978088378906, "copy_logits_min": -750000000.0, "copy_num_tokens": 299.9375, "epoch": 0.06780699514934899, "gen_logits_max": 18.570785522460938, "gen_logits_mean": -0.05224873125553131, "gen_logits_min": -12.178796768188477, "gen_logits_std": 2.286792755126953, "gen_loss": 0.5074328184127808, "grad_norm": 1.4667050386900549, "learning_rate": 1.3510204081632652e-05, "loss": 0.6005, "mean_copy_accuracy": 0.9212416857481003, "mean_gen_accuracy": 0.8202331513166428, "mean_token_accuracy": 0.8431187868118286, "num_tokens": 89898752.0, "sample_num_tokens": 7347.0, "step": 332, "total_num_tokens": 89928140.0, "z_loss": 0.03458035737276077 }, { "copy_logits_max": 29.759998321533203, "copy_logits_min": -750000000.0, "copy_num_tokens": 400.875, "epoch": 0.06801123308654583, "gen_logits_max": 18.657499313354492, "gen_logits_mean": -0.15053439140319824, "gen_logits_min": -12.492569923400879, "gen_logits_std": 2.322709560394287, "gen_loss": 0.5652737617492676, "grad_norm": 2.198865470131207, "learning_rate": 1.3551020408163265e-05, "loss": 0.5995, "mean_copy_accuracy": 0.887282595038414, "mean_gen_accuracy": 0.8258926719427109, "mean_token_accuracy": 0.8415292501449585, "num_tokens": 90176713.0, "sample_num_tokens": 8912.75, "step": 333, "total_num_tokens": 90212364.0, "z_loss": 0.03888294845819473 }, { "copy_logits_max": 38.70874786376953, "copy_logits_min": -625000000.0, "copy_num_tokens": 510.1875, "epoch": 0.06821547102374266, "gen_logits_max": 17.88853645324707, "gen_logits_mean": -0.5272647142410278, "gen_logits_min": -13.69857406616211, "gen_logits_std": 2.4131925106048584, "gen_loss": 0.5119147896766663, "grad_norm": 1.5261150945188298, "learning_rate": 1.3591836734693877e-05, "loss": 0.5625, "mean_copy_accuracy": 0.9243954420089722, "mean_gen_accuracy": 0.8305817693471909, "mean_token_accuracy": 0.8562184423208237, "num_tokens": 90442198.0, "sample_num_tokens": 8248.0, "step": 334, "total_num_tokens": 90475190.0, "z_loss": 0.05307517573237419 }, { "copy_logits_max": 34.65094757080078, "copy_logits_min": -687500032.0, "copy_num_tokens": 739.1875, "epoch": 0.06841970896093949, "gen_logits_max": 17.1651668548584, "gen_logits_mean": -0.49105629324913025, "gen_logits_min": -13.004739761352539, "gen_logits_std": 2.3579635620117188, "gen_loss": 0.5178332328796387, "grad_norm": 2.1285645831500135, "learning_rate": 1.363265306122449e-05, "loss": 0.5725, "mean_copy_accuracy": 0.9177276045084, "mean_gen_accuracy": 0.8341137021780014, "mean_token_accuracy": 0.8572405725717545, "num_tokens": 90726409.0, "sample_num_tokens": 10100.75, "step": 335, "total_num_tokens": 90766812.0, "z_loss": 0.060606565326452255 }, { "copy_logits_max": 33.77545166015625, "copy_logits_min": -750000000.0, "copy_num_tokens": 311.8125, "epoch": 0.06862394689813633, "gen_logits_max": 19.007610321044922, "gen_logits_mean": -0.10315308719873428, "gen_logits_min": -13.255131721496582, "gen_logits_std": 2.4051594734191895, "gen_loss": 0.6017353534698486, "grad_norm": 1.599370062290051, "learning_rate": 1.3673469387755102e-05, "loss": 0.5858, "mean_copy_accuracy": 0.9221390336751938, "mean_gen_accuracy": 0.8284984827041626, "mean_token_accuracy": 0.848643109202385, "num_tokens": 90982296.0, "sample_num_tokens": 6559.0, "step": 336, "total_num_tokens": 91008532.0, "z_loss": 0.039225418120622635 }, { "copy_logits_max": 30.907848358154297, "copy_logits_min": -687500032.0, "copy_num_tokens": 510.6875, "epoch": 0.06882818483533316, "gen_logits_max": 18.120628356933594, "gen_logits_mean": -0.39968252182006836, "gen_logits_min": -12.504626274108887, "gen_logits_std": 2.3213438987731934, "gen_loss": 0.5351040959358215, "grad_norm": 3.061069568322827, "learning_rate": 1.3714285714285714e-05, "loss": 0.5608, "mean_copy_accuracy": 0.9091272354125977, "mean_gen_accuracy": 0.8344139456748962, "mean_token_accuracy": 0.853818267583847, "num_tokens": 91249216.0, "sample_num_tokens": 9436.5, "step": 337, "total_num_tokens": 91286962.0, "z_loss": 0.03928626328706741 }, { "copy_logits_max": 30.125389099121094, "copy_logits_min": -750000000.0, "copy_num_tokens": 538.75, "epoch": 0.06903242277253, "gen_logits_max": 17.72006607055664, "gen_logits_mean": -0.7411514520645142, "gen_logits_min": -13.24615478515625, "gen_logits_std": 2.376042604446411, "gen_loss": 0.5242135524749756, "grad_norm": 1.6762668397328928, "learning_rate": 1.3755102040816328e-05, "loss": 0.5657, "mean_copy_accuracy": 0.9153283685445786, "mean_gen_accuracy": 0.8316309750080109, "mean_token_accuracy": 0.8522692769765854, "num_tokens": 91516379.0, "sample_num_tokens": 9254.25, "step": 338, "total_num_tokens": 91553396.0, "z_loss": 0.04728547856211662 }, { "copy_logits_max": 29.277347564697266, "copy_logits_min": -687500032.0, "copy_num_tokens": 290.75, "epoch": 0.06923666070972684, "gen_logits_max": 18.942655563354492, "gen_logits_mean": -0.1425430029630661, "gen_logits_min": -13.159940719604492, "gen_logits_std": 2.3382725715637207, "gen_loss": 0.4868498742580414, "grad_norm": 1.7443292865591968, "learning_rate": 1.379591836734694e-05, "loss": 0.5434, "mean_copy_accuracy": 0.9262265563011169, "mean_gen_accuracy": 0.8349148631095886, "mean_token_accuracy": 0.8589967489242554, "num_tokens": 91802306.0, "sample_num_tokens": 7116.5, "step": 339, "total_num_tokens": 91830772.0, "z_loss": 0.03300857171416283 }, { "copy_logits_max": 22.287036895751953, "copy_logits_min": -750000000.0, "copy_num_tokens": 230.4375, "epoch": 0.06944089864692367, "gen_logits_max": 18.937515258789062, "gen_logits_mean": -0.42304685711860657, "gen_logits_min": -12.154208183288574, "gen_logits_std": 2.293544292449951, "gen_loss": 0.5040287971496582, "grad_norm": 1.7677938330672258, "learning_rate": 1.3836734693877551e-05, "loss": 0.556, "mean_copy_accuracy": 0.9472697377204895, "mean_gen_accuracy": 0.8276032358407974, "mean_token_accuracy": 0.8552822172641754, "num_tokens": 92062714.0, "sample_num_tokens": 6919.5, "step": 340, "total_num_tokens": 92090392.0, "z_loss": 0.025723442435264587 }, { "copy_logits_max": 34.5650520324707, "copy_logits_min": -750000000.0, "copy_num_tokens": 563.625, "epoch": 0.0696451365841205, "gen_logits_max": 18.62374496459961, "gen_logits_mean": -0.8905784487724304, "gen_logits_min": -13.25757122039795, "gen_logits_std": 2.3883917331695557, "gen_loss": 0.4931347966194153, "grad_norm": 1.8268688080114681, "learning_rate": 1.3877551020408165e-05, "loss": 0.5448, "mean_copy_accuracy": 0.9398755729198456, "mean_gen_accuracy": 0.8314569890499115, "mean_token_accuracy": 0.8580370396375656, "num_tokens": 92316405.0, "sample_num_tokens": 9208.75, "step": 341, "total_num_tokens": 92353240.0, "z_loss": 0.04945003613829613 }, { "copy_logits_max": 34.48834228515625, "copy_logits_min": -750000000.0, "copy_num_tokens": 359.625, "epoch": 0.06984937452131733, "gen_logits_max": 18.2324161529541, "gen_logits_mean": -0.7346630096435547, "gen_logits_min": -13.491421699523926, "gen_logits_std": 2.3632776737213135, "gen_loss": 0.49680691957473755, "grad_norm": 1.7489638898956335, "learning_rate": 1.3918367346938776e-05, "loss": 0.5455, "mean_copy_accuracy": 0.930080384016037, "mean_gen_accuracy": 0.8273743093013763, "mean_token_accuracy": 0.8534992635250092, "num_tokens": 92581754.0, "sample_num_tokens": 6787.0, "step": 342, "total_num_tokens": 92608902.0, "z_loss": 0.04280093312263489 }, { "copy_logits_max": 29.01451873779297, "copy_logits_min": -750000000.0, "copy_num_tokens": 254.3125, "epoch": 0.07005361245851417, "gen_logits_max": 18.834854125976562, "gen_logits_mean": -0.37949997186660767, "gen_logits_min": -12.221385955810547, "gen_logits_std": 2.284334659576416, "gen_loss": 0.5406605005264282, "grad_norm": 1.7401114053079818, "learning_rate": 1.3959183673469388e-05, "loss": 0.5546, "mean_copy_accuracy": 0.9133306741714478, "mean_gen_accuracy": 0.8334287405014038, "mean_token_accuracy": 0.8503381311893463, "num_tokens": 92842390.0, "sample_num_tokens": 7524.5, "step": 343, "total_num_tokens": 92872488.0, "z_loss": 0.028357353061437607 }, { "copy_logits_max": 38.613948822021484, "copy_logits_min": -750000000.0, "copy_num_tokens": 638.5625, "epoch": 0.070257850395711, "gen_logits_max": 18.742107391357422, "gen_logits_mean": -0.7801882028579712, "gen_logits_min": -13.662321090698242, "gen_logits_std": 2.4291505813598633, "gen_loss": 0.47791850566864014, "grad_norm": 1.4954396357318303, "learning_rate": 1.4e-05, "loss": 0.5257, "mean_copy_accuracy": 0.9314101934432983, "mean_gen_accuracy": 0.8405490070581436, "mean_token_accuracy": 0.8640744835138321, "num_tokens": 93099874.0, "sample_num_tokens": 9188.5, "step": 344, "total_num_tokens": 93136628.0, "z_loss": 0.05132681876420975 }, { "copy_logits_max": 40.88059616088867, "copy_logits_min": -687500032.0, "copy_num_tokens": 523.5625, "epoch": 0.07046208833290783, "gen_logits_max": 18.690990447998047, "gen_logits_mean": -0.7285199761390686, "gen_logits_min": -13.804500579833984, "gen_logits_std": 2.4580366611480713, "gen_loss": 0.54869544506073, "grad_norm": 1.7006484208902657, "learning_rate": 1.4040816326530612e-05, "loss": 0.5388, "mean_copy_accuracy": 0.9403885155916214, "mean_gen_accuracy": 0.8328353017568588, "mean_token_accuracy": 0.8627269268035889, "num_tokens": 93372387.0, "sample_num_tokens": 8438.75, "step": 345, "total_num_tokens": 93406142.0, "z_loss": 0.047926414757966995 }, { "copy_logits_max": 30.38640022277832, "copy_logits_min": -750000000.0, "copy_num_tokens": 454.125, "epoch": 0.07066632627010468, "gen_logits_max": 18.24611473083496, "gen_logits_mean": -0.7158311605453491, "gen_logits_min": -13.431615829467773, "gen_logits_std": 2.4613828659057617, "gen_loss": 0.5244665145874023, "grad_norm": 1.9706144442117983, "learning_rate": 1.4081632653061225e-05, "loss": 0.573, "mean_copy_accuracy": 0.9368574321269989, "mean_gen_accuracy": 0.8228151798248291, "mean_token_accuracy": 0.8522408604621887, "num_tokens": 93633718.0, "sample_num_tokens": 7962.5, "step": 346, "total_num_tokens": 93665568.0, "z_loss": 0.04120289161801338 }, { "copy_logits_max": 41.36994171142578, "copy_logits_min": -750000000.0, "copy_num_tokens": 620.5, "epoch": 0.0708705642073015, "gen_logits_max": 18.806493759155273, "gen_logits_mean": -0.6237517595291138, "gen_logits_min": -13.407502174377441, "gen_logits_std": 2.3817481994628906, "gen_loss": 0.47108474373817444, "grad_norm": 1.278955378343959, "learning_rate": 1.4122448979591837e-05, "loss": 0.5126, "mean_copy_accuracy": 0.9381110519170761, "mean_gen_accuracy": 0.8401515930891037, "mean_token_accuracy": 0.8654405027627945, "num_tokens": 93921346.0, "sample_num_tokens": 9157.0, "step": 347, "total_num_tokens": 93957974.0, "z_loss": 0.05175510048866272 }, { "copy_logits_max": 24.398893356323242, "copy_logits_min": -750000000.0, "copy_num_tokens": 292.8125, "epoch": 0.07107480214449834, "gen_logits_max": 18.508310317993164, "gen_logits_mean": -0.5212219953536987, "gen_logits_min": -12.461772918701172, "gen_logits_std": 2.3474621772766113, "gen_loss": 0.5172063112258911, "grad_norm": 1.7121024570561623, "learning_rate": 1.4163265306122448e-05, "loss": 0.5241, "mean_copy_accuracy": 0.9440559893846512, "mean_gen_accuracy": 0.8363209813833237, "mean_token_accuracy": 0.8603020459413528, "num_tokens": 94193790.0, "sample_num_tokens": 7649.0, "step": 348, "total_num_tokens": 94224386.0, "z_loss": 0.02929959073662758 }, { "copy_logits_max": 31.286033630371094, "copy_logits_min": -750000000.0, "copy_num_tokens": 369.8125, "epoch": 0.07127904008169518, "gen_logits_max": 18.392213821411133, "gen_logits_mean": -0.8162657022476196, "gen_logits_min": -13.263893127441406, "gen_logits_std": 2.312972068786621, "gen_loss": 0.4514090418815613, "grad_norm": 1.7702604557188217, "learning_rate": 1.4204081632653062e-05, "loss": 0.5313, "mean_copy_accuracy": 0.9369210302829742, "mean_gen_accuracy": 0.8342976719141006, "mean_token_accuracy": 0.8575134873390198, "num_tokens": 94458202.0, "sample_num_tokens": 7206.5, "step": 349, "total_num_tokens": 94487028.0, "z_loss": 0.03513529151678085 }, { "copy_logits_max": 32.34526062011719, "copy_logits_min": -750000000.0, "copy_num_tokens": 750.6875, "epoch": 0.07148327801889201, "gen_logits_max": 17.240550994873047, "gen_logits_mean": -1.2179675102233887, "gen_logits_min": -13.534063339233398, "gen_logits_std": 2.480699062347412, "gen_loss": 0.46647578477859497, "grad_norm": 1.8692886986050457, "learning_rate": 1.4244897959183674e-05, "loss": 0.5471, "mean_copy_accuracy": 0.9351643770933151, "mean_gen_accuracy": 0.8345410227775574, "mean_token_accuracy": 0.8635535091161728, "num_tokens": 94744317.0, "sample_num_tokens": 10341.25, "step": 350, "total_num_tokens": 94785682.0, "z_loss": 0.05032676085829735 }, { "copy_logits_max": 36.2480583190918, "copy_logits_min": -750000000.0, "copy_num_tokens": 497.625, "epoch": 0.07168751595608884, "gen_logits_max": 18.11670684814453, "gen_logits_mean": -1.0236213207244873, "gen_logits_min": -13.734613418579102, "gen_logits_std": 2.420823335647583, "gen_loss": 0.5772849321365356, "grad_norm": 2.400966865241384, "learning_rate": 1.4285714285714285e-05, "loss": 0.5452, "mean_copy_accuracy": 0.9386453926563263, "mean_gen_accuracy": 0.8311323076486588, "mean_token_accuracy": 0.856636643409729, "num_tokens": 95046891.0, "sample_num_tokens": 8677.25, "step": 351, "total_num_tokens": 95081600.0, "z_loss": 0.04285716265439987 }, { "copy_logits_max": 28.671876907348633, "copy_logits_min": -750000000.0, "copy_num_tokens": 504.8125, "epoch": 0.07189175389328568, "gen_logits_max": 17.976713180541992, "gen_logits_mean": -1.1825518608093262, "gen_logits_min": -13.220907211303711, "gen_logits_std": 2.356008529663086, "gen_loss": 0.46079277992248535, "grad_norm": 2.3362277739538353, "learning_rate": 1.4326530612244899e-05, "loss": 0.535, "mean_copy_accuracy": 0.9446152150630951, "mean_gen_accuracy": 0.8320479393005371, "mean_token_accuracy": 0.8618285059928894, "num_tokens": 95315266.0, "sample_num_tokens": 8705.5, "step": 352, "total_num_tokens": 95350088.0, "z_loss": 0.03799409046769142 }, { "copy_logits_max": 29.353553771972656, "copy_logits_min": -750000000.0, "copy_num_tokens": 459.75, "epoch": 0.07209599183048251, "gen_logits_max": 18.380516052246094, "gen_logits_mean": -1.0236668586730957, "gen_logits_min": -13.409445762634277, "gen_logits_std": 2.3852767944335938, "gen_loss": 0.5009652972221375, "grad_norm": 1.634302934841755, "learning_rate": 1.436734693877551e-05, "loss": 0.5328, "mean_copy_accuracy": 0.9434096813201904, "mean_gen_accuracy": 0.828826442360878, "mean_token_accuracy": 0.860329806804657, "num_tokens": 95599790.0, "sample_num_tokens": 8471.5, "step": 353, "total_num_tokens": 95633676.0, "z_loss": 0.03662640228867531 }, { "copy_logits_max": 27.02741050720215, "copy_logits_min": -750000000.0, "copy_num_tokens": 427.1875, "epoch": 0.07230022976767934, "gen_logits_max": 18.04049301147461, "gen_logits_mean": -0.8555101752281189, "gen_logits_min": -13.16069507598877, "gen_logits_std": 2.4045729637145996, "gen_loss": 0.4847865104675293, "grad_norm": 1.3633047963596676, "learning_rate": 1.4408163265306124e-05, "loss": 0.5105, "mean_copy_accuracy": 0.9481916278600693, "mean_gen_accuracy": 0.8366783261299133, "mean_token_accuracy": 0.8619885742664337, "num_tokens": 95882116.0, "sample_num_tokens": 9398.5, "step": 354, "total_num_tokens": 95919710.0, "z_loss": 0.03103424608707428 }, { "copy_logits_max": 35.049251556396484, "copy_logits_min": -750000000.0, "copy_num_tokens": 536.8125, "epoch": 0.07250446770487617, "gen_logits_max": 17.97800636291504, "gen_logits_mean": -1.0159831047058105, "gen_logits_min": -14.027156829833984, "gen_logits_std": 2.461893081665039, "gen_loss": 0.5129969120025635, "grad_norm": 1.954126495705763, "learning_rate": 1.4448979591836736e-05, "loss": 0.5583, "mean_copy_accuracy": 0.9438060969114304, "mean_gen_accuracy": 0.8240324258804321, "mean_token_accuracy": 0.8568177372217178, "num_tokens": 96138863.0, "sample_num_tokens": 8389.25, "step": 355, "total_num_tokens": 96172420.0, "z_loss": 0.04516954720020294 }, { "copy_logits_max": 27.53400230407715, "copy_logits_min": -750000000.0, "copy_num_tokens": 391.875, "epoch": 0.07270870564207302, "gen_logits_max": 18.561859130859375, "gen_logits_mean": -0.8957760334014893, "gen_logits_min": -13.45164680480957, "gen_logits_std": 2.4527785778045654, "gen_loss": 0.4847967028617859, "grad_norm": 1.7480306871769398, "learning_rate": 1.4489795918367348e-05, "loss": 0.5003, "mean_copy_accuracy": 0.9500319361686707, "mean_gen_accuracy": 0.8371116071939468, "mean_token_accuracy": 0.8609180003404617, "num_tokens": 96393630.0, "sample_num_tokens": 8254.0, "step": 356, "total_num_tokens": 96426646.0, "z_loss": 0.03316150978207588 }, { "copy_logits_max": 31.51415252685547, "copy_logits_min": -750000000.0, "copy_num_tokens": 599.1875, "epoch": 0.07291294357926985, "gen_logits_max": 18.312397003173828, "gen_logits_mean": -1.0412054061889648, "gen_logits_min": -13.663227081298828, "gen_logits_std": 2.423935651779175, "gen_loss": 0.4269660413265228, "grad_norm": 2.0126180046302964, "learning_rate": 1.453061224489796e-05, "loss": 0.4983, "mean_copy_accuracy": 0.945331484079361, "mean_gen_accuracy": 0.8400270044803619, "mean_token_accuracy": 0.864183321595192, "num_tokens": 96655976.0, "sample_num_tokens": 9614.5, "step": 357, "total_num_tokens": 96694434.0, "z_loss": 0.04056451469659805 }, { "copy_logits_max": 34.040077209472656, "copy_logits_min": -750000000.0, "copy_num_tokens": 402.4375, "epoch": 0.07311718151646668, "gen_logits_max": 18.642183303833008, "gen_logits_mean": -1.162698745727539, "gen_logits_min": -14.179899215698242, "gen_logits_std": 2.4265384674072266, "gen_loss": 0.5105653405189514, "grad_norm": 1.5361800327951147, "learning_rate": 1.4571428571428571e-05, "loss": 0.5364, "mean_copy_accuracy": 0.9447969794273376, "mean_gen_accuracy": 0.827147588133812, "mean_token_accuracy": 0.8586606532335281, "num_tokens": 96939909.0, "sample_num_tokens": 7251.25, "step": 358, "total_num_tokens": 96968914.0, "z_loss": 0.03743981570005417 }, { "copy_logits_max": 25.443593978881836, "copy_logits_min": -750000000.0, "copy_num_tokens": 492.4375, "epoch": 0.07332141945366352, "gen_logits_max": 17.36043930053711, "gen_logits_mean": -1.18435537815094, "gen_logits_min": -12.746384620666504, "gen_logits_std": 2.322183847427368, "gen_loss": 0.44626736640930176, "grad_norm": 1.5852323630619019, "learning_rate": 1.4612244897959183e-05, "loss": 0.5233, "mean_copy_accuracy": 0.95584437251091, "mean_gen_accuracy": 0.8299110978841782, "mean_token_accuracy": 0.8611793965101242, "num_tokens": 97196230.0, "sample_num_tokens": 8615.0, "step": 359, "total_num_tokens": 97230690.0, "z_loss": 0.03332672640681267 }, { "copy_logits_max": 30.46540069580078, "copy_logits_min": -687500032.0, "copy_num_tokens": 366.625, "epoch": 0.07352565739086035, "gen_logits_max": 17.249889373779297, "gen_logits_mean": -1.530083179473877, "gen_logits_min": -14.39085578918457, "gen_logits_std": 2.4814836978912354, "gen_loss": 0.5220090746879578, "grad_norm": 1.5168632373098254, "learning_rate": 1.4653061224489796e-05, "loss": 0.5288, "mean_copy_accuracy": 0.9538969248533249, "mean_gen_accuracy": 0.8281940519809723, "mean_token_accuracy": 0.8594168871641159, "num_tokens": 97451720.0, "sample_num_tokens": 6567.0, "step": 360, "total_num_tokens": 97477988.0, "z_loss": 0.03884586691856384 }, { "copy_logits_max": 31.197187423706055, "copy_logits_min": -750000000.0, "copy_num_tokens": 565.375, "epoch": 0.07372989532805718, "gen_logits_max": 17.525554656982422, "gen_logits_mean": -1.523855447769165, "gen_logits_min": -14.200197219848633, "gen_logits_std": 2.4332079887390137, "gen_loss": 0.47045284509658813, "grad_norm": 1.2101418856913422, "learning_rate": 1.4693877551020408e-05, "loss": 0.5035, "mean_copy_accuracy": 0.9589776545763016, "mean_gen_accuracy": 0.8321117907762527, "mean_token_accuracy": 0.8644476979970932, "num_tokens": 97724054.0, "sample_num_tokens": 8836.0, "step": 361, "total_num_tokens": 97759398.0, "z_loss": 0.04058065265417099 }, { "copy_logits_max": 26.589908599853516, "copy_logits_min": -750000000.0, "copy_num_tokens": 372.3125, "epoch": 0.07393413326525403, "gen_logits_max": 18.280366897583008, "gen_logits_mean": -0.7036445140838623, "gen_logits_min": -11.974251747131348, "gen_logits_std": 2.2704832553863525, "gen_loss": 0.47396963834762573, "grad_norm": 1.5761265648803704, "learning_rate": 1.4734693877551021e-05, "loss": 0.4995, "mean_copy_accuracy": 0.948861762881279, "mean_gen_accuracy": 0.8361625224351883, "mean_token_accuracy": 0.8619395792484283, "num_tokens": 97996352.0, "sample_num_tokens": 7812.5, "step": 362, "total_num_tokens": 98027602.0, "z_loss": 0.028268901631236076 }, { "copy_logits_max": 25.88674545288086, "copy_logits_min": -750000000.0, "copy_num_tokens": 354.4375, "epoch": 0.07413837120245086, "gen_logits_max": 19.09607696533203, "gen_logits_mean": -0.97243332862854, "gen_logits_min": -12.924686431884766, "gen_logits_std": 2.3339200019836426, "gen_loss": 0.43591806292533875, "grad_norm": 1.3100541728097712, "learning_rate": 1.4775510204081633e-05, "loss": 0.5021, "mean_copy_accuracy": 0.9464532881975174, "mean_gen_accuracy": 0.840465784072876, "mean_token_accuracy": 0.8695184737443924, "num_tokens": 98274078.0, "sample_num_tokens": 7569.5, "step": 363, "total_num_tokens": 98304356.0, "z_loss": 0.02800118550658226 }, { "copy_logits_max": 23.610347747802734, "copy_logits_min": -750000000.0, "copy_num_tokens": 286.6875, "epoch": 0.07434260913964769, "gen_logits_max": 18.775590896606445, "gen_logits_mean": -0.8774434924125671, "gen_logits_min": -12.52698802947998, "gen_logits_std": 2.338654041290283, "gen_loss": 0.5007705688476562, "grad_norm": 2.0581126125944618, "learning_rate": 1.4816326530612245e-05, "loss": 0.5188, "mean_copy_accuracy": 0.9647328108549118, "mean_gen_accuracy": 0.8284392803907394, "mean_token_accuracy": 0.8607558012008667, "num_tokens": 98548230.0, "sample_num_tokens": 7305.5, "step": 364, "total_num_tokens": 98577452.0, "z_loss": 0.02281835488975048 }, { "copy_logits_max": 32.75679016113281, "copy_logits_min": -687500032.0, "copy_num_tokens": 447.75, "epoch": 0.07454684707684453, "gen_logits_max": 18.281023025512695, "gen_logits_mean": -0.8061459064483643, "gen_logits_min": -13.539467811584473, "gen_logits_std": 2.4772543907165527, "gen_loss": 0.4832347333431244, "grad_norm": 1.434136602794119, "learning_rate": 1.4857142857142858e-05, "loss": 0.5121, "mean_copy_accuracy": 0.9592760056257248, "mean_gen_accuracy": 0.8328798860311508, "mean_token_accuracy": 0.8636214584112167, "num_tokens": 98809975.0, "sample_num_tokens": 8491.25, "step": 365, "total_num_tokens": 98843940.0, "z_loss": 0.03435567021369934 }, { "copy_logits_max": 37.169654846191406, "copy_logits_min": -687500032.0, "copy_num_tokens": 581.5625, "epoch": 0.07475108501404136, "gen_logits_max": 18.10281753540039, "gen_logits_mean": -0.82179856300354, "gen_logits_min": -14.34430980682373, "gen_logits_std": 2.537947177886963, "gen_loss": 0.5530098676681519, "grad_norm": 2.112081353413886, "learning_rate": 1.489795918367347e-05, "loss": 0.5407, "mean_copy_accuracy": 0.943637415766716, "mean_gen_accuracy": 0.8287407755851746, "mean_token_accuracy": 0.8601368069648743, "num_tokens": 99071929.0, "sample_num_tokens": 8496.25, "step": 366, "total_num_tokens": 99105914.0, "z_loss": 0.044268056750297546 }, { "copy_logits_max": 26.992584228515625, "copy_logits_min": -750000000.0, "copy_num_tokens": 444.0625, "epoch": 0.07495532295123819, "gen_logits_max": 16.648347854614258, "gen_logits_mean": -1.1322038173675537, "gen_logits_min": -13.654926300048828, "gen_logits_std": 2.404247522354126, "gen_loss": 0.46097952127456665, "grad_norm": 1.4838445348457125, "learning_rate": 1.4938775510204082e-05, "loss": 0.516, "mean_copy_accuracy": 0.9466481059789658, "mean_gen_accuracy": 0.8340047895908356, "mean_token_accuracy": 0.8628692924976349, "num_tokens": 99331091.0, "sample_num_tokens": 7711.25, "step": 367, "total_num_tokens": 99361936.0, "z_loss": 0.03824726492166519 }, { "copy_logits_max": 23.728824615478516, "copy_logits_min": -750000000.0, "copy_num_tokens": 298.0, "epoch": 0.07515956088843502, "gen_logits_max": 18.07117462158203, "gen_logits_mean": -0.8175789713859558, "gen_logits_min": -12.776116371154785, "gen_logits_std": 2.371941089630127, "gen_loss": 0.5151456594467163, "grad_norm": 1.3518953544322467, "learning_rate": 1.4979591836734695e-05, "loss": 0.4989, "mean_copy_accuracy": 0.9582943469285965, "mean_gen_accuracy": 0.8360810875892639, "mean_token_accuracy": 0.8652033805847168, "num_tokens": 99617899.0, "sample_num_tokens": 7665.75, "step": 368, "total_num_tokens": 99648562.0, "z_loss": 0.02488897368311882 }, { "copy_logits_max": 30.436466217041016, "copy_logits_min": -687500032.0, "copy_num_tokens": 376.0, "epoch": 0.07536379882563186, "gen_logits_max": 17.717395782470703, "gen_logits_mean": -1.0989643335342407, "gen_logits_min": -12.73073959350586, "gen_logits_std": 2.2955942153930664, "gen_loss": 0.5219484567642212, "grad_norm": 1.4606584420514976, "learning_rate": 1.5020408163265307e-05, "loss": 0.5276, "mean_copy_accuracy": 0.9524818360805511, "mean_gen_accuracy": 0.8299997597932816, "mean_token_accuracy": 0.8586420118808746, "num_tokens": 99884918.0, "sample_num_tokens": 8560.0, "step": 369, "total_num_tokens": 99919158.0, "z_loss": 0.030605241656303406 }, { "copy_logits_max": 33.543601989746094, "copy_logits_min": -750000000.0, "copy_num_tokens": 483.8125, "epoch": 0.0755680367628287, "gen_logits_max": 17.357563018798828, "gen_logits_mean": -1.1555802822113037, "gen_logits_min": -13.167437553405762, "gen_logits_std": 2.334876537322998, "gen_loss": 0.518945038318634, "grad_norm": 1.4122185998406505, "learning_rate": 1.5061224489795919e-05, "loss": 0.5097, "mean_copy_accuracy": 0.9548991471529007, "mean_gen_accuracy": 0.8363852351903915, "mean_token_accuracy": 0.8676597028970718, "num_tokens": 100158998.0, "sample_num_tokens": 7736.0, "step": 370, "total_num_tokens": 100189942.0, "z_loss": 0.04086780175566673 }, { "copy_logits_max": 28.531570434570312, "copy_logits_min": -750000000.0, "copy_num_tokens": 364.4375, "epoch": 0.07577227470002552, "gen_logits_max": 18.28939437866211, "gen_logits_mean": -1.4002037048339844, "gen_logits_min": -13.295001029968262, "gen_logits_std": 2.373904228210449, "gen_loss": 0.47343575954437256, "grad_norm": 2.8564603040706147, "learning_rate": 1.510204081632653e-05, "loss": 0.5107, "mean_copy_accuracy": 0.9476717710494995, "mean_gen_accuracy": 0.8389938771724701, "mean_token_accuracy": 0.86603082716465, "num_tokens": 100415577.0, "sample_num_tokens": 8542.75, "step": 371, "total_num_tokens": 100449748.0, "z_loss": 0.027842827141284943 }, { "copy_logits_max": 32.3583869934082, "copy_logits_min": -750000000.0, "copy_num_tokens": 594.3125, "epoch": 0.07597651263722237, "gen_logits_max": 17.40448760986328, "gen_logits_mean": -1.6655256748199463, "gen_logits_min": -13.822139739990234, "gen_logits_std": 2.43274188041687, "gen_loss": 0.42672890424728394, "grad_norm": 1.3446197237512092, "learning_rate": 1.5142857142857142e-05, "loss": 0.5286, "mean_copy_accuracy": 0.9404939264059067, "mean_gen_accuracy": 0.8315533697605133, "mean_token_accuracy": 0.8570265471935272, "num_tokens": 100677326.0, "sample_num_tokens": 9691.0, "step": 372, "total_num_tokens": 100716090.0, "z_loss": 0.040351808071136475 }, { "copy_logits_max": 32.60215759277344, "copy_logits_min": -750000000.0, "copy_num_tokens": 458.625, "epoch": 0.0761807505744192, "gen_logits_max": 17.60942840576172, "gen_logits_mean": -1.394897699356079, "gen_logits_min": -14.067754745483398, "gen_logits_std": 2.4599077701568604, "gen_loss": 0.44858449697494507, "grad_norm": 1.705523041146064, "learning_rate": 1.5183673469387754e-05, "loss": 0.4976, "mean_copy_accuracy": 0.9423348009586334, "mean_gen_accuracy": 0.8360876739025116, "mean_token_accuracy": 0.8629383146762848, "num_tokens": 100936834.0, "sample_num_tokens": 8146.0, "step": 373, "total_num_tokens": 100969418.0, "z_loss": 0.0376613549888134 }, { "copy_logits_max": 29.381052017211914, "copy_logits_min": -750000000.0, "copy_num_tokens": 851.25, "epoch": 0.07638498851161603, "gen_logits_max": 16.027631759643555, "gen_logits_mean": -1.688796877861023, "gen_logits_min": -13.696468353271484, "gen_logits_std": 2.404224395751953, "gen_loss": 0.3856319785118103, "grad_norm": 1.5327827136375856, "learning_rate": 1.522448979591837e-05, "loss": 0.4675, "mean_copy_accuracy": 0.9691639542579651, "mean_gen_accuracy": 0.8408750295639038, "mean_token_accuracy": 0.8723006695508957, "num_tokens": 101200369.0, "sample_num_tokens": 10914.75, "step": 374, "total_num_tokens": 101244028.0, "z_loss": 0.047828949987888336 }, { "copy_logits_max": 28.702537536621094, "copy_logits_min": -750000000.0, "copy_num_tokens": 454.5625, "epoch": 0.07658922644881287, "gen_logits_max": 18.51315689086914, "gen_logits_mean": -1.433142900466919, "gen_logits_min": -13.453951835632324, "gen_logits_std": 2.399707794189453, "gen_loss": 0.4575325548648834, "grad_norm": 1.321810193631337, "learning_rate": 1.526530612244898e-05, "loss": 0.5185, "mean_copy_accuracy": 0.9636059552431107, "mean_gen_accuracy": 0.8259295672178268, "mean_token_accuracy": 0.8602089285850525, "num_tokens": 101482912.0, "sample_num_tokens": 8755.0, "step": 375, "total_num_tokens": 101517932.0, "z_loss": 0.03126025199890137 }, { "copy_logits_max": 25.053743362426758, "copy_logits_min": -750000000.0, "copy_num_tokens": 302.75, "epoch": 0.0767934643860097, "gen_logits_max": 18.726587295532227, "gen_logits_mean": -1.0851620435714722, "gen_logits_min": -12.412801742553711, "gen_logits_std": 2.2876672744750977, "gen_loss": 0.47864285111427307, "grad_norm": 1.1136214116714152, "learning_rate": 1.530612244897959e-05, "loss": 0.5114, "mean_copy_accuracy": 0.9561600536108017, "mean_gen_accuracy": 0.8358374685049057, "mean_token_accuracy": 0.8637511879205704, "num_tokens": 101744836.0, "sample_num_tokens": 7848.0, "step": 376, "total_num_tokens": 101776228.0, "z_loss": 0.02456285059452057 }, { "copy_logits_max": 28.707578659057617, "copy_logits_min": -750000000.0, "copy_num_tokens": 391.5625, "epoch": 0.07699770232320653, "gen_logits_max": 17.071014404296875, "gen_logits_mean": -1.1710443496704102, "gen_logits_min": -13.961337089538574, "gen_logits_std": 2.4384102821350098, "gen_loss": 0.5400048494338989, "grad_norm": 1.6481185260462348, "learning_rate": 1.5346938775510204e-05, "loss": 0.5395, "mean_copy_accuracy": 0.9630790054798126, "mean_gen_accuracy": 0.8240078538656235, "mean_token_accuracy": 0.8601448982954025, "num_tokens": 102027809.0, "sample_num_tokens": 7985.75, "step": 377, "total_num_tokens": 102059752.0, "z_loss": 0.03350399062037468 }, { "copy_logits_max": 37.34132385253906, "copy_logits_min": -750000000.0, "copy_num_tokens": 592.75, "epoch": 0.07720194026040338, "gen_logits_max": 17.847349166870117, "gen_logits_mean": -1.232207179069519, "gen_logits_min": -13.541938781738281, "gen_logits_std": 2.395798921585083, "gen_loss": 0.527332603931427, "grad_norm": 1.069790411514329, "learning_rate": 1.5387755102040815e-05, "loss": 0.5306, "mean_copy_accuracy": 0.9648267179727554, "mean_gen_accuracy": 0.8241399526596069, "mean_token_accuracy": 0.8593979477882385, "num_tokens": 102285946.0, "sample_num_tokens": 9003.0, "step": 378, "total_num_tokens": 102321958.0, "z_loss": 0.043566979467868805 }, { "copy_logits_max": 35.96065139770508, "copy_logits_min": -750000000.0, "copy_num_tokens": 587.5, "epoch": 0.0774061781976002, "gen_logits_max": 18.15903091430664, "gen_logits_mean": -1.3925668001174927, "gen_logits_min": -13.65696907043457, "gen_logits_std": 2.39597225189209, "gen_loss": 0.4218900799751282, "grad_norm": 2.078117334451829, "learning_rate": 1.5428571428571428e-05, "loss": 0.4869, "mean_copy_accuracy": 0.9702093154191971, "mean_gen_accuracy": 0.8383038640022278, "mean_token_accuracy": 0.8776270598173141, "num_tokens": 102573260.0, "sample_num_tokens": 8726.5, "step": 379, "total_num_tokens": 102608166.0, "z_loss": 0.04203847050666809 }, { "copy_logits_max": 24.361570358276367, "copy_logits_min": -750000000.0, "copy_num_tokens": 503.9375, "epoch": 0.07761041613479704, "gen_logits_max": 16.22623062133789, "gen_logits_mean": -1.2496048212051392, "gen_logits_min": -13.269868850708008, "gen_logits_std": 2.3663249015808105, "gen_loss": 0.46289151906967163, "grad_norm": 1.0229449379584543, "learning_rate": 1.546938775510204e-05, "loss": 0.4776, "mean_copy_accuracy": 0.968424454331398, "mean_gen_accuracy": 0.8380237519741058, "mean_token_accuracy": 0.8694149553775787, "num_tokens": 102849359.0, "sample_num_tokens": 8525.25, "step": 380, "total_num_tokens": 102883460.0, "z_loss": 0.03663979470729828 }, { "copy_logits_max": 28.255985260009766, "copy_logits_min": -750000000.0, "copy_num_tokens": 425.625, "epoch": 0.07781465407199387, "gen_logits_max": 18.056949615478516, "gen_logits_mean": -1.653017282485962, "gen_logits_min": -14.256153106689453, "gen_logits_std": 2.439209461212158, "gen_loss": 0.4302193224430084, "grad_norm": 1.314578040267428, "learning_rate": 1.5510204081632655e-05, "loss": 0.4721, "mean_copy_accuracy": 0.959740474820137, "mean_gen_accuracy": 0.8461096882820129, "mean_token_accuracy": 0.8724866956472397, "num_tokens": 103117699.0, "sample_num_tokens": 7706.75, "step": 381, "total_num_tokens": 103148526.0, "z_loss": 0.03266041725873947 }, { "copy_logits_max": 22.176250457763672, "copy_logits_min": -750000000.0, "copy_num_tokens": 281.5625, "epoch": 0.07801889200919071, "gen_logits_max": 17.637706756591797, "gen_logits_mean": -1.457085132598877, "gen_logits_min": -13.3357572555542, "gen_logits_std": 2.349083423614502, "gen_loss": 0.46182742714881897, "grad_norm": 1.182440146165488, "learning_rate": 1.5551020408163265e-05, "loss": 0.5146, "mean_copy_accuracy": 0.9615028351545334, "mean_gen_accuracy": 0.8294503092765808, "mean_token_accuracy": 0.8557114899158478, "num_tokens": 103374224.0, "sample_num_tokens": 7026.0, "step": 382, "total_num_tokens": 103402328.0, "z_loss": 0.023729341104626656 }, { "copy_logits_max": 27.03084373474121, "copy_logits_min": -750000000.0, "copy_num_tokens": 369.875, "epoch": 0.07822312994638754, "gen_logits_max": 17.084197998046875, "gen_logits_mean": -1.537745714187622, "gen_logits_min": -13.872577667236328, "gen_logits_std": 2.363105535507202, "gen_loss": 0.43682271242141724, "grad_norm": 1.7191197179665985, "learning_rate": 1.559183673469388e-05, "loss": 0.4783, "mean_copy_accuracy": 0.9693605005741119, "mean_gen_accuracy": 0.8427485227584839, "mean_token_accuracy": 0.8779879510402679, "num_tokens": 103654315.0, "sample_num_tokens": 7443.75, "step": 383, "total_num_tokens": 103684090.0, "z_loss": 0.03171689063310623 }, { "copy_logits_max": 23.494327545166016, "copy_logits_min": -625000000.0, "copy_num_tokens": 265.5625, "epoch": 0.07842736788358437, "gen_logits_max": 17.780920028686523, "gen_logits_mean": -1.6412489414215088, "gen_logits_min": -13.540387153625488, "gen_logits_std": 2.3656320571899414, "gen_loss": 0.4589945673942566, "grad_norm": 1.5535048602025756, "learning_rate": 1.563265306122449e-05, "loss": 0.5179, "mean_copy_accuracy": 0.9558780044317245, "mean_gen_accuracy": 0.8287320882081985, "mean_token_accuracy": 0.8571226000785828, "num_tokens": 103896902.0, "sample_num_tokens": 6833.0, "step": 384, "total_num_tokens": 103924234.0, "z_loss": 0.02335241064429283 }, { "copy_logits_max": 28.009563446044922, "copy_logits_min": -687500032.0, "copy_num_tokens": 406.375, "epoch": 0.07863160582078121, "gen_logits_max": 17.42578887939453, "gen_logits_mean": -1.7068328857421875, "gen_logits_min": -13.680727005004883, "gen_logits_std": 2.3697001934051514, "gen_loss": 0.4314725697040558, "grad_norm": 1.5021014313356416, "learning_rate": 1.5673469387755105e-05, "loss": 0.4984, "mean_copy_accuracy": 0.9590014219284058, "mean_gen_accuracy": 0.8324903398752213, "mean_token_accuracy": 0.8634393513202667, "num_tokens": 104166421.0, "sample_num_tokens": 7607.25, "step": 385, "total_num_tokens": 104196850.0, "z_loss": 0.03157925233244896 }, { "copy_logits_max": 33.63984680175781, "copy_logits_min": -687500032.0, "copy_num_tokens": 548.125, "epoch": 0.07883584375797804, "gen_logits_max": 18.51137351989746, "gen_logits_mean": -1.5460526943206787, "gen_logits_min": -13.696572303771973, "gen_logits_std": 2.392241954803467, "gen_loss": 0.44345176219940186, "grad_norm": 1.2492730795321179, "learning_rate": 1.5714285714285715e-05, "loss": 0.5059, "mean_copy_accuracy": 0.9592366367578506, "mean_gen_accuracy": 0.8344840407371521, "mean_token_accuracy": 0.8607617169618607, "num_tokens": 104426355.0, "sample_num_tokens": 8999.25, "step": 386, "total_num_tokens": 104462352.0, "z_loss": 0.03629366308450699 }, { "copy_logits_max": 30.572858810424805, "copy_logits_min": -750000000.0, "copy_num_tokens": 403.125, "epoch": 0.07904008169517487, "gen_logits_max": 17.910022735595703, "gen_logits_mean": -1.2247929573059082, "gen_logits_min": -13.941948890686035, "gen_logits_std": 2.401383876800537, "gen_loss": 0.45911717414855957, "grad_norm": 1.6363700619588148, "learning_rate": 1.575510204081633e-05, "loss": 0.4845, "mean_copy_accuracy": 0.9650317877531052, "mean_gen_accuracy": 0.8392253667116165, "mean_token_accuracy": 0.8671279847621918, "num_tokens": 104695513.0, "sample_num_tokens": 7451.25, "step": 387, "total_num_tokens": 104725318.0, "z_loss": 0.03464002534747124 }, { "copy_logits_max": 28.234033584594727, "copy_logits_min": -750000000.0, "copy_num_tokens": 391.125, "epoch": 0.07924431963237172, "gen_logits_max": 16.830089569091797, "gen_logits_mean": -1.515099048614502, "gen_logits_min": -13.590397834777832, "gen_logits_std": 2.321190595626831, "gen_loss": 0.4655715823173523, "grad_norm": 1.1464889510749823, "learning_rate": 1.579591836734694e-05, "loss": 0.5053, "mean_copy_accuracy": 0.9601652324199677, "mean_gen_accuracy": 0.8334679156541824, "mean_token_accuracy": 0.8657326996326447, "num_tokens": 104957047.0, "sample_num_tokens": 7767.75, "step": 388, "total_num_tokens": 104988118.0, "z_loss": 0.03161899745464325 }, { "copy_logits_max": 23.376739501953125, "copy_logits_min": -750000000.0, "copy_num_tokens": 353.6875, "epoch": 0.07944855756956855, "gen_logits_max": 18.26070785522461, "gen_logits_mean": -1.487301230430603, "gen_logits_min": -13.097034454345703, "gen_logits_std": 2.2923614978790283, "gen_loss": 0.4490227997303009, "grad_norm": 1.5057368309585064, "learning_rate": 1.583673469387755e-05, "loss": 0.4688, "mean_copy_accuracy": 0.9565805345773697, "mean_gen_accuracy": 0.8411307781934738, "mean_token_accuracy": 0.8689318299293518, "num_tokens": 105234358.0, "sample_num_tokens": 8623.0, "step": 389, "total_num_tokens": 105268850.0, "z_loss": 0.020277876406908035 }, { "copy_logits_max": 26.148971557617188, "copy_logits_min": -750000000.0, "copy_num_tokens": 344.75, "epoch": 0.07965279550676538, "gen_logits_max": 18.069129943847656, "gen_logits_mean": -1.832897424697876, "gen_logits_min": -13.966110229492188, "gen_logits_std": 2.351837635040283, "gen_loss": 0.45249322056770325, "grad_norm": 1.2493065759182842, "learning_rate": 1.5877551020408166e-05, "loss": 0.4775, "mean_copy_accuracy": 0.9598400145769119, "mean_gen_accuracy": 0.8436801880598068, "mean_token_accuracy": 0.8717231005430222, "num_tokens": 105522313.0, "sample_num_tokens": 7680.25, "step": 390, "total_num_tokens": 105553034.0, "z_loss": 0.02588721364736557 }, { "copy_logits_max": 36.119239807128906, "copy_logits_min": -750000000.0, "copy_num_tokens": 471.0, "epoch": 0.07985703344396222, "gen_logits_max": 18.101287841796875, "gen_logits_mean": -1.900743007659912, "gen_logits_min": -14.256211280822754, "gen_logits_std": 2.400369882583618, "gen_loss": 0.45937272906303406, "grad_norm": 1.2276204789206224, "learning_rate": 1.5918367346938776e-05, "loss": 0.5015, "mean_copy_accuracy": 0.9656083285808563, "mean_gen_accuracy": 0.8299498856067657, "mean_token_accuracy": 0.8624192476272583, "num_tokens": 105783323.0, "sample_num_tokens": 8530.25, "step": 391, "total_num_tokens": 105817444.0, "z_loss": 0.03584121912717819 }, { "copy_logits_max": 27.131103515625, "copy_logits_min": -750000000.0, "copy_num_tokens": 389.3125, "epoch": 0.08006127138115905, "gen_logits_max": 16.84307289123535, "gen_logits_mean": -1.4402313232421875, "gen_logits_min": -13.47269344329834, "gen_logits_std": 2.309626579284668, "gen_loss": 0.46764034032821655, "grad_norm": 1.3171575070646442, "learning_rate": 1.595918367346939e-05, "loss": 0.4722, "mean_copy_accuracy": 0.9656367301940918, "mean_gen_accuracy": 0.8398925960063934, "mean_token_accuracy": 0.8680254220962524, "num_tokens": 106049429.0, "sample_num_tokens": 8023.25, "step": 392, "total_num_tokens": 106081522.0, "z_loss": 0.029453620314598083 }, { "copy_logits_max": 27.72250747680664, "copy_logits_min": -750000000.0, "copy_num_tokens": 357.875, "epoch": 0.08026550931835588, "gen_logits_max": 17.275978088378906, "gen_logits_mean": -1.7710723876953125, "gen_logits_min": -13.728660583496094, "gen_logits_std": 2.3309237957000732, "gen_loss": 0.42380425333976746, "grad_norm": 1.659073001272799, "learning_rate": 1.6e-05, "loss": 0.4777, "mean_copy_accuracy": 0.966189444065094, "mean_gen_accuracy": 0.842526376247406, "mean_token_accuracy": 0.874671682715416, "num_tokens": 106303801.0, "sample_num_tokens": 6687.75, "step": 393, "total_num_tokens": 106330552.0, "z_loss": 0.02941695600748062 }, { "copy_logits_max": 29.15022087097168, "copy_logits_min": -750000000.0, "copy_num_tokens": 607.5625, "epoch": 0.08046974725555271, "gen_logits_max": 16.15798568725586, "gen_logits_mean": -2.224578380584717, "gen_logits_min": -14.575957298278809, "gen_logits_std": 2.4255788326263428, "gen_loss": 0.41027796268463135, "grad_norm": 2.045389811666503, "learning_rate": 1.6040816326530613e-05, "loss": 0.4732, "mean_copy_accuracy": 0.9714044332504272, "mean_gen_accuracy": 0.8413107395172119, "mean_token_accuracy": 0.8729066997766495, "num_tokens": 106564855.0, "sample_num_tokens": 9434.25, "step": 394, "total_num_tokens": 106602592.0, "z_loss": 0.039565540850162506 }, { "copy_logits_max": 29.3590087890625, "copy_logits_min": -750000000.0, "copy_num_tokens": 333.125, "epoch": 0.08067398519274956, "gen_logits_max": 18.25577163696289, "gen_logits_mean": -1.936926245689392, "gen_logits_min": -14.15719223022461, "gen_logits_std": 2.3618998527526855, "gen_loss": 0.46403610706329346, "grad_norm": 1.2729924606703522, "learning_rate": 1.6081632653061223e-05, "loss": 0.477, "mean_copy_accuracy": 0.962579995393753, "mean_gen_accuracy": 0.839508131146431, "mean_token_accuracy": 0.8705831915140152, "num_tokens": 106850617.0, "sample_num_tokens": 7700.25, "step": 395, "total_num_tokens": 106881418.0, "z_loss": 0.02577276900410652 }, { "copy_logits_max": 28.32219696044922, "copy_logits_min": -687500032.0, "copy_num_tokens": 574.875, "epoch": 0.08087822312994639, "gen_logits_max": 17.51051902770996, "gen_logits_mean": -2.0413095951080322, "gen_logits_min": -14.210365295410156, "gen_logits_std": 2.393829345703125, "gen_loss": 0.44358354806900024, "grad_norm": 1.023340780979028, "learning_rate": 1.612244897959184e-05, "loss": 0.4791, "mean_copy_accuracy": 0.9690827429294586, "mean_gen_accuracy": 0.8363011926412582, "mean_token_accuracy": 0.8717071264982224, "num_tokens": 107140959.0, "sample_num_tokens": 9553.25, "step": 396, "total_num_tokens": 107179172.0, "z_loss": 0.03474119305610657 }, { "copy_logits_max": 40.293701171875, "copy_logits_min": -750000000.0, "copy_num_tokens": 472.75, "epoch": 0.08108246106714322, "gen_logits_max": 18.600711822509766, "gen_logits_mean": -1.8886473178863525, "gen_logits_min": -14.565343856811523, "gen_logits_std": 2.4450368881225586, "gen_loss": 0.5253919959068298, "grad_norm": 1.1576674269807083, "learning_rate": 1.616326530612245e-05, "loss": 0.5174, "mean_copy_accuracy": 0.9604508876800537, "mean_gen_accuracy": 0.8274797797203064, "mean_token_accuracy": 0.8578994870185852, "num_tokens": 107413216.0, "sample_num_tokens": 8272.5, "step": 397, "total_num_tokens": 107446306.0, "z_loss": 0.03640151023864746 }, { "copy_logits_max": 32.05182647705078, "copy_logits_min": -687500032.0, "copy_num_tokens": 316.3125, "epoch": 0.08128669900434006, "gen_logits_max": 18.2722225189209, "gen_logits_mean": -1.7646665573120117, "gen_logits_min": -13.957744598388672, "gen_logits_std": 2.3262665271759033, "gen_loss": 0.4487859010696411, "grad_norm": 1.0142092889947123, "learning_rate": 1.6204081632653063e-05, "loss": 0.4888, "mean_copy_accuracy": 0.968597874045372, "mean_gen_accuracy": 0.835230827331543, "mean_token_accuracy": 0.8655582964420319, "num_tokens": 107685255.0, "sample_num_tokens": 7548.25, "step": 398, "total_num_tokens": 107715448.0, "z_loss": 0.027058115229010582 }, { "copy_logits_max": 27.758602142333984, "copy_logits_min": -750000000.0, "copy_num_tokens": 495.4375, "epoch": 0.08149093694153689, "gen_logits_max": 17.508140563964844, "gen_logits_mean": -1.7030713558197021, "gen_logits_min": -14.082721710205078, "gen_logits_std": 2.4120492935180664, "gen_loss": 0.43159836530685425, "grad_norm": 1.3397182839250004, "learning_rate": 1.6244897959183673e-05, "loss": 0.4794, "mean_copy_accuracy": 0.970435693860054, "mean_gen_accuracy": 0.8364680707454681, "mean_token_accuracy": 0.871817871928215, "num_tokens": 107939579.0, "sample_num_tokens": 9335.75, "step": 399, "total_num_tokens": 107976922.0, "z_loss": 0.03128959983587265 }, { "copy_logits_max": 28.478565216064453, "copy_logits_min": -687500032.0, "copy_num_tokens": 329.0625, "epoch": 0.08169517487873372, "gen_logits_max": 18.149227142333984, "gen_logits_mean": -1.3954863548278809, "gen_logits_min": -14.185091018676758, "gen_logits_std": 2.393838882446289, "gen_loss": 0.45595669746398926, "grad_norm": 0.8550170162841667, "learning_rate": 1.6285714285714283e-05, "loss": 0.4624, "mean_copy_accuracy": 0.9657389670610428, "mean_gen_accuracy": 0.8476430475711823, "mean_token_accuracy": 0.8718103915452957, "num_tokens": 108201945.0, "sample_num_tokens": 7375.25, "step": 400, "total_num_tokens": 108231446.0, "z_loss": 0.028078611940145493 }, { "copy_logits_max": 31.087976455688477, "copy_logits_min": -750000000.0, "copy_num_tokens": 442.3125, "epoch": 0.08189941281593056, "gen_logits_max": 17.85666847229004, "gen_logits_mean": -1.7533273696899414, "gen_logits_min": -14.241223335266113, "gen_logits_std": 2.4058055877685547, "gen_loss": 0.45279479026794434, "grad_norm": 1.0998533947672524, "learning_rate": 1.63265306122449e-05, "loss": 0.5117, "mean_copy_accuracy": 0.9670457988977432, "mean_gen_accuracy": 0.8274107128381729, "mean_token_accuracy": 0.8624721616506577, "num_tokens": 108479586.0, "sample_num_tokens": 8040.0, "step": 401, "total_num_tokens": 108511746.0, "z_loss": 0.03303268179297447 }, { "copy_logits_max": 28.691164016723633, "copy_logits_min": -750000000.0, "copy_num_tokens": 565.9375, "epoch": 0.0821036507531274, "gen_logits_max": 16.974184036254883, "gen_logits_mean": -1.9213266372680664, "gen_logits_min": -14.63737964630127, "gen_logits_std": 2.4911649227142334, "gen_loss": 0.4380738139152527, "grad_norm": 0.8195628284266095, "learning_rate": 1.636734693877551e-05, "loss": 0.5128, "mean_copy_accuracy": 0.95704784989357, "mean_gen_accuracy": 0.8304564654827118, "mean_token_accuracy": 0.8589421659708023, "num_tokens": 108727445.0, "sample_num_tokens": 8757.25, "step": 402, "total_num_tokens": 108762474.0, "z_loss": 0.03861679881811142 }, { "copy_logits_max": 33.320579528808594, "copy_logits_min": -750000000.0, "copy_num_tokens": 464.8125, "epoch": 0.08230788869032422, "gen_logits_max": 17.79737663269043, "gen_logits_mean": -2.0594420433044434, "gen_logits_min": -14.24974250793457, "gen_logits_std": 2.411252737045288, "gen_loss": 0.4550454020500183, "grad_norm": 1.1698742468763321, "learning_rate": 1.6408163265306124e-05, "loss": 0.4715, "mean_copy_accuracy": 0.9736785590648651, "mean_gen_accuracy": 0.8384304046630859, "mean_token_accuracy": 0.8747716993093491, "num_tokens": 108994469.0, "sample_num_tokens": 7908.25, "step": 403, "total_num_tokens": 109026102.0, "z_loss": 0.03445146232843399 }, { "copy_logits_max": 32.5698356628418, "copy_logits_min": -750000000.0, "copy_num_tokens": 614.6875, "epoch": 0.08251212662752107, "gen_logits_max": 16.225162506103516, "gen_logits_mean": -2.420673370361328, "gen_logits_min": -15.50211238861084, "gen_logits_std": 2.459336519241333, "gen_loss": 0.44089198112487793, "grad_norm": 1.009544375440171, "learning_rate": 1.6448979591836734e-05, "loss": 0.4914, "mean_copy_accuracy": 0.9655201882123947, "mean_gen_accuracy": 0.836483046412468, "mean_token_accuracy": 0.869204506278038, "num_tokens": 109261455.0, "sample_num_tokens": 8952.25, "step": 404, "total_num_tokens": 109297264.0, "z_loss": 0.04130391776561737 }, { "copy_logits_max": 23.072349548339844, "copy_logits_min": -750000000.0, "copy_num_tokens": 368.5625, "epoch": 0.0827163645647179, "gen_logits_max": 17.74648666381836, "gen_logits_mean": -2.0073556900024414, "gen_logits_min": -13.972772598266602, "gen_logits_std": 2.4133596420288086, "gen_loss": 0.43985462188720703, "grad_norm": 1.31439194205893, "learning_rate": 1.6489795918367347e-05, "loss": 0.4782, "mean_copy_accuracy": 0.967379093170166, "mean_gen_accuracy": 0.8375105559825897, "mean_token_accuracy": 0.8708217293024063, "num_tokens": 109555253.0, "sample_num_tokens": 7662.25, "step": 405, "total_num_tokens": 109585902.0, "z_loss": 0.025125056505203247 }, { "copy_logits_max": 24.724567413330078, "copy_logits_min": -687500032.0, "copy_num_tokens": 298.25, "epoch": 0.08292060250191473, "gen_logits_max": 17.7938232421875, "gen_logits_mean": -2.284749746322632, "gen_logits_min": -13.85569953918457, "gen_logits_std": 2.3413000106811523, "gen_loss": 0.43598031997680664, "grad_norm": 1.3081244823074474, "learning_rate": 1.6530612244897957e-05, "loss": 0.4814, "mean_copy_accuracy": 0.9700285494327545, "mean_gen_accuracy": 0.8356738090515137, "mean_token_accuracy": 0.8636056184768677, "num_tokens": 109796014.0, "sample_num_tokens": 6986.0, "step": 406, "total_num_tokens": 109823958.0, "z_loss": 0.02151089906692505 }, { "copy_logits_max": 25.168296813964844, "copy_logits_min": -750000000.0, "copy_num_tokens": 374.9375, "epoch": 0.08312484043911156, "gen_logits_max": 18.01386260986328, "gen_logits_mean": -2.3146209716796875, "gen_logits_min": -13.692401885986328, "gen_logits_std": 2.2792422771453857, "gen_loss": 0.38770055770874023, "grad_norm": 1.420489942958341, "learning_rate": 1.6571428571428574e-05, "loss": 0.467, "mean_copy_accuracy": 0.9654429703950882, "mean_gen_accuracy": 0.8431326299905777, "mean_token_accuracy": 0.8769821673631668, "num_tokens": 110084288.0, "sample_num_tokens": 7820.0, "step": 407, "total_num_tokens": 110115568.0, "z_loss": 0.02160032093524933 }, { "copy_logits_max": 23.410242080688477, "copy_logits_min": -750000000.0, "copy_num_tokens": 334.4375, "epoch": 0.0833290783763084, "gen_logits_max": 17.80852508544922, "gen_logits_mean": -2.0782346725463867, "gen_logits_min": -13.748922348022461, "gen_logits_std": 2.3528852462768555, "gen_loss": 0.4202944040298462, "grad_norm": 1.0065065722965079, "learning_rate": 1.6612244897959184e-05, "loss": 0.468, "mean_copy_accuracy": 0.9726002961397171, "mean_gen_accuracy": 0.8353326618671417, "mean_token_accuracy": 0.8700650036334991, "num_tokens": 110338949.0, "sample_num_tokens": 7549.25, "step": 408, "total_num_tokens": 110369146.0, "z_loss": 0.023138675838708878 }, { "copy_logits_max": 27.430049896240234, "copy_logits_min": -750000000.0, "copy_num_tokens": 368.3125, "epoch": 0.08353331631350523, "gen_logits_max": 17.86359214782715, "gen_logits_mean": -2.0031471252441406, "gen_logits_min": -13.974737167358398, "gen_logits_std": 2.420503616333008, "gen_loss": 0.4367367625236511, "grad_norm": 0.9469990748216499, "learning_rate": 1.6653061224489797e-05, "loss": 0.4655, "mean_copy_accuracy": 0.9694423526525497, "mean_gen_accuracy": 0.8392625004053116, "mean_token_accuracy": 0.866413339972496, "num_tokens": 110580247.0, "sample_num_tokens": 7475.25, "step": 409, "total_num_tokens": 110610148.0, "z_loss": 0.029263878241181374 }, { "copy_logits_max": 22.792125701904297, "copy_logits_min": -750000000.0, "copy_num_tokens": 404.75, "epoch": 0.08373755425070206, "gen_logits_max": 16.227354049682617, "gen_logits_mean": -2.0698275566101074, "gen_logits_min": -14.01008129119873, "gen_logits_std": 2.42897629737854, "gen_loss": 0.5068613290786743, "grad_norm": 1.4160656755949104, "learning_rate": 1.6693877551020408e-05, "loss": 0.4822, "mean_copy_accuracy": 0.9718377441167831, "mean_gen_accuracy": 0.8372542262077332, "mean_token_accuracy": 0.8735138028860092, "num_tokens": 110858928.0, "sample_num_tokens": 8657.0, "step": 410, "total_num_tokens": 110893556.0, "z_loss": 0.026738815009593964 }, { "copy_logits_max": 33.573448181152344, "copy_logits_min": -750000000.0, "copy_num_tokens": 630.75, "epoch": 0.0839417921878989, "gen_logits_max": 16.579788208007812, "gen_logits_mean": -2.248291492462158, "gen_logits_min": -14.703348159790039, "gen_logits_std": 2.4389917850494385, "gen_loss": 0.39886829257011414, "grad_norm": 0.9775349151318247, "learning_rate": 1.673469387755102e-05, "loss": 0.4558, "mean_copy_accuracy": 0.972150593996048, "mean_gen_accuracy": 0.8408233374357224, "mean_token_accuracy": 0.8737664222717285, "num_tokens": 111113441.0, "sample_num_tokens": 8497.75, "step": 411, "total_num_tokens": 111147432.0, "z_loss": 0.04499926418066025 }, { "copy_logits_max": 30.424375534057617, "copy_logits_min": -750000000.0, "copy_num_tokens": 489.5, "epoch": 0.08414603012509574, "gen_logits_max": 17.84691619873047, "gen_logits_mean": -2.0541300773620605, "gen_logits_min": -14.101518630981445, "gen_logits_std": 2.423804521560669, "gen_loss": 0.3902585506439209, "grad_norm": 0.961086144505892, "learning_rate": 1.6775510204081634e-05, "loss": 0.4468, "mean_copy_accuracy": 0.9733954817056656, "mean_gen_accuracy": 0.8480938374996185, "mean_token_accuracy": 0.879055842757225, "num_tokens": 111401970.0, "sample_num_tokens": 8688.0, "step": 412, "total_num_tokens": 111436722.0, "z_loss": 0.034548670053482056 }, { "copy_logits_max": 35.407772064208984, "copy_logits_min": -687500032.0, "copy_num_tokens": 521.1875, "epoch": 0.08435026806229257, "gen_logits_max": 17.392166137695312, "gen_logits_mean": -2.0976529121398926, "gen_logits_min": -14.076019287109375, "gen_logits_std": 2.3426513671875, "gen_loss": 0.4394506812095642, "grad_norm": 1.11648800805525, "learning_rate": 1.6816326530612244e-05, "loss": 0.4619, "mean_copy_accuracy": 0.9723895341157913, "mean_gen_accuracy": 0.837885707616806, "mean_token_accuracy": 0.8693519681692123, "num_tokens": 111666980.0, "sample_num_tokens": 8603.0, "step": 413, "total_num_tokens": 111701392.0, "z_loss": 0.03755837678909302 }, { "copy_logits_max": 27.428659439086914, "copy_logits_min": -750000000.0, "copy_num_tokens": 492.125, "epoch": 0.08455450599948941, "gen_logits_max": 16.532554626464844, "gen_logits_mean": -2.1249914169311523, "gen_logits_min": -13.681079864501953, "gen_logits_std": 2.3144116401672363, "gen_loss": 0.43230581283569336, "grad_norm": 1.1577778366675235, "learning_rate": 1.6857142857142858e-05, "loss": 0.4798, "mean_copy_accuracy": 0.9695506691932678, "mean_gen_accuracy": 0.8369967490434647, "mean_token_accuracy": 0.8658134341239929, "num_tokens": 111920990.0, "sample_num_tokens": 8646.0, "step": 414, "total_num_tokens": 111955574.0, "z_loss": 0.03212796896696091 }, { "copy_logits_max": 27.6281681060791, "copy_logits_min": -750000000.0, "copy_num_tokens": 310.5625, "epoch": 0.08475874393668624, "gen_logits_max": 16.89739418029785, "gen_logits_mean": -1.853478193283081, "gen_logits_min": -13.766633987426758, "gen_logits_std": 2.398296356201172, "gen_loss": 0.4591572880744934, "grad_norm": 1.2597945282284468, "learning_rate": 1.6897959183673468e-05, "loss": 0.4689, "mean_copy_accuracy": 0.9748123735189438, "mean_gen_accuracy": 0.8352861404418945, "mean_token_accuracy": 0.8753262162208557, "num_tokens": 112219398.0, "sample_num_tokens": 7138.5, "step": 415, "total_num_tokens": 112247952.0, "z_loss": 0.02437650039792061 }, { "copy_logits_max": 28.431528091430664, "copy_logits_min": -750000000.0, "copy_num_tokens": 459.75, "epoch": 0.08496298187388307, "gen_logits_max": 17.308202743530273, "gen_logits_mean": -2.344209671020508, "gen_logits_min": -14.978437423706055, "gen_logits_std": 2.4708139896392822, "gen_loss": 0.4304956793785095, "grad_norm": 1.7252045365916313, "learning_rate": 1.693877551020408e-05, "loss": 0.4963, "mean_copy_accuracy": 0.9622477144002914, "mean_gen_accuracy": 0.8313641995191574, "mean_token_accuracy": 0.8678047358989716, "num_tokens": 112481578.0, "sample_num_tokens": 8157.5, "step": 416, "total_num_tokens": 112514208.0, "z_loss": 0.028172429651021957 }, { "copy_logits_max": 21.50908851623535, "copy_logits_min": -687500032.0, "copy_num_tokens": 321.5, "epoch": 0.08516721981107991, "gen_logits_max": 17.63326644897461, "gen_logits_mean": -2.3910670280456543, "gen_logits_min": -13.58784294128418, "gen_logits_std": 2.2633533477783203, "gen_loss": 0.4244316518306732, "grad_norm": 1.324311834337125, "learning_rate": 1.6979591836734695e-05, "loss": 0.4729, "mean_copy_accuracy": 0.9693114012479782, "mean_gen_accuracy": 0.8354125916957855, "mean_token_accuracy": 0.865926206111908, "num_tokens": 112744354.0, "sample_num_tokens": 8480.0, "step": 417, "total_num_tokens": 112778274.0, "z_loss": 0.01574096456170082 }, { "copy_logits_max": 34.2000846862793, "copy_logits_min": -750000000.0, "copy_num_tokens": 409.125, "epoch": 0.08537145774827674, "gen_logits_max": 17.220497131347656, "gen_logits_mean": -2.2552402019500732, "gen_logits_min": -14.719803810119629, "gen_logits_std": 2.424563407897949, "gen_loss": 0.4411134719848633, "grad_norm": 2.0387272426544545, "learning_rate": 1.7020408163265308e-05, "loss": 0.4739, "mean_copy_accuracy": 0.972289964556694, "mean_gen_accuracy": 0.842357411980629, "mean_token_accuracy": 0.8765861690044403, "num_tokens": 112987129.0, "sample_num_tokens": 7426.75, "step": 418, "total_num_tokens": 113016836.0, "z_loss": 0.0344112366437912 }, { "copy_logits_max": 20.398168563842773, "copy_logits_min": -750000000.0, "copy_num_tokens": 250.0, "epoch": 0.08557569568547357, "gen_logits_max": 17.23784637451172, "gen_logits_mean": -2.084907054901123, "gen_logits_min": -14.052215576171875, "gen_logits_std": 2.393045425415039, "gen_loss": 0.45340949296951294, "grad_norm": 0.9765225968232752, "learning_rate": 1.706122448979592e-05, "loss": 0.4726, "mean_copy_accuracy": 0.9726810902357101, "mean_gen_accuracy": 0.836123526096344, "mean_token_accuracy": 0.865614578127861, "num_tokens": 113239297.0, "sample_num_tokens": 6958.25, "step": 419, "total_num_tokens": 113267130.0, "z_loss": 0.018113113939762115 }, { "copy_logits_max": 31.18179702758789, "copy_logits_min": -750000000.0, "copy_num_tokens": 560.0625, "epoch": 0.08577993362267042, "gen_logits_max": 16.389894485473633, "gen_logits_mean": -2.4122560024261475, "gen_logits_min": -14.660057067871094, "gen_logits_std": 2.4225234985351562, "gen_loss": 0.3901887536048889, "grad_norm": 1.519307083373335, "learning_rate": 1.7102040816326532e-05, "loss": 0.4676, "mean_copy_accuracy": 0.9731481224298477, "mean_gen_accuracy": 0.8346264064311981, "mean_token_accuracy": 0.8674812912940979, "num_tokens": 113490826.0, "sample_num_tokens": 8436.5, "step": 420, "total_num_tokens": 113524572.0, "z_loss": 0.034866251051425934 }, { "copy_logits_max": 31.57763671875, "copy_logits_min": -750000000.0, "copy_num_tokens": 700.625, "epoch": 0.08598417155986725, "gen_logits_max": 16.3736572265625, "gen_logits_mean": -2.4724645614624023, "gen_logits_min": -14.801107406616211, "gen_logits_std": 2.4558260440826416, "gen_loss": 0.41351011395454407, "grad_norm": 1.32949054848628, "learning_rate": 1.7142857142857142e-05, "loss": 0.4501, "mean_copy_accuracy": 0.9716979712247849, "mean_gen_accuracy": 0.8432474732398987, "mean_token_accuracy": 0.8816432654857635, "num_tokens": 113771517.0, "sample_num_tokens": 9842.75, "step": 421, "total_num_tokens": 113810888.0, "z_loss": 0.039137110114097595 }, { "copy_logits_max": 28.690370559692383, "copy_logits_min": -750000000.0, "copy_num_tokens": 441.25, "epoch": 0.08618840949706408, "gen_logits_max": 17.75934600830078, "gen_logits_mean": -2.1902823448181152, "gen_logits_min": -14.283820152282715, "gen_logits_std": 2.4214460849761963, "gen_loss": 0.4731716513633728, "grad_norm": 1.744980874730069, "learning_rate": 1.7183673469387755e-05, "loss": 0.4933, "mean_copy_accuracy": 0.9720460772514343, "mean_gen_accuracy": 0.8318391293287277, "mean_token_accuracy": 0.8622421771287918, "num_tokens": 114058753.0, "sample_num_tokens": 8702.75, "step": 422, "total_num_tokens": 114093564.0, "z_loss": 0.029623089358210564 }, { "copy_logits_max": 24.312307357788086, "copy_logits_min": -750000000.0, "copy_num_tokens": 562.5625, "epoch": 0.08639264743426091, "gen_logits_max": 15.917790412902832, "gen_logits_mean": -2.594316244125366, "gen_logits_min": -14.377318382263184, "gen_logits_std": 2.4239161014556885, "gen_loss": 0.39160704612731934, "grad_norm": 1.4080772277440081, "learning_rate": 1.722448979591837e-05, "loss": 0.4785, "mean_copy_accuracy": 0.9647075980901718, "mean_gen_accuracy": 0.8403185158967972, "mean_token_accuracy": 0.8664681166410446, "num_tokens": 114325065.0, "sample_num_tokens": 9812.75, "step": 423, "total_num_tokens": 114364316.0, "z_loss": 0.030717946588993073 }, { "copy_logits_max": 29.313499450683594, "copy_logits_min": -687500032.0, "copy_num_tokens": 404.4375, "epoch": 0.08659688537145775, "gen_logits_max": 18.15304183959961, "gen_logits_mean": -2.5543570518493652, "gen_logits_min": -14.974132537841797, "gen_logits_std": 2.370650291442871, "gen_loss": 0.41855645179748535, "grad_norm": 0.9357408046798291, "learning_rate": 1.7265306122448982e-05, "loss": 0.461, "mean_copy_accuracy": 0.9709260016679764, "mean_gen_accuracy": 0.8352274149656296, "mean_token_accuracy": 0.8694619536399841, "num_tokens": 114609658.0, "sample_num_tokens": 7685.5, "step": 424, "total_num_tokens": 114640400.0, "z_loss": 0.025957629084587097 }, { "copy_logits_max": 23.692001342773438, "copy_logits_min": -750000000.0, "copy_num_tokens": 450.625, "epoch": 0.08680112330865458, "gen_logits_max": 16.224018096923828, "gen_logits_mean": -2.488308906555176, "gen_logits_min": -14.48897933959961, "gen_logits_std": 2.460275888442993, "gen_loss": 0.46121907234191895, "grad_norm": 1.6215500173373825, "learning_rate": 1.7306122448979592e-05, "loss": 0.4805, "mean_copy_accuracy": 0.961776614189148, "mean_gen_accuracy": 0.8347562551498413, "mean_token_accuracy": 0.8637291640043259, "num_tokens": 114865608.0, "sample_num_tokens": 8748.5, "step": 425, "total_num_tokens": 114900602.0, "z_loss": 0.025303617119789124 }, { "copy_logits_max": 31.87470054626465, "copy_logits_min": -750000000.0, "copy_num_tokens": 580.5625, "epoch": 0.08700536124585141, "gen_logits_max": 16.58075714111328, "gen_logits_mean": -2.545780658721924, "gen_logits_min": -14.946224212646484, "gen_logits_std": 2.5148019790649414, "gen_loss": 0.4470773935317993, "grad_norm": 2.114131875183525, "learning_rate": 1.7346938775510202e-05, "loss": 0.465, "mean_copy_accuracy": 0.9742705076932907, "mean_gen_accuracy": 0.8370053321123123, "mean_token_accuracy": 0.8702960163354874, "num_tokens": 115140411.0, "sample_num_tokens": 8485.75, "step": 426, "total_num_tokens": 115174354.0, "z_loss": 0.035405393689870834 }, { "copy_logits_max": 25.504213333129883, "copy_logits_min": -750000000.0, "copy_num_tokens": 433.125, "epoch": 0.08720959918304826, "gen_logits_max": 16.553787231445312, "gen_logits_mean": -2.4755239486694336, "gen_logits_min": -14.231367111206055, "gen_logits_std": 2.3715546131134033, "gen_loss": 0.4420582056045532, "grad_norm": 1.6846583733635572, "learning_rate": 1.7387755102040816e-05, "loss": 0.5036, "mean_copy_accuracy": 0.972468376159668, "mean_gen_accuracy": 0.8300021439790726, "mean_token_accuracy": 0.8643272370100021, "num_tokens": 115413366.0, "sample_num_tokens": 8892.5, "step": 427, "total_num_tokens": 115448936.0, "z_loss": 0.026203341782093048 }, { "copy_logits_max": 40.10485076904297, "copy_logits_min": -562500032.0, "copy_num_tokens": 497.6875, "epoch": 0.08741383712024509, "gen_logits_max": 17.779586791992188, "gen_logits_mean": -2.172027111053467, "gen_logits_min": -15.035181045532227, "gen_logits_std": 2.492189884185791, "gen_loss": 0.47427111864089966, "grad_norm": 1.3119379088687841, "learning_rate": 1.742857142857143e-05, "loss": 0.5088, "mean_copy_accuracy": 0.9657148271799088, "mean_gen_accuracy": 0.8299264460802078, "mean_token_accuracy": 0.8622172027826309, "num_tokens": 115682534.0, "sample_num_tokens": 8497.5, "step": 428, "total_num_tokens": 115716524.0, "z_loss": 0.03810994327068329 }, { "copy_logits_max": 23.989500045776367, "copy_logits_min": -750000000.0, "copy_num_tokens": 580.0625, "epoch": 0.08761807505744192, "gen_logits_max": 16.694576263427734, "gen_logits_mean": -2.407365322113037, "gen_logits_min": -14.26773738861084, "gen_logits_std": 2.43904709815979, "gen_loss": 0.43835559487342834, "grad_norm": 1.5329012319910584, "learning_rate": 1.7469387755102043e-05, "loss": 0.4873, "mean_copy_accuracy": 0.9749352484941483, "mean_gen_accuracy": 0.8231659382581711, "mean_token_accuracy": 0.8680167496204376, "num_tokens": 115965143.0, "sample_num_tokens": 10149.25, "step": 429, "total_num_tokens": 116005740.0, "z_loss": 0.029124949127435684 }, { "copy_logits_max": 20.849340438842773, "copy_logits_min": -750000000.0, "copy_num_tokens": 310.6875, "epoch": 0.08782231299463876, "gen_logits_max": 17.377805709838867, "gen_logits_mean": -2.4755234718322754, "gen_logits_min": -14.115835189819336, "gen_logits_std": 2.3661444187164307, "gen_loss": 0.4262625575065613, "grad_norm": 1.173647119611346, "learning_rate": 1.7510204081632653e-05, "loss": 0.477, "mean_copy_accuracy": 0.9614219069480896, "mean_gen_accuracy": 0.8363404422998428, "mean_token_accuracy": 0.8641598522663116, "num_tokens": 116234933.0, "sample_num_tokens": 7961.75, "step": 430, "total_num_tokens": 116266780.0, "z_loss": 0.017269153147935867 }, { "copy_logits_max": 19.887493133544922, "copy_logits_min": -750000000.0, "copy_num_tokens": 355.4375, "epoch": 0.08802655093183559, "gen_logits_max": 16.402477264404297, "gen_logits_mean": -2.4776923656463623, "gen_logits_min": -15.117813110351562, "gen_logits_std": 2.454211473464966, "gen_loss": 0.38403522968292236, "grad_norm": 1.8871270376019382, "learning_rate": 1.7551020408163266e-05, "loss": 0.4733, "mean_copy_accuracy": 0.9699251651763916, "mean_gen_accuracy": 0.836976021528244, "mean_token_accuracy": 0.8683875948190689, "num_tokens": 116499439.0, "sample_num_tokens": 8258.75, "step": 431, "total_num_tokens": 116532474.0, "z_loss": 0.020601453259587288 }, { "copy_logits_max": 22.071136474609375, "copy_logits_min": -750000000.0, "copy_num_tokens": 406.9375, "epoch": 0.08823078886903242, "gen_logits_max": 16.3350830078125, "gen_logits_mean": -2.483153820037842, "gen_logits_min": -14.536985397338867, "gen_logits_std": 2.4276809692382812, "gen_loss": 0.38543128967285156, "grad_norm": 1.2976602456289696, "learning_rate": 1.7591836734693876e-05, "loss": 0.4591, "mean_copy_accuracy": 0.9709027260541916, "mean_gen_accuracy": 0.8405702263116837, "mean_token_accuracy": 0.8704852908849716, "num_tokens": 116760055.0, "sample_num_tokens": 7786.25, "step": 432, "total_num_tokens": 116791200.0, "z_loss": 0.026924947276711464 }, { "copy_logits_max": 19.797454833984375, "copy_logits_min": -750000064.0, "copy_num_tokens": 374.6875, "epoch": 0.08843502680622926, "gen_logits_max": 16.19771957397461, "gen_logits_mean": -2.66271710395813, "gen_logits_min": -15.324161529541016, "gen_logits_std": 2.51763653755188, "gen_loss": 0.46609166264533997, "grad_norm": 1.1068556763788917, "learning_rate": 1.7632653061224493e-05, "loss": 0.4765, "mean_copy_accuracy": 0.9725754112005234, "mean_gen_accuracy": 0.8344262987375259, "mean_token_accuracy": 0.8655552268028259, "num_tokens": 117011347.0, "sample_num_tokens": 7130.75, "step": 433, "total_num_tokens": 117039870.0, "z_loss": 0.02324332483112812 }, { "copy_logits_max": 21.439937591552734, "copy_logits_min": -750000000.0, "copy_num_tokens": 355.5, "epoch": 0.0886392647434261, "gen_logits_max": 16.853912353515625, "gen_logits_mean": -2.3461496829986572, "gen_logits_min": -13.951146125793457, "gen_logits_std": 2.3619933128356934, "gen_loss": 0.48821723461151123, "grad_norm": 0.9928792706378012, "learning_rate": 1.7673469387755103e-05, "loss": 0.4673, "mean_copy_accuracy": 0.9756705462932587, "mean_gen_accuracy": 0.8364439606666565, "mean_token_accuracy": 0.8681095242500305, "num_tokens": 117294632.0, "sample_num_tokens": 8627.5, "step": 434, "total_num_tokens": 117329142.0, "z_loss": 0.017885930836200714 }, { "copy_logits_max": 27.567031860351562, "copy_logits_min": -625000000.0, "copy_num_tokens": 611.125, "epoch": 0.08884350268062292, "gen_logits_max": 17.891315460205078, "gen_logits_mean": -2.3802995681762695, "gen_logits_min": -14.372502326965332, "gen_logits_std": 2.414053201675415, "gen_loss": 0.4064110219478607, "grad_norm": 1.368780805231093, "learning_rate": 1.7714285714285717e-05, "loss": 0.4716, "mean_copy_accuracy": 0.9695572257041931, "mean_gen_accuracy": 0.8330848664045334, "mean_token_accuracy": 0.8704529404640198, "num_tokens": 117571776.0, "sample_num_tokens": 10008.0, "step": 435, "total_num_tokens": 117611808.0, "z_loss": 0.031563661992549896 }, { "copy_logits_max": 23.10198974609375, "copy_logits_min": -750000000.0, "copy_num_tokens": 605.5, "epoch": 0.08904774061781975, "gen_logits_max": 15.205694198608398, "gen_logits_mean": -2.6590893268585205, "gen_logits_min": -15.174158096313477, "gen_logits_std": 2.4365744590759277, "gen_loss": 0.3782804608345032, "grad_norm": 1.548801096005188, "learning_rate": 1.7755102040816327e-05, "loss": 0.4376, "mean_copy_accuracy": 0.9726402014493942, "mean_gen_accuracy": 0.8478448241949081, "mean_token_accuracy": 0.8796041309833527, "num_tokens": 117870545.0, "sample_num_tokens": 9281.25, "step": 436, "total_num_tokens": 117907670.0, "z_loss": 0.03133070468902588 }, { "copy_logits_max": 31.82611656188965, "copy_logits_min": -750000000.0, "copy_num_tokens": 884.25, "epoch": 0.0892519785550166, "gen_logits_max": 16.372024536132812, "gen_logits_mean": -2.6850342750549316, "gen_logits_min": -15.048859596252441, "gen_logits_std": 2.4511866569519043, "gen_loss": 0.4139898121356964, "grad_norm": 1.7593998430743127, "learning_rate": 1.779591836734694e-05, "loss": 0.44, "mean_copy_accuracy": 0.9783005863428116, "mean_gen_accuracy": 0.8435099869966507, "mean_token_accuracy": 0.8803535848855972, "num_tokens": 118172470.0, "sample_num_tokens": 11726.5, "step": 437, "total_num_tokens": 118219376.0, "z_loss": 0.04106534272432327 }, { "copy_logits_max": 32.12921142578125, "copy_logits_min": -750000000.0, "copy_num_tokens": 708.875, "epoch": 0.08945621649221343, "gen_logits_max": 17.082992553710938, "gen_logits_mean": -2.423574924468994, "gen_logits_min": -15.258054733276367, "gen_logits_std": 2.5217108726501465, "gen_loss": 0.43790072202682495, "grad_norm": 1.2600147058022113, "learning_rate": 1.783673469387755e-05, "loss": 0.4649, "mean_copy_accuracy": 0.9724022597074509, "mean_gen_accuracy": 0.8358670026063919, "mean_token_accuracy": 0.8716536909341812, "num_tokens": 118459777.0, "sample_num_tokens": 10420.75, "step": 438, "total_num_tokens": 118501460.0, "z_loss": 0.0389120951294899 }, { "copy_logits_max": 20.93647003173828, "copy_logits_min": -687500032.0, "copy_num_tokens": 281.6875, "epoch": 0.08966045442941026, "gen_logits_max": 17.28180503845215, "gen_logits_mean": -2.3754072189331055, "gen_logits_min": -14.557987213134766, "gen_logits_std": 2.422239303588867, "gen_loss": 0.42733234167099, "grad_norm": 1.7739283181745868, "learning_rate": 1.7877551020408164e-05, "loss": 0.4438, "mean_copy_accuracy": 0.9612938314676285, "mean_gen_accuracy": 0.8443219214677811, "mean_token_accuracy": 0.8733575493097305, "num_tokens": 118733671.0, "sample_num_tokens": 8124.75, "step": 439, "total_num_tokens": 118766170.0, "z_loss": 0.01809566840529442 }, { "copy_logits_max": 24.286502838134766, "copy_logits_min": -750000000.0, "copy_num_tokens": 336.5625, "epoch": 0.0898646923666071, "gen_logits_max": 17.22287368774414, "gen_logits_mean": -2.2569990158081055, "gen_logits_min": -13.879520416259766, "gen_logits_std": 2.3782849311828613, "gen_loss": 0.46666979789733887, "grad_norm": 1.3269198698544127, "learning_rate": 1.7918367346938777e-05, "loss": 0.4683, "mean_copy_accuracy": 0.9755112826824188, "mean_gen_accuracy": 0.8374382853507996, "mean_token_accuracy": 0.8706063777208328, "num_tokens": 118988449.0, "sample_num_tokens": 7091.25, "step": 440, "total_num_tokens": 119016814.0, "z_loss": 0.02268693782389164 }, { "copy_logits_max": 27.20220184326172, "copy_logits_min": -750000000.0, "copy_num_tokens": 486.0, "epoch": 0.09006893030380393, "gen_logits_max": 17.525779724121094, "gen_logits_mean": -2.338599681854248, "gen_logits_min": -14.445765495300293, "gen_logits_std": 2.3999111652374268, "gen_loss": 0.4150829017162323, "grad_norm": 1.2185920950941858, "learning_rate": 1.7959183673469387e-05, "loss": 0.4568, "mean_copy_accuracy": 0.9765522330999374, "mean_gen_accuracy": 0.839027464389801, "mean_token_accuracy": 0.8748768419027328, "num_tokens": 119257972.0, "sample_num_tokens": 8863.5, "step": 441, "total_num_tokens": 119293426.0, "z_loss": 0.024912748485803604 }, { "copy_logits_max": 22.74969482421875, "copy_logits_min": -750000000.0, "copy_num_tokens": 297.5625, "epoch": 0.09027316824100076, "gen_logits_max": 17.06233024597168, "gen_logits_mean": -2.277625560760498, "gen_logits_min": -13.410202026367188, "gen_logits_std": 2.342686176300049, "gen_loss": 0.456538587808609, "grad_norm": 2.0400524258873123, "learning_rate": 1.8e-05, "loss": 0.4713, "mean_copy_accuracy": 0.9649974554777145, "mean_gen_accuracy": 0.8383847028017044, "mean_token_accuracy": 0.8692376613616943, "num_tokens": 119541724.0, "sample_num_tokens": 7552.5, "step": 442, "total_num_tokens": 119571934.0, "z_loss": 0.017637282609939575 }, { "copy_logits_max": 25.012195587158203, "copy_logits_min": -750000000.0, "copy_num_tokens": 361.4375, "epoch": 0.0904774061781976, "gen_logits_max": 16.906158447265625, "gen_logits_mean": -2.5154247283935547, "gen_logits_min": -14.231199264526367, "gen_logits_std": 2.4064323902130127, "gen_loss": 0.49569690227508545, "grad_norm": 1.2728794252983764, "learning_rate": 1.804081632653061e-05, "loss": 0.4729, "mean_copy_accuracy": 0.9731698334217072, "mean_gen_accuracy": 0.8379473835229874, "mean_token_accuracy": 0.8666692078113556, "num_tokens": 119797526.0, "sample_num_tokens": 8102.5, "step": 443, "total_num_tokens": 119829936.0, "z_loss": 0.024453885853290558 }, { "copy_logits_max": 22.421642303466797, "copy_logits_min": -687500032.0, "copy_num_tokens": 497.25, "epoch": 0.09068164411539444, "gen_logits_max": 15.458585739135742, "gen_logits_mean": -2.834627151489258, "gen_logits_min": -14.578327178955078, "gen_logits_std": 2.3330583572387695, "gen_loss": 0.40920737385749817, "grad_norm": 1.4420135163816699, "learning_rate": 1.8081632653061227e-05, "loss": 0.4581, "mean_copy_accuracy": 0.9680230766534805, "mean_gen_accuracy": 0.8424858450889587, "mean_token_accuracy": 0.8711085915565491, "num_tokens": 120067616.0, "sample_num_tokens": 9136.5, "step": 444, "total_num_tokens": 120104162.0, "z_loss": 0.025594055652618408 }, { "copy_logits_max": 24.45907211303711, "copy_logits_min": -687499968.0, "copy_num_tokens": 376.5, "epoch": 0.09088588205259127, "gen_logits_max": 16.013988494873047, "gen_logits_mean": -2.9592232704162598, "gen_logits_min": -14.879371643066406, "gen_logits_std": 2.369619607925415, "gen_loss": 0.4500053822994232, "grad_norm": 1.014221757680859, "learning_rate": 1.8122448979591837e-05, "loss": 0.468, "mean_copy_accuracy": 0.9705567508935928, "mean_gen_accuracy": 0.8336756080389023, "mean_token_accuracy": 0.8660701364278793, "num_tokens": 120341230.0, "sample_num_tokens": 8045.0, "step": 445, "total_num_tokens": 120373410.0, "z_loss": 0.022641774266958237 }, { "copy_logits_max": 28.222728729248047, "copy_logits_min": -750000000.0, "copy_num_tokens": 559.4375, "epoch": 0.09109011998978811, "gen_logits_max": 16.12929916381836, "gen_logits_mean": -2.4414401054382324, "gen_logits_min": -14.904534339904785, "gen_logits_std": 2.485826253890991, "gen_loss": 0.42044955492019653, "grad_norm": 2.01675698398005, "learning_rate": 1.816326530612245e-05, "loss": 0.4812, "mean_copy_accuracy": 0.9597426205873489, "mean_gen_accuracy": 0.8365852385759354, "mean_token_accuracy": 0.8687165975570679, "num_tokens": 120616817.0, "sample_num_tokens": 8248.25, "step": 446, "total_num_tokens": 120649810.0, "z_loss": 0.03500623255968094 }, { "copy_logits_max": 23.663496017456055, "copy_logits_min": -750000000.0, "copy_num_tokens": 237.875, "epoch": 0.09129435792698494, "gen_logits_max": 17.573345184326172, "gen_logits_mean": -2.4616570472717285, "gen_logits_min": -14.228410720825195, "gen_logits_std": 2.3830008506774902, "gen_loss": 0.47640204429626465, "grad_norm": 1.4693768223912598, "learning_rate": 1.820408163265306e-05, "loss": 0.4805, "mean_copy_accuracy": 0.9794581383466721, "mean_gen_accuracy": 0.8296467363834381, "mean_token_accuracy": 0.8652927726507187, "num_tokens": 120877124.0, "sample_num_tokens": 6861.5, "step": 447, "total_num_tokens": 120904570.0, "z_loss": 0.017971329391002655 }, { "copy_logits_max": 31.910242080688477, "copy_logits_min": -750000000.0, "copy_num_tokens": 388.0, "epoch": 0.09149859586418177, "gen_logits_max": 17.1883544921875, "gen_logits_mean": -2.341754913330078, "gen_logits_min": -14.638744354248047, "gen_logits_std": 2.495448112487793, "gen_loss": 0.47180265188217163, "grad_norm": 1.583114952514943, "learning_rate": 1.8244897959183674e-05, "loss": 0.4585, "mean_copy_accuracy": 0.9796205163002014, "mean_gen_accuracy": 0.8374619781970978, "mean_token_accuracy": 0.8780240416526794, "num_tokens": 121154321.0, "sample_num_tokens": 7431.25, "step": 448, "total_num_tokens": 121184046.0, "z_loss": 0.02959918975830078 }, { "copy_logits_max": 19.272493362426758, "copy_logits_min": -750000000.0, "copy_num_tokens": 363.875, "epoch": 0.0917028338013786, "gen_logits_max": 16.134960174560547, "gen_logits_mean": -2.756324052810669, "gen_logits_min": -14.127008438110352, "gen_logits_std": 2.3833484649658203, "gen_loss": 0.40374428033828735, "grad_norm": 1.0428078690011096, "learning_rate": 1.8285714285714288e-05, "loss": 0.4366, "mean_copy_accuracy": 0.9763848930597305, "mean_gen_accuracy": 0.8425161838531494, "mean_token_accuracy": 0.874278113245964, "num_tokens": 121420471.0, "sample_num_tokens": 7608.25, "step": 449, "total_num_tokens": 121450904.0, "z_loss": 0.01745220273733139 }, { "copy_logits_max": 26.092065811157227, "copy_logits_min": -750000000.0, "copy_num_tokens": 383.6875, "epoch": 0.09190707173857544, "gen_logits_max": 17.546165466308594, "gen_logits_mean": -2.6782138347625732, "gen_logits_min": -14.528305053710938, "gen_logits_std": 2.4030447006225586, "gen_loss": 0.4563823938369751, "grad_norm": 2.3790447542194113, "learning_rate": 1.8326530612244898e-05, "loss": 0.4946, "mean_copy_accuracy": 0.9617932736873627, "mean_gen_accuracy": 0.829541802406311, "mean_token_accuracy": 0.8598868101835251, "num_tokens": 121685213.0, "sample_num_tokens": 7882.25, "step": 450, "total_num_tokens": 121716742.0, "z_loss": 0.022422516718506813 }, { "copy_logits_max": 21.79571533203125, "copy_logits_min": -750000000.0, "copy_num_tokens": 446.25, "epoch": 0.09211130967577227, "gen_logits_max": 16.436161041259766, "gen_logits_mean": -2.515270233154297, "gen_logits_min": -14.693175315856934, "gen_logits_std": 2.4411933422088623, "gen_loss": 0.40374958515167236, "grad_norm": 0.9542627890870389, "learning_rate": 1.836734693877551e-05, "loss": 0.4628, "mean_copy_accuracy": 0.9731565564870834, "mean_gen_accuracy": 0.8397723734378815, "mean_token_accuracy": 0.871759220957756, "num_tokens": 121976522.0, "sample_num_tokens": 8852.5, "step": 451, "total_num_tokens": 122011932.0, "z_loss": 0.02333381026983261 }, { "copy_logits_max": 25.286266326904297, "copy_logits_min": -750000000.0, "copy_num_tokens": 474.75, "epoch": 0.0923155476129691, "gen_logits_max": 15.818033218383789, "gen_logits_mean": -2.494406223297119, "gen_logits_min": -13.98983097076416, "gen_logits_std": 2.412193775177002, "gen_loss": 0.443460077047348, "grad_norm": 1.3372615108491452, "learning_rate": 1.840816326530612e-05, "loss": 0.4441, "mean_copy_accuracy": 0.9799377769231796, "mean_gen_accuracy": 0.836589589715004, "mean_token_accuracy": 0.8737965673208237, "num_tokens": 122235251.0, "sample_num_tokens": 8842.25, "step": 452, "total_num_tokens": 122270620.0, "z_loss": 0.02600226178765297 }, { "copy_logits_max": 26.681333541870117, "copy_logits_min": -750000000.0, "copy_num_tokens": 455.0, "epoch": 0.09251978555016595, "gen_logits_max": 16.385578155517578, "gen_logits_mean": -2.6931302547454834, "gen_logits_min": -14.243338584899902, "gen_logits_std": 2.362839698791504, "gen_loss": 0.4438748359680176, "grad_norm": 2.8504450592817396, "learning_rate": 1.8448979591836735e-05, "loss": 0.4767, "mean_copy_accuracy": 0.9745378494262695, "mean_gen_accuracy": 0.8328759372234344, "mean_token_accuracy": 0.8675462901592255, "num_tokens": 122509980.0, "sample_num_tokens": 8234.0, "step": 453, "total_num_tokens": 122542916.0, "z_loss": 0.025002963840961456 }, { "copy_logits_max": 23.65941619873047, "copy_logits_min": -750000000.0, "copy_num_tokens": 351.0, "epoch": 0.09272402348736278, "gen_logits_max": 16.710941314697266, "gen_logits_mean": -2.8512864112854004, "gen_logits_min": -14.47028923034668, "gen_logits_std": 2.414597988128662, "gen_loss": 0.3948633372783661, "grad_norm": 1.0084979439409265, "learning_rate": 1.8489795918367345e-05, "loss": 0.4551, "mean_copy_accuracy": 0.9657833278179169, "mean_gen_accuracy": 0.8397549390792847, "mean_token_accuracy": 0.8672679960727692, "num_tokens": 122765534.0, "sample_num_tokens": 7161.5, "step": 454, "total_num_tokens": 122794180.0, "z_loss": 0.020167533308267593 }, { "copy_logits_max": 17.620168685913086, "copy_logits_min": -750000000.0, "copy_num_tokens": 367.75, "epoch": 0.09292826142455961, "gen_logits_max": 14.714204788208008, "gen_logits_mean": -2.889676094055176, "gen_logits_min": -14.059455871582031, "gen_logits_std": 2.318572759628296, "gen_loss": 0.42483657598495483, "grad_norm": 1.3512792494688983, "learning_rate": 1.853061224489796e-05, "loss": 0.4509, "mean_copy_accuracy": 0.9646528661251068, "mean_gen_accuracy": 0.8444180488586426, "mean_token_accuracy": 0.8681790977716446, "num_tokens": 123018731.0, "sample_num_tokens": 8349.25, "step": 455, "total_num_tokens": 123052128.0, "z_loss": 0.019898025318980217 }, { "copy_logits_max": 23.49483871459961, "copy_logits_min": -750000000.0, "copy_num_tokens": 370.3125, "epoch": 0.09313249936175645, "gen_logits_max": 17.238433837890625, "gen_logits_mean": -2.6993954181671143, "gen_logits_min": -14.313091278076172, "gen_logits_std": 2.3968429565429688, "gen_loss": 0.449497789144516, "grad_norm": 0.905958733263827, "learning_rate": 1.8571428571428572e-05, "loss": 0.4433, "mean_copy_accuracy": 0.9739989638328552, "mean_gen_accuracy": 0.8400451391935349, "mean_token_accuracy": 0.8742068707942963, "num_tokens": 123299426.0, "sample_num_tokens": 7356.0, "step": 456, "total_num_tokens": 123328850.0, "z_loss": 0.02172953635454178 }, { "copy_logits_max": 22.24918556213379, "copy_logits_min": -750000000.0, "copy_num_tokens": 393.375, "epoch": 0.09333673729895328, "gen_logits_max": 15.791999816894531, "gen_logits_mean": -3.0096206665039062, "gen_logits_min": -13.952532768249512, "gen_logits_std": 2.322829246520996, "gen_loss": 0.4422004818916321, "grad_norm": 0.9747734905912876, "learning_rate": 1.8612244897959185e-05, "loss": 0.4565, "mean_copy_accuracy": 0.973685622215271, "mean_gen_accuracy": 0.8380131870508194, "mean_token_accuracy": 0.8701597601175308, "num_tokens": 123569834.0, "sample_num_tokens": 8232.5, "step": 457, "total_num_tokens": 123602764.0, "z_loss": 0.0200221985578537 }, { "copy_logits_max": 24.226926803588867, "copy_logits_min": -750000000.0, "copy_num_tokens": 417.5, "epoch": 0.09354097523615011, "gen_logits_max": 16.442554473876953, "gen_logits_mean": -2.9341073036193848, "gen_logits_min": -14.398418426513672, "gen_logits_std": 2.3740596771240234, "gen_loss": 0.47664281725883484, "grad_norm": 1.1084222563464097, "learning_rate": 1.8653061224489795e-05, "loss": 0.4538, "mean_copy_accuracy": 0.9775170087814331, "mean_gen_accuracy": 0.8386202901601791, "mean_token_accuracy": 0.8703617304563522, "num_tokens": 123828262.0, "sample_num_tokens": 8605.0, "step": 458, "total_num_tokens": 123862682.0, "z_loss": 0.021737948060035706 }, { "copy_logits_max": 28.083744049072266, "copy_logits_min": -750000000.0, "copy_num_tokens": 599.6875, "epoch": 0.09374521317334696, "gen_logits_max": 16.425373077392578, "gen_logits_mean": -3.1000421047210693, "gen_logits_min": -14.553014755249023, "gen_logits_std": 2.3684868812561035, "gen_loss": 0.37590962648391724, "grad_norm": 0.977801987504544, "learning_rate": 1.869387755102041e-05, "loss": 0.4373, "mean_copy_accuracy": 0.9737190008163452, "mean_gen_accuracy": 0.8414496779441833, "mean_token_accuracy": 0.875616580247879, "num_tokens": 124100240.0, "sample_num_tokens": 8981.0, "step": 459, "total_num_tokens": 124136164.0, "z_loss": 0.029341809451580048 }, { "copy_logits_max": 23.828384399414062, "copy_logits_min": -687500032.0, "copy_num_tokens": 370.8125, "epoch": 0.09394945111054379, "gen_logits_max": 15.881095886230469, "gen_logits_mean": -2.854196786880493, "gen_logits_min": -14.063812255859375, "gen_logits_std": 2.3891170024871826, "gen_loss": 0.4284460246562958, "grad_norm": 1.0573010126993367, "learning_rate": 1.8734693877551022e-05, "loss": 0.4564, "mean_copy_accuracy": 0.9728866219520569, "mean_gen_accuracy": 0.8321432918310165, "mean_token_accuracy": 0.8702602684497833, "num_tokens": 124385975.0, "sample_num_tokens": 7760.75, "step": 460, "total_num_tokens": 124417018.0, "z_loss": 0.021858662366867065 }, { "copy_logits_max": 19.977602005004883, "copy_logits_min": -750000000.0, "copy_num_tokens": 354.625, "epoch": 0.09415368904774062, "gen_logits_max": 15.84823989868164, "gen_logits_mean": -2.965961456298828, "gen_logits_min": -14.485445022583008, "gen_logits_std": 2.3708317279815674, "gen_loss": 0.37783384323120117, "grad_norm": 1.1483978399237476, "learning_rate": 1.8775510204081636e-05, "loss": 0.4373, "mean_copy_accuracy": 0.9733492434024811, "mean_gen_accuracy": 0.8454923927783966, "mean_token_accuracy": 0.8715373426675797, "num_tokens": 124644414.0, "sample_num_tokens": 8298.5, "step": 461, "total_num_tokens": 124677608.0, "z_loss": 0.019166231155395508 }, { "copy_logits_max": 25.064350128173828, "copy_logits_min": -687499968.0, "copy_num_tokens": 499.3125, "epoch": 0.09435792698493745, "gen_logits_max": 16.845243453979492, "gen_logits_mean": -2.765937328338623, "gen_logits_min": -14.009687423706055, "gen_logits_std": 2.356055498123169, "gen_loss": 0.4154556393623352, "grad_norm": 1.0193239810107302, "learning_rate": 1.8816326530612246e-05, "loss": 0.4235, "mean_copy_accuracy": 0.9816378355026245, "mean_gen_accuracy": 0.8454578518867493, "mean_token_accuracy": 0.878307655453682, "num_tokens": 124918633.0, "sample_num_tokens": 8663.25, "step": 462, "total_num_tokens": 124953286.0, "z_loss": 0.022770583629608154 }, { "copy_logits_max": 25.134353637695312, "copy_logits_min": -750000000.0, "copy_num_tokens": 588.0, "epoch": 0.09456216492213429, "gen_logits_max": 16.425355911254883, "gen_logits_mean": -2.9577584266662598, "gen_logits_min": -15.080674171447754, "gen_logits_std": 2.514923572540283, "gen_loss": 0.4182950258255005, "grad_norm": 0.9663287108772105, "learning_rate": 1.8857142857142856e-05, "loss": 0.4425, "mean_copy_accuracy": 0.9773155897855759, "mean_gen_accuracy": 0.8403806388378143, "mean_token_accuracy": 0.8752817809581757, "num_tokens": 125196053.0, "sample_num_tokens": 9352.25, "step": 463, "total_num_tokens": 125233462.0, "z_loss": 0.026226744055747986 }, { "copy_logits_max": 25.02967071533203, "copy_logits_min": -750000000.0, "copy_num_tokens": 478.75, "epoch": 0.09476640285933112, "gen_logits_max": 16.92376708984375, "gen_logits_mean": -3.0581045150756836, "gen_logits_min": -14.497051239013672, "gen_logits_std": 2.3805503845214844, "gen_loss": 0.40703654289245605, "grad_norm": 1.1264795645117287, "learning_rate": 1.889795918367347e-05, "loss": 0.438, "mean_copy_accuracy": 0.9731246531009674, "mean_gen_accuracy": 0.840595543384552, "mean_token_accuracy": 0.8737525641918182, "num_tokens": 125470315.0, "sample_num_tokens": 8167.25, "step": 464, "total_num_tokens": 125502984.0, "z_loss": 0.02299388125538826 }, { "copy_logits_max": 25.256515502929688, "copy_logits_min": -750000000.0, "copy_num_tokens": 369.875, "epoch": 0.09497064079652795, "gen_logits_max": 17.246784210205078, "gen_logits_mean": -2.830265998840332, "gen_logits_min": -14.641397476196289, "gen_logits_std": 2.431004524230957, "gen_loss": 0.4167865514755249, "grad_norm": 0.8297936457272533, "learning_rate": 1.893877551020408e-05, "loss": 0.4439, "mean_copy_accuracy": 0.9805885255336761, "mean_gen_accuracy": 0.8392439782619476, "mean_token_accuracy": 0.8718565106391907, "num_tokens": 125733906.0, "sample_num_tokens": 7634.5, "step": 465, "total_num_tokens": 125764444.0, "z_loss": 0.021170707419514656 }, { "copy_logits_max": 29.288267135620117, "copy_logits_min": -624999936.0, "copy_num_tokens": 623.5, "epoch": 0.0951748787337248, "gen_logits_max": 15.14846420288086, "gen_logits_mean": -2.867537021636963, "gen_logits_min": -14.85136604309082, "gen_logits_std": 2.427306652069092, "gen_loss": 0.39002013206481934, "grad_norm": 1.2709755022417104, "learning_rate": 1.8979591836734696e-05, "loss": 0.4725, "mean_copy_accuracy": 0.9719602465629578, "mean_gen_accuracy": 0.8330240100622177, "mean_token_accuracy": 0.8677691370248795, "num_tokens": 126007155.0, "sample_num_tokens": 9129.75, "step": 466, "total_num_tokens": 126043674.0, "z_loss": 0.034532468765974045 }, { "copy_logits_max": 29.953636169433594, "copy_logits_min": -750000000.0, "copy_num_tokens": 351.9375, "epoch": 0.09537911667092162, "gen_logits_max": 17.27463150024414, "gen_logits_mean": -2.8513965606689453, "gen_logits_min": -14.762438774108887, "gen_logits_std": 2.4195451736450195, "gen_loss": 0.4441160559654236, "grad_norm": 0.8396254946962879, "learning_rate": 1.9020408163265306e-05, "loss": 0.4316, "mean_copy_accuracy": 0.9766061007976532, "mean_gen_accuracy": 0.8474516570568085, "mean_token_accuracy": 0.8747313916683197, "num_tokens": 126273266.0, "sample_num_tokens": 7305.0, "step": 467, "total_num_tokens": 126302486.0, "z_loss": 0.023346632719039917 }, { "copy_logits_max": 20.568641662597656, "copy_logits_min": -687500032.0, "copy_num_tokens": 401.1875, "epoch": 0.09558335460811845, "gen_logits_max": 16.751161575317383, "gen_logits_mean": -2.5552244186401367, "gen_logits_min": -14.783218383789062, "gen_logits_std": 2.45560884475708, "gen_loss": 0.4445452094078064, "grad_norm": 2.1527173654027876, "learning_rate": 1.906122448979592e-05, "loss": 0.467, "mean_copy_accuracy": 0.9696314632892609, "mean_gen_accuracy": 0.8352914899587631, "mean_token_accuracy": 0.8665078729391098, "num_tokens": 126515888.0, "sample_num_tokens": 8131.0, "step": 468, "total_num_tokens": 126548412.0, "z_loss": 0.019979087635874748 }, { "copy_logits_max": 26.670848846435547, "copy_logits_min": -687500032.0, "copy_num_tokens": 350.9375, "epoch": 0.0957875925453153, "gen_logits_max": 18.1639404296875, "gen_logits_mean": -2.7973947525024414, "gen_logits_min": -14.59453296661377, "gen_logits_std": 2.3874049186706543, "gen_loss": 0.42820900678634644, "grad_norm": 1.2123676016566916, "learning_rate": 1.910204081632653e-05, "loss": 0.4446, "mean_copy_accuracy": 0.9783801138401031, "mean_gen_accuracy": 0.8429576456546783, "mean_token_accuracy": 0.8723654896020889, "num_tokens": 126803889.0, "sample_num_tokens": 7811.75, "step": 469, "total_num_tokens": 126835136.0, "z_loss": 0.021015873178839684 }, { "copy_logits_max": 20.84986686706543, "copy_logits_min": -750000000.0, "copy_num_tokens": 351.125, "epoch": 0.09599183048251213, "gen_logits_max": 16.461715698242188, "gen_logits_mean": -2.8462445735931396, "gen_logits_min": -14.575117111206055, "gen_logits_std": 2.445591926574707, "gen_loss": 0.41814637184143066, "grad_norm": 1.495716610840698, "learning_rate": 1.9142857142857143e-05, "loss": 0.4563, "mean_copy_accuracy": 0.9731672704219818, "mean_gen_accuracy": 0.8403545469045639, "mean_token_accuracy": 0.8712051957845688, "num_tokens": 127060803.0, "sample_num_tokens": 7271.25, "step": 470, "total_num_tokens": 127089888.0, "z_loss": 0.015889234840869904 }, { "copy_logits_max": 22.485851287841797, "copy_logits_min": -750000000.0, "copy_num_tokens": 467.8125, "epoch": 0.09619606841970896, "gen_logits_max": 15.125984191894531, "gen_logits_mean": -2.8852972984313965, "gen_logits_min": -14.417983055114746, "gen_logits_std": 2.3919758796691895, "gen_loss": 0.42625707387924194, "grad_norm": 1.0544252152489948, "learning_rate": 1.9183673469387756e-05, "loss": 0.445, "mean_copy_accuracy": 0.979244664311409, "mean_gen_accuracy": 0.8366016298532486, "mean_token_accuracy": 0.8747269958257675, "num_tokens": 127337556.0, "sample_num_tokens": 8083.0, "step": 471, "total_num_tokens": 127369888.0, "z_loss": 0.023982996121048927 }, { "copy_logits_max": 23.942344665527344, "copy_logits_min": -750000000.0, "copy_num_tokens": 487.75, "epoch": 0.0964003063569058, "gen_logits_max": 16.858081817626953, "gen_logits_mean": -2.7664554119110107, "gen_logits_min": -14.844001770019531, "gen_logits_std": 2.4408915042877197, "gen_loss": 0.4677277207374573, "grad_norm": 1.215217986643478, "learning_rate": 1.922448979591837e-05, "loss": 0.4717, "mean_copy_accuracy": 0.964243546128273, "mean_gen_accuracy": 0.8374115824699402, "mean_token_accuracy": 0.8684060722589493, "num_tokens": 127613359.0, "sample_num_tokens": 8633.25, "step": 472, "total_num_tokens": 127647892.0, "z_loss": 0.020876236259937286 }, { "copy_logits_max": 24.835887908935547, "copy_logits_min": -750000064.0, "copy_num_tokens": 365.875, "epoch": 0.09660454429410263, "gen_logits_max": 17.472217559814453, "gen_logits_mean": -3.093200922012329, "gen_logits_min": -15.369247436523438, "gen_logits_std": 2.4246881008148193, "gen_loss": 0.3875812888145447, "grad_norm": 0.9617985518751618, "learning_rate": 1.926530612244898e-05, "loss": 0.4184, "mean_copy_accuracy": 0.9758700579404831, "mean_gen_accuracy": 0.8461246490478516, "mean_token_accuracy": 0.8801891207695007, "num_tokens": 127870170.0, "sample_num_tokens": 7508.0, "step": 473, "total_num_tokens": 127900202.0, "z_loss": 0.020488349720835686 }, { "copy_logits_max": 31.286338806152344, "copy_logits_min": -687500032.0, "copy_num_tokens": 437.6875, "epoch": 0.09680878223129946, "gen_logits_max": 17.30536651611328, "gen_logits_mean": -3.0401570796966553, "gen_logits_min": -15.567930221557617, "gen_logits_std": 2.520613431930542, "gen_loss": 0.4710639417171478, "grad_norm": 1.1917743160788898, "learning_rate": 1.9306122448979593e-05, "loss": 0.4494, "mean_copy_accuracy": 0.9751693904399872, "mean_gen_accuracy": 0.8366428762674332, "mean_token_accuracy": 0.8708574771881104, "num_tokens": 128122377.0, "sample_num_tokens": 7484.25, "step": 474, "total_num_tokens": 128152314.0, "z_loss": 0.028628095984458923 }, { "copy_logits_max": 24.75466537475586, "copy_logits_min": -750000000.0, "copy_num_tokens": 316.25, "epoch": 0.09701302016849629, "gen_logits_max": 17.31157684326172, "gen_logits_mean": -2.829719305038452, "gen_logits_min": -14.634201049804688, "gen_logits_std": 2.413262128829956, "gen_loss": 0.4905998110771179, "grad_norm": 1.3158662669236847, "learning_rate": 1.9346938775510203e-05, "loss": 0.4707, "mean_copy_accuracy": 0.9687509387731552, "mean_gen_accuracy": 0.8315645754337311, "mean_token_accuracy": 0.8630746752023697, "num_tokens": 128378757.0, "sample_num_tokens": 7732.25, "step": 475, "total_num_tokens": 128409686.0, "z_loss": 0.017889127135276794 }, { "copy_logits_max": 21.883102416992188, "copy_logits_min": -750000000.0, "copy_num_tokens": 335.625, "epoch": 0.09721725810569314, "gen_logits_max": 16.21471405029297, "gen_logits_mean": -3.116638660430908, "gen_logits_min": -14.813347816467285, "gen_logits_std": 2.3643102645874023, "gen_loss": 0.41864410042762756, "grad_norm": 0.8895937933676492, "learning_rate": 1.9387755102040817e-05, "loss": 0.4365, "mean_copy_accuracy": 0.9718007892370224, "mean_gen_accuracy": 0.8477279394865036, "mean_token_accuracy": 0.8728609979152679, "num_tokens": 128626039.0, "sample_num_tokens": 7530.25, "step": 476, "total_num_tokens": 128656160.0, "z_loss": 0.019001897424459457 }, { "copy_logits_max": 26.762598037719727, "copy_logits_min": -750000000.0, "copy_num_tokens": 461.8125, "epoch": 0.09742149604288997, "gen_logits_max": 17.15796661376953, "gen_logits_mean": -2.859232187271118, "gen_logits_min": -15.445920944213867, "gen_logits_std": 2.4364047050476074, "gen_loss": 0.46127307415008545, "grad_norm": 0.9877656990772335, "learning_rate": 1.942857142857143e-05, "loss": 0.4507, "mean_copy_accuracy": 0.9750033915042877, "mean_gen_accuracy": 0.8366204053163528, "mean_token_accuracy": 0.8704274147748947, "num_tokens": 128890706.0, "sample_num_tokens": 7782.5, "step": 477, "total_num_tokens": 128921836.0, "z_loss": 0.024586107581853867 }, { "copy_logits_max": 21.2952938079834, "copy_logits_min": -750000000.0, "copy_num_tokens": 427.9375, "epoch": 0.0976257339800868, "gen_logits_max": 16.269771575927734, "gen_logits_mean": -2.9414453506469727, "gen_logits_min": -14.590185165405273, "gen_logits_std": 2.407431125640869, "gen_loss": 0.43526631593704224, "grad_norm": 0.8508672760965622, "learning_rate": 1.946938775510204e-05, "loss": 0.4304, "mean_copy_accuracy": 0.9753465801477432, "mean_gen_accuracy": 0.8438848555088043, "mean_token_accuracy": 0.8735353797674179, "num_tokens": 129145293.0, "sample_num_tokens": 8548.25, "step": 478, "total_num_tokens": 129179486.0, "z_loss": 0.015345558524131775 }, { "copy_logits_max": 22.662355422973633, "copy_logits_min": -750000000.0, "copy_num_tokens": 457.75, "epoch": 0.09782997191728364, "gen_logits_max": 15.75122356414795, "gen_logits_mean": -2.714510440826416, "gen_logits_min": -14.632558822631836, "gen_logits_std": 2.4648497104644775, "gen_loss": 0.4513986110687256, "grad_norm": 1.089965011283686, "learning_rate": 1.9510204081632654e-05, "loss": 0.4327, "mean_copy_accuracy": 0.976903647184372, "mean_gen_accuracy": 0.84248086810112, "mean_token_accuracy": 0.8794680535793304, "num_tokens": 129429371.0, "sample_num_tokens": 7735.75, "step": 479, "total_num_tokens": 129460314.0, "z_loss": 0.021231452003121376 }, { "copy_logits_max": 26.363262176513672, "copy_logits_min": -687500032.0, "copy_num_tokens": 418.0, "epoch": 0.09803420985448047, "gen_logits_max": 17.114791870117188, "gen_logits_mean": -3.119079351425171, "gen_logits_min": -15.38664436340332, "gen_logits_std": 2.433218002319336, "gen_loss": 0.43321093916893005, "grad_norm": 0.8488666371152749, "learning_rate": 1.9551020408163264e-05, "loss": 0.4496, "mean_copy_accuracy": 0.9762459099292755, "mean_gen_accuracy": 0.83719901740551, "mean_token_accuracy": 0.8693191409111023, "num_tokens": 129681709.0, "sample_num_tokens": 8807.75, "step": 480, "total_num_tokens": 129716940.0, "z_loss": 0.01904066652059555 }, { "copy_logits_max": 27.114992141723633, "copy_logits_min": -750000000.0, "copy_num_tokens": 553.625, "epoch": 0.0982384477916773, "gen_logits_max": 17.613840103149414, "gen_logits_mean": -3.006503105163574, "gen_logits_min": -14.684762001037598, "gen_logits_std": 2.4065680503845215, "gen_loss": 0.40513086318969727, "grad_norm": 1.1361690826035962, "learning_rate": 1.9591836734693877e-05, "loss": 0.4582, "mean_copy_accuracy": 0.976023718714714, "mean_gen_accuracy": 0.8341865539550781, "mean_token_accuracy": 0.8685290962457657, "num_tokens": 129952978.0, "sample_num_tokens": 9517.0, "step": 481, "total_num_tokens": 129991046.0, "z_loss": 0.022916633635759354 }, { "copy_logits_max": 28.156497955322266, "copy_logits_min": -750000000.0, "copy_num_tokens": 392.5625, "epoch": 0.09844268572887414, "gen_logits_max": 17.086517333984375, "gen_logits_mean": -3.2678515911102295, "gen_logits_min": -14.926509857177734, "gen_logits_std": 2.3872382640838623, "gen_loss": 0.39236772060394287, "grad_norm": 0.9206968307468975, "learning_rate": 1.963265306122449e-05, "loss": 0.4377, "mean_copy_accuracy": 0.9800224900245667, "mean_gen_accuracy": 0.8412295281887054, "mean_token_accuracy": 0.8756716847419739, "num_tokens": 130217162.0, "sample_num_tokens": 7963.0, "step": 482, "total_num_tokens": 130249014.0, "z_loss": 0.020079368725419044 }, { "copy_logits_max": 26.172134399414062, "copy_logits_min": -750000000.0, "copy_num_tokens": 500.75, "epoch": 0.09864692366607097, "gen_logits_max": 16.532028198242188, "gen_logits_mean": -3.126394748687744, "gen_logits_min": -14.907726287841797, "gen_logits_std": 2.44779634475708, "gen_loss": 0.4206666946411133, "grad_norm": 1.1157472362483676, "learning_rate": 1.9673469387755104e-05, "loss": 0.4276, "mean_copy_accuracy": 0.9759396612644196, "mean_gen_accuracy": 0.8441070020198822, "mean_token_accuracy": 0.8767605423927307, "num_tokens": 130493755.0, "sample_num_tokens": 9037.25, "step": 483, "total_num_tokens": 130529904.0, "z_loss": 0.02433663234114647 }, { "copy_logits_max": 23.68947982788086, "copy_logits_min": -687500032.0, "copy_num_tokens": 389.5625, "epoch": 0.0988511616032678, "gen_logits_max": 16.358076095581055, "gen_logits_mean": -3.426077127456665, "gen_logits_min": -15.767790794372559, "gen_logits_std": 2.472179889678955, "gen_loss": 0.428628146648407, "grad_norm": 1.0904030903066224, "learning_rate": 1.9714285714285714e-05, "loss": 0.4447, "mean_copy_accuracy": 0.9765586107969284, "mean_gen_accuracy": 0.8361777365207672, "mean_token_accuracy": 0.8755202442407608, "num_tokens": 130788038.0, "sample_num_tokens": 7522.5, "step": 484, "total_num_tokens": 130818128.0, "z_loss": 0.020577406510710716 }, { "copy_logits_max": 26.33547592163086, "copy_logits_min": -750000000.0, "copy_num_tokens": 477.5625, "epoch": 0.09905539954046465, "gen_logits_max": 16.864517211914062, "gen_logits_mean": -3.2536792755126953, "gen_logits_min": -15.42702579498291, "gen_logits_std": 2.447812557220459, "gen_loss": 0.45236828923225403, "grad_norm": 1.0229953715628135, "learning_rate": 1.9755102040816328e-05, "loss": 0.4481, "mean_copy_accuracy": 0.9827834218740463, "mean_gen_accuracy": 0.8392585664987564, "mean_token_accuracy": 0.8740202188491821, "num_tokens": 131060877.0, "sample_num_tokens": 7985.75, "step": 485, "total_num_tokens": 131092820.0, "z_loss": 0.024073516950011253 }, { "copy_logits_max": 23.84789276123047, "copy_logits_min": -750000000.0, "copy_num_tokens": 447.375, "epoch": 0.09925963747766148, "gen_logits_max": 16.156848907470703, "gen_logits_mean": -3.260021924972534, "gen_logits_min": -15.127318382263184, "gen_logits_std": 2.3793227672576904, "gen_loss": 0.39977020025253296, "grad_norm": 0.9654501415380736, "learning_rate": 1.9795918367346938e-05, "loss": 0.4346, "mean_copy_accuracy": 0.9795296341180801, "mean_gen_accuracy": 0.8417983502149582, "mean_token_accuracy": 0.8743846565485001, "num_tokens": 131352215.0, "sample_num_tokens": 8213.75, "step": 486, "total_num_tokens": 131385070.0, "z_loss": 0.019514068961143494 }, { "copy_logits_max": 30.636215209960938, "copy_logits_min": -750000000.0, "copy_num_tokens": 540.75, "epoch": 0.09946387541485831, "gen_logits_max": 17.173322677612305, "gen_logits_mean": -3.0475645065307617, "gen_logits_min": -14.878989219665527, "gen_logits_std": 2.408013343811035, "gen_loss": 0.3854620158672333, "grad_norm": 1.4784679029419245, "learning_rate": 1.983673469387755e-05, "loss": 0.4204, "mean_copy_accuracy": 0.9789044559001923, "mean_gen_accuracy": 0.8442526161670685, "mean_token_accuracy": 0.8819211572408676, "num_tokens": 131634984.0, "sample_num_tokens": 8527.5, "step": 487, "total_num_tokens": 131669094.0, "z_loss": 0.024442091584205627 }, { "copy_logits_max": 20.708707809448242, "copy_logits_min": -750000000.0, "copy_num_tokens": 285.125, "epoch": 0.09966811335205514, "gen_logits_max": 16.937999725341797, "gen_logits_mean": -3.2624664306640625, "gen_logits_min": -15.114880561828613, "gen_logits_std": 2.3788297176361084, "gen_loss": 0.4414452910423279, "grad_norm": 0.8645697850029984, "learning_rate": 1.9877551020408165e-05, "loss": 0.4308, "mean_copy_accuracy": 0.9761975854635239, "mean_gen_accuracy": 0.8462177067995071, "mean_token_accuracy": 0.8703841269016266, "num_tokens": 131886837.0, "sample_num_tokens": 7185.25, "step": 488, "total_num_tokens": 131915578.0, "z_loss": 0.013608853332698345 }, { "copy_logits_max": 17.810935974121094, "copy_logits_min": -750000000.0, "copy_num_tokens": 469.0, "epoch": 0.09987235128925198, "gen_logits_max": 17.30438232421875, "gen_logits_mean": -3.0558271408081055, "gen_logits_min": -14.181005477905273, "gen_logits_std": 2.305333137512207, "gen_loss": 0.42289450764656067, "grad_norm": 0.962130922161313, "learning_rate": 1.9918367346938775e-05, "loss": 0.4299, "mean_copy_accuracy": 0.9809904992580414, "mean_gen_accuracy": 0.8371168375015259, "mean_token_accuracy": 0.876195102930069, "num_tokens": 132181827.0, "sample_num_tokens": 10868.25, "step": 489, "total_num_tokens": 132225300.0, "z_loss": 0.013513848185539246 }, { "copy_logits_max": 24.002422332763672, "copy_logits_min": -750000000.0, "copy_num_tokens": 419.8125, "epoch": 0.10007658922644881, "gen_logits_max": 17.13996696472168, "gen_logits_mean": -3.3829243183135986, "gen_logits_min": -14.938093185424805, "gen_logits_std": 2.3938913345336914, "gen_loss": 0.4152851700782776, "grad_norm": 0.7257061839448667, "learning_rate": 1.9959183673469388e-05, "loss": 0.4545, "mean_copy_accuracy": 0.9769619405269623, "mean_gen_accuracy": 0.8363576382398605, "mean_token_accuracy": 0.8674820065498352, "num_tokens": 132454633.0, "sample_num_tokens": 8668.25, "step": 490, "total_num_tokens": 132489306.0, "z_loss": 0.02057472988963127 }, { "copy_logits_max": 25.099319458007812, "copy_logits_min": -687500032.0, "copy_num_tokens": 497.0625, "epoch": 0.10028082716364564, "gen_logits_max": 17.39853858947754, "gen_logits_mean": -2.869994640350342, "gen_logits_min": -14.764383316040039, "gen_logits_std": 2.400132179260254, "gen_loss": 0.3968225121498108, "grad_norm": 0.9363667916655515, "learning_rate": 1.9999999999999998e-05, "loss": 0.4455, "mean_copy_accuracy": 0.9756323248147964, "mean_gen_accuracy": 0.8370469212532043, "mean_token_accuracy": 0.8693067729473114, "num_tokens": 132713119.0, "sample_num_tokens": 9124.75, "step": 491, "total_num_tokens": 132749618.0, "z_loss": 0.01979433000087738 }, { "copy_logits_max": 18.883953094482422, "copy_logits_min": -687500032.0, "copy_num_tokens": 319.3125, "epoch": 0.10048506510084249, "gen_logits_max": 16.892013549804688, "gen_logits_mean": -3.136322259902954, "gen_logits_min": -14.873543739318848, "gen_logits_std": 2.372684955596924, "gen_loss": 0.4301885664463043, "grad_norm": 0.7565567893714359, "learning_rate": 2.0040816326530615e-05, "loss": 0.4303, "mean_copy_accuracy": 0.9807142466306686, "mean_gen_accuracy": 0.8421747386455536, "mean_token_accuracy": 0.8746339082717896, "num_tokens": 132976955.0, "sample_num_tokens": 7314.75, "step": 492, "total_num_tokens": 133006214.0, "z_loss": 0.013711897656321526 }, { "copy_logits_max": 24.937665939331055, "copy_logits_min": -750000000.0, "copy_num_tokens": 429.6875, "epoch": 0.10068930303803932, "gen_logits_max": 16.943920135498047, "gen_logits_mean": -3.030362129211426, "gen_logits_min": -15.56597900390625, "gen_logits_std": 2.433605432510376, "gen_loss": 0.37119734287261963, "grad_norm": 0.894802964166952, "learning_rate": 2.0081632653061225e-05, "loss": 0.4312, "mean_copy_accuracy": 0.980818897485733, "mean_gen_accuracy": 0.8420039117336273, "mean_token_accuracy": 0.8717012405395508, "num_tokens": 133248356.0, "sample_num_tokens": 8112.5, "step": 493, "total_num_tokens": 133280806.0, "z_loss": 0.020301029086112976 }, { "copy_logits_max": 28.705482482910156, "copy_logits_min": -750000000.0, "copy_num_tokens": 444.5625, "epoch": 0.10089354097523615, "gen_logits_max": 17.10161590576172, "gen_logits_mean": -2.920842170715332, "gen_logits_min": -15.037137985229492, "gen_logits_std": 2.4617676734924316, "gen_loss": 0.43018683791160583, "grad_norm": 1.1734813039061156, "learning_rate": 2.012244897959184e-05, "loss": 0.4292, "mean_copy_accuracy": 0.975728839635849, "mean_gen_accuracy": 0.8426623046398163, "mean_token_accuracy": 0.8780307024717331, "num_tokens": 133517247.0, "sample_num_tokens": 8132.75, "step": 494, "total_num_tokens": 133549778.0, "z_loss": 0.023718353360891342 }, { "copy_logits_max": 35.654319763183594, "copy_logits_min": -625000064.0, "copy_num_tokens": 623.625, "epoch": 0.10109777891243299, "gen_logits_max": 17.05075454711914, "gen_logits_mean": -2.757216453552246, "gen_logits_min": -15.023866653442383, "gen_logits_std": 2.50869083404541, "gen_loss": 0.4527103900909424, "grad_norm": 0.8363858068359953, "learning_rate": 2.016326530612245e-05, "loss": 0.4483, "mean_copy_accuracy": 0.9784415513277054, "mean_gen_accuracy": 0.8366783112287521, "mean_token_accuracy": 0.87373848259449, "num_tokens": 133781330.0, "sample_num_tokens": 8741.5, "step": 495, "total_num_tokens": 133816296.0, "z_loss": 0.030848784372210503 }, { "copy_logits_max": 22.534292221069336, "copy_logits_min": -750000000.0, "copy_num_tokens": 363.4375, "epoch": 0.10130201684962982, "gen_logits_max": 16.937274932861328, "gen_logits_mean": -3.0397391319274902, "gen_logits_min": -14.697940826416016, "gen_logits_std": 2.4059581756591797, "gen_loss": 0.4864901006221771, "grad_norm": 0.7754782065014286, "learning_rate": 2.0204081632653062e-05, "loss": 0.437, "mean_copy_accuracy": 0.9790873527526855, "mean_gen_accuracy": 0.8399921506643295, "mean_token_accuracy": 0.8750240355730057, "num_tokens": 134061401.0, "sample_num_tokens": 8576.75, "step": 496, "total_num_tokens": 134095708.0, "z_loss": 0.015549305826425552 }, { "copy_logits_max": 24.52834701538086, "copy_logits_min": -687500032.0, "copy_num_tokens": 457.8125, "epoch": 0.10150625478682665, "gen_logits_max": 16.024192810058594, "gen_logits_mean": -3.353482246398926, "gen_logits_min": -15.39011001586914, "gen_logits_std": 2.4544153213500977, "gen_loss": 0.42700085043907166, "grad_norm": 1.4451714371572995, "learning_rate": 2.0244897959183672e-05, "loss": 0.4461, "mean_copy_accuracy": 0.978845089673996, "mean_gen_accuracy": 0.8377330601215363, "mean_token_accuracy": 0.8692941963672638, "num_tokens": 134316648.0, "sample_num_tokens": 8393.5, "step": 497, "total_num_tokens": 134350222.0, "z_loss": 0.020952114835381508 }, { "copy_logits_max": 16.643917083740234, "copy_logits_min": -750000000.0, "copy_num_tokens": 306.9375, "epoch": 0.1017104927240235, "gen_logits_max": 15.608198165893555, "gen_logits_mean": -3.6234614849090576, "gen_logits_min": -15.236566543579102, "gen_logits_std": 2.3736538887023926, "gen_loss": 0.38557472825050354, "grad_norm": 0.7100750279994625, "learning_rate": 2.028571428571429e-05, "loss": 0.4035, "mean_copy_accuracy": 0.9797282665967941, "mean_gen_accuracy": 0.8491508066654205, "mean_token_accuracy": 0.8803444057703018, "num_tokens": 134592199.0, "sample_num_tokens": 7653.25, "step": 498, "total_num_tokens": 134622812.0, "z_loss": 0.012270918115973473 }, { "copy_logits_max": 21.66104507446289, "copy_logits_min": -750000000.0, "copy_num_tokens": 310.875, "epoch": 0.10191473066122032, "gen_logits_max": 16.755430221557617, "gen_logits_mean": -3.408745288848877, "gen_logits_min": -15.316323280334473, "gen_logits_std": 2.3929879665374756, "gen_loss": 0.4121951162815094, "grad_norm": 0.7179402047378242, "learning_rate": 2.03265306122449e-05, "loss": 0.4362, "mean_copy_accuracy": 0.9821439385414124, "mean_gen_accuracy": 0.8402703106403351, "mean_token_accuracy": 0.8723578155040741, "num_tokens": 134854583.0, "sample_num_tokens": 7226.75, "step": 499, "total_num_tokens": 134883490.0, "z_loss": 0.015823040157556534 }, { "epoch": 0.10211896859841715, "grad_norm": 0.7487899786202826, "learning_rate": 2.036734693877551e-05, "loss": 0.419, "step": 500 }, { "epoch": 0.10211896859841715, "eval_copy_logits_max": 21.41415023803711, "eval_copy_logits_min": -21.542522430419922, "eval_gen_logits_max": 18.596118927001953, "eval_gen_logits_mean": -4.0564422607421875, "eval_gen_logits_min": -14.900343894958496, "eval_gen_logits_std": 2.244217872619629, "eval_gen_loss": 0.4639369249343872, "eval_loss": 0.4652155637741089, "eval_mean_copy_accuracy": 0.9851415753364563, "eval_mean_gen_accuracy": 0.8317129909992218, "eval_mean_token_accuracy": 0.851209819316864, "eval_num_tokens": 135167706.0, "eval_runtime": 0.7944, "eval_samples_per_second": 10.071, "eval_steps_per_second": 2.518, "eval_total_num_tokens": 135167706.0, "eval_z_loss": 0.008888136595487595, "step": 500 }, { "copy_logits_max": 23.926565170288086, "copy_logits_min": -687500032.0, "copy_num_tokens": 427.1875, "epoch": 0.10232320653561398, "gen_logits_max": 16.449031829833984, "gen_logits_mean": -3.029616117477417, "gen_logits_min": -15.08946418762207, "gen_logits_std": 2.4540348052978516, "gen_loss": 0.4431697428226471, "grad_norm": 0.6987486433849718, "learning_rate": 2.0408163265306123e-05, "loss": 0.418, "mean_copy_accuracy": 0.9817142114043236, "mean_gen_accuracy": 0.8425673916935921, "mean_token_accuracy": 0.8768282905220985, "num_tokens": 135392088.0, "sample_num_tokens": 8141.0, "step": 501, "total_num_tokens": 135424652.0, "z_loss": 0.02136317268013954 }, { "copy_logits_max": 22.72034454345703, "copy_logits_min": -750000000.0, "copy_num_tokens": 343.5625, "epoch": 0.10252744447281083, "gen_logits_max": 16.50791358947754, "gen_logits_mean": -3.1766819953918457, "gen_logits_min": -15.45854377746582, "gen_logits_std": 2.4175539016723633, "gen_loss": 0.4012428820133209, "grad_norm": 0.8654860769718351, "learning_rate": 2.0448979591836733e-05, "loss": 0.4483, "mean_copy_accuracy": 0.9817380607128143, "mean_gen_accuracy": 0.8367080837488174, "mean_token_accuracy": 0.8712315261363983, "num_tokens": 135660598.0, "sample_num_tokens": 7798.0, "step": 502, "total_num_tokens": 135691790.0, "z_loss": 0.01511091273277998 }, { "copy_logits_max": 23.545082092285156, "copy_logits_min": -750000000.0, "copy_num_tokens": 437.8125, "epoch": 0.10273168241000766, "gen_logits_max": 16.666717529296875, "gen_logits_mean": -3.27571439743042, "gen_logits_min": -15.23166561126709, "gen_logits_std": 2.4534244537353516, "gen_loss": 0.39503413438796997, "grad_norm": 1.1863114956178482, "learning_rate": 2.048979591836735e-05, "loss": 0.4335, "mean_copy_accuracy": 0.9761261940002441, "mean_gen_accuracy": 0.8379080891609192, "mean_token_accuracy": 0.8756406754255295, "num_tokens": 135933002.0, "sample_num_tokens": 8294.5, "step": 503, "total_num_tokens": 135966180.0, "z_loss": 0.018818378448486328 }, { "copy_logits_max": 28.301057815551758, "copy_logits_min": -750000000.0, "copy_num_tokens": 536.4375, "epoch": 0.10293592034720449, "gen_logits_max": 17.081796646118164, "gen_logits_mean": -3.3037848472595215, "gen_logits_min": -15.540096282958984, "gen_logits_std": 2.453444719314575, "gen_loss": 0.4354245066642761, "grad_norm": 0.8047989957954554, "learning_rate": 2.053061224489796e-05, "loss": 0.4392, "mean_copy_accuracy": 0.9838823974132538, "mean_gen_accuracy": 0.8346990197896957, "mean_token_accuracy": 0.8698538094758987, "num_tokens": 136192281.0, "sample_num_tokens": 8853.75, "step": 504, "total_num_tokens": 136227696.0, "z_loss": 0.021305138245224953 }, { "copy_logits_max": 20.150028228759766, "copy_logits_min": -687500032.0, "copy_num_tokens": 439.3125, "epoch": 0.10314015828440133, "gen_logits_max": 16.248844146728516, "gen_logits_mean": -3.303290843963623, "gen_logits_min": -15.554738998413086, "gen_logits_std": 2.4207773208618164, "gen_loss": 0.42056170105934143, "grad_norm": 1.149679667771478, "learning_rate": 2.0571428571428573e-05, "loss": 0.4361, "mean_copy_accuracy": 0.9803516417741776, "mean_gen_accuracy": 0.8403427451848984, "mean_token_accuracy": 0.8713844865560532, "num_tokens": 136460832.0, "sample_num_tokens": 8698.5, "step": 505, "total_num_tokens": 136495626.0, "z_loss": 0.017680060118436813 }, { "copy_logits_max": 22.892620086669922, "copy_logits_min": -750000000.0, "copy_num_tokens": 532.75, "epoch": 0.10334439622159816, "gen_logits_max": 16.37039566040039, "gen_logits_mean": -3.3004536628723145, "gen_logits_min": -15.407103538513184, "gen_logits_std": 2.402092933654785, "gen_loss": 0.43560948967933655, "grad_norm": 1.0121864567174401, "learning_rate": 2.0612244897959183e-05, "loss": 0.4381, "mean_copy_accuracy": 0.9795763790607452, "mean_gen_accuracy": 0.8338232487440109, "mean_token_accuracy": 0.8731933981180191, "num_tokens": 136738051.0, "sample_num_tokens": 9013.75, "step": 506, "total_num_tokens": 136774106.0, "z_loss": 0.019937671720981598 }, { "copy_logits_max": 23.088184356689453, "copy_logits_min": -750000000.0, "copy_num_tokens": 444.625, "epoch": 0.10354863415879499, "gen_logits_max": 15.849882125854492, "gen_logits_mean": -3.580071210861206, "gen_logits_min": -15.832698822021484, "gen_logits_std": 2.4896106719970703, "gen_loss": 0.4468947649002075, "grad_norm": 1.1176108028100706, "learning_rate": 2.0653061224489796e-05, "loss": 0.4408, "mean_copy_accuracy": 0.9816597402095795, "mean_gen_accuracy": 0.8353509902954102, "mean_token_accuracy": 0.8750645518302917, "num_tokens": 137022028.0, "sample_num_tokens": 8289.0, "step": 507, "total_num_tokens": 137055184.0, "z_loss": 0.02118520252406597 }, { "copy_logits_max": 19.17026138305664, "copy_logits_min": -750000000.0, "copy_num_tokens": 462.125, "epoch": 0.10375287209599184, "gen_logits_max": 15.342130661010742, "gen_logits_mean": -3.280813694000244, "gen_logits_min": -15.710935592651367, "gen_logits_std": 2.473942995071411, "gen_loss": 0.38419926166534424, "grad_norm": 0.752712960749448, "learning_rate": 2.069387755102041e-05, "loss": 0.4271, "mean_copy_accuracy": 0.9824277311563492, "mean_gen_accuracy": 0.8437605053186417, "mean_token_accuracy": 0.875744104385376, "num_tokens": 137289806.0, "sample_num_tokens": 8216.0, "step": 508, "total_num_tokens": 137322670.0, "z_loss": 0.018690429627895355 }, { "copy_logits_max": 32.2230224609375, "copy_logits_min": -750000000.0, "copy_num_tokens": 569.9375, "epoch": 0.10395711003318867, "gen_logits_max": 18.153121948242188, "gen_logits_mean": -3.1674065589904785, "gen_logits_min": -15.22914981842041, "gen_logits_std": 2.409574508666992, "gen_loss": 0.4263927936553955, "grad_norm": 0.9952915740593888, "learning_rate": 2.0734693877551023e-05, "loss": 0.4208, "mean_copy_accuracy": 0.9775222837924957, "mean_gen_accuracy": 0.8446630388498306, "mean_token_accuracy": 0.8749034851789474, "num_tokens": 137542508.0, "sample_num_tokens": 9302.5, "step": 509, "total_num_tokens": 137579718.0, "z_loss": 0.022865546867251396 }, { "copy_logits_max": 25.113197326660156, "copy_logits_min": -750000000.0, "copy_num_tokens": 515.3125, "epoch": 0.1041613479703855, "gen_logits_max": 17.645549774169922, "gen_logits_mean": -3.0333902835845947, "gen_logits_min": -14.849178314208984, "gen_logits_std": 2.4423234462738037, "gen_loss": 0.4622567892074585, "grad_norm": 0.8533013662413452, "learning_rate": 2.0775510204081633e-05, "loss": 0.4351, "mean_copy_accuracy": 0.9785510003566742, "mean_gen_accuracy": 0.842089906334877, "mean_token_accuracy": 0.8741330355405807, "num_tokens": 137834252.0, "sample_num_tokens": 9397.5, "step": 510, "total_num_tokens": 137871842.0, "z_loss": 0.02020561508834362 }, { "copy_logits_max": 22.679424285888672, "copy_logits_min": -750000000.0, "copy_num_tokens": 410.0, "epoch": 0.10436558590758234, "gen_logits_max": 16.347257614135742, "gen_logits_mean": -3.3507440090179443, "gen_logits_min": -15.198780059814453, "gen_logits_std": 2.4780333042144775, "gen_loss": 0.3991682827472687, "grad_norm": 0.7731401466248813, "learning_rate": 2.0816326530612247e-05, "loss": 0.4814, "mean_copy_accuracy": 0.979042187333107, "mean_gen_accuracy": 0.8251669406890869, "mean_token_accuracy": 0.8571885377168655, "num_tokens": 138086589.0, "sample_num_tokens": 7649.25, "step": 511, "total_num_tokens": 138117186.0, "z_loss": 0.019680071622133255 }, { "copy_logits_max": 24.72515869140625, "copy_logits_min": -687500032.0, "copy_num_tokens": 603.3125, "epoch": 0.10456982384477917, "gen_logits_max": 16.566211700439453, "gen_logits_mean": -3.420724630355835, "gen_logits_min": -15.481595039367676, "gen_logits_std": 2.4839534759521484, "gen_loss": 0.37592220306396484, "grad_norm": 1.113273839121249, "learning_rate": 2.0857142857142857e-05, "loss": 0.4294, "mean_copy_accuracy": 0.9808582663536072, "mean_gen_accuracy": 0.8388697803020477, "mean_token_accuracy": 0.877098798751831, "num_tokens": 138363803.0, "sample_num_tokens": 9893.75, "step": 512, "total_num_tokens": 138403378.0, "z_loss": 0.02227957174181938 }, { "copy_logits_max": 23.3841609954834, "copy_logits_min": -750000000.0, "copy_num_tokens": 500.75, "epoch": 0.104774061781976, "gen_logits_max": 16.501657485961914, "gen_logits_mean": -3.1617064476013184, "gen_logits_min": -16.020795822143555, "gen_logits_std": 2.4938831329345703, "gen_loss": 0.4256552457809448, "grad_norm": 1.2578470523841254, "learning_rate": 2.0897959183673467e-05, "loss": 0.451, "mean_copy_accuracy": 0.9760538190603256, "mean_gen_accuracy": 0.8294438272714615, "mean_token_accuracy": 0.8695808053016663, "num_tokens": 138647599.0, "sample_num_tokens": 8992.25, "step": 513, "total_num_tokens": 138683568.0, "z_loss": 0.02093818411231041 }, { "copy_logits_max": 22.22808837890625, "copy_logits_min": -687500032.0, "copy_num_tokens": 275.9375, "epoch": 0.10497829971917283, "gen_logits_max": 17.203433990478516, "gen_logits_mean": -3.7377090454101562, "gen_logits_min": -15.82944107055664, "gen_logits_std": 2.3820462226867676, "gen_loss": 0.36952346563339233, "grad_norm": 0.9793902225647554, "learning_rate": 2.0938775510204084e-05, "loss": 0.4014, "mean_copy_accuracy": 0.983138844370842, "mean_gen_accuracy": 0.8493269979953766, "mean_token_accuracy": 0.8808395564556122, "num_tokens": 138942762.0, "sample_num_tokens": 7439.0, "step": 514, "total_num_tokens": 138972518.0, "z_loss": 0.01471974141895771 }, { "copy_logits_max": 26.88024139404297, "copy_logits_min": -750000000.0, "copy_num_tokens": 578.8125, "epoch": 0.10518253765636967, "gen_logits_max": 16.965633392333984, "gen_logits_mean": -3.398674964904785, "gen_logits_min": -15.32174301147461, "gen_logits_std": 2.417769432067871, "gen_loss": 0.3497076630592346, "grad_norm": 0.8810959309067756, "learning_rate": 2.0979591836734694e-05, "loss": 0.4304, "mean_copy_accuracy": 0.9825234711170197, "mean_gen_accuracy": 0.8399619460105896, "mean_token_accuracy": 0.8747730404138565, "num_tokens": 139202557.0, "sample_num_tokens": 8951.25, "step": 515, "total_num_tokens": 139238362.0, "z_loss": 0.022200096398591995 }, { "copy_logits_max": 25.648983001708984, "copy_logits_min": -687500032.0, "copy_num_tokens": 372.1875, "epoch": 0.1053867755935665, "gen_logits_max": 16.833621978759766, "gen_logits_mean": -3.6104536056518555, "gen_logits_min": -15.564325332641602, "gen_logits_std": 2.450148105621338, "gen_loss": 0.4342065155506134, "grad_norm": 0.8023516210787031, "learning_rate": 2.1020408163265307e-05, "loss": 0.4349, "mean_copy_accuracy": 0.9805752784013748, "mean_gen_accuracy": 0.8368856608867645, "mean_token_accuracy": 0.8736489713191986, "num_tokens": 139468210.0, "sample_num_tokens": 7445.0, "step": 516, "total_num_tokens": 139497990.0, "z_loss": 0.018773270770907402 }, { "copy_logits_max": 31.519166946411133, "copy_logits_min": -750000000.0, "copy_num_tokens": 649.5, "epoch": 0.10559101353076333, "gen_logits_max": 17.74477767944336, "gen_logits_mean": -3.2753705978393555, "gen_logits_min": -15.433873176574707, "gen_logits_std": 2.42535400390625, "gen_loss": 0.38198933005332947, "grad_norm": 0.9567601543494181, "learning_rate": 2.1061224489795917e-05, "loss": 0.399, "mean_copy_accuracy": 0.9776050597429276, "mean_gen_accuracy": 0.8496767729520798, "mean_token_accuracy": 0.882216215133667, "num_tokens": 139737803.0, "sample_num_tokens": 10251.75, "step": 517, "total_num_tokens": 139778810.0, "z_loss": 0.02348054200410843 }, { "copy_logits_max": 23.9807071685791, "copy_logits_min": -750000000.0, "copy_num_tokens": 568.8125, "epoch": 0.10579525146796018, "gen_logits_max": 16.72959327697754, "gen_logits_mean": -3.6246261596679688, "gen_logits_min": -15.319657325744629, "gen_logits_std": 2.4103705883026123, "gen_loss": 0.37735503911972046, "grad_norm": 0.8930275167909045, "learning_rate": 2.110204081632653e-05, "loss": 0.399, "mean_copy_accuracy": 0.9802064299583435, "mean_gen_accuracy": 0.8513117134571075, "mean_token_accuracy": 0.883233442902565, "num_tokens": 140005477.0, "sample_num_tokens": 9412.75, "step": 518, "total_num_tokens": 140043128.0, "z_loss": 0.02070767432451248 }, { "copy_logits_max": 16.497407913208008, "copy_logits_min": -750000000.0, "copy_num_tokens": 222.4375, "epoch": 0.10599948940515701, "gen_logits_max": 16.251934051513672, "gen_logits_mean": -3.908369302749634, "gen_logits_min": -15.616979598999023, "gen_logits_std": 2.4184834957122803, "gen_loss": 0.37876179814338684, "grad_norm": 0.8494204958004227, "learning_rate": 2.1142857142857144e-05, "loss": 0.4409, "mean_copy_accuracy": 0.9796518683433533, "mean_gen_accuracy": 0.8341909945011139, "mean_token_accuracy": 0.8697456866502762, "num_tokens": 140276628.0, "sample_num_tokens": 6892.5, "step": 519, "total_num_tokens": 140304198.0, "z_loss": 0.01085065770894289 }, { "copy_logits_max": 27.316633224487305, "copy_logits_min": -750000000.0, "copy_num_tokens": 745.9375, "epoch": 0.10620372734235384, "gen_logits_max": 15.951229095458984, "gen_logits_mean": -3.3353753089904785, "gen_logits_min": -15.518217086791992, "gen_logits_std": 2.5265932083129883, "gen_loss": 0.3815028667449951, "grad_norm": 0.8477720703465369, "learning_rate": 2.1183673469387758e-05, "loss": 0.4276, "mean_copy_accuracy": 0.9801302999258041, "mean_gen_accuracy": 0.8399046808481216, "mean_token_accuracy": 0.8752328902482986, "num_tokens": 140542698.0, "sample_num_tokens": 9618.5, "step": 520, "total_num_tokens": 140581172.0, "z_loss": 0.028843341395258904 }, { "copy_logits_max": 23.3820858001709, "copy_logits_min": -750000000.0, "copy_num_tokens": 475.0, "epoch": 0.10640796527955068, "gen_logits_max": 16.76519012451172, "gen_logits_mean": -3.451228141784668, "gen_logits_min": -15.110047340393066, "gen_logits_std": 2.425079345703125, "gen_loss": 0.409316748380661, "grad_norm": 0.8883264378894743, "learning_rate": 2.1224489795918368e-05, "loss": 0.3974, "mean_copy_accuracy": 0.9791112691164017, "mean_gen_accuracy": 0.8493572920560837, "mean_token_accuracy": 0.8833377510309219, "num_tokens": 140841535.0, "sample_num_tokens": 8973.75, "step": 521, "total_num_tokens": 140877430.0, "z_loss": 0.018765706568956375 }, { "copy_logits_max": 22.98749351501465, "copy_logits_min": -750000000.0, "copy_num_tokens": 500.1875, "epoch": 0.10661220321674751, "gen_logits_max": 16.3599910736084, "gen_logits_mean": -3.534513473510742, "gen_logits_min": -15.519098281860352, "gen_logits_std": 2.49080753326416, "gen_loss": 0.3905045986175537, "grad_norm": 1.2262451169901487, "learning_rate": 2.126530612244898e-05, "loss": 0.3938, "mean_copy_accuracy": 0.9798448532819748, "mean_gen_accuracy": 0.8523607105016708, "mean_token_accuracy": 0.8865721374750137, "num_tokens": 141123593.0, "sample_num_tokens": 8458.25, "step": 522, "total_num_tokens": 141157426.0, "z_loss": 0.020479638129472733 }, { "copy_logits_max": 22.117815017700195, "copy_logits_min": -687500032.0, "copy_num_tokens": 336.25, "epoch": 0.10681644115394434, "gen_logits_max": 16.67926788330078, "gen_logits_mean": -3.5574707984924316, "gen_logits_min": -15.416069030761719, "gen_logits_std": 2.44197940826416, "gen_loss": 0.40280449390411377, "grad_norm": 1.050344990667537, "learning_rate": 2.130612244897959e-05, "loss": 0.4249, "mean_copy_accuracy": 0.9796439856290817, "mean_gen_accuracy": 0.8427259922027588, "mean_token_accuracy": 0.8747980147600174, "num_tokens": 141382696.0, "sample_num_tokens": 6828.0, "step": 523, "total_num_tokens": 141410008.0, "z_loss": 0.017436161637306213 }, { "copy_logits_max": 26.583984375, "copy_logits_min": -750000000.0, "copy_num_tokens": 461.25, "epoch": 0.10702067909114119, "gen_logits_max": 16.521644592285156, "gen_logits_mean": -3.5433201789855957, "gen_logits_min": -15.125236511230469, "gen_logits_std": 2.3866066932678223, "gen_loss": 0.4263259172439575, "grad_norm": 1.1393785065699569, "learning_rate": 2.1346938775510205e-05, "loss": 0.4068, "mean_copy_accuracy": 0.9780340790748596, "mean_gen_accuracy": 0.8516372442245483, "mean_token_accuracy": 0.8842340260744095, "num_tokens": 141652909.0, "sample_num_tokens": 8956.75, "step": 524, "total_num_tokens": 141688736.0, "z_loss": 0.01816697046160698 }, { "copy_logits_max": 23.051002502441406, "copy_logits_min": -624999936.0, "copy_num_tokens": 420.8125, "epoch": 0.10722491702833802, "gen_logits_max": 16.61756706237793, "gen_logits_mean": -3.415506601333618, "gen_logits_min": -15.226982116699219, "gen_logits_std": 2.4625296592712402, "gen_loss": 0.38830000162124634, "grad_norm": 0.8251237378303248, "learning_rate": 2.1387755102040818e-05, "loss": 0.4193, "mean_copy_accuracy": 0.9815405160188675, "mean_gen_accuracy": 0.8443044573068619, "mean_token_accuracy": 0.8765449970960617, "num_tokens": 141924651.0, "sample_num_tokens": 8276.25, "step": 525, "total_num_tokens": 141957756.0, "z_loss": 0.015719501301646233 }, { "copy_logits_max": 20.576129913330078, "copy_logits_min": -750000000.0, "copy_num_tokens": 353.1875, "epoch": 0.10742915496553485, "gen_logits_max": 16.45733642578125, "gen_logits_mean": -3.3076255321502686, "gen_logits_min": -15.19163703918457, "gen_logits_std": 2.417506217956543, "gen_loss": 0.4069259762763977, "grad_norm": 1.2528820716091333, "learning_rate": 2.1428571428571428e-05, "loss": 0.4241, "mean_copy_accuracy": 0.980309471487999, "mean_gen_accuracy": 0.8450740873813629, "mean_token_accuracy": 0.8807267248630524, "num_tokens": 142207309.0, "sample_num_tokens": 7059.25, "step": 526, "total_num_tokens": 142235546.0, "z_loss": 0.0166497640311718 }, { "copy_logits_max": 23.080245971679688, "copy_logits_min": -750000000.0, "copy_num_tokens": 461.625, "epoch": 0.10763339290273168, "gen_logits_max": 15.64708137512207, "gen_logits_mean": -3.0635504722595215, "gen_logits_min": -15.168501853942871, "gen_logits_std": 2.491553783416748, "gen_loss": 0.38977086544036865, "grad_norm": 0.8280184818301614, "learning_rate": 2.146938775510204e-05, "loss": 0.4117, "mean_copy_accuracy": 0.9815048575401306, "mean_gen_accuracy": 0.846077486872673, "mean_token_accuracy": 0.8793938308954239, "num_tokens": 142457203.0, "sample_num_tokens": 7716.25, "step": 527, "total_num_tokens": 142488068.0, "z_loss": 0.022490959614515305 }, { "copy_logits_max": 28.006942749023438, "copy_logits_min": -687500032.0, "copy_num_tokens": 847.75, "epoch": 0.10783763083992852, "gen_logits_max": 16.845048904418945, "gen_logits_mean": -3.0008137226104736, "gen_logits_min": -15.33244800567627, "gen_logits_std": 2.5223708152770996, "gen_loss": 0.35902541875839233, "grad_norm": 2.0168577055642336, "learning_rate": 2.151020408163265e-05, "loss": 0.4392, "mean_copy_accuracy": 0.9702320396900177, "mean_gen_accuracy": 0.8408007621765137, "mean_token_accuracy": 0.873991996049881, "num_tokens": 142740884.0, "sample_num_tokens": 11536.0, "step": 528, "total_num_tokens": 142787028.0, "z_loss": 0.028881991282105446 }, { "copy_logits_max": 24.548110961914062, "copy_logits_min": -687500032.0, "copy_num_tokens": 421.25, "epoch": 0.10804186877712535, "gen_logits_max": 16.448745727539062, "gen_logits_mean": -3.410520076751709, "gen_logits_min": -15.398771286010742, "gen_logits_std": 2.42348575592041, "gen_loss": 0.4091288447380066, "grad_norm": 1.0681867649475423, "learning_rate": 2.1551020408163265e-05, "loss": 0.444, "mean_copy_accuracy": 0.9830502420663834, "mean_gen_accuracy": 0.8364974558353424, "mean_token_accuracy": 0.8709289580583572, "num_tokens": 143025798.0, "sample_num_tokens": 7854.0, "step": 529, "total_num_tokens": 143057214.0, "z_loss": 0.018114782869815826 }, { "copy_logits_max": 23.097612380981445, "copy_logits_min": -750000000.0, "copy_num_tokens": 483.9375, "epoch": 0.10824610671432218, "gen_logits_max": 16.375713348388672, "gen_logits_mean": -3.537332534790039, "gen_logits_min": -15.137831687927246, "gen_logits_std": 2.3473381996154785, "gen_loss": 0.4081419110298157, "grad_norm": 1.5512231779674865, "learning_rate": 2.159183673469388e-05, "loss": 0.4519, "mean_copy_accuracy": 0.981478750705719, "mean_gen_accuracy": 0.834267869591713, "mean_token_accuracy": 0.8708048909902573, "num_tokens": 143278073.0, "sample_num_tokens": 7920.75, "step": 530, "total_num_tokens": 143309756.0, "z_loss": 0.019288834184408188 }, { "copy_logits_max": 27.652849197387695, "copy_logits_min": -687500032.0, "copy_num_tokens": 495.0, "epoch": 0.10845034465151902, "gen_logits_max": 16.10177993774414, "gen_logits_mean": -3.1619668006896973, "gen_logits_min": -15.26921272277832, "gen_logits_std": 2.4233479499816895, "gen_loss": 0.4553031921386719, "grad_norm": 1.1028380375580311, "learning_rate": 2.1632653061224492e-05, "loss": 0.446, "mean_copy_accuracy": 0.9788483828306198, "mean_gen_accuracy": 0.8338636159896851, "mean_token_accuracy": 0.8709557503461838, "num_tokens": 143534869.0, "sample_num_tokens": 8661.25, "step": 531, "total_num_tokens": 143569514.0, "z_loss": 0.020580841228365898 }, { "copy_logits_max": 26.676300048828125, "copy_logits_min": -687500032.0, "copy_num_tokens": 404.6875, "epoch": 0.10865458258871585, "gen_logits_max": 16.304025650024414, "gen_logits_mean": -3.133467197418213, "gen_logits_min": -15.3504638671875, "gen_logits_std": 2.4383511543273926, "gen_loss": 0.3915192484855652, "grad_norm": 1.1493790768864818, "learning_rate": 2.1673469387755102e-05, "loss": 0.4373, "mean_copy_accuracy": 0.9781440645456314, "mean_gen_accuracy": 0.8416153192520142, "mean_token_accuracy": 0.870569258928299, "num_tokens": 143791493.0, "sample_num_tokens": 7089.75, "step": 532, "total_num_tokens": 143819852.0, "z_loss": 0.022376950830221176 }, { "copy_logits_max": 26.837209701538086, "copy_logits_min": -750000000.0, "copy_num_tokens": 477.0625, "epoch": 0.10885882052591268, "gen_logits_max": 15.77700424194336, "gen_logits_mean": -3.420466661453247, "gen_logits_min": -15.551444053649902, "gen_logits_std": 2.504662275314331, "gen_loss": 0.3874800503253937, "grad_norm": 1.1557083107991504, "learning_rate": 2.1714285714285715e-05, "loss": 0.426, "mean_copy_accuracy": 0.9751425087451935, "mean_gen_accuracy": 0.8417644649744034, "mean_token_accuracy": 0.8758853077888489, "num_tokens": 144038466.0, "sample_num_tokens": 7561.5, "step": 533, "total_num_tokens": 144068712.0, "z_loss": 0.025482527911663055 }, { "copy_logits_max": 22.280126571655273, "copy_logits_min": -750000000.0, "copy_num_tokens": 361.125, "epoch": 0.10906305846310953, "gen_logits_max": 15.348209381103516, "gen_logits_mean": -3.404865264892578, "gen_logits_min": -15.267444610595703, "gen_logits_std": 2.408503532409668, "gen_loss": 0.42530813813209534, "grad_norm": 1.2923931150363845, "learning_rate": 2.1755102040816326e-05, "loss": 0.4417, "mean_copy_accuracy": 0.9792134463787079, "mean_gen_accuracy": 0.835857704281807, "mean_token_accuracy": 0.873928889632225, "num_tokens": 144307808.0, "sample_num_tokens": 8007.0, "step": 534, "total_num_tokens": 144339836.0, "z_loss": 0.017594192177057266 }, { "copy_logits_max": 22.68326187133789, "copy_logits_min": -750000000.0, "copy_num_tokens": 521.125, "epoch": 0.10926729640030636, "gen_logits_max": 16.013607025146484, "gen_logits_mean": -3.531289577484131, "gen_logits_min": -14.88382339477539, "gen_logits_std": 2.3995132446289062, "gen_loss": 0.3708063066005707, "grad_norm": 0.9105398965593109, "learning_rate": 2.1795918367346942e-05, "loss": 0.4274, "mean_copy_accuracy": 0.9819118082523346, "mean_gen_accuracy": 0.8435229063034058, "mean_token_accuracy": 0.8762421160936356, "num_tokens": 144594529.0, "sample_num_tokens": 9755.75, "step": 535, "total_num_tokens": 144633552.0, "z_loss": 0.01885722577571869 }, { "copy_logits_max": 25.418771743774414, "copy_logits_min": -625000000.0, "copy_num_tokens": 571.625, "epoch": 0.10947153433750319, "gen_logits_max": 16.30272102355957, "gen_logits_mean": -3.106433868408203, "gen_logits_min": -14.836494445800781, "gen_logits_std": 2.4137749671936035, "gen_loss": 0.44037070870399475, "grad_norm": 0.9718903199296138, "learning_rate": 2.1836734693877552e-05, "loss": 0.4517, "mean_copy_accuracy": 0.9790997803211212, "mean_gen_accuracy": 0.8342425227165222, "mean_token_accuracy": 0.8699020892381668, "num_tokens": 144855025.0, "sample_num_tokens": 8839.25, "step": 536, "total_num_tokens": 144890382.0, "z_loss": 0.020771749317646027 }, { "copy_logits_max": 24.732145309448242, "copy_logits_min": -687499968.0, "copy_num_tokens": 645.6875, "epoch": 0.10967577227470003, "gen_logits_max": 15.327028274536133, "gen_logits_mean": -3.209988594055176, "gen_logits_min": -14.789124488830566, "gen_logits_std": 2.489523410797119, "gen_loss": 0.4001104533672333, "grad_norm": 1.2265983277105734, "learning_rate": 2.1877551020408162e-05, "loss": 0.4149, "mean_copy_accuracy": 0.9755775183439255, "mean_gen_accuracy": 0.8439032882452011, "mean_token_accuracy": 0.8809266537427902, "num_tokens": 145117284.0, "sample_num_tokens": 9288.0, "step": 537, "total_num_tokens": 145154436.0, "z_loss": 0.0211520716547966 }, { "copy_logits_max": 20.543376922607422, "copy_logits_min": -750000000.0, "copy_num_tokens": 358.3125, "epoch": 0.10988001021189686, "gen_logits_max": 16.679332733154297, "gen_logits_mean": -3.696892738342285, "gen_logits_min": -14.901172637939453, "gen_logits_std": 2.354820728302002, "gen_loss": 0.38647642731666565, "grad_norm": 0.856906046954026, "learning_rate": 2.1918367346938776e-05, "loss": 0.4174, "mean_copy_accuracy": 0.9803852885961533, "mean_gen_accuracy": 0.847447395324707, "mean_token_accuracy": 0.8779085725545883, "num_tokens": 145385288.0, "sample_num_tokens": 7224.5, "step": 538, "total_num_tokens": 145414186.0, "z_loss": 0.015080583281815052 }, { "copy_logits_max": 23.820293426513672, "copy_logits_min": -687500032.0, "copy_num_tokens": 416.625, "epoch": 0.11008424814909369, "gen_logits_max": 16.365150451660156, "gen_logits_mean": -3.2832963466644287, "gen_logits_min": -15.14042854309082, "gen_logits_std": 2.457265615463257, "gen_loss": 0.40900343656539917, "grad_norm": 1.1710963996568968, "learning_rate": 2.1959183673469386e-05, "loss": 0.4367, "mean_copy_accuracy": 0.9783390313386917, "mean_gen_accuracy": 0.8362058848142624, "mean_token_accuracy": 0.8727929145097733, "num_tokens": 145637297.0, "sample_num_tokens": 8539.75, "step": 539, "total_num_tokens": 145671456.0, "z_loss": 0.016796041280031204 }, { "copy_logits_max": 24.871158599853516, "copy_logits_min": -687499968.0, "copy_num_tokens": 541.6875, "epoch": 0.11028848608629054, "gen_logits_max": 16.878036499023438, "gen_logits_mean": -3.4498350620269775, "gen_logits_min": -14.938125610351562, "gen_logits_std": 2.393155097961426, "gen_loss": 0.37685680389404297, "grad_norm": 1.2688955959799055, "learning_rate": 2.2e-05, "loss": 0.4359, "mean_copy_accuracy": 0.9766391217708588, "mean_gen_accuracy": 0.8377851843833923, "mean_token_accuracy": 0.871869221329689, "num_tokens": 145912168.0, "sample_num_tokens": 9394.0, "step": 540, "total_num_tokens": 145949744.0, "z_loss": 0.01942150853574276 }, { "copy_logits_max": 23.457691192626953, "copy_logits_min": -750000000.0, "copy_num_tokens": 479.9375, "epoch": 0.11049272402348737, "gen_logits_max": 16.343170166015625, "gen_logits_mean": -3.4554603099823, "gen_logits_min": -15.02962875366211, "gen_logits_std": 2.4462480545043945, "gen_loss": 0.4112316370010376, "grad_norm": 0.9602659335956787, "learning_rate": 2.2040816326530613e-05, "loss": 0.4372, "mean_copy_accuracy": 0.9806087017059326, "mean_gen_accuracy": 0.8371556401252747, "mean_token_accuracy": 0.8716170638799667, "num_tokens": 146179513.0, "sample_num_tokens": 9171.75, "step": 541, "total_num_tokens": 146216200.0, "z_loss": 0.019337106496095657 }, { "copy_logits_max": 19.363500595092773, "copy_logits_min": -750000000.0, "copy_num_tokens": 297.0, "epoch": 0.1106969619606842, "gen_logits_max": 16.884740829467773, "gen_logits_mean": -3.4363973140716553, "gen_logits_min": -14.883426666259766, "gen_logits_std": 2.354149341583252, "gen_loss": 0.3928395211696625, "grad_norm": 0.7870397671893341, "learning_rate": 2.2081632653061226e-05, "loss": 0.416, "mean_copy_accuracy": 0.9826307445764542, "mean_gen_accuracy": 0.8440179526805878, "mean_token_accuracy": 0.8723994642496109, "num_tokens": 146426525.0, "sample_num_tokens": 7399.75, "step": 542, "total_num_tokens": 146456124.0, "z_loss": 0.01299147680401802 }, { "copy_logits_max": 24.46082878112793, "copy_logits_min": -750000000.0, "copy_num_tokens": 363.0, "epoch": 0.11090119989788103, "gen_logits_max": 17.368824005126953, "gen_logits_mean": -3.564485549926758, "gen_logits_min": -15.349711418151855, "gen_logits_std": 2.3914172649383545, "gen_loss": 0.40298402309417725, "grad_norm": 1.036084214041356, "learning_rate": 2.2122448979591836e-05, "loss": 0.4342, "mean_copy_accuracy": 0.9784225672483444, "mean_gen_accuracy": 0.8439318090677261, "mean_token_accuracy": 0.8702322840690613, "num_tokens": 146673782.0, "sample_num_tokens": 7737.0, "step": 543, "total_num_tokens": 146704730.0, "z_loss": 0.01677589677274227 }, { "copy_logits_max": 21.916961669921875, "copy_logits_min": -750000000.0, "copy_num_tokens": 419.375, "epoch": 0.11110543783507787, "gen_logits_max": 16.478408813476562, "gen_logits_mean": -3.402935743331909, "gen_logits_min": -15.286633491516113, "gen_logits_std": 2.4490556716918945, "gen_loss": 0.40544331073760986, "grad_norm": 0.8422335645002779, "learning_rate": 2.216326530612245e-05, "loss": 0.4175, "mean_copy_accuracy": 0.9776456654071808, "mean_gen_accuracy": 0.8470505625009537, "mean_token_accuracy": 0.8742129355669022, "num_tokens": 146925031.0, "sample_num_tokens": 7954.75, "step": 544, "total_num_tokens": 146956850.0, "z_loss": 0.015580345876514912 }, { "copy_logits_max": 20.23307991027832, "copy_logits_min": -750000000.0, "copy_num_tokens": 594.0625, "epoch": 0.1113096757722747, "gen_logits_max": 15.708728790283203, "gen_logits_mean": -3.6246023178100586, "gen_logits_min": -14.914285659790039, "gen_logits_std": 2.364135980606079, "gen_loss": 0.36489713191986084, "grad_norm": 0.925428714590854, "learning_rate": 2.220408163265306e-05, "loss": 0.4139, "mean_copy_accuracy": 0.977356418967247, "mean_gen_accuracy": 0.8454450070858002, "mean_token_accuracy": 0.876788780093193, "num_tokens": 147192762.0, "sample_num_tokens": 10265.5, "step": 545, "total_num_tokens": 147233824.0, "z_loss": 0.01769641786813736 }, { "copy_logits_max": 17.69580841064453, "copy_logits_min": -750000000.0, "copy_num_tokens": 401.0, "epoch": 0.11151391370947153, "gen_logits_max": 15.71820068359375, "gen_logits_mean": -3.892446994781494, "gen_logits_min": -15.672002792358398, "gen_logits_std": 2.386296272277832, "gen_loss": 0.36398768424987793, "grad_norm": 0.7099701618682357, "learning_rate": 2.2244897959183677e-05, "loss": 0.3916, "mean_copy_accuracy": 0.9825897812843323, "mean_gen_accuracy": 0.8529105931520462, "mean_token_accuracy": 0.8847239464521408, "num_tokens": 147465100.0, "sample_num_tokens": 8037.5, "step": 546, "total_num_tokens": 147497250.0, "z_loss": 0.016345784068107605 }, { "copy_logits_max": 23.29751205444336, "copy_logits_min": -750000000.0, "copy_num_tokens": 373.875, "epoch": 0.11171815164666837, "gen_logits_max": 16.672557830810547, "gen_logits_mean": -3.3859405517578125, "gen_logits_min": -15.219884872436523, "gen_logits_std": 2.4275472164154053, "gen_loss": 0.4414063096046448, "grad_norm": 0.8713632486606385, "learning_rate": 2.2285714285714287e-05, "loss": 0.4199, "mean_copy_accuracy": 0.9828770458698273, "mean_gen_accuracy": 0.8426235318183899, "mean_token_accuracy": 0.8767975717782974, "num_tokens": 147736505.0, "sample_num_tokens": 8031.25, "step": 547, "total_num_tokens": 147768630.0, "z_loss": 0.015351835638284683 }, { "copy_logits_max": 25.081148147583008, "copy_logits_min": -750000000.0, "copy_num_tokens": 434.375, "epoch": 0.1119223895838652, "gen_logits_max": 16.525978088378906, "gen_logits_mean": -3.5182223320007324, "gen_logits_min": -15.124322891235352, "gen_logits_std": 2.4150309562683105, "gen_loss": 0.3889829218387604, "grad_norm": 0.8464404188310738, "learning_rate": 2.23265306122449e-05, "loss": 0.4158, "mean_copy_accuracy": 0.9832577109336853, "mean_gen_accuracy": 0.8467742204666138, "mean_token_accuracy": 0.8791461139917374, "num_tokens": 148002289.0, "sample_num_tokens": 7744.75, "step": 548, "total_num_tokens": 148033268.0, "z_loss": 0.018346130847930908 }, { "copy_logits_max": 25.19440269470215, "copy_logits_min": -750000000.0, "copy_num_tokens": 582.3125, "epoch": 0.11212662752106203, "gen_logits_max": 16.56165313720703, "gen_logits_mean": -3.6243040561676025, "gen_logits_min": -15.469093322753906, "gen_logits_std": 2.4307796955108643, "gen_loss": 0.41405731439590454, "grad_norm": 0.7020177603574658, "learning_rate": 2.236734693877551e-05, "loss": 0.4088, "mean_copy_accuracy": 0.9831323176622391, "mean_gen_accuracy": 0.8469338715076447, "mean_token_accuracy": 0.8789934813976288, "num_tokens": 148277266.0, "sample_num_tokens": 8962.0, "step": 549, "total_num_tokens": 148313114.0, "z_loss": 0.02343488484621048 }, { "copy_logits_max": 22.25727081298828, "copy_logits_min": -750000000.0, "copy_num_tokens": 441.625, "epoch": 0.11233086545825888, "gen_logits_max": 16.33710479736328, "gen_logits_mean": -3.699131488800049, "gen_logits_min": -15.679859161376953, "gen_logits_std": 2.4421586990356445, "gen_loss": 0.3798534870147705, "grad_norm": 1.1382590733889388, "learning_rate": 2.240816326530612e-05, "loss": 0.4215, "mean_copy_accuracy": 0.9759833216667175, "mean_gen_accuracy": 0.8407337218523026, "mean_token_accuracy": 0.8756020218133926, "num_tokens": 148560047.0, "sample_num_tokens": 8022.75, "step": 550, "total_num_tokens": 148592138.0, "z_loss": 0.0183436069637537 }, { "copy_logits_max": 21.581636428833008, "copy_logits_min": -750000000.0, "copy_num_tokens": 329.75, "epoch": 0.11253510339545571, "gen_logits_max": 17.280925750732422, "gen_logits_mean": -3.5480856895446777, "gen_logits_min": -15.303102493286133, "gen_logits_std": 2.4434568881988525, "gen_loss": 0.42259591817855835, "grad_norm": 0.8505548930004256, "learning_rate": 2.2448979591836737e-05, "loss": 0.3945, "mean_copy_accuracy": 0.9814036339521408, "mean_gen_accuracy": 0.8509708046913147, "mean_token_accuracy": 0.8833158016204834, "num_tokens": 148817902.0, "sample_num_tokens": 7307.0, "step": 551, "total_num_tokens": 148847130.0, "z_loss": 0.014585788361728191 }, { "copy_logits_max": 22.45974349975586, "copy_logits_min": -750000000.0, "copy_num_tokens": 420.1875, "epoch": 0.11273934133265254, "gen_logits_max": 15.777533531188965, "gen_logits_mean": -3.4733314514160156, "gen_logits_min": -15.498116493225098, "gen_logits_std": 2.4493606090545654, "gen_loss": 0.4324656128883362, "grad_norm": 1.0633082770570044, "learning_rate": 2.2489795918367347e-05, "loss": 0.4228, "mean_copy_accuracy": 0.9815872013568878, "mean_gen_accuracy": 0.8420247584581375, "mean_token_accuracy": 0.8778810501098633, "num_tokens": 149097203.0, "sample_num_tokens": 9009.75, "step": 552, "total_num_tokens": 149133242.0, "z_loss": 0.01759040355682373 }, { "copy_logits_max": 20.972946166992188, "copy_logits_min": -687500032.0, "copy_num_tokens": 458.0, "epoch": 0.11294357926984938, "gen_logits_max": 16.8740291595459, "gen_logits_mean": -3.549830913543701, "gen_logits_min": -14.786407470703125, "gen_logits_std": 2.3465938568115234, "gen_loss": 0.41565972566604614, "grad_norm": 0.783097967310208, "learning_rate": 2.253061224489796e-05, "loss": 0.4395, "mean_copy_accuracy": 0.9808205217123032, "mean_gen_accuracy": 0.835720106959343, "mean_token_accuracy": 0.8691418915987015, "num_tokens": 149352738.0, "sample_num_tokens": 9184.5, "step": 553, "total_num_tokens": 149389476.0, "z_loss": 0.014226622879505157 }, { "copy_logits_max": 19.412551879882812, "copy_logits_min": -750000000.0, "copy_num_tokens": 436.125, "epoch": 0.11314781720704621, "gen_logits_max": 16.314430236816406, "gen_logits_mean": -3.7664148807525635, "gen_logits_min": -15.429466247558594, "gen_logits_std": 2.410022735595703, "gen_loss": 0.4128619432449341, "grad_norm": 0.800140217461718, "learning_rate": 2.257142857142857e-05, "loss": 0.4202, "mean_copy_accuracy": 0.9795549660921097, "mean_gen_accuracy": 0.844861164689064, "mean_token_accuracy": 0.8755754679441452, "num_tokens": 149636398.0, "sample_num_tokens": 8373.0, "step": 554, "total_num_tokens": 149669890.0, "z_loss": 0.014022015035152435 }, { "copy_logits_max": 22.232444763183594, "copy_logits_min": -750000000.0, "copy_num_tokens": 380.9375, "epoch": 0.11335205514424304, "gen_logits_max": 16.234455108642578, "gen_logits_mean": -3.702272891998291, "gen_logits_min": -15.18189525604248, "gen_logits_std": 2.406813144683838, "gen_loss": 0.42052793502807617, "grad_norm": 0.9188528255198947, "learning_rate": 2.2612244897959184e-05, "loss": 0.4168, "mean_copy_accuracy": 0.9837286919355392, "mean_gen_accuracy": 0.837025061249733, "mean_token_accuracy": 0.8776508420705795, "num_tokens": 149911242.0, "sample_num_tokens": 7794.5, "step": 555, "total_num_tokens": 149942420.0, "z_loss": 0.01729843020439148 }, { "copy_logits_max": 20.602298736572266, "copy_logits_min": -750000000.0, "copy_num_tokens": 404.5625, "epoch": 0.11355629308143987, "gen_logits_max": 16.144941329956055, "gen_logits_mean": -3.990746021270752, "gen_logits_min": -15.622444152832031, "gen_logits_std": 2.3679451942443848, "gen_loss": 0.41557127237319946, "grad_norm": 0.848292808849848, "learning_rate": 2.2653061224489794e-05, "loss": 0.4238, "mean_copy_accuracy": 0.9793882220983505, "mean_gen_accuracy": 0.8429383933544159, "mean_token_accuracy": 0.8749275505542755, "num_tokens": 150171322.0, "sample_num_tokens": 8354.5, "step": 556, "total_num_tokens": 150204740.0, "z_loss": 0.014357779175043106 }, { "copy_logits_max": 18.602680206298828, "copy_logits_min": -750000000.0, "copy_num_tokens": 383.1875, "epoch": 0.11376053101863672, "gen_logits_max": 16.339187622070312, "gen_logits_mean": -4.2995758056640625, "gen_logits_min": -15.723093032836914, "gen_logits_std": 2.3508152961730957, "gen_loss": 0.35082268714904785, "grad_norm": 0.7589313108427592, "learning_rate": 2.269387755102041e-05, "loss": 0.3948, "mean_copy_accuracy": 0.9804009050130844, "mean_gen_accuracy": 0.8491618037223816, "mean_token_accuracy": 0.8804734498262405, "num_tokens": 150431411.0, "sample_num_tokens": 8102.25, "step": 557, "total_num_tokens": 150463820.0, "z_loss": 0.013326780870556831 }, { "copy_logits_max": 20.431537628173828, "copy_logits_min": -750000000.0, "copy_num_tokens": 493.75, "epoch": 0.11396476895583355, "gen_logits_max": 15.833816528320312, "gen_logits_mean": -3.4697439670562744, "gen_logits_min": -15.210923194885254, "gen_logits_std": 2.450418472290039, "gen_loss": 0.39057356119155884, "grad_norm": 0.8388171162511213, "learning_rate": 2.273469387755102e-05, "loss": 0.4301, "mean_copy_accuracy": 0.9810831248760223, "mean_gen_accuracy": 0.8419867902994156, "mean_token_accuracy": 0.8719001710414886, "num_tokens": 150691082.0, "sample_num_tokens": 10012.0, "step": 558, "total_num_tokens": 150731130.0, "z_loss": 0.017955269664525986 }, { "copy_logits_max": 20.072097778320312, "copy_logits_min": -687500032.0, "copy_num_tokens": 300.125, "epoch": 0.11416900689303038, "gen_logits_max": 16.52881622314453, "gen_logits_mean": -3.689490795135498, "gen_logits_min": -14.969161033630371, "gen_logits_std": 2.372154474258423, "gen_loss": 0.40886977314949036, "grad_norm": 0.8313144410983581, "learning_rate": 2.2775510204081635e-05, "loss": 0.4299, "mean_copy_accuracy": 0.9817910194396973, "mean_gen_accuracy": 0.8423751592636108, "mean_token_accuracy": 0.8719235211610794, "num_tokens": 150951868.0, "sample_num_tokens": 6839.5, "step": 559, "total_num_tokens": 150979226.0, "z_loss": 0.012425392866134644 }, { "copy_logits_max": 21.07465934753418, "copy_logits_min": -750000000.0, "copy_num_tokens": 391.6875, "epoch": 0.11437324483022722, "gen_logits_max": 16.7409610748291, "gen_logits_mean": -3.5927443504333496, "gen_logits_min": -15.333683013916016, "gen_logits_std": 2.39512038230896, "gen_loss": 0.44081228971481323, "grad_norm": 0.6961963595657878, "learning_rate": 2.2816326530612245e-05, "loss": 0.4215, "mean_copy_accuracy": 0.9839956313371658, "mean_gen_accuracy": 0.8400674760341644, "mean_token_accuracy": 0.8751235604286194, "num_tokens": 151230270.0, "sample_num_tokens": 8357.5, "step": 560, "total_num_tokens": 151263700.0, "z_loss": 0.013952123932540417 }, { "copy_logits_max": 15.309636116027832, "copy_logits_min": -750000000.0, "copy_num_tokens": 244.4375, "epoch": 0.11457748276742405, "gen_logits_max": 15.686079025268555, "gen_logits_mean": -3.7366890907287598, "gen_logits_min": -15.489524841308594, "gen_logits_std": 2.369091033935547, "gen_loss": 0.42579004168510437, "grad_norm": 0.7505240290538092, "learning_rate": 2.2857142857142858e-05, "loss": 0.4211, "mean_copy_accuracy": 0.982133224606514, "mean_gen_accuracy": 0.8436685651540756, "mean_token_accuracy": 0.8738172799348831, "num_tokens": 151497704.0, "sample_num_tokens": 6417.5, "step": 561, "total_num_tokens": 151523374.0, "z_loss": 0.00999686773866415 }, { "copy_logits_max": 23.83755874633789, "copy_logits_min": -687499968.0, "copy_num_tokens": 498.9375, "epoch": 0.11478172070462088, "gen_logits_max": 15.980612754821777, "gen_logits_mean": -3.118251323699951, "gen_logits_min": -15.187576293945312, "gen_logits_std": 2.4983162879943848, "gen_loss": 0.44097089767456055, "grad_norm": 0.7301529741635367, "learning_rate": 2.289795918367347e-05, "loss": 0.437, "mean_copy_accuracy": 0.9837188720703125, "mean_gen_accuracy": 0.8347335159778595, "mean_token_accuracy": 0.869807243347168, "num_tokens": 151752376.0, "sample_num_tokens": 8653.0, "step": 562, "total_num_tokens": 151786988.0, "z_loss": 0.019979331642389297 }, { "copy_logits_max": 19.872846603393555, "copy_logits_min": -750000000.0, "copy_num_tokens": 467.25, "epoch": 0.11498595864181772, "gen_logits_max": 15.110223770141602, "gen_logits_mean": -3.6592373847961426, "gen_logits_min": -15.550088882446289, "gen_logits_std": 2.4736149311065674, "gen_loss": 0.375559538602829, "grad_norm": 0.765453330416788, "learning_rate": 2.293877551020408e-05, "loss": 0.4137, "mean_copy_accuracy": 0.9790391772985458, "mean_gen_accuracy": 0.8459605425596237, "mean_token_accuracy": 0.8757823705673218, "num_tokens": 152029493.0, "sample_num_tokens": 8838.25, "step": 563, "total_num_tokens": 152064846.0, "z_loss": 0.016554320231080055 }, { "copy_logits_max": 28.569902420043945, "copy_logits_min": -750000000.0, "copy_num_tokens": 573.25, "epoch": 0.11519019657901455, "gen_logits_max": 16.811561584472656, "gen_logits_mean": -3.6983187198638916, "gen_logits_min": -15.74222183227539, "gen_logits_std": 2.46573543548584, "gen_loss": 0.37912243604660034, "grad_norm": 0.9410526179537958, "learning_rate": 2.2979591836734695e-05, "loss": 0.4253, "mean_copy_accuracy": 0.9801327586174011, "mean_gen_accuracy": 0.8465546667575836, "mean_token_accuracy": 0.8793554306030273, "num_tokens": 152302861.0, "sample_num_tokens": 9361.75, "step": 564, "total_num_tokens": 152340308.0, "z_loss": 0.02276090905070305 }, { "copy_logits_max": 23.881183624267578, "copy_logits_min": -750000000.0, "copy_num_tokens": 377.625, "epoch": 0.11539443451621138, "gen_logits_max": 16.819305419921875, "gen_logits_mean": -3.9416935443878174, "gen_logits_min": -15.794649124145508, "gen_logits_std": 2.3822696208953857, "gen_loss": 0.4154777228832245, "grad_norm": 0.7213963103646325, "learning_rate": 2.3020408163265305e-05, "loss": 0.4267, "mean_copy_accuracy": 0.9854441732168198, "mean_gen_accuracy": 0.8368396759033203, "mean_token_accuracy": 0.8708219081163406, "num_tokens": 152570052.0, "sample_num_tokens": 7907.5, "step": 565, "total_num_tokens": 152601682.0, "z_loss": 0.014026501215994358 }, { "copy_logits_max": 22.962196350097656, "copy_logits_min": -750000000.0, "copy_num_tokens": 649.3125, "epoch": 0.11559867245340823, "gen_logits_max": 15.551032066345215, "gen_logits_mean": -3.4491629600524902, "gen_logits_min": -14.958789825439453, "gen_logits_std": 2.3977437019348145, "gen_loss": 0.3785390555858612, "grad_norm": 0.9064276426669009, "learning_rate": 2.306122448979592e-05, "loss": 0.3871, "mean_copy_accuracy": 0.9823789000511169, "mean_gen_accuracy": 0.849197268486023, "mean_token_accuracy": 0.8844646215438843, "num_tokens": 152829912.0, "sample_num_tokens": 9177.5, "step": 566, "total_num_tokens": 152866622.0, "z_loss": 0.017153268679976463 }, { "copy_logits_max": 24.305187225341797, "copy_logits_min": -750000000.0, "copy_num_tokens": 651.8125, "epoch": 0.11580291039060506, "gen_logits_max": 16.13344955444336, "gen_logits_mean": -4.0593581199646, "gen_logits_min": -15.516641616821289, "gen_logits_std": 2.4010376930236816, "gen_loss": 0.36070406436920166, "grad_norm": 0.640028832870458, "learning_rate": 2.310204081632653e-05, "loss": 0.3915, "mean_copy_accuracy": 0.9819381833076477, "mean_gen_accuracy": 0.8508963286876678, "mean_token_accuracy": 0.8804267048835754, "num_tokens": 153096328.0, "sample_num_tokens": 10464.0, "step": 567, "total_num_tokens": 153138184.0, "z_loss": 0.01804630272090435 }, { "copy_logits_max": 24.069076538085938, "copy_logits_min": -750000000.0, "copy_num_tokens": 529.5625, "epoch": 0.11600714832780189, "gen_logits_max": 15.53037166595459, "gen_logits_mean": -3.7115249633789062, "gen_logits_min": -15.811761856079102, "gen_logits_std": 2.474949359893799, "gen_loss": 0.3562522530555725, "grad_norm": 0.8200262611073112, "learning_rate": 2.3142857142857145e-05, "loss": 0.4079, "mean_copy_accuracy": 0.9822230190038681, "mean_gen_accuracy": 0.8466642946004868, "mean_token_accuracy": 0.8777433782815933, "num_tokens": 153369839.0, "sample_num_tokens": 8733.25, "step": 568, "total_num_tokens": 153404772.0, "z_loss": 0.018973184749484062 }, { "copy_logits_max": 22.47022247314453, "copy_logits_min": -687500032.0, "copy_num_tokens": 374.4375, "epoch": 0.11621138626499872, "gen_logits_max": 16.042316436767578, "gen_logits_mean": -4.083896636962891, "gen_logits_min": -15.739688873291016, "gen_logits_std": 2.4014222621917725, "gen_loss": 0.39509639143943787, "grad_norm": 0.7986847981235191, "learning_rate": 2.3183673469387755e-05, "loss": 0.4174, "mean_copy_accuracy": 0.9786222875118256, "mean_gen_accuracy": 0.8420184701681137, "mean_token_accuracy": 0.8751915246248245, "num_tokens": 153648329.0, "sample_num_tokens": 7395.25, "step": 569, "total_num_tokens": 153677910.0, "z_loss": 0.015360575169324875 }, { "copy_logits_max": 21.924760818481445, "copy_logits_min": -687500032.0, "copy_num_tokens": 359.4375, "epoch": 0.11641562420219556, "gen_logits_max": 16.303661346435547, "gen_logits_mean": -3.5662388801574707, "gen_logits_min": -15.657185554504395, "gen_logits_std": 2.448002576828003, "gen_loss": 0.4686124920845032, "grad_norm": 0.9237050874519069, "learning_rate": 2.322448979591837e-05, "loss": 0.4353, "mean_copy_accuracy": 0.9777110517024994, "mean_gen_accuracy": 0.8367698788642883, "mean_token_accuracy": 0.8731954842805862, "num_tokens": 153934903.0, "sample_num_tokens": 7532.75, "step": 570, "total_num_tokens": 153965034.0, "z_loss": 0.015603349544107914 }, { "copy_logits_max": 22.86342430114746, "copy_logits_min": -750000000.0, "copy_num_tokens": 456.5, "epoch": 0.11661986213939239, "gen_logits_max": 17.134998321533203, "gen_logits_mean": -4.271304130554199, "gen_logits_min": -15.652978897094727, "gen_logits_std": 2.3602964878082275, "gen_loss": 0.39185208082199097, "grad_norm": 0.7501127094165898, "learning_rate": 2.326530612244898e-05, "loss": 0.411, "mean_copy_accuracy": 0.9862653762102127, "mean_gen_accuracy": 0.8444839268922806, "mean_token_accuracy": 0.8781719505786896, "num_tokens": 154201183.0, "sample_num_tokens": 8688.25, "step": 571, "total_num_tokens": 154235936.0, "z_loss": 0.014176832512021065 }, { "copy_logits_max": 16.020732879638672, "copy_logits_min": -750000064.0, "copy_num_tokens": 279.6875, "epoch": 0.11682410007658922, "gen_logits_max": 16.36867904663086, "gen_logits_mean": -3.4609878063201904, "gen_logits_min": -14.763273239135742, "gen_logits_std": 2.414353847503662, "gen_loss": 0.49017560482025146, "grad_norm": 0.7435621805036781, "learning_rate": 2.3306122448979592e-05, "loss": 0.432, "mean_copy_accuracy": 0.9857650548219681, "mean_gen_accuracy": 0.8427972495555878, "mean_token_accuracy": 0.870405375957489, "num_tokens": 154472324.0, "sample_num_tokens": 7720.5, "step": 572, "total_num_tokens": 154503206.0, "z_loss": 0.011133762076497078 }, { "copy_logits_max": 17.655284881591797, "copy_logits_min": -750000064.0, "copy_num_tokens": 375.75, "epoch": 0.11702833801378607, "gen_logits_max": 16.181232452392578, "gen_logits_mean": -3.9205570220947266, "gen_logits_min": -15.328887939453125, "gen_logits_std": 2.3631997108459473, "gen_loss": 0.3535454571247101, "grad_norm": 1.000295718928739, "learning_rate": 2.3346938775510206e-05, "loss": 0.414, "mean_copy_accuracy": 0.9841956198215485, "mean_gen_accuracy": 0.8447747826576233, "mean_token_accuracy": 0.8759528547525406, "num_tokens": 154718811.0, "sample_num_tokens": 8572.75, "step": 573, "total_num_tokens": 154753102.0, "z_loss": 0.01212025061249733 }, { "copy_logits_max": 15.397042274475098, "copy_logits_min": -750000000.0, "copy_num_tokens": 460.0625, "epoch": 0.1172325759509829, "gen_logits_max": 14.660120010375977, "gen_logits_mean": -3.824838161468506, "gen_logits_min": -15.17071533203125, "gen_logits_std": 2.3614964485168457, "gen_loss": 0.38739943504333496, "grad_norm": 0.8720960381314504, "learning_rate": 2.3387755102040816e-05, "loss": 0.4195, "mean_copy_accuracy": 0.9776470959186554, "mean_gen_accuracy": 0.8452058285474777, "mean_token_accuracy": 0.872926265001297, "num_tokens": 154982089.0, "sample_num_tokens": 9809.75, "step": 574, "total_num_tokens": 155021328.0, "z_loss": 0.011657100170850754 }, { "copy_logits_max": 17.70675277709961, "copy_logits_min": -750000000.0, "copy_num_tokens": 422.5625, "epoch": 0.11743681388817973, "gen_logits_max": 16.006019592285156, "gen_logits_mean": -3.38895320892334, "gen_logits_min": -14.888984680175781, "gen_logits_std": 2.4088850021362305, "gen_loss": 0.3825107514858246, "grad_norm": 0.7275642216154986, "learning_rate": 2.342857142857143e-05, "loss": 0.4329, "mean_copy_accuracy": 0.9823731631040573, "mean_gen_accuracy": 0.8374971598386765, "mean_token_accuracy": 0.8720107525587082, "num_tokens": 155253332.0, "sample_num_tokens": 8738.0, "step": 575, "total_num_tokens": 155288284.0, "z_loss": 0.012947378680109978 }, { "copy_logits_max": 23.504032135009766, "copy_logits_min": -750000000.0, "copy_num_tokens": 604.125, "epoch": 0.11764105182537657, "gen_logits_max": 15.991331100463867, "gen_logits_mean": -3.8766775131225586, "gen_logits_min": -15.952728271484375, "gen_logits_std": 2.463916778564453, "gen_loss": 0.399441123008728, "grad_norm": 0.7360640457406773, "learning_rate": 2.346938775510204e-05, "loss": 0.4173, "mean_copy_accuracy": 0.9868880361318588, "mean_gen_accuracy": 0.840316578745842, "mean_token_accuracy": 0.8755131214857101, "num_tokens": 155524973.0, "sample_num_tokens": 9163.25, "step": 576, "total_num_tokens": 155561626.0, "z_loss": 0.018927929922938347 }, { "copy_logits_max": 24.09992027282715, "copy_logits_min": -750000000.0, "copy_num_tokens": 610.5625, "epoch": 0.1178452897625734, "gen_logits_max": 16.7420654296875, "gen_logits_mean": -3.6983280181884766, "gen_logits_min": -15.515557289123535, "gen_logits_std": 2.43247389793396, "gen_loss": 0.35077327489852905, "grad_norm": 0.7390780611012088, "learning_rate": 2.3510204081632653e-05, "loss": 0.4268, "mean_copy_accuracy": 0.9810930043458939, "mean_gen_accuracy": 0.8373855352401733, "mean_token_accuracy": 0.871841549873352, "num_tokens": 155799309.0, "sample_num_tokens": 9244.25, "step": 577, "total_num_tokens": 155836286.0, "z_loss": 0.020668409764766693 }, { "copy_logits_max": 21.816253662109375, "copy_logits_min": -625000000.0, "copy_num_tokens": 521.875, "epoch": 0.11804952769977023, "gen_logits_max": 16.293590545654297, "gen_logits_mean": -3.810126304626465, "gen_logits_min": -15.610432624816895, "gen_logits_std": 2.4119338989257812, "gen_loss": 0.3671308159828186, "grad_norm": 0.7939595683535323, "learning_rate": 2.3551020408163266e-05, "loss": 0.4074, "mean_copy_accuracy": 0.9805794656276703, "mean_gen_accuracy": 0.8488845378160477, "mean_token_accuracy": 0.8782354444265366, "num_tokens": 156078354.0, "sample_num_tokens": 8923.0, "step": 578, "total_num_tokens": 156114046.0, "z_loss": 0.015360710211098194 }, { "copy_logits_max": 19.379276275634766, "copy_logits_min": -750000000.0, "copy_num_tokens": 425.0, "epoch": 0.11825376563696707, "gen_logits_max": 16.40804672241211, "gen_logits_mean": -4.425138473510742, "gen_logits_min": -15.945356369018555, "gen_logits_std": 2.3168768882751465, "gen_loss": 0.377808153629303, "grad_norm": 0.9665770931579806, "learning_rate": 2.359183673469388e-05, "loss": 0.4244, "mean_copy_accuracy": 0.9794402718544006, "mean_gen_accuracy": 0.8403087854385376, "mean_token_accuracy": 0.8727080821990967, "num_tokens": 156347073.0, "sample_num_tokens": 8303.25, "step": 579, "total_num_tokens": 156380286.0, "z_loss": 0.010696806013584137 }, { "copy_logits_max": 22.28845977783203, "copy_logits_min": -750000000.0, "copy_num_tokens": 483.125, "epoch": 0.1184580035741639, "gen_logits_max": 16.46841049194336, "gen_logits_mean": -4.13477087020874, "gen_logits_min": -15.6580228805542, "gen_logits_std": 2.350064992904663, "gen_loss": 0.41635996103286743, "grad_norm": 0.6660408281026597, "learning_rate": 2.363265306122449e-05, "loss": 0.3939, "mean_copy_accuracy": 0.989549845457077, "mean_gen_accuracy": 0.8430824726819992, "mean_token_accuracy": 0.8825070112943649, "num_tokens": 156645409.0, "sample_num_tokens": 8434.25, "step": 580, "total_num_tokens": 156679146.0, "z_loss": 0.014477740973234177 }, { "copy_logits_max": 25.269878387451172, "copy_logits_min": -750000000.0, "copy_num_tokens": 504.0625, "epoch": 0.11866224151136073, "gen_logits_max": 16.637226104736328, "gen_logits_mean": -3.558302402496338, "gen_logits_min": -16.1758975982666, "gen_logits_std": 2.4975216388702393, "gen_loss": 0.42743536829948425, "grad_norm": 1.172629692583121, "learning_rate": 2.3673469387755103e-05, "loss": 0.4214, "mean_copy_accuracy": 0.9854474663734436, "mean_gen_accuracy": 0.8362246006727219, "mean_token_accuracy": 0.8766364455223083, "num_tokens": 156905916.0, "sample_num_tokens": 7904.5, "step": 581, "total_num_tokens": 156937534.0, "z_loss": 0.019295584410429 }, { "copy_logits_max": 24.132028579711914, "copy_logits_min": -687500032.0, "copy_num_tokens": 651.3125, "epoch": 0.11886647944855756, "gen_logits_max": 16.64199447631836, "gen_logits_mean": -3.5952558517456055, "gen_logits_min": -15.87643814086914, "gen_logits_std": 2.4307467937469482, "gen_loss": 0.3872939348220825, "grad_norm": 0.8841919271615336, "learning_rate": 2.3714285714285713e-05, "loss": 0.4328, "mean_copy_accuracy": 0.9815647602081299, "mean_gen_accuracy": 0.8330061286687851, "mean_token_accuracy": 0.8719310760498047, "num_tokens": 157183966.0, "sample_num_tokens": 10259.0, "step": 582, "total_num_tokens": 157225002.0, "z_loss": 0.018219217658042908 }, { "copy_logits_max": 18.595458984375, "copy_logits_min": -750000000.0, "copy_num_tokens": 264.1875, "epoch": 0.11907071738575441, "gen_logits_max": 15.760690689086914, "gen_logits_mean": -3.803246021270752, "gen_logits_min": -15.72018051147461, "gen_logits_std": 2.4014463424682617, "gen_loss": 0.4027823805809021, "grad_norm": 0.7430278750981777, "learning_rate": 2.3755102040816327e-05, "loss": 0.4143, "mean_copy_accuracy": 0.9807487279176712, "mean_gen_accuracy": 0.845165565609932, "mean_token_accuracy": 0.8754399865865707, "num_tokens": 157448537.0, "sample_num_tokens": 6452.75, "step": 583, "total_num_tokens": 157474348.0, "z_loss": 0.011645553633570671 }, { "copy_logits_max": 19.55514907836914, "copy_logits_min": -687500032.0, "copy_num_tokens": 494.125, "epoch": 0.11927495532295124, "gen_logits_max": 15.35274887084961, "gen_logits_mean": -3.8229455947875977, "gen_logits_min": -15.560149192810059, "gen_logits_std": 2.4118618965148926, "gen_loss": 0.3295612335205078, "grad_norm": 0.7512969796674092, "learning_rate": 2.379591836734694e-05, "loss": 0.3931, "mean_copy_accuracy": 0.9835855215787888, "mean_gen_accuracy": 0.8552014976739883, "mean_token_accuracy": 0.8845135867595673, "num_tokens": 157706528.0, "sample_num_tokens": 9370.5, "step": 584, "total_num_tokens": 157744010.0, "z_loss": 0.014259140007197857 }, { "copy_logits_max": 18.300674438476562, "copy_logits_min": -750000000.0, "copy_num_tokens": 357.4375, "epoch": 0.11947919326014807, "gen_logits_max": 16.836406707763672, "gen_logits_mean": -3.819032669067383, "gen_logits_min": -15.3958101272583, "gen_logits_std": 2.367903232574463, "gen_loss": 0.42075759172439575, "grad_norm": 0.6306244749745628, "learning_rate": 2.3836734693877554e-05, "loss": 0.4105, "mean_copy_accuracy": 0.9863844513893127, "mean_gen_accuracy": 0.844030573964119, "mean_token_accuracy": 0.87533900141716, "num_tokens": 157956763.0, "sample_num_tokens": 7962.25, "step": 585, "total_num_tokens": 157988612.0, "z_loss": 0.011332107707858086 }, { "copy_logits_max": 24.09203338623047, "copy_logits_min": -750000000.0, "copy_num_tokens": 537.9375, "epoch": 0.11968343119734491, "gen_logits_max": 16.555055618286133, "gen_logits_mean": -3.4995110034942627, "gen_logits_min": -15.3174409866333, "gen_logits_std": 2.404346466064453, "gen_loss": 0.421649694442749, "grad_norm": 0.85609997784291, "learning_rate": 2.3877551020408164e-05, "loss": 0.4143, "mean_copy_accuracy": 0.9788857847452164, "mean_gen_accuracy": 0.8420260697603226, "mean_token_accuracy": 0.8767002373933792, "num_tokens": 158223410.0, "sample_num_tokens": 8735.0, "step": 586, "total_num_tokens": 158258350.0, "z_loss": 0.018000245094299316 }, { "copy_logits_max": 20.425355911254883, "copy_logits_min": -750000000.0, "copy_num_tokens": 487.4375, "epoch": 0.11988766913454174, "gen_logits_max": 15.276958465576172, "gen_logits_mean": -3.719918727874756, "gen_logits_min": -15.458086013793945, "gen_logits_std": 2.3979592323303223, "gen_loss": 0.39078274369239807, "grad_norm": 0.7674710063002906, "learning_rate": 2.3918367346938774e-05, "loss": 0.4183, "mean_copy_accuracy": 0.9821373671293259, "mean_gen_accuracy": 0.8398455530405045, "mean_token_accuracy": 0.8762233108282089, "num_tokens": 158507680.0, "sample_num_tokens": 9079.5, "step": 587, "total_num_tokens": 158543998.0, "z_loss": 0.015143143944442272 }, { "copy_logits_max": 18.201644897460938, "copy_logits_min": -750000000.0, "copy_num_tokens": 408.6875, "epoch": 0.12009190707173857, "gen_logits_max": 16.799877166748047, "gen_logits_mean": -3.6792006492614746, "gen_logits_min": -15.542162895202637, "gen_logits_std": 2.4064297676086426, "gen_loss": 0.4169325530529022, "grad_norm": 0.82445669648871, "learning_rate": 2.3959183673469387e-05, "loss": 0.4054, "mean_copy_accuracy": 0.9845269322395325, "mean_gen_accuracy": 0.8448337912559509, "mean_token_accuracy": 0.8811476528644562, "num_tokens": 158777790.0, "sample_num_tokens": 8279.0, "step": 588, "total_num_tokens": 158810906.0, "z_loss": 0.01289954874664545 }, { "copy_logits_max": 21.971139907836914, "copy_logits_min": -750000000.0, "copy_num_tokens": 481.5625, "epoch": 0.12029614500893542, "gen_logits_max": 17.07872772216797, "gen_logits_mean": -4.047698974609375, "gen_logits_min": -15.729076385498047, "gen_logits_std": 2.390728712081909, "gen_loss": 0.39479610323905945, "grad_norm": 0.6917876253295894, "learning_rate": 2.4e-05, "loss": 0.4003, "mean_copy_accuracy": 0.9826971590518951, "mean_gen_accuracy": 0.842799961566925, "mean_token_accuracy": 0.879732608795166, "num_tokens": 159058844.0, "sample_num_tokens": 7875.5, "step": 589, "total_num_tokens": 159090346.0, "z_loss": 0.012828975915908813 }, { "copy_logits_max": 15.857585906982422, "copy_logits_min": -750000000.0, "copy_num_tokens": 324.8125, "epoch": 0.12050038294613225, "gen_logits_max": 15.381211280822754, "gen_logits_mean": -3.478698968887329, "gen_logits_min": -15.75404167175293, "gen_logits_std": 2.438685417175293, "gen_loss": 0.39241838455200195, "grad_norm": 0.7476493379214147, "learning_rate": 2.4040816326530614e-05, "loss": 0.4224, "mean_copy_accuracy": 0.9829018712043762, "mean_gen_accuracy": 0.8383728414773941, "mean_token_accuracy": 0.873394638299942, "num_tokens": 159309968.0, "sample_num_tokens": 7136.5, "step": 590, "total_num_tokens": 159338514.0, "z_loss": 0.011441866867244244 }, { "copy_logits_max": 24.954160690307617, "copy_logits_min": -750000000.0, "copy_num_tokens": 446.1875, "epoch": 0.12070462088332908, "gen_logits_max": 16.51932144165039, "gen_logits_mean": -3.8490095138549805, "gen_logits_min": -16.184720993041992, "gen_logits_std": 2.4858269691467285, "gen_loss": 0.4358382225036621, "grad_norm": 0.8884107008281488, "learning_rate": 2.4081632653061224e-05, "loss": 0.4185, "mean_copy_accuracy": 0.9847720414400101, "mean_gen_accuracy": 0.8358228802680969, "mean_token_accuracy": 0.8771766424179077, "num_tokens": 159607522.0, "sample_num_tokens": 7702.0, "step": 591, "total_num_tokens": 159638330.0, "z_loss": 0.017824240028858185 }, { "copy_logits_max": 25.188365936279297, "copy_logits_min": -750000000.0, "copy_num_tokens": 604.375, "epoch": 0.12090885882052592, "gen_logits_max": 16.84929656982422, "gen_logits_mean": -3.783423900604248, "gen_logits_min": -16.155967712402344, "gen_logits_std": 2.480365514755249, "gen_loss": 0.4527285695075989, "grad_norm": 0.7674162920465348, "learning_rate": 2.4122448979591838e-05, "loss": 0.4339, "mean_copy_accuracy": 0.9821786880493164, "mean_gen_accuracy": 0.8343686163425446, "mean_token_accuracy": 0.872405081987381, "num_tokens": 159881948.0, "sample_num_tokens": 9560.0, "step": 592, "total_num_tokens": 159920188.0, "z_loss": 0.018157243728637695 }, { "copy_logits_max": 18.26299285888672, "copy_logits_min": -750000000.0, "copy_num_tokens": 394.3125, "epoch": 0.12111309675772275, "gen_logits_max": 15.911531448364258, "gen_logits_mean": -3.9668142795562744, "gen_logits_min": -15.81667709350586, "gen_logits_std": 2.4120655059814453, "gen_loss": 0.37562817335128784, "grad_norm": 0.7721940476790153, "learning_rate": 2.4163265306122448e-05, "loss": 0.4338, "mean_copy_accuracy": 0.9824856966733932, "mean_gen_accuracy": 0.8402134925127029, "mean_token_accuracy": 0.8735315054655075, "num_tokens": 160152993.0, "sample_num_tokens": 8529.25, "step": 593, "total_num_tokens": 160187110.0, "z_loss": 0.014652137644588947 }, { "copy_logits_max": 20.877296447753906, "copy_logits_min": -687500032.0, "copy_num_tokens": 480.375, "epoch": 0.12131733469491958, "gen_logits_max": 16.138227462768555, "gen_logits_mean": -3.9313015937805176, "gen_logits_min": -15.955315589904785, "gen_logits_std": 2.451781749725342, "gen_loss": 0.3339524567127228, "grad_norm": 0.6551733966521602, "learning_rate": 2.4204081632653064e-05, "loss": 0.4008, "mean_copy_accuracy": 0.9819497317075729, "mean_gen_accuracy": 0.8505878150463104, "mean_token_accuracy": 0.8808494359254837, "num_tokens": 160428826.0, "sample_num_tokens": 9057.5, "step": 594, "total_num_tokens": 160465056.0, "z_loss": 0.014738328754901886 }, { "copy_logits_max": 21.99628257751465, "copy_logits_min": -750000000.0, "copy_num_tokens": 368.0625, "epoch": 0.12152157263211641, "gen_logits_max": 16.52779197692871, "gen_logits_mean": -4.057616710662842, "gen_logits_min": -16.37362289428711, "gen_logits_std": 2.4382224082946777, "gen_loss": 0.40529996156692505, "grad_norm": 0.7378105173222348, "learning_rate": 2.4244897959183674e-05, "loss": 0.4011, "mean_copy_accuracy": 0.9837104976177216, "mean_gen_accuracy": 0.8417423814535141, "mean_token_accuracy": 0.8789559453725815, "num_tokens": 160708793.0, "sample_num_tokens": 7690.75, "step": 595, "total_num_tokens": 160739556.0, "z_loss": 0.013638322241604328 }, { "copy_logits_max": 17.393009185791016, "copy_logits_min": -750000000.0, "copy_num_tokens": 382.1875, "epoch": 0.12172581056931325, "gen_logits_max": 16.11243438720703, "gen_logits_mean": -3.609072685241699, "gen_logits_min": -15.39572525024414, "gen_logits_std": 2.4417977333068848, "gen_loss": 0.4232596158981323, "grad_norm": 0.7564152673960249, "learning_rate": 2.4285714285714288e-05, "loss": 0.4181, "mean_copy_accuracy": 0.9813035279512405, "mean_gen_accuracy": 0.8423225283622742, "mean_token_accuracy": 0.8710808306932449, "num_tokens": 160963156.0, "sample_num_tokens": 8113.0, "step": 596, "total_num_tokens": 160995608.0, "z_loss": 0.012014830484986305 }, { "copy_logits_max": 26.01969337463379, "copy_logits_min": -750000000.0, "copy_num_tokens": 522.0625, "epoch": 0.12193004850651008, "gen_logits_max": 16.953073501586914, "gen_logits_mean": -3.791959285736084, "gen_logits_min": -16.191505432128906, "gen_logits_std": 2.4780967235565186, "gen_loss": 0.40696442127227783, "grad_norm": 0.8223326479964584, "learning_rate": 2.4326530612244898e-05, "loss": 0.4082, "mean_copy_accuracy": 0.9842406362295151, "mean_gen_accuracy": 0.8404812663793564, "mean_token_accuracy": 0.8781452775001526, "num_tokens": 161229480.0, "sample_num_tokens": 8619.0, "step": 597, "total_num_tokens": 161263956.0, "z_loss": 0.016641991212964058 }, { "copy_logits_max": 17.39373016357422, "copy_logits_min": -750000000.0, "copy_num_tokens": 388.5625, "epoch": 0.12213428644370691, "gen_logits_max": 15.450782775878906, "gen_logits_mean": -3.8462073802948, "gen_logits_min": -15.790682792663574, "gen_logits_std": 2.4620585441589355, "gen_loss": 0.4007430076599121, "grad_norm": 0.7465745781302515, "learning_rate": 2.436734693877551e-05, "loss": 0.4579, "mean_copy_accuracy": 0.9791216552257538, "mean_gen_accuracy": 0.8327009379863739, "mean_token_accuracy": 0.8658115863800049, "num_tokens": 161473056.0, "sample_num_tokens": 7448.0, "step": 598, "total_num_tokens": 161502848.0, "z_loss": 0.015082175843417645 }, { "copy_logits_max": 18.4166316986084, "copy_logits_min": -750000000.0, "copy_num_tokens": 488.0625, "epoch": 0.12233852438090376, "gen_logits_max": 15.556854248046875, "gen_logits_mean": -3.6283562183380127, "gen_logits_min": -15.442572593688965, "gen_logits_std": 2.487511157989502, "gen_loss": 0.3790813386440277, "grad_norm": 0.8470585001018559, "learning_rate": 2.440816326530612e-05, "loss": 0.4223, "mean_copy_accuracy": 0.9838218986988068, "mean_gen_accuracy": 0.8436463177204132, "mean_token_accuracy": 0.8727057427167892, "num_tokens": 161705897.0, "sample_num_tokens": 8085.75, "step": 599, "total_num_tokens": 161738240.0, "z_loss": 0.016100723296403885 }, { "copy_logits_max": 22.322011947631836, "copy_logits_min": -750000000.0, "copy_num_tokens": 362.3125, "epoch": 0.12254276231810059, "gen_logits_max": 16.870901107788086, "gen_logits_mean": -4.055599689483643, "gen_logits_min": -15.72024154663086, "gen_logits_std": 2.4140567779541016, "gen_loss": 0.4240845739841461, "grad_norm": 0.8270142085836054, "learning_rate": 2.4448979591836735e-05, "loss": 0.4252, "mean_copy_accuracy": 0.97887222468853, "mean_gen_accuracy": 0.8473839163780212, "mean_token_accuracy": 0.8761058151721954, "num_tokens": 161956530.0, "sample_num_tokens": 7304.5, "step": 600, "total_num_tokens": 161985748.0, "z_loss": 0.013442069292068481 }, { "copy_logits_max": 21.648757934570312, "copy_logits_min": -750000000.0, "copy_num_tokens": 536.3125, "epoch": 0.12274700025529742, "gen_logits_max": 16.300058364868164, "gen_logits_mean": -3.9758572578430176, "gen_logits_min": -16.08922576904297, "gen_logits_std": 2.4039740562438965, "gen_loss": 0.37237414717674255, "grad_norm": 0.7291167574093119, "learning_rate": 2.448979591836735e-05, "loss": 0.3826, "mean_copy_accuracy": 0.9858985990285873, "mean_gen_accuracy": 0.8517426997423172, "mean_token_accuracy": 0.8879782259464264, "num_tokens": 162230003.0, "sample_num_tokens": 9270.75, "step": 601, "total_num_tokens": 162267086.0, "z_loss": 0.015003175474703312 }, { "copy_logits_max": 24.856277465820312, "copy_logits_min": -750000000.0, "copy_num_tokens": 452.6875, "epoch": 0.12295123819249426, "gen_logits_max": 16.331924438476562, "gen_logits_mean": -4.105233192443848, "gen_logits_min": -16.05693817138672, "gen_logits_std": 2.448826789855957, "gen_loss": 0.40328842401504517, "grad_norm": 0.8236137334884523, "learning_rate": 2.453061224489796e-05, "loss": 0.4069, "mean_copy_accuracy": 0.9850954860448837, "mean_gen_accuracy": 0.8412413001060486, "mean_token_accuracy": 0.8826896548271179, "num_tokens": 162517147.0, "sample_num_tokens": 7573.25, "step": 602, "total_num_tokens": 162547440.0, "z_loss": 0.01640939526259899 }, { "copy_logits_max": 22.45375633239746, "copy_logits_min": -750000000.0, "copy_num_tokens": 648.5625, "epoch": 0.12315547612969109, "gen_logits_max": 15.848920822143555, "gen_logits_mean": -4.072406768798828, "gen_logits_min": -15.87576675415039, "gen_logits_std": 2.470634937286377, "gen_loss": 0.36491721868515015, "grad_norm": 0.9704699807818252, "learning_rate": 2.4571428571428572e-05, "loss": 0.3967, "mean_copy_accuracy": 0.9844846874475479, "mean_gen_accuracy": 0.8443542271852493, "mean_token_accuracy": 0.8829703480005264, "num_tokens": 162797071.0, "sample_num_tokens": 8968.75, "step": 603, "total_num_tokens": 162832946.0, "z_loss": 0.019395187497138977 }, { "copy_logits_max": 22.341947555541992, "copy_logits_min": -750000000.0, "copy_num_tokens": 584.75, "epoch": 0.12335971406688792, "gen_logits_max": 15.924373626708984, "gen_logits_mean": -3.9563090801239014, "gen_logits_min": -16.359996795654297, "gen_logits_std": 2.5152950286865234, "gen_loss": 0.40159177780151367, "grad_norm": 0.8594575729675505, "learning_rate": 2.4612244897959182e-05, "loss": 0.379, "mean_copy_accuracy": 0.9830719530582428, "mean_gen_accuracy": 0.8501250445842743, "mean_token_accuracy": 0.885945662856102, "num_tokens": 163078911.0, "sample_num_tokens": 9048.25, "step": 604, "total_num_tokens": 163115104.0, "z_loss": 0.016151707619428635 }, { "copy_logits_max": 21.42209243774414, "copy_logits_min": -750000000.0, "copy_num_tokens": 416.3125, "epoch": 0.12356395200408476, "gen_logits_max": 15.45926284790039, "gen_logits_mean": -4.10654354095459, "gen_logits_min": -16.411930084228516, "gen_logits_std": 2.528815746307373, "gen_loss": 0.37048566341400146, "grad_norm": 0.7880963279331252, "learning_rate": 2.46530612244898e-05, "loss": 0.4013, "mean_copy_accuracy": 0.9880902171134949, "mean_gen_accuracy": 0.8408937752246857, "mean_token_accuracy": 0.881359875202179, "num_tokens": 163356773.0, "sample_num_tokens": 7065.75, "step": 605, "total_num_tokens": 163385036.0, "z_loss": 0.015726782381534576 }, { "copy_logits_max": 19.363178253173828, "copy_logits_min": -750000000.0, "copy_num_tokens": 326.125, "epoch": 0.1237681899412816, "gen_logits_max": 16.276565551757812, "gen_logits_mean": -4.279423713684082, "gen_logits_min": -16.21439552307129, "gen_logits_std": 2.384925365447998, "gen_loss": 0.3623220920562744, "grad_norm": 0.7064454917688413, "learning_rate": 2.469387755102041e-05, "loss": 0.3812, "mean_copy_accuracy": 0.9846292585134506, "mean_gen_accuracy": 0.8524429649114609, "mean_token_accuracy": 0.8878751993179321, "num_tokens": 163631978.0, "sample_num_tokens": 6961.0, "step": 606, "total_num_tokens": 163659822.0, "z_loss": 0.010736941359937191 }, { "copy_logits_max": 21.723167419433594, "copy_logits_min": -625000000.0, "copy_num_tokens": 613.625, "epoch": 0.12397242787847843, "gen_logits_max": 16.012237548828125, "gen_logits_mean": -3.8927135467529297, "gen_logits_min": -16.340288162231445, "gen_logits_std": 2.4893741607666016, "gen_loss": 0.40341493487358093, "grad_norm": 0.780782557005503, "learning_rate": 2.4734693877551022e-05, "loss": 0.3986, "mean_copy_accuracy": 0.9857029318809509, "mean_gen_accuracy": 0.8404448181390762, "mean_token_accuracy": 0.8802139163017273, "num_tokens": 163902408.0, "sample_num_tokens": 9808.5, "step": 607, "total_num_tokens": 163941642.0, "z_loss": 0.017033567652106285 }, { "copy_logits_max": 24.596120834350586, "copy_logits_min": -750000000.0, "copy_num_tokens": 476.6875, "epoch": 0.12417666581567526, "gen_logits_max": 16.877765655517578, "gen_logits_mean": -3.8523011207580566, "gen_logits_min": -16.163236618041992, "gen_logits_std": 2.4719555377960205, "gen_loss": 0.3791419565677643, "grad_norm": 0.7089331975223244, "learning_rate": 2.4775510204081632e-05, "loss": 0.4096, "mean_copy_accuracy": 0.9842214733362198, "mean_gen_accuracy": 0.8412154763936996, "mean_token_accuracy": 0.8778082132339478, "num_tokens": 164179665.0, "sample_num_tokens": 7749.25, "step": 608, "total_num_tokens": 164210662.0, "z_loss": 0.0148696880787611 }, { "copy_logits_max": 19.067646026611328, "copy_logits_min": -750000000.0, "copy_num_tokens": 615.125, "epoch": 0.1243809037528721, "gen_logits_max": 15.677993774414062, "gen_logits_mean": -3.549017906188965, "gen_logits_min": -16.16201400756836, "gen_logits_std": 2.5300188064575195, "gen_loss": 0.39044177532196045, "grad_norm": 0.7526086992885108, "learning_rate": 2.4816326530612246e-05, "loss": 0.4133, "mean_copy_accuracy": 0.9853514581918716, "mean_gen_accuracy": 0.8401025086641312, "mean_token_accuracy": 0.8768764436244965, "num_tokens": 164452878.0, "sample_num_tokens": 9507.0, "step": 609, "total_num_tokens": 164490906.0, "z_loss": 0.01845194771885872 }, { "copy_logits_max": 22.020641326904297, "copy_logits_min": -750000000.0, "copy_num_tokens": 596.1875, "epoch": 0.12458514169006893, "gen_logits_max": 16.500804901123047, "gen_logits_mean": -4.126116752624512, "gen_logits_min": -16.211353302001953, "gen_logits_std": 2.4501166343688965, "gen_loss": 0.3757721185684204, "grad_norm": 0.7172158852025551, "learning_rate": 2.485714285714286e-05, "loss": 0.419, "mean_copy_accuracy": 0.9828884452581406, "mean_gen_accuracy": 0.8395403623580933, "mean_token_accuracy": 0.8715221136808395, "num_tokens": 164706441.0, "sample_num_tokens": 9833.25, "step": 610, "total_num_tokens": 164745774.0, "z_loss": 0.014623327180743217 }, { "copy_logits_max": 23.207799911499023, "copy_logits_min": -750000000.0, "copy_num_tokens": 406.125, "epoch": 0.12478937962726576, "gen_logits_max": 16.46819305419922, "gen_logits_mean": -4.1291022300720215, "gen_logits_min": -16.440088272094727, "gen_logits_std": 2.4967124462127686, "gen_loss": 0.41494640707969666, "grad_norm": 1.0330504758827423, "learning_rate": 2.4897959183673473e-05, "loss": 0.3982, "mean_copy_accuracy": 0.9829449355602264, "mean_gen_accuracy": 0.8443008661270142, "mean_token_accuracy": 0.8812405318021774, "num_tokens": 164976806.0, "sample_num_tokens": 7584.5, "step": 611, "total_num_tokens": 165007144.0, "z_loss": 0.015834376215934753 }, { "copy_logits_max": 27.49776268005371, "copy_logits_min": -750000000.0, "copy_num_tokens": 745.3125, "epoch": 0.1249936175644626, "gen_logits_max": 16.20992660522461, "gen_logits_mean": -3.842076063156128, "gen_logits_min": -16.2525634765625, "gen_logits_std": 2.530259132385254, "gen_loss": 0.3780760169029236, "grad_norm": 0.6846473751772724, "learning_rate": 2.4938775510204083e-05, "loss": 0.3938, "mean_copy_accuracy": 0.9864737540483475, "mean_gen_accuracy": 0.844871386885643, "mean_token_accuracy": 0.8834524899721146, "num_tokens": 165251482.0, "sample_num_tokens": 10038.0, "step": 612, "total_num_tokens": 165291634.0, "z_loss": 0.02338505908846855 }, { "copy_logits_max": 20.530391693115234, "copy_logits_min": -687499968.0, "copy_num_tokens": 457.875, "epoch": 0.12519785550165943, "gen_logits_max": 16.207090377807617, "gen_logits_mean": -4.198713302612305, "gen_logits_min": -16.159679412841797, "gen_logits_std": 2.4440536499023438, "gen_loss": 0.413263738155365, "grad_norm": 0.7607622018742529, "learning_rate": 2.4979591836734693e-05, "loss": 0.4239, "mean_copy_accuracy": 0.9853893369436264, "mean_gen_accuracy": 0.8357185572385788, "mean_token_accuracy": 0.8732243627309799, "num_tokens": 165544182.0, "sample_num_tokens": 8981.0, "step": 613, "total_num_tokens": 165580106.0, "z_loss": 0.013654878363013268 }, { "copy_logits_max": 18.33160400390625, "copy_logits_min": -687500032.0, "copy_num_tokens": 273.8125, "epoch": 0.12540209343885628, "gen_logits_max": 16.009870529174805, "gen_logits_mean": -4.309875011444092, "gen_logits_min": -16.305404663085938, "gen_logits_std": 2.4487743377685547, "gen_loss": 0.45138537883758545, "grad_norm": 0.8234337911023796, "learning_rate": 2.5020408163265306e-05, "loss": 0.4246, "mean_copy_accuracy": 0.9795272946357727, "mean_gen_accuracy": 0.8403946608304977, "mean_token_accuracy": 0.8743274956941605, "num_tokens": 165804292.0, "sample_num_tokens": 6604.5, "step": 614, "total_num_tokens": 165830710.0, "z_loss": 0.010587451048195362 }, { "copy_logits_max": 19.15792465209961, "copy_logits_min": -687500032.0, "copy_num_tokens": 400.375, "epoch": 0.1256063313760531, "gen_logits_max": 15.729856491088867, "gen_logits_mean": -4.408532619476318, "gen_logits_min": -16.562294006347656, "gen_logits_std": 2.4116127490997314, "gen_loss": 0.38704580068588257, "grad_norm": 0.8703817041755845, "learning_rate": 2.5061224489795916e-05, "loss": 0.3954, "mean_copy_accuracy": 0.9825059026479721, "mean_gen_accuracy": 0.8486599326133728, "mean_token_accuracy": 0.8799670189619064, "num_tokens": 166066870.0, "sample_num_tokens": 7850.0, "step": 615, "total_num_tokens": 166098270.0, "z_loss": 0.012233320623636246 }, { "copy_logits_max": 24.41310691833496, "copy_logits_min": -687500032.0, "copy_num_tokens": 589.4375, "epoch": 0.12581056931324994, "gen_logits_max": 16.084869384765625, "gen_logits_mean": -4.372544288635254, "gen_logits_min": -16.9160213470459, "gen_logits_std": 2.454951763153076, "gen_loss": 0.39382559061050415, "grad_norm": 0.8606070906255209, "learning_rate": 2.5102040816326533e-05, "loss": 0.4027, "mean_copy_accuracy": 0.9831665307283401, "mean_gen_accuracy": 0.839385986328125, "mean_token_accuracy": 0.878168597817421, "num_tokens": 166329507.0, "sample_num_tokens": 8789.75, "step": 616, "total_num_tokens": 166364666.0, "z_loss": 0.015635419636964798 }, { "copy_logits_max": 23.3486385345459, "copy_logits_min": -750000000.0, "copy_num_tokens": 345.375, "epoch": 0.12601480725044678, "gen_logits_max": 16.29035186767578, "gen_logits_mean": -4.32011604309082, "gen_logits_min": -16.759536743164062, "gen_logits_std": 2.4616856575012207, "gen_loss": 0.4299481511116028, "grad_norm": 0.7263822486194684, "learning_rate": 2.5142857142857143e-05, "loss": 0.4111, "mean_copy_accuracy": 0.9844736009836197, "mean_gen_accuracy": 0.8412158787250519, "mean_token_accuracy": 0.8763762265443802, "num_tokens": 166606952.0, "sample_num_tokens": 7337.5, "step": 617, "total_num_tokens": 166636302.0, "z_loss": 0.013492057099938393 }, { "copy_logits_max": 17.48221206665039, "copy_logits_min": -750000000.0, "copy_num_tokens": 358.8125, "epoch": 0.1262190451876436, "gen_logits_max": 15.688383102416992, "gen_logits_mean": -4.287051200866699, "gen_logits_min": -16.3447322845459, "gen_logits_std": 2.4330098628997803, "gen_loss": 0.4626231789588928, "grad_norm": 0.8590840771776371, "learning_rate": 2.5183673469387757e-05, "loss": 0.4171, "mean_copy_accuracy": 0.9858473688364029, "mean_gen_accuracy": 0.8334130495786667, "mean_token_accuracy": 0.8726866543292999, "num_tokens": 166886152.0, "sample_num_tokens": 8160.5, "step": 618, "total_num_tokens": 166918794.0, "z_loss": 0.010902881622314453 }, { "copy_logits_max": 23.890419006347656, "copy_logits_min": -750000000.0, "copy_num_tokens": 435.75, "epoch": 0.12642328312484044, "gen_logits_max": 16.650466918945312, "gen_logits_mean": -4.314288139343262, "gen_logits_min": -16.45880889892578, "gen_logits_std": 2.4705653190612793, "gen_loss": 0.38698869943618774, "grad_norm": 0.8029038270177176, "learning_rate": 2.5224489795918367e-05, "loss": 0.4221, "mean_copy_accuracy": 0.9840106815099716, "mean_gen_accuracy": 0.83753502368927, "mean_token_accuracy": 0.8731789439916611, "num_tokens": 167155188.0, "sample_num_tokens": 7986.5, "step": 619, "total_num_tokens": 167187134.0, "z_loss": 0.015057485550642014 }, { "copy_logits_max": 25.03317642211914, "copy_logits_min": -750000000.0, "copy_num_tokens": 715.375, "epoch": 0.12662752106203728, "gen_logits_max": 16.34696388244629, "gen_logits_mean": -3.8080058097839355, "gen_logits_min": -16.398353576660156, "gen_logits_std": 2.5313143730163574, "gen_loss": 0.3639489412307739, "grad_norm": 0.8226954994613962, "learning_rate": 2.526530612244898e-05, "loss": 0.4029, "mean_copy_accuracy": 0.9869987368583679, "mean_gen_accuracy": 0.8393623530864716, "mean_token_accuracy": 0.8819113522768021, "num_tokens": 167433097.0, "sample_num_tokens": 10087.75, "step": 620, "total_num_tokens": 167473448.0, "z_loss": 0.019437795504927635 }, { "copy_logits_max": 15.899553298950195, "copy_logits_min": -687500032.0, "copy_num_tokens": 463.5, "epoch": 0.1268317589992341, "gen_logits_max": 15.621856689453125, "gen_logits_mean": -3.833574056625366, "gen_logits_min": -16.042821884155273, "gen_logits_std": 2.527416467666626, "gen_loss": 0.43682822585105896, "grad_norm": 0.6427694219812757, "learning_rate": 2.5306122448979594e-05, "loss": 0.4027, "mean_copy_accuracy": 0.987740159034729, "mean_gen_accuracy": 0.8427183479070663, "mean_token_accuracy": 0.8798660486936569, "num_tokens": 167721466.0, "sample_num_tokens": 8885.5, "step": 621, "total_num_tokens": 167757008.0, "z_loss": 0.01103658601641655 }, { "copy_logits_max": 20.25649070739746, "copy_logits_min": -750000000.0, "copy_num_tokens": 683.5, "epoch": 0.12703599693643094, "gen_logits_max": 14.285483360290527, "gen_logits_mean": -4.231002330780029, "gen_logits_min": -16.709192276000977, "gen_logits_std": 2.570925712585449, "gen_loss": 0.3756802976131439, "grad_norm": 0.8292281821380014, "learning_rate": 2.5346938775510207e-05, "loss": 0.3909, "mean_copy_accuracy": 0.9833171218633652, "mean_gen_accuracy": 0.8472313284873962, "mean_token_accuracy": 0.8846603184938431, "num_tokens": 168003850.0, "sample_num_tokens": 9311.5, "step": 622, "total_num_tokens": 168041096.0, "z_loss": 0.016547800973057747 }, { "copy_logits_max": 20.760629653930664, "copy_logits_min": -750000000.0, "copy_num_tokens": 542.8125, "epoch": 0.1272402348736278, "gen_logits_max": 16.325286865234375, "gen_logits_mean": -4.535768508911133, "gen_logits_min": -16.49124526977539, "gen_logits_std": 2.391369342803955, "gen_loss": 0.3452514708042145, "grad_norm": 0.6543745669386414, "learning_rate": 2.5387755102040817e-05, "loss": 0.3727, "mean_copy_accuracy": 0.9851153194904327, "mean_gen_accuracy": 0.8565613180398941, "mean_token_accuracy": 0.8870330601930618, "num_tokens": 168279679.0, "sample_num_tokens": 9425.25, "step": 623, "total_num_tokens": 168317380.0, "z_loss": 0.013098537921905518 }, { "copy_logits_max": 22.434322357177734, "copy_logits_min": -750000000.0, "copy_num_tokens": 386.3125, "epoch": 0.1274444728108246, "gen_logits_max": 16.43758773803711, "gen_logits_mean": -4.255626678466797, "gen_logits_min": -16.44396209716797, "gen_logits_std": 2.4474194049835205, "gen_loss": 0.38388216495513916, "grad_norm": 0.6740853528085969, "learning_rate": 2.5428571428571427e-05, "loss": 0.4079, "mean_copy_accuracy": 0.984272301197052, "mean_gen_accuracy": 0.8405549377202988, "mean_token_accuracy": 0.8741308748722076, "num_tokens": 168542851.0, "sample_num_tokens": 7279.25, "step": 624, "total_num_tokens": 168571968.0, "z_loss": 0.01411854475736618 }, { "copy_logits_max": 22.977413177490234, "copy_logits_min": -750000000.0, "copy_num_tokens": 600.875, "epoch": 0.12764871074802145, "gen_logits_max": 15.907766342163086, "gen_logits_mean": -4.343620777130127, "gen_logits_min": -16.469606399536133, "gen_logits_std": 2.496109962463379, "gen_loss": 0.3849906027317047, "grad_norm": 0.6780442699039705, "learning_rate": 2.546938775510204e-05, "loss": 0.3773, "mean_copy_accuracy": 0.9821985960006714, "mean_gen_accuracy": 0.85230453312397, "mean_token_accuracy": 0.8848642110824585, "num_tokens": 168810153.0, "sample_num_tokens": 9158.25, "step": 625, "total_num_tokens": 168846786.0, "z_loss": 0.017034973949193954 }, { "copy_logits_max": 19.351640701293945, "copy_logits_min": -750000000.0, "copy_num_tokens": 462.75, "epoch": 0.1278529486852183, "gen_logits_max": 16.267770767211914, "gen_logits_mean": -4.852230072021484, "gen_logits_min": -16.860437393188477, "gen_logits_std": 2.4006073474884033, "gen_loss": 0.34522539377212524, "grad_norm": 0.6610251392550518, "learning_rate": 2.551020408163265e-05, "loss": 0.4017, "mean_copy_accuracy": 0.9857634603977203, "mean_gen_accuracy": 0.8450496345758438, "mean_token_accuracy": 0.8811911344528198, "num_tokens": 169088903.0, "sample_num_tokens": 8317.75, "step": 626, "total_num_tokens": 169122174.0, "z_loss": 0.01286650076508522 }, { "copy_logits_max": 21.484298706054688, "copy_logits_min": -687500032.0, "copy_num_tokens": 614.6875, "epoch": 0.1280571866224151, "gen_logits_max": 15.848639488220215, "gen_logits_mean": -4.713392734527588, "gen_logits_min": -16.639135360717773, "gen_logits_std": 2.4484572410583496, "gen_loss": 0.35930269956588745, "grad_norm": 0.907780102393675, "learning_rate": 2.5551020408163267e-05, "loss": 0.3863, "mean_copy_accuracy": 0.9878661632537842, "mean_gen_accuracy": 0.8497983664274216, "mean_token_accuracy": 0.8845741450786591, "num_tokens": 169355003.0, "sample_num_tokens": 9135.75, "step": 627, "total_num_tokens": 169391546.0, "z_loss": 0.014530817978084087 }, { "copy_logits_max": 17.66903305053711, "copy_logits_min": -750000000.0, "copy_num_tokens": 386.125, "epoch": 0.12826142455961195, "gen_logits_max": 15.096187591552734, "gen_logits_mean": -4.384032249450684, "gen_logits_min": -16.80392074584961, "gen_logits_std": 2.4667117595672607, "gen_loss": 0.41930076479911804, "grad_norm": 0.7396747887606232, "learning_rate": 2.5591836734693878e-05, "loss": 0.4089, "mean_copy_accuracy": 0.9864999800920486, "mean_gen_accuracy": 0.8424056619405746, "mean_token_accuracy": 0.8769381940364838, "num_tokens": 169609437.0, "sample_num_tokens": 7860.25, "step": 628, "total_num_tokens": 169640878.0, "z_loss": 0.010534258559346199 }, { "copy_logits_max": 18.869117736816406, "copy_logits_min": -750000000.0, "copy_num_tokens": 488.625, "epoch": 0.12846566249680877, "gen_logits_max": 15.321754455566406, "gen_logits_mean": -4.1595659255981445, "gen_logits_min": -16.511112213134766, "gen_logits_std": 2.4885969161987305, "gen_loss": 0.4002126455307007, "grad_norm": 0.7040427601369548, "learning_rate": 2.563265306122449e-05, "loss": 0.3946, "mean_copy_accuracy": 0.9877879172563553, "mean_gen_accuracy": 0.8418766558170319, "mean_token_accuracy": 0.8827900886535645, "num_tokens": 169880286.0, "sample_num_tokens": 8523.0, "step": 629, "total_num_tokens": 169914378.0, "z_loss": 0.011923497542738914 }, { "copy_logits_max": 21.617141723632812, "copy_logits_min": -687500032.0, "copy_num_tokens": 510.125, "epoch": 0.1286699004340056, "gen_logits_max": 15.92631721496582, "gen_logits_mean": -4.660948276519775, "gen_logits_min": -16.756452560424805, "gen_logits_std": 2.429111957550049, "gen_loss": 0.3910145163536072, "grad_norm": 0.6724725151903339, "learning_rate": 2.56734693877551e-05, "loss": 0.4066, "mean_copy_accuracy": 0.9846721142530441, "mean_gen_accuracy": 0.8415653854608536, "mean_token_accuracy": 0.8758291900157928, "num_tokens": 170158456.0, "sample_num_tokens": 8674.0, "step": 630, "total_num_tokens": 170193152.0, "z_loss": 0.013772054575383663 }, { "copy_logits_max": 18.499813079833984, "copy_logits_min": -687500032.0, "copy_num_tokens": 552.25, "epoch": 0.12887413837120246, "gen_logits_max": 14.886638641357422, "gen_logits_mean": -4.278568267822266, "gen_logits_min": -16.586124420166016, "gen_logits_std": 2.487476110458374, "gen_loss": 0.36845964193344116, "grad_norm": 0.6639804776564152, "learning_rate": 2.5714285714285714e-05, "loss": 0.4013, "mean_copy_accuracy": 0.9827159345149994, "mean_gen_accuracy": 0.8450691103935242, "mean_token_accuracy": 0.8800307661294937, "num_tokens": 170449921.0, "sample_num_tokens": 10153.25, "step": 631, "total_num_tokens": 170490534.0, "z_loss": 0.014226384460926056 }, { "copy_logits_max": 16.954212188720703, "copy_logits_min": -750000000.0, "copy_num_tokens": 416.9375, "epoch": 0.12907837630839927, "gen_logits_max": 15.756549835205078, "gen_logits_mean": -4.39249849319458, "gen_logits_min": -16.78312873840332, "gen_logits_std": 2.4310109615325928, "gen_loss": 0.3935614824295044, "grad_norm": 0.6986148213118266, "learning_rate": 2.5755102040816328e-05, "loss": 0.4074, "mean_copy_accuracy": 0.9856373071670532, "mean_gen_accuracy": 0.8435609191656113, "mean_token_accuracy": 0.8776198178529739, "num_tokens": 170727940.0, "sample_num_tokens": 7514.5, "step": 632, "total_num_tokens": 170757998.0, "z_loss": 0.011945280246436596 }, { "copy_logits_max": 18.467632293701172, "copy_logits_min": -750000000.0, "copy_num_tokens": 452.9375, "epoch": 0.12928261424559612, "gen_logits_max": 16.179765701293945, "gen_logits_mean": -4.90981912612915, "gen_logits_min": -17.185598373413086, "gen_logits_std": 2.402517318725586, "gen_loss": 0.37720000743865967, "grad_norm": 0.663272920558955, "learning_rate": 2.579591836734694e-05, "loss": 0.4005, "mean_copy_accuracy": 0.9842279106378555, "mean_gen_accuracy": 0.8451108634471893, "mean_token_accuracy": 0.8764860332012177, "num_tokens": 170989077.0, "sample_num_tokens": 8089.75, "step": 633, "total_num_tokens": 171021436.0, "z_loss": 0.011225639842450619 }, { "copy_logits_max": 21.6884822845459, "copy_logits_min": -750000000.0, "copy_num_tokens": 542.1875, "epoch": 0.12948685218279296, "gen_logits_max": 16.553691864013672, "gen_logits_mean": -4.406216621398926, "gen_logits_min": -16.690200805664062, "gen_logits_std": 2.4765758514404297, "gen_loss": 0.410068541765213, "grad_norm": 1.237729216638515, "learning_rate": 2.583673469387755e-05, "loss": 0.4064, "mean_copy_accuracy": 0.9870074093341827, "mean_gen_accuracy": 0.8393701314926147, "mean_token_accuracy": 0.8765228390693665, "num_tokens": 171259037.0, "sample_num_tokens": 8766.75, "step": 634, "total_num_tokens": 171294104.0, "z_loss": 0.014049977995455265 }, { "copy_logits_max": 20.570802688598633, "copy_logits_min": -750000000.0, "copy_num_tokens": 672.0, "epoch": 0.12969109011998978, "gen_logits_max": 15.622507095336914, "gen_logits_mean": -4.323765277862549, "gen_logits_min": -16.65507698059082, "gen_logits_std": 2.507436752319336, "gen_loss": 0.3737620413303375, "grad_norm": 0.8186334029378689, "learning_rate": 2.5877551020408165e-05, "loss": 0.3862, "mean_copy_accuracy": 0.9838839322328568, "mean_gen_accuracy": 0.8458986878395081, "mean_token_accuracy": 0.8830387741327286, "num_tokens": 171553990.0, "sample_num_tokens": 10389.5, "step": 635, "total_num_tokens": 171595548.0, "z_loss": 0.014567151665687561 }, { "copy_logits_max": 17.900476455688477, "copy_logits_min": -750000000.0, "copy_num_tokens": 464.125, "epoch": 0.12989532805718662, "gen_logits_max": 16.433670043945312, "gen_logits_mean": -4.688279151916504, "gen_logits_min": -16.701648712158203, "gen_logits_std": 2.3875105381011963, "gen_loss": 0.3537582457065582, "grad_norm": 0.7363334413503532, "learning_rate": 2.5918367346938775e-05, "loss": 0.3786, "mean_copy_accuracy": 0.9859070777893066, "mean_gen_accuracy": 0.8566538691520691, "mean_token_accuracy": 0.8825356662273407, "num_tokens": 171827342.0, "sample_num_tokens": 8749.0, "step": 636, "total_num_tokens": 171862338.0, "z_loss": 0.01112170610576868 }, { "copy_logits_max": 24.195270538330078, "copy_logits_min": -687500032.0, "copy_num_tokens": 517.0, "epoch": 0.13009956599438346, "gen_logits_max": 17.134883880615234, "gen_logits_mean": -4.442525863647461, "gen_logits_min": -16.932079315185547, "gen_logits_std": 2.4486279487609863, "gen_loss": 0.418281614780426, "grad_norm": 0.690861704182987, "learning_rate": 2.595918367346939e-05, "loss": 0.3999, "mean_copy_accuracy": 0.9868155270814896, "mean_gen_accuracy": 0.8448396027088165, "mean_token_accuracy": 0.8829080313444138, "num_tokens": 172122265.0, "sample_num_tokens": 8700.25, "step": 637, "total_num_tokens": 172157066.0, "z_loss": 0.013274548575282097 }, { "copy_logits_max": 19.226974487304688, "copy_logits_min": -750000000.0, "copy_num_tokens": 520.0, "epoch": 0.13030380393158028, "gen_logits_max": 16.105487823486328, "gen_logits_mean": -4.5627760887146, "gen_logits_min": -16.89273452758789, "gen_logits_std": 2.4995601177215576, "gen_loss": 0.3454887270927429, "grad_norm": 0.9091980456342974, "learning_rate": 2.6000000000000002e-05, "loss": 0.3815, "mean_copy_accuracy": 0.9826484620571136, "mean_gen_accuracy": 0.8492050170898438, "mean_token_accuracy": 0.883181169629097, "num_tokens": 172396305.0, "sample_num_tokens": 8739.75, "step": 638, "total_num_tokens": 172431264.0, "z_loss": 0.013890683650970459 }, { "copy_logits_max": 17.54560089111328, "copy_logits_min": -750000000.0, "copy_num_tokens": 470.375, "epoch": 0.13050804186877712, "gen_logits_max": 15.972894668579102, "gen_logits_mean": -4.8610124588012695, "gen_logits_min": -16.680255889892578, "gen_logits_std": 2.4335074424743652, "gen_loss": 0.39699265360832214, "grad_norm": 0.6271471436482816, "learning_rate": 2.6040816326530612e-05, "loss": 0.3893, "mean_copy_accuracy": 0.9874187111854553, "mean_gen_accuracy": 0.8475791811943054, "mean_token_accuracy": 0.881417766213417, "num_tokens": 172653116.0, "sample_num_tokens": 8740.5, "step": 639, "total_num_tokens": 172688078.0, "z_loss": 0.011621179059147835 }, { "copy_logits_max": 20.131690979003906, "copy_logits_min": -687500032.0, "copy_num_tokens": 584.125, "epoch": 0.13071227980597397, "gen_logits_max": 16.49937629699707, "gen_logits_mean": -4.649075031280518, "gen_logits_min": -16.53966522216797, "gen_logits_std": 2.434718608856201, "gen_loss": 0.36902785301208496, "grad_norm": 0.7718808521841315, "learning_rate": 2.6081632653061225e-05, "loss": 0.3817, "mean_copy_accuracy": 0.9877093881368637, "mean_gen_accuracy": 0.8491043001413345, "mean_token_accuracy": 0.8828238844871521, "num_tokens": 172932556.0, "sample_num_tokens": 9273.5, "step": 640, "total_num_tokens": 172969650.0, "z_loss": 0.011915584094822407 }, { "copy_logits_max": 15.844303131103516, "copy_logits_min": -750000000.0, "copy_num_tokens": 392.625, "epoch": 0.13091651774317078, "gen_logits_max": 16.089256286621094, "gen_logits_mean": -4.639400959014893, "gen_logits_min": -16.28765106201172, "gen_logits_std": 2.3956470489501953, "gen_loss": 0.4355570673942566, "grad_norm": 0.7005824104033691, "learning_rate": 2.6122448979591835e-05, "loss": 0.4086, "mean_copy_accuracy": 0.9831672161817551, "mean_gen_accuracy": 0.8399892151355743, "mean_token_accuracy": 0.8742343783378601, "num_tokens": 173222911.0, "sample_num_tokens": 8279.75, "step": 641, "total_num_tokens": 173256030.0, "z_loss": 0.009999722242355347 }, { "copy_logits_max": 21.744861602783203, "copy_logits_min": -750000000.0, "copy_num_tokens": 513.125, "epoch": 0.13112075568036763, "gen_logits_max": 17.256507873535156, "gen_logits_mean": -4.643897533416748, "gen_logits_min": -16.632457733154297, "gen_logits_std": 2.4229371547698975, "gen_loss": 0.35779887437820435, "grad_norm": 0.8233670570068609, "learning_rate": 2.616326530612245e-05, "loss": 0.4141, "mean_copy_accuracy": 0.9827613532543182, "mean_gen_accuracy": 0.8380681723356247, "mean_token_accuracy": 0.8727903366088867, "num_tokens": 173480946.0, "sample_num_tokens": 8910.5, "step": 642, "total_num_tokens": 173516588.0, "z_loss": 0.01283470168709755 }, { "copy_logits_max": 19.976825714111328, "copy_logits_min": -750000000.0, "copy_num_tokens": 509.4375, "epoch": 0.13132499361756447, "gen_logits_max": 15.325638771057129, "gen_logits_mean": -4.276610374450684, "gen_logits_min": -16.916404724121094, "gen_logits_std": 2.5503299236297607, "gen_loss": 0.3843104839324951, "grad_norm": 0.750166905114611, "learning_rate": 2.6204081632653062e-05, "loss": 0.3831, "mean_copy_accuracy": 0.9853668957948685, "mean_gen_accuracy": 0.85151506960392, "mean_token_accuracy": 0.8852737694978714, "num_tokens": 173734382.0, "sample_num_tokens": 7458.5, "step": 643, "total_num_tokens": 173764216.0, "z_loss": 0.01572532393038273 }, { "copy_logits_max": 21.428287506103516, "copy_logits_min": -687500032.0, "copy_num_tokens": 449.1875, "epoch": 0.1315292315547613, "gen_logits_max": 15.834832191467285, "gen_logits_mean": -4.290036201477051, "gen_logits_min": -16.866043090820312, "gen_logits_std": 2.552293300628662, "gen_loss": 0.43567538261413574, "grad_norm": 0.9968409563210402, "learning_rate": 2.6244897959183676e-05, "loss": 0.4151, "mean_copy_accuracy": 0.9868814051151276, "mean_gen_accuracy": 0.8378228545188904, "mean_token_accuracy": 0.875887468457222, "num_tokens": 174015354.0, "sample_num_tokens": 8283.5, "step": 644, "total_num_tokens": 174048488.0, "z_loss": 0.01326387282460928 }, { "copy_logits_max": 13.693353652954102, "copy_logits_min": -687500032.0, "copy_num_tokens": 267.0625, "epoch": 0.13173346949195813, "gen_logits_max": 16.199180603027344, "gen_logits_mean": -4.925447463989258, "gen_logits_min": -16.81946563720703, "gen_logits_std": 2.4169836044311523, "gen_loss": 0.42981693148612976, "grad_norm": 0.6954642566526719, "learning_rate": 2.6285714285714286e-05, "loss": 0.4001, "mean_copy_accuracy": 0.9842877984046936, "mean_gen_accuracy": 0.8454145938158035, "mean_token_accuracy": 0.8778249323368073, "num_tokens": 174290914.0, "sample_num_tokens": 7434.5, "step": 645, "total_num_tokens": 174320652.0, "z_loss": 0.007009267341345549 }, { "copy_logits_max": 16.789596557617188, "copy_logits_min": -750000000.0, "copy_num_tokens": 433.3125, "epoch": 0.13193770742915498, "gen_logits_max": 15.616219520568848, "gen_logits_mean": -4.530606746673584, "gen_logits_min": -17.104244232177734, "gen_logits_std": 2.545252799987793, "gen_loss": 0.38125383853912354, "grad_norm": 0.5807113693978725, "learning_rate": 2.63265306122449e-05, "loss": 0.4063, "mean_copy_accuracy": 0.9888439178466797, "mean_gen_accuracy": 0.8408254384994507, "mean_token_accuracy": 0.8759552836418152, "num_tokens": 174542881.0, "sample_num_tokens": 7329.75, "step": 646, "total_num_tokens": 174572200.0, "z_loss": 0.012241048738360405 }, { "copy_logits_max": 15.612356185913086, "copy_logits_min": -750000064.0, "copy_num_tokens": 359.375, "epoch": 0.1321419453663518, "gen_logits_max": 16.226665496826172, "gen_logits_mean": -5.412278652191162, "gen_logits_min": -17.258087158203125, "gen_logits_std": 2.3971405029296875, "gen_loss": 0.35583385825157166, "grad_norm": 0.5791537564959284, "learning_rate": 2.636734693877551e-05, "loss": 0.4076, "mean_copy_accuracy": 0.9869221448898315, "mean_gen_accuracy": 0.8438138216733932, "mean_token_accuracy": 0.8737915307283401, "num_tokens": 174797246.0, "sample_num_tokens": 7979.0, "step": 647, "total_num_tokens": 174829162.0, "z_loss": 0.008551817387342453 }, { "copy_logits_max": 17.788333892822266, "copy_logits_min": -750000000.0, "copy_num_tokens": 580.9375, "epoch": 0.13234618330354864, "gen_logits_max": 16.443403244018555, "gen_logits_mean": -4.946476459503174, "gen_logits_min": -17.449512481689453, "gen_logits_std": 2.5297863483428955, "gen_loss": 0.3612382411956787, "grad_norm": 0.7682087269421073, "learning_rate": 2.6408163265306126e-05, "loss": 0.409, "mean_copy_accuracy": 0.9820966571569443, "mean_gen_accuracy": 0.8451332151889801, "mean_token_accuracy": 0.8764683604240417, "num_tokens": 175036019.0, "sample_num_tokens": 9238.25, "step": 648, "total_num_tokens": 175072972.0, "z_loss": 0.012366567738354206 }, { "copy_logits_max": 12.875129699707031, "copy_logits_min": -750000000.0, "copy_num_tokens": 482.25, "epoch": 0.13255042124074548, "gen_logits_max": 15.394845962524414, "gen_logits_mean": -5.068974494934082, "gen_logits_min": -17.207902908325195, "gen_logits_std": 2.4627599716186523, "gen_loss": 0.381034791469574, "grad_norm": 0.5535103233027054, "learning_rate": 2.6448979591836736e-05, "loss": 0.3613, "mean_copy_accuracy": 0.9874717146158218, "mean_gen_accuracy": 0.8556795865297318, "mean_token_accuracy": 0.8886151611804962, "num_tokens": 175310324.0, "sample_num_tokens": 9292.0, "step": 649, "total_num_tokens": 175347492.0, "z_loss": 0.008000190369784832 }, { "copy_logits_max": 16.379907608032227, "copy_logits_min": -750000000.0, "copy_num_tokens": 349.9375, "epoch": 0.1327546591779423, "gen_logits_max": 16.349071502685547, "gen_logits_mean": -4.902191162109375, "gen_logits_min": -16.993724822998047, "gen_logits_std": 2.4860808849334717, "gen_loss": 0.3879592716693878, "grad_norm": 0.8672112829354156, "learning_rate": 2.6489795918367346e-05, "loss": 0.4066, "mean_copy_accuracy": 0.9840118139982224, "mean_gen_accuracy": 0.8453007340431213, "mean_token_accuracy": 0.8790387213230133, "num_tokens": 175574561.0, "sample_num_tokens": 7189.75, "step": 650, "total_num_tokens": 175603320.0, "z_loss": 0.010171818546950817 }, { "copy_logits_max": 18.563182830810547, "copy_logits_min": -687500032.0, "copy_num_tokens": 613.0, "epoch": 0.13295889711513914, "gen_logits_max": 16.329572677612305, "gen_logits_mean": -4.6819353103637695, "gen_logits_min": -16.839622497558594, "gen_logits_std": 2.479012966156006, "gen_loss": 0.38811978697776794, "grad_norm": 0.66261595289973, "learning_rate": 2.653061224489796e-05, "loss": 0.3915, "mean_copy_accuracy": 0.9864528924226761, "mean_gen_accuracy": 0.8485254794359207, "mean_token_accuracy": 0.8841638267040253, "num_tokens": 175847374.0, "sample_num_tokens": 9991.0, "step": 651, "total_num_tokens": 175887338.0, "z_loss": 0.012706137262284756 }, { "copy_logits_max": 18.041519165039062, "copy_logits_min": -750000000.0, "copy_num_tokens": 564.875, "epoch": 0.13316313505233598, "gen_logits_max": 16.783939361572266, "gen_logits_mean": -4.805502891540527, "gen_logits_min": -17.19322967529297, "gen_logits_std": 2.4710781574249268, "gen_loss": 0.34069743752479553, "grad_norm": 0.7308737917647556, "learning_rate": 2.657142857142857e-05, "loss": 0.3933, "mean_copy_accuracy": 0.9870223850011826, "mean_gen_accuracy": 0.8432271927595139, "mean_token_accuracy": 0.8792139440774918, "num_tokens": 176140648.0, "sample_num_tokens": 9404.0, "step": 652, "total_num_tokens": 176178264.0, "z_loss": 0.01201932318508625 }, { "copy_logits_max": 20.190290451049805, "copy_logits_min": -750000000.0, "copy_num_tokens": 458.375, "epoch": 0.1333673729895328, "gen_logits_max": 16.271800994873047, "gen_logits_mean": -4.450051307678223, "gen_logits_min": -16.718441009521484, "gen_logits_std": 2.511348009109497, "gen_loss": 0.3946356475353241, "grad_norm": 0.7624231115966675, "learning_rate": 2.6612244897959187e-05, "loss": 0.412, "mean_copy_accuracy": 0.9861015230417252, "mean_gen_accuracy": 0.8373170495033264, "mean_token_accuracy": 0.8765222281217575, "num_tokens": 176412730.0, "sample_num_tokens": 7653.5, "step": 653, "total_num_tokens": 176443344.0, "z_loss": 0.012684437446296215 }, { "copy_logits_max": 17.76388168334961, "copy_logits_min": -750000000.0, "copy_num_tokens": 420.375, "epoch": 0.13357161092672964, "gen_logits_max": 15.980035781860352, "gen_logits_mean": -4.549898147583008, "gen_logits_min": -16.938257217407227, "gen_logits_std": 2.4462714195251465, "gen_loss": 0.38371527194976807, "grad_norm": 0.7604611142063984, "learning_rate": 2.6653061224489797e-05, "loss": 0.3704, "mean_copy_accuracy": 0.983222097158432, "mean_gen_accuracy": 0.8587882071733475, "mean_token_accuracy": 0.886791929602623, "num_tokens": 176671389.0, "sample_num_tokens": 8547.25, "step": 654, "total_num_tokens": 176705578.0, "z_loss": 0.010530160740017891 }, { "copy_logits_max": 17.12710189819336, "copy_logits_min": -687500032.0, "copy_num_tokens": 468.75, "epoch": 0.13377584886392646, "gen_logits_max": 15.653542518615723, "gen_logits_mean": -5.216041564941406, "gen_logits_min": -16.91744613647461, "gen_logits_std": 2.435192108154297, "gen_loss": 0.3831408619880676, "grad_norm": 0.6717037329076602, "learning_rate": 2.669387755102041e-05, "loss": 0.408, "mean_copy_accuracy": 0.9809501320123672, "mean_gen_accuracy": 0.8487902134656906, "mean_token_accuracy": 0.875378891825676, "num_tokens": 176916370.0, "sample_num_tokens": 9304.5, "step": 655, "total_num_tokens": 176953588.0, "z_loss": 0.010083301924169064 }, { "copy_logits_max": 17.33758544921875, "copy_logits_min": -750000000.0, "copy_num_tokens": 430.375, "epoch": 0.1339800868011233, "gen_logits_max": 15.908819198608398, "gen_logits_mean": -5.039874076843262, "gen_logits_min": -17.729665756225586, "gen_logits_std": 2.498850107192993, "gen_loss": 0.3463481664657593, "grad_norm": 0.7082186537650185, "learning_rate": 2.673469387755102e-05, "loss": 0.4148, "mean_copy_accuracy": 0.9874549955129623, "mean_gen_accuracy": 0.8404889404773712, "mean_token_accuracy": 0.8779814094305038, "num_tokens": 177194362.0, "sample_num_tokens": 7828.0, "step": 656, "total_num_tokens": 177225674.0, "z_loss": 0.012402425520122051 }, { "copy_logits_max": 19.6497745513916, "copy_logits_min": -687499968.0, "copy_num_tokens": 546.4375, "epoch": 0.13418432473832015, "gen_logits_max": 15.413349151611328, "gen_logits_mean": -4.700464725494385, "gen_logits_min": -16.492984771728516, "gen_logits_std": 2.4442405700683594, "gen_loss": 0.3647080957889557, "grad_norm": 0.6740573468241365, "learning_rate": 2.6775510204081634e-05, "loss": 0.3811, "mean_copy_accuracy": 0.9837018400430679, "mean_gen_accuracy": 0.850310817360878, "mean_token_accuracy": 0.8849144279956818, "num_tokens": 177476888.0, "sample_num_tokens": 8348.5, "step": 657, "total_num_tokens": 177510282.0, "z_loss": 0.012620982713997364 }, { "copy_logits_max": 16.10885238647461, "copy_logits_min": -750000000.0, "copy_num_tokens": 304.6875, "epoch": 0.13438856267551696, "gen_logits_max": 15.754714965820312, "gen_logits_mean": -5.383548736572266, "gen_logits_min": -16.882705688476562, "gen_logits_std": 2.39265513420105, "gen_loss": 0.3611234426498413, "grad_norm": 0.7494372436659712, "learning_rate": 2.6816326530612244e-05, "loss": 0.3892, "mean_copy_accuracy": 0.9828973114490509, "mean_gen_accuracy": 0.8511276245117188, "mean_token_accuracy": 0.8798287510871887, "num_tokens": 177738881.0, "sample_num_tokens": 7542.25, "step": 658, "total_num_tokens": 177769050.0, "z_loss": 0.008313504979014397 }, { "copy_logits_max": 12.379753112792969, "copy_logits_min": -750000000.0, "copy_num_tokens": 262.75, "epoch": 0.1345928006127138, "gen_logits_max": 15.211761474609375, "gen_logits_mean": -5.153092384338379, "gen_logits_min": -16.93073844909668, "gen_logits_std": 2.413508892059326, "gen_loss": 0.36307209730148315, "grad_norm": 0.7488448017530033, "learning_rate": 2.685714285714286e-05, "loss": 0.4073, "mean_copy_accuracy": 0.9838044345378876, "mean_gen_accuracy": 0.8479786813259125, "mean_token_accuracy": 0.8749988824129105, "num_tokens": 178007310.0, "sample_num_tokens": 7197.0, "step": 659, "total_num_tokens": 178036098.0, "z_loss": 0.007115609478205442 }, { "copy_logits_max": 19.134685516357422, "copy_logits_min": -750000000.0, "copy_num_tokens": 561.1875, "epoch": 0.13479703854991065, "gen_logits_max": 15.779478073120117, "gen_logits_mean": -4.4625163078308105, "gen_logits_min": -16.91585922241211, "gen_logits_std": 2.5380053520202637, "gen_loss": 0.38474616408348083, "grad_norm": 0.6926600313384529, "learning_rate": 2.689795918367347e-05, "loss": 0.3947, "mean_copy_accuracy": 0.9888382256031036, "mean_gen_accuracy": 0.8411625325679779, "mean_token_accuracy": 0.8813851177692413, "num_tokens": 178296202.0, "sample_num_tokens": 9614.5, "step": 660, "total_num_tokens": 178334660.0, "z_loss": 0.01290521863847971 }, { "copy_logits_max": 20.678817749023438, "copy_logits_min": -750000000.0, "copy_num_tokens": 675.3125, "epoch": 0.13500127648710747, "gen_logits_max": 15.846402168273926, "gen_logits_mean": -4.060186386108398, "gen_logits_min": -16.675800323486328, "gen_logits_std": 2.596184253692627, "gen_loss": 0.40486541390419006, "grad_norm": 0.6950089567881217, "learning_rate": 2.6938775510204084e-05, "loss": 0.3907, "mean_copy_accuracy": 0.9827812016010284, "mean_gen_accuracy": 0.8424310684204102, "mean_token_accuracy": 0.8801878839731216, "num_tokens": 178563456.0, "sample_num_tokens": 9092.5, "step": 661, "total_num_tokens": 178599826.0, "z_loss": 0.017011208459734917 }, { "copy_logits_max": 20.81832504272461, "copy_logits_min": -750000000.0, "copy_num_tokens": 458.375, "epoch": 0.1352055144243043, "gen_logits_max": 16.48387908935547, "gen_logits_mean": -4.9629716873168945, "gen_logits_min": -17.275440216064453, "gen_logits_std": 2.4686131477355957, "gen_loss": 0.36959323287010193, "grad_norm": 0.7307801778657049, "learning_rate": 2.6979591836734694e-05, "loss": 0.3907, "mean_copy_accuracy": 0.9832115322351456, "mean_gen_accuracy": 0.8474203497171402, "mean_token_accuracy": 0.8811144679784775, "num_tokens": 178849223.0, "sample_num_tokens": 8264.75, "step": 662, "total_num_tokens": 178882282.0, "z_loss": 0.01147700659930706 }, { "copy_logits_max": 19.743690490722656, "copy_logits_min": -750000000.0, "copy_num_tokens": 384.1875, "epoch": 0.13540975236150116, "gen_logits_max": 16.537979125976562, "gen_logits_mean": -5.027658939361572, "gen_logits_min": -17.349773406982422, "gen_logits_std": 2.460925579071045, "gen_loss": 0.3931628465652466, "grad_norm": 0.6962087427833421, "learning_rate": 2.7020408163265304e-05, "loss": 0.3931, "mean_copy_accuracy": 0.9859281480312347, "mean_gen_accuracy": 0.8471900671720505, "mean_token_accuracy": 0.8807152062654495, "num_tokens": 179120313.0, "sample_num_tokens": 7311.75, "step": 663, "total_num_tokens": 179149560.0, "z_loss": 0.010764699429273605 }, { "copy_logits_max": 18.93813705444336, "copy_logits_min": -750000000.0, "copy_num_tokens": 723.0625, "epoch": 0.13561399029869797, "gen_logits_max": 15.564993858337402, "gen_logits_mean": -4.350648880004883, "gen_logits_min": -17.320804595947266, "gen_logits_std": 2.551089286804199, "gen_loss": 0.36535757780075073, "grad_norm": 0.8203336625119889, "learning_rate": 2.706122448979592e-05, "loss": 0.388, "mean_copy_accuracy": 0.9845933318138123, "mean_gen_accuracy": 0.8455344438552856, "mean_token_accuracy": 0.8828648030757904, "num_tokens": 179394548.0, "sample_num_tokens": 9947.0, "step": 664, "total_num_tokens": 179434336.0, "z_loss": 0.013878149911761284 }, { "copy_logits_max": 16.914413452148438, "copy_logits_min": -750000000.0, "copy_num_tokens": 505.0625, "epoch": 0.13581822823589482, "gen_logits_max": 15.437234878540039, "gen_logits_mean": -4.798542499542236, "gen_logits_min": -17.002145767211914, "gen_logits_std": 2.4566173553466797, "gen_loss": 0.39009249210357666, "grad_norm": 0.6544995233238987, "learning_rate": 2.710204081632653e-05, "loss": 0.3802, "mean_copy_accuracy": 0.9856847524642944, "mean_gen_accuracy": 0.853822723031044, "mean_token_accuracy": 0.8833861947059631, "num_tokens": 179670696.0, "sample_num_tokens": 8784.0, "step": 665, "total_num_tokens": 179705832.0, "z_loss": 0.010427611880004406 }, { "copy_logits_max": 14.820374488830566, "copy_logits_min": -750000000.0, "copy_num_tokens": 488.6875, "epoch": 0.13602246617309166, "gen_logits_max": 15.504173278808594, "gen_logits_mean": -5.115266799926758, "gen_logits_min": -17.383480072021484, "gen_logits_std": 2.4972500801086426, "gen_loss": 0.4060891270637512, "grad_norm": 0.6885939441595772, "learning_rate": 2.7142857142857144e-05, "loss": 0.3899, "mean_copy_accuracy": 0.9833433479070663, "mean_gen_accuracy": 0.8475931882858276, "mean_token_accuracy": 0.8793547004461288, "num_tokens": 179936266.0, "sample_num_tokens": 8827.5, "step": 666, "total_num_tokens": 179971576.0, "z_loss": 0.00877715926617384 }, { "copy_logits_max": 16.7578125, "copy_logits_min": -687499904.0, "copy_num_tokens": 381.625, "epoch": 0.13622670411028848, "gen_logits_max": 15.387954711914062, "gen_logits_mean": -4.786412715911865, "gen_logits_min": -17.026939392089844, "gen_logits_std": 2.5066657066345215, "gen_loss": 0.38009676337242126, "grad_norm": 0.66748607785745, "learning_rate": 2.7183673469387754e-05, "loss": 0.3992, "mean_copy_accuracy": 0.9822088181972504, "mean_gen_accuracy": 0.8433169424533844, "mean_token_accuracy": 0.8793205171823502, "num_tokens": 180229221.0, "sample_num_tokens": 7912.25, "step": 667, "total_num_tokens": 180260870.0, "z_loss": 0.009195608086884022 }, { "copy_logits_max": 17.007966995239258, "copy_logits_min": -687500032.0, "copy_num_tokens": 508.875, "epoch": 0.13643094204748532, "gen_logits_max": 15.864477157592773, "gen_logits_mean": -4.4208526611328125, "gen_logits_min": -16.525941848754883, "gen_logits_std": 2.5447421073913574, "gen_loss": 0.3455590605735779, "grad_norm": 0.6666959983347479, "learning_rate": 2.7224489795918368e-05, "loss": 0.3837, "mean_copy_accuracy": 0.9848169088363647, "mean_gen_accuracy": 0.8493400365114212, "mean_token_accuracy": 0.8828539252281189, "num_tokens": 180495274.0, "sample_num_tokens": 8280.0, "step": 668, "total_num_tokens": 180528394.0, "z_loss": 0.012539350427687168 }, { "copy_logits_max": 21.155269622802734, "copy_logits_min": -750000000.0, "copy_num_tokens": 595.4375, "epoch": 0.13663517998468216, "gen_logits_max": 16.71491241455078, "gen_logits_mean": -4.624629974365234, "gen_logits_min": -17.549854278564453, "gen_logits_std": 2.5759339332580566, "gen_loss": 0.3624105155467987, "grad_norm": 0.6428355543335232, "learning_rate": 2.726530612244898e-05, "loss": 0.3932, "mean_copy_accuracy": 0.9870201200246811, "mean_gen_accuracy": 0.8410477042198181, "mean_token_accuracy": 0.8816345483064651, "num_tokens": 180783957.0, "sample_num_tokens": 8392.75, "step": 669, "total_num_tokens": 180817528.0, "z_loss": 0.015597295947372913 }, { "copy_logits_max": 16.116043090820312, "copy_logits_min": -687500032.0, "copy_num_tokens": 534.1875, "epoch": 0.13683941792187898, "gen_logits_max": 15.998050689697266, "gen_logits_mean": -4.854069709777832, "gen_logits_min": -17.337059020996094, "gen_logits_std": 2.493701934814453, "gen_loss": 0.3766886591911316, "grad_norm": 0.6515677096831296, "learning_rate": 2.7306122448979595e-05, "loss": 0.4039, "mean_copy_accuracy": 0.9855637699365616, "mean_gen_accuracy": 0.8473347425460815, "mean_token_accuracy": 0.8806547522544861, "num_tokens": 181047572.0, "sample_num_tokens": 9599.0, "step": 670, "total_num_tokens": 181085968.0, "z_loss": 0.010900705121457577 }, { "copy_logits_max": 20.522151947021484, "copy_logits_min": -750000000.0, "copy_num_tokens": 658.75, "epoch": 0.13704365585907582, "gen_logits_max": 16.246536254882812, "gen_logits_mean": -4.5896525382995605, "gen_logits_min": -16.79972267150879, "gen_logits_std": 2.4402008056640625, "gen_loss": 0.3660353124141693, "grad_norm": 0.5819142647796, "learning_rate": 2.7346938775510205e-05, "loss": 0.4047, "mean_copy_accuracy": 0.985800176858902, "mean_gen_accuracy": 0.8410737961530685, "mean_token_accuracy": 0.8745775371789932, "num_tokens": 181311485.0, "sample_num_tokens": 9535.75, "step": 671, "total_num_tokens": 181349628.0, "z_loss": 0.012905146926641464 }, { "copy_logits_max": 14.43852424621582, "copy_logits_min": -750000000.0, "copy_num_tokens": 371.625, "epoch": 0.13724789379627267, "gen_logits_max": 15.81624698638916, "gen_logits_mean": -5.181545257568359, "gen_logits_min": -17.379159927368164, "gen_logits_std": 2.4475412368774414, "gen_loss": 0.3355512022972107, "grad_norm": 0.7623084303699815, "learning_rate": 2.7387755102040818e-05, "loss": 0.3925, "mean_copy_accuracy": 0.9829261153936386, "mean_gen_accuracy": 0.8469642400741577, "mean_token_accuracy": 0.8781454116106033, "num_tokens": 181585208.0, "sample_num_tokens": 8258.0, "step": 672, "total_num_tokens": 181618240.0, "z_loss": 0.008272793143987656 }, { "copy_logits_max": 13.66450309753418, "copy_logits_min": -687499968.0, "copy_num_tokens": 448.8125, "epoch": 0.13745213173346948, "gen_logits_max": 15.09178352355957, "gen_logits_mean": -5.165999889373779, "gen_logits_min": -17.221302032470703, "gen_logits_std": 2.456139087677002, "gen_loss": 0.3328888416290283, "grad_norm": 0.6623521393942048, "learning_rate": 2.7428571428571428e-05, "loss": 0.3697, "mean_copy_accuracy": 0.988263726234436, "mean_gen_accuracy": 0.8532299250364304, "mean_token_accuracy": 0.8895063996315002, "num_tokens": 181866181.0, "sample_num_tokens": 8298.25, "step": 673, "total_num_tokens": 181899374.0, "z_loss": 0.00900975614786148 }, { "copy_logits_max": 19.17270278930664, "copy_logits_min": -687500032.0, "copy_num_tokens": 655.625, "epoch": 0.13765636967066633, "gen_logits_max": 15.529443740844727, "gen_logits_mean": -5.19095516204834, "gen_logits_min": -17.694583892822266, "gen_logits_std": 2.4807615280151367, "gen_loss": 0.3261430859565735, "grad_norm": 0.7858652229837589, "learning_rate": 2.746938775510204e-05, "loss": 0.3751, "mean_copy_accuracy": 0.9887284338474274, "mean_gen_accuracy": 0.8496029078960419, "mean_token_accuracy": 0.8849504142999649, "num_tokens": 182136912.0, "sample_num_tokens": 9852.5, "step": 674, "total_num_tokens": 182176322.0, "z_loss": 0.012173322960734367 }, { "copy_logits_max": 17.546672821044922, "copy_logits_min": -750000000.0, "copy_num_tokens": 359.9375, "epoch": 0.13786060760786317, "gen_logits_max": 15.824127197265625, "gen_logits_mean": -5.094002723693848, "gen_logits_min": -17.32088851928711, "gen_logits_std": 2.4522271156311035, "gen_loss": 0.4118703603744507, "grad_norm": 0.8108458160182137, "learning_rate": 2.7510204081632655e-05, "loss": 0.4154, "mean_copy_accuracy": 0.9806054830551147, "mean_gen_accuracy": 0.8429392576217651, "mean_token_accuracy": 0.8726202845573425, "num_tokens": 182393384.0, "sample_num_tokens": 7295.0, "step": 675, "total_num_tokens": 182422564.0, "z_loss": 0.009301779791712761 }, { "copy_logits_max": 19.608074188232422, "copy_logits_min": -624999936.0, "copy_num_tokens": 637.6875, "epoch": 0.13806484554506, "gen_logits_max": 16.261089324951172, "gen_logits_mean": -4.930426597595215, "gen_logits_min": -17.02802848815918, "gen_logits_std": 2.488874912261963, "gen_loss": 0.34829574823379517, "grad_norm": 0.7750501117774987, "learning_rate": 2.7551020408163265e-05, "loss": 0.3856, "mean_copy_accuracy": 0.9879430532455444, "mean_gen_accuracy": 0.8475752621889114, "mean_token_accuracy": 0.8815668374300003, "num_tokens": 182653110.0, "sample_num_tokens": 10006.5, "step": 676, "total_num_tokens": 182693136.0, "z_loss": 0.012654132209718227 }, { "copy_logits_max": 18.43364715576172, "copy_logits_min": -750000000.0, "copy_num_tokens": 475.1875, "epoch": 0.13826908348225683, "gen_logits_max": 16.198951721191406, "gen_logits_mean": -5.115902900695801, "gen_logits_min": -16.795948028564453, "gen_logits_std": 2.430668592453003, "gen_loss": 0.4157106876373291, "grad_norm": 0.6881381075725996, "learning_rate": 2.759183673469388e-05, "loss": 0.3941, "mean_copy_accuracy": 0.9871301352977753, "mean_gen_accuracy": 0.8441689759492874, "mean_token_accuracy": 0.8796756118535995, "num_tokens": 182919338.0, "sample_num_tokens": 8521.0, "step": 677, "total_num_tokens": 182953422.0, "z_loss": 0.009665168821811676 }, { "copy_logits_max": 15.808991432189941, "copy_logits_min": -750000000.0, "copy_num_tokens": 451.75, "epoch": 0.13847332141945368, "gen_logits_max": 15.407998085021973, "gen_logits_mean": -5.008121967315674, "gen_logits_min": -17.224063873291016, "gen_logits_std": 2.463172197341919, "gen_loss": 0.38550424575805664, "grad_norm": 0.6785189416740399, "learning_rate": 2.763265306122449e-05, "loss": 0.3833, "mean_copy_accuracy": 0.9871000051498413, "mean_gen_accuracy": 0.8494274914264679, "mean_token_accuracy": 0.8813721090555191, "num_tokens": 183172470.0, "sample_num_tokens": 7657.5, "step": 678, "total_num_tokens": 183203100.0, "z_loss": 0.010175167582929134 }, { "copy_logits_max": 14.810175895690918, "copy_logits_min": -687500032.0, "copy_num_tokens": 478.625, "epoch": 0.1386775593566505, "gen_logits_max": 15.340017318725586, "gen_logits_mean": -5.12026834487915, "gen_logits_min": -17.6748046875, "gen_logits_std": 2.4909214973449707, "gen_loss": 0.3806920051574707, "grad_norm": 0.6863385400979237, "learning_rate": 2.7673469387755102e-05, "loss": 0.414, "mean_copy_accuracy": 0.9834612309932709, "mean_gen_accuracy": 0.8449953645467758, "mean_token_accuracy": 0.8730301558971405, "num_tokens": 183415606.0, "sample_num_tokens": 8852.5, "step": 679, "total_num_tokens": 183451016.0, "z_loss": 0.009978409856557846 }, { "copy_logits_max": 16.477413177490234, "copy_logits_min": -750000000.0, "copy_num_tokens": 598.9375, "epoch": 0.13888179729384734, "gen_logits_max": 15.63621711730957, "gen_logits_mean": -5.179643154144287, "gen_logits_min": -17.20537567138672, "gen_logits_std": 2.5104451179504395, "gen_loss": 0.3495471775531769, "grad_norm": 0.7762410787172329, "learning_rate": 2.7714285714285716e-05, "loss": 0.388, "mean_copy_accuracy": 0.9853344857692719, "mean_gen_accuracy": 0.8437167406082153, "mean_token_accuracy": 0.8832719177007675, "num_tokens": 183712934.0, "sample_num_tokens": 9217.5, "step": 680, "total_num_tokens": 183749804.0, "z_loss": 0.012341894209384918 }, { "copy_logits_max": 17.103797912597656, "copy_logits_min": -750000000.0, "copy_num_tokens": 399.75, "epoch": 0.13908603523104415, "gen_logits_max": 16.210939407348633, "gen_logits_mean": -4.812830924987793, "gen_logits_min": -17.551177978515625, "gen_logits_std": 2.543034553527832, "gen_loss": 0.42368853092193604, "grad_norm": 0.7856196277943296, "learning_rate": 2.775510204081633e-05, "loss": 0.3888, "mean_copy_accuracy": 0.9864403605461121, "mean_gen_accuracy": 0.8416000157594681, "mean_token_accuracy": 0.8786504417657852, "num_tokens": 183983263.0, "sample_num_tokens": 7977.75, "step": 681, "total_num_tokens": 184015174.0, "z_loss": 0.010530669242143631 }, { "copy_logits_max": 16.645793914794922, "copy_logits_min": -750000000.0, "copy_num_tokens": 450.625, "epoch": 0.139290273168241, "gen_logits_max": 15.795769691467285, "gen_logits_mean": -5.178915977478027, "gen_logits_min": -17.249853134155273, "gen_logits_std": 2.4610419273376465, "gen_loss": 0.3688347339630127, "grad_norm": 0.5948645022173128, "learning_rate": 2.779591836734694e-05, "loss": 0.3717, "mean_copy_accuracy": 0.988019585609436, "mean_gen_accuracy": 0.8504345417022705, "mean_token_accuracy": 0.8864084780216217, "num_tokens": 184242759.0, "sample_num_tokens": 7968.75, "step": 682, "total_num_tokens": 184274634.0, "z_loss": 0.010127482935786247 }, { "copy_logits_max": 16.98849868774414, "copy_logits_min": -750000000.0, "copy_num_tokens": 343.875, "epoch": 0.13949451110543784, "gen_logits_max": 15.151185989379883, "gen_logits_mean": -5.184455871582031, "gen_logits_min": -18.53091812133789, "gen_logits_std": 2.509524345397949, "gen_loss": 0.3797690272331238, "grad_norm": 0.6668243851689835, "learning_rate": 2.7836734693877553e-05, "loss": 0.3837, "mean_copy_accuracy": 0.9868702739477158, "mean_gen_accuracy": 0.8461553901433945, "mean_token_accuracy": 0.8823357820510864, "num_tokens": 184518022.0, "sample_num_tokens": 6931.0, "step": 683, "total_num_tokens": 184545746.0, "z_loss": 0.009235687553882599 }, { "copy_logits_max": 15.466241836547852, "copy_logits_min": -750000000.0, "copy_num_tokens": 283.0625, "epoch": 0.13969874904263466, "gen_logits_max": 15.79810905456543, "gen_logits_mean": -5.572818279266357, "gen_logits_min": -17.36869239807129, "gen_logits_std": 2.4324283599853516, "gen_loss": 0.36789506673812866, "grad_norm": 0.6009352493723976, "learning_rate": 2.7877551020408163e-05, "loss": 0.3618, "mean_copy_accuracy": 0.987128272652626, "mean_gen_accuracy": 0.8550041764974594, "mean_token_accuracy": 0.8863987028598785, "num_tokens": 184795707.0, "sample_num_tokens": 6962.25, "step": 684, "total_num_tokens": 184823556.0, "z_loss": 0.007791951298713684 }, { "copy_logits_max": 21.41615867614746, "copy_logits_min": -687500032.0, "copy_num_tokens": 555.75, "epoch": 0.1399029869798315, "gen_logits_max": 16.000547409057617, "gen_logits_mean": -5.269501686096191, "gen_logits_min": -17.75234603881836, "gen_logits_std": 2.525299072265625, "gen_loss": 0.3658902645111084, "grad_norm": 0.7856153855228382, "learning_rate": 2.7918367346938776e-05, "loss": 0.3669, "mean_copy_accuracy": 0.9887222647666931, "mean_gen_accuracy": 0.8494429439306259, "mean_token_accuracy": 0.8870916813611984, "num_tokens": 185048385.0, "sample_num_tokens": 8411.75, "step": 685, "total_num_tokens": 185082032.0, "z_loss": 0.012160719372332096 }, { "copy_logits_max": 13.60773754119873, "copy_logits_min": -750000000.0, "copy_num_tokens": 431.0, "epoch": 0.14010722491702834, "gen_logits_max": 15.157299995422363, "gen_logits_mean": -5.105253219604492, "gen_logits_min": -17.057809829711914, "gen_logits_std": 2.4565510749816895, "gen_loss": 0.39901208877563477, "grad_norm": 0.738746744159698, "learning_rate": 2.795918367346939e-05, "loss": 0.3896, "mean_copy_accuracy": 0.9820175468921661, "mean_gen_accuracy": 0.8481120020151138, "mean_token_accuracy": 0.877314567565918, "num_tokens": 185310823.0, "sample_num_tokens": 9143.75, "step": 686, "total_num_tokens": 185347398.0, "z_loss": 0.007621029391884804 }, { "copy_logits_max": 19.259410858154297, "copy_logits_min": -687500032.0, "copy_num_tokens": 447.9375, "epoch": 0.14031146285422516, "gen_logits_max": 16.020000457763672, "gen_logits_mean": -5.147309303283691, "gen_logits_min": -17.57257843017578, "gen_logits_std": 2.4919989109039307, "gen_loss": 0.4004926085472107, "grad_norm": 0.736291629141578, "learning_rate": 2.8e-05, "loss": 0.3932, "mean_copy_accuracy": 0.9851460456848145, "mean_gen_accuracy": 0.842598095536232, "mean_token_accuracy": 0.8773498386144638, "num_tokens": 185584338.0, "sample_num_tokens": 8708.5, "step": 687, "total_num_tokens": 185619172.0, "z_loss": 0.008967721834778786 }, { "copy_logits_max": 17.96731185913086, "copy_logits_min": -750000000.0, "copy_num_tokens": 438.5, "epoch": 0.140515700791422, "gen_logits_max": 16.036712646484375, "gen_logits_mean": -5.488151550292969, "gen_logits_min": -17.82833480834961, "gen_logits_std": 2.4746532440185547, "gen_loss": 0.4014870524406433, "grad_norm": 0.7012702860674683, "learning_rate": 2.8040816326530613e-05, "loss": 0.3786, "mean_copy_accuracy": 0.9877855032682419, "mean_gen_accuracy": 0.8457788228988647, "mean_token_accuracy": 0.8844927996397018, "num_tokens": 185874949.0, "sample_num_tokens": 8328.25, "step": 688, "total_num_tokens": 185908262.0, "z_loss": 0.009068729355931282 }, { "copy_logits_max": 19.415172576904297, "copy_logits_min": -750000000.0, "copy_num_tokens": 646.5, "epoch": 0.14071993872861885, "gen_logits_max": 15.706530570983887, "gen_logits_mean": -4.900241851806641, "gen_logits_min": -17.315282821655273, "gen_logits_std": 2.5343894958496094, "gen_loss": 0.39430493116378784, "grad_norm": 0.7688611344846081, "learning_rate": 2.8081632653061223e-05, "loss": 0.4002, "mean_copy_accuracy": 0.9855367094278336, "mean_gen_accuracy": 0.8430269658565521, "mean_token_accuracy": 0.8777913153171539, "num_tokens": 186137803.0, "sample_num_tokens": 9929.25, "step": 689, "total_num_tokens": 186177520.0, "z_loss": 0.011811108328402042 }, { "copy_logits_max": 15.416372299194336, "copy_logits_min": -750000000.0, "copy_num_tokens": 416.4375, "epoch": 0.14092417666581566, "gen_logits_max": 16.0505313873291, "gen_logits_mean": -5.507070541381836, "gen_logits_min": -17.3906192779541, "gen_logits_std": 2.469237804412842, "gen_loss": 0.3882790207862854, "grad_norm": 0.6707683542061612, "learning_rate": 2.8122448979591837e-05, "loss": 0.3906, "mean_copy_accuracy": 0.9871674329042435, "mean_gen_accuracy": 0.8434496074914932, "mean_token_accuracy": 0.8796336203813553, "num_tokens": 186428259.0, "sample_num_tokens": 8244.25, "step": 690, "total_num_tokens": 186461236.0, "z_loss": 0.008161228150129318 }, { "copy_logits_max": 16.643905639648438, "copy_logits_min": -687499968.0, "copy_num_tokens": 504.875, "epoch": 0.1411284146030125, "gen_logits_max": 15.917179107666016, "gen_logits_mean": -5.503913402557373, "gen_logits_min": -17.820098876953125, "gen_logits_std": 2.47585391998291, "gen_loss": 0.370656281709671, "grad_norm": 0.6574580171477149, "learning_rate": 2.816326530612245e-05, "loss": 0.3845, "mean_copy_accuracy": 0.9893069714307785, "mean_gen_accuracy": 0.8447289019823074, "mean_token_accuracy": 0.8820696622133255, "num_tokens": 186695175.0, "sample_num_tokens": 7923.75, "step": 691, "total_num_tokens": 186726870.0, "z_loss": 0.008647468872368336 }, { "copy_logits_max": 17.974048614501953, "copy_logits_min": -750000000.0, "copy_num_tokens": 653.0, "epoch": 0.14133265254020935, "gen_logits_max": 15.498048782348633, "gen_logits_mean": -4.813558578491211, "gen_logits_min": -17.576702117919922, "gen_logits_std": 2.5691986083984375, "gen_loss": 0.35547614097595215, "grad_norm": 0.6597802479621947, "learning_rate": 2.8204081632653063e-05, "loss": 0.3651, "mean_copy_accuracy": 0.9832300990819931, "mean_gen_accuracy": 0.8588321805000305, "mean_token_accuracy": 0.8877544552087784, "num_tokens": 186970318.0, "sample_num_tokens": 9863.5, "step": 692, "total_num_tokens": 187009772.0, "z_loss": 0.011400770395994186 }, { "copy_logits_max": 18.73613929748535, "copy_logits_min": -687499968.0, "copy_num_tokens": 450.3125, "epoch": 0.14153689047740617, "gen_logits_max": 16.905574798583984, "gen_logits_mean": -5.216085433959961, "gen_logits_min": -17.297775268554688, "gen_logits_std": 2.462191104888916, "gen_loss": 0.4318985044956207, "grad_norm": 0.49430529734252937, "learning_rate": 2.8244897959183673e-05, "loss": 0.3825, "mean_copy_accuracy": 0.9886067807674408, "mean_gen_accuracy": 0.8481471687555313, "mean_token_accuracy": 0.8817516267299652, "num_tokens": 187280214.0, "sample_num_tokens": 8869.5, "step": 693, "total_num_tokens": 187315692.0, "z_loss": 0.009492478333413601 }, { "copy_logits_max": 18.189739227294922, "copy_logits_min": -750000000.0, "copy_num_tokens": 428.1875, "epoch": 0.141741128414603, "gen_logits_max": 16.065462112426758, "gen_logits_mean": -5.018923759460449, "gen_logits_min": -17.589374542236328, "gen_logits_std": 2.5575411319732666, "gen_loss": 0.40966081619262695, "grad_norm": 0.8707947070911994, "learning_rate": 2.8285714285714287e-05, "loss": 0.3909, "mean_copy_accuracy": 0.9829818606376648, "mean_gen_accuracy": 0.8463198989629745, "mean_token_accuracy": 0.8787380307912827, "num_tokens": 187554542.0, "sample_num_tokens": 8226.5, "step": 694, "total_num_tokens": 187587448.0, "z_loss": 0.011344842612743378 }, { "copy_logits_max": 13.363862991333008, "copy_logits_min": -750000000.0, "copy_num_tokens": 317.5625, "epoch": 0.14194536635179986, "gen_logits_max": 15.77534294128418, "gen_logits_mean": -5.736852645874023, "gen_logits_min": -17.92429542541504, "gen_logits_std": 2.4577324390411377, "gen_loss": 0.3862249255180359, "grad_norm": 0.7391588170703303, "learning_rate": 2.8326530612244897e-05, "loss": 0.3943, "mean_copy_accuracy": 0.9842478632926941, "mean_gen_accuracy": 0.8477488458156586, "mean_token_accuracy": 0.8787999004125595, "num_tokens": 187815388.0, "sample_num_tokens": 7227.5, "step": 695, "total_num_tokens": 187844298.0, "z_loss": 0.007734227925539017 }, { "copy_logits_max": 15.356585502624512, "copy_logits_min": -750000000.0, "copy_num_tokens": 488.3125, "epoch": 0.14214960428899667, "gen_logits_max": 15.585183143615723, "gen_logits_mean": -5.727774143218994, "gen_logits_min": -17.780414581298828, "gen_logits_std": 2.4848835468292236, "gen_loss": 0.3448103368282318, "grad_norm": 0.6523695587649124, "learning_rate": 2.8367346938775514e-05, "loss": 0.3646, "mean_copy_accuracy": 0.9868722707033157, "mean_gen_accuracy": 0.8547375500202179, "mean_token_accuracy": 0.8865072131156921, "num_tokens": 188068643.0, "sample_num_tokens": 9315.75, "step": 696, "total_num_tokens": 188105906.0, "z_loss": 0.007848238572478294 }, { "copy_logits_max": 15.334000587463379, "copy_logits_min": -750000000.0, "copy_num_tokens": 439.8125, "epoch": 0.14235384222619352, "gen_logits_max": 14.76919174194336, "gen_logits_mean": -5.6247758865356445, "gen_logits_min": -17.93048667907715, "gen_logits_std": 2.539663314819336, "gen_loss": 0.3335752487182617, "grad_norm": 0.7066593141839446, "learning_rate": 2.8408163265306124e-05, "loss": 0.3811, "mean_copy_accuracy": 0.9849580824375153, "mean_gen_accuracy": 0.8497328907251358, "mean_token_accuracy": 0.8847085386514664, "num_tokens": 188324155.0, "sample_num_tokens": 7462.75, "step": 697, "total_num_tokens": 188354006.0, "z_loss": 0.008453055284917355 }, { "copy_logits_max": 15.264296531677246, "copy_logits_min": -750000000.0, "copy_num_tokens": 327.1875, "epoch": 0.14255808016339036, "gen_logits_max": 15.565275192260742, "gen_logits_mean": -5.709140777587891, "gen_logits_min": -17.763362884521484, "gen_logits_std": 2.4746475219726562, "gen_loss": 0.4417448043823242, "grad_norm": 0.6953365944539633, "learning_rate": 2.8448979591836737e-05, "loss": 0.4046, "mean_copy_accuracy": 0.9844974279403687, "mean_gen_accuracy": 0.8427072912454605, "mean_token_accuracy": 0.8736769258975983, "num_tokens": 188574383.0, "sample_num_tokens": 7915.75, "step": 698, "total_num_tokens": 188606046.0, "z_loss": 0.007766270078718662 }, { "copy_logits_max": 18.000885009765625, "copy_logits_min": -750000000.0, "copy_num_tokens": 532.1875, "epoch": 0.14276231810058718, "gen_logits_max": 15.965974807739258, "gen_logits_mean": -5.707681179046631, "gen_logits_min": -18.07785415649414, "gen_logits_std": 2.535702705383301, "gen_loss": 0.39780908823013306, "grad_norm": 0.5641236471688774, "learning_rate": 2.8489795918367347e-05, "loss": 0.3938, "mean_copy_accuracy": 0.9905771613121033, "mean_gen_accuracy": 0.8405992239713669, "mean_token_accuracy": 0.877159059047699, "num_tokens": 188848800.0, "sample_num_tokens": 9209.5, "step": 699, "total_num_tokens": 188885638.0, "z_loss": 0.010633053258061409 }, { "copy_logits_max": 14.121492385864258, "copy_logits_min": -750000000.0, "copy_num_tokens": 439.5625, "epoch": 0.14296655603778402, "gen_logits_max": 14.654867172241211, "gen_logits_mean": -5.183169841766357, "gen_logits_min": -17.589763641357422, "gen_logits_std": 2.5702614784240723, "gen_loss": 0.39555609226226807, "grad_norm": 0.853486279939884, "learning_rate": 2.8530612244897957e-05, "loss": 0.378, "mean_copy_accuracy": 0.9847688674926758, "mean_gen_accuracy": 0.8466815799474716, "mean_token_accuracy": 0.884423166513443, "num_tokens": 189126650.0, "sample_num_tokens": 8514.0, "step": 700, "total_num_tokens": 189160706.0, "z_loss": 0.009573299437761307 }, { "copy_logits_max": 17.526458740234375, "copy_logits_min": -750000000.0, "copy_num_tokens": 436.875, "epoch": 0.14317079397498086, "gen_logits_max": 15.778398513793945, "gen_logits_mean": -5.667961120605469, "gen_logits_min": -17.9039363861084, "gen_logits_std": 2.498361825942993, "gen_loss": 0.38636574149131775, "grad_norm": 1.0457710465234396, "learning_rate": 2.857142857142857e-05, "loss": 0.3727, "mean_copy_accuracy": 0.9868091493844986, "mean_gen_accuracy": 0.8519369065761566, "mean_token_accuracy": 0.8833914548158646, "num_tokens": 189399494.0, "sample_num_tokens": 7821.0, "step": 701, "total_num_tokens": 189430778.0, "z_loss": 0.0092413779348135 }, { "copy_logits_max": 16.10474395751953, "copy_logits_min": -750000000.0, "copy_num_tokens": 517.625, "epoch": 0.14337503191217768, "gen_logits_max": 16.01314926147461, "gen_logits_mean": -5.803606033325195, "gen_logits_min": -17.910472869873047, "gen_logits_std": 2.4760303497314453, "gen_loss": 0.3496094048023224, "grad_norm": 0.7961717152922999, "learning_rate": 2.8612244897959184e-05, "loss": 0.3833, "mean_copy_accuracy": 0.9807591438293457, "mean_gen_accuracy": 0.8540282547473907, "mean_token_accuracy": 0.8818541318178177, "num_tokens": 189657109.0, "sample_num_tokens": 9409.75, "step": 702, "total_num_tokens": 189694748.0, "z_loss": 0.008934523910284042 }, { "copy_logits_max": 17.735326766967773, "copy_logits_min": -750000000.0, "copy_num_tokens": 758.1875, "epoch": 0.14357926984937452, "gen_logits_max": 14.530332565307617, "gen_logits_mean": -5.015903949737549, "gen_logits_min": -17.964887619018555, "gen_logits_std": 2.5796072483062744, "gen_loss": 0.35981470346450806, "grad_norm": 0.6670693888682159, "learning_rate": 2.8653061224489798e-05, "loss": 0.4246, "mean_copy_accuracy": 0.986241340637207, "mean_gen_accuracy": 0.8301776796579361, "mean_token_accuracy": 0.8702303171157837, "num_tokens": 189944502.0, "sample_num_tokens": 10244.5, "step": 703, "total_num_tokens": 189985480.0, "z_loss": 0.01275806874036789 }, { "copy_logits_max": 16.976375579833984, "copy_logits_min": -750000000.0, "copy_num_tokens": 408.625, "epoch": 0.14378350778657137, "gen_logits_max": 15.321575164794922, "gen_logits_mean": -5.754107475280762, "gen_logits_min": -17.94664192199707, "gen_logits_std": 2.444680690765381, "gen_loss": 0.3667110204696655, "grad_norm": 0.8258195342137516, "learning_rate": 2.8693877551020408e-05, "loss": 0.3838, "mean_copy_accuracy": 0.9868528097867966, "mean_gen_accuracy": 0.8492494821548462, "mean_token_accuracy": 0.8803428560495377, "num_tokens": 190211422.0, "sample_num_tokens": 7852.0, "step": 704, "total_num_tokens": 190242830.0, "z_loss": 0.008266402408480644 }, { "copy_logits_max": 13.894920349121094, "copy_logits_min": -750000000.0, "copy_num_tokens": 384.3125, "epoch": 0.14398774572376818, "gen_logits_max": 15.109074592590332, "gen_logits_mean": -5.5769500732421875, "gen_logits_min": -18.041934967041016, "gen_logits_std": 2.4808554649353027, "gen_loss": 0.352491170167923, "grad_norm": 0.6276473263865994, "learning_rate": 2.873469387755102e-05, "loss": 0.3764, "mean_copy_accuracy": 0.9834393858909607, "mean_gen_accuracy": 0.8546376824378967, "mean_token_accuracy": 0.8842318505048752, "num_tokens": 190470344.0, "sample_num_tokens": 7689.5, "step": 705, "total_num_tokens": 190501102.0, "z_loss": 0.0070876628160476685 }, { "copy_logits_max": 15.627239227294922, "copy_logits_min": -750000000.0, "copy_num_tokens": 525.875, "epoch": 0.14419198366096503, "gen_logits_max": 15.702882766723633, "gen_logits_mean": -5.696333408355713, "gen_logits_min": -17.571758270263672, "gen_logits_std": 2.4535515308380127, "gen_loss": 0.33131784200668335, "grad_norm": 0.8465427235843217, "learning_rate": 2.877551020408163e-05, "loss": 0.3749, "mean_copy_accuracy": 0.9833149611949921, "mean_gen_accuracy": 0.8499671071767807, "mean_token_accuracy": 0.8836224675178528, "num_tokens": 190712448.0, "sample_num_tokens": 8511.0, "step": 706, "total_num_tokens": 190746492.0, "z_loss": 0.008796049281954765 }, { "copy_logits_max": 17.692548751831055, "copy_logits_min": -687499904.0, "copy_num_tokens": 507.4375, "epoch": 0.14439622159816187, "gen_logits_max": 15.185001373291016, "gen_logits_mean": -5.71529483795166, "gen_logits_min": -18.158939361572266, "gen_logits_std": 2.5014753341674805, "gen_loss": 0.37114405632019043, "grad_norm": 0.8541952732747103, "learning_rate": 2.8816326530612248e-05, "loss": 0.3972, "mean_copy_accuracy": 0.9864793568849564, "mean_gen_accuracy": 0.8439438492059708, "mean_token_accuracy": 0.8784227967262268, "num_tokens": 190983431.0, "sample_num_tokens": 8225.75, "step": 707, "total_num_tokens": 191016334.0, "z_loss": 0.011368284933269024 }, { "copy_logits_max": 12.205418586730957, "copy_logits_min": -750000000.0, "copy_num_tokens": 381.4375, "epoch": 0.1446004595353587, "gen_logits_max": 15.078149795532227, "gen_logits_mean": -5.776904106140137, "gen_logits_min": -17.472787857055664, "gen_logits_std": 2.4422216415405273, "gen_loss": 0.4110023081302643, "grad_norm": 0.6900162891060555, "learning_rate": 2.8857142857142858e-05, "loss": 0.3913, "mean_copy_accuracy": 0.9837510287761688, "mean_gen_accuracy": 0.8503506928682327, "mean_token_accuracy": 0.8779293894767761, "num_tokens": 191246559.0, "sample_num_tokens": 8821.25, "step": 708, "total_num_tokens": 191281844.0, "z_loss": 0.007505887188017368 }, { "copy_logits_max": 15.374702453613281, "copy_logits_min": -750000000.0, "copy_num_tokens": 309.5625, "epoch": 0.14480469747255553, "gen_logits_max": 15.76295280456543, "gen_logits_mean": -5.8479838371276855, "gen_logits_min": -17.383556365966797, "gen_logits_std": 2.3971898555755615, "gen_loss": 0.4007250964641571, "grad_norm": 0.7696798884604762, "learning_rate": 2.889795918367347e-05, "loss": 0.3922, "mean_copy_accuracy": 0.9830949455499649, "mean_gen_accuracy": 0.8413221389055252, "mean_token_accuracy": 0.8789209425449371, "num_tokens": 191535557.0, "sample_num_tokens": 7620.75, "step": 709, "total_num_tokens": 191566040.0, "z_loss": 0.006216224282979965 }, { "copy_logits_max": 15.38410758972168, "copy_logits_min": -750000000.0, "copy_num_tokens": 338.5625, "epoch": 0.14500893540975235, "gen_logits_max": 14.946243286132812, "gen_logits_mean": -5.032548904418945, "gen_logits_min": -17.018115997314453, "gen_logits_std": 2.5246291160583496, "gen_loss": 0.4268474280834198, "grad_norm": 0.7767683245616842, "learning_rate": 2.8938775510204082e-05, "loss": 0.4087, "mean_copy_accuracy": 0.9844357520341873, "mean_gen_accuracy": 0.8446390181779861, "mean_token_accuracy": 0.8732353448867798, "num_tokens": 191797143.0, "sample_num_tokens": 7415.75, "step": 710, "total_num_tokens": 191826806.0, "z_loss": 0.008001301437616348 }, { "copy_logits_max": 13.867897987365723, "copy_logits_min": -750000000.0, "copy_num_tokens": 422.5625, "epoch": 0.1452131733469492, "gen_logits_max": 14.5086669921875, "gen_logits_mean": -5.609915733337402, "gen_logits_min": -17.362686157226562, "gen_logits_std": 2.458967447280884, "gen_loss": 0.39602628350257874, "grad_norm": 0.6747930261283399, "learning_rate": 2.8979591836734695e-05, "loss": 0.388, "mean_copy_accuracy": 0.9866326302289963, "mean_gen_accuracy": 0.8473694175481796, "mean_token_accuracy": 0.8784722089767456, "num_tokens": 192055935.0, "sample_num_tokens": 8955.25, "step": 711, "total_num_tokens": 192091756.0, "z_loss": 0.00777058769017458 }, { "copy_logits_max": 13.048730850219727, "copy_logits_min": -750000000.0, "copy_num_tokens": 341.5625, "epoch": 0.14541741128414604, "gen_logits_max": 14.951324462890625, "gen_logits_mean": -5.827409744262695, "gen_logits_min": -17.594825744628906, "gen_logits_std": 2.4744746685028076, "gen_loss": 0.398809015750885, "grad_norm": 0.6489647111491487, "learning_rate": 2.902040816326531e-05, "loss": 0.3964, "mean_copy_accuracy": 0.9865096360445023, "mean_gen_accuracy": 0.8479888141155243, "mean_token_accuracy": 0.8782937079668045, "num_tokens": 192311349.0, "sample_num_tokens": 7636.75, "step": 712, "total_num_tokens": 192341896.0, "z_loss": 0.006227894686162472 }, { "copy_logits_max": 21.881568908691406, "copy_logits_min": -750000000.0, "copy_num_tokens": 812.25, "epoch": 0.14562164922134285, "gen_logits_max": 14.752008438110352, "gen_logits_mean": -5.305381774902344, "gen_logits_min": -18.208688735961914, "gen_logits_std": 2.6195454597473145, "gen_loss": 0.3289693593978882, "grad_norm": 0.7897686197400514, "learning_rate": 2.906122448979592e-05, "loss": 0.3786, "mean_copy_accuracy": 0.9866189211606979, "mean_gen_accuracy": 0.8463084250688553, "mean_token_accuracy": 0.8865612894296646, "num_tokens": 192578664.0, "sample_num_tokens": 9723.0, "step": 713, "total_num_tokens": 192617556.0, "z_loss": 0.012664645910263062 }, { "copy_logits_max": 14.450430870056152, "copy_logits_min": -750000000.0, "copy_num_tokens": 469.25, "epoch": 0.1458258871585397, "gen_logits_max": 14.84466552734375, "gen_logits_mean": -5.923633575439453, "gen_logits_min": -18.023162841796875, "gen_logits_std": 2.4501304626464844, "gen_loss": 0.37943601608276367, "grad_norm": 0.7557109113437517, "learning_rate": 2.9102040816326532e-05, "loss": 0.3878, "mean_copy_accuracy": 0.9866763651371002, "mean_gen_accuracy": 0.8422108590602875, "mean_token_accuracy": 0.8799334317445755, "num_tokens": 192854082.0, "sample_num_tokens": 8921.0, "step": 714, "total_num_tokens": 192889766.0, "z_loss": 0.007885830476880074 }, { "copy_logits_max": 14.83417797088623, "copy_logits_min": -750000000.0, "copy_num_tokens": 400.4375, "epoch": 0.14603012509573654, "gen_logits_max": 14.299617767333984, "gen_logits_mean": -5.828449249267578, "gen_logits_min": -18.597217559814453, "gen_logits_std": 2.5541000366210938, "gen_loss": 0.40003806352615356, "grad_norm": 0.7887508450847243, "learning_rate": 2.9142857142857142e-05, "loss": 0.3715, "mean_copy_accuracy": 0.984528198838234, "mean_gen_accuracy": 0.8543957471847534, "mean_token_accuracy": 0.8869404345750809, "num_tokens": 193126140.0, "sample_num_tokens": 7276.5, "step": 715, "total_num_tokens": 193155246.0, "z_loss": 0.009261137805879116 }, { "copy_logits_max": 15.914398193359375, "copy_logits_min": -750000000.0, "copy_num_tokens": 563.5625, "epoch": 0.14623436303293336, "gen_logits_max": 14.890026092529297, "gen_logits_mean": -5.972387790679932, "gen_logits_min": -17.98607063293457, "gen_logits_std": 2.4576940536499023, "gen_loss": 0.34674692153930664, "grad_norm": 0.6368335621321147, "learning_rate": 2.9183673469387756e-05, "loss": 0.3782, "mean_copy_accuracy": 0.9857225865125656, "mean_gen_accuracy": 0.8526705801486969, "mean_token_accuracy": 0.885044202208519, "num_tokens": 193397088.0, "sample_num_tokens": 9232.5, "step": 716, "total_num_tokens": 193434018.0, "z_loss": 0.008290981873869896 }, { "copy_logits_max": 12.290307998657227, "copy_logits_min": -750000000.0, "copy_num_tokens": 366.4375, "epoch": 0.1464386009701302, "gen_logits_max": 14.570211410522461, "gen_logits_mean": -6.0942230224609375, "gen_logits_min": -17.648591995239258, "gen_logits_std": 2.3821909427642822, "gen_loss": 0.3794791102409363, "grad_norm": 0.6824639200001196, "learning_rate": 2.9224489795918366e-05, "loss": 0.3582, "mean_copy_accuracy": 0.985664114356041, "mean_gen_accuracy": 0.8584485352039337, "mean_token_accuracy": 0.8892270922660828, "num_tokens": 193673427.0, "sample_num_tokens": 8095.25, "step": 717, "total_num_tokens": 193705808.0, "z_loss": 0.0058520641177892685 }, { "copy_logits_max": 14.486217498779297, "copy_logits_min": -687500032.0, "copy_num_tokens": 429.375, "epoch": 0.14664283890732704, "gen_logits_max": 15.032693862915039, "gen_logits_mean": -5.374190807342529, "gen_logits_min": -17.806591033935547, "gen_logits_std": 2.556370973587036, "gen_loss": 0.40842026472091675, "grad_norm": 0.7235719711920744, "learning_rate": 2.9265306122448982e-05, "loss": 0.3856, "mean_copy_accuracy": 0.9830880463123322, "mean_gen_accuracy": 0.8478512614965439, "mean_token_accuracy": 0.8792405724525452, "num_tokens": 193936226.0, "sample_num_tokens": 8713.5, "step": 718, "total_num_tokens": 193971080.0, "z_loss": 0.007932608015835285 }, { "copy_logits_max": 13.212440490722656, "copy_logits_min": -750000000.0, "copy_num_tokens": 386.125, "epoch": 0.14684707684452386, "gen_logits_max": 15.337900161743164, "gen_logits_mean": -5.775143623352051, "gen_logits_min": -17.928207397460938, "gen_logits_std": 2.4466025829315186, "gen_loss": 0.38595259189605713, "grad_norm": 0.652618821583011, "learning_rate": 2.9306122448979593e-05, "loss": 0.381, "mean_copy_accuracy": 0.9891446232795715, "mean_gen_accuracy": 0.8477886468172073, "mean_token_accuracy": 0.8791125267744064, "num_tokens": 194204414.0, "sample_num_tokens": 7971.5, "step": 719, "total_num_tokens": 194236300.0, "z_loss": 0.006372353062033653 }, { "copy_logits_max": 14.763249397277832, "copy_logits_min": -749999936.0, "copy_num_tokens": 408.0, "epoch": 0.1470513147817207, "gen_logits_max": 14.524222373962402, "gen_logits_mean": -6.238888263702393, "gen_logits_min": -18.281787872314453, "gen_logits_std": 2.461207389831543, "gen_loss": 0.3786078691482544, "grad_norm": 0.6575142434459837, "learning_rate": 2.9346938775510206e-05, "loss": 0.3778, "mean_copy_accuracy": 0.9873177260160446, "mean_gen_accuracy": 0.8512063473463058, "mean_token_accuracy": 0.8829395473003387, "num_tokens": 194484003.0, "sample_num_tokens": 7552.25, "step": 720, "total_num_tokens": 194514212.0, "z_loss": 0.007540179882198572 }, { "copy_logits_max": 15.860872268676758, "copy_logits_min": -749999936.0, "copy_num_tokens": 496.125, "epoch": 0.14725555271891755, "gen_logits_max": 15.078238487243652, "gen_logits_mean": -5.666382312774658, "gen_logits_min": -17.790634155273438, "gen_logits_std": 2.483485698699951, "gen_loss": 0.34577420353889465, "grad_norm": 0.6871478134354224, "learning_rate": 2.9387755102040816e-05, "loss": 0.3865, "mean_copy_accuracy": 0.9843804240226746, "mean_gen_accuracy": 0.8446563333272934, "mean_token_accuracy": 0.8775921314954758, "num_tokens": 194751975.0, "sample_num_tokens": 8636.75, "step": 721, "total_num_tokens": 194786522.0, "z_loss": 0.00793151743710041 }, { "copy_logits_max": 15.101481437683105, "copy_logits_min": -750000000.0, "copy_num_tokens": 476.5625, "epoch": 0.14745979065611436, "gen_logits_max": 14.786101341247559, "gen_logits_mean": -5.7080979347229, "gen_logits_min": -18.03740119934082, "gen_logits_std": 2.487881898880005, "gen_loss": 0.363433837890625, "grad_norm": 0.5720289554459719, "learning_rate": 2.942857142857143e-05, "loss": 0.3936, "mean_copy_accuracy": 0.9868974983692169, "mean_gen_accuracy": 0.844952404499054, "mean_token_accuracy": 0.8799229860305786, "num_tokens": 195021087.0, "sample_num_tokens": 8176.25, "step": 722, "total_num_tokens": 195053792.0, "z_loss": 0.008827589452266693 }, { "copy_logits_max": 16.220294952392578, "copy_logits_min": -750000000.0, "copy_num_tokens": 397.875, "epoch": 0.1476640285933112, "gen_logits_max": 14.840518951416016, "gen_logits_mean": -6.403332710266113, "gen_logits_min": -18.728851318359375, "gen_logits_std": 2.4671754837036133, "gen_loss": 0.4006282389163971, "grad_norm": 0.7678993210547853, "learning_rate": 2.9469387755102043e-05, "loss": 0.3934, "mean_copy_accuracy": 0.986205980181694, "mean_gen_accuracy": 0.8422680050134659, "mean_token_accuracy": 0.8787620961666107, "num_tokens": 195283549.0, "sample_num_tokens": 7649.75, "step": 723, "total_num_tokens": 195314148.0, "z_loss": 0.0075143733993172646 }, { "copy_logits_max": 11.745161056518555, "copy_logits_min": -750000000.0, "copy_num_tokens": 400.8125, "epoch": 0.14786826653050805, "gen_logits_max": 14.503826141357422, "gen_logits_mean": -6.43122673034668, "gen_logits_min": -18.875690460205078, "gen_logits_std": 2.5043883323669434, "gen_loss": 0.37128251791000366, "grad_norm": 0.7518405687129464, "learning_rate": 2.9510204081632653e-05, "loss": 0.3696, "mean_copy_accuracy": 0.9864968508481979, "mean_gen_accuracy": 0.8523394167423248, "mean_token_accuracy": 0.8832344710826874, "num_tokens": 195560847.0, "sample_num_tokens": 8255.75, "step": 724, "total_num_tokens": 195593870.0, "z_loss": 0.0062144482508301735 }, { "copy_logits_max": 14.290081977844238, "copy_logits_min": -750000000.0, "copy_num_tokens": 329.125, "epoch": 0.14807250446770487, "gen_logits_max": 14.68499755859375, "gen_logits_mean": -6.365780830383301, "gen_logits_min": -18.590377807617188, "gen_logits_std": 2.477111339569092, "gen_loss": 0.4226762056350708, "grad_norm": 0.6813035660560766, "learning_rate": 2.9551020408163266e-05, "loss": 0.4191, "mean_copy_accuracy": 0.9830149412155151, "mean_gen_accuracy": 0.838153749704361, "mean_token_accuracy": 0.8696786165237427, "num_tokens": 195822980.0, "sample_num_tokens": 7042.5, "step": 725, "total_num_tokens": 195851150.0, "z_loss": 0.007062011398375034 }, { "copy_logits_max": 13.793795585632324, "copy_logits_min": -750000000.0, "copy_num_tokens": 439.3125, "epoch": 0.1482767424049017, "gen_logits_max": 15.086767196655273, "gen_logits_mean": -6.4756011962890625, "gen_logits_min": -18.67879867553711, "gen_logits_std": 2.473660945892334, "gen_loss": 0.3717389702796936, "grad_norm": 0.6563971975278767, "learning_rate": 2.9591836734693876e-05, "loss": 0.3947, "mean_copy_accuracy": 0.987413227558136, "mean_gen_accuracy": 0.8431923389434814, "mean_token_accuracy": 0.875933513045311, "num_tokens": 196063277.0, "sample_num_tokens": 8137.25, "step": 726, "total_num_tokens": 196095826.0, "z_loss": 0.006669684778898954 }, { "copy_logits_max": 15.572919845581055, "copy_logits_min": -750000000.0, "copy_num_tokens": 530.4375, "epoch": 0.14848098034209856, "gen_logits_max": 14.398111343383789, "gen_logits_mean": -5.606504917144775, "gen_logits_min": -17.97250747680664, "gen_logits_std": 2.5642101764678955, "gen_loss": 0.4054170250892639, "grad_norm": 1.360905929864632, "learning_rate": 2.963265306122449e-05, "loss": 0.3903, "mean_copy_accuracy": 0.9842564612627029, "mean_gen_accuracy": 0.8492732346057892, "mean_token_accuracy": 0.8775084465742111, "num_tokens": 196302177.0, "sample_num_tokens": 8097.75, "step": 727, "total_num_tokens": 196334568.0, "z_loss": 0.009271085262298584 }, { "copy_logits_max": 13.120800018310547, "copy_logits_min": -750000000.0, "copy_num_tokens": 453.3125, "epoch": 0.14868521827929537, "gen_logits_max": 14.672257423400879, "gen_logits_mean": -5.781257629394531, "gen_logits_min": -17.803348541259766, "gen_logits_std": 2.4981608390808105, "gen_loss": 0.3536292314529419, "grad_norm": 0.8127034792494446, "learning_rate": 2.9673469387755103e-05, "loss": 0.3836, "mean_copy_accuracy": 0.9819436967372894, "mean_gen_accuracy": 0.8543304949998856, "mean_token_accuracy": 0.8821301460266113, "num_tokens": 196573977.0, "sample_num_tokens": 8631.75, "step": 728, "total_num_tokens": 196608504.0, "z_loss": 0.00694326451048255 }, { "copy_logits_max": 12.3152437210083, "copy_logits_min": -750000000.0, "copy_num_tokens": 406.4375, "epoch": 0.14888945621649222, "gen_logits_max": 15.199724197387695, "gen_logits_mean": -5.801286220550537, "gen_logits_min": -17.93057632446289, "gen_logits_std": 2.4867911338806152, "gen_loss": 0.42988520860671997, "grad_norm": 0.8360692226975698, "learning_rate": 2.9714285714285717e-05, "loss": 0.4022, "mean_copy_accuracy": 0.9892479032278061, "mean_gen_accuracy": 0.8392055481672287, "mean_token_accuracy": 0.8751060962677002, "num_tokens": 196871268.0, "sample_num_tokens": 7936.0, "step": 729, "total_num_tokens": 196903012.0, "z_loss": 0.0067603555507957935 }, { "copy_logits_max": 17.723501205444336, "copy_logits_min": -750000000.0, "copy_num_tokens": 519.125, "epoch": 0.14909369415368906, "gen_logits_max": 15.7027587890625, "gen_logits_mean": -6.40434455871582, "gen_logits_min": -18.75411605834961, "gen_logits_std": 2.515974521636963, "gen_loss": 0.3540683686733246, "grad_norm": 0.6662205508980418, "learning_rate": 2.9755102040816327e-05, "loss": 0.3699, "mean_copy_accuracy": 0.9886969029903412, "mean_gen_accuracy": 0.8470710664987564, "mean_token_accuracy": 0.8856833130121231, "num_tokens": 197151614.0, "sample_num_tokens": 8313.5, "step": 730, "total_num_tokens": 197184868.0, "z_loss": 0.008387178182601929 }, { "copy_logits_max": 13.18337345123291, "copy_logits_min": -750000000.0, "copy_num_tokens": 529.0625, "epoch": 0.14929793209088588, "gen_logits_max": 14.677773475646973, "gen_logits_mean": -6.540205001831055, "gen_logits_min": -18.772371292114258, "gen_logits_std": 2.4374494552612305, "gen_loss": 0.34633007645606995, "grad_norm": 0.9350517275592309, "learning_rate": 2.979591836734694e-05, "loss": 0.3744, "mean_copy_accuracy": 0.981505960226059, "mean_gen_accuracy": 0.8559421598911285, "mean_token_accuracy": 0.8849324434995651, "num_tokens": 197431284.0, "sample_num_tokens": 9166.0, "step": 731, "total_num_tokens": 197467948.0, "z_loss": 0.006490145809948444 }, { "copy_logits_max": 12.568863868713379, "copy_logits_min": -750000000.0, "copy_num_tokens": 516.9375, "epoch": 0.14950217002808272, "gen_logits_max": 14.006502151489258, "gen_logits_mean": -6.371945381164551, "gen_logits_min": -19.40958023071289, "gen_logits_std": 2.587756395339966, "gen_loss": 0.3468584418296814, "grad_norm": 0.9263515289266168, "learning_rate": 2.983673469387755e-05, "loss": 0.3878, "mean_copy_accuracy": 0.988629549741745, "mean_gen_accuracy": 0.8469593524932861, "mean_token_accuracy": 0.8808435201644897, "num_tokens": 197722004.0, "sample_num_tokens": 8886.5, "step": 732, "total_num_tokens": 197757550.0, "z_loss": 0.007965131662786007 }, { "copy_logits_max": 14.144147872924805, "copy_logits_min": -750000000.0, "copy_num_tokens": 344.9375, "epoch": 0.14970640796527956, "gen_logits_max": 14.872373580932617, "gen_logits_mean": -6.243548393249512, "gen_logits_min": -18.20482635498047, "gen_logits_std": 2.518911838531494, "gen_loss": 0.40228092670440674, "grad_norm": 0.6417544367026388, "learning_rate": 2.9877551020408164e-05, "loss": 0.3922, "mean_copy_accuracy": 0.9882513135671616, "mean_gen_accuracy": 0.8496626317501068, "mean_token_accuracy": 0.878398209810257, "num_tokens": 197979149.0, "sample_num_tokens": 7631.25, "step": 733, "total_num_tokens": 198009674.0, "z_loss": 0.006639169994741678 }, { "copy_logits_max": 15.523792266845703, "copy_logits_min": -750000000.0, "copy_num_tokens": 460.9375, "epoch": 0.14991064590247638, "gen_logits_max": 14.869302749633789, "gen_logits_mean": -6.24679708480835, "gen_logits_min": -18.80707359313965, "gen_logits_std": 2.5144379138946533, "gen_loss": 0.3886226713657379, "grad_norm": 0.836418549028554, "learning_rate": 2.9918367346938777e-05, "loss": 0.3828, "mean_copy_accuracy": 0.9834315180778503, "mean_gen_accuracy": 0.849275678396225, "mean_token_accuracy": 0.8797389715909958, "num_tokens": 198255580.0, "sample_num_tokens": 8279.5, "step": 734, "total_num_tokens": 198288698.0, "z_loss": 0.008470884524285793 }, { "copy_logits_max": 17.2423038482666, "copy_logits_min": -750000000.0, "copy_num_tokens": 680.1875, "epoch": 0.15011488383967322, "gen_logits_max": 14.302947044372559, "gen_logits_mean": -5.819710731506348, "gen_logits_min": -18.453594207763672, "gen_logits_std": 2.5431313514709473, "gen_loss": 0.3310525417327881, "grad_norm": 0.6824195861787598, "learning_rate": 2.995918367346939e-05, "loss": 0.3788, "mean_copy_accuracy": 0.9826992452144623, "mean_gen_accuracy": 0.8544470220804214, "mean_token_accuracy": 0.8829825669527054, "num_tokens": 198519674.0, "sample_num_tokens": 10355.0, "step": 735, "total_num_tokens": 198561094.0, "z_loss": 0.00886395201086998 }, { "copy_logits_max": 18.610824584960938, "copy_logits_min": -687500032.0, "copy_num_tokens": 672.9375, "epoch": 0.15031912177687004, "gen_logits_max": 15.340940475463867, "gen_logits_mean": -6.220104217529297, "gen_logits_min": -18.916332244873047, "gen_logits_std": 2.5497870445251465, "gen_loss": 0.35783061385154724, "grad_norm": 0.935176486397014, "learning_rate": 3e-05, "loss": 0.3776, "mean_copy_accuracy": 0.9818175882101059, "mean_gen_accuracy": 0.8470329493284225, "mean_token_accuracy": 0.8837122917175293, "num_tokens": 198788265.0, "sample_num_tokens": 8564.75, "step": 736, "total_num_tokens": 198822524.0, "z_loss": 0.010606842115521431 }, { "copy_logits_max": 15.58518123626709, "copy_logits_min": -687500032.0, "copy_num_tokens": 537.6875, "epoch": 0.15052335971406688, "gen_logits_max": 14.68766975402832, "gen_logits_mean": -6.138914108276367, "gen_logits_min": -18.235443115234375, "gen_logits_std": 2.483973979949951, "gen_loss": 0.3555631637573242, "grad_norm": 0.9800379473098927, "learning_rate": 2.9998736842105265e-05, "loss": 0.3646, "mean_copy_accuracy": 0.9874521195888519, "mean_gen_accuracy": 0.8536997884511948, "mean_token_accuracy": 0.8887443691492081, "num_tokens": 199096281.0, "sample_num_tokens": 8633.75, "step": 737, "total_num_tokens": 199130816.0, "z_loss": 0.009813583455979824 }, { "copy_logits_max": 10.54677963256836, "copy_logits_min": -750000000.0, "copy_num_tokens": 278.5625, "epoch": 0.15072759765126373, "gen_logits_max": 13.926936149597168, "gen_logits_mean": -7.063738822937012, "gen_logits_min": -18.48054313659668, "gen_logits_std": 2.406198501586914, "gen_loss": 0.4044690430164337, "grad_norm": 0.8592808570227589, "learning_rate": 2.9997473684210526e-05, "loss": 0.3967, "mean_copy_accuracy": 0.9867976754903793, "mean_gen_accuracy": 0.8422947227954865, "mean_token_accuracy": 0.8770784586668015, "num_tokens": 199376964.0, "sample_num_tokens": 7463.0, "step": 738, "total_num_tokens": 199406816.0, "z_loss": 0.004745436832308769 }, { "copy_logits_max": 9.625487327575684, "copy_logits_min": -750000000.0, "copy_num_tokens": 340.8125, "epoch": 0.15093183558846054, "gen_logits_max": 14.193989753723145, "gen_logits_mean": -6.187805652618408, "gen_logits_min": -18.168506622314453, "gen_logits_std": 2.4414663314819336, "gen_loss": 0.39090102910995483, "grad_norm": 0.7579583385392091, "learning_rate": 2.999621052631579e-05, "loss": 0.4043, "mean_copy_accuracy": 0.9799262583255768, "mean_gen_accuracy": 0.8483767211437225, "mean_token_accuracy": 0.8727231472730637, "num_tokens": 199620318.0, "sample_num_tokens": 8517.5, "step": 739, "total_num_tokens": 199654388.0, "z_loss": 0.005257612094283104 }, { "copy_logits_max": 12.977861404418945, "copy_logits_min": -750000000.0, "copy_num_tokens": 558.8125, "epoch": 0.1511360735256574, "gen_logits_max": 13.882833480834961, "gen_logits_mean": -6.523890018463135, "gen_logits_min": -18.713661193847656, "gen_logits_std": 2.4764575958251953, "gen_loss": 0.34811148047447205, "grad_norm": 0.933338233235101, "learning_rate": 2.999494736842105e-05, "loss": 0.3792, "mean_copy_accuracy": 0.9826704114675522, "mean_gen_accuracy": 0.8477689027786255, "mean_token_accuracy": 0.8826350569725037, "num_tokens": 199895642.0, "sample_num_tokens": 9640.0, "step": 740, "total_num_tokens": 199934202.0, "z_loss": 0.00789998471736908 }, { "copy_logits_max": 16.125572204589844, "copy_logits_min": -750000000.0, "copy_num_tokens": 496.5, "epoch": 0.15134031146285423, "gen_logits_max": 14.467103958129883, "gen_logits_mean": -6.069939613342285, "gen_logits_min": -17.86351776123047, "gen_logits_std": 2.478776216506958, "gen_loss": 0.41234534978866577, "grad_norm": 0.7404071088049817, "learning_rate": 2.999368421052632e-05, "loss": 0.4158, "mean_copy_accuracy": 0.9862475246191025, "mean_gen_accuracy": 0.8356141299009323, "mean_token_accuracy": 0.8711580485105515, "num_tokens": 200153833.0, "sample_num_tokens": 8376.25, "step": 741, "total_num_tokens": 200187338.0, "z_loss": 0.009029809385538101 }, { "copy_logits_max": 12.817435264587402, "copy_logits_min": -750000000.0, "copy_num_tokens": 418.8125, "epoch": 0.15154454940005105, "gen_logits_max": 14.194707870483398, "gen_logits_mean": -6.686769008636475, "gen_logits_min": -18.395427703857422, "gen_logits_std": 2.4446208477020264, "gen_loss": 0.39551621675491333, "grad_norm": 0.8110348389766823, "learning_rate": 2.999242105263158e-05, "loss": 0.3931, "mean_copy_accuracy": 0.9871925115585327, "mean_gen_accuracy": 0.8441267907619476, "mean_token_accuracy": 0.877932071685791, "num_tokens": 200426290.0, "sample_num_tokens": 8389.0, "step": 742, "total_num_tokens": 200459846.0, "z_loss": 0.006793534848839045 }, { "copy_logits_max": 13.6410493850708, "copy_logits_min": -750000000.0, "copy_num_tokens": 608.9375, "epoch": 0.1517487873372479, "gen_logits_max": 14.405536651611328, "gen_logits_mean": -5.794425010681152, "gen_logits_min": -17.757186889648438, "gen_logits_std": 2.494443416595459, "gen_loss": 0.3349218964576721, "grad_norm": 0.8134872745072199, "learning_rate": 2.9991157894736844e-05, "loss": 0.4029, "mean_copy_accuracy": 0.9854766279459, "mean_gen_accuracy": 0.8411422669887543, "mean_token_accuracy": 0.8778658509254456, "num_tokens": 200683873.0, "sample_num_tokens": 8567.25, "step": 743, "total_num_tokens": 200718142.0, "z_loss": 0.008423741906881332 }, { "copy_logits_max": 14.474480628967285, "copy_logits_min": -750000000.0, "copy_num_tokens": 348.75, "epoch": 0.15195302527444474, "gen_logits_max": 14.933662414550781, "gen_logits_mean": -6.533925533294678, "gen_logits_min": -18.216350555419922, "gen_logits_std": 2.4309675693511963, "gen_loss": 0.38277480006217957, "grad_norm": 0.8781165836466199, "learning_rate": 2.9989894736842105e-05, "loss": 0.3784, "mean_copy_accuracy": 0.9809329360723495, "mean_gen_accuracy": 0.854962483048439, "mean_token_accuracy": 0.8840741813182831, "num_tokens": 200973724.0, "sample_num_tokens": 7751.0, "step": 744, "total_num_tokens": 201004728.0, "z_loss": 0.006551453843712807 }, { "copy_logits_max": 12.80266284942627, "copy_logits_min": -750000000.0, "copy_num_tokens": 499.75, "epoch": 0.15215726321164155, "gen_logits_max": 14.055909156799316, "gen_logits_mean": -6.275391578674316, "gen_logits_min": -18.214160919189453, "gen_logits_std": 2.452955484390259, "gen_loss": 0.3629028797149658, "grad_norm": 0.6543884347405668, "learning_rate": 2.998863157894737e-05, "loss": 0.3913, "mean_copy_accuracy": 0.9846724718809128, "mean_gen_accuracy": 0.8463518619537354, "mean_token_accuracy": 0.8759759813547134, "num_tokens": 201215759.0, "sample_num_tokens": 9499.75, "step": 745, "total_num_tokens": 201253758.0, "z_loss": 0.006791618652641773 }, { "copy_logits_max": 13.810026168823242, "copy_logits_min": -750000000.0, "copy_num_tokens": 499.5625, "epoch": 0.1523615011488384, "gen_logits_max": 14.360469818115234, "gen_logits_mean": -5.517231464385986, "gen_logits_min": -17.658143997192383, "gen_logits_std": 2.5536599159240723, "gen_loss": 0.384105920791626, "grad_norm": 0.789061889039478, "learning_rate": 2.998736842105263e-05, "loss": 0.4071, "mean_copy_accuracy": 0.9842908680438995, "mean_gen_accuracy": 0.8429987728595734, "mean_token_accuracy": 0.8729261159896851, "num_tokens": 201469333.0, "sample_num_tokens": 8781.25, "step": 746, "total_num_tokens": 201504458.0, "z_loss": 0.008723721839487553 }, { "copy_logits_max": 12.207586288452148, "copy_logits_min": -750000000.0, "copy_num_tokens": 343.75, "epoch": 0.15256573908603524, "gen_logits_max": 14.891642570495605, "gen_logits_mean": -6.635359764099121, "gen_logits_min": -18.02492904663086, "gen_logits_std": 2.3884429931640625, "gen_loss": 0.38353753089904785, "grad_norm": 0.5519823411885083, "learning_rate": 2.9986105263157895e-05, "loss": 0.3774, "mean_copy_accuracy": 0.9889048635959625, "mean_gen_accuracy": 0.8490196466445923, "mean_token_accuracy": 0.8802215754985809, "num_tokens": 201727466.0, "sample_num_tokens": 7594.0, "step": 747, "total_num_tokens": 201757842.0, "z_loss": 0.005968652665615082 }, { "copy_logits_max": 10.89742660522461, "copy_logits_min": -750000000.0, "copy_num_tokens": 342.4375, "epoch": 0.15276997702323206, "gen_logits_max": 14.247085571289062, "gen_logits_mean": -6.221645832061768, "gen_logits_min": -17.79000473022461, "gen_logits_std": 2.480257511138916, "gen_loss": 0.39001041650772095, "grad_norm": 0.7642156163601392, "learning_rate": 2.9984842105263156e-05, "loss": 0.3715, "mean_copy_accuracy": 0.9854916334152222, "mean_gen_accuracy": 0.8521615713834763, "mean_token_accuracy": 0.8865432590246201, "num_tokens": 202008572.0, "sample_num_tokens": 7481.0, "step": 748, "total_num_tokens": 202038496.0, "z_loss": 0.005497965961694717 }, { "copy_logits_max": 13.084650993347168, "copy_logits_min": -750000000.0, "copy_num_tokens": 395.375, "epoch": 0.1529742149604289, "gen_logits_max": 14.481523513793945, "gen_logits_mean": -5.995595932006836, "gen_logits_min": -18.11502456665039, "gen_logits_std": 2.5012965202331543, "gen_loss": 0.37006649374961853, "grad_norm": 0.6788442319763711, "learning_rate": 2.9983578947368423e-05, "loss": 0.3875, "mean_copy_accuracy": 0.9860087037086487, "mean_gen_accuracy": 0.8444178551435471, "mean_token_accuracy": 0.8823930621147156, "num_tokens": 202265062.0, "sample_num_tokens": 7920.0, "step": 749, "total_num_tokens": 202296742.0, "z_loss": 0.007483664900064468 }, { "copy_logits_max": 12.493337631225586, "copy_logits_min": -750000000.0, "copy_num_tokens": 328.875, "epoch": 0.15317845289762574, "gen_logits_max": 14.75352668762207, "gen_logits_mean": -6.304386138916016, "gen_logits_min": -18.195751190185547, "gen_logits_std": 2.499077558517456, "gen_loss": 0.380426824092865, "grad_norm": 0.5559304255503171, "learning_rate": 2.9982315789473688e-05, "loss": 0.3604, "mean_copy_accuracy": 0.9893863350152969, "mean_gen_accuracy": 0.8554991781711578, "mean_token_accuracy": 0.8871216326951981, "num_tokens": 202556635.0, "sample_num_tokens": 7100.25, "step": 750, "total_num_tokens": 202585036.0, "z_loss": 0.007349785882979631 }, { "copy_logits_max": 15.793248176574707, "copy_logits_min": -625000000.0, "copy_num_tokens": 430.375, "epoch": 0.15338269083482256, "gen_logits_max": 15.173410415649414, "gen_logits_mean": -6.40423583984375, "gen_logits_min": -18.378719329833984, "gen_logits_std": 2.485405445098877, "gen_loss": 0.3900512754917145, "grad_norm": 0.888927652363057, "learning_rate": 2.998105263157895e-05, "loss": 0.3953, "mean_copy_accuracy": 0.9855406433343887, "mean_gen_accuracy": 0.8428191095590591, "mean_token_accuracy": 0.8787208050489426, "num_tokens": 202803824.0, "sample_num_tokens": 8270.0, "step": 751, "total_num_tokens": 202836904.0, "z_loss": 0.008584649302065372 }, { "copy_logits_max": 10.168521881103516, "copy_logits_min": -750000000.0, "copy_num_tokens": 473.6875, "epoch": 0.1535869287720194, "gen_logits_max": 14.030702590942383, "gen_logits_mean": -6.114018440246582, "gen_logits_min": -18.795333862304688, "gen_logits_std": 2.53293514251709, "gen_loss": 0.345331609249115, "grad_norm": 0.6623557250443259, "learning_rate": 2.9979789473684213e-05, "loss": 0.3603, "mean_copy_accuracy": 0.9867283403873444, "mean_gen_accuracy": 0.8564053326845169, "mean_token_accuracy": 0.8866034001111984, "num_tokens": 203083065.0, "sample_num_tokens": 9567.25, "step": 752, "total_num_tokens": 203121334.0, "z_loss": 0.007101144175976515 }, { "copy_logits_max": 12.851492881774902, "copy_logits_min": -750000000.0, "copy_num_tokens": 564.75, "epoch": 0.15379116670921625, "gen_logits_max": 13.323898315429688, "gen_logits_mean": -6.262271881103516, "gen_logits_min": -18.301097869873047, "gen_logits_std": 2.5324082374572754, "gen_loss": 0.34062904119491577, "grad_norm": 0.6347619062573059, "learning_rate": 2.9978526315789474e-05, "loss": 0.3722, "mean_copy_accuracy": 0.9871968328952789, "mean_gen_accuracy": 0.848264068365097, "mean_token_accuracy": 0.8831964284181595, "num_tokens": 203348827.0, "sample_num_tokens": 8915.25, "step": 753, "total_num_tokens": 203384488.0, "z_loss": 0.00829055905342102 }, { "copy_logits_max": 11.945672035217285, "copy_logits_min": -687500032.0, "copy_num_tokens": 379.3125, "epoch": 0.15399540464641306, "gen_logits_max": 14.018957138061523, "gen_logits_mean": -6.449487686157227, "gen_logits_min": -18.506755828857422, "gen_logits_std": 2.4254000186920166, "gen_loss": 0.3699526786804199, "grad_norm": 0.6348434066612914, "learning_rate": 2.9977263157894738e-05, "loss": 0.3771, "mean_copy_accuracy": 0.9854491800069809, "mean_gen_accuracy": 0.8459570109844208, "mean_token_accuracy": 0.8814588636159897, "num_tokens": 203618893.0, "sample_num_tokens": 8136.75, "step": 754, "total_num_tokens": 203651440.0, "z_loss": 0.005240605678409338 }, { "copy_logits_max": 13.517489433288574, "copy_logits_min": -687499904.0, "copy_num_tokens": 704.5, "epoch": 0.1541996425836099, "gen_logits_max": 14.455448150634766, "gen_logits_mean": -5.577798366546631, "gen_logits_min": -17.771059036254883, "gen_logits_std": 2.5638632774353027, "gen_loss": 0.39054790139198303, "grad_norm": 0.7153064635998813, "learning_rate": 2.9976e-05, "loss": 0.3868, "mean_copy_accuracy": 0.9884648621082306, "mean_gen_accuracy": 0.8390441089868546, "mean_token_accuracy": 0.8783122599124908, "num_tokens": 203897576.0, "sample_num_tokens": 10159.0, "step": 755, "total_num_tokens": 203938212.0, "z_loss": 0.00866265781223774 }, { "copy_logits_max": 10.956001281738281, "copy_logits_min": -750000000.0, "copy_num_tokens": 626.9375, "epoch": 0.15440388052080675, "gen_logits_max": 13.843439102172852, "gen_logits_mean": -5.831903457641602, "gen_logits_min": -17.33177947998047, "gen_logits_std": 2.439685821533203, "gen_loss": 0.3066761791706085, "grad_norm": 0.7298491780421659, "learning_rate": 2.9974736842105263e-05, "loss": 0.3486, "mean_copy_accuracy": 0.9887803792953491, "mean_gen_accuracy": 0.8523176461458206, "mean_token_accuracy": 0.8923352956771851, "num_tokens": 204200253.0, "sample_num_tokens": 9464.75, "step": 756, "total_num_tokens": 204238112.0, "z_loss": 0.007612600922584534 }, { "copy_logits_max": 12.710376739501953, "copy_logits_min": -750000000.0, "copy_num_tokens": 445.375, "epoch": 0.15460811845800357, "gen_logits_max": 14.784414291381836, "gen_logits_mean": -6.6486358642578125, "gen_logits_min": -18.525928497314453, "gen_logits_std": 2.4351937770843506, "gen_loss": 0.3842375874519348, "grad_norm": 0.8095208589563303, "learning_rate": 2.9973473684210528e-05, "loss": 0.3867, "mean_copy_accuracy": 0.9856122434139252, "mean_gen_accuracy": 0.8456393480300903, "mean_token_accuracy": 0.8777434676885605, "num_tokens": 204471857.0, "sample_num_tokens": 8187.25, "step": 757, "total_num_tokens": 204504606.0, "z_loss": 0.005900661461055279 }, { "copy_logits_max": 12.205716133117676, "copy_logits_min": -750000000.0, "copy_num_tokens": 563.25, "epoch": 0.1548123563952004, "gen_logits_max": 13.980704307556152, "gen_logits_mean": -6.198227882385254, "gen_logits_min": -18.6063232421875, "gen_logits_std": 2.5032458305358887, "gen_loss": 0.35341256856918335, "grad_norm": 0.746552480402305, "learning_rate": 2.9972210526315792e-05, "loss": 0.4009, "mean_copy_accuracy": 0.9860825538635254, "mean_gen_accuracy": 0.8409654647111893, "mean_token_accuracy": 0.8737445771694183, "num_tokens": 204732924.0, "sample_num_tokens": 9090.0, "step": 758, "total_num_tokens": 204769284.0, "z_loss": 0.00726001663133502 }, { "copy_logits_max": 11.021656036376953, "copy_logits_min": -750000000.0, "copy_num_tokens": 313.4375, "epoch": 0.15501659433239726, "gen_logits_max": 14.009400367736816, "gen_logits_mean": -6.867982864379883, "gen_logits_min": -18.495223999023438, "gen_logits_std": 2.4335696697235107, "gen_loss": 0.34864091873168945, "grad_norm": 0.7511305164577877, "learning_rate": 2.9970947368421053e-05, "loss": 0.3754, "mean_copy_accuracy": 0.9870499819517136, "mean_gen_accuracy": 0.846378743648529, "mean_token_accuracy": 0.8837742060422897, "num_tokens": 205008757.0, "sample_num_tokens": 7635.75, "step": 759, "total_num_tokens": 205039300.0, "z_loss": 0.005574706010520458 }, { "copy_logits_max": 12.808286666870117, "copy_logits_min": -750000000.0, "copy_num_tokens": 594.375, "epoch": 0.15522083226959407, "gen_logits_max": 13.872913360595703, "gen_logits_mean": -6.225005149841309, "gen_logits_min": -18.162389755249023, "gen_logits_std": 2.4796392917633057, "gen_loss": 0.330558717250824, "grad_norm": 0.6341319026542721, "learning_rate": 2.9969684210526317e-05, "loss": 0.3365, "mean_copy_accuracy": 0.9892991036176682, "mean_gen_accuracy": 0.856649786233902, "mean_token_accuracy": 0.8958569914102554, "num_tokens": 205309744.0, "sample_num_tokens": 9436.0, "step": 760, "total_num_tokens": 205347488.0, "z_loss": 0.006958776619285345 }, { "copy_logits_max": 13.939545631408691, "copy_logits_min": -687500032.0, "copy_num_tokens": 596.25, "epoch": 0.15542507020679092, "gen_logits_max": 14.168766975402832, "gen_logits_mean": -5.644535064697266, "gen_logits_min": -17.668594360351562, "gen_logits_std": 2.4583208560943604, "gen_loss": 0.3604651689529419, "grad_norm": 0.70751523654714, "learning_rate": 2.9968421052631578e-05, "loss": 0.3719, "mean_copy_accuracy": 0.9882105886936188, "mean_gen_accuracy": 0.84980209171772, "mean_token_accuracy": 0.8832238763570786, "num_tokens": 205582316.0, "sample_num_tokens": 9316.5, "step": 761, "total_num_tokens": 205619582.0, "z_loss": 0.007536616176366806 }, { "copy_logits_max": 13.969423294067383, "copy_logits_min": -687499968.0, "copy_num_tokens": 607.5, "epoch": 0.15562930814398773, "gen_logits_max": 14.42188835144043, "gen_logits_mean": -5.858485221862793, "gen_logits_min": -17.579578399658203, "gen_logits_std": 2.463378429412842, "gen_loss": 0.3365831673145294, "grad_norm": 0.8973034597707527, "learning_rate": 2.9967157894736842e-05, "loss": 0.3662, "mean_copy_accuracy": 0.9878886640071869, "mean_gen_accuracy": 0.8485952168703079, "mean_token_accuracy": 0.8865804672241211, "num_tokens": 205874947.0, "sample_num_tokens": 8346.75, "step": 762, "total_num_tokens": 205908334.0, "z_loss": 0.00790602620691061 }, { "copy_logits_max": 12.80394172668457, "copy_logits_min": -750000000.0, "copy_num_tokens": 614.9375, "epoch": 0.15583354608118458, "gen_logits_max": 13.841029167175293, "gen_logits_mean": -6.2458577156066895, "gen_logits_min": -18.18438720703125, "gen_logits_std": 2.4734013080596924, "gen_loss": 0.33111900091171265, "grad_norm": 0.7191128503744624, "learning_rate": 2.9965894736842107e-05, "loss": 0.3527, "mean_copy_accuracy": 0.9862323552370071, "mean_gen_accuracy": 0.8570964336395264, "mean_token_accuracy": 0.8898022025823593, "num_tokens": 206168017.0, "sample_num_tokens": 9571.25, "step": 763, "total_num_tokens": 206206302.0, "z_loss": 0.007263069972395897 }, { "copy_logits_max": 10.196294784545898, "copy_logits_min": -687500032.0, "copy_num_tokens": 456.4375, "epoch": 0.15603778401838142, "gen_logits_max": 14.04588508605957, "gen_logits_mean": -6.012501239776611, "gen_logits_min": -17.682697296142578, "gen_logits_std": 2.4874143600463867, "gen_loss": 0.40362516045570374, "grad_norm": 0.726035298590977, "learning_rate": 2.9964631578947368e-05, "loss": 0.3835, "mean_copy_accuracy": 0.9893728792667389, "mean_gen_accuracy": 0.842060461640358, "mean_token_accuracy": 0.8837868571281433, "num_tokens": 206456507.0, "sample_num_tokens": 8210.75, "step": 764, "total_num_tokens": 206489350.0, "z_loss": 0.005703752860426903 }, { "copy_logits_max": 13.239636421203613, "copy_logits_min": -687500032.0, "copy_num_tokens": 428.0625, "epoch": 0.15624202195557824, "gen_logits_max": 14.232139587402344, "gen_logits_mean": -6.346619606018066, "gen_logits_min": -18.582653045654297, "gen_logits_std": 2.4540209770202637, "gen_loss": 0.35154786705970764, "grad_norm": 0.821756462185967, "learning_rate": 2.9963368421052635e-05, "loss": 0.3746, "mean_copy_accuracy": 0.9842590391635895, "mean_gen_accuracy": 0.8511759787797928, "mean_token_accuracy": 0.8867157101631165, "num_tokens": 206731011.0, "sample_num_tokens": 7880.25, "step": 765, "total_num_tokens": 206762532.0, "z_loss": 0.0063120829872787 }, { "copy_logits_max": 11.43984603881836, "copy_logits_min": -750000000.0, "copy_num_tokens": 604.4375, "epoch": 0.15644625989277508, "gen_logits_max": 13.957256317138672, "gen_logits_mean": -5.6357421875, "gen_logits_min": -17.740562438964844, "gen_logits_std": 2.4397637844085693, "gen_loss": 0.39267563819885254, "grad_norm": 0.653219444521989, "learning_rate": 2.9962105263157896e-05, "loss": 0.3724, "mean_copy_accuracy": 0.989193320274353, "mean_gen_accuracy": 0.8477436155080795, "mean_token_accuracy": 0.8863647133111954, "num_tokens": 207001571.0, "sample_num_tokens": 9191.75, "step": 766, "total_num_tokens": 207038338.0, "z_loss": 0.006244835909456015 }, { "copy_logits_max": 9.590278625488281, "copy_logits_min": -687500032.0, "copy_num_tokens": 291.125, "epoch": 0.15665049782997192, "gen_logits_max": 13.925614356994629, "gen_logits_mean": -6.752070903778076, "gen_logits_min": -17.912263870239258, "gen_logits_std": 2.3306760787963867, "gen_loss": 0.39197850227355957, "grad_norm": 0.6858826696894553, "learning_rate": 2.996084210526316e-05, "loss": 0.3903, "mean_copy_accuracy": 0.9846499264240265, "mean_gen_accuracy": 0.8440845012664795, "mean_token_accuracy": 0.8811196535825729, "num_tokens": 207286577.0, "sample_num_tokens": 9066.25, "step": 767, "total_num_tokens": 207322842.0, "z_loss": 0.004683178849518299 }, { "copy_logits_max": 11.227829933166504, "copy_logits_min": -625000000.0, "copy_num_tokens": 439.5, "epoch": 0.15685473576716874, "gen_logits_max": 14.410161972045898, "gen_logits_mean": -6.812665939331055, "gen_logits_min": -18.319429397583008, "gen_logits_std": 2.3952431678771973, "gen_loss": 0.3565353751182556, "grad_norm": 0.6581918779825058, "learning_rate": 2.995957894736842e-05, "loss": 0.4006, "mean_copy_accuracy": 0.9866242855787277, "mean_gen_accuracy": 0.8431942462921143, "mean_token_accuracy": 0.8740399926900864, "num_tokens": 207544333.0, "sample_num_tokens": 8526.75, "step": 768, "total_num_tokens": 207578440.0, "z_loss": 0.00573487114161253 }, { "copy_logits_max": 11.56136417388916, "copy_logits_min": -750000000.0, "copy_num_tokens": 453.0, "epoch": 0.15705897370436558, "gen_logits_max": 14.26594352722168, "gen_logits_mean": -6.809422492980957, "gen_logits_min": -18.604543685913086, "gen_logits_std": 2.4320380687713623, "gen_loss": 0.3551705479621887, "grad_norm": 0.6617588490217886, "learning_rate": 2.9958315789473686e-05, "loss": 0.3963, "mean_copy_accuracy": 0.9866102933883667, "mean_gen_accuracy": 0.842461496591568, "mean_token_accuracy": 0.8752113878726959, "num_tokens": 207797125.0, "sample_num_tokens": 8154.75, "step": 769, "total_num_tokens": 207829744.0, "z_loss": 0.005853539332747459 }, { "copy_logits_max": 9.118206024169922, "copy_logits_min": -750000000.0, "copy_num_tokens": 437.3125, "epoch": 0.15726321164156243, "gen_logits_max": 14.446955680847168, "gen_logits_mean": -6.445201873779297, "gen_logits_min": -18.252105712890625, "gen_logits_std": 2.474921226501465, "gen_loss": 0.3724144697189331, "grad_norm": 0.611670835439785, "learning_rate": 2.9957052631578947e-05, "loss": 0.383, "mean_copy_accuracy": 0.9877214431762695, "mean_gen_accuracy": 0.8462741374969482, "mean_token_accuracy": 0.8790462017059326, "num_tokens": 208050571.0, "sample_num_tokens": 8123.25, "step": 770, "total_num_tokens": 208083064.0, "z_loss": 0.005282978527247906 }, { "copy_logits_max": 12.231740951538086, "copy_logits_min": -687500032.0, "copy_num_tokens": 455.625, "epoch": 0.15746744957875924, "gen_logits_max": 14.11008358001709, "gen_logits_mean": -6.912034034729004, "gen_logits_min": -19.0709171295166, "gen_logits_std": 2.462467908859253, "gen_loss": 0.4046977460384369, "grad_norm": 0.7045456306427803, "learning_rate": 2.995578947368421e-05, "loss": 0.3891, "mean_copy_accuracy": 0.9877091497182846, "mean_gen_accuracy": 0.839890256524086, "mean_token_accuracy": 0.8787093609571457, "num_tokens": 208320048.0, "sample_num_tokens": 8061.0, "step": 771, "total_num_tokens": 208352292.0, "z_loss": 0.005857415962964296 }, { "copy_logits_max": 11.61878776550293, "copy_logits_min": -750000000.0, "copy_num_tokens": 525.6875, "epoch": 0.1576716875159561, "gen_logits_max": 14.102765083312988, "gen_logits_mean": -6.847866058349609, "gen_logits_min": -18.79324722290039, "gen_logits_std": 2.4888176918029785, "gen_loss": 0.3633325695991516, "grad_norm": 0.7020101802082256, "learning_rate": 2.9954526315789472e-05, "loss": 0.358, "mean_copy_accuracy": 0.9850299209356308, "mean_gen_accuracy": 0.8582015186548233, "mean_token_accuracy": 0.8888727128505707, "num_tokens": 208590770.0, "sample_num_tokens": 8896.0, "step": 772, "total_num_tokens": 208626354.0, "z_loss": 0.005830750335007906 }, { "copy_logits_max": 8.630301475524902, "copy_logits_min": -687500032.0, "copy_num_tokens": 532.875, "epoch": 0.15787592545315293, "gen_logits_max": 13.309430122375488, "gen_logits_mean": -6.966692924499512, "gen_logits_min": -18.830787658691406, "gen_logits_std": 2.458533763885498, "gen_loss": 0.3496670424938202, "grad_norm": 0.6558523475966385, "learning_rate": 2.995326315789474e-05, "loss": 0.383, "mean_copy_accuracy": 0.9876892268657684, "mean_gen_accuracy": 0.8486060500144958, "mean_token_accuracy": 0.8820066154003143, "num_tokens": 208870610.0, "sample_num_tokens": 9811.0, "step": 773, "total_num_tokens": 208909854.0, "z_loss": 0.004681035876274109 }, { "copy_logits_max": 11.701250076293945, "copy_logits_min": -750000000.0, "copy_num_tokens": 635.6875, "epoch": 0.15808016339034975, "gen_logits_max": 13.288858413696289, "gen_logits_mean": -6.05666446685791, "gen_logits_min": -17.953323364257812, "gen_logits_std": 2.4910454750061035, "gen_loss": 0.3420471251010895, "grad_norm": 0.7808643121115308, "learning_rate": 2.9952e-05, "loss": 0.3833, "mean_copy_accuracy": 0.9847172945737839, "mean_gen_accuracy": 0.8471044898033142, "mean_token_accuracy": 0.8816366940736771, "num_tokens": 209134231.0, "sample_num_tokens": 8956.75, "step": 774, "total_num_tokens": 209170058.0, "z_loss": 0.006623115390539169 }, { "copy_logits_max": 11.099377632141113, "copy_logits_min": -687500032.0, "copy_num_tokens": 468.25, "epoch": 0.1582844013275466, "gen_logits_max": 14.030675888061523, "gen_logits_mean": -6.520918846130371, "gen_logits_min": -18.405078887939453, "gen_logits_std": 2.4453630447387695, "gen_loss": 0.3767417073249817, "grad_norm": 0.6251220761801287, "learning_rate": 2.9950736842105265e-05, "loss": 0.3459, "mean_copy_accuracy": 0.9869165420532227, "mean_gen_accuracy": 0.8541143238544464, "mean_token_accuracy": 0.8928084671497345, "num_tokens": 209423844.0, "sample_num_tokens": 8325.0, "step": 775, "total_num_tokens": 209457144.0, "z_loss": 0.005790120456367731 }, { "copy_logits_max": 9.223716735839844, "copy_logits_min": -750000000.0, "copy_num_tokens": 388.4375, "epoch": 0.15848863926474344, "gen_logits_max": 13.831310272216797, "gen_logits_mean": -6.639402389526367, "gen_logits_min": -18.13799285888672, "gen_logits_std": 2.413478374481201, "gen_loss": 0.40136462450027466, "grad_norm": 0.7109289133148241, "learning_rate": 2.9949473684210526e-05, "loss": 0.3712, "mean_copy_accuracy": 0.9887650310993195, "mean_gen_accuracy": 0.8447344750165939, "mean_token_accuracy": 0.8834945261478424, "num_tokens": 209702791.0, "sample_num_tokens": 7687.75, "step": 776, "total_num_tokens": 209733542.0, "z_loss": 0.004658978432416916 }, { "copy_logits_max": 12.050527572631836, "copy_logits_min": -750000000.0, "copy_num_tokens": 559.5625, "epoch": 0.15869287720194025, "gen_logits_max": 13.873468399047852, "gen_logits_mean": -6.76035737991333, "gen_logits_min": -18.742603302001953, "gen_logits_std": 2.4684839248657227, "gen_loss": 0.34200942516326904, "grad_norm": 0.7066134357184058, "learning_rate": 2.994821052631579e-05, "loss": 0.3757, "mean_copy_accuracy": 0.9875846952199936, "mean_gen_accuracy": 0.8447021842002869, "mean_token_accuracy": 0.8834180235862732, "num_tokens": 209979299.0, "sample_num_tokens": 8947.25, "step": 777, "total_num_tokens": 210015088.0, "z_loss": 0.005935090593993664 }, { "copy_logits_max": 8.845234870910645, "copy_logits_min": -687500032.0, "copy_num_tokens": 276.1875, "epoch": 0.1588971151391371, "gen_logits_max": 14.391355514526367, "gen_logits_mean": -6.994311809539795, "gen_logits_min": -18.166051864624023, "gen_logits_std": 2.3638176918029785, "gen_loss": 0.4260876178741455, "grad_norm": 0.5914655380123618, "learning_rate": 2.9946947368421054e-05, "loss": 0.3963, "mean_copy_accuracy": 0.986687183380127, "mean_gen_accuracy": 0.8485529273748398, "mean_token_accuracy": 0.8740047961473465, "num_tokens": 210212533.0, "sample_num_tokens": 6660.25, "step": 778, "total_num_tokens": 210239174.0, "z_loss": 0.003964412026107311 }, { "copy_logits_max": 8.303472518920898, "copy_logits_min": -750000000.0, "copy_num_tokens": 434.75, "epoch": 0.15910135307633394, "gen_logits_max": 13.291247367858887, "gen_logits_mean": -6.828243255615234, "gen_logits_min": -18.335521697998047, "gen_logits_std": 2.409304141998291, "gen_loss": 0.3423060178756714, "grad_norm": 0.8324655931369419, "learning_rate": 2.9945684210526315e-05, "loss": 0.3534, "mean_copy_accuracy": 0.987036794424057, "mean_gen_accuracy": 0.8547795712947845, "mean_token_accuracy": 0.8882866948843002, "num_tokens": 210481796.0, "sample_num_tokens": 8489.0, "step": 779, "total_num_tokens": 210515752.0, "z_loss": 0.004618038423359394 }, { "copy_logits_max": 9.761674880981445, "copy_logits_min": -750000000.0, "copy_num_tokens": 378.6875, "epoch": 0.15930559101353076, "gen_logits_max": 13.883082389831543, "gen_logits_mean": -7.071932315826416, "gen_logits_min": -18.783082962036133, "gen_logits_std": 2.4565722942352295, "gen_loss": 0.37715381383895874, "grad_norm": 0.8299363360125062, "learning_rate": 2.994442105263158e-05, "loss": 0.3868, "mean_copy_accuracy": 0.9812700003385544, "mean_gen_accuracy": 0.8470335602760315, "mean_token_accuracy": 0.8801231533288956, "num_tokens": 210749787.0, "sample_num_tokens": 7720.25, "step": 780, "total_num_tokens": 210780668.0, "z_loss": 0.004114317707717419 }, { "copy_logits_max": 9.742008209228516, "copy_logits_min": -750000000.0, "copy_num_tokens": 498.5, "epoch": 0.1595098289507276, "gen_logits_max": 14.4713716506958, "gen_logits_mean": -6.78459358215332, "gen_logits_min": -18.36998748779297, "gen_logits_std": 2.4214186668395996, "gen_loss": 0.3935058116912842, "grad_norm": 0.5519767083685349, "learning_rate": 2.9943157894736844e-05, "loss": 0.3846, "mean_copy_accuracy": 0.990854874253273, "mean_gen_accuracy": 0.843381941318512, "mean_token_accuracy": 0.8780727386474609, "num_tokens": 211002418.0, "sample_num_tokens": 9535.0, "step": 781, "total_num_tokens": 211040558.0, "z_loss": 0.004914689343422651 }, { "copy_logits_max": 10.437149047851562, "copy_logits_min": -750000000.0, "copy_num_tokens": 412.6875, "epoch": 0.15971406688792444, "gen_logits_max": 13.54904556274414, "gen_logits_mean": -7.092830657958984, "gen_logits_min": -18.82404327392578, "gen_logits_std": 2.409665584564209, "gen_loss": 0.37579038739204407, "grad_norm": 0.7267634476814477, "learning_rate": 2.9941894736842108e-05, "loss": 0.3864, "mean_copy_accuracy": 0.9869567900896072, "mean_gen_accuracy": 0.8455580472946167, "mean_token_accuracy": 0.8804933726787567, "num_tokens": 211272402.0, "sample_num_tokens": 8227.0, "step": 782, "total_num_tokens": 211305310.0, "z_loss": 0.0049221450462937355 }, { "copy_logits_max": 7.286154747009277, "copy_logits_min": -750000000.0, "copy_num_tokens": 409.25, "epoch": 0.15991830482512126, "gen_logits_max": 13.018902778625488, "gen_logits_mean": -6.281667709350586, "gen_logits_min": -18.194110870361328, "gen_logits_std": 2.4791131019592285, "gen_loss": 0.4272616505622864, "grad_norm": 0.6287473103769963, "learning_rate": 2.994063157894737e-05, "loss": 0.3725, "mean_copy_accuracy": 0.9878683090209961, "mean_gen_accuracy": 0.8470492959022522, "mean_token_accuracy": 0.8835978657007217, "num_tokens": 211551436.0, "sample_num_tokens": 8236.5, "step": 783, "total_num_tokens": 211584382.0, "z_loss": 0.004738601855933666 }, { "copy_logits_max": 11.632556915283203, "copy_logits_min": -687500032.0, "copy_num_tokens": 443.3125, "epoch": 0.1601225427623181, "gen_logits_max": 13.802989959716797, "gen_logits_mean": -7.21844482421875, "gen_logits_min": -19.38897705078125, "gen_logits_std": 2.516275644302368, "gen_loss": 0.4072796702384949, "grad_norm": 0.598975759897759, "learning_rate": 2.9939368421052633e-05, "loss": 0.3742, "mean_copy_accuracy": 0.9885268062353134, "mean_gen_accuracy": 0.8472331166267395, "mean_token_accuracy": 0.884006917476654, "num_tokens": 211829550.0, "sample_num_tokens": 8648.0, "step": 784, "total_num_tokens": 211864142.0, "z_loss": 0.005270885769277811 }, { "copy_logits_max": 10.24235725402832, "copy_logits_min": -750000000.0, "copy_num_tokens": 536.6875, "epoch": 0.16032678069951495, "gen_logits_max": 13.351943016052246, "gen_logits_mean": -6.752596855163574, "gen_logits_min": -18.825096130371094, "gen_logits_std": 2.5280210971832275, "gen_loss": 0.3510691821575165, "grad_norm": 0.658519183906413, "learning_rate": 2.9938105263157894e-05, "loss": 0.3591, "mean_copy_accuracy": 0.9883001446723938, "mean_gen_accuracy": 0.852251723408699, "mean_token_accuracy": 0.8895865827798843, "num_tokens": 212124819.0, "sample_num_tokens": 8933.25, "step": 785, "total_num_tokens": 212160552.0, "z_loss": 0.00614626519382 }, { "copy_logits_max": 9.815226554870605, "copy_logits_min": -750000000.0, "copy_num_tokens": 342.9375, "epoch": 0.16053101863671176, "gen_logits_max": 14.383912086486816, "gen_logits_mean": -7.21556282043457, "gen_logits_min": -19.120820999145508, "gen_logits_std": 2.4710254669189453, "gen_loss": 0.418381005525589, "grad_norm": 0.704034098649148, "learning_rate": 2.993684210526316e-05, "loss": 0.3895, "mean_copy_accuracy": 0.9878302365541458, "mean_gen_accuracy": 0.8463006764650345, "mean_token_accuracy": 0.8800448328256607, "num_tokens": 212383055.0, "sample_num_tokens": 7269.25, "step": 786, "total_num_tokens": 212412132.0, "z_loss": 0.005090926308184862 }, { "copy_logits_max": 10.96778392791748, "copy_logits_min": -750000000.0, "copy_num_tokens": 524.375, "epoch": 0.1607352565739086, "gen_logits_max": 13.422155380249023, "gen_logits_mean": -6.819842338562012, "gen_logits_min": -18.565662384033203, "gen_logits_std": 2.4377031326293945, "gen_loss": 0.34993064403533936, "grad_norm": 0.659179892606247, "learning_rate": 2.993557894736842e-05, "loss": 0.3615, "mean_copy_accuracy": 0.9868132770061493, "mean_gen_accuracy": 0.8519572764635086, "mean_token_accuracy": 0.8834411054849625, "num_tokens": 212627485.0, "sample_num_tokens": 8658.25, "step": 787, "total_num_tokens": 212662118.0, "z_loss": 0.00537947379052639 }, { "copy_logits_max": 6.730114936828613, "copy_logits_min": -750000000.0, "copy_num_tokens": 456.5625, "epoch": 0.16093949451110542, "gen_logits_max": 13.158454895019531, "gen_logits_mean": -8.164857864379883, "gen_logits_min": -19.496414184570312, "gen_logits_std": 2.3495922088623047, "gen_loss": 0.3420811891555786, "grad_norm": 0.6120164607451964, "learning_rate": 2.9934315789473684e-05, "loss": 0.3705, "mean_copy_accuracy": 0.9878462702035904, "mean_gen_accuracy": 0.850274384021759, "mean_token_accuracy": 0.8826269209384918, "num_tokens": 212889500.0, "sample_num_tokens": 9358.0, "step": 788, "total_num_tokens": 212926932.0, "z_loss": 0.0040308209136128426 }, { "copy_logits_max": 7.788570404052734, "copy_logits_min": -687500032.0, "copy_num_tokens": 404.375, "epoch": 0.16114373244830227, "gen_logits_max": 13.093653678894043, "gen_logits_mean": -6.948248863220215, "gen_logits_min": -18.254486083984375, "gen_logits_std": 2.3721513748168945, "gen_loss": 0.37993308901786804, "grad_norm": 0.6707930711486445, "learning_rate": 2.9933052631578945e-05, "loss": 0.3635, "mean_copy_accuracy": 0.9863573908805847, "mean_gen_accuracy": 0.8540352433919907, "mean_token_accuracy": 0.8873023688793182, "num_tokens": 213152450.0, "sample_num_tokens": 8479.5, "step": 789, "total_num_tokens": 213186368.0, "z_loss": 0.004175262060016394 }, { "copy_logits_max": 8.635372161865234, "copy_logits_min": -750000000.0, "copy_num_tokens": 517.5625, "epoch": 0.1613479703854991, "gen_logits_max": 12.620787620544434, "gen_logits_mean": -7.329740047454834, "gen_logits_min": -19.354835510253906, "gen_logits_std": 2.4143173694610596, "gen_loss": 0.34471842646598816, "grad_norm": 0.6879613660371141, "learning_rate": 2.9931789473684213e-05, "loss": 0.3846, "mean_copy_accuracy": 0.9873746335506439, "mean_gen_accuracy": 0.8463467508554459, "mean_token_accuracy": 0.8810366243124008, "num_tokens": 213412145.0, "sample_num_tokens": 8394.25, "step": 790, "total_num_tokens": 213445722.0, "z_loss": 0.00459597073495388 }, { "copy_logits_max": 7.020535945892334, "copy_logits_min": -687500032.0, "copy_num_tokens": 469.0, "epoch": 0.16155220832269593, "gen_logits_max": 13.222261428833008, "gen_logits_mean": -6.637490749359131, "gen_logits_min": -18.368005752563477, "gen_logits_std": 2.413830280303955, "gen_loss": 0.4113065004348755, "grad_norm": 0.5192725500846403, "learning_rate": 2.9930526315789477e-05, "loss": 0.3613, "mean_copy_accuracy": 0.9892220348119736, "mean_gen_accuracy": 0.8504479080438614, "mean_token_accuracy": 0.8856038749217987, "num_tokens": 213706161.0, "sample_num_tokens": 8389.75, "step": 791, "total_num_tokens": 213739720.0, "z_loss": 0.004144618287682533 }, { "copy_logits_max": 8.99427318572998, "copy_logits_min": -687500032.0, "copy_num_tokens": 451.0625, "epoch": 0.16175644625989277, "gen_logits_max": 13.020463943481445, "gen_logits_mean": -6.879570007324219, "gen_logits_min": -20.004545211791992, "gen_logits_std": 2.5912327766418457, "gen_loss": 0.36247947812080383, "grad_norm": 0.6080993313482336, "learning_rate": 2.9929263157894738e-05, "loss": 0.3777, "mean_copy_accuracy": 0.9885353296995163, "mean_gen_accuracy": 0.849809855222702, "mean_token_accuracy": 0.881385862827301, "num_tokens": 213972667.0, "sample_num_tokens": 8011.25, "step": 792, "total_num_tokens": 214004712.0, "z_loss": 0.004088263493031263 }, { "copy_logits_max": 5.941040992736816, "copy_logits_min": -750000000.0, "copy_num_tokens": 420.0625, "epoch": 0.16196068419708962, "gen_logits_max": 12.538192749023438, "gen_logits_mean": -8.137514114379883, "gen_logits_min": -19.8726749420166, "gen_logits_std": 2.4205245971679688, "gen_loss": 0.35409724712371826, "grad_norm": 0.5632924032952268, "learning_rate": 2.9928000000000002e-05, "loss": 0.345, "mean_copy_accuracy": 0.989321306347847, "mean_gen_accuracy": 0.8564366698265076, "mean_token_accuracy": 0.8890622109174728, "num_tokens": 214254744.0, "sample_num_tokens": 9403.0, "step": 793, "total_num_tokens": 214292356.0, "z_loss": 0.0034926445223391056 }, { "copy_logits_max": 7.067878723144531, "copy_logits_min": -750000000.0, "copy_num_tokens": 295.9375, "epoch": 0.16216492213428643, "gen_logits_max": 13.069398880004883, "gen_logits_mean": -7.616755485534668, "gen_logits_min": -19.392822265625, "gen_logits_std": 2.3918874263763428, "gen_loss": 0.3573603928089142, "grad_norm": 0.5669042529241634, "learning_rate": 2.9926736842105263e-05, "loss": 0.3867, "mean_copy_accuracy": 0.9875789135694504, "mean_gen_accuracy": 0.8477303087711334, "mean_token_accuracy": 0.8778752237558365, "num_tokens": 214500345.0, "sample_num_tokens": 7143.75, "step": 794, "total_num_tokens": 214528920.0, "z_loss": 0.0035523870028555393 }, { "copy_logits_max": 6.91165018081665, "copy_logits_min": -687500032.0, "copy_num_tokens": 503.375, "epoch": 0.16236916007148328, "gen_logits_max": 13.488399505615234, "gen_logits_mean": -7.542065620422363, "gen_logits_min": -19.26679801940918, "gen_logits_std": 2.3909199237823486, "gen_loss": 0.3415966331958771, "grad_norm": 0.7122692509799803, "learning_rate": 2.9925473684210527e-05, "loss": 0.3566, "mean_copy_accuracy": 0.9857781529426575, "mean_gen_accuracy": 0.8553737103939056, "mean_token_accuracy": 0.8869713991880417, "num_tokens": 214777105.0, "sample_num_tokens": 9679.25, "step": 795, "total_num_tokens": 214815822.0, "z_loss": 0.0041785212233662605 }, { "copy_logits_max": 9.329343795776367, "copy_logits_min": -687499968.0, "copy_num_tokens": 804.625, "epoch": 0.16257339800868012, "gen_logits_max": 13.183154106140137, "gen_logits_mean": -6.45075798034668, "gen_logits_min": -18.685142517089844, "gen_logits_std": 2.506004810333252, "gen_loss": 0.3125193119049072, "grad_norm": 0.7021263887485022, "learning_rate": 2.9924210526315788e-05, "loss": 0.3571, "mean_copy_accuracy": 0.9885975271463394, "mean_gen_accuracy": 0.8541502803564072, "mean_token_accuracy": 0.8884385675191879, "num_tokens": 215039299.0, "sample_num_tokens": 10718.25, "step": 796, "total_num_tokens": 215082172.0, "z_loss": 0.005482726730406284 }, { "copy_logits_max": 9.061408996582031, "copy_logits_min": -750000000.0, "copy_num_tokens": 604.375, "epoch": 0.16277763594587694, "gen_logits_max": 13.160345077514648, "gen_logits_mean": -7.126407623291016, "gen_logits_min": -19.13760757446289, "gen_logits_std": 2.502548933029175, "gen_loss": 0.31462791562080383, "grad_norm": 0.6380736525728973, "learning_rate": 2.9922947368421053e-05, "loss": 0.3662, "mean_copy_accuracy": 0.9888790100812912, "mean_gen_accuracy": 0.849895566701889, "mean_token_accuracy": 0.8861532509326935, "num_tokens": 215319764.0, "sample_num_tokens": 9024.5, "step": 797, "total_num_tokens": 215355862.0, "z_loss": 0.0047848704271018505 }, { "copy_logits_max": 8.742374420166016, "copy_logits_min": -750000000.0, "copy_num_tokens": 448.875, "epoch": 0.16298187388307378, "gen_logits_max": 13.100288391113281, "gen_logits_mean": -7.263120174407959, "gen_logits_min": -19.19045639038086, "gen_logits_std": 2.4442458152770996, "gen_loss": 0.3700970709323883, "grad_norm": 0.6461999440469925, "learning_rate": 2.9921684210526317e-05, "loss": 0.3626, "mean_copy_accuracy": 0.9874172061681747, "mean_gen_accuracy": 0.8501386344432831, "mean_token_accuracy": 0.8871908485889435, "num_tokens": 215595595.0, "sample_num_tokens": 8448.25, "step": 798, "total_num_tokens": 215629388.0, "z_loss": 0.004087104927748442 }, { "copy_logits_max": 8.444573402404785, "copy_logits_min": -750000000.0, "copy_num_tokens": 497.5, "epoch": 0.16318611182027062, "gen_logits_max": 13.258426666259766, "gen_logits_mean": -7.113354682922363, "gen_logits_min": -19.192277908325195, "gen_logits_std": 2.4383835792541504, "gen_loss": 0.38351190090179443, "grad_norm": 0.5981331494569502, "learning_rate": 2.992042105263158e-05, "loss": 0.3878, "mean_copy_accuracy": 0.9839881807565689, "mean_gen_accuracy": 0.8536517471075058, "mean_token_accuracy": 0.8789498656988144, "num_tokens": 215861944.0, "sample_num_tokens": 8552.5, "step": 799, "total_num_tokens": 215896154.0, "z_loss": 0.004079345613718033 }, { "copy_logits_max": 9.914201736450195, "copy_logits_min": -750000000.0, "copy_num_tokens": 546.375, "epoch": 0.16339034975746744, "gen_logits_max": 13.568621635437012, "gen_logits_mean": -7.210421562194824, "gen_logits_min": -19.49221420288086, "gen_logits_std": 2.475783348083496, "gen_loss": 0.37920427322387695, "grad_norm": 0.5859155252420148, "learning_rate": 2.9919157894736842e-05, "loss": 0.3797, "mean_copy_accuracy": 0.9857668578624725, "mean_gen_accuracy": 0.8457463681697845, "mean_token_accuracy": 0.8793686479330063, "num_tokens": 216140444.0, "sample_num_tokens": 8563.0, "step": 800, "total_num_tokens": 216174696.0, "z_loss": 0.004499651491641998 }, { "copy_logits_max": 8.167312622070312, "copy_logits_min": -750000000.0, "copy_num_tokens": 350.0, "epoch": 0.16359458769466428, "gen_logits_max": 12.519787788391113, "gen_logits_mean": -7.578614234924316, "gen_logits_min": -19.349056243896484, "gen_logits_std": 2.4072341918945312, "gen_loss": 0.3547951579093933, "grad_norm": 0.6822523776207636, "learning_rate": 2.9917894736842106e-05, "loss": 0.3836, "mean_copy_accuracy": 0.9870951324701309, "mean_gen_accuracy": 0.8479008972644806, "mean_token_accuracy": 0.881010502576828, "num_tokens": 216406910.0, "sample_num_tokens": 7706.5, "step": 801, "total_num_tokens": 216437736.0, "z_loss": 0.003755017649382353 }, { "copy_logits_max": 8.10764217376709, "copy_logits_min": -750000000.0, "copy_num_tokens": 604.8125, "epoch": 0.16379882563186113, "gen_logits_max": 13.2899169921875, "gen_logits_mean": -6.764920711517334, "gen_logits_min": -20.044898986816406, "gen_logits_std": 2.596830129623413, "gen_loss": 0.3851006031036377, "grad_norm": 0.5900372347264423, "learning_rate": 2.9916631578947367e-05, "loss": 0.3935, "mean_copy_accuracy": 0.985712006688118, "mean_gen_accuracy": 0.8394070118665695, "mean_token_accuracy": 0.8762395232915878, "num_tokens": 216665342.0, "sample_num_tokens": 9076.0, "step": 802, "total_num_tokens": 216701646.0, "z_loss": 0.0048524546436965466 }, { "copy_logits_max": 5.039881706237793, "copy_logits_min": -750000000.0, "copy_num_tokens": 376.1875, "epoch": 0.16400306356905794, "gen_logits_max": 11.990530967712402, "gen_logits_mean": -7.382742881774902, "gen_logits_min": -20.08365249633789, "gen_logits_std": 2.512706995010376, "gen_loss": 0.3704073131084442, "grad_norm": 0.6003735214713786, "learning_rate": 2.991536842105263e-05, "loss": 0.3675, "mean_copy_accuracy": 0.9877697080373764, "mean_gen_accuracy": 0.8482960164546967, "mean_token_accuracy": 0.8834691792726517, "num_tokens": 216938842.0, "sample_num_tokens": 7834.5, "step": 803, "total_num_tokens": 216970180.0, "z_loss": 0.003073988715186715 }, { "copy_logits_max": 8.584335327148438, "copy_logits_min": -562499968.0, "copy_num_tokens": 581.875, "epoch": 0.1642073015062548, "gen_logits_max": 13.160009384155273, "gen_logits_mean": -7.330808162689209, "gen_logits_min": -19.824508666992188, "gen_logits_std": 2.4905550479888916, "gen_loss": 0.33333027362823486, "grad_norm": 0.6644504987855733, "learning_rate": 2.9914105263157896e-05, "loss": 0.3594, "mean_copy_accuracy": 0.9898000955581665, "mean_gen_accuracy": 0.8538106977939606, "mean_token_accuracy": 0.8853946179151535, "num_tokens": 217220617.0, "sample_num_tokens": 9013.25, "step": 804, "total_num_tokens": 217256670.0, "z_loss": 0.004388227127492428 }, { "copy_logits_max": 7.595123767852783, "copy_logits_min": -687499968.0, "copy_num_tokens": 687.1875, "epoch": 0.16441153944345163, "gen_logits_max": 12.776618003845215, "gen_logits_mean": -6.355245590209961, "gen_logits_min": -18.165889739990234, "gen_logits_std": 2.41926908493042, "gen_loss": 0.3351859152317047, "grad_norm": 0.5555661860925775, "learning_rate": 2.9912842105263157e-05, "loss": 0.3502, "mean_copy_accuracy": 0.9895519316196442, "mean_gen_accuracy": 0.8535808473825455, "mean_token_accuracy": 0.8883102536201477, "num_tokens": 217491041.0, "sample_num_tokens": 9712.75, "step": 805, "total_num_tokens": 217529892.0, "z_loss": 0.005107890348881483 }, { "copy_logits_max": 8.033164978027344, "copy_logits_min": -750000000.0, "copy_num_tokens": 615.125, "epoch": 0.16461577738064845, "gen_logits_max": 13.188478469848633, "gen_logits_mean": -6.1506123542785645, "gen_logits_min": -18.054950714111328, "gen_logits_std": 2.4891347885131836, "gen_loss": 0.3699495196342468, "grad_norm": 0.5826093661637093, "learning_rate": 2.9911578947368425e-05, "loss": 0.3484, "mean_copy_accuracy": 0.989279493689537, "mean_gen_accuracy": 0.8482519686222076, "mean_token_accuracy": 0.890453651547432, "num_tokens": 217771238.0, "sample_num_tokens": 8786.0, "step": 806, "total_num_tokens": 217806382.0, "z_loss": 0.0049988748505711555 }, { "copy_logits_max": 7.9308881759643555, "copy_logits_min": -750000000.0, "copy_num_tokens": 374.5625, "epoch": 0.1648200153178453, "gen_logits_max": 13.300200462341309, "gen_logits_mean": -6.981674671173096, "gen_logits_min": -19.287004470825195, "gen_logits_std": 2.500762701034546, "gen_loss": 0.40952062606811523, "grad_norm": 0.5404006705327548, "learning_rate": 2.9910315789473686e-05, "loss": 0.3915, "mean_copy_accuracy": 0.9870468974113464, "mean_gen_accuracy": 0.8441582769155502, "mean_token_accuracy": 0.8763945996761322, "num_tokens": 218036340.0, "sample_num_tokens": 8103.0, "step": 807, "total_num_tokens": 218068752.0, "z_loss": 0.004053577780723572 }, { "copy_logits_max": 5.067389011383057, "copy_logits_min": -750000000.0, "copy_num_tokens": 462.5625, "epoch": 0.16502425325504214, "gen_logits_max": 12.603620529174805, "gen_logits_mean": -7.537718772888184, "gen_logits_min": -19.612377166748047, "gen_logits_std": 2.4946603775024414, "gen_loss": 0.31764042377471924, "grad_norm": 0.539090973317569, "learning_rate": 2.990905263157895e-05, "loss": 0.3449, "mean_copy_accuracy": 0.9902516305446625, "mean_gen_accuracy": 0.8554992377758026, "mean_token_accuracy": 0.8904474526643753, "num_tokens": 218318884.0, "sample_num_tokens": 7810.5, "step": 808, "total_num_tokens": 218350126.0, "z_loss": 0.0038155773654580116 }, { "copy_logits_max": 6.8564629554748535, "copy_logits_min": -750000000.0, "copy_num_tokens": 523.0625, "epoch": 0.16522849119223895, "gen_logits_max": 12.593002319335938, "gen_logits_mean": -7.437430381774902, "gen_logits_min": -19.812713623046875, "gen_logits_std": 2.518174171447754, "gen_loss": 0.3443559408187866, "grad_norm": 0.5539524256079187, "learning_rate": 2.990778947368421e-05, "loss": 0.3813, "mean_copy_accuracy": 0.9876763373613358, "mean_gen_accuracy": 0.8496025800704956, "mean_token_accuracy": 0.8786998838186264, "num_tokens": 218571939.0, "sample_num_tokens": 8876.75, "step": 809, "total_num_tokens": 218607446.0, "z_loss": 0.0037355695385485888 }, { "copy_logits_max": 3.7309932708740234, "copy_logits_min": -750000000.0, "copy_num_tokens": 233.8125, "epoch": 0.1654327291294358, "gen_logits_max": 12.506485939025879, "gen_logits_mean": -8.395273208618164, "gen_logits_min": -20.27569007873535, "gen_logits_std": 2.4182233810424805, "gen_loss": 0.39971011877059937, "grad_norm": 0.6325885940349312, "learning_rate": 2.9906526315789475e-05, "loss": 0.371, "mean_copy_accuracy": 0.9832004308700562, "mean_gen_accuracy": 0.8546663224697113, "mean_token_accuracy": 0.8826392292976379, "num_tokens": 218839225.0, "sample_num_tokens": 6993.75, "step": 810, "total_num_tokens": 218867200.0, "z_loss": 0.0023460634984076023 }, { "copy_logits_max": 8.06689739227295, "copy_logits_min": -750000000.0, "copy_num_tokens": 569.375, "epoch": 0.16563696706663264, "gen_logits_max": 12.619467735290527, "gen_logits_mean": -7.564435005187988, "gen_logits_min": -19.96432876586914, "gen_logits_std": 2.525308132171631, "gen_loss": 0.35217586159706116, "grad_norm": 0.7478684920381509, "learning_rate": 2.9905263157894736e-05, "loss": 0.3698, "mean_copy_accuracy": 0.9901771992444992, "mean_gen_accuracy": 0.8419667333364487, "mean_token_accuracy": 0.8827647864818573, "num_tokens": 219124739.0, "sample_num_tokens": 8507.75, "step": 811, "total_num_tokens": 219158770.0, "z_loss": 0.004108849912881851 }, { "copy_logits_max": 3.920957088470459, "copy_logits_min": -750000000.0, "copy_num_tokens": 409.5, "epoch": 0.16584120500382946, "gen_logits_max": 11.848736763000488, "gen_logits_mean": -8.239957809448242, "gen_logits_min": -20.078893661499023, "gen_logits_std": 2.4436194896698, "gen_loss": 0.3561672866344452, "grad_norm": 0.657473890262274, "learning_rate": 2.9904e-05, "loss": 0.3391, "mean_copy_accuracy": 0.9890225678682327, "mean_gen_accuracy": 0.8552846908569336, "mean_token_accuracy": 0.8927380740642548, "num_tokens": 219424246.0, "sample_num_tokens": 8583.5, "step": 812, "total_num_tokens": 219458580.0, "z_loss": 0.0029054395854473114 }, { "copy_logits_max": 5.032115936279297, "copy_logits_min": -750000000.0, "copy_num_tokens": 434.75, "epoch": 0.1660454429410263, "gen_logits_max": 12.244909286499023, "gen_logits_mean": -7.69659423828125, "gen_logits_min": -19.2725772857666, "gen_logits_std": 2.483464479446411, "gen_loss": 0.37604597210884094, "grad_norm": 0.6546939402016614, "learning_rate": 2.990273684210526e-05, "loss": 0.3477, "mean_copy_accuracy": 0.9878165274858475, "mean_gen_accuracy": 0.852972537279129, "mean_token_accuracy": 0.8897842615842819, "num_tokens": 219688495.0, "sample_num_tokens": 8598.25, "step": 813, "total_num_tokens": 219722888.0, "z_loss": 0.0029966984875500202 }, { "copy_logits_max": 10.072921752929688, "copy_logits_min": -750000000.0, "copy_num_tokens": 699.8125, "epoch": 0.16624968087822312, "gen_logits_max": 13.983428955078125, "gen_logits_mean": -6.234326362609863, "gen_logits_min": -19.213397979736328, "gen_logits_std": 2.5925064086914062, "gen_loss": 0.4163469076156616, "grad_norm": 0.7504055489569752, "learning_rate": 2.990147368421053e-05, "loss": 0.3882, "mean_copy_accuracy": 0.9883307665586472, "mean_gen_accuracy": 0.8418740928173065, "mean_token_accuracy": 0.8799142688512802, "num_tokens": 219942924.0, "sample_num_tokens": 9179.0, "step": 814, "total_num_tokens": 219979640.0, "z_loss": 0.004910183604806662 }, { "copy_logits_max": 6.554817199707031, "copy_logits_min": -750000000.0, "copy_num_tokens": 451.8125, "epoch": 0.16645391881541996, "gen_logits_max": 12.804475784301758, "gen_logits_mean": -7.61105489730835, "gen_logits_min": -19.78763771057129, "gen_logits_std": 2.4668385982513428, "gen_loss": 0.346998929977417, "grad_norm": 0.6765979679751262, "learning_rate": 2.990021052631579e-05, "loss": 0.3785, "mean_copy_accuracy": 0.9879859834909439, "mean_gen_accuracy": 0.8479243814945221, "mean_token_accuracy": 0.883628636598587, "num_tokens": 220199587.0, "sample_num_tokens": 8758.25, "step": 815, "total_num_tokens": 220234620.0, "z_loss": 0.003475716570392251 }, { "copy_logits_max": 3.1658382415771484, "copy_logits_min": -750000000.0, "copy_num_tokens": 275.0625, "epoch": 0.1666581567526168, "gen_logits_max": 12.527212142944336, "gen_logits_mean": -8.149362564086914, "gen_logits_min": -19.598556518554688, "gen_logits_std": 2.3685765266418457, "gen_loss": 0.3633902966976166, "grad_norm": 0.6797033372689764, "learning_rate": 2.9898947368421054e-05, "loss": 0.3555, "mean_copy_accuracy": 0.9864488542079926, "mean_gen_accuracy": 0.8537703454494476, "mean_token_accuracy": 0.8848672807216644, "num_tokens": 220465017.0, "sample_num_tokens": 7509.75, "step": 816, "total_num_tokens": 220495056.0, "z_loss": 0.002642896492034197 }, { "copy_logits_max": 4.316131114959717, "copy_logits_min": -750000000.0, "copy_num_tokens": 274.4375, "epoch": 0.16686239468981362, "gen_logits_max": 13.169963836669922, "gen_logits_mean": -6.881683826446533, "gen_logits_min": -18.446916580200195, "gen_logits_std": 2.343757152557373, "gen_loss": 0.4408300817012787, "grad_norm": 0.7663215039329018, "learning_rate": 2.989768421052632e-05, "loss": 0.3845, "mean_copy_accuracy": 0.9887800514698029, "mean_gen_accuracy": 0.8465997576713562, "mean_token_accuracy": 0.879686176776886, "num_tokens": 220746936.0, "sample_num_tokens": 8064.5, "step": 817, "total_num_tokens": 220779194.0, "z_loss": 0.0029954626224935055 }, { "copy_logits_max": 7.432528495788574, "copy_logits_min": -687500032.0, "copy_num_tokens": 710.0625, "epoch": 0.16706663262701046, "gen_logits_max": 12.418130874633789, "gen_logits_mean": -6.7459716796875, "gen_logits_min": -19.113033294677734, "gen_logits_std": 2.5220093727111816, "gen_loss": 0.30557551980018616, "grad_norm": 0.7011073687846189, "learning_rate": 2.989642105263158e-05, "loss": 0.3709, "mean_copy_accuracy": 0.987299770116806, "mean_gen_accuracy": 0.8475787043571472, "mean_token_accuracy": 0.8845296502113342, "num_tokens": 221034259.0, "sample_num_tokens": 9324.75, "step": 818, "total_num_tokens": 221071558.0, "z_loss": 0.004064351320266724 }, { "copy_logits_max": 5.529626369476318, "copy_logits_min": -750000000.0, "copy_num_tokens": 641.875, "epoch": 0.1672708705642073, "gen_logits_max": 12.375368118286133, "gen_logits_mean": -6.305245876312256, "gen_logits_min": -18.18035125732422, "gen_logits_std": 2.479205846786499, "gen_loss": 0.2834327518939972, "grad_norm": 0.5449069411535239, "learning_rate": 2.9895157894736844e-05, "loss": 0.3489, "mean_copy_accuracy": 0.9901136755943298, "mean_gen_accuracy": 0.8527185022830963, "mean_token_accuracy": 0.8892758935689926, "num_tokens": 221305800.0, "sample_num_tokens": 9466.5, "step": 819, "total_num_tokens": 221343666.0, "z_loss": 0.0038598603568971157 }, { "copy_logits_max": 6.485884666442871, "copy_logits_min": -750000000.0, "copy_num_tokens": 604.3125, "epoch": 0.16747510850140412, "gen_logits_max": 13.076929092407227, "gen_logits_mean": -7.605614185333252, "gen_logits_min": -19.990507125854492, "gen_logits_std": 2.554718017578125, "gen_loss": 0.35694098472595215, "grad_norm": 0.6045414035114923, "learning_rate": 2.9893894736842105e-05, "loss": 0.3799, "mean_copy_accuracy": 0.9876131266355515, "mean_gen_accuracy": 0.8471831828355789, "mean_token_accuracy": 0.877942681312561, "num_tokens": 221581948.0, "sample_num_tokens": 10270.0, "step": 820, "total_num_tokens": 221623028.0, "z_loss": 0.0038623290602117777 }, { "copy_logits_max": 7.033395767211914, "copy_logits_min": -750000000.0, "copy_num_tokens": 513.625, "epoch": 0.16767934643860097, "gen_logits_max": 12.616660118103027, "gen_logits_mean": -7.079646587371826, "gen_logits_min": -19.935916900634766, "gen_logits_std": 2.548919677734375, "gen_loss": 0.35156187415122986, "grad_norm": 0.5800320326477926, "learning_rate": 2.989263157894737e-05, "loss": 0.3439, "mean_copy_accuracy": 0.987752765417099, "mean_gen_accuracy": 0.8555137515068054, "mean_token_accuracy": 0.8904023319482803, "num_tokens": 221856118.0, "sample_num_tokens": 8501.5, "step": 821, "total_num_tokens": 221890124.0, "z_loss": 0.0037666703574359417 }, { "copy_logits_max": 8.680121421813965, "copy_logits_min": -750000000.0, "copy_num_tokens": 449.125, "epoch": 0.1678835843757978, "gen_logits_max": 12.884378433227539, "gen_logits_mean": -7.451629638671875, "gen_logits_min": -20.14178466796875, "gen_logits_std": 2.5410571098327637, "gen_loss": 0.388775110244751, "grad_norm": 0.5860685387269451, "learning_rate": 2.9891368421052633e-05, "loss": 0.3781, "mean_copy_accuracy": 0.9870577901601791, "mean_gen_accuracy": 0.8453323543071747, "mean_token_accuracy": 0.8795686066150665, "num_tokens": 222137000.0, "sample_num_tokens": 8849.5, "step": 822, "total_num_tokens": 222172398.0, "z_loss": 0.0034295078366994858 }, { "copy_logits_max": 7.845783710479736, "copy_logits_min": -750000000.0, "copy_num_tokens": 493.25, "epoch": 0.16808782231299463, "gen_logits_max": 12.552040100097656, "gen_logits_mean": -7.9545183181762695, "gen_logits_min": -20.337987899780273, "gen_logits_std": 2.5307912826538086, "gen_loss": 0.38767120242118835, "grad_norm": 0.6104045527539584, "learning_rate": 2.9890105263157898e-05, "loss": 0.358, "mean_copy_accuracy": 0.9898351281881332, "mean_gen_accuracy": 0.8457707017660141, "mean_token_accuracy": 0.885093554854393, "num_tokens": 222420553.0, "sample_num_tokens": 8873.75, "step": 823, "total_num_tokens": 222456048.0, "z_loss": 0.0034578097984194756 }, { "copy_logits_max": 8.69011116027832, "copy_logits_min": -687500032.0, "copy_num_tokens": 577.0, "epoch": 0.16829206025019147, "gen_logits_max": 12.938556671142578, "gen_logits_mean": -7.652237415313721, "gen_logits_min": -20.244590759277344, "gen_logits_std": 2.5135679244995117, "gen_loss": 0.3966687321662903, "grad_norm": 0.5389447713860763, "learning_rate": 2.988884210526316e-05, "loss": 0.3889, "mean_copy_accuracy": 0.9891022592782974, "mean_gen_accuracy": 0.8362838476896286, "mean_token_accuracy": 0.8778194636106491, "num_tokens": 222706932.0, "sample_num_tokens": 8894.0, "step": 824, "total_num_tokens": 222742508.0, "z_loss": 0.0038462220691144466 }, { "copy_logits_max": 6.613982677459717, "copy_logits_min": -750000000.0, "copy_num_tokens": 402.9375, "epoch": 0.16849629818738832, "gen_logits_max": 12.78816032409668, "gen_logits_mean": -7.856884956359863, "gen_logits_min": -19.964445114135742, "gen_logits_std": 2.495213031768799, "gen_loss": 0.3830293118953705, "grad_norm": 0.5907362399451261, "learning_rate": 2.9887578947368423e-05, "loss": 0.3823, "mean_copy_accuracy": 0.9865383207798004, "mean_gen_accuracy": 0.8466041088104248, "mean_token_accuracy": 0.877249002456665, "num_tokens": 222968361.0, "sample_num_tokens": 8446.25, "step": 825, "total_num_tokens": 223002146.0, "z_loss": 0.0032040884252637625 }, { "copy_logits_max": 6.952399730682373, "copy_logits_min": -750000000.0, "copy_num_tokens": 462.5625, "epoch": 0.16870053612458513, "gen_logits_max": 12.915716171264648, "gen_logits_mean": -7.4955339431762695, "gen_logits_min": -20.207889556884766, "gen_logits_std": 2.5455145835876465, "gen_loss": 0.32879242300987244, "grad_norm": 0.6208472890191122, "learning_rate": 2.9886315789473684e-05, "loss": 0.3539, "mean_copy_accuracy": 0.9894207566976547, "mean_gen_accuracy": 0.8527328670024872, "mean_token_accuracy": 0.886642724275589, "num_tokens": 223248560.0, "sample_num_tokens": 8618.0, "step": 826, "total_num_tokens": 223283032.0, "z_loss": 0.0032705413177609444 }, { "copy_logits_max": 5.624955177307129, "copy_logits_min": -750000000.0, "copy_num_tokens": 381.25, "epoch": 0.16890477406178198, "gen_logits_max": 11.663459777832031, "gen_logits_mean": -7.140877723693848, "gen_logits_min": -19.43584442138672, "gen_logits_std": 2.519275188446045, "gen_loss": 0.38582170009613037, "grad_norm": 0.5477578509967335, "learning_rate": 2.9885052631578948e-05, "loss": 0.3625, "mean_copy_accuracy": 0.9893059730529785, "mean_gen_accuracy": 0.845296710729599, "mean_token_accuracy": 0.8859792351722717, "num_tokens": 223545582.0, "sample_num_tokens": 8205.0, "step": 827, "total_num_tokens": 223578402.0, "z_loss": 0.002872820943593979 }, { "copy_logits_max": 4.99778938293457, "copy_logits_min": -750000000.0, "copy_num_tokens": 549.0, "epoch": 0.16910901199897882, "gen_logits_max": 12.839656829833984, "gen_logits_mean": -7.044599533081055, "gen_logits_min": -19.651840209960938, "gen_logits_std": 2.5390782356262207, "gen_loss": 0.3612968623638153, "grad_norm": 0.7690469566493904, "learning_rate": 2.988378947368421e-05, "loss": 0.3808, "mean_copy_accuracy": 0.9852417856454849, "mean_gen_accuracy": 0.845558226108551, "mean_token_accuracy": 0.8792348057031631, "num_tokens": 223806868.0, "sample_num_tokens": 10067.0, "step": 828, "total_num_tokens": 223847136.0, "z_loss": 0.0031615595798939466 }, { "copy_logits_max": 6.766862869262695, "copy_logits_min": -750000000.0, "copy_num_tokens": 505.125, "epoch": 0.16931324993617564, "gen_logits_max": 12.123279571533203, "gen_logits_mean": -8.16424560546875, "gen_logits_min": -20.280059814453125, "gen_logits_std": 2.5308914184570312, "gen_loss": 0.37472257018089294, "grad_norm": 0.5932097957324122, "learning_rate": 2.9882526315789473e-05, "loss": 0.353, "mean_copy_accuracy": 0.9901452213525772, "mean_gen_accuracy": 0.853882446885109, "mean_token_accuracy": 0.8875573128461838, "num_tokens": 224082647.0, "sample_num_tokens": 9735.25, "step": 829, "total_num_tokens": 224121588.0, "z_loss": 0.0032811895944178104 }, { "copy_logits_max": 8.400049209594727, "copy_logits_min": -750000000.0, "copy_num_tokens": 406.4375, "epoch": 0.16951748787337248, "gen_logits_max": 12.931403160095215, "gen_logits_mean": -8.092875480651855, "gen_logits_min": -20.882240295410156, "gen_logits_std": 2.5632433891296387, "gen_loss": 0.40932950377464294, "grad_norm": 0.8059760418055466, "learning_rate": 2.9881263157894738e-05, "loss": 0.3643, "mean_copy_accuracy": 0.9879180639982224, "mean_gen_accuracy": 0.853031262755394, "mean_token_accuracy": 0.886020690202713, "num_tokens": 224355931.0, "sample_num_tokens": 7876.25, "step": 830, "total_num_tokens": 224387436.0, "z_loss": 0.0034226011484861374 }, { "copy_logits_max": 4.996067047119141, "copy_logits_min": -687500032.0, "copy_num_tokens": 408.3125, "epoch": 0.16972172581056932, "gen_logits_max": 12.63228988647461, "gen_logits_mean": -7.451390266418457, "gen_logits_min": -20.163894653320312, "gen_logits_std": 2.5969207286834717, "gen_loss": 0.39900559186935425, "grad_norm": 1.5932900109696082, "learning_rate": 2.9880000000000002e-05, "loss": 0.3744, "mean_copy_accuracy": 0.9815382361412048, "mean_gen_accuracy": 0.8522024899721146, "mean_token_accuracy": 0.8820811659097672, "num_tokens": 224621082.0, "sample_num_tokens": 8291.0, "step": 831, "total_num_tokens": 224654246.0, "z_loss": 0.002875210251659155 }, { "copy_logits_max": 8.573877334594727, "copy_logits_min": -750000000.0, "copy_num_tokens": 579.0625, "epoch": 0.16992596374776614, "gen_logits_max": 11.873778343200684, "gen_logits_mean": -7.624459743499756, "gen_logits_min": -20.114192962646484, "gen_logits_std": 2.5889010429382324, "gen_loss": 0.3206424117088318, "grad_norm": 0.8418663980974648, "learning_rate": 2.9878736842105266e-05, "loss": 0.367, "mean_copy_accuracy": 0.9902074038982391, "mean_gen_accuracy": 0.8484146296977997, "mean_token_accuracy": 0.8833905905485153, "num_tokens": 224879631.0, "sample_num_tokens": 9146.25, "step": 832, "total_num_tokens": 224916216.0, "z_loss": 0.0035618222318589687 }, { "copy_logits_max": 5.509146690368652, "copy_logits_min": -750000000.0, "copy_num_tokens": 347.1875, "epoch": 0.17013020168496298, "gen_logits_max": 12.225553512573242, "gen_logits_mean": -8.46257209777832, "gen_logits_min": -20.647361755371094, "gen_logits_std": 2.536182165145874, "gen_loss": 0.36770591139793396, "grad_norm": 0.8086741368503834, "learning_rate": 2.9877473684210527e-05, "loss": 0.3655, "mean_copy_accuracy": 0.988898754119873, "mean_gen_accuracy": 0.8495312184095383, "mean_token_accuracy": 0.8854872137308121, "num_tokens": 225142148.0, "sample_num_tokens": 7416.5, "step": 833, "total_num_tokens": 225171814.0, "z_loss": 0.0027427291497588158 }, { "copy_logits_max": 6.93071985244751, "copy_logits_min": -687500032.0, "copy_num_tokens": 482.3125, "epoch": 0.17033443962215983, "gen_logits_max": 12.25021743774414, "gen_logits_mean": -7.816307544708252, "gen_logits_min": -20.75157928466797, "gen_logits_std": 2.630087375640869, "gen_loss": 0.36653098464012146, "grad_norm": 0.8643320102248737, "learning_rate": 2.987621052631579e-05, "loss": 0.3751, "mean_copy_accuracy": 0.9825048744678497, "mean_gen_accuracy": 0.8474572002887726, "mean_token_accuracy": 0.8806808590888977, "num_tokens": 225395035.0, "sample_num_tokens": 7877.75, "step": 834, "total_num_tokens": 225426546.0, "z_loss": 0.0031074094586074352 }, { "copy_logits_max": 6.9262518882751465, "copy_logits_min": -750000000.0, "copy_num_tokens": 459.625, "epoch": 0.17053867755935664, "gen_logits_max": 11.629512786865234, "gen_logits_mean": -7.433546543121338, "gen_logits_min": -20.399181365966797, "gen_logits_std": 2.651611804962158, "gen_loss": 0.3707379698753357, "grad_norm": 0.6933247649145543, "learning_rate": 2.9874947368421052e-05, "loss": 0.3711, "mean_copy_accuracy": 0.987607941031456, "mean_gen_accuracy": 0.8519163280725479, "mean_token_accuracy": 0.885438397526741, "num_tokens": 225661715.0, "sample_num_tokens": 7547.25, "step": 835, "total_num_tokens": 225691904.0, "z_loss": 0.003112917300313711 }, { "copy_logits_max": 6.262015342712402, "copy_logits_min": -750000064.0, "copy_num_tokens": 534.625, "epoch": 0.1707429154965535, "gen_logits_max": 11.670991897583008, "gen_logits_mean": -8.159497261047363, "gen_logits_min": -20.624866485595703, "gen_logits_std": 2.5986571311950684, "gen_loss": 0.374194473028183, "grad_norm": 0.5625861003420284, "learning_rate": 2.9873684210526317e-05, "loss": 0.3455, "mean_copy_accuracy": 0.9909227192401886, "mean_gen_accuracy": 0.8548323661088943, "mean_token_accuracy": 0.890939861536026, "num_tokens": 225943603.0, "sample_num_tokens": 9044.25, "step": 836, "total_num_tokens": 225979780.0, "z_loss": 0.0033942312002182007 }, { "copy_logits_max": 6.409367084503174, "copy_logits_min": -687500032.0, "copy_num_tokens": 550.75, "epoch": 0.17094715343375033, "gen_logits_max": 12.035577774047852, "gen_logits_mean": -8.031560897827148, "gen_logits_min": -20.773080825805664, "gen_logits_std": 2.568361282348633, "gen_loss": 0.31526246666908264, "grad_norm": 0.5663536678442379, "learning_rate": 2.9872421052631578e-05, "loss": 0.354, "mean_copy_accuracy": 0.9876427054405212, "mean_gen_accuracy": 0.8580792546272278, "mean_token_accuracy": 0.888044074177742, "num_tokens": 226214760.0, "sample_num_tokens": 8681.5, "step": 837, "total_num_tokens": 226249486.0, "z_loss": 0.003156894352287054 }, { "copy_logits_max": 8.106266021728516, "copy_logits_min": -687500032.0, "copy_num_tokens": 480.75, "epoch": 0.17115139137094715, "gen_logits_max": 12.587831497192383, "gen_logits_mean": -8.468045234680176, "gen_logits_min": -21.036359786987305, "gen_logits_std": 2.559691905975342, "gen_loss": 0.3583199977874756, "grad_norm": 0.7709078798614164, "learning_rate": 2.9871157894736842e-05, "loss": 0.3731, "mean_copy_accuracy": 0.9860324263572693, "mean_gen_accuracy": 0.845968946814537, "mean_token_accuracy": 0.8823921233415604, "num_tokens": 226495986.0, "sample_num_tokens": 8283.0, "step": 838, "total_num_tokens": 226529118.0, "z_loss": 0.0032154512591660023 }, { "copy_logits_max": 5.744717597961426, "copy_logits_min": -750000000.0, "copy_num_tokens": 420.875, "epoch": 0.171355629308144, "gen_logits_max": 12.073204040527344, "gen_logits_mean": -8.8046875, "gen_logits_min": -21.17046356201172, "gen_logits_std": 2.5568737983703613, "gen_loss": 0.35569947957992554, "grad_norm": 0.792025080318659, "learning_rate": 2.9869894736842106e-05, "loss": 0.3822, "mean_copy_accuracy": 0.9891387075185776, "mean_gen_accuracy": 0.8440745323896408, "mean_token_accuracy": 0.8769891858100891, "num_tokens": 226769344.0, "sample_num_tokens": 8630.5, "step": 839, "total_num_tokens": 226803866.0, "z_loss": 0.002850792370736599 }, { "copy_logits_max": 5.6281046867370605, "copy_logits_min": -750000000.0, "copy_num_tokens": 581.125, "epoch": 0.17155986724534084, "gen_logits_max": 12.175110816955566, "gen_logits_mean": -7.33376407623291, "gen_logits_min": -20.272735595703125, "gen_logits_std": 2.6410329341888428, "gen_loss": 0.3849351108074188, "grad_norm": 0.6699433603671201, "learning_rate": 2.986863157894737e-05, "loss": 0.3656, "mean_copy_accuracy": 0.9900181740522385, "mean_gen_accuracy": 0.8415028601884842, "mean_token_accuracy": 0.8855158537626266, "num_tokens": 227055965.0, "sample_num_tokens": 9359.75, "step": 840, "total_num_tokens": 227093404.0, "z_loss": 0.003561732592061162 }, { "copy_logits_max": 6.093506336212158, "copy_logits_min": -750000000.0, "copy_num_tokens": 487.6875, "epoch": 0.17176410518253765, "gen_logits_max": 11.966848373413086, "gen_logits_mean": -7.985236167907715, "gen_logits_min": -20.58449935913086, "gen_logits_std": 2.610577344894409, "gen_loss": 0.3522859811782837, "grad_norm": 0.6182057585394699, "learning_rate": 2.986736842105263e-05, "loss": 0.3494, "mean_copy_accuracy": 0.9860970228910446, "mean_gen_accuracy": 0.854143813252449, "mean_token_accuracy": 0.8884807229042053, "num_tokens": 227337401.0, "sample_num_tokens": 8004.25, "step": 841, "total_num_tokens": 227369418.0, "z_loss": 0.0031412001699209213 }, { "copy_logits_max": 7.556889533996582, "copy_logits_min": -687500032.0, "copy_num_tokens": 593.25, "epoch": 0.1719683431197345, "gen_logits_max": 12.426213264465332, "gen_logits_mean": -8.029924392700195, "gen_logits_min": -20.940105438232422, "gen_logits_std": 2.598055362701416, "gen_loss": 0.3559354245662689, "grad_norm": 0.571654390862508, "learning_rate": 2.9866105263157896e-05, "loss": 0.3591, "mean_copy_accuracy": 0.9881391525268555, "mean_gen_accuracy": 0.8559531420469284, "mean_token_accuracy": 0.8872745037078857, "num_tokens": 227614711.0, "sample_num_tokens": 9319.75, "step": 842, "total_num_tokens": 227651990.0, "z_loss": 0.0031234589405357838 }, { "copy_logits_max": 4.817152976989746, "copy_logits_min": -750000000.0, "copy_num_tokens": 517.0, "epoch": 0.1721725810569313, "gen_logits_max": 11.58114242553711, "gen_logits_mean": -7.9301300048828125, "gen_logits_min": -20.814685821533203, "gen_logits_std": 2.629748821258545, "gen_loss": 0.3460030257701874, "grad_norm": 0.6597151195830186, "learning_rate": 2.9864842105263157e-05, "loss": 0.3594, "mean_copy_accuracy": 0.986877053976059, "mean_gen_accuracy": 0.8524992018938065, "mean_token_accuracy": 0.8855985403060913, "num_tokens": 227890292.0, "sample_num_tokens": 8264.5, "step": 843, "total_num_tokens": 227923350.0, "z_loss": 0.002883219625800848 }, { "copy_logits_max": 4.830052375793457, "copy_logits_min": -750000000.0, "copy_num_tokens": 545.0, "epoch": 0.17237681899412816, "gen_logits_max": 12.168298721313477, "gen_logits_mean": -7.827972412109375, "gen_logits_min": -20.80941390991211, "gen_logits_std": 2.627990961074829, "gen_loss": 0.3289673924446106, "grad_norm": 0.6616039124158446, "learning_rate": 2.986357894736842e-05, "loss": 0.3466, "mean_copy_accuracy": 0.9870166033506393, "mean_gen_accuracy": 0.8539959490299225, "mean_token_accuracy": 0.8911843746900558, "num_tokens": 228154355.0, "sample_num_tokens": 8935.25, "step": 844, "total_num_tokens": 228190096.0, "z_loss": 0.002710428088903427 }, { "copy_logits_max": 5.087817192077637, "copy_logits_min": -750000000.0, "copy_num_tokens": 516.8125, "epoch": 0.172581056931325, "gen_logits_max": 11.729509353637695, "gen_logits_mean": -8.950174331665039, "gen_logits_min": -21.123733520507812, "gen_logits_std": 2.5347139835357666, "gen_loss": 0.3496537208557129, "grad_norm": 0.6021245595299947, "learning_rate": 2.9862315789473685e-05, "loss": 0.3605, "mean_copy_accuracy": 0.9910245835781097, "mean_gen_accuracy": 0.8549247533082962, "mean_token_accuracy": 0.8854968845844269, "num_tokens": 228418607.0, "sample_num_tokens": 8652.25, "step": 845, "total_num_tokens": 228453216.0, "z_loss": 0.0026254993863403797 }, { "copy_logits_max": 4.44972562789917, "copy_logits_min": -750000000.0, "copy_num_tokens": 270.8125, "epoch": 0.17278529486852182, "gen_logits_max": 12.197039604187012, "gen_logits_mean": -7.7526960372924805, "gen_logits_min": -20.323692321777344, "gen_logits_std": 2.592623233795166, "gen_loss": 0.42597341537475586, "grad_norm": 0.7694050232539285, "learning_rate": 2.9861052631578946e-05, "loss": 0.3587, "mean_copy_accuracy": 0.9846510887145996, "mean_gen_accuracy": 0.857405349612236, "mean_token_accuracy": 0.88620325922966, "num_tokens": 228702118.0, "sample_num_tokens": 7488.5, "step": 846, "total_num_tokens": 228732072.0, "z_loss": 0.00262876832857728 }, { "copy_logits_max": 4.114257335662842, "copy_logits_min": -750000000.0, "copy_num_tokens": 458.1875, "epoch": 0.17298953280571866, "gen_logits_max": 11.835956573486328, "gen_logits_mean": -7.042372226715088, "gen_logits_min": -19.90606689453125, "gen_logits_std": 2.682003974914551, "gen_loss": 0.35710182785987854, "grad_norm": 1.6974247251245482, "learning_rate": 2.9859789473684214e-05, "loss": 0.3652, "mean_copy_accuracy": 0.9862153828144073, "mean_gen_accuracy": 0.853499174118042, "mean_token_accuracy": 0.8861034363508224, "num_tokens": 228976019.0, "sample_num_tokens": 7867.25, "step": 847, "total_num_tokens": 229007488.0, "z_loss": 0.0031210549641400576 }, { "copy_logits_max": 4.695478916168213, "copy_logits_min": -750000000.0, "copy_num_tokens": 472.375, "epoch": 0.1731937707429155, "gen_logits_max": 11.957192420959473, "gen_logits_mean": -7.887243270874023, "gen_logits_min": -20.62744903564453, "gen_logits_std": 2.5841124057769775, "gen_loss": 0.36851608753204346, "grad_norm": 0.8171588845103329, "learning_rate": 2.9858526315789475e-05, "loss": 0.3609, "mean_copy_accuracy": 0.9901221841573715, "mean_gen_accuracy": 0.8478924483060837, "mean_token_accuracy": 0.8840920925140381, "num_tokens": 229249357.0, "sample_num_tokens": 9003.25, "step": 848, "total_num_tokens": 229285370.0, "z_loss": 0.0028850946109741926 }, { "copy_logits_max": 5.852205276489258, "copy_logits_min": -750000000.0, "copy_num_tokens": 483.0, "epoch": 0.17339800868011232, "gen_logits_max": 12.219231605529785, "gen_logits_mean": -8.028167724609375, "gen_logits_min": -20.513561248779297, "gen_logits_std": 2.587150812149048, "gen_loss": 0.3696794807910919, "grad_norm": 0.5848323941477159, "learning_rate": 2.985726315789474e-05, "loss": 0.3399, "mean_copy_accuracy": 0.9902134090662003, "mean_gen_accuracy": 0.8545837700366974, "mean_token_accuracy": 0.8909560739994049, "num_tokens": 229527774.0, "sample_num_tokens": 8217.0, "step": 849, "total_num_tokens": 229560642.0, "z_loss": 0.003227761946618557 }, { "copy_logits_max": 2.7640457153320312, "copy_logits_min": -750000000.0, "copy_num_tokens": 251.25, "epoch": 0.17360224661730916, "gen_logits_max": 12.479843139648438, "gen_logits_mean": -8.059907913208008, "gen_logits_min": -20.280620574951172, "gen_logits_std": 2.501052141189575, "gen_loss": 0.4446694850921631, "grad_norm": 0.9094690962748877, "learning_rate": 2.9856e-05, "loss": 0.39, "mean_copy_accuracy": 0.9821780771017075, "mean_gen_accuracy": 0.844900518655777, "mean_token_accuracy": 0.8757745623588562, "num_tokens": 229796064.0, "sample_num_tokens": 7389.0, "step": 850, "total_num_tokens": 229825620.0, "z_loss": 0.002444368787109852 }, { "copy_logits_max": 5.630935192108154, "copy_logits_min": -687500032.0, "copy_num_tokens": 419.375, "epoch": 0.173806484554506, "gen_logits_max": 11.883256912231445, "gen_logits_mean": -8.153945922851562, "gen_logits_min": -20.656536102294922, "gen_logits_std": 2.5582332611083984, "gen_loss": 0.39178937673568726, "grad_norm": 0.6962056079535655, "learning_rate": 2.9854736842105264e-05, "loss": 0.3779, "mean_copy_accuracy": 0.9884775429964066, "mean_gen_accuracy": 0.8474100530147552, "mean_token_accuracy": 0.8811797499656677, "num_tokens": 230068087.0, "sample_num_tokens": 8356.75, "step": 851, "total_num_tokens": 230101514.0, "z_loss": 0.0032340665347874165 }, { "copy_logits_max": 6.996910572052002, "copy_logits_min": -750000000.0, "copy_num_tokens": 630.25, "epoch": 0.17401072249170282, "gen_logits_max": 12.32170295715332, "gen_logits_mean": -7.920068740844727, "gen_logits_min": -20.933147430419922, "gen_logits_std": 2.604418992996216, "gen_loss": 0.38980966806411743, "grad_norm": 0.774522379149212, "learning_rate": 2.9853473684210525e-05, "loss": 0.3755, "mean_copy_accuracy": 0.9902890771627426, "mean_gen_accuracy": 0.8433952927589417, "mean_token_accuracy": 0.8832024484872818, "num_tokens": 230350582.0, "sample_num_tokens": 9365.5, "step": 852, "total_num_tokens": 230388044.0, "z_loss": 0.003860433818772435 }, { "copy_logits_max": 6.249935150146484, "copy_logits_min": -750000064.0, "copy_num_tokens": 586.5625, "epoch": 0.17421496042889967, "gen_logits_max": 12.092121124267578, "gen_logits_mean": -8.006295204162598, "gen_logits_min": -20.878520965576172, "gen_logits_std": 2.6043901443481445, "gen_loss": 0.31611040234565735, "grad_norm": 1.0865662648597498, "learning_rate": 2.985221052631579e-05, "loss": 0.3591, "mean_copy_accuracy": 0.9839991927146912, "mean_gen_accuracy": 0.8524639755487442, "mean_token_accuracy": 0.8855841606855392, "num_tokens": 230638161.0, "sample_num_tokens": 8566.25, "step": 853, "total_num_tokens": 230672426.0, "z_loss": 0.0033276542089879513 }, { "copy_logits_max": 6.376540184020996, "copy_logits_min": -687500032.0, "copy_num_tokens": 511.875, "epoch": 0.1744191983660965, "gen_logits_max": 11.918451309204102, "gen_logits_mean": -7.365129470825195, "gen_logits_min": -21.005664825439453, "gen_logits_std": 2.617757558822632, "gen_loss": 0.35834383964538574, "grad_norm": 0.6292341206855482, "learning_rate": 2.985094736842105e-05, "loss": 0.3706, "mean_copy_accuracy": 0.9879658371210098, "mean_gen_accuracy": 0.8486208468675613, "mean_token_accuracy": 0.8823289424180984, "num_tokens": 230933046.0, "sample_num_tokens": 9308.5, "step": 854, "total_num_tokens": 230970280.0, "z_loss": 0.002999647054821253 }, { "copy_logits_max": 4.49688196182251, "copy_logits_min": -750000000.0, "copy_num_tokens": 340.5, "epoch": 0.17462343630329333, "gen_logits_max": 11.906082153320312, "gen_logits_mean": -8.428346633911133, "gen_logits_min": -20.49600601196289, "gen_logits_std": 2.5179450511932373, "gen_loss": 0.4207040071487427, "grad_norm": 0.6417828063769135, "learning_rate": 2.9849684210526318e-05, "loss": 0.3772, "mean_copy_accuracy": 0.9884713739156723, "mean_gen_accuracy": 0.8493158668279648, "mean_token_accuracy": 0.8808392733335495, "num_tokens": 231219926.0, "sample_num_tokens": 7920.0, "step": 855, "total_num_tokens": 231251606.0, "z_loss": 0.0026139242108911276 }, { "copy_logits_max": 6.001630783081055, "copy_logits_min": -750000000.0, "copy_num_tokens": 471.4375, "epoch": 0.17482767424049017, "gen_logits_max": 11.044836044311523, "gen_logits_mean": -7.891465187072754, "gen_logits_min": -20.356658935546875, "gen_logits_std": 2.5825228691101074, "gen_loss": 0.3441801071166992, "grad_norm": 0.5642630345430273, "learning_rate": 2.984842105263158e-05, "loss": 0.3496, "mean_copy_accuracy": 0.9900598078966141, "mean_gen_accuracy": 0.8567372560501099, "mean_token_accuracy": 0.8892088085412979, "num_tokens": 231497904.0, "sample_num_tokens": 8429.5, "step": 856, "total_num_tokens": 231531622.0, "z_loss": 0.002912284340709448 }, { "copy_logits_max": 4.467080593109131, "copy_logits_min": -687500032.0, "copy_num_tokens": 836.25, "epoch": 0.17503191217768702, "gen_logits_max": 11.712797164916992, "gen_logits_mean": -8.075800895690918, "gen_logits_min": -21.24675941467285, "gen_logits_std": 2.581129789352417, "gen_loss": 0.30538833141326904, "grad_norm": 0.5811451651512195, "learning_rate": 2.9847157894736844e-05, "loss": 0.358, "mean_copy_accuracy": 0.9884755313396454, "mean_gen_accuracy": 0.8546241074800491, "mean_token_accuracy": 0.8890529870986938, "num_tokens": 231777339.0, "sample_num_tokens": 10305.25, "step": 857, "total_num_tokens": 231818560.0, "z_loss": 0.0031783157028257847 }, { "copy_logits_max": 4.572513580322266, "copy_logits_min": -750000000.0, "copy_num_tokens": 382.375, "epoch": 0.17523615011488383, "gen_logits_max": 12.438730239868164, "gen_logits_mean": -7.877314567565918, "gen_logits_min": -19.971837997436523, "gen_logits_std": 2.533900260925293, "gen_loss": 0.44259774684906006, "grad_norm": 0.8430510115734903, "learning_rate": 2.9845894736842108e-05, "loss": 0.3832, "mean_copy_accuracy": 0.98463274538517, "mean_gen_accuracy": 0.8465398699045181, "mean_token_accuracy": 0.8757141530513763, "num_tokens": 232042174.0, "sample_num_tokens": 8678.0, "step": 858, "total_num_tokens": 232076886.0, "z_loss": 0.002545962342992425 }, { "copy_logits_max": 3.9224276542663574, "copy_logits_min": -750000000.0, "copy_num_tokens": 415.875, "epoch": 0.17544038805208068, "gen_logits_max": 11.859479904174805, "gen_logits_mean": -9.004156112670898, "gen_logits_min": -21.133712768554688, "gen_logits_std": 2.4866292476654053, "gen_loss": 0.35715681314468384, "grad_norm": 0.8023278469201165, "learning_rate": 2.984463157894737e-05, "loss": 0.3514, "mean_copy_accuracy": 0.9875081926584244, "mean_gen_accuracy": 0.8586505651473999, "mean_token_accuracy": 0.889527291059494, "num_tokens": 232305439.0, "sample_num_tokens": 8416.25, "step": 859, "total_num_tokens": 232339104.0, "z_loss": 0.002493906067684293 }, { "copy_logits_max": 5.685044288635254, "copy_logits_min": -625000064.0, "copy_num_tokens": 594.25, "epoch": 0.17564462598927752, "gen_logits_max": 11.990642547607422, "gen_logits_mean": -7.599017143249512, "gen_logits_min": -20.006851196289062, "gen_logits_std": 2.5637948513031006, "gen_loss": 0.35549747943878174, "grad_norm": 0.7781320855848031, "learning_rate": 2.9843368421052633e-05, "loss": 0.3441, "mean_copy_accuracy": 0.9896700084209442, "mean_gen_accuracy": 0.8548874855041504, "mean_token_accuracy": 0.8907912969589233, "num_tokens": 232582624.0, "sample_num_tokens": 9139.5, "step": 860, "total_num_tokens": 232619182.0, "z_loss": 0.0032473942264914513 }, { "copy_logits_max": 4.022739410400391, "copy_logits_min": -750000000.0, "copy_num_tokens": 458.4375, "epoch": 0.17584886392647434, "gen_logits_max": 11.822174072265625, "gen_logits_mean": -8.680704116821289, "gen_logits_min": -21.225017547607422, "gen_logits_std": 2.546590566635132, "gen_loss": 0.36014389991760254, "grad_norm": 0.5791522562246296, "learning_rate": 2.9842105263157894e-05, "loss": 0.3577, "mean_copy_accuracy": 0.9902870357036591, "mean_gen_accuracy": 0.8420834243297577, "mean_token_accuracy": 0.8840484023094177, "num_tokens": 232876515.0, "sample_num_tokens": 8225.25, "step": 861, "total_num_tokens": 232909416.0, "z_loss": 0.0025478769093751907 }, { "copy_logits_max": 4.146122932434082, "copy_logits_min": -750000000.0, "copy_num_tokens": 461.375, "epoch": 0.17605310186367118, "gen_logits_max": 11.574846267700195, "gen_logits_mean": -8.12830924987793, "gen_logits_min": -20.709827423095703, "gen_logits_std": 2.5743532180786133, "gen_loss": 0.32338160276412964, "grad_norm": 0.7663067639214016, "learning_rate": 2.9840842105263158e-05, "loss": 0.3498, "mean_copy_accuracy": 0.9848058223724365, "mean_gen_accuracy": 0.857173278927803, "mean_token_accuracy": 0.8876082897186279, "num_tokens": 233146968.0, "sample_num_tokens": 8181.5, "step": 862, "total_num_tokens": 233179694.0, "z_loss": 0.0024645416997373104 }, { "copy_logits_max": 7.823272705078125, "copy_logits_min": -687500032.0, "copy_num_tokens": 824.5625, "epoch": 0.17625733980086802, "gen_logits_max": 11.308732986450195, "gen_logits_mean": -8.186141967773438, "gen_logits_min": -21.24669647216797, "gen_logits_std": 2.620593786239624, "gen_loss": 0.32819512486457825, "grad_norm": 0.538187892284984, "learning_rate": 2.9839578947368423e-05, "loss": 0.3268, "mean_copy_accuracy": 0.9893968105316162, "mean_gen_accuracy": 0.8582330793142319, "mean_token_accuracy": 0.8963865488767624, "num_tokens": 233437486.0, "sample_num_tokens": 10211.0, "step": 863, "total_num_tokens": 233478330.0, "z_loss": 0.003664599498733878 }, { "copy_logits_max": 1.5965261459350586, "copy_logits_min": -750000000.0, "copy_num_tokens": 460.375, "epoch": 0.17646157773806484, "gen_logits_max": 11.450193405151367, "gen_logits_mean": -7.881457805633545, "gen_logits_min": -19.638744354248047, "gen_logits_std": 2.50266170501709, "gen_loss": 0.3487337529659271, "grad_norm": 0.6409636228202246, "learning_rate": 2.9838315789473687e-05, "loss": 0.3558, "mean_copy_accuracy": 0.9900713711977005, "mean_gen_accuracy": 0.8564431965351105, "mean_token_accuracy": 0.8885468989610672, "num_tokens": 233719714.0, "sample_num_tokens": 8710.5, "step": 864, "total_num_tokens": 233754556.0, "z_loss": 0.0026420261710882187 }, { "copy_logits_max": 6.588587760925293, "copy_logits_min": -750000000.0, "copy_num_tokens": 688.5625, "epoch": 0.17666581567526168, "gen_logits_max": 12.173845291137695, "gen_logits_mean": -7.298615455627441, "gen_logits_min": -19.549196243286133, "gen_logits_std": 2.5545310974121094, "gen_loss": 0.38144955039024353, "grad_norm": 0.625325014059142, "learning_rate": 2.9837052631578948e-05, "loss": 0.3741, "mean_copy_accuracy": 0.9876866191625595, "mean_gen_accuracy": 0.8447717279195786, "mean_token_accuracy": 0.882356122136116, "num_tokens": 233998628.0, "sample_num_tokens": 9342.5, "step": 865, "total_num_tokens": 234035998.0, "z_loss": 0.003734270343557 }, { "copy_logits_max": 2.4757132530212402, "copy_logits_min": -750000000.0, "copy_num_tokens": 318.9375, "epoch": 0.17687005361245853, "gen_logits_max": 12.45469856262207, "gen_logits_mean": -7.874849796295166, "gen_logits_min": -19.73048973083496, "gen_logits_std": 2.532412052154541, "gen_loss": 0.37861111760139465, "grad_norm": 0.6758387667161738, "learning_rate": 2.9835789473684212e-05, "loss": 0.3695, "mean_copy_accuracy": 0.9862608313560486, "mean_gen_accuracy": 0.8480672389268875, "mean_token_accuracy": 0.8810461610555649, "num_tokens": 234272794.0, "sample_num_tokens": 6809.0, "step": 866, "total_num_tokens": 234300030.0, "z_loss": 0.00268723675981164 }, { "copy_logits_max": 3.5545849800109863, "copy_logits_min": -750000000.0, "copy_num_tokens": 458.875, "epoch": 0.17707429154965534, "gen_logits_max": 11.610315322875977, "gen_logits_mean": -7.7617387771606445, "gen_logits_min": -20.213525772094727, "gen_logits_std": 2.6165738105773926, "gen_loss": 0.3828548192977905, "grad_norm": 0.5762985841283668, "learning_rate": 2.9834526315789473e-05, "loss": 0.3688, "mean_copy_accuracy": 0.9880932867527008, "mean_gen_accuracy": 0.848173588514328, "mean_token_accuracy": 0.8830002844333649, "num_tokens": 234532376.0, "sample_num_tokens": 8261.0, "step": 867, "total_num_tokens": 234565420.0, "z_loss": 0.002908450085669756 }, { "copy_logits_max": 4.084124565124512, "copy_logits_min": -687500032.0, "copy_num_tokens": 551.25, "epoch": 0.1772785294868522, "gen_logits_max": 11.470242500305176, "gen_logits_mean": -8.543460845947266, "gen_logits_min": -20.44304656982422, "gen_logits_std": 2.5372276306152344, "gen_loss": 0.3957473635673523, "grad_norm": 0.6256401321289975, "learning_rate": 2.9833263157894737e-05, "loss": 0.3602, "mean_copy_accuracy": 0.9899415969848633, "mean_gen_accuracy": 0.8470380306243896, "mean_token_accuracy": 0.8871126919984818, "num_tokens": 234838505.0, "sample_num_tokens": 8864.25, "step": 868, "total_num_tokens": 234873962.0, "z_loss": 0.002800500486046076 }, { "copy_logits_max": 4.793622970581055, "copy_logits_min": -687500032.0, "copy_num_tokens": 521.8125, "epoch": 0.177482767424049, "gen_logits_max": 11.398444175720215, "gen_logits_mean": -8.229528427124023, "gen_logits_min": -20.314834594726562, "gen_logits_std": 2.498492479324341, "gen_loss": 0.3691736161708832, "grad_norm": 0.6765121197237866, "learning_rate": 2.9831999999999998e-05, "loss": 0.3634, "mean_copy_accuracy": 0.9879525303840637, "mean_gen_accuracy": 0.8529279679059982, "mean_token_accuracy": 0.8835064321756363, "num_tokens": 235100732.0, "sample_num_tokens": 9150.0, "step": 869, "total_num_tokens": 235137332.0, "z_loss": 0.003001783974468708 }, { "copy_logits_max": 4.060575008392334, "copy_logits_min": -750000000.0, "copy_num_tokens": 526.25, "epoch": 0.17768700536124585, "gen_logits_max": 11.360002517700195, "gen_logits_mean": -8.203316688537598, "gen_logits_min": -21.338768005371094, "gen_logits_std": 2.584277629852295, "gen_loss": 0.36468419432640076, "grad_norm": 0.8148908754987677, "learning_rate": 2.9830736842105263e-05, "loss": 0.373, "mean_copy_accuracy": 0.9853731989860535, "mean_gen_accuracy": 0.8455430418252945, "mean_token_accuracy": 0.8842749446630478, "num_tokens": 235381296.0, "sample_num_tokens": 8011.0, "step": 870, "total_num_tokens": 235413340.0, "z_loss": 0.0026995628140866756 }, { "copy_logits_max": 3.248446464538574, "copy_logits_min": -750000000.0, "copy_num_tokens": 586.5, "epoch": 0.1778912432984427, "gen_logits_max": 11.35068130493164, "gen_logits_mean": -8.266796112060547, "gen_logits_min": -20.837236404418945, "gen_logits_std": 2.5604116916656494, "gen_loss": 0.35448765754699707, "grad_norm": 0.669867419033593, "learning_rate": 2.982947368421053e-05, "loss": 0.3663, "mean_copy_accuracy": 0.990489736199379, "mean_gen_accuracy": 0.845976710319519, "mean_token_accuracy": 0.8845668882131577, "num_tokens": 235652840.0, "sample_num_tokens": 8768.0, "step": 871, "total_num_tokens": 235687912.0, "z_loss": 0.0024687759578227997 }, { "copy_logits_max": 2.3139114379882812, "copy_logits_min": -750000000.0, "copy_num_tokens": 719.9375, "epoch": 0.1780954812356395, "gen_logits_max": 11.221698760986328, "gen_logits_mean": -7.657015323638916, "gen_logits_min": -20.092899322509766, "gen_logits_std": 2.575143575668335, "gen_loss": 0.33626052737236023, "grad_norm": 0.7246329859474202, "learning_rate": 2.982821052631579e-05, "loss": 0.3606, "mean_copy_accuracy": 0.9876425564289093, "mean_gen_accuracy": 0.852599248290062, "mean_token_accuracy": 0.8841877579689026, "num_tokens": 235918577.0, "sample_num_tokens": 10400.75, "step": 872, "total_num_tokens": 235960180.0, "z_loss": 0.002750018611550331 }, { "copy_logits_max": 2.656126022338867, "copy_logits_min": -750000000.0, "copy_num_tokens": 546.6875, "epoch": 0.17829971917283635, "gen_logits_max": 11.119852066040039, "gen_logits_mean": -8.375953674316406, "gen_logits_min": -20.889799118041992, "gen_logits_std": 2.5361328125, "gen_loss": 0.3667270541191101, "grad_norm": 0.6555913331718678, "learning_rate": 2.9826947368421056e-05, "loss": 0.3752, "mean_copy_accuracy": 0.9897098690271378, "mean_gen_accuracy": 0.845392644405365, "mean_token_accuracy": 0.8832114189863205, "num_tokens": 236212151.0, "sample_num_tokens": 9473.75, "step": 873, "total_num_tokens": 236250046.0, "z_loss": 0.0023866298142820597 }, { "copy_logits_max": 1.2871671915054321, "copy_logits_min": -750000000.0, "copy_num_tokens": 392.8125, "epoch": 0.1785039571100332, "gen_logits_max": 11.422014236450195, "gen_logits_mean": -7.723396301269531, "gen_logits_min": -20.164087295532227, "gen_logits_std": 2.555510997772217, "gen_loss": 0.38764137029647827, "grad_norm": 0.6265470968160768, "learning_rate": 2.9825684210526316e-05, "loss": 0.3704, "mean_copy_accuracy": 0.9864885807037354, "mean_gen_accuracy": 0.8503388166427612, "mean_token_accuracy": 0.8822886645793915, "num_tokens": 236481667.0, "sample_num_tokens": 7882.75, "step": 874, "total_num_tokens": 236513198.0, "z_loss": 0.0027056634426116943 }, { "copy_logits_max": 4.143457412719727, "copy_logits_min": -687500032.0, "copy_num_tokens": 279.125, "epoch": 0.17870819504723, "gen_logits_max": 11.678297996520996, "gen_logits_mean": -8.212055206298828, "gen_logits_min": -20.254724502563477, "gen_logits_std": 2.5557072162628174, "gen_loss": 0.4398069679737091, "grad_norm": 0.5593281530053529, "learning_rate": 2.982442105263158e-05, "loss": 0.3916, "mean_copy_accuracy": 0.9889591187238693, "mean_gen_accuracy": 0.8447064161300659, "mean_token_accuracy": 0.87404765188694, "num_tokens": 236721468.0, "sample_num_tokens": 7101.0, "step": 875, "total_num_tokens": 236749872.0, "z_loss": 0.0025136801414191723 }, { "copy_logits_max": 5.03389835357666, "copy_logits_min": -750000000.0, "copy_num_tokens": 457.8125, "epoch": 0.17891243298442686, "gen_logits_max": 11.3687744140625, "gen_logits_mean": -8.026263236999512, "gen_logits_min": -20.56260108947754, "gen_logits_std": 2.6007299423217773, "gen_loss": 0.3669224679470062, "grad_norm": 0.6493046045613082, "learning_rate": 2.9823157894736842e-05, "loss": 0.3571, "mean_copy_accuracy": 0.9874986559152603, "mean_gen_accuracy": 0.8519958853721619, "mean_token_accuracy": 0.8881471157073975, "num_tokens": 236969102.0, "sample_num_tokens": 7690.0, "step": 876, "total_num_tokens": 236999862.0, "z_loss": 0.003185578156262636 }, { "copy_logits_max": 1.427463173866272, "copy_logits_min": -750000000.0, "copy_num_tokens": 397.875, "epoch": 0.1791166709216237, "gen_logits_max": 10.882216453552246, "gen_logits_mean": -8.639913558959961, "gen_logits_min": -20.355737686157227, "gen_logits_std": 2.5135374069213867, "gen_loss": 0.2972063422203064, "grad_norm": 0.5370910320981838, "learning_rate": 2.9821894736842106e-05, "loss": 0.3359, "mean_copy_accuracy": 0.9884316325187683, "mean_gen_accuracy": 0.8595761358737946, "mean_token_accuracy": 0.8913392424583435, "num_tokens": 237234170.0, "sample_num_tokens": 7820.5, "step": 877, "total_num_tokens": 237265452.0, "z_loss": 0.00223600585013628 }, { "copy_logits_max": 3.1001858711242676, "copy_logits_min": -750000000.0, "copy_num_tokens": 499.0625, "epoch": 0.17932090885882052, "gen_logits_max": 10.488807678222656, "gen_logits_mean": -8.869159698486328, "gen_logits_min": -20.938663482666016, "gen_logits_std": 2.514068603515625, "gen_loss": 0.3146442472934723, "grad_norm": 0.57013669153054, "learning_rate": 2.9820631578947367e-05, "loss": 0.3348, "mean_copy_accuracy": 0.9894106984138489, "mean_gen_accuracy": 0.8555057048797607, "mean_token_accuracy": 0.8927250057458878, "num_tokens": 237519894.0, "sample_num_tokens": 8414.0, "step": 878, "total_num_tokens": 237553550.0, "z_loss": 0.002547535812482238 }, { "copy_logits_max": 4.643823146820068, "copy_logits_min": -687500032.0, "copy_num_tokens": 357.6875, "epoch": 0.17952514679601736, "gen_logits_max": 10.985021591186523, "gen_logits_mean": -8.28952407836914, "gen_logits_min": -21.104286193847656, "gen_logits_std": 2.604644775390625, "gen_loss": 0.4226405918598175, "grad_norm": 0.6056156899191333, "learning_rate": 2.9819368421052635e-05, "loss": 0.3893, "mean_copy_accuracy": 0.9870166927576065, "mean_gen_accuracy": 0.8465886414051056, "mean_token_accuracy": 0.875450000166893, "num_tokens": 237774224.0, "sample_num_tokens": 7757.0, "step": 879, "total_num_tokens": 237805252.0, "z_loss": 0.00242538470774889 }, { "copy_logits_max": 3.7934646606445312, "copy_logits_min": -750000000.0, "copy_num_tokens": 446.0, "epoch": 0.1797293847332142, "gen_logits_max": 11.04344654083252, "gen_logits_mean": -7.561714172363281, "gen_logits_min": -20.092941284179688, "gen_logits_std": 2.5822157859802246, "gen_loss": 0.37801888585090637, "grad_norm": 0.5852772948876582, "learning_rate": 2.9818105263157896e-05, "loss": 0.346, "mean_copy_accuracy": 0.9869158416986465, "mean_gen_accuracy": 0.8597563356161118, "mean_token_accuracy": 0.8902393132448196, "num_tokens": 238031570.0, "sample_num_tokens": 8024.0, "step": 880, "total_num_tokens": 238063666.0, "z_loss": 0.00282686366699636 }, { "copy_logits_max": 0.7590270638465881, "copy_logits_min": -750000000.0, "copy_num_tokens": 397.1875, "epoch": 0.17993362267041102, "gen_logits_max": 11.117061614990234, "gen_logits_mean": -8.251582145690918, "gen_logits_min": -20.633180618286133, "gen_logits_std": 2.5593390464782715, "gen_loss": 0.344976544380188, "grad_norm": 0.7092860416694724, "learning_rate": 2.981684210526316e-05, "loss": 0.4017, "mean_copy_accuracy": 0.9843309968709946, "mean_gen_accuracy": 0.8400063067674637, "mean_token_accuracy": 0.8720641732215881, "num_tokens": 238288547.0, "sample_num_tokens": 8380.75, "step": 881, "total_num_tokens": 238322070.0, "z_loss": 0.0022356435656547546 }, { "copy_logits_max": 4.155538558959961, "copy_logits_min": -750000064.0, "copy_num_tokens": 482.625, "epoch": 0.18013786060760786, "gen_logits_max": 11.504446029663086, "gen_logits_mean": -7.618938446044922, "gen_logits_min": -20.198776245117188, "gen_logits_std": 2.5447583198547363, "gen_loss": 0.3586311340332031, "grad_norm": 0.5826465578581432, "learning_rate": 2.981557894736842e-05, "loss": 0.3454, "mean_copy_accuracy": 0.9905620515346527, "mean_gen_accuracy": 0.8511113226413727, "mean_token_accuracy": 0.890000507235527, "num_tokens": 238584426.0, "sample_num_tokens": 8213.0, "step": 882, "total_num_tokens": 238617278.0, "z_loss": 0.002653706818819046 }, { "copy_logits_max": 5.224908828735352, "copy_logits_min": -750000064.0, "copy_num_tokens": 507.625, "epoch": 0.1803420985448047, "gen_logits_max": 11.758302688598633, "gen_logits_mean": -8.296112060546875, "gen_logits_min": -20.67390251159668, "gen_logits_std": 2.5764498710632324, "gen_loss": 0.39424288272857666, "grad_norm": 0.5912174639133284, "learning_rate": 2.9814315789473685e-05, "loss": 0.3481, "mean_copy_accuracy": 0.9915855824947357, "mean_gen_accuracy": 0.8496796488761902, "mean_token_accuracy": 0.88962621986866, "num_tokens": 238866524.0, "sample_num_tokens": 8129.5, "step": 883, "total_num_tokens": 238899042.0, "z_loss": 0.002749408595263958 }, { "copy_logits_max": 3.5038509368896484, "copy_logits_min": -750000000.0, "copy_num_tokens": 249.8125, "epoch": 0.18054633648200152, "gen_logits_max": 11.907062530517578, "gen_logits_mean": -8.79505729675293, "gen_logits_min": -20.70914649963379, "gen_logits_std": 2.48405122756958, "gen_loss": 0.42632585763931274, "grad_norm": 0.6279721211758642, "learning_rate": 2.981305263157895e-05, "loss": 0.3822, "mean_copy_accuracy": 0.9877202808856964, "mean_gen_accuracy": 0.84697425365448, "mean_token_accuracy": 0.8784815222024918, "num_tokens": 239133910.0, "sample_num_tokens": 7058.0, "step": 884, "total_num_tokens": 239162142.0, "z_loss": 0.0022132634185254574 }, { "copy_logits_max": -0.8312608003616333, "copy_logits_min": -750000000.0, "copy_num_tokens": 272.3125, "epoch": 0.18075057441919837, "gen_logits_max": 11.684328079223633, "gen_logits_mean": -8.603185653686523, "gen_logits_min": -20.879226684570312, "gen_logits_std": 2.528714895248413, "gen_loss": 0.33973070979118347, "grad_norm": 1.0313181181342796, "learning_rate": 2.981178947368421e-05, "loss": 0.368, "mean_copy_accuracy": 0.985835388302803, "mean_gen_accuracy": 0.8527236580848694, "mean_token_accuracy": 0.88460873067379, "num_tokens": 239392854.0, "sample_num_tokens": 6495.5, "step": 885, "total_num_tokens": 239418836.0, "z_loss": 0.001947588985785842 }, { "copy_logits_max": 5.237123966217041, "copy_logits_min": -750000000.0, "copy_num_tokens": 489.625, "epoch": 0.1809548123563952, "gen_logits_max": 10.869933128356934, "gen_logits_mean": -8.267230033874512, "gen_logits_min": -20.833415985107422, "gen_logits_std": 2.5313000679016113, "gen_loss": 0.3544122278690338, "grad_norm": 0.6540655756309818, "learning_rate": 2.9810526315789475e-05, "loss": 0.3483, "mean_copy_accuracy": 0.9914889931678772, "mean_gen_accuracy": 0.855166032910347, "mean_token_accuracy": 0.8886289149522781, "num_tokens": 239660318.0, "sample_num_tokens": 7823.5, "step": 886, "total_num_tokens": 239691612.0, "z_loss": 0.003091286402195692 }, { "copy_logits_max": 4.699279308319092, "copy_logits_min": -750000000.0, "copy_num_tokens": 538.1875, "epoch": 0.18115905029359203, "gen_logits_max": 11.465049743652344, "gen_logits_mean": -7.595084190368652, "gen_logits_min": -19.32459259033203, "gen_logits_std": 2.487826347351074, "gen_loss": 0.4076504707336426, "grad_norm": 0.8294744636137943, "learning_rate": 2.9809263157894736e-05, "loss": 0.3729, "mean_copy_accuracy": 0.9880334734916687, "mean_gen_accuracy": 0.847383052110672, "mean_token_accuracy": 0.8825421035289764, "num_tokens": 239917035.0, "sample_num_tokens": 9391.75, "step": 887, "total_num_tokens": 239954602.0, "z_loss": 0.0037856013514101505 }, { "copy_logits_max": 3.7048192024230957, "copy_logits_min": -625000000.0, "copy_num_tokens": 391.4375, "epoch": 0.18136328823078887, "gen_logits_max": 11.249883651733398, "gen_logits_mean": -8.624160766601562, "gen_logits_min": -20.609283447265625, "gen_logits_std": 2.4904534816741943, "gen_loss": 0.36851000785827637, "grad_norm": 0.6101421636862251, "learning_rate": 2.9808000000000003e-05, "loss": 0.3582, "mean_copy_accuracy": 0.9869719743728638, "mean_gen_accuracy": 0.852316677570343, "mean_token_accuracy": 0.8855413496494293, "num_tokens": 240205408.0, "sample_num_tokens": 8502.5, "step": 888, "total_num_tokens": 240239418.0, "z_loss": 0.002996967639774084 }, { "copy_logits_max": 4.250343322753906, "copy_logits_min": -750000000.0, "copy_num_tokens": 425.1875, "epoch": 0.18156752616798572, "gen_logits_max": 11.316113471984863, "gen_logits_mean": -8.921979904174805, "gen_logits_min": -20.902812957763672, "gen_logits_std": 2.509270191192627, "gen_loss": 0.37537771463394165, "grad_norm": 0.6901991705441883, "learning_rate": 2.9806736842105264e-05, "loss": 0.3683, "mean_copy_accuracy": 0.9857222437858582, "mean_gen_accuracy": 0.8513045459985733, "mean_token_accuracy": 0.8839450627565384, "num_tokens": 240487052.0, "sample_num_tokens": 8885.0, "step": 889, "total_num_tokens": 240522592.0, "z_loss": 0.003072878113016486 }, { "copy_logits_max": 4.436020851135254, "copy_logits_min": -750000000.0, "copy_num_tokens": 425.375, "epoch": 0.18177176410518253, "gen_logits_max": 11.14462661743164, "gen_logits_mean": -8.973570823669434, "gen_logits_min": -20.992406845092773, "gen_logits_std": 2.513397216796875, "gen_loss": 0.36406636238098145, "grad_norm": 0.6828223065198206, "learning_rate": 2.980547368421053e-05, "loss": 0.3649, "mean_copy_accuracy": 0.9869189560413361, "mean_gen_accuracy": 0.8525232374668121, "mean_token_accuracy": 0.8848587572574615, "num_tokens": 240768191.0, "sample_num_tokens": 8229.25, "step": 890, "total_num_tokens": 240801108.0, "z_loss": 0.003068037796765566 }, { "copy_logits_max": 2.3959856033325195, "copy_logits_min": -687500032.0, "copy_num_tokens": 485.375, "epoch": 0.18197600204237938, "gen_logits_max": 10.999452590942383, "gen_logits_mean": -7.7171950340271, "gen_logits_min": -19.61021614074707, "gen_logits_std": 2.5161781311035156, "gen_loss": 0.32962673902511597, "grad_norm": 0.6561110182880673, "learning_rate": 2.980421052631579e-05, "loss": 0.3804, "mean_copy_accuracy": 0.9880551397800446, "mean_gen_accuracy": 0.8433532565832138, "mean_token_accuracy": 0.8778183162212372, "num_tokens": 241037079.0, "sample_num_tokens": 8677.25, "step": 891, "total_num_tokens": 241071788.0, "z_loss": 0.0033096903935074806 }, { "copy_logits_max": 3.056394577026367, "copy_logits_min": -687500032.0, "copy_num_tokens": 397.1875, "epoch": 0.18218023997957622, "gen_logits_max": 11.027364730834961, "gen_logits_mean": -8.408056259155273, "gen_logits_min": -20.345781326293945, "gen_logits_std": 2.4530560970306396, "gen_loss": 0.3386232852935791, "grad_norm": 0.6943782025750287, "learning_rate": 2.9802947368421054e-05, "loss": 0.3581, "mean_copy_accuracy": 0.9850070923566818, "mean_gen_accuracy": 0.849643811583519, "mean_token_accuracy": 0.8848271071910858, "num_tokens": 241318114.0, "sample_num_tokens": 8603.0, "step": 892, "total_num_tokens": 241352526.0, "z_loss": 0.0028473809361457825 }, { "copy_logits_max": 1.9551939964294434, "copy_logits_min": -750000000.0, "copy_num_tokens": 494.0, "epoch": 0.18238447791677304, "gen_logits_max": 10.619743347167969, "gen_logits_mean": -7.706075668334961, "gen_logits_min": -19.576507568359375, "gen_logits_std": 2.497468948364258, "gen_loss": 0.3509279787540436, "grad_norm": 0.5777529933385431, "learning_rate": 2.9801684210526315e-05, "loss": 0.3624, "mean_copy_accuracy": 0.9896747320890427, "mean_gen_accuracy": 0.8494659215211868, "mean_token_accuracy": 0.8825008273124695, "num_tokens": 241574773.0, "sample_num_tokens": 8321.75, "step": 893, "total_num_tokens": 241608060.0, "z_loss": 0.003005626145750284 }, { "copy_logits_max": 3.375598907470703, "copy_logits_min": -750000000.0, "copy_num_tokens": 473.375, "epoch": 0.18258871585396988, "gen_logits_max": 11.35491943359375, "gen_logits_mean": -8.613384246826172, "gen_logits_min": -20.475976943969727, "gen_logits_std": 2.4937644004821777, "gen_loss": 0.3632006049156189, "grad_norm": 0.632666016208713, "learning_rate": 2.980042105263158e-05, "loss": 0.3701, "mean_copy_accuracy": 0.9884313642978668, "mean_gen_accuracy": 0.8484165370464325, "mean_token_accuracy": 0.8832533359527588, "num_tokens": 241849331.0, "sample_num_tokens": 8798.25, "step": 894, "total_num_tokens": 241884524.0, "z_loss": 0.0028421576134860516 }, { "copy_logits_max": 2.921945810317993, "copy_logits_min": -750000000.0, "copy_num_tokens": 595.8125, "epoch": 0.1827929537911667, "gen_logits_max": 10.962575912475586, "gen_logits_mean": -7.922208786010742, "gen_logits_min": -20.173248291015625, "gen_logits_std": 2.5429916381835938, "gen_loss": 0.3484194278717041, "grad_norm": 0.6268486630327872, "learning_rate": 2.979915789473684e-05, "loss": 0.3719, "mean_copy_accuracy": 0.9874079376459122, "mean_gen_accuracy": 0.8480547219514847, "mean_token_accuracy": 0.882528617978096, "num_tokens": 242113687.0, "sample_num_tokens": 9026.25, "step": 895, "total_num_tokens": 242149792.0, "z_loss": 0.0028981599025428295 }, { "copy_logits_max": 1.9272044897079468, "copy_logits_min": -687500032.0, "copy_num_tokens": 349.0, "epoch": 0.18299719172836354, "gen_logits_max": 11.362770080566406, "gen_logits_mean": -8.659685134887695, "gen_logits_min": -20.389888763427734, "gen_logits_std": 2.4975733757019043, "gen_loss": 0.4291517436504364, "grad_norm": 0.574806921312874, "learning_rate": 2.9797894736842108e-05, "loss": 0.3794, "mean_copy_accuracy": 0.9893249869346619, "mean_gen_accuracy": 0.8464978039264679, "mean_token_accuracy": 0.8822988271713257, "num_tokens": 242402401.0, "sample_num_tokens": 7362.25, "step": 896, "total_num_tokens": 242431850.0, "z_loss": 0.0023801960051059723 }, { "copy_logits_max": 2.4425902366638184, "copy_logits_min": -750000000.0, "copy_num_tokens": 304.25, "epoch": 0.18320142966556038, "gen_logits_max": 10.897546768188477, "gen_logits_mean": -9.239628791809082, "gen_logits_min": -21.35836410522461, "gen_logits_std": 2.4783196449279785, "gen_loss": 0.36920735239982605, "grad_norm": 0.6838351704808278, "learning_rate": 2.979663157894737e-05, "loss": 0.3511, "mean_copy_accuracy": 0.9874899983406067, "mean_gen_accuracy": 0.8533691316843033, "mean_token_accuracy": 0.8877654671669006, "num_tokens": 242692667.0, "sample_num_tokens": 7239.75, "step": 897, "total_num_tokens": 242721626.0, "z_loss": 0.0021408640313893557 }, { "copy_logits_max": 1.9294177293777466, "copy_logits_min": -750000000.0, "copy_num_tokens": 356.0, "epoch": 0.1834056676027572, "gen_logits_max": 11.095239639282227, "gen_logits_mean": -8.526556015014648, "gen_logits_min": -20.792741775512695, "gen_logits_std": 2.5321044921875, "gen_loss": 0.3816624879837036, "grad_norm": 0.5424834136215261, "learning_rate": 2.9795368421052633e-05, "loss": 0.3452, "mean_copy_accuracy": 0.9863482862710953, "mean_gen_accuracy": 0.8593825995922089, "mean_token_accuracy": 0.8888841420412064, "num_tokens": 242963921.0, "sample_num_tokens": 7565.75, "step": 898, "total_num_tokens": 242994184.0, "z_loss": 0.0022471670527011156 }, { "copy_logits_max": 1.7628700733184814, "copy_logits_min": -750000000.0, "copy_num_tokens": 447.5625, "epoch": 0.18360990553995404, "gen_logits_max": 10.821795463562012, "gen_logits_mean": -8.520552635192871, "gen_logits_min": -20.64326286315918, "gen_logits_std": 2.5261001586914062, "gen_loss": 0.3511643707752228, "grad_norm": 0.5129052427822312, "learning_rate": 2.9794105263157897e-05, "loss": 0.37, "mean_copy_accuracy": 0.9884684532880783, "mean_gen_accuracy": 0.8510631769895554, "mean_token_accuracy": 0.8804785907268524, "num_tokens": 243216984.0, "sample_num_tokens": 8761.5, "step": 899, "total_num_tokens": 243252030.0, "z_loss": 0.002522039460018277 }, { "copy_logits_max": 4.6941752433776855, "copy_logits_min": -750000000.0, "copy_num_tokens": 564.0, "epoch": 0.1838141434771509, "gen_logits_max": 11.317586898803711, "gen_logits_mean": -8.215561866760254, "gen_logits_min": -21.07893180847168, "gen_logits_std": 2.582179546356201, "gen_loss": 0.3289709687232971, "grad_norm": 1.2142543343379457, "learning_rate": 2.9792842105263158e-05, "loss": 0.3519, "mean_copy_accuracy": 0.9907692670822144, "mean_gen_accuracy": 0.8503703325986862, "mean_token_accuracy": 0.8878145068883896, "num_tokens": 243504906.0, "sample_num_tokens": 8600.0, "step": 900, "total_num_tokens": 243539306.0, "z_loss": 0.003409370081499219 }, { "copy_logits_max": 5.025672435760498, "copy_logits_min": -750000000.0, "copy_num_tokens": 500.5625, "epoch": 0.1840183814143477, "gen_logits_max": 10.859694480895996, "gen_logits_mean": -9.09814167022705, "gen_logits_min": -21.54027557373047, "gen_logits_std": 2.544160842895508, "gen_loss": 0.3119298219680786, "grad_norm": 0.5497104748025317, "learning_rate": 2.9791578947368422e-05, "loss": 0.3627, "mean_copy_accuracy": 0.9897553473711014, "mean_gen_accuracy": 0.8537263721227646, "mean_token_accuracy": 0.8860190063714981, "num_tokens": 243780469.0, "sample_num_tokens": 7891.25, "step": 901, "total_num_tokens": 243812034.0, "z_loss": 0.0031287414021790028 }, { "copy_logits_max": 2.7982540130615234, "copy_logits_min": -750000000.0, "copy_num_tokens": 355.8125, "epoch": 0.18422261935154455, "gen_logits_max": 10.669633865356445, "gen_logits_mean": -9.596923828125, "gen_logits_min": -21.5263614654541, "gen_logits_std": 2.493678092956543, "gen_loss": 0.3703712224960327, "grad_norm": 0.5829955988789488, "learning_rate": 2.9790315789473683e-05, "loss": 0.3691, "mean_copy_accuracy": 0.9901927262544632, "mean_gen_accuracy": 0.8408558219671249, "mean_token_accuracy": 0.8816013038158417, "num_tokens": 244055986.0, "sample_num_tokens": 7643.5, "step": 902, "total_num_tokens": 244086560.0, "z_loss": 0.002299108076840639 }, { "copy_logits_max": 1.8906989097595215, "copy_logits_min": -750000000.0, "copy_num_tokens": 323.4375, "epoch": 0.1844268572887414, "gen_logits_max": 11.009166717529297, "gen_logits_mean": -8.627082824707031, "gen_logits_min": -20.470964431762695, "gen_logits_std": 2.48769211769104, "gen_loss": 0.3745582699775696, "grad_norm": 0.6324684547560339, "learning_rate": 2.9789052631578948e-05, "loss": 0.3418, "mean_copy_accuracy": 0.9871686846017838, "mean_gen_accuracy": 0.8572552651166916, "mean_token_accuracy": 0.8923983722925186, "num_tokens": 244344386.0, "sample_num_tokens": 7231.0, "step": 903, "total_num_tokens": 244373310.0, "z_loss": 0.0023795166052877903 }, { "copy_logits_max": 2.3735899925231934, "copy_logits_min": -750000000.0, "copy_num_tokens": 468.0, "epoch": 0.1846310952259382, "gen_logits_max": 11.207568168640137, "gen_logits_mean": -7.838400840759277, "gen_logits_min": -19.486705780029297, "gen_logits_std": 2.4771053791046143, "gen_loss": 0.3541888892650604, "grad_norm": 0.6106193879374051, "learning_rate": 2.9787789473684212e-05, "loss": 0.3472, "mean_copy_accuracy": 0.988445520401001, "mean_gen_accuracy": 0.8561491519212723, "mean_token_accuracy": 0.8895753771066666, "num_tokens": 244615129.0, "sample_num_tokens": 8213.75, "step": 904, "total_num_tokens": 244647984.0, "z_loss": 0.0025984281674027443 }, { "copy_logits_max": 0.6583384275436401, "copy_logits_min": -750000000.0, "copy_num_tokens": 414.25, "epoch": 0.18483533316313505, "gen_logits_max": 10.462141036987305, "gen_logits_mean": -9.129579544067383, "gen_logits_min": -20.388572692871094, "gen_logits_std": 2.4156665802001953, "gen_loss": 0.37709447741508484, "grad_norm": 0.5821155272904601, "learning_rate": 2.9786526315789476e-05, "loss": 0.3622, "mean_copy_accuracy": 0.9909286499023438, "mean_gen_accuracy": 0.8499727100133896, "mean_token_accuracy": 0.8836942166090012, "num_tokens": 244888553.0, "sample_num_tokens": 8837.75, "step": 905, "total_num_tokens": 244923904.0, "z_loss": 0.002085794461891055 }, { "copy_logits_max": 3.092000722885132, "copy_logits_min": -687500032.0, "copy_num_tokens": 580.3125, "epoch": 0.1850395711003319, "gen_logits_max": 10.795465469360352, "gen_logits_mean": -7.967262268066406, "gen_logits_min": -20.40385627746582, "gen_logits_std": 2.5537362098693848, "gen_loss": 0.3785317540168762, "grad_norm": 0.9058402458861706, "learning_rate": 2.9785263157894737e-05, "loss": 0.3571, "mean_copy_accuracy": 0.9857455641031265, "mean_gen_accuracy": 0.8579388409852982, "mean_token_accuracy": 0.8874096274375916, "num_tokens": 245156148.0, "sample_num_tokens": 9461.5, "step": 906, "total_num_tokens": 245193994.0, "z_loss": 0.0025532355066388845 }, { "copy_logits_max": 2.191335439682007, "copy_logits_min": -750000000.0, "copy_num_tokens": 441.375, "epoch": 0.1852438090375287, "gen_logits_max": 10.715238571166992, "gen_logits_mean": -8.577106475830078, "gen_logits_min": -20.013046264648438, "gen_logits_std": 2.4757370948791504, "gen_loss": 0.3415291905403137, "grad_norm": 0.652892139611963, "learning_rate": 2.9784e-05, "loss": 0.3538, "mean_copy_accuracy": 0.9892867356538773, "mean_gen_accuracy": 0.853954941034317, "mean_token_accuracy": 0.8877811878919601, "num_tokens": 245446782.0, "sample_num_tokens": 8430.0, "step": 907, "total_num_tokens": 245480502.0, "z_loss": 0.002596277743577957 }, { "copy_logits_max": 3.732240676879883, "copy_logits_min": -750000000.0, "copy_num_tokens": 400.5, "epoch": 0.18544804697472556, "gen_logits_max": 10.153178215026855, "gen_logits_mean": -8.110372543334961, "gen_logits_min": -19.741769790649414, "gen_logits_std": 2.488499164581299, "gen_loss": 0.4110046327114105, "grad_norm": 0.6192956628485364, "learning_rate": 2.9782736842105262e-05, "loss": 0.371, "mean_copy_accuracy": 0.9895588308572769, "mean_gen_accuracy": 0.850539967417717, "mean_token_accuracy": 0.8840353637933731, "num_tokens": 245721748.0, "sample_num_tokens": 8148.5, "step": 908, "total_num_tokens": 245754342.0, "z_loss": 0.0027222102507948875 }, { "copy_logits_max": 3.857997417449951, "copy_logits_min": -750000000.0, "copy_num_tokens": 463.0625, "epoch": 0.1856522849119224, "gen_logits_max": 9.953531265258789, "gen_logits_mean": -9.597773551940918, "gen_logits_min": -21.110763549804688, "gen_logits_std": 2.464261531829834, "gen_loss": 0.37075352668762207, "grad_norm": 0.5861992544247583, "learning_rate": 2.9781473684210527e-05, "loss": 0.3672, "mean_copy_accuracy": 0.9866140782833099, "mean_gen_accuracy": 0.8510380834341049, "mean_token_accuracy": 0.8832943290472031, "num_tokens": 245991774.0, "sample_num_tokens": 8851.0, "step": 909, "total_num_tokens": 246027178.0, "z_loss": 0.002387342508882284 }, { "copy_logits_max": 0.5961793661117554, "copy_logits_min": -750000000.0, "copy_num_tokens": 356.6875, "epoch": 0.18585652284911922, "gen_logits_max": 10.026778221130371, "gen_logits_mean": -10.208097457885742, "gen_logits_min": -21.609142303466797, "gen_logits_std": 2.442251205444336, "gen_loss": 0.36336639523506165, "grad_norm": 0.6227038320415336, "learning_rate": 2.9780210526315788e-05, "loss": 0.3401, "mean_copy_accuracy": 0.9856613129377365, "mean_gen_accuracy": 0.8627730309963226, "mean_token_accuracy": 0.8915069103240967, "num_tokens": 246269297.0, "sample_num_tokens": 8584.25, "step": 910, "total_num_tokens": 246303634.0, "z_loss": 0.0021216757595539093 }, { "copy_logits_max": 4.5969929695129395, "copy_logits_min": -750000000.0, "copy_num_tokens": 582.3125, "epoch": 0.18606076078631606, "gen_logits_max": 10.621362686157227, "gen_logits_mean": -8.401012420654297, "gen_logits_min": -20.779573440551758, "gen_logits_std": 2.5276269912719727, "gen_loss": 0.36910223960876465, "grad_norm": 0.649584153800489, "learning_rate": 2.9778947368421052e-05, "loss": 0.3577, "mean_copy_accuracy": 0.9883603155612946, "mean_gen_accuracy": 0.8447893410921097, "mean_token_accuracy": 0.8863040208816528, "num_tokens": 246555548.0, "sample_num_tokens": 8432.5, "step": 911, "total_num_tokens": 246589278.0, "z_loss": 0.0034316156525164843 }, { "copy_logits_max": 1.7641634941101074, "copy_logits_min": -750000000.0, "copy_num_tokens": 446.75, "epoch": 0.1862649987235129, "gen_logits_max": 10.248848915100098, "gen_logits_mean": -9.543313980102539, "gen_logits_min": -21.070152282714844, "gen_logits_std": 2.451636791229248, "gen_loss": 0.34479692578315735, "grad_norm": 0.7108596459231137, "learning_rate": 2.977768421052632e-05, "loss": 0.3657, "mean_copy_accuracy": 0.9889168441295624, "mean_gen_accuracy": 0.8546218872070312, "mean_token_accuracy": 0.8830764591693878, "num_tokens": 246824749.0, "sample_num_tokens": 9367.25, "step": 912, "total_num_tokens": 246862218.0, "z_loss": 0.0025932230055332184 }, { "copy_logits_max": 4.979848384857178, "copy_logits_min": -750000000.0, "copy_num_tokens": 457.4375, "epoch": 0.18646923666070972, "gen_logits_max": 11.195795059204102, "gen_logits_mean": -8.223108291625977, "gen_logits_min": -20.266916275024414, "gen_logits_std": 2.5203890800476074, "gen_loss": 0.3505628705024719, "grad_norm": 0.616648092911703, "learning_rate": 2.977642105263158e-05, "loss": 0.341, "mean_copy_accuracy": 0.9882919788360596, "mean_gen_accuracy": 0.8577169924974442, "mean_token_accuracy": 0.8900591433048248, "num_tokens": 247092488.0, "sample_num_tokens": 8359.5, "step": 913, "total_num_tokens": 247125926.0, "z_loss": 0.0032402777578681707 }, { "copy_logits_max": 2.3077149391174316, "copy_logits_min": -750000000.0, "copy_num_tokens": 618.125, "epoch": 0.18667347459790656, "gen_logits_max": 10.540658950805664, "gen_logits_mean": -8.048616409301758, "gen_logits_min": -20.030956268310547, "gen_logits_std": 2.528261661529541, "gen_loss": 0.33903518319129944, "grad_norm": 0.6524002857950137, "learning_rate": 2.9775157894736845e-05, "loss": 0.3447, "mean_copy_accuracy": 0.9866075366735458, "mean_gen_accuracy": 0.8581981807947159, "mean_token_accuracy": 0.889201745390892, "num_tokens": 247360365.0, "sample_num_tokens": 9997.75, "step": 914, "total_num_tokens": 247400356.0, "z_loss": 0.002539990935474634 }, { "copy_logits_max": 4.558374404907227, "copy_logits_min": -750000000.0, "copy_num_tokens": 494.625, "epoch": 0.1868777125351034, "gen_logits_max": 11.018035888671875, "gen_logits_mean": -9.340286254882812, "gen_logits_min": -21.49509048461914, "gen_logits_std": 2.500643730163574, "gen_loss": 0.3573974668979645, "grad_norm": 0.5533588375407277, "learning_rate": 2.9773894736842106e-05, "loss": 0.368, "mean_copy_accuracy": 0.9869149774312973, "mean_gen_accuracy": 0.8525388687849045, "mean_token_accuracy": 0.8821728527545929, "num_tokens": 247632140.0, "sample_num_tokens": 8850.5, "step": 915, "total_num_tokens": 247667542.0, "z_loss": 0.002535332925617695 }, { "copy_logits_max": 0.13383108377456665, "copy_logits_min": -750000000.0, "copy_num_tokens": 312.25, "epoch": 0.18708195047230022, "gen_logits_max": 10.975271224975586, "gen_logits_mean": -8.099997520446777, "gen_logits_min": -19.574506759643555, "gen_logits_std": 2.4840526580810547, "gen_loss": 0.4069370627403259, "grad_norm": 0.5879610221002299, "learning_rate": 2.977263157894737e-05, "loss": 0.3848, "mean_copy_accuracy": 0.9907552599906921, "mean_gen_accuracy": 0.8422045409679413, "mean_token_accuracy": 0.8777641355991364, "num_tokens": 247894112.0, "sample_num_tokens": 7188.0, "step": 916, "total_num_tokens": 247922864.0, "z_loss": 0.0022719488479197025 }, { "copy_logits_max": 2.692124605178833, "copy_logits_min": -750000064.0, "copy_num_tokens": 474.5625, "epoch": 0.18728618840949707, "gen_logits_max": 11.568018913269043, "gen_logits_mean": -7.273922920227051, "gen_logits_min": -18.787302017211914, "gen_logits_std": 2.4680395126342773, "gen_loss": 0.41200345754623413, "grad_norm": 0.5040909238818404, "learning_rate": 2.977136842105263e-05, "loss": 0.3635, "mean_copy_accuracy": 0.9911955147981644, "mean_gen_accuracy": 0.8460298627614975, "mean_token_accuracy": 0.8844320625066757, "num_tokens": 248190802.0, "sample_num_tokens": 8756.5, "step": 917, "total_num_tokens": 248225828.0, "z_loss": 0.002615204080939293 }, { "copy_logits_max": 1.852696180343628, "copy_logits_min": -750000000.0, "copy_num_tokens": 560.375, "epoch": 0.1874904263466939, "gen_logits_max": 10.670607566833496, "gen_logits_mean": -8.23130989074707, "gen_logits_min": -19.90377426147461, "gen_logits_std": 2.5454602241516113, "gen_loss": 0.3470662832260132, "grad_norm": 0.7054163729944601, "learning_rate": 2.9770105263157895e-05, "loss": 0.357, "mean_copy_accuracy": 0.9880202412605286, "mean_gen_accuracy": 0.8497075438499451, "mean_token_accuracy": 0.8848462253808975, "num_tokens": 248439956.0, "sample_num_tokens": 9275.5, "step": 918, "total_num_tokens": 248477058.0, "z_loss": 0.0023179228883236647 }, { "copy_logits_max": 2.193321466445923, "copy_logits_min": -750000000.0, "copy_num_tokens": 328.4375, "epoch": 0.18769466428389073, "gen_logits_max": 10.754836082458496, "gen_logits_mean": -8.757119178771973, "gen_logits_min": -20.62796401977539, "gen_logits_std": 2.5301904678344727, "gen_loss": 0.3690875172615051, "grad_norm": 0.5283543530026625, "learning_rate": 2.9768842105263156e-05, "loss": 0.3512, "mean_copy_accuracy": 0.9889430552721024, "mean_gen_accuracy": 0.857572540640831, "mean_token_accuracy": 0.8884508907794952, "num_tokens": 248715674.0, "sample_num_tokens": 7625.0, "step": 919, "total_num_tokens": 248746174.0, "z_loss": 0.0022112280130386353 }, { "copy_logits_max": 0.1623389720916748, "copy_logits_min": -750000000.0, "copy_num_tokens": 617.125, "epoch": 0.18789890222108757, "gen_logits_max": 10.142074584960938, "gen_logits_mean": -7.8668742179870605, "gen_logits_min": -19.54281234741211, "gen_logits_std": 2.4552032947540283, "gen_loss": 0.34005051851272583, "grad_norm": 0.5591233140170458, "learning_rate": 2.9767578947368424e-05, "loss": 0.3603, "mean_copy_accuracy": 0.9883532375097275, "mean_gen_accuracy": 0.85604128241539, "mean_token_accuracy": 0.8852168470621109, "num_tokens": 248987513.0, "sample_num_tokens": 10497.75, "step": 920, "total_num_tokens": 249029504.0, "z_loss": 0.002528594573959708 }, { "copy_logits_max": 1.0811421871185303, "copy_logits_min": -750000000.0, "copy_num_tokens": 539.75, "epoch": 0.1881031401582844, "gen_logits_max": 10.361234664916992, "gen_logits_mean": -8.310455322265625, "gen_logits_min": -20.220386505126953, "gen_logits_std": 2.5189623832702637, "gen_loss": 0.3369694948196411, "grad_norm": 0.5636029395901159, "learning_rate": 2.9766315789473685e-05, "loss": 0.3524, "mean_copy_accuracy": 0.989019125699997, "mean_gen_accuracy": 0.8502727448940277, "mean_token_accuracy": 0.8872446715831757, "num_tokens": 249283556.0, "sample_num_tokens": 9269.5, "step": 921, "total_num_tokens": 249320634.0, "z_loss": 0.002277777995914221 }, { "copy_logits_max": 1.5120375156402588, "copy_logits_min": -750000000.0, "copy_num_tokens": 521.375, "epoch": 0.18830737809548123, "gen_logits_max": 10.448254585266113, "gen_logits_mean": -8.057294845581055, "gen_logits_min": -19.975204467773438, "gen_logits_std": 2.519752264022827, "gen_loss": 0.3640320301055908, "grad_norm": 0.5930617355657394, "learning_rate": 2.976505263157895e-05, "loss": 0.3641, "mean_copy_accuracy": 0.9892961084842682, "mean_gen_accuracy": 0.843870997428894, "mean_token_accuracy": 0.8868773728609085, "num_tokens": 249558953.0, "sample_num_tokens": 8973.25, "step": 922, "total_num_tokens": 249594846.0, "z_loss": 0.002322041429579258 }, { "copy_logits_max": 4.829921245574951, "copy_logits_min": -750000000.0, "copy_num_tokens": 836.25, "epoch": 0.18851161603267808, "gen_logits_max": 10.06956672668457, "gen_logits_mean": -8.76824951171875, "gen_logits_min": -21.32248306274414, "gen_logits_std": 2.589315891265869, "gen_loss": 0.28694555163383484, "grad_norm": 0.5299153771083926, "learning_rate": 2.976378947368421e-05, "loss": 0.3388, "mean_copy_accuracy": 0.9904991537332535, "mean_gen_accuracy": 0.8559021800756454, "mean_token_accuracy": 0.8907963037490845, "num_tokens": 249847906.0, "sample_num_tokens": 10453.0, "step": 923, "total_num_tokens": 249889718.0, "z_loss": 0.0027160034514963627 }, { "copy_logits_max": -0.2675163745880127, "copy_logits_min": -750000000.0, "copy_num_tokens": 577.1875, "epoch": 0.1887158539698749, "gen_logits_max": 10.32387924194336, "gen_logits_mean": -8.442411422729492, "gen_logits_min": -20.442678451538086, "gen_logits_std": 2.538259744644165, "gen_loss": 0.3786714971065521, "grad_norm": 0.5555878450918578, "learning_rate": 2.9762526315789474e-05, "loss": 0.3615, "mean_copy_accuracy": 0.9904681444168091, "mean_gen_accuracy": 0.8489028364419937, "mean_token_accuracy": 0.8843124806880951, "num_tokens": 250133021.0, "sample_num_tokens": 10197.75, "step": 924, "total_num_tokens": 250173812.0, "z_loss": 0.002153162844479084 }, { "copy_logits_max": 1.9105579853057861, "copy_logits_min": -687500032.0, "copy_num_tokens": 326.0625, "epoch": 0.18892009190707174, "gen_logits_max": 10.571171760559082, "gen_logits_mean": -8.971369743347168, "gen_logits_min": -20.967939376831055, "gen_logits_std": 2.484860420227051, "gen_loss": 0.38402867317199707, "grad_norm": 0.5185477026418255, "learning_rate": 2.976126315789474e-05, "loss": 0.3728, "mean_copy_accuracy": 0.990831658244133, "mean_gen_accuracy": 0.8472241163253784, "mean_token_accuracy": 0.8805250525474548, "num_tokens": 250424307.0, "sample_num_tokens": 7886.25, "step": 925, "total_num_tokens": 250455852.0, "z_loss": 0.001953147817403078 }, { "copy_logits_max": -0.6203573346138, "copy_logits_min": -750000000.0, "copy_num_tokens": 369.75, "epoch": 0.18912432984426858, "gen_logits_max": 9.429132461547852, "gen_logits_mean": -9.875555038452148, "gen_logits_min": -21.421951293945312, "gen_logits_std": 2.4519970417022705, "gen_loss": 0.31785857677459717, "grad_norm": 0.78225759012352, "learning_rate": 2.976e-05, "loss": 0.3631, "mean_copy_accuracy": 0.9861244708299637, "mean_gen_accuracy": 0.8559803515672684, "mean_token_accuracy": 0.8845186829566956, "num_tokens": 250677075.0, "sample_num_tokens": 7654.75, "step": 926, "total_num_tokens": 250707694.0, "z_loss": 0.0016098563792183995 }, { "copy_logits_max": 2.1275463104248047, "copy_logits_min": -750000000.0, "copy_num_tokens": 465.875, "epoch": 0.1893285677814654, "gen_logits_max": 10.150922775268555, "gen_logits_mean": -8.387961387634277, "gen_logits_min": -20.499671936035156, "gen_logits_std": 2.549281597137451, "gen_loss": 0.2991670072078705, "grad_norm": 0.5510123921039987, "learning_rate": 2.9758736842105264e-05, "loss": 0.3294, "mean_copy_accuracy": 0.990705132484436, "mean_gen_accuracy": 0.861829936504364, "mean_token_accuracy": 0.8937734663486481, "num_tokens": 250934080.0, "sample_num_tokens": 7881.5, "step": 927, "total_num_tokens": 250965606.0, "z_loss": 0.0024599512107670307 }, { "copy_logits_max": 1.5643670558929443, "copy_logits_min": -750000000.0, "copy_num_tokens": 383.5, "epoch": 0.18953280571866224, "gen_logits_max": 11.369138717651367, "gen_logits_mean": -7.792227745056152, "gen_logits_min": -19.844802856445312, "gen_logits_std": 2.5320074558258057, "gen_loss": 0.3166605830192566, "grad_norm": 0.6669474917535227, "learning_rate": 2.9757473684210528e-05, "loss": 0.3413, "mean_copy_accuracy": 0.992609053850174, "mean_gen_accuracy": 0.8585474044084549, "mean_token_accuracy": 0.8939039707183838, "num_tokens": 251192061.0, "sample_num_tokens": 7294.25, "step": 928, "total_num_tokens": 251221238.0, "z_loss": 0.0025196780916303396 }, { "copy_logits_max": 2.1713781356811523, "copy_logits_min": -750000000.0, "copy_num_tokens": 511.5, "epoch": 0.18973704365585908, "gen_logits_max": 10.4392728805542, "gen_logits_mean": -9.019533157348633, "gen_logits_min": -21.250961303710938, "gen_logits_std": 2.557298421859741, "gen_loss": 0.3656514883041382, "grad_norm": 0.5672611891182655, "learning_rate": 2.9756210526315793e-05, "loss": 0.3759, "mean_copy_accuracy": 0.9883472770452499, "mean_gen_accuracy": 0.850138857960701, "mean_token_accuracy": 0.8811528980731964, "num_tokens": 251455191.0, "sample_num_tokens": 8693.25, "step": 929, "total_num_tokens": 251489964.0, "z_loss": 0.0024400365073233843 }, { "copy_logits_max": 2.3636770248413086, "copy_logits_min": -750000000.0, "copy_num_tokens": 332.6875, "epoch": 0.1899412815930559, "gen_logits_max": 10.95350456237793, "gen_logits_mean": -9.370903015136719, "gen_logits_min": -21.069416046142578, "gen_logits_std": 2.411454200744629, "gen_loss": 0.43251752853393555, "grad_norm": 0.670143171205468, "learning_rate": 2.9754947368421054e-05, "loss": 0.3876, "mean_copy_accuracy": 0.9849465787410736, "mean_gen_accuracy": 0.848218560218811, "mean_token_accuracy": 0.8767342418432236, "num_tokens": 251712880.0, "sample_num_tokens": 7471.0, "step": 930, "total_num_tokens": 251742764.0, "z_loss": 0.002173761371523142 }, { "copy_logits_max": 3.988807201385498, "copy_logits_min": -750000000.0, "copy_num_tokens": 488.1875, "epoch": 0.19014551953025274, "gen_logits_max": 10.906791687011719, "gen_logits_mean": -8.414043426513672, "gen_logits_min": -20.684104919433594, "gen_logits_std": 2.573596954345703, "gen_loss": 0.34380966424942017, "grad_norm": 0.5176583336431758, "learning_rate": 2.9753684210526318e-05, "loss": 0.3412, "mean_copy_accuracy": 0.9892837703227997, "mean_gen_accuracy": 0.8554679900407791, "mean_token_accuracy": 0.8917952328920364, "num_tokens": 251987193.0, "sample_num_tokens": 8503.25, "step": 931, "total_num_tokens": 252021206.0, "z_loss": 0.002677856246009469 }, { "copy_logits_max": 3.2959280014038086, "copy_logits_min": -750000000.0, "copy_num_tokens": 516.375, "epoch": 0.1903497574674496, "gen_logits_max": 10.29415512084961, "gen_logits_mean": -8.332029342651367, "gen_logits_min": -20.765636444091797, "gen_logits_std": 2.561068058013916, "gen_loss": 0.33292073011398315, "grad_norm": 0.7338995293534093, "learning_rate": 2.975242105263158e-05, "loss": 0.3405, "mean_copy_accuracy": 0.9905448704957962, "mean_gen_accuracy": 0.8595177531242371, "mean_token_accuracy": 0.892880842089653, "num_tokens": 252275476.0, "sample_num_tokens": 8784.0, "step": 932, "total_num_tokens": 252310612.0, "z_loss": 0.002728318329900503 }, { "copy_logits_max": 2.038987636566162, "copy_logits_min": -687500032.0, "copy_num_tokens": 479.1875, "epoch": 0.1905539954046464, "gen_logits_max": 10.45945930480957, "gen_logits_mean": -7.964666843414307, "gen_logits_min": -20.005277633666992, "gen_logits_std": 2.493180751800537, "gen_loss": 0.3409222960472107, "grad_norm": 0.6594117189042626, "learning_rate": 2.9751157894736843e-05, "loss": 0.3462, "mean_copy_accuracy": 0.9890398383140564, "mean_gen_accuracy": 0.8571336418390274, "mean_token_accuracy": 0.8880463093519211, "num_tokens": 252540162.0, "sample_num_tokens": 8751.0, "step": 933, "total_num_tokens": 252575166.0, "z_loss": 0.0027634943835437298 }, { "copy_logits_max": 1.5221318006515503, "copy_logits_min": -750000000.0, "copy_num_tokens": 288.8125, "epoch": 0.19075823334184325, "gen_logits_max": 10.67513656616211, "gen_logits_mean": -9.18714714050293, "gen_logits_min": -21.074520111083984, "gen_logits_std": 2.4985618591308594, "gen_loss": 0.3807910084724426, "grad_norm": 0.5239758853202481, "learning_rate": 2.9749894736842104e-05, "loss": 0.3732, "mean_copy_accuracy": 0.9881898760795593, "mean_gen_accuracy": 0.8468160927295685, "mean_token_accuracy": 0.8799785077571869, "num_tokens": 252810852.0, "sample_num_tokens": 7528.0, "step": 934, "total_num_tokens": 252840964.0, "z_loss": 0.0021234401501715183 }, { "copy_logits_max": 0.981104850769043, "copy_logits_min": -750000000.0, "copy_num_tokens": 435.75, "epoch": 0.1909624712790401, "gen_logits_max": 9.92863655090332, "gen_logits_mean": -9.678716659545898, "gen_logits_min": -21.56924819946289, "gen_logits_std": 2.5026564598083496, "gen_loss": 0.34798574447631836, "grad_norm": 0.5851815847413998, "learning_rate": 2.974863157894737e-05, "loss": 0.3733, "mean_copy_accuracy": 0.9873041212558746, "mean_gen_accuracy": 0.8492943346500397, "mean_token_accuracy": 0.8807788491249084, "num_tokens": 253076612.0, "sample_num_tokens": 8310.0, "step": 935, "total_num_tokens": 253109852.0, "z_loss": 0.002024194924160838 }, { "copy_logits_max": 1.7759082317352295, "copy_logits_min": -750000000.0, "copy_num_tokens": 542.3125, "epoch": 0.1911667092162369, "gen_logits_max": 9.576292037963867, "gen_logits_mean": -9.770793914794922, "gen_logits_min": -22.110233306884766, "gen_logits_std": 2.4988837242126465, "gen_loss": 0.32663494348526, "grad_norm": 0.5174346667950972, "learning_rate": 2.974736842105263e-05, "loss": 0.3489, "mean_copy_accuracy": 0.9866753667593002, "mean_gen_accuracy": 0.856509268283844, "mean_token_accuracy": 0.8854335099458694, "num_tokens": 253339783.0, "sample_num_tokens": 8713.25, "step": 936, "total_num_tokens": 253374636.0, "z_loss": 0.002021582331508398 }, { "copy_logits_max": 1.9978705644607544, "copy_logits_min": -750000000.0, "copy_num_tokens": 453.8125, "epoch": 0.19137094715343375, "gen_logits_max": 10.205836296081543, "gen_logits_mean": -9.491167068481445, "gen_logits_min": -21.613719940185547, "gen_logits_std": 2.521141529083252, "gen_loss": 0.3498217463493347, "grad_norm": 0.5390506488391211, "learning_rate": 2.9746105263157897e-05, "loss": 0.3594, "mean_copy_accuracy": 0.9886531233787537, "mean_gen_accuracy": 0.8543533980846405, "mean_token_accuracy": 0.8856001645326614, "num_tokens": 253603286.0, "sample_num_tokens": 8939.5, "step": 937, "total_num_tokens": 253639044.0, "z_loss": 0.0020390914287418127 }, { "copy_logits_max": 3.068760633468628, "copy_logits_min": -687500032.0, "copy_num_tokens": 510.75, "epoch": 0.1915751850906306, "gen_logits_max": 10.31714153289795, "gen_logits_mean": -8.233672142028809, "gen_logits_min": -20.30570411682129, "gen_logits_std": 2.5239477157592773, "gen_loss": 0.368442177772522, "grad_norm": 0.5894501706389522, "learning_rate": 2.974484210526316e-05, "loss": 0.3631, "mean_copy_accuracy": 0.9908730089664459, "mean_gen_accuracy": 0.8493241667747498, "mean_token_accuracy": 0.8840350061655045, "num_tokens": 253888618.0, "sample_num_tokens": 8201.5, "step": 938, "total_num_tokens": 253921424.0, "z_loss": 0.002343276981264353 }, { "copy_logits_max": 2.1537201404571533, "copy_logits_min": -750000000.0, "copy_num_tokens": 601.25, "epoch": 0.1917794230278274, "gen_logits_max": 10.48017406463623, "gen_logits_mean": -8.685070037841797, "gen_logits_min": -20.779769897460938, "gen_logits_std": 2.5550689697265625, "gen_loss": 0.3488420844078064, "grad_norm": 0.563490880356395, "learning_rate": 2.9743578947368422e-05, "loss": 0.3711, "mean_copy_accuracy": 0.9895646274089813, "mean_gen_accuracy": 0.8452233225107193, "mean_token_accuracy": 0.8803568780422211, "num_tokens": 254146719.0, "sample_num_tokens": 9179.25, "step": 939, "total_num_tokens": 254183436.0, "z_loss": 0.0022509826812893152 }, { "copy_logits_max": 3.947725534439087, "copy_logits_min": -750000000.0, "copy_num_tokens": 650.625, "epoch": 0.19198366096502426, "gen_logits_max": 10.925748825073242, "gen_logits_mean": -7.289186477661133, "gen_logits_min": -19.617034912109375, "gen_logits_std": 2.536735773086548, "gen_loss": 0.36122092604637146, "grad_norm": 0.522397045531925, "learning_rate": 2.9742315789473686e-05, "loss": 0.3475, "mean_copy_accuracy": 0.9906223118305206, "mean_gen_accuracy": 0.8461935818195343, "mean_token_accuracy": 0.8873616456985474, "num_tokens": 254435350.0, "sample_num_tokens": 8843.0, "step": 940, "total_num_tokens": 254470722.0, "z_loss": 0.0023419070057570934 }, { "copy_logits_max": 2.1576144695281982, "copy_logits_min": -750000000.0, "copy_num_tokens": 522.625, "epoch": 0.1921878989022211, "gen_logits_max": 10.505647659301758, "gen_logits_mean": -8.196992874145508, "gen_logits_min": -20.054609298706055, "gen_logits_std": 2.464961290359497, "gen_loss": 0.34450554847717285, "grad_norm": 0.5236011040582571, "learning_rate": 2.9741052631578947e-05, "loss": 0.3485, "mean_copy_accuracy": 0.9896539151668549, "mean_gen_accuracy": 0.8516097217798233, "mean_token_accuracy": 0.8884985446929932, "num_tokens": 254743157.0, "sample_num_tokens": 8551.75, "step": 941, "total_num_tokens": 254777364.0, "z_loss": 0.0021680393256247044 }, { "copy_logits_max": 4.208239555358887, "copy_logits_min": -750000000.0, "copy_num_tokens": 542.8125, "epoch": 0.19239213683941792, "gen_logits_max": 10.633373260498047, "gen_logits_mean": -8.83138656616211, "gen_logits_min": -21.246475219726562, "gen_logits_std": 2.571901798248291, "gen_loss": 0.35852205753326416, "grad_norm": 0.5490206015314277, "learning_rate": 2.9739789473684212e-05, "loss": 0.3844, "mean_copy_accuracy": 0.9905966371297836, "mean_gen_accuracy": 0.8418972045183182, "mean_token_accuracy": 0.877228781580925, "num_tokens": 255018486.0, "sample_num_tokens": 8216.5, "step": 942, "total_num_tokens": 255051352.0, "z_loss": 0.0023526984732598066 }, { "copy_logits_max": 2.057007312774658, "copy_logits_min": -750000000.0, "copy_num_tokens": 388.5625, "epoch": 0.19259637477661476, "gen_logits_max": 10.429031372070312, "gen_logits_mean": -8.508228302001953, "gen_logits_min": -20.411128997802734, "gen_logits_std": 2.5449185371398926, "gen_loss": 0.3649771213531494, "grad_norm": 0.5671148574477394, "learning_rate": 2.9738526315789473e-05, "loss": 0.3658, "mean_copy_accuracy": 0.9892848879098892, "mean_gen_accuracy": 0.8472824692726135, "mean_token_accuracy": 0.8830956071615219, "num_tokens": 255294394.0, "sample_num_tokens": 8802.0, "step": 943, "total_num_tokens": 255329602.0, "z_loss": 0.002107572741806507 }, { "copy_logits_max": 3.8926427364349365, "copy_logits_min": -750000000.0, "copy_num_tokens": 379.875, "epoch": 0.1928006127138116, "gen_logits_max": 10.74816608428955, "gen_logits_mean": -8.008354187011719, "gen_logits_min": -20.002403259277344, "gen_logits_std": 2.5344605445861816, "gen_loss": 0.35289424657821655, "grad_norm": 0.5052975017066582, "learning_rate": 2.9737263157894737e-05, "loss": 0.3586, "mean_copy_accuracy": 0.9896829426288605, "mean_gen_accuracy": 0.8487791866064072, "mean_token_accuracy": 0.8839813321828842, "num_tokens": 255566281.0, "sample_num_tokens": 7098.75, "step": 944, "total_num_tokens": 255594676.0, "z_loss": 0.0021678281482309103 }, { "copy_logits_max": 2.8940131664276123, "copy_logits_min": -750000000.0, "copy_num_tokens": 461.375, "epoch": 0.19300485065100842, "gen_logits_max": 10.64528751373291, "gen_logits_mean": -8.167905807495117, "gen_logits_min": -20.244544982910156, "gen_logits_std": 2.53960919380188, "gen_loss": 0.37819594144821167, "grad_norm": 0.6214343239776138, "learning_rate": 2.9736e-05, "loss": 0.3679, "mean_copy_accuracy": 0.9880287349224091, "mean_gen_accuracy": 0.8442784547805786, "mean_token_accuracy": 0.8818963766098022, "num_tokens": 255837074.0, "sample_num_tokens": 7881.5, "step": 945, "total_num_tokens": 255868600.0, "z_loss": 0.0021841549314558506 }, { "copy_logits_max": 1.8937268257141113, "copy_logits_min": -750000064.0, "copy_num_tokens": 481.9375, "epoch": 0.19320908858820526, "gen_logits_max": 10.431778907775879, "gen_logits_mean": -8.854856491088867, "gen_logits_min": -20.665483474731445, "gen_logits_std": 2.493889331817627, "gen_loss": 0.37172648310661316, "grad_norm": 0.5154714708280294, "learning_rate": 2.9734736842105266e-05, "loss": 0.3532, "mean_copy_accuracy": 0.9909444600343704, "mean_gen_accuracy": 0.8536811172962189, "mean_token_accuracy": 0.8867639899253845, "num_tokens": 256106364.0, "sample_num_tokens": 8618.5, "step": 946, "total_num_tokens": 256140838.0, "z_loss": 0.0021341918036341667 }, { "copy_logits_max": 2.5699429512023926, "copy_logits_min": -687500032.0, "copy_num_tokens": 598.6875, "epoch": 0.1934133265254021, "gen_logits_max": 9.890045166015625, "gen_logits_mean": -8.594608306884766, "gen_logits_min": -20.96094512939453, "gen_logits_std": 2.484044075012207, "gen_loss": 0.32586464285850525, "grad_norm": 0.5301121359956027, "learning_rate": 2.9733473684210526e-05, "loss": 0.3517, "mean_copy_accuracy": 0.9908143877983093, "mean_gen_accuracy": 0.8491527140140533, "mean_token_accuracy": 0.8876058906316757, "num_tokens": 256376231.0, "sample_num_tokens": 9702.25, "step": 947, "total_num_tokens": 256415040.0, "z_loss": 0.0022581894882023335 }, { "copy_logits_max": -0.09436443448066711, "copy_logits_min": -687500032.0, "copy_num_tokens": 442.3125, "epoch": 0.19361756446259892, "gen_logits_max": 9.96198844909668, "gen_logits_mean": -9.016568183898926, "gen_logits_min": -20.65936851501465, "gen_logits_std": 2.4545116424560547, "gen_loss": 0.3466638922691345, "grad_norm": 0.5866095879857254, "learning_rate": 2.973221052631579e-05, "loss": 0.3352, "mean_copy_accuracy": 0.9896716326475143, "mean_gen_accuracy": 0.8556943088769913, "mean_token_accuracy": 0.893168568611145, "num_tokens": 256670247.0, "sample_num_tokens": 8913.75, "step": 948, "total_num_tokens": 256705902.0, "z_loss": 0.0018843101570382714 }, { "copy_logits_max": 0.3250522017478943, "copy_logits_min": -687500032.0, "copy_num_tokens": 495.75, "epoch": 0.19382180239979577, "gen_logits_max": 10.505782127380371, "gen_logits_mean": -8.179856300354004, "gen_logits_min": -20.218643188476562, "gen_logits_std": 2.5580530166625977, "gen_loss": 0.36219173669815063, "grad_norm": 0.5081709590052913, "learning_rate": 2.9730947368421052e-05, "loss": 0.3556, "mean_copy_accuracy": 0.9886549264192581, "mean_gen_accuracy": 0.8584764748811722, "mean_token_accuracy": 0.8869768232107162, "num_tokens": 256923769.0, "sample_num_tokens": 8119.75, "step": 949, "total_num_tokens": 256956248.0, "z_loss": 0.002181285060942173 }, { "copy_logits_max": 3.184572458267212, "copy_logits_min": -750000000.0, "copy_num_tokens": 523.4375, "epoch": 0.19402604033699258, "gen_logits_max": 10.26169204711914, "gen_logits_mean": -9.246618270874023, "gen_logits_min": -21.481163024902344, "gen_logits_std": 2.5415706634521484, "gen_loss": 0.36282339692115784, "grad_norm": 0.497321638747251, "learning_rate": 2.9729684210526316e-05, "loss": 0.344, "mean_copy_accuracy": 0.9908750504255295, "mean_gen_accuracy": 0.8532115817070007, "mean_token_accuracy": 0.8892938941717148, "num_tokens": 257208339.0, "sample_num_tokens": 8921.75, "step": 950, "total_num_tokens": 257244026.0, "z_loss": 0.0022058242466300726 }, { "copy_logits_max": 1.466109275817871, "copy_logits_min": -750000000.0, "copy_num_tokens": 468.9375, "epoch": 0.19423027827418943, "gen_logits_max": 9.979381561279297, "gen_logits_mean": -9.66604232788086, "gen_logits_min": -21.381542205810547, "gen_logits_std": 2.433938980102539, "gen_loss": 0.3711757957935333, "grad_norm": 0.5383565096185188, "learning_rate": 2.9728421052631577e-05, "loss": 0.3618, "mean_copy_accuracy": 0.9905485063791275, "mean_gen_accuracy": 0.8484105467796326, "mean_token_accuracy": 0.8829602152109146, "num_tokens": 257486162.0, "sample_num_tokens": 8531.0, "step": 951, "total_num_tokens": 257520286.0, "z_loss": 0.0020627276971936226 }, { "copy_logits_max": 0.18150067329406738, "copy_logits_min": -687500032.0, "copy_num_tokens": 381.5, "epoch": 0.19443451621138627, "gen_logits_max": 10.004796981811523, "gen_logits_mean": -9.353273391723633, "gen_logits_min": -21.26820182800293, "gen_logits_std": 2.493891954421997, "gen_loss": 0.3340573012828827, "grad_norm": 0.5524595640384085, "learning_rate": 2.972715789473684e-05, "loss": 0.3427, "mean_copy_accuracy": 0.9898330867290497, "mean_gen_accuracy": 0.861962303519249, "mean_token_accuracy": 0.8895103633403778, "num_tokens": 257741050.0, "sample_num_tokens": 8367.0, "step": 952, "total_num_tokens": 257774518.0, "z_loss": 0.0019276984967291355 }, { "copy_logits_max": 2.063467502593994, "copy_logits_min": -750000064.0, "copy_num_tokens": 520.25, "epoch": 0.1946387541485831, "gen_logits_max": 10.417570114135742, "gen_logits_mean": -8.80947494506836, "gen_logits_min": -21.099468231201172, "gen_logits_std": 2.535244941711426, "gen_loss": 0.3585382103919983, "grad_norm": 0.6063059062808496, "learning_rate": 2.972589473684211e-05, "loss": 0.3703, "mean_copy_accuracy": 0.9875776469707489, "mean_gen_accuracy": 0.8490587621927261, "mean_token_accuracy": 0.8819780945777893, "num_tokens": 257993419.0, "sample_num_tokens": 8532.75, "step": 953, "total_num_tokens": 258027550.0, "z_loss": 0.0022206061985343695 }, { "copy_logits_max": 2.8507730960845947, "copy_logits_min": -750000000.0, "copy_num_tokens": 563.3125, "epoch": 0.19484299208577993, "gen_logits_max": 10.151350021362305, "gen_logits_mean": -8.049656867980957, "gen_logits_min": -20.323043823242188, "gen_logits_std": 2.537604570388794, "gen_loss": 0.3711095452308655, "grad_norm": 0.5501400072125195, "learning_rate": 2.972463157894737e-05, "loss": 0.3716, "mean_copy_accuracy": 0.9911895394325256, "mean_gen_accuracy": 0.8439278453588486, "mean_token_accuracy": 0.8828074336051941, "num_tokens": 258260328.0, "sample_num_tokens": 8689.0, "step": 954, "total_num_tokens": 258295084.0, "z_loss": 0.002278480678796768 }, { "copy_logits_max": 1.6182105541229248, "copy_logits_min": -750000000.0, "copy_num_tokens": 475.6875, "epoch": 0.19504723002297678, "gen_logits_max": 10.137886047363281, "gen_logits_mean": -8.933330535888672, "gen_logits_min": -21.12598419189453, "gen_logits_std": 2.4822797775268555, "gen_loss": 0.3764703571796417, "grad_norm": 0.5467627779155162, "learning_rate": 2.9723368421052634e-05, "loss": 0.3582, "mean_copy_accuracy": 0.9907035231590271, "mean_gen_accuracy": 0.8485045433044434, "mean_token_accuracy": 0.8849401026964188, "num_tokens": 258534727.0, "sample_num_tokens": 8867.25, "step": 955, "total_num_tokens": 258570196.0, "z_loss": 0.0021494533866643906 }, { "copy_logits_max": 2.700089931488037, "copy_logits_min": -750000000.0, "copy_num_tokens": 329.8125, "epoch": 0.1952514679601736, "gen_logits_max": 10.599113464355469, "gen_logits_mean": -9.345027923583984, "gen_logits_min": -21.21858787536621, "gen_logits_std": 2.49530291557312, "gen_loss": 0.4084722697734833, "grad_norm": 0.6387315130814517, "learning_rate": 2.9722105263157895e-05, "loss": 0.3733, "mean_copy_accuracy": 0.9895905256271362, "mean_gen_accuracy": 0.8440877348184586, "mean_token_accuracy": 0.8850481659173965, "num_tokens": 258804797.0, "sample_num_tokens": 7483.75, "step": 956, "total_num_tokens": 258834732.0, "z_loss": 0.002052243333309889 }, { "copy_logits_max": 1.7943826913833618, "copy_logits_min": -750000000.0, "copy_num_tokens": 433.875, "epoch": 0.19545570589737044, "gen_logits_max": 10.068657875061035, "gen_logits_mean": -9.472558975219727, "gen_logits_min": -21.45402717590332, "gen_logits_std": 2.465817928314209, "gen_loss": 0.37382787466049194, "grad_norm": 0.5381809747838402, "learning_rate": 2.972084210526316e-05, "loss": 0.3676, "mean_copy_accuracy": 0.9898691326379776, "mean_gen_accuracy": 0.8520835787057877, "mean_token_accuracy": 0.8814848214387894, "num_tokens": 259068609.0, "sample_num_tokens": 8169.25, "step": 957, "total_num_tokens": 259101286.0, "z_loss": 0.0019470097031444311 }, { "copy_logits_max": 1.7253456115722656, "copy_logits_min": -750000000.0, "copy_num_tokens": 300.5625, "epoch": 0.19565994383456728, "gen_logits_max": 10.765083312988281, "gen_logits_mean": -8.728530883789062, "gen_logits_min": -20.919858932495117, "gen_logits_std": 2.503365993499756, "gen_loss": 0.4064634442329407, "grad_norm": 0.6251083047169887, "learning_rate": 2.971957894736842e-05, "loss": 0.3892, "mean_copy_accuracy": 0.9888664036989212, "mean_gen_accuracy": 0.8419199734926224, "mean_token_accuracy": 0.8762586265802383, "num_tokens": 259339137.0, "sample_num_tokens": 7244.75, "step": 958, "total_num_tokens": 259368116.0, "z_loss": 0.0019075032323598862 }, { "copy_logits_max": 0.9813402891159058, "copy_logits_min": -750000000.0, "copy_num_tokens": 434.6875, "epoch": 0.1958641817717641, "gen_logits_max": 10.559046745300293, "gen_logits_mean": -8.70506477355957, "gen_logits_min": -20.468772888183594, "gen_logits_std": 2.4553091526031494, "gen_loss": 0.3768264055252075, "grad_norm": 0.6359548206793931, "learning_rate": 2.9718315789473685e-05, "loss": 0.349, "mean_copy_accuracy": 0.989033654332161, "mean_gen_accuracy": 0.8573740720748901, "mean_token_accuracy": 0.8895649015903473, "num_tokens": 259620204.0, "sample_num_tokens": 9364.0, "step": 959, "total_num_tokens": 259657660.0, "z_loss": 0.0019979586359113455 }, { "copy_logits_max": 2.456778049468994, "copy_logits_min": -687500032.0, "copy_num_tokens": 637.25, "epoch": 0.19606841970896094, "gen_logits_max": 9.798646926879883, "gen_logits_mean": -8.454081535339355, "gen_logits_min": -20.44279670715332, "gen_logits_std": 2.4846627712249756, "gen_loss": 0.34716179966926575, "grad_norm": 0.5382616645912955, "learning_rate": 2.9717052631578946e-05, "loss": 0.3651, "mean_copy_accuracy": 0.9891225099563599, "mean_gen_accuracy": 0.85023133456707, "mean_token_accuracy": 0.8830409795045853, "num_tokens": 259867383.0, "sample_num_tokens": 9369.25, "step": 960, "total_num_tokens": 259904860.0, "z_loss": 0.0022574623581022024 }, { "copy_logits_max": 0.8178321123123169, "copy_logits_min": -687500032.0, "copy_num_tokens": 725.1875, "epoch": 0.19627265764615778, "gen_logits_max": 9.873428344726562, "gen_logits_mean": -8.296314239501953, "gen_logits_min": -19.97598648071289, "gen_logits_std": 2.4359331130981445, "gen_loss": 0.3179870843887329, "grad_norm": 0.6279157054921537, "learning_rate": 2.9715789473684213e-05, "loss": 0.3509, "mean_copy_accuracy": 0.9889701306819916, "mean_gen_accuracy": 0.8455851823091507, "mean_token_accuracy": 0.8868038058280945, "num_tokens": 260149922.0, "sample_num_tokens": 9600.5, "step": 961, "total_num_tokens": 260188324.0, "z_loss": 0.002213770058006048 }, { "copy_logits_max": 2.9323182106018066, "copy_logits_min": -750000000.0, "copy_num_tokens": 645.8125, "epoch": 0.1964768955833546, "gen_logits_max": 9.767436027526855, "gen_logits_mean": -9.530557632446289, "gen_logits_min": -21.33795928955078, "gen_logits_std": 2.425307512283325, "gen_loss": 0.32491111755371094, "grad_norm": 0.5176555458810156, "learning_rate": 2.9714526315789474e-05, "loss": 0.3385, "mean_copy_accuracy": 0.9922253340482712, "mean_gen_accuracy": 0.8490384221076965, "mean_token_accuracy": 0.8924068063497543, "num_tokens": 260447591.0, "sample_num_tokens": 9247.25, "step": 962, "total_num_tokens": 260484580.0, "z_loss": 0.002229201141744852 }, { "copy_logits_max": -0.05316799879074097, "copy_logits_min": -750000000.0, "copy_num_tokens": 308.25, "epoch": 0.19668113352055144, "gen_logits_max": 10.326855659484863, "gen_logits_mean": -8.41003131866455, "gen_logits_min": -20.377758026123047, "gen_logits_std": 2.492133378982544, "gen_loss": 0.39018481969833374, "grad_norm": 0.6106882776242254, "learning_rate": 2.971326315789474e-05, "loss": 0.3534, "mean_copy_accuracy": 0.9878525286912918, "mean_gen_accuracy": 0.8559559881687164, "mean_token_accuracy": 0.8845381587743759, "num_tokens": 260694404.0, "sample_num_tokens": 7559.0, "step": 963, "total_num_tokens": 260724640.0, "z_loss": 0.001995435683056712 }, { "copy_logits_max": -0.16906329989433289, "copy_logits_min": -750000000.0, "copy_num_tokens": 463.875, "epoch": 0.1968853714577483, "gen_logits_max": 9.95452880859375, "gen_logits_mean": -7.787285804748535, "gen_logits_min": -19.36410140991211, "gen_logits_std": 2.437986135482788, "gen_loss": 0.38181352615356445, "grad_norm": 0.5707644827227106, "learning_rate": 2.9712e-05, "loss": 0.3561, "mean_copy_accuracy": 0.9900869429111481, "mean_gen_accuracy": 0.8506709784269333, "mean_token_accuracy": 0.8854635208845139, "num_tokens": 260945170.0, "sample_num_tokens": 8321.0, "step": 964, "total_num_tokens": 260978454.0, "z_loss": 0.002037856262177229 }, { "copy_logits_max": -1.3382560014724731, "copy_logits_min": -687500032.0, "copy_num_tokens": 217.5625, "epoch": 0.1970896093949451, "gen_logits_max": 10.037069320678711, "gen_logits_mean": -8.814042091369629, "gen_logits_min": -20.447895050048828, "gen_logits_std": 2.466862916946411, "gen_loss": 0.365251362323761, "grad_norm": 0.5085273094075, "learning_rate": 2.9710736842105264e-05, "loss": 0.3538, "mean_copy_accuracy": 0.990532636642456, "mean_gen_accuracy": 0.8585423976182938, "mean_token_accuracy": 0.8844197690486908, "num_tokens": 261198715.0, "sample_num_tokens": 7086.25, "step": 965, "total_num_tokens": 261227060.0, "z_loss": 0.0017227384960278869 }, { "copy_logits_max": 2.113527774810791, "copy_logits_min": -750000000.0, "copy_num_tokens": 469.8125, "epoch": 0.19729384733214195, "gen_logits_max": 9.694435119628906, "gen_logits_mean": -9.080196380615234, "gen_logits_min": -21.290271759033203, "gen_logits_std": 2.5266330242156982, "gen_loss": 0.38037195801734924, "grad_norm": 0.5631940456259623, "learning_rate": 2.9709473684210528e-05, "loss": 0.3621, "mean_copy_accuracy": 0.9899841994047165, "mean_gen_accuracy": 0.8532556593418121, "mean_token_accuracy": 0.8869838565587997, "num_tokens": 261463763.0, "sample_num_tokens": 8159.75, "step": 966, "total_num_tokens": 261496402.0, "z_loss": 0.001986632589250803 }, { "copy_logits_max": 2.025446653366089, "copy_logits_min": -750000000.0, "copy_num_tokens": 519.0, "epoch": 0.1974980852693388, "gen_logits_max": 9.698402404785156, "gen_logits_mean": -9.591442108154297, "gen_logits_min": -21.464439392089844, "gen_logits_std": 2.4846038818359375, "gen_loss": 0.36902475357055664, "grad_norm": 0.49854346060062865, "learning_rate": 2.970821052631579e-05, "loss": 0.3441, "mean_copy_accuracy": 0.9917770773172379, "mean_gen_accuracy": 0.8482529073953629, "mean_token_accuracy": 0.8897117972373962, "num_tokens": 261760501.0, "sample_num_tokens": 8228.25, "step": 967, "total_num_tokens": 261793414.0, "z_loss": 0.00208309106528759 }, { "copy_logits_max": 1.9629888534545898, "copy_logits_min": -750000000.0, "copy_num_tokens": 594.4375, "epoch": 0.1977023232065356, "gen_logits_max": 9.847395896911621, "gen_logits_mean": -8.890403747558594, "gen_logits_min": -21.419103622436523, "gen_logits_std": 2.565791368484497, "gen_loss": 0.3628406822681427, "grad_norm": 0.6788057768490499, "learning_rate": 2.9706947368421053e-05, "loss": 0.3665, "mean_copy_accuracy": 0.990618571639061, "mean_gen_accuracy": 0.8485892117023468, "mean_token_accuracy": 0.8816175013780594, "num_tokens": 262024656.0, "sample_num_tokens": 9092.0, "step": 968, "total_num_tokens": 262061024.0, "z_loss": 0.0021372302435338497 }, { "copy_logits_max": 0.41802841424942017, "copy_logits_min": -750000000.0, "copy_num_tokens": 507.3125, "epoch": 0.19790656114373245, "gen_logits_max": 9.9109525680542, "gen_logits_mean": -8.518013000488281, "gen_logits_min": -21.182615280151367, "gen_logits_std": 2.5171637535095215, "gen_loss": 0.3333929181098938, "grad_norm": 0.6403456998295792, "learning_rate": 2.9705684210526318e-05, "loss": 0.3439, "mean_copy_accuracy": 0.9889155775308609, "mean_gen_accuracy": 0.8505611568689346, "mean_token_accuracy": 0.8885448426008224, "num_tokens": 262301734.0, "sample_num_tokens": 8918.5, "step": 969, "total_num_tokens": 262337408.0, "z_loss": 0.0019375784322619438 }, { "copy_logits_max": 0.06043684482574463, "copy_logits_min": -750000000.0, "copy_num_tokens": 562.6875, "epoch": 0.1981107990809293, "gen_logits_max": 10.11275863647461, "gen_logits_mean": -8.385505676269531, "gen_logits_min": -20.723834991455078, "gen_logits_std": 2.517017364501953, "gen_loss": 0.3488408923149109, "grad_norm": 0.5688643071437535, "learning_rate": 2.9704421052631582e-05, "loss": 0.356, "mean_copy_accuracy": 0.99001844227314, "mean_gen_accuracy": 0.8515735864639282, "mean_token_accuracy": 0.8859285265207291, "num_tokens": 262556124.0, "sample_num_tokens": 8688.0, "step": 970, "total_num_tokens": 262590876.0, "z_loss": 0.0019534502644091845 }, { "copy_logits_max": -0.6755502820014954, "copy_logits_min": -687500032.0, "copy_num_tokens": 238.3125, "epoch": 0.1983150370181261, "gen_logits_max": 10.350420951843262, "gen_logits_mean": -10.091971397399902, "gen_logits_min": -21.794218063354492, "gen_logits_std": 2.441452980041504, "gen_loss": 0.4047589600086212, "grad_norm": 0.5075021903604047, "learning_rate": 2.9703157894736843e-05, "loss": 0.348, "mean_copy_accuracy": 0.9905158877372742, "mean_gen_accuracy": 0.8578065186738968, "mean_token_accuracy": 0.8883659839630127, "num_tokens": 262837928.0, "sample_num_tokens": 6590.5, "step": 971, "total_num_tokens": 262864290.0, "z_loss": 0.0018492767121642828 }, { "copy_logits_max": 1.517416000366211, "copy_logits_min": -687500032.0, "copy_num_tokens": 557.5625, "epoch": 0.19851927495532296, "gen_logits_max": 9.64765739440918, "gen_logits_mean": -9.15570068359375, "gen_logits_min": -21.13275146484375, "gen_logits_std": 2.4986581802368164, "gen_loss": 0.32193681597709656, "grad_norm": 0.5830664220916028, "learning_rate": 2.9701894736842107e-05, "loss": 0.3704, "mean_copy_accuracy": 0.9898913353681564, "mean_gen_accuracy": 0.8481111377477646, "mean_token_accuracy": 0.8806380927562714, "num_tokens": 263077182.0, "sample_num_tokens": 9295.0, "step": 972, "total_num_tokens": 263114362.0, "z_loss": 0.0021055247634649277 }, { "copy_logits_max": -0.588291347026825, "copy_logits_min": -750000000.0, "copy_num_tokens": 373.625, "epoch": 0.1987235128925198, "gen_logits_max": 10.02825927734375, "gen_logits_mean": -9.052084922790527, "gen_logits_min": -20.48737335205078, "gen_logits_std": 2.421229362487793, "gen_loss": 0.343498557806015, "grad_norm": 0.5078495561880336, "learning_rate": 2.9700631578947368e-05, "loss": 0.3521, "mean_copy_accuracy": 0.9882113933563232, "mean_gen_accuracy": 0.856902539730072, "mean_token_accuracy": 0.8858052790164948, "num_tokens": 263336626.0, "sample_num_tokens": 8067.0, "step": 973, "total_num_tokens": 263368894.0, "z_loss": 0.0018725725822150707 }, { "copy_logits_max": 0.17624205350875854, "copy_logits_min": -750000000.0, "copy_num_tokens": 516.5625, "epoch": 0.19892775082971662, "gen_logits_max": 9.899704933166504, "gen_logits_mean": -9.259161949157715, "gen_logits_min": -22.16944122314453, "gen_logits_std": 2.5541703701019287, "gen_loss": 0.3496692180633545, "grad_norm": 0.5214949942419334, "learning_rate": 2.9699368421052632e-05, "loss": 0.3722, "mean_copy_accuracy": 0.9884505420923233, "mean_gen_accuracy": 0.8465733528137207, "mean_token_accuracy": 0.8795957267284393, "num_tokens": 263615344.0, "sample_num_tokens": 9832.0, "step": 974, "total_num_tokens": 263654672.0, "z_loss": 0.0018469325732439756 }, { "copy_logits_max": 0.6929829120635986, "copy_logits_min": -750000000.0, "copy_num_tokens": 378.25, "epoch": 0.19913198876691346, "gen_logits_max": 10.148260116577148, "gen_logits_mean": -10.320058822631836, "gen_logits_min": -22.554607391357422, "gen_logits_std": 2.518120288848877, "gen_loss": 0.33519232273101807, "grad_norm": 0.9486884303277624, "learning_rate": 2.9698105263157893e-05, "loss": 0.3718, "mean_copy_accuracy": 0.987487405538559, "mean_gen_accuracy": 0.8509652763605118, "mean_token_accuracy": 0.8817910552024841, "num_tokens": 263871057.0, "sample_num_tokens": 7790.75, "step": 975, "total_num_tokens": 263902220.0, "z_loss": 0.0018901245202869177 }, { "copy_logits_max": 5.690830707550049, "copy_logits_min": -687500032.0, "copy_num_tokens": 772.5, "epoch": 0.19933622670411028, "gen_logits_max": 9.367324829101562, "gen_logits_mean": -9.462149620056152, "gen_logits_min": -22.41596031188965, "gen_logits_std": 2.584453582763672, "gen_loss": 0.33967214822769165, "grad_norm": 0.5567644753078395, "learning_rate": 2.9696842105263158e-05, "loss": 0.3462, "mean_copy_accuracy": 0.990486666560173, "mean_gen_accuracy": 0.8485732674598694, "mean_token_accuracy": 0.8894951045513153, "num_tokens": 264143851.0, "sample_num_tokens": 9841.75, "step": 976, "total_num_tokens": 264183218.0, "z_loss": 0.002362336963415146 }, { "copy_logits_max": 1.8365634679794312, "copy_logits_min": -750000000.0, "copy_num_tokens": 595.9375, "epoch": 0.19954046464130712, "gen_logits_max": 10.033881187438965, "gen_logits_mean": -9.754753112792969, "gen_logits_min": -22.235633850097656, "gen_logits_std": 2.600823402404785, "gen_loss": 0.34922462701797485, "grad_norm": 0.544298236899337, "learning_rate": 2.9695578947368422e-05, "loss": 0.36, "mean_copy_accuracy": 0.9896432161331177, "mean_gen_accuracy": 0.8463251739740372, "mean_token_accuracy": 0.8845691382884979, "num_tokens": 264436443.0, "sample_num_tokens": 9119.25, "step": 977, "total_num_tokens": 264472920.0, "z_loss": 0.002096375450491905 }, { "copy_logits_max": -0.17600077390670776, "copy_logits_min": -750000000.0, "copy_num_tokens": 337.6875, "epoch": 0.19974470257850396, "gen_logits_max": 10.102799415588379, "gen_logits_mean": -9.872724533081055, "gen_logits_min": -22.06805992126465, "gen_logits_std": 2.5127248764038086, "gen_loss": 0.3671894669532776, "grad_norm": 0.7230038712997322, "learning_rate": 2.9694315789473686e-05, "loss": 0.3618, "mean_copy_accuracy": 0.9870802462100983, "mean_gen_accuracy": 0.8508462309837341, "mean_token_accuracy": 0.8821248859167099, "num_tokens": 264714591.0, "sample_num_tokens": 7447.25, "step": 978, "total_num_tokens": 264744380.0, "z_loss": 0.0015745642594993114 }, { "copy_logits_max": 2.5487210750579834, "copy_logits_min": -750000000.0, "copy_num_tokens": 521.125, "epoch": 0.19994894051570078, "gen_logits_max": 9.866680145263672, "gen_logits_mean": -9.39747428894043, "gen_logits_min": -21.401241302490234, "gen_logits_std": 2.472257137298584, "gen_loss": 0.3934152126312256, "grad_norm": 0.5606402309811477, "learning_rate": 2.969305263157895e-05, "loss": 0.3959, "mean_copy_accuracy": 0.9887462705373764, "mean_gen_accuracy": 0.8393884599208832, "mean_token_accuracy": 0.8743199557065964, "num_tokens": 264964400.0, "sample_num_tokens": 9629.0, "step": 979, "total_num_tokens": 265002916.0, "z_loss": 0.002041025087237358 }, { "copy_logits_max": 2.3513500690460205, "copy_logits_min": -750000000.0, "copy_num_tokens": 556.125, "epoch": 0.20015317845289762, "gen_logits_max": 10.010944366455078, "gen_logits_mean": -8.505134582519531, "gen_logits_min": -20.95604705810547, "gen_logits_std": 2.550259590148926, "gen_loss": 0.372447669506073, "grad_norm": 0.6556997711220381, "learning_rate": 2.969178947368421e-05, "loss": 0.3675, "mean_copy_accuracy": 0.9883140474557877, "mean_gen_accuracy": 0.8487273007631302, "mean_token_accuracy": 0.8815856277942657, "num_tokens": 265237069.0, "sample_num_tokens": 8915.75, "step": 980, "total_num_tokens": 265272732.0, "z_loss": 0.0023720450699329376 }, { "copy_logits_max": -0.3202613592147827, "copy_logits_min": -687500032.0, "copy_num_tokens": 421.125, "epoch": 0.20035741639009447, "gen_logits_max": 10.593969345092773, "gen_logits_mean": -9.0770263671875, "gen_logits_min": -21.513347625732422, "gen_logits_std": 2.5108542442321777, "gen_loss": 0.3878645896911621, "grad_norm": 0.5124159214502215, "learning_rate": 2.9690526315789476e-05, "loss": 0.3496, "mean_copy_accuracy": 0.988139271736145, "mean_gen_accuracy": 0.8569713085889816, "mean_token_accuracy": 0.8869130611419678, "num_tokens": 265500376.0, "sample_num_tokens": 8151.5, "step": 981, "total_num_tokens": 265532982.0, "z_loss": 0.0019291001372039318 }, { "copy_logits_max": 1.55901300907135, "copy_logits_min": -750000000.0, "copy_num_tokens": 614.125, "epoch": 0.20056165432729128, "gen_logits_max": 9.916173934936523, "gen_logits_mean": -8.267614364624023, "gen_logits_min": -21.067977905273438, "gen_logits_std": 2.614401340484619, "gen_loss": 0.3143560588359833, "grad_norm": 0.6227426385015135, "learning_rate": 2.9689263157894737e-05, "loss": 0.3525, "mean_copy_accuracy": 0.988527774810791, "mean_gen_accuracy": 0.8476445227861404, "mean_token_accuracy": 0.8857108801603317, "num_tokens": 265779213.0, "sample_num_tokens": 9745.25, "step": 982, "total_num_tokens": 265818194.0, "z_loss": 0.0021382251288741827 }, { "copy_logits_max": -0.7758195996284485, "copy_logits_min": -750000000.0, "copy_num_tokens": 395.125, "epoch": 0.20076589226448813, "gen_logits_max": 9.93724250793457, "gen_logits_mean": -10.391094207763672, "gen_logits_min": -22.26782989501953, "gen_logits_std": 2.493755340576172, "gen_loss": 0.38237255811691284, "grad_norm": 0.5627348210257548, "learning_rate": 2.9688e-05, "loss": 0.3656, "mean_copy_accuracy": 0.9874483793973923, "mean_gen_accuracy": 0.8516964018344879, "mean_token_accuracy": 0.883604034781456, "num_tokens": 266069392.0, "sample_num_tokens": 7859.5, "step": 983, "total_num_tokens": 266100830.0, "z_loss": 0.0018643904477357864 }, { "copy_logits_max": 1.8882122039794922, "copy_logits_min": -750000000.0, "copy_num_tokens": 526.125, "epoch": 0.20097013020168497, "gen_logits_max": 10.37370491027832, "gen_logits_mean": -8.956451416015625, "gen_logits_min": -21.190364837646484, "gen_logits_std": 2.566075086593628, "gen_loss": 0.36963513493537903, "grad_norm": 0.570130818258039, "learning_rate": 2.9686736842105262e-05, "loss": 0.3349, "mean_copy_accuracy": 0.9904422610998154, "mean_gen_accuracy": 0.8619711399078369, "mean_token_accuracy": 0.8928974568843842, "num_tokens": 266341717.0, "sample_num_tokens": 9149.75, "step": 984, "total_num_tokens": 266378316.0, "z_loss": 0.0022747735492885113 }, { "copy_logits_max": 1.579312801361084, "copy_logits_min": -687500032.0, "copy_num_tokens": 507.3125, "epoch": 0.2011743681388818, "gen_logits_max": 10.66530990600586, "gen_logits_mean": -9.329176902770996, "gen_logits_min": -21.918415069580078, "gen_logits_std": 2.5493690967559814, "gen_loss": 0.39532914757728577, "grad_norm": 0.6327411371893876, "learning_rate": 2.9685473684210526e-05, "loss": 0.3804, "mean_copy_accuracy": 0.9882256835699081, "mean_gen_accuracy": 0.8405026346445084, "mean_token_accuracy": 0.8787427097558975, "num_tokens": 266626390.0, "sample_num_tokens": 9035.0, "step": 985, "total_num_tokens": 266662530.0, "z_loss": 0.002171127125620842 }, { "copy_logits_max": -0.39584270119667053, "copy_logits_min": -750000000.0, "copy_num_tokens": 451.8125, "epoch": 0.20137860607607863, "gen_logits_max": 10.546388626098633, "gen_logits_mean": -9.060758590698242, "gen_logits_min": -21.647090911865234, "gen_logits_std": 2.5233089923858643, "gen_loss": 0.34915629029273987, "grad_norm": 0.7360211306554955, "learning_rate": 2.968421052631579e-05, "loss": 0.3463, "mean_copy_accuracy": 0.9892337322235107, "mean_gen_accuracy": 0.8542172163724899, "mean_token_accuracy": 0.8907938450574875, "num_tokens": 266904480.0, "sample_num_tokens": 7825.5, "step": 986, "total_num_tokens": 266935782.0, "z_loss": 0.0017438005888834596 }, { "copy_logits_max": 2.6272711753845215, "copy_logits_min": -750000000.0, "copy_num_tokens": 612.125, "epoch": 0.20158284401327548, "gen_logits_max": 9.515447616577148, "gen_logits_mean": -9.663763999938965, "gen_logits_min": -22.458581924438477, "gen_logits_std": 2.5876221656799316, "gen_loss": 0.32742539048194885, "grad_norm": 0.5779943148857586, "learning_rate": 2.9682947368421055e-05, "loss": 0.3447, "mean_copy_accuracy": 0.9912147223949432, "mean_gen_accuracy": 0.8524498641490936, "mean_token_accuracy": 0.8901773244142532, "num_tokens": 267192455.0, "sample_num_tokens": 9122.75, "step": 987, "total_num_tokens": 267228946.0, "z_loss": 0.00208359956741333 }, { "copy_logits_max": 2.4551610946655273, "copy_logits_min": -750000000.0, "copy_num_tokens": 366.875, "epoch": 0.2017870819504723, "gen_logits_max": 10.663002014160156, "gen_logits_mean": -9.07278823852539, "gen_logits_min": -21.425098419189453, "gen_logits_std": 2.587435483932495, "gen_loss": 0.37630385160446167, "grad_norm": 0.7780044364198799, "learning_rate": 2.9681684210526316e-05, "loss": 0.3732, "mean_copy_accuracy": 0.9896656423807144, "mean_gen_accuracy": 0.8464847505092621, "mean_token_accuracy": 0.8786395341157913, "num_tokens": 267447840.0, "sample_num_tokens": 7184.0, "step": 988, "total_num_tokens": 267476576.0, "z_loss": 0.002283566165715456 }, { "copy_logits_max": 1.9038821458816528, "copy_logits_min": -687500032.0, "copy_num_tokens": 575.8125, "epoch": 0.20199131988766914, "gen_logits_max": 9.527061462402344, "gen_logits_mean": -9.42745590209961, "gen_logits_min": -21.892711639404297, "gen_logits_std": 2.5319466590881348, "gen_loss": 0.3160886764526367, "grad_norm": 0.563423582535954, "learning_rate": 2.968042105263158e-05, "loss": 0.3426, "mean_copy_accuracy": 0.9906088411808014, "mean_gen_accuracy": 0.8541813492774963, "mean_token_accuracy": 0.8894860595464706, "num_tokens": 267708186.0, "sample_num_tokens": 8689.5, "step": 989, "total_num_tokens": 267742944.0, "z_loss": 0.002062997780740261 }, { "copy_logits_max": 3.55808162689209, "copy_logits_min": -750000000.0, "copy_num_tokens": 814.5625, "epoch": 0.20219555782486598, "gen_logits_max": 9.55060863494873, "gen_logits_mean": -9.256610870361328, "gen_logits_min": -22.46088981628418, "gen_logits_std": 2.574439525604248, "gen_loss": 0.3025684952735901, "grad_norm": 0.8466865518916546, "learning_rate": 2.967915789473684e-05, "loss": 0.3319, "mean_copy_accuracy": 0.984903410077095, "mean_gen_accuracy": 0.8582673072814941, "mean_token_accuracy": 0.891977995634079, "num_tokens": 267987874.0, "sample_num_tokens": 10450.0, "step": 990, "total_num_tokens": 268029674.0, "z_loss": 0.0020476235076785088 }, { "copy_logits_max": 2.332068920135498, "copy_logits_min": -750000000.0, "copy_num_tokens": 360.8125, "epoch": 0.2023997957620628, "gen_logits_max": 9.664356231689453, "gen_logits_mean": -10.792118072509766, "gen_logits_min": -22.982805252075195, "gen_logits_std": 2.520068407058716, "gen_loss": 0.352876752614975, "grad_norm": 0.564413109726524, "learning_rate": 2.9677894736842105e-05, "loss": 0.3371, "mean_copy_accuracy": 0.9900414943695068, "mean_gen_accuracy": 0.8606764376163483, "mean_token_accuracy": 0.8916207849979401, "num_tokens": 268263022.0, "sample_num_tokens": 7446.5, "step": 991, "total_num_tokens": 268292808.0, "z_loss": 0.0019996571354568005 }, { "copy_logits_max": 0.8244427442550659, "copy_logits_min": -625000064.0, "copy_num_tokens": 354.3125, "epoch": 0.20260403369925964, "gen_logits_max": 10.481178283691406, "gen_logits_mean": -9.44898509979248, "gen_logits_min": -21.56662368774414, "gen_logits_std": 2.55429744720459, "gen_loss": 0.3749402165412903, "grad_norm": 0.594804595663937, "learning_rate": 2.967663157894737e-05, "loss": 0.334, "mean_copy_accuracy": 0.9919039458036423, "mean_gen_accuracy": 0.8654917925596237, "mean_token_accuracy": 0.8915065228939056, "num_tokens": 268521992.0, "sample_num_tokens": 7762.5, "step": 992, "total_num_tokens": 268553042.0, "z_loss": 0.002259515691548586 }, { "copy_logits_max": 4.464613914489746, "copy_logits_min": -687500032.0, "copy_num_tokens": 441.1875, "epoch": 0.20280827163645648, "gen_logits_max": 9.441116333007812, "gen_logits_mean": -9.678337097167969, "gen_logits_min": -22.31785011291504, "gen_logits_std": 2.54119610786438, "gen_loss": 0.3523758351802826, "grad_norm": 0.5957806632693805, "learning_rate": 2.967536842105263e-05, "loss": 0.3449, "mean_copy_accuracy": 0.9899183809757233, "mean_gen_accuracy": 0.8495037853717804, "mean_token_accuracy": 0.8912231177091599, "num_tokens": 268811004.0, "sample_num_tokens": 8733.5, "step": 993, "total_num_tokens": 268845938.0, "z_loss": 0.002267182804644108 }, { "copy_logits_max": 0.4550474286079407, "copy_logits_min": -625000000.0, "copy_num_tokens": 260.25, "epoch": 0.2030125095736533, "gen_logits_max": 10.28856086730957, "gen_logits_mean": -10.343487739562988, "gen_logits_min": -22.30951499938965, "gen_logits_std": 2.513723611831665, "gen_loss": 0.3428301215171814, "grad_norm": 0.6638885342800998, "learning_rate": 2.9674105263157898e-05, "loss": 0.3559, "mean_copy_accuracy": 0.9829023331403732, "mean_gen_accuracy": 0.8542311787605286, "mean_token_accuracy": 0.8843999207019806, "num_tokens": 269071013.0, "sample_num_tokens": 6814.25, "step": 994, "total_num_tokens": 269098270.0, "z_loss": 0.0016847579972818494 }, { "copy_logits_max": 1.4327082633972168, "copy_logits_min": -750000000.0, "copy_num_tokens": 585.9375, "epoch": 0.20321674751085014, "gen_logits_max": 9.079045295715332, "gen_logits_mean": -10.198580741882324, "gen_logits_min": -22.46170425415039, "gen_logits_std": 2.5266714096069336, "gen_loss": 0.31723853945732117, "grad_norm": 0.6050456817906182, "learning_rate": 2.967284210526316e-05, "loss": 0.332, "mean_copy_accuracy": 0.9883032888174057, "mean_gen_accuracy": 0.8539038151502609, "mean_token_accuracy": 0.8922922015190125, "num_tokens": 269348367.0, "sample_num_tokens": 8477.75, "step": 995, "total_num_tokens": 269382278.0, "z_loss": 0.001881716656498611 }, { "copy_logits_max": 2.659076690673828, "copy_logits_min": -750000000.0, "copy_num_tokens": 490.5625, "epoch": 0.203420985448047, "gen_logits_max": 9.640358924865723, "gen_logits_mean": -9.527639389038086, "gen_logits_min": -21.930023193359375, "gen_logits_std": 2.5592265129089355, "gen_loss": 0.32637250423431396, "grad_norm": 0.6037720410949748, "learning_rate": 2.9671578947368424e-05, "loss": 0.3392, "mean_copy_accuracy": 0.9865798205137253, "mean_gen_accuracy": 0.8609525859355927, "mean_token_accuracy": 0.8942577093839645, "num_tokens": 269610915.0, "sample_num_tokens": 7991.75, "step": 996, "total_num_tokens": 269642882.0, "z_loss": 0.0019410587847232819 }, { "copy_logits_max": 1.3804996013641357, "copy_logits_min": -750000000.0, "copy_num_tokens": 474.9375, "epoch": 0.2036252233852438, "gen_logits_max": 9.798906326293945, "gen_logits_mean": -9.88510799407959, "gen_logits_min": -22.115074157714844, "gen_logits_std": 2.522844076156616, "gen_loss": 0.3781793415546417, "grad_norm": 0.5491721079707268, "learning_rate": 2.9670315789473684e-05, "loss": 0.3625, "mean_copy_accuracy": 0.988812118768692, "mean_gen_accuracy": 0.8514668196439743, "mean_token_accuracy": 0.8825113475322723, "num_tokens": 269882689.0, "sample_num_tokens": 9029.25, "step": 997, "total_num_tokens": 269918806.0, "z_loss": 0.0018190096598118544 }, { "copy_logits_max": 2.204447031021118, "copy_logits_min": -750000000.0, "copy_num_tokens": 442.125, "epoch": 0.20382946132244065, "gen_logits_max": 10.432201385498047, "gen_logits_mean": -9.17862319946289, "gen_logits_min": -21.85017204284668, "gen_logits_std": 2.5715718269348145, "gen_loss": 0.3677677810192108, "grad_norm": 0.5747593913706984, "learning_rate": 2.966905263157895e-05, "loss": 0.3704, "mean_copy_accuracy": 0.9889702200889587, "mean_gen_accuracy": 0.8478560000658035, "mean_token_accuracy": 0.8796997517347336, "num_tokens": 270142299.0, "sample_num_tokens": 8382.25, "step": 998, "total_num_tokens": 270175828.0, "z_loss": 0.001848528510890901 }, { "copy_logits_max": 0.7052402496337891, "copy_logits_min": -750000000.0, "copy_num_tokens": 466.5, "epoch": 0.2040336992596375, "gen_logits_max": 9.499751091003418, "gen_logits_mean": -9.809439659118652, "gen_logits_min": -21.935239791870117, "gen_logits_std": 2.522853136062622, "gen_loss": 0.3362445831298828, "grad_norm": 0.5521430083504613, "learning_rate": 2.966778947368421e-05, "loss": 0.3659, "mean_copy_accuracy": 0.9926120787858963, "mean_gen_accuracy": 0.8490884453058243, "mean_token_accuracy": 0.8852756023406982, "num_tokens": 270423433.0, "sample_num_tokens": 8845.25, "step": 999, "total_num_tokens": 270458814.0, "z_loss": 0.0017113095382228494 }, { "epoch": 0.2042379371968343, "grad_norm": 0.5684603467406164, "learning_rate": 2.9666526315789474e-05, "loss": 0.3437, "step": 1000 }, { "epoch": 0.2042379371968343, "eval_copy_logits_max": -0.4373815655708313, "eval_copy_logits_min": -51.15595245361328, "eval_gen_logits_max": 9.691875457763672, "eval_gen_logits_mean": -12.404125213623047, "eval_gen_logits_min": -23.876148223876953, "eval_gen_logits_std": 2.4052798748016357, "eval_gen_loss": 0.4073115587234497, "eval_loss": 0.4020393192768097, "eval_mean_copy_accuracy": 0.9863116145133972, "eval_mean_gen_accuracy": 0.8493835926055908, "eval_mean_token_accuracy": 0.8668412268161774, "eval_num_tokens": 270723476.0, "eval_runtime": 0.7968, "eval_samples_per_second": 10.04, "eval_steps_per_second": 2.51, "eval_total_num_tokens": 270723476.0, "eval_z_loss": 0.0015904126921668649, "step": 1000 }, { "copy_logits_max": 0.5041590332984924, "copy_logits_min": -750000000.0, "copy_num_tokens": 338.5, "epoch": 0.20444217513403115, "gen_logits_max": 9.783952713012695, "gen_logits_mean": -9.164934158325195, "gen_logits_min": -21.30755615234375, "gen_logits_std": 2.4464571475982666, "gen_loss": 0.3843972086906433, "grad_norm": 0.5973443780989792, "learning_rate": 2.9665263157894735e-05, "loss": 0.3673, "mean_copy_accuracy": 0.9882952868938446, "mean_gen_accuracy": 0.8522695004940033, "mean_token_accuracy": 0.8849338889122009, "num_tokens": 270952009.0, "sample_num_tokens": 7275.25, "step": 1001, "total_num_tokens": 270981110.0, "z_loss": 0.001849965425208211 }, { "copy_logits_max": 1.8805744647979736, "copy_logits_min": -750000128.0, "copy_num_tokens": 518.9375, "epoch": 0.20464641307122797, "gen_logits_max": 9.514060020446777, "gen_logits_mean": -9.497854232788086, "gen_logits_min": -21.399436950683594, "gen_logits_std": 2.4998912811279297, "gen_loss": 0.3198140263557434, "grad_norm": 0.526637679967826, "learning_rate": 2.9664000000000003e-05, "loss": 0.3181, "mean_copy_accuracy": 0.9909635931253433, "mean_gen_accuracy": 0.865042045712471, "mean_token_accuracy": 0.8972422182559967, "num_tokens": 271214107.0, "sample_num_tokens": 9200.75, "step": 1002, "total_num_tokens": 271250910.0, "z_loss": 0.0018899659626185894 }, { "copy_logits_max": 2.099616527557373, "copy_logits_min": -750000000.0, "copy_num_tokens": 387.9375, "epoch": 0.2048506510084248, "gen_logits_max": 9.584754943847656, "gen_logits_mean": -9.709694862365723, "gen_logits_min": -21.627975463867188, "gen_logits_std": 2.5096240043640137, "gen_loss": 0.333035945892334, "grad_norm": 1.4679567346313425, "learning_rate": 2.9662736842105264e-05, "loss": 0.3577, "mean_copy_accuracy": 0.988963857293129, "mean_gen_accuracy": 0.8508299142122269, "mean_token_accuracy": 0.8854046314954758, "num_tokens": 271489413.0, "sample_num_tokens": 7617.25, "step": 1003, "total_num_tokens": 271519882.0, "z_loss": 0.0019107834668830037 }, { "copy_logits_max": 2.432274580001831, "copy_logits_min": -750000000.0, "copy_num_tokens": 482.6875, "epoch": 0.20505488894562166, "gen_logits_max": 10.217506408691406, "gen_logits_mean": -9.248855590820312, "gen_logits_min": -21.574546813964844, "gen_logits_std": 2.540782928466797, "gen_loss": 0.3310174345970154, "grad_norm": 0.5519974007068377, "learning_rate": 2.9661473684210528e-05, "loss": 0.3605, "mean_copy_accuracy": 0.9905873239040375, "mean_gen_accuracy": 0.851751521229744, "mean_token_accuracy": 0.8848343789577484, "num_tokens": 271761976.0, "sample_num_tokens": 8117.0, "step": 1004, "total_num_tokens": 271794444.0, "z_loss": 0.0019587413407862186 }, { "copy_logits_max": -1.2862119674682617, "copy_logits_min": -750000000.0, "copy_num_tokens": 300.5, "epoch": 0.20525912688281847, "gen_logits_max": 9.913597106933594, "gen_logits_mean": -10.163253784179688, "gen_logits_min": -21.462108612060547, "gen_logits_std": 2.446237802505493, "gen_loss": 0.3687679171562195, "grad_norm": 0.5681168827087442, "learning_rate": 2.9660210526315792e-05, "loss": 0.3365, "mean_copy_accuracy": 0.9877952486276627, "mean_gen_accuracy": 0.8621341437101364, "mean_token_accuracy": 0.8934646993875504, "num_tokens": 272053008.0, "sample_num_tokens": 7861.5, "step": 1005, "total_num_tokens": 272084454.0, "z_loss": 0.0016400105087086558 }, { "copy_logits_max": 1.4109748601913452, "copy_logits_min": -750000000.0, "copy_num_tokens": 545.5625, "epoch": 0.20546336482001532, "gen_logits_max": 9.874519348144531, "gen_logits_mean": -8.87144947052002, "gen_logits_min": -21.152467727661133, "gen_logits_std": 2.5928523540496826, "gen_loss": 0.3594951033592224, "grad_norm": 0.7238081816665247, "learning_rate": 2.9658947368421053e-05, "loss": 0.3832, "mean_copy_accuracy": 0.9889824241399765, "mean_gen_accuracy": 0.8440062254667282, "mean_token_accuracy": 0.878475233912468, "num_tokens": 272306599.0, "sample_num_tokens": 8830.25, "step": 1006, "total_num_tokens": 272341920.0, "z_loss": 0.0021366016007959843 }, { "copy_logits_max": 1.2730515003204346, "copy_logits_min": -687500032.0, "copy_num_tokens": 509.3125, "epoch": 0.20566760275721216, "gen_logits_max": 9.816347122192383, "gen_logits_mean": -9.722738265991211, "gen_logits_min": -21.82157325744629, "gen_logits_std": 2.508683919906616, "gen_loss": 0.32577699422836304, "grad_norm": 0.5443662127455067, "learning_rate": 2.9657684210526317e-05, "loss": 0.3451, "mean_copy_accuracy": 0.989013746380806, "mean_gen_accuracy": 0.8568213284015656, "mean_token_accuracy": 0.889379233121872, "num_tokens": 272577339.0, "sample_num_tokens": 8788.75, "step": 1007, "total_num_tokens": 272612494.0, "z_loss": 0.0019221839029341936 }, { "copy_logits_max": 3.2770416736602783, "copy_logits_min": -750000000.0, "copy_num_tokens": 585.8125, "epoch": 0.20587184069440898, "gen_logits_max": 10.008482933044434, "gen_logits_mean": -9.775751113891602, "gen_logits_min": -22.223867416381836, "gen_logits_std": 2.5547361373901367, "gen_loss": 0.3221546411514282, "grad_norm": 0.5925498986695436, "learning_rate": 2.965642105263158e-05, "loss": 0.3352, "mean_copy_accuracy": 0.9893787354230881, "mean_gen_accuracy": 0.8592717796564102, "mean_token_accuracy": 0.8918530195951462, "num_tokens": 272837544.0, "sample_num_tokens": 8716.0, "step": 1008, "total_num_tokens": 272872408.0, "z_loss": 0.0021237204782664776 }, { "copy_logits_max": 2.696472644805908, "copy_logits_min": -687500032.0, "copy_num_tokens": 433.5625, "epoch": 0.20607607863160582, "gen_logits_max": 9.866125106811523, "gen_logits_mean": -9.747841835021973, "gen_logits_min": -22.469701766967773, "gen_logits_std": 2.5781383514404297, "gen_loss": 0.30447784066200256, "grad_norm": 0.5382497931269806, "learning_rate": 2.9655157894736843e-05, "loss": 0.3401, "mean_copy_accuracy": 0.9896785616874695, "mean_gen_accuracy": 0.8558375239372253, "mean_token_accuracy": 0.8894034773111343, "num_tokens": 273100470.0, "sample_num_tokens": 7385.0, "step": 1009, "total_num_tokens": 273130010.0, "z_loss": 0.0019071748247370124 }, { "copy_logits_max": 0.7648319602012634, "copy_logits_min": -750000000.0, "copy_num_tokens": 333.5, "epoch": 0.20628031656880266, "gen_logits_max": 9.855688095092773, "gen_logits_mean": -9.62844181060791, "gen_logits_min": -21.123579025268555, "gen_logits_std": 2.448550224304199, "gen_loss": 0.3684975206851959, "grad_norm": 0.6911744163358295, "learning_rate": 2.9653894736842107e-05, "loss": 0.3832, "mean_copy_accuracy": 0.9899139702320099, "mean_gen_accuracy": 0.8411771208047867, "mean_token_accuracy": 0.8751765638589859, "num_tokens": 273341512.0, "sample_num_tokens": 6867.5, "step": 1010, "total_num_tokens": 273368982.0, "z_loss": 0.0017456155037507415 }, { "copy_logits_max": -0.1003718376159668, "copy_logits_min": -750000000.0, "copy_num_tokens": 406.625, "epoch": 0.20648455450599948, "gen_logits_max": 10.262296676635742, "gen_logits_mean": -8.468305587768555, "gen_logits_min": -20.182212829589844, "gen_logits_std": 2.5043325424194336, "gen_loss": 0.361025333404541, "grad_norm": 0.5420771912404326, "learning_rate": 2.965263157894737e-05, "loss": 0.3783, "mean_copy_accuracy": 0.9885598868131638, "mean_gen_accuracy": 0.8501652181148529, "mean_token_accuracy": 0.8802663832902908, "num_tokens": 273594253.0, "sample_num_tokens": 7817.75, "step": 1011, "total_num_tokens": 273625524.0, "z_loss": 0.0017469029407948256 }, { "copy_logits_max": 0.4752388000488281, "copy_logits_min": -750000000.0, "copy_num_tokens": 290.8125, "epoch": 0.20668879244319632, "gen_logits_max": 10.063047409057617, "gen_logits_mean": -9.783601760864258, "gen_logits_min": -21.77601432800293, "gen_logits_std": 2.4856550693511963, "gen_loss": 0.39792704582214355, "grad_norm": 0.8457252994566393, "learning_rate": 2.9651368421052632e-05, "loss": 0.3853, "mean_copy_accuracy": 0.9853380620479584, "mean_gen_accuracy": 0.8413901329040527, "mean_token_accuracy": 0.8749188184738159, "num_tokens": 273851225.0, "sample_num_tokens": 7418.75, "step": 1012, "total_num_tokens": 273880900.0, "z_loss": 0.0016852919943630695 }, { "copy_logits_max": 2.6425623893737793, "copy_logits_min": -687500032.0, "copy_num_tokens": 498.3125, "epoch": 0.20689303038039317, "gen_logits_max": 9.616515159606934, "gen_logits_mean": -9.598644256591797, "gen_logits_min": -21.608963012695312, "gen_logits_std": 2.503396987915039, "gen_loss": 0.38941746950149536, "grad_norm": 0.5340210221875765, "learning_rate": 2.9650105263157896e-05, "loss": 0.3597, "mean_copy_accuracy": 0.9896959662437439, "mean_gen_accuracy": 0.8520069569349289, "mean_token_accuracy": 0.8866532891988754, "num_tokens": 274127635.0, "sample_num_tokens": 9098.75, "step": 1013, "total_num_tokens": 274164030.0, "z_loss": 0.0019428508821874857 }, { "copy_logits_max": 1.6216306686401367, "copy_logits_min": -687500032.0, "copy_num_tokens": 359.25, "epoch": 0.20709726831758998, "gen_logits_max": 9.444950103759766, "gen_logits_mean": -9.72878360748291, "gen_logits_min": -21.651350021362305, "gen_logits_std": 2.50345516204834, "gen_loss": 0.369146466255188, "grad_norm": 0.5614801200473791, "learning_rate": 2.9648842105263157e-05, "loss": 0.3729, "mean_copy_accuracy": 0.9899841099977493, "mean_gen_accuracy": 0.8466906249523163, "mean_token_accuracy": 0.8799647986888885, "num_tokens": 274401305.0, "sample_num_tokens": 7209.25, "step": 1014, "total_num_tokens": 274430142.0, "z_loss": 0.00210908567532897 }, { "copy_logits_max": 2.3168842792510986, "copy_logits_min": -750000128.0, "copy_num_tokens": 666.8125, "epoch": 0.20730150625478683, "gen_logits_max": 9.674543380737305, "gen_logits_mean": -9.937851905822754, "gen_logits_min": -22.288414001464844, "gen_logits_std": 2.536722183227539, "gen_loss": 0.2932758331298828, "grad_norm": 0.5417023714762966, "learning_rate": 2.9647578947368422e-05, "loss": 0.3465, "mean_copy_accuracy": 0.9908647686243057, "mean_gen_accuracy": 0.8547096252441406, "mean_token_accuracy": 0.8899306952953339, "num_tokens": 274669955.0, "sample_num_tokens": 10768.75, "step": 1015, "total_num_tokens": 274713030.0, "z_loss": 0.0022304290905594826 }, { "copy_logits_max": 1.044984221458435, "copy_logits_min": -750000000.0, "copy_num_tokens": 508.75, "epoch": 0.20750574419198367, "gen_logits_max": 10.406829833984375, "gen_logits_mean": -9.077106475830078, "gen_logits_min": -21.581768035888672, "gen_logits_std": 2.5194263458251953, "gen_loss": 0.3554479479789734, "grad_norm": 0.6027316895287422, "learning_rate": 2.9646315789473683e-05, "loss": 0.3561, "mean_copy_accuracy": 0.9882480502128601, "mean_gen_accuracy": 0.8497537970542908, "mean_token_accuracy": 0.8851500600576401, "num_tokens": 274973201.0, "sample_num_tokens": 9155.25, "step": 1016, "total_num_tokens": 275009822.0, "z_loss": 0.0019773405510932207 }, { "copy_logits_max": 1.0568795204162598, "copy_logits_min": -687500032.0, "copy_num_tokens": 360.8125, "epoch": 0.2077099821291805, "gen_logits_max": 9.541011810302734, "gen_logits_mean": -10.394356727600098, "gen_logits_min": -21.963855743408203, "gen_logits_std": 2.424083709716797, "gen_loss": 0.35932019352912903, "grad_norm": 0.5566241395021096, "learning_rate": 2.9645052631578947e-05, "loss": 0.3321, "mean_copy_accuracy": 0.9909293353557587, "mean_gen_accuracy": 0.8579871207475662, "mean_token_accuracy": 0.8953501433134079, "num_tokens": 275274127.0, "sample_num_tokens": 7713.75, "step": 1017, "total_num_tokens": 275304982.0, "z_loss": 0.001763330539688468 }, { "copy_logits_max": 2.039332628250122, "copy_logits_min": -750000000.0, "copy_num_tokens": 331.75, "epoch": 0.20791422006637733, "gen_logits_max": 10.101685523986816, "gen_logits_mean": -9.551457405090332, "gen_logits_min": -21.515005111694336, "gen_logits_std": 2.506190061569214, "gen_loss": 0.36966484785079956, "grad_norm": 0.6288200925467465, "learning_rate": 2.964378947368421e-05, "loss": 0.3692, "mean_copy_accuracy": 0.9919330328702927, "mean_gen_accuracy": 0.8440709114074707, "mean_token_accuracy": 0.8805634379386902, "num_tokens": 275539904.0, "sample_num_tokens": 7507.0, "step": 1018, "total_num_tokens": 275569932.0, "z_loss": 0.0018104222835972905 }, { "copy_logits_max": 0.476593017578125, "copy_logits_min": -750000000.0, "copy_num_tokens": 330.875, "epoch": 0.20811845800357417, "gen_logits_max": 10.72299575805664, "gen_logits_mean": -9.07243824005127, "gen_logits_min": -21.053321838378906, "gen_logits_std": 2.496626377105713, "gen_loss": 0.3998195230960846, "grad_norm": 0.6223623596682585, "learning_rate": 2.9642526315789476e-05, "loss": 0.391, "mean_copy_accuracy": 0.9911251813173294, "mean_gen_accuracy": 0.8403235077857971, "mean_token_accuracy": 0.8731474131345749, "num_tokens": 275806453.0, "sample_num_tokens": 7828.75, "step": 1019, "total_num_tokens": 275837768.0, "z_loss": 0.0016809742664918303 }, { "copy_logits_max": 2.5233869552612305, "copy_logits_min": -750000000.0, "copy_num_tokens": 521.875, "epoch": 0.208322695940771, "gen_logits_max": 10.249602317810059, "gen_logits_mean": -9.64035415649414, "gen_logits_min": -21.929710388183594, "gen_logits_std": 2.5652847290039062, "gen_loss": 0.37022852897644043, "grad_norm": 0.6687953372068375, "learning_rate": 2.964126315789474e-05, "loss": 0.3803, "mean_copy_accuracy": 0.9878663271665573, "mean_gen_accuracy": 0.8464118391275406, "mean_token_accuracy": 0.8782160580158234, "num_tokens": 276058506.0, "sample_num_tokens": 8316.5, "step": 1020, "total_num_tokens": 276091772.0, "z_loss": 0.0016193806659430265 }, { "copy_logits_max": 1.465500831604004, "copy_logits_min": -687500032.0, "copy_num_tokens": 425.875, "epoch": 0.20852693387796783, "gen_logits_max": 10.446557998657227, "gen_logits_mean": -9.600515365600586, "gen_logits_min": -21.813262939453125, "gen_logits_std": 2.4988560676574707, "gen_loss": 0.3485073745250702, "grad_norm": 0.6828546229635296, "learning_rate": 2.964e-05, "loss": 0.3511, "mean_copy_accuracy": 0.9878110140562057, "mean_gen_accuracy": 0.8546665608882904, "mean_token_accuracy": 0.8876439779996872, "num_tokens": 276324699.0, "sample_num_tokens": 8282.75, "step": 1021, "total_num_tokens": 276357830.0, "z_loss": 0.001783972606062889 }, { "copy_logits_max": 1.5443525314331055, "copy_logits_min": -750000000.0, "copy_num_tokens": 365.125, "epoch": 0.20873117181516468, "gen_logits_max": 10.783174514770508, "gen_logits_mean": -8.965553283691406, "gen_logits_min": -21.113983154296875, "gen_logits_std": 2.4977195262908936, "gen_loss": 0.38218486309051514, "grad_norm": 0.6940572907756365, "learning_rate": 2.9638736842105265e-05, "loss": 0.3535, "mean_copy_accuracy": 0.9914190173149109, "mean_gen_accuracy": 0.8489668965339661, "mean_token_accuracy": 0.8869280368089676, "num_tokens": 276606904.0, "sample_num_tokens": 7764.0, "step": 1022, "total_num_tokens": 276637960.0, "z_loss": 0.00197475403547287 }, { "copy_logits_max": 3.1119284629821777, "copy_logits_min": -750000000.0, "copy_num_tokens": 384.5625, "epoch": 0.2089354097523615, "gen_logits_max": 10.078128814697266, "gen_logits_mean": -8.799854278564453, "gen_logits_min": -20.809955596923828, "gen_logits_std": 2.4850525856018066, "gen_loss": 0.4040295481681824, "grad_norm": 0.5520217715040195, "learning_rate": 2.9637473684210526e-05, "loss": 0.3648, "mean_copy_accuracy": 0.9899449646472931, "mean_gen_accuracy": 0.8469074666500092, "mean_token_accuracy": 0.8836921602487564, "num_tokens": 276882380.0, "sample_num_tokens": 7697.5, "step": 1023, "total_num_tokens": 276913170.0, "z_loss": 0.002009811345487833 }, { "copy_logits_max": 0.8755048513412476, "copy_logits_min": -750000000.0, "copy_num_tokens": 425.4375, "epoch": 0.20913964768955834, "gen_logits_max": 10.057905197143555, "gen_logits_mean": -9.98232650756836, "gen_logits_min": -23.02736473083496, "gen_logits_std": 2.525136947631836, "gen_loss": 0.33758723735809326, "grad_norm": 0.610782779393148, "learning_rate": 2.963621052631579e-05, "loss": 0.3732, "mean_copy_accuracy": 0.9866911619901657, "mean_gen_accuracy": 0.8495101183652878, "mean_token_accuracy": 0.8817823380231857, "num_tokens": 277138109.0, "sample_num_tokens": 7848.25, "step": 1024, "total_num_tokens": 277169502.0, "z_loss": 0.001895032823085785 }, { "copy_logits_max": 2.43162202835083, "copy_logits_min": -750000000.0, "copy_num_tokens": 468.5, "epoch": 0.20934388562675518, "gen_logits_max": 10.206869125366211, "gen_logits_mean": -10.105839729309082, "gen_logits_min": -22.301631927490234, "gen_logits_std": 2.4958667755126953, "gen_loss": 0.32271477580070496, "grad_norm": 0.5139991732837558, "learning_rate": 2.963494736842105e-05, "loss": 0.3638, "mean_copy_accuracy": 0.9862681776285172, "mean_gen_accuracy": 0.8526889234781265, "mean_token_accuracy": 0.8797153383493423, "num_tokens": 277390646.0, "sample_num_tokens": 8381.0, "step": 1025, "total_num_tokens": 277424170.0, "z_loss": 0.002027597976848483 }, { "copy_logits_max": 3.1739416122436523, "copy_logits_min": -687500032.0, "copy_num_tokens": 512.5625, "epoch": 0.209548123563952, "gen_logits_max": 11.103290557861328, "gen_logits_mean": -8.958534240722656, "gen_logits_min": -21.272144317626953, "gen_logits_std": 2.529327869415283, "gen_loss": 0.35794615745544434, "grad_norm": 0.4852598426620212, "learning_rate": 2.9633684210526316e-05, "loss": 0.3399, "mean_copy_accuracy": 0.9925217032432556, "mean_gen_accuracy": 0.8528473228216171, "mean_token_accuracy": 0.8914300948381424, "num_tokens": 277683302.0, "sample_num_tokens": 8454.0, "step": 1026, "total_num_tokens": 277717118.0, "z_loss": 0.0023194507230073214 }, { "copy_logits_max": 3.3581161499023438, "copy_logits_min": -750000000.0, "copy_num_tokens": 627.0, "epoch": 0.20975236150114884, "gen_logits_max": 10.064615249633789, "gen_logits_mean": -9.481222152709961, "gen_logits_min": -21.876113891601562, "gen_logits_std": 2.5326590538024902, "gen_loss": 0.34547609090805054, "grad_norm": 0.47894606245813154, "learning_rate": 2.963242105263158e-05, "loss": 0.3437, "mean_copy_accuracy": 0.9922524839639664, "mean_gen_accuracy": 0.8532985299825668, "mean_token_accuracy": 0.8910426795482635, "num_tokens": 277972166.0, "sample_num_tokens": 9892.0, "step": 1027, "total_num_tokens": 278011734.0, "z_loss": 0.002146680373698473 }, { "copy_logits_max": 3.5992238521575928, "copy_logits_min": -750000000.0, "copy_num_tokens": 501.3125, "epoch": 0.20995659943834566, "gen_logits_max": 10.874720573425293, "gen_logits_mean": -8.4940185546875, "gen_logits_min": -20.994136810302734, "gen_logits_std": 2.528172492980957, "gen_loss": 0.3238568902015686, "grad_norm": 0.5167197994976231, "learning_rate": 2.9631157894736844e-05, "loss": 0.3737, "mean_copy_accuracy": 0.9912949353456497, "mean_gen_accuracy": 0.8482838124036789, "mean_token_accuracy": 0.8818386346101761, "num_tokens": 278244539.0, "sample_num_tokens": 8131.75, "step": 1028, "total_num_tokens": 278277066.0, "z_loss": 0.001979559427127242 }, { "copy_logits_max": 2.7971982955932617, "copy_logits_min": -687500032.0, "copy_num_tokens": 411.0625, "epoch": 0.2101608373755425, "gen_logits_max": 10.474851608276367, "gen_logits_mean": -8.784538269042969, "gen_logits_min": -21.239402770996094, "gen_logits_std": 2.5392377376556396, "gen_loss": 0.4127430319786072, "grad_norm": 0.5528357928512417, "learning_rate": 2.9629894736842105e-05, "loss": 0.37, "mean_copy_accuracy": 0.9886071979999542, "mean_gen_accuracy": 0.8504527658224106, "mean_token_accuracy": 0.8838438540697098, "num_tokens": 278526989.0, "sample_num_tokens": 8010.25, "step": 1029, "total_num_tokens": 278559030.0, "z_loss": 0.001918851281516254 }, { "copy_logits_max": 2.9976272583007812, "copy_logits_min": -687500032.0, "copy_num_tokens": 414.125, "epoch": 0.21036507531273935, "gen_logits_max": 11.150552749633789, "gen_logits_mean": -8.148009300231934, "gen_logits_min": -20.903182983398438, "gen_logits_std": 2.57025146484375, "gen_loss": 0.4083884656429291, "grad_norm": 0.6595014187412205, "learning_rate": 2.962863157894737e-05, "loss": 0.3589, "mean_copy_accuracy": 0.9895207434892654, "mean_gen_accuracy": 0.8490816801786423, "mean_token_accuracy": 0.8837766796350479, "num_tokens": 278799801.0, "sample_num_tokens": 7212.75, "step": 1030, "total_num_tokens": 278828652.0, "z_loss": 0.0019889811519533396 }, { "copy_logits_max": 1.262930154800415, "copy_logits_min": -750000000.0, "copy_num_tokens": 331.0625, "epoch": 0.21056931324993616, "gen_logits_max": 10.771933555603027, "gen_logits_mean": -9.573809623718262, "gen_logits_min": -21.82056999206543, "gen_logits_std": 2.5113635063171387, "gen_loss": 0.4176776707172394, "grad_norm": 0.5543488405612311, "learning_rate": 2.962736842105263e-05, "loss": 0.3579, "mean_copy_accuracy": 0.9915364682674408, "mean_gen_accuracy": 0.8506347239017487, "mean_token_accuracy": 0.8845353126525879, "num_tokens": 279060037.0, "sample_num_tokens": 7724.75, "step": 1031, "total_num_tokens": 279090936.0, "z_loss": 0.001872434513643384 }, { "copy_logits_max": 1.6508845090866089, "copy_logits_min": -687500032.0, "copy_num_tokens": 348.25, "epoch": 0.210773551187133, "gen_logits_max": 10.933075904846191, "gen_logits_mean": -9.12473201751709, "gen_logits_min": -21.495338439941406, "gen_logits_std": 2.5438036918640137, "gen_loss": 0.38258570432662964, "grad_norm": 0.5547043493400415, "learning_rate": 2.9626105263157895e-05, "loss": 0.3667, "mean_copy_accuracy": 0.9878305047750473, "mean_gen_accuracy": 0.8536933660507202, "mean_token_accuracy": 0.8822765350341797, "num_tokens": 279316718.0, "sample_num_tokens": 7454.5, "step": 1032, "total_num_tokens": 279346536.0, "z_loss": 0.0017311194678768516 }, { "copy_logits_max": 1.2893227338790894, "copy_logits_min": -750000000.0, "copy_num_tokens": 577.25, "epoch": 0.21097778912432985, "gen_logits_max": 10.437902450561523, "gen_logits_mean": -9.497533798217773, "gen_logits_min": -21.877592086791992, "gen_logits_std": 2.5516998767852783, "gen_loss": 0.3711128532886505, "grad_norm": 0.6779056522621546, "learning_rate": 2.962484210526316e-05, "loss": 0.3561, "mean_copy_accuracy": 0.9870323538780212, "mean_gen_accuracy": 0.8514614105224609, "mean_token_accuracy": 0.8849780708551407, "num_tokens": 279588863.0, "sample_num_tokens": 9141.25, "step": 1033, "total_num_tokens": 279625428.0, "z_loss": 0.0016758881974965334 }, { "copy_logits_max": -0.7707219123840332, "copy_logits_min": -750000000.0, "copy_num_tokens": 440.25, "epoch": 0.21118202706152667, "gen_logits_max": 10.234796524047852, "gen_logits_mean": -9.561725616455078, "gen_logits_min": -22.205350875854492, "gen_logits_std": 2.513382911682129, "gen_loss": 0.37346452474594116, "grad_norm": 0.5374721924920246, "learning_rate": 2.962357894736842e-05, "loss": 0.386, "mean_copy_accuracy": 0.99039326608181, "mean_gen_accuracy": 0.8453768640756607, "mean_token_accuracy": 0.8777939677238464, "num_tokens": 279870301.0, "sample_num_tokens": 7873.75, "step": 1034, "total_num_tokens": 279901796.0, "z_loss": 0.0018187026726081967 }, { "copy_logits_max": -0.9982553720474243, "copy_logits_min": -750000000.0, "copy_num_tokens": 452.0, "epoch": 0.2113862649987235, "gen_logits_max": 10.004057884216309, "gen_logits_mean": -9.16494369506836, "gen_logits_min": -22.059757232666016, "gen_logits_std": 2.582655191421509, "gen_loss": 0.3375745415687561, "grad_norm": 0.7026731254993314, "learning_rate": 2.9622315789473688e-05, "loss": 0.3353, "mean_copy_accuracy": 0.9902003556489944, "mean_gen_accuracy": 0.8593555390834808, "mean_token_accuracy": 0.8929257243871689, "num_tokens": 280137841.0, "sample_num_tokens": 8180.75, "step": 1035, "total_num_tokens": 280170564.0, "z_loss": 0.001839108532294631 }, { "copy_logits_max": 1.440246820449829, "copy_logits_min": -687500032.0, "copy_num_tokens": 573.9375, "epoch": 0.21159050293592035, "gen_logits_max": 9.5866060256958, "gen_logits_mean": -10.871477127075195, "gen_logits_min": -23.063129425048828, "gen_logits_std": 2.502009391784668, "gen_loss": 0.3438910245895386, "grad_norm": 0.5336041998698723, "learning_rate": 2.962105263157895e-05, "loss": 0.3324, "mean_copy_accuracy": 0.9893453866243362, "mean_gen_accuracy": 0.8589199483394623, "mean_token_accuracy": 0.8929726332426071, "num_tokens": 280425356.0, "sample_num_tokens": 9850.5, "step": 1036, "total_num_tokens": 280464758.0, "z_loss": 0.0018215901218354702 }, { "copy_logits_max": 0.8871786594390869, "copy_logits_min": -687500032.0, "copy_num_tokens": 295.8125, "epoch": 0.21179474087311717, "gen_logits_max": 10.858190536499023, "gen_logits_mean": -9.515165328979492, "gen_logits_min": -21.627685546875, "gen_logits_std": 2.501467227935791, "gen_loss": 0.45751243829727173, "grad_norm": 0.6327628634742847, "learning_rate": 2.9619789473684213e-05, "loss": 0.3865, "mean_copy_accuracy": 0.9869646579027176, "mean_gen_accuracy": 0.845332533121109, "mean_token_accuracy": 0.8783423751592636, "num_tokens": 280700163.0, "sample_num_tokens": 7718.25, "step": 1037, "total_num_tokens": 280731036.0, "z_loss": 0.0019798956345766783 }, { "copy_logits_max": -0.8294252753257751, "copy_logits_min": -625000064.0, "copy_num_tokens": 373.5625, "epoch": 0.21199897881031401, "gen_logits_max": 9.860755920410156, "gen_logits_mean": -9.72469425201416, "gen_logits_min": -21.844755172729492, "gen_logits_std": 2.533048629760742, "gen_loss": 0.35649019479751587, "grad_norm": 0.5280858161634948, "learning_rate": 2.9618526315789474e-05, "loss": 0.3451, "mean_copy_accuracy": 0.9913718551397324, "mean_gen_accuracy": 0.8593776375055313, "mean_token_accuracy": 0.8906141519546509, "num_tokens": 280959996.0, "sample_num_tokens": 8213.5, "step": 1038, "total_num_tokens": 280992850.0, "z_loss": 0.0017438280628994107 }, { "copy_logits_max": -2.67673921585083, "copy_logits_min": -750000000.0, "copy_num_tokens": 386.0625, "epoch": 0.21220321674751086, "gen_logits_max": 9.449287414550781, "gen_logits_mean": -10.502342224121094, "gen_logits_min": -22.011032104492188, "gen_logits_std": 2.4749503135681152, "gen_loss": 0.3578457534313202, "grad_norm": 0.5233957700996917, "learning_rate": 2.9617263157894738e-05, "loss": 0.3658, "mean_copy_accuracy": 0.991302415728569, "mean_gen_accuracy": 0.8509992808103561, "mean_token_accuracy": 0.8806125521659851, "num_tokens": 281221231.0, "sample_num_tokens": 8798.25, "step": 1039, "total_num_tokens": 281256424.0, "z_loss": 0.001497464720159769 }, { "copy_logits_max": -1.178505778312683, "copy_logits_min": -750000000.0, "copy_num_tokens": 460.9375, "epoch": 0.21240745468470767, "gen_logits_max": 10.139339447021484, "gen_logits_mean": -9.92988109588623, "gen_logits_min": -22.113502502441406, "gen_logits_std": 2.508165121078491, "gen_loss": 0.316949725151062, "grad_norm": 0.5177418083459849, "learning_rate": 2.9616e-05, "loss": 0.3358, "mean_copy_accuracy": 0.9891715049743652, "mean_gen_accuracy": 0.8583597242832184, "mean_token_accuracy": 0.8913031220436096, "num_tokens": 281503252.0, "sample_num_tokens": 9112.5, "step": 1040, "total_num_tokens": 281539702.0, "z_loss": 0.0016410702373832464 }, { "copy_logits_max": 0.7583026885986328, "copy_logits_min": -750000064.0, "copy_num_tokens": 716.8125, "epoch": 0.21261169262190452, "gen_logits_max": 9.525716781616211, "gen_logits_mean": -10.193260192871094, "gen_logits_min": -22.905685424804688, "gen_logits_std": 2.573547840118408, "gen_loss": 0.30280888080596924, "grad_norm": 0.6049479244171578, "learning_rate": 2.9614736842105263e-05, "loss": 0.3342, "mean_copy_accuracy": 0.9891434609889984, "mean_gen_accuracy": 0.8569933176040649, "mean_token_accuracy": 0.8924655765295029, "num_tokens": 281793465.0, "sample_num_tokens": 10371.25, "step": 1041, "total_num_tokens": 281834950.0, "z_loss": 0.0015928067732602358 }, { "copy_logits_max": 0.6718724966049194, "copy_logits_min": -750000000.0, "copy_num_tokens": 437.5, "epoch": 0.21281593055910136, "gen_logits_max": 9.842557907104492, "gen_logits_mean": -9.14966869354248, "gen_logits_min": -21.39396858215332, "gen_logits_std": 2.563086748123169, "gen_loss": 0.3362139165401459, "grad_norm": 0.5715268705917758, "learning_rate": 2.9613473684210524e-05, "loss": 0.3485, "mean_copy_accuracy": 0.9911402016878128, "mean_gen_accuracy": 0.8504372835159302, "mean_token_accuracy": 0.8878380060195923, "num_tokens": 282081809.0, "sample_num_tokens": 8110.25, "step": 1042, "total_num_tokens": 282114250.0, "z_loss": 0.0016666608862578869 }, { "copy_logits_max": -0.06956583261489868, "copy_logits_min": -750000064.0, "copy_num_tokens": 506.4375, "epoch": 0.21302016849629818, "gen_logits_max": 8.995737075805664, "gen_logits_mean": -11.071521759033203, "gen_logits_min": -23.078393936157227, "gen_logits_std": 2.482542037963867, "gen_loss": 0.3577694892883301, "grad_norm": 1.168977155713146, "learning_rate": 2.9612210526315792e-05, "loss": 0.3512, "mean_copy_accuracy": 0.9926983267068863, "mean_gen_accuracy": 0.851229265332222, "mean_token_accuracy": 0.8859118819236755, "num_tokens": 282356297.0, "sample_num_tokens": 8909.25, "step": 1043, "total_num_tokens": 282391934.0, "z_loss": 0.001604277640581131 }, { "copy_logits_max": -0.6940694451332092, "copy_logits_min": -750000000.0, "copy_num_tokens": 449.5, "epoch": 0.21322440643349502, "gen_logits_max": 9.858707427978516, "gen_logits_mean": -8.92221450805664, "gen_logits_min": -21.798486709594727, "gen_logits_std": 2.5572357177734375, "gen_loss": 0.3053131699562073, "grad_norm": 0.5274867347975604, "learning_rate": 2.9610947368421053e-05, "loss": 0.3295, "mean_copy_accuracy": 0.9906708300113678, "mean_gen_accuracy": 0.862092450261116, "mean_token_accuracy": 0.8930259346961975, "num_tokens": 282629896.0, "sample_num_tokens": 8460.0, "step": 1044, "total_num_tokens": 282663736.0, "z_loss": 0.0016666812589392066 }, { "copy_logits_max": -2.1540567874908447, "copy_logits_min": -750000000.0, "copy_num_tokens": 375.875, "epoch": 0.21342864437069187, "gen_logits_max": 9.49129867553711, "gen_logits_mean": -10.783076286315918, "gen_logits_min": -22.712665557861328, "gen_logits_std": 2.486375093460083, "gen_loss": 0.3776235282421112, "grad_norm": 0.6103905378861414, "learning_rate": 2.9609684210526317e-05, "loss": 0.3489, "mean_copy_accuracy": 0.9912218749523163, "mean_gen_accuracy": 0.8537757843732834, "mean_token_accuracy": 0.888169452548027, "num_tokens": 282901903.0, "sample_num_tokens": 9075.25, "step": 1045, "total_num_tokens": 282938204.0, "z_loss": 0.001481405459344387 }, { "copy_logits_max": -1.0712720155715942, "copy_logits_min": -750000000.0, "copy_num_tokens": 642.5625, "epoch": 0.21363288230788868, "gen_logits_max": 9.394498825073242, "gen_logits_mean": -8.853379249572754, "gen_logits_min": -22.19072723388672, "gen_logits_std": 2.545503616333008, "gen_loss": 0.29464614391326904, "grad_norm": 0.5346455919492299, "learning_rate": 2.960842105263158e-05, "loss": 0.329, "mean_copy_accuracy": 0.990039125084877, "mean_gen_accuracy": 0.8576787859201431, "mean_token_accuracy": 0.8930794596672058, "num_tokens": 283182167.0, "sample_num_tokens": 9621.25, "step": 1046, "total_num_tokens": 283220652.0, "z_loss": 0.002030325122177601 }, { "copy_logits_max": -0.5895711183547974, "copy_logits_min": -750000000.0, "copy_num_tokens": 442.4375, "epoch": 0.21383712024508553, "gen_logits_max": 9.727849960327148, "gen_logits_mean": -10.298795700073242, "gen_logits_min": -23.129789352416992, "gen_logits_std": 2.5163495540618896, "gen_loss": 0.40373334288597107, "grad_norm": 0.5272609607814851, "learning_rate": 2.9607157894736842e-05, "loss": 0.3474, "mean_copy_accuracy": 0.9894306659698486, "mean_gen_accuracy": 0.8607377856969833, "mean_token_accuracy": 0.8867768049240112, "num_tokens": 283458783.0, "sample_num_tokens": 8616.25, "step": 1047, "total_num_tokens": 283493248.0, "z_loss": 0.0018935413099825382 }, { "copy_logits_max": 0.746131181716919, "copy_logits_min": -625000064.0, "copy_num_tokens": 492.8125, "epoch": 0.21404135818228237, "gen_logits_max": 9.763280868530273, "gen_logits_mean": -9.205998420715332, "gen_logits_min": -21.948108673095703, "gen_logits_std": 2.590244770050049, "gen_loss": 0.37500545382499695, "grad_norm": 0.527257275843199, "learning_rate": 2.9605894736842107e-05, "loss": 0.3714, "mean_copy_accuracy": 0.9892231971025467, "mean_gen_accuracy": 0.8474310040473938, "mean_token_accuracy": 0.88063745200634, "num_tokens": 283705814.0, "sample_num_tokens": 8975.5, "step": 1048, "total_num_tokens": 283741716.0, "z_loss": 0.0023590405471622944 }, { "copy_logits_max": 2.7067742347717285, "copy_logits_min": -750000000.0, "copy_num_tokens": 779.0, "epoch": 0.2142455961194792, "gen_logits_max": 9.992140769958496, "gen_logits_mean": -9.819242477416992, "gen_logits_min": -22.3934383392334, "gen_logits_std": 2.5284714698791504, "gen_loss": 0.3322293758392334, "grad_norm": 0.5209223656125603, "learning_rate": 2.9604631578947368e-05, "loss": 0.3499, "mean_copy_accuracy": 0.9875660389661789, "mean_gen_accuracy": 0.8558892458677292, "mean_token_accuracy": 0.8874930739402771, "num_tokens": 283965112.0, "sample_num_tokens": 10971.0, "step": 1049, "total_num_tokens": 284008996.0, "z_loss": 0.002402116311714053 }, { "copy_logits_max": 0.5292452573776245, "copy_logits_min": -687500032.0, "copy_num_tokens": 401.0625, "epoch": 0.21444983405667603, "gen_logits_max": 10.294242858886719, "gen_logits_mean": -9.356579780578613, "gen_logits_min": -21.550918579101562, "gen_logits_std": 2.4992544651031494, "gen_loss": 0.37458810210227966, "grad_norm": 0.6011904837633254, "learning_rate": 2.9603368421052632e-05, "loss": 0.3648, "mean_copy_accuracy": 0.9899373799562454, "mean_gen_accuracy": 0.8481946140527725, "mean_token_accuracy": 0.8834964781999588, "num_tokens": 284256465.0, "sample_num_tokens": 7704.25, "step": 1050, "total_num_tokens": 284287282.0, "z_loss": 0.002283484674990177 }, { "copy_logits_max": 1.2255326509475708, "copy_logits_min": -750000000.0, "copy_num_tokens": 660.375, "epoch": 0.21465407199387287, "gen_logits_max": 9.17569351196289, "gen_logits_mean": -9.697087287902832, "gen_logits_min": -22.621196746826172, "gen_logits_std": 2.595061779022217, "gen_loss": 0.32396450638771057, "grad_norm": 0.503205454513031, "learning_rate": 2.9602105263157896e-05, "loss": 0.3531, "mean_copy_accuracy": 0.9920714646577835, "mean_gen_accuracy": 0.845300018787384, "mean_token_accuracy": 0.8855235278606415, "num_tokens": 284526627.0, "sample_num_tokens": 9296.75, "step": 1051, "total_num_tokens": 284563814.0, "z_loss": 0.0021987413056194782 }, { "copy_logits_max": 0.013983547687530518, "copy_logits_min": -750000000.0, "copy_num_tokens": 477.0625, "epoch": 0.2148583099310697, "gen_logits_max": 9.704187393188477, "gen_logits_mean": -9.386152267456055, "gen_logits_min": -21.60451316833496, "gen_logits_std": 2.5514566898345947, "gen_loss": 0.30605369806289673, "grad_norm": 0.5057766707496688, "learning_rate": 2.960084210526316e-05, "loss": 0.3265, "mean_copy_accuracy": 0.9909844100475311, "mean_gen_accuracy": 0.8602015972137451, "mean_token_accuracy": 0.8942018151283264, "num_tokens": 284813739.0, "sample_num_tokens": 8944.25, "step": 1052, "total_num_tokens": 284849516.0, "z_loss": 0.0017391711007803679 }, { "copy_logits_max": -1.8958359956741333, "copy_logits_min": -750000000.0, "copy_num_tokens": 548.75, "epoch": 0.21506254786826653, "gen_logits_max": 9.199603080749512, "gen_logits_mean": -9.438654899597168, "gen_logits_min": -22.320741653442383, "gen_logits_std": 2.521601676940918, "gen_loss": 0.3209648132324219, "grad_norm": 0.6057965281202323, "learning_rate": 2.959957894736842e-05, "loss": 0.3409, "mean_copy_accuracy": 0.9891849160194397, "mean_gen_accuracy": 0.8596051782369614, "mean_token_accuracy": 0.8905349671840668, "num_tokens": 285091178.0, "sample_num_tokens": 9346.0, "step": 1053, "total_num_tokens": 285128562.0, "z_loss": 0.0015377074014395475 }, { "copy_logits_max": -1.834068775177002, "copy_logits_min": -750000000.0, "copy_num_tokens": 440.625, "epoch": 0.21526678580546335, "gen_logits_max": 9.007193565368652, "gen_logits_mean": -10.308286666870117, "gen_logits_min": -22.496822357177734, "gen_logits_std": 2.50593638420105, "gen_loss": 0.35287803411483765, "grad_norm": 0.5626146834887008, "learning_rate": 2.9598315789473686e-05, "loss": 0.3735, "mean_copy_accuracy": 0.9902539700269699, "mean_gen_accuracy": 0.8488618284463882, "mean_token_accuracy": 0.8785152435302734, "num_tokens": 285348878.0, "sample_num_tokens": 9703.5, "step": 1054, "total_num_tokens": 285387692.0, "z_loss": 0.001577582093887031 }, { "copy_logits_max": -1.8683336973190308, "copy_logits_min": -750000000.0, "copy_num_tokens": 657.6875, "epoch": 0.2154710237426602, "gen_logits_max": 9.33687973022461, "gen_logits_mean": -9.588991165161133, "gen_logits_min": -22.048059463500977, "gen_logits_std": 2.559957981109619, "gen_loss": 0.36435437202453613, "grad_norm": 0.8161801083153402, "learning_rate": 2.9597052631578947e-05, "loss": 0.3229, "mean_copy_accuracy": 0.9925515651702881, "mean_gen_accuracy": 0.8557899743318558, "mean_token_accuracy": 0.8955727517604828, "num_tokens": 285640372.0, "sample_num_tokens": 10359.5, "step": 1055, "total_num_tokens": 285681810.0, "z_loss": 0.0016710220370441675 }, { "copy_logits_max": 0.5116561651229858, "copy_logits_min": -625000064.0, "copy_num_tokens": 612.125, "epoch": 0.21567526167985704, "gen_logits_max": 9.364633560180664, "gen_logits_mean": -9.751787185668945, "gen_logits_min": -22.509674072265625, "gen_logits_std": 2.5933666229248047, "gen_loss": 0.3201861083507538, "grad_norm": 0.540065841612882, "learning_rate": 2.959578947368421e-05, "loss": 0.3509, "mean_copy_accuracy": 0.9884035885334015, "mean_gen_accuracy": 0.8611755669116974, "mean_token_accuracy": 0.890681117773056, "num_tokens": 285907281.0, "sample_num_tokens": 9461.25, "step": 1056, "total_num_tokens": 285945126.0, "z_loss": 0.0016231376212090254 }, { "copy_logits_max": 0.9023913145065308, "copy_logits_min": -750000000.0, "copy_num_tokens": 653.5625, "epoch": 0.21587949961705385, "gen_logits_max": 8.44925308227539, "gen_logits_mean": -9.850225448608398, "gen_logits_min": -22.286752700805664, "gen_logits_std": 2.602663993835449, "gen_loss": 0.33095747232437134, "grad_norm": 0.5846887884876911, "learning_rate": 2.9594526315789472e-05, "loss": 0.3522, "mean_copy_accuracy": 0.9909133464097977, "mean_gen_accuracy": 0.8508092164993286, "mean_token_accuracy": 0.889038160443306, "num_tokens": 286192713.0, "sample_num_tokens": 9320.25, "step": 1057, "total_num_tokens": 286229994.0, "z_loss": 0.001667198957875371 }, { "copy_logits_max": -0.5443406105041504, "copy_logits_min": -750000000.0, "copy_num_tokens": 394.3125, "epoch": 0.2160837375542507, "gen_logits_max": 8.931985855102539, "gen_logits_mean": -9.871254920959473, "gen_logits_min": -22.098068237304688, "gen_logits_std": 2.4636409282684326, "gen_loss": 0.34243571758270264, "grad_norm": 0.5309827898875724, "learning_rate": 2.9593263157894736e-05, "loss": 0.344, "mean_copy_accuracy": 0.990107536315918, "mean_gen_accuracy": 0.8578974455595016, "mean_token_accuracy": 0.8872172236442566, "num_tokens": 286465664.0, "sample_num_tokens": 8758.0, "step": 1058, "total_num_tokens": 286500696.0, "z_loss": 0.001413378631696105 }, { "copy_logits_max": -1.5711779594421387, "copy_logits_min": -750000000.0, "copy_num_tokens": 491.0625, "epoch": 0.21628797549144754, "gen_logits_max": 9.486108779907227, "gen_logits_mean": -9.621031761169434, "gen_logits_min": -22.498064041137695, "gen_logits_std": 2.568213939666748, "gen_loss": 0.32445579767227173, "grad_norm": 0.5014445723072112, "learning_rate": 2.9592000000000004e-05, "loss": 0.3407, "mean_copy_accuracy": 0.9889429211616516, "mean_gen_accuracy": 0.8590565621852875, "mean_token_accuracy": 0.8911807090044022, "num_tokens": 286758222.0, "sample_num_tokens": 8375.0, "step": 1059, "total_num_tokens": 286791722.0, "z_loss": 0.0015527064679190516 }, { "copy_logits_max": 1.3113542795181274, "copy_logits_min": -750000000.0, "copy_num_tokens": 562.875, "epoch": 0.21649221342864436, "gen_logits_max": 9.41707706451416, "gen_logits_mean": -9.81990909576416, "gen_logits_min": -22.44651222229004, "gen_logits_std": 2.545971393585205, "gen_loss": 0.3531988263130188, "grad_norm": 0.5953183807354324, "learning_rate": 2.9590736842105265e-05, "loss": 0.3338, "mean_copy_accuracy": 0.9904083907604218, "mean_gen_accuracy": 0.8570453822612762, "mean_token_accuracy": 0.8936119228601456, "num_tokens": 287041270.0, "sample_num_tokens": 8257.5, "step": 1060, "total_num_tokens": 287074300.0, "z_loss": 0.0019529522396624088 }, { "copy_logits_max": -3.3078246116638184, "copy_logits_min": -750000000.0, "copy_num_tokens": 291.75, "epoch": 0.2166964513658412, "gen_logits_max": 10.056705474853516, "gen_logits_mean": -10.459495544433594, "gen_logits_min": -22.564979553222656, "gen_logits_std": 2.496094226837158, "gen_loss": 0.3653455972671509, "grad_norm": 0.5172103285387574, "learning_rate": 2.958947368421053e-05, "loss": 0.3496, "mean_copy_accuracy": 0.9900424480438232, "mean_gen_accuracy": 0.8575224727392197, "mean_token_accuracy": 0.8880881071090698, "num_tokens": 287316866.0, "sample_num_tokens": 7790.5, "step": 1061, "total_num_tokens": 287348028.0, "z_loss": 0.0015908016357570887 }, { "copy_logits_max": -2.688364028930664, "copy_logits_min": -750000000.0, "copy_num_tokens": 365.5, "epoch": 0.21690068930303805, "gen_logits_max": 9.918011665344238, "gen_logits_mean": -9.377731323242188, "gen_logits_min": -21.184972763061523, "gen_logits_std": 2.492292881011963, "gen_loss": 0.39015984535217285, "grad_norm": 0.6292180393571871, "learning_rate": 2.958821052631579e-05, "loss": 0.356, "mean_copy_accuracy": 0.9887480288743973, "mean_gen_accuracy": 0.8500395864248276, "mean_token_accuracy": 0.8845589011907578, "num_tokens": 287570209.0, "sample_num_tokens": 7382.75, "step": 1062, "total_num_tokens": 287599740.0, "z_loss": 0.0016640231478959322 }, { "copy_logits_max": 0.3323618173599243, "copy_logits_min": -750000000.0, "copy_num_tokens": 601.5, "epoch": 0.21710492724023486, "gen_logits_max": 9.548317909240723, "gen_logits_mean": -10.157352447509766, "gen_logits_min": -22.96127700805664, "gen_logits_std": 2.522193670272827, "gen_loss": 0.3313091993331909, "grad_norm": 0.5852132862444441, "learning_rate": 2.9586947368421054e-05, "loss": 0.354, "mean_copy_accuracy": 0.9897353798151016, "mean_gen_accuracy": 0.8520668745040894, "mean_token_accuracy": 0.887191504240036, "num_tokens": 287822121.0, "sample_num_tokens": 9900.75, "step": 1063, "total_num_tokens": 287861724.0, "z_loss": 0.0018634756561368704 }, { "copy_logits_max": -0.3969813585281372, "copy_logits_min": -750000000.0, "copy_num_tokens": 545.75, "epoch": 0.2173091651774317, "gen_logits_max": 9.71242904663086, "gen_logits_mean": -9.674793243408203, "gen_logits_min": -22.231225967407227, "gen_logits_std": 2.523505449295044, "gen_loss": 0.35316869616508484, "grad_norm": 0.48834970556967794, "learning_rate": 2.9585684210526315e-05, "loss": 0.339, "mean_copy_accuracy": 0.99198317527771, "mean_gen_accuracy": 0.8532910645008087, "mean_token_accuracy": 0.889945849776268, "num_tokens": 288098613.0, "sample_num_tokens": 9296.25, "step": 1064, "total_num_tokens": 288135798.0, "z_loss": 0.0018654644954949617 }, { "copy_logits_max": 0.1095060408115387, "copy_logits_min": -750000000.0, "copy_num_tokens": 448.25, "epoch": 0.21751340311462855, "gen_logits_max": 10.347306251525879, "gen_logits_mean": -9.150848388671875, "gen_logits_min": -21.592039108276367, "gen_logits_std": 2.5448946952819824, "gen_loss": 0.3673861026763916, "grad_norm": 0.6153400283025323, "learning_rate": 2.958442105263158e-05, "loss": 0.3527, "mean_copy_accuracy": 0.9880606830120087, "mean_gen_accuracy": 0.8499006181955338, "mean_token_accuracy": 0.8878171294927597, "num_tokens": 288375553.0, "sample_num_tokens": 8708.25, "step": 1065, "total_num_tokens": 288410386.0, "z_loss": 0.0020901435054838657 }, { "copy_logits_max": 0.5157018899917603, "copy_logits_min": -750000000.0, "copy_num_tokens": 556.5, "epoch": 0.21771764105182537, "gen_logits_max": 9.7158203125, "gen_logits_mean": -9.96823501586914, "gen_logits_min": -22.566043853759766, "gen_logits_std": 2.569744825363159, "gen_loss": 0.3138030171394348, "grad_norm": 0.5159292121261145, "learning_rate": 2.958315789473684e-05, "loss": 0.3575, "mean_copy_accuracy": 0.9898334741592407, "mean_gen_accuracy": 0.8521300405263901, "mean_token_accuracy": 0.8877818435430527, "num_tokens": 288649641.0, "sample_num_tokens": 9276.25, "step": 1066, "total_num_tokens": 288686746.0, "z_loss": 0.0018650905694812536 }, { "copy_logits_max": 0.061690524220466614, "copy_logits_min": -750000000.0, "copy_num_tokens": 292.0, "epoch": 0.2179218789890222, "gen_logits_max": 9.750911712646484, "gen_logits_mean": -9.891119956970215, "gen_logits_min": -22.600719451904297, "gen_logits_std": 2.533066749572754, "gen_loss": 0.34947484731674194, "grad_norm": 0.6017742992004331, "learning_rate": 2.958189473684211e-05, "loss": 0.3575, "mean_copy_accuracy": 0.9872282594442368, "mean_gen_accuracy": 0.8541410267353058, "mean_token_accuracy": 0.8838609606027603, "num_tokens": 288905768.0, "sample_num_tokens": 6703.5, "step": 1067, "total_num_tokens": 288932582.0, "z_loss": 0.0017094726208597422 }, { "copy_logits_max": 0.4963120222091675, "copy_logits_min": -750000000.0, "copy_num_tokens": 579.625, "epoch": 0.21812611692621905, "gen_logits_max": 10.0205078125, "gen_logits_mean": -9.295778274536133, "gen_logits_min": -21.930727005004883, "gen_logits_std": 2.5692687034606934, "gen_loss": 0.338289737701416, "grad_norm": 0.5118537498917366, "learning_rate": 2.958063157894737e-05, "loss": 0.339, "mean_copy_accuracy": 0.9896917045116425, "mean_gen_accuracy": 0.8543237596750259, "mean_token_accuracy": 0.8887675255537033, "num_tokens": 289171584.0, "sample_num_tokens": 9933.0, "step": 1068, "total_num_tokens": 289211316.0, "z_loss": 0.00185544160194695 }, { "copy_logits_max": -0.36820271611213684, "copy_logits_min": -625000000.0, "copy_num_tokens": 605.5625, "epoch": 0.21833035486341587, "gen_logits_max": 9.411666870117188, "gen_logits_mean": -9.819255828857422, "gen_logits_min": -22.362335205078125, "gen_logits_std": 2.5872700214385986, "gen_loss": 0.3134293258190155, "grad_norm": 0.6172389941095481, "learning_rate": 2.9579368421052634e-05, "loss": 0.367, "mean_copy_accuracy": 0.9881022423505783, "mean_gen_accuracy": 0.8458252996206284, "mean_token_accuracy": 0.88410584628582, "num_tokens": 289459269.0, "sample_num_tokens": 9139.75, "step": 1069, "total_num_tokens": 289495828.0, "z_loss": 0.0016322543378919363 }, { "copy_logits_max": -3.2986295223236084, "copy_logits_min": -687500032.0, "copy_num_tokens": 382.8125, "epoch": 0.21853459280061271, "gen_logits_max": 9.228826522827148, "gen_logits_mean": -10.246437072753906, "gen_logits_min": -22.64643096923828, "gen_logits_std": 2.562025785446167, "gen_loss": 0.36590683460235596, "grad_norm": 0.5360066004895319, "learning_rate": 2.9578105263157894e-05, "loss": 0.374, "mean_copy_accuracy": 0.989354208111763, "mean_gen_accuracy": 0.848840594291687, "mean_token_accuracy": 0.8789850324392319, "num_tokens": 289727267.0, "sample_num_tokens": 8332.25, "step": 1070, "total_num_tokens": 289760596.0, "z_loss": 0.0015456171240657568 }, { "copy_logits_max": 1.2979689836502075, "copy_logits_min": -750000000.0, "copy_num_tokens": 597.5625, "epoch": 0.21873883073780956, "gen_logits_max": 9.9371976852417, "gen_logits_mean": -10.003307342529297, "gen_logits_min": -22.949684143066406, "gen_logits_std": 2.6031382083892822, "gen_loss": 0.30743494629859924, "grad_norm": 0.5144829911328372, "learning_rate": 2.957684210526316e-05, "loss": 0.3563, "mean_copy_accuracy": 0.9905057549476624, "mean_gen_accuracy": 0.851044699549675, "mean_token_accuracy": 0.883560910820961, "num_tokens": 290004715.0, "sample_num_tokens": 9633.25, "step": 1071, "total_num_tokens": 290043248.0, "z_loss": 0.0016347009222954512 }, { "copy_logits_max": -1.4128899574279785, "copy_logits_min": -750000000.0, "copy_num_tokens": 396.0625, "epoch": 0.21894306867500637, "gen_logits_max": 9.697723388671875, "gen_logits_mean": -10.98769760131836, "gen_logits_min": -23.00377655029297, "gen_logits_std": 2.4854040145874023, "gen_loss": 0.3304339051246643, "grad_norm": 0.5361442481383699, "learning_rate": 2.957557894736842e-05, "loss": 0.3315, "mean_copy_accuracy": 0.9900783747434616, "mean_gen_accuracy": 0.8623296469449997, "mean_token_accuracy": 0.8938805013895035, "num_tokens": 290274207.0, "sample_num_tokens": 7996.25, "step": 1072, "total_num_tokens": 290306192.0, "z_loss": 0.0015290247974917293 }, { "copy_logits_max": -2.5566353797912598, "copy_logits_min": -687500032.0, "copy_num_tokens": 259.5625, "epoch": 0.21914730661220322, "gen_logits_max": 10.624833106994629, "gen_logits_mean": -9.634525299072266, "gen_logits_min": -22.284488677978516, "gen_logits_std": 2.5790066719055176, "gen_loss": 0.3827623426914215, "grad_norm": 0.5683250198656746, "learning_rate": 2.9574315789473684e-05, "loss": 0.347, "mean_copy_accuracy": 0.9891041368246078, "mean_gen_accuracy": 0.8554603159427643, "mean_token_accuracy": 0.8859435468912125, "num_tokens": 290529695.0, "sample_num_tokens": 6755.75, "step": 1073, "total_num_tokens": 290556718.0, "z_loss": 0.0016003758646547794 }, { "copy_logits_max": -1.6406846046447754, "copy_logits_min": -750000000.0, "copy_num_tokens": 417.75, "epoch": 0.21935154454940006, "gen_logits_max": 9.833139419555664, "gen_logits_mean": -10.191559791564941, "gen_logits_min": -22.384090423583984, "gen_logits_std": 2.529728889465332, "gen_loss": 0.35767537355422974, "grad_norm": 0.6003784137902289, "learning_rate": 2.957305263157895e-05, "loss": 0.3586, "mean_copy_accuracy": 0.9904572665691376, "mean_gen_accuracy": 0.8457490801811218, "mean_token_accuracy": 0.8837934136390686, "num_tokens": 290829228.0, "sample_num_tokens": 8787.5, "step": 1074, "total_num_tokens": 290864378.0, "z_loss": 0.0015324023552238941 }, { "copy_logits_max": -0.6439144611358643, "copy_logits_min": -750000000.0, "copy_num_tokens": 488.4375, "epoch": 0.21955578248659688, "gen_logits_max": 9.771499633789062, "gen_logits_mean": -9.842247009277344, "gen_logits_min": -22.325210571289062, "gen_logits_std": 2.542722702026367, "gen_loss": 0.28770214319229126, "grad_norm": 0.4728804386339366, "learning_rate": 2.957178947368421e-05, "loss": 0.3393, "mean_copy_accuracy": 0.9905062764883041, "mean_gen_accuracy": 0.8606927990913391, "mean_token_accuracy": 0.889655351638794, "num_tokens": 291101686.0, "sample_num_tokens": 8945.0, "step": 1075, "total_num_tokens": 291137466.0, "z_loss": 0.0015193340368568897 }, { "copy_logits_max": -2.132774829864502, "copy_logits_min": -750000000.0, "copy_num_tokens": 291.5, "epoch": 0.21976002042379372, "gen_logits_max": 11.026028633117676, "gen_logits_mean": -9.406034469604492, "gen_logits_min": -22.423763275146484, "gen_logits_std": 2.537489891052246, "gen_loss": 0.35293683409690857, "grad_norm": 0.5134087324436386, "learning_rate": 2.9570526315789477e-05, "loss": 0.3476, "mean_copy_accuracy": 0.9876924306154251, "mean_gen_accuracy": 0.8655433654785156, "mean_token_accuracy": 0.8875925093889236, "num_tokens": 291355994.0, "sample_num_tokens": 8531.5, "step": 1076, "total_num_tokens": 291390120.0, "z_loss": 0.0016014758730307221 }, { "copy_logits_max": 1.4129433631896973, "copy_logits_min": -750000000.0, "copy_num_tokens": 574.6875, "epoch": 0.21996425836099057, "gen_logits_max": 10.458403587341309, "gen_logits_mean": -8.919336318969727, "gen_logits_min": -22.189790725708008, "gen_logits_std": 2.566585063934326, "gen_loss": 0.3742907643318176, "grad_norm": 0.5561732475287404, "learning_rate": 2.9569263157894738e-05, "loss": 0.3692, "mean_copy_accuracy": 0.9875152558088303, "mean_gen_accuracy": 0.8473668843507767, "mean_token_accuracy": 0.8803085684776306, "num_tokens": 291611242.0, "sample_num_tokens": 9266.5, "step": 1077, "total_num_tokens": 291648308.0, "z_loss": 0.001952496124431491 }, { "copy_logits_max": 1.29646635055542, "copy_logits_min": -750000000.0, "copy_num_tokens": 679.375, "epoch": 0.22016849629818738, "gen_logits_max": 9.870073318481445, "gen_logits_mean": -9.31340217590332, "gen_logits_min": -22.236270904541016, "gen_logits_std": 2.5861406326293945, "gen_loss": 0.3302813768386841, "grad_norm": 0.6833227713012614, "learning_rate": 2.9568000000000002e-05, "loss": 0.3531, "mean_copy_accuracy": 0.9890480190515518, "mean_gen_accuracy": 0.8478010445833206, "mean_token_accuracy": 0.8867393136024475, "num_tokens": 291889973.0, "sample_num_tokens": 9044.25, "step": 1078, "total_num_tokens": 291926150.0, "z_loss": 0.0021818068344146013 }, { "copy_logits_max": 0.912308931350708, "copy_logits_min": -687500032.0, "copy_num_tokens": 596.8125, "epoch": 0.22037273423538423, "gen_logits_max": 11.039791107177734, "gen_logits_mean": -8.988014221191406, "gen_logits_min": -22.484699249267578, "gen_logits_std": 2.586392402648926, "gen_loss": 0.35614943504333496, "grad_norm": 0.5469992694938426, "learning_rate": 2.9566736842105263e-05, "loss": 0.3393, "mean_copy_accuracy": 0.9919467121362686, "mean_gen_accuracy": 0.8526674062013626, "mean_token_accuracy": 0.8898676633834839, "num_tokens": 292170135.0, "sample_num_tokens": 9194.25, "step": 1079, "total_num_tokens": 292206912.0, "z_loss": 0.002094268798828125 }, { "copy_logits_max": -1.051733374595642, "copy_logits_min": -750000000.0, "copy_num_tokens": 332.9375, "epoch": 0.22057697217258107, "gen_logits_max": 10.852256774902344, "gen_logits_mean": -8.530047416687012, "gen_logits_min": -20.902538299560547, "gen_logits_std": 2.476978063583374, "gen_loss": 0.41813528537750244, "grad_norm": 0.5259061460425632, "learning_rate": 2.9565473684210527e-05, "loss": 0.3524, "mean_copy_accuracy": 0.9900275468826294, "mean_gen_accuracy": 0.8540296852588654, "mean_token_accuracy": 0.8855431973934174, "num_tokens": 292449694.0, "sample_num_tokens": 7352.0, "step": 1080, "total_num_tokens": 292479102.0, "z_loss": 0.0019145077094435692 }, { "copy_logits_max": -1.0804765224456787, "copy_logits_min": -750000000.0, "copy_num_tokens": 559.0625, "epoch": 0.2207812101097779, "gen_logits_max": 10.396781921386719, "gen_logits_mean": -9.551239013671875, "gen_logits_min": -22.81648826599121, "gen_logits_std": 2.5943613052368164, "gen_loss": 0.3230527937412262, "grad_norm": 0.6350446076478838, "learning_rate": 2.956421052631579e-05, "loss": 0.3259, "mean_copy_accuracy": 0.9898601323366165, "mean_gen_accuracy": 0.8589783012866974, "mean_token_accuracy": 0.897626593708992, "num_tokens": 292717460.0, "sample_num_tokens": 8828.5, "step": 1081, "total_num_tokens": 292752774.0, "z_loss": 0.0016953423619270325 }, { "copy_logits_max": -0.7960673570632935, "copy_logits_min": -750000000.0, "copy_num_tokens": 509.4375, "epoch": 0.22098544804697473, "gen_logits_max": 9.918766021728516, "gen_logits_mean": -10.318986892700195, "gen_logits_min": -23.159713745117188, "gen_logits_std": 2.538971185684204, "gen_loss": 0.3225175440311432, "grad_norm": 0.5421907746590056, "learning_rate": 2.9562947368421053e-05, "loss": 0.3599, "mean_copy_accuracy": 0.9903082400560379, "mean_gen_accuracy": 0.8533067852258682, "mean_token_accuracy": 0.884246751666069, "num_tokens": 292989767.0, "sample_num_tokens": 9407.25, "step": 1082, "total_num_tokens": 293027396.0, "z_loss": 0.0016299095004796982 }, { "copy_logits_max": -1.0091028213500977, "copy_logits_min": -750000000.0, "copy_num_tokens": 370.3125, "epoch": 0.22118968598417155, "gen_logits_max": 11.316219329833984, "gen_logits_mean": -8.821770668029785, "gen_logits_min": -21.023658752441406, "gen_logits_std": 2.5455260276794434, "gen_loss": 0.4005306363105774, "grad_norm": 0.6005520262865229, "learning_rate": 2.9561684210526314e-05, "loss": 0.3836, "mean_copy_accuracy": 0.9902172535657883, "mean_gen_accuracy": 0.8443079888820648, "mean_token_accuracy": 0.8769011944532394, "num_tokens": 293256852.0, "sample_num_tokens": 7900.5, "step": 1083, "total_num_tokens": 293288454.0, "z_loss": 0.001664182636886835 }, { "copy_logits_max": -2.613126277923584, "copy_logits_min": -687500032.0, "copy_num_tokens": 338.4375, "epoch": 0.2213939239213684, "gen_logits_max": 9.529809951782227, "gen_logits_mean": -10.260635375976562, "gen_logits_min": -22.901229858398438, "gen_logits_std": 2.497081756591797, "gen_loss": 0.39831632375717163, "grad_norm": 0.5618211093600781, "learning_rate": 2.956042105263158e-05, "loss": 0.38, "mean_copy_accuracy": 0.9865504205226898, "mean_gen_accuracy": 0.8494482934474945, "mean_token_accuracy": 0.8765542209148407, "num_tokens": 293515530.0, "sample_num_tokens": 7735.5, "step": 1084, "total_num_tokens": 293546472.0, "z_loss": 0.0015017754631116986 }, { "copy_logits_max": -0.23741519451141357, "copy_logits_min": -750000000.0, "copy_num_tokens": 425.875, "epoch": 0.22159816185856523, "gen_logits_max": 10.613417625427246, "gen_logits_mean": -9.935662269592285, "gen_logits_min": -22.8233642578125, "gen_logits_std": 2.5532021522521973, "gen_loss": 0.3986138701438904, "grad_norm": 0.5389719866296891, "learning_rate": 2.9559157894736842e-05, "loss": 0.3567, "mean_copy_accuracy": 0.9906846135854721, "mean_gen_accuracy": 0.8524731546640396, "mean_token_accuracy": 0.8860030025243759, "num_tokens": 293772823.0, "sample_num_tokens": 8807.75, "step": 1085, "total_num_tokens": 293808054.0, "z_loss": 0.0015764011768624187 }, { "copy_logits_max": -1.324509859085083, "copy_logits_min": -687500032.0, "copy_num_tokens": 506.125, "epoch": 0.22180239979576205, "gen_logits_max": 10.297298431396484, "gen_logits_mean": -9.510294914245605, "gen_logits_min": -22.258769989013672, "gen_logits_std": 2.6015186309814453, "gen_loss": 0.3436044752597809, "grad_norm": 0.5277976777653934, "learning_rate": 2.9557894736842107e-05, "loss": 0.3535, "mean_copy_accuracy": 0.9926280528306961, "mean_gen_accuracy": 0.8457996249198914, "mean_token_accuracy": 0.8865365982055664, "num_tokens": 294044121.0, "sample_num_tokens": 9093.75, "step": 1086, "total_num_tokens": 294080496.0, "z_loss": 0.0017562046414241195 }, { "copy_logits_max": -2.308155059814453, "copy_logits_min": -750000000.0, "copy_num_tokens": 405.875, "epoch": 0.2220066377329589, "gen_logits_max": 9.809560775756836, "gen_logits_mean": -11.231047630310059, "gen_logits_min": -23.593612670898438, "gen_logits_std": 2.4818148612976074, "gen_loss": 0.3475828468799591, "grad_norm": 0.4763731175145563, "learning_rate": 2.955663157894737e-05, "loss": 0.3575, "mean_copy_accuracy": 0.9896983951330185, "mean_gen_accuracy": 0.8537950813770294, "mean_token_accuracy": 0.8844036012887955, "num_tokens": 294322627.0, "sample_num_tokens": 8447.75, "step": 1087, "total_num_tokens": 294356418.0, "z_loss": 0.0014865768607705832 }, { "copy_logits_max": -0.152021586894989, "copy_logits_min": -750000000.0, "copy_num_tokens": 498.125, "epoch": 0.22221087567015574, "gen_logits_max": 9.760632514953613, "gen_logits_mean": -10.446701049804688, "gen_logits_min": -23.215381622314453, "gen_logits_std": 2.53196382522583, "gen_loss": 0.3560805320739746, "grad_norm": 0.5556987361208144, "learning_rate": 2.9555368421052632e-05, "loss": 0.3496, "mean_copy_accuracy": 0.9902435839176178, "mean_gen_accuracy": 0.8510431051254272, "mean_token_accuracy": 0.8904422521591187, "num_tokens": 294593475.0, "sample_num_tokens": 8519.25, "step": 1088, "total_num_tokens": 294627552.0, "z_loss": 0.0018214780138805509 }, { "copy_logits_max": -0.8302338123321533, "copy_logits_min": -750000000.0, "copy_num_tokens": 444.75, "epoch": 0.22241511360735255, "gen_logits_max": 10.134380340576172, "gen_logits_mean": -10.067930221557617, "gen_logits_min": -22.64406394958496, "gen_logits_std": 2.518153667449951, "gen_loss": 0.31899434328079224, "grad_norm": 0.5283218466869565, "learning_rate": 2.9554105263157896e-05, "loss": 0.338, "mean_copy_accuracy": 0.9894538968801498, "mean_gen_accuracy": 0.8615613281726837, "mean_token_accuracy": 0.8905785083770752, "num_tokens": 294850364.0, "sample_num_tokens": 8395.5, "step": 1089, "total_num_tokens": 294883946.0, "z_loss": 0.0016205853316932917 }, { "copy_logits_max": -1.896974802017212, "copy_logits_min": -687500032.0, "copy_num_tokens": 424.4375, "epoch": 0.2226193515445494, "gen_logits_max": 9.35473346710205, "gen_logits_mean": -10.131023406982422, "gen_logits_min": -22.711889266967773, "gen_logits_std": 2.5213825702667236, "gen_loss": 0.3525319993495941, "grad_norm": 0.5486361154624285, "learning_rate": 2.9552842105263157e-05, "loss": 0.3363, "mean_copy_accuracy": 0.9922586530447006, "mean_gen_accuracy": 0.8554339557886124, "mean_token_accuracy": 0.8913060128688812, "num_tokens": 295119082.0, "sample_num_tokens": 8013.5, "step": 1090, "total_num_tokens": 295151136.0, "z_loss": 0.0015957481227815151 }, { "copy_logits_max": -0.4876554310321808, "copy_logits_min": -687500032.0, "copy_num_tokens": 425.0, "epoch": 0.22282358948174624, "gen_logits_max": 9.24423599243164, "gen_logits_mean": -10.328360557556152, "gen_logits_min": -22.417043685913086, "gen_logits_std": 2.522256374359131, "gen_loss": 0.3887181282043457, "grad_norm": 0.5606139200986435, "learning_rate": 2.955157894736842e-05, "loss": 0.3552, "mean_copy_accuracy": 0.9886434227228165, "mean_gen_accuracy": 0.8516110330820084, "mean_token_accuracy": 0.8849347680807114, "num_tokens": 295378687.0, "sample_num_tokens": 8133.25, "step": 1091, "total_num_tokens": 295411220.0, "z_loss": 0.001656288979575038 }, { "copy_logits_max": -2.3128795623779297, "copy_logits_min": -750000000.0, "copy_num_tokens": 539.25, "epoch": 0.22302782741894306, "gen_logits_max": 10.746223449707031, "gen_logits_mean": -8.959218978881836, "gen_logits_min": -21.225290298461914, "gen_logits_std": 2.544618606567383, "gen_loss": 0.331237256526947, "grad_norm": 0.5388199853181029, "learning_rate": 2.9550315789473686e-05, "loss": 0.3232, "mean_copy_accuracy": 0.9904211908578873, "mean_gen_accuracy": 0.863366961479187, "mean_token_accuracy": 0.8957875221967697, "num_tokens": 295644881.0, "sample_num_tokens": 9282.25, "step": 1092, "total_num_tokens": 295682010.0, "z_loss": 0.0015607387758791447 }, { "copy_logits_max": -4.64632511138916, "copy_logits_min": -750000000.0, "copy_num_tokens": 446.625, "epoch": 0.2232320653561399, "gen_logits_max": 9.924322128295898, "gen_logits_mean": -10.830078125, "gen_logits_min": -22.937583923339844, "gen_logits_std": 2.4635462760925293, "gen_loss": 0.335521399974823, "grad_norm": 0.5031016167979944, "learning_rate": 2.954905263157895e-05, "loss": 0.3329, "mean_copy_accuracy": 0.9905851185321808, "mean_gen_accuracy": 0.8609124273061752, "mean_token_accuracy": 0.8935238867998123, "num_tokens": 295916334.0, "sample_num_tokens": 9621.5, "step": 1093, "total_num_tokens": 295954820.0, "z_loss": 0.0013619603123515844 }, { "copy_logits_max": -2.4019775390625, "copy_logits_min": -750000000.0, "copy_num_tokens": 457.9375, "epoch": 0.22343630329333675, "gen_logits_max": 9.343792915344238, "gen_logits_mean": -10.764824867248535, "gen_logits_min": -22.79910659790039, "gen_logits_std": 2.513115167617798, "gen_loss": 0.3389478325843811, "grad_norm": 0.5274135880195779, "learning_rate": 2.954778947368421e-05, "loss": 0.3554, "mean_copy_accuracy": 0.990787535905838, "mean_gen_accuracy": 0.855892077088356, "mean_token_accuracy": 0.8843083828687668, "num_tokens": 296188119.0, "sample_num_tokens": 9053.75, "step": 1094, "total_num_tokens": 296224334.0, "z_loss": 0.0015153929125517607 }, { "copy_logits_max": -0.18833988904953003, "copy_logits_min": -750000000.0, "copy_num_tokens": 558.9375, "epoch": 0.22364054123053356, "gen_logits_max": 10.806947708129883, "gen_logits_mean": -9.472822189331055, "gen_logits_min": -22.197399139404297, "gen_logits_std": 2.5951297283172607, "gen_loss": 0.3494354486465454, "grad_norm": 0.5830575903397269, "learning_rate": 2.9546526315789475e-05, "loss": 0.3504, "mean_copy_accuracy": 0.9879824072122574, "mean_gen_accuracy": 0.8525955677032471, "mean_token_accuracy": 0.8863599896430969, "num_tokens": 296448892.0, "sample_num_tokens": 8405.0, "step": 1095, "total_num_tokens": 296482512.0, "z_loss": 0.0016697593964636326 }, { "copy_logits_max": -2.937215566635132, "copy_logits_min": -750000000.0, "copy_num_tokens": 290.4375, "epoch": 0.2238447791677304, "gen_logits_max": 9.707967758178711, "gen_logits_mean": -10.81340217590332, "gen_logits_min": -23.344276428222656, "gen_logits_std": 2.4804844856262207, "gen_loss": 0.3541354537010193, "grad_norm": 0.5194860103955596, "learning_rate": 2.9545263157894736e-05, "loss": 0.3354, "mean_copy_accuracy": 0.9901737570762634, "mean_gen_accuracy": 0.8568982183933258, "mean_token_accuracy": 0.8900085538625717, "num_tokens": 296731056.0, "sample_num_tokens": 6906.0, "step": 1096, "total_num_tokens": 296758680.0, "z_loss": 0.0014658812433481216 }, { "copy_logits_max": 0.5220539569854736, "copy_logits_min": -625000064.0, "copy_num_tokens": 485.25, "epoch": 0.22404901710492725, "gen_logits_max": 9.573598861694336, "gen_logits_mean": -10.078628540039062, "gen_logits_min": -22.95818519592285, "gen_logits_std": 2.5845792293548584, "gen_loss": 0.32936984300613403, "grad_norm": 0.535063535351923, "learning_rate": 2.9544e-05, "loss": 0.3415, "mean_copy_accuracy": 0.9903688430786133, "mean_gen_accuracy": 0.8592270612716675, "mean_token_accuracy": 0.8927666246891022, "num_tokens": 297003668.0, "sample_num_tokens": 8325.0, "step": 1097, "total_num_tokens": 297036968.0, "z_loss": 0.0017726465594023466 }, { "copy_logits_max": -0.5703986883163452, "copy_logits_min": -750000000.0, "copy_num_tokens": 579.0, "epoch": 0.22425325504212407, "gen_logits_max": 8.939456939697266, "gen_logits_mean": -9.97714900970459, "gen_logits_min": -22.32171630859375, "gen_logits_std": 2.5311901569366455, "gen_loss": 0.36458319425582886, "grad_norm": 0.5174804961009516, "learning_rate": 2.954273684210526e-05, "loss": 0.377, "mean_copy_accuracy": 0.9917764663696289, "mean_gen_accuracy": 0.8425103724002838, "mean_token_accuracy": 0.8789500445127487, "num_tokens": 297267705.0, "sample_num_tokens": 8944.75, "step": 1098, "total_num_tokens": 297303484.0, "z_loss": 0.001705981558188796 }, { "copy_logits_max": -1.9361153841018677, "copy_logits_min": -750000000.0, "copy_num_tokens": 474.875, "epoch": 0.2244574929793209, "gen_logits_max": 9.511932373046875, "gen_logits_mean": -9.526409149169922, "gen_logits_min": -22.064472198486328, "gen_logits_std": 2.568783760070801, "gen_loss": 0.3188285827636719, "grad_norm": 0.5968544384511504, "learning_rate": 2.9541473684210526e-05, "loss": 0.349, "mean_copy_accuracy": 0.9863921999931335, "mean_gen_accuracy": 0.8554823249578476, "mean_token_accuracy": 0.8857758939266205, "num_tokens": 297525759.0, "sample_num_tokens": 7830.75, "step": 1099, "total_num_tokens": 297557082.0, "z_loss": 0.001551199471578002 }, { "copy_logits_max": -0.324565052986145, "copy_logits_min": -750000000.0, "copy_num_tokens": 436.3125, "epoch": 0.22466173091651775, "gen_logits_max": 10.403809547424316, "gen_logits_mean": -9.816967010498047, "gen_logits_min": -21.981868743896484, "gen_logits_std": 2.578226327896118, "gen_loss": 0.3795855641365051, "grad_norm": 0.5606408024081082, "learning_rate": 2.9540210526315793e-05, "loss": 0.3446, "mean_copy_accuracy": 0.9904931634664536, "mean_gen_accuracy": 0.8569173365831375, "mean_token_accuracy": 0.8883701115846634, "num_tokens": 297787603.0, "sample_num_tokens": 8050.75, "step": 1100, "total_num_tokens": 297819806.0, "z_loss": 0.0016892170533537865 }, { "copy_logits_max": -1.7540216445922852, "copy_logits_min": -750000000.0, "copy_num_tokens": 515.9375, "epoch": 0.22486596885371457, "gen_logits_max": 9.217044830322266, "gen_logits_mean": -10.424553871154785, "gen_logits_min": -22.944856643676758, "gen_logits_std": 2.49198055267334, "gen_loss": 0.34027352929115295, "grad_norm": 0.6887032875824501, "learning_rate": 2.9538947368421054e-05, "loss": 0.3238, "mean_copy_accuracy": 0.9914081692695618, "mean_gen_accuracy": 0.8586784303188324, "mean_token_accuracy": 0.8960987776517868, "num_tokens": 298051717.0, "sample_num_tokens": 8203.25, "step": 1101, "total_num_tokens": 298084530.0, "z_loss": 0.0014774377923458815 }, { "copy_logits_max": -2.406607151031494, "copy_logits_min": -750000000.0, "copy_num_tokens": 388.6875, "epoch": 0.22507020679091141, "gen_logits_max": 9.270780563354492, "gen_logits_mean": -9.292375564575195, "gen_logits_min": -21.903730392456055, "gen_logits_std": 2.5066967010498047, "gen_loss": 0.34673723578453064, "grad_norm": 0.6287345628799687, "learning_rate": 2.953768421052632e-05, "loss": 0.3324, "mean_copy_accuracy": 0.987798810005188, "mean_gen_accuracy": 0.8595902919769287, "mean_token_accuracy": 0.8914221674203873, "num_tokens": 298318080.0, "sample_num_tokens": 7315.5, "step": 1102, "total_num_tokens": 298347342.0, "z_loss": 0.0013866747030988336 }, { "copy_logits_max": -0.7713189125061035, "copy_logits_min": -750000064.0, "copy_num_tokens": 543.9375, "epoch": 0.22527444472810826, "gen_logits_max": 9.501276016235352, "gen_logits_mean": -9.487981796264648, "gen_logits_min": -22.403486251831055, "gen_logits_std": 2.585923910140991, "gen_loss": 0.3602473735809326, "grad_norm": 0.5480088862492137, "learning_rate": 2.953642105263158e-05, "loss": 0.3459, "mean_copy_accuracy": 0.9916726350784302, "mean_gen_accuracy": 0.8505108654499054, "mean_token_accuracy": 0.8895845264196396, "num_tokens": 298606245.0, "sample_num_tokens": 8041.25, "step": 1103, "total_num_tokens": 298638410.0, "z_loss": 0.0016728760674595833 }, { "copy_logits_max": -1.5764305591583252, "copy_logits_min": -687500032.0, "copy_num_tokens": 555.5, "epoch": 0.22547868266530507, "gen_logits_max": 9.172991752624512, "gen_logits_mean": -9.612669944763184, "gen_logits_min": -21.852096557617188, "gen_logits_std": 2.565495729446411, "gen_loss": 0.3660890758037567, "grad_norm": 0.5395472274419237, "learning_rate": 2.9535157894736844e-05, "loss": 0.3449, "mean_copy_accuracy": 0.9907573461532593, "mean_gen_accuracy": 0.8532016575336456, "mean_token_accuracy": 0.8868376165628433, "num_tokens": 298887573.0, "sample_num_tokens": 9426.25, "step": 1104, "total_num_tokens": 298925278.0, "z_loss": 0.001638730987906456 }, { "copy_logits_max": -1.8736279010772705, "copy_logits_min": -750000000.0, "copy_num_tokens": 344.75, "epoch": 0.22568292060250192, "gen_logits_max": 9.59614372253418, "gen_logits_mean": -10.227506637573242, "gen_logits_min": -21.963939666748047, "gen_logits_std": 2.450080156326294, "gen_loss": 0.3795362710952759, "grad_norm": 0.5356020370120738, "learning_rate": 2.9533894736842105e-05, "loss": 0.3299, "mean_copy_accuracy": 0.9877306520938873, "mean_gen_accuracy": 0.8640477061271667, "mean_token_accuracy": 0.8936064690351486, "num_tokens": 299169049.0, "sample_num_tokens": 8002.75, "step": 1105, "total_num_tokens": 299201060.0, "z_loss": 0.0014273449778556824 }, { "copy_logits_max": -3.432098388671875, "copy_logits_min": -750000000.0, "copy_num_tokens": 523.0625, "epoch": 0.22588715853969876, "gen_logits_max": 9.809805870056152, "gen_logits_mean": -8.609295845031738, "gen_logits_min": -21.8090877532959, "gen_logits_std": 2.5770044326782227, "gen_loss": 0.36894673109054565, "grad_norm": 0.5836303865742694, "learning_rate": 2.953263157894737e-05, "loss": 0.365, "mean_copy_accuracy": 0.9886937588453293, "mean_gen_accuracy": 0.8470708876848221, "mean_token_accuracy": 0.8796445429325104, "num_tokens": 299450334.0, "sample_num_tokens": 9277.5, "step": 1106, "total_num_tokens": 299487444.0, "z_loss": 0.0015457362169399858 }, { "copy_logits_max": -0.7292912006378174, "copy_logits_min": -687500032.0, "copy_num_tokens": 403.3125, "epoch": 0.22609139647689558, "gen_logits_max": 9.619110107421875, "gen_logits_mean": -10.76849365234375, "gen_logits_min": -23.00613784790039, "gen_logits_std": 2.539747714996338, "gen_loss": 0.36345866322517395, "grad_norm": 0.5528822567896303, "learning_rate": 2.953136842105263e-05, "loss": 0.3697, "mean_copy_accuracy": 0.9908533841371536, "mean_gen_accuracy": 0.84827920794487, "mean_token_accuracy": 0.8787118494510651, "num_tokens": 299696567.0, "sample_num_tokens": 7703.25, "step": 1107, "total_num_tokens": 299727380.0, "z_loss": 0.0016420070314779878 }, { "copy_logits_max": -2.1377878189086914, "copy_logits_min": -687500032.0, "copy_num_tokens": 387.4375, "epoch": 0.22629563441409242, "gen_logits_max": 9.960931777954102, "gen_logits_mean": -10.018664360046387, "gen_logits_min": -22.12270736694336, "gen_logits_std": 2.5206072330474854, "gen_loss": 0.37550926208496094, "grad_norm": 0.5889100825408676, "learning_rate": 2.9530105263157898e-05, "loss": 0.3876, "mean_copy_accuracy": 0.9909132868051529, "mean_gen_accuracy": 0.840042844414711, "mean_token_accuracy": 0.8771793693304062, "num_tokens": 299958310.0, "sample_num_tokens": 7981.5, "step": 1108, "total_num_tokens": 299990236.0, "z_loss": 0.0015832518693059683 }, { "copy_logits_max": -1.24723219871521, "copy_logits_min": -687500032.0, "copy_num_tokens": 582.75, "epoch": 0.22649987235128924, "gen_logits_max": 9.088733673095703, "gen_logits_mean": -10.108333587646484, "gen_logits_min": -22.523330688476562, "gen_logits_std": 2.5898306369781494, "gen_loss": 0.32975879311561584, "grad_norm": 0.6056809133480361, "learning_rate": 2.952884210526316e-05, "loss": 0.3326, "mean_copy_accuracy": 0.9898130297660828, "mean_gen_accuracy": 0.8605745285749435, "mean_token_accuracy": 0.8938266634941101, "num_tokens": 300216568.0, "sample_num_tokens": 8886.0, "step": 1109, "total_num_tokens": 300252112.0, "z_loss": 0.0015391232445836067 }, { "copy_logits_max": -1.330507516860962, "copy_logits_min": -625000064.0, "copy_num_tokens": 626.625, "epoch": 0.22670411028848608, "gen_logits_max": 9.031559944152832, "gen_logits_mean": -9.316265106201172, "gen_logits_min": -21.6794376373291, "gen_logits_std": 2.560856342315674, "gen_loss": 0.329601526260376, "grad_norm": 0.4418041059850436, "learning_rate": 2.9527578947368423e-05, "loss": 0.3433, "mean_copy_accuracy": 0.9899798482656479, "mean_gen_accuracy": 0.8559257984161377, "mean_token_accuracy": 0.8908681720495224, "num_tokens": 300502138.0, "sample_num_tokens": 9965.5, "step": 1110, "total_num_tokens": 300542000.0, "z_loss": 0.0016234266804531217 }, { "copy_logits_max": -1.824663758277893, "copy_logits_min": -750000000.0, "copy_num_tokens": 457.75, "epoch": 0.22690834822568293, "gen_logits_max": 10.197322845458984, "gen_logits_mean": -8.968696594238281, "gen_logits_min": -20.603538513183594, "gen_logits_std": 2.516730308532715, "gen_loss": 0.36470121145248413, "grad_norm": 0.5365758383972046, "learning_rate": 2.9526315789473684e-05, "loss": 0.3461, "mean_copy_accuracy": 0.9892862886190414, "mean_gen_accuracy": 0.8528156727552414, "mean_token_accuracy": 0.8888494074344635, "num_tokens": 300784225.0, "sample_num_tokens": 8954.25, "step": 1111, "total_num_tokens": 300820042.0, "z_loss": 0.0017362824873998761 }, { "copy_logits_max": -1.692596673965454, "copy_logits_min": -750000000.0, "copy_num_tokens": 626.5625, "epoch": 0.22711258616287974, "gen_logits_max": 9.37388801574707, "gen_logits_mean": -9.461654663085938, "gen_logits_min": -21.76042366027832, "gen_logits_std": 2.56188702583313, "gen_loss": 0.3294604420661926, "grad_norm": 0.5574562233342268, "learning_rate": 2.9525052631578948e-05, "loss": 0.3509, "mean_copy_accuracy": 0.9890340566635132, "mean_gen_accuracy": 0.858571320772171, "mean_token_accuracy": 0.8881004899740219, "num_tokens": 301048662.0, "sample_num_tokens": 9371.0, "step": 1112, "total_num_tokens": 301086146.0, "z_loss": 0.001881559262983501 }, { "copy_logits_max": -0.3375549912452698, "copy_logits_min": -750000000.0, "copy_num_tokens": 534.4375, "epoch": 0.2273168241000766, "gen_logits_max": 8.94780158996582, "gen_logits_mean": -10.300752639770508, "gen_logits_min": -22.452041625976562, "gen_logits_std": 2.5099806785583496, "gen_loss": 0.3603483736515045, "grad_norm": 0.5018717492789871, "learning_rate": 2.9523789473684212e-05, "loss": 0.3456, "mean_copy_accuracy": 0.9901258647441864, "mean_gen_accuracy": 0.8585207164287567, "mean_token_accuracy": 0.8881758004426956, "num_tokens": 301328566.0, "sample_num_tokens": 9099.5, "step": 1113, "total_num_tokens": 301364964.0, "z_loss": 0.0015712802996858954 }, { "copy_logits_max": -1.3709728717803955, "copy_logits_min": -687500032.0, "copy_num_tokens": 532.75, "epoch": 0.22752106203727343, "gen_logits_max": 8.010862350463867, "gen_logits_mean": -10.843987464904785, "gen_logits_min": -22.884252548217773, "gen_logits_std": 2.514824628829956, "gen_loss": 0.360541969537735, "grad_norm": 0.5741251748300978, "learning_rate": 2.9522526315789473e-05, "loss": 0.3453, "mean_copy_accuracy": 0.9888391196727753, "mean_gen_accuracy": 0.8598349690437317, "mean_token_accuracy": 0.8900293558835983, "num_tokens": 301585546.0, "sample_num_tokens": 8770.5, "step": 1114, "total_num_tokens": 301620628.0, "z_loss": 0.0014700992032885551 }, { "copy_logits_max": -2.415285587310791, "copy_logits_min": -750000000.0, "copy_num_tokens": 492.1875, "epoch": 0.22772529997447025, "gen_logits_max": 8.189931869506836, "gen_logits_mean": -9.723264694213867, "gen_logits_min": -21.663555145263672, "gen_logits_std": 2.476172924041748, "gen_loss": 0.3051207661628723, "grad_norm": 0.57203206337856, "learning_rate": 2.9521263157894738e-05, "loss": 0.3402, "mean_copy_accuracy": 0.9903652220964432, "mean_gen_accuracy": 0.8579502105712891, "mean_token_accuracy": 0.8915528208017349, "num_tokens": 301850206.0, "sample_num_tokens": 9119.5, "step": 1115, "total_num_tokens": 301886684.0, "z_loss": 0.0014160911086946726 }, { "copy_logits_max": -0.8031783103942871, "copy_logits_min": -750000000.0, "copy_num_tokens": 386.5, "epoch": 0.2279295379116671, "gen_logits_max": 9.076091766357422, "gen_logits_mean": -9.943303108215332, "gen_logits_min": -22.218000411987305, "gen_logits_std": 2.5461864471435547, "gen_loss": 0.3600155711174011, "grad_norm": 0.5280108630040445, "learning_rate": 2.9520000000000002e-05, "loss": 0.3313, "mean_copy_accuracy": 0.9916521310806274, "mean_gen_accuracy": 0.8524216264486313, "mean_token_accuracy": 0.8927188664674759, "num_tokens": 302139897.0, "sample_num_tokens": 7317.25, "step": 1116, "total_num_tokens": 302169166.0, "z_loss": 0.0015184159856289625 }, { "copy_logits_max": -2.561884641647339, "copy_logits_min": -750000000.0, "copy_num_tokens": 364.75, "epoch": 0.22813377584886393, "gen_logits_max": 8.70661735534668, "gen_logits_mean": -10.862397193908691, "gen_logits_min": -22.773738861083984, "gen_logits_std": 2.476539134979248, "gen_loss": 0.36347976326942444, "grad_norm": 0.6909136953809889, "learning_rate": 2.9518736842105266e-05, "loss": 0.3821, "mean_copy_accuracy": 0.9902411997318268, "mean_gen_accuracy": 0.8420321196317673, "mean_token_accuracy": 0.8801357746124268, "num_tokens": 302392509.0, "sample_num_tokens": 7796.25, "step": 1117, "total_num_tokens": 302423694.0, "z_loss": 0.0014999615959823132 }, { "copy_logits_max": -1.1261272430419922, "copy_logits_min": -750000000.0, "copy_num_tokens": 361.8125, "epoch": 0.22833801378606075, "gen_logits_max": 8.767335891723633, "gen_logits_mean": -10.237430572509766, "gen_logits_min": -21.946762084960938, "gen_logits_std": 2.4740045070648193, "gen_loss": 0.3671007454395294, "grad_norm": 0.534024616261435, "learning_rate": 2.9517473684210527e-05, "loss": 0.3462, "mean_copy_accuracy": 0.9915062040090561, "mean_gen_accuracy": 0.8503884673118591, "mean_token_accuracy": 0.8881139755249023, "num_tokens": 302691558.0, "sample_num_tokens": 7816.5, "step": 1118, "total_num_tokens": 302722824.0, "z_loss": 0.0014635883271694183 }, { "copy_logits_max": -1.3260982036590576, "copy_logits_min": -750000000.0, "copy_num_tokens": 431.625, "epoch": 0.2285422517232576, "gen_logits_max": 9.130387306213379, "gen_logits_mean": -9.406564712524414, "gen_logits_min": -21.43808364868164, "gen_logits_std": 2.5177955627441406, "gen_loss": 0.3688895106315613, "grad_norm": 0.536214250222575, "learning_rate": 2.951621052631579e-05, "loss": 0.3548, "mean_copy_accuracy": 0.9897903650999069, "mean_gen_accuracy": 0.8502259403467178, "mean_token_accuracy": 0.8851721733808517, "num_tokens": 302975918.0, "sample_num_tokens": 8718.0, "step": 1119, "total_num_tokens": 303010790.0, "z_loss": 0.0016472884453833103 }, { "copy_logits_max": -0.4120732545852661, "copy_logits_min": -750000000.0, "copy_num_tokens": 266.5625, "epoch": 0.22874648966045444, "gen_logits_max": 8.622817039489746, "gen_logits_mean": -10.633647918701172, "gen_logits_min": -22.367525100708008, "gen_logits_std": 2.421581745147705, "gen_loss": 0.3968404531478882, "grad_norm": 0.5242543017398762, "learning_rate": 2.9514947368421052e-05, "loss": 0.3586, "mean_copy_accuracy": 0.9908465296030045, "mean_gen_accuracy": 0.8546687513589859, "mean_token_accuracy": 0.8840978592634201, "num_tokens": 303241026.0, "sample_num_tokens": 6621.5, "step": 1120, "total_num_tokens": 303267512.0, "z_loss": 0.0015899611171334982 }, { "copy_logits_max": -0.15046632289886475, "copy_logits_min": -687500032.0, "copy_num_tokens": 356.375, "epoch": 0.22895072759765125, "gen_logits_max": 8.666109085083008, "gen_logits_mean": -11.350486755371094, "gen_logits_min": -23.075145721435547, "gen_logits_std": 2.4556174278259277, "gen_loss": 0.3450695276260376, "grad_norm": 0.5961372909153733, "learning_rate": 2.9513684210526317e-05, "loss": 0.3678, "mean_copy_accuracy": 0.9900158494710922, "mean_gen_accuracy": 0.8518030494451523, "mean_token_accuracy": 0.8820698261260986, "num_tokens": 303482269.0, "sample_num_tokens": 7646.25, "step": 1121, "total_num_tokens": 303512854.0, "z_loss": 0.0016398290172219276 }, { "copy_logits_max": 0.0423465371131897, "copy_logits_min": -750000000.0, "copy_num_tokens": 444.8125, "epoch": 0.2291549655348481, "gen_logits_max": 8.826926231384277, "gen_logits_mean": -9.764871597290039, "gen_logits_min": -21.755332946777344, "gen_logits_std": 2.492542028427124, "gen_loss": 0.3590700626373291, "grad_norm": 0.6212158709234175, "learning_rate": 2.9512421052631578e-05, "loss": 0.3594, "mean_copy_accuracy": 0.9893069714307785, "mean_gen_accuracy": 0.8479964733123779, "mean_token_accuracy": 0.8850659281015396, "num_tokens": 303744177.0, "sample_num_tokens": 7273.75, "step": 1122, "total_num_tokens": 303773272.0, "z_loss": 0.0016991092124953866 }, { "copy_logits_max": -0.6024185419082642, "copy_logits_min": -687500032.0, "copy_num_tokens": 382.3125, "epoch": 0.22935920347204494, "gen_logits_max": 8.813879013061523, "gen_logits_mean": -10.2476224899292, "gen_logits_min": -22.075923919677734, "gen_logits_std": 2.482475757598877, "gen_loss": 0.37884819507598877, "grad_norm": 0.6269945196508304, "learning_rate": 2.9511157894736842e-05, "loss": 0.3579, "mean_copy_accuracy": 0.9890977889299393, "mean_gen_accuracy": 0.8507500439882278, "mean_token_accuracy": 0.8850390017032623, "num_tokens": 304010078.0, "sample_num_tokens": 8364.0, "step": 1123, "total_num_tokens": 304043534.0, "z_loss": 0.0016079164342954755 }, { "copy_logits_max": 0.21131965517997742, "copy_logits_min": -750000000.0, "copy_num_tokens": 583.8125, "epoch": 0.22956344140924176, "gen_logits_max": 8.482307434082031, "gen_logits_mean": -10.254895210266113, "gen_logits_min": -22.217361450195312, "gen_logits_std": 2.503216028213501, "gen_loss": 0.352558970451355, "grad_norm": 0.5528000576301981, "learning_rate": 2.9509894736842103e-05, "loss": 0.3453, "mean_copy_accuracy": 0.9905786365270615, "mean_gen_accuracy": 0.8556922823190689, "mean_token_accuracy": 0.8877019435167313, "num_tokens": 304268077.0, "sample_num_tokens": 9638.25, "step": 1124, "total_num_tokens": 304306630.0, "z_loss": 0.0017105634324252605 }, { "copy_logits_max": -0.6828374266624451, "copy_logits_min": -750000000.0, "copy_num_tokens": 321.4375, "epoch": 0.2297676793464386, "gen_logits_max": 9.118316650390625, "gen_logits_mean": -9.994057655334473, "gen_logits_min": -21.692176818847656, "gen_logits_std": 2.453960418701172, "gen_loss": 0.37641555070877075, "grad_norm": 0.5153978047499737, "learning_rate": 2.950863157894737e-05, "loss": 0.3618, "mean_copy_accuracy": 0.9902622699737549, "mean_gen_accuracy": 0.8518550544977188, "mean_token_accuracy": 0.8818733245134354, "num_tokens": 304540895.0, "sample_num_tokens": 7241.75, "step": 1125, "total_num_tokens": 304569862.0, "z_loss": 0.0016171637689694762 }, { "copy_logits_max": -2.362255573272705, "copy_logits_min": -750000000.0, "copy_num_tokens": 455.0, "epoch": 0.22997191728363545, "gen_logits_max": 9.454293251037598, "gen_logits_mean": -9.562220573425293, "gen_logits_min": -21.834304809570312, "gen_logits_std": 2.5357718467712402, "gen_loss": 0.3499123752117157, "grad_norm": 0.6784144631556713, "learning_rate": 2.950736842105263e-05, "loss": 0.313, "mean_copy_accuracy": 0.9909267425537109, "mean_gen_accuracy": 0.8600040525197983, "mean_token_accuracy": 0.8975463211536407, "num_tokens": 304819545.0, "sample_num_tokens": 8761.75, "step": 1126, "total_num_tokens": 304854592.0, "z_loss": 0.0016308617778122425 }, { "copy_logits_max": -0.1996278166770935, "copy_logits_min": -750000000.0, "copy_num_tokens": 536.0, "epoch": 0.23017615522083226, "gen_logits_max": 9.054884910583496, "gen_logits_mean": -8.96180534362793, "gen_logits_min": -21.293521881103516, "gen_logits_std": 2.540095806121826, "gen_loss": 0.3544788956642151, "grad_norm": 0.6344865366134667, "learning_rate": 2.9506105263157896e-05, "loss": 0.3694, "mean_copy_accuracy": 0.9905581176280975, "mean_gen_accuracy": 0.8497786819934845, "mean_token_accuracy": 0.8840308338403702, "num_tokens": 305081863.0, "sample_num_tokens": 8953.25, "step": 1127, "total_num_tokens": 305117676.0, "z_loss": 0.0020487026777118444 }, { "copy_logits_max": -1.127323031425476, "copy_logits_min": -687500032.0, "copy_num_tokens": 508.8125, "epoch": 0.2303803931580291, "gen_logits_max": 9.477380752563477, "gen_logits_mean": -9.087181091308594, "gen_logits_min": -20.903709411621094, "gen_logits_std": 2.5027575492858887, "gen_loss": 0.31831037998199463, "grad_norm": 0.5176347021672248, "learning_rate": 2.950484210526316e-05, "loss": 0.3586, "mean_copy_accuracy": 0.9923423677682877, "mean_gen_accuracy": 0.8459834307432175, "mean_token_accuracy": 0.8843473047018051, "num_tokens": 305345386.0, "sample_num_tokens": 8796.0, "step": 1128, "total_num_tokens": 305380570.0, "z_loss": 0.0021622921340167522 }, { "copy_logits_max": -1.0267505645751953, "copy_logits_min": -750000000.0, "copy_num_tokens": 368.5625, "epoch": 0.23058463109522595, "gen_logits_max": 9.161513328552246, "gen_logits_mean": -9.6737642288208, "gen_logits_min": -21.84505844116211, "gen_logits_std": 2.479093313217163, "gen_loss": 0.366133451461792, "grad_norm": 0.5567707217472222, "learning_rate": 2.950357894736842e-05, "loss": 0.3544, "mean_copy_accuracy": 0.989356055855751, "mean_gen_accuracy": 0.8561819940805435, "mean_token_accuracy": 0.8864751160144806, "num_tokens": 305615125.0, "sample_num_tokens": 7697.25, "step": 1129, "total_num_tokens": 305645914.0, "z_loss": 0.002123043639585376 }, { "copy_logits_max": 1.9845845699310303, "copy_logits_min": -750000000.0, "copy_num_tokens": 653.375, "epoch": 0.23078886903242277, "gen_logits_max": 8.751199722290039, "gen_logits_mean": -9.312829971313477, "gen_logits_min": -21.742815017700195, "gen_logits_std": 2.560478925704956, "gen_loss": 0.33279740810394287, "grad_norm": 0.5560604536266364, "learning_rate": 2.9502315789473685e-05, "loss": 0.3458, "mean_copy_accuracy": 0.9907363951206207, "mean_gen_accuracy": 0.8529186993837357, "mean_token_accuracy": 0.8893183320760727, "num_tokens": 305898846.0, "sample_num_tokens": 9605.5, "step": 1130, "total_num_tokens": 305937268.0, "z_loss": 0.0022187726572155952 }, { "copy_logits_max": -0.7963358759880066, "copy_logits_min": -750000000.0, "copy_num_tokens": 396.875, "epoch": 0.2309931069696196, "gen_logits_max": 8.70469856262207, "gen_logits_mean": -10.67834186553955, "gen_logits_min": -22.7578067779541, "gen_logits_std": 2.4782979488372803, "gen_loss": 0.33066338300704956, "grad_norm": 0.5215698034679427, "learning_rate": 2.9501052631578946e-05, "loss": 0.359, "mean_copy_accuracy": 0.9898350983858109, "mean_gen_accuracy": 0.8569908440113068, "mean_token_accuracy": 0.8860212564468384, "num_tokens": 306173043.0, "sample_num_tokens": 8436.25, "step": 1131, "total_num_tokens": 306206788.0, "z_loss": 0.0019524461822584271 }, { "copy_logits_max": 0.24815088510513306, "copy_logits_min": -625000000.0, "copy_num_tokens": 407.5, "epoch": 0.23119734490681645, "gen_logits_max": 9.031352043151855, "gen_logits_mean": -9.545637130737305, "gen_logits_min": -21.361968994140625, "gen_logits_std": 2.450824737548828, "gen_loss": 0.3856949210166931, "grad_norm": 0.5731223708627267, "learning_rate": 2.949978947368421e-05, "loss": 0.3754, "mean_copy_accuracy": 0.989528477191925, "mean_gen_accuracy": 0.8469913005828857, "mean_token_accuracy": 0.8807033598423004, "num_tokens": 306427878.0, "sample_num_tokens": 8597.0, "step": 1132, "total_num_tokens": 306462266.0, "z_loss": 0.0019393876427784562 }, { "copy_logits_max": 0.40648168325424194, "copy_logits_min": -750000000.0, "copy_num_tokens": 344.8125, "epoch": 0.23140158284401327, "gen_logits_max": 9.208658218383789, "gen_logits_mean": -9.589578628540039, "gen_logits_min": -21.755556106567383, "gen_logits_std": 2.5346016883850098, "gen_loss": 0.3511500358581543, "grad_norm": 0.5446178655566771, "learning_rate": 2.9498526315789475e-05, "loss": 0.3622, "mean_copy_accuracy": 0.9910148978233337, "mean_gen_accuracy": 0.8461022228002548, "mean_token_accuracy": 0.8796751499176025, "num_tokens": 306683658.0, "sample_num_tokens": 6885.5, "step": 1133, "total_num_tokens": 306711200.0, "z_loss": 0.0017425676342099905 }, { "copy_logits_max": -1.6306138038635254, "copy_logits_min": -750000000.0, "copy_num_tokens": 625.3125, "epoch": 0.23160582078121011, "gen_logits_max": 8.041486740112305, "gen_logits_mean": -9.193105697631836, "gen_logits_min": -21.249887466430664, "gen_logits_std": 2.4511873722076416, "gen_loss": 0.3583366870880127, "grad_norm": 0.6339805594500978, "learning_rate": 2.949726315789474e-05, "loss": 0.358, "mean_copy_accuracy": 0.9918917864561081, "mean_gen_accuracy": 0.8520492017269135, "mean_token_accuracy": 0.8862878531217575, "num_tokens": 306940622.0, "sample_num_tokens": 10262.0, "step": 1134, "total_num_tokens": 306981670.0, "z_loss": 0.001620055758394301 }, { "copy_logits_max": -3.588073253631592, "copy_logits_min": -750000000.0, "copy_num_tokens": 431.4375, "epoch": 0.23181005871840693, "gen_logits_max": 8.102030754089355, "gen_logits_mean": -10.481854438781738, "gen_logits_min": -22.38802146911621, "gen_logits_std": 2.473775863647461, "gen_loss": 0.29809290170669556, "grad_norm": 0.4928237064971948, "learning_rate": 2.9496e-05, "loss": 0.3333, "mean_copy_accuracy": 0.9895037859678268, "mean_gen_accuracy": 0.860725998878479, "mean_token_accuracy": 0.8908811509609222, "num_tokens": 307207842.0, "sample_num_tokens": 8697.5, "step": 1135, "total_num_tokens": 307242632.0, "z_loss": 0.0012017064727842808 }, { "copy_logits_max": -0.7778365612030029, "copy_logits_min": -750000000.0, "copy_num_tokens": 472.5625, "epoch": 0.23201429665560377, "gen_logits_max": 8.699311256408691, "gen_logits_mean": -9.850530624389648, "gen_logits_min": -22.071510314941406, "gen_logits_std": 2.52231764793396, "gen_loss": 0.36597949266433716, "grad_norm": 0.5418692729850121, "learning_rate": 2.9494736842105264e-05, "loss": 0.3521, "mean_copy_accuracy": 0.9907879382371902, "mean_gen_accuracy": 0.855619415640831, "mean_token_accuracy": 0.8849009722471237, "num_tokens": 307465882.0, "sample_num_tokens": 8078.0, "step": 1136, "total_num_tokens": 307498194.0, "z_loss": 0.0015151422703638673 }, { "copy_logits_max": -1.8544301986694336, "copy_logits_min": -750000000.0, "copy_num_tokens": 432.75, "epoch": 0.23221853459280062, "gen_logits_max": 8.805397033691406, "gen_logits_mean": -10.06389045715332, "gen_logits_min": -21.789012908935547, "gen_logits_std": 2.4636893272399902, "gen_loss": 0.3952197730541229, "grad_norm": 0.5741685520908226, "learning_rate": 2.9493473684210525e-05, "loss": 0.362, "mean_copy_accuracy": 0.9892590939998627, "mean_gen_accuracy": 0.8523771017789841, "mean_token_accuracy": 0.8825730979442596, "num_tokens": 307737984.0, "sample_num_tokens": 9123.0, "step": 1137, "total_num_tokens": 307774476.0, "z_loss": 0.0014184832107275724 }, { "copy_logits_max": -1.052569031715393, "copy_logits_min": -750000064.0, "copy_num_tokens": 481.6875, "epoch": 0.23242277252999743, "gen_logits_max": 8.350311279296875, "gen_logits_mean": -10.91566276550293, "gen_logits_min": -22.944496154785156, "gen_logits_std": 2.5067596435546875, "gen_loss": 0.38747745752334595, "grad_norm": 0.5012556850913297, "learning_rate": 2.949221052631579e-05, "loss": 0.4054, "mean_copy_accuracy": 0.9899354726076126, "mean_gen_accuracy": 0.8353438526391983, "mean_token_accuracy": 0.8692442327737808, "num_tokens": 307999284.0, "sample_num_tokens": 8358.5, "step": 1138, "total_num_tokens": 308032718.0, "z_loss": 0.0014566433383151889 }, { "copy_logits_max": -1.0729079246520996, "copy_logits_min": -687500032.0, "copy_num_tokens": 804.8125, "epoch": 0.23262701046719428, "gen_logits_max": 7.961200714111328, "gen_logits_mean": -9.759612083435059, "gen_logits_min": -22.20786476135254, "gen_logits_std": 2.567577362060547, "gen_loss": 0.2704240381717682, "grad_norm": 0.5203781588952487, "learning_rate": 2.949094736842105e-05, "loss": 0.3081, "mean_copy_accuracy": 0.9911412000656128, "mean_gen_accuracy": 0.865393802523613, "mean_token_accuracy": 0.8992937356233597, "num_tokens": 308261182.0, "sample_num_tokens": 10833.5, "step": 1139, "total_num_tokens": 308304516.0, "z_loss": 0.0014898102963343263 }, { "copy_logits_max": -0.7157195806503296, "copy_logits_min": -687500032.0, "copy_num_tokens": 635.5, "epoch": 0.23283124840439112, "gen_logits_max": 8.516223907470703, "gen_logits_mean": -9.502522468566895, "gen_logits_min": -21.918142318725586, "gen_logits_std": 2.5246152877807617, "gen_loss": 0.3089194893836975, "grad_norm": 0.5235595909236636, "learning_rate": 2.9489684210526315e-05, "loss": 0.333, "mean_copy_accuracy": 0.9898610860109329, "mean_gen_accuracy": 0.8635441809892654, "mean_token_accuracy": 0.8920682221651077, "num_tokens": 308549099.0, "sample_num_tokens": 9805.25, "step": 1140, "total_num_tokens": 308588320.0, "z_loss": 0.0015808974858373404 }, { "copy_logits_max": -0.18835043907165527, "copy_logits_min": -687500032.0, "copy_num_tokens": 471.375, "epoch": 0.23303548634158794, "gen_logits_max": 8.55624008178711, "gen_logits_mean": -9.749363899230957, "gen_logits_min": -22.286767959594727, "gen_logits_std": 2.534364700317383, "gen_loss": 0.3351590633392334, "grad_norm": 0.5009695925711257, "learning_rate": 2.9488421052631583e-05, "loss": 0.3415, "mean_copy_accuracy": 0.991107165813446, "mean_gen_accuracy": 0.8568664193153381, "mean_token_accuracy": 0.8902351558208466, "num_tokens": 308828544.0, "sample_num_tokens": 7694.0, "step": 1141, "total_num_tokens": 308859320.0, "z_loss": 0.0015687126433476806 }, { "copy_logits_max": 0.019363924860954285, "copy_logits_min": -750000000.0, "copy_num_tokens": 436.1875, "epoch": 0.23323972427878478, "gen_logits_max": 9.364989280700684, "gen_logits_mean": -9.882855415344238, "gen_logits_min": -21.950469970703125, "gen_logits_std": 2.5149784088134766, "gen_loss": 0.3506237864494324, "grad_norm": 0.5057810674533544, "learning_rate": 2.9487157894736844e-05, "loss": 0.3629, "mean_copy_accuracy": 0.9905172139406204, "mean_gen_accuracy": 0.8511337488889694, "mean_token_accuracy": 0.8820692896842957, "num_tokens": 309090408.0, "sample_num_tokens": 8254.0, "step": 1142, "total_num_tokens": 309123424.0, "z_loss": 0.0016689938493072987 }, { "copy_logits_max": -1.055208683013916, "copy_logits_min": -750000000.0, "copy_num_tokens": 459.5625, "epoch": 0.23344396221598163, "gen_logits_max": 9.460268020629883, "gen_logits_mean": -9.048927307128906, "gen_logits_min": -20.861026763916016, "gen_logits_std": 2.4910449981689453, "gen_loss": 0.37985578179359436, "grad_norm": 0.6548691548067829, "learning_rate": 2.9485894736842108e-05, "loss": 0.3839, "mean_copy_accuracy": 0.9901761561632156, "mean_gen_accuracy": 0.8423631191253662, "mean_token_accuracy": 0.8779601901769638, "num_tokens": 309373921.0, "sample_num_tokens": 9134.25, "step": 1143, "total_num_tokens": 309410458.0, "z_loss": 0.0016992473974823952 }, { "copy_logits_max": -0.37779757380485535, "copy_logits_min": -750000000.0, "copy_num_tokens": 591.25, "epoch": 0.23364820015317844, "gen_logits_max": 8.78443717956543, "gen_logits_mean": -9.700325012207031, "gen_logits_min": -21.725881576538086, "gen_logits_std": 2.4982688426971436, "gen_loss": 0.32809799909591675, "grad_norm": 0.5314007266365322, "learning_rate": 2.948463157894737e-05, "loss": 0.3525, "mean_copy_accuracy": 0.9895092248916626, "mean_gen_accuracy": 0.855770617723465, "mean_token_accuracy": 0.8869968503713608, "num_tokens": 309634365.0, "sample_num_tokens": 9202.25, "step": 1144, "total_num_tokens": 309671174.0, "z_loss": 0.001829147688113153 }, { "copy_logits_max": -2.2203330993652344, "copy_logits_min": -750000000.0, "copy_num_tokens": 323.875, "epoch": 0.2338524380903753, "gen_logits_max": 9.178352355957031, "gen_logits_mean": -10.107515335083008, "gen_logits_min": -21.687219619750977, "gen_logits_std": 2.474853038787842, "gen_loss": 0.3500468134880066, "grad_norm": 0.4866304828843984, "learning_rate": 2.9483368421052633e-05, "loss": 0.3571, "mean_copy_accuracy": 0.9927762001752853, "mean_gen_accuracy": 0.8513533174991608, "mean_token_accuracy": 0.8842209130525589, "num_tokens": 309900362.0, "sample_num_tokens": 7827.5, "step": 1145, "total_num_tokens": 309931672.0, "z_loss": 0.0015063988976180553 }, { "copy_logits_max": -0.5447630286216736, "copy_logits_min": -750000000.0, "copy_num_tokens": 518.1875, "epoch": 0.23405667602757213, "gen_logits_max": 9.384729385375977, "gen_logits_mean": -10.429612159729004, "gen_logits_min": -22.701473236083984, "gen_logits_std": 2.5117712020874023, "gen_loss": 0.3252831697463989, "grad_norm": 0.5876889294459529, "learning_rate": 2.9482105263157894e-05, "loss": 0.3394, "mean_copy_accuracy": 0.9901206642389297, "mean_gen_accuracy": 0.8583519011735916, "mean_token_accuracy": 0.8918797671794891, "num_tokens": 310178537.0, "sample_num_tokens": 9049.75, "step": 1146, "total_num_tokens": 310214736.0, "z_loss": 0.0014783763326704502 }, { "copy_logits_max": -0.9250333309173584, "copy_logits_min": -750000000.0, "copy_num_tokens": 503.625, "epoch": 0.23426091396476895, "gen_logits_max": 8.884437561035156, "gen_logits_mean": -9.610978126525879, "gen_logits_min": -21.881498336791992, "gen_logits_std": 2.5341107845306396, "gen_loss": 0.30584728717803955, "grad_norm": 0.4992802840045353, "learning_rate": 2.948084210526316e-05, "loss": 0.3289, "mean_copy_accuracy": 0.9893599152565002, "mean_gen_accuracy": 0.8650055527687073, "mean_token_accuracy": 0.8929306417703629, "num_tokens": 310454083.0, "sample_num_tokens": 9225.25, "step": 1147, "total_num_tokens": 310490984.0, "z_loss": 0.0013921104837208986 }, { "copy_logits_max": -0.08655518293380737, "copy_logits_min": -750000000.0, "copy_num_tokens": 481.125, "epoch": 0.2344651519019658, "gen_logits_max": 9.007913589477539, "gen_logits_mean": -9.330877304077148, "gen_logits_min": -22.559648513793945, "gen_logits_std": 2.584089517593384, "gen_loss": 0.3129342198371887, "grad_norm": 0.5735727993577645, "learning_rate": 2.947957894736842e-05, "loss": 0.3406, "mean_copy_accuracy": 0.9891352951526642, "mean_gen_accuracy": 0.8528084754943848, "mean_token_accuracy": 0.8907178044319153, "num_tokens": 310721350.0, "sample_num_tokens": 8175.0, "step": 1148, "total_num_tokens": 310754050.0, "z_loss": 0.0014862597454339266 }, { "copy_logits_max": 1.1483713388442993, "copy_logits_min": -750000000.0, "copy_num_tokens": 594.0, "epoch": 0.23466938983916263, "gen_logits_max": 8.993080139160156, "gen_logits_mean": -9.317887306213379, "gen_logits_min": -21.94519805908203, "gen_logits_std": 2.576490640640259, "gen_loss": 0.35392308235168457, "grad_norm": 0.5210723612133364, "learning_rate": 2.9478315789473687e-05, "loss": 0.3438, "mean_copy_accuracy": 0.9908387809991837, "mean_gen_accuracy": 0.8481380790472031, "mean_token_accuracy": 0.8872855454683304, "num_tokens": 310997049.0, "sample_num_tokens": 9747.75, "step": 1149, "total_num_tokens": 311036040.0, "z_loss": 0.001647773664444685 }, { "copy_logits_max": 1.192122220993042, "copy_logits_min": -750000000.0, "copy_num_tokens": 516.8125, "epoch": 0.23487362777635945, "gen_logits_max": 8.250093460083008, "gen_logits_mean": -11.21756362915039, "gen_logits_min": -23.361101150512695, "gen_logits_std": 2.5088131427764893, "gen_loss": 0.3185332715511322, "grad_norm": 0.47977356323157405, "learning_rate": 2.9477052631578948e-05, "loss": 0.3343, "mean_copy_accuracy": 0.9920918643474579, "mean_gen_accuracy": 0.8626367300748825, "mean_token_accuracy": 0.8925266116857529, "num_tokens": 311265491.0, "sample_num_tokens": 8219.25, "step": 1150, "total_num_tokens": 311298368.0, "z_loss": 0.0019424305064603686 }, { "copy_logits_max": 0.18561846017837524, "copy_logits_min": -750000000.0, "copy_num_tokens": 652.0625, "epoch": 0.2350778657135563, "gen_logits_max": 8.196444511413574, "gen_logits_mean": -9.857038497924805, "gen_logits_min": -22.426761627197266, "gen_logits_std": 2.5521745681762695, "gen_loss": 0.274649441242218, "grad_norm": 0.5262891400046442, "learning_rate": 2.9475789473684212e-05, "loss": 0.3236, "mean_copy_accuracy": 0.9911312162876129, "mean_gen_accuracy": 0.8604269921779633, "mean_token_accuracy": 0.8929868787527084, "num_tokens": 311516229.0, "sample_num_tokens": 9017.25, "step": 1151, "total_num_tokens": 311552298.0, "z_loss": 0.0018320860108360648 }, { "copy_logits_max": -1.6408360004425049, "copy_logits_min": -750000000.0, "copy_num_tokens": 457.0, "epoch": 0.23528210365075314, "gen_logits_max": 9.31714153289795, "gen_logits_mean": -10.143449783325195, "gen_logits_min": -22.306636810302734, "gen_logits_std": 2.5209436416625977, "gen_loss": 0.3530082106590271, "grad_norm": 0.56621979451708, "learning_rate": 2.9474526315789473e-05, "loss": 0.3638, "mean_copy_accuracy": 0.9875444024801254, "mean_gen_accuracy": 0.8523143976926804, "mean_token_accuracy": 0.8799237161874771, "num_tokens": 311786442.0, "sample_num_tokens": 9157.5, "step": 1152, "total_num_tokens": 311823072.0, "z_loss": 0.0016424893401563168 }, { "copy_logits_max": -1.564422607421875, "copy_logits_min": -750000000.0, "copy_num_tokens": 319.6875, "epoch": 0.23548634158794995, "gen_logits_max": 8.410755157470703, "gen_logits_mean": -10.922048568725586, "gen_logits_min": -22.70742416381836, "gen_logits_std": 2.5181679725646973, "gen_loss": 0.30779945850372314, "grad_norm": 0.503338553550188, "learning_rate": 2.9473263157894737e-05, "loss": 0.3037, "mean_copy_accuracy": 0.9905869215726852, "mean_gen_accuracy": 0.8705413639545441, "mean_token_accuracy": 0.9010362327098846, "num_tokens": 312063297.0, "sample_num_tokens": 6959.25, "step": 1153, "total_num_tokens": 312091134.0, "z_loss": 0.0013874201104044914 }, { "copy_logits_max": -0.19357551634311676, "copy_logits_min": -687500032.0, "copy_num_tokens": 519.0625, "epoch": 0.2356905795251468, "gen_logits_max": 7.6026411056518555, "gen_logits_mean": -11.733712196350098, "gen_logits_min": -23.808177947998047, "gen_logits_std": 2.4871091842651367, "gen_loss": 0.3264102339744568, "grad_norm": 0.6730216796790058, "learning_rate": 2.9472000000000002e-05, "loss": 0.3409, "mean_copy_accuracy": 0.9895376265048981, "mean_gen_accuracy": 0.8580209910869598, "mean_token_accuracy": 0.8918885290622711, "num_tokens": 312327587.0, "sample_num_tokens": 9095.25, "step": 1154, "total_num_tokens": 312363968.0, "z_loss": 0.001611854531802237 }, { "copy_logits_max": -0.48097702860832214, "copy_logits_min": -750000000.0, "copy_num_tokens": 508.25, "epoch": 0.23589481746234364, "gen_logits_max": 9.090719223022461, "gen_logits_mean": -10.253727912902832, "gen_logits_min": -22.6010684967041, "gen_logits_std": 2.5467867851257324, "gen_loss": 0.33401140570640564, "grad_norm": 0.5062797977996065, "learning_rate": 2.9470736842105263e-05, "loss": 0.343, "mean_copy_accuracy": 0.9922926425933838, "mean_gen_accuracy": 0.8534178733825684, "mean_token_accuracy": 0.8900642246007919, "num_tokens": 312606926.0, "sample_num_tokens": 8747.5, "step": 1155, "total_num_tokens": 312641916.0, "z_loss": 0.0016105571994557977 }, { "copy_logits_max": 0.3458763659000397, "copy_logits_min": -687500032.0, "copy_num_tokens": 737.25, "epoch": 0.23609905539954046, "gen_logits_max": 8.920453071594238, "gen_logits_mean": -9.59952163696289, "gen_logits_min": -21.86655044555664, "gen_logits_std": 2.524622917175293, "gen_loss": 0.3428114652633667, "grad_norm": 0.7896596295856195, "learning_rate": 2.9469473684210527e-05, "loss": 0.3489, "mean_copy_accuracy": 0.9866489917039871, "mean_gen_accuracy": 0.8573978990316391, "mean_token_accuracy": 0.8881596922874451, "num_tokens": 312877771.0, "sample_num_tokens": 9880.25, "step": 1156, "total_num_tokens": 312917292.0, "z_loss": 0.001556484610773623 }, { "copy_logits_max": -2.4956843852996826, "copy_logits_min": -750000000.0, "copy_num_tokens": 365.5625, "epoch": 0.2363032933367373, "gen_logits_max": 8.547087669372559, "gen_logits_mean": -10.546716690063477, "gen_logits_min": -22.215600967407227, "gen_logits_std": 2.4558820724487305, "gen_loss": 0.35510775446891785, "grad_norm": 0.5642706592788479, "learning_rate": 2.946821052631579e-05, "loss": 0.3572, "mean_copy_accuracy": 0.992350161075592, "mean_gen_accuracy": 0.8480297923088074, "mean_token_accuracy": 0.8808615207672119, "num_tokens": 313145236.0, "sample_num_tokens": 8311.0, "step": 1157, "total_num_tokens": 313178480.0, "z_loss": 0.0014676630962640047 }, { "copy_logits_max": -0.10827982425689697, "copy_logits_min": -750000000.0, "copy_num_tokens": 309.1875, "epoch": 0.23650753127393415, "gen_logits_max": 9.120996475219727, "gen_logits_mean": -10.44915771484375, "gen_logits_min": -22.48239517211914, "gen_logits_std": 2.4790637493133545, "gen_loss": 0.3813396096229553, "grad_norm": 0.8927195541330747, "learning_rate": 2.9466947368421056e-05, "loss": 0.3525, "mean_copy_accuracy": 0.9909047931432724, "mean_gen_accuracy": 0.8485053926706314, "mean_token_accuracy": 0.8864712715148926, "num_tokens": 313408640.0, "sample_num_tokens": 7096.0, "step": 1158, "total_num_tokens": 313437024.0, "z_loss": 0.0016672855708748102 }, { "copy_logits_max": -1.3107811212539673, "copy_logits_min": -750000000.0, "copy_num_tokens": 404.8125, "epoch": 0.23671176921113096, "gen_logits_max": 9.621492385864258, "gen_logits_mean": -9.734922409057617, "gen_logits_min": -21.532556533813477, "gen_logits_std": 2.499880790710449, "gen_loss": 0.3860318660736084, "grad_norm": 0.9552247218821021, "learning_rate": 2.9465684210526317e-05, "loss": 0.3687, "mean_copy_accuracy": 0.9875736236572266, "mean_gen_accuracy": 0.8438731282949448, "mean_token_accuracy": 0.8839118629693985, "num_tokens": 313685121.0, "sample_num_tokens": 8140.75, "step": 1159, "total_num_tokens": 313717684.0, "z_loss": 0.0015481733717024326 }, { "copy_logits_max": -0.5502371788024902, "copy_logits_min": -750000000.0, "copy_num_tokens": 556.9375, "epoch": 0.2369160071483278, "gen_logits_max": 9.293718338012695, "gen_logits_mean": -8.415252685546875, "gen_logits_min": -20.63410758972168, "gen_logits_std": 2.566964626312256, "gen_loss": 0.3726327419281006, "grad_norm": 0.6619458117634544, "learning_rate": 2.946442105263158e-05, "loss": 0.3706, "mean_copy_accuracy": 0.9894945472478867, "mean_gen_accuracy": 0.8480051308870316, "mean_token_accuracy": 0.8819383233785629, "num_tokens": 313948280.0, "sample_num_tokens": 9501.5, "step": 1160, "total_num_tokens": 313986286.0, "z_loss": 0.00172066455706954 }, { "copy_logits_max": 0.0330776572227478, "copy_logits_min": -750000000.0, "copy_num_tokens": 372.625, "epoch": 0.23712024508552462, "gen_logits_max": 8.332687377929688, "gen_logits_mean": -10.573493957519531, "gen_logits_min": -22.810056686401367, "gen_logits_std": 2.5035390853881836, "gen_loss": 0.34524497389793396, "grad_norm": 0.6768038917249585, "learning_rate": 2.9463157894736842e-05, "loss": 0.3489, "mean_copy_accuracy": 0.9909368604421616, "mean_gen_accuracy": 0.8479841649532318, "mean_token_accuracy": 0.8866255432367325, "num_tokens": 314210386.0, "sample_num_tokens": 7479.5, "step": 1161, "total_num_tokens": 314240304.0, "z_loss": 0.0016670033801347017 }, { "copy_logits_max": -1.2634708881378174, "copy_logits_min": -750000000.0, "copy_num_tokens": 578.25, "epoch": 0.23732448302272147, "gen_logits_max": 8.963406562805176, "gen_logits_mean": -9.41102409362793, "gen_logits_min": -21.833593368530273, "gen_logits_std": 2.5398976802825928, "gen_loss": 0.3107109069824219, "grad_norm": 0.5987055148166899, "learning_rate": 2.9461894736842106e-05, "loss": 0.336, "mean_copy_accuracy": 0.9884628802537918, "mean_gen_accuracy": 0.8616666793823242, "mean_token_accuracy": 0.8908724337816238, "num_tokens": 314449333.0, "sample_num_tokens": 9283.25, "step": 1162, "total_num_tokens": 314486466.0, "z_loss": 0.001530101173557341 }, { "copy_logits_max": -2.3382606506347656, "copy_logits_min": -750000000.0, "copy_num_tokens": 403.0, "epoch": 0.2375287209599183, "gen_logits_max": 8.672111511230469, "gen_logits_mean": -10.242536544799805, "gen_logits_min": -22.667781829833984, "gen_logits_std": 2.514882802963257, "gen_loss": 0.34906935691833496, "grad_norm": 0.627530234356713, "learning_rate": 2.9460631578947367e-05, "loss": 0.3512, "mean_copy_accuracy": 0.9879079908132553, "mean_gen_accuracy": 0.8577995002269745, "mean_token_accuracy": 0.8860465735197067, "num_tokens": 314706729.0, "sample_num_tokens": 8020.75, "step": 1163, "total_num_tokens": 314738812.0, "z_loss": 0.0013217173982411623 }, { "copy_logits_max": -2.274040937423706, "copy_logits_min": -750000000.0, "copy_num_tokens": 321.375, "epoch": 0.23773295889711513, "gen_logits_max": 8.497142791748047, "gen_logits_mean": -10.195878982543945, "gen_logits_min": -22.0260009765625, "gen_logits_std": 2.4598140716552734, "gen_loss": 0.3691169023513794, "grad_norm": 0.5249891382367078, "learning_rate": 2.945936842105263e-05, "loss": 0.3442, "mean_copy_accuracy": 0.9890104979276657, "mean_gen_accuracy": 0.8596836179494858, "mean_token_accuracy": 0.888707846403122, "num_tokens": 314973671.0, "sample_num_tokens": 7949.75, "step": 1164, "total_num_tokens": 315005470.0, "z_loss": 0.0014873349573463202 }, { "copy_logits_max": -1.0638470649719238, "copy_logits_min": -750000000.0, "copy_num_tokens": 560.625, "epoch": 0.23793719683431197, "gen_logits_max": 8.065741539001465, "gen_logits_mean": -9.797774314880371, "gen_logits_min": -22.453689575195312, "gen_logits_std": 2.5429935455322266, "gen_loss": 0.2922055721282959, "grad_norm": 0.5675227312127452, "learning_rate": 2.9458105263157896e-05, "loss": 0.3256, "mean_copy_accuracy": 0.9903731942176819, "mean_gen_accuracy": 0.8604232519865036, "mean_token_accuracy": 0.8958407342433929, "num_tokens": 315251116.0, "sample_num_tokens": 8709.5, "step": 1165, "total_num_tokens": 315285954.0, "z_loss": 0.0018527174834161997 }, { "copy_logits_max": 1.7949318885803223, "copy_logits_min": -687500032.0, "copy_num_tokens": 634.1875, "epoch": 0.23814143477150881, "gen_logits_max": 8.550647735595703, "gen_logits_mean": -9.790752410888672, "gen_logits_min": -22.146305084228516, "gen_logits_std": 2.549241781234741, "gen_loss": 0.34666162729263306, "grad_norm": 0.5138050087266433, "learning_rate": 2.945684210526316e-05, "loss": 0.3385, "mean_copy_accuracy": 0.9904067367315292, "mean_gen_accuracy": 0.8531298935413361, "mean_token_accuracy": 0.8919372111558914, "num_tokens": 315536685.0, "sample_num_tokens": 9229.25, "step": 1166, "total_num_tokens": 315573602.0, "z_loss": 0.001979718916118145 }, { "copy_logits_max": 0.6604405641555786, "copy_logits_min": -750000000.0, "copy_num_tokens": 377.375, "epoch": 0.23834567270870563, "gen_logits_max": 9.208053588867188, "gen_logits_mean": -9.806910514831543, "gen_logits_min": -21.820297241210938, "gen_logits_std": 2.4956817626953125, "gen_loss": 0.42240968346595764, "grad_norm": 0.5316443811650641, "learning_rate": 2.9455578947368424e-05, "loss": 0.3855, "mean_copy_accuracy": 0.9887826591730118, "mean_gen_accuracy": 0.8489603251218796, "mean_token_accuracy": 0.877769723534584, "num_tokens": 315816461.0, "sample_num_tokens": 8176.75, "step": 1167, "total_num_tokens": 315849168.0, "z_loss": 0.0018165252404287457 }, { "copy_logits_max": -1.2922513484954834, "copy_logits_min": -750000000.0, "copy_num_tokens": 546.1875, "epoch": 0.23854991064590247, "gen_logits_max": 8.690006256103516, "gen_logits_mean": -9.126399040222168, "gen_logits_min": -21.299468994140625, "gen_logits_std": 2.5181665420532227, "gen_loss": 0.35706034302711487, "grad_norm": 0.5620598701266297, "learning_rate": 2.9454315789473685e-05, "loss": 0.3542, "mean_copy_accuracy": 0.9901202619075775, "mean_gen_accuracy": 0.8490954339504242, "mean_token_accuracy": 0.8865758180618286, "num_tokens": 316080875.0, "sample_num_tokens": 8198.75, "step": 1168, "total_num_tokens": 316113670.0, "z_loss": 0.0016953282756730914 }, { "copy_logits_max": -1.5213208198547363, "copy_logits_min": -750000000.0, "copy_num_tokens": 409.875, "epoch": 0.23875414858309932, "gen_logits_max": 8.846505165100098, "gen_logits_mean": -9.0303955078125, "gen_logits_min": -21.224523544311523, "gen_logits_std": 2.5223050117492676, "gen_loss": 0.348556786775589, "grad_norm": 0.5858978815552259, "learning_rate": 2.945305263157895e-05, "loss": 0.3577, "mean_copy_accuracy": 0.9926963895559311, "mean_gen_accuracy": 0.8533816486597061, "mean_token_accuracy": 0.8876344412565231, "num_tokens": 316359083.0, "sample_num_tokens": 7710.25, "step": 1169, "total_num_tokens": 316389924.0, "z_loss": 0.0017734671710059047 }, { "copy_logits_max": -2.336846113204956, "copy_logits_min": -750000000.0, "copy_num_tokens": 338.1875, "epoch": 0.23895838652029613, "gen_logits_max": 8.74354076385498, "gen_logits_mean": -10.596410751342773, "gen_logits_min": -22.35547637939453, "gen_logits_std": 2.4867916107177734, "gen_loss": 0.3424605131149292, "grad_norm": 0.501458720932367, "learning_rate": 2.945178947368421e-05, "loss": 0.38, "mean_copy_accuracy": 0.9918844252824783, "mean_gen_accuracy": 0.8441557437181473, "mean_token_accuracy": 0.878716379404068, "num_tokens": 316633732.0, "sample_num_tokens": 7810.5, "step": 1170, "total_num_tokens": 316664974.0, "z_loss": 0.001414105761796236 }, { "copy_logits_max": -0.2866622805595398, "copy_logits_min": -750000000.0, "copy_num_tokens": 380.4375, "epoch": 0.23916262445749298, "gen_logits_max": 8.78296947479248, "gen_logits_mean": -9.907130241394043, "gen_logits_min": -22.416156768798828, "gen_logits_std": 2.527985095977783, "gen_loss": 0.3641641438007355, "grad_norm": 0.5419640018202597, "learning_rate": 2.9450526315789475e-05, "loss": 0.3525, "mean_copy_accuracy": 0.989768385887146, "mean_gen_accuracy": 0.8542805016040802, "mean_token_accuracy": 0.8862264007329941, "num_tokens": 316890599.0, "sample_num_tokens": 7459.25, "step": 1171, "total_num_tokens": 316920436.0, "z_loss": 0.0016109831631183624 }, { "copy_logits_max": -0.3332346975803375, "copy_logits_min": -687500032.0, "copy_num_tokens": 474.3125, "epoch": 0.23936686239468982, "gen_logits_max": 8.772971153259277, "gen_logits_mean": -10.167765617370605, "gen_logits_min": -22.19228172302246, "gen_logits_std": 2.4973907470703125, "gen_loss": 0.37457531690597534, "grad_norm": 0.5094173688297331, "learning_rate": 2.9449263157894736e-05, "loss": 0.3683, "mean_copy_accuracy": 0.9901164323091507, "mean_gen_accuracy": 0.8504528701305389, "mean_token_accuracy": 0.8802593350410461, "num_tokens": 317155117.0, "sample_num_tokens": 8442.25, "step": 1172, "total_num_tokens": 317188886.0, "z_loss": 0.0016041398048400879 }, { "copy_logits_max": -2.9817967414855957, "copy_logits_min": -750000064.0, "copy_num_tokens": 242.375, "epoch": 0.23957110033188664, "gen_logits_max": 8.685104370117188, "gen_logits_mean": -11.897454261779785, "gen_logits_min": -23.14165687561035, "gen_logits_std": 2.389449119567871, "gen_loss": 0.3837388753890991, "grad_norm": 0.5541756865806073, "learning_rate": 2.9448e-05, "loss": 0.355, "mean_copy_accuracy": 0.9877657443284988, "mean_gen_accuracy": 0.851537674665451, "mean_token_accuracy": 0.8850230425596237, "num_tokens": 317417446.0, "sample_num_tokens": 7037.5, "step": 1173, "total_num_tokens": 317445596.0, "z_loss": 0.001410533906891942 }, { "copy_logits_max": -1.393652319908142, "copy_logits_min": -687500032.0, "copy_num_tokens": 404.6875, "epoch": 0.23977533826908348, "gen_logits_max": 8.675139427185059, "gen_logits_mean": -10.720748901367188, "gen_logits_min": -22.646150588989258, "gen_logits_std": 2.449523448944092, "gen_loss": 0.3614034354686737, "grad_norm": 0.5764828928989076, "learning_rate": 2.9446736842105264e-05, "loss": 0.3686, "mean_copy_accuracy": 0.989457905292511, "mean_gen_accuracy": 0.8481398075819016, "mean_token_accuracy": 0.8831634223461151, "num_tokens": 317681590.0, "sample_num_tokens": 8169.0, "step": 1174, "total_num_tokens": 317714266.0, "z_loss": 0.001561178476549685 }, { "copy_logits_max": -2.059865951538086, "copy_logits_min": -750000000.0, "copy_num_tokens": 471.0625, "epoch": 0.23997957620628033, "gen_logits_max": 8.29513931274414, "gen_logits_mean": -10.179373741149902, "gen_logits_min": -21.844881057739258, "gen_logits_std": 2.4874327182769775, "gen_loss": 0.3435792326927185, "grad_norm": 0.5253036261627086, "learning_rate": 2.944547368421053e-05, "loss": 0.3494, "mean_copy_accuracy": 0.9910416454076767, "mean_gen_accuracy": 0.8531412184238434, "mean_token_accuracy": 0.88564732670784, "num_tokens": 317952275.0, "sample_num_tokens": 8629.75, "step": 1175, "total_num_tokens": 317986794.0, "z_loss": 0.00143230683170259 }, { "copy_logits_max": -1.7938976287841797, "copy_logits_min": -750000000.0, "copy_num_tokens": 392.125, "epoch": 0.24018381414347714, "gen_logits_max": 8.24053955078125, "gen_logits_mean": -10.838717460632324, "gen_logits_min": -23.199954986572266, "gen_logits_std": 2.4926862716674805, "gen_loss": 0.35470372438430786, "grad_norm": 0.5592500877730903, "learning_rate": 2.944421052631579e-05, "loss": 0.3483, "mean_copy_accuracy": 0.9882680773735046, "mean_gen_accuracy": 0.8567862957715988, "mean_token_accuracy": 0.8843693435192108, "num_tokens": 318194513.0, "sample_num_tokens": 7679.75, "step": 1176, "total_num_tokens": 318225232.0, "z_loss": 0.0012935185804963112 }, { "copy_logits_max": -2.3502938747406006, "copy_logits_min": -750000000.0, "copy_num_tokens": 630.5625, "epoch": 0.240388052080674, "gen_logits_max": 8.502544403076172, "gen_logits_mean": -9.587244033813477, "gen_logits_min": -21.307350158691406, "gen_logits_std": 2.5070364475250244, "gen_loss": 0.32779204845428467, "grad_norm": 0.6508055905286686, "learning_rate": 2.9442947368421054e-05, "loss": 0.3318, "mean_copy_accuracy": 0.9920713156461716, "mean_gen_accuracy": 0.8560093194246292, "mean_token_accuracy": 0.8933264464139938, "num_tokens": 318463532.0, "sample_num_tokens": 9498.5, "step": 1177, "total_num_tokens": 318501526.0, "z_loss": 0.001386943506076932 }, { "copy_logits_max": -0.3465409278869629, "copy_logits_min": -687500032.0, "copy_num_tokens": 601.5, "epoch": 0.24059229001787083, "gen_logits_max": 8.209272384643555, "gen_logits_mean": -10.17049789428711, "gen_logits_min": -22.778270721435547, "gen_logits_std": 2.5676088333129883, "gen_loss": 0.308743417263031, "grad_norm": 0.5073617933706386, "learning_rate": 2.9441684210526315e-05, "loss": 0.3443, "mean_copy_accuracy": 0.9891572296619415, "mean_gen_accuracy": 0.855040043592453, "mean_token_accuracy": 0.8885803520679474, "num_tokens": 318726898.0, "sample_num_tokens": 9501.5, "step": 1178, "total_num_tokens": 318764904.0, "z_loss": 0.0014602603623643517 }, { "copy_logits_max": -0.24418491125106812, "copy_logits_min": -750000000.0, "copy_num_tokens": 444.75, "epoch": 0.24079652795506765, "gen_logits_max": 7.571094512939453, "gen_logits_mean": -11.169370651245117, "gen_logits_min": -23.256019592285156, "gen_logits_std": 2.513028860092163, "gen_loss": 0.3093155026435852, "grad_norm": 0.539670192163063, "learning_rate": 2.944042105263158e-05, "loss": 0.3282, "mean_copy_accuracy": 0.990411564707756, "mean_gen_accuracy": 0.8570056557655334, "mean_token_accuracy": 0.8936269879341125, "num_tokens": 319003611.0, "sample_num_tokens": 7887.75, "step": 1179, "total_num_tokens": 319035162.0, "z_loss": 0.0014079648535698652 }, { "copy_logits_max": 0.8051411509513855, "copy_logits_min": -687500032.0, "copy_num_tokens": 499.125, "epoch": 0.2410007658922645, "gen_logits_max": 8.173086166381836, "gen_logits_mean": -10.095817565917969, "gen_logits_min": -22.353458404541016, "gen_logits_std": 2.4924821853637695, "gen_loss": 0.34785521030426025, "grad_norm": 0.5585175085495261, "learning_rate": 2.9439157894736843e-05, "loss": 0.3593, "mean_copy_accuracy": 0.9903584867715836, "mean_gen_accuracy": 0.8525094836950302, "mean_token_accuracy": 0.8840509802103043, "num_tokens": 319254580.0, "sample_num_tokens": 8220.5, "step": 1180, "total_num_tokens": 319287462.0, "z_loss": 0.0017571677453815937 }, { "copy_logits_max": 0.4683414101600647, "copy_logits_min": -625000000.0, "copy_num_tokens": 297.0625, "epoch": 0.24120500382946133, "gen_logits_max": 9.023860931396484, "gen_logits_mean": -10.165185928344727, "gen_logits_min": -22.32571792602539, "gen_logits_std": 2.5242836475372314, "gen_loss": 0.39422091841697693, "grad_norm": 0.5731485148424422, "learning_rate": 2.9437894736842104e-05, "loss": 0.3427, "mean_copy_accuracy": 0.9912205338478088, "mean_gen_accuracy": 0.8560630083084106, "mean_token_accuracy": 0.8870613127946854, "num_tokens": 319540317.0, "sample_num_tokens": 7675.25, "step": 1181, "total_num_tokens": 319571018.0, "z_loss": 0.0015740913804620504 }, { "copy_logits_max": -1.6129021644592285, "copy_logits_min": -625000064.0, "copy_num_tokens": 296.125, "epoch": 0.24140924176665815, "gen_logits_max": 8.74140453338623, "gen_logits_mean": -10.723945617675781, "gen_logits_min": -22.587007522583008, "gen_logits_std": 2.469468832015991, "gen_loss": 0.3776283860206604, "grad_norm": 0.5600625827810473, "learning_rate": 2.9436631578947372e-05, "loss": 0.3549, "mean_copy_accuracy": 0.990460678935051, "mean_gen_accuracy": 0.8539132326841354, "mean_token_accuracy": 0.8846303075551987, "num_tokens": 319792219.0, "sample_num_tokens": 7106.25, "step": 1182, "total_num_tokens": 319820644.0, "z_loss": 0.001520434976555407 }, { "copy_logits_max": -0.3635861277580261, "copy_logits_min": -687500032.0, "copy_num_tokens": 624.625, "epoch": 0.241613479703855, "gen_logits_max": 8.50058364868164, "gen_logits_mean": -9.379597663879395, "gen_logits_min": -21.968780517578125, "gen_logits_std": 2.578864812850952, "gen_loss": 0.30688026547431946, "grad_norm": 0.5122397112561363, "learning_rate": 2.9435368421052633e-05, "loss": 0.3316, "mean_copy_accuracy": 0.9907861649990082, "mean_gen_accuracy": 0.8553169965744019, "mean_token_accuracy": 0.8929576426744461, "num_tokens": 320069439.0, "sample_num_tokens": 8898.75, "step": 1183, "total_num_tokens": 320105034.0, "z_loss": 0.0016969458665698767 }, { "copy_logits_max": -1.438668966293335, "copy_logits_min": -750000000.0, "copy_num_tokens": 491.5625, "epoch": 0.24181771764105184, "gen_logits_max": 8.532611846923828, "gen_logits_mean": -10.219481468200684, "gen_logits_min": -22.365232467651367, "gen_logits_std": 2.5344176292419434, "gen_loss": 0.35271626710891724, "grad_norm": 0.5218775946389222, "learning_rate": 2.9434105263157897e-05, "loss": 0.3528, "mean_copy_accuracy": 0.9903622567653656, "mean_gen_accuracy": 0.8511506766080856, "mean_token_accuracy": 0.884462982416153, "num_tokens": 320336152.0, "sample_num_tokens": 8818.0, "step": 1184, "total_num_tokens": 320371424.0, "z_loss": 0.0014197409618645906 }, { "copy_logits_max": -1.1122026443481445, "copy_logits_min": -750000064.0, "copy_num_tokens": 446.5, "epoch": 0.24202195557824865, "gen_logits_max": 9.253423690795898, "gen_logits_mean": -9.401192665100098, "gen_logits_min": -21.060806274414062, "gen_logits_std": 2.523355007171631, "gen_loss": 0.3871394991874695, "grad_norm": 0.6899056038335801, "learning_rate": 2.9432842105263158e-05, "loss": 0.3571, "mean_copy_accuracy": 0.9920863658189774, "mean_gen_accuracy": 0.8531630635261536, "mean_token_accuracy": 0.8860495388507843, "num_tokens": 320630614.0, "sample_num_tokens": 8172.5, "step": 1185, "total_num_tokens": 320663304.0, "z_loss": 0.0016316084656864405 }, { "copy_logits_max": -3.277737617492676, "copy_logits_min": -750000000.0, "copy_num_tokens": 227.0625, "epoch": 0.2422261935154455, "gen_logits_max": 9.317323684692383, "gen_logits_mean": -10.184775352478027, "gen_logits_min": -21.52252197265625, "gen_logits_std": 2.4583547115325928, "gen_loss": 0.3626624643802643, "grad_norm": 0.6210685532296043, "learning_rate": 2.9431578947368422e-05, "loss": 0.369, "mean_copy_accuracy": 0.9897345304489136, "mean_gen_accuracy": 0.8496203720569611, "mean_token_accuracy": 0.8799134194850922, "num_tokens": 320891648.0, "sample_num_tokens": 7040.0, "step": 1186, "total_num_tokens": 320919808.0, "z_loss": 0.0014019230147823691 }, { "copy_logits_max": -0.872590184211731, "copy_logits_min": -687500032.0, "copy_num_tokens": 487.8125, "epoch": 0.24243043145264234, "gen_logits_max": 8.963157653808594, "gen_logits_mean": -9.933369636535645, "gen_logits_min": -21.785921096801758, "gen_logits_std": 2.5298216342926025, "gen_loss": 0.3506239652633667, "grad_norm": 0.48444135242508585, "learning_rate": 2.9430315789473683e-05, "loss": 0.3455, "mean_copy_accuracy": 0.9916338175535202, "mean_gen_accuracy": 0.8570794761180878, "mean_token_accuracy": 0.8864270895719528, "num_tokens": 321156954.0, "sample_num_tokens": 8817.5, "step": 1187, "total_num_tokens": 321192224.0, "z_loss": 0.0015033771051093936 }, { "copy_logits_max": -1.9991066455841064, "copy_logits_min": -750000000.0, "copy_num_tokens": 429.1875, "epoch": 0.24263466938983916, "gen_logits_max": 8.883481979370117, "gen_logits_mean": -9.810200691223145, "gen_logits_min": -21.50833511352539, "gen_logits_std": 2.4792630672454834, "gen_loss": 0.3857342600822449, "grad_norm": 0.4968247190686723, "learning_rate": 2.9429052631578948e-05, "loss": 0.3689, "mean_copy_accuracy": 0.9890233874320984, "mean_gen_accuracy": 0.8544933646917343, "mean_token_accuracy": 0.8815701305866241, "num_tokens": 321430109.0, "sample_num_tokens": 9158.25, "step": 1188, "total_num_tokens": 321466742.0, "z_loss": 0.0014567720936611295 }, { "copy_logits_max": 1.5339076519012451, "copy_logits_min": -750000000.0, "copy_num_tokens": 641.875, "epoch": 0.242838907327036, "gen_logits_max": 8.31771469116211, "gen_logits_mean": -9.49919605255127, "gen_logits_min": -21.94194793701172, "gen_logits_std": 2.5284299850463867, "gen_loss": 0.33854806423187256, "grad_norm": 0.5308800644018972, "learning_rate": 2.942778947368421e-05, "loss": 0.3658, "mean_copy_accuracy": 0.9890046119689941, "mean_gen_accuracy": 0.8472726047039032, "mean_token_accuracy": 0.8826215863227844, "num_tokens": 321697421.0, "sample_num_tokens": 9077.25, "step": 1189, "total_num_tokens": 321733730.0, "z_loss": 0.001552205765619874 }, { "copy_logits_max": -0.1876876950263977, "copy_logits_min": -750000000.0, "copy_num_tokens": 475.8125, "epoch": 0.24304314526423282, "gen_logits_max": 8.468818664550781, "gen_logits_mean": -10.708905220031738, "gen_logits_min": -22.44464683532715, "gen_logits_std": 2.4809789657592773, "gen_loss": 0.3462895154953003, "grad_norm": 0.5916499844528414, "learning_rate": 2.9426526315789476e-05, "loss": 0.3516, "mean_copy_accuracy": 0.9907230138778687, "mean_gen_accuracy": 0.8538082242012024, "mean_token_accuracy": 0.8861117660999298, "num_tokens": 321959101.0, "sample_num_tokens": 8953.25, "step": 1190, "total_num_tokens": 321994914.0, "z_loss": 0.0014664160553365946 }, { "copy_logits_max": -0.4665917754173279, "copy_logits_min": -750000000.0, "copy_num_tokens": 357.4375, "epoch": 0.24324738320142966, "gen_logits_max": 9.699939727783203, "gen_logits_mean": -10.13686752319336, "gen_logits_min": -22.073043823242188, "gen_logits_std": 2.4941062927246094, "gen_loss": 0.36986666917800903, "grad_norm": 0.5320665038764333, "learning_rate": 2.9425263157894737e-05, "loss": 0.3598, "mean_copy_accuracy": 0.9905885010957718, "mean_gen_accuracy": 0.8532680422067642, "mean_token_accuracy": 0.8840716332197189, "num_tokens": 322217520.0, "sample_num_tokens": 8154.5, "step": 1191, "total_num_tokens": 322250138.0, "z_loss": 0.001489237416535616 }, { "copy_logits_max": 0.9227676391601562, "copy_logits_min": -750000000.0, "copy_num_tokens": 488.0, "epoch": 0.2434516211386265, "gen_logits_max": 8.31760311126709, "gen_logits_mean": -11.245134353637695, "gen_logits_min": -23.24016761779785, "gen_logits_std": 2.4950101375579834, "gen_loss": 0.3499460220336914, "grad_norm": 0.4771130622043569, "learning_rate": 2.9424e-05, "loss": 0.3641, "mean_copy_accuracy": 0.9937066733837128, "mean_gen_accuracy": 0.8433556854724884, "mean_token_accuracy": 0.88081094622612, "num_tokens": 322488360.0, "sample_num_tokens": 8043.5, "step": 1192, "total_num_tokens": 322520534.0, "z_loss": 0.001586124999448657 }, { "copy_logits_max": 0.485262393951416, "copy_logits_min": -687500032.0, "copy_num_tokens": 534.375, "epoch": 0.24365585907582332, "gen_logits_max": 8.827251434326172, "gen_logits_mean": -9.758771896362305, "gen_logits_min": -22.003955841064453, "gen_logits_std": 2.5530476570129395, "gen_loss": 0.3852161169052124, "grad_norm": 0.5092877117165161, "learning_rate": 2.9422736842105262e-05, "loss": 0.3379, "mean_copy_accuracy": 0.9912461787462234, "mean_gen_accuracy": 0.8594262003898621, "mean_token_accuracy": 0.8931261450052261, "num_tokens": 322764862.0, "sample_num_tokens": 8699.0, "step": 1193, "total_num_tokens": 322799658.0, "z_loss": 0.0016328375786542892 }, { "copy_logits_max": 0.389323353767395, "copy_logits_min": -750000000.0, "copy_num_tokens": 497.625, "epoch": 0.24386009701302017, "gen_logits_max": 8.989315032958984, "gen_logits_mean": -10.100893020629883, "gen_logits_min": -22.299705505371094, "gen_logits_std": 2.5466744899749756, "gen_loss": 0.3510802388191223, "grad_norm": 0.4669445357762279, "learning_rate": 2.9421473684210527e-05, "loss": 0.3481, "mean_copy_accuracy": 0.9900976568460464, "mean_gen_accuracy": 0.8551078736782074, "mean_token_accuracy": 0.8857300877571106, "num_tokens": 323021832.0, "sample_num_tokens": 8433.0, "step": 1194, "total_num_tokens": 323055564.0, "z_loss": 0.0015605366788804531 }, { "copy_logits_max": -0.7006688117980957, "copy_logits_min": -750000000.0, "copy_num_tokens": 458.3125, "epoch": 0.244064334950217, "gen_logits_max": 8.97797966003418, "gen_logits_mean": -10.751480102539062, "gen_logits_min": -22.920673370361328, "gen_logits_std": 2.492274761199951, "gen_loss": 0.3193509578704834, "grad_norm": 0.5123842631271538, "learning_rate": 2.942021052631579e-05, "loss": 0.34, "mean_copy_accuracy": 0.9901316612958908, "mean_gen_accuracy": 0.8558025360107422, "mean_token_accuracy": 0.887064203619957, "num_tokens": 323288670.0, "sample_num_tokens": 9290.5, "step": 1195, "total_num_tokens": 323325832.0, "z_loss": 0.0015160879120230675 }, { "copy_logits_max": 2.4076833724975586, "copy_logits_min": -750000000.0, "copy_num_tokens": 602.75, "epoch": 0.24426857288741383, "gen_logits_max": 8.648439407348633, "gen_logits_mean": -10.040185928344727, "gen_logits_min": -22.487655639648438, "gen_logits_std": 2.5926268100738525, "gen_loss": 0.31996554136276245, "grad_norm": 0.502809871823478, "learning_rate": 2.9418947368421052e-05, "loss": 0.3293, "mean_copy_accuracy": 0.9927137047052383, "mean_gen_accuracy": 0.8557028472423553, "mean_token_accuracy": 0.8938673138618469, "num_tokens": 323567242.0, "sample_num_tokens": 8165.5, "step": 1196, "total_num_tokens": 323599904.0, "z_loss": 0.001759285107254982 }, { "copy_logits_max": -0.8131722807884216, "copy_logits_min": -687500032.0, "copy_num_tokens": 435.5625, "epoch": 0.24447281082461067, "gen_logits_max": 9.431678771972656, "gen_logits_mean": -9.60061264038086, "gen_logits_min": -21.705402374267578, "gen_logits_std": 2.516490936279297, "gen_loss": 0.33761581778526306, "grad_norm": 0.5196527276095415, "learning_rate": 2.9417684210526316e-05, "loss": 0.3338, "mean_copy_accuracy": 0.9898788183927536, "mean_gen_accuracy": 0.8630559891462326, "mean_token_accuracy": 0.8909498155117035, "num_tokens": 323824281.0, "sample_num_tokens": 9066.75, "step": 1197, "total_num_tokens": 323860548.0, "z_loss": 0.0014929529279470444 }, { "copy_logits_max": -0.859477162361145, "copy_logits_min": -687500032.0, "copy_num_tokens": 409.1875, "epoch": 0.24467704876180751, "gen_logits_max": 8.73487663269043, "gen_logits_mean": -10.884632110595703, "gen_logits_min": -23.127925872802734, "gen_logits_std": 2.502492904663086, "gen_loss": 0.32639792561531067, "grad_norm": 0.5752075268440228, "learning_rate": 2.941642105263158e-05, "loss": 0.3504, "mean_copy_accuracy": 0.9897922277450562, "mean_gen_accuracy": 0.8558490425348282, "mean_token_accuracy": 0.8858699649572372, "num_tokens": 324073068.0, "sample_num_tokens": 7650.0, "step": 1198, "total_num_tokens": 324103668.0, "z_loss": 0.0014489279128611088 }, { "copy_logits_max": -1.9397509098052979, "copy_logits_min": -750000000.0, "copy_num_tokens": 233.6875, "epoch": 0.24488128669900433, "gen_logits_max": 9.29439926147461, "gen_logits_mean": -10.032423973083496, "gen_logits_min": -21.69968032836914, "gen_logits_std": 2.4786376953125, "gen_loss": 0.39384347200393677, "grad_norm": 0.553375307246405, "learning_rate": 2.9415157894736845e-05, "loss": 0.3476, "mean_copy_accuracy": 0.9905566424131393, "mean_gen_accuracy": 0.8593287914991379, "mean_token_accuracy": 0.887321263551712, "num_tokens": 324343424.0, "sample_num_tokens": 6709.5, "step": 1199, "total_num_tokens": 324370262.0, "z_loss": 0.0013565780827775598 }, { "copy_logits_max": -3.343925714492798, "copy_logits_min": -750000000.0, "copy_num_tokens": 250.625, "epoch": 0.24508552463620117, "gen_logits_max": 8.858726501464844, "gen_logits_mean": -10.841278076171875, "gen_logits_min": -22.379135131835938, "gen_logits_std": 2.4498090744018555, "gen_loss": 0.41939085721969604, "grad_norm": 0.5528768589970435, "learning_rate": 2.9413894736842106e-05, "loss": 0.3707, "mean_copy_accuracy": 0.9903808832168579, "mean_gen_accuracy": 0.8503700345754623, "mean_token_accuracy": 0.8802988231182098, "num_tokens": 324585842.0, "sample_num_tokens": 7338.5, "step": 1200, "total_num_tokens": 324615196.0, "z_loss": 0.0012292055180296302 }, { "copy_logits_max": -2.5354137420654297, "copy_logits_min": -687500032.0, "copy_num_tokens": 598.375, "epoch": 0.24528976257339802, "gen_logits_max": 9.751945495605469, "gen_logits_mean": -8.880752563476562, "gen_logits_min": -21.392684936523438, "gen_logits_std": 2.546860694885254, "gen_loss": 0.30944719910621643, "grad_norm": 0.5506607665772391, "learning_rate": 2.941263157894737e-05, "loss": 0.3403, "mean_copy_accuracy": 0.9902379959821701, "mean_gen_accuracy": 0.8557243943214417, "mean_token_accuracy": 0.8904858827590942, "num_tokens": 324869146.0, "sample_num_tokens": 10767.0, "step": 1201, "total_num_tokens": 324912214.0, "z_loss": 0.0012815678492188454 }, { "copy_logits_max": -0.2942214012145996, "copy_logits_min": -750000000.0, "copy_num_tokens": 451.4375, "epoch": 0.24549400051059483, "gen_logits_max": 8.774351119995117, "gen_logits_mean": -9.723037719726562, "gen_logits_min": -21.793750762939453, "gen_logits_std": 2.519510269165039, "gen_loss": 0.3478633761405945, "grad_norm": 0.5401536151006915, "learning_rate": 2.941136842105263e-05, "loss": 0.3394, "mean_copy_accuracy": 0.9907201379537582, "mean_gen_accuracy": 0.8559763431549072, "mean_token_accuracy": 0.8893002718687057, "num_tokens": 325144914.0, "sample_num_tokens": 7453.5, "step": 1202, "total_num_tokens": 325174728.0, "z_loss": 0.0013673114590346813 }, { "copy_logits_max": -0.850953996181488, "copy_logits_min": -750000000.0, "copy_num_tokens": 515.9375, "epoch": 0.24569823844779168, "gen_logits_max": 8.505149841308594, "gen_logits_mean": -10.293804168701172, "gen_logits_min": -22.46455955505371, "gen_logits_std": 2.5001769065856934, "gen_loss": 0.3341508209705353, "grad_norm": 0.5225038126893605, "learning_rate": 2.9410105263157895e-05, "loss": 0.3417, "mean_copy_accuracy": 0.9904748350381851, "mean_gen_accuracy": 0.8554811775684357, "mean_token_accuracy": 0.8885440677404404, "num_tokens": 325396304.0, "sample_num_tokens": 7884.5, "step": 1203, "total_num_tokens": 325427842.0, "z_loss": 0.0013424973003566265 }, { "copy_logits_max": -0.22715669870376587, "copy_logits_min": -750000000.0, "copy_num_tokens": 385.25, "epoch": 0.24590247638498852, "gen_logits_max": 8.90446662902832, "gen_logits_mean": -10.627771377563477, "gen_logits_min": -22.969648361206055, "gen_logits_std": 2.5244455337524414, "gen_loss": 0.3499129116535187, "grad_norm": 0.4548137911776149, "learning_rate": 2.9408842105263156e-05, "loss": 0.3324, "mean_copy_accuracy": 0.9922362416982651, "mean_gen_accuracy": 0.8541791290044785, "mean_token_accuracy": 0.8900905847549438, "num_tokens": 325665298.0, "sample_num_tokens": 6825.0, "step": 1204, "total_num_tokens": 325692598.0, "z_loss": 0.0014672012766823173 }, { "copy_logits_max": -1.7908217906951904, "copy_logits_min": -750000000.0, "copy_num_tokens": 341.0625, "epoch": 0.24610671432218534, "gen_logits_max": 8.648506164550781, "gen_logits_mean": -10.147186279296875, "gen_logits_min": -22.269329071044922, "gen_logits_std": 2.4992337226867676, "gen_loss": 0.3079484701156616, "grad_norm": 0.49670858892985476, "learning_rate": 2.940757894736842e-05, "loss": 0.3409, "mean_copy_accuracy": 0.9902977049350739, "mean_gen_accuracy": 0.8550791442394257, "mean_token_accuracy": 0.8886556029319763, "num_tokens": 325934840.0, "sample_num_tokens": 7242.5, "step": 1205, "total_num_tokens": 325963810.0, "z_loss": 0.0013294753152877092 }, { "copy_logits_max": -1.240821123123169, "copy_logits_min": -750000000.0, "copy_num_tokens": 419.25, "epoch": 0.24631095225938218, "gen_logits_max": 9.998170852661133, "gen_logits_mean": -9.31990909576416, "gen_logits_min": -21.31212615966797, "gen_logits_std": 2.52063250541687, "gen_loss": 0.3788292706012726, "grad_norm": 0.4968355955361023, "learning_rate": 2.9406315789473685e-05, "loss": 0.342, "mean_copy_accuracy": 0.9895743280649185, "mean_gen_accuracy": 0.8557944893836975, "mean_token_accuracy": 0.8894422501325607, "num_tokens": 326214301.0, "sample_num_tokens": 7810.25, "step": 1206, "total_num_tokens": 326245542.0, "z_loss": 0.0016518877819180489 }, { "copy_logits_max": 0.06147308647632599, "copy_logits_min": -750000000.0, "copy_num_tokens": 688.625, "epoch": 0.24651519019657903, "gen_logits_max": 8.542665481567383, "gen_logits_mean": -9.654765129089355, "gen_logits_min": -21.87980842590332, "gen_logits_std": 2.5449328422546387, "gen_loss": 0.3120775818824768, "grad_norm": 0.4728086752939248, "learning_rate": 2.940505263157895e-05, "loss": 0.3214, "mean_copy_accuracy": 0.991372287273407, "mean_gen_accuracy": 0.8656671494245529, "mean_token_accuracy": 0.8953986912965775, "num_tokens": 326482480.0, "sample_num_tokens": 10325.5, "step": 1207, "total_num_tokens": 326523782.0, "z_loss": 0.0015417211689054966 }, { "copy_logits_max": -0.09757378697395325, "copy_logits_min": -750000000.0, "copy_num_tokens": 533.0, "epoch": 0.24671942813377584, "gen_logits_max": 9.516181945800781, "gen_logits_mean": -9.931571960449219, "gen_logits_min": -22.442420959472656, "gen_logits_std": 2.541313648223877, "gen_loss": 0.3289417326450348, "grad_norm": 0.5546718606082864, "learning_rate": 2.9403789473684214e-05, "loss": 0.3567, "mean_copy_accuracy": 0.9887212514877319, "mean_gen_accuracy": 0.85616335272789, "mean_token_accuracy": 0.8846741914749146, "num_tokens": 326751857.0, "sample_num_tokens": 9009.75, "step": 1208, "total_num_tokens": 326787896.0, "z_loss": 0.0014998071128502488 }, { "copy_logits_max": -3.0395946502685547, "copy_logits_min": -750000000.0, "copy_num_tokens": 397.375, "epoch": 0.24692366607097269, "gen_logits_max": 9.63235092163086, "gen_logits_mean": -9.760625839233398, "gen_logits_min": -21.963581085205078, "gen_logits_std": 2.506338119506836, "gen_loss": 0.33844661712646484, "grad_norm": 0.5286202369605022, "learning_rate": 2.9402526315789474e-05, "loss": 0.3779, "mean_copy_accuracy": 0.990348607301712, "mean_gen_accuracy": 0.8455081731081009, "mean_token_accuracy": 0.8786936402320862, "num_tokens": 327024647.0, "sample_num_tokens": 9021.75, "step": 1209, "total_num_tokens": 327060734.0, "z_loss": 0.0013085742248222232 }, { "copy_logits_max": -0.33000776171684265, "copy_logits_min": -687500032.0, "copy_num_tokens": 550.75, "epoch": 0.24712790400816953, "gen_logits_max": 8.965311050415039, "gen_logits_mean": -9.996990203857422, "gen_logits_min": -22.258323669433594, "gen_logits_std": 2.5153980255126953, "gen_loss": 0.33687877655029297, "grad_norm": 0.5189631149606173, "learning_rate": 2.940126315789474e-05, "loss": 0.3258, "mean_copy_accuracy": 0.9900085031986237, "mean_gen_accuracy": 0.8607174307107925, "mean_token_accuracy": 0.8939193338155746, "num_tokens": 327297255.0, "sample_num_tokens": 8923.75, "step": 1210, "total_num_tokens": 327332950.0, "z_loss": 0.0015114390989765525 }, { "copy_logits_max": -0.5588958263397217, "copy_logits_min": -750000000.0, "copy_num_tokens": 434.875, "epoch": 0.24733214194536635, "gen_logits_max": 8.999787330627441, "gen_logits_mean": -9.676946640014648, "gen_logits_min": -21.7224063873291, "gen_logits_std": 2.510545253753662, "gen_loss": 0.36457404494285583, "grad_norm": 0.5107085487238228, "learning_rate": 2.94e-05, "loss": 0.33, "mean_copy_accuracy": 0.9911714047193527, "mean_gen_accuracy": 0.8639229983091354, "mean_token_accuracy": 0.8937383145093918, "num_tokens": 327551371.0, "sample_num_tokens": 8200.25, "step": 1211, "total_num_tokens": 327584172.0, "z_loss": 0.0014804506208747625 }, { "copy_logits_max": 0.026443839073181152, "copy_logits_min": -750000000.0, "copy_num_tokens": 440.1875, "epoch": 0.2475363798825632, "gen_logits_max": 8.828250885009766, "gen_logits_mean": -10.447942733764648, "gen_logits_min": -22.508695602416992, "gen_logits_std": 2.495182514190674, "gen_loss": 0.3334440290927887, "grad_norm": 0.6761733098244931, "learning_rate": 2.9398736842105264e-05, "loss": 0.3302, "mean_copy_accuracy": 0.9911738485097885, "mean_gen_accuracy": 0.8603281378746033, "mean_token_accuracy": 0.8939499855041504, "num_tokens": 327809440.0, "sample_num_tokens": 7352.0, "step": 1212, "total_num_tokens": 327838848.0, "z_loss": 0.0016023787902668118 }, { "copy_logits_max": -3.0838568210601807, "copy_logits_min": -750000000.0, "copy_num_tokens": 435.8125, "epoch": 0.24774061781976003, "gen_logits_max": 8.901754379272461, "gen_logits_mean": -11.218633651733398, "gen_logits_min": -22.76162338256836, "gen_logits_std": 2.4692938327789307, "gen_loss": 0.3032141923904419, "grad_norm": 0.5830179978198494, "learning_rate": 2.9397473684210525e-05, "loss": 0.3445, "mean_copy_accuracy": 0.9897018373012543, "mean_gen_accuracy": 0.8601411879062653, "mean_token_accuracy": 0.8896729946136475, "num_tokens": 328075786.0, "sample_num_tokens": 9073.5, "step": 1213, "total_num_tokens": 328112080.0, "z_loss": 0.0012262096861377358 }, { "copy_logits_max": -2.6109166145324707, "copy_logits_min": -625000000.0, "copy_num_tokens": 442.125, "epoch": 0.24794485575695685, "gen_logits_max": 8.929059982299805, "gen_logits_mean": -10.431756973266602, "gen_logits_min": -22.414236068725586, "gen_logits_std": 2.4948151111602783, "gen_loss": 0.342639684677124, "grad_norm": 0.8028528101210999, "learning_rate": 2.9396210526315793e-05, "loss": 0.3324, "mean_copy_accuracy": 0.9876494407653809, "mean_gen_accuracy": 0.8595872968435287, "mean_token_accuracy": 0.8926034420728683, "num_tokens": 328347047.0, "sample_num_tokens": 9475.25, "step": 1214, "total_num_tokens": 328384948.0, "z_loss": 0.0013063657097518444 }, { "copy_logits_max": -0.9548343420028687, "copy_logits_min": -750000000.0, "copy_num_tokens": 339.1875, "epoch": 0.2481490936941537, "gen_logits_max": 8.913553237915039, "gen_logits_mean": -10.294300079345703, "gen_logits_min": -22.110172271728516, "gen_logits_std": 2.479498863220215, "gen_loss": 0.3797951936721802, "grad_norm": 0.6108978217782197, "learning_rate": 2.9394947368421054e-05, "loss": 0.3447, "mean_copy_accuracy": 0.992353767156601, "mean_gen_accuracy": 0.8505130708217621, "mean_token_accuracy": 0.8886879235506058, "num_tokens": 328634583.0, "sample_num_tokens": 7581.25, "step": 1215, "total_num_tokens": 328664908.0, "z_loss": 0.0015716699417680502 }, { "copy_logits_max": 0.9023123979568481, "copy_logits_min": -750000000.0, "copy_num_tokens": 574.8125, "epoch": 0.2483533316313505, "gen_logits_max": 9.335979461669922, "gen_logits_mean": -9.447758674621582, "gen_logits_min": -22.083099365234375, "gen_logits_std": 2.53330135345459, "gen_loss": 0.31164395809173584, "grad_norm": 0.6861515947641655, "learning_rate": 2.9393684210526318e-05, "loss": 0.3482, "mean_copy_accuracy": 0.9895633608102798, "mean_gen_accuracy": 0.8546198308467865, "mean_token_accuracy": 0.8882909417152405, "num_tokens": 328891452.0, "sample_num_tokens": 9064.5, "step": 1216, "total_num_tokens": 328927710.0, "z_loss": 0.0020624673925340176 }, { "copy_logits_max": 0.11101359128952026, "copy_logits_min": -750000000.0, "copy_num_tokens": 445.375, "epoch": 0.24855756956854735, "gen_logits_max": 8.774105072021484, "gen_logits_mean": -11.129297256469727, "gen_logits_min": -23.25063705444336, "gen_logits_std": 2.471935749053955, "gen_loss": 0.31100404262542725, "grad_norm": 0.4943794977911824, "learning_rate": 2.939242105263158e-05, "loss": 0.3366, "mean_copy_accuracy": 0.9896546304225922, "mean_gen_accuracy": 0.8592297434806824, "mean_token_accuracy": 0.8909813463687897, "num_tokens": 329174748.0, "sample_num_tokens": 8758.0, "step": 1217, "total_num_tokens": 329209780.0, "z_loss": 0.0018986284267157316 }, { "copy_logits_max": 2.4151439666748047, "copy_logits_min": -750000064.0, "copy_num_tokens": 716.875, "epoch": 0.2487618075057442, "gen_logits_max": 8.65471076965332, "gen_logits_mean": -9.989206314086914, "gen_logits_min": -22.675765991210938, "gen_logits_std": 2.5846312046051025, "gen_loss": 0.27904027700424194, "grad_norm": 0.5879028619168525, "learning_rate": 2.9391157894736843e-05, "loss": 0.3297, "mean_copy_accuracy": 0.9901147186756134, "mean_gen_accuracy": 0.8619736135005951, "mean_token_accuracy": 0.8952279984951019, "num_tokens": 329435065.0, "sample_num_tokens": 9777.75, "step": 1218, "total_num_tokens": 329474176.0, "z_loss": 0.002129210624843836 }, { "copy_logits_max": 1.1103829145431519, "copy_logits_min": -687500032.0, "copy_num_tokens": 567.4375, "epoch": 0.24896604544294101, "gen_logits_max": 9.466201782226562, "gen_logits_mean": -9.68789291381836, "gen_logits_min": -21.68183135986328, "gen_logits_std": 2.53163480758667, "gen_loss": 0.3377993702888489, "grad_norm": 0.49589009165074965, "learning_rate": 2.9389894736842104e-05, "loss": 0.3324, "mean_copy_accuracy": 0.9915602803230286, "mean_gen_accuracy": 0.8537364900112152, "mean_token_accuracy": 0.892524778842926, "num_tokens": 329727379.0, "sample_num_tokens": 9276.25, "step": 1219, "total_num_tokens": 329764484.0, "z_loss": 0.002004362177103758 }, { "copy_logits_max": 0.5163230895996094, "copy_logits_min": -687500032.0, "copy_num_tokens": 448.125, "epoch": 0.24917028338013786, "gen_logits_max": 8.985565185546875, "gen_logits_mean": -10.405946731567383, "gen_logits_min": -22.33344268798828, "gen_logits_std": 2.476642608642578, "gen_loss": 0.3582727313041687, "grad_norm": 0.590566871063957, "learning_rate": 2.938863157894737e-05, "loss": 0.3357, "mean_copy_accuracy": 0.9922716617584229, "mean_gen_accuracy": 0.8506868481636047, "mean_token_accuracy": 0.8911673575639725, "num_tokens": 330042166.0, "sample_num_tokens": 8092.5, "step": 1220, "total_num_tokens": 330074536.0, "z_loss": 0.0018689699936658144 }, { "copy_logits_max": 0.9033030271530151, "copy_logits_min": -750000064.0, "copy_num_tokens": 527.75, "epoch": 0.2493745213173347, "gen_logits_max": 7.899658679962158, "gen_logits_mean": -10.063426971435547, "gen_logits_min": -22.42474365234375, "gen_logits_std": 2.544069528579712, "gen_loss": 0.33800458908081055, "grad_norm": 0.6615055274279215, "learning_rate": 2.9387368421052633e-05, "loss": 0.3512, "mean_copy_accuracy": 0.9895409345626831, "mean_gen_accuracy": 0.8499544709920883, "mean_token_accuracy": 0.8874567300081253, "num_tokens": 330315805.0, "sample_num_tokens": 7972.75, "step": 1221, "total_num_tokens": 330347696.0, "z_loss": 0.0019243209389969707 }, { "copy_logits_max": -0.3117368817329407, "copy_logits_min": -750000000.0, "copy_num_tokens": 358.125, "epoch": 0.24957875925453152, "gen_logits_max": 8.737860679626465, "gen_logits_mean": -10.376862525939941, "gen_logits_min": -22.054790496826172, "gen_logits_std": 2.482464551925659, "gen_loss": 0.3383161723613739, "grad_norm": 0.623157292285255, "learning_rate": 2.9386105263157894e-05, "loss": 0.3594, "mean_copy_accuracy": 0.9883474707603455, "mean_gen_accuracy": 0.8542848974466324, "mean_token_accuracy": 0.8826750665903091, "num_tokens": 330569097.0, "sample_num_tokens": 7243.25, "step": 1222, "total_num_tokens": 330598070.0, "z_loss": 0.0014910558238625526 }, { "copy_logits_max": 1.6229413747787476, "copy_logits_min": -750000064.0, "copy_num_tokens": 705.0, "epoch": 0.24978299719172836, "gen_logits_max": 8.566776275634766, "gen_logits_mean": -10.210205078125, "gen_logits_min": -22.736797332763672, "gen_logits_std": 2.5573296546936035, "gen_loss": 0.3116520345211029, "grad_norm": 0.7797746318985967, "learning_rate": 2.938484210526316e-05, "loss": 0.3351, "mean_copy_accuracy": 0.9898416996002197, "mean_gen_accuracy": 0.8583655655384064, "mean_token_accuracy": 0.892180547118187, "num_tokens": 330846303.0, "sample_num_tokens": 10229.25, "step": 1223, "total_num_tokens": 330887220.0, "z_loss": 0.0016827750951051712 }, { "copy_logits_max": -0.45688048005104065, "copy_logits_min": -750000000.0, "copy_num_tokens": 556.3125, "epoch": 0.2499872351289252, "gen_logits_max": 8.666336059570312, "gen_logits_mean": -9.68371868133545, "gen_logits_min": -21.376846313476562, "gen_logits_std": 2.4843087196350098, "gen_loss": 0.3163336217403412, "grad_norm": 0.5459355776503549, "learning_rate": 2.9383578947368422e-05, "loss": 0.3452, "mean_copy_accuracy": 0.9916904717683792, "mean_gen_accuracy": 0.8482588231563568, "mean_token_accuracy": 0.8868440091609955, "num_tokens": 331123824.0, "sample_num_tokens": 8498.5, "step": 1224, "total_num_tokens": 331157818.0, "z_loss": 0.0014766850508749485 }, { "copy_logits_max": -2.005354404449463, "copy_logits_min": -750000000.0, "copy_num_tokens": 352.625, "epoch": 0.25019147306612205, "gen_logits_max": 8.96687126159668, "gen_logits_mean": -10.660038948059082, "gen_logits_min": -22.34505844116211, "gen_logits_std": 2.490926742553711, "gen_loss": 0.3460286259651184, "grad_norm": 0.5334191066372937, "learning_rate": 2.9382315789473687e-05, "loss": 0.3429, "mean_copy_accuracy": 0.9909527748823166, "mean_gen_accuracy": 0.8557514399290085, "mean_token_accuracy": 0.887891486287117, "num_tokens": 331389515.0, "sample_num_tokens": 7293.25, "step": 1225, "total_num_tokens": 331418688.0, "z_loss": 0.0012811594642698765 }, { "copy_logits_max": -1.363349199295044, "copy_logits_min": -750000000.0, "copy_num_tokens": 498.8125, "epoch": 0.25039571100331887, "gen_logits_max": 8.667381286621094, "gen_logits_mean": -10.538154602050781, "gen_logits_min": -22.078323364257812, "gen_logits_std": 2.447676658630371, "gen_loss": 0.39971259236335754, "grad_norm": 0.6642909975985267, "learning_rate": 2.9381052631578947e-05, "loss": 0.3807, "mean_copy_accuracy": 0.9875767976045609, "mean_gen_accuracy": 0.8475204259157181, "mean_token_accuracy": 0.880036473274231, "num_tokens": 331649232.0, "sample_num_tokens": 8905.0, "step": 1226, "total_num_tokens": 331684852.0, "z_loss": 0.0014382441295310855 }, { "copy_logits_max": 0.6245485544204712, "copy_logits_min": -687500032.0, "copy_num_tokens": 629.6875, "epoch": 0.2505999489405157, "gen_logits_max": 8.209878921508789, "gen_logits_mean": -10.66533374786377, "gen_logits_min": -22.547645568847656, "gen_logits_std": 2.5026817321777344, "gen_loss": 0.3011946678161621, "grad_norm": 0.5427013654698439, "learning_rate": 2.9379789473684212e-05, "loss": 0.3349, "mean_copy_accuracy": 0.989379271864891, "mean_gen_accuracy": 0.8553563803434372, "mean_token_accuracy": 0.8913584053516388, "num_tokens": 331928013.0, "sample_num_tokens": 8913.25, "step": 1227, "total_num_tokens": 331963666.0, "z_loss": 0.0015153184067457914 }, { "copy_logits_max": -1.1828670501708984, "copy_logits_min": -750000000.0, "copy_num_tokens": 503.8125, "epoch": 0.25080418687771255, "gen_logits_max": 8.166288375854492, "gen_logits_mean": -9.999958038330078, "gen_logits_min": -21.77903938293457, "gen_logits_std": 2.4807639122009277, "gen_loss": 0.3530794382095337, "grad_norm": 0.5119575803459956, "learning_rate": 2.9378526315789473e-05, "loss": 0.3592, "mean_copy_accuracy": 0.9896251559257507, "mean_gen_accuracy": 0.8529220670461655, "mean_token_accuracy": 0.883274182677269, "num_tokens": 332188787.0, "sample_num_tokens": 8744.25, "step": 1228, "total_num_tokens": 332223764.0, "z_loss": 0.0014757656026631594 }, { "copy_logits_max": -0.25896674394607544, "copy_logits_min": -687500032.0, "copy_num_tokens": 779.3125, "epoch": 0.25100842481490937, "gen_logits_max": 8.182060241699219, "gen_logits_mean": -9.801138877868652, "gen_logits_min": -22.21515464782715, "gen_logits_std": 2.519188165664673, "gen_loss": 0.35032564401626587, "grad_norm": 0.5261673033750944, "learning_rate": 2.9377263157894737e-05, "loss": 0.3558, "mean_copy_accuracy": 0.9917613118886948, "mean_gen_accuracy": 0.8473777025938034, "mean_token_accuracy": 0.8844897300004959, "num_tokens": 332464478.0, "sample_num_tokens": 10357.5, "step": 1229, "total_num_tokens": 332505908.0, "z_loss": 0.0016344116302207112 }, { "copy_logits_max": -3.036774158477783, "copy_logits_min": -750000000.0, "copy_num_tokens": 313.4375, "epoch": 0.2512126627521062, "gen_logits_max": 8.475959777832031, "gen_logits_mean": -12.260217666625977, "gen_logits_min": -23.403852462768555, "gen_logits_std": 2.4073078632354736, "gen_loss": 0.3608042001724243, "grad_norm": 0.5429253910890496, "learning_rate": 2.9375999999999998e-05, "loss": 0.3497, "mean_copy_accuracy": 0.9903726279735565, "mean_gen_accuracy": 0.8549673408269882, "mean_token_accuracy": 0.8844375759363174, "num_tokens": 332738042.0, "sample_num_tokens": 7886.5, "step": 1230, "total_num_tokens": 332769588.0, "z_loss": 0.0012967254733666778 }, { "copy_logits_max": -1.4478392601013184, "copy_logits_min": -687500032.0, "copy_num_tokens": 583.125, "epoch": 0.25141690068930306, "gen_logits_max": 8.555554389953613, "gen_logits_mean": -10.140519142150879, "gen_logits_min": -21.99205207824707, "gen_logits_std": 2.4820284843444824, "gen_loss": 0.30979204177856445, "grad_norm": 0.5303668694371046, "learning_rate": 2.9374736842105266e-05, "loss": 0.3549, "mean_copy_accuracy": 0.9889366179704666, "mean_gen_accuracy": 0.849799171090126, "mean_token_accuracy": 0.8845319151878357, "num_tokens": 333001968.0, "sample_num_tokens": 8884.0, "step": 1231, "total_num_tokens": 333037504.0, "z_loss": 0.0015428719343617558 }, { "copy_logits_max": -2.8880953788757324, "copy_logits_min": -625000064.0, "copy_num_tokens": 594.9375, "epoch": 0.2516211386264999, "gen_logits_max": 8.5009183883667, "gen_logits_mean": -9.356353759765625, "gen_logits_min": -21.45651626586914, "gen_logits_std": 2.480604887008667, "gen_loss": 0.3032148778438568, "grad_norm": 0.5422792197988454, "learning_rate": 2.9373473684210527e-05, "loss": 0.3405, "mean_copy_accuracy": 0.9883155673742294, "mean_gen_accuracy": 0.8588200360536575, "mean_token_accuracy": 0.8886187821626663, "num_tokens": 333273665.0, "sample_num_tokens": 9953.25, "step": 1232, "total_num_tokens": 333313478.0, "z_loss": 0.0014338104519993067 }, { "copy_logits_max": -2.3650474548339844, "copy_logits_min": -687500032.0, "copy_num_tokens": 395.3125, "epoch": 0.2518253765636967, "gen_logits_max": 8.420249938964844, "gen_logits_mean": -10.175064086914062, "gen_logits_min": -21.88819694519043, "gen_logits_std": 2.4349865913391113, "gen_loss": 0.30959489941596985, "grad_norm": 0.5092038313967847, "learning_rate": 2.937221052631579e-05, "loss": 0.3405, "mean_copy_accuracy": 0.9896287620067596, "mean_gen_accuracy": 0.8613044619560242, "mean_token_accuracy": 0.8893546760082245, "num_tokens": 333540317.0, "sample_num_tokens": 8425.75, "step": 1233, "total_num_tokens": 333574020.0, "z_loss": 0.001309349201619625 }, { "copy_logits_max": -0.8735089302062988, "copy_logits_min": -750000000.0, "copy_num_tokens": 744.4375, "epoch": 0.25202961450089356, "gen_logits_max": 8.668523788452148, "gen_logits_mean": -9.703256607055664, "gen_logits_min": -21.517532348632812, "gen_logits_std": 2.459881067276001, "gen_loss": 0.31197118759155273, "grad_norm": 0.46407447829678644, "learning_rate": 2.9370947368421055e-05, "loss": 0.3346, "mean_copy_accuracy": 0.9901664704084396, "mean_gen_accuracy": 0.8643803298473358, "mean_token_accuracy": 0.8925916254520416, "num_tokens": 333836563.0, "sample_num_tokens": 10437.75, "step": 1234, "total_num_tokens": 333878314.0, "z_loss": 0.0016405710484832525 }, { "copy_logits_max": 0.6254050731658936, "copy_logits_min": -750000064.0, "copy_num_tokens": 611.75, "epoch": 0.2522338524380904, "gen_logits_max": 8.849020004272461, "gen_logits_mean": -8.863357543945312, "gen_logits_min": -20.514965057373047, "gen_logits_std": 2.4831700325012207, "gen_loss": 0.3032464385032654, "grad_norm": 0.5791332066071745, "learning_rate": 2.9369684210526316e-05, "loss": 0.3212, "mean_copy_accuracy": 0.9910802990198135, "mean_gen_accuracy": 0.8621376603841782, "mean_token_accuracy": 0.8958570212125778, "num_tokens": 334121919.0, "sample_num_tokens": 8650.25, "step": 1235, "total_num_tokens": 334156520.0, "z_loss": 0.0016931367572396994 }, { "copy_logits_max": -0.02459271252155304, "copy_logits_min": -750000000.0, "copy_num_tokens": 673.875, "epoch": 0.2524380903752872, "gen_logits_max": 8.602705001831055, "gen_logits_mean": -9.417509078979492, "gen_logits_min": -21.95809555053711, "gen_logits_std": 2.5103912353515625, "gen_loss": 0.32096847891807556, "grad_norm": 0.47318908799975273, "learning_rate": 2.936842105263158e-05, "loss": 0.3373, "mean_copy_accuracy": 0.9907423555850983, "mean_gen_accuracy": 0.8587985634803772, "mean_token_accuracy": 0.8918543457984924, "num_tokens": 334405184.0, "sample_num_tokens": 9997.5, "step": 1236, "total_num_tokens": 334445174.0, "z_loss": 0.0018211505375802517 }, { "copy_logits_max": -1.247190237045288, "copy_logits_min": -687500032.0, "copy_num_tokens": 543.375, "epoch": 0.25264232831248407, "gen_logits_max": 9.090705871582031, "gen_logits_mean": -10.008975982666016, "gen_logits_min": -21.578025817871094, "gen_logits_std": 2.426236629486084, "gen_loss": 0.3242298364639282, "grad_norm": 0.5010625456696061, "learning_rate": 2.936715789473684e-05, "loss": 0.3632, "mean_copy_accuracy": 0.9907685220241547, "mean_gen_accuracy": 0.8485767692327499, "mean_token_accuracy": 0.8831315189599991, "num_tokens": 334686204.0, "sample_num_tokens": 8671.5, "step": 1237, "total_num_tokens": 334720890.0, "z_loss": 0.0016667405143380165 }, { "copy_logits_max": -0.6265460252761841, "copy_logits_min": -687500032.0, "copy_num_tokens": 539.1875, "epoch": 0.2528465662496809, "gen_logits_max": 8.80030345916748, "gen_logits_mean": -9.882692337036133, "gen_logits_min": -21.832443237304688, "gen_logits_std": 2.4273040294647217, "gen_loss": 0.34581565856933594, "grad_norm": 0.4788927595683135, "learning_rate": 2.9365894736842106e-05, "loss": 0.3679, "mean_copy_accuracy": 0.9907782673835754, "mean_gen_accuracy": 0.8515893816947937, "mean_token_accuracy": 0.8829844892024994, "num_tokens": 334949261.0, "sample_num_tokens": 8564.25, "step": 1238, "total_num_tokens": 334983518.0, "z_loss": 0.0016088703414425254 }, { "copy_logits_max": -0.2786986529827118, "copy_logits_min": -625000000.0, "copy_num_tokens": 545.0625, "epoch": 0.2530508041868777, "gen_logits_max": 8.934389114379883, "gen_logits_mean": -9.813705444335938, "gen_logits_min": -22.205158233642578, "gen_logits_std": 2.5566325187683105, "gen_loss": 0.3489153981208801, "grad_norm": 0.4725955941906446, "learning_rate": 2.936463157894737e-05, "loss": 0.334, "mean_copy_accuracy": 0.9921560138463974, "mean_gen_accuracy": 0.8549933731555939, "mean_token_accuracy": 0.8911330699920654, "num_tokens": 335217526.0, "sample_num_tokens": 8847.5, "step": 1239, "total_num_tokens": 335252916.0, "z_loss": 0.0015914442483335733 }, { "copy_logits_max": -2.426396608352661, "copy_logits_min": -687500032.0, "copy_num_tokens": 349.125, "epoch": 0.25325504212407457, "gen_logits_max": 8.270347595214844, "gen_logits_mean": -12.022004127502441, "gen_logits_min": -23.60200309753418, "gen_logits_std": 2.4273834228515625, "gen_loss": 0.3335683345794678, "grad_norm": 0.5701988785321954, "learning_rate": 2.9363368421052634e-05, "loss": 0.3629, "mean_copy_accuracy": 0.9884997010231018, "mean_gen_accuracy": 0.8490772098302841, "mean_token_accuracy": 0.8840106576681137, "num_tokens": 335475461.0, "sample_num_tokens": 7357.75, "step": 1240, "total_num_tokens": 335504892.0, "z_loss": 0.00133865000680089 }, { "copy_logits_max": -1.5801293849945068, "copy_logits_min": -750000000.0, "copy_num_tokens": 398.0, "epoch": 0.2534592800612714, "gen_logits_max": 8.208305358886719, "gen_logits_mean": -10.679230690002441, "gen_logits_min": -22.19757080078125, "gen_logits_std": 2.435709238052368, "gen_loss": 0.3694855570793152, "grad_norm": 0.49012998116458284, "learning_rate": 2.9362105263157895e-05, "loss": 0.3473, "mean_copy_accuracy": 0.990104079246521, "mean_gen_accuracy": 0.8586159348487854, "mean_token_accuracy": 0.8853108733892441, "num_tokens": 335737730.0, "sample_num_tokens": 7833.5, "step": 1241, "total_num_tokens": 335769064.0, "z_loss": 0.00131086609326303 }, { "copy_logits_max": -1.6877176761627197, "copy_logits_min": -750000000.0, "copy_num_tokens": 667.6875, "epoch": 0.2536635179984682, "gen_logits_max": 8.831385612487793, "gen_logits_mean": -9.049245834350586, "gen_logits_min": -20.478763580322266, "gen_logits_std": 2.414938449859619, "gen_loss": 0.2824629545211792, "grad_norm": 0.6097201100838772, "learning_rate": 2.936084210526316e-05, "loss": 0.3327, "mean_copy_accuracy": 0.988875687122345, "mean_gen_accuracy": 0.8593353033065796, "mean_token_accuracy": 0.8924538940191269, "num_tokens": 335980457.0, "sample_num_tokens": 9654.75, "step": 1242, "total_num_tokens": 336019076.0, "z_loss": 0.0012810224434360862 }, { "copy_logits_max": 1.2521631717681885, "copy_logits_min": -750000000.0, "copy_num_tokens": 538.6875, "epoch": 0.2538677559356651, "gen_logits_max": 8.791656494140625, "gen_logits_mean": -9.650404930114746, "gen_logits_min": -21.944217681884766, "gen_logits_std": 2.5231313705444336, "gen_loss": 0.36253243684768677, "grad_norm": 0.5125558209907107, "learning_rate": 2.935957894736842e-05, "loss": 0.3484, "mean_copy_accuracy": 0.9915789812803268, "mean_gen_accuracy": 0.8531651049852371, "mean_token_accuracy": 0.8885546773672104, "num_tokens": 336257025.0, "sample_num_tokens": 7923.25, "step": 1243, "total_num_tokens": 336288718.0, "z_loss": 0.0015538339503109455 }, { "copy_logits_max": -1.9532840251922607, "copy_logits_min": -687500032.0, "copy_num_tokens": 402.3125, "epoch": 0.2540719938728619, "gen_logits_max": 7.663420677185059, "gen_logits_mean": -11.10966682434082, "gen_logits_min": -22.728809356689453, "gen_logits_std": 2.4379477500915527, "gen_loss": 0.336370587348938, "grad_norm": 0.5312622379513964, "learning_rate": 2.9358315789473685e-05, "loss": 0.3667, "mean_copy_accuracy": 0.9904698133468628, "mean_gen_accuracy": 0.8489863276481628, "mean_token_accuracy": 0.8818506598472595, "num_tokens": 336516989.0, "sample_num_tokens": 8374.75, "step": 1244, "total_num_tokens": 336550488.0, "z_loss": 0.0012477452401071787 }, { "copy_logits_max": -0.558067262172699, "copy_logits_min": -750000000.0, "copy_num_tokens": 740.125, "epoch": 0.2542762318100587, "gen_logits_max": 8.075736999511719, "gen_logits_mean": -10.322503089904785, "gen_logits_min": -22.52289581298828, "gen_logits_std": 2.5236427783966064, "gen_loss": 0.3241819739341736, "grad_norm": 0.552204322756687, "learning_rate": 2.9357052631578946e-05, "loss": 0.3398, "mean_copy_accuracy": 0.9898636043071747, "mean_gen_accuracy": 0.858199879527092, "mean_token_accuracy": 0.8902370631694794, "num_tokens": 336801884.0, "sample_num_tokens": 10516.0, "step": 1245, "total_num_tokens": 336843948.0, "z_loss": 0.0014566038735210896 }, { "copy_logits_max": -2.3684802055358887, "copy_logits_min": -750000000.0, "copy_num_tokens": 438.5, "epoch": 0.2544804697472556, "gen_logits_max": 9.618882179260254, "gen_logits_mean": -10.301433563232422, "gen_logits_min": -21.795215606689453, "gen_logits_std": 2.443058490753174, "gen_loss": 0.35768628120422363, "grad_norm": 0.4926062279179519, "learning_rate": 2.935578947368421e-05, "loss": 0.3468, "mean_copy_accuracy": 0.9897726327180862, "mean_gen_accuracy": 0.8567953109741211, "mean_token_accuracy": 0.8898663818836212, "num_tokens": 337099389.0, "sample_num_tokens": 8849.75, "step": 1246, "total_num_tokens": 337134788.0, "z_loss": 0.001331612467765808 }, { "copy_logits_max": -1.4865508079528809, "copy_logits_min": -625000064.0, "copy_num_tokens": 511.375, "epoch": 0.2546847076844524, "gen_logits_max": 9.154953002929688, "gen_logits_mean": -10.047682762145996, "gen_logits_min": -21.827762603759766, "gen_logits_std": 2.459808826446533, "gen_loss": 0.350929319858551, "grad_norm": 0.5463928353992757, "learning_rate": 2.9354526315789474e-05, "loss": 0.3435, "mean_copy_accuracy": 0.9912146478891373, "mean_gen_accuracy": 0.8492912501096725, "mean_token_accuracy": 0.8874301910400391, "num_tokens": 337380346.0, "sample_num_tokens": 8721.5, "step": 1247, "total_num_tokens": 337415232.0, "z_loss": 0.0014972805511206388 }, { "copy_logits_max": -3.903332233428955, "copy_logits_min": -750000000.0, "copy_num_tokens": 280.5, "epoch": 0.2548889456216492, "gen_logits_max": 8.350251197814941, "gen_logits_mean": -10.623466491699219, "gen_logits_min": -22.180131912231445, "gen_logits_std": 2.423842191696167, "gen_loss": 0.3289182484149933, "grad_norm": 0.488267412160818, "learning_rate": 2.935326315789474e-05, "loss": 0.3322, "mean_copy_accuracy": 0.9910511672496796, "mean_gen_accuracy": 0.8624320030212402, "mean_token_accuracy": 0.8922707289457321, "num_tokens": 337657934.0, "sample_num_tokens": 7701.5, "step": 1248, "total_num_tokens": 337688740.0, "z_loss": 0.00123180216178298 }, { "copy_logits_max": -0.2667417526245117, "copy_logits_min": -750000000.0, "copy_num_tokens": 697.25, "epoch": 0.2550931835588461, "gen_logits_max": 8.683700561523438, "gen_logits_mean": -9.883768081665039, "gen_logits_min": -21.95907211303711, "gen_logits_std": 2.5048608779907227, "gen_loss": 0.28376591205596924, "grad_norm": 0.6479880115163639, "learning_rate": 2.9352000000000003e-05, "loss": 0.3395, "mean_copy_accuracy": 0.9896877110004425, "mean_gen_accuracy": 0.8539343178272247, "mean_token_accuracy": 0.8902439475059509, "num_tokens": 337921030.0, "sample_num_tokens": 9210.0, "step": 1249, "total_num_tokens": 337957870.0, "z_loss": 0.0017042640829458833 }, { "copy_logits_max": -4.094449043273926, "copy_logits_min": -750000000.0, "copy_num_tokens": 356.375, "epoch": 0.2552974214960429, "gen_logits_max": 8.769659996032715, "gen_logits_mean": -10.606693267822266, "gen_logits_min": -22.317293167114258, "gen_logits_std": 2.4495794773101807, "gen_loss": 0.36401045322418213, "grad_norm": 0.5756273364698867, "learning_rate": 2.9350736842105264e-05, "loss": 0.3475, "mean_copy_accuracy": 0.9879839271306992, "mean_gen_accuracy": 0.8584937155246735, "mean_token_accuracy": 0.8877121359109879, "num_tokens": 338196987.0, "sample_num_tokens": 7600.25, "step": 1250, "total_num_tokens": 338227388.0, "z_loss": 0.0013087575789541006 }, { "copy_logits_max": -3.6282105445861816, "copy_logits_min": -750000064.0, "copy_num_tokens": 337.5625, "epoch": 0.2555016594332397, "gen_logits_max": 9.003996849060059, "gen_logits_mean": -9.53232479095459, "gen_logits_min": -21.217636108398438, "gen_logits_std": 2.473468542098999, "gen_loss": 0.3705185055732727, "grad_norm": 0.5431628267741506, "learning_rate": 2.9349473684210528e-05, "loss": 0.357, "mean_copy_accuracy": 0.989558219909668, "mean_gen_accuracy": 0.848390057682991, "mean_token_accuracy": 0.8847565948963165, "num_tokens": 338458989.0, "sample_num_tokens": 7642.75, "step": 1251, "total_num_tokens": 338489560.0, "z_loss": 0.0013402223121374846 }, { "copy_logits_max": 1.0506857633590698, "copy_logits_min": -750000064.0, "copy_num_tokens": 583.8125, "epoch": 0.2557058973704366, "gen_logits_max": 8.736230850219727, "gen_logits_mean": -10.224312782287598, "gen_logits_min": -22.15920639038086, "gen_logits_std": 2.518016815185547, "gen_loss": 0.3729802370071411, "grad_norm": 0.5692260450657749, "learning_rate": 2.934821052631579e-05, "loss": 0.366, "mean_copy_accuracy": 0.991378664970398, "mean_gen_accuracy": 0.8430958092212677, "mean_token_accuracy": 0.8816457539796829, "num_tokens": 338708263.0, "sample_num_tokens": 8510.75, "step": 1252, "total_num_tokens": 338742306.0, "z_loss": 0.0016582420794293284 }, { "copy_logits_max": -2.268350124359131, "copy_logits_min": -750000000.0, "copy_num_tokens": 234.0, "epoch": 0.2559101353076334, "gen_logits_max": 9.695537567138672, "gen_logits_mean": -9.773713111877441, "gen_logits_min": -21.056753158569336, "gen_logits_std": 2.450134038925171, "gen_loss": 0.43276992440223694, "grad_norm": 0.5475346741234631, "learning_rate": 2.9346947368421053e-05, "loss": 0.3487, "mean_copy_accuracy": 0.9918920993804932, "mean_gen_accuracy": 0.8496358543634415, "mean_token_accuracy": 0.8849081844091415, "num_tokens": 338971985.0, "sample_num_tokens": 6896.25, "step": 1253, "total_num_tokens": 338999570.0, "z_loss": 0.0015366575680673122 }, { "copy_logits_max": -0.7545501589775085, "copy_logits_min": -750000000.0, "copy_num_tokens": 479.5625, "epoch": 0.2561143732448302, "gen_logits_max": 8.85049819946289, "gen_logits_mean": -10.61404800415039, "gen_logits_min": -22.718149185180664, "gen_logits_std": 2.5217039585113525, "gen_loss": 0.35930880904197693, "grad_norm": 0.4757654605780034, "learning_rate": 2.9345684210526314e-05, "loss": 0.329, "mean_copy_accuracy": 0.9904337823390961, "mean_gen_accuracy": 0.8571558147668839, "mean_token_accuracy": 0.8921160250902176, "num_tokens": 339247630.0, "sample_num_tokens": 9383.0, "step": 1254, "total_num_tokens": 339285162.0, "z_loss": 0.001450094860047102 }, { "copy_logits_max": -2.9058823585510254, "copy_logits_min": -750000000.0, "copy_num_tokens": 269.3125, "epoch": 0.25631861118202703, "gen_logits_max": 8.652676582336426, "gen_logits_mean": -11.347221374511719, "gen_logits_min": -22.95269775390625, "gen_logits_std": 2.447535514831543, "gen_loss": 0.38769465684890747, "grad_norm": 0.5530580818368598, "learning_rate": 2.9344421052631582e-05, "loss": 0.3406, "mean_copy_accuracy": 0.9898999780416489, "mean_gen_accuracy": 0.8578883409500122, "mean_token_accuracy": 0.8902306407690048, "num_tokens": 339534481.0, "sample_num_tokens": 7226.75, "step": 1255, "total_num_tokens": 339563388.0, "z_loss": 0.0013655750080943108 }, { "copy_logits_max": 0.6608033180236816, "copy_logits_min": -750000000.0, "copy_num_tokens": 668.5625, "epoch": 0.2565228491192239, "gen_logits_max": 8.114683151245117, "gen_logits_mean": -10.582286834716797, "gen_logits_min": -22.61012840270996, "gen_logits_std": 2.5247292518615723, "gen_loss": 0.3195381164550781, "grad_norm": 0.5274941534317535, "learning_rate": 2.9343157894736843e-05, "loss": 0.3428, "mean_copy_accuracy": 0.9895710349082947, "mean_gen_accuracy": 0.8570642322301865, "mean_token_accuracy": 0.8871098756790161, "num_tokens": 339802000.0, "sample_num_tokens": 9790.0, "step": 1256, "total_num_tokens": 339841160.0, "z_loss": 0.0014159699203446507 }, { "copy_logits_max": -1.9862394332885742, "copy_logits_min": -750000064.0, "copy_num_tokens": 506.6875, "epoch": 0.2567270870564207, "gen_logits_max": 9.022708892822266, "gen_logits_mean": -10.54501724243164, "gen_logits_min": -22.387176513671875, "gen_logits_std": 2.4940388202667236, "gen_loss": 0.3116583824157715, "grad_norm": 0.47996819263554313, "learning_rate": 2.9341894736842107e-05, "loss": 0.3455, "mean_copy_accuracy": 0.9907245337963104, "mean_gen_accuracy": 0.8546176254749298, "mean_token_accuracy": 0.8865983635187149, "num_tokens": 340076027.0, "sample_num_tokens": 8771.75, "step": 1257, "total_num_tokens": 340111114.0, "z_loss": 0.0014242705656215549 }, { "copy_logits_max": -0.3644520044326782, "copy_logits_min": -750000000.0, "copy_num_tokens": 375.375, "epoch": 0.25693132499361754, "gen_logits_max": 9.115638732910156, "gen_logits_mean": -9.595440864562988, "gen_logits_min": -21.656021118164062, "gen_logits_std": 2.5514073371887207, "gen_loss": 0.31241607666015625, "grad_norm": 0.5296404656727384, "learning_rate": 2.9340631578947368e-05, "loss": 0.3317, "mean_copy_accuracy": 0.9904604405164719, "mean_gen_accuracy": 0.8585629016160965, "mean_token_accuracy": 0.8907400667667389, "num_tokens": 340338621.0, "sample_num_tokens": 7020.25, "step": 1258, "total_num_tokens": 340366702.0, "z_loss": 0.0015465167816728354 }, { "copy_logits_max": -0.7371699810028076, "copy_logits_min": -750000000.0, "copy_num_tokens": 545.75, "epoch": 0.2571355629308144, "gen_logits_max": 7.718568801879883, "gen_logits_mean": -12.27833366394043, "gen_logits_min": -24.078731536865234, "gen_logits_std": 2.4623827934265137, "gen_loss": 0.3124096691608429, "grad_norm": 0.5421074774137702, "learning_rate": 2.9339368421052632e-05, "loss": 0.3579, "mean_copy_accuracy": 0.9915327280759811, "mean_gen_accuracy": 0.8513299822807312, "mean_token_accuracy": 0.8848194926977158, "num_tokens": 340589015.0, "sample_num_tokens": 9043.75, "step": 1259, "total_num_tokens": 340625190.0, "z_loss": 0.0014345680829137564 }, { "copy_logits_max": -1.5717790126800537, "copy_logits_min": -750000128.0, "copy_num_tokens": 547.9375, "epoch": 0.2573398008680112, "gen_logits_max": 8.114248275756836, "gen_logits_mean": -10.291653633117676, "gen_logits_min": -22.130233764648438, "gen_logits_std": 2.5008387565612793, "gen_loss": 0.3155955970287323, "grad_norm": 0.5072783089593159, "learning_rate": 2.9338105263157893e-05, "loss": 0.3368, "mean_copy_accuracy": 0.9901819676160812, "mean_gen_accuracy": 0.858291432261467, "mean_token_accuracy": 0.890692800283432, "num_tokens": 340873058.0, "sample_num_tokens": 8813.0, "step": 1260, "total_num_tokens": 340908310.0, "z_loss": 0.00136895093601197 }, { "copy_logits_max": -1.6865791082382202, "copy_logits_min": -750000000.0, "copy_num_tokens": 462.25, "epoch": 0.25754403880520804, "gen_logits_max": 9.094703674316406, "gen_logits_mean": -9.685476303100586, "gen_logits_min": -21.841413497924805, "gen_logits_std": 2.5356857776641846, "gen_loss": 0.36032986640930176, "grad_norm": 0.4975038138010467, "learning_rate": 2.9336842105263158e-05, "loss": 0.3421, "mean_copy_accuracy": 0.9906090199947357, "mean_gen_accuracy": 0.8538264781236649, "mean_token_accuracy": 0.8877014517784119, "num_tokens": 341142138.0, "sample_num_tokens": 8060.0, "step": 1261, "total_num_tokens": 341174378.0, "z_loss": 0.001526382751762867 }, { "copy_logits_max": -1.526305913925171, "copy_logits_min": -687500032.0, "copy_num_tokens": 339.625, "epoch": 0.2577482767424049, "gen_logits_max": 8.819026947021484, "gen_logits_mean": -11.158184051513672, "gen_logits_min": -23.114681243896484, "gen_logits_std": 2.487056255340576, "gen_loss": 0.3355846405029297, "grad_norm": 0.5667045821207627, "learning_rate": 2.9335578947368422e-05, "loss": 0.354, "mean_copy_accuracy": 0.9912243634462357, "mean_gen_accuracy": 0.851428359746933, "mean_token_accuracy": 0.8853438943624496, "num_tokens": 341431090.0, "sample_num_tokens": 6921.5, "step": 1262, "total_num_tokens": 341458776.0, "z_loss": 0.0013444744981825352 }, { "copy_logits_max": -2.079615592956543, "copy_logits_min": -687500032.0, "copy_num_tokens": 263.3125, "epoch": 0.25795251467960173, "gen_logits_max": 9.315074920654297, "gen_logits_mean": -10.479820251464844, "gen_logits_min": -21.903369903564453, "gen_logits_std": 2.4502885341644287, "gen_loss": 0.39539986848831177, "grad_norm": 0.4981772375915588, "learning_rate": 2.9334315789473686e-05, "loss": 0.3754, "mean_copy_accuracy": 0.9899294823408127, "mean_gen_accuracy": 0.8495922535657883, "mean_token_accuracy": 0.8793090581893921, "num_tokens": 341682755.0, "sample_num_tokens": 7631.75, "step": 1263, "total_num_tokens": 341713282.0, "z_loss": 0.001313815708272159 }, { "copy_logits_max": -1.673975944519043, "copy_logits_min": -750000000.0, "copy_num_tokens": 432.3125, "epoch": 0.25815675261679855, "gen_logits_max": 8.741473197937012, "gen_logits_mean": -10.279587745666504, "gen_logits_min": -21.569543838500977, "gen_logits_std": 2.4326884746551514, "gen_loss": 0.36271530389785767, "grad_norm": 0.6183968596310413, "learning_rate": 2.933305263157895e-05, "loss": 0.3637, "mean_copy_accuracy": 0.9884715974330902, "mean_gen_accuracy": 0.8570951521396637, "mean_token_accuracy": 0.8831650465726852, "num_tokens": 341929502.0, "sample_num_tokens": 7962.0, "step": 1264, "total_num_tokens": 341961350.0, "z_loss": 0.001256289193406701 }, { "copy_logits_max": -0.3245754837989807, "copy_logits_min": -750000000.0, "copy_num_tokens": 378.25, "epoch": 0.2583609905539954, "gen_logits_max": 8.909134864807129, "gen_logits_mean": -10.094267845153809, "gen_logits_min": -21.305423736572266, "gen_logits_std": 2.4127302169799805, "gen_loss": 0.3470740020275116, "grad_norm": 0.5211505136028061, "learning_rate": 2.933178947368421e-05, "loss": 0.3509, "mean_copy_accuracy": 0.9891892820596695, "mean_gen_accuracy": 0.8554661571979523, "mean_token_accuracy": 0.8843524307012558, "num_tokens": 342191536.0, "sample_num_tokens": 7972.0, "step": 1265, "total_num_tokens": 342223424.0, "z_loss": 0.0012842449359595776 }, { "copy_logits_max": -4.46385383605957, "copy_logits_min": -625000000.0, "copy_num_tokens": 193.75, "epoch": 0.25856522849119223, "gen_logits_max": 10.18271255493164, "gen_logits_mean": -9.69512939453125, "gen_logits_min": -20.950735092163086, "gen_logits_std": 2.407559394836426, "gen_loss": 0.37609851360321045, "grad_norm": 0.5423349510700105, "learning_rate": 2.9330526315789476e-05, "loss": 0.373, "mean_copy_accuracy": 0.9900853037834167, "mean_gen_accuracy": 0.8530961275100708, "mean_token_accuracy": 0.8783201575279236, "num_tokens": 342421982.0, "sample_num_tokens": 6911.5, "step": 1266, "total_num_tokens": 342449628.0, "z_loss": 0.001250583678483963 }, { "copy_logits_max": -0.750485897064209, "copy_logits_min": -687500032.0, "copy_num_tokens": 728.9375, "epoch": 0.25876946642838905, "gen_logits_max": 8.947441101074219, "gen_logits_mean": -9.159770965576172, "gen_logits_min": -20.991527557373047, "gen_logits_std": 2.485761880874634, "gen_loss": 0.29789143800735474, "grad_norm": 0.5066664564018519, "learning_rate": 2.9329263157894737e-05, "loss": 0.3238, "mean_copy_accuracy": 0.991065725684166, "mean_gen_accuracy": 0.8591307401657104, "mean_token_accuracy": 0.8948757648468018, "num_tokens": 342703160.0, "sample_num_tokens": 9903.5, "step": 1267, "total_num_tokens": 342742774.0, "z_loss": 0.00147646211553365 }, { "copy_logits_max": -1.580587387084961, "copy_logits_min": -750000000.0, "copy_num_tokens": 591.5625, "epoch": 0.2589737043655859, "gen_logits_max": 8.547988891601562, "gen_logits_mean": -10.451091766357422, "gen_logits_min": -22.156343460083008, "gen_logits_std": 2.4112000465393066, "gen_loss": 0.3197796940803528, "grad_norm": 0.5913585753592376, "learning_rate": 2.9328e-05, "loss": 0.3358, "mean_copy_accuracy": 0.9903391152620316, "mean_gen_accuracy": 0.8570323884487152, "mean_token_accuracy": 0.8903877586126328, "num_tokens": 342995300.0, "sample_num_tokens": 10069.0, "step": 1268, "total_num_tokens": 343035576.0, "z_loss": 0.0012928526848554611 }, { "copy_logits_max": -0.17972174286842346, "copy_logits_min": -687500032.0, "copy_num_tokens": 583.0625, "epoch": 0.25917794230278274, "gen_logits_max": 10.0159330368042, "gen_logits_mean": -8.702610969543457, "gen_logits_min": -20.775972366333008, "gen_logits_std": 2.484027147293091, "gen_loss": 0.3356739282608032, "grad_norm": 0.5533754670380145, "learning_rate": 2.9326736842105262e-05, "loss": 0.3665, "mean_copy_accuracy": 0.9883337020874023, "mean_gen_accuracy": 0.8518776297569275, "mean_token_accuracy": 0.8859833329916, "num_tokens": 343273189.0, "sample_num_tokens": 8863.75, "step": 1269, "total_num_tokens": 343308644.0, "z_loss": 0.001561000943183899 }, { "copy_logits_max": -1.2990341186523438, "copy_logits_min": -687500032.0, "copy_num_tokens": 467.75, "epoch": 0.25938218023997955, "gen_logits_max": 8.937437057495117, "gen_logits_mean": -10.438899993896484, "gen_logits_min": -22.28351593017578, "gen_logits_std": 2.404265880584717, "gen_loss": 0.37286680936813354, "grad_norm": 0.5836080685926567, "learning_rate": 2.9325473684210526e-05, "loss": 0.349, "mean_copy_accuracy": 0.9906797856092453, "mean_gen_accuracy": 0.8547509163618088, "mean_token_accuracy": 0.8865207731723785, "num_tokens": 343542672.0, "sample_num_tokens": 8874.0, "step": 1270, "total_num_tokens": 343578168.0, "z_loss": 0.0014705169014632702 }, { "copy_logits_max": -0.03513902425765991, "copy_logits_min": -750000000.0, "copy_num_tokens": 819.6875, "epoch": 0.2595864181771764, "gen_logits_max": 7.926612854003906, "gen_logits_mean": -9.948660850524902, "gen_logits_min": -22.018295288085938, "gen_logits_std": 2.4791181087493896, "gen_loss": 0.28357797861099243, "grad_norm": 0.5784895590241359, "learning_rate": 2.9324210526315787e-05, "loss": 0.3256, "mean_copy_accuracy": 0.9892863035202026, "mean_gen_accuracy": 0.8562241047620773, "mean_token_accuracy": 0.8941575884819031, "num_tokens": 343815404.0, "sample_num_tokens": 10540.5, "step": 1271, "total_num_tokens": 343857566.0, "z_loss": 0.0016122168162837625 }, { "copy_logits_max": -0.9657160043716431, "copy_logits_min": -687500032.0, "copy_num_tokens": 581.375, "epoch": 0.25979065611437324, "gen_logits_max": 9.107934951782227, "gen_logits_mean": -10.107463836669922, "gen_logits_min": -22.48019027709961, "gen_logits_std": 2.484414577484131, "gen_loss": 0.2983579635620117, "grad_norm": 0.4936054331522937, "learning_rate": 2.9322947368421055e-05, "loss": 0.3433, "mean_copy_accuracy": 0.9915434867143631, "mean_gen_accuracy": 0.8499195128679276, "mean_token_accuracy": 0.8895975351333618, "num_tokens": 344116779.0, "sample_num_tokens": 9890.75, "step": 1272, "total_num_tokens": 344156342.0, "z_loss": 0.0013555781915783882 }, { "copy_logits_max": -1.1409879922866821, "copy_logits_min": -750000000.0, "copy_num_tokens": 429.5, "epoch": 0.25999489405157006, "gen_logits_max": 8.445188522338867, "gen_logits_mean": -10.465400695800781, "gen_logits_min": -21.574054718017578, "gen_logits_std": 2.3927202224731445, "gen_loss": 0.3400114178657532, "grad_norm": 0.4847376271872097, "learning_rate": 2.9321684210526316e-05, "loss": 0.3453, "mean_copy_accuracy": 0.99378402531147, "mean_gen_accuracy": 0.8539588004350662, "mean_token_accuracy": 0.8879261910915375, "num_tokens": 344408491.0, "sample_num_tokens": 8488.75, "step": 1273, "total_num_tokens": 344442446.0, "z_loss": 0.001431592390872538 }, { "copy_logits_max": -1.6017584800720215, "copy_logits_min": -750000000.0, "copy_num_tokens": 475.375, "epoch": 0.26019913198876693, "gen_logits_max": 7.421926975250244, "gen_logits_mean": -10.786248207092285, "gen_logits_min": -22.44802474975586, "gen_logits_std": 2.45721173286438, "gen_loss": 0.3412407338619232, "grad_norm": 0.5819670623041672, "learning_rate": 2.932042105263158e-05, "loss": 0.3422, "mean_copy_accuracy": 0.9912324994802475, "mean_gen_accuracy": 0.8551365286111832, "mean_token_accuracy": 0.8865867406129837, "num_tokens": 344678451.0, "sample_num_tokens": 7899.25, "step": 1274, "total_num_tokens": 344710048.0, "z_loss": 0.001274164766073227 }, { "copy_logits_max": -1.7098554372787476, "copy_logits_min": -687500032.0, "copy_num_tokens": 371.8125, "epoch": 0.26040336992596375, "gen_logits_max": 9.062127113342285, "gen_logits_mean": -10.263033866882324, "gen_logits_min": -22.44330406188965, "gen_logits_std": 2.473623752593994, "gen_loss": 0.33703047037124634, "grad_norm": 0.5805961497716117, "learning_rate": 2.9319157894736844e-05, "loss": 0.3286, "mean_copy_accuracy": 0.9875029623508453, "mean_gen_accuracy": 0.8603373467922211, "mean_token_accuracy": 0.8942295461893082, "num_tokens": 344945269.0, "sample_num_tokens": 8142.25, "step": 1275, "total_num_tokens": 344977838.0, "z_loss": 0.0012549214297905564 }, { "copy_logits_max": -1.021817684173584, "copy_logits_min": -750000000.0, "copy_num_tokens": 534.375, "epoch": 0.26060760786316056, "gen_logits_max": 8.240222930908203, "gen_logits_mean": -9.860945701599121, "gen_logits_min": -21.46932601928711, "gen_logits_std": 2.425787925720215, "gen_loss": 0.3696848154067993, "grad_norm": 0.5504475128962654, "learning_rate": 2.9317894736842105e-05, "loss": 0.3739, "mean_copy_accuracy": 0.9894202351570129, "mean_gen_accuracy": 0.8476530462503433, "mean_token_accuracy": 0.880776584148407, "num_tokens": 345196027.0, "sample_num_tokens": 8714.25, "step": 1276, "total_num_tokens": 345230884.0, "z_loss": 0.0014373963931575418 }, { "copy_logits_max": 0.1281641721725464, "copy_logits_min": -750000000.0, "copy_num_tokens": 378.0, "epoch": 0.26081184580035743, "gen_logits_max": 8.992385864257812, "gen_logits_mean": -9.777536392211914, "gen_logits_min": -21.783584594726562, "gen_logits_std": 2.5265188217163086, "gen_loss": 0.379278302192688, "grad_norm": 0.5226923123092887, "learning_rate": 2.931663157894737e-05, "loss": 0.3423, "mean_copy_accuracy": 0.9909473061561584, "mean_gen_accuracy": 0.8594307154417038, "mean_token_accuracy": 0.8886268138885498, "num_tokens": 345462891.0, "sample_num_tokens": 7846.75, "step": 1277, "total_num_tokens": 345494278.0, "z_loss": 0.00151577009819448 }, { "copy_logits_max": -1.425168752670288, "copy_logits_min": -750000000.0, "copy_num_tokens": 263.375, "epoch": 0.26101608373755425, "gen_logits_max": 8.414921760559082, "gen_logits_mean": -11.449679374694824, "gen_logits_min": -22.83077049255371, "gen_logits_std": 2.408491849899292, "gen_loss": 0.3898943364620209, "grad_norm": 0.5278771731895295, "learning_rate": 2.931536842105263e-05, "loss": 0.332, "mean_copy_accuracy": 0.9915996044874191, "mean_gen_accuracy": 0.864647701382637, "mean_token_accuracy": 0.8925400078296661, "num_tokens": 345711594.0, "sample_num_tokens": 7172.0, "step": 1278, "total_num_tokens": 345740282.0, "z_loss": 0.0013833499979227781 }, { "copy_logits_max": -0.7966477870941162, "copy_logits_min": -750000000.0, "copy_num_tokens": 555.125, "epoch": 0.26122032167475107, "gen_logits_max": 9.332799911499023, "gen_logits_mean": -9.418977737426758, "gen_logits_min": -20.90411949157715, "gen_logits_std": 2.430309295654297, "gen_loss": 0.3219297230243683, "grad_norm": 0.49893116627459366, "learning_rate": 2.9314105263157895e-05, "loss": 0.3563, "mean_copy_accuracy": 0.9898672699928284, "mean_gen_accuracy": 0.850836843252182, "mean_token_accuracy": 0.884411409497261, "num_tokens": 345971529.0, "sample_num_tokens": 8904.75, "step": 1279, "total_num_tokens": 346007148.0, "z_loss": 0.0014285201905295253 }, { "copy_logits_max": 0.33219146728515625, "copy_logits_min": -750000000.0, "copy_num_tokens": 470.3125, "epoch": 0.26142455961194794, "gen_logits_max": 8.945022583007812, "gen_logits_mean": -10.078398704528809, "gen_logits_min": -22.606414794921875, "gen_logits_std": 2.5686564445495605, "gen_loss": 0.39323699474334717, "grad_norm": 0.7143161130747022, "learning_rate": 2.931284210526316e-05, "loss": 0.3749, "mean_copy_accuracy": 0.9892767518758774, "mean_gen_accuracy": 0.8438528478145599, "mean_token_accuracy": 0.8789142668247223, "num_tokens": 346258013.0, "sample_num_tokens": 7523.75, "step": 1280, "total_num_tokens": 346288108.0, "z_loss": 0.0015341058606281877 }, { "copy_logits_max": -0.7522004842758179, "copy_logits_min": -750000064.0, "copy_num_tokens": 343.25, "epoch": 0.26162879754914475, "gen_logits_max": 9.835612297058105, "gen_logits_mean": -8.989564895629883, "gen_logits_min": -20.50368881225586, "gen_logits_std": 2.42501163482666, "gen_loss": 0.35623374581336975, "grad_norm": 0.49284435790576603, "learning_rate": 2.9311578947368424e-05, "loss": 0.3462, "mean_copy_accuracy": 0.9919488281011581, "mean_gen_accuracy": 0.858154833316803, "mean_token_accuracy": 0.8868456780910492, "num_tokens": 346517195.0, "sample_num_tokens": 7630.75, "step": 1281, "total_num_tokens": 346547718.0, "z_loss": 0.001423774752765894 }, { "copy_logits_max": -0.8979578614234924, "copy_logits_min": -750000000.0, "copy_num_tokens": 405.5625, "epoch": 0.26183303548634157, "gen_logits_max": 8.813703536987305, "gen_logits_mean": -10.498379707336426, "gen_logits_min": -22.286161422729492, "gen_logits_std": 2.460409164428711, "gen_loss": 0.37243497371673584, "grad_norm": 0.7934662750391476, "learning_rate": 2.9310315789473685e-05, "loss": 0.3763, "mean_copy_accuracy": 0.9929408878087997, "mean_gen_accuracy": 0.8501400649547577, "mean_token_accuracy": 0.8783742785453796, "num_tokens": 346775639.0, "sample_num_tokens": 7668.75, "step": 1282, "total_num_tokens": 346806314.0, "z_loss": 0.0012772271875292063 }, { "copy_logits_max": -1.9499807357788086, "copy_logits_min": -750000000.0, "copy_num_tokens": 477.4375, "epoch": 0.26203727342353844, "gen_logits_max": 9.253683090209961, "gen_logits_mean": -10.121830940246582, "gen_logits_min": -21.614864349365234, "gen_logits_std": 2.4371566772460938, "gen_loss": 0.35165131092071533, "grad_norm": 0.5594784190333508, "learning_rate": 2.930905263157895e-05, "loss": 0.3555, "mean_copy_accuracy": 0.9934504181146622, "mean_gen_accuracy": 0.8464294821023941, "mean_token_accuracy": 0.8847556412220001, "num_tokens": 347037126.0, "sample_num_tokens": 8569.0, "step": 1283, "total_num_tokens": 347071402.0, "z_loss": 0.0013475490268319845 }, { "copy_logits_max": -2.200131416320801, "copy_logits_min": -750000000.0, "copy_num_tokens": 453.9375, "epoch": 0.26224151136073526, "gen_logits_max": 9.011838912963867, "gen_logits_mean": -10.179178237915039, "gen_logits_min": -21.436016082763672, "gen_logits_std": 2.384723663330078, "gen_loss": 0.35099440813064575, "grad_norm": 0.6926849809975975, "learning_rate": 2.930778947368421e-05, "loss": 0.3251, "mean_copy_accuracy": 0.9917585104703903, "mean_gen_accuracy": 0.8614620864391327, "mean_token_accuracy": 0.8925963640213013, "num_tokens": 347308397.0, "sample_num_tokens": 9227.75, "step": 1284, "total_num_tokens": 347345308.0, "z_loss": 0.0012649837881326675 }, { "copy_logits_max": 0.47390758991241455, "copy_logits_min": -750000000.0, "copy_num_tokens": 448.25, "epoch": 0.2624457492979321, "gen_logits_max": 8.51936149597168, "gen_logits_mean": -11.500499725341797, "gen_logits_min": -23.3228759765625, "gen_logits_std": 2.471177101135254, "gen_loss": 0.3651232123374939, "grad_norm": 0.5077355711677043, "learning_rate": 2.9306526315789474e-05, "loss": 0.3635, "mean_copy_accuracy": 0.9925041049718857, "mean_gen_accuracy": 0.8528848439455032, "mean_token_accuracy": 0.8821309953927994, "num_tokens": 347578419.0, "sample_num_tokens": 8405.25, "step": 1285, "total_num_tokens": 347612040.0, "z_loss": 0.0016555492766201496 }, { "copy_logits_max": 1.9693503379821777, "copy_logits_min": -687500032.0, "copy_num_tokens": 616.0625, "epoch": 0.26264998723512895, "gen_logits_max": 8.643163681030273, "gen_logits_mean": -10.54373550415039, "gen_logits_min": -22.52279281616211, "gen_logits_std": 2.4358363151550293, "gen_loss": 0.2971872389316559, "grad_norm": 0.544082488356879, "learning_rate": 2.9305263157894735e-05, "loss": 0.3409, "mean_copy_accuracy": 0.9914616048336029, "mean_gen_accuracy": 0.8565185070037842, "mean_token_accuracy": 0.8906971216201782, "num_tokens": 347845368.0, "sample_num_tokens": 9343.5, "step": 1286, "total_num_tokens": 347882742.0, "z_loss": 0.0021142871119081974 }, { "copy_logits_max": 2.144711494445801, "copy_logits_min": -750000000.0, "copy_num_tokens": 640.5625, "epoch": 0.26285422517232576, "gen_logits_max": 8.431100845336914, "gen_logits_mean": -10.062246322631836, "gen_logits_min": -21.705286026000977, "gen_logits_std": 2.3976383209228516, "gen_loss": 0.29443538188934326, "grad_norm": 0.5673086790327759, "learning_rate": 2.9304e-05, "loss": 0.3232, "mean_copy_accuracy": 0.9910683631896973, "mean_gen_accuracy": 0.8595211505889893, "mean_token_accuracy": 0.8943373262882233, "num_tokens": 348141604.0, "sample_num_tokens": 10191.0, "step": 1287, "total_num_tokens": 348182368.0, "z_loss": 0.002105526626110077 }, { "copy_logits_max": 1.0895899534225464, "copy_logits_min": -625000064.0, "copy_num_tokens": 585.625, "epoch": 0.2630584631095226, "gen_logits_max": 8.686508178710938, "gen_logits_mean": -9.784475326538086, "gen_logits_min": -21.840412139892578, "gen_logits_std": 2.463778495788574, "gen_loss": 0.3212001919746399, "grad_norm": 0.49073316965351665, "learning_rate": 2.9302736842105267e-05, "loss": 0.322, "mean_copy_accuracy": 0.9921620488166809, "mean_gen_accuracy": 0.8583012223243713, "mean_token_accuracy": 0.8947610408067703, "num_tokens": 348441285.0, "sample_num_tokens": 8954.75, "step": 1288, "total_num_tokens": 348477104.0, "z_loss": 0.002252396661788225 }, { "copy_logits_max": -1.2561256885528564, "copy_logits_min": -750000000.0, "copy_num_tokens": 334.375, "epoch": 0.26326270104671945, "gen_logits_max": 9.395689010620117, "gen_logits_mean": -10.210959434509277, "gen_logits_min": -21.566246032714844, "gen_logits_std": 2.3902130126953125, "gen_loss": 0.3825627267360687, "grad_norm": 0.5815991034845222, "learning_rate": 2.9301473684210528e-05, "loss": 0.3515, "mean_copy_accuracy": 0.9881114065647125, "mean_gen_accuracy": 0.8527508527040482, "mean_token_accuracy": 0.8856132924556732, "num_tokens": 348706154.0, "sample_num_tokens": 8005.0, "step": 1289, "total_num_tokens": 348738174.0, "z_loss": 0.001817054464481771 }, { "copy_logits_max": 0.0017116069793701172, "copy_logits_min": -750000000.0, "copy_num_tokens": 557.875, "epoch": 0.26346693898391627, "gen_logits_max": 7.5416035652160645, "gen_logits_mean": -11.329634666442871, "gen_logits_min": -22.840190887451172, "gen_logits_std": 2.3964667320251465, "gen_loss": 0.2873976230621338, "grad_norm": 0.5098007985325229, "learning_rate": 2.9300210526315792e-05, "loss": 0.3245, "mean_copy_accuracy": 0.9896837025880814, "mean_gen_accuracy": 0.8646250367164612, "mean_token_accuracy": 0.8946997821331024, "num_tokens": 348959031.0, "sample_num_tokens": 8847.75, "step": 1290, "total_num_tokens": 348994422.0, "z_loss": 0.0016984569374471903 }, { "copy_logits_max": -1.6262603998184204, "copy_logits_min": -750000000.0, "copy_num_tokens": 384.5, "epoch": 0.2636711769211131, "gen_logits_max": 8.785577774047852, "gen_logits_mean": -9.590190887451172, "gen_logits_min": -20.829357147216797, "gen_logits_std": 2.4163599014282227, "gen_loss": 0.3371376395225525, "grad_norm": 0.5214812462430256, "learning_rate": 2.9298947368421053e-05, "loss": 0.3334, "mean_copy_accuracy": 0.992594987154007, "mean_gen_accuracy": 0.860208049416542, "mean_token_accuracy": 0.8933550417423248, "num_tokens": 349237118.0, "sample_num_tokens": 7074.5, "step": 1291, "total_num_tokens": 349265416.0, "z_loss": 0.0015778439119458199 }, { "copy_logits_max": -1.676931381225586, "copy_logits_min": -750000000.0, "copy_num_tokens": 402.9375, "epoch": 0.26387541485830995, "gen_logits_max": 8.402935028076172, "gen_logits_mean": -10.315465927124023, "gen_logits_min": -21.476078033447266, "gen_logits_std": 2.370645761489868, "gen_loss": 0.3431042432785034, "grad_norm": 0.542410190524912, "learning_rate": 2.9297684210526317e-05, "loss": 0.3444, "mean_copy_accuracy": 0.9898060709238052, "mean_gen_accuracy": 0.8592473417520523, "mean_token_accuracy": 0.8870984613895416, "num_tokens": 349482010.0, "sample_num_tokens": 8642.0, "step": 1292, "total_num_tokens": 349516578.0, "z_loss": 0.0013382502365857363 }, { "copy_logits_max": -2.562957763671875, "copy_logits_min": -750000000.0, "copy_num_tokens": 577.75, "epoch": 0.26407965279550677, "gen_logits_max": 8.032620429992676, "gen_logits_mean": -10.664787292480469, "gen_logits_min": -22.47998809814453, "gen_logits_std": 2.4127414226531982, "gen_loss": 0.2873639166355133, "grad_norm": 0.46572326643364076, "learning_rate": 2.929642105263158e-05, "loss": 0.3309, "mean_copy_accuracy": 0.9918105751276016, "mean_gen_accuracy": 0.8591875284910202, "mean_token_accuracy": 0.892477884888649, "num_tokens": 349770607.0, "sample_num_tokens": 8725.25, "step": 1293, "total_num_tokens": 349805508.0, "z_loss": 0.001410831231623888 }, { "copy_logits_max": -1.0814406871795654, "copy_logits_min": -750000000.0, "copy_num_tokens": 326.125, "epoch": 0.2642838907327036, "gen_logits_max": 8.248449325561523, "gen_logits_mean": -11.00333309173584, "gen_logits_min": -22.283832550048828, "gen_logits_std": 2.3933591842651367, "gen_loss": 0.35187703371047974, "grad_norm": 0.6650162749358075, "learning_rate": 2.9295157894736843e-05, "loss": 0.3391, "mean_copy_accuracy": 0.9912514239549637, "mean_gen_accuracy": 0.8579907864332199, "mean_token_accuracy": 0.8904783427715302, "num_tokens": 350039124.0, "sample_num_tokens": 7760.0, "step": 1294, "total_num_tokens": 350070164.0, "z_loss": 0.001486883731558919 }, { "copy_logits_max": -0.6699637174606323, "copy_logits_min": -750000000.0, "copy_num_tokens": 559.0625, "epoch": 0.26448812866990046, "gen_logits_max": 8.311120986938477, "gen_logits_mean": -10.482047080993652, "gen_logits_min": -22.192062377929688, "gen_logits_std": 2.41898250579834, "gen_loss": 0.32155841588974, "grad_norm": 0.5295483197084631, "learning_rate": 2.9293894736842104e-05, "loss": 0.3354, "mean_copy_accuracy": 0.991700679063797, "mean_gen_accuracy": 0.8513340204954147, "mean_token_accuracy": 0.8892358243465424, "num_tokens": 350303513.0, "sample_num_tokens": 8954.25, "step": 1295, "total_num_tokens": 350339330.0, "z_loss": 0.0014979710103943944 }, { "copy_logits_max": -1.5415079593658447, "copy_logits_min": -750000000.0, "copy_num_tokens": 401.5625, "epoch": 0.2646923666070973, "gen_logits_max": 8.116668701171875, "gen_logits_mean": -11.03225326538086, "gen_logits_min": -22.296234130859375, "gen_logits_std": 2.379530429840088, "gen_loss": 0.3792388439178467, "grad_norm": 0.6587508501819059, "learning_rate": 2.929263157894737e-05, "loss": 0.3587, "mean_copy_accuracy": 0.9874171912670135, "mean_gen_accuracy": 0.8517027646303177, "mean_token_accuracy": 0.8840499222278595, "num_tokens": 350562118.0, "sample_num_tokens": 7366.0, "step": 1296, "total_num_tokens": 350591582.0, "z_loss": 0.0013002557680010796 }, { "copy_logits_max": -0.11168718338012695, "copy_logits_min": -687500032.0, "copy_num_tokens": 584.75, "epoch": 0.2648966045442941, "gen_logits_max": 8.414436340332031, "gen_logits_mean": -10.720462799072266, "gen_logits_min": -22.82157325744629, "gen_logits_std": 2.5146217346191406, "gen_loss": 0.3521299362182617, "grad_norm": 0.5453187585192429, "learning_rate": 2.9291368421052632e-05, "loss": 0.3491, "mean_copy_accuracy": 0.9916294515132904, "mean_gen_accuracy": 0.8525340706110001, "mean_token_accuracy": 0.8879431337118149, "num_tokens": 350829573.0, "sample_num_tokens": 9057.25, "step": 1297, "total_num_tokens": 350865802.0, "z_loss": 0.0015596742741763592 }, { "copy_logits_max": -1.6168608665466309, "copy_logits_min": -750000000.0, "copy_num_tokens": 461.375, "epoch": 0.26510084248149096, "gen_logits_max": 8.696985244750977, "gen_logits_mean": -10.528475761413574, "gen_logits_min": -22.30899429321289, "gen_logits_std": 2.4649839401245117, "gen_loss": 0.30278605222702026, "grad_norm": 0.5185469742967918, "learning_rate": 2.9290105263157897e-05, "loss": 0.3432, "mean_copy_accuracy": 0.9922567754983902, "mean_gen_accuracy": 0.8567662239074707, "mean_token_accuracy": 0.8880612850189209, "num_tokens": 351086239.0, "sample_num_tokens": 8497.75, "step": 1298, "total_num_tokens": 351120230.0, "z_loss": 0.0013557072961702943 }, { "copy_logits_max": -2.4026808738708496, "copy_logits_min": -750000000.0, "copy_num_tokens": 375.9375, "epoch": 0.2653050804186878, "gen_logits_max": 8.586769104003906, "gen_logits_mean": -9.393860816955566, "gen_logits_min": -20.73422622680664, "gen_logits_std": 2.4131453037261963, "gen_loss": 0.3053819537162781, "grad_norm": 0.47312107352997357, "learning_rate": 2.9288842105263157e-05, "loss": 0.3318, "mean_copy_accuracy": 0.9921193718910217, "mean_gen_accuracy": 0.857823982834816, "mean_token_accuracy": 0.8909232914447784, "num_tokens": 351369558.0, "sample_num_tokens": 8232.0, "step": 1299, "total_num_tokens": 351402486.0, "z_loss": 0.001414319733157754 }, { "copy_logits_max": -1.1292853355407715, "copy_logits_min": -750000000.0, "copy_num_tokens": 433.6875, "epoch": 0.2655093183558846, "gen_logits_max": 8.62438678741455, "gen_logits_mean": -9.922367095947266, "gen_logits_min": -22.023269653320312, "gen_logits_std": 2.450798988342285, "gen_loss": 0.37454304099082947, "grad_norm": 0.5516465977083536, "learning_rate": 2.9287578947368422e-05, "loss": 0.3668, "mean_copy_accuracy": 0.9897526055574417, "mean_gen_accuracy": 0.8486602306365967, "mean_token_accuracy": 0.88203065097332, "num_tokens": 351634403.0, "sample_num_tokens": 7436.25, "step": 1300, "total_num_tokens": 351664148.0, "z_loss": 0.0016054497100412846 }, { "copy_logits_max": -1.0456578731536865, "copy_logits_min": -750000000.0, "copy_num_tokens": 449.125, "epoch": 0.26571355629308147, "gen_logits_max": 7.494032859802246, "gen_logits_mean": -11.74176025390625, "gen_logits_min": -23.61408233642578, "gen_logits_std": 2.463010549545288, "gen_loss": 0.31096476316452026, "grad_norm": 0.507230119897773, "learning_rate": 2.9286315789473686e-05, "loss": 0.3386, "mean_copy_accuracy": 0.9902258664369583, "mean_gen_accuracy": 0.8623638451099396, "mean_token_accuracy": 0.8911471962928772, "num_tokens": 351892871.0, "sample_num_tokens": 7465.25, "step": 1301, "total_num_tokens": 351922732.0, "z_loss": 0.0014315617736428976 }, { "copy_logits_max": -1.1360903978347778, "copy_logits_min": -750000000.0, "copy_num_tokens": 445.0, "epoch": 0.2659177942302783, "gen_logits_max": 8.390913963317871, "gen_logits_mean": -10.808276176452637, "gen_logits_min": -23.185272216796875, "gen_logits_std": 2.5100183486938477, "gen_loss": 0.3316042423248291, "grad_norm": 0.5616806962054979, "learning_rate": 2.9285052631578947e-05, "loss": 0.3673, "mean_copy_accuracy": 0.9918014407157898, "mean_gen_accuracy": 0.8508569151163101, "mean_token_accuracy": 0.8807939291000366, "num_tokens": 352147281.0, "sample_num_tokens": 8580.25, "step": 1302, "total_num_tokens": 352181602.0, "z_loss": 0.0014086300507187843 }, { "copy_logits_max": -0.6767787933349609, "copy_logits_min": -750000000.0, "copy_num_tokens": 386.1875, "epoch": 0.2661220321674751, "gen_logits_max": 8.162282943725586, "gen_logits_mean": -11.15369987487793, "gen_logits_min": -23.136293411254883, "gen_logits_std": 2.496800184249878, "gen_loss": 0.3590589761734009, "grad_norm": 0.5512832950730298, "learning_rate": 2.928378947368421e-05, "loss": 0.329, "mean_copy_accuracy": 0.9893815666437149, "mean_gen_accuracy": 0.8630286902189255, "mean_token_accuracy": 0.8931962698698044, "num_tokens": 352420836.0, "sample_num_tokens": 7608.5, "step": 1303, "total_num_tokens": 352451270.0, "z_loss": 0.0015107624931260943 }, { "copy_logits_max": -1.7248071432113647, "copy_logits_min": -750000000.0, "copy_num_tokens": 360.9375, "epoch": 0.26632627010467197, "gen_logits_max": 8.905048370361328, "gen_logits_mean": -10.403069496154785, "gen_logits_min": -22.362844467163086, "gen_logits_std": 2.479830503463745, "gen_loss": 0.3760872483253479, "grad_norm": 0.5449801524192037, "learning_rate": 2.9282526315789476e-05, "loss": 0.3394, "mean_copy_accuracy": 0.9898511320352554, "mean_gen_accuracy": 0.8534975051879883, "mean_token_accuracy": 0.8900698721408844, "num_tokens": 352682447.0, "sample_num_tokens": 7616.75, "step": 1304, "total_num_tokens": 352712914.0, "z_loss": 0.0014269406674429774 }, { "copy_logits_max": 1.8297845125198364, "copy_logits_min": -750000000.0, "copy_num_tokens": 492.125, "epoch": 0.2665305080418688, "gen_logits_max": 7.669125556945801, "gen_logits_mean": -11.304698944091797, "gen_logits_min": -23.4969539642334, "gen_logits_std": 2.511587619781494, "gen_loss": 0.33880263566970825, "grad_norm": 0.5082604880309824, "learning_rate": 2.928126315789474e-05, "loss": 0.3356, "mean_copy_accuracy": 0.9919798672199249, "mean_gen_accuracy": 0.8541980087757111, "mean_token_accuracy": 0.8915486484766006, "num_tokens": 352967082.0, "sample_num_tokens": 7834.0, "step": 1305, "total_num_tokens": 352998418.0, "z_loss": 0.0015256137121468782 }, { "copy_logits_max": -0.46927666664123535, "copy_logits_min": -687500096.0, "copy_num_tokens": 496.5, "epoch": 0.2667347459790656, "gen_logits_max": 8.660905838012695, "gen_logits_mean": -10.072649002075195, "gen_logits_min": -22.504337310791016, "gen_logits_std": 2.4978911876678467, "gen_loss": 0.3520966172218323, "grad_norm": 0.4721139490833549, "learning_rate": 2.928e-05, "loss": 0.3412, "mean_copy_accuracy": 0.9938125014305115, "mean_gen_accuracy": 0.8503607660531998, "mean_token_accuracy": 0.8897598534822464, "num_tokens": 353247627.0, "sample_num_tokens": 9029.25, "step": 1306, "total_num_tokens": 353283744.0, "z_loss": 0.0015705630648881197 }, { "copy_logits_max": -3.9993155002593994, "copy_logits_min": -750000000.0, "copy_num_tokens": 438.125, "epoch": 0.2669389839162625, "gen_logits_max": 7.997278213500977, "gen_logits_mean": -12.222871780395508, "gen_logits_min": -23.26952362060547, "gen_logits_std": 2.3732802867889404, "gen_loss": 0.3209129869937897, "grad_norm": 0.4625717475355226, "learning_rate": 2.9278736842105265e-05, "loss": 0.3262, "mean_copy_accuracy": 0.9921280294656754, "mean_gen_accuracy": 0.8646928668022156, "mean_token_accuracy": 0.8944253027439117, "num_tokens": 353538157.0, "sample_num_tokens": 9462.25, "step": 1307, "total_num_tokens": 353576006.0, "z_loss": 0.0011834998149424791 }, { "copy_logits_max": -1.9482427835464478, "copy_logits_min": -750000000.0, "copy_num_tokens": 665.125, "epoch": 0.2671432218534593, "gen_logits_max": 7.9305219650268555, "gen_logits_mean": -11.145071983337402, "gen_logits_min": -22.868701934814453, "gen_logits_std": 2.461595058441162, "gen_loss": 0.319125771522522, "grad_norm": 0.526701060885528, "learning_rate": 2.9277473684210526e-05, "loss": 0.3674, "mean_copy_accuracy": 0.9902036339044571, "mean_gen_accuracy": 0.8518993705511093, "mean_token_accuracy": 0.8806984573602676, "num_tokens": 353803693.0, "sample_num_tokens": 10353.75, "step": 1308, "total_num_tokens": 353845108.0, "z_loss": 0.0012923995964229107 }, { "copy_logits_max": -2.1004583835601807, "copy_logits_min": -750000064.0, "copy_num_tokens": 464.875, "epoch": 0.2673474597906561, "gen_logits_max": 7.758843421936035, "gen_logits_mean": -11.240157127380371, "gen_logits_min": -23.32776641845703, "gen_logits_std": 2.418891668319702, "gen_loss": 0.3358483910560608, "grad_norm": 0.5477471493373494, "learning_rate": 2.927621052631579e-05, "loss": 0.3269, "mean_copy_accuracy": 0.9924726486206055, "mean_gen_accuracy": 0.8606463074684143, "mean_token_accuracy": 0.893407329916954, "num_tokens": 354074715.0, "sample_num_tokens": 8362.75, "step": 1309, "total_num_tokens": 354108166.0, "z_loss": 0.0013605747371912003 }, { "copy_logits_max": -1.7089648246765137, "copy_logits_min": -750000000.0, "copy_num_tokens": 384.1875, "epoch": 0.2675516977278529, "gen_logits_max": 8.214434623718262, "gen_logits_mean": -11.817703247070312, "gen_logits_min": -23.757312774658203, "gen_logits_std": 2.474822521209717, "gen_loss": 0.37257397174835205, "grad_norm": 0.5094594204086412, "learning_rate": 2.927494736842105e-05, "loss": 0.3565, "mean_copy_accuracy": 0.9912779033184052, "mean_gen_accuracy": 0.848794549703598, "mean_token_accuracy": 0.8859078884124756, "num_tokens": 354343663.0, "sample_num_tokens": 7965.25, "step": 1310, "total_num_tokens": 354375524.0, "z_loss": 0.001288724597543478 }, { "copy_logits_max": -2.9538800716400146, "copy_logits_min": -750000000.0, "copy_num_tokens": 301.875, "epoch": 0.2677559356650498, "gen_logits_max": 8.146245956420898, "gen_logits_mean": -11.280269622802734, "gen_logits_min": -23.152090072631836, "gen_logits_std": 2.4987027645111084, "gen_loss": 0.3657075762748718, "grad_norm": 0.5174196803586432, "learning_rate": 2.9273684210526316e-05, "loss": 0.3382, "mean_copy_accuracy": 0.9905691891908646, "mean_gen_accuracy": 0.857682541012764, "mean_token_accuracy": 0.8892780244350433, "num_tokens": 354624711.0, "sample_num_tokens": 6840.75, "step": 1311, "total_num_tokens": 354652074.0, "z_loss": 0.0012178707402199507 }, { "copy_logits_max": -1.8052759170532227, "copy_logits_min": -750000000.0, "copy_num_tokens": 480.125, "epoch": 0.2679601736022466, "gen_logits_max": 7.5701727867126465, "gen_logits_mean": -10.943827629089355, "gen_logits_min": -23.43769645690918, "gen_logits_std": 2.4911282062530518, "gen_loss": 0.3209960460662842, "grad_norm": 0.5347737469012699, "learning_rate": 2.927242105263158e-05, "loss": 0.3374, "mean_copy_accuracy": 0.9913944602012634, "mean_gen_accuracy": 0.8510810434818268, "mean_token_accuracy": 0.8899528980255127, "num_tokens": 354916207.0, "sample_num_tokens": 8534.75, "step": 1312, "total_num_tokens": 354950346.0, "z_loss": 0.0013430570252239704 }, { "copy_logits_max": -2.2755701541900635, "copy_logits_min": -750000000.0, "copy_num_tokens": 572.8125, "epoch": 0.2681644115394434, "gen_logits_max": 8.846041679382324, "gen_logits_mean": -10.694863319396973, "gen_logits_min": -22.85778045654297, "gen_logits_std": 2.4809436798095703, "gen_loss": 0.38232851028442383, "grad_norm": 0.5615025051249034, "learning_rate": 2.9271157894736844e-05, "loss": 0.3505, "mean_copy_accuracy": 0.9920483231544495, "mean_gen_accuracy": 0.8485414981842041, "mean_token_accuracy": 0.8878055065870285, "num_tokens": 355190004.0, "sample_num_tokens": 9905.5, "step": 1313, "total_num_tokens": 355229626.0, "z_loss": 0.001456219470128417 }, { "copy_logits_max": -2.4078211784362793, "copy_logits_min": -750000000.0, "copy_num_tokens": 570.5625, "epoch": 0.2683686494766403, "gen_logits_max": 7.374665260314941, "gen_logits_mean": -11.237260818481445, "gen_logits_min": -23.436046600341797, "gen_logits_std": 2.481635332107544, "gen_loss": 0.3014145791530609, "grad_norm": 0.48008947976684757, "learning_rate": 2.9269894736842105e-05, "loss": 0.3103, "mean_copy_accuracy": 0.9931509643793106, "mean_gen_accuracy": 0.8631160408258438, "mean_token_accuracy": 0.8993968367576599, "num_tokens": 355471543.0, "sample_num_tokens": 8554.25, "step": 1314, "total_num_tokens": 355505760.0, "z_loss": 0.0012995493598282337 }, { "copy_logits_max": -3.482389450073242, "copy_logits_min": -750000000.0, "copy_num_tokens": 350.375, "epoch": 0.2685728874138371, "gen_logits_max": 9.413382530212402, "gen_logits_mean": -10.021130561828613, "gen_logits_min": -21.913776397705078, "gen_logits_std": 2.480350971221924, "gen_loss": 0.30607321858406067, "grad_norm": 0.573958130254803, "learning_rate": 2.926863157894737e-05, "loss": 0.3158, "mean_copy_accuracy": 0.990360215306282, "mean_gen_accuracy": 0.8647077232599258, "mean_token_accuracy": 0.8977021723985672, "num_tokens": 355750610.0, "sample_num_tokens": 7103.5, "step": 1315, "total_num_tokens": 355779024.0, "z_loss": 0.001329769496805966 }, { "copy_logits_max": -1.5144821405410767, "copy_logits_min": -750000000.0, "copy_num_tokens": 494.875, "epoch": 0.26877712535103393, "gen_logits_max": 8.20726490020752, "gen_logits_mean": -10.102828979492188, "gen_logits_min": -22.10924530029297, "gen_logits_std": 2.4715521335601807, "gen_loss": 0.31848815083503723, "grad_norm": 0.5996539182308732, "learning_rate": 2.9267368421052634e-05, "loss": 0.3561, "mean_copy_accuracy": 0.9909934997558594, "mean_gen_accuracy": 0.8522464036941528, "mean_token_accuracy": 0.8877203911542892, "num_tokens": 356019123.0, "sample_num_tokens": 7870.25, "step": 1316, "total_num_tokens": 356050604.0, "z_loss": 0.0013617349322885275 }, { "copy_logits_max": 1.7147159576416016, "copy_logits_min": -750000000.0, "copy_num_tokens": 573.6875, "epoch": 0.2689813632882308, "gen_logits_max": 7.941302299499512, "gen_logits_mean": -10.080375671386719, "gen_logits_min": -23.107574462890625, "gen_logits_std": 2.539066791534424, "gen_loss": 0.2891254425048828, "grad_norm": 0.500903235175148, "learning_rate": 2.9266105263157895e-05, "loss": 0.3153, "mean_copy_accuracy": 0.9923326224088669, "mean_gen_accuracy": 0.8603402674198151, "mean_token_accuracy": 0.8973831236362457, "num_tokens": 356294793.0, "sample_num_tokens": 8041.75, "step": 1317, "total_num_tokens": 356326960.0, "z_loss": 0.0015912235248833895 }, { "copy_logits_max": -2.0338876247406006, "copy_logits_min": -687499968.0, "copy_num_tokens": 349.875, "epoch": 0.2691856012254276, "gen_logits_max": 9.643038749694824, "gen_logits_mean": -9.911373138427734, "gen_logits_min": -22.501882553100586, "gen_logits_std": 2.5057411193847656, "gen_loss": 0.39168137311935425, "grad_norm": 0.6476648332901352, "learning_rate": 2.926484210526316e-05, "loss": 0.3767, "mean_copy_accuracy": 0.9905735701322556, "mean_gen_accuracy": 0.8496126681566238, "mean_token_accuracy": 0.8791357278823853, "num_tokens": 356557288.0, "sample_num_tokens": 7746.0, "step": 1318, "total_num_tokens": 356588272.0, "z_loss": 0.0014116595266386867 }, { "copy_logits_max": -1.6928520202636719, "copy_logits_min": -687500032.0, "copy_num_tokens": 733.75, "epoch": 0.26938983916262443, "gen_logits_max": 8.39225959777832, "gen_logits_mean": -9.990690231323242, "gen_logits_min": -22.72100830078125, "gen_logits_std": 2.4741735458374023, "gen_loss": 0.3301679491996765, "grad_norm": 0.7599570444294309, "learning_rate": 2.926357894736842e-05, "loss": 0.3443, "mean_copy_accuracy": 0.9914561063051224, "mean_gen_accuracy": 0.8554343432188034, "mean_token_accuracy": 0.8882823139429092, "num_tokens": 356825005.0, "sample_num_tokens": 10225.25, "step": 1319, "total_num_tokens": 356865906.0, "z_loss": 0.0012413484510034323 }, { "copy_logits_max": -1.0641086101531982, "copy_logits_min": -750000000.0, "copy_num_tokens": 534.625, "epoch": 0.2695940770998213, "gen_logits_max": 8.478470802307129, "gen_logits_mean": -10.165255546569824, "gen_logits_min": -22.385364532470703, "gen_logits_std": 2.46002197265625, "gen_loss": 0.3469931185245514, "grad_norm": 0.49078906014669227, "learning_rate": 2.9262315789473684e-05, "loss": 0.346, "mean_copy_accuracy": 0.9919873923063278, "mean_gen_accuracy": 0.8527791202068329, "mean_token_accuracy": 0.8882744461297989, "num_tokens": 357110350.0, "sample_num_tokens": 8467.0, "step": 1320, "total_num_tokens": 357144218.0, "z_loss": 0.0018096768762916327 }, { "copy_logits_max": -0.05704805254936218, "copy_logits_min": -750000128.0, "copy_num_tokens": 491.4375, "epoch": 0.2697983150370181, "gen_logits_max": 8.847749710083008, "gen_logits_mean": -9.727317810058594, "gen_logits_min": -22.34276580810547, "gen_logits_std": 2.516244649887085, "gen_loss": 0.349170446395874, "grad_norm": 0.5558485661446949, "learning_rate": 2.926105263157895e-05, "loss": 0.3529, "mean_copy_accuracy": 0.9912416785955429, "mean_gen_accuracy": 0.8557703793048859, "mean_token_accuracy": 0.8864439576864243, "num_tokens": 357390962.0, "sample_num_tokens": 9449.0, "step": 1321, "total_num_tokens": 357428758.0, "z_loss": 0.0020315665751695633 }, { "copy_logits_max": 2.09380841255188, "copy_logits_min": -750000064.0, "copy_num_tokens": 557.3125, "epoch": 0.27000255297421494, "gen_logits_max": 7.901706218719482, "gen_logits_mean": -10.6640043258667, "gen_logits_min": -23.061616897583008, "gen_logits_std": 2.5031650066375732, "gen_loss": 0.3469727039337158, "grad_norm": 0.5173962067200474, "learning_rate": 2.9259789473684213e-05, "loss": 0.3321, "mean_copy_accuracy": 0.9914229661226273, "mean_gen_accuracy": 0.8560387492179871, "mean_token_accuracy": 0.8931645303964615, "num_tokens": 357695438.0, "sample_num_tokens": 9408.0, "step": 1322, "total_num_tokens": 357733070.0, "z_loss": 0.0023812642320990562 }, { "copy_logits_max": -0.7489123940467834, "copy_logits_min": -750000000.0, "copy_num_tokens": 457.4375, "epoch": 0.2702067909114118, "gen_logits_max": 8.034300804138184, "gen_logits_mean": -11.655593872070312, "gen_logits_min": -23.33226203918457, "gen_logits_std": 2.435046672821045, "gen_loss": 0.33528950810432434, "grad_norm": 0.5606318961733157, "learning_rate": 2.9258526315789474e-05, "loss": 0.368, "mean_copy_accuracy": 0.9885713905096054, "mean_gen_accuracy": 0.8505334258079529, "mean_token_accuracy": 0.8813177645206451, "num_tokens": 357977564.0, "sample_num_tokens": 9129.5, "step": 1323, "total_num_tokens": 358014082.0, "z_loss": 0.0018351738108322024 }, { "copy_logits_max": -0.5494166016578674, "copy_logits_min": -687500032.0, "copy_num_tokens": 476.5, "epoch": 0.2704110288486086, "gen_logits_max": 8.01351547241211, "gen_logits_mean": -10.924764633178711, "gen_logits_min": -22.512035369873047, "gen_logits_std": 2.422344207763672, "gen_loss": 0.352004736661911, "grad_norm": 0.5284300342097781, "learning_rate": 2.9257263157894738e-05, "loss": 0.3514, "mean_copy_accuracy": 0.9900740534067154, "mean_gen_accuracy": 0.858483299612999, "mean_token_accuracy": 0.887299969792366, "num_tokens": 358249186.0, "sample_num_tokens": 8773.5, "step": 1324, "total_num_tokens": 358284280.0, "z_loss": 0.001748852082528174 }, { "copy_logits_max": 2.822531223297119, "copy_logits_min": -687500032.0, "copy_num_tokens": 606.875, "epoch": 0.27061526678580544, "gen_logits_max": 7.762571334838867, "gen_logits_mean": -11.66463851928711, "gen_logits_min": -23.88782501220703, "gen_logits_std": 2.482701301574707, "gen_loss": 0.3725052773952484, "grad_norm": 0.5660305172088009, "learning_rate": 2.9256e-05, "loss": 0.3744, "mean_copy_accuracy": 0.9896534830331802, "mean_gen_accuracy": 0.8421728610992432, "mean_token_accuracy": 0.879597544670105, "num_tokens": 358509441.0, "sample_num_tokens": 9423.75, "step": 1325, "total_num_tokens": 358547136.0, "z_loss": 0.0019911816343665123 }, { "copy_logits_max": 0.7433618307113647, "copy_logits_min": -750000000.0, "copy_num_tokens": 553.3125, "epoch": 0.2708195047230023, "gen_logits_max": 7.942749977111816, "gen_logits_mean": -10.57823371887207, "gen_logits_min": -23.429954528808594, "gen_logits_std": 2.4480814933776855, "gen_loss": 0.3277736306190491, "grad_norm": 0.5311230326153387, "learning_rate": 2.9254736842105263e-05, "loss": 0.3488, "mean_copy_accuracy": 0.9928385615348816, "mean_gen_accuracy": 0.8522372394800186, "mean_token_accuracy": 0.8850962221622467, "num_tokens": 358770117.0, "sample_num_tokens": 9124.25, "step": 1326, "total_num_tokens": 358806614.0, "z_loss": 0.0017170014325529337 }, { "copy_logits_max": 1.6605801582336426, "copy_logits_min": -750000000.0, "copy_num_tokens": 566.8125, "epoch": 0.27102374266019913, "gen_logits_max": 8.12001895904541, "gen_logits_mean": -10.796098709106445, "gen_logits_min": -23.474239349365234, "gen_logits_std": 2.471794605255127, "gen_loss": 0.32966580986976624, "grad_norm": 0.5603589042254062, "learning_rate": 2.9253473684210524e-05, "loss": 0.3546, "mean_copy_accuracy": 0.9912118315696716, "mean_gen_accuracy": 0.8516693562269211, "mean_token_accuracy": 0.88582943379879, "num_tokens": 359035875.0, "sample_num_tokens": 9237.25, "step": 1327, "total_num_tokens": 359072824.0, "z_loss": 0.0017336152959614992 }, { "copy_logits_max": -1.3396333456039429, "copy_logits_min": -687500032.0, "copy_num_tokens": 494.0625, "epoch": 0.27122798059739595, "gen_logits_max": 9.29046630859375, "gen_logits_mean": -9.156494140625, "gen_logits_min": -21.32730484008789, "gen_logits_std": 2.432568073272705, "gen_loss": 0.3555290102958679, "grad_norm": 0.5354348383871367, "learning_rate": 2.925221052631579e-05, "loss": 0.349, "mean_copy_accuracy": 0.9906144589185715, "mean_gen_accuracy": 0.8577756881713867, "mean_token_accuracy": 0.8898332566022873, "num_tokens": 359316946.0, "sample_num_tokens": 8865.5, "step": 1328, "total_num_tokens": 359352408.0, "z_loss": 0.0015207139076665044 }, { "copy_logits_max": -2.054093837738037, "copy_logits_min": -750000000.0, "copy_num_tokens": 334.4375, "epoch": 0.2714322185345928, "gen_logits_max": 8.441994667053223, "gen_logits_mean": -10.545798301696777, "gen_logits_min": -22.116714477539062, "gen_logits_std": 2.4010419845581055, "gen_loss": 0.3727629780769348, "grad_norm": 0.5709556176472086, "learning_rate": 2.9250947368421056e-05, "loss": 0.3399, "mean_copy_accuracy": 0.9905549585819244, "mean_gen_accuracy": 0.8548617660999298, "mean_token_accuracy": 0.8871931433677673, "num_tokens": 359587438.0, "sample_num_tokens": 7219.0, "step": 1329, "total_num_tokens": 359616314.0, "z_loss": 0.0013453715946525335 }, { "copy_logits_max": -0.7314387559890747, "copy_logits_min": -750000000.0, "copy_num_tokens": 542.5, "epoch": 0.27163645647178963, "gen_logits_max": 8.641209602355957, "gen_logits_mean": -9.908615112304688, "gen_logits_min": -22.381072998046875, "gen_logits_std": 2.491299629211426, "gen_loss": 0.3196171522140503, "grad_norm": 0.6787438378890385, "learning_rate": 2.9249684210526317e-05, "loss": 0.3209, "mean_copy_accuracy": 0.9932754933834076, "mean_gen_accuracy": 0.8599870055913925, "mean_token_accuracy": 0.8966614156961441, "num_tokens": 359861853.0, "sample_num_tokens": 8311.75, "step": 1330, "total_num_tokens": 359895100.0, "z_loss": 0.001501570688560605 }, { "copy_logits_max": 0.16778558492660522, "copy_logits_min": -687500032.0, "copy_num_tokens": 614.125, "epoch": 0.27184069440898645, "gen_logits_max": 7.381868362426758, "gen_logits_mean": -11.29010009765625, "gen_logits_min": -23.82124900817871, "gen_logits_std": 2.498922348022461, "gen_loss": 0.2966413199901581, "grad_norm": 0.5916013486726743, "learning_rate": 2.924842105263158e-05, "loss": 0.3259, "mean_copy_accuracy": 0.9927180707454681, "mean_gen_accuracy": 0.8566259294748306, "mean_token_accuracy": 0.8929805904626846, "num_tokens": 360127467.0, "sample_num_tokens": 9063.25, "step": 1331, "total_num_tokens": 360163720.0, "z_loss": 0.001478732330724597 }, { "copy_logits_max": -1.2670071125030518, "copy_logits_min": -750000000.0, "copy_num_tokens": 505.0625, "epoch": 0.2720449323461833, "gen_logits_max": 7.6970696449279785, "gen_logits_mean": -11.312641143798828, "gen_logits_min": -24.314613342285156, "gen_logits_std": 2.459929943084717, "gen_loss": 0.3189171254634857, "grad_norm": 0.5568297724250443, "learning_rate": 2.9247157894736842e-05, "loss": 0.346, "mean_copy_accuracy": 0.9908994138240814, "mean_gen_accuracy": 0.8523350954055786, "mean_token_accuracy": 0.8865616321563721, "num_tokens": 360402586.0, "sample_num_tokens": 8278.5, "step": 1332, "total_num_tokens": 360435700.0, "z_loss": 0.0012363260611891747 }, { "copy_logits_max": -1.7819912433624268, "copy_logits_min": -750000000.0, "copy_num_tokens": 480.9375, "epoch": 0.27224917028338014, "gen_logits_max": 8.358772277832031, "gen_logits_mean": -10.62814712524414, "gen_logits_min": -23.294456481933594, "gen_logits_std": 2.479832649230957, "gen_loss": 0.3615275025367737, "grad_norm": 0.6626763162803664, "learning_rate": 2.9245894736842107e-05, "loss": 0.3495, "mean_copy_accuracy": 0.9905525892972946, "mean_gen_accuracy": 0.8503738194704056, "mean_token_accuracy": 0.886243611574173, "num_tokens": 360684537.0, "sample_num_tokens": 8297.25, "step": 1333, "total_num_tokens": 360717726.0, "z_loss": 0.0012651989236474037 }, { "copy_logits_max": -0.32118964195251465, "copy_logits_min": -687500032.0, "copy_num_tokens": 443.3125, "epoch": 0.27245340822057695, "gen_logits_max": 8.591377258300781, "gen_logits_mean": -11.190430641174316, "gen_logits_min": -24.111392974853516, "gen_logits_std": 2.475693702697754, "gen_loss": 0.32987070083618164, "grad_norm": 0.5656264187456616, "learning_rate": 2.9244631578947368e-05, "loss": 0.3293, "mean_copy_accuracy": 0.9912426620721817, "mean_gen_accuracy": 0.8608495891094208, "mean_token_accuracy": 0.8934634476900101, "num_tokens": 360952534.0, "sample_num_tokens": 7813.0, "step": 1334, "total_num_tokens": 360983786.0, "z_loss": 0.0013030220288783312 }, { "copy_logits_max": -2.589151620864868, "copy_logits_min": -750000000.0, "copy_num_tokens": 272.9375, "epoch": 0.2726576461577738, "gen_logits_max": 8.397116661071777, "gen_logits_mean": -12.32614803314209, "gen_logits_min": -24.244403839111328, "gen_logits_std": 2.4067702293395996, "gen_loss": 0.3435226082801819, "grad_norm": 0.5424042102355726, "learning_rate": 2.9243368421052632e-05, "loss": 0.3358, "mean_copy_accuracy": 0.9914427697658539, "mean_gen_accuracy": 0.859344556927681, "mean_token_accuracy": 0.8899482935667038, "num_tokens": 361228838.0, "sample_num_tokens": 7967.0, "step": 1335, "total_num_tokens": 361260706.0, "z_loss": 0.001188443275168538 }, { "copy_logits_max": 0.05410847067832947, "copy_logits_min": -750000000.0, "copy_num_tokens": 670.9375, "epoch": 0.27286188409497064, "gen_logits_max": 8.407047271728516, "gen_logits_mean": -10.600647926330566, "gen_logits_min": -24.18785285949707, "gen_logits_std": 2.526729106903076, "gen_loss": 0.30860987305641174, "grad_norm": 0.5164245266474851, "learning_rate": 2.9242105263157893e-05, "loss": 0.3249, "mean_copy_accuracy": 0.9909962266683578, "mean_gen_accuracy": 0.8601285964250565, "mean_token_accuracy": 0.8952386528253555, "num_tokens": 361496854.0, "sample_num_tokens": 9853.0, "step": 1336, "total_num_tokens": 361536266.0, "z_loss": 0.0013691586209461093 }, { "copy_logits_max": 0.025486618280410767, "copy_logits_min": -750000000.0, "copy_num_tokens": 538.625, "epoch": 0.27306612203216746, "gen_logits_max": 7.880358695983887, "gen_logits_mean": -10.951089859008789, "gen_logits_min": -23.90268898010254, "gen_logits_std": 2.482407808303833, "gen_loss": 0.2959775924682617, "grad_norm": 0.4837529024420316, "learning_rate": 2.924084210526316e-05, "loss": 0.3226, "mean_copy_accuracy": 0.9919965863227844, "mean_gen_accuracy": 0.861776277422905, "mean_token_accuracy": 0.8972087204456329, "num_tokens": 361788498.0, "sample_num_tokens": 9035.5, "step": 1337, "total_num_tokens": 361824640.0, "z_loss": 0.0011965723242610693 }, { "copy_logits_max": -0.2859764099121094, "copy_logits_min": -687500032.0, "copy_num_tokens": 472.625, "epoch": 0.27327035996936433, "gen_logits_max": 8.4036865234375, "gen_logits_mean": -9.558821678161621, "gen_logits_min": -22.765291213989258, "gen_logits_std": 2.499786376953125, "gen_loss": 0.3635706305503845, "grad_norm": 0.4873274923716054, "learning_rate": 2.923957894736842e-05, "loss": 0.3299, "mean_copy_accuracy": 0.9934652149677277, "mean_gen_accuracy": 0.8517527729272842, "mean_token_accuracy": 0.8936061859130859, "num_tokens": 362058423.0, "sample_num_tokens": 8130.25, "step": 1338, "total_num_tokens": 362090944.0, "z_loss": 0.0014544855803251266 }, { "copy_logits_max": -1.5056450366973877, "copy_logits_min": -750000000.0, "copy_num_tokens": 600.6875, "epoch": 0.27347459790656115, "gen_logits_max": 8.594179153442383, "gen_logits_mean": -10.66897964477539, "gen_logits_min": -23.338600158691406, "gen_logits_std": 2.463127851486206, "gen_loss": 0.3374904990196228, "grad_norm": 0.5085264214175851, "learning_rate": 2.9238315789473686e-05, "loss": 0.3771, "mean_copy_accuracy": 0.9915415048599243, "mean_gen_accuracy": 0.8450622707605362, "mean_token_accuracy": 0.8771592527627945, "num_tokens": 362321438.0, "sample_num_tokens": 10399.5, "step": 1339, "total_num_tokens": 362363036.0, "z_loss": 0.001477228943258524 }, { "copy_logits_max": -2.4215686321258545, "copy_logits_min": -750000000.0, "copy_num_tokens": 295.125, "epoch": 0.27367883584375796, "gen_logits_max": 8.37314224243164, "gen_logits_mean": -11.809484481811523, "gen_logits_min": -23.69169807434082, "gen_logits_std": 2.4097671508789062, "gen_loss": 0.37240278720855713, "grad_norm": 0.5074642234710544, "learning_rate": 2.9237052631578947e-05, "loss": 0.3511, "mean_copy_accuracy": 0.9909958988428116, "mean_gen_accuracy": 0.8511238992214203, "mean_token_accuracy": 0.8858982771635056, "num_tokens": 362591803.0, "sample_num_tokens": 6777.25, "step": 1340, "total_num_tokens": 362618912.0, "z_loss": 0.0013691227650269866 }, { "copy_logits_max": -1.2351263761520386, "copy_logits_min": -750000000.0, "copy_num_tokens": 506.3125, "epoch": 0.27388307378095483, "gen_logits_max": 8.210943222045898, "gen_logits_mean": -10.712987899780273, "gen_logits_min": -23.215259552001953, "gen_logits_std": 2.440133571624756, "gen_loss": 0.3157409429550171, "grad_norm": 0.6029094516207082, "learning_rate": 2.923578947368421e-05, "loss": 0.3579, "mean_copy_accuracy": 0.9881028532981873, "mean_gen_accuracy": 0.8535888642072678, "mean_token_accuracy": 0.8841080069541931, "num_tokens": 362838839.0, "sample_num_tokens": 8827.25, "step": 1341, "total_num_tokens": 362874148.0, "z_loss": 0.0013846809742972255 }, { "copy_logits_max": -0.6466751098632812, "copy_logits_min": -750000064.0, "copy_num_tokens": 484.125, "epoch": 0.27408731171815165, "gen_logits_max": 7.426352500915527, "gen_logits_mean": -11.543481826782227, "gen_logits_min": -23.77215576171875, "gen_logits_std": 2.421095132827759, "gen_loss": 0.3718557357788086, "grad_norm": 0.4670866154288012, "learning_rate": 2.9234526315789475e-05, "loss": 0.3368, "mean_copy_accuracy": 0.9930751919746399, "mean_gen_accuracy": 0.8570828437805176, "mean_token_accuracy": 0.8909816592931747, "num_tokens": 363110335.0, "sample_num_tokens": 9378.25, "step": 1342, "total_num_tokens": 363147848.0, "z_loss": 0.0013444590149447322 }, { "copy_logits_max": -1.0684778690338135, "copy_logits_min": -750000000.0, "copy_num_tokens": 512.75, "epoch": 0.27429154965534847, "gen_logits_max": 8.073810577392578, "gen_logits_mean": -10.763248443603516, "gen_logits_min": -22.976295471191406, "gen_logits_std": 2.432943344116211, "gen_loss": 0.3498672842979431, "grad_norm": 0.5247451092246999, "learning_rate": 2.9233263157894736e-05, "loss": 0.3421, "mean_copy_accuracy": 0.9933781027793884, "mean_gen_accuracy": 0.8562742173671722, "mean_token_accuracy": 0.8874504566192627, "num_tokens": 363380425.0, "sample_num_tokens": 9172.25, "step": 1343, "total_num_tokens": 363417114.0, "z_loss": 0.0014090694021433592 }, { "copy_logits_max": 1.9271485805511475, "copy_logits_min": -687500032.0, "copy_num_tokens": 705.3125, "epoch": 0.27449578759254534, "gen_logits_max": 8.131540298461914, "gen_logits_mean": -9.92544937133789, "gen_logits_min": -23.28079605102539, "gen_logits_std": 2.5340042114257812, "gen_loss": 0.3228340148925781, "grad_norm": 0.5337112040180538, "learning_rate": 2.9232e-05, "loss": 0.3312, "mean_copy_accuracy": 0.9936844110488892, "mean_gen_accuracy": 0.8567528426647186, "mean_token_accuracy": 0.8926328718662262, "num_tokens": 363651611.0, "sample_num_tokens": 9731.25, "step": 1344, "total_num_tokens": 363690536.0, "z_loss": 0.0015595222357660532 }, { "copy_logits_max": -0.17736393213272095, "copy_logits_min": -750000000.0, "copy_num_tokens": 405.5625, "epoch": 0.27470002552974215, "gen_logits_max": 7.954318046569824, "gen_logits_mean": -10.427217483520508, "gen_logits_min": -23.201162338256836, "gen_logits_std": 2.486290693283081, "gen_loss": 0.3365747332572937, "grad_norm": 0.5464029116955643, "learning_rate": 2.9230736842105265e-05, "loss": 0.3307, "mean_copy_accuracy": 0.9910916537046432, "mean_gen_accuracy": 0.8597415536642075, "mean_token_accuracy": 0.8953841179609299, "num_tokens": 363923627.0, "sample_num_tokens": 7568.25, "step": 1345, "total_num_tokens": 363953900.0, "z_loss": 0.0013217268278822303 }, { "copy_logits_max": 1.1244134902954102, "copy_logits_min": -687500032.0, "copy_num_tokens": 418.5625, "epoch": 0.27490426346693897, "gen_logits_max": 8.372371673583984, "gen_logits_mean": -9.863654136657715, "gen_logits_min": -23.17276954650879, "gen_logits_std": 2.5471155643463135, "gen_loss": 0.3637900650501251, "grad_norm": 0.6330690557066744, "learning_rate": 2.922947368421053e-05, "loss": 0.3444, "mean_copy_accuracy": 0.9913278371095657, "mean_gen_accuracy": 0.8557140827178955, "mean_token_accuracy": 0.8885829299688339, "num_tokens": 364181896.0, "sample_num_tokens": 7167.5, "step": 1346, "total_num_tokens": 364210566.0, "z_loss": 0.0013986262492835522 }, { "copy_logits_max": -1.9348493814468384, "copy_logits_min": -750000000.0, "copy_num_tokens": 473.875, "epoch": 0.27510850140413584, "gen_logits_max": 8.304883003234863, "gen_logits_mean": -10.180119514465332, "gen_logits_min": -22.39785385131836, "gen_logits_std": 2.4552841186523438, "gen_loss": 0.3433426022529602, "grad_norm": 0.5487891975879132, "learning_rate": 2.922821052631579e-05, "loss": 0.3298, "mean_copy_accuracy": 0.9924668818712234, "mean_gen_accuracy": 0.8578871339559555, "mean_token_accuracy": 0.8927681148052216, "num_tokens": 364471622.0, "sample_num_tokens": 9224.5, "step": 1347, "total_num_tokens": 364508520.0, "z_loss": 0.0013879011385142803 }, { "copy_logits_max": -2.426913261413574, "copy_logits_min": -750000000.0, "copy_num_tokens": 316.8125, "epoch": 0.27531273934133266, "gen_logits_max": 8.525079727172852, "gen_logits_mean": -10.419639587402344, "gen_logits_min": -22.514266967773438, "gen_logits_std": 2.4345338344573975, "gen_loss": 0.37463632225990295, "grad_norm": 0.49837363586454836, "learning_rate": 2.9226947368421055e-05, "loss": 0.3657, "mean_copy_accuracy": 0.9919861108064651, "mean_gen_accuracy": 0.8543675243854523, "mean_token_accuracy": 0.8816717863082886, "num_tokens": 364735667.0, "sample_num_tokens": 7647.75, "step": 1348, "total_num_tokens": 364766258.0, "z_loss": 0.001394578954204917 }, { "copy_logits_max": -2.0375757217407227, "copy_logits_min": -750000000.0, "copy_num_tokens": 569.375, "epoch": 0.2755169772785295, "gen_logits_max": 7.512758255004883, "gen_logits_mean": -11.549810409545898, "gen_logits_min": -23.274646759033203, "gen_logits_std": 2.478342056274414, "gen_loss": 0.3109208941459656, "grad_norm": 0.5152845864716223, "learning_rate": 2.9225684210526315e-05, "loss": 0.3326, "mean_copy_accuracy": 0.9913614690303802, "mean_gen_accuracy": 0.8580760210752487, "mean_token_accuracy": 0.8935459554195404, "num_tokens": 365014183.0, "sample_num_tokens": 9258.75, "step": 1349, "total_num_tokens": 365051218.0, "z_loss": 0.0012949908850714564 }, { "copy_logits_max": -0.10507076978683472, "copy_logits_min": -687500032.0, "copy_num_tokens": 424.3125, "epoch": 0.27572121521572635, "gen_logits_max": 8.243892669677734, "gen_logits_mean": -10.29941177368164, "gen_logits_min": -23.160762786865234, "gen_logits_std": 2.503291606903076, "gen_loss": 0.3506854772567749, "grad_norm": 0.4923649413422824, "learning_rate": 2.922442105263158e-05, "loss": 0.35, "mean_copy_accuracy": 0.9897443950176239, "mean_gen_accuracy": 0.8570073693990707, "mean_token_accuracy": 0.8861372023820877, "num_tokens": 365268617.0, "sample_num_tokens": 8543.25, "step": 1350, "total_num_tokens": 365302790.0, "z_loss": 0.001299750991165638 }, { "copy_logits_max": -0.5666540861129761, "copy_logits_min": -750000000.0, "copy_num_tokens": 536.0, "epoch": 0.27592545315292316, "gen_logits_max": 7.622625350952148, "gen_logits_mean": -10.862117767333984, "gen_logits_min": -23.52935028076172, "gen_logits_std": 2.50370454788208, "gen_loss": 0.3207358717918396, "grad_norm": 0.5099655537324176, "learning_rate": 2.922315789473684e-05, "loss": 0.3328, "mean_copy_accuracy": 0.991475835442543, "mean_gen_accuracy": 0.8536769598722458, "mean_token_accuracy": 0.8902412056922913, "num_tokens": 365517800.0, "sample_num_tokens": 8411.5, "step": 1351, "total_num_tokens": 365551446.0, "z_loss": 0.001190266339108348 }, { "copy_logits_max": -3.1340513229370117, "copy_logits_min": -750000000.0, "copy_num_tokens": 321.8125, "epoch": 0.27612969109012, "gen_logits_max": 8.615644454956055, "gen_logits_mean": -10.609675407409668, "gen_logits_min": -22.895719528198242, "gen_logits_std": 2.4780635833740234, "gen_loss": 0.36538487672805786, "grad_norm": 0.4915237505213685, "learning_rate": 2.9221894736842105e-05, "loss": 0.3525, "mean_copy_accuracy": 0.9893328249454498, "mean_gen_accuracy": 0.8604174852371216, "mean_token_accuracy": 0.8839500993490219, "num_tokens": 365780607.0, "sample_num_tokens": 8003.25, "step": 1352, "total_num_tokens": 365812620.0, "z_loss": 0.0012393697397783399 }, { "copy_logits_max": 1.8017739057540894, "copy_logits_min": -750000000.0, "copy_num_tokens": 605.625, "epoch": 0.27633392902731685, "gen_logits_max": 8.797409057617188, "gen_logits_mean": -10.010520935058594, "gen_logits_min": -23.488868713378906, "gen_logits_std": 2.551856517791748, "gen_loss": 0.33849671483039856, "grad_norm": 0.5717676135895756, "learning_rate": 2.922063157894737e-05, "loss": 0.3522, "mean_copy_accuracy": 0.9892980605363846, "mean_gen_accuracy": 0.8584182858467102, "mean_token_accuracy": 0.8888523280620575, "num_tokens": 366044101.0, "sample_num_tokens": 10310.75, "step": 1353, "total_num_tokens": 366085344.0, "z_loss": 0.0013065071543678641 }, { "copy_logits_max": -0.4375542998313904, "copy_logits_min": -750000000.0, "copy_num_tokens": 378.3125, "epoch": 0.27653816696451367, "gen_logits_max": 7.997638702392578, "gen_logits_mean": -10.005640029907227, "gen_logits_min": -22.613384246826172, "gen_logits_std": 2.472475528717041, "gen_loss": 0.3700098395347595, "grad_norm": 0.832212719229971, "learning_rate": 2.9219368421052634e-05, "loss": 0.3558, "mean_copy_accuracy": 0.9910132586956024, "mean_gen_accuracy": 0.8544937372207642, "mean_token_accuracy": 0.8869960457086563, "num_tokens": 366309740.0, "sample_num_tokens": 7168.0, "step": 1354, "total_num_tokens": 366338412.0, "z_loss": 0.001320218201726675 }, { "copy_logits_max": -1.443196177482605, "copy_logits_min": -750000000.0, "copy_num_tokens": 370.8125, "epoch": 0.2767424049017105, "gen_logits_max": 7.421555519104004, "gen_logits_mean": -11.839357376098633, "gen_logits_min": -24.005748748779297, "gen_logits_std": 2.4856624603271484, "gen_loss": 0.3430837392807007, "grad_norm": 0.5432427238872743, "learning_rate": 2.9218105263157898e-05, "loss": 0.3425, "mean_copy_accuracy": 0.990884393453598, "mean_gen_accuracy": 0.8570788353681564, "mean_token_accuracy": 0.8911004513502121, "num_tokens": 366583460.0, "sample_num_tokens": 7183.0, "step": 1355, "total_num_tokens": 366612192.0, "z_loss": 0.0011301012709736824 }, { "copy_logits_max": -0.1779937744140625, "copy_logits_min": -750000000.0, "copy_num_tokens": 392.9375, "epoch": 0.27694664283890735, "gen_logits_max": 7.991229057312012, "gen_logits_mean": -10.392629623413086, "gen_logits_min": -23.00799560546875, "gen_logits_std": 2.5142693519592285, "gen_loss": 0.38135337829589844, "grad_norm": 0.48868024258842546, "learning_rate": 2.921684210526316e-05, "loss": 0.3341, "mean_copy_accuracy": 0.9927007257938385, "mean_gen_accuracy": 0.8554768264293671, "mean_token_accuracy": 0.8914404213428497, "num_tokens": 366839715.0, "sample_num_tokens": 7565.75, "step": 1356, "total_num_tokens": 366869978.0, "z_loss": 0.0013458130415529013 }, { "copy_logits_max": -1.668744444847107, "copy_logits_min": -750000000.0, "copy_num_tokens": 323.0625, "epoch": 0.27715088077610417, "gen_logits_max": 9.226417541503906, "gen_logits_mean": -9.859834671020508, "gen_logits_min": -21.60446548461914, "gen_logits_std": 2.4661450386047363, "gen_loss": 0.3946398198604584, "grad_norm": 0.5451360370249425, "learning_rate": 2.9215578947368423e-05, "loss": 0.3401, "mean_copy_accuracy": 0.990623340010643, "mean_gen_accuracy": 0.8580280393362045, "mean_token_accuracy": 0.8887223899364471, "num_tokens": 367109064.0, "sample_num_tokens": 8394.5, "step": 1357, "total_num_tokens": 367142642.0, "z_loss": 0.0012850076891481876 }, { "copy_logits_max": -1.8955923318862915, "copy_logits_min": -750000000.0, "copy_num_tokens": 609.4375, "epoch": 0.277355118713301, "gen_logits_max": 8.020405769348145, "gen_logits_mean": -11.567716598510742, "gen_logits_min": -23.78765106201172, "gen_logits_std": 2.4868569374084473, "gen_loss": 0.28924763202667236, "grad_norm": 0.48686585797670784, "learning_rate": 2.9214315789473684e-05, "loss": 0.3226, "mean_copy_accuracy": 0.9920594394207001, "mean_gen_accuracy": 0.8581754863262177, "mean_token_accuracy": 0.8946632295846939, "num_tokens": 367379093.0, "sample_num_tokens": 9867.25, "step": 1358, "total_num_tokens": 367418562.0, "z_loss": 0.0011619769502431154 }, { "copy_logits_max": -1.7131061553955078, "copy_logits_min": -750000000.0, "copy_num_tokens": 503.375, "epoch": 0.27755935665049786, "gen_logits_max": 7.325163841247559, "gen_logits_mean": -11.665760040283203, "gen_logits_min": -23.546859741210938, "gen_logits_std": 2.4653141498565674, "gen_loss": 0.3290424644947052, "grad_norm": 0.4941821647135167, "learning_rate": 2.921305263157895e-05, "loss": 0.3263, "mean_copy_accuracy": 0.9910313487052917, "mean_gen_accuracy": 0.8624284118413925, "mean_token_accuracy": 0.893002524971962, "num_tokens": 367667286.0, "sample_num_tokens": 8900.5, "step": 1359, "total_num_tokens": 367702888.0, "z_loss": 0.001160487998276949 }, { "copy_logits_max": -1.5998942852020264, "copy_logits_min": -750000000.0, "copy_num_tokens": 464.4375, "epoch": 0.2777635945876947, "gen_logits_max": 8.352059364318848, "gen_logits_mean": -9.378111839294434, "gen_logits_min": -21.404083251953125, "gen_logits_std": 2.4887967109680176, "gen_loss": 0.35564225912094116, "grad_norm": 0.4705084706126165, "learning_rate": 2.921178947368421e-05, "loss": 0.33, "mean_copy_accuracy": 0.9915079921483994, "mean_gen_accuracy": 0.8551790416240692, "mean_token_accuracy": 0.8930769562721252, "num_tokens": 367958664.0, "sample_num_tokens": 8602.0, "step": 1360, "total_num_tokens": 367993072.0, "z_loss": 0.0013110456056892872 }, { "copy_logits_max": -0.5505924224853516, "copy_logits_min": -750000000.0, "copy_num_tokens": 485.0, "epoch": 0.2779678325248915, "gen_logits_max": 8.172025680541992, "gen_logits_mean": -10.165155410766602, "gen_logits_min": -23.005626678466797, "gen_logits_std": 2.476468563079834, "gen_loss": 0.3760892152786255, "grad_norm": 0.5439580619515708, "learning_rate": 2.9210526315789474e-05, "loss": 0.3439, "mean_copy_accuracy": 0.9922718107700348, "mean_gen_accuracy": 0.8545296639204025, "mean_token_accuracy": 0.8887089937925339, "num_tokens": 368220426.0, "sample_num_tokens": 9138.5, "step": 1361, "total_num_tokens": 368256980.0, "z_loss": 0.0013844476779922843 }, { "copy_logits_max": -0.3052114248275757, "copy_logits_min": -625000000.0, "copy_num_tokens": 732.625, "epoch": 0.2781720704620883, "gen_logits_max": 8.918146133422852, "gen_logits_mean": -10.303129196166992, "gen_logits_min": -23.03122329711914, "gen_logits_std": 2.562119960784912, "gen_loss": 0.3213949203491211, "grad_norm": 0.533647050966059, "learning_rate": 2.9209263157894738e-05, "loss": 0.3377, "mean_copy_accuracy": 0.9924528151750565, "mean_gen_accuracy": 0.8594580739736557, "mean_token_accuracy": 0.8906010389328003, "num_tokens": 368471877.0, "sample_num_tokens": 9748.25, "step": 1362, "total_num_tokens": 368510870.0, "z_loss": 0.0014539259718731046 }, { "copy_logits_max": -0.7422221302986145, "copy_logits_min": -750000000.0, "copy_num_tokens": 387.9375, "epoch": 0.2783763083992852, "gen_logits_max": 8.959158897399902, "gen_logits_mean": -10.425399780273438, "gen_logits_min": -22.658191680908203, "gen_logits_std": 2.490067958831787, "gen_loss": 0.3370177149772644, "grad_norm": 0.5258039625753005, "learning_rate": 2.9208000000000002e-05, "loss": 0.3475, "mean_copy_accuracy": 0.9920763820409775, "mean_gen_accuracy": 0.8524989932775497, "mean_token_accuracy": 0.8869968354701996, "num_tokens": 368752852.0, "sample_num_tokens": 7492.5, "step": 1363, "total_num_tokens": 368782822.0, "z_loss": 0.0013542971573770046 }, { "copy_logits_max": 0.25500184297561646, "copy_logits_min": -750000000.0, "copy_num_tokens": 620.375, "epoch": 0.278580546336482, "gen_logits_max": 7.476144313812256, "gen_logits_mean": -10.323888778686523, "gen_logits_min": -22.439430236816406, "gen_logits_std": 2.502659320831299, "gen_loss": 0.3049600124359131, "grad_norm": 0.5204946640668255, "learning_rate": 2.9206736842105263e-05, "loss": 0.331, "mean_copy_accuracy": 0.9914427250623703, "mean_gen_accuracy": 0.8612987995147705, "mean_token_accuracy": 0.8930111676454544, "num_tokens": 369011181.0, "sample_num_tokens": 8891.25, "step": 1364, "total_num_tokens": 369046746.0, "z_loss": 0.0014702683547511697 }, { "copy_logits_max": -0.7695326209068298, "copy_logits_min": -750000000.0, "copy_num_tokens": 442.875, "epoch": 0.2787847842736788, "gen_logits_max": 9.426997184753418, "gen_logits_mean": -10.112855911254883, "gen_logits_min": -22.334365844726562, "gen_logits_std": 2.5154030323028564, "gen_loss": 0.35790979862213135, "grad_norm": 0.4768997469170631, "learning_rate": 2.9205473684210527e-05, "loss": 0.3244, "mean_copy_accuracy": 0.9919109642505646, "mean_gen_accuracy": 0.8636552840471268, "mean_token_accuracy": 0.8952982574701309, "num_tokens": 369293224.0, "sample_num_tokens": 8523.5, "step": 1365, "total_num_tokens": 369327318.0, "z_loss": 0.0013543273089453578 }, { "copy_logits_max": -2.0819637775421143, "copy_logits_min": -750000000.0, "copy_num_tokens": 379.9375, "epoch": 0.2789890222108757, "gen_logits_max": 8.092312812805176, "gen_logits_mean": -11.732419967651367, "gen_logits_min": -23.61993408203125, "gen_logits_std": 2.5038931369781494, "gen_loss": 0.33313480019569397, "grad_norm": 0.4789196550531067, "learning_rate": 2.920421052631579e-05, "loss": 0.3249, "mean_copy_accuracy": 0.9929621517658234, "mean_gen_accuracy": 0.8596919476985931, "mean_token_accuracy": 0.8953016549348831, "num_tokens": 369560012.0, "sample_num_tokens": 7734.0, "step": 1366, "total_num_tokens": 369590948.0, "z_loss": 0.0011774440063163638 }, { "copy_logits_max": -2.4118335247039795, "copy_logits_min": -750000000.0, "copy_num_tokens": 557.125, "epoch": 0.2791932601480725, "gen_logits_max": 7.517396926879883, "gen_logits_mean": -10.888700485229492, "gen_logits_min": -22.86940574645996, "gen_logits_std": 2.4948554039001465, "gen_loss": 0.3034905791282654, "grad_norm": 0.5561353835185764, "learning_rate": 2.9202947368421053e-05, "loss": 0.3182, "mean_copy_accuracy": 0.9894680678844452, "mean_gen_accuracy": 0.8646526038646698, "mean_token_accuracy": 0.8958979845046997, "num_tokens": 369803092.0, "sample_num_tokens": 8502.5, "step": 1367, "total_num_tokens": 369837102.0, "z_loss": 0.001173265976831317 }, { "copy_logits_max": -1.5073914527893066, "copy_logits_min": -750000000.0, "copy_num_tokens": 355.0, "epoch": 0.2793974980852693, "gen_logits_max": 8.029609680175781, "gen_logits_mean": -11.927295684814453, "gen_logits_min": -23.732675552368164, "gen_logits_std": 2.4551610946655273, "gen_loss": 0.3907761573791504, "grad_norm": 0.5406990063984967, "learning_rate": 2.9201684210526314e-05, "loss": 0.3652, "mean_copy_accuracy": 0.992255300283432, "mean_gen_accuracy": 0.8495715409517288, "mean_token_accuracy": 0.881473183631897, "num_tokens": 370054531.0, "sample_num_tokens": 7651.75, "step": 1368, "total_num_tokens": 370085138.0, "z_loss": 0.0013332974631339312 }, { "copy_logits_max": -1.7650868892669678, "copy_logits_min": -750000000.0, "copy_num_tokens": 376.25, "epoch": 0.2796017360224662, "gen_logits_max": 7.740096569061279, "gen_logits_mean": -11.196537017822266, "gen_logits_min": -23.34386444091797, "gen_logits_std": 2.481689453125, "gen_loss": 0.31832155585289, "grad_norm": 0.48284788292946623, "learning_rate": 2.9200421052631578e-05, "loss": 0.3161, "mean_copy_accuracy": 0.9915091693401337, "mean_gen_accuracy": 0.8677926361560822, "mean_token_accuracy": 0.8966751396656036, "num_tokens": 370299740.0, "sample_num_tokens": 7425.0, "step": 1369, "total_num_tokens": 370329440.0, "z_loss": 0.001187809742987156 }, { "copy_logits_max": -2.1552679538726807, "copy_logits_min": -750000000.0, "copy_num_tokens": 501.875, "epoch": 0.279805973959663, "gen_logits_max": 8.723848342895508, "gen_logits_mean": -9.809398651123047, "gen_logits_min": -22.148235321044922, "gen_logits_std": 2.576789140701294, "gen_loss": 0.323358952999115, "grad_norm": 0.562303486773202, "learning_rate": 2.9199157894736846e-05, "loss": 0.345, "mean_copy_accuracy": 0.9879878163337708, "mean_gen_accuracy": 0.8583182990550995, "mean_token_accuracy": 0.8880622386932373, "num_tokens": 370550568.0, "sample_num_tokens": 8568.5, "step": 1370, "total_num_tokens": 370584842.0, "z_loss": 0.0011495902435854077 }, { "copy_logits_max": -1.3918578624725342, "copy_logits_min": -625000064.0, "copy_num_tokens": 602.625, "epoch": 0.2800102118968598, "gen_logits_max": 7.652972221374512, "gen_logits_mean": -10.728793144226074, "gen_logits_min": -23.312362670898438, "gen_logits_std": 2.5492053031921387, "gen_loss": 0.3358829617500305, "grad_norm": 0.5415610078804579, "learning_rate": 2.9197894736842107e-05, "loss": 0.3518, "mean_copy_accuracy": 0.9914019256830215, "mean_gen_accuracy": 0.848931610584259, "mean_token_accuracy": 0.8860449343919754, "num_tokens": 370811715.0, "sample_num_tokens": 9417.25, "step": 1371, "total_num_tokens": 370849384.0, "z_loss": 0.0013281553983688354 }, { "copy_logits_max": -1.033771276473999, "copy_logits_min": -687500032.0, "copy_num_tokens": 396.375, "epoch": 0.2802144498340567, "gen_logits_max": 8.126951217651367, "gen_logits_mean": -11.316980361938477, "gen_logits_min": -23.33865737915039, "gen_logits_std": 2.5059218406677246, "gen_loss": 0.38341155648231506, "grad_norm": 0.5846996843844556, "learning_rate": 2.919663157894737e-05, "loss": 0.3408, "mean_copy_accuracy": 0.9934435784816742, "mean_gen_accuracy": 0.856436163187027, "mean_token_accuracy": 0.8916175663471222, "num_tokens": 371106982.0, "sample_num_tokens": 7772.5, "step": 1372, "total_num_tokens": 371138072.0, "z_loss": 0.0012946148635819554 }, { "copy_logits_max": -0.7362391948699951, "copy_logits_min": -687500032.0, "copy_num_tokens": 381.75, "epoch": 0.2804186877712535, "gen_logits_max": 8.670759201049805, "gen_logits_mean": -10.84757137298584, "gen_logits_min": -22.966114044189453, "gen_logits_std": 2.5244550704956055, "gen_loss": 0.3506844639778137, "grad_norm": 0.526068737849622, "learning_rate": 2.9195368421052632e-05, "loss": 0.3544, "mean_copy_accuracy": 0.9917362928390503, "mean_gen_accuracy": 0.8519328832626343, "mean_token_accuracy": 0.8843053579330444, "num_tokens": 371368133.0, "sample_num_tokens": 7869.75, "step": 1373, "total_num_tokens": 371399612.0, "z_loss": 0.0011767602991312742 }, { "copy_logits_max": -2.119669198989868, "copy_logits_min": -750000000.0, "copy_num_tokens": 457.1875, "epoch": 0.2806229257084503, "gen_logits_max": 7.437498092651367, "gen_logits_mean": -11.453655242919922, "gen_logits_min": -23.49199104309082, "gen_logits_std": 2.507056713104248, "gen_loss": 0.3512326180934906, "grad_norm": 0.5580400932648913, "learning_rate": 2.9194105263157896e-05, "loss": 0.3366, "mean_copy_accuracy": 0.9913793802261353, "mean_gen_accuracy": 0.8598331809043884, "mean_token_accuracy": 0.890275627374649, "num_tokens": 371620055.0, "sample_num_tokens": 8080.25, "step": 1374, "total_num_tokens": 371652376.0, "z_loss": 0.0011062112171202898 }, { "copy_logits_max": -1.7659125328063965, "copy_logits_min": -750000064.0, "copy_num_tokens": 330.0, "epoch": 0.2808271636456472, "gen_logits_max": 8.126615524291992, "gen_logits_mean": -10.85566234588623, "gen_logits_min": -22.957735061645508, "gen_logits_std": 2.5097014904022217, "gen_loss": 0.30120813846588135, "grad_norm": 0.45527657667195226, "learning_rate": 2.9192842105263157e-05, "loss": 0.3339, "mean_copy_accuracy": 0.9909782111644745, "mean_gen_accuracy": 0.8608015179634094, "mean_token_accuracy": 0.891509935259819, "num_tokens": 371898414.0, "sample_num_tokens": 7619.0, "step": 1375, "total_num_tokens": 371928890.0, "z_loss": 0.0010218604002147913 }, { "copy_logits_max": -2.1302905082702637, "copy_logits_min": -750000000.0, "copy_num_tokens": 395.125, "epoch": 0.281031401582844, "gen_logits_max": 8.305745124816895, "gen_logits_mean": -10.878880500793457, "gen_logits_min": -22.533771514892578, "gen_logits_std": 2.462954044342041, "gen_loss": 0.3969176411628723, "grad_norm": 0.5102822724048918, "learning_rate": 2.919157894736842e-05, "loss": 0.3543, "mean_copy_accuracy": 0.991170659661293, "mean_gen_accuracy": 0.8556567281484604, "mean_token_accuracy": 0.8856415450572968, "num_tokens": 372178399.0, "sample_num_tokens": 8486.25, "step": 1376, "total_num_tokens": 372212344.0, "z_loss": 0.0013118770439177752 }, { "copy_logits_max": -0.6165794134140015, "copy_logits_min": -750000000.0, "copy_num_tokens": 412.4375, "epoch": 0.2812356395200408, "gen_logits_max": 8.195403099060059, "gen_logits_mean": -11.890288352966309, "gen_logits_min": -24.009326934814453, "gen_logits_std": 2.4959402084350586, "gen_loss": 0.3153475224971771, "grad_norm": 0.4931400001161752, "learning_rate": 2.9190315789473682e-05, "loss": 0.3455, "mean_copy_accuracy": 0.9931418597698212, "mean_gen_accuracy": 0.8531478047370911, "mean_token_accuracy": 0.8869340270757675, "num_tokens": 372444312.0, "sample_num_tokens": 8283.5, "step": 1377, "total_num_tokens": 372477446.0, "z_loss": 0.0011017294600605965 }, { "copy_logits_max": -2.4143495559692383, "copy_logits_min": -687500032.0, "copy_num_tokens": 277.9375, "epoch": 0.2814398774572377, "gen_logits_max": 8.591352462768555, "gen_logits_mean": -11.63892650604248, "gen_logits_min": -23.563812255859375, "gen_logits_std": 2.4742136001586914, "gen_loss": 0.33809801936149597, "grad_norm": 0.5181882303071288, "learning_rate": 2.918905263157895e-05, "loss": 0.3604, "mean_copy_accuracy": 0.9904774278402328, "mean_gen_accuracy": 0.8534657657146454, "mean_token_accuracy": 0.8827531486749649, "num_tokens": 372691148.0, "sample_num_tokens": 7078.0, "step": 1378, "total_num_tokens": 372719460.0, "z_loss": 0.0010780536103993654 }, { "copy_logits_max": -2.3287835121154785, "copy_logits_min": -750000000.0, "copy_num_tokens": 553.75, "epoch": 0.2816441153944345, "gen_logits_max": 8.225786209106445, "gen_logits_mean": -10.750530242919922, "gen_logits_min": -22.76058006286621, "gen_logits_std": 2.536090135574341, "gen_loss": 0.30421310663223267, "grad_norm": 0.555033744212144, "learning_rate": 2.918778947368421e-05, "loss": 0.3086, "mean_copy_accuracy": 0.9916107654571533, "mean_gen_accuracy": 0.8690702170133591, "mean_token_accuracy": 0.8995369970798492, "num_tokens": 372981834.0, "sample_num_tokens": 9617.5, "step": 1379, "total_num_tokens": 373020304.0, "z_loss": 0.001117910142056644 }, { "copy_logits_max": -0.5220051407814026, "copy_logits_min": -750000000.0, "copy_num_tokens": 723.8125, "epoch": 0.28184835333163133, "gen_logits_max": 8.234119415283203, "gen_logits_mean": -11.25654411315918, "gen_logits_min": -23.936447143554688, "gen_logits_std": 2.515583038330078, "gen_loss": 0.3372722864151001, "grad_norm": 0.5805912374895457, "learning_rate": 2.9186526315789475e-05, "loss": 0.3516, "mean_copy_accuracy": 0.9902792721986771, "mean_gen_accuracy": 0.8488456010818481, "mean_token_accuracy": 0.8865236639976501, "num_tokens": 373277672.0, "sample_num_tokens": 11091.5, "step": 1380, "total_num_tokens": 373322038.0, "z_loss": 0.0013138862559571862 }, { "copy_logits_max": 0.7496294379234314, "copy_logits_min": -750000000.0, "copy_num_tokens": 365.4375, "epoch": 0.2820525912688282, "gen_logits_max": 8.588407516479492, "gen_logits_mean": -10.94857120513916, "gen_logits_min": -22.860565185546875, "gen_logits_std": 2.5162198543548584, "gen_loss": 0.35397425293922424, "grad_norm": 0.5597964790471074, "learning_rate": 2.9185263157894736e-05, "loss": 0.3481, "mean_copy_accuracy": 0.9938144832849503, "mean_gen_accuracy": 0.8500972390174866, "mean_token_accuracy": 0.8881330639123917, "num_tokens": 373552540.0, "sample_num_tokens": 7633.5, "step": 1381, "total_num_tokens": 373583074.0, "z_loss": 0.0013615796342492104 }, { "copy_logits_max": -0.15452685952186584, "copy_logits_min": -750000000.0, "copy_num_tokens": 414.6875, "epoch": 0.282256829206025, "gen_logits_max": 8.068069458007812, "gen_logits_mean": -11.501864433288574, "gen_logits_min": -23.54545783996582, "gen_logits_std": 2.485361099243164, "gen_loss": 0.36924800276756287, "grad_norm": 0.5138133550954134, "learning_rate": 2.9184e-05, "loss": 0.3166, "mean_copy_accuracy": 0.9924411922693253, "mean_gen_accuracy": 0.864951953291893, "mean_token_accuracy": 0.8957079350948334, "num_tokens": 373837665.0, "sample_num_tokens": 7838.75, "step": 1382, "total_num_tokens": 373869020.0, "z_loss": 0.0013740388676524162 }, { "copy_logits_max": -0.6412261724472046, "copy_logits_min": -750000000.0, "copy_num_tokens": 426.5625, "epoch": 0.28246106714322183, "gen_logits_max": 8.557908058166504, "gen_logits_mean": -10.024291038513184, "gen_logits_min": -22.642257690429688, "gen_logits_std": 2.5751290321350098, "gen_loss": 0.3339918553829193, "grad_norm": 0.5376378270999452, "learning_rate": 2.9182736842105265e-05, "loss": 0.3398, "mean_copy_accuracy": 0.9918126314878464, "mean_gen_accuracy": 0.8520902842283249, "mean_token_accuracy": 0.8880306482315063, "num_tokens": 374093995.0, "sample_num_tokens": 7653.75, "step": 1383, "total_num_tokens": 374124610.0, "z_loss": 0.0013200382236391306 }, { "copy_logits_max": -0.65670245885849, "copy_logits_min": -687500032.0, "copy_num_tokens": 375.375, "epoch": 0.2826653050804187, "gen_logits_max": 8.884039878845215, "gen_logits_mean": -10.972872734069824, "gen_logits_min": -22.751262664794922, "gen_logits_std": 2.498380184173584, "gen_loss": 0.36363649368286133, "grad_norm": 0.6009472200348595, "learning_rate": 2.9181473684210526e-05, "loss": 0.3781, "mean_copy_accuracy": 0.9882558137178421, "mean_gen_accuracy": 0.8465090543031693, "mean_token_accuracy": 0.8781624436378479, "num_tokens": 374349373.0, "sample_num_tokens": 8011.25, "step": 1384, "total_num_tokens": 374381418.0, "z_loss": 0.001295367255806923 }, { "copy_logits_max": 1.6684324741363525, "copy_logits_min": -750000000.0, "copy_num_tokens": 630.3125, "epoch": 0.2828695430176155, "gen_logits_max": 7.245565891265869, "gen_logits_mean": -10.93176555633545, "gen_logits_min": -23.52574920654297, "gen_logits_std": 2.524322032928467, "gen_loss": 0.3358812928199768, "grad_norm": 0.6198874196356725, "learning_rate": 2.918021052631579e-05, "loss": 0.3531, "mean_copy_accuracy": 0.990504652261734, "mean_gen_accuracy": 0.8514300286769867, "mean_token_accuracy": 0.8870556056499481, "num_tokens": 374614037.0, "sample_num_tokens": 9020.75, "step": 1385, "total_num_tokens": 374650120.0, "z_loss": 0.0015140733448788524 }, { "copy_logits_max": 1.223280668258667, "copy_logits_min": -750000000.0, "copy_num_tokens": 448.625, "epoch": 0.28307378095481234, "gen_logits_max": 7.719277858734131, "gen_logits_mean": -11.248291015625, "gen_logits_min": -23.891666412353516, "gen_logits_std": 2.5453052520751953, "gen_loss": 0.33338049054145813, "grad_norm": 0.4978507965725255, "learning_rate": 2.9178947368421054e-05, "loss": 0.3362, "mean_copy_accuracy": 0.9922425448894501, "mean_gen_accuracy": 0.854917123913765, "mean_token_accuracy": 0.89004285633564, "num_tokens": 374904991.0, "sample_num_tokens": 7822.75, "step": 1386, "total_num_tokens": 374936282.0, "z_loss": 0.0014739391626790166 }, { "copy_logits_max": -1.0059480667114258, "copy_logits_min": -750000000.0, "copy_num_tokens": 385.0, "epoch": 0.2832780188920092, "gen_logits_max": 7.312073230743408, "gen_logits_mean": -11.729391098022461, "gen_logits_min": -23.687915802001953, "gen_logits_std": 2.462482213973999, "gen_loss": 0.3255891799926758, "grad_norm": 0.5451625388380243, "learning_rate": 2.917768421052632e-05, "loss": 0.3493, "mean_copy_accuracy": 0.9901001155376434, "mean_gen_accuracy": 0.8580734133720398, "mean_token_accuracy": 0.885221317410469, "num_tokens": 375157759.0, "sample_num_tokens": 7662.75, "step": 1387, "total_num_tokens": 375188410.0, "z_loss": 0.0012912938836961985 }, { "copy_logits_max": 0.3035324513912201, "copy_logits_min": -750000000.0, "copy_num_tokens": 689.6875, "epoch": 0.283482256829206, "gen_logits_max": 6.868453025817871, "gen_logits_mean": -12.197434425354004, "gen_logits_min": -24.762027740478516, "gen_logits_std": 2.495011329650879, "gen_loss": 0.3064700961112976, "grad_norm": 0.6332773343428745, "learning_rate": 2.917642105263158e-05, "loss": 0.3147, "mean_copy_accuracy": 0.9917431473731995, "mean_gen_accuracy": 0.8593376129865646, "mean_token_accuracy": 0.8986342698335648, "num_tokens": 375431042.0, "sample_num_tokens": 9255.5, "step": 1388, "total_num_tokens": 375468064.0, "z_loss": 0.001197419362142682 }, { "copy_logits_max": -1.9692978858947754, "copy_logits_min": -750000000.0, "copy_num_tokens": 333.4375, "epoch": 0.28368649476640284, "gen_logits_max": 8.3485107421875, "gen_logits_mean": -11.206750869750977, "gen_logits_min": -23.364229202270508, "gen_logits_std": 2.490889072418213, "gen_loss": 0.36696958541870117, "grad_norm": 0.5110687812761546, "learning_rate": 2.9175157894736844e-05, "loss": 0.3276, "mean_copy_accuracy": 0.9910879582166672, "mean_gen_accuracy": 0.8591378480195999, "mean_token_accuracy": 0.8932022899389267, "num_tokens": 375710353.0, "sample_num_tokens": 7900.75, "step": 1389, "total_num_tokens": 375741956.0, "z_loss": 0.0012222863733768463 }, { "copy_logits_max": -0.6767330169677734, "copy_logits_min": -750000000.0, "copy_num_tokens": 520.75, "epoch": 0.2838907327035997, "gen_logits_max": 9.505301475524902, "gen_logits_mean": -8.847795486450195, "gen_logits_min": -21.839126586914062, "gen_logits_std": 2.566253185272217, "gen_loss": 0.32160860300064087, "grad_norm": 0.5781002880264681, "learning_rate": 2.9173894736842105e-05, "loss": 0.335, "mean_copy_accuracy": 0.9910590648651123, "mean_gen_accuracy": 0.8571184426546097, "mean_token_accuracy": 0.8926263302564621, "num_tokens": 375972773.0, "sample_num_tokens": 8405.75, "step": 1390, "total_num_tokens": 376006396.0, "z_loss": 0.0011813801247626543 }, { "copy_logits_max": -0.4118781089782715, "copy_logits_min": -750000000.0, "copy_num_tokens": 449.875, "epoch": 0.28409497064079653, "gen_logits_max": 7.825830459594727, "gen_logits_mean": -10.631296157836914, "gen_logits_min": -23.044795989990234, "gen_logits_std": 2.469214916229248, "gen_loss": 0.3168696165084839, "grad_norm": 0.5368402949525897, "learning_rate": 2.917263157894737e-05, "loss": 0.3343, "mean_copy_accuracy": 0.9904609769582748, "mean_gen_accuracy": 0.8608029633760452, "mean_token_accuracy": 0.8911413252353668, "num_tokens": 376213084.0, "sample_num_tokens": 7408.0, "step": 1391, "total_num_tokens": 376242716.0, "z_loss": 0.0011261417530477047 }, { "copy_logits_max": -1.0725891590118408, "copy_logits_min": -750000000.0, "copy_num_tokens": 480.875, "epoch": 0.28429920857799335, "gen_logits_max": 7.56876277923584, "gen_logits_mean": -11.744152069091797, "gen_logits_min": -24.07183837890625, "gen_logits_std": 2.462909698486328, "gen_loss": 0.30801326036453247, "grad_norm": 0.5210174922145298, "learning_rate": 2.917136842105263e-05, "loss": 0.321, "mean_copy_accuracy": 0.9906343072652817, "mean_gen_accuracy": 0.8645908981561661, "mean_token_accuracy": 0.8960762172937393, "num_tokens": 376485041.0, "sample_num_tokens": 9039.75, "step": 1392, "total_num_tokens": 376521200.0, "z_loss": 0.001202587503939867 }, { "copy_logits_max": -0.6607158184051514, "copy_logits_min": -625000000.0, "copy_num_tokens": 469.75, "epoch": 0.2845034465151902, "gen_logits_max": 7.674648761749268, "gen_logits_mean": -11.316203117370605, "gen_logits_min": -23.56459617614746, "gen_logits_std": 2.485924482345581, "gen_loss": 0.3401769995689392, "grad_norm": 0.5320441422313295, "learning_rate": 2.9170105263157894e-05, "loss": 0.354, "mean_copy_accuracy": 0.9906459897756577, "mean_gen_accuracy": 0.8506028950214386, "mean_token_accuracy": 0.8839839100837708, "num_tokens": 376745193.0, "sample_num_tokens": 8364.25, "step": 1393, "total_num_tokens": 376778650.0, "z_loss": 0.001236163778230548 }, { "copy_logits_max": -0.06900426745414734, "copy_logits_min": -687500032.0, "copy_num_tokens": 543.625, "epoch": 0.28470768445238703, "gen_logits_max": 7.59152889251709, "gen_logits_mean": -10.945398330688477, "gen_logits_min": -23.216392517089844, "gen_logits_std": 2.5015182495117188, "gen_loss": 0.317359060049057, "grad_norm": 0.4674199260524457, "learning_rate": 2.916884210526316e-05, "loss": 0.3229, "mean_copy_accuracy": 0.9911062270402908, "mean_gen_accuracy": 0.8600892424583435, "mean_token_accuracy": 0.8952485620975494, "num_tokens": 377028064.0, "sample_num_tokens": 8883.5, "step": 1394, "total_num_tokens": 377063598.0, "z_loss": 0.0012433391530066729 }, { "copy_logits_max": -0.13022303581237793, "copy_logits_min": -750000064.0, "copy_num_tokens": 476.1875, "epoch": 0.28491192238958385, "gen_logits_max": 7.956942081451416, "gen_logits_mean": -11.242738723754883, "gen_logits_min": -23.53716468811035, "gen_logits_std": 2.4793484210968018, "gen_loss": 0.3441605269908905, "grad_norm": 0.5193686258519142, "learning_rate": 2.9167578947368423e-05, "loss": 0.3545, "mean_copy_accuracy": 0.9896243363618851, "mean_gen_accuracy": 0.8539027273654938, "mean_token_accuracy": 0.8825295120477676, "num_tokens": 377276794.0, "sample_num_tokens": 8465.0, "step": 1395, "total_num_tokens": 377310654.0, "z_loss": 0.0013790915254503489 }, { "copy_logits_max": -0.8581746816635132, "copy_logits_min": -750000000.0, "copy_num_tokens": 436.6875, "epoch": 0.2851161603267807, "gen_logits_max": 7.835978031158447, "gen_logits_mean": -10.833064079284668, "gen_logits_min": -23.24810028076172, "gen_logits_std": 2.480262041091919, "gen_loss": 0.3220783472061157, "grad_norm": 0.6435692198862689, "learning_rate": 2.9166315789473687e-05, "loss": 0.3625, "mean_copy_accuracy": 0.9904997795820236, "mean_gen_accuracy": 0.8522628098726273, "mean_token_accuracy": 0.8823502361774445, "num_tokens": 377520766.0, "sample_num_tokens": 7796.5, "step": 1396, "total_num_tokens": 377551952.0, "z_loss": 0.0012126018991693854 }, { "copy_logits_max": -1.1892187595367432, "copy_logits_min": -687500032.0, "copy_num_tokens": 419.1875, "epoch": 0.28532039826397754, "gen_logits_max": 7.696938991546631, "gen_logits_mean": -11.092373847961426, "gen_logits_min": -23.217636108398438, "gen_logits_std": 2.4642958641052246, "gen_loss": 0.35202160477638245, "grad_norm": 0.4878084422422818, "learning_rate": 2.9165052631578948e-05, "loss": 0.3201, "mean_copy_accuracy": 0.990487277507782, "mean_gen_accuracy": 0.8675834685564041, "mean_token_accuracy": 0.8946671634912491, "num_tokens": 377791076.0, "sample_num_tokens": 8834.0, "step": 1397, "total_num_tokens": 377826412.0, "z_loss": 0.0013223718851804733 }, { "copy_logits_max": 0.09750235080718994, "copy_logits_min": -750000000.0, "copy_num_tokens": 491.0, "epoch": 0.28552463620117435, "gen_logits_max": 7.6612677574157715, "gen_logits_mean": -11.61955738067627, "gen_logits_min": -23.72119140625, "gen_logits_std": 2.5037665367126465, "gen_loss": 0.3716139495372772, "grad_norm": 0.5487925442774665, "learning_rate": 2.9163789473684212e-05, "loss": 0.3491, "mean_copy_accuracy": 0.9905262291431427, "mean_gen_accuracy": 0.8508425354957581, "mean_token_accuracy": 0.8870174586772919, "num_tokens": 378060450.0, "sample_num_tokens": 8786.5, "step": 1398, "total_num_tokens": 378095596.0, "z_loss": 0.0013002387713640928 }, { "copy_logits_max": -0.6240198612213135, "copy_logits_min": -750000064.0, "copy_num_tokens": 424.3125, "epoch": 0.2857288741383712, "gen_logits_max": 8.442696571350098, "gen_logits_mean": -11.24852180480957, "gen_logits_min": -23.373247146606445, "gen_logits_std": 2.509335517883301, "gen_loss": 0.33901339769363403, "grad_norm": 0.5028096814848918, "learning_rate": 2.9162526315789473e-05, "loss": 0.3361, "mean_copy_accuracy": 0.9917822182178497, "mean_gen_accuracy": 0.8597228974103928, "mean_token_accuracy": 0.891325518488884, "num_tokens": 378340627.0, "sample_num_tokens": 8410.75, "step": 1399, "total_num_tokens": 378374270.0, "z_loss": 0.0012891353107988834 }, { "copy_logits_max": 0.056869328022003174, "copy_logits_min": -750000000.0, "copy_num_tokens": 359.4375, "epoch": 0.28593311207556804, "gen_logits_max": 8.210826873779297, "gen_logits_mean": -10.845584869384766, "gen_logits_min": -23.243343353271484, "gen_logits_std": 2.5275001525878906, "gen_loss": 0.3354324698448181, "grad_norm": 0.5399722012824779, "learning_rate": 2.9161263157894738e-05, "loss": 0.3243, "mean_copy_accuracy": 0.989368125796318, "mean_gen_accuracy": 0.8604418039321899, "mean_token_accuracy": 0.8940806686878204, "num_tokens": 378607605.0, "sample_num_tokens": 7752.25, "step": 1400, "total_num_tokens": 378638614.0, "z_loss": 0.0011833163443952799 }, { "copy_logits_max": -0.8407673835754395, "copy_logits_min": -750000000.0, "copy_num_tokens": 487.9375, "epoch": 0.28613735001276486, "gen_logits_max": 8.071554183959961, "gen_logits_mean": -10.188098907470703, "gen_logits_min": -22.667766571044922, "gen_logits_std": 2.528473138809204, "gen_loss": 0.3382403254508972, "grad_norm": 0.5865314685607593, "learning_rate": 2.916e-05, "loss": 0.3289, "mean_copy_accuracy": 0.9921273738145828, "mean_gen_accuracy": 0.8554368168115616, "mean_token_accuracy": 0.8950193226337433, "num_tokens": 378892852.0, "sample_num_tokens": 8482.0, "step": 1401, "total_num_tokens": 378926780.0, "z_loss": 0.0012757166987285018 }, { "copy_logits_max": -0.3399454951286316, "copy_logits_min": -750000000.0, "copy_num_tokens": 321.5625, "epoch": 0.28634158794996173, "gen_logits_max": 8.534357070922852, "gen_logits_mean": -11.326412200927734, "gen_logits_min": -23.451282501220703, "gen_logits_std": 2.5236215591430664, "gen_loss": 0.3258039951324463, "grad_norm": 0.5239516561369908, "learning_rate": 2.9158736842105266e-05, "loss": 0.3406, "mean_copy_accuracy": 0.9911472648382187, "mean_gen_accuracy": 0.8575010299682617, "mean_token_accuracy": 0.8906370252370834, "num_tokens": 379166167.0, "sample_num_tokens": 7450.25, "step": 1402, "total_num_tokens": 379195968.0, "z_loss": 0.0010536399204283953 }, { "copy_logits_max": -1.3228892087936401, "copy_logits_min": -750000064.0, "copy_num_tokens": 444.3125, "epoch": 0.28654582588715855, "gen_logits_max": 8.068981170654297, "gen_logits_mean": -10.782463073730469, "gen_logits_min": -23.380538940429688, "gen_logits_std": 2.542696952819824, "gen_loss": 0.31760841608047485, "grad_norm": 0.5508505664555576, "learning_rate": 2.9157473684210527e-05, "loss": 0.3583, "mean_copy_accuracy": 0.9899052232503891, "mean_gen_accuracy": 0.8531533181667328, "mean_token_accuracy": 0.8830578625202179, "num_tokens": 379416435.0, "sample_num_tokens": 7798.25, "step": 1403, "total_num_tokens": 379447628.0, "z_loss": 0.0011809291318058968 }, { "copy_logits_max": -1.1245200634002686, "copy_logits_min": -687500032.0, "copy_num_tokens": 439.0625, "epoch": 0.28675006382435536, "gen_logits_max": 7.771688461303711, "gen_logits_mean": -11.197977066040039, "gen_logits_min": -23.273666381835938, "gen_logits_std": 2.4958715438842773, "gen_loss": 0.34545642137527466, "grad_norm": 0.4956768526099592, "learning_rate": 2.915621052631579e-05, "loss": 0.3165, "mean_copy_accuracy": 0.9912592172622681, "mean_gen_accuracy": 0.8636591136455536, "mean_token_accuracy": 0.8967527002096176, "num_tokens": 379674811.0, "sample_num_tokens": 8977.25, "step": 1404, "total_num_tokens": 379710720.0, "z_loss": 0.001150342170149088 }, { "copy_logits_max": -1.0201129913330078, "copy_logits_min": -750000000.0, "copy_num_tokens": 398.5, "epoch": 0.28695430176155223, "gen_logits_max": 7.520119667053223, "gen_logits_mean": -11.153556823730469, "gen_logits_min": -23.508834838867188, "gen_logits_std": 2.524698495864868, "gen_loss": 0.3434724509716034, "grad_norm": 0.5043498718023157, "learning_rate": 2.9154947368421052e-05, "loss": 0.3473, "mean_copy_accuracy": 0.9923252910375595, "mean_gen_accuracy": 0.8534234911203384, "mean_token_accuracy": 0.8851105570793152, "num_tokens": 379935470.0, "sample_num_tokens": 7730.0, "step": 1405, "total_num_tokens": 379966390.0, "z_loss": 0.0012269193539395928 }, { "copy_logits_max": -0.16198799014091492, "copy_logits_min": -750000000.0, "copy_num_tokens": 624.3125, "epoch": 0.28715853969874905, "gen_logits_max": 7.698741912841797, "gen_logits_mean": -10.189706802368164, "gen_logits_min": -23.061708450317383, "gen_logits_std": 2.562500238418579, "gen_loss": 0.30304577946662903, "grad_norm": 0.5276624376314792, "learning_rate": 2.9153684210526317e-05, "loss": 0.3379, "mean_copy_accuracy": 0.9902020841836929, "mean_gen_accuracy": 0.8610261231660843, "mean_token_accuracy": 0.8914686590433121, "num_tokens": 380197591.0, "sample_num_tokens": 9325.25, "step": 1406, "total_num_tokens": 380234892.0, "z_loss": 0.0013376008719205856 }, { "copy_logits_max": 0.2286493182182312, "copy_logits_min": -750000000.0, "copy_num_tokens": 593.375, "epoch": 0.28736277763594587, "gen_logits_max": 7.215501308441162, "gen_logits_mean": -10.892122268676758, "gen_logits_min": -23.622966766357422, "gen_logits_std": 2.5478997230529785, "gen_loss": 0.32406115531921387, "grad_norm": 0.5572614837973333, "learning_rate": 2.9152421052631578e-05, "loss": 0.3459, "mean_copy_accuracy": 0.9908046126365662, "mean_gen_accuracy": 0.8511447459459305, "mean_token_accuracy": 0.8868100047111511, "num_tokens": 380469878.0, "sample_num_tokens": 9149.0, "step": 1407, "total_num_tokens": 380506474.0, "z_loss": 0.0012743771076202393 }, { "copy_logits_max": 0.042222291231155396, "copy_logits_min": -750000000.0, "copy_num_tokens": 421.1875, "epoch": 0.28756701557314274, "gen_logits_max": 8.004640579223633, "gen_logits_mean": -9.970985412597656, "gen_logits_min": -22.53188705444336, "gen_logits_std": 2.5369181632995605, "gen_loss": 0.36679714918136597, "grad_norm": 0.5016702105177118, "learning_rate": 2.9151157894736842e-05, "loss": 0.3528, "mean_copy_accuracy": 0.9932983815670013, "mean_gen_accuracy": 0.8514537364244461, "mean_token_accuracy": 0.886859804391861, "num_tokens": 380750381.0, "sample_num_tokens": 7829.25, "step": 1408, "total_num_tokens": 380781698.0, "z_loss": 0.0012907549971714616 }, { "copy_logits_max": -3.4621829986572266, "copy_logits_min": -750000000.0, "copy_num_tokens": 326.4375, "epoch": 0.28777125351033955, "gen_logits_max": 8.101332664489746, "gen_logits_mean": -11.455456733703613, "gen_logits_min": -23.989316940307617, "gen_logits_std": 2.4987926483154297, "gen_loss": 0.29529526829719543, "grad_norm": 0.5885134369772902, "learning_rate": 2.9149894736842106e-05, "loss": 0.3259, "mean_copy_accuracy": 0.988971397280693, "mean_gen_accuracy": 0.8668983727693558, "mean_token_accuracy": 0.8952824473381042, "num_tokens": 381028930.0, "sample_num_tokens": 8372.0, "step": 1409, "total_num_tokens": 381062418.0, "z_loss": 0.0010409944225102663 }, { "copy_logits_max": -2.0248329639434814, "copy_logits_min": -750000000.0, "copy_num_tokens": 488.375, "epoch": 0.28797549144753637, "gen_logits_max": 7.282938003540039, "gen_logits_mean": -12.111557006835938, "gen_logits_min": -24.182764053344727, "gen_logits_std": 2.4669535160064697, "gen_loss": 0.3781706690788269, "grad_norm": 0.530929417079513, "learning_rate": 2.9148631578947367e-05, "loss": 0.3419, "mean_copy_accuracy": 0.9917825162410736, "mean_gen_accuracy": 0.8543383926153183, "mean_token_accuracy": 0.8881273418664932, "num_tokens": 381301927.0, "sample_num_tokens": 9315.75, "step": 1410, "total_num_tokens": 381339190.0, "z_loss": 0.0012029390782117844 }, { "copy_logits_max": -2.364147186279297, "copy_logits_min": -750000000.0, "copy_num_tokens": 463.9375, "epoch": 0.28817972938473324, "gen_logits_max": 7.792546272277832, "gen_logits_mean": -11.191164016723633, "gen_logits_min": -23.589702606201172, "gen_logits_std": 2.5433669090270996, "gen_loss": 0.3181511163711548, "grad_norm": 0.553405714583064, "learning_rate": 2.9147368421052635e-05, "loss": 0.3212, "mean_copy_accuracy": 0.9915321171283722, "mean_gen_accuracy": 0.8587521910667419, "mean_token_accuracy": 0.8955762684345245, "num_tokens": 381601390.0, "sample_num_tokens": 8357.5, "step": 1411, "total_num_tokens": 381634820.0, "z_loss": 0.0012130232062190771 }, { "copy_logits_max": -1.4235601425170898, "copy_logits_min": -750000000.0, "copy_num_tokens": 461.3125, "epoch": 0.28838396732193006, "gen_logits_max": 7.732317924499512, "gen_logits_mean": -10.787741661071777, "gen_logits_min": -23.493988037109375, "gen_logits_std": 2.5429539680480957, "gen_loss": 0.35103827714920044, "grad_norm": 0.5354541536902536, "learning_rate": 2.9146105263157896e-05, "loss": 0.3461, "mean_copy_accuracy": 0.9910152107477188, "mean_gen_accuracy": 0.8624936193227768, "mean_token_accuracy": 0.8913269490003586, "num_tokens": 381865864.0, "sample_num_tokens": 8880.0, "step": 1412, "total_num_tokens": 381901384.0, "z_loss": 0.001214589225128293 }, { "copy_logits_max": -1.797415018081665, "copy_logits_min": -750000000.0, "copy_num_tokens": 292.4375, "epoch": 0.2885882052591269, "gen_logits_max": 8.271780014038086, "gen_logits_mean": -11.084123611450195, "gen_logits_min": -22.99625015258789, "gen_logits_std": 2.4821360111236572, "gen_loss": 0.3515758514404297, "grad_norm": 0.5817390669270841, "learning_rate": 2.914484210526316e-05, "loss": 0.3322, "mean_copy_accuracy": 0.9899269491434097, "mean_gen_accuracy": 0.8596110641956329, "mean_token_accuracy": 0.8925262242555618, "num_tokens": 382144775.0, "sample_num_tokens": 7448.25, "step": 1413, "total_num_tokens": 382174568.0, "z_loss": 0.001248617540113628 }, { "copy_logits_max": -1.5621339082717896, "copy_logits_min": -750000000.0, "copy_num_tokens": 338.4375, "epoch": 0.28879244319632374, "gen_logits_max": 8.328230857849121, "gen_logits_mean": -10.598572731018066, "gen_logits_min": -22.898168563842773, "gen_logits_std": 2.48795223236084, "gen_loss": 0.33274105191230774, "grad_norm": 0.5003082917828724, "learning_rate": 2.914357894736842e-05, "loss": 0.3641, "mean_copy_accuracy": 0.9890428930521011, "mean_gen_accuracy": 0.8555520474910736, "mean_token_accuracy": 0.8805472701787949, "num_tokens": 382385643.0, "sample_num_tokens": 6927.75, "step": 1414, "total_num_tokens": 382413354.0, "z_loss": 0.0012390321353450418 }, { "copy_logits_max": 0.32554399967193604, "copy_logits_min": -750000000.0, "copy_num_tokens": 533.8125, "epoch": 0.28899668113352056, "gen_logits_max": 7.105607032775879, "gen_logits_mean": -11.111699104309082, "gen_logits_min": -23.84937286376953, "gen_logits_std": 2.539039134979248, "gen_loss": 0.27941232919692993, "grad_norm": 0.5094407665586971, "learning_rate": 2.9142315789473685e-05, "loss": 0.3037, "mean_copy_accuracy": 0.9914797246456146, "mean_gen_accuracy": 0.8666882216930389, "mean_token_accuracy": 0.899339035153389, "num_tokens": 382674854.0, "sample_num_tokens": 7820.5, "step": 1415, "total_num_tokens": 382706136.0, "z_loss": 0.0011777093168348074 }, { "copy_logits_max": 0.354734867811203, "copy_logits_min": -750000000.0, "copy_num_tokens": 298.625, "epoch": 0.2892009190707174, "gen_logits_max": 7.85332727432251, "gen_logits_mean": -11.658411979675293, "gen_logits_min": -23.716230392456055, "gen_logits_std": 2.5039710998535156, "gen_loss": 0.38659775257110596, "grad_norm": 0.5317980522744213, "learning_rate": 2.9141052631578946e-05, "loss": 0.3544, "mean_copy_accuracy": 0.9894629120826721, "mean_gen_accuracy": 0.8550947159528732, "mean_token_accuracy": 0.8843093067407608, "num_tokens": 382928402.0, "sample_num_tokens": 6812.5, "step": 1416, "total_num_tokens": 382955652.0, "z_loss": 0.0013690909836441278 }, { "copy_logits_max": -0.14296826720237732, "copy_logits_min": -687500032.0, "copy_num_tokens": 573.25, "epoch": 0.2894051570079142, "gen_logits_max": 7.647526264190674, "gen_logits_mean": -10.652466773986816, "gen_logits_min": -23.14242935180664, "gen_logits_std": 2.5138392448425293, "gen_loss": 0.37935373187065125, "grad_norm": 0.49630548571945976, "learning_rate": 2.913978947368421e-05, "loss": 0.3465, "mean_copy_accuracy": 0.9924226403236389, "mean_gen_accuracy": 0.8496908843517303, "mean_token_accuracy": 0.8881687968969345, "num_tokens": 383214697.0, "sample_num_tokens": 8823.75, "step": 1417, "total_num_tokens": 383249992.0, "z_loss": 0.0014299256727099419 }, { "copy_logits_max": -0.4947918653488159, "copy_logits_min": -750000000.0, "copy_num_tokens": 617.625, "epoch": 0.28960939494511106, "gen_logits_max": 7.531728267669678, "gen_logits_mean": -10.680936813354492, "gen_logits_min": -23.08068084716797, "gen_logits_std": 2.563714027404785, "gen_loss": 0.28148746490478516, "grad_norm": 0.5121236421883849, "learning_rate": 2.913852631578947e-05, "loss": 0.328, "mean_copy_accuracy": 0.9906423389911652, "mean_gen_accuracy": 0.8596044033765793, "mean_token_accuracy": 0.893102690577507, "num_tokens": 383483900.0, "sample_num_tokens": 8878.0, "step": 1418, "total_num_tokens": 383519412.0, "z_loss": 0.0012124557979404926 }, { "copy_logits_max": -0.05724482238292694, "copy_logits_min": -750000000.0, "copy_num_tokens": 657.1875, "epoch": 0.2898136328823079, "gen_logits_max": 7.8406982421875, "gen_logits_mean": -9.751932144165039, "gen_logits_min": -22.550769805908203, "gen_logits_std": 2.546811103820801, "gen_loss": 0.3459222614765167, "grad_norm": 0.49935409061837327, "learning_rate": 2.913726315789474e-05, "loss": 0.3701, "mean_copy_accuracy": 0.9923514127731323, "mean_gen_accuracy": 0.8457587361335754, "mean_token_accuracy": 0.8800472766160965, "num_tokens": 383767331.0, "sample_num_tokens": 10216.75, "step": 1419, "total_num_tokens": 383808198.0, "z_loss": 0.0012685897527262568 }, { "copy_logits_max": -2.16483211517334, "copy_logits_min": -750000000.0, "copy_num_tokens": 386.125, "epoch": 0.2900178708195047, "gen_logits_max": 7.552491188049316, "gen_logits_mean": -10.462181091308594, "gen_logits_min": -22.85009765625, "gen_logits_std": 2.4944562911987305, "gen_loss": 0.3472529649734497, "grad_norm": 0.47241896973582426, "learning_rate": 2.9136e-05, "loss": 0.3421, "mean_copy_accuracy": 0.9937779754400253, "mean_gen_accuracy": 0.8491430133581161, "mean_token_accuracy": 0.8908044099807739, "num_tokens": 384061537.0, "sample_num_tokens": 7476.25, "step": 1420, "total_num_tokens": 384091442.0, "z_loss": 0.0011776754399761558 }, { "copy_logits_max": -0.27664411067962646, "copy_logits_min": -750000000.0, "copy_num_tokens": 585.5625, "epoch": 0.29022210875670157, "gen_logits_max": 7.390163421630859, "gen_logits_mean": -10.236538887023926, "gen_logits_min": -22.572834014892578, "gen_logits_std": 2.534302234649658, "gen_loss": 0.33563530445098877, "grad_norm": 0.4970655677578261, "learning_rate": 2.9134736842105265e-05, "loss": 0.3453, "mean_copy_accuracy": 0.9936306178569794, "mean_gen_accuracy": 0.8550662994384766, "mean_token_accuracy": 0.8889949917793274, "num_tokens": 384323557.0, "sample_num_tokens": 8763.25, "step": 1421, "total_num_tokens": 384358610.0, "z_loss": 0.0013258276740089059 }, { "copy_logits_max": -1.2392476797103882, "copy_logits_min": -687500032.0, "copy_num_tokens": 436.1875, "epoch": 0.2904263466938984, "gen_logits_max": 7.020323753356934, "gen_logits_mean": -11.335996627807617, "gen_logits_min": -23.512042999267578, "gen_logits_std": 2.51603102684021, "gen_loss": 0.2988288402557373, "grad_norm": 0.5350027983727499, "learning_rate": 2.913347368421053e-05, "loss": 0.3351, "mean_copy_accuracy": 0.9913253039121628, "mean_gen_accuracy": 0.8609421402215958, "mean_token_accuracy": 0.8900568187236786, "num_tokens": 384592946.0, "sample_num_tokens": 7941.5, "step": 1422, "total_num_tokens": 384624712.0, "z_loss": 0.0011270721442997456 }, { "copy_logits_max": -1.6207751035690308, "copy_logits_min": -562500032.0, "copy_num_tokens": 537.0, "epoch": 0.2906305846310952, "gen_logits_max": 8.083624839782715, "gen_logits_mean": -9.715787887573242, "gen_logits_min": -22.541595458984375, "gen_logits_std": 2.550630807876587, "gen_loss": 0.342950701713562, "grad_norm": 0.5936557619141715, "learning_rate": 2.913221052631579e-05, "loss": 0.3574, "mean_copy_accuracy": 0.9899089634418488, "mean_gen_accuracy": 0.8532966524362564, "mean_token_accuracy": 0.8837892264127731, "num_tokens": 384865899.0, "sample_num_tokens": 8898.75, "step": 1423, "total_num_tokens": 384901494.0, "z_loss": 0.0011925275903195143 }, { "copy_logits_max": -0.09834368526935577, "copy_logits_min": -750000000.0, "copy_num_tokens": 720.6875, "epoch": 0.2908348225682921, "gen_logits_max": 7.199986934661865, "gen_logits_mean": -10.633554458618164, "gen_logits_min": -23.785289764404297, "gen_logits_std": 2.6007113456726074, "gen_loss": 0.2981630563735962, "grad_norm": 0.5039797203362061, "learning_rate": 2.9130947368421054e-05, "loss": 0.3302, "mean_copy_accuracy": 0.9920562654733658, "mean_gen_accuracy": 0.8541447073221207, "mean_token_accuracy": 0.891779363155365, "num_tokens": 385165127.0, "sample_num_tokens": 9955.25, "step": 1424, "total_num_tokens": 385204948.0, "z_loss": 0.0011911863693967462 }, { "copy_logits_max": 0.5923606157302856, "copy_logits_min": -687500032.0, "copy_num_tokens": 540.5625, "epoch": 0.2910390605054889, "gen_logits_max": 7.894501209259033, "gen_logits_mean": -10.661046981811523, "gen_logits_min": -23.19717025756836, "gen_logits_std": 2.539968490600586, "gen_loss": 0.3420639634132385, "grad_norm": 0.5228448203772246, "learning_rate": 2.9129684210526315e-05, "loss": 0.3265, "mean_copy_accuracy": 0.9924224019050598, "mean_gen_accuracy": 0.8584482222795486, "mean_token_accuracy": 0.892359659075737, "num_tokens": 385445909.0, "sample_num_tokens": 9040.75, "step": 1425, "total_num_tokens": 385482072.0, "z_loss": 0.0013629914028570056 }, { "copy_logits_max": 0.3810983896255493, "copy_logits_min": -687500032.0, "copy_num_tokens": 530.4375, "epoch": 0.2912432984426857, "gen_logits_max": 7.855124473571777, "gen_logits_mean": -10.407146453857422, "gen_logits_min": -22.581974029541016, "gen_logits_std": 2.556260108947754, "gen_loss": 0.3830397129058838, "grad_norm": 0.552663810605157, "learning_rate": 2.912842105263158e-05, "loss": 0.3486, "mean_copy_accuracy": 0.9931527376174927, "mean_gen_accuracy": 0.8493296056985855, "mean_token_accuracy": 0.8873695433139801, "num_tokens": 385711134.0, "sample_num_tokens": 9016.5, "step": 1426, "total_num_tokens": 385747200.0, "z_loss": 0.0014378208434209228 }, { "copy_logits_max": 0.06531167030334473, "copy_logits_min": -750000000.0, "copy_num_tokens": 361.6875, "epoch": 0.2914475363798826, "gen_logits_max": 7.756743431091309, "gen_logits_mean": -11.420852661132812, "gen_logits_min": -23.924732208251953, "gen_logits_std": 2.5431790351867676, "gen_loss": 0.3528839647769928, "grad_norm": 0.5509415767330021, "learning_rate": 2.9127157894736844e-05, "loss": 0.3421, "mean_copy_accuracy": 0.9905135482549667, "mean_gen_accuracy": 0.8571315258741379, "mean_token_accuracy": 0.8915923833847046, "num_tokens": 385994322.0, "sample_num_tokens": 7247.0, "step": 1427, "total_num_tokens": 386023310.0, "z_loss": 0.0011992482468485832 }, { "copy_logits_max": 0.6969397068023682, "copy_logits_min": -750000000.0, "copy_num_tokens": 607.0, "epoch": 0.2916517743170794, "gen_logits_max": 6.985345840454102, "gen_logits_mean": -11.146045684814453, "gen_logits_min": -23.71562957763672, "gen_logits_std": 2.582876682281494, "gen_loss": 0.30862489342689514, "grad_norm": 0.5090453146464905, "learning_rate": 2.9125894736842108e-05, "loss": 0.3208, "mean_copy_accuracy": 0.9915381371974945, "mean_gen_accuracy": 0.8575390875339508, "mean_token_accuracy": 0.8946828246116638, "num_tokens": 386248616.0, "sample_num_tokens": 8586.0, "step": 1428, "total_num_tokens": 386282960.0, "z_loss": 0.0012298040091991425 }, { "copy_logits_max": -0.9879716634750366, "copy_logits_min": -750000000.0, "copy_num_tokens": 650.3125, "epoch": 0.2918560122542762, "gen_logits_max": 7.524563789367676, "gen_logits_mean": -10.44438362121582, "gen_logits_min": -22.733501434326172, "gen_logits_std": 2.57922101020813, "gen_loss": 0.29721301794052124, "grad_norm": 0.47124746671806345, "learning_rate": 2.912463157894737e-05, "loss": 0.3458, "mean_copy_accuracy": 0.9931323230266571, "mean_gen_accuracy": 0.8543899804353714, "mean_token_accuracy": 0.8890822976827621, "num_tokens": 386545424.0, "sample_num_tokens": 10303.5, "step": 1429, "total_num_tokens": 386586638.0, "z_loss": 0.0011964819859713316 }, { "copy_logits_max": -1.0564494132995605, "copy_logits_min": -687500032.0, "copy_num_tokens": 373.875, "epoch": 0.2920602501914731, "gen_logits_max": 7.7694993019104, "gen_logits_mean": -10.563959121704102, "gen_logits_min": -22.409799575805664, "gen_logits_std": 2.4863882064819336, "gen_loss": 0.3339012563228607, "grad_norm": 0.5470411404236116, "learning_rate": 2.9123368421052633e-05, "loss": 0.3406, "mean_copy_accuracy": 0.9926333129405975, "mean_gen_accuracy": 0.8558157086372375, "mean_token_accuracy": 0.8878999948501587, "num_tokens": 386800873.0, "sample_num_tokens": 8848.75, "step": 1430, "total_num_tokens": 386836268.0, "z_loss": 0.0010627289302647114 }, { "copy_logits_max": 2.264273166656494, "copy_logits_min": -750000000.0, "copy_num_tokens": 704.375, "epoch": 0.2922644881286699, "gen_logits_max": 6.827861309051514, "gen_logits_mean": -10.721063613891602, "gen_logits_min": -23.513629913330078, "gen_logits_std": 2.578509569168091, "gen_loss": 0.31690746545791626, "grad_norm": 0.501231366217943, "learning_rate": 2.9122105263157894e-05, "loss": 0.3252, "mean_copy_accuracy": 0.9906930327415466, "mean_gen_accuracy": 0.8619134426116943, "mean_token_accuracy": 0.8936382234096527, "num_tokens": 387055975.0, "sample_num_tokens": 9818.75, "step": 1431, "total_num_tokens": 387095250.0, "z_loss": 0.001207580091431737 }, { "copy_logits_max": -2.430408477783203, "copy_logits_min": -750000000.0, "copy_num_tokens": 384.3125, "epoch": 0.2924687260658667, "gen_logits_max": 7.745105743408203, "gen_logits_mean": -12.26872444152832, "gen_logits_min": -24.35382080078125, "gen_logits_std": 2.499061107635498, "gen_loss": 0.316161185503006, "grad_norm": 0.5136212862375551, "learning_rate": 2.912084210526316e-05, "loss": 0.3632, "mean_copy_accuracy": 0.991117998957634, "mean_gen_accuracy": 0.8523519784212112, "mean_token_accuracy": 0.8835101574659348, "num_tokens": 387328987.0, "sample_num_tokens": 7844.25, "step": 1432, "total_num_tokens": 387360364.0, "z_loss": 0.0010917356703430414 }, { "copy_logits_max": -0.9847126603126526, "copy_logits_min": -750000000.0, "copy_num_tokens": 613.75, "epoch": 0.2926729640030636, "gen_logits_max": 7.392573833465576, "gen_logits_mean": -11.535396575927734, "gen_logits_min": -23.529138565063477, "gen_logits_std": 2.5535078048706055, "gen_loss": 0.296440064907074, "grad_norm": 0.456134823790628, "learning_rate": 2.911957894736842e-05, "loss": 0.3142, "mean_copy_accuracy": 0.9934130609035492, "mean_gen_accuracy": 0.8619637340307236, "mean_token_accuracy": 0.8977353572845459, "num_tokens": 387602624.0, "sample_num_tokens": 10483.5, "step": 1433, "total_num_tokens": 387644558.0, "z_loss": 0.0010289851343259215 }, { "copy_logits_max": 2.155712366104126, "copy_logits_min": -750000000.0, "copy_num_tokens": 545.625, "epoch": 0.2928772019402604, "gen_logits_max": 7.5747504234313965, "gen_logits_mean": -10.762280464172363, "gen_logits_min": -23.59709930419922, "gen_logits_std": 2.584291934967041, "gen_loss": 0.2865527868270874, "grad_norm": 0.48885829695474586, "learning_rate": 2.9118315789473684e-05, "loss": 0.3398, "mean_copy_accuracy": 0.9928176552057266, "mean_gen_accuracy": 0.855593204498291, "mean_token_accuracy": 0.8898216038942337, "num_tokens": 387885484.0, "sample_num_tokens": 8175.5, "step": 1434, "total_num_tokens": 387918186.0, "z_loss": 0.0012858349364250898 }, { "copy_logits_max": -1.0627365112304688, "copy_logits_min": -750000000.0, "copy_num_tokens": 524.3125, "epoch": 0.2930814398774572, "gen_logits_max": 7.283863544464111, "gen_logits_mean": -12.474365234375, "gen_logits_min": -24.51572036743164, "gen_logits_std": 2.5099005699157715, "gen_loss": 0.30849915742874146, "grad_norm": 0.49280748003400093, "learning_rate": 2.9117052631578948e-05, "loss": 0.3199, "mean_copy_accuracy": 0.9923005700111389, "mean_gen_accuracy": 0.860793724656105, "mean_token_accuracy": 0.8940725177526474, "num_tokens": 388180785.0, "sample_num_tokens": 9092.75, "step": 1435, "total_num_tokens": 388217156.0, "z_loss": 0.0011745941592380404 }, { "copy_logits_max": -0.26151028275489807, "copy_logits_min": -750000000.0, "copy_num_tokens": 732.125, "epoch": 0.2932856778146541, "gen_logits_max": 7.2329182624816895, "gen_logits_mean": -10.872198104858398, "gen_logits_min": -23.9693603515625, "gen_logits_std": 2.5341148376464844, "gen_loss": 0.2850000560283661, "grad_norm": 0.5359354664309793, "learning_rate": 2.9115789473684212e-05, "loss": 0.3415, "mean_copy_accuracy": 0.9905140846967697, "mean_gen_accuracy": 0.8539405167102814, "mean_token_accuracy": 0.8896487653255463, "num_tokens": 388458025.0, "sample_num_tokens": 10458.75, "step": 1436, "total_num_tokens": 388499860.0, "z_loss": 0.0011040025856345892 }, { "copy_logits_max": -0.4526657462120056, "copy_logits_min": -687500032.0, "copy_num_tokens": 554.875, "epoch": 0.2934899157518509, "gen_logits_max": 7.4908928871154785, "gen_logits_mean": -11.844514846801758, "gen_logits_min": -23.940662384033203, "gen_logits_std": 2.5261282920837402, "gen_loss": 0.34013497829437256, "grad_norm": 0.5131403408098205, "learning_rate": 2.9114526315789477e-05, "loss": 0.349, "mean_copy_accuracy": 0.9909407645463943, "mean_gen_accuracy": 0.8511659353971481, "mean_token_accuracy": 0.8869380056858063, "num_tokens": 388731441.0, "sample_num_tokens": 9021.25, "step": 1437, "total_num_tokens": 388767526.0, "z_loss": 0.0012261363444849849 }, { "copy_logits_max": -1.0306005477905273, "copy_logits_min": -750000000.0, "copy_num_tokens": 435.875, "epoch": 0.2936941536890477, "gen_logits_max": 7.694975852966309, "gen_logits_mean": -11.95485782623291, "gen_logits_min": -24.428306579589844, "gen_logits_std": 2.518124580383301, "gen_loss": 0.31999510526657104, "grad_norm": 0.4538846258819745, "learning_rate": 2.9113263157894737e-05, "loss": 0.3131, "mean_copy_accuracy": 0.9907517582178116, "mean_gen_accuracy": 0.8731191158294678, "mean_token_accuracy": 0.8956776708364487, "num_tokens": 388998890.0, "sample_num_tokens": 8691.0, "step": 1438, "total_num_tokens": 389033654.0, "z_loss": 0.0011395541951060295 }, { "copy_logits_max": 0.6482599973678589, "copy_logits_min": -625000064.0, "copy_num_tokens": 604.875, "epoch": 0.2938983916262446, "gen_logits_max": 7.307710647583008, "gen_logits_mean": -11.287191390991211, "gen_logits_min": -23.7110595703125, "gen_logits_std": 2.5214462280273438, "gen_loss": 0.34581488370895386, "grad_norm": 0.4730556978908783, "learning_rate": 2.9112000000000002e-05, "loss": 0.3386, "mean_copy_accuracy": 0.9923379272222519, "mean_gen_accuracy": 0.8529060184955597, "mean_token_accuracy": 0.8925142586231232, "num_tokens": 389280232.0, "sample_num_tokens": 9519.5, "step": 1439, "total_num_tokens": 389318310.0, "z_loss": 0.0013262615539133549 }, { "copy_logits_max": 1.52638840675354, "copy_logits_min": -750000064.0, "copy_num_tokens": 540.25, "epoch": 0.2941026295634414, "gen_logits_max": 6.727537155151367, "gen_logits_mean": -11.87717056274414, "gen_logits_min": -24.04425811767578, "gen_logits_std": 2.550168037414551, "gen_loss": 0.3293251693248749, "grad_norm": 0.45388147542219176, "learning_rate": 2.9110736842105263e-05, "loss": 0.3254, "mean_copy_accuracy": 0.9932298213243484, "mean_gen_accuracy": 0.8574763089418411, "mean_token_accuracy": 0.8948542475700378, "num_tokens": 389570171.0, "sample_num_tokens": 8530.75, "step": 1440, "total_num_tokens": 389604294.0, "z_loss": 0.0012480308068916202 }, { "copy_logits_max": -0.9747474193572998, "copy_logits_min": -750000000.0, "copy_num_tokens": 507.3125, "epoch": 0.2943068675006382, "gen_logits_max": 7.15079927444458, "gen_logits_mean": -13.063405990600586, "gen_logits_min": -25.323139190673828, "gen_logits_std": 2.504204273223877, "gen_loss": 0.34794536232948303, "grad_norm": 0.5061838901857447, "learning_rate": 2.9109473684210527e-05, "loss": 0.3345, "mean_copy_accuracy": 0.9927456229925156, "mean_gen_accuracy": 0.8556395620107651, "mean_token_accuracy": 0.8904892951250076, "num_tokens": 389841365.0, "sample_num_tokens": 8740.25, "step": 1441, "total_num_tokens": 389876326.0, "z_loss": 0.001253834692761302 }, { "copy_logits_max": -0.921319842338562, "copy_logits_min": -687500032.0, "copy_num_tokens": 481.1875, "epoch": 0.2945111054378351, "gen_logits_max": 7.853212833404541, "gen_logits_mean": -10.834796905517578, "gen_logits_min": -23.33684730529785, "gen_logits_std": 2.5666584968566895, "gen_loss": 0.3619307279586792, "grad_norm": 0.5023983386801019, "learning_rate": 2.9108210526315788e-05, "loss": 0.3536, "mean_copy_accuracy": 0.9930692166090012, "mean_gen_accuracy": 0.8532165586948395, "mean_token_accuracy": 0.8866194635629654, "num_tokens": 390109237.0, "sample_num_tokens": 8999.25, "step": 1442, "total_num_tokens": 390145234.0, "z_loss": 0.001427776413038373 }, { "copy_logits_max": -0.23325592279434204, "copy_logits_min": -750000064.0, "copy_num_tokens": 406.5625, "epoch": 0.2947153433750319, "gen_logits_max": 8.007661819458008, "gen_logits_mean": -11.12880802154541, "gen_logits_min": -22.95279312133789, "gen_logits_std": 2.5264956951141357, "gen_loss": 0.36116230487823486, "grad_norm": 0.5259307925247543, "learning_rate": 2.9106947368421056e-05, "loss": 0.3283, "mean_copy_accuracy": 0.9925205409526825, "mean_gen_accuracy": 0.8529448956251144, "mean_token_accuracy": 0.8925861567258835, "num_tokens": 390394708.0, "sample_num_tokens": 8228.0, "step": 1443, "total_num_tokens": 390427620.0, "z_loss": 0.0011946939630433917 }, { "copy_logits_max": -1.9522576332092285, "copy_logits_min": -750000000.0, "copy_num_tokens": 412.75, "epoch": 0.29491958131222873, "gen_logits_max": 7.675240516662598, "gen_logits_mean": -11.735669136047363, "gen_logits_min": -23.640180587768555, "gen_logits_std": 2.4938082695007324, "gen_loss": 0.36848363280296326, "grad_norm": 0.44486870338929413, "learning_rate": 2.9105684210526317e-05, "loss": 0.3462, "mean_copy_accuracy": 0.993170753121376, "mean_gen_accuracy": 0.8538317978382111, "mean_token_accuracy": 0.8869512677192688, "num_tokens": 390669820.0, "sample_num_tokens": 8323.0, "step": 1444, "total_num_tokens": 390703112.0, "z_loss": 0.001215985044836998 }, { "copy_logits_max": -1.4350413084030151, "copy_logits_min": -750000000.0, "copy_num_tokens": 360.0, "epoch": 0.2951238192494256, "gen_logits_max": 7.757157325744629, "gen_logits_mean": -11.308693885803223, "gen_logits_min": -23.02789306640625, "gen_logits_std": 2.5048019886016846, "gen_loss": 0.359011173248291, "grad_norm": 0.48803400768543675, "learning_rate": 2.910442105263158e-05, "loss": 0.3179, "mean_copy_accuracy": 0.9930964112281799, "mean_gen_accuracy": 0.8635808378458023, "mean_token_accuracy": 0.8988817930221558, "num_tokens": 390962947.0, "sample_num_tokens": 8722.25, "step": 1445, "total_num_tokens": 390997836.0, "z_loss": 0.0011633469257503748 }, { "copy_logits_max": -2.243408203125, "copy_logits_min": -750000000.0, "copy_num_tokens": 411.875, "epoch": 0.2953280571866224, "gen_logits_max": 7.76862907409668, "gen_logits_mean": -11.760374069213867, "gen_logits_min": -24.213390350341797, "gen_logits_std": 2.515774965286255, "gen_loss": 0.3304727077484131, "grad_norm": 0.636238100889495, "learning_rate": 2.9103157894736842e-05, "loss": 0.3369, "mean_copy_accuracy": 0.991742417216301, "mean_gen_accuracy": 0.8548228889703751, "mean_token_accuracy": 0.8902315348386765, "num_tokens": 391225176.0, "sample_num_tokens": 8301.0, "step": 1446, "total_num_tokens": 391258380.0, "z_loss": 0.0011293792631477118 }, { "copy_logits_max": -0.5832805633544922, "copy_logits_min": -750000000.0, "copy_num_tokens": 399.9375, "epoch": 0.29553229512381923, "gen_logits_max": 7.743033409118652, "gen_logits_mean": -11.758694648742676, "gen_logits_min": -24.286006927490234, "gen_logits_std": 2.5424113273620605, "gen_loss": 0.3117002248764038, "grad_norm": 0.5199525161036371, "learning_rate": 2.9101894736842106e-05, "loss": 0.348, "mean_copy_accuracy": 0.9921212494373322, "mean_gen_accuracy": 0.8544367253780365, "mean_token_accuracy": 0.8886447101831436, "num_tokens": 391478428.0, "sample_num_tokens": 7756.5, "step": 1447, "total_num_tokens": 391509454.0, "z_loss": 0.001101782312616706 }, { "copy_logits_max": 0.5855137705802917, "copy_logits_min": -750000000.0, "copy_num_tokens": 548.4375, "epoch": 0.2957365330610161, "gen_logits_max": 7.492722511291504, "gen_logits_mean": -10.546171188354492, "gen_logits_min": -23.151657104492188, "gen_logits_std": 2.572780132293701, "gen_loss": 0.3306972086429596, "grad_norm": 0.47333030199581905, "learning_rate": 2.9100631578947367e-05, "loss": 0.3437, "mean_copy_accuracy": 0.991940975189209, "mean_gen_accuracy": 0.8568592071533203, "mean_token_accuracy": 0.8878372460603714, "num_tokens": 391749933.0, "sample_num_tokens": 9797.25, "step": 1448, "total_num_tokens": 391789122.0, "z_loss": 0.001151148695498705 }, { "copy_logits_max": -2.1596007347106934, "copy_logits_min": -750000000.0, "copy_num_tokens": 294.0, "epoch": 0.2959407709982129, "gen_logits_max": 7.621282577514648, "gen_logits_mean": -11.318984985351562, "gen_logits_min": -23.888227462768555, "gen_logits_std": 2.528080940246582, "gen_loss": 0.3156190812587738, "grad_norm": 0.4857333825198645, "learning_rate": 2.909936842105263e-05, "loss": 0.3098, "mean_copy_accuracy": 0.9904177337884903, "mean_gen_accuracy": 0.8671231716871262, "mean_token_accuracy": 0.8984788060188293, "num_tokens": 392035795.0, "sample_num_tokens": 7509.25, "step": 1449, "total_num_tokens": 392065832.0, "z_loss": 0.0010539579670876265 }, { "copy_logits_max": -2.698424816131592, "copy_logits_min": -750000064.0, "copy_num_tokens": 350.8125, "epoch": 0.29614500893540974, "gen_logits_max": 7.560380935668945, "gen_logits_mean": -10.299559593200684, "gen_logits_min": -22.82331085205078, "gen_logits_std": 2.547250747680664, "gen_loss": 0.38159871101379395, "grad_norm": 0.47408920338848154, "learning_rate": 2.9098105263157896e-05, "loss": 0.3587, "mean_copy_accuracy": 0.992599755525589, "mean_gen_accuracy": 0.8452364653348923, "mean_token_accuracy": 0.8835802972316742, "num_tokens": 392294908.0, "sample_num_tokens": 7425.5, "step": 1450, "total_num_tokens": 392324610.0, "z_loss": 0.0011625918559730053 }, { "copy_logits_max": 1.704859972000122, "copy_logits_min": -750000000.0, "copy_num_tokens": 401.8125, "epoch": 0.2963492468726066, "gen_logits_max": 7.764825820922852, "gen_logits_mean": -10.989768028259277, "gen_logits_min": -23.54300308227539, "gen_logits_std": 2.557945728302002, "gen_loss": 0.37322521209716797, "grad_norm": 0.5280328965484291, "learning_rate": 2.909684210526316e-05, "loss": 0.3505, "mean_copy_accuracy": 0.9901984483003616, "mean_gen_accuracy": 0.8566258102655411, "mean_token_accuracy": 0.8861154466867447, "num_tokens": 392533718.0, "sample_num_tokens": 8023.5, "step": 1451, "total_num_tokens": 392565812.0, "z_loss": 0.001306611578911543 }, { "copy_logits_max": -1.2067046165466309, "copy_logits_min": -750000000.0, "copy_num_tokens": 445.0, "epoch": 0.2965534848098034, "gen_logits_max": 6.865249156951904, "gen_logits_mean": -12.436705589294434, "gen_logits_min": -24.152816772460938, "gen_logits_std": 2.494751453399658, "gen_loss": 0.3352600932121277, "grad_norm": 0.5104697873884162, "learning_rate": 2.9095578947368424e-05, "loss": 0.3428, "mean_copy_accuracy": 0.9925422519445419, "mean_gen_accuracy": 0.8555559515953064, "mean_token_accuracy": 0.8875317275524139, "num_tokens": 392805542.0, "sample_num_tokens": 8926.5, "step": 1452, "total_num_tokens": 392841248.0, "z_loss": 0.001126638613641262 }, { "copy_logits_max": -2.329780340194702, "copy_logits_min": -750000000.0, "copy_num_tokens": 486.5625, "epoch": 0.29675772274700024, "gen_logits_max": 7.802823066711426, "gen_logits_mean": -11.267136573791504, "gen_logits_min": -23.117523193359375, "gen_logits_std": 2.4720494747161865, "gen_loss": 0.3236773610115051, "grad_norm": 0.5347637254814968, "learning_rate": 2.9094315789473685e-05, "loss": 0.3299, "mean_copy_accuracy": 0.9904337078332901, "mean_gen_accuracy": 0.8559994399547577, "mean_token_accuracy": 0.8934763967990875, "num_tokens": 393083965.0, "sample_num_tokens": 9802.25, "step": 1453, "total_num_tokens": 393123174.0, "z_loss": 0.0010540641378611326 }, { "copy_logits_max": -1.279278039932251, "copy_logits_min": -750000000.0, "copy_num_tokens": 519.625, "epoch": 0.2969619606841971, "gen_logits_max": 6.617250919342041, "gen_logits_mean": -12.080315589904785, "gen_logits_min": -24.326862335205078, "gen_logits_std": 2.511868476867676, "gen_loss": 0.2934982180595398, "grad_norm": 0.46391063229785306, "learning_rate": 2.909305263157895e-05, "loss": 0.3215, "mean_copy_accuracy": 0.991043895483017, "mean_gen_accuracy": 0.8622185438871384, "mean_token_accuracy": 0.8941790908575058, "num_tokens": 393364690.0, "sample_num_tokens": 8834.5, "step": 1454, "total_num_tokens": 393400028.0, "z_loss": 0.0010789677035063505 }, { "copy_logits_max": -0.4418286085128784, "copy_logits_min": -750000064.0, "copy_num_tokens": 514.9375, "epoch": 0.29716619862139393, "gen_logits_max": 7.567554473876953, "gen_logits_mean": -10.735581398010254, "gen_logits_min": -23.638996124267578, "gen_logits_std": 2.549532651901245, "gen_loss": 0.3841255009174347, "grad_norm": 1.9059397312172948, "learning_rate": 2.909178947368421e-05, "loss": 0.3389, "mean_copy_accuracy": 0.9937297999858856, "mean_gen_accuracy": 0.8511317074298859, "mean_token_accuracy": 0.8892549425363541, "num_tokens": 393651597.0, "sample_num_tokens": 8816.25, "step": 1455, "total_num_tokens": 393686862.0, "z_loss": 0.0014129066839814186 }, { "copy_logits_max": -1.7216427326202393, "copy_logits_min": -750000000.0, "copy_num_tokens": 403.125, "epoch": 0.29737043655859074, "gen_logits_max": 7.530505180358887, "gen_logits_mean": -12.328669548034668, "gen_logits_min": -24.41623306274414, "gen_logits_std": 2.4911937713623047, "gen_loss": 0.33484411239624023, "grad_norm": 0.5404991971328063, "learning_rate": 2.9090526315789475e-05, "loss": 0.3526, "mean_copy_accuracy": 0.9927389621734619, "mean_gen_accuracy": 0.8513798117637634, "mean_token_accuracy": 0.8852712363004684, "num_tokens": 393939646.0, "sample_num_tokens": 8515.0, "step": 1456, "total_num_tokens": 393973706.0, "z_loss": 0.0012106320355087519 }, { "copy_logits_max": -0.2311854064464569, "copy_logits_min": -750000000.0, "copy_num_tokens": 570.5625, "epoch": 0.2975746744957876, "gen_logits_max": 8.148470878601074, "gen_logits_mean": -9.999109268188477, "gen_logits_min": -22.150611877441406, "gen_logits_std": 2.5370709896087646, "gen_loss": 0.32329171895980835, "grad_norm": 0.476818819838726, "learning_rate": 2.9089263157894736e-05, "loss": 0.3256, "mean_copy_accuracy": 0.9916402250528336, "mean_gen_accuracy": 0.8653899431228638, "mean_token_accuracy": 0.8948546499013901, "num_tokens": 394235334.0, "sample_num_tokens": 9615.5, "step": 1457, "total_num_tokens": 394273796.0, "z_loss": 0.0012428363552317023 }, { "copy_logits_max": -2.32967472076416, "copy_logits_min": -750000000.0, "copy_num_tokens": 330.4375, "epoch": 0.29777891243298443, "gen_logits_max": 7.4751877784729, "gen_logits_mean": -12.168524742126465, "gen_logits_min": -24.34520721435547, "gen_logits_std": 2.506340980529785, "gen_loss": 0.33049720525741577, "grad_norm": 0.5887497522438581, "learning_rate": 2.9088e-05, "loss": 0.332, "mean_copy_accuracy": 0.9897902607917786, "mean_gen_accuracy": 0.8599455803632736, "mean_token_accuracy": 0.892316535115242, "num_tokens": 394503845.0, "sample_num_tokens": 7688.75, "step": 1458, "total_num_tokens": 394534600.0, "z_loss": 0.0011027096770703793 }, { "copy_logits_max": -0.4621339738368988, "copy_logits_min": -750000000.0, "copy_num_tokens": 407.8125, "epoch": 0.29798315037018125, "gen_logits_max": 8.041898727416992, "gen_logits_mean": -10.087605476379395, "gen_logits_min": -22.373767852783203, "gen_logits_std": 2.5198638439178467, "gen_loss": 0.3408249616622925, "grad_norm": 0.4913162973666342, "learning_rate": 2.9086736842105264e-05, "loss": 0.3566, "mean_copy_accuracy": 0.9914873093366623, "mean_gen_accuracy": 0.8537945449352264, "mean_token_accuracy": 0.8823797702789307, "num_tokens": 394754199.0, "sample_num_tokens": 8767.75, "step": 1459, "total_num_tokens": 394789270.0, "z_loss": 0.001249097054824233 }, { "copy_logits_max": 0.4769532084465027, "copy_logits_min": -687500032.0, "copy_num_tokens": 540.9375, "epoch": 0.2981873883073781, "gen_logits_max": 7.586820602416992, "gen_logits_mean": -10.29442024230957, "gen_logits_min": -21.97176742553711, "gen_logits_std": 2.48646879196167, "gen_loss": 0.3353543281555176, "grad_norm": 0.5062843142005475, "learning_rate": 2.908547368421053e-05, "loss": 0.3359, "mean_copy_accuracy": 0.9919697493314743, "mean_gen_accuracy": 0.8562658131122589, "mean_token_accuracy": 0.8891400396823883, "num_tokens": 395026782.0, "sample_num_tokens": 10144.5, "step": 1460, "total_num_tokens": 395067360.0, "z_loss": 0.0012800986878573895 }, { "copy_logits_max": -0.47336477041244507, "copy_logits_min": -687500032.0, "copy_num_tokens": 478.0625, "epoch": 0.29839162624457494, "gen_logits_max": 7.221287727355957, "gen_logits_mean": -11.231245040893555, "gen_logits_min": -22.98625373840332, "gen_logits_std": 2.4133124351501465, "gen_loss": 0.31347042322158813, "grad_norm": 0.4394720546679011, "learning_rate": 2.908421052631579e-05, "loss": 0.3284, "mean_copy_accuracy": 0.992644414305687, "mean_gen_accuracy": 0.8589385151863098, "mean_token_accuracy": 0.8944251984357834, "num_tokens": 395324925.0, "sample_num_tokens": 8460.25, "step": 1461, "total_num_tokens": 395358766.0, "z_loss": 0.0012249356368556619 }, { "copy_logits_max": -0.30169758200645447, "copy_logits_min": -750000000.0, "copy_num_tokens": 471.3125, "epoch": 0.29859586418177175, "gen_logits_max": 7.800551891326904, "gen_logits_mean": -10.342084884643555, "gen_logits_min": -22.284282684326172, "gen_logits_std": 2.451916456222534, "gen_loss": 0.351252943277359, "grad_norm": 0.5756315716651498, "learning_rate": 2.9082947368421054e-05, "loss": 0.3238, "mean_copy_accuracy": 0.9915432184934616, "mean_gen_accuracy": 0.8620217442512512, "mean_token_accuracy": 0.8944234102964401, "num_tokens": 395594444.0, "sample_num_tokens": 8954.5, "step": 1462, "total_num_tokens": 395630262.0, "z_loss": 0.0012648075353354216 }, { "copy_logits_max": -0.8951088190078735, "copy_logits_min": -750000000.0, "copy_num_tokens": 543.25, "epoch": 0.2988001021189686, "gen_logits_max": 6.776180267333984, "gen_logits_mean": -11.656006813049316, "gen_logits_min": -23.32725715637207, "gen_logits_std": 2.461329936981201, "gen_loss": 0.29784587025642395, "grad_norm": 0.4252618638806231, "learning_rate": 2.9081684210526318e-05, "loss": 0.3344, "mean_copy_accuracy": 0.9929293394088745, "mean_gen_accuracy": 0.8583292812108994, "mean_token_accuracy": 0.8913251310586929, "num_tokens": 395869367.0, "sample_num_tokens": 9192.75, "step": 1463, "total_num_tokens": 395906138.0, "z_loss": 0.0012116790749132633 }, { "copy_logits_max": -2.224606990814209, "copy_logits_min": -750000000.0, "copy_num_tokens": 325.9375, "epoch": 0.29900434005616544, "gen_logits_max": 7.141211986541748, "gen_logits_mean": -12.383867263793945, "gen_logits_min": -23.720388412475586, "gen_logits_std": 2.4315600395202637, "gen_loss": 0.32438576221466064, "grad_norm": 0.5158144669888698, "learning_rate": 2.908042105263158e-05, "loss": 0.3606, "mean_copy_accuracy": 0.992155060172081, "mean_gen_accuracy": 0.8553355038166046, "mean_token_accuracy": 0.8819335550069809, "num_tokens": 396129021.0, "sample_num_tokens": 7968.75, "step": 1464, "total_num_tokens": 396160896.0, "z_loss": 0.0011005762498825788 }, { "copy_logits_max": -1.5996493101119995, "copy_logits_min": -750000000.0, "copy_num_tokens": 353.125, "epoch": 0.29920857799336226, "gen_logits_max": 7.083107948303223, "gen_logits_mean": -11.711736679077148, "gen_logits_min": -23.232664108276367, "gen_logits_std": 2.4102110862731934, "gen_loss": 0.32792285084724426, "grad_norm": 0.5252047964982867, "learning_rate": 2.9079157894736843e-05, "loss": 0.31, "mean_copy_accuracy": 0.9926002025604248, "mean_gen_accuracy": 0.8685804903507233, "mean_token_accuracy": 0.8972857147455215, "num_tokens": 396375001.0, "sample_num_tokens": 7605.75, "step": 1465, "total_num_tokens": 396405424.0, "z_loss": 0.001103412127122283 }, { "copy_logits_max": -2.1120831966400146, "copy_logits_min": -750000000.0, "copy_num_tokens": 274.9375, "epoch": 0.29941281593055913, "gen_logits_max": 7.7144365310668945, "gen_logits_mean": -11.351058959960938, "gen_logits_min": -22.683414459228516, "gen_logits_std": 2.4314515590667725, "gen_loss": 0.4117485284805298, "grad_norm": 0.6027812422052095, "learning_rate": 2.9077894736842104e-05, "loss": 0.3899, "mean_copy_accuracy": 0.9887992888689041, "mean_gen_accuracy": 0.8442800790071487, "mean_token_accuracy": 0.8742297142744064, "num_tokens": 396629835.0, "sample_num_tokens": 7148.25, "step": 1466, "total_num_tokens": 396658428.0, "z_loss": 0.0011435628402978182 }, { "copy_logits_max": -0.9516098499298096, "copy_logits_min": -750000000.0, "copy_num_tokens": 471.4375, "epoch": 0.29961705386775594, "gen_logits_max": 6.375495433807373, "gen_logits_mean": -12.37984848022461, "gen_logits_min": -24.069740295410156, "gen_logits_std": 2.3927855491638184, "gen_loss": 0.321670800447464, "grad_norm": 2.235930026751437, "learning_rate": 2.907663157894737e-05, "loss": 0.3417, "mean_copy_accuracy": 0.9920683056116104, "mean_gen_accuracy": 0.8563530147075653, "mean_token_accuracy": 0.8875794857740402, "num_tokens": 396894935.0, "sample_num_tokens": 8368.75, "step": 1467, "total_num_tokens": 396928410.0, "z_loss": 0.0010556589113548398 }, { "copy_logits_max": -0.508442223072052, "copy_logits_min": -750000000.0, "copy_num_tokens": 409.5, "epoch": 0.29982129180495276, "gen_logits_max": 6.938192367553711, "gen_logits_mean": -11.794692039489746, "gen_logits_min": -23.46514892578125, "gen_logits_std": 2.4426674842834473, "gen_loss": 0.33408141136169434, "grad_norm": 0.5313197681932771, "learning_rate": 2.9075368421052633e-05, "loss": 0.3259, "mean_copy_accuracy": 0.9919145107269287, "mean_gen_accuracy": 0.8601197600364685, "mean_token_accuracy": 0.8943309187889099, "num_tokens": 397179886.0, "sample_num_tokens": 7266.0, "step": 1468, "total_num_tokens": 397208950.0, "z_loss": 0.001223083003424108 }, { "copy_logits_max": -2.176487445831299, "copy_logits_min": -750000000.0, "copy_num_tokens": 365.375, "epoch": 0.3000255297421496, "gen_logits_max": 7.374176502227783, "gen_logits_mean": -10.171740531921387, "gen_logits_min": -21.528324127197266, "gen_logits_std": 2.4158577919006348, "gen_loss": 0.33900827169418335, "grad_norm": 0.582430749517677, "learning_rate": 2.9074105263157897e-05, "loss": 0.3519, "mean_copy_accuracy": 0.9921348989009857, "mean_gen_accuracy": 0.8492019474506378, "mean_token_accuracy": 0.8860085606575012, "num_tokens": 397436370.0, "sample_num_tokens": 7265.0, "step": 1469, "total_num_tokens": 397465430.0, "z_loss": 0.0011340490309521556 }, { "copy_logits_max": -3.614217519760132, "copy_logits_min": -750000000.0, "copy_num_tokens": 268.6875, "epoch": 0.30022976767934645, "gen_logits_max": 7.445095539093018, "gen_logits_mean": -12.050273895263672, "gen_logits_min": -23.34223175048828, "gen_logits_std": 2.371541738510132, "gen_loss": 0.36196091771125793, "grad_norm": 0.606805577144709, "learning_rate": 2.9072842105263158e-05, "loss": 0.3362, "mean_copy_accuracy": 0.9890643805265427, "mean_gen_accuracy": 0.8572690635919571, "mean_token_accuracy": 0.887668788433075, "num_tokens": 397696001.0, "sample_num_tokens": 6821.75, "step": 1470, "total_num_tokens": 397723288.0, "z_loss": 0.0010753462556749582 }, { "copy_logits_max": -2.4559085369110107, "copy_logits_min": -750000000.0, "copy_num_tokens": 393.0625, "epoch": 0.30043400561654326, "gen_logits_max": 6.677408695220947, "gen_logits_mean": -11.684154510498047, "gen_logits_min": -23.338544845581055, "gen_logits_std": 2.407721757888794, "gen_loss": 0.31415820121765137, "grad_norm": 0.6159235670031347, "learning_rate": 2.9071578947368422e-05, "loss": 0.3522, "mean_copy_accuracy": 0.9894779026508331, "mean_gen_accuracy": 0.8525509387254715, "mean_token_accuracy": 0.8854663074016571, "num_tokens": 397954681.0, "sample_num_tokens": 7945.75, "step": 1471, "total_num_tokens": 397986464.0, "z_loss": 0.0010323937749490142 }, { "copy_logits_max": -0.2489355504512787, "copy_logits_min": -687500032.0, "copy_num_tokens": 447.1875, "epoch": 0.3006382435537401, "gen_logits_max": 7.432529926300049, "gen_logits_mean": -10.972146987915039, "gen_logits_min": -22.762165069580078, "gen_logits_std": 2.4335126876831055, "gen_loss": 0.34440386295318604, "grad_norm": 1.1809625398318526, "learning_rate": 2.9070315789473683e-05, "loss": 0.3404, "mean_copy_accuracy": 0.9913780987262726, "mean_gen_accuracy": 0.8534639179706573, "mean_token_accuracy": 0.8908833116292953, "num_tokens": 398221343.0, "sample_num_tokens": 8236.75, "step": 1472, "total_num_tokens": 398254290.0, "z_loss": 0.0012374694924801588 }, { "copy_logits_max": -3.9766292572021484, "copy_logits_min": -750000000.0, "copy_num_tokens": 401.4375, "epoch": 0.30084248149093695, "gen_logits_max": 8.020378112792969, "gen_logits_mean": -10.380195617675781, "gen_logits_min": -21.341550827026367, "gen_logits_std": 2.331077814102173, "gen_loss": 0.3319532871246338, "grad_norm": 0.4918990296560125, "learning_rate": 2.9069052631578948e-05, "loss": 0.3541, "mean_copy_accuracy": 0.9920443594455719, "mean_gen_accuracy": 0.8550013899803162, "mean_token_accuracy": 0.8861856758594513, "num_tokens": 398501922.0, "sample_num_tokens": 8091.0, "step": 1473, "total_num_tokens": 398534286.0, "z_loss": 0.0012437503319233656 }, { "copy_logits_max": -1.7703299522399902, "copy_logits_min": -750000000.0, "copy_num_tokens": 499.125, "epoch": 0.30104671942813377, "gen_logits_max": 7.6917405128479, "gen_logits_mean": -10.341299057006836, "gen_logits_min": -22.34014320373535, "gen_logits_std": 2.482431173324585, "gen_loss": 0.39135974645614624, "grad_norm": 0.6629141566380328, "learning_rate": 2.906778947368421e-05, "loss": 0.3447, "mean_copy_accuracy": 0.9890689849853516, "mean_gen_accuracy": 0.8529156446456909, "mean_token_accuracy": 0.8868447840213776, "num_tokens": 398773122.0, "sample_num_tokens": 9339.0, "step": 1474, "total_num_tokens": 398810478.0, "z_loss": 0.0012628805125132203 }, { "copy_logits_max": -1.76984703540802, "copy_logits_min": -750000000.0, "copy_num_tokens": 502.9375, "epoch": 0.3012509573653306, "gen_logits_max": 7.468276023864746, "gen_logits_mean": -10.879560470581055, "gen_logits_min": -23.443889617919922, "gen_logits_std": 2.466110944747925, "gen_loss": 0.32798197865486145, "grad_norm": 0.4762912271242153, "learning_rate": 2.9066526315789473e-05, "loss": 0.3194, "mean_copy_accuracy": 0.9918878376483917, "mean_gen_accuracy": 0.8634667545557022, "mean_token_accuracy": 0.8942533284425735, "num_tokens": 399049419.0, "sample_num_tokens": 8626.75, "step": 1475, "total_num_tokens": 399083926.0, "z_loss": 0.0013783962931483984 }, { "copy_logits_max": 0.6168093681335449, "copy_logits_min": -750000000.0, "copy_num_tokens": 479.0625, "epoch": 0.30145519530252746, "gen_logits_max": 7.374814033508301, "gen_logits_mean": -10.446516036987305, "gen_logits_min": -22.016027450561523, "gen_logits_std": 2.389216661453247, "gen_loss": 0.3145832419395447, "grad_norm": 0.48901769949744883, "learning_rate": 2.906526315789474e-05, "loss": 0.3104, "mean_copy_accuracy": 0.9936432987451553, "mean_gen_accuracy": 0.8654833436012268, "mean_token_accuracy": 0.9003161638975143, "num_tokens": 399332069.0, "sample_num_tokens": 7809.75, "step": 1476, "total_num_tokens": 399363308.0, "z_loss": 0.0015388978645205498 }, { "copy_logits_max": 0.03796875476837158, "copy_logits_min": -750000000.0, "copy_num_tokens": 476.75, "epoch": 0.3016594332397243, "gen_logits_max": 6.851681709289551, "gen_logits_mean": -11.666034698486328, "gen_logits_min": -23.55539321899414, "gen_logits_std": 2.4020042419433594, "gen_loss": 0.3671356439590454, "grad_norm": 0.488541825269757, "learning_rate": 2.9064e-05, "loss": 0.3451, "mean_copy_accuracy": 0.9919472634792328, "mean_gen_accuracy": 0.8522892743349075, "mean_token_accuracy": 0.8880624026060104, "num_tokens": 399611518.0, "sample_num_tokens": 8389.0, "step": 1477, "total_num_tokens": 399645074.0, "z_loss": 0.0015676121693104506 }, { "copy_logits_max": 1.2216087579727173, "copy_logits_min": -687500032.0, "copy_num_tokens": 640.0, "epoch": 0.3018636711769211, "gen_logits_max": 6.569868087768555, "gen_logits_mean": -11.38275146484375, "gen_logits_min": -23.516481399536133, "gen_logits_std": 2.471442461013794, "gen_loss": 0.3374074399471283, "grad_norm": 0.5196253282265154, "learning_rate": 2.9062736842105266e-05, "loss": 0.3421, "mean_copy_accuracy": 0.9919801503419876, "mean_gen_accuracy": 0.8534743934869766, "mean_token_accuracy": 0.8883474171161652, "num_tokens": 399892398.0, "sample_num_tokens": 9500.0, "step": 1478, "total_num_tokens": 399930398.0, "z_loss": 0.001694665988907218 }, { "copy_logits_max": 0.48646819591522217, "copy_logits_min": -750000000.0, "copy_num_tokens": 621.25, "epoch": 0.30206790911411796, "gen_logits_max": 7.053750991821289, "gen_logits_mean": -10.831753730773926, "gen_logits_min": -22.357574462890625, "gen_logits_std": 2.4221158027648926, "gen_loss": 0.3149678409099579, "grad_norm": 0.5611243087354812, "learning_rate": 2.9061473684210527e-05, "loss": 0.3431, "mean_copy_accuracy": 0.9913601577281952, "mean_gen_accuracy": 0.8517867922782898, "mean_token_accuracy": 0.8869338780641556, "num_tokens": 400158006.0, "sample_num_tokens": 9403.0, "step": 1479, "total_num_tokens": 400195618.0, "z_loss": 0.0014365589013323188 }, { "copy_logits_max": -1.6741697788238525, "copy_logits_min": -750000000.0, "copy_num_tokens": 374.625, "epoch": 0.3022721470513148, "gen_logits_max": 6.926294803619385, "gen_logits_mean": -10.287294387817383, "gen_logits_min": -21.034889221191406, "gen_logits_std": 2.3053412437438965, "gen_loss": 0.33961087465286255, "grad_norm": 0.5391209915554007, "learning_rate": 2.906021052631579e-05, "loss": 0.3529, "mean_copy_accuracy": 0.9895059168338776, "mean_gen_accuracy": 0.8524553626775742, "mean_token_accuracy": 0.8840009570121765, "num_tokens": 400420411.0, "sample_num_tokens": 7496.25, "step": 1480, "total_num_tokens": 400450396.0, "z_loss": 0.001231688424013555 }, { "copy_logits_max": -3.4444334506988525, "copy_logits_min": -750000000.0, "copy_num_tokens": 313.8125, "epoch": 0.3024763849885116, "gen_logits_max": 6.639967918395996, "gen_logits_mean": -12.204607963562012, "gen_logits_min": -22.963634490966797, "gen_logits_std": 2.2859764099121094, "gen_loss": 0.3241689205169678, "grad_norm": 0.516477527760776, "learning_rate": 2.9058947368421052e-05, "loss": 0.3159, "mean_copy_accuracy": 0.9922089725732803, "mean_gen_accuracy": 0.8650211542844772, "mean_token_accuracy": 0.8969685137271881, "num_tokens": 400684406.0, "sample_num_tokens": 7477.5, "step": 1481, "total_num_tokens": 400714316.0, "z_loss": 0.0010459143668413162 }, { "copy_logits_max": -0.7768265008926392, "copy_logits_min": -750000000.0, "copy_num_tokens": 547.3125, "epoch": 0.30268062292570846, "gen_logits_max": 7.153432369232178, "gen_logits_mean": -11.90407943725586, "gen_logits_min": -23.401437759399414, "gen_logits_std": 2.4123144149780273, "gen_loss": 0.31452131271362305, "grad_norm": 0.5346055472187907, "learning_rate": 2.9057684210526316e-05, "loss": 0.3312, "mean_copy_accuracy": 0.9925040900707245, "mean_gen_accuracy": 0.8572369962930679, "mean_token_accuracy": 0.8920891135931015, "num_tokens": 400950101.0, "sample_num_tokens": 9560.75, "step": 1482, "total_num_tokens": 400988344.0, "z_loss": 0.0012160675833001733 }, { "copy_logits_max": 1.634533405303955, "copy_logits_min": -750000000.0, "copy_num_tokens": 686.0, "epoch": 0.3028848608629053, "gen_logits_max": 6.366794109344482, "gen_logits_mean": -11.854557991027832, "gen_logits_min": -23.53205680847168, "gen_logits_std": 2.425102710723877, "gen_loss": 0.2923738956451416, "grad_norm": 0.48883337323906795, "learning_rate": 2.9056421052631577e-05, "loss": 0.3453, "mean_copy_accuracy": 0.9936084151268005, "mean_gen_accuracy": 0.8473767042160034, "mean_token_accuracy": 0.8883141130208969, "num_tokens": 401250955.0, "sample_num_tokens": 9694.25, "step": 1483, "total_num_tokens": 401289732.0, "z_loss": 0.0011599769350141287 }, { "copy_logits_max": -2.148192882537842, "copy_logits_min": -687500032.0, "copy_num_tokens": 448.3125, "epoch": 0.3030890988001021, "gen_logits_max": 7.198911666870117, "gen_logits_mean": -11.864638328552246, "gen_logits_min": -23.336246490478516, "gen_logits_std": 2.4182488918304443, "gen_loss": 0.33870694041252136, "grad_norm": 0.5849632475287684, "learning_rate": 2.9055157894736845e-05, "loss": 0.3708, "mean_copy_accuracy": 0.9914061576128006, "mean_gen_accuracy": 0.8472176492214203, "mean_token_accuracy": 0.8819815218448639, "num_tokens": 401518371.0, "sample_num_tokens": 7894.75, "step": 1484, "total_num_tokens": 401549950.0, "z_loss": 0.0011338668409734964 }, { "copy_logits_max": -1.4080531597137451, "copy_logits_min": -750000000.0, "copy_num_tokens": 527.5, "epoch": 0.30329333673729897, "gen_logits_max": 7.4698286056518555, "gen_logits_mean": -10.058503150939941, "gen_logits_min": -21.195232391357422, "gen_logits_std": 2.3203248977661133, "gen_loss": 0.3401229679584503, "grad_norm": 0.4981405175832859, "learning_rate": 2.9053894736842106e-05, "loss": 0.3315, "mean_copy_accuracy": 0.9922041893005371, "mean_gen_accuracy": 0.8593368977308273, "mean_token_accuracy": 0.8923074156045914, "num_tokens": 401798412.0, "sample_num_tokens": 9778.5, "step": 1485, "total_num_tokens": 401837526.0, "z_loss": 0.001045483397319913 }, { "copy_logits_max": -2.0220746994018555, "copy_logits_min": -625000064.0, "copy_num_tokens": 388.8125, "epoch": 0.3034975746744958, "gen_logits_max": 7.203089714050293, "gen_logits_mean": -12.431547164916992, "gen_logits_min": -24.31868553161621, "gen_logits_std": 2.441631317138672, "gen_loss": 0.36891400814056396, "grad_norm": 0.5862351104161522, "learning_rate": 2.905263157894737e-05, "loss": 0.3493, "mean_copy_accuracy": 0.9908888936042786, "mean_gen_accuracy": 0.8518609553575516, "mean_token_accuracy": 0.8857134580612183, "num_tokens": 402049709.0, "sample_num_tokens": 7621.75, "step": 1486, "total_num_tokens": 402080196.0, "z_loss": 0.00109146349132061 }, { "copy_logits_max": -1.8052784204483032, "copy_logits_min": -625000064.0, "copy_num_tokens": 564.25, "epoch": 0.3037018126116926, "gen_logits_max": 7.606700897216797, "gen_logits_mean": -11.101641654968262, "gen_logits_min": -22.68385124206543, "gen_logits_std": 2.3792896270751953, "gen_loss": 0.3239842653274536, "grad_norm": 0.4636099486526823, "learning_rate": 2.905136842105263e-05, "loss": 0.3187, "mean_copy_accuracy": 0.9917598515748978, "mean_gen_accuracy": 0.8660306036472321, "mean_token_accuracy": 0.8984931707382202, "num_tokens": 402324861.0, "sample_num_tokens": 8985.25, "step": 1487, "total_num_tokens": 402360802.0, "z_loss": 0.001180598046630621 }, { "copy_logits_max": -0.40950846672058105, "copy_logits_min": -562500032.0, "copy_num_tokens": 656.8125, "epoch": 0.3039060505488895, "gen_logits_max": 6.868182182312012, "gen_logits_mean": -10.484347343444824, "gen_logits_min": -21.952104568481445, "gen_logits_std": 2.3957483768463135, "gen_loss": 0.31317615509033203, "grad_norm": 0.5347799792626645, "learning_rate": 2.9050105263157895e-05, "loss": 0.3306, "mean_copy_accuracy": 0.991001307964325, "mean_gen_accuracy": 0.8600171953439713, "mean_token_accuracy": 0.8917786180973053, "num_tokens": 402612692.0, "sample_num_tokens": 9791.0, "step": 1488, "total_num_tokens": 402651856.0, "z_loss": 0.0012041525915265083 }, { "copy_logits_max": -0.7704880833625793, "copy_logits_min": -687500032.0, "copy_num_tokens": 455.75, "epoch": 0.3041102884860863, "gen_logits_max": 6.946169853210449, "gen_logits_mean": -11.533258438110352, "gen_logits_min": -23.158578872680664, "gen_logits_std": 2.401233434677124, "gen_loss": 0.3526003658771515, "grad_norm": 0.5327041947065514, "learning_rate": 2.9048842105263156e-05, "loss": 0.3787, "mean_copy_accuracy": 0.9908313602209091, "mean_gen_accuracy": 0.8488444685935974, "mean_token_accuracy": 0.8778854459524155, "num_tokens": 402862006.0, "sample_num_tokens": 7651.0, "step": 1489, "total_num_tokens": 402892610.0, "z_loss": 0.00121751194819808 }, { "copy_logits_max": -0.455262154340744, "copy_logits_min": -750000000.0, "copy_num_tokens": 484.75, "epoch": 0.3043145264232831, "gen_logits_max": 7.328098297119141, "gen_logits_mean": -10.897407531738281, "gen_logits_min": -22.189258575439453, "gen_logits_std": 2.378912925720215, "gen_loss": 0.35374581813812256, "grad_norm": 0.4667079800137762, "learning_rate": 2.904757894736842e-05, "loss": 0.3486, "mean_copy_accuracy": 0.9924899488687515, "mean_gen_accuracy": 0.853977307677269, "mean_token_accuracy": 0.8878212720155716, "num_tokens": 403127832.0, "sample_num_tokens": 9303.5, "step": 1490, "total_num_tokens": 403165046.0, "z_loss": 0.0012033542152494192 }, { "copy_logits_max": -1.0400125980377197, "copy_logits_min": -750000000.0, "copy_num_tokens": 479.0625, "epoch": 0.30451876436048, "gen_logits_max": 8.164908409118652, "gen_logits_mean": -10.33798885345459, "gen_logits_min": -21.753704071044922, "gen_logits_std": 2.4175314903259277, "gen_loss": 0.3315068483352661, "grad_norm": 0.5174101669537091, "learning_rate": 2.9046315789473685e-05, "loss": 0.3359, "mean_copy_accuracy": 0.9929509311914444, "mean_gen_accuracy": 0.8604281544685364, "mean_token_accuracy": 0.8893091976642609, "num_tokens": 403372157.0, "sample_num_tokens": 8459.25, "step": 1491, "total_num_tokens": 403405994.0, "z_loss": 0.001217942568473518 }, { "copy_logits_max": 1.0286641120910645, "copy_logits_min": -750000000.0, "copy_num_tokens": 497.125, "epoch": 0.3047230022976768, "gen_logits_max": 7.7636003494262695, "gen_logits_mean": -10.942593574523926, "gen_logits_min": -23.057907104492188, "gen_logits_std": 2.5056068897247314, "gen_loss": 0.3624398708343506, "grad_norm": 0.5183955590843294, "learning_rate": 2.904505263157895e-05, "loss": 0.3517, "mean_copy_accuracy": 0.9921685755252838, "mean_gen_accuracy": 0.8497547805309296, "mean_token_accuracy": 0.8857974708080292, "num_tokens": 403643398.0, "sample_num_tokens": 8432.0, "step": 1492, "total_num_tokens": 403677126.0, "z_loss": 0.0012967794900760055 }, { "copy_logits_max": -1.4304049015045166, "copy_logits_min": -687500032.0, "copy_num_tokens": 567.4375, "epoch": 0.3049272402348736, "gen_logits_max": 7.61968994140625, "gen_logits_mean": -11.752262115478516, "gen_logits_min": -23.473587036132812, "gen_logits_std": 2.434262275695801, "gen_loss": 0.3273109197616577, "grad_norm": 0.4948013483275801, "learning_rate": 2.9043789473684214e-05, "loss": 0.3372, "mean_copy_accuracy": 0.9912269562482834, "mean_gen_accuracy": 0.8560087531805038, "mean_token_accuracy": 0.8907408863306046, "num_tokens": 403919348.0, "sample_num_tokens": 9988.0, "step": 1493, "total_num_tokens": 403959300.0, "z_loss": 0.0011704051867127419 }, { "copy_logits_max": -2.8480870723724365, "copy_logits_min": -750000000.0, "copy_num_tokens": 293.75, "epoch": 0.3051314781720705, "gen_logits_max": 8.23882007598877, "gen_logits_mean": -11.662830352783203, "gen_logits_min": -22.700485229492188, "gen_logits_std": 2.3554153442382812, "gen_loss": 0.3711096942424774, "grad_norm": 0.49984460966500527, "learning_rate": 2.9042526315789475e-05, "loss": 0.345, "mean_copy_accuracy": 0.9916660785675049, "mean_gen_accuracy": 0.8546237647533417, "mean_token_accuracy": 0.8871872276067734, "num_tokens": 404195875.0, "sample_num_tokens": 7883.25, "step": 1494, "total_num_tokens": 404227408.0, "z_loss": 0.0011033602058887482 }, { "copy_logits_max": -1.6870771646499634, "copy_logits_min": -750000064.0, "copy_num_tokens": 455.1875, "epoch": 0.3053357161092673, "gen_logits_max": 7.717619895935059, "gen_logits_mean": -11.829774856567383, "gen_logits_min": -23.379030227661133, "gen_logits_std": 2.434826374053955, "gen_loss": 0.3564453721046448, "grad_norm": 0.4616692755115178, "learning_rate": 2.904126315789474e-05, "loss": 0.3484, "mean_copy_accuracy": 0.9929995834827423, "mean_gen_accuracy": 0.8586249202489853, "mean_token_accuracy": 0.8865430653095245, "num_tokens": 404458166.0, "sample_num_tokens": 8768.0, "step": 1495, "total_num_tokens": 404493238.0, "z_loss": 0.001201232778839767 }, { "copy_logits_max": -0.19136595726013184, "copy_logits_min": -687500032.0, "copy_num_tokens": 506.3125, "epoch": 0.3055399540464641, "gen_logits_max": 8.23438835144043, "gen_logits_mean": -11.14737319946289, "gen_logits_min": -23.384357452392578, "gen_logits_std": 2.5075583457946777, "gen_loss": 0.34447991847991943, "grad_norm": 0.48737037661180016, "learning_rate": 2.904e-05, "loss": 0.3196, "mean_copy_accuracy": 0.9917029291391373, "mean_gen_accuracy": 0.8627075850963593, "mean_token_accuracy": 0.8963239043951035, "num_tokens": 404759875.0, "sample_num_tokens": 8561.75, "step": 1496, "total_num_tokens": 404794122.0, "z_loss": 0.0013693968066945672 }, { "copy_logits_max": -0.8872292637825012, "copy_logits_min": -687500032.0, "copy_num_tokens": 535.5, "epoch": 0.305744191983661, "gen_logits_max": 8.20206069946289, "gen_logits_mean": -11.212905883789062, "gen_logits_min": -23.22347640991211, "gen_logits_std": 2.486530303955078, "gen_loss": 0.3111218810081482, "grad_norm": 0.51350720378195, "learning_rate": 2.9038736842105264e-05, "loss": 0.3303, "mean_copy_accuracy": 0.9914744794368744, "mean_gen_accuracy": 0.8600997626781464, "mean_token_accuracy": 0.8936078548431396, "num_tokens": 405020924.0, "sample_num_tokens": 9076.5, "step": 1497, "total_num_tokens": 405057230.0, "z_loss": 0.0012621877249330282 }, { "copy_logits_max": -2.593517780303955, "copy_logits_min": -750000000.0, "copy_num_tokens": 486.75, "epoch": 0.3059484299208578, "gen_logits_max": 8.119832992553711, "gen_logits_mean": -10.831279754638672, "gen_logits_min": -22.66590690612793, "gen_logits_std": 2.4728500843048096, "gen_loss": 0.31174224615097046, "grad_norm": 0.47225426265094833, "learning_rate": 2.9037473684210525e-05, "loss": 0.3276, "mean_copy_accuracy": 0.9938455075025558, "mean_gen_accuracy": 0.8572586327791214, "mean_token_accuracy": 0.8946230113506317, "num_tokens": 405300610.0, "sample_num_tokens": 8537.5, "step": 1498, "total_num_tokens": 405334760.0, "z_loss": 0.0011074105277657509 }, { "copy_logits_max": -1.0733283758163452, "copy_logits_min": -687500032.0, "copy_num_tokens": 595.125, "epoch": 0.3061526678580546, "gen_logits_max": 8.05622673034668, "gen_logits_mean": -10.63608455657959, "gen_logits_min": -22.66067886352539, "gen_logits_std": 2.4560506343841553, "gen_loss": 0.3459123373031616, "grad_norm": 0.5765819712120971, "learning_rate": 2.903621052631579e-05, "loss": 0.3388, "mean_copy_accuracy": 0.9930212646722794, "mean_gen_accuracy": 0.8514773994684219, "mean_token_accuracy": 0.8913548439741135, "num_tokens": 405578522.0, "sample_num_tokens": 9191.0, "step": 1499, "total_num_tokens": 405615286.0, "z_loss": 0.0011559760896489024 }, { "epoch": 0.3063569057952515, "grad_norm": 0.5806071768936331, "learning_rate": 2.9034947368421054e-05, "loss": 0.3413, "step": 1500 }, { "epoch": 0.3063569057952515, "eval_copy_logits_max": -4.076389789581299, "eval_copy_logits_min": -63.533546447753906, "eval_gen_logits_max": 7.162979602813721, "eval_gen_logits_mean": -15.162225723266602, "eval_gen_logits_min": -26.288597106933594, "eval_gen_logits_std": 2.402681350708008, "eval_gen_loss": 0.3812113404273987, "eval_loss": 0.3668663501739502, "eval_mean_copy_accuracy": 0.9878777861595154, "eval_mean_gen_accuracy": 0.8593238294124603, "eval_mean_token_accuracy": 0.8756709396839142, "eval_num_tokens": 405896428.0, "eval_runtime": 0.7634, "eval_samples_per_second": 10.48, "eval_steps_per_second": 2.62, "eval_total_num_tokens": 405896428.0, "eval_z_loss": 0.0010907729156315327, "step": 1500 }, { "copy_logits_max": -1.5884603261947632, "copy_logits_min": -687500032.0, "copy_num_tokens": 446.3125, "epoch": 0.3065611437324483, "gen_logits_max": 7.964679718017578, "gen_logits_mean": -11.619146347045898, "gen_logits_min": -23.713462829589844, "gen_logits_std": 2.463597297668457, "gen_loss": 0.3649177551269531, "grad_norm": 0.5254384801874644, "learning_rate": 2.9033684210526318e-05, "loss": 0.3475, "mean_copy_accuracy": 0.9921604916453362, "mean_gen_accuracy": 0.8566028848290443, "mean_token_accuracy": 0.8905528411269188, "num_tokens": 406143039.0, "sample_num_tokens": 8440.75, "step": 1501, "total_num_tokens": 406176802.0, "z_loss": 0.0012303143739700317 }, { "copy_logits_max": -1.9229114055633545, "copy_logits_min": -750000000.0, "copy_num_tokens": 455.5625, "epoch": 0.3067653816696451, "gen_logits_max": 8.731681823730469, "gen_logits_mean": -10.211212158203125, "gen_logits_min": -21.982810974121094, "gen_logits_std": 2.4520115852355957, "gen_loss": 0.3055751919746399, "grad_norm": 0.5159653985504259, "learning_rate": 2.903242105263158e-05, "loss": 0.3298, "mean_copy_accuracy": 0.9922604560852051, "mean_gen_accuracy": 0.8558722734451294, "mean_token_accuracy": 0.8938905596733093, "num_tokens": 406414518.0, "sample_num_tokens": 7957.5, "step": 1502, "total_num_tokens": 406446348.0, "z_loss": 0.0012867228360846639 }, { "copy_logits_max": -1.2718446254730225, "copy_logits_min": -750000000.0, "copy_num_tokens": 406.8125, "epoch": 0.306969619606842, "gen_logits_max": 8.065301895141602, "gen_logits_mean": -11.546673774719238, "gen_logits_min": -23.760656356811523, "gen_logits_std": 2.486778497695923, "gen_loss": 0.3307443857192993, "grad_norm": 0.7043785487629839, "learning_rate": 2.9031157894736843e-05, "loss": 0.3864, "mean_copy_accuracy": 0.9902873784303665, "mean_gen_accuracy": 0.8419418931007385, "mean_token_accuracy": 0.8775585293769836, "num_tokens": 406688997.0, "sample_num_tokens": 7659.25, "step": 1503, "total_num_tokens": 406719634.0, "z_loss": 0.0011586627224460244 }, { "copy_logits_max": -0.25123900175094604, "copy_logits_min": -750000000.0, "copy_num_tokens": 659.5625, "epoch": 0.3071738575440388, "gen_logits_max": 6.101280212402344, "gen_logits_mean": -11.687766075134277, "gen_logits_min": -23.41960906982422, "gen_logits_std": 2.425929069519043, "gen_loss": 0.3026354908943176, "grad_norm": 0.45669030028050983, "learning_rate": 2.9029894736842108e-05, "loss": 0.3246, "mean_copy_accuracy": 0.9923244416713715, "mean_gen_accuracy": 0.860401064157486, "mean_token_accuracy": 0.8928365111351013, "num_tokens": 406953912.0, "sample_num_tokens": 9602.0, "step": 1504, "total_num_tokens": 406992320.0, "z_loss": 0.0011078992392867804 }, { "copy_logits_max": -0.989867627620697, "copy_logits_min": -750000000.0, "copy_num_tokens": 795.5, "epoch": 0.3073780954812356, "gen_logits_max": 7.076534271240234, "gen_logits_mean": -11.762189865112305, "gen_logits_min": -23.809804916381836, "gen_logits_std": 2.488150119781494, "gen_loss": 0.29309144616127014, "grad_norm": 0.5076838444450564, "learning_rate": 2.902863157894737e-05, "loss": 0.313, "mean_copy_accuracy": 0.9927307218313217, "mean_gen_accuracy": 0.8584993481636047, "mean_token_accuracy": 0.8973740935325623, "num_tokens": 407242101.0, "sample_num_tokens": 10111.25, "step": 1505, "total_num_tokens": 407282546.0, "z_loss": 0.0011369348503649235 }, { "copy_logits_max": -0.9550051689147949, "copy_logits_min": -625000000.0, "copy_num_tokens": 587.1875, "epoch": 0.3075823334184325, "gen_logits_max": 7.199338912963867, "gen_logits_mean": -11.145214080810547, "gen_logits_min": -22.898887634277344, "gen_logits_std": 2.4687955379486084, "gen_loss": 0.3458474278450012, "grad_norm": 0.4732732060533969, "learning_rate": 2.9027368421052633e-05, "loss": 0.3384, "mean_copy_accuracy": 0.9922423809766769, "mean_gen_accuracy": 0.8535721600055695, "mean_token_accuracy": 0.8888476192951202, "num_tokens": 407528592.0, "sample_num_tokens": 8736.5, "step": 1506, "total_num_tokens": 407563538.0, "z_loss": 0.0011920877732336521 }, { "copy_logits_max": -1.7786089181900024, "copy_logits_min": -687500032.0, "copy_num_tokens": 735.3125, "epoch": 0.3077865713556293, "gen_logits_max": 7.301288604736328, "gen_logits_mean": -11.41020393371582, "gen_logits_min": -23.243091583251953, "gen_logits_std": 2.4452219009399414, "gen_loss": 0.31422358751296997, "grad_norm": 0.5504767422638145, "learning_rate": 2.9026105263157894e-05, "loss": 0.3146, "mean_copy_accuracy": 0.9904194474220276, "mean_gen_accuracy": 0.8672500550746918, "mean_token_accuracy": 0.8968517631292343, "num_tokens": 407799319.0, "sample_num_tokens": 10696.25, "step": 1507, "total_num_tokens": 407842104.0, "z_loss": 0.001180147286504507 }, { "copy_logits_max": -0.3809899687767029, "copy_logits_min": -687500032.0, "copy_num_tokens": 695.625, "epoch": 0.30799080929282613, "gen_logits_max": 6.647730827331543, "gen_logits_mean": -11.272109031677246, "gen_logits_min": -23.249786376953125, "gen_logits_std": 2.4734601974487305, "gen_loss": 0.3077320456504822, "grad_norm": 0.5160368954139971, "learning_rate": 2.9024842105263158e-05, "loss": 0.3447, "mean_copy_accuracy": 0.9916824847459793, "mean_gen_accuracy": 0.8495054244995117, "mean_token_accuracy": 0.8864818662405014, "num_tokens": 408082116.0, "sample_num_tokens": 10334.0, "step": 1508, "total_num_tokens": 408123452.0, "z_loss": 0.001161111518740654 }, { "copy_logits_max": -0.1353483498096466, "copy_logits_min": -687500032.0, "copy_num_tokens": 591.6875, "epoch": 0.308195047230023, "gen_logits_max": 6.4598164558410645, "gen_logits_mean": -11.345293045043945, "gen_logits_min": -23.373666763305664, "gen_logits_std": 2.4852328300476074, "gen_loss": 0.3268709182739258, "grad_norm": 0.4932315468985231, "learning_rate": 2.9023578947368422e-05, "loss": 0.3383, "mean_copy_accuracy": 0.9919683039188385, "mean_gen_accuracy": 0.8563797175884247, "mean_token_accuracy": 0.8901435881853104, "num_tokens": 408348166.0, "sample_num_tokens": 8094.5, "step": 1509, "total_num_tokens": 408380544.0, "z_loss": 0.0011974782682955265 }, { "copy_logits_max": -2.79168701171875, "copy_logits_min": -687500032.0, "copy_num_tokens": 530.625, "epoch": 0.3083992851672198, "gen_logits_max": 7.166333198547363, "gen_logits_mean": -11.460205078125, "gen_logits_min": -22.534273147583008, "gen_logits_std": 2.3701281547546387, "gen_loss": 0.324973464012146, "grad_norm": 0.4364141071582717, "learning_rate": 2.9022315789473687e-05, "loss": 0.33, "mean_copy_accuracy": 0.9930402487516403, "mean_gen_accuracy": 0.8591407239437103, "mean_token_accuracy": 0.8937042951583862, "num_tokens": 408633819.0, "sample_num_tokens": 8762.75, "step": 1510, "total_num_tokens": 408668870.0, "z_loss": 0.0010620846878737211 }, { "copy_logits_max": -3.5967013835906982, "copy_logits_min": -750000000.0, "copy_num_tokens": 202.0, "epoch": 0.30860352310441663, "gen_logits_max": 7.474208831787109, "gen_logits_mean": -11.88478946685791, "gen_logits_min": -23.049964904785156, "gen_logits_std": 2.379016160964966, "gen_loss": 0.41415148973464966, "grad_norm": 0.5615324806263183, "learning_rate": 2.9021052631578948e-05, "loss": 0.3639, "mean_copy_accuracy": 0.9886988550424576, "mean_gen_accuracy": 0.8520862311124802, "mean_token_accuracy": 0.8810379952192307, "num_tokens": 408884101.0, "sample_num_tokens": 6771.25, "step": 1511, "total_num_tokens": 408911186.0, "z_loss": 0.0010700805578380823 }, { "copy_logits_max": -2.138319730758667, "copy_logits_min": -750000000.0, "copy_num_tokens": 399.3125, "epoch": 0.3088077610416135, "gen_logits_max": 6.6289520263671875, "gen_logits_mean": -11.546146392822266, "gen_logits_min": -23.1201114654541, "gen_logits_std": 2.4066107273101807, "gen_loss": 0.3107210397720337, "grad_norm": 0.4692552798514937, "learning_rate": 2.9019789473684212e-05, "loss": 0.3232, "mean_copy_accuracy": 0.9926670044660568, "mean_gen_accuracy": 0.862568199634552, "mean_token_accuracy": 0.8976694345474243, "num_tokens": 409145963.0, "sample_num_tokens": 7332.25, "step": 1512, "total_num_tokens": 409175292.0, "z_loss": 0.0011494203936308622 }, { "copy_logits_max": -5.400927543640137, "copy_logits_min": -750000000.0, "copy_num_tokens": 203.0, "epoch": 0.3090119989788103, "gen_logits_max": 7.660521507263184, "gen_logits_mean": -13.353681564331055, "gen_logits_min": -24.187175750732422, "gen_logits_std": 2.3404688835144043, "gen_loss": 0.35347798466682434, "grad_norm": 0.468712540172094, "learning_rate": 2.9018526315789473e-05, "loss": 0.3224, "mean_copy_accuracy": 0.9919628500938416, "mean_gen_accuracy": 0.8659442663192749, "mean_token_accuracy": 0.893826812505722, "num_tokens": 409388206.0, "sample_num_tokens": 6912.5, "step": 1513, "total_num_tokens": 409415856.0, "z_loss": 0.0010873469291254878 }, { "copy_logits_max": -2.428084373474121, "copy_logits_min": -687500032.0, "copy_num_tokens": 439.625, "epoch": 0.30921623691600714, "gen_logits_max": 7.8273468017578125, "gen_logits_mean": -11.510327339172363, "gen_logits_min": -22.720111846923828, "gen_logits_std": 2.4256842136383057, "gen_loss": 0.39271020889282227, "grad_norm": 0.4994952270038237, "learning_rate": 2.9017263157894737e-05, "loss": 0.3422, "mean_copy_accuracy": 0.992096483707428, "mean_gen_accuracy": 0.8552907556295395, "mean_token_accuracy": 0.8887676149606705, "num_tokens": 409660254.0, "sample_num_tokens": 8223.0, "step": 1514, "total_num_tokens": 409693146.0, "z_loss": 0.001248059095814824 }, { "copy_logits_max": -1.9156031608581543, "copy_logits_min": -750000000.0, "copy_num_tokens": 465.5625, "epoch": 0.309420474853204, "gen_logits_max": 7.803732395172119, "gen_logits_mean": -12.137589454650879, "gen_logits_min": -24.126426696777344, "gen_logits_std": 2.4579880237579346, "gen_loss": 0.32486042380332947, "grad_norm": 0.4920661550759028, "learning_rate": 2.9015999999999998e-05, "loss": 0.3214, "mean_copy_accuracy": 0.9931240975856781, "mean_gen_accuracy": 0.8573254346847534, "mean_token_accuracy": 0.8941191285848618, "num_tokens": 409939597.0, "sample_num_tokens": 8670.25, "step": 1515, "total_num_tokens": 409974278.0, "z_loss": 0.0010988520225510001 }, { "copy_logits_max": -0.9899948835372925, "copy_logits_min": -750000000.0, "copy_num_tokens": 512.5, "epoch": 0.3096247127904008, "gen_logits_max": 8.295055389404297, "gen_logits_mean": -10.718467712402344, "gen_logits_min": -22.82994842529297, "gen_logits_std": 2.4977059364318848, "gen_loss": 0.32528382539749146, "grad_norm": 0.47170571044447884, "learning_rate": 2.9014736842105262e-05, "loss": 0.337, "mean_copy_accuracy": 0.9929351955652237, "mean_gen_accuracy": 0.8525726795196533, "mean_token_accuracy": 0.8880138993263245, "num_tokens": 410213897.0, "sample_num_tokens": 8531.75, "step": 1516, "total_num_tokens": 410248024.0, "z_loss": 0.0012561697512865067 }, { "copy_logits_max": 0.10118915140628815, "copy_logits_min": -750000000.0, "copy_num_tokens": 692.25, "epoch": 0.30982895072759764, "gen_logits_max": 7.515646457672119, "gen_logits_mean": -11.318683624267578, "gen_logits_min": -23.853548049926758, "gen_logits_std": 2.523712396621704, "gen_loss": 0.27670958638191223, "grad_norm": 0.5082813685094677, "learning_rate": 2.901347368421053e-05, "loss": 0.3151, "mean_copy_accuracy": 0.9916147589683533, "mean_gen_accuracy": 0.855663537979126, "mean_token_accuracy": 0.8982433527708054, "num_tokens": 410497721.0, "sample_num_tokens": 9566.75, "step": 1517, "total_num_tokens": 410535988.0, "z_loss": 0.0012096428545191884 }, { "copy_logits_max": -1.2522788047790527, "copy_logits_min": -625000064.0, "copy_num_tokens": 552.625, "epoch": 0.3100331886647945, "gen_logits_max": 7.317330360412598, "gen_logits_mean": -11.145230293273926, "gen_logits_min": -23.594270706176758, "gen_logits_std": 2.5586471557617188, "gen_loss": 0.33810681104660034, "grad_norm": 0.5061063644762614, "learning_rate": 2.901221052631579e-05, "loss": 0.3513, "mean_copy_accuracy": 0.9907980561256409, "mean_gen_accuracy": 0.8551700115203857, "mean_token_accuracy": 0.8851263672113419, "num_tokens": 410768994.0, "sample_num_tokens": 8682.0, "step": 1518, "total_num_tokens": 410803722.0, "z_loss": 0.001187309273518622 }, { "copy_logits_max": 0.5284843444824219, "copy_logits_min": -687500032.0, "copy_num_tokens": 551.3125, "epoch": 0.31023742660199133, "gen_logits_max": 7.93762731552124, "gen_logits_mean": -10.491634368896484, "gen_logits_min": -22.611053466796875, "gen_logits_std": 2.5288639068603516, "gen_loss": 0.34101858735084534, "grad_norm": 0.550746164356308, "learning_rate": 2.9010947368421055e-05, "loss": 0.3307, "mean_copy_accuracy": 0.9917209893465042, "mean_gen_accuracy": 0.8575197905302048, "mean_token_accuracy": 0.891491025686264, "num_tokens": 411038393.0, "sample_num_tokens": 8891.25, "step": 1519, "total_num_tokens": 411073958.0, "z_loss": 0.0012020327849313617 }, { "copy_logits_max": -1.7729969024658203, "copy_logits_min": -750000000.0, "copy_num_tokens": 393.9375, "epoch": 0.31044166453918814, "gen_logits_max": 7.310055732727051, "gen_logits_mean": -12.456366539001465, "gen_logits_min": -24.091175079345703, "gen_logits_std": 2.4752368927001953, "gen_loss": 0.33449774980545044, "grad_norm": 0.48410252254676367, "learning_rate": 2.9009684210526316e-05, "loss": 0.3416, "mean_copy_accuracy": 0.9914164990186691, "mean_gen_accuracy": 0.8581209927797318, "mean_token_accuracy": 0.8884227126836777, "num_tokens": 411302745.0, "sample_num_tokens": 8164.75, "step": 1520, "total_num_tokens": 411335404.0, "z_loss": 0.0010314414976164699 }, { "copy_logits_max": -0.6762399673461914, "copy_logits_min": -750000000.0, "copy_num_tokens": 601.1875, "epoch": 0.31064590247638496, "gen_logits_max": 7.71356201171875, "gen_logits_mean": -10.69925594329834, "gen_logits_min": -22.867839813232422, "gen_logits_std": 2.5581777095794678, "gen_loss": 0.32966893911361694, "grad_norm": 0.5539067448908647, "learning_rate": 2.900842105263158e-05, "loss": 0.3192, "mean_copy_accuracy": 0.9909805655479431, "mean_gen_accuracy": 0.8613512217998505, "mean_token_accuracy": 0.8972201645374298, "num_tokens": 411580898.0, "sample_num_tokens": 8916.5, "step": 1521, "total_num_tokens": 411616564.0, "z_loss": 0.0010965842520818114 }, { "copy_logits_max": -2.5555953979492188, "copy_logits_min": -750000000.0, "copy_num_tokens": 482.625, "epoch": 0.31085014041358183, "gen_logits_max": 7.432534694671631, "gen_logits_mean": -11.427106857299805, "gen_logits_min": -23.219802856445312, "gen_logits_std": 2.4704127311706543, "gen_loss": 0.3401795029640198, "grad_norm": 0.44007579627632826, "learning_rate": 2.900715789473684e-05, "loss": 0.3285, "mean_copy_accuracy": 0.9919398874044418, "mean_gen_accuracy": 0.8613721281290054, "mean_token_accuracy": 0.892421156167984, "num_tokens": 411853823.0, "sample_num_tokens": 9068.75, "step": 1522, "total_num_tokens": 411890098.0, "z_loss": 0.001122012035921216 }, { "copy_logits_max": -1.5850141048431396, "copy_logits_min": -750000000.0, "copy_num_tokens": 300.8125, "epoch": 0.31105437835077865, "gen_logits_max": 8.975252151489258, "gen_logits_mean": -10.770489692687988, "gen_logits_min": -22.22157096862793, "gen_logits_std": 2.4402413368225098, "gen_loss": 0.37056148052215576, "grad_norm": 0.6016986698404044, "learning_rate": 2.9005894736842106e-05, "loss": 0.3349, "mean_copy_accuracy": 0.9911809861660004, "mean_gen_accuracy": 0.8610400557518005, "mean_token_accuracy": 0.8923281878232956, "num_tokens": 412141059.0, "sample_num_tokens": 7884.75, "step": 1523, "total_num_tokens": 412172598.0, "z_loss": 0.0010948240524157882 }, { "copy_logits_max": 0.1285107135772705, "copy_logits_min": -750000000.0, "copy_num_tokens": 573.3125, "epoch": 0.31125861628797546, "gen_logits_max": 7.2507710456848145, "gen_logits_mean": -11.105911254882812, "gen_logits_min": -22.99246597290039, "gen_logits_std": 2.5367538928985596, "gen_loss": 0.31144803762435913, "grad_norm": 0.5047629325027709, "learning_rate": 2.9004631578947367e-05, "loss": 0.3275, "mean_copy_accuracy": 0.990941047668457, "mean_gen_accuracy": 0.8641447424888611, "mean_token_accuracy": 0.8934390693902969, "num_tokens": 412403397.0, "sample_num_tokens": 9115.75, "step": 1524, "total_num_tokens": 412439860.0, "z_loss": 0.0012357537634670734 }, { "copy_logits_max": -1.6882781982421875, "copy_logits_min": -687500032.0, "copy_num_tokens": 628.6875, "epoch": 0.31146285422517234, "gen_logits_max": 7.221409320831299, "gen_logits_mean": -10.993497848510742, "gen_logits_min": -23.064605712890625, "gen_logits_std": 2.4704675674438477, "gen_loss": 0.32167956233024597, "grad_norm": 0.542505418512238, "learning_rate": 2.9003368421052634e-05, "loss": 0.3638, "mean_copy_accuracy": 0.9928104430437088, "mean_gen_accuracy": 0.8482116311788559, "mean_token_accuracy": 0.8816761076450348, "num_tokens": 412658701.0, "sample_num_tokens": 9431.25, "step": 1525, "total_num_tokens": 412696426.0, "z_loss": 0.001174121629446745 }, { "copy_logits_max": 2.0425238609313965, "copy_logits_min": -750000064.0, "copy_num_tokens": 525.3125, "epoch": 0.31166709216236915, "gen_logits_max": 8.53921127319336, "gen_logits_mean": -10.392999649047852, "gen_logits_min": -22.68456268310547, "gen_logits_std": 2.5487518310546875, "gen_loss": 0.3600732088088989, "grad_norm": 0.5210393593646027, "learning_rate": 2.9002105263157895e-05, "loss": 0.3492, "mean_copy_accuracy": 0.991378977894783, "mean_gen_accuracy": 0.8491461277008057, "mean_token_accuracy": 0.8871863335371017, "num_tokens": 412941445.0, "sample_num_tokens": 8582.75, "step": 1526, "total_num_tokens": 412975776.0, "z_loss": 0.0012643655063584447 }, { "copy_logits_max": -1.5150375366210938, "copy_logits_min": -687500032.0, "copy_num_tokens": 612.3125, "epoch": 0.31187133009956597, "gen_logits_max": 6.934441566467285, "gen_logits_mean": -11.326391220092773, "gen_logits_min": -23.407745361328125, "gen_logits_std": 2.4632349014282227, "gen_loss": 0.32906633615493774, "grad_norm": 0.45235215270677287, "learning_rate": 2.900084210526316e-05, "loss": 0.3375, "mean_copy_accuracy": 0.9923841208219528, "mean_gen_accuracy": 0.8563457578420639, "mean_token_accuracy": 0.8906634449958801, "num_tokens": 413207222.0, "sample_num_tokens": 9298.0, "step": 1527, "total_num_tokens": 413244414.0, "z_loss": 0.001076816115528345 }, { "copy_logits_max": -1.3577373027801514, "copy_logits_min": -687500032.0, "copy_num_tokens": 465.125, "epoch": 0.31207556803676284, "gen_logits_max": 7.291866779327393, "gen_logits_mean": -11.828516960144043, "gen_logits_min": -23.553298950195312, "gen_logits_std": 2.5165152549743652, "gen_loss": 0.3085673451423645, "grad_norm": 0.5100065556907352, "learning_rate": 2.899957894736842e-05, "loss": 0.3402, "mean_copy_accuracy": 0.9896280914545059, "mean_gen_accuracy": 0.8579529076814651, "mean_token_accuracy": 0.8899147808551788, "num_tokens": 413485958.0, "sample_num_tokens": 8648.5, "step": 1528, "total_num_tokens": 413520552.0, "z_loss": 0.0010499585187062621 }, { "copy_logits_max": -1.3274931907653809, "copy_logits_min": -750000000.0, "copy_num_tokens": 546.875, "epoch": 0.31227980597395966, "gen_logits_max": 7.95849609375, "gen_logits_mean": -11.351972579956055, "gen_logits_min": -23.221359252929688, "gen_logits_std": 2.4842982292175293, "gen_loss": 0.31256869435310364, "grad_norm": 0.4989175345635912, "learning_rate": 2.8998315789473685e-05, "loss": 0.338, "mean_copy_accuracy": 0.9915243089199066, "mean_gen_accuracy": 0.8579068332910538, "mean_token_accuracy": 0.889245480298996, "num_tokens": 413766660.0, "sample_num_tokens": 8799.0, "step": 1529, "total_num_tokens": 413801856.0, "z_loss": 0.001248276443220675 }, { "copy_logits_max": 1.8272266387939453, "copy_logits_min": -687500032.0, "copy_num_tokens": 689.625, "epoch": 0.3124840439111565, "gen_logits_max": 7.572720527648926, "gen_logits_mean": -10.313882827758789, "gen_logits_min": -22.616769790649414, "gen_logits_std": 2.517902135848999, "gen_loss": 0.34534573554992676, "grad_norm": 0.6003912927484767, "learning_rate": 2.899705263157895e-05, "loss": 0.3523, "mean_copy_accuracy": 0.9925680905580521, "mean_gen_accuracy": 0.8497153669595718, "mean_token_accuracy": 0.8856168240308762, "num_tokens": 414034660.0, "sample_num_tokens": 9302.0, "step": 1530, "total_num_tokens": 414071868.0, "z_loss": 0.0013872252311557531 }, { "copy_logits_max": -1.9758963584899902, "copy_logits_min": -750000000.0, "copy_num_tokens": 532.4375, "epoch": 0.31268828184835334, "gen_logits_max": 6.885088920593262, "gen_logits_mean": -12.60934829711914, "gen_logits_min": -24.372472763061523, "gen_logits_std": 2.4643917083740234, "gen_loss": 0.302293062210083, "grad_norm": 0.8232295278889807, "learning_rate": 2.899578947368421e-05, "loss": 0.3126, "mean_copy_accuracy": 0.9921650439500809, "mean_gen_accuracy": 0.8626693040132523, "mean_token_accuracy": 0.8974648416042328, "num_tokens": 414322520.0, "sample_num_tokens": 9109.0, "step": 1531, "total_num_tokens": 414358956.0, "z_loss": 0.0010741068981587887 }, { "copy_logits_max": 0.9991129636764526, "copy_logits_min": -750000000.0, "copy_num_tokens": 452.75, "epoch": 0.31289251978555016, "gen_logits_max": 8.002044677734375, "gen_logits_mean": -10.801618576049805, "gen_logits_min": -22.90838623046875, "gen_logits_std": 2.4959425926208496, "gen_loss": 0.33745571970939636, "grad_norm": 0.5216438523136709, "learning_rate": 2.8994526315789474e-05, "loss": 0.3454, "mean_copy_accuracy": 0.9925014972686768, "mean_gen_accuracy": 0.8606747835874557, "mean_token_accuracy": 0.8874355405569077, "num_tokens": 414599438.0, "sample_num_tokens": 9001.5, "step": 1532, "total_num_tokens": 414635444.0, "z_loss": 0.001522420672699809 }, { "copy_logits_max": 1.5063344240188599, "copy_logits_min": -625000000.0, "copy_num_tokens": 417.0625, "epoch": 0.313096757722747, "gen_logits_max": 7.624362468719482, "gen_logits_mean": -11.891290664672852, "gen_logits_min": -23.57479476928711, "gen_logits_std": 2.459132194519043, "gen_loss": 0.3350716829299927, "grad_norm": 0.6748306865155866, "learning_rate": 2.899326315789474e-05, "loss": 0.3465, "mean_copy_accuracy": 0.9898916929960251, "mean_gen_accuracy": 0.8575872033834457, "mean_token_accuracy": 0.8868147134780884, "num_tokens": 414860185.0, "sample_num_tokens": 8037.25, "step": 1533, "total_num_tokens": 414892334.0, "z_loss": 0.001909540151245892 }, { "copy_logits_max": 1.5842359066009521, "copy_logits_min": -750000000.0, "copy_num_tokens": 427.375, "epoch": 0.31330099565994385, "gen_logits_max": 7.631167411804199, "gen_logits_mean": -11.261823654174805, "gen_logits_min": -23.430774688720703, "gen_logits_std": 2.521840810775757, "gen_loss": 0.3475191593170166, "grad_norm": 0.49054574613553314, "learning_rate": 2.8992000000000003e-05, "loss": 0.3575, "mean_copy_accuracy": 0.9917171150445938, "mean_gen_accuracy": 0.8523854613304138, "mean_token_accuracy": 0.8847669363021851, "num_tokens": 415127075.0, "sample_num_tokens": 8014.25, "step": 1534, "total_num_tokens": 415159132.0, "z_loss": 0.0020015472546219826 }, { "copy_logits_max": -0.4920186400413513, "copy_logits_min": -750000000.0, "copy_num_tokens": 305.375, "epoch": 0.31350523359714066, "gen_logits_max": 7.646631240844727, "gen_logits_mean": -12.672163009643555, "gen_logits_min": -23.882753372192383, "gen_logits_std": 2.413384437561035, "gen_loss": 0.3248057961463928, "grad_norm": 0.5309700657067024, "learning_rate": 2.8990736842105264e-05, "loss": 0.3278, "mean_copy_accuracy": 0.9887984842061996, "mean_gen_accuracy": 0.864735335111618, "mean_token_accuracy": 0.8922483772039413, "num_tokens": 415386453.0, "sample_num_tokens": 7053.25, "step": 1535, "total_num_tokens": 415414666.0, "z_loss": 0.0014819472562521696 }, { "copy_logits_max": 1.8329226970672607, "copy_logits_min": -687500032.0, "copy_num_tokens": 544.75, "epoch": 0.3137094715343375, "gen_logits_max": 8.454605102539062, "gen_logits_mean": -10.588848114013672, "gen_logits_min": -22.340030670166016, "gen_logits_std": 2.4759647846221924, "gen_loss": 0.36185693740844727, "grad_norm": 0.4967090872813255, "learning_rate": 2.8989473684210528e-05, "loss": 0.3463, "mean_copy_accuracy": 0.9905325323343277, "mean_gen_accuracy": 0.8534348011016846, "mean_token_accuracy": 0.8879329115152359, "num_tokens": 415663673.0, "sample_num_tokens": 9777.75, "step": 1536, "total_num_tokens": 415702784.0, "z_loss": 0.0018847264582291245 }, { "copy_logits_max": 1.2781782150268555, "copy_logits_min": -750000000.0, "copy_num_tokens": 386.625, "epoch": 0.31391370947153435, "gen_logits_max": 7.689446926116943, "gen_logits_mean": -10.782878875732422, "gen_logits_min": -22.90349578857422, "gen_logits_std": 2.476100206375122, "gen_loss": 0.3372582495212555, "grad_norm": 0.5463825841866686, "learning_rate": 2.898821052631579e-05, "loss": 0.3459, "mean_copy_accuracy": 0.9897917956113815, "mean_gen_accuracy": 0.8552180677652359, "mean_token_accuracy": 0.8859180361032486, "num_tokens": 415932226.0, "sample_num_tokens": 7894.5, "step": 1537, "total_num_tokens": 415963804.0, "z_loss": 0.0015779085224494338 }, { "copy_logits_max": 3.7407121658325195, "copy_logits_min": -687500032.0, "copy_num_tokens": 571.25, "epoch": 0.31411794740873117, "gen_logits_max": 7.573033332824707, "gen_logits_mean": -11.18912124633789, "gen_logits_min": -23.219396591186523, "gen_logits_std": 2.5117599964141846, "gen_loss": 0.3250136375427246, "grad_norm": 0.5173478371326914, "learning_rate": 2.8986947368421053e-05, "loss": 0.3119, "mean_copy_accuracy": 0.9925577640533447, "mean_gen_accuracy": 0.8603581786155701, "mean_token_accuracy": 0.8993491530418396, "num_tokens": 416222555.0, "sample_num_tokens": 8626.75, "step": 1538, "total_num_tokens": 416257062.0, "z_loss": 0.0020532459020614624 }, { "copy_logits_max": 2.161097526550293, "copy_logits_min": -750000000.0, "copy_num_tokens": 489.25, "epoch": 0.314322185345928, "gen_logits_max": 7.620327472686768, "gen_logits_mean": -11.216272354125977, "gen_logits_min": -23.686824798583984, "gen_logits_std": 2.5226364135742188, "gen_loss": 0.3063790798187256, "grad_norm": 0.5954360842172526, "learning_rate": 2.8985684210526314e-05, "loss": 0.3407, "mean_copy_accuracy": 0.9909058958292007, "mean_gen_accuracy": 0.8534543961286545, "mean_token_accuracy": 0.8890673667192459, "num_tokens": 416490670.0, "sample_num_tokens": 7877.0, "step": 1539, "total_num_tokens": 416522178.0, "z_loss": 0.0018312346655875444 }, { "copy_logits_max": 0.5605276823043823, "copy_logits_min": -750000064.0, "copy_num_tokens": 504.5625, "epoch": 0.31452642328312486, "gen_logits_max": 7.2823591232299805, "gen_logits_mean": -12.481006622314453, "gen_logits_min": -24.047231674194336, "gen_logits_std": 2.461885929107666, "gen_loss": 0.3487318158149719, "grad_norm": 0.5075316402946013, "learning_rate": 2.898442105263158e-05, "loss": 0.3533, "mean_copy_accuracy": 0.9902258068323135, "mean_gen_accuracy": 0.8524960577487946, "mean_token_accuracy": 0.8858962506055832, "num_tokens": 416761318.0, "sample_num_tokens": 9400.0, "step": 1540, "total_num_tokens": 416798918.0, "z_loss": 0.0013611470349133015 }, { "copy_logits_max": 2.119035243988037, "copy_logits_min": -625000064.0, "copy_num_tokens": 510.75, "epoch": 0.3147306612203217, "gen_logits_max": 7.676130771636963, "gen_logits_mean": -11.97722053527832, "gen_logits_min": -24.08885955810547, "gen_logits_std": 2.50504207611084, "gen_loss": 0.34205758571624756, "grad_norm": 0.5423575885745245, "learning_rate": 2.8983157894736843e-05, "loss": 0.3129, "mean_copy_accuracy": 0.9910176545381546, "mean_gen_accuracy": 0.8631441444158554, "mean_token_accuracy": 0.9000886827707291, "num_tokens": 417051640.0, "sample_num_tokens": 8898.0, "step": 1541, "total_num_tokens": 417087232.0, "z_loss": 0.0016575523186475039 }, { "copy_logits_max": 4.279501914978027, "copy_logits_min": -750000000.0, "copy_num_tokens": 578.4375, "epoch": 0.3149348991575185, "gen_logits_max": 7.136228084564209, "gen_logits_mean": -11.03817367553711, "gen_logits_min": -23.489288330078125, "gen_logits_std": 2.55277681350708, "gen_loss": 0.3065983057022095, "grad_norm": 0.5009099546838944, "learning_rate": 2.8981894736842107e-05, "loss": 0.3184, "mean_copy_accuracy": 0.9926356077194214, "mean_gen_accuracy": 0.8572642803192139, "mean_token_accuracy": 0.8977543860673904, "num_tokens": 417330200.0, "sample_num_tokens": 9459.0, "step": 1542, "total_num_tokens": 417368036.0, "z_loss": 0.0015925598563626409 }, { "copy_logits_max": 1.3024442195892334, "copy_logits_min": -750000064.0, "copy_num_tokens": 442.8125, "epoch": 0.31513913709471536, "gen_logits_max": 7.384576797485352, "gen_logits_mean": -12.061912536621094, "gen_logits_min": -24.00874900817871, "gen_logits_std": 2.5015249252319336, "gen_loss": 0.35330724716186523, "grad_norm": 0.4813858371780505, "learning_rate": 2.898063157894737e-05, "loss": 0.3356, "mean_copy_accuracy": 0.9927422553300858, "mean_gen_accuracy": 0.8540207743644714, "mean_token_accuracy": 0.891323059797287, "num_tokens": 417609841.0, "sample_num_tokens": 7812.75, "step": 1543, "total_num_tokens": 417641092.0, "z_loss": 0.001586497644893825 }, { "copy_logits_max": 0.07745856046676636, "copy_logits_min": -750000000.0, "copy_num_tokens": 579.25, "epoch": 0.3153433750319122, "gen_logits_max": 6.8958024978637695, "gen_logits_mean": -12.068235397338867, "gen_logits_min": -24.082767486572266, "gen_logits_std": 2.5022530555725098, "gen_loss": 0.32891860604286194, "grad_norm": 0.6806456985953031, "learning_rate": 2.8979368421052633e-05, "loss": 0.3287, "mean_copy_accuracy": 0.9914553910493851, "mean_gen_accuracy": 0.8574943095445633, "mean_token_accuracy": 0.8915236294269562, "num_tokens": 417857025.0, "sample_num_tokens": 8631.75, "step": 1544, "total_num_tokens": 417891552.0, "z_loss": 0.0012555575231090188 }, { "copy_logits_max": 3.5262513160705566, "copy_logits_min": -749999936.0, "copy_num_tokens": 692.5625, "epoch": 0.315547612969109, "gen_logits_max": 7.488126754760742, "gen_logits_mean": -10.790019035339355, "gen_logits_min": -23.35595703125, "gen_logits_std": 2.576795816421509, "gen_loss": 0.3267807364463806, "grad_norm": 0.53524432321457, "learning_rate": 2.8978105263157897e-05, "loss": 0.3428, "mean_copy_accuracy": 0.9937191009521484, "mean_gen_accuracy": 0.8496364206075668, "mean_token_accuracy": 0.8886612057685852, "num_tokens": 418135646.0, "sample_num_tokens": 9454.5, "step": 1545, "total_num_tokens": 418173464.0, "z_loss": 0.0018894672393798828 }, { "copy_logits_max": 1.8445725440979004, "copy_logits_min": -750000000.0, "copy_num_tokens": 305.375, "epoch": 0.31575185090630586, "gen_logits_max": 8.167464256286621, "gen_logits_mean": -9.980996131896973, "gen_logits_min": -22.57428741455078, "gen_logits_std": 2.5139760971069336, "gen_loss": 0.3410664200782776, "grad_norm": 0.8898961824259273, "learning_rate": 2.8976842105263158e-05, "loss": 0.3479, "mean_copy_accuracy": 0.9880625158548355, "mean_gen_accuracy": 0.8549728989601135, "mean_token_accuracy": 0.8848460763692856, "num_tokens": 418381153.0, "sample_num_tokens": 6294.75, "step": 1546, "total_num_tokens": 418406332.0, "z_loss": 0.001605857047252357 }, { "copy_logits_max": -0.5572095513343811, "copy_logits_min": -750000000.0, "copy_num_tokens": 350.3125, "epoch": 0.3159560888435027, "gen_logits_max": 7.705989837646484, "gen_logits_mean": -10.701275825500488, "gen_logits_min": -23.054401397705078, "gen_logits_std": 2.4871888160705566, "gen_loss": 0.3054107129573822, "grad_norm": 0.6931949073263478, "learning_rate": 2.8975578947368422e-05, "loss": 0.3193, "mean_copy_accuracy": 0.9882030040025711, "mean_gen_accuracy": 0.8642020970582962, "mean_token_accuracy": 0.8955647200345993, "num_tokens": 418645390.0, "sample_num_tokens": 7281.0, "step": 1547, "total_num_tokens": 418674514.0, "z_loss": 0.00142430339474231 }, { "copy_logits_max": -0.052590250968933105, "copy_logits_min": -687500032.0, "copy_num_tokens": 460.875, "epoch": 0.3161603267806995, "gen_logits_max": 7.3527703285217285, "gen_logits_mean": -10.903645515441895, "gen_logits_min": -22.363426208496094, "gen_logits_std": 2.4611966609954834, "gen_loss": 0.2866005599498749, "grad_norm": 0.5620490193838303, "learning_rate": 2.8974315789473683e-05, "loss": 0.3277, "mean_copy_accuracy": 0.9920919090509415, "mean_gen_accuracy": 0.8591669797897339, "mean_token_accuracy": 0.8918372243642807, "num_tokens": 418922738.0, "sample_num_tokens": 8706.0, "step": 1548, "total_num_tokens": 418957562.0, "z_loss": 0.0013280529528856277 }, { "copy_logits_max": 2.908578395843506, "copy_logits_min": -750000000.0, "copy_num_tokens": 660.1875, "epoch": 0.31636456471789637, "gen_logits_max": 7.42453670501709, "gen_logits_mean": -12.249551773071289, "gen_logits_min": -24.08131980895996, "gen_logits_std": 2.459747314453125, "gen_loss": 0.32441121339797974, "grad_norm": 0.5835420958320969, "learning_rate": 2.897305263157895e-05, "loss": 0.3171, "mean_copy_accuracy": 0.9918073564767838, "mean_gen_accuracy": 0.8648659586906433, "mean_token_accuracy": 0.8962607830762863, "num_tokens": 419208985.0, "sample_num_tokens": 10155.75, "step": 1549, "total_num_tokens": 419249608.0, "z_loss": 0.0018532357644289732 }, { "copy_logits_max": -1.4187860488891602, "copy_logits_min": -750000000.0, "copy_num_tokens": 249.875, "epoch": 0.3165688026550932, "gen_logits_max": 8.082433700561523, "gen_logits_mean": -11.633195877075195, "gen_logits_min": -23.78618049621582, "gen_logits_std": 2.492293119430542, "gen_loss": 0.30000191926956177, "grad_norm": 0.5358088281166375, "learning_rate": 2.897178947368421e-05, "loss": 0.3312, "mean_copy_accuracy": 0.9904672056436539, "mean_gen_accuracy": 0.8583154529333115, "mean_token_accuracy": 0.8915451765060425, "num_tokens": 419476906.0, "sample_num_tokens": 6905.0, "step": 1550, "total_num_tokens": 419504526.0, "z_loss": 0.001398702384904027 }, { "copy_logits_max": 0.7138986587524414, "copy_logits_min": -750000000.0, "copy_num_tokens": 435.0, "epoch": 0.31677304059229, "gen_logits_max": 8.728289604187012, "gen_logits_mean": -9.660872459411621, "gen_logits_min": -21.8565673828125, "gen_logits_std": 2.543667793273926, "gen_loss": 0.3551071584224701, "grad_norm": 0.7379377632895974, "learning_rate": 2.8970526315789476e-05, "loss": 0.3362, "mean_copy_accuracy": 0.9885705262422562, "mean_gen_accuracy": 0.8607657849788666, "mean_token_accuracy": 0.8918183147907257, "num_tokens": 419741560.0, "sample_num_tokens": 8012.5, "step": 1551, "total_num_tokens": 419773610.0, "z_loss": 0.002016443759202957 }, { "copy_logits_max": -0.9752156734466553, "copy_logits_min": -750000000.0, "copy_num_tokens": 381.5625, "epoch": 0.31697727852948687, "gen_logits_max": 8.317983627319336, "gen_logits_mean": -11.61629867553711, "gen_logits_min": -23.244680404663086, "gen_logits_std": 2.432168483734131, "gen_loss": 0.3721897006034851, "grad_norm": 0.45822830913833806, "learning_rate": 2.8969263157894737e-05, "loss": 0.3267, "mean_copy_accuracy": 0.9935326278209686, "mean_gen_accuracy": 0.8626087158918381, "mean_token_accuracy": 0.89413221180439, "num_tokens": 420032638.0, "sample_num_tokens": 8237.0, "step": 1552, "total_num_tokens": 420065586.0, "z_loss": 0.0015443665906786919 }, { "copy_logits_max": 6.351892471313477, "copy_logits_min": -687500032.0, "copy_num_tokens": 782.375, "epoch": 0.3171815164666837, "gen_logits_max": 7.336573600769043, "gen_logits_mean": -11.389324188232422, "gen_logits_min": -23.83141326904297, "gen_logits_std": 2.531276226043701, "gen_loss": 0.2977585196495056, "grad_norm": 0.5950103151084554, "learning_rate": 2.8968e-05, "loss": 0.3164, "mean_copy_accuracy": 0.9923708736896515, "mean_gen_accuracy": 0.8593744486570358, "mean_token_accuracy": 0.898442953824997, "num_tokens": 420304300.0, "sample_num_tokens": 9761.5, "step": 1553, "total_num_tokens": 420343346.0, "z_loss": 0.0027194484136998653 }, { "copy_logits_max": 1.5062748193740845, "copy_logits_min": -750000000.0, "copy_num_tokens": 469.3125, "epoch": 0.3173857544038805, "gen_logits_max": 7.663610458374023, "gen_logits_mean": -11.581132888793945, "gen_logits_min": -23.504287719726562, "gen_logits_std": 2.481584310531616, "gen_loss": 0.3282478451728821, "grad_norm": 0.5192081324875087, "learning_rate": 2.8966736842105262e-05, "loss": 0.325, "mean_copy_accuracy": 0.9908474534749985, "mean_gen_accuracy": 0.8616281300783157, "mean_token_accuracy": 0.8951113373041153, "num_tokens": 420578053.0, "sample_num_tokens": 8487.75, "step": 1554, "total_num_tokens": 420612004.0, "z_loss": 0.0018119015730917454 }, { "copy_logits_max": -0.9929929375648499, "copy_logits_min": -750000000.0, "copy_num_tokens": 363.1875, "epoch": 0.3175899923410774, "gen_logits_max": 9.225784301757812, "gen_logits_mean": -10.135149955749512, "gen_logits_min": -21.971193313598633, "gen_logits_std": 2.49029541015625, "gen_loss": 0.34768763184547424, "grad_norm": 1.2547763175029523, "learning_rate": 2.8965473684210526e-05, "loss": 0.331, "mean_copy_accuracy": 0.9921168386936188, "mean_gen_accuracy": 0.85893714427948, "mean_token_accuracy": 0.8932293355464935, "num_tokens": 420851641.0, "sample_num_tokens": 8397.25, "step": 1555, "total_num_tokens": 420885230.0, "z_loss": 0.001444501569494605 }, { "copy_logits_max": 0.6509718298912048, "copy_logits_min": -687500032.0, "copy_num_tokens": 499.75, "epoch": 0.3177942302782742, "gen_logits_max": 8.216569900512695, "gen_logits_mean": -11.104772567749023, "gen_logits_min": -22.815837860107422, "gen_logits_std": 2.448744058609009, "gen_loss": 0.31363198161125183, "grad_norm": 0.5471258318198622, "learning_rate": 2.8964210526315787e-05, "loss": 0.3227, "mean_copy_accuracy": 0.9918888956308365, "mean_gen_accuracy": 0.8597417324781418, "mean_token_accuracy": 0.8952105492353439, "num_tokens": 421147802.0, "sample_num_tokens": 8672.5, "step": 1556, "total_num_tokens": 421182492.0, "z_loss": 0.001772158546373248 }, { "copy_logits_max": 1.1363856792449951, "copy_logits_min": -687500032.0, "copy_num_tokens": 443.25, "epoch": 0.317998468215471, "gen_logits_max": 8.427816390991211, "gen_logits_mean": -10.39545726776123, "gen_logits_min": -22.075946807861328, "gen_logits_std": 2.488762855529785, "gen_loss": 0.3730865716934204, "grad_norm": 0.558286374405796, "learning_rate": 2.896294736842105e-05, "loss": 0.3612, "mean_copy_accuracy": 0.9909011274576187, "mean_gen_accuracy": 0.8439951986074448, "mean_token_accuracy": 0.8805019557476044, "num_tokens": 421414869.0, "sample_num_tokens": 8337.25, "step": 1557, "total_num_tokens": 421448218.0, "z_loss": 0.0019537867046892643 }, { "copy_logits_max": -1.4486427307128906, "copy_logits_min": -750000000.0, "copy_num_tokens": 381.5625, "epoch": 0.3182027061526679, "gen_logits_max": 8.226661682128906, "gen_logits_mean": -10.919319152832031, "gen_logits_min": -23.001876831054688, "gen_logits_std": 2.470001697540283, "gen_loss": 0.36718863248825073, "grad_norm": 0.5810970055107442, "learning_rate": 2.896168421052632e-05, "loss": 0.3322, "mean_copy_accuracy": 0.991849422454834, "mean_gen_accuracy": 0.860405445098877, "mean_token_accuracy": 0.8929162472486496, "num_tokens": 421689048.0, "sample_num_tokens": 7522.5, "step": 1558, "total_num_tokens": 421719138.0, "z_loss": 0.001607790938578546 }, { "copy_logits_max": -3.4877498149871826, "copy_logits_min": -750000000.0, "copy_num_tokens": 303.1875, "epoch": 0.3184069440898647, "gen_logits_max": 8.840675354003906, "gen_logits_mean": -11.033735275268555, "gen_logits_min": -22.72288703918457, "gen_logits_std": 2.4751226902008057, "gen_loss": 0.3712354004383087, "grad_norm": 0.6482543948210643, "learning_rate": 2.896042105263158e-05, "loss": 0.3391, "mean_copy_accuracy": 0.9918316453695297, "mean_gen_accuracy": 0.8596033453941345, "mean_token_accuracy": 0.8918276876211166, "num_tokens": 421950419.0, "sample_num_tokens": 7317.75, "step": 1559, "total_num_tokens": 421979690.0, "z_loss": 0.0012676448095589876 }, { "copy_logits_max": -1.8136706352233887, "copy_logits_min": -750000128.0, "copy_num_tokens": 419.5625, "epoch": 0.3186111820270615, "gen_logits_max": 8.38560676574707, "gen_logits_mean": -10.213335037231445, "gen_logits_min": -22.37005615234375, "gen_logits_std": 2.525526762008667, "gen_loss": 0.28355976939201355, "grad_norm": 0.5506389305805747, "learning_rate": 2.8959157894736845e-05, "loss": 0.3292, "mean_copy_accuracy": 0.991937443614006, "mean_gen_accuracy": 0.8624315112829208, "mean_token_accuracy": 0.8936866372823715, "num_tokens": 422205643.0, "sample_num_tokens": 7833.25, "step": 1560, "total_num_tokens": 422236976.0, "z_loss": 0.001543697202578187 }, { "copy_logits_max": -2.372753620147705, "copy_logits_min": -750000000.0, "copy_num_tokens": 571.8125, "epoch": 0.3188154199642584, "gen_logits_max": 8.705122947692871, "gen_logits_mean": -10.631153106689453, "gen_logits_min": -22.323455810546875, "gen_logits_std": 2.4753146171569824, "gen_loss": 0.29552000761032104, "grad_norm": 0.9540809901173289, "learning_rate": 2.8957894736842105e-05, "loss": 0.3265, "mean_copy_accuracy": 0.9901513755321503, "mean_gen_accuracy": 0.8587872684001923, "mean_token_accuracy": 0.8931503593921661, "num_tokens": 422489044.0, "sample_num_tokens": 9395.0, "step": 1561, "total_num_tokens": 422526624.0, "z_loss": 0.0015240224311128259 }, { "copy_logits_max": -0.6954317688941956, "copy_logits_min": -687500032.0, "copy_num_tokens": 703.8125, "epoch": 0.3190196579014552, "gen_logits_max": 7.705731391906738, "gen_logits_mean": -10.591346740722656, "gen_logits_min": -22.20604705810547, "gen_logits_std": 2.3960204124450684, "gen_loss": 0.34459248185157776, "grad_norm": 1.0778121578849134, "learning_rate": 2.895663157894737e-05, "loss": 0.3369, "mean_copy_accuracy": 0.9854211062192917, "mean_gen_accuracy": 0.8598116338253021, "mean_token_accuracy": 0.8910615742206573, "num_tokens": 422788764.0, "sample_num_tokens": 10426.0, "step": 1562, "total_num_tokens": 422830468.0, "z_loss": 0.0016168678412213922 }, { "copy_logits_max": 0.0717369019985199, "copy_logits_min": -750000000.0, "copy_num_tokens": 472.8125, "epoch": 0.319223895838652, "gen_logits_max": 7.57452917098999, "gen_logits_mean": -10.876386642456055, "gen_logits_min": -22.343128204345703, "gen_logits_std": 2.4165172576904297, "gen_loss": 0.35792815685272217, "grad_norm": 0.5436363760495706, "learning_rate": 2.895536842105263e-05, "loss": 0.3104, "mean_copy_accuracy": 0.9921664148569107, "mean_gen_accuracy": 0.8629120588302612, "mean_token_accuracy": 0.8992751091718674, "num_tokens": 423103747.0, "sample_num_tokens": 8459.25, "step": 1563, "total_num_tokens": 423137584.0, "z_loss": 0.0017621174920350313 }, { "copy_logits_max": -0.3015349209308624, "copy_logits_min": -750000000.0, "copy_num_tokens": 513.3125, "epoch": 0.3194281337758489, "gen_logits_max": 8.776238441467285, "gen_logits_mean": -9.718196868896484, "gen_logits_min": -21.48012351989746, "gen_logits_std": 2.397891044616699, "gen_loss": 0.3117656409740448, "grad_norm": 0.6404442716494958, "learning_rate": 2.8954105263157895e-05, "loss": 0.3219, "mean_copy_accuracy": 0.9908934533596039, "mean_gen_accuracy": 0.86216601729393, "mean_token_accuracy": 0.8980110883712769, "num_tokens": 423376559.0, "sample_num_tokens": 9072.25, "step": 1564, "total_num_tokens": 423412848.0, "z_loss": 0.0018267970299348235 }, { "copy_logits_max": -0.281681627035141, "copy_logits_min": -750000000.0, "copy_num_tokens": 352.3125, "epoch": 0.3196323717130457, "gen_logits_max": 7.676636695861816, "gen_logits_mean": -10.890474319458008, "gen_logits_min": -22.24691390991211, "gen_logits_std": 2.383362054824829, "gen_loss": 0.33245253562927246, "grad_norm": 0.7984695893901006, "learning_rate": 2.8952842105263156e-05, "loss": 0.323, "mean_copy_accuracy": 0.9875055402517319, "mean_gen_accuracy": 0.8682811260223389, "mean_token_accuracy": 0.8942514210939407, "num_tokens": 423650817.0, "sample_num_tokens": 7567.75, "step": 1565, "total_num_tokens": 423681088.0, "z_loss": 0.0016467496752738953 }, { "copy_logits_max": 2.186152696609497, "copy_logits_min": -750000000.0, "copy_num_tokens": 686.6875, "epoch": 0.3198366096502425, "gen_logits_max": 7.378717422485352, "gen_logits_mean": -11.019906044006348, "gen_logits_min": -22.865158081054688, "gen_logits_std": 2.4418833255767822, "gen_loss": 0.32917869091033936, "grad_norm": 0.8439775854991316, "learning_rate": 2.8951578947368424e-05, "loss": 0.3653, "mean_copy_accuracy": 0.9815555512905121, "mean_gen_accuracy": 0.8563563823699951, "mean_token_accuracy": 0.8855815082788467, "num_tokens": 423902970.0, "sample_num_tokens": 9913.0, "step": 1566, "total_num_tokens": 423942622.0, "z_loss": 0.002022803295403719 }, { "copy_logits_max": -1.9417015314102173, "copy_logits_min": -750000000.0, "copy_num_tokens": 521.6875, "epoch": 0.3200408475874394, "gen_logits_max": 7.031026840209961, "gen_logits_mean": -11.683820724487305, "gen_logits_min": -22.659385681152344, "gen_logits_std": 2.313356637954712, "gen_loss": 0.3211551904678345, "grad_norm": 0.5156411983272672, "learning_rate": 2.8950315789473685e-05, "loss": 0.3483, "mean_copy_accuracy": 0.991562083363533, "mean_gen_accuracy": 0.8564081192016602, "mean_token_accuracy": 0.8896110653877258, "num_tokens": 424177249.0, "sample_num_tokens": 8548.25, "step": 1567, "total_num_tokens": 424211442.0, "z_loss": 0.001786328386515379 }, { "copy_logits_max": -2.3232524394989014, "copy_logits_min": -750000000.0, "copy_num_tokens": 273.4375, "epoch": 0.3202450855246362, "gen_logits_max": 8.474609375, "gen_logits_mean": -11.09821891784668, "gen_logits_min": -22.263845443725586, "gen_logits_std": 2.3633055686950684, "gen_loss": 0.34462544322013855, "grad_norm": 0.6886580448297597, "learning_rate": 2.894905263157895e-05, "loss": 0.3917, "mean_copy_accuracy": 0.986903965473175, "mean_gen_accuracy": 0.846995934844017, "mean_token_accuracy": 0.8732063621282578, "num_tokens": 424426623.0, "sample_num_tokens": 7348.75, "step": 1568, "total_num_tokens": 424456018.0, "z_loss": 0.0016307344194501638 }, { "copy_logits_max": -0.3164200186729431, "copy_logits_min": -750000128.0, "copy_num_tokens": 489.625, "epoch": 0.320449323461833, "gen_logits_max": 8.071137428283691, "gen_logits_mean": -11.654034614562988, "gen_logits_min": -23.20761489868164, "gen_logits_std": 2.4015231132507324, "gen_loss": 0.3622470796108246, "grad_norm": 0.5735850185878301, "learning_rate": 2.894778947368421e-05, "loss": 0.3481, "mean_copy_accuracy": 0.9906259626150131, "mean_gen_accuracy": 0.8527317196130753, "mean_token_accuracy": 0.8880411088466644, "num_tokens": 424711026.0, "sample_num_tokens": 8237.0, "step": 1569, "total_num_tokens": 424743974.0, "z_loss": 0.0018795933574438095 }, { "copy_logits_max": -1.595178246498108, "copy_logits_min": -750000064.0, "copy_num_tokens": 344.3125, "epoch": 0.3206535613990299, "gen_logits_max": 8.480650901794434, "gen_logits_mean": -10.868032455444336, "gen_logits_min": -22.15058135986328, "gen_logits_std": 2.412355899810791, "gen_loss": 0.3395247459411621, "grad_norm": 0.6698869101054925, "learning_rate": 2.8946526315789474e-05, "loss": 0.3241, "mean_copy_accuracy": 0.9881312996149063, "mean_gen_accuracy": 0.8656329363584518, "mean_token_accuracy": 0.8970710933208466, "num_tokens": 424965233.0, "sample_num_tokens": 7183.75, "step": 1570, "total_num_tokens": 424993968.0, "z_loss": 0.0016320927534252405 }, { "copy_logits_max": -1.3213034868240356, "copy_logits_min": -750000000.0, "copy_num_tokens": 435.9375, "epoch": 0.3208577993362267, "gen_logits_max": 8.31033706665039, "gen_logits_mean": -10.611587524414062, "gen_logits_min": -21.757850646972656, "gen_logits_std": 2.379227638244629, "gen_loss": 0.3388512134552002, "grad_norm": 0.49300853489024654, "learning_rate": 2.894526315789474e-05, "loss": 0.339, "mean_copy_accuracy": 0.9908097982406616, "mean_gen_accuracy": 0.8602148741483688, "mean_token_accuracy": 0.8920140713453293, "num_tokens": 425228432.0, "sample_num_tokens": 7973.0, "step": 1571, "total_num_tokens": 425260324.0, "z_loss": 0.0017431253800168633 }, { "copy_logits_max": -0.056993842124938965, "copy_logits_min": -750000000.0, "copy_num_tokens": 549.5, "epoch": 0.32106203727342353, "gen_logits_max": 6.920726299285889, "gen_logits_mean": -11.25731086730957, "gen_logits_min": -23.207483291625977, "gen_logits_std": 2.39705228805542, "gen_loss": 0.3202987015247345, "grad_norm": 0.6366699172878891, "learning_rate": 2.8944e-05, "loss": 0.3341, "mean_copy_accuracy": 0.9918485432863235, "mean_gen_accuracy": 0.856652557849884, "mean_token_accuracy": 0.8903241902589798, "num_tokens": 425486456.0, "sample_num_tokens": 8348.5, "step": 1572, "total_num_tokens": 425519850.0, "z_loss": 0.0017939055105671287 }, { "copy_logits_max": -0.6919726133346558, "copy_logits_min": -687500032.0, "copy_num_tokens": 453.125, "epoch": 0.3212662752106204, "gen_logits_max": 7.719123363494873, "gen_logits_mean": -11.151325225830078, "gen_logits_min": -22.99248504638672, "gen_logits_std": 2.4302539825439453, "gen_loss": 0.35371094942092896, "grad_norm": 0.540454172153263, "learning_rate": 2.8942736842105264e-05, "loss": 0.3354, "mean_copy_accuracy": 0.9919235557317734, "mean_gen_accuracy": 0.8538133203983307, "mean_token_accuracy": 0.8898548185825348, "num_tokens": 425743024.0, "sample_num_tokens": 8187.5, "step": 1573, "total_num_tokens": 425775774.0, "z_loss": 0.0016168145230039954 }, { "copy_logits_max": -3.412324905395508, "copy_logits_min": -750000000.0, "copy_num_tokens": 422.9375, "epoch": 0.3214705131478172, "gen_logits_max": 7.5291948318481445, "gen_logits_mean": -11.22998332977295, "gen_logits_min": -22.725418090820312, "gen_logits_std": 2.408935308456421, "gen_loss": 0.33523985743522644, "grad_norm": 0.8711845903252513, "learning_rate": 2.8941473684210528e-05, "loss": 0.3376, "mean_copy_accuracy": 0.9902090579271317, "mean_gen_accuracy": 0.8550614416599274, "mean_token_accuracy": 0.8912394493818283, "num_tokens": 426014096.0, "sample_num_tokens": 7378.5, "step": 1574, "total_num_tokens": 426043610.0, "z_loss": 0.001246243016794324 }, { "copy_logits_max": -3.1524105072021484, "copy_logits_min": -750000000.0, "copy_num_tokens": 442.5, "epoch": 0.32167475108501403, "gen_logits_max": 7.449305534362793, "gen_logits_mean": -10.772889137268066, "gen_logits_min": -21.707317352294922, "gen_logits_std": 2.3084704875946045, "gen_loss": 0.38365861773490906, "grad_norm": 0.5495763968982333, "learning_rate": 2.8940210526315792e-05, "loss": 0.3675, "mean_copy_accuracy": 0.9932760000228882, "mean_gen_accuracy": 0.8457904607057571, "mean_token_accuracy": 0.8808062225580215, "num_tokens": 426296944.0, "sample_num_tokens": 9161.5, "step": 1575, "total_num_tokens": 426333590.0, "z_loss": 0.0015325592830777168 }, { "copy_logits_max": -3.317450761795044, "copy_logits_min": -750000000.0, "copy_num_tokens": 457.1875, "epoch": 0.32187898902221085, "gen_logits_max": 7.000640869140625, "gen_logits_mean": -12.171735763549805, "gen_logits_min": -23.21126937866211, "gen_logits_std": 2.328195095062256, "gen_loss": 0.3173735737800598, "grad_norm": 0.5438014144449049, "learning_rate": 2.8938947368421053e-05, "loss": 0.3326, "mean_copy_accuracy": 0.9915741235017776, "mean_gen_accuracy": 0.861874133348465, "mean_token_accuracy": 0.8897164165973663, "num_tokens": 426573121.0, "sample_num_tokens": 9602.75, "step": 1576, "total_num_tokens": 426611532.0, "z_loss": 0.0014772220747545362 }, { "copy_logits_max": -0.32738497853279114, "copy_logits_min": -687500096.0, "copy_num_tokens": 680.25, "epoch": 0.3220832269594077, "gen_logits_max": 7.008284091949463, "gen_logits_mean": -10.848196029663086, "gen_logits_min": -22.582895278930664, "gen_logits_std": 2.3718888759613037, "gen_loss": 0.28732770681381226, "grad_norm": 0.8058501378172079, "learning_rate": 2.8937684210526318e-05, "loss": 0.3314, "mean_copy_accuracy": 0.9908682405948639, "mean_gen_accuracy": 0.8542399853467941, "mean_token_accuracy": 0.8935331404209137, "num_tokens": 426869516.0, "sample_num_tokens": 9161.0, "step": 1577, "total_num_tokens": 426906160.0, "z_loss": 0.00214123772457242 }, { "copy_logits_max": -2.815246105194092, "copy_logits_min": -750000000.0, "copy_num_tokens": 311.3125, "epoch": 0.32228746489660454, "gen_logits_max": 7.485476493835449, "gen_logits_mean": -10.926239013671875, "gen_logits_min": -21.92477798461914, "gen_logits_std": 2.3207085132598877, "gen_loss": 0.3152226209640503, "grad_norm": 0.48850035734363856, "learning_rate": 2.893642105263158e-05, "loss": 0.3372, "mean_copy_accuracy": 0.9892363548278809, "mean_gen_accuracy": 0.8622264415025711, "mean_token_accuracy": 0.8904240131378174, "num_tokens": 427130230.0, "sample_num_tokens": 7475.0, "step": 1578, "total_num_tokens": 427160130.0, "z_loss": 0.0016572915483266115 }, { "copy_logits_max": 0.5690007209777832, "copy_logits_min": -750000000.0, "copy_num_tokens": 301.875, "epoch": 0.32249170283380135, "gen_logits_max": 7.249037742614746, "gen_logits_mean": -12.24303913116455, "gen_logits_min": -23.59691619873047, "gen_logits_std": 2.378655433654785, "gen_loss": 0.32687342166900635, "grad_norm": 0.8817815148131549, "learning_rate": 2.8935157894736843e-05, "loss": 0.3371, "mean_copy_accuracy": 0.9869690984487534, "mean_gen_accuracy": 0.8613942861557007, "mean_token_accuracy": 0.8900071382522583, "num_tokens": 427402116.0, "sample_num_tokens": 7222.0, "step": 1579, "total_num_tokens": 427431004.0, "z_loss": 0.002471614396199584 }, { "copy_logits_max": -3.023373603820801, "copy_logits_min": -750000000.0, "copy_num_tokens": 402.0625, "epoch": 0.3226959407709982, "gen_logits_max": 7.18754243850708, "gen_logits_mean": -11.825803756713867, "gen_logits_min": -22.624773025512695, "gen_logits_std": 2.3290843963623047, "gen_loss": 0.27972230315208435, "grad_norm": 0.47981526528889384, "learning_rate": 2.8933894736842104e-05, "loss": 0.3131, "mean_copy_accuracy": 0.9894085824489594, "mean_gen_accuracy": 0.8690421432256699, "mean_token_accuracy": 0.8971875607967377, "num_tokens": 427681941.0, "sample_num_tokens": 8749.25, "step": 1580, "total_num_tokens": 427716938.0, "z_loss": 0.0019329529022797942 }, { "copy_logits_max": -0.00772935152053833, "copy_logits_min": -750000000.0, "copy_num_tokens": 419.5, "epoch": 0.32290017870819504, "gen_logits_max": 7.715836524963379, "gen_logits_mean": -11.092559814453125, "gen_logits_min": -22.083984375, "gen_logits_std": 2.344278335571289, "gen_loss": 0.3435346484184265, "grad_norm": 0.4895855670054081, "learning_rate": 2.8932631578947368e-05, "loss": 0.3342, "mean_copy_accuracy": 0.9908055663108826, "mean_gen_accuracy": 0.8589097410440445, "mean_token_accuracy": 0.8902367949485779, "num_tokens": 427961833.0, "sample_num_tokens": 8041.75, "step": 1581, "total_num_tokens": 427994000.0, "z_loss": 0.002624625340104103 }, { "copy_logits_max": -1.4326766729354858, "copy_logits_min": -750000000.0, "copy_num_tokens": 318.625, "epoch": 0.32310441664539186, "gen_logits_max": 8.095125198364258, "gen_logits_mean": -10.95379638671875, "gen_logits_min": -21.931175231933594, "gen_logits_std": 2.290968418121338, "gen_loss": 0.37176835536956787, "grad_norm": 0.6059628990894562, "learning_rate": 2.8931368421052632e-05, "loss": 0.3474, "mean_copy_accuracy": 0.9898669272661209, "mean_gen_accuracy": 0.8603415638208389, "mean_token_accuracy": 0.8879233151674271, "num_tokens": 428226037.0, "sample_num_tokens": 7643.75, "step": 1582, "total_num_tokens": 428256612.0, "z_loss": 0.002034353092312813 }, { "copy_logits_max": -0.8652201294898987, "copy_logits_min": -750000000.0, "copy_num_tokens": 500.0625, "epoch": 0.32330865458258873, "gen_logits_max": 7.67987060546875, "gen_logits_mean": -10.5125150680542, "gen_logits_min": -21.531099319458008, "gen_logits_std": 2.312680244445801, "gen_loss": 0.34997719526290894, "grad_norm": 0.5639793381181216, "learning_rate": 2.8930105263157897e-05, "loss": 0.3293, "mean_copy_accuracy": 0.9896300137042999, "mean_gen_accuracy": 0.8609271943569183, "mean_token_accuracy": 0.8952023088932037, "num_tokens": 428516079.0, "sample_num_tokens": 9751.75, "step": 1583, "total_num_tokens": 428555086.0, "z_loss": 0.0019964133389294147 }, { "copy_logits_max": -0.007881313562393188, "copy_logits_min": -687500032.0, "copy_num_tokens": 384.625, "epoch": 0.32351289251978554, "gen_logits_max": 7.54398775100708, "gen_logits_mean": -12.071256637573242, "gen_logits_min": -23.6751766204834, "gen_logits_std": 2.387341022491455, "gen_loss": 0.3267074525356293, "grad_norm": 0.5763421718003645, "learning_rate": 2.892884210526316e-05, "loss": 0.3223, "mean_copy_accuracy": 0.9889553189277649, "mean_gen_accuracy": 0.8613858371973038, "mean_token_accuracy": 0.8953531831502914, "num_tokens": 428793297.0, "sample_num_tokens": 7433.25, "step": 1584, "total_num_tokens": 428823030.0, "z_loss": 0.0017181083094328642 }, { "copy_logits_max": -0.8523099422454834, "copy_logits_min": -687500032.0, "copy_num_tokens": 374.375, "epoch": 0.32371713045698236, "gen_logits_max": 8.355766296386719, "gen_logits_mean": -11.315961837768555, "gen_logits_min": -22.790485382080078, "gen_logits_std": 2.430154800415039, "gen_loss": 0.35654371976852417, "grad_norm": 0.591422562653998, "learning_rate": 2.8927578947368422e-05, "loss": 0.3406, "mean_copy_accuracy": 0.9893947541713715, "mean_gen_accuracy": 0.8551691919565201, "mean_token_accuracy": 0.8901240527629852, "num_tokens": 429064257.0, "sample_num_tokens": 7901.75, "step": 1585, "total_num_tokens": 429095864.0, "z_loss": 0.0016711762873455882 }, { "copy_logits_max": -3.772406578063965, "copy_logits_min": -687500032.0, "copy_num_tokens": 426.625, "epoch": 0.32392136839417923, "gen_logits_max": 8.307767868041992, "gen_logits_mean": -11.251206398010254, "gen_logits_min": -22.272008895874023, "gen_logits_std": 2.3419861793518066, "gen_loss": 0.34102898836135864, "grad_norm": 0.580661152637546, "learning_rate": 2.8926315789473686e-05, "loss": 0.3435, "mean_copy_accuracy": 0.9901420772075653, "mean_gen_accuracy": 0.8564469814300537, "mean_token_accuracy": 0.8905841708183289, "num_tokens": 429338191.0, "sample_num_tokens": 8792.75, "step": 1586, "total_num_tokens": 429373362.0, "z_loss": 0.0015294395852833986 }, { "copy_logits_max": -3.2085514068603516, "copy_logits_min": -750000000.0, "copy_num_tokens": 405.3125, "epoch": 0.32412560633137605, "gen_logits_max": 8.41878890991211, "gen_logits_mean": -10.594902038574219, "gen_logits_min": -21.531103134155273, "gen_logits_std": 2.3498902320861816, "gen_loss": 0.29994648694992065, "grad_norm": 0.5585761140106835, "learning_rate": 2.8925052631578947e-05, "loss": 0.341, "mean_copy_accuracy": 0.9897678047418594, "mean_gen_accuracy": 0.854970321059227, "mean_token_accuracy": 0.8909771591424942, "num_tokens": 429621027.0, "sample_num_tokens": 8084.75, "step": 1587, "total_num_tokens": 429653366.0, "z_loss": 0.001420414773747325 }, { "copy_logits_max": -3.6936087608337402, "copy_logits_min": -750000000.0, "copy_num_tokens": 375.75, "epoch": 0.32432984426857286, "gen_logits_max": 8.754215240478516, "gen_logits_mean": -10.996389389038086, "gen_logits_min": -21.940582275390625, "gen_logits_std": 2.353250026702881, "gen_loss": 0.3709297776222229, "grad_norm": 1.151937722890635, "learning_rate": 2.892378947368421e-05, "loss": 0.3411, "mean_copy_accuracy": 0.9874125421047211, "mean_gen_accuracy": 0.8622853010892868, "mean_token_accuracy": 0.8880752325057983, "num_tokens": 429869688.0, "sample_num_tokens": 8113.0, "step": 1588, "total_num_tokens": 429902140.0, "z_loss": 0.0015162695199251175 }, { "copy_logits_max": 2.0638437271118164, "copy_logits_min": -750000000.0, "copy_num_tokens": 580.1875, "epoch": 0.32453408220576974, "gen_logits_max": 7.85820198059082, "gen_logits_mean": -10.189216613769531, "gen_logits_min": -21.50649642944336, "gen_logits_std": 2.3717761039733887, "gen_loss": 0.30268335342407227, "grad_norm": 0.7321356848939303, "learning_rate": 2.8922526315789472e-05, "loss": 0.3207, "mean_copy_accuracy": 0.9922620356082916, "mean_gen_accuracy": 0.8637511879205704, "mean_token_accuracy": 0.8974367380142212, "num_tokens": 430129801.0, "sample_num_tokens": 8503.25, "step": 1589, "total_num_tokens": 430163814.0, "z_loss": 0.003435170976445079 }, { "copy_logits_max": 1.3053958415985107, "copy_logits_min": -687500032.0, "copy_num_tokens": 322.4375, "epoch": 0.32473832014296655, "gen_logits_max": 8.588338851928711, "gen_logits_mean": -11.571897506713867, "gen_logits_min": -22.845012664794922, "gen_logits_std": 2.4152650833129883, "gen_loss": 0.3437873125076294, "grad_norm": 0.8154535532453788, "learning_rate": 2.892126315789474e-05, "loss": 0.3476, "mean_copy_accuracy": 0.9893473386764526, "mean_gen_accuracy": 0.8545615077018738, "mean_token_accuracy": 0.88907590508461, "num_tokens": 430407178.0, "sample_num_tokens": 7333.5, "step": 1590, "total_num_tokens": 430436512.0, "z_loss": 0.0031490763649344444 }, { "copy_logits_max": 4.161608695983887, "copy_logits_min": -750000000.0, "copy_num_tokens": 646.625, "epoch": 0.32494255808016337, "gen_logits_max": 8.07300090789795, "gen_logits_mean": -9.698783874511719, "gen_logits_min": -21.41484832763672, "gen_logits_std": 2.470578193664551, "gen_loss": 0.3135261833667755, "grad_norm": 0.5386516595472125, "learning_rate": 2.892e-05, "loss": 0.3344, "mean_copy_accuracy": 0.992457926273346, "mean_gen_accuracy": 0.85291887819767, "mean_token_accuracy": 0.8924786895513535, "num_tokens": 430692082.0, "sample_num_tokens": 9127.0, "step": 1591, "total_num_tokens": 430728590.0, "z_loss": 0.005038864444941282 }, { "copy_logits_max": 0.04242616891860962, "copy_logits_min": -750000000.0, "copy_num_tokens": 387.8125, "epoch": 0.32514679601736024, "gen_logits_max": 8.730947494506836, "gen_logits_mean": -10.112008094787598, "gen_logits_min": -21.24393653869629, "gen_logits_std": 2.3368523120880127, "gen_loss": 0.36027124524116516, "grad_norm": 0.7602724956722512, "learning_rate": 2.8918736842105265e-05, "loss": 0.3548, "mean_copy_accuracy": 0.9847377985715866, "mean_gen_accuracy": 0.855350449681282, "mean_token_accuracy": 0.8877551257610321, "num_tokens": 430978537.0, "sample_num_tokens": 8267.75, "step": 1592, "total_num_tokens": 431011608.0, "z_loss": 0.0029672267846763134 }, { "copy_logits_max": 4.468803405761719, "copy_logits_min": -750000000.0, "copy_num_tokens": 766.5625, "epoch": 0.32535103395455706, "gen_logits_max": 7.561987400054932, "gen_logits_mean": -11.196784019470215, "gen_logits_min": -22.92946434020996, "gen_logits_std": 2.4190988540649414, "gen_loss": 0.3096616566181183, "grad_norm": 0.544968106624731, "learning_rate": 2.8917473684210526e-05, "loss": 0.345, "mean_copy_accuracy": 0.9895409047603607, "mean_gen_accuracy": 0.8558122962713242, "mean_token_accuracy": 0.890967071056366, "num_tokens": 431286307.0, "sample_num_tokens": 11190.75, "step": 1593, "total_num_tokens": 431331070.0, "z_loss": 0.004325975198298693 }, { "copy_logits_max": 2.8992884159088135, "copy_logits_min": -750000000.0, "copy_num_tokens": 310.0, "epoch": 0.3255552718917539, "gen_logits_max": 8.395172119140625, "gen_logits_mean": -11.209920883178711, "gen_logits_min": -22.53802490234375, "gen_logits_std": 2.4142088890075684, "gen_loss": 0.3266077935695648, "grad_norm": 0.6724779240261932, "learning_rate": 2.891621052631579e-05, "loss": 0.336, "mean_copy_accuracy": 0.9880847781896591, "mean_gen_accuracy": 0.8651245683431625, "mean_token_accuracy": 0.8937005698680878, "num_tokens": 431559063.0, "sample_num_tokens": 6825.25, "step": 1594, "total_num_tokens": 431586364.0, "z_loss": 0.0034688254818320274 }, { "copy_logits_max": 1.1353672742843628, "copy_logits_min": -750000000.0, "copy_num_tokens": 587.8125, "epoch": 0.32575950982895074, "gen_logits_max": 7.836747646331787, "gen_logits_mean": -9.538566589355469, "gen_logits_min": -20.63066864013672, "gen_logits_std": 2.3541786670684814, "gen_loss": 0.2674112915992737, "grad_norm": 0.6820876979469572, "learning_rate": 2.891494736842105e-05, "loss": 0.3222, "mean_copy_accuracy": 0.9898138791322708, "mean_gen_accuracy": 0.8638685643672943, "mean_token_accuracy": 0.8989506959915161, "num_tokens": 431821489.0, "sample_num_tokens": 9656.75, "step": 1595, "total_num_tokens": 431860116.0, "z_loss": 0.003853677539154887 }, { "copy_logits_max": 2.8134398460388184, "copy_logits_min": -750000000.0, "copy_num_tokens": 410.125, "epoch": 0.32596374776614756, "gen_logits_max": 8.01986026763916, "gen_logits_mean": -10.91046142578125, "gen_logits_min": -22.161426544189453, "gen_logits_std": 2.4027609825134277, "gen_loss": 0.3767557144165039, "grad_norm": 0.559940646995396, "learning_rate": 2.8913684210526316e-05, "loss": 0.3519, "mean_copy_accuracy": 0.9900752305984497, "mean_gen_accuracy": 0.8534626662731171, "mean_token_accuracy": 0.8870524168014526, "num_tokens": 432102512.0, "sample_num_tokens": 7977.5, "step": 1596, "total_num_tokens": 432134422.0, "z_loss": 0.0028929472900927067 }, { "copy_logits_max": -0.42311984300613403, "copy_logits_min": -687500032.0, "copy_num_tokens": 603.75, "epoch": 0.3261679857033444, "gen_logits_max": 7.440983295440674, "gen_logits_mean": -11.209234237670898, "gen_logits_min": -22.56290054321289, "gen_logits_std": 2.4139416217803955, "gen_loss": 0.33359891176223755, "grad_norm": 0.5522725305652705, "learning_rate": 2.891242105263158e-05, "loss": 0.3426, "mean_copy_accuracy": 0.9873272925615311, "mean_gen_accuracy": 0.857877716422081, "mean_token_accuracy": 0.8885830044746399, "num_tokens": 432360327.0, "sample_num_tokens": 9580.25, "step": 1597, "total_num_tokens": 432398648.0, "z_loss": 0.002528171520680189 }, { "copy_logits_max": -1.077297568321228, "copy_logits_min": -750000000.0, "copy_num_tokens": 484.1875, "epoch": 0.32637222364054125, "gen_logits_max": 7.570192337036133, "gen_logits_mean": -11.458720207214355, "gen_logits_min": -22.643991470336914, "gen_logits_std": 2.3504910469055176, "gen_loss": 0.3354794979095459, "grad_norm": 0.5443468784834261, "learning_rate": 2.8911157894736844e-05, "loss": 0.3336, "mean_copy_accuracy": 0.9903614073991776, "mean_gen_accuracy": 0.8563751131296158, "mean_token_accuracy": 0.8907154500484467, "num_tokens": 432655633.0, "sample_num_tokens": 9005.25, "step": 1598, "total_num_tokens": 432691654.0, "z_loss": 0.0019208159064874053 }, { "copy_logits_max": -3.7812376022338867, "copy_logits_min": -750000000.0, "copy_num_tokens": 220.6875, "epoch": 0.32657646157773806, "gen_logits_max": 8.942256927490234, "gen_logits_mean": -10.104560852050781, "gen_logits_min": -21.515857696533203, "gen_logits_std": 2.3680810928344727, "gen_loss": 0.3628072142601013, "grad_norm": 0.571305732905098, "learning_rate": 2.890989473684211e-05, "loss": 0.3616, "mean_copy_accuracy": 0.9881958812475204, "mean_gen_accuracy": 0.8542762249708176, "mean_token_accuracy": 0.8827462494373322, "num_tokens": 432903524.0, "sample_num_tokens": 6447.0, "step": 1599, "total_num_tokens": 432929312.0, "z_loss": 0.0013108913553878665 }, { "copy_logits_max": -0.40833890438079834, "copy_logits_min": -625000064.0, "copy_num_tokens": 652.5, "epoch": 0.3267806995149349, "gen_logits_max": 7.907557487487793, "gen_logits_mean": -10.165149688720703, "gen_logits_min": -21.194786071777344, "gen_logits_std": 2.3384361267089844, "gen_loss": 0.34928780794143677, "grad_norm": 0.5209901685835479, "learning_rate": 2.890863157894737e-05, "loss": 0.3457, "mean_copy_accuracy": 0.9882658123970032, "mean_gen_accuracy": 0.8607503175735474, "mean_token_accuracy": 0.8884276896715164, "num_tokens": 433161307.0, "sample_num_tokens": 10274.75, "step": 1600, "total_num_tokens": 433202406.0, "z_loss": 0.0022322931326925755 }, { "copy_logits_max": -2.9161181449890137, "copy_logits_min": -750000000.0, "copy_num_tokens": 226.0, "epoch": 0.32698493745213175, "gen_logits_max": 8.477150917053223, "gen_logits_mean": -10.947530746459961, "gen_logits_min": -21.715770721435547, "gen_logits_std": 2.3338611125946045, "gen_loss": 0.3602280020713806, "grad_norm": 0.511776803065985, "learning_rate": 2.8907368421052634e-05, "loss": 0.3445, "mean_copy_accuracy": 0.987541139125824, "mean_gen_accuracy": 0.8584116399288177, "mean_token_accuracy": 0.8861086964607239, "num_tokens": 433413386.0, "sample_num_tokens": 6946.0, "step": 1601, "total_num_tokens": 433441170.0, "z_loss": 0.001576162874698639 }, { "copy_logits_max": -0.5056377649307251, "copy_logits_min": -687500032.0, "copy_num_tokens": 331.3125, "epoch": 0.32718917538932857, "gen_logits_max": 8.612045288085938, "gen_logits_mean": -10.477937698364258, "gen_logits_min": -21.625991821289062, "gen_logits_std": 2.341482639312744, "gen_loss": 0.33172476291656494, "grad_norm": 0.5509913973882504, "learning_rate": 2.8906105263157895e-05, "loss": 0.3251, "mean_copy_accuracy": 0.9902012497186661, "mean_gen_accuracy": 0.8589982241392136, "mean_token_accuracy": 0.8947702348232269, "num_tokens": 433696068.0, "sample_num_tokens": 7139.0, "step": 1602, "total_num_tokens": 433724624.0, "z_loss": 0.002009960124269128 }, { "copy_logits_max": -0.8851206302642822, "copy_logits_min": -750000064.0, "copy_num_tokens": 607.1875, "epoch": 0.3273934133265254, "gen_logits_max": 8.134371757507324, "gen_logits_mean": -10.333274841308594, "gen_logits_min": -21.484474182128906, "gen_logits_std": 2.3677752017974854, "gen_loss": 0.33386075496673584, "grad_norm": 0.6514176893223474, "learning_rate": 2.890484210526316e-05, "loss": 0.318, "mean_copy_accuracy": 0.9919632822275162, "mean_gen_accuracy": 0.8581512868404388, "mean_token_accuracy": 0.8974481672048569, "num_tokens": 433989352.0, "sample_num_tokens": 10074.5, "step": 1603, "total_num_tokens": 434029650.0, "z_loss": 0.0021370097529143095 }, { "copy_logits_max": 1.814119577407837, "copy_logits_min": -687500032.0, "copy_num_tokens": 521.625, "epoch": 0.32759765126372226, "gen_logits_max": 7.625654697418213, "gen_logits_mean": -11.364875793457031, "gen_logits_min": -23.166292190551758, "gen_logits_std": 2.4317336082458496, "gen_loss": 0.30646055936813354, "grad_norm": 0.5321863645294299, "learning_rate": 2.890357894736842e-05, "loss": 0.347, "mean_copy_accuracy": 0.9916476309299469, "mean_gen_accuracy": 0.8531283438205719, "mean_token_accuracy": 0.8877504616975784, "num_tokens": 434273810.0, "sample_num_tokens": 8520.5, "step": 1604, "total_num_tokens": 434307892.0, "z_loss": 0.0020963819697499275 }, { "copy_logits_max": -1.0029442310333252, "copy_logits_min": -750000000.0, "copy_num_tokens": 496.0, "epoch": 0.32780188920091907, "gen_logits_max": 7.313997268676758, "gen_logits_mean": -11.968222618103027, "gen_logits_min": -23.50347137451172, "gen_logits_std": 2.4000205993652344, "gen_loss": 0.3416309356689453, "grad_norm": 1.1664394098550361, "learning_rate": 2.8902315789473684e-05, "loss": 0.3497, "mean_copy_accuracy": 0.9911794662475586, "mean_gen_accuracy": 0.8514096587896347, "mean_token_accuracy": 0.8880755305290222, "num_tokens": 434537527.0, "sample_num_tokens": 8688.75, "step": 1605, "total_num_tokens": 434572282.0, "z_loss": 0.0017452409956604242 }, { "copy_logits_max": -4.544132232666016, "copy_logits_min": -750000000.0, "copy_num_tokens": 363.3125, "epoch": 0.3280061271381159, "gen_logits_max": 8.34638500213623, "gen_logits_mean": -11.301198959350586, "gen_logits_min": -22.575145721435547, "gen_logits_std": 2.3564400672912598, "gen_loss": 0.37091612815856934, "grad_norm": 0.5019704082015646, "learning_rate": 2.8901052631578945e-05, "loss": 0.3297, "mean_copy_accuracy": 0.9909149557352066, "mean_gen_accuracy": 0.8611752688884735, "mean_token_accuracy": 0.8925967216491699, "num_tokens": 434803015.0, "sample_num_tokens": 7950.25, "step": 1606, "total_num_tokens": 434834816.0, "z_loss": 0.0013774435501545668 }, { "copy_logits_max": -2.1709303855895996, "copy_logits_min": -625000064.0, "copy_num_tokens": 357.5, "epoch": 0.32821036507531276, "gen_logits_max": 7.815752983093262, "gen_logits_mean": -11.319818496704102, "gen_logits_min": -22.672588348388672, "gen_logits_std": 2.388606071472168, "gen_loss": 0.35240286588668823, "grad_norm": 0.5262840136521605, "learning_rate": 2.8899789473684213e-05, "loss": 0.3428, "mean_copy_accuracy": 0.9886609315872192, "mean_gen_accuracy": 0.8621739447116852, "mean_token_accuracy": 0.8867885023355484, "num_tokens": 435046713.0, "sample_num_tokens": 7377.25, "step": 1607, "total_num_tokens": 435076222.0, "z_loss": 0.0015301681123673916 }, { "copy_logits_max": -2.667720079421997, "copy_logits_min": -750000000.0, "copy_num_tokens": 502.3125, "epoch": 0.3284146030125096, "gen_logits_max": 7.188079357147217, "gen_logits_mean": -12.292021751403809, "gen_logits_min": -23.740550994873047, "gen_logits_std": 2.404508590698242, "gen_loss": 0.3175662159919739, "grad_norm": 0.5279863139982144, "learning_rate": 2.8898526315789474e-05, "loss": 0.3445, "mean_copy_accuracy": 0.9880082756280899, "mean_gen_accuracy": 0.8589107692241669, "mean_token_accuracy": 0.8894055783748627, "num_tokens": 435317672.0, "sample_num_tokens": 8356.0, "step": 1608, "total_num_tokens": 435351096.0, "z_loss": 0.0016263246070593596 }, { "copy_logits_max": -5.2835693359375, "copy_logits_min": -750000000.0, "copy_num_tokens": 336.0, "epoch": 0.3286188409497064, "gen_logits_max": 7.306205749511719, "gen_logits_mean": -12.131326675415039, "gen_logits_min": -23.10637855529785, "gen_logits_std": 2.354922294616699, "gen_loss": 0.358837366104126, "grad_norm": 0.49177740362960826, "learning_rate": 2.8897263157894738e-05, "loss": 0.3397, "mean_copy_accuracy": 0.9912473857402802, "mean_gen_accuracy": 0.8587256073951721, "mean_token_accuracy": 0.8896198868751526, "num_tokens": 435597232.0, "sample_num_tokens": 7339.0, "step": 1609, "total_num_tokens": 435626588.0, "z_loss": 0.0012596171582117677 }, { "copy_logits_max": -1.0877550840377808, "copy_logits_min": -750000000.0, "copy_num_tokens": 476.4375, "epoch": 0.32882307888690326, "gen_logits_max": 7.501110076904297, "gen_logits_mean": -11.441854476928711, "gen_logits_min": -22.989051818847656, "gen_logits_std": 2.418670654296875, "gen_loss": 0.3041495680809021, "grad_norm": 0.5312076052846167, "learning_rate": 2.8896e-05, "loss": 0.3352, "mean_copy_accuracy": 0.9887341558933258, "mean_gen_accuracy": 0.8632048815488815, "mean_token_accuracy": 0.8914323151111603, "num_tokens": 435825226.0, "sample_num_tokens": 7967.5, "step": 1610, "total_num_tokens": 435857096.0, "z_loss": 0.0016871410189196467 }, { "copy_logits_max": -4.58351993560791, "copy_logits_min": -687500032.0, "copy_num_tokens": 381.5, "epoch": 0.3290273168241001, "gen_logits_max": 7.217021942138672, "gen_logits_mean": -12.840042114257812, "gen_logits_min": -23.745655059814453, "gen_logits_std": 2.3521602153778076, "gen_loss": 0.39521247148513794, "grad_norm": 0.5937028402343763, "learning_rate": 2.8894736842105263e-05, "loss": 0.371, "mean_copy_accuracy": 0.9870457202196121, "mean_gen_accuracy": 0.8501271903514862, "mean_token_accuracy": 0.8805023282766342, "num_tokens": 436079542.0, "sample_num_tokens": 8723.5, "step": 1611, "total_num_tokens": 436114436.0, "z_loss": 0.0013792063109576702 }, { "copy_logits_max": -3.6290783882141113, "copy_logits_min": -750000000.0, "copy_num_tokens": 419.125, "epoch": 0.3292315547612969, "gen_logits_max": 8.268636703491211, "gen_logits_mean": -10.555742263793945, "gen_logits_min": -21.744171142578125, "gen_logits_std": 2.348498582839966, "gen_loss": 0.3188859820365906, "grad_norm": 0.5145542327132898, "learning_rate": 2.8893473684210528e-05, "loss": 0.3306, "mean_copy_accuracy": 0.9903802424669266, "mean_gen_accuracy": 0.8620174378156662, "mean_token_accuracy": 0.8917591571807861, "num_tokens": 436334985.0, "sample_num_tokens": 8194.25, "step": 1612, "total_num_tokens": 436367762.0, "z_loss": 0.0012314399937167764 }, { "copy_logits_max": -2.2706079483032227, "copy_logits_min": -750000000.0, "copy_num_tokens": 533.75, "epoch": 0.32943579269849377, "gen_logits_max": 7.457466125488281, "gen_logits_mean": -11.25636100769043, "gen_logits_min": -22.71590805053711, "gen_logits_std": 2.405031204223633, "gen_loss": 0.30377769470214844, "grad_norm": 0.4751955224360536, "learning_rate": 2.889221052631579e-05, "loss": 0.3442, "mean_copy_accuracy": 0.9915085136890411, "mean_gen_accuracy": 0.8544911891222, "mean_token_accuracy": 0.887297198176384, "num_tokens": 436599258.0, "sample_num_tokens": 8911.5, "step": 1613, "total_num_tokens": 436634904.0, "z_loss": 0.0014857032801955938 }, { "copy_logits_max": -3.285900115966797, "copy_logits_min": -750000000.0, "copy_num_tokens": 425.625, "epoch": 0.3296400306356906, "gen_logits_max": 7.533535003662109, "gen_logits_mean": -11.083168029785156, "gen_logits_min": -22.811492919921875, "gen_logits_std": 2.430309295654297, "gen_loss": 0.35426968336105347, "grad_norm": 0.5253467292769389, "learning_rate": 2.8890947368421053e-05, "loss": 0.3471, "mean_copy_accuracy": 0.9925369173288345, "mean_gen_accuracy": 0.8533926010131836, "mean_token_accuracy": 0.8893505781888962, "num_tokens": 436873783.0, "sample_num_tokens": 8046.75, "step": 1614, "total_num_tokens": 436905970.0, "z_loss": 0.0013991311425343156 }, { "copy_logits_max": -2.985720157623291, "copy_logits_min": -687500032.0, "copy_num_tokens": 386.125, "epoch": 0.3298442685728874, "gen_logits_max": 7.3616838455200195, "gen_logits_mean": -11.654899597167969, "gen_logits_min": -23.068857192993164, "gen_logits_std": 2.4009461402893066, "gen_loss": 0.3318105936050415, "grad_norm": 0.4844877492132032, "learning_rate": 2.8889684210526317e-05, "loss": 0.314, "mean_copy_accuracy": 0.9908072352409363, "mean_gen_accuracy": 0.8705072999000549, "mean_token_accuracy": 0.897544652223587, "num_tokens": 437146857.0, "sample_num_tokens": 7666.75, "step": 1615, "total_num_tokens": 437177524.0, "z_loss": 0.0013322398299351335 }, { "copy_logits_max": 1.0652315616607666, "copy_logits_min": -750000000.0, "copy_num_tokens": 540.375, "epoch": 0.33004850651008427, "gen_logits_max": 7.6006269454956055, "gen_logits_mean": -10.374122619628906, "gen_logits_min": -21.98189926147461, "gen_logits_std": 2.4218244552612305, "gen_loss": 0.37366074323654175, "grad_norm": 0.5152557329137406, "learning_rate": 2.888842105263158e-05, "loss": 0.3307, "mean_copy_accuracy": 0.9920980930328369, "mean_gen_accuracy": 0.8545773774385452, "mean_token_accuracy": 0.8904073238372803, "num_tokens": 437402601.0, "sample_num_tokens": 8836.25, "step": 1616, "total_num_tokens": 437437946.0, "z_loss": 0.0017025473061949015 }, { "copy_logits_max": -2.660612106323242, "copy_logits_min": -750000000.0, "copy_num_tokens": 385.375, "epoch": 0.3302527444472811, "gen_logits_max": 7.899256229400635, "gen_logits_mean": -10.149856567382812, "gen_logits_min": -21.084346771240234, "gen_logits_std": 2.3531620502471924, "gen_loss": 0.3092620372772217, "grad_norm": 0.5313328364870633, "learning_rate": 2.8887157894736843e-05, "loss": 0.3466, "mean_copy_accuracy": 0.9891432821750641, "mean_gen_accuracy": 0.8561189770698547, "mean_token_accuracy": 0.888518899679184, "num_tokens": 437678544.0, "sample_num_tokens": 8791.5, "step": 1617, "total_num_tokens": 437713710.0, "z_loss": 0.0012137824669480324 }, { "copy_logits_max": -2.2650275230407715, "copy_logits_min": -750000000.0, "copy_num_tokens": 423.5625, "epoch": 0.3304569823844779, "gen_logits_max": 7.53884220123291, "gen_logits_mean": -11.278154373168945, "gen_logits_min": -22.717355728149414, "gen_logits_std": 2.4181418418884277, "gen_loss": 0.3228437602519989, "grad_norm": 0.5590413224102279, "learning_rate": 2.8885894736842107e-05, "loss": 0.334, "mean_copy_accuracy": 0.9901013523340225, "mean_gen_accuracy": 0.8597890585660934, "mean_token_accuracy": 0.8901169896125793, "num_tokens": 437956309.0, "sample_num_tokens": 8978.25, "step": 1618, "total_num_tokens": 437992222.0, "z_loss": 0.0013584229163825512 }, { "copy_logits_max": -3.292820453643799, "copy_logits_min": -687500032.0, "copy_num_tokens": 486.6875, "epoch": 0.3306612203216748, "gen_logits_max": 7.807931900024414, "gen_logits_mean": -9.312596321105957, "gen_logits_min": -20.480621337890625, "gen_logits_std": 2.391146421432495, "gen_loss": 0.37473154067993164, "grad_norm": 0.4660504498150723, "learning_rate": 2.8884631578947368e-05, "loss": 0.3243, "mean_copy_accuracy": 0.992432028055191, "mean_gen_accuracy": 0.8640831261873245, "mean_token_accuracy": 0.8947473168373108, "num_tokens": 438229966.0, "sample_num_tokens": 9015.0, "step": 1619, "total_num_tokens": 438266026.0, "z_loss": 0.0015751319006085396 }, { "copy_logits_max": -1.9746854305267334, "copy_logits_min": -750000000.0, "copy_num_tokens": 358.4375, "epoch": 0.3308654582588716, "gen_logits_max": 7.9510908126831055, "gen_logits_mean": -10.847146987915039, "gen_logits_min": -22.57978057861328, "gen_logits_std": 2.439228057861328, "gen_loss": 0.335536926984787, "grad_norm": 0.4572740784708241, "learning_rate": 2.8883368421052632e-05, "loss": 0.3385, "mean_copy_accuracy": 0.9919924587011337, "mean_gen_accuracy": 0.8548468798398972, "mean_token_accuracy": 0.8901029676198959, "num_tokens": 438506916.0, "sample_num_tokens": 7342.0, "step": 1620, "total_num_tokens": 438536284.0, "z_loss": 0.0012936524581164122 }, { "copy_logits_max": -2.600294589996338, "copy_logits_min": -750000000.0, "copy_num_tokens": 493.1875, "epoch": 0.3310696961960684, "gen_logits_max": 7.4198126792907715, "gen_logits_mean": -10.835583686828613, "gen_logits_min": -22.97470474243164, "gen_logits_std": 2.4734787940979004, "gen_loss": 0.27445653080940247, "grad_norm": 0.497628844794392, "learning_rate": 2.8882105263157893e-05, "loss": 0.3252, "mean_copy_accuracy": 0.9919266998767853, "mean_gen_accuracy": 0.8585918545722961, "mean_token_accuracy": 0.8931166529655457, "num_tokens": 438768761.0, "sample_num_tokens": 8526.75, "step": 1621, "total_num_tokens": 438802868.0, "z_loss": 0.001184162450954318 }, { "copy_logits_max": -4.6013689041137695, "copy_logits_min": -750000000.0, "copy_num_tokens": 295.875, "epoch": 0.3312739341332653, "gen_logits_max": 9.316405296325684, "gen_logits_mean": -10.052972793579102, "gen_logits_min": -21.40920639038086, "gen_logits_std": 2.363534927368164, "gen_loss": 0.3855407238006592, "grad_norm": 0.5106494932071983, "learning_rate": 2.8880842105263157e-05, "loss": 0.3571, "mean_copy_accuracy": 0.9903390556573868, "mean_gen_accuracy": 0.8533650189638138, "mean_token_accuracy": 0.8826262056827545, "num_tokens": 439040378.0, "sample_num_tokens": 7563.0, "step": 1622, "total_num_tokens": 439070630.0, "z_loss": 0.0012907093623653054 }, { "copy_logits_max": -0.711921215057373, "copy_logits_min": -750000000.0, "copy_num_tokens": 485.5, "epoch": 0.3314781720704621, "gen_logits_max": 8.11512279510498, "gen_logits_mean": -10.389074325561523, "gen_logits_min": -22.155242919921875, "gen_logits_std": 2.4477100372314453, "gen_loss": 0.42448434233665466, "grad_norm": 0.48207397138184827, "learning_rate": 2.887957894736842e-05, "loss": 0.3601, "mean_copy_accuracy": 0.9918211996555328, "mean_gen_accuracy": 0.8493051677942276, "mean_token_accuracy": 0.8883087635040283, "num_tokens": 439312472.0, "sample_num_tokens": 8565.5, "step": 1623, "total_num_tokens": 439346734.0, "z_loss": 0.001506877364590764 }, { "copy_logits_max": -1.7379111051559448, "copy_logits_min": -687500032.0, "copy_num_tokens": 448.6875, "epoch": 0.3316824100076589, "gen_logits_max": 7.026393890380859, "gen_logits_mean": -11.918903350830078, "gen_logits_min": -23.792362213134766, "gen_logits_std": 2.453531503677368, "gen_loss": 0.3214540481567383, "grad_norm": 0.5660420041043066, "learning_rate": 2.8878315789473686e-05, "loss": 0.3428, "mean_copy_accuracy": 0.9909865409135818, "mean_gen_accuracy": 0.8579195439815521, "mean_token_accuracy": 0.8888312876224518, "num_tokens": 439586364.0, "sample_num_tokens": 7889.5, "step": 1624, "total_num_tokens": 439617922.0, "z_loss": 0.0012776432558894157 }, { "copy_logits_max": -2.440751791000366, "copy_logits_min": -750000000.0, "copy_num_tokens": 398.25, "epoch": 0.3318866479448558, "gen_logits_max": 7.569458961486816, "gen_logits_mean": -11.294477462768555, "gen_logits_min": -22.621885299682617, "gen_logits_std": 2.4391016960144043, "gen_loss": 0.355785071849823, "grad_norm": 0.5344926222455525, "learning_rate": 2.887705263157895e-05, "loss": 0.3408, "mean_copy_accuracy": 0.9916019439697266, "mean_gen_accuracy": 0.8568583577871323, "mean_token_accuracy": 0.8901944011449814, "num_tokens": 439869548.0, "sample_num_tokens": 7455.0, "step": 1625, "total_num_tokens": 439899368.0, "z_loss": 0.001286619226448238 }, { "copy_logits_max": -0.5286793112754822, "copy_logits_min": -750000000.0, "copy_num_tokens": 576.8125, "epoch": 0.3320908858820526, "gen_logits_max": 6.700095176696777, "gen_logits_mean": -10.992183685302734, "gen_logits_min": -23.025983810424805, "gen_logits_std": 2.440359592437744, "gen_loss": 0.31797853112220764, "grad_norm": 0.9263368241443014, "learning_rate": 2.887578947368421e-05, "loss": 0.3277, "mean_copy_accuracy": 0.9909852743148804, "mean_gen_accuracy": 0.8526695072650909, "mean_token_accuracy": 0.8926311731338501, "num_tokens": 440153256.0, "sample_num_tokens": 8602.0, "step": 1626, "total_num_tokens": 440187664.0, "z_loss": 0.0014035656349733472 }, { "copy_logits_max": -1.1388846635818481, "copy_logits_min": -687500032.0, "copy_num_tokens": 797.5, "epoch": 0.3322951238192494, "gen_logits_max": 7.024485111236572, "gen_logits_mean": -10.06015396118164, "gen_logits_min": -21.57941436767578, "gen_logits_std": 2.408444404602051, "gen_loss": 0.2751710116863251, "grad_norm": 0.4583363647946626, "learning_rate": 2.8874526315789475e-05, "loss": 0.313, "mean_copy_accuracy": 0.9919780939817429, "mean_gen_accuracy": 0.8644433319568634, "mean_token_accuracy": 0.8974235653877258, "num_tokens": 440420080.0, "sample_num_tokens": 9707.5, "step": 1627, "total_num_tokens": 440458910.0, "z_loss": 0.0013864366337656975 }, { "copy_logits_max": -4.561285018920898, "copy_logits_min": -750000000.0, "copy_num_tokens": 619.0, "epoch": 0.33249936175644623, "gen_logits_max": 7.645839214324951, "gen_logits_mean": -11.096138000488281, "gen_logits_min": -22.95061492919922, "gen_logits_std": 2.465823173522949, "gen_loss": 0.30265918374061584, "grad_norm": 0.5391268186806045, "learning_rate": 2.8873263157894736e-05, "loss": 0.3278, "mean_copy_accuracy": 0.9922409653663635, "mean_gen_accuracy": 0.8579407036304474, "mean_token_accuracy": 0.8934226185083389, "num_tokens": 440709738.0, "sample_num_tokens": 9733.0, "step": 1628, "total_num_tokens": 440748670.0, "z_loss": 0.0014716439181938767 }, { "copy_logits_max": -3.6105611324310303, "copy_logits_min": -750000000.0, "copy_num_tokens": 467.5, "epoch": 0.3327035996936431, "gen_logits_max": 7.595808506011963, "gen_logits_mean": -10.983959197998047, "gen_logits_min": -22.588836669921875, "gen_logits_std": 2.4331302642822266, "gen_loss": 0.3305017352104187, "grad_norm": 0.6738194932511228, "learning_rate": 2.8872e-05, "loss": 0.3126, "mean_copy_accuracy": 0.9905354231595993, "mean_gen_accuracy": 0.8596846461296082, "mean_token_accuracy": 0.8984679728746414, "num_tokens": 440998015.0, "sample_num_tokens": 8429.75, "step": 1629, "total_num_tokens": 441031734.0, "z_loss": 0.0011722252238541842 }, { "copy_logits_max": -5.686519622802734, "copy_logits_min": -750000000.0, "copy_num_tokens": 293.0625, "epoch": 0.3329078376308399, "gen_logits_max": 7.702354431152344, "gen_logits_mean": -11.25426959991455, "gen_logits_min": -22.522178649902344, "gen_logits_std": 2.413609504699707, "gen_loss": 0.3392215669155121, "grad_norm": 0.5624506906114449, "learning_rate": 2.887073684210526e-05, "loss": 0.3511, "mean_copy_accuracy": 0.9917854070663452, "mean_gen_accuracy": 0.852911964058876, "mean_token_accuracy": 0.8866220265626907, "num_tokens": 441252184.0, "sample_num_tokens": 7111.0, "step": 1630, "total_num_tokens": 441280628.0, "z_loss": 0.0011238340521231294 }, { "copy_logits_max": -0.9549487233161926, "copy_logits_min": -687500032.0, "copy_num_tokens": 547.8125, "epoch": 0.33311207556803674, "gen_logits_max": 7.40371036529541, "gen_logits_mean": -11.383491516113281, "gen_logits_min": -23.850780487060547, "gen_logits_std": 2.5138638019561768, "gen_loss": 0.30123794078826904, "grad_norm": 0.5030294807664675, "learning_rate": 2.886947368421053e-05, "loss": 0.3451, "mean_copy_accuracy": 0.9918467849493027, "mean_gen_accuracy": 0.8510201275348663, "mean_token_accuracy": 0.8865782022476196, "num_tokens": 441513743.0, "sample_num_tokens": 8416.25, "step": 1631, "total_num_tokens": 441547408.0, "z_loss": 0.0012191589921712875 }, { "copy_logits_max": -2.5946474075317383, "copy_logits_min": -750000000.0, "copy_num_tokens": 724.3125, "epoch": 0.3333163135052336, "gen_logits_max": 7.331665515899658, "gen_logits_mean": -11.147199630737305, "gen_logits_min": -22.85399627685547, "gen_logits_std": 2.4353554248809814, "gen_loss": 0.2903260886669159, "grad_norm": 0.49386527654010387, "learning_rate": 2.886821052631579e-05, "loss": 0.3315, "mean_copy_accuracy": 0.9918251633644104, "mean_gen_accuracy": 0.8572267293930054, "mean_token_accuracy": 0.8923801779747009, "num_tokens": 441783283.0, "sample_num_tokens": 9947.75, "step": 1632, "total_num_tokens": 441823074.0, "z_loss": 0.0013395196292549372 }, { "copy_logits_max": -2.3158936500549316, "copy_logits_min": -687500032.0, "copy_num_tokens": 581.1875, "epoch": 0.3335205514424304, "gen_logits_max": 8.062203407287598, "gen_logits_mean": -9.782777786254883, "gen_logits_min": -21.363706588745117, "gen_logits_std": 2.408559799194336, "gen_loss": 0.3415874242782593, "grad_norm": 1.4081646008142548, "learning_rate": 2.8866947368421055e-05, "loss": 0.3409, "mean_copy_accuracy": 0.9903731048107147, "mean_gen_accuracy": 0.85674849152565, "mean_token_accuracy": 0.889173686504364, "num_tokens": 442050246.0, "sample_num_tokens": 9394.0, "step": 1633, "total_num_tokens": 442087822.0, "z_loss": 0.0015062771271914244 }, { "copy_logits_max": -3.037051200866699, "copy_logits_min": -750000064.0, "copy_num_tokens": 404.375, "epoch": 0.33372478937962724, "gen_logits_max": 7.896135330200195, "gen_logits_mean": -11.269975662231445, "gen_logits_min": -22.782747268676758, "gen_logits_std": 2.4254887104034424, "gen_loss": 0.3468313217163086, "grad_norm": 0.5470669890034775, "learning_rate": 2.8865684210526315e-05, "loss": 0.3316, "mean_copy_accuracy": 0.9906410127878189, "mean_gen_accuracy": 0.8572425246238708, "mean_token_accuracy": 0.8930850625038147, "num_tokens": 442348669.0, "sample_num_tokens": 8533.25, "step": 1634, "total_num_tokens": 442382802.0, "z_loss": 0.0013805897906422615 }, { "copy_logits_max": -5.207770824432373, "copy_logits_min": -750000000.0, "copy_num_tokens": 431.875, "epoch": 0.3339290273168241, "gen_logits_max": 8.170589447021484, "gen_logits_mean": -11.707560539245605, "gen_logits_min": -22.983612060546875, "gen_logits_std": 2.3808493614196777, "gen_loss": 0.3493232727050781, "grad_norm": 0.5431517056189589, "learning_rate": 2.886442105263158e-05, "loss": 0.344, "mean_copy_accuracy": 0.990425780415535, "mean_gen_accuracy": 0.8546890169382095, "mean_token_accuracy": 0.8886680454015732, "num_tokens": 442636515.0, "sample_num_tokens": 9414.25, "step": 1635, "total_num_tokens": 442674172.0, "z_loss": 0.0012927516363561153 }, { "copy_logits_max": -3.6041617393493652, "copy_logits_min": -687500032.0, "copy_num_tokens": 405.6875, "epoch": 0.33413326525402093, "gen_logits_max": 8.193498611450195, "gen_logits_mean": -10.813124656677246, "gen_logits_min": -22.164594650268555, "gen_logits_std": 2.410381317138672, "gen_loss": 0.33832553029060364, "grad_norm": 0.5676425258881559, "learning_rate": 2.886315789473684e-05, "loss": 0.3586, "mean_copy_accuracy": 0.9899365305900574, "mean_gen_accuracy": 0.8556516021490097, "mean_token_accuracy": 0.8831753730773926, "num_tokens": 442882535.0, "sample_num_tokens": 7777.25, "step": 1636, "total_num_tokens": 442913644.0, "z_loss": 0.001413532067090273 }, { "copy_logits_max": -3.9091012477874756, "copy_logits_min": -750000000.0, "copy_num_tokens": 602.75, "epoch": 0.33433750319121774, "gen_logits_max": 8.062984466552734, "gen_logits_mean": -10.461905479431152, "gen_logits_min": -21.93099594116211, "gen_logits_std": 2.399559736251831, "gen_loss": 0.2781970500946045, "grad_norm": 1.022419961302367, "learning_rate": 2.8861894736842105e-05, "loss": 0.3128, "mean_copy_accuracy": 0.9918225854635239, "mean_gen_accuracy": 0.8633589595556259, "mean_token_accuracy": 0.8971902579069138, "num_tokens": 443151094.0, "sample_num_tokens": 9366.5, "step": 1637, "total_num_tokens": 443188560.0, "z_loss": 0.0014021012466400862 }, { "copy_logits_max": -3.6047167778015137, "copy_logits_min": -750000000.0, "copy_num_tokens": 562.1875, "epoch": 0.3345417411284146, "gen_logits_max": 7.250107765197754, "gen_logits_mean": -12.190251350402832, "gen_logits_min": -23.9167423248291, "gen_logits_std": 2.443136215209961, "gen_loss": 0.3499876856803894, "grad_norm": 0.4448645315107685, "learning_rate": 2.886063157894737e-05, "loss": 0.3111, "mean_copy_accuracy": 0.9939824640750885, "mean_gen_accuracy": 0.8680742383003235, "mean_token_accuracy": 0.9004068076610565, "num_tokens": 443437733.0, "sample_num_tokens": 8734.75, "step": 1638, "total_num_tokens": 443472672.0, "z_loss": 0.0013661966659128666 }, { "copy_logits_max": -6.007340908050537, "copy_logits_min": -750000000.0, "copy_num_tokens": 400.5625, "epoch": 0.33474597906561143, "gen_logits_max": 7.196323394775391, "gen_logits_mean": -12.85634994506836, "gen_logits_min": -23.96773910522461, "gen_logits_std": 2.3746848106384277, "gen_loss": 0.3184082806110382, "grad_norm": 1.3955273240274118, "learning_rate": 2.8859368421052634e-05, "loss": 0.3337, "mean_copy_accuracy": 0.9917604327201843, "mean_gen_accuracy": 0.8546061664819717, "mean_token_accuracy": 0.890251561999321, "num_tokens": 443712777.0, "sample_num_tokens": 7694.75, "step": 1639, "total_num_tokens": 443743556.0, "z_loss": 0.000986090861260891 }, { "copy_logits_max": -4.889103889465332, "copy_logits_min": -750000000.0, "copy_num_tokens": 488.9375, "epoch": 0.33495021700280825, "gen_logits_max": 7.933485984802246, "gen_logits_mean": -10.528406143188477, "gen_logits_min": -21.827667236328125, "gen_logits_std": 2.387773036956787, "gen_loss": 0.3740333318710327, "grad_norm": 0.7561506894207171, "learning_rate": 2.8858105263157898e-05, "loss": 0.3598, "mean_copy_accuracy": 0.98977030813694, "mean_gen_accuracy": 0.8501304984092712, "mean_token_accuracy": 0.8851563036441803, "num_tokens": 443980169.0, "sample_num_tokens": 8902.75, "step": 1640, "total_num_tokens": 444015780.0, "z_loss": 0.0012476284755393863 }, { "copy_logits_max": -3.8588461875915527, "copy_logits_min": -750000000.0, "copy_num_tokens": 432.0, "epoch": 0.3351544549400051, "gen_logits_max": 7.002851486206055, "gen_logits_mean": -11.889718055725098, "gen_logits_min": -22.923524856567383, "gen_logits_std": 2.3578529357910156, "gen_loss": 0.34949803352355957, "grad_norm": 0.5344527823913491, "learning_rate": 2.885684210526316e-05, "loss": 0.3262, "mean_copy_accuracy": 0.9916345626115799, "mean_gen_accuracy": 0.8587311059236526, "mean_token_accuracy": 0.8956645429134369, "num_tokens": 444280783.0, "sample_num_tokens": 8452.75, "step": 1641, "total_num_tokens": 444314594.0, "z_loss": 0.0011888889130204916 }, { "copy_logits_max": -4.866097927093506, "copy_logits_min": -750000000.0, "copy_num_tokens": 478.625, "epoch": 0.33535869287720194, "gen_logits_max": 7.182544708251953, "gen_logits_mean": -11.795536994934082, "gen_logits_min": -23.290653228759766, "gen_logits_std": 2.41843843460083, "gen_loss": 0.3143547475337982, "grad_norm": 0.538078374408685, "learning_rate": 2.8855578947368423e-05, "loss": 0.3427, "mean_copy_accuracy": 0.9923418909311295, "mean_gen_accuracy": 0.8529984354972839, "mean_token_accuracy": 0.8885541707277298, "num_tokens": 444556059.0, "sample_num_tokens": 9170.75, "step": 1642, "total_num_tokens": 444592742.0, "z_loss": 0.0012077277060598135 }, { "copy_logits_max": -4.826129913330078, "copy_logits_min": -750000000.0, "copy_num_tokens": 468.0, "epoch": 0.33556293081439875, "gen_logits_max": 7.421723365783691, "gen_logits_mean": -11.05972671508789, "gen_logits_min": -22.908876419067383, "gen_logits_std": 2.4194540977478027, "gen_loss": 0.3337697982788086, "grad_norm": 0.5134296061516169, "learning_rate": 2.8854315789473684e-05, "loss": 0.3528, "mean_copy_accuracy": 0.9908579289913177, "mean_gen_accuracy": 0.8536703735589981, "mean_token_accuracy": 0.8864179700613022, "num_tokens": 444810985.0, "sample_num_tokens": 7759.75, "step": 1643, "total_num_tokens": 444842024.0, "z_loss": 0.0012006403412669897 }, { "copy_logits_max": -5.882440567016602, "copy_logits_min": -687500032.0, "copy_num_tokens": 362.3125, "epoch": 0.3357671687515956, "gen_logits_max": 8.392961502075195, "gen_logits_mean": -9.94776439666748, "gen_logits_min": -21.359535217285156, "gen_logits_std": 2.40879487991333, "gen_loss": 0.3740108907222748, "grad_norm": 0.4994939267557885, "learning_rate": 2.885305263157895e-05, "loss": 0.3267, "mean_copy_accuracy": 0.9886698126792908, "mean_gen_accuracy": 0.8584564477205276, "mean_token_accuracy": 0.8941709846258163, "num_tokens": 445082406.0, "sample_num_tokens": 7605.5, "step": 1644, "total_num_tokens": 445112828.0, "z_loss": 0.0012125617358833551 }, { "copy_logits_max": -7.33576774597168, "copy_logits_min": -750000000.0, "copy_num_tokens": 479.875, "epoch": 0.33597140668879244, "gen_logits_max": 7.529055595397949, "gen_logits_mean": -11.569173812866211, "gen_logits_min": -22.83228874206543, "gen_logits_std": 2.421998977661133, "gen_loss": 0.32147711515426636, "grad_norm": 0.8684908541062402, "learning_rate": 2.885178947368421e-05, "loss": 0.3507, "mean_copy_accuracy": 0.9881951212882996, "mean_gen_accuracy": 0.8571336567401886, "mean_token_accuracy": 0.8909032046794891, "num_tokens": 445345261.0, "sample_num_tokens": 8588.75, "step": 1645, "total_num_tokens": 445379616.0, "z_loss": 0.0009069910156540573 }, { "copy_logits_max": -5.541553974151611, "copy_logits_min": -750000000.0, "copy_num_tokens": 413.375, "epoch": 0.33617564462598926, "gen_logits_max": 8.407630920410156, "gen_logits_mean": -10.345861434936523, "gen_logits_min": -21.897144317626953, "gen_logits_std": 2.4348597526550293, "gen_loss": 0.34981584548950195, "grad_norm": 0.5762843138115771, "learning_rate": 2.8850526315789474e-05, "loss": 0.3349, "mean_copy_accuracy": 0.9912913292646408, "mean_gen_accuracy": 0.8579007238149643, "mean_token_accuracy": 0.8901818543672562, "num_tokens": 445611727.0, "sample_num_tokens": 8006.25, "step": 1646, "total_num_tokens": 445643752.0, "z_loss": 0.0012202711077407002 }, { "copy_logits_max": -3.7359070777893066, "copy_logits_min": -625000000.0, "copy_num_tokens": 711.1875, "epoch": 0.33637988256318613, "gen_logits_max": 7.53217077255249, "gen_logits_mean": -11.455926895141602, "gen_logits_min": -23.55701446533203, "gen_logits_std": 2.4734082221984863, "gen_loss": 0.27731889486312866, "grad_norm": 0.5747719728034979, "learning_rate": 2.8849263157894738e-05, "loss": 0.3418, "mean_copy_accuracy": 0.9910584539175034, "mean_gen_accuracy": 0.8536590933799744, "mean_token_accuracy": 0.8875574320554733, "num_tokens": 445868188.0, "sample_num_tokens": 10256.0, "step": 1647, "total_num_tokens": 445909212.0, "z_loss": 0.0013108381535857916 }, { "copy_logits_max": -2.8126492500305176, "copy_logits_min": -687500032.0, "copy_num_tokens": 506.5, "epoch": 0.33658412050038294, "gen_logits_max": 7.1821160316467285, "gen_logits_mean": -11.662710189819336, "gen_logits_min": -23.483470916748047, "gen_logits_std": 2.451920986175537, "gen_loss": 0.33607953786849976, "grad_norm": 0.5683376531943287, "learning_rate": 2.8848000000000002e-05, "loss": 0.3321, "mean_copy_accuracy": 0.9897398352622986, "mean_gen_accuracy": 0.8623306751251221, "mean_token_accuracy": 0.8942272812128067, "num_tokens": 446132856.0, "sample_num_tokens": 8632.0, "step": 1648, "total_num_tokens": 446167384.0, "z_loss": 0.0013275621458888054 }, { "copy_logits_max": -5.236918926239014, "copy_logits_min": -750000000.0, "copy_num_tokens": 371.25, "epoch": 0.33678835843757976, "gen_logits_max": 8.057660102844238, "gen_logits_mean": -11.993553161621094, "gen_logits_min": -23.83384132385254, "gen_logits_std": 2.527067184448242, "gen_loss": 0.32314610481262207, "grad_norm": 0.5408653284532485, "learning_rate": 2.8846736842105263e-05, "loss": 0.3358, "mean_copy_accuracy": 0.9886205792427063, "mean_gen_accuracy": 0.8577143549919128, "mean_token_accuracy": 0.8893611878156662, "num_tokens": 446398059.0, "sample_num_tokens": 7796.25, "step": 1649, "total_num_tokens": 446429244.0, "z_loss": 0.0012713521718978882 }, { "copy_logits_max": -4.256239414215088, "copy_logits_min": -750000000.0, "copy_num_tokens": 311.5625, "epoch": 0.33699259637477663, "gen_logits_max": 8.15924072265625, "gen_logits_mean": -11.467645645141602, "gen_logits_min": -23.7981014251709, "gen_logits_std": 2.5467026233673096, "gen_loss": 0.3445678651332855, "grad_norm": 0.5046015951606453, "learning_rate": 2.8845473684210528e-05, "loss": 0.3443, "mean_copy_accuracy": 0.9904189854860306, "mean_gen_accuracy": 0.8603718727827072, "mean_token_accuracy": 0.8913093656301498, "num_tokens": 446657371.0, "sample_num_tokens": 6616.25, "step": 1650, "total_num_tokens": 446683836.0, "z_loss": 0.0011991560459136963 }, { "copy_logits_max": -5.452864646911621, "copy_logits_min": -750000000.0, "copy_num_tokens": 451.625, "epoch": 0.33719683431197345, "gen_logits_max": 7.786219120025635, "gen_logits_mean": -11.902556419372559, "gen_logits_min": -23.640335083007812, "gen_logits_std": 2.4750916957855225, "gen_loss": 0.31461334228515625, "grad_norm": 0.4550171860164756, "learning_rate": 2.8844210526315792e-05, "loss": 0.3102, "mean_copy_accuracy": 0.9936345815658569, "mean_gen_accuracy": 0.8638313710689545, "mean_token_accuracy": 0.8994615077972412, "num_tokens": 446933961.0, "sample_num_tokens": 7766.75, "step": 1651, "total_num_tokens": 446965028.0, "z_loss": 0.001207762979902327 }, { "copy_logits_max": -5.048978805541992, "copy_logits_min": -750000000.0, "copy_num_tokens": 381.3125, "epoch": 0.33740107224917026, "gen_logits_max": 8.542835235595703, "gen_logits_mean": -10.736468315124512, "gen_logits_min": -22.712379455566406, "gen_logits_std": 2.497568368911743, "gen_loss": 0.34580397605895996, "grad_norm": 0.497958269806878, "learning_rate": 2.8842947368421053e-05, "loss": 0.3338, "mean_copy_accuracy": 0.9924972653388977, "mean_gen_accuracy": 0.8561434596776962, "mean_token_accuracy": 0.8912785351276398, "num_tokens": 447211211.0, "sample_num_tokens": 8224.75, "step": 1652, "total_num_tokens": 447244110.0, "z_loss": 0.0012229151325300336 }, { "copy_logits_max": -5.416499137878418, "copy_logits_min": -750000000.0, "copy_num_tokens": 520.125, "epoch": 0.33760531018636714, "gen_logits_max": 8.428446769714355, "gen_logits_mean": -10.521809577941895, "gen_logits_min": -22.875816345214844, "gen_logits_std": 2.575789451599121, "gen_loss": 0.30349159240722656, "grad_norm": 0.703510369742306, "learning_rate": 2.8841684210526317e-05, "loss": 0.3256, "mean_copy_accuracy": 0.9903899282217026, "mean_gen_accuracy": 0.8629797101020813, "mean_token_accuracy": 0.8960239738225937, "num_tokens": 447508660.0, "sample_num_tokens": 8995.5, "step": 1653, "total_num_tokens": 447544642.0, "z_loss": 0.0010967471171170473 }, { "copy_logits_max": -2.4810900688171387, "copy_logits_min": -750000000.0, "copy_num_tokens": 325.4375, "epoch": 0.33780954812356395, "gen_logits_max": 8.4899320602417, "gen_logits_mean": -10.951600074768066, "gen_logits_min": -22.652788162231445, "gen_logits_std": 2.4651975631713867, "gen_loss": 0.3735823631286621, "grad_norm": 0.5090130636593145, "learning_rate": 2.8840421052631578e-05, "loss": 0.3508, "mean_copy_accuracy": 0.9925210773944855, "mean_gen_accuracy": 0.8452001214027405, "mean_token_accuracy": 0.8855016678571701, "num_tokens": 447782913.0, "sample_num_tokens": 7730.75, "step": 1654, "total_num_tokens": 447813836.0, "z_loss": 0.001474105636589229 }, { "copy_logits_max": -3.3184680938720703, "copy_logits_min": -750000000.0, "copy_num_tokens": 503.8125, "epoch": 0.33801378606076077, "gen_logits_max": 8.388696670532227, "gen_logits_mean": -10.64406967163086, "gen_logits_min": -22.637226104736328, "gen_logits_std": 2.521359443664551, "gen_loss": 0.3264027535915375, "grad_norm": 0.4944755491825922, "learning_rate": 2.8839157894736842e-05, "loss": 0.3316, "mean_copy_accuracy": 0.9931683391332626, "mean_gen_accuracy": 0.862308531999588, "mean_token_accuracy": 0.8932584524154663, "num_tokens": 448047893.0, "sample_num_tokens": 8971.25, "step": 1655, "total_num_tokens": 448083778.0, "z_loss": 0.001568060601130128 }, { "copy_logits_max": -3.1068899631500244, "copy_logits_min": -750000000.0, "copy_num_tokens": 517.3125, "epoch": 0.33821802399795764, "gen_logits_max": 8.028299331665039, "gen_logits_mean": -10.687051773071289, "gen_logits_min": -23.05752182006836, "gen_logits_std": 2.5037715435028076, "gen_loss": 0.32225608825683594, "grad_norm": 0.4693270348598354, "learning_rate": 2.8837894736842107e-05, "loss": 0.3269, "mean_copy_accuracy": 0.9941737949848175, "mean_gen_accuracy": 0.8611819893121719, "mean_token_accuracy": 0.8925175666809082, "num_tokens": 448333724.0, "sample_num_tokens": 8766.5, "step": 1656, "total_num_tokens": 448368790.0, "z_loss": 0.0017857224447652698 }, { "copy_logits_max": -3.034301280975342, "copy_logits_min": -750000000.0, "copy_num_tokens": 373.5625, "epoch": 0.33842226193515446, "gen_logits_max": 8.126302719116211, "gen_logits_mean": -12.041236877441406, "gen_logits_min": -23.683944702148438, "gen_logits_std": 2.446479320526123, "gen_loss": 0.36166912317276, "grad_norm": 0.6027991977003551, "learning_rate": 2.883663157894737e-05, "loss": 0.3475, "mean_copy_accuracy": 0.9880848526954651, "mean_gen_accuracy": 0.8529545664787292, "mean_token_accuracy": 0.8859591782093048, "num_tokens": 448587996.0, "sample_num_tokens": 7747.0, "step": 1657, "total_num_tokens": 448618984.0, "z_loss": 0.0016384045593440533 }, { "copy_logits_max": -2.102861166000366, "copy_logits_min": -687500032.0, "copy_num_tokens": 510.125, "epoch": 0.33862649987235127, "gen_logits_max": 7.317147731781006, "gen_logits_mean": -11.412588119506836, "gen_logits_min": -23.49787139892578, "gen_logits_std": 2.474503755569458, "gen_loss": 0.3488486409187317, "grad_norm": 0.4731233414632751, "learning_rate": 2.8835368421052632e-05, "loss": 0.3306, "mean_copy_accuracy": 0.9920715391635895, "mean_gen_accuracy": 0.8573799580335617, "mean_token_accuracy": 0.8916445225477219, "num_tokens": 448859388.0, "sample_num_tokens": 8445.5, "step": 1658, "total_num_tokens": 448893170.0, "z_loss": 0.0016284673474729061 }, { "copy_logits_max": -6.198495864868164, "copy_logits_min": -750000000.0, "copy_num_tokens": 312.9375, "epoch": 0.33883073780954814, "gen_logits_max": 7.917288780212402, "gen_logits_mean": -11.766672134399414, "gen_logits_min": -23.055490493774414, "gen_logits_std": 2.434758186340332, "gen_loss": 0.3164003789424896, "grad_norm": 0.5206102352355141, "learning_rate": 2.8834105263157896e-05, "loss": 0.3157, "mean_copy_accuracy": 0.9903841018676758, "mean_gen_accuracy": 0.8658235520124435, "mean_token_accuracy": 0.8964205533266068, "num_tokens": 449137217.0, "sample_num_tokens": 7571.25, "step": 1659, "total_num_tokens": 449167502.0, "z_loss": 0.0010363432811573148 }, { "copy_logits_max": -3.548389196395874, "copy_logits_min": -687500032.0, "copy_num_tokens": 381.6875, "epoch": 0.33903497574674496, "gen_logits_max": 7.19785213470459, "gen_logits_mean": -13.072877883911133, "gen_logits_min": -24.62735366821289, "gen_logits_std": 2.437732219696045, "gen_loss": 0.34534788131713867, "grad_norm": 0.519675588561916, "learning_rate": 2.8832842105263157e-05, "loss": 0.3304, "mean_copy_accuracy": 0.9909464716911316, "mean_gen_accuracy": 0.8637096583843231, "mean_token_accuracy": 0.8918018788099289, "num_tokens": 449394161.0, "sample_num_tokens": 7929.75, "step": 1660, "total_num_tokens": 449425880.0, "z_loss": 0.001232003909535706 }, { "copy_logits_max": -5.258544445037842, "copy_logits_min": -750000000.0, "copy_num_tokens": 339.0, "epoch": 0.3392392136839418, "gen_logits_max": 6.9934983253479, "gen_logits_mean": -13.200651168823242, "gen_logits_min": -24.57551383972168, "gen_logits_std": 2.4042470455169678, "gen_loss": 0.3007960319519043, "grad_norm": 0.5426283353663193, "learning_rate": 2.883157894736842e-05, "loss": 0.3412, "mean_copy_accuracy": 0.9907580316066742, "mean_gen_accuracy": 0.8565255254507065, "mean_token_accuracy": 0.8886529058218002, "num_tokens": 449642246.0, "sample_num_tokens": 7388.0, "step": 1661, "total_num_tokens": 449671798.0, "z_loss": 0.0010010795667767525 }, { "copy_logits_max": -4.516983985900879, "copy_logits_min": -750000000.0, "copy_num_tokens": 415.1875, "epoch": 0.33944345162113865, "gen_logits_max": 7.234127044677734, "gen_logits_mean": -11.637880325317383, "gen_logits_min": -23.307632446289062, "gen_logits_std": 2.449942111968994, "gen_loss": 0.3179052770137787, "grad_norm": 0.48763907746653395, "learning_rate": 2.8830315789473682e-05, "loss": 0.3248, "mean_copy_accuracy": 0.9905376583337784, "mean_gen_accuracy": 0.8646610826253891, "mean_token_accuracy": 0.8932057768106461, "num_tokens": 449898292.0, "sample_num_tokens": 7704.0, "step": 1662, "total_num_tokens": 449929108.0, "z_loss": 0.0010717692784965038 }, { "copy_logits_max": -4.549973487854004, "copy_logits_min": -750000000.0, "copy_num_tokens": 469.5, "epoch": 0.33964768955833546, "gen_logits_max": 8.05335521697998, "gen_logits_mean": -10.534185409545898, "gen_logits_min": -22.09462547302246, "gen_logits_std": 2.372007369995117, "gen_loss": 0.3468019366264343, "grad_norm": 0.5161456451985554, "learning_rate": 2.8829052631578947e-05, "loss": 0.344, "mean_copy_accuracy": 0.9917260110378265, "mean_gen_accuracy": 0.8522133082151413, "mean_token_accuracy": 0.886155441403389, "num_tokens": 450173868.0, "sample_num_tokens": 8798.0, "step": 1663, "total_num_tokens": 450209060.0, "z_loss": 0.0011057243682444096 }, { "copy_logits_max": -6.125582695007324, "copy_logits_min": -750000000.0, "copy_num_tokens": 357.5625, "epoch": 0.3398519274955323, "gen_logits_max": 7.945181369781494, "gen_logits_mean": -11.52785587310791, "gen_logits_min": -22.462993621826172, "gen_logits_std": 2.4040355682373047, "gen_loss": 0.37836509943008423, "grad_norm": 0.4647340795225865, "learning_rate": 2.8827789473684214e-05, "loss": 0.3524, "mean_copy_accuracy": 0.9915287494659424, "mean_gen_accuracy": 0.8583480417728424, "mean_token_accuracy": 0.8846427351236343, "num_tokens": 450444107.0, "sample_num_tokens": 8429.25, "step": 1664, "total_num_tokens": 450477824.0, "z_loss": 0.001104129245504737 }, { "copy_logits_max": -0.7596722841262817, "copy_logits_min": -687500032.0, "copy_num_tokens": 697.8125, "epoch": 0.34005616543272915, "gen_logits_max": 7.465739727020264, "gen_logits_mean": -10.599943161010742, "gen_logits_min": -22.746383666992188, "gen_logits_std": 2.4942431449890137, "gen_loss": 0.28477099537849426, "grad_norm": 0.5134903837242001, "learning_rate": 2.8826526315789475e-05, "loss": 0.3171, "mean_copy_accuracy": 0.9920259267091751, "mean_gen_accuracy": 0.8603198081254959, "mean_token_accuracy": 0.8977512717247009, "num_tokens": 450725676.0, "sample_num_tokens": 9275.0, "step": 1665, "total_num_tokens": 450762776.0, "z_loss": 0.0013000167673453689 }, { "copy_logits_max": -6.232794284820557, "copy_logits_min": -750000000.0, "copy_num_tokens": 603.5, "epoch": 0.34026040336992597, "gen_logits_max": 7.236058712005615, "gen_logits_mean": -11.505165100097656, "gen_logits_min": -23.311922073364258, "gen_logits_std": 2.4832935333251953, "gen_loss": 0.3080581724643707, "grad_norm": 0.49753577570825086, "learning_rate": 2.882526315789474e-05, "loss": 0.3374, "mean_copy_accuracy": 0.9908712059259415, "mean_gen_accuracy": 0.862262174487114, "mean_token_accuracy": 0.8899014443159103, "num_tokens": 450985850.0, "sample_num_tokens": 10219.0, "step": 1666, "total_num_tokens": 451026726.0, "z_loss": 0.0009951975662261248 }, { "copy_logits_max": -4.321471691131592, "copy_logits_min": -562500032.0, "copy_num_tokens": 359.3125, "epoch": 0.3404646413071228, "gen_logits_max": 7.556775093078613, "gen_logits_mean": -11.730239868164062, "gen_logits_min": -23.361692428588867, "gen_logits_std": 2.4650964736938477, "gen_loss": 0.39291149377822876, "grad_norm": 0.5171752931685162, "learning_rate": 2.8824e-05, "loss": 0.3511, "mean_copy_accuracy": 0.9902940094470978, "mean_gen_accuracy": 0.8561649918556213, "mean_token_accuracy": 0.886196106672287, "num_tokens": 451248039.0, "sample_num_tokens": 7705.75, "step": 1667, "total_num_tokens": 451278862.0, "z_loss": 0.001182570937089622 }, { "copy_logits_max": -7.11527156829834, "copy_logits_min": -750000000.0, "copy_num_tokens": 368.5625, "epoch": 0.34066887924431966, "gen_logits_max": 7.90067195892334, "gen_logits_mean": -11.3607759475708, "gen_logits_min": -23.111915588378906, "gen_logits_std": 2.4681291580200195, "gen_loss": 0.3393673896789551, "grad_norm": 0.4619276066156092, "learning_rate": 2.8822736842105265e-05, "loss": 0.3003, "mean_copy_accuracy": 0.9924197196960449, "mean_gen_accuracy": 0.8703518062829971, "mean_token_accuracy": 0.9024029076099396, "num_tokens": 451536887.0, "sample_num_tokens": 8435.25, "step": 1668, "total_num_tokens": 451570628.0, "z_loss": 0.001052288105711341 }, { "copy_logits_max": -4.397934913635254, "copy_logits_min": -750000000.0, "copy_num_tokens": 679.625, "epoch": 0.34087311718151647, "gen_logits_max": 7.601624488830566, "gen_logits_mean": -10.24095344543457, "gen_logits_min": -21.95599365234375, "gen_logits_std": 2.4564566612243652, "gen_loss": 0.2903293967247009, "grad_norm": 0.5434898858171661, "learning_rate": 2.8821473684210526e-05, "loss": 0.3371, "mean_copy_accuracy": 0.9920570999383926, "mean_gen_accuracy": 0.8546715080738068, "mean_token_accuracy": 0.8906738460063934, "num_tokens": 451804205.0, "sample_num_tokens": 9792.75, "step": 1669, "total_num_tokens": 451843376.0, "z_loss": 0.001262695761397481 }, { "copy_logits_max": -4.356009483337402, "copy_logits_min": -687500032.0, "copy_num_tokens": 360.375, "epoch": 0.3410773551187133, "gen_logits_max": 8.645030975341797, "gen_logits_mean": -10.045780181884766, "gen_logits_min": -21.68382453918457, "gen_logits_std": 2.4313268661499023, "gen_loss": 0.4246717691421509, "grad_norm": 0.5011530676243364, "learning_rate": 2.882021052631579e-05, "loss": 0.3527, "mean_copy_accuracy": 0.9921908974647522, "mean_gen_accuracy": 0.8550092875957489, "mean_token_accuracy": 0.8852114230394363, "num_tokens": 452075155.0, "sample_num_tokens": 7660.25, "step": 1670, "total_num_tokens": 452105796.0, "z_loss": 0.0013170524034649134 }, { "copy_logits_max": -5.935249328613281, "copy_logits_min": -750000000.0, "copy_num_tokens": 405.5625, "epoch": 0.34128159305591016, "gen_logits_max": 7.362052917480469, "gen_logits_mean": -11.369619369506836, "gen_logits_min": -22.846027374267578, "gen_logits_std": 2.4512417316436768, "gen_loss": 0.28262364864349365, "grad_norm": 0.4470706856546976, "learning_rate": 2.881894736842105e-05, "loss": 0.2981, "mean_copy_accuracy": 0.9933608919382095, "mean_gen_accuracy": 0.8636884987354279, "mean_token_accuracy": 0.9040986597537994, "num_tokens": 452371575.0, "sample_num_tokens": 7501.75, "step": 1671, "total_num_tokens": 452401582.0, "z_loss": 0.0011008840519934893 }, { "copy_logits_max": -6.800850868225098, "copy_logits_min": -750000000.0, "copy_num_tokens": 441.625, "epoch": 0.341485830993107, "gen_logits_max": 7.16383171081543, "gen_logits_mean": -11.933534622192383, "gen_logits_min": -23.404083251953125, "gen_logits_std": 2.422133445739746, "gen_loss": 0.2891198992729187, "grad_norm": 0.4679196945667553, "learning_rate": 2.881768421052632e-05, "loss": 0.3174, "mean_copy_accuracy": 0.992252990603447, "mean_gen_accuracy": 0.8648483902215958, "mean_token_accuracy": 0.8952498286962509, "num_tokens": 452632845.0, "sample_num_tokens": 8549.75, "step": 1672, "total_num_tokens": 452667044.0, "z_loss": 0.0009572784183546901 }, { "copy_logits_max": -5.472888946533203, "copy_logits_min": -687500032.0, "copy_num_tokens": 508.25, "epoch": 0.3416900689303038, "gen_logits_max": 7.030146598815918, "gen_logits_mean": -11.471288681030273, "gen_logits_min": -22.943946838378906, "gen_logits_std": 2.434767961502075, "gen_loss": 0.302747905254364, "grad_norm": 0.5214515057612794, "learning_rate": 2.881642105263158e-05, "loss": 0.3139, "mean_copy_accuracy": 0.9917730689048767, "mean_gen_accuracy": 0.8664406538009644, "mean_token_accuracy": 0.8973113596439362, "num_tokens": 452915305.0, "sample_num_tokens": 8889.75, "step": 1673, "total_num_tokens": 452950864.0, "z_loss": 0.0010673528304323554 }, { "copy_logits_max": -7.806108474731445, "copy_logits_min": -750000000.0, "copy_num_tokens": 331.5625, "epoch": 0.34189430686750066, "gen_logits_max": 8.450052261352539, "gen_logits_mean": -10.653448104858398, "gen_logits_min": -21.93606948852539, "gen_logits_std": 2.4151782989501953, "gen_loss": 0.34021466970443726, "grad_norm": 0.4988743400605345, "learning_rate": 2.8815157894736844e-05, "loss": 0.3411, "mean_copy_accuracy": 0.9920065850019455, "mean_gen_accuracy": 0.857305184006691, "mean_token_accuracy": 0.887809082865715, "num_tokens": 453185394.0, "sample_num_tokens": 7895.0, "step": 1674, "total_num_tokens": 453216974.0, "z_loss": 0.001105748931877315 }, { "copy_logits_max": -7.4049201011657715, "copy_logits_min": -750000000.0, "copy_num_tokens": 380.8125, "epoch": 0.3420985448046975, "gen_logits_max": 6.947683334350586, "gen_logits_mean": -11.41147518157959, "gen_logits_min": -22.22439956665039, "gen_logits_std": 2.349045753479004, "gen_loss": 0.3090311288833618, "grad_norm": 0.4848894325945269, "learning_rate": 2.8813894736842105e-05, "loss": 0.3195, "mean_copy_accuracy": 0.9908924400806427, "mean_gen_accuracy": 0.8655388057231903, "mean_token_accuracy": 0.8957892060279846, "num_tokens": 453454596.0, "sample_num_tokens": 7947.0, "step": 1675, "total_num_tokens": 453486384.0, "z_loss": 0.001014698063954711 }, { "copy_logits_max": -3.738015651702881, "copy_logits_min": -687500032.0, "copy_num_tokens": 521.75, "epoch": 0.3423027827418943, "gen_logits_max": 7.946650505065918, "gen_logits_mean": -10.600461959838867, "gen_logits_min": -21.985332489013672, "gen_logits_std": 2.406663179397583, "gen_loss": 0.3592774271965027, "grad_norm": 0.6527523419064782, "learning_rate": 2.881263157894737e-05, "loss": 0.3386, "mean_copy_accuracy": 0.9900457113981247, "mean_gen_accuracy": 0.8586776107549667, "mean_token_accuracy": 0.8902757167816162, "num_tokens": 453722179.0, "sample_num_tokens": 8329.25, "step": 1676, "total_num_tokens": 453755496.0, "z_loss": 0.00118416512850672 }, { "copy_logits_max": -4.7468461990356445, "copy_logits_min": -750000000.0, "copy_num_tokens": 443.1875, "epoch": 0.34250702067909117, "gen_logits_max": 8.563899993896484, "gen_logits_mean": -10.624176025390625, "gen_logits_min": -21.83185577392578, "gen_logits_std": 2.392601251602173, "gen_loss": 0.3414771854877472, "grad_norm": 0.49424501134051935, "learning_rate": 2.881136842105263e-05, "loss": 0.3245, "mean_copy_accuracy": 0.9934472590684891, "mean_gen_accuracy": 0.8600018620491028, "mean_token_accuracy": 0.8945398777723312, "num_tokens": 453979529.0, "sample_num_tokens": 8279.25, "step": 1677, "total_num_tokens": 454012646.0, "z_loss": 0.001244035316631198 }, { "copy_logits_max": -4.199594020843506, "copy_logits_min": -687500032.0, "copy_num_tokens": 541.0625, "epoch": 0.342711258616288, "gen_logits_max": 7.69507360458374, "gen_logits_mean": -11.168903350830078, "gen_logits_min": -22.553287506103516, "gen_logits_std": 2.443054437637329, "gen_loss": 0.31744176149368286, "grad_norm": 0.4869185655370003, "learning_rate": 2.8810105263157894e-05, "loss": 0.3153, "mean_copy_accuracy": 0.9940889030694962, "mean_gen_accuracy": 0.8601996004581451, "mean_token_accuracy": 0.89571613073349, "num_tokens": 454264232.0, "sample_num_tokens": 9698.0, "step": 1678, "total_num_tokens": 454303024.0, "z_loss": 0.0012194991577416658 }, { "copy_logits_max": -2.8016180992126465, "copy_logits_min": -687500032.0, "copy_num_tokens": 505.25, "epoch": 0.3429154965534848, "gen_logits_max": 7.63431453704834, "gen_logits_mean": -11.237470626831055, "gen_logits_min": -22.63504981994629, "gen_logits_std": 2.398113250732422, "gen_loss": 0.2900235950946808, "grad_norm": 0.6982791341823267, "learning_rate": 2.880884210526316e-05, "loss": 0.319, "mean_copy_accuracy": 0.9929011315107346, "mean_gen_accuracy": 0.864034429192543, "mean_token_accuracy": 0.8954989165067673, "num_tokens": 454530555.0, "sample_num_tokens": 8383.25, "step": 1679, "total_num_tokens": 454564088.0, "z_loss": 0.0011063781566917896 }, { "copy_logits_max": -7.070910930633545, "copy_logits_min": -750000000.0, "copy_num_tokens": 454.0, "epoch": 0.34311973449068167, "gen_logits_max": 8.155289649963379, "gen_logits_mean": -11.098851203918457, "gen_logits_min": -22.40403175354004, "gen_logits_std": 2.4040534496307373, "gen_loss": 0.26295173168182373, "grad_norm": 0.4761281069016211, "learning_rate": 2.8807578947368423e-05, "loss": 0.3177, "mean_copy_accuracy": 0.993418276309967, "mean_gen_accuracy": 0.868148073554039, "mean_token_accuracy": 0.8968604505062103, "num_tokens": 454802688.0, "sample_num_tokens": 8274.0, "step": 1680, "total_num_tokens": 454835784.0, "z_loss": 0.001016627880744636 }, { "copy_logits_max": -4.91545295715332, "copy_logits_min": -750000000.0, "copy_num_tokens": 320.5, "epoch": 0.3433239724278785, "gen_logits_max": 7.729623794555664, "gen_logits_mean": -11.902013778686523, "gen_logits_min": -23.181884765625, "gen_logits_std": 2.4063401222229004, "gen_loss": 0.37200701236724854, "grad_norm": 0.5626467064097554, "learning_rate": 2.8806315789473687e-05, "loss": 0.3389, "mean_copy_accuracy": 0.9897422343492508, "mean_gen_accuracy": 0.8588421046733856, "mean_token_accuracy": 0.8890513330698013, "num_tokens": 455073847.0, "sample_num_tokens": 8173.75, "step": 1681, "total_num_tokens": 455106542.0, "z_loss": 0.0011065731523558497 }, { "copy_logits_max": -3.2334413528442383, "copy_logits_min": -750000000.0, "copy_num_tokens": 457.875, "epoch": 0.3435282103650753, "gen_logits_max": 7.806114673614502, "gen_logits_mean": -11.284273147583008, "gen_logits_min": -22.618568420410156, "gen_logits_std": 2.4382123947143555, "gen_loss": 0.36539599299430847, "grad_norm": 0.6140576123867721, "learning_rate": 2.8805052631578948e-05, "loss": 0.3413, "mean_copy_accuracy": 0.9930705279111862, "mean_gen_accuracy": 0.851899191737175, "mean_token_accuracy": 0.8881153911352158, "num_tokens": 455338840.0, "sample_num_tokens": 8748.5, "step": 1682, "total_num_tokens": 455373834.0, "z_loss": 0.0012824618024751544 }, { "copy_logits_max": -3.9879021644592285, "copy_logits_min": -750000000.0, "copy_num_tokens": 386.125, "epoch": 0.3437324483022721, "gen_logits_max": 7.450194835662842, "gen_logits_mean": -12.066461563110352, "gen_logits_min": -23.602201461791992, "gen_logits_std": 2.469029426574707, "gen_loss": 0.32230502367019653, "grad_norm": 0.4967072709470395, "learning_rate": 2.8803789473684213e-05, "loss": 0.3476, "mean_copy_accuracy": 0.9924278110265732, "mean_gen_accuracy": 0.8528517037630081, "mean_token_accuracy": 0.8888479471206665, "num_tokens": 455600286.0, "sample_num_tokens": 7695.5, "step": 1683, "total_num_tokens": 455631068.0, "z_loss": 0.0011812427546828985 }, { "copy_logits_max": -4.1428117752075195, "copy_logits_min": -750000000.0, "copy_num_tokens": 385.75, "epoch": 0.343936686239469, "gen_logits_max": 7.719696998596191, "gen_logits_mean": -11.152751922607422, "gen_logits_min": -22.74747085571289, "gen_logits_std": 2.4540884494781494, "gen_loss": 0.3589620590209961, "grad_norm": 0.5730288152862136, "learning_rate": 2.8802526315789473e-05, "loss": 0.3326, "mean_copy_accuracy": 0.9900717437267303, "mean_gen_accuracy": 0.8603982627391815, "mean_token_accuracy": 0.8902806341648102, "num_tokens": 455858162.0, "sample_num_tokens": 7748.0, "step": 1684, "total_num_tokens": 455889154.0, "z_loss": 0.0011271428084000945 }, { "copy_logits_max": -3.5297274589538574, "copy_logits_min": -687500032.0, "copy_num_tokens": 620.125, "epoch": 0.3441409241766658, "gen_logits_max": 6.527235984802246, "gen_logits_mean": -11.967720985412598, "gen_logits_min": -23.434436798095703, "gen_logits_std": 2.435561180114746, "gen_loss": 0.3060075044631958, "grad_norm": 0.49284882068651487, "learning_rate": 2.8801263157894738e-05, "loss": 0.3358, "mean_copy_accuracy": 0.9915674924850464, "mean_gen_accuracy": 0.8568530976772308, "mean_token_accuracy": 0.8908811658620834, "num_tokens": 456119164.0, "sample_num_tokens": 9680.0, "step": 1685, "total_num_tokens": 456157884.0, "z_loss": 0.0010789668885990977 }, { "copy_logits_max": -6.48886775970459, "copy_logits_min": -750000000.0, "copy_num_tokens": 387.0, "epoch": 0.3443451621138626, "gen_logits_max": 7.144810676574707, "gen_logits_mean": -12.629684448242188, "gen_logits_min": -23.842567443847656, "gen_logits_std": 2.4322915077209473, "gen_loss": 0.3246920108795166, "grad_norm": 0.43954654484972583, "learning_rate": 2.88e-05, "loss": 0.3206, "mean_copy_accuracy": 0.9915266633033752, "mean_gen_accuracy": 0.8630923181772232, "mean_token_accuracy": 0.8923789262771606, "num_tokens": 456390839.0, "sample_num_tokens": 7758.25, "step": 1686, "total_num_tokens": 456421872.0, "z_loss": 0.0010560940718278289 }, { "copy_logits_max": -4.449774742126465, "copy_logits_min": -750000064.0, "copy_num_tokens": 384.1875, "epoch": 0.3445494000510595, "gen_logits_max": 7.6383957862854, "gen_logits_mean": -10.224452018737793, "gen_logits_min": -21.418745040893555, "gen_logits_std": 2.4122772216796875, "gen_loss": 0.330240935087204, "grad_norm": 0.5994265269211084, "learning_rate": 2.8798736842105263e-05, "loss": 0.3454, "mean_copy_accuracy": 0.991709977388382, "mean_gen_accuracy": 0.8558528274297714, "mean_token_accuracy": 0.8888442814350128, "num_tokens": 456654938.0, "sample_num_tokens": 7546.0, "step": 1687, "total_num_tokens": 456685122.0, "z_loss": 0.0012479664292186499 }, { "copy_logits_max": -6.038246154785156, "copy_logits_min": -750000000.0, "copy_num_tokens": 374.1875, "epoch": 0.3447536379882563, "gen_logits_max": 7.568937301635742, "gen_logits_mean": -11.903596878051758, "gen_logits_min": -23.106246948242188, "gen_logits_std": 2.403285026550293, "gen_loss": 0.3475583791732788, "grad_norm": 0.4657944047433495, "learning_rate": 2.8797473684210527e-05, "loss": 0.3311, "mean_copy_accuracy": 0.9925592839717865, "mean_gen_accuracy": 0.8625020235776901, "mean_token_accuracy": 0.8920150995254517, "num_tokens": 456922322.0, "sample_num_tokens": 7988.5, "step": 1688, "total_num_tokens": 456954276.0, "z_loss": 0.001087843906134367 }, { "copy_logits_max": -6.116949081420898, "copy_logits_min": -750000000.0, "copy_num_tokens": 402.0625, "epoch": 0.34495787592545313, "gen_logits_max": 7.889260292053223, "gen_logits_mean": -10.864385604858398, "gen_logits_min": -22.427297592163086, "gen_logits_std": 2.435375690460205, "gen_loss": 0.3377273380756378, "grad_norm": 0.6120687623556723, "learning_rate": 2.879621052631579e-05, "loss": 0.3364, "mean_copy_accuracy": 0.9880986958742142, "mean_gen_accuracy": 0.8627682030200958, "mean_token_accuracy": 0.8913407325744629, "num_tokens": 457187068.0, "sample_num_tokens": 7752.0, "step": 1689, "total_num_tokens": 457218076.0, "z_loss": 0.0010702258441597223 }, { "copy_logits_max": -7.384502410888672, "copy_logits_min": -750000000.0, "copy_num_tokens": 297.75, "epoch": 0.34516211386265, "gen_logits_max": 7.170419692993164, "gen_logits_mean": -11.668889999389648, "gen_logits_min": -22.521961212158203, "gen_logits_std": 2.3299551010131836, "gen_loss": 0.3249122202396393, "grad_norm": 0.41999328094996224, "learning_rate": 2.8794947368421053e-05, "loss": 0.3214, "mean_copy_accuracy": 0.9921323210000992, "mean_gen_accuracy": 0.8678636401891708, "mean_token_accuracy": 0.897349089384079, "num_tokens": 457463476.0, "sample_num_tokens": 8197.0, "step": 1690, "total_num_tokens": 457496264.0, "z_loss": 0.0010415789438411593 }, { "copy_logits_max": -4.568436145782471, "copy_logits_min": -750000000.0, "copy_num_tokens": 501.5, "epoch": 0.3453663517998468, "gen_logits_max": 6.718609809875488, "gen_logits_mean": -12.076433181762695, "gen_logits_min": -23.37632179260254, "gen_logits_std": 2.3753786087036133, "gen_loss": 0.3159777820110321, "grad_norm": 0.5205966016828741, "learning_rate": 2.8793684210526317e-05, "loss": 0.3515, "mean_copy_accuracy": 0.9919420629739761, "mean_gen_accuracy": 0.8578048944473267, "mean_token_accuracy": 0.886698842048645, "num_tokens": 457717738.0, "sample_num_tokens": 9770.0, "step": 1691, "total_num_tokens": 457756818.0, "z_loss": 0.0011428407160565257 }, { "copy_logits_max": -4.114992141723633, "copy_logits_min": -750000000.0, "copy_num_tokens": 415.3125, "epoch": 0.34557058973704363, "gen_logits_max": 7.314719200134277, "gen_logits_mean": -10.664860725402832, "gen_logits_min": -21.823612213134766, "gen_logits_std": 2.390455961227417, "gen_loss": 0.3905351758003235, "grad_norm": 0.5109119455223601, "learning_rate": 2.879242105263158e-05, "loss": 0.3647, "mean_copy_accuracy": 0.991919070482254, "mean_gen_accuracy": 0.847450390458107, "mean_token_accuracy": 0.8805781751871109, "num_tokens": 457983379.0, "sample_num_tokens": 8465.75, "step": 1692, "total_num_tokens": 458017242.0, "z_loss": 0.0014108887407928705 }, { "copy_logits_max": -4.317937850952148, "copy_logits_min": -750000000.0, "copy_num_tokens": 508.25, "epoch": 0.3457748276742405, "gen_logits_max": 6.957571983337402, "gen_logits_mean": -11.740869522094727, "gen_logits_min": -23.169586181640625, "gen_logits_std": 2.3669214248657227, "gen_loss": 0.31969690322875977, "grad_norm": 0.4979873482230945, "learning_rate": 2.8791157894736842e-05, "loss": 0.343, "mean_copy_accuracy": 0.9922025948762894, "mean_gen_accuracy": 0.8565112501382828, "mean_token_accuracy": 0.8897563070058823, "num_tokens": 458242797.0, "sample_num_tokens": 9458.25, "step": 1693, "total_num_tokens": 458280630.0, "z_loss": 0.0011749870609492064 }, { "copy_logits_max": -6.223237037658691, "copy_logits_min": -750000000.0, "copy_num_tokens": 266.4375, "epoch": 0.3459790656114373, "gen_logits_max": 7.2462615966796875, "gen_logits_mean": -11.726476669311523, "gen_logits_min": -22.6422119140625, "gen_logits_std": 2.344385862350464, "gen_loss": 0.36485135555267334, "grad_norm": 0.615746969849314, "learning_rate": 2.8789894736842106e-05, "loss": 0.3453, "mean_copy_accuracy": 0.9908106178045273, "mean_gen_accuracy": 0.8580866008996964, "mean_token_accuracy": 0.8899373114109039, "num_tokens": 458530232.0, "sample_num_tokens": 7148.0, "step": 1694, "total_num_tokens": 458558824.0, "z_loss": 0.0010918679181486368 }, { "copy_logits_max": -4.599740028381348, "copy_logits_min": -750000000.0, "copy_num_tokens": 621.5, "epoch": 0.34618330354863414, "gen_logits_max": 6.071182727813721, "gen_logits_mean": -11.283946990966797, "gen_logits_min": -22.66446876525879, "gen_logits_std": 2.3885130882263184, "gen_loss": 0.33040064573287964, "grad_norm": 0.5132435827585057, "learning_rate": 2.8788631578947367e-05, "loss": 0.3676, "mean_copy_accuracy": 0.9929088205099106, "mean_gen_accuracy": 0.8434859663248062, "mean_token_accuracy": 0.8806596994400024, "num_tokens": 458791532.0, "sample_num_tokens": 8646.0, "step": 1695, "total_num_tokens": 458826116.0, "z_loss": 0.0011471811449155211 }, { "copy_logits_max": -5.630272388458252, "copy_logits_min": -750000000.0, "copy_num_tokens": 527.25, "epoch": 0.346387541485831, "gen_logits_max": 6.864973545074463, "gen_logits_mean": -10.82402229309082, "gen_logits_min": -22.121963500976562, "gen_logits_std": 2.366969108581543, "gen_loss": 0.3682320713996887, "grad_norm": 0.5366117422547042, "learning_rate": 2.878736842105263e-05, "loss": 0.3425, "mean_copy_accuracy": 0.9927557408809662, "mean_gen_accuracy": 0.850858211517334, "mean_token_accuracy": 0.8882057666778564, "num_tokens": 459043252.0, "sample_num_tokens": 9143.5, "step": 1696, "total_num_tokens": 459079826.0, "z_loss": 0.001088154036551714 }, { "copy_logits_max": -6.617408752441406, "copy_logits_min": -687500032.0, "copy_num_tokens": 501.9375, "epoch": 0.3465917794230278, "gen_logits_max": 6.693604469299316, "gen_logits_mean": -10.98281478881836, "gen_logits_min": -22.129701614379883, "gen_logits_std": 2.3387064933776855, "gen_loss": 0.3483414947986603, "grad_norm": 0.5079950799134058, "learning_rate": 2.8786105263157896e-05, "loss": 0.3634, "mean_copy_accuracy": 0.9924174100160599, "mean_gen_accuracy": 0.8477097749710083, "mean_token_accuracy": 0.8836879283189774, "num_tokens": 459315753.0, "sample_num_tokens": 9599.75, "step": 1697, "total_num_tokens": 459354152.0, "z_loss": 0.001026761019602418 }, { "copy_logits_max": -7.4430341720581055, "copy_logits_min": -750000000.0, "copy_num_tokens": 319.8125, "epoch": 0.34679601736022464, "gen_logits_max": 6.877197265625, "gen_logits_mean": -10.818323135375977, "gen_logits_min": -21.433185577392578, "gen_logits_std": 2.318049192428589, "gen_loss": 0.32851627469062805, "grad_norm": 0.4686508263304221, "learning_rate": 2.878484210526316e-05, "loss": 0.3361, "mean_copy_accuracy": 0.9931961596012115, "mean_gen_accuracy": 0.856142207980156, "mean_token_accuracy": 0.88690185546875, "num_tokens": 459589876.0, "sample_num_tokens": 7659.5, "step": 1698, "total_num_tokens": 459620514.0, "z_loss": 0.0009609570261090994 }, { "copy_logits_max": -4.335029125213623, "copy_logits_min": -750000000.0, "copy_num_tokens": 512.0625, "epoch": 0.3470002552974215, "gen_logits_max": 6.096188545227051, "gen_logits_mean": -11.91993522644043, "gen_logits_min": -22.840805053710938, "gen_logits_std": 2.3240010738372803, "gen_loss": 0.33501124382019043, "grad_norm": 0.45572130242068065, "learning_rate": 2.878357894736842e-05, "loss": 0.3186, "mean_copy_accuracy": 0.9925855249166489, "mean_gen_accuracy": 0.8599824160337448, "mean_token_accuracy": 0.8943259567022324, "num_tokens": 459861485.0, "sample_num_tokens": 8871.75, "step": 1699, "total_num_tokens": 459896972.0, "z_loss": 0.0010518089402467012 }, { "copy_logits_max": -5.934962749481201, "copy_logits_min": -750000000.0, "copy_num_tokens": 515.6875, "epoch": 0.3472044932346183, "gen_logits_max": 7.067615985870361, "gen_logits_mean": -10.474373817443848, "gen_logits_min": -20.992332458496094, "gen_logits_std": 2.2932958602905273, "gen_loss": 0.346670925617218, "grad_norm": 0.43570546696247925, "learning_rate": 2.8782315789473686e-05, "loss": 0.3432, "mean_copy_accuracy": 0.9935245215892792, "mean_gen_accuracy": 0.8526474386453629, "mean_token_accuracy": 0.8865745961666107, "num_tokens": 460137244.0, "sample_num_tokens": 8745.0, "step": 1700, "total_num_tokens": 460172224.0, "z_loss": 0.0011256226571276784 }, { "copy_logits_max": -7.823790550231934, "copy_logits_min": -750000000.0, "copy_num_tokens": 349.375, "epoch": 0.34740873117181514, "gen_logits_max": 7.208853721618652, "gen_logits_mean": -10.806272506713867, "gen_logits_min": -21.749284744262695, "gen_logits_std": 2.3216300010681152, "gen_loss": 0.31760749220848083, "grad_norm": 0.46511370274016456, "learning_rate": 2.8781052631578946e-05, "loss": 0.3191, "mean_copy_accuracy": 0.9919128865003586, "mean_gen_accuracy": 0.8640872687101364, "mean_token_accuracy": 0.8951893001794815, "num_tokens": 460410033.0, "sample_num_tokens": 7148.25, "step": 1701, "total_num_tokens": 460438626.0, "z_loss": 0.0010045459493994713 }, { "copy_logits_max": -6.769231796264648, "copy_logits_min": -687500032.0, "copy_num_tokens": 417.8125, "epoch": 0.347612969109012, "gen_logits_max": 7.324893951416016, "gen_logits_mean": -10.821390151977539, "gen_logits_min": -22.471660614013672, "gen_logits_std": 2.4155147075653076, "gen_loss": 0.3209017515182495, "grad_norm": 0.4979825271340299, "learning_rate": 2.877978947368421e-05, "loss": 0.3155, "mean_copy_accuracy": 0.991254135966301, "mean_gen_accuracy": 0.8637792617082596, "mean_token_accuracy": 0.8971830755472183, "num_tokens": 460678279.0, "sample_num_tokens": 8032.25, "step": 1702, "total_num_tokens": 460710408.0, "z_loss": 0.0011228976072743535 }, { "copy_logits_max": -5.070677280426025, "copy_logits_min": -687500032.0, "copy_num_tokens": 428.625, "epoch": 0.34781720704620883, "gen_logits_max": 7.194779872894287, "gen_logits_mean": -11.330107688903809, "gen_logits_min": -22.46727752685547, "gen_logits_std": 2.359205722808838, "gen_loss": 0.3628024458885193, "grad_norm": 0.5226452700883749, "learning_rate": 2.877852631578947e-05, "loss": 0.3192, "mean_copy_accuracy": 0.9931023120880127, "mean_gen_accuracy": 0.8602136671543121, "mean_token_accuracy": 0.8942694962024689, "num_tokens": 460956677.0, "sample_num_tokens": 8399.25, "step": 1703, "total_num_tokens": 460990274.0, "z_loss": 0.001205133623443544 }, { "copy_logits_max": -3.1188814640045166, "copy_logits_min": -625000064.0, "copy_num_tokens": 586.875, "epoch": 0.34802144498340565, "gen_logits_max": 5.976482391357422, "gen_logits_mean": -11.886640548706055, "gen_logits_min": -23.5155086517334, "gen_logits_std": 2.430405616760254, "gen_loss": 0.2775299549102783, "grad_norm": 0.4627403825191574, "learning_rate": 2.8777263157894736e-05, "loss": 0.3267, "mean_copy_accuracy": 0.9935162514448166, "mean_gen_accuracy": 0.8587010651826859, "mean_token_accuracy": 0.8951523154973984, "num_tokens": 461252223.0, "sample_num_tokens": 8500.75, "step": 1704, "total_num_tokens": 461286226.0, "z_loss": 0.0012974650599062443 }, { "copy_logits_max": -4.986720085144043, "copy_logits_min": -750000000.0, "copy_num_tokens": 523.375, "epoch": 0.3482256829206025, "gen_logits_max": 6.5445356369018555, "gen_logits_mean": -11.177679061889648, "gen_logits_min": -22.898401260375977, "gen_logits_std": 2.448840856552124, "gen_loss": 0.3202379047870636, "grad_norm": 0.4800206271683528, "learning_rate": 2.8776000000000004e-05, "loss": 0.3373, "mean_copy_accuracy": 0.9921656847000122, "mean_gen_accuracy": 0.8581783175468445, "mean_token_accuracy": 0.8900352269411087, "num_tokens": 461499401.0, "sample_num_tokens": 8148.25, "step": 1705, "total_num_tokens": 461531994.0, "z_loss": 0.0013227021554484963 }, { "copy_logits_max": -4.726304054260254, "copy_logits_min": -687500032.0, "copy_num_tokens": 478.375, "epoch": 0.34842992085779934, "gen_logits_max": 6.454484462738037, "gen_logits_mean": -12.966863632202148, "gen_logits_min": -24.187599182128906, "gen_logits_std": 2.3874568939208984, "gen_loss": 0.34325623512268066, "grad_norm": 0.47963352791512476, "learning_rate": 2.8774736842105265e-05, "loss": 0.341, "mean_copy_accuracy": 0.9928369969129562, "mean_gen_accuracy": 0.8492297232151031, "mean_token_accuracy": 0.8887782394886017, "num_tokens": 461777892.0, "sample_num_tokens": 8228.0, "step": 1706, "total_num_tokens": 461810804.0, "z_loss": 0.0011653739493340254 }, { "copy_logits_max": -6.09229040145874, "copy_logits_min": -687500032.0, "copy_num_tokens": 515.8125, "epoch": 0.34863415879499615, "gen_logits_max": 5.892378807067871, "gen_logits_mean": -12.827530860900879, "gen_logits_min": -24.260589599609375, "gen_logits_std": 2.4066200256347656, "gen_loss": 0.3113000988960266, "grad_norm": 0.4450232894192496, "learning_rate": 2.877347368421053e-05, "loss": 0.3144, "mean_copy_accuracy": 0.9946986138820648, "mean_gen_accuracy": 0.8600295335054398, "mean_token_accuracy": 0.8949286341667175, "num_tokens": 462055331.0, "sample_num_tokens": 8950.75, "step": 1707, "total_num_tokens": 462091134.0, "z_loss": 0.0010517206974327564 }, { "copy_logits_max": -5.251298904418945, "copy_logits_min": -750000000.0, "copy_num_tokens": 352.8125, "epoch": 0.348838396732193, "gen_logits_max": 7.099647521972656, "gen_logits_mean": -12.109041213989258, "gen_logits_min": -23.353723526000977, "gen_logits_std": 2.4165637493133545, "gen_loss": 0.40520215034484863, "grad_norm": 0.5131739262365138, "learning_rate": 2.877221052631579e-05, "loss": 0.362, "mean_copy_accuracy": 0.9923126250505447, "mean_gen_accuracy": 0.8495802879333496, "mean_token_accuracy": 0.8829680234193802, "num_tokens": 462336422.0, "sample_num_tokens": 7844.5, "step": 1708, "total_num_tokens": 462367800.0, "z_loss": 0.0011610371293500066 }, { "copy_logits_max": -4.739477634429932, "copy_logits_min": -750000000.0, "copy_num_tokens": 432.9375, "epoch": 0.34904263466938984, "gen_logits_max": 7.818714141845703, "gen_logits_mean": -10.437175750732422, "gen_logits_min": -22.31650161743164, "gen_logits_std": 2.4342055320739746, "gen_loss": 0.3696829676628113, "grad_norm": 0.5130944238757762, "learning_rate": 2.8770947368421054e-05, "loss": 0.348, "mean_copy_accuracy": 0.9914083629846573, "mean_gen_accuracy": 0.8539264649152756, "mean_token_accuracy": 0.885690227150917, "num_tokens": 462599040.0, "sample_num_tokens": 8160.5, "step": 1709, "total_num_tokens": 462631682.0, "z_loss": 0.0012309199664741755 }, { "copy_logits_max": -7.319353103637695, "copy_logits_min": -750000064.0, "copy_num_tokens": 486.0625, "epoch": 0.34924687260658666, "gen_logits_max": 6.7294793128967285, "gen_logits_mean": -11.768571853637695, "gen_logits_min": -23.191267013549805, "gen_logits_std": 2.405937910079956, "gen_loss": 0.3340400457382202, "grad_norm": 0.5095984792191063, "learning_rate": 2.8769684210526315e-05, "loss": 0.3566, "mean_copy_accuracy": 0.9917140454053879, "mean_gen_accuracy": 0.8499091118574142, "mean_token_accuracy": 0.8853113949298859, "num_tokens": 462875646.0, "sample_num_tokens": 8588.5, "step": 1710, "total_num_tokens": 462910000.0, "z_loss": 0.0009860943537205458 }, { "copy_logits_max": -6.139743804931641, "copy_logits_min": -750000000.0, "copy_num_tokens": 436.125, "epoch": 0.3494511105437835, "gen_logits_max": 7.507099151611328, "gen_logits_mean": -10.857339859008789, "gen_logits_min": -22.266311645507812, "gen_logits_std": 2.388248920440674, "gen_loss": 0.35523107647895813, "grad_norm": 0.5028465680374576, "learning_rate": 2.876842105263158e-05, "loss": 0.3303, "mean_copy_accuracy": 0.991706982254982, "mean_gen_accuracy": 0.8607629984617233, "mean_token_accuracy": 0.8924380391836166, "num_tokens": 463145571.0, "sample_num_tokens": 8404.25, "step": 1711, "total_num_tokens": 463179188.0, "z_loss": 0.001022788928821683 }, { "copy_logits_max": -7.707651615142822, "copy_logits_min": -750000000.0, "copy_num_tokens": 544.1875, "epoch": 0.34965534848098034, "gen_logits_max": 7.138686656951904, "gen_logits_mean": -10.167752265930176, "gen_logits_min": -22.21932029724121, "gen_logits_std": 2.4686131477355957, "gen_loss": 0.30216309428215027, "grad_norm": 0.5213271613456713, "learning_rate": 2.876715789473684e-05, "loss": 0.3324, "mean_copy_accuracy": 0.9931147992610931, "mean_gen_accuracy": 0.8566573709249496, "mean_token_accuracy": 0.8894293457269669, "num_tokens": 463410126.0, "sample_num_tokens": 9323.5, "step": 1712, "total_num_tokens": 463447420.0, "z_loss": 0.0010075217578560114 }, { "copy_logits_max": -8.354508399963379, "copy_logits_min": -750000064.0, "copy_num_tokens": 459.3125, "epoch": 0.34985958641817716, "gen_logits_max": 7.3535051345825195, "gen_logits_mean": -10.934115409851074, "gen_logits_min": -22.681581497192383, "gen_logits_std": 2.436847686767578, "gen_loss": 0.30129045248031616, "grad_norm": 0.48273163015996934, "learning_rate": 2.8765894736842108e-05, "loss": 0.3477, "mean_copy_accuracy": 0.9920788556337357, "mean_gen_accuracy": 0.8507721871137619, "mean_token_accuracy": 0.8848622143268585, "num_tokens": 463666643.0, "sample_num_tokens": 8034.25, "step": 1713, "total_num_tokens": 463698780.0, "z_loss": 0.0010007366072386503 }, { "copy_logits_max": -5.430475234985352, "copy_logits_min": -750000064.0, "copy_num_tokens": 367.75, "epoch": 0.35006382435537403, "gen_logits_max": 7.098182201385498, "gen_logits_mean": -12.435903549194336, "gen_logits_min": -23.82427406311035, "gen_logits_std": 2.4490318298339844, "gen_loss": 0.32068371772766113, "grad_norm": 0.5683757204263187, "learning_rate": 2.876463157894737e-05, "loss": 0.3352, "mean_copy_accuracy": 0.9900946468114853, "mean_gen_accuracy": 0.8612081706523895, "mean_token_accuracy": 0.8904833048582077, "num_tokens": 463911991.0, "sample_num_tokens": 8070.25, "step": 1714, "total_num_tokens": 463944272.0, "z_loss": 0.0009559595491737127 }, { "copy_logits_max": -4.832647800445557, "copy_logits_min": -750000000.0, "copy_num_tokens": 572.1875, "epoch": 0.35026806229257085, "gen_logits_max": 6.836105823516846, "gen_logits_mean": -11.372357368469238, "gen_logits_min": -23.649667739868164, "gen_logits_std": 2.478639602661133, "gen_loss": 0.3081166744232178, "grad_norm": 0.5041719211288848, "learning_rate": 2.8763368421052633e-05, "loss": 0.318, "mean_copy_accuracy": 0.991586372256279, "mean_gen_accuracy": 0.8594070822000504, "mean_token_accuracy": 0.8958540558815002, "num_tokens": 464179424.0, "sample_num_tokens": 9217.5, "step": 1715, "total_num_tokens": 464216294.0, "z_loss": 0.0010401689214631915 }, { "copy_logits_max": -4.234192371368408, "copy_logits_min": -750000064.0, "copy_num_tokens": 464.0, "epoch": 0.35047230022976766, "gen_logits_max": 6.908745288848877, "gen_logits_mean": -11.88751220703125, "gen_logits_min": -23.936447143554688, "gen_logits_std": 2.4620583057403564, "gen_loss": 0.32170137763023376, "grad_norm": 0.5361934984233907, "learning_rate": 2.8762105263157894e-05, "loss": 0.3431, "mean_copy_accuracy": 0.9909263700246811, "mean_gen_accuracy": 0.8536109775304794, "mean_token_accuracy": 0.8880455195903778, "num_tokens": 464427677.0, "sample_num_tokens": 8185.25, "step": 1716, "total_num_tokens": 464460418.0, "z_loss": 0.0011527573224157095 }, { "copy_logits_max": -3.157017469406128, "copy_logits_min": -750000000.0, "copy_num_tokens": 635.1875, "epoch": 0.35067653816696454, "gen_logits_max": 6.069112300872803, "gen_logits_mean": -13.070425987243652, "gen_logits_min": -25.18524169921875, "gen_logits_std": 2.464031219482422, "gen_loss": 0.30986887216567993, "grad_norm": 0.4924508732727765, "learning_rate": 2.876084210526316e-05, "loss": 0.3159, "mean_copy_accuracy": 0.9926936626434326, "mean_gen_accuracy": 0.8632924556732178, "mean_token_accuracy": 0.8975359052419662, "num_tokens": 464726515.0, "sample_num_tokens": 10163.25, "step": 1717, "total_num_tokens": 464767168.0, "z_loss": 0.00109526002779603 }, { "copy_logits_max": -4.035181999206543, "copy_logits_min": -750000064.0, "copy_num_tokens": 484.9375, "epoch": 0.35088077610416135, "gen_logits_max": 6.910525321960449, "gen_logits_mean": -12.614580154418945, "gen_logits_min": -24.764984130859375, "gen_logits_std": 2.509462833404541, "gen_loss": 0.349155455827713, "grad_norm": 0.5108263937385644, "learning_rate": 2.8759578947368423e-05, "loss": 0.3472, "mean_copy_accuracy": 0.9922246783971786, "mean_gen_accuracy": 0.8496928066015244, "mean_token_accuracy": 0.8871968537569046, "num_tokens": 465008824.0, "sample_num_tokens": 8227.5, "step": 1718, "total_num_tokens": 465041734.0, "z_loss": 0.001223781262524426 }, { "copy_logits_max": -4.613676071166992, "copy_logits_min": -750000000.0, "copy_num_tokens": 477.875, "epoch": 0.35108501404135817, "gen_logits_max": 6.726511001586914, "gen_logits_mean": -12.673924446105957, "gen_logits_min": -24.128307342529297, "gen_logits_std": 2.4006080627441406, "gen_loss": 0.3552412986755371, "grad_norm": 0.742648121437979, "learning_rate": 2.8758315789473684e-05, "loss": 0.3374, "mean_copy_accuracy": 0.9922483414411545, "mean_gen_accuracy": 0.8533039093017578, "mean_token_accuracy": 0.890251025557518, "num_tokens": 465287085.0, "sample_num_tokens": 9116.25, "step": 1719, "total_num_tokens": 465323550.0, "z_loss": 0.0010894900187849998 }, { "copy_logits_max": -4.990778923034668, "copy_logits_min": -750000000.0, "copy_num_tokens": 383.875, "epoch": 0.35128925197855504, "gen_logits_max": 6.78610372543335, "gen_logits_mean": -12.23703384399414, "gen_logits_min": -24.145660400390625, "gen_logits_std": 2.4270973205566406, "gen_loss": 0.3483782112598419, "grad_norm": 0.44179789320894564, "learning_rate": 2.8757052631578948e-05, "loss": 0.3512, "mean_copy_accuracy": 0.9929668754339218, "mean_gen_accuracy": 0.8501892387866974, "mean_token_accuracy": 0.886685848236084, "num_tokens": 465578457.0, "sample_num_tokens": 8017.75, "step": 1720, "total_num_tokens": 465610528.0, "z_loss": 0.00105197518132627 }, { "copy_logits_max": -5.109228610992432, "copy_logits_min": -750000000.0, "copy_num_tokens": 332.25, "epoch": 0.35149348991575186, "gen_logits_max": 7.570886135101318, "gen_logits_mean": -11.140592575073242, "gen_logits_min": -22.970733642578125, "gen_logits_std": 2.445413112640381, "gen_loss": 0.367952823638916, "grad_norm": 0.5494712302059583, "learning_rate": 2.8755789473684212e-05, "loss": 0.3608, "mean_copy_accuracy": 0.9924227446317673, "mean_gen_accuracy": 0.8507394194602966, "mean_token_accuracy": 0.8831735998392105, "num_tokens": 465828255.0, "sample_num_tokens": 6908.75, "step": 1721, "total_num_tokens": 465855890.0, "z_loss": 0.0011507265735417604 }, { "copy_logits_max": -6.4126877784729, "copy_logits_min": -750000000.0, "copy_num_tokens": 449.75, "epoch": 0.35169772785294867, "gen_logits_max": 7.651026725769043, "gen_logits_mean": -10.872821807861328, "gen_logits_min": -22.683486938476562, "gen_logits_std": 2.537862777709961, "gen_loss": 0.3693193793296814, "grad_norm": 0.4410866597005634, "learning_rate": 2.8754526315789477e-05, "loss": 0.3155, "mean_copy_accuracy": 0.993129625916481, "mean_gen_accuracy": 0.860938161611557, "mean_token_accuracy": 0.8960630297660828, "num_tokens": 466138630.0, "sample_num_tokens": 8476.0, "step": 1722, "total_num_tokens": 466172534.0, "z_loss": 0.0011710887774825096 }, { "copy_logits_max": -5.848757743835449, "copy_logits_min": -750000000.0, "copy_num_tokens": 473.3125, "epoch": 0.35190196579014554, "gen_logits_max": 7.59184455871582, "gen_logits_mean": -10.81702995300293, "gen_logits_min": -22.826377868652344, "gen_logits_std": 2.4807276725769043, "gen_loss": 0.3276941180229187, "grad_norm": 0.44836162476159175, "learning_rate": 2.8753263157894738e-05, "loss": 0.3137, "mean_copy_accuracy": 0.9918875098228455, "mean_gen_accuracy": 0.8689293265342712, "mean_token_accuracy": 0.8963978737592697, "num_tokens": 466380807.0, "sample_num_tokens": 8200.75, "step": 1723, "total_num_tokens": 466413610.0, "z_loss": 0.0011613806709647179 }, { "copy_logits_max": -6.809810161590576, "copy_logits_min": -750000064.0, "copy_num_tokens": 421.75, "epoch": 0.35210620372734236, "gen_logits_max": 7.834043502807617, "gen_logits_mean": -12.039663314819336, "gen_logits_min": -23.647911071777344, "gen_logits_std": 2.451700448989868, "gen_loss": 0.3569600582122803, "grad_norm": 0.4452517402966926, "learning_rate": 2.8752000000000002e-05, "loss": 0.3408, "mean_copy_accuracy": 0.9922760426998138, "mean_gen_accuracy": 0.8564166724681854, "mean_token_accuracy": 0.8872346431016922, "num_tokens": 466663491.0, "sample_num_tokens": 8688.75, "step": 1724, "total_num_tokens": 466698246.0, "z_loss": 0.0011105460580438375 }, { "copy_logits_max": -6.109593391418457, "copy_logits_min": -750000000.0, "copy_num_tokens": 527.0625, "epoch": 0.3523104416645392, "gen_logits_max": 7.061639785766602, "gen_logits_mean": -11.397319793701172, "gen_logits_min": -23.05883026123047, "gen_logits_std": 2.4938132762908936, "gen_loss": 0.3038185238838196, "grad_norm": 0.429391221389858, "learning_rate": 2.8750736842105263e-05, "loss": 0.3143, "mean_copy_accuracy": 0.9947702884674072, "mean_gen_accuracy": 0.8626687079668045, "mean_token_accuracy": 0.8959591835737228, "num_tokens": 466933033.0, "sample_num_tokens": 8440.75, "step": 1725, "total_num_tokens": 466966796.0, "z_loss": 0.0010839274618774652 }, { "copy_logits_max": -3.0705084800720215, "copy_logits_min": -687500032.0, "copy_num_tokens": 727.125, "epoch": 0.35251467960173605, "gen_logits_max": 6.927731037139893, "gen_logits_mean": -10.860485076904297, "gen_logits_min": -23.162189483642578, "gen_logits_std": 2.534480094909668, "gen_loss": 0.2848150134086609, "grad_norm": 0.42185071909658706, "learning_rate": 2.8749473684210527e-05, "loss": 0.3069, "mean_copy_accuracy": 0.993446096777916, "mean_gen_accuracy": 0.8613732010126114, "mean_token_accuracy": 0.9001990109682083, "num_tokens": 467209194.0, "sample_num_tokens": 9469.0, "step": 1726, "total_num_tokens": 467247070.0, "z_loss": 0.0011803354136645794 }, { "copy_logits_max": -5.930920600891113, "copy_logits_min": -687500032.0, "copy_num_tokens": 477.0625, "epoch": 0.35271891753893286, "gen_logits_max": 7.183155059814453, "gen_logits_mean": -11.79704475402832, "gen_logits_min": -23.765335083007812, "gen_logits_std": 2.4973626136779785, "gen_loss": 0.3325487971305847, "grad_norm": 0.44078100639631973, "learning_rate": 2.8748210526315788e-05, "loss": 0.347, "mean_copy_accuracy": 0.9940295964479446, "mean_gen_accuracy": 0.8492894321680069, "mean_token_accuracy": 0.8864759802818298, "num_tokens": 467490586.0, "sample_num_tokens": 8454.0, "step": 1727, "total_num_tokens": 467524402.0, "z_loss": 0.0010669706389307976 }, { "copy_logits_max": -6.23789119720459, "copy_logits_min": -750000000.0, "copy_num_tokens": 292.0625, "epoch": 0.3529231554761297, "gen_logits_max": 7.425567150115967, "gen_logits_mean": -12.044500350952148, "gen_logits_min": -23.659395217895508, "gen_logits_std": 2.45536470413208, "gen_loss": 0.34604203701019287, "grad_norm": 0.500019820885903, "learning_rate": 2.8746947368421052e-05, "loss": 0.348, "mean_copy_accuracy": 0.9912880063056946, "mean_gen_accuracy": 0.8535118401050568, "mean_token_accuracy": 0.8870532959699631, "num_tokens": 467770654.0, "sample_num_tokens": 7533.5, "step": 1728, "total_num_tokens": 467800788.0, "z_loss": 0.001008997904136777 }, { "copy_logits_max": -4.883398532867432, "copy_logits_min": -750000000.0, "copy_num_tokens": 480.375, "epoch": 0.35312739341332655, "gen_logits_max": 7.114791393280029, "gen_logits_mean": -11.142349243164062, "gen_logits_min": -23.066509246826172, "gen_logits_std": 2.5344340801239014, "gen_loss": 0.34506097435951233, "grad_norm": 0.4846611011793883, "learning_rate": 2.8745684210526317e-05, "loss": 0.3299, "mean_copy_accuracy": 0.9897421002388, "mean_gen_accuracy": 0.8599925190210342, "mean_token_accuracy": 0.8908956348896027, "num_tokens": 468038614.0, "sample_num_tokens": 8419.0, "step": 1729, "total_num_tokens": 468072290.0, "z_loss": 0.001080543384887278 }, { "copy_logits_max": -5.593874931335449, "copy_logits_min": -625000064.0, "copy_num_tokens": 618.0625, "epoch": 0.35333163135052337, "gen_logits_max": 6.650358200073242, "gen_logits_mean": -11.693157196044922, "gen_logits_min": -23.93834686279297, "gen_logits_std": 2.542682409286499, "gen_loss": 0.3034733235836029, "grad_norm": 0.4462033959893036, "learning_rate": 2.874442105263158e-05, "loss": 0.3086, "mean_copy_accuracy": 0.993780180811882, "mean_gen_accuracy": 0.8614224642515182, "mean_token_accuracy": 0.9005205631256104, "num_tokens": 468328519.0, "sample_num_tokens": 8795.25, "step": 1730, "total_num_tokens": 468363700.0, "z_loss": 0.00108579290099442 }, { "copy_logits_max": -6.328392028808594, "copy_logits_min": -750000064.0, "copy_num_tokens": 461.3125, "epoch": 0.3535358692877202, "gen_logits_max": 6.9544572830200195, "gen_logits_mean": -11.382518768310547, "gen_logits_min": -24.0509090423584, "gen_logits_std": 2.579792022705078, "gen_loss": 0.28597089648246765, "grad_norm": 0.5298511101732315, "learning_rate": 2.8743157894736842e-05, "loss": 0.3149, "mean_copy_accuracy": 0.9936993718147278, "mean_gen_accuracy": 0.8621281534433365, "mean_token_accuracy": 0.8976269513368607, "num_tokens": 468597023.0, "sample_num_tokens": 7740.75, "step": 1731, "total_num_tokens": 468627986.0, "z_loss": 0.0010831973049789667 }, { "copy_logits_max": -7.341113090515137, "copy_logits_min": -750000000.0, "copy_num_tokens": 505.3125, "epoch": 0.35374010722491706, "gen_logits_max": 6.878673076629639, "gen_logits_mean": -11.291034698486328, "gen_logits_min": -23.02942657470703, "gen_logits_std": 2.4931998252868652, "gen_loss": 0.31521180272102356, "grad_norm": 0.5191864739367898, "learning_rate": 2.8741894736842106e-05, "loss": 0.3455, "mean_copy_accuracy": 0.9927937239408493, "mean_gen_accuracy": 0.8553708642721176, "mean_token_accuracy": 0.8890022933483124, "num_tokens": 468854646.0, "sample_num_tokens": 8984.5, "step": 1732, "total_num_tokens": 468890584.0, "z_loss": 0.0009034466929733753 }, { "copy_logits_max": -5.403409957885742, "copy_logits_min": -750000000.0, "copy_num_tokens": 361.875, "epoch": 0.35394434516211387, "gen_logits_max": 7.202167987823486, "gen_logits_mean": -11.90196418762207, "gen_logits_min": -23.6187744140625, "gen_logits_std": 2.5126047134399414, "gen_loss": 0.36501452326774597, "grad_norm": 0.4885219762364147, "learning_rate": 2.874063157894737e-05, "loss": 0.3448, "mean_copy_accuracy": 0.9913042783737183, "mean_gen_accuracy": 0.8569373190402985, "mean_token_accuracy": 0.8884822428226471, "num_tokens": 469127346.0, "sample_num_tokens": 7562.5, "step": 1733, "total_num_tokens": 469157596.0, "z_loss": 0.001032786793075502 }, { "copy_logits_max": -6.4707794189453125, "copy_logits_min": -750000000.0, "copy_num_tokens": 382.0, "epoch": 0.3541485830993107, "gen_logits_max": 7.125955581665039, "gen_logits_mean": -11.711380004882812, "gen_logits_min": -23.2122802734375, "gen_logits_std": 2.4767653942108154, "gen_loss": 0.3519885838031769, "grad_norm": 0.5094053296829946, "learning_rate": 2.873936842105263e-05, "loss": 0.3482, "mean_copy_accuracy": 0.9925204217433929, "mean_gen_accuracy": 0.8510091453790665, "mean_token_accuracy": 0.8852063864469528, "num_tokens": 469399499.0, "sample_num_tokens": 8510.25, "step": 1734, "total_num_tokens": 469433540.0, "z_loss": 0.0009192107245326042 }, { "copy_logits_max": -5.827243804931641, "copy_logits_min": -750000000.0, "copy_num_tokens": 579.1875, "epoch": 0.3543528210365075, "gen_logits_max": 6.76271915435791, "gen_logits_mean": -11.854124069213867, "gen_logits_min": -23.970277786254883, "gen_logits_std": 2.5256147384643555, "gen_loss": 0.33265888690948486, "grad_norm": 0.4908113957556437, "learning_rate": 2.8738105263157896e-05, "loss": 0.3221, "mean_copy_accuracy": 0.9929457902908325, "mean_gen_accuracy": 0.8602364212274551, "mean_token_accuracy": 0.8931256681680679, "num_tokens": 469672356.0, "sample_num_tokens": 9044.5, "step": 1735, "total_num_tokens": 469708534.0, "z_loss": 0.0010989608708769083 }, { "copy_logits_max": -6.933485984802246, "copy_logits_min": -750000000.0, "copy_num_tokens": 290.6875, "epoch": 0.3545570589737044, "gen_logits_max": 7.524970054626465, "gen_logits_mean": -12.872001647949219, "gen_logits_min": -23.98507308959961, "gen_logits_std": 2.442065715789795, "gen_loss": 0.3583906292915344, "grad_norm": 0.5041176785849835, "learning_rate": 2.8736842105263157e-05, "loss": 0.3488, "mean_copy_accuracy": 0.9912079870700836, "mean_gen_accuracy": 0.8559361696243286, "mean_token_accuracy": 0.8871710300445557, "num_tokens": 469929374.0, "sample_num_tokens": 7311.5, "step": 1736, "total_num_tokens": 469958620.0, "z_loss": 0.0010234881192445755 }, { "copy_logits_max": -5.707851409912109, "copy_logits_min": -750000000.0, "copy_num_tokens": 489.875, "epoch": 0.3547612969109012, "gen_logits_max": 7.014082908630371, "gen_logits_mean": -11.650372505187988, "gen_logits_min": -23.13610076904297, "gen_logits_std": 2.4748029708862305, "gen_loss": 0.3325735926628113, "grad_norm": 0.4905468442483375, "learning_rate": 2.8735578947368424e-05, "loss": 0.3451, "mean_copy_accuracy": 0.9901913851499557, "mean_gen_accuracy": 0.8592803627252579, "mean_token_accuracy": 0.8860072195529938, "num_tokens": 470187511.0, "sample_num_tokens": 8933.25, "step": 1737, "total_num_tokens": 470223244.0, "z_loss": 0.0009392788633704185 }, { "copy_logits_max": -7.235011100769043, "copy_logits_min": -750000000.0, "copy_num_tokens": 496.6875, "epoch": 0.354965534848098, "gen_logits_max": 6.932236671447754, "gen_logits_mean": -11.812387466430664, "gen_logits_min": -23.456485748291016, "gen_logits_std": 2.492795705795288, "gen_loss": 0.3110470771789551, "grad_norm": 0.4676719249547458, "learning_rate": 2.8734315789473685e-05, "loss": 0.3173, "mean_copy_accuracy": 0.9934556037187576, "mean_gen_accuracy": 0.8644819259643555, "mean_token_accuracy": 0.8964405059814453, "num_tokens": 470489050.0, "sample_num_tokens": 9446.0, "step": 1738, "total_num_tokens": 470526834.0, "z_loss": 0.0009777785744518042 }, { "copy_logits_max": -6.757645130157471, "copy_logits_min": -750000000.0, "copy_num_tokens": 303.8125, "epoch": 0.3551697727852949, "gen_logits_max": 6.965831756591797, "gen_logits_mean": -13.174156188964844, "gen_logits_min": -25.111772537231445, "gen_logits_std": 2.4770400524139404, "gen_loss": 0.3066653609275818, "grad_norm": 0.48675466592057304, "learning_rate": 2.873305263157895e-05, "loss": 0.3224, "mean_copy_accuracy": 0.9938272833824158, "mean_gen_accuracy": 0.8571433871984482, "mean_token_accuracy": 0.8974105417728424, "num_tokens": 470765427.0, "sample_num_tokens": 6936.25, "step": 1739, "total_num_tokens": 470793172.0, "z_loss": 0.0009576448355801404 }, { "copy_logits_max": -6.958529472351074, "copy_logits_min": -750000000.0, "copy_num_tokens": 405.25, "epoch": 0.3553740107224917, "gen_logits_max": 7.365875244140625, "gen_logits_mean": -11.612808227539062, "gen_logits_min": -23.045997619628906, "gen_logits_std": 2.4416754245758057, "gen_loss": 0.3547561764717102, "grad_norm": 0.476162766005815, "learning_rate": 2.873178947368421e-05, "loss": 0.3414, "mean_copy_accuracy": 0.9919522255659103, "mean_gen_accuracy": 0.8618598580360413, "mean_token_accuracy": 0.8882559686899185, "num_tokens": 471015939.0, "sample_num_tokens": 9082.75, "step": 1740, "total_num_tokens": 471052270.0, "z_loss": 0.0009517017751932144 }, { "copy_logits_max": -5.895714282989502, "copy_logits_min": -750000000.0, "copy_num_tokens": 438.75, "epoch": 0.3555782486596885, "gen_logits_max": 7.504266738891602, "gen_logits_mean": -11.791027069091797, "gen_logits_min": -23.879714965820312, "gen_logits_std": 2.515738010406494, "gen_loss": 0.3724013566970825, "grad_norm": 0.4763135021920562, "learning_rate": 2.8730526315789475e-05, "loss": 0.3437, "mean_copy_accuracy": 0.9931734204292297, "mean_gen_accuracy": 0.8572473376989365, "mean_token_accuracy": 0.8892608880996704, "num_tokens": 471284424.0, "sample_num_tokens": 9254.5, "step": 1741, "total_num_tokens": 471321442.0, "z_loss": 0.0010746350744739175 }, { "copy_logits_max": -5.363339900970459, "copy_logits_min": -687500032.0, "copy_num_tokens": 641.1875, "epoch": 0.3557824865968854, "gen_logits_max": 6.791877746582031, "gen_logits_mean": -11.306236267089844, "gen_logits_min": -23.527462005615234, "gen_logits_std": 2.5199968814849854, "gen_loss": 0.32244759798049927, "grad_norm": 0.4529351684834711, "learning_rate": 2.8729263157894736e-05, "loss": 0.3377, "mean_copy_accuracy": 0.9927470088005066, "mean_gen_accuracy": 0.8555002063512802, "mean_token_accuracy": 0.8893196880817413, "num_tokens": 471563977.0, "sample_num_tokens": 10035.75, "step": 1742, "total_num_tokens": 471604120.0, "z_loss": 0.0009570568799972534 }, { "copy_logits_max": -7.0571393966674805, "copy_logits_min": -750000000.0, "copy_num_tokens": 283.5, "epoch": 0.3559867245340822, "gen_logits_max": 7.315641403198242, "gen_logits_mean": -12.308564186096191, "gen_logits_min": -23.772750854492188, "gen_logits_std": 2.4580161571502686, "gen_loss": 0.3778455853462219, "grad_norm": 0.5133182986049617, "learning_rate": 2.8728e-05, "loss": 0.3385, "mean_copy_accuracy": 0.9912120997905731, "mean_gen_accuracy": 0.8589828312397003, "mean_token_accuracy": 0.8872475624084473, "num_tokens": 471832262.0, "sample_num_tokens": 7119.0, "step": 1743, "total_num_tokens": 471860738.0, "z_loss": 0.000936438562348485 }, { "copy_logits_max": -6.174630641937256, "copy_logits_min": -687500032.0, "copy_num_tokens": 503.8125, "epoch": 0.356190962471279, "gen_logits_max": 6.832952499389648, "gen_logits_mean": -12.42631721496582, "gen_logits_min": -23.965927124023438, "gen_logits_std": 2.496307373046875, "gen_loss": 0.32962021231651306, "grad_norm": 0.4432387016310374, "learning_rate": 2.872673684210526e-05, "loss": 0.341, "mean_copy_accuracy": 0.9920915216207504, "mean_gen_accuracy": 0.8542869538068771, "mean_token_accuracy": 0.889133408665657, "num_tokens": 472108416.0, "sample_num_tokens": 8821.5, "step": 1744, "total_num_tokens": 472143702.0, "z_loss": 0.0009391664643771946 }, { "copy_logits_max": -4.303186893463135, "copy_logits_min": -687500032.0, "copy_num_tokens": 739.4375, "epoch": 0.3563952004084759, "gen_logits_max": 6.095243453979492, "gen_logits_mean": -12.787315368652344, "gen_logits_min": -24.87720489501953, "gen_logits_std": 2.5228209495544434, "gen_loss": 0.2818091809749603, "grad_norm": 0.4446035452371928, "learning_rate": 2.872547368421053e-05, "loss": 0.3233, "mean_copy_accuracy": 0.9941520988941193, "mean_gen_accuracy": 0.8648861199617386, "mean_token_accuracy": 0.8970059305429459, "num_tokens": 472378699.0, "sample_num_tokens": 10220.75, "step": 1745, "total_num_tokens": 472419582.0, "z_loss": 0.0009820809355005622 }, { "copy_logits_max": -5.8024773597717285, "copy_logits_min": -750000000.0, "copy_num_tokens": 372.25, "epoch": 0.3565994383456727, "gen_logits_max": 7.556980609893799, "gen_logits_mean": -12.200759887695312, "gen_logits_min": -23.731019973754883, "gen_logits_std": 2.4668221473693848, "gen_loss": 0.354203462600708, "grad_norm": 0.4851072147568397, "learning_rate": 2.8724210526315793e-05, "loss": 0.3458, "mean_copy_accuracy": 0.9926112443208694, "mean_gen_accuracy": 0.8521962761878967, "mean_token_accuracy": 0.888329803943634, "num_tokens": 472663000.0, "sample_num_tokens": 7836.0, "step": 1746, "total_num_tokens": 472694344.0, "z_loss": 0.0010469101835042238 }, { "copy_logits_max": -4.440786361694336, "copy_logits_min": -687500032.0, "copy_num_tokens": 498.0, "epoch": 0.3568036762828695, "gen_logits_max": 7.535863876342773, "gen_logits_mean": -10.556626319885254, "gen_logits_min": -22.3143310546875, "gen_logits_std": 2.488247871398926, "gen_loss": 0.3065915107727051, "grad_norm": 0.46936530623078765, "learning_rate": 2.8722947368421054e-05, "loss": 0.3377, "mean_copy_accuracy": 0.9915026873350143, "mean_gen_accuracy": 0.8605517446994781, "mean_token_accuracy": 0.8919220715761185, "num_tokens": 472923084.0, "sample_num_tokens": 8147.0, "step": 1747, "total_num_tokens": 472955672.0, "z_loss": 0.0010770949302241206 }, { "copy_logits_max": -7.1282525062561035, "copy_logits_min": -687500032.0, "copy_num_tokens": 600.25, "epoch": 0.3570079142200664, "gen_logits_max": 6.884125709533691, "gen_logits_mean": -11.60173225402832, "gen_logits_min": -23.27680206298828, "gen_logits_std": 2.470421314239502, "gen_loss": 0.3009331226348877, "grad_norm": 0.43835686861551615, "learning_rate": 2.8721684210526318e-05, "loss": 0.3105, "mean_copy_accuracy": 0.9930076599121094, "mean_gen_accuracy": 0.8635510951280594, "mean_token_accuracy": 0.8958741128444672, "num_tokens": 473187669.0, "sample_num_tokens": 10472.25, "step": 1748, "total_num_tokens": 473229558.0, "z_loss": 0.001066114054992795 }, { "copy_logits_max": -5.243481636047363, "copy_logits_min": -750000000.0, "copy_num_tokens": 450.8125, "epoch": 0.3572121521572632, "gen_logits_max": 7.497945308685303, "gen_logits_mean": -12.453490257263184, "gen_logits_min": -23.951051712036133, "gen_logits_std": 2.459506034851074, "gen_loss": 0.3175601065158844, "grad_norm": 0.4712969499116364, "learning_rate": 2.872042105263158e-05, "loss": 0.3456, "mean_copy_accuracy": 0.9930560439825058, "mean_gen_accuracy": 0.8569279313087463, "mean_token_accuracy": 0.885602131485939, "num_tokens": 473439377.0, "sample_num_tokens": 8930.75, "step": 1749, "total_num_tokens": 473475100.0, "z_loss": 0.0010920343920588493 }, { "copy_logits_max": -9.036455154418945, "copy_logits_min": -750000000.0, "copy_num_tokens": 311.3125, "epoch": 0.35741639009446, "gen_logits_max": 7.271238803863525, "gen_logits_mean": -13.240682601928711, "gen_logits_min": -24.224849700927734, "gen_logits_std": 2.3998613357543945, "gen_loss": 0.3455491065979004, "grad_norm": 0.43668486900257936, "learning_rate": 2.8719157894736843e-05, "loss": 0.3157, "mean_copy_accuracy": 0.9941473603248596, "mean_gen_accuracy": 0.8653990626335144, "mean_token_accuracy": 0.8967683613300323, "num_tokens": 473717609.0, "sample_num_tokens": 7614.75, "step": 1750, "total_num_tokens": 473748068.0, "z_loss": 0.0010186296422034502 }, { "copy_logits_max": -5.194913864135742, "copy_logits_min": -750000000.0, "copy_num_tokens": 415.8125, "epoch": 0.3576206280316569, "gen_logits_max": 7.4141082763671875, "gen_logits_mean": -10.78194808959961, "gen_logits_min": -22.638097763061523, "gen_logits_std": 2.5137600898742676, "gen_loss": 0.32361024618148804, "grad_norm": 0.47415827585042536, "learning_rate": 2.8717894736842104e-05, "loss": 0.3396, "mean_copy_accuracy": 0.9929080754518509, "mean_gen_accuracy": 0.8535409718751907, "mean_token_accuracy": 0.888925701379776, "num_tokens": 473976229.0, "sample_num_tokens": 6935.75, "step": 1751, "total_num_tokens": 474003972.0, "z_loss": 0.0010708300396800041 }, { "copy_logits_max": -7.476758003234863, "copy_logits_min": -750000000.0, "copy_num_tokens": 474.375, "epoch": 0.3578248659688537, "gen_logits_max": 7.525579452514648, "gen_logits_mean": -11.629531860351562, "gen_logits_min": -23.534046173095703, "gen_logits_std": 2.4943947792053223, "gen_loss": 0.2793145179748535, "grad_norm": 0.41164397075281906, "learning_rate": 2.871663157894737e-05, "loss": 0.3057, "mean_copy_accuracy": 0.9941982328891754, "mean_gen_accuracy": 0.8658457100391388, "mean_token_accuracy": 0.8992533832788467, "num_tokens": 474252157.0, "sample_num_tokens": 8760.25, "step": 1752, "total_num_tokens": 474287198.0, "z_loss": 0.0009846321772783995 }, { "copy_logits_max": -5.483712196350098, "copy_logits_min": -750000000.0, "copy_num_tokens": 488.0, "epoch": 0.3580291039060505, "gen_logits_max": 7.491665363311768, "gen_logits_mean": -10.460454940795898, "gen_logits_min": -22.378063201904297, "gen_logits_std": 2.4910073280334473, "gen_loss": 0.32237985730171204, "grad_norm": 0.5144737689162493, "learning_rate": 2.871536842105263e-05, "loss": 0.3353, "mean_copy_accuracy": 0.9908347129821777, "mean_gen_accuracy": 0.8593781292438507, "mean_token_accuracy": 0.8903588056564331, "num_tokens": 474495267.0, "sample_num_tokens": 8747.25, "step": 1753, "total_num_tokens": 474530256.0, "z_loss": 0.0010792070534080267 }, { "copy_logits_max": -6.104058742523193, "copy_logits_min": -750000000.0, "copy_num_tokens": 453.1875, "epoch": 0.3582333418432474, "gen_logits_max": 7.383708953857422, "gen_logits_mean": -11.444629669189453, "gen_logits_min": -23.210176467895508, "gen_logits_std": 2.4641504287719727, "gen_loss": 0.3666139245033264, "grad_norm": 0.4363167988054958, "learning_rate": 2.8714105263157897e-05, "loss": 0.3291, "mean_copy_accuracy": 0.993965670466423, "mean_gen_accuracy": 0.8542201668024063, "mean_token_accuracy": 0.8917927891016006, "num_tokens": 474774996.0, "sample_num_tokens": 8405.5, "step": 1754, "total_num_tokens": 474808618.0, "z_loss": 0.001020120456814766 }, { "copy_logits_max": -6.810420036315918, "copy_logits_min": -750000000.0, "copy_num_tokens": 456.875, "epoch": 0.3584375797804442, "gen_logits_max": 7.748383522033691, "gen_logits_mean": -11.08695125579834, "gen_logits_min": -22.55926513671875, "gen_logits_std": 2.4623727798461914, "gen_loss": 0.3322087526321411, "grad_norm": 0.4676491925074803, "learning_rate": 2.8712842105263158e-05, "loss": 0.3317, "mean_copy_accuracy": 0.9929743111133575, "mean_gen_accuracy": 0.8592714965343475, "mean_token_accuracy": 0.8929435461759567, "num_tokens": 475043484.0, "sample_num_tokens": 8498.5, "step": 1755, "total_num_tokens": 475077478.0, "z_loss": 0.001003964338451624 }, { "copy_logits_max": -5.068769931793213, "copy_logits_min": -750000000.0, "copy_num_tokens": 634.0, "epoch": 0.35864181771764103, "gen_logits_max": 6.886185646057129, "gen_logits_mean": -11.861015319824219, "gen_logits_min": -24.072689056396484, "gen_logits_std": 2.5026662349700928, "gen_loss": 0.3169083893299103, "grad_norm": 0.5226641028422271, "learning_rate": 2.8711578947368423e-05, "loss": 0.3265, "mean_copy_accuracy": 0.9932693988084793, "mean_gen_accuracy": 0.8631512224674225, "mean_token_accuracy": 0.8921570330858231, "num_tokens": 475316361.0, "sample_num_tokens": 10008.75, "step": 1756, "total_num_tokens": 475356396.0, "z_loss": 0.0010063484078273177 }, { "copy_logits_max": -7.240049362182617, "copy_logits_min": -687500032.0, "copy_num_tokens": 419.0625, "epoch": 0.3588460556548379, "gen_logits_max": 7.356286525726318, "gen_logits_mean": -11.258075714111328, "gen_logits_min": -22.888866424560547, "gen_logits_std": 2.4411392211914062, "gen_loss": 0.34717413783073425, "grad_norm": 0.47174812237195746, "learning_rate": 2.8710315789473683e-05, "loss": 0.3111, "mean_copy_accuracy": 0.9930064827203751, "mean_gen_accuracy": 0.8634639531373978, "mean_token_accuracy": 0.8975708037614822, "num_tokens": 475599584.0, "sample_num_tokens": 8137.0, "step": 1757, "total_num_tokens": 475632132.0, "z_loss": 0.0010212266352027655 }, { "copy_logits_max": -7.3537797927856445, "copy_logits_min": -750000000.0, "copy_num_tokens": 469.1875, "epoch": 0.3590502935920347, "gen_logits_max": 7.921569347381592, "gen_logits_mean": -10.561166763305664, "gen_logits_min": -22.709184646606445, "gen_logits_std": 2.503056764602661, "gen_loss": 0.335574209690094, "grad_norm": 0.5036595149640561, "learning_rate": 2.8709052631578948e-05, "loss": 0.3083, "mean_copy_accuracy": 0.9938325136899948, "mean_gen_accuracy": 0.8639858365058899, "mean_token_accuracy": 0.8999763876199722, "num_tokens": 475885465.0, "sample_num_tokens": 7715.25, "step": 1758, "total_num_tokens": 475916326.0, "z_loss": 0.001018585404381156 }, { "copy_logits_max": -5.903400421142578, "copy_logits_min": -750000000.0, "copy_num_tokens": 456.5625, "epoch": 0.35925453152923154, "gen_logits_max": 6.9424238204956055, "gen_logits_mean": -11.679203987121582, "gen_logits_min": -23.464969635009766, "gen_logits_std": 2.493746042251587, "gen_loss": 0.3250705599784851, "grad_norm": 0.5555810441419129, "learning_rate": 2.8707789473684212e-05, "loss": 0.3383, "mean_copy_accuracy": 0.9912910759449005, "mean_gen_accuracy": 0.8603078126907349, "mean_token_accuracy": 0.8913190513849258, "num_tokens": 476159825.0, "sample_num_tokens": 9023.75, "step": 1759, "total_num_tokens": 476195920.0, "z_loss": 0.0009775467915460467 }, { "copy_logits_max": -6.33042573928833, "copy_logits_min": -750000000.0, "copy_num_tokens": 351.8125, "epoch": 0.3594587694664284, "gen_logits_max": 6.837347507476807, "gen_logits_mean": -12.690448760986328, "gen_logits_min": -23.92363739013672, "gen_logits_std": 2.4410910606384277, "gen_loss": 0.3200964331626892, "grad_norm": 0.508743968468579, "learning_rate": 2.8706526315789473e-05, "loss": 0.3315, "mean_copy_accuracy": 0.9918656200170517, "mean_gen_accuracy": 0.8629937469959259, "mean_token_accuracy": 0.8909040242433548, "num_tokens": 476418463.0, "sample_num_tokens": 7761.75, "step": 1760, "total_num_tokens": 476449510.0, "z_loss": 0.0009514179546386003 }, { "copy_logits_max": -5.608426570892334, "copy_logits_min": -750000000.0, "copy_num_tokens": 584.375, "epoch": 0.3596630074036252, "gen_logits_max": 7.869795322418213, "gen_logits_mean": -10.301980972290039, "gen_logits_min": -21.810009002685547, "gen_logits_std": 2.452284574508667, "gen_loss": 0.3701969385147095, "grad_norm": 0.5352694692566551, "learning_rate": 2.8705263157894737e-05, "loss": 0.3258, "mean_copy_accuracy": 0.9924898445606232, "mean_gen_accuracy": 0.855159267783165, "mean_token_accuracy": 0.8958161473274231, "num_tokens": 476712322.0, "sample_num_tokens": 10298.0, "step": 1761, "total_num_tokens": 476753514.0, "z_loss": 0.0011029154993593693 }, { "copy_logits_max": -3.844289779663086, "copy_logits_min": -625000064.0, "copy_num_tokens": 463.5625, "epoch": 0.35986724534082204, "gen_logits_max": 7.019959449768066, "gen_logits_mean": -12.002889633178711, "gen_logits_min": -23.51021385192871, "gen_logits_std": 2.4525434970855713, "gen_loss": 0.3257857859134674, "grad_norm": 0.5063181587361866, "learning_rate": 2.8704e-05, "loss": 0.3489, "mean_copy_accuracy": 0.9907243549823761, "mean_gen_accuracy": 0.8570468574762344, "mean_token_accuracy": 0.8882358074188232, "num_tokens": 476990642.0, "sample_num_tokens": 8596.5, "step": 1762, "total_num_tokens": 477025028.0, "z_loss": 0.0010999618098139763 }, { "copy_logits_max": -4.4686384201049805, "copy_logits_min": -750000000.0, "copy_num_tokens": 367.375, "epoch": 0.3600714832780189, "gen_logits_max": 7.491690635681152, "gen_logits_mean": -11.454324722290039, "gen_logits_min": -23.30341911315918, "gen_logits_std": 2.480724573135376, "gen_loss": 0.3655034601688385, "grad_norm": 0.6256617057171688, "learning_rate": 2.8702736842105266e-05, "loss": 0.3428, "mean_copy_accuracy": 0.9922398775815964, "mean_gen_accuracy": 0.8554484695196152, "mean_token_accuracy": 0.8886663168668747, "num_tokens": 477261604.0, "sample_num_tokens": 7479.0, "step": 1763, "total_num_tokens": 477291520.0, "z_loss": 0.0011840126244351268 }, { "copy_logits_max": -5.794881820678711, "copy_logits_min": -750000000.0, "copy_num_tokens": 431.5, "epoch": 0.3602757212152157, "gen_logits_max": 7.657043933868408, "gen_logits_mean": -10.669233322143555, "gen_logits_min": -22.831375122070312, "gen_logits_std": 2.471682071685791, "gen_loss": 0.3313310742378235, "grad_norm": 0.4734333376917876, "learning_rate": 2.8701473684210527e-05, "loss": 0.3326, "mean_copy_accuracy": 0.9937023520469666, "mean_gen_accuracy": 0.8568022102117538, "mean_token_accuracy": 0.8947107642889023, "num_tokens": 477548101.0, "sample_num_tokens": 7628.75, "step": 1764, "total_num_tokens": 477578616.0, "z_loss": 0.0010617026127874851 }, { "copy_logits_max": -7.311025619506836, "copy_logits_min": -750000000.0, "copy_num_tokens": 593.25, "epoch": 0.36047995915241254, "gen_logits_max": 6.896577835083008, "gen_logits_mean": -11.320579528808594, "gen_logits_min": -23.159204483032227, "gen_logits_std": 2.4662766456604004, "gen_loss": 0.2981114685535431, "grad_norm": 0.4475477618646226, "learning_rate": 2.870021052631579e-05, "loss": 0.3278, "mean_copy_accuracy": 0.992345929145813, "mean_gen_accuracy": 0.859588235616684, "mean_token_accuracy": 0.892543613910675, "num_tokens": 477819575.0, "sample_num_tokens": 9809.75, "step": 1765, "total_num_tokens": 477858814.0, "z_loss": 0.0009385935263708234 }, { "copy_logits_max": -5.242635726928711, "copy_logits_min": -687500032.0, "copy_num_tokens": 402.25, "epoch": 0.3606841970896094, "gen_logits_max": 6.828479766845703, "gen_logits_mean": -12.484701156616211, "gen_logits_min": -24.46376609802246, "gen_logits_std": 2.4903602600097656, "gen_loss": 0.32488012313842773, "grad_norm": 0.5419729167337325, "learning_rate": 2.8698947368421052e-05, "loss": 0.3238, "mean_copy_accuracy": 0.9901845455169678, "mean_gen_accuracy": 0.8621774911880493, "mean_token_accuracy": 0.8928132355213165, "num_tokens": 478100524.0, "sample_num_tokens": 8681.0, "step": 1766, "total_num_tokens": 478135248.0, "z_loss": 0.0009551512193866074 }, { "copy_logits_max": -5.224412441253662, "copy_logits_min": -750000000.0, "copy_num_tokens": 375.875, "epoch": 0.36088843502680623, "gen_logits_max": 7.185336112976074, "gen_logits_mean": -12.105979919433594, "gen_logits_min": -24.092050552368164, "gen_logits_std": 2.490142583847046, "gen_loss": 0.346372127532959, "grad_norm": 0.5023192726024176, "learning_rate": 2.8697684210526316e-05, "loss": 0.3325, "mean_copy_accuracy": 0.9913859516382217, "mean_gen_accuracy": 0.8569092899560928, "mean_token_accuracy": 0.8914964348077774, "num_tokens": 478374468.0, "sample_num_tokens": 7584.0, "step": 1767, "total_num_tokens": 478404804.0, "z_loss": 0.001015983521938324 }, { "copy_logits_max": -2.876415252685547, "copy_logits_min": -687500032.0, "copy_num_tokens": 676.9375, "epoch": 0.36109267296400305, "gen_logits_max": 6.339627265930176, "gen_logits_mean": -11.851516723632812, "gen_logits_min": -24.334392547607422, "gen_logits_std": 2.5285909175872803, "gen_loss": 0.31716394424438477, "grad_norm": 0.46869101132080654, "learning_rate": 2.8696421052631577e-05, "loss": 0.3512, "mean_copy_accuracy": 0.9913845807313919, "mean_gen_accuracy": 0.8516967743635178, "mean_token_accuracy": 0.8842630833387375, "num_tokens": 478631925.0, "sample_num_tokens": 9148.75, "step": 1768, "total_num_tokens": 478668520.0, "z_loss": 0.0012210009153932333 }, { "copy_logits_max": -4.241196632385254, "copy_logits_min": -750000000.0, "copy_num_tokens": 690.625, "epoch": 0.3612969109011999, "gen_logits_max": 7.150391578674316, "gen_logits_mean": -11.161922454833984, "gen_logits_min": -23.16135597229004, "gen_logits_std": 2.515856981277466, "gen_loss": 0.254880428314209, "grad_norm": 0.5837359660902266, "learning_rate": 2.869515789473684e-05, "loss": 0.2874, "mean_copy_accuracy": 0.9946284890174866, "mean_gen_accuracy": 0.8696759045124054, "mean_token_accuracy": 0.906883493065834, "num_tokens": 478920073.0, "sample_num_tokens": 9756.25, "step": 1769, "total_num_tokens": 478959098.0, "z_loss": 0.0010298271663486958 }, { "copy_logits_max": -4.403921604156494, "copy_logits_min": -750000000.0, "copy_num_tokens": 425.5625, "epoch": 0.36150114883839674, "gen_logits_max": 6.698973655700684, "gen_logits_mean": -11.865358352661133, "gen_logits_min": -23.974164962768555, "gen_logits_std": 2.4752519130706787, "gen_loss": 0.33732908964157104, "grad_norm": 0.44806030083977644, "learning_rate": 2.8693894736842106e-05, "loss": 0.3365, "mean_copy_accuracy": 0.993167370557785, "mean_gen_accuracy": 0.853825643658638, "mean_token_accuracy": 0.8915615379810333, "num_tokens": 479207140.0, "sample_num_tokens": 8169.5, "step": 1770, "total_num_tokens": 479239818.0, "z_loss": 0.00112160574644804 }, { "copy_logits_max": -3.687253952026367, "copy_logits_min": -750000000.0, "copy_num_tokens": 635.6875, "epoch": 0.36170538677559355, "gen_logits_max": 6.708102703094482, "gen_logits_mean": -11.1752347946167, "gen_logits_min": -23.411413192749023, "gen_logits_std": 2.4867324829101562, "gen_loss": 0.32072845101356506, "grad_norm": 0.46462868543635294, "learning_rate": 2.869263157894737e-05, "loss": 0.3327, "mean_copy_accuracy": 0.9922545850276947, "mean_gen_accuracy": 0.8600186407566071, "mean_token_accuracy": 0.8925204277038574, "num_tokens": 479496882.0, "sample_num_tokens": 9143.5, "step": 1771, "total_num_tokens": 479533456.0, "z_loss": 0.0011801079381257296 }, { "copy_logits_max": -3.249572277069092, "copy_logits_min": -750000000.0, "copy_num_tokens": 500.6875, "epoch": 0.3619096247127904, "gen_logits_max": 7.017086029052734, "gen_logits_mean": -11.211864471435547, "gen_logits_min": -23.516725540161133, "gen_logits_std": 2.5039947032928467, "gen_loss": 0.3322351574897766, "grad_norm": 0.45441683199327354, "learning_rate": 2.8691368421052635e-05, "loss": 0.3276, "mean_copy_accuracy": 0.9936859607696533, "mean_gen_accuracy": 0.8574325144290924, "mean_token_accuracy": 0.8919019252061844, "num_tokens": 479767258.0, "sample_num_tokens": 8648.5, "step": 1772, "total_num_tokens": 479801852.0, "z_loss": 0.0011021520476788282 }, { "copy_logits_max": -5.791210174560547, "copy_logits_min": -687500032.0, "copy_num_tokens": 411.8125, "epoch": 0.36211386264998724, "gen_logits_max": 7.29152774810791, "gen_logits_mean": -11.146949768066406, "gen_logits_min": -23.40420150756836, "gen_logits_std": 2.4832968711853027, "gen_loss": 0.3636521100997925, "grad_norm": 0.4994868985556235, "learning_rate": 2.8690105263157896e-05, "loss": 0.3527, "mean_copy_accuracy": 0.9924258291721344, "mean_gen_accuracy": 0.85472671687603, "mean_token_accuracy": 0.8877823352813721, "num_tokens": 480046448.0, "sample_num_tokens": 8022.0, "step": 1773, "total_num_tokens": 480078536.0, "z_loss": 0.0011065080761909485 }, { "copy_logits_max": -4.694989204406738, "copy_logits_min": -750000000.0, "copy_num_tokens": 670.75, "epoch": 0.36231810058718406, "gen_logits_max": 6.096070766448975, "gen_logits_mean": -11.29764461517334, "gen_logits_min": -23.40264320373535, "gen_logits_std": 2.4767916202545166, "gen_loss": 0.3060062527656555, "grad_norm": 0.4295475586840353, "learning_rate": 2.868884210526316e-05, "loss": 0.3098, "mean_copy_accuracy": 0.9914903938770294, "mean_gen_accuracy": 0.867952436208725, "mean_token_accuracy": 0.8992197662591934, "num_tokens": 480338857.0, "sample_num_tokens": 10229.75, "step": 1774, "total_num_tokens": 480379776.0, "z_loss": 0.0009677603375166655 }, { "copy_logits_max": -4.17215633392334, "copy_logits_min": -687500032.0, "copy_num_tokens": 518.25, "epoch": 0.3625223385243809, "gen_logits_max": 6.00829553604126, "gen_logits_mean": -11.848381042480469, "gen_logits_min": -23.90786361694336, "gen_logits_std": 2.496410608291626, "gen_loss": 0.33656296133995056, "grad_norm": 0.46200580608267644, "learning_rate": 2.868757894736842e-05, "loss": 0.3158, "mean_copy_accuracy": 0.993876576423645, "mean_gen_accuracy": 0.8578349649906158, "mean_token_accuracy": 0.8971389085054398, "num_tokens": 480634035.0, "sample_num_tokens": 8067.75, "step": 1775, "total_num_tokens": 480666306.0, "z_loss": 0.0009831460192799568 }, { "copy_logits_max": -6.25499153137207, "copy_logits_min": -750000000.0, "copy_num_tokens": 366.375, "epoch": 0.36272657646157774, "gen_logits_max": 6.580764293670654, "gen_logits_mean": -12.617738723754883, "gen_logits_min": -24.36096954345703, "gen_logits_std": 2.455291271209717, "gen_loss": 0.30060842633247375, "grad_norm": 0.4323848279377384, "learning_rate": 2.8686315789473685e-05, "loss": 0.3111, "mean_copy_accuracy": 0.9948268085718155, "mean_gen_accuracy": 0.8685361742973328, "mean_token_accuracy": 0.8995157182216644, "num_tokens": 480909233.0, "sample_num_tokens": 7858.75, "step": 1776, "total_num_tokens": 480940668.0, "z_loss": 0.0008997984114103019 }, { "copy_logits_max": -6.175484657287598, "copy_logits_min": -750000000.0, "copy_num_tokens": 279.5, "epoch": 0.36293081439877456, "gen_logits_max": 7.3840837478637695, "gen_logits_mean": -11.534870147705078, "gen_logits_min": -23.535781860351562, "gen_logits_std": 2.481644630432129, "gen_loss": 0.39556294679641724, "grad_norm": 0.5214466558054514, "learning_rate": 2.8685052631578946e-05, "loss": 0.3392, "mean_copy_accuracy": 0.9926621168851852, "mean_gen_accuracy": 0.8557293713092804, "mean_token_accuracy": 0.8887933939695358, "num_tokens": 481171891.0, "sample_num_tokens": 6592.75, "step": 1777, "total_num_tokens": 481198262.0, "z_loss": 0.0010827038204297423 }, { "copy_logits_max": -5.171665191650391, "copy_logits_min": -750000000.0, "copy_num_tokens": 407.375, "epoch": 0.36313505233597143, "gen_logits_max": 6.595641613006592, "gen_logits_mean": -12.520606994628906, "gen_logits_min": -24.206491470336914, "gen_logits_std": 2.4507570266723633, "gen_loss": 0.36873728036880493, "grad_norm": 0.5057806428865949, "learning_rate": 2.8683789473684214e-05, "loss": 0.3442, "mean_copy_accuracy": 0.9919548034667969, "mean_gen_accuracy": 0.8557502031326294, "mean_token_accuracy": 0.8856061547994614, "num_tokens": 481413199.0, "sample_num_tokens": 8195.75, "step": 1778, "total_num_tokens": 481445982.0, "z_loss": 0.0009250955190509558 }, { "copy_logits_max": -6.773197174072266, "copy_logits_min": -750000000.0, "copy_num_tokens": 393.0625, "epoch": 0.36333929027316825, "gen_logits_max": 7.15194034576416, "gen_logits_mean": -10.851005554199219, "gen_logits_min": -22.753833770751953, "gen_logits_std": 2.4747610092163086, "gen_loss": 0.3613247573375702, "grad_norm": 0.6162260967431548, "learning_rate": 2.8682526315789475e-05, "loss": 0.3618, "mean_copy_accuracy": 0.9920366406440735, "mean_gen_accuracy": 0.8531085401773453, "mean_token_accuracy": 0.8830056041479111, "num_tokens": 481683801.0, "sample_num_tokens": 8190.25, "step": 1779, "total_num_tokens": 481716562.0, "z_loss": 0.0010266926838085055 }, { "copy_logits_max": -4.526674747467041, "copy_logits_min": -750000000.0, "copy_num_tokens": 446.9375, "epoch": 0.36354352821036506, "gen_logits_max": 6.66987943649292, "gen_logits_mean": -12.149504661560059, "gen_logits_min": -24.284048080444336, "gen_logits_std": 2.4405574798583984, "gen_loss": 0.35760432481765747, "grad_norm": 0.5362750056297189, "learning_rate": 2.868126315789474e-05, "loss": 0.3385, "mean_copy_accuracy": 0.9927300065755844, "mean_gen_accuracy": 0.8602596521377563, "mean_token_accuracy": 0.8906527012586594, "num_tokens": 481963903.0, "sample_num_tokens": 8260.25, "step": 1780, "total_num_tokens": 481996944.0, "z_loss": 0.0009865534957498312 }, { "copy_logits_max": -4.411931037902832, "copy_logits_min": -750000000.0, "copy_num_tokens": 571.5625, "epoch": 0.36374776614756194, "gen_logits_max": 7.005531311035156, "gen_logits_mean": -11.097813606262207, "gen_logits_min": -22.979904174804688, "gen_logits_std": 2.4909067153930664, "gen_loss": 0.3167029023170471, "grad_norm": 0.4567106471115607, "learning_rate": 2.868e-05, "loss": 0.3103, "mean_copy_accuracy": 0.9928062111139297, "mean_gen_accuracy": 0.8628396093845367, "mean_token_accuracy": 0.8989739418029785, "num_tokens": 482243095.0, "sample_num_tokens": 8804.75, "step": 1781, "total_num_tokens": 482278314.0, "z_loss": 0.001164182205684483 }, { "copy_logits_max": -5.0530619621276855, "copy_logits_min": -750000000.0, "copy_num_tokens": 390.0625, "epoch": 0.36395200408475875, "gen_logits_max": 6.8826093673706055, "gen_logits_mean": -11.698115348815918, "gen_logits_min": -23.72120475769043, "gen_logits_std": 2.49170184135437, "gen_loss": 0.3601406514644623, "grad_norm": 0.47283983539600055, "learning_rate": 2.8678736842105264e-05, "loss": 0.3429, "mean_copy_accuracy": 0.9937494546175003, "mean_gen_accuracy": 0.8495019972324371, "mean_token_accuracy": 0.884476363658905, "num_tokens": 482516481.0, "sample_num_tokens": 8329.75, "step": 1782, "total_num_tokens": 482549800.0, "z_loss": 0.0010768957436084747 }, { "copy_logits_max": -4.7544050216674805, "copy_logits_min": -750000000.0, "copy_num_tokens": 522.75, "epoch": 0.36415624202195557, "gen_logits_max": 6.168469429016113, "gen_logits_mean": -12.176284790039062, "gen_logits_min": -24.17011070251465, "gen_logits_std": 2.4866511821746826, "gen_loss": 0.3012387752532959, "grad_norm": 0.5237144648336687, "learning_rate": 2.8677473684210525e-05, "loss": 0.3363, "mean_copy_accuracy": 0.9901824295520782, "mean_gen_accuracy": 0.8610502630472183, "mean_token_accuracy": 0.8885250836610794, "num_tokens": 482756639.0, "sample_num_tokens": 8476.25, "step": 1783, "total_num_tokens": 482790544.0, "z_loss": 0.0010108971036970615 }, { "copy_logits_max": -4.939764022827148, "copy_logits_min": -750000000.0, "copy_num_tokens": 380.4375, "epoch": 0.36436047995915244, "gen_logits_max": 6.560494422912598, "gen_logits_mean": -12.647697448730469, "gen_logits_min": -23.866788864135742, "gen_logits_std": 2.43497371673584, "gen_loss": 0.35318630933761597, "grad_norm": 0.48255643685674376, "learning_rate": 2.867621052631579e-05, "loss": 0.3158, "mean_copy_accuracy": 0.9943011105060577, "mean_gen_accuracy": 0.8622086048126221, "mean_token_accuracy": 0.8966917097568512, "num_tokens": 483023222.0, "sample_num_tokens": 7553.5, "step": 1784, "total_num_tokens": 483053436.0, "z_loss": 0.0009902464225888252 }, { "copy_logits_max": -4.480919361114502, "copy_logits_min": -687500032.0, "copy_num_tokens": 516.1875, "epoch": 0.36456471789634926, "gen_logits_max": 7.079702854156494, "gen_logits_mean": -11.638306617736816, "gen_logits_min": -23.283397674560547, "gen_logits_std": 2.4027576446533203, "gen_loss": 0.31306833028793335, "grad_norm": 0.48186015492546336, "learning_rate": 2.8674947368421054e-05, "loss": 0.3326, "mean_copy_accuracy": 0.9930288642644882, "mean_gen_accuracy": 0.8604910373687744, "mean_token_accuracy": 0.893071323633194, "num_tokens": 483272778.0, "sample_num_tokens": 9055.5, "step": 1785, "total_num_tokens": 483309000.0, "z_loss": 0.0010246406309306622 }, { "copy_logits_max": -3.0264925956726074, "copy_logits_min": -750000000.0, "copy_num_tokens": 610.3125, "epoch": 0.36476895583354607, "gen_logits_max": 6.7667555809021, "gen_logits_mean": -11.090757369995117, "gen_logits_min": -23.43454360961914, "gen_logits_std": 2.555251121520996, "gen_loss": 0.33153969049453735, "grad_norm": 0.4648670358497721, "learning_rate": 2.8673684210526318e-05, "loss": 0.3161, "mean_copy_accuracy": 0.9930868893861771, "mean_gen_accuracy": 0.8625038862228394, "mean_token_accuracy": 0.8966459631919861, "num_tokens": 483564219.0, "sample_num_tokens": 9280.25, "step": 1786, "total_num_tokens": 483601340.0, "z_loss": 0.0010511480504646897 }, { "copy_logits_max": -6.090517520904541, "copy_logits_min": -750000000.0, "copy_num_tokens": 278.125, "epoch": 0.36497319377074294, "gen_logits_max": 7.575379848480225, "gen_logits_mean": -11.905774116516113, "gen_logits_min": -23.022823333740234, "gen_logits_std": 2.4282636642456055, "gen_loss": 0.3811540901660919, "grad_norm": 0.47599902637542346, "learning_rate": 2.8672421052631582e-05, "loss": 0.3474, "mean_copy_accuracy": 0.9913789182901382, "mean_gen_accuracy": 0.8563120365142822, "mean_token_accuracy": 0.8846699148416519, "num_tokens": 483845016.0, "sample_num_tokens": 7476.5, "step": 1787, "total_num_tokens": 483874922.0, "z_loss": 0.001096346415579319 }, { "copy_logits_max": -2.378901720046997, "copy_logits_min": -750000000.0, "copy_num_tokens": 466.125, "epoch": 0.36517743170793976, "gen_logits_max": 7.708424091339111, "gen_logits_mean": -10.958272933959961, "gen_logits_min": -23.045307159423828, "gen_logits_std": 2.589665412902832, "gen_loss": 0.3202993869781494, "grad_norm": 0.46874300556996723, "learning_rate": 2.8671157894736843e-05, "loss": 0.3392, "mean_copy_accuracy": 0.9912764132022858, "mean_gen_accuracy": 0.8564816862344742, "mean_token_accuracy": 0.8887660056352615, "num_tokens": 484092349.0, "sample_num_tokens": 7567.25, "step": 1788, "total_num_tokens": 484122618.0, "z_loss": 0.0011296711163595319 }, { "copy_logits_max": -3.639204740524292, "copy_logits_min": -750000000.0, "copy_num_tokens": 438.4375, "epoch": 0.3653816696451366, "gen_logits_max": 7.231914043426514, "gen_logits_mean": -11.635517120361328, "gen_logits_min": -23.745784759521484, "gen_logits_std": 2.5479989051818848, "gen_loss": 0.3363618850708008, "grad_norm": 0.512352819901513, "learning_rate": 2.8669894736842108e-05, "loss": 0.3581, "mean_copy_accuracy": 0.9913151562213898, "mean_gen_accuracy": 0.8547049760818481, "mean_token_accuracy": 0.883626326918602, "num_tokens": 484335592.0, "sample_num_tokens": 7253.5, "step": 1789, "total_num_tokens": 484364606.0, "z_loss": 0.0012493932154029608 }, { "copy_logits_max": -4.915469169616699, "copy_logits_min": -750000000.0, "copy_num_tokens": 564.1875, "epoch": 0.3655859075823334, "gen_logits_max": 7.0116119384765625, "gen_logits_mean": -10.943856239318848, "gen_logits_min": -22.609081268310547, "gen_logits_std": 2.508045196533203, "gen_loss": 0.3118554353713989, "grad_norm": 0.45815353098161254, "learning_rate": 2.866863157894737e-05, "loss": 0.342, "mean_copy_accuracy": 0.9926480203866959, "mean_gen_accuracy": 0.8562641739845276, "mean_token_accuracy": 0.890865370631218, "num_tokens": 484590380.0, "sample_num_tokens": 9291.5, "step": 1790, "total_num_tokens": 484627546.0, "z_loss": 0.0010835619177669287 }, { "copy_logits_max": -6.374105453491211, "copy_logits_min": -750000000.0, "copy_num_tokens": 580.1875, "epoch": 0.36579014551953026, "gen_logits_max": 7.859401702880859, "gen_logits_mean": -11.585816383361816, "gen_logits_min": -23.335308074951172, "gen_logits_std": 2.5054211616516113, "gen_loss": 0.33809158205986023, "grad_norm": 0.4661579569398329, "learning_rate": 2.8667368421052633e-05, "loss": 0.3309, "mean_copy_accuracy": 0.9926308840513229, "mean_gen_accuracy": 0.8574342876672745, "mean_token_accuracy": 0.8940242379903793, "num_tokens": 484900384.0, "sample_num_tokens": 9982.5, "step": 1791, "total_num_tokens": 484940314.0, "z_loss": 0.001048337435349822 }, { "copy_logits_max": -4.893955230712891, "copy_logits_min": -625000000.0, "copy_num_tokens": 446.5, "epoch": 0.3659943834567271, "gen_logits_max": 7.753783226013184, "gen_logits_mean": -10.978801727294922, "gen_logits_min": -22.387203216552734, "gen_logits_std": 2.464322566986084, "gen_loss": 0.3328728675842285, "grad_norm": 0.45783025493421925, "learning_rate": 2.8666105263157894e-05, "loss": 0.315, "mean_copy_accuracy": 0.9935271888971329, "mean_gen_accuracy": 0.8626008629798889, "mean_token_accuracy": 0.8983809351921082, "num_tokens": 485197600.0, "sample_num_tokens": 8624.5, "step": 1792, "total_num_tokens": 485232098.0, "z_loss": 0.0010474445298314095 }, { "copy_logits_max": -5.394833087921143, "copy_logits_min": -750000000.0, "copy_num_tokens": 464.875, "epoch": 0.3661986213939239, "gen_logits_max": 5.660787105560303, "gen_logits_mean": -13.821575164794922, "gen_logits_min": -25.424701690673828, "gen_logits_std": 2.489496946334839, "gen_loss": 0.3050263524055481, "grad_norm": 0.43137041530979303, "learning_rate": 2.8664842105263158e-05, "loss": 0.3168, "mean_copy_accuracy": 0.9919706881046295, "mean_gen_accuracy": 0.861846387386322, "mean_token_accuracy": 0.8962295204401016, "num_tokens": 485481333.0, "sample_num_tokens": 8285.75, "step": 1793, "total_num_tokens": 485514476.0, "z_loss": 0.0008425511769019067 }, { "copy_logits_max": -4.85167932510376, "copy_logits_min": -750000000.0, "copy_num_tokens": 587.125, "epoch": 0.36640285933112077, "gen_logits_max": 6.761599540710449, "gen_logits_mean": -12.812387466430664, "gen_logits_min": -24.823543548583984, "gen_logits_std": 2.5252175331115723, "gen_loss": 0.2895006239414215, "grad_norm": 0.44081384628884784, "learning_rate": 2.8663578947368422e-05, "loss": 0.3132, "mean_copy_accuracy": 0.9945678114891052, "mean_gen_accuracy": 0.8644676506519318, "mean_token_accuracy": 0.8962305784225464, "num_tokens": 485743788.0, "sample_num_tokens": 8823.5, "step": 1794, "total_num_tokens": 485779082.0, "z_loss": 0.0009214571327902377 }, { "copy_logits_max": -5.057042121887207, "copy_logits_min": -750000000.0, "copy_num_tokens": 358.5, "epoch": 0.3666070972683176, "gen_logits_max": 7.607908725738525, "gen_logits_mean": -11.61392593383789, "gen_logits_min": -23.232179641723633, "gen_logits_std": 2.4691030979156494, "gen_loss": 0.37192779779434204, "grad_norm": 0.543410247916968, "learning_rate": 2.8662315789473687e-05, "loss": 0.3356, "mean_copy_accuracy": 0.9915368854999542, "mean_gen_accuracy": 0.8623288422822952, "mean_token_accuracy": 0.8889358937740326, "num_tokens": 486012062.0, "sample_num_tokens": 7619.5, "step": 1795, "total_num_tokens": 486042540.0, "z_loss": 0.000932426075451076 }, { "copy_logits_max": -4.6551008224487305, "copy_logits_min": -750000064.0, "copy_num_tokens": 421.5, "epoch": 0.3668113352055144, "gen_logits_max": 7.262310028076172, "gen_logits_mean": -12.272589683532715, "gen_logits_min": -23.951297760009766, "gen_logits_std": 2.4887876510620117, "gen_loss": 0.3356919288635254, "grad_norm": 0.592448212420573, "learning_rate": 2.8661052631578948e-05, "loss": 0.3238, "mean_copy_accuracy": 0.9906992167234421, "mean_gen_accuracy": 0.8643172681331635, "mean_token_accuracy": 0.8933926075696945, "num_tokens": 486289071.0, "sample_num_tokens": 8179.25, "step": 1796, "total_num_tokens": 486321788.0, "z_loss": 0.0008998406119644642 }, { "copy_logits_max": -5.73463249206543, "copy_logits_min": -750000128.0, "copy_num_tokens": 465.25, "epoch": 0.36701557314271127, "gen_logits_max": 7.308837890625, "gen_logits_mean": -11.412158012390137, "gen_logits_min": -23.121137619018555, "gen_logits_std": 2.474691152572632, "gen_loss": 0.30582720041275024, "grad_norm": 0.548156454406356, "learning_rate": 2.8659789473684212e-05, "loss": 0.3156, "mean_copy_accuracy": 0.9917545914649963, "mean_gen_accuracy": 0.8623240888118744, "mean_token_accuracy": 0.8961774259805679, "num_tokens": 486562325.0, "sample_num_tokens": 8774.25, "step": 1797, "total_num_tokens": 486597422.0, "z_loss": 0.0010035057784989476 }, { "copy_logits_max": -3.6568515300750732, "copy_logits_min": -687500032.0, "copy_num_tokens": 407.875, "epoch": 0.3672198110799081, "gen_logits_max": 7.365605354309082, "gen_logits_mean": -12.22620964050293, "gen_logits_min": -24.240188598632812, "gen_logits_std": 2.555583953857422, "gen_loss": 0.344778448343277, "grad_norm": 0.49772001461353926, "learning_rate": 2.8658526315789473e-05, "loss": 0.3411, "mean_copy_accuracy": 0.9940803349018097, "mean_gen_accuracy": 0.8548087179660797, "mean_token_accuracy": 0.8912153542041779, "num_tokens": 486829693.0, "sample_num_tokens": 7615.25, "step": 1798, "total_num_tokens": 486860154.0, "z_loss": 0.0010983939282596111 }, { "copy_logits_max": -1.513790488243103, "copy_logits_min": -625000064.0, "copy_num_tokens": 621.6875, "epoch": 0.3674240490171049, "gen_logits_max": 7.158689498901367, "gen_logits_mean": -11.063308715820312, "gen_logits_min": -23.593307495117188, "gen_logits_std": 2.5485098361968994, "gen_loss": 0.3420121371746063, "grad_norm": 0.4647998838695434, "learning_rate": 2.8657263157894737e-05, "loss": 0.3352, "mean_copy_accuracy": 0.9927594512701035, "mean_gen_accuracy": 0.8533817231655121, "mean_token_accuracy": 0.8895355314016342, "num_tokens": 487100522.0, "sample_num_tokens": 8892.0, "step": 1799, "total_num_tokens": 487136090.0, "z_loss": 0.0013875606236979365 }, { "copy_logits_max": -4.093874931335449, "copy_logits_min": -750000000.0, "copy_num_tokens": 471.375, "epoch": 0.3676282869543018, "gen_logits_max": 6.727168083190918, "gen_logits_mean": -12.191689491271973, "gen_logits_min": -24.086509704589844, "gen_logits_std": 2.5068037509918213, "gen_loss": 0.3418263792991638, "grad_norm": 0.4863053442155151, "learning_rate": 2.8656e-05, "loss": 0.3373, "mean_copy_accuracy": 0.9918865114450455, "mean_gen_accuracy": 0.857378214597702, "mean_token_accuracy": 0.8913056403398514, "num_tokens": 487374823.0, "sample_num_tokens": 7950.25, "step": 1800, "total_num_tokens": 487406624.0, "z_loss": 0.0012291987659409642 }, { "copy_logits_max": -3.0712900161743164, "copy_logits_min": -750000000.0, "copy_num_tokens": 512.75, "epoch": 0.3678325248914986, "gen_logits_max": 7.041728973388672, "gen_logits_mean": -11.994163513183594, "gen_logits_min": -23.825000762939453, "gen_logits_std": 2.550642967224121, "gen_loss": 0.31309032440185547, "grad_norm": 0.4291270552057982, "learning_rate": 2.8654736842105262e-05, "loss": 0.3264, "mean_copy_accuracy": 0.9930866807699203, "mean_gen_accuracy": 0.8586168736219406, "mean_token_accuracy": 0.892842710018158, "num_tokens": 487643066.0, "sample_num_tokens": 8470.5, "step": 1801, "total_num_tokens": 487676948.0, "z_loss": 0.0012009057682007551 }, { "copy_logits_max": -3.737030506134033, "copy_logits_min": -750000000.0, "copy_num_tokens": 570.6875, "epoch": 0.3680367628286954, "gen_logits_max": 7.074855804443359, "gen_logits_mean": -11.994844436645508, "gen_logits_min": -24.12519645690918, "gen_logits_std": 2.5077531337738037, "gen_loss": 0.33856040239334106, "grad_norm": 0.49976190195350584, "learning_rate": 2.8653473684210527e-05, "loss": 0.3435, "mean_copy_accuracy": 0.9925132989883423, "mean_gen_accuracy": 0.8547492027282715, "mean_token_accuracy": 0.8867047131061554, "num_tokens": 487914910.0, "sample_num_tokens": 9606.0, "step": 1802, "total_num_tokens": 487953334.0, "z_loss": 0.0011840576771646738 }, { "copy_logits_max": -2.645188331604004, "copy_logits_min": -687500032.0, "copy_num_tokens": 488.6875, "epoch": 0.3682410007658923, "gen_logits_max": 6.824234485626221, "gen_logits_mean": -10.784358978271484, "gen_logits_min": -22.770366668701172, "gen_logits_std": 2.5862629413604736, "gen_loss": 0.32320094108581543, "grad_norm": 0.7156839939372323, "learning_rate": 2.865221052631579e-05, "loss": 0.3365, "mean_copy_accuracy": 0.9896510094404221, "mean_gen_accuracy": 0.8587232828140259, "mean_token_accuracy": 0.8906200230121613, "num_tokens": 488174817.0, "sample_num_tokens": 8038.75, "step": 1803, "total_num_tokens": 488206972.0, "z_loss": 0.0011259515304118395 }, { "copy_logits_max": -4.4550933837890625, "copy_logits_min": -687500032.0, "copy_num_tokens": 813.5, "epoch": 0.3684452387030891, "gen_logits_max": 6.544398307800293, "gen_logits_mean": -12.764704704284668, "gen_logits_min": -24.576988220214844, "gen_logits_std": 2.501739978790283, "gen_loss": 0.2944686710834503, "grad_norm": 0.5163922203235436, "learning_rate": 2.8650947368421055e-05, "loss": 0.3164, "mean_copy_accuracy": 0.991335541009903, "mean_gen_accuracy": 0.8706580847501755, "mean_token_accuracy": 0.8982997834682465, "num_tokens": 488463648.0, "sample_num_tokens": 11665.5, "step": 1804, "total_num_tokens": 488510310.0, "z_loss": 0.0010381565662100911 }, { "copy_logits_max": -3.9675979614257812, "copy_logits_min": -750000000.0, "copy_num_tokens": 356.3125, "epoch": 0.3686494766402859, "gen_logits_max": 7.365166664123535, "gen_logits_mean": -12.56100082397461, "gen_logits_min": -23.968374252319336, "gen_logits_std": 2.4818477630615234, "gen_loss": 0.3619735538959503, "grad_norm": 0.504127424534257, "learning_rate": 2.8649684210526316e-05, "loss": 0.3397, "mean_copy_accuracy": 0.9912737160921097, "mean_gen_accuracy": 0.8570446968078613, "mean_token_accuracy": 0.8899595737457275, "num_tokens": 488741143.0, "sample_num_tokens": 7792.75, "step": 1805, "total_num_tokens": 488772314.0, "z_loss": 0.0011220057494938374 }, { "copy_logits_max": -5.712141990661621, "copy_logits_min": -750000000.0, "copy_num_tokens": 295.4375, "epoch": 0.3688537145774828, "gen_logits_max": 7.43790864944458, "gen_logits_mean": -11.538493156433105, "gen_logits_min": -22.67801856994629, "gen_logits_std": 2.4260053634643555, "gen_loss": 0.3401140570640564, "grad_norm": 0.5951383108882381, "learning_rate": 2.864842105263158e-05, "loss": 0.3214, "mean_copy_accuracy": 0.9922746419906616, "mean_gen_accuracy": 0.8588375747203827, "mean_token_accuracy": 0.8946111351251602, "num_tokens": 489039231.0, "sample_num_tokens": 7466.75, "step": 1806, "total_num_tokens": 489069098.0, "z_loss": 0.001041912822984159 }, { "copy_logits_max": -4.96172571182251, "copy_logits_min": -687500032.0, "copy_num_tokens": 347.75, "epoch": 0.3690579525146796, "gen_logits_max": 8.285423278808594, "gen_logits_mean": -11.233687400817871, "gen_logits_min": -22.499168395996094, "gen_logits_std": 2.4691686630249023, "gen_loss": 0.4168180823326111, "grad_norm": 0.4997166091499666, "learning_rate": 2.864715789473684e-05, "loss": 0.3557, "mean_copy_accuracy": 0.9910313636064529, "mean_gen_accuracy": 0.8561552613973618, "mean_token_accuracy": 0.885050818324089, "num_tokens": 489292210.0, "sample_num_tokens": 8575.5, "step": 1807, "total_num_tokens": 489326512.0, "z_loss": 0.0011565728345885873 }, { "copy_logits_max": -3.9161174297332764, "copy_logits_min": -750000000.0, "copy_num_tokens": 565.5, "epoch": 0.3692621904518764, "gen_logits_max": 6.669617652893066, "gen_logits_mean": -13.200966835021973, "gen_logits_min": -24.754451751708984, "gen_logits_std": 2.467484474182129, "gen_loss": 0.33583158254623413, "grad_norm": 0.6118534040570576, "learning_rate": 2.8645894736842106e-05, "loss": 0.3293, "mean_copy_accuracy": 0.989608034491539, "mean_gen_accuracy": 0.8590121269226074, "mean_token_accuracy": 0.8927904814481735, "num_tokens": 489557072.0, "sample_num_tokens": 9555.0, "step": 1808, "total_num_tokens": 489595292.0, "z_loss": 0.0009854332311078906 }, { "copy_logits_max": -6.1816205978393555, "copy_logits_min": -750000000.0, "copy_num_tokens": 478.75, "epoch": 0.3694664283890733, "gen_logits_max": 6.823349475860596, "gen_logits_mean": -13.278772354125977, "gen_logits_min": -24.576459884643555, "gen_logits_std": 2.44895076751709, "gen_loss": 0.3078118562698364, "grad_norm": 0.4839589502121174, "learning_rate": 2.8644631578947367e-05, "loss": 0.3362, "mean_copy_accuracy": 0.9930785894393921, "mean_gen_accuracy": 0.8612823188304901, "mean_token_accuracy": 0.8896737545728683, "num_tokens": 489821711.0, "sample_num_tokens": 9147.25, "step": 1809, "total_num_tokens": 489858300.0, "z_loss": 0.001037808135151863 }, { "copy_logits_max": -2.6185271739959717, "copy_logits_min": -750000000.0, "copy_num_tokens": 416.125, "epoch": 0.3696706663262701, "gen_logits_max": 8.583986282348633, "gen_logits_mean": -11.783016204833984, "gen_logits_min": -23.170793533325195, "gen_logits_std": 2.4788737297058105, "gen_loss": 0.3303051292896271, "grad_norm": 0.5133266754222559, "learning_rate": 2.864336842105263e-05, "loss": 0.3299, "mean_copy_accuracy": 0.9921364635229111, "mean_gen_accuracy": 0.861169159412384, "mean_token_accuracy": 0.8925239741802216, "num_tokens": 490099855.0, "sample_num_tokens": 7679.25, "step": 1810, "total_num_tokens": 490130572.0, "z_loss": 0.0012569739483296871 }, { "copy_logits_max": -6.447248935699463, "copy_logits_min": -750000000.0, "copy_num_tokens": 589.6875, "epoch": 0.3698749042634669, "gen_logits_max": 7.75020694732666, "gen_logits_mean": -10.820779800415039, "gen_logits_min": -22.7957763671875, "gen_logits_std": 2.505666732788086, "gen_loss": 0.3139941692352295, "grad_norm": 0.40600118090265164, "learning_rate": 2.8642105263157895e-05, "loss": 0.2775, "mean_copy_accuracy": 0.9932060092687607, "mean_gen_accuracy": 0.8733426332473755, "mean_token_accuracy": 0.9091023355722427, "num_tokens": 490432097.0, "sample_num_tokens": 10208.25, "step": 1811, "total_num_tokens": 490472930.0, "z_loss": 0.0011002968531101942 }, { "copy_logits_max": -1.6653118133544922, "copy_logits_min": -750000000.0, "copy_num_tokens": 522.9375, "epoch": 0.3700791422006638, "gen_logits_max": 7.373918533325195, "gen_logits_mean": -11.475359916687012, "gen_logits_min": -23.345882415771484, "gen_logits_std": 2.5075082778930664, "gen_loss": 0.37521982192993164, "grad_norm": 0.5190455450232873, "learning_rate": 2.864084210526316e-05, "loss": 0.3335, "mean_copy_accuracy": 0.9919234961271286, "mean_gen_accuracy": 0.8535841107368469, "mean_token_accuracy": 0.8930229544639587, "num_tokens": 490731301.0, "sample_num_tokens": 8374.25, "step": 1812, "total_num_tokens": 490764798.0, "z_loss": 0.0012981618056073785 }, { "copy_logits_max": -4.508122444152832, "copy_logits_min": -750000000.0, "copy_num_tokens": 479.8125, "epoch": 0.3702833801378606, "gen_logits_max": 7.697549343109131, "gen_logits_mean": -11.945819854736328, "gen_logits_min": -23.524578094482422, "gen_logits_std": 2.4844114780426025, "gen_loss": 0.2787322700023651, "grad_norm": 0.46108948169042296, "learning_rate": 2.8639578947368424e-05, "loss": 0.3067, "mean_copy_accuracy": 0.9922254681587219, "mean_gen_accuracy": 0.865684986114502, "mean_token_accuracy": 0.8991645723581314, "num_tokens": 491004337.0, "sample_num_tokens": 8219.25, "step": 1813, "total_num_tokens": 491037214.0, "z_loss": 0.0009350080508738756 }, { "copy_logits_max": -5.583163261413574, "copy_logits_min": -750000000.0, "copy_num_tokens": 408.0625, "epoch": 0.3704876180750574, "gen_logits_max": 6.742066383361816, "gen_logits_mean": -13.252508163452148, "gen_logits_min": -24.84151840209961, "gen_logits_std": 2.488497257232666, "gen_loss": 0.30968350172042847, "grad_norm": 0.48615055072622904, "learning_rate": 2.8638315789473685e-05, "loss": 0.3284, "mean_copy_accuracy": 0.9924252182245255, "mean_gen_accuracy": 0.859487920999527, "mean_token_accuracy": 0.8922741711139679, "num_tokens": 491291574.0, "sample_num_tokens": 8374.5, "step": 1814, "total_num_tokens": 491325072.0, "z_loss": 0.0009241023217327893 }, { "copy_logits_max": -1.2727208137512207, "copy_logits_min": -687500032.0, "copy_num_tokens": 758.875, "epoch": 0.3706918560122543, "gen_logits_max": 6.374077796936035, "gen_logits_mean": -11.994829177856445, "gen_logits_min": -24.006893157958984, "gen_logits_std": 2.5212249755859375, "gen_loss": 0.2965966463088989, "grad_norm": 0.4937518915821822, "learning_rate": 2.863705263157895e-05, "loss": 0.3308, "mean_copy_accuracy": 0.9915442615747452, "mean_gen_accuracy": 0.8595924526453018, "mean_token_accuracy": 0.8930899351835251, "num_tokens": 491544216.0, "sample_num_tokens": 10541.0, "step": 1815, "total_num_tokens": 491586380.0, "z_loss": 0.0010649797040969133 }, { "copy_logits_max": -2.1592750549316406, "copy_logits_min": -750000000.0, "copy_num_tokens": 770.0, "epoch": 0.3708960939494511, "gen_logits_max": 6.0071940422058105, "gen_logits_mean": -11.276756286621094, "gen_logits_min": -23.733291625976562, "gen_logits_std": 2.5731353759765625, "gen_loss": 0.30695176124572754, "grad_norm": 0.47372514384065384, "learning_rate": 2.863578947368421e-05, "loss": 0.3383, "mean_copy_accuracy": 0.9908061623573303, "mean_gen_accuracy": 0.8602383434772491, "mean_token_accuracy": 0.8882130682468414, "num_tokens": 491818741.0, "sample_num_tokens": 10759.25, "step": 1816, "total_num_tokens": 491861778.0, "z_loss": 0.0010674626100808382 }, { "copy_logits_max": -5.277374267578125, "copy_logits_min": -750000000.0, "copy_num_tokens": 358.0, "epoch": 0.3711003318866479, "gen_logits_max": 6.727764129638672, "gen_logits_mean": -14.022980690002441, "gen_logits_min": -25.525672912597656, "gen_logits_std": 2.451611280441284, "gen_loss": 0.3379769027233124, "grad_norm": 0.4754698941967975, "learning_rate": 2.8634526315789474e-05, "loss": 0.3424, "mean_copy_accuracy": 0.9909886866807938, "mean_gen_accuracy": 0.8577532917261124, "mean_token_accuracy": 0.8872715830802917, "num_tokens": 492110184.0, "sample_num_tokens": 7510.5, "step": 1817, "total_num_tokens": 492140226.0, "z_loss": 0.0008688884554430842 }, { "copy_logits_max": -4.620473861694336, "copy_logits_min": -687500032.0, "copy_num_tokens": 455.0, "epoch": 0.3713045698238448, "gen_logits_max": 7.4190521240234375, "gen_logits_mean": -12.242193222045898, "gen_logits_min": -23.923606872558594, "gen_logits_std": 2.4977657794952393, "gen_loss": 0.3217945694923401, "grad_norm": 0.5123448963757055, "learning_rate": 2.8633263157894735e-05, "loss": 0.3322, "mean_copy_accuracy": 0.9917448163032532, "mean_gen_accuracy": 0.8581462949514389, "mean_token_accuracy": 0.8935935497283936, "num_tokens": 492377630.0, "sample_num_tokens": 8495.5, "step": 1818, "total_num_tokens": 492411612.0, "z_loss": 0.0009075538837350905 }, { "copy_logits_max": -4.126288414001465, "copy_logits_min": -750000064.0, "copy_num_tokens": 665.1875, "epoch": 0.3715088077610416, "gen_logits_max": 6.489161491394043, "gen_logits_mean": -12.2517671585083, "gen_logits_min": -23.996173858642578, "gen_logits_std": 2.5162734985351562, "gen_loss": 0.27908700704574585, "grad_norm": 0.4871916482094595, "learning_rate": 2.8632000000000003e-05, "loss": 0.3302, "mean_copy_accuracy": 0.9941014349460602, "mean_gen_accuracy": 0.8568793386220932, "mean_token_accuracy": 0.8927890509366989, "num_tokens": 492646584.0, "sample_num_tokens": 9376.0, "step": 1819, "total_num_tokens": 492684088.0, "z_loss": 0.000924013031180948 }, { "copy_logits_max": -5.557590484619141, "copy_logits_min": -687500032.0, "copy_num_tokens": 277.75, "epoch": 0.37171304569823843, "gen_logits_max": 6.781176567077637, "gen_logits_mean": -13.251062393188477, "gen_logits_min": -24.63519287109375, "gen_logits_std": 2.452089309692383, "gen_loss": 0.3002570569515228, "grad_norm": 0.511823372932725, "learning_rate": 2.8630736842105264e-05, "loss": 0.3287, "mean_copy_accuracy": 0.9923064708709717, "mean_gen_accuracy": 0.8566940873861313, "mean_token_accuracy": 0.8908075839281082, "num_tokens": 492928605.0, "sample_num_tokens": 6932.25, "step": 1820, "total_num_tokens": 492956334.0, "z_loss": 0.0008646309142932296 }, { "copy_logits_max": -4.501538276672363, "copy_logits_min": -750000000.0, "copy_num_tokens": 479.25, "epoch": 0.3719172836354353, "gen_logits_max": 7.852673053741455, "gen_logits_mean": -11.073685646057129, "gen_logits_min": -22.475357055664062, "gen_logits_std": 2.460761785507202, "gen_loss": 0.34006011486053467, "grad_norm": 0.5636652005375136, "learning_rate": 2.8629473684210528e-05, "loss": 0.3484, "mean_copy_accuracy": 0.9925045669078827, "mean_gen_accuracy": 0.8495383560657501, "mean_token_accuracy": 0.8863863497972488, "num_tokens": 493213443.0, "sample_num_tokens": 8748.75, "step": 1821, "total_num_tokens": 493248438.0, "z_loss": 0.0009246282861568034 }, { "copy_logits_max": -3.9680914878845215, "copy_logits_min": -750000000.0, "copy_num_tokens": 356.0, "epoch": 0.3721215215726321, "gen_logits_max": 7.2108964920043945, "gen_logits_mean": -12.310006141662598, "gen_logits_min": -23.859272003173828, "gen_logits_std": 2.46781063079834, "gen_loss": 0.351149320602417, "grad_norm": 0.466570466663156, "learning_rate": 2.862821052631579e-05, "loss": 0.3311, "mean_copy_accuracy": 0.9918041378259659, "mean_gen_accuracy": 0.8586940169334412, "mean_token_accuracy": 0.8923742771148682, "num_tokens": 493489859.0, "sample_num_tokens": 7232.25, "step": 1822, "total_num_tokens": 493518788.0, "z_loss": 0.0010554140899330378 }, { "copy_logits_max": -4.490716457366943, "copy_logits_min": -687500032.0, "copy_num_tokens": 443.8125, "epoch": 0.37232575950982894, "gen_logits_max": 7.22512149810791, "gen_logits_mean": -11.27437973022461, "gen_logits_min": -23.043216705322266, "gen_logits_std": 2.415256977081299, "gen_loss": 0.29406818747520447, "grad_norm": 0.4977513221611176, "learning_rate": 2.8626947368421053e-05, "loss": 0.3138, "mean_copy_accuracy": 0.9922575205564499, "mean_gen_accuracy": 0.8662640005350113, "mean_token_accuracy": 0.8967341035604477, "num_tokens": 493740765.0, "sample_num_tokens": 8267.75, "step": 1823, "total_num_tokens": 493773836.0, "z_loss": 0.0010063748341053724 }, { "copy_logits_max": -4.516156196594238, "copy_logits_min": -750000000.0, "copy_num_tokens": 407.5625, "epoch": 0.3725299974470258, "gen_logits_max": 6.738983154296875, "gen_logits_mean": -12.202179908752441, "gen_logits_min": -23.657608032226562, "gen_logits_std": 2.444577693939209, "gen_loss": 0.3040412962436676, "grad_norm": 0.4585548662270144, "learning_rate": 2.8625684210526314e-05, "loss": 0.3073, "mean_copy_accuracy": 0.9925731271505356, "mean_gen_accuracy": 0.8643272221088409, "mean_token_accuracy": 0.8971139937639236, "num_tokens": 494030694.0, "sample_num_tokens": 7860.0, "step": 1824, "total_num_tokens": 494062134.0, "z_loss": 0.0010731879156082869 }, { "copy_logits_max": -3.79510498046875, "copy_logits_min": -750000000.0, "copy_num_tokens": 594.125, "epoch": 0.3727342353842226, "gen_logits_max": 6.870058536529541, "gen_logits_mean": -12.321468353271484, "gen_logits_min": -23.9744873046875, "gen_logits_std": 2.491882085800171, "gen_loss": 0.3405410051345825, "grad_norm": 0.5224027954169738, "learning_rate": 2.862442105263158e-05, "loss": 0.328, "mean_copy_accuracy": 0.9925063103437424, "mean_gen_accuracy": 0.8608643412590027, "mean_token_accuracy": 0.8943780660629272, "num_tokens": 494302569.0, "sample_num_tokens": 9982.25, "step": 1825, "total_num_tokens": 494342498.0, "z_loss": 0.0011739605106413364 }, { "copy_logits_max": -3.1359870433807373, "copy_logits_min": -750000000.0, "copy_num_tokens": 417.3125, "epoch": 0.37293847332141944, "gen_logits_max": 7.216498851776123, "gen_logits_mean": -12.046670913696289, "gen_logits_min": -23.70362091064453, "gen_logits_std": 2.4766204357147217, "gen_loss": 0.35236552357673645, "grad_norm": 0.48665867573152166, "learning_rate": 2.8623157894736843e-05, "loss": 0.3322, "mean_copy_accuracy": 0.9928402155637741, "mean_gen_accuracy": 0.8558801114559174, "mean_token_accuracy": 0.8915455043315887, "num_tokens": 494564954.0, "sample_num_tokens": 8555.0, "step": 1826, "total_num_tokens": 494599174.0, "z_loss": 0.0012388816103339195 }, { "copy_logits_max": -3.779597759246826, "copy_logits_min": -750000064.0, "copy_num_tokens": 577.0, "epoch": 0.3731427112586163, "gen_logits_max": 7.216454982757568, "gen_logits_mean": -10.611291885375977, "gen_logits_min": -22.39822769165039, "gen_logits_std": 2.4724345207214355, "gen_loss": 0.3284197449684143, "grad_norm": 0.5191281747912495, "learning_rate": 2.8621894736842107e-05, "loss": 0.3342, "mean_copy_accuracy": 0.9930033087730408, "mean_gen_accuracy": 0.8553394824266434, "mean_token_accuracy": 0.8919981271028519, "num_tokens": 494821116.0, "sample_num_tokens": 8233.5, "step": 1827, "total_num_tokens": 494854050.0, "z_loss": 0.0012337756343185902 }, { "copy_logits_max": -2.8333468437194824, "copy_logits_min": -750000000.0, "copy_num_tokens": 468.5625, "epoch": 0.3733469491958131, "gen_logits_max": 7.030337810516357, "gen_logits_mean": -12.301996231079102, "gen_logits_min": -24.329496383666992, "gen_logits_std": 2.522339344024658, "gen_loss": 0.2962355613708496, "grad_norm": 0.4854457678630383, "learning_rate": 2.862063157894737e-05, "loss": 0.3295, "mean_copy_accuracy": 0.991754025220871, "mean_gen_accuracy": 0.8559872061014175, "mean_token_accuracy": 0.8927827477455139, "num_tokens": 495080960.0, "sample_num_tokens": 7820.5, "step": 1828, "total_num_tokens": 495112242.0, "z_loss": 0.0011485028080642223 }, { "copy_logits_max": -3.734781503677368, "copy_logits_min": -625000000.0, "copy_num_tokens": 606.875, "epoch": 0.37355118713300994, "gen_logits_max": 7.618210315704346, "gen_logits_mean": -11.274115562438965, "gen_logits_min": -23.425609588623047, "gen_logits_std": 2.520358085632324, "gen_loss": 0.30720603466033936, "grad_norm": 0.5556997024912523, "learning_rate": 2.8619368421052633e-05, "loss": 0.3492, "mean_copy_accuracy": 0.9921736121177673, "mean_gen_accuracy": 0.8477922677993774, "mean_token_accuracy": 0.8866478651762009, "num_tokens": 495344015.0, "sample_num_tokens": 8682.25, "step": 1829, "total_num_tokens": 495378744.0, "z_loss": 0.0010717649711295962 }, { "copy_logits_max": -7.467437744140625, "copy_logits_min": -750000000.0, "copy_num_tokens": 315.8125, "epoch": 0.3737554250702068, "gen_logits_max": 7.3103108406066895, "gen_logits_mean": -12.272024154663086, "gen_logits_min": -23.278608322143555, "gen_logits_std": 2.393113136291504, "gen_loss": 0.36875107884407043, "grad_norm": 0.46299243655933486, "learning_rate": 2.8618105263157897e-05, "loss": 0.3414, "mean_copy_accuracy": 0.9929819852113724, "mean_gen_accuracy": 0.8567566722631454, "mean_token_accuracy": 0.8869033008813858, "num_tokens": 495603543.0, "sample_num_tokens": 8839.25, "step": 1830, "total_num_tokens": 495638900.0, "z_loss": 0.0009739490924403071 }, { "copy_logits_max": -6.890614032745361, "copy_logits_min": -750000000.0, "copy_num_tokens": 354.25, "epoch": 0.37395966300740363, "gen_logits_max": 7.648005485534668, "gen_logits_mean": -11.597881317138672, "gen_logits_min": -23.077024459838867, "gen_logits_std": 2.459578514099121, "gen_loss": 0.4036881923675537, "grad_norm": 0.480311148511309, "learning_rate": 2.8616842105263158e-05, "loss": 0.3441, "mean_copy_accuracy": 0.9923890382051468, "mean_gen_accuracy": 0.8519959598779678, "mean_token_accuracy": 0.8865019232034683, "num_tokens": 495883810.0, "sample_num_tokens": 7905.0, "step": 1831, "total_num_tokens": 495915430.0, "z_loss": 0.0010604490526020527 }, { "copy_logits_max": -5.184202194213867, "copy_logits_min": -750000000.0, "copy_num_tokens": 465.1875, "epoch": 0.37416390094460045, "gen_logits_max": 6.797407150268555, "gen_logits_mean": -12.552692413330078, "gen_logits_min": -23.993358612060547, "gen_logits_std": 2.4658710956573486, "gen_loss": 0.33132821321487427, "grad_norm": 0.5119032829934947, "learning_rate": 2.8615578947368422e-05, "loss": 0.3333, "mean_copy_accuracy": 0.990997388958931, "mean_gen_accuracy": 0.8556424975395203, "mean_token_accuracy": 0.8894926160573959, "num_tokens": 496145837.0, "sample_num_tokens": 8758.75, "step": 1832, "total_num_tokens": 496180872.0, "z_loss": 0.000932830385863781 }, { "copy_logits_max": -3.8614509105682373, "copy_logits_min": -687500032.0, "copy_num_tokens": 523.4375, "epoch": 0.3743681388817973, "gen_logits_max": 7.00596284866333, "gen_logits_mean": -12.457047462463379, "gen_logits_min": -24.422576904296875, "gen_logits_std": 2.5026168823242188, "gen_loss": 0.36673882603645325, "grad_norm": 0.47597112199103514, "learning_rate": 2.8614315789473683e-05, "loss": 0.3323, "mean_copy_accuracy": 0.9915740042924881, "mean_gen_accuracy": 0.8562015444040298, "mean_token_accuracy": 0.8924018889665604, "num_tokens": 496428168.0, "sample_num_tokens": 8924.0, "step": 1833, "total_num_tokens": 496463864.0, "z_loss": 0.001046390738338232 }, { "copy_logits_max": -3.9016170501708984, "copy_logits_min": -687500032.0, "copy_num_tokens": 444.125, "epoch": 0.37457237681899413, "gen_logits_max": 6.306265354156494, "gen_logits_mean": -13.28406047821045, "gen_logits_min": -25.008398056030273, "gen_logits_std": 2.4739060401916504, "gen_loss": 0.3278050422668457, "grad_norm": 0.5484779901327581, "learning_rate": 2.8613052631578947e-05, "loss": 0.3145, "mean_copy_accuracy": 0.9918708950281143, "mean_gen_accuracy": 0.8606695383787155, "mean_token_accuracy": 0.896127462387085, "num_tokens": 496716315.0, "sample_num_tokens": 8633.25, "step": 1834, "total_num_tokens": 496750848.0, "z_loss": 0.0010280476417392492 }, { "copy_logits_max": -3.5583033561706543, "copy_logits_min": -750000000.0, "copy_num_tokens": 505.125, "epoch": 0.37477661475619095, "gen_logits_max": 6.88579797744751, "gen_logits_mean": -12.316125869750977, "gen_logits_min": -24.3728084564209, "gen_logits_std": 2.510977268218994, "gen_loss": 0.32123351097106934, "grad_norm": 0.4550995550465296, "learning_rate": 2.861178947368421e-05, "loss": 0.3412, "mean_copy_accuracy": 0.9946909546852112, "mean_gen_accuracy": 0.854502335190773, "mean_token_accuracy": 0.8872164338827133, "num_tokens": 496990684.0, "sample_num_tokens": 8161.5, "step": 1835, "total_num_tokens": 497023330.0, "z_loss": 0.0010602257680147886 }, { "copy_logits_max": -4.1432061195373535, "copy_logits_min": -750000000.0, "copy_num_tokens": 482.4375, "epoch": 0.3749808526933878, "gen_logits_max": 6.794678688049316, "gen_logits_mean": -12.501700401306152, "gen_logits_min": -24.447925567626953, "gen_logits_std": 2.468079090118408, "gen_loss": 0.339843213558197, "grad_norm": 0.5072150403231592, "learning_rate": 2.8610526315789476e-05, "loss": 0.3258, "mean_copy_accuracy": 0.9917101114988327, "mean_gen_accuracy": 0.8568244576454163, "mean_token_accuracy": 0.8941435366868973, "num_tokens": 497262193.0, "sample_num_tokens": 8394.75, "step": 1836, "total_num_tokens": 497295772.0, "z_loss": 0.0010017961030825973 }, { "copy_logits_max": -4.2559967041015625, "copy_logits_min": -750000000.0, "copy_num_tokens": 639.1875, "epoch": 0.37518509063058464, "gen_logits_max": 6.774863243103027, "gen_logits_mean": -12.993677139282227, "gen_logits_min": -24.822172164916992, "gen_logits_std": 2.500535488128662, "gen_loss": 0.3074694871902466, "grad_norm": 0.5505214328392478, "learning_rate": 2.8609263157894737e-05, "loss": 0.3307, "mean_copy_accuracy": 0.9914743453264236, "mean_gen_accuracy": 0.8593990206718445, "mean_token_accuracy": 0.8905239403247833, "num_tokens": 497536407.0, "sample_num_tokens": 10072.25, "step": 1837, "total_num_tokens": 497576696.0, "z_loss": 0.0010331334779039025 }, { "copy_logits_max": -3.4287517070770264, "copy_logits_min": -687500032.0, "copy_num_tokens": 578.8125, "epoch": 0.37538932856778146, "gen_logits_max": 6.6769819259643555, "gen_logits_mean": -12.416963577270508, "gen_logits_min": -24.502002716064453, "gen_logits_std": 2.5079782009124756, "gen_loss": 0.2847751975059509, "grad_norm": 0.45922580987522943, "learning_rate": 2.8608e-05, "loss": 0.3106, "mean_copy_accuracy": 0.993071123957634, "mean_gen_accuracy": 0.863361120223999, "mean_token_accuracy": 0.8981318324804306, "num_tokens": 497813751.0, "sample_num_tokens": 8376.25, "step": 1838, "total_num_tokens": 497847256.0, "z_loss": 0.0010143384570255876 }, { "copy_logits_max": -3.912585496902466, "copy_logits_min": -750000000.0, "copy_num_tokens": 623.75, "epoch": 0.3755935665049783, "gen_logits_max": 7.38192081451416, "gen_logits_mean": -11.621076583862305, "gen_logits_min": -23.327438354492188, "gen_logits_std": 2.445566177368164, "gen_loss": 0.31586360931396484, "grad_norm": 0.43268168865014317, "learning_rate": 2.8606736842105266e-05, "loss": 0.3167, "mean_copy_accuracy": 0.9926785528659821, "mean_gen_accuracy": 0.8608956933021545, "mean_token_accuracy": 0.8958875089883804, "num_tokens": 498109445.0, "sample_num_tokens": 9537.75, "step": 1839, "total_num_tokens": 498147596.0, "z_loss": 0.0011122514260932803 }, { "copy_logits_max": -5.544754981994629, "copy_logits_min": -750000000.0, "copy_num_tokens": 629.4375, "epoch": 0.37579780444217514, "gen_logits_max": 6.718575477600098, "gen_logits_mean": -11.167373657226562, "gen_logits_min": -22.76005744934082, "gen_logits_std": 2.421461582183838, "gen_loss": 0.28694844245910645, "grad_norm": 0.5750505065642988, "learning_rate": 2.8605473684210526e-05, "loss": 0.3455, "mean_copy_accuracy": 0.9920112937688828, "mean_gen_accuracy": 0.8569611757993698, "mean_token_accuracy": 0.888093888759613, "num_tokens": 498379884.0, "sample_num_tokens": 8911.0, "step": 1840, "total_num_tokens": 498415528.0, "z_loss": 0.0009860158897936344 }, { "copy_logits_max": -5.2533159255981445, "copy_logits_min": -750000064.0, "copy_num_tokens": 402.125, "epoch": 0.37600204237937196, "gen_logits_max": 7.436039924621582, "gen_logits_mean": -12.439926147460938, "gen_logits_min": -24.079496383666992, "gen_logits_std": 2.4708709716796875, "gen_loss": 0.2980997562408447, "grad_norm": 0.45992570622056145, "learning_rate": 2.860421052631579e-05, "loss": 0.3097, "mean_copy_accuracy": 0.9927437901496887, "mean_gen_accuracy": 0.8687542825937271, "mean_token_accuracy": 0.8964679092168808, "num_tokens": 498661109.0, "sample_num_tokens": 7861.75, "step": 1841, "total_num_tokens": 498692556.0, "z_loss": 0.0008895228966139257 }, { "copy_logits_max": -5.328669548034668, "copy_logits_min": -750000000.0, "copy_num_tokens": 389.0, "epoch": 0.3762062803165688, "gen_logits_max": 7.524394989013672, "gen_logits_mean": -11.52792739868164, "gen_logits_min": -23.182159423828125, "gen_logits_std": 2.4398927688598633, "gen_loss": 0.3914027214050293, "grad_norm": 0.5468574628218783, "learning_rate": 2.860294736842105e-05, "loss": 0.3509, "mean_copy_accuracy": 0.9911005049943924, "mean_gen_accuracy": 0.8543180823326111, "mean_token_accuracy": 0.8878635615110397, "num_tokens": 498918589.0, "sample_num_tokens": 8388.25, "step": 1842, "total_num_tokens": 498952142.0, "z_loss": 0.001042286166921258 }, { "copy_logits_max": -4.090299606323242, "copy_logits_min": -750000000.0, "copy_num_tokens": 479.6875, "epoch": 0.37641051825376565, "gen_logits_max": 6.946319580078125, "gen_logits_mean": -12.68260383605957, "gen_logits_min": -24.383697509765625, "gen_logits_std": 2.427238941192627, "gen_loss": 0.32401761412620544, "grad_norm": 0.42924979749477443, "learning_rate": 2.8601684210526316e-05, "loss": 0.3177, "mean_copy_accuracy": 0.9934993535280228, "mean_gen_accuracy": 0.8618858903646469, "mean_token_accuracy": 0.8951873779296875, "num_tokens": 499174533.0, "sample_num_tokens": 8534.75, "step": 1843, "total_num_tokens": 499208672.0, "z_loss": 0.0009734950726851821 }, { "copy_logits_max": -4.4608306884765625, "copy_logits_min": -750000000.0, "copy_num_tokens": 360.25, "epoch": 0.37661475619096246, "gen_logits_max": 7.3082804679870605, "gen_logits_mean": -12.393177032470703, "gen_logits_min": -23.56204605102539, "gen_logits_std": 2.409355640411377, "gen_loss": 0.3825083076953888, "grad_norm": 0.5806818844699231, "learning_rate": 2.860042105263158e-05, "loss": 0.344, "mean_copy_accuracy": 0.992847815155983, "mean_gen_accuracy": 0.8528697490692139, "mean_token_accuracy": 0.8861222863197327, "num_tokens": 499430476.0, "sample_num_tokens": 8025.5, "step": 1844, "total_num_tokens": 499462578.0, "z_loss": 0.0010551514569669962 }, { "copy_logits_max": -5.742123126983643, "copy_logits_min": -750000000.0, "copy_num_tokens": 387.9375, "epoch": 0.3768189941281593, "gen_logits_max": 7.303638935089111, "gen_logits_mean": -12.000905990600586, "gen_logits_min": -23.592620849609375, "gen_logits_std": 2.467270612716675, "gen_loss": 0.3470272421836853, "grad_norm": 0.5055216407791452, "learning_rate": 2.8599157894736845e-05, "loss": 0.3673, "mean_copy_accuracy": 0.9927185922861099, "mean_gen_accuracy": 0.8477235436439514, "mean_token_accuracy": 0.8814176321029663, "num_tokens": 499690419.0, "sample_num_tokens": 7326.25, "step": 1845, "total_num_tokens": 499719724.0, "z_loss": 0.001028029015287757 }, { "copy_logits_max": -5.7559943199157715, "copy_logits_min": -750000000.0, "copy_num_tokens": 422.25, "epoch": 0.37702323206535615, "gen_logits_max": 6.365466117858887, "gen_logits_mean": -12.723075866699219, "gen_logits_min": -23.8997859954834, "gen_logits_std": 2.3819069862365723, "gen_loss": 0.31746357679367065, "grad_norm": 0.9030908686360761, "learning_rate": 2.8597894736842106e-05, "loss": 0.3362, "mean_copy_accuracy": 0.9921021461486816, "mean_gen_accuracy": 0.8529567420482635, "mean_token_accuracy": 0.8902029991149902, "num_tokens": 499971823.0, "sample_num_tokens": 8271.75, "step": 1846, "total_num_tokens": 500004910.0, "z_loss": 0.0007969563594087958 }, { "copy_logits_max": -4.526803016662598, "copy_logits_min": -687500032.0, "copy_num_tokens": 749.1875, "epoch": 0.37722747000255297, "gen_logits_max": 6.396956443786621, "gen_logits_mean": -11.5653076171875, "gen_logits_min": -22.946468353271484, "gen_logits_std": 2.40077543258667, "gen_loss": 0.32719412446022034, "grad_norm": 0.5516963330774347, "learning_rate": 2.859663157894737e-05, "loss": 0.3487, "mean_copy_accuracy": 0.991600900888443, "mean_gen_accuracy": 0.8516235649585724, "mean_token_accuracy": 0.8907565176486969, "num_tokens": 500247571.0, "sample_num_tokens": 10036.75, "step": 1847, "total_num_tokens": 500287718.0, "z_loss": 0.0009283750550821424 }, { "copy_logits_max": -4.994902610778809, "copy_logits_min": -750000000.0, "copy_num_tokens": 385.8125, "epoch": 0.3774317079397498, "gen_logits_max": 6.76996374130249, "gen_logits_mean": -12.41308879852295, "gen_logits_min": -23.592777252197266, "gen_logits_std": 2.4142866134643555, "gen_loss": 0.38021236658096313, "grad_norm": 0.5588397831818638, "learning_rate": 2.859536842105263e-05, "loss": 0.3506, "mean_copy_accuracy": 0.9923603236675262, "mean_gen_accuracy": 0.8556215018033981, "mean_token_accuracy": 0.88394595682621, "num_tokens": 500517098.0, "sample_num_tokens": 8389.0, "step": 1848, "total_num_tokens": 500550654.0, "z_loss": 0.0009832176147028804 }, { "copy_logits_max": -4.662406921386719, "copy_logits_min": -750000064.0, "copy_num_tokens": 590.75, "epoch": 0.37763594587694665, "gen_logits_max": 6.698897361755371, "gen_logits_mean": -11.872909545898438, "gen_logits_min": -23.540889739990234, "gen_logits_std": 2.466644287109375, "gen_loss": 0.30313003063201904, "grad_norm": 0.46695956873763844, "learning_rate": 2.8594105263157895e-05, "loss": 0.3168, "mean_copy_accuracy": 0.9932720363140106, "mean_gen_accuracy": 0.8633257150650024, "mean_token_accuracy": 0.8967789560556412, "num_tokens": 500792119.0, "sample_num_tokens": 9232.75, "step": 1849, "total_num_tokens": 500829050.0, "z_loss": 0.0009683596435934305 }, { "copy_logits_max": -5.779703140258789, "copy_logits_min": -750000000.0, "copy_num_tokens": 507.6875, "epoch": 0.37784018381414347, "gen_logits_max": 6.381992340087891, "gen_logits_mean": -12.245213508605957, "gen_logits_min": -24.035493850708008, "gen_logits_std": 2.4305241107940674, "gen_loss": 0.27455100417137146, "grad_norm": 0.4363037864153543, "learning_rate": 2.8592842105263156e-05, "loss": 0.2964, "mean_copy_accuracy": 0.9928719699382782, "mean_gen_accuracy": 0.8705084770917892, "mean_token_accuracy": 0.8999813050031662, "num_tokens": 501069975.0, "sample_num_tokens": 8632.75, "step": 1850, "total_num_tokens": 501104506.0, "z_loss": 0.0008323777001351118 }, { "copy_logits_max": -3.3262181282043457, "copy_logits_min": -750000000.0, "copy_num_tokens": 624.125, "epoch": 0.3780444217513403, "gen_logits_max": 5.599022388458252, "gen_logits_mean": -13.05996322631836, "gen_logits_min": -24.847536087036133, "gen_logits_std": 2.4563441276550293, "gen_loss": 0.28811579942703247, "grad_norm": 0.468107025635839, "learning_rate": 2.859157894736842e-05, "loss": 0.3068, "mean_copy_accuracy": 0.9924326688051224, "mean_gen_accuracy": 0.8640920370817184, "mean_token_accuracy": 0.898862361907959, "num_tokens": 501339039.0, "sample_num_tokens": 9158.75, "step": 1851, "total_num_tokens": 501375674.0, "z_loss": 0.0009290879825130105 }, { "copy_logits_max": -7.731833457946777, "copy_logits_min": -750000000.0, "copy_num_tokens": 322.1875, "epoch": 0.37824865968853716, "gen_logits_max": 7.39993143081665, "gen_logits_mean": -11.858031272888184, "gen_logits_min": -23.090059280395508, "gen_logits_std": 2.398547887802124, "gen_loss": 0.32786130905151367, "grad_norm": 0.4821993311590263, "learning_rate": 2.8590315789473685e-05, "loss": 0.3219, "mean_copy_accuracy": 0.9940444529056549, "mean_gen_accuracy": 0.8621058166027069, "mean_token_accuracy": 0.8961944878101349, "num_tokens": 501625266.0, "sample_num_tokens": 8027.0, "step": 1852, "total_num_tokens": 501657374.0, "z_loss": 0.0009178921463899314 }, { "copy_logits_max": -6.6516642570495605, "copy_logits_min": -750000000.0, "copy_num_tokens": 670.875, "epoch": 0.378452897625734, "gen_logits_max": 6.8236613273620605, "gen_logits_mean": -10.26188850402832, "gen_logits_min": -21.270566940307617, "gen_logits_std": 2.3679285049438477, "gen_loss": 0.33003729581832886, "grad_norm": 0.4855622294389344, "learning_rate": 2.858905263157895e-05, "loss": 0.3505, "mean_copy_accuracy": 0.9920101314783096, "mean_gen_accuracy": 0.8571046739816666, "mean_token_accuracy": 0.8864619433879852, "num_tokens": 501882762.0, "sample_num_tokens": 10315.0, "step": 1853, "total_num_tokens": 501924022.0, "z_loss": 0.0009491806267760694 }, { "copy_logits_max": -5.653936386108398, "copy_logits_min": -687500032.0, "copy_num_tokens": 593.0625, "epoch": 0.3786571355629308, "gen_logits_max": 6.20786190032959, "gen_logits_mean": -12.275106430053711, "gen_logits_min": -23.681737899780273, "gen_logits_std": 2.4027414321899414, "gen_loss": 0.3356356918811798, "grad_norm": 0.5563168886832309, "learning_rate": 2.8587789473684213e-05, "loss": 0.3385, "mean_copy_accuracy": 0.989237442612648, "mean_gen_accuracy": 0.8582421392202377, "mean_token_accuracy": 0.8893150091171265, "num_tokens": 502168388.0, "sample_num_tokens": 9894.0, "step": 1854, "total_num_tokens": 502207964.0, "z_loss": 0.0009347136947326362 }, { "copy_logits_max": -4.742562294006348, "copy_logits_min": -750000000.0, "copy_num_tokens": 482.5625, "epoch": 0.37886137350012766, "gen_logits_max": 7.392351150512695, "gen_logits_mean": -10.855430603027344, "gen_logits_min": -22.605567932128906, "gen_logits_std": 2.4635114669799805, "gen_loss": 0.34200090169906616, "grad_norm": 0.4636323250351214, "learning_rate": 2.8586526315789474e-05, "loss": 0.3352, "mean_copy_accuracy": 0.994333028793335, "mean_gen_accuracy": 0.8516258746385574, "mean_token_accuracy": 0.8905180841684341, "num_tokens": 502457629.0, "sample_num_tokens": 8682.25, "step": 1855, "total_num_tokens": 502492358.0, "z_loss": 0.0011276102159172297 }, { "copy_logits_max": -5.075592041015625, "copy_logits_min": -750000000.0, "copy_num_tokens": 424.25, "epoch": 0.3790656114373245, "gen_logits_max": 6.619386672973633, "gen_logits_mean": -12.09242057800293, "gen_logits_min": -23.64501953125, "gen_logits_std": 2.455268383026123, "gen_loss": 0.31561923027038574, "grad_norm": 0.4922002976320408, "learning_rate": 2.858526315789474e-05, "loss": 0.3288, "mean_copy_accuracy": 0.9932000637054443, "mean_gen_accuracy": 0.8561514317989349, "mean_token_accuracy": 0.8912536352872849, "num_tokens": 502728914.0, "sample_num_tokens": 7615.5, "step": 1856, "total_num_tokens": 502759376.0, "z_loss": 0.0010029401164501905 }, { "copy_logits_max": -4.153458595275879, "copy_logits_min": -750000000.0, "copy_num_tokens": 402.75, "epoch": 0.3792698493745213, "gen_logits_max": 7.567812442779541, "gen_logits_mean": -10.607349395751953, "gen_logits_min": -21.720470428466797, "gen_logits_std": 2.361205577850342, "gen_loss": 0.34901997447013855, "grad_norm": 0.44791866674048203, "learning_rate": 2.8584e-05, "loss": 0.3357, "mean_copy_accuracy": 0.993616595864296, "mean_gen_accuracy": 0.8611333221197128, "mean_token_accuracy": 0.889521136879921, "num_tokens": 502990879.0, "sample_num_tokens": 8709.75, "step": 1857, "total_num_tokens": 503025718.0, "z_loss": 0.0010175877250730991 }, { "copy_logits_max": -7.576348304748535, "copy_logits_min": -687500032.0, "copy_num_tokens": 350.4375, "epoch": 0.37947408731171817, "gen_logits_max": 7.810313701629639, "gen_logits_mean": -11.815193176269531, "gen_logits_min": -22.711660385131836, "gen_logits_std": 2.3727145195007324, "gen_loss": 0.3522525131702423, "grad_norm": 0.4955541777593908, "learning_rate": 2.8582736842105264e-05, "loss": 0.333, "mean_copy_accuracy": 0.9925040304660797, "mean_gen_accuracy": 0.8558603972196579, "mean_token_accuracy": 0.8920394480228424, "num_tokens": 503272956.0, "sample_num_tokens": 8872.0, "step": 1858, "total_num_tokens": 503308444.0, "z_loss": 0.0008930191397666931 }, { "copy_logits_max": -5.4516921043396, "copy_logits_min": -687500032.0, "copy_num_tokens": 709.375, "epoch": 0.379678325248915, "gen_logits_max": 6.4549970626831055, "gen_logits_mean": -10.773780822753906, "gen_logits_min": -22.552804946899414, "gen_logits_std": 2.4502453804016113, "gen_loss": 0.2945518493652344, "grad_norm": 0.4587976585189839, "learning_rate": 2.8581473684210525e-05, "loss": 0.3211, "mean_copy_accuracy": 0.9935938268899918, "mean_gen_accuracy": 0.8575014770030975, "mean_token_accuracy": 0.8949781209230423, "num_tokens": 503556659.0, "sample_num_tokens": 9234.75, "step": 1859, "total_num_tokens": 503593598.0, "z_loss": 0.0010621144901961088 }, { "copy_logits_max": -4.9935197830200195, "copy_logits_min": -750000000.0, "copy_num_tokens": 527.6875, "epoch": 0.3798825631861118, "gen_logits_max": 7.192455291748047, "gen_logits_mean": -11.065644264221191, "gen_logits_min": -21.965551376342773, "gen_logits_std": 2.385258913040161, "gen_loss": 0.35342109203338623, "grad_norm": 0.4679110185205675, "learning_rate": 2.8580210526315792e-05, "loss": 0.3405, "mean_copy_accuracy": 0.9934971183538437, "mean_gen_accuracy": 0.8557976484298706, "mean_token_accuracy": 0.8888998925685883, "num_tokens": 503837158.0, "sample_num_tokens": 9073.5, "step": 1860, "total_num_tokens": 503873452.0, "z_loss": 0.0011113532818853855 }, { "copy_logits_max": -5.769631385803223, "copy_logits_min": -750000000.0, "copy_num_tokens": 325.0625, "epoch": 0.38008680112330867, "gen_logits_max": 6.8606390953063965, "gen_logits_mean": -12.784555435180664, "gen_logits_min": -24.073698043823242, "gen_logits_std": 2.390432596206665, "gen_loss": 0.31598734855651855, "grad_norm": 0.42956708058170756, "learning_rate": 2.8578947368421053e-05, "loss": 0.2918, "mean_copy_accuracy": 0.9928866475820541, "mean_gen_accuracy": 0.8703537881374359, "mean_token_accuracy": 0.9025664776563644, "num_tokens": 504111183.0, "sample_num_tokens": 7054.25, "step": 1861, "total_num_tokens": 504139400.0, "z_loss": 0.0009219241328537464 }, { "copy_logits_max": -5.20937442779541, "copy_logits_min": -750000000.0, "copy_num_tokens": 436.1875, "epoch": 0.3802910390605055, "gen_logits_max": 6.619223117828369, "gen_logits_mean": -12.536114692687988, "gen_logits_min": -23.77316665649414, "gen_logits_std": 2.4029781818389893, "gen_loss": 0.2992638945579529, "grad_norm": 0.5055900978610542, "learning_rate": 2.8577684210526318e-05, "loss": 0.3372, "mean_copy_accuracy": 0.9911035448312759, "mean_gen_accuracy": 0.8606430143117905, "mean_token_accuracy": 0.8893884718418121, "num_tokens": 504379291.0, "sample_num_tokens": 8511.25, "step": 1862, "total_num_tokens": 504413336.0, "z_loss": 0.0008998060948215425 }, { "copy_logits_max": -4.096567153930664, "copy_logits_min": -750000000.0, "copy_num_tokens": 461.3125, "epoch": 0.3804952769977023, "gen_logits_max": 6.261844635009766, "gen_logits_mean": -11.775823593139648, "gen_logits_min": -22.822446823120117, "gen_logits_std": 2.328695774078369, "gen_loss": 0.2999851405620575, "grad_norm": 0.41763845612064976, "learning_rate": 2.857642105263158e-05, "loss": 0.3134, "mean_copy_accuracy": 0.9940854012966156, "mean_gen_accuracy": 0.8618573546409607, "mean_token_accuracy": 0.8984323740005493, "num_tokens": 504655028.0, "sample_num_tokens": 8043.0, "step": 1863, "total_num_tokens": 504687200.0, "z_loss": 0.0009754134807735682 }, { "copy_logits_max": -4.696699142456055, "copy_logits_min": -750000000.0, "copy_num_tokens": 447.5625, "epoch": 0.3806995149348992, "gen_logits_max": 6.3330078125, "gen_logits_mean": -12.827985763549805, "gen_logits_min": -24.2534236907959, "gen_logits_std": 2.460658550262451, "gen_loss": 0.3197983503341675, "grad_norm": 0.6107420717111384, "learning_rate": 2.8575157894736843e-05, "loss": 0.3118, "mean_copy_accuracy": 0.9944234937429428, "mean_gen_accuracy": 0.8594285547733307, "mean_token_accuracy": 0.8955088555812836, "num_tokens": 504947188.0, "sample_num_tokens": 8613.5, "step": 1864, "total_num_tokens": 504981642.0, "z_loss": 0.0010652386117726564 }, { "copy_logits_max": -7.2640700340271, "copy_logits_min": -750000000.0, "copy_num_tokens": 418.4375, "epoch": 0.380903752872096, "gen_logits_max": 7.15697717666626, "gen_logits_mean": -11.989255905151367, "gen_logits_min": -22.77288818359375, "gen_logits_std": 2.3744137287139893, "gen_loss": 0.33677998185157776, "grad_norm": 0.4301869116491057, "learning_rate": 2.8573894736842104e-05, "loss": 0.3292, "mean_copy_accuracy": 0.9943026304244995, "mean_gen_accuracy": 0.8602806925773621, "mean_token_accuracy": 0.8927523344755173, "num_tokens": 505227936.0, "sample_num_tokens": 8758.0, "step": 1865, "total_num_tokens": 505262968.0, "z_loss": 0.0010172028560191393 }, { "copy_logits_max": -5.92422342300415, "copy_logits_min": -750000000.0, "copy_num_tokens": 582.75, "epoch": 0.3811079908092928, "gen_logits_max": 6.427194595336914, "gen_logits_mean": -11.611597061157227, "gen_logits_min": -22.935697555541992, "gen_logits_std": 2.4296507835388184, "gen_loss": 0.29765716195106506, "grad_norm": 0.5932860666818515, "learning_rate": 2.8572631578947368e-05, "loss": 0.3274, "mean_copy_accuracy": 0.9934646487236023, "mean_gen_accuracy": 0.8561451584100723, "mean_token_accuracy": 0.8928549289703369, "num_tokens": 505524658.0, "sample_num_tokens": 9319.0, "step": 1866, "total_num_tokens": 505561934.0, "z_loss": 0.000921744154766202 }, { "copy_logits_max": -5.460824012756348, "copy_logits_min": -687500032.0, "copy_num_tokens": 573.375, "epoch": 0.3813122287464897, "gen_logits_max": 6.25183629989624, "gen_logits_mean": -12.980937957763672, "gen_logits_min": -24.948471069335938, "gen_logits_std": 2.4405014514923096, "gen_loss": 0.2865144610404968, "grad_norm": 0.48284809543034823, "learning_rate": 2.8571368421052632e-05, "loss": 0.3399, "mean_copy_accuracy": 0.9930404126644135, "mean_gen_accuracy": 0.8464730381965637, "mean_token_accuracy": 0.8883218318223953, "num_tokens": 505809684.0, "sample_num_tokens": 8830.5, "step": 1867, "total_num_tokens": 505845006.0, "z_loss": 0.0009200879139825702 }, { "copy_logits_max": -6.649250030517578, "copy_logits_min": -750000000.0, "copy_num_tokens": 261.3125, "epoch": 0.3815164666836865, "gen_logits_max": 7.191577911376953, "gen_logits_mean": -11.77513313293457, "gen_logits_min": -23.497905731201172, "gen_logits_std": 2.416456699371338, "gen_loss": 0.3486381769180298, "grad_norm": 0.9458992268476991, "learning_rate": 2.8570105263157897e-05, "loss": 0.3084, "mean_copy_accuracy": 0.9922835528850555, "mean_gen_accuracy": 0.8708217889070511, "mean_token_accuracy": 0.8983522802591324, "num_tokens": 506085665.0, "sample_num_tokens": 6646.25, "step": 1868, "total_num_tokens": 506112250.0, "z_loss": 0.0009783024434000254 }, { "copy_logits_max": -4.778363227844238, "copy_logits_min": -750000000.0, "copy_num_tokens": 485.3125, "epoch": 0.3817207046208833, "gen_logits_max": 6.62626838684082, "gen_logits_mean": -11.95108413696289, "gen_logits_min": -23.379192352294922, "gen_logits_std": 2.366973400115967, "gen_loss": 0.34253454208374023, "grad_norm": 0.5087099297090988, "learning_rate": 2.856884210526316e-05, "loss": 0.316, "mean_copy_accuracy": 0.9938548803329468, "mean_gen_accuracy": 0.8623960167169571, "mean_token_accuracy": 0.8961962461471558, "num_tokens": 506346373.0, "sample_num_tokens": 8804.25, "step": 1869, "total_num_tokens": 506381590.0, "z_loss": 0.000978452037088573 }, { "copy_logits_max": -5.724615573883057, "copy_logits_min": -687500032.0, "copy_num_tokens": 405.375, "epoch": 0.3819249425580802, "gen_logits_max": 6.761338233947754, "gen_logits_mean": -12.440465927124023, "gen_logits_min": -23.566112518310547, "gen_logits_std": 2.3988265991210938, "gen_loss": 0.3391357660293579, "grad_norm": 0.45946344751221896, "learning_rate": 2.8567578947368422e-05, "loss": 0.3332, "mean_copy_accuracy": 0.993075355887413, "mean_gen_accuracy": 0.8569687008857727, "mean_token_accuracy": 0.8918654322624207, "num_tokens": 506628632.0, "sample_num_tokens": 8239.0, "step": 1870, "total_num_tokens": 506661588.0, "z_loss": 0.0008921456173993647 }, { "copy_logits_max": -4.480107307434082, "copy_logits_min": -750000000.0, "copy_num_tokens": 485.75, "epoch": 0.382129180495277, "gen_logits_max": 7.420446872711182, "gen_logits_mean": -10.37856674194336, "gen_logits_min": -21.759166717529297, "gen_logits_std": 2.4606776237487793, "gen_loss": 0.28787174820899963, "grad_norm": 0.4440764190051736, "learning_rate": 2.8566315789473686e-05, "loss": 0.325, "mean_copy_accuracy": 0.9937659204006195, "mean_gen_accuracy": 0.8586984723806381, "mean_token_accuracy": 0.8938841372728348, "num_tokens": 506916971.0, "sample_num_tokens": 8325.75, "step": 1871, "total_num_tokens": 506950274.0, "z_loss": 0.0008227330399677157 }, { "copy_logits_max": -4.410571575164795, "copy_logits_min": -750000000.0, "copy_num_tokens": 497.875, "epoch": 0.3823334184324738, "gen_logits_max": 6.59881591796875, "gen_logits_mean": -11.585643768310547, "gen_logits_min": -23.23744010925293, "gen_logits_std": 2.462326765060425, "gen_loss": 0.32489657402038574, "grad_norm": 0.4178258173650095, "learning_rate": 2.8565052631578947e-05, "loss": 0.3216, "mean_copy_accuracy": 0.9931967258453369, "mean_gen_accuracy": 0.8607937842607498, "mean_token_accuracy": 0.8946100920438766, "num_tokens": 507200865.0, "sample_num_tokens": 8252.75, "step": 1872, "total_num_tokens": 507233876.0, "z_loss": 0.0010075794998556376 }, { "copy_logits_max": -2.7224063873291016, "copy_logits_min": -750000000.0, "copy_num_tokens": 499.0625, "epoch": 0.3825376563696707, "gen_logits_max": 7.11051082611084, "gen_logits_mean": -10.954105377197266, "gen_logits_min": -22.68585777282715, "gen_logits_std": 2.446842670440674, "gen_loss": 0.3577881455421448, "grad_norm": 0.4169933523684696, "learning_rate": 2.856378947368421e-05, "loss": 0.3292, "mean_copy_accuracy": 0.9940399080514908, "mean_gen_accuracy": 0.8503158390522003, "mean_token_accuracy": 0.8917157351970673, "num_tokens": 507491850.0, "sample_num_tokens": 9241.0, "step": 1873, "total_num_tokens": 507528814.0, "z_loss": 0.0011222460307180882 }, { "copy_logits_max": -4.199873924255371, "copy_logits_min": -750000000.0, "copy_num_tokens": 555.25, "epoch": 0.3827418943068675, "gen_logits_max": 6.235044479370117, "gen_logits_mean": -11.847126960754395, "gen_logits_min": -23.03057289123535, "gen_logits_std": 2.3958287239074707, "gen_loss": 0.35975927114486694, "grad_norm": 0.4440723740432541, "learning_rate": 2.8562526315789472e-05, "loss": 0.3062, "mean_copy_accuracy": 0.9945833384990692, "mean_gen_accuracy": 0.8653712272644043, "mean_token_accuracy": 0.899404987692833, "num_tokens": 507773812.0, "sample_num_tokens": 8871.5, "step": 1874, "total_num_tokens": 507809298.0, "z_loss": 0.0011035691713914275 }, { "copy_logits_max": -4.908720970153809, "copy_logits_min": -750000000.0, "copy_num_tokens": 455.6875, "epoch": 0.3829461322440643, "gen_logits_max": 6.445479393005371, "gen_logits_mean": -12.781439781188965, "gen_logits_min": -23.95943832397461, "gen_logits_std": 2.415360450744629, "gen_loss": 0.31548142433166504, "grad_norm": 0.45956720054646555, "learning_rate": 2.8561263157894737e-05, "loss": 0.3037, "mean_copy_accuracy": 0.9916540831327438, "mean_gen_accuracy": 0.8699322938919067, "mean_token_accuracy": 0.9006199240684509, "num_tokens": 508065540.0, "sample_num_tokens": 8413.5, "step": 1875, "total_num_tokens": 508099194.0, "z_loss": 0.0010139268124476075 }, { "copy_logits_max": -3.9815382957458496, "copy_logits_min": -625000064.0, "copy_num_tokens": 514.6875, "epoch": 0.3831503701812612, "gen_logits_max": 7.5802106857299805, "gen_logits_mean": -9.715970993041992, "gen_logits_min": -21.038917541503906, "gen_logits_std": 2.39321231842041, "gen_loss": 0.3133559823036194, "grad_norm": 0.45641907041187185, "learning_rate": 2.856e-05, "loss": 0.3233, "mean_copy_accuracy": 0.9927598237991333, "mean_gen_accuracy": 0.862927719950676, "mean_token_accuracy": 0.8940009027719498, "num_tokens": 508342858.0, "sample_num_tokens": 9130.5, "step": 1876, "total_num_tokens": 508379380.0, "z_loss": 0.0010100710205733776 }, { "copy_logits_max": -5.242764949798584, "copy_logits_min": -687500032.0, "copy_num_tokens": 424.8125, "epoch": 0.383354608118458, "gen_logits_max": 7.324099063873291, "gen_logits_mean": -11.232867240905762, "gen_logits_min": -22.061988830566406, "gen_logits_std": 2.361229419708252, "gen_loss": 0.35212242603302, "grad_norm": 0.47219366220982095, "learning_rate": 2.8558736842105265e-05, "loss": 0.3403, "mean_copy_accuracy": 0.9938159584999084, "mean_gen_accuracy": 0.8549326509237289, "mean_token_accuracy": 0.8879055231809616, "num_tokens": 508606922.0, "sample_num_tokens": 8372.5, "step": 1877, "total_num_tokens": 508640412.0, "z_loss": 0.0009944409830495715 }, { "copy_logits_max": -3.827092170715332, "copy_logits_min": -687500032.0, "copy_num_tokens": 409.1875, "epoch": 0.3835588460556548, "gen_logits_max": 6.778621673583984, "gen_logits_mean": -11.910825729370117, "gen_logits_min": -23.044830322265625, "gen_logits_std": 2.3979947566986084, "gen_loss": 0.34262654185295105, "grad_norm": 0.4564076373977241, "learning_rate": 2.8557473684210526e-05, "loss": 0.3177, "mean_copy_accuracy": 0.9925035238265991, "mean_gen_accuracy": 0.8618093878030777, "mean_token_accuracy": 0.8981058448553085, "num_tokens": 508886131.0, "sample_num_tokens": 7890.25, "step": 1878, "total_num_tokens": 508917692.0, "z_loss": 0.0009363125427626073 }, { "copy_logits_max": -6.092974662780762, "copy_logits_min": -750000000.0, "copy_num_tokens": 278.6875, "epoch": 0.3837630839928517, "gen_logits_max": 7.627920150756836, "gen_logits_mean": -12.101900100708008, "gen_logits_min": -23.56914710998535, "gen_logits_std": 2.4172909259796143, "gen_loss": 0.3602379262447357, "grad_norm": 0.4585617553177124, "learning_rate": 2.855621052631579e-05, "loss": 0.3326, "mean_copy_accuracy": 0.9924057126045227, "mean_gen_accuracy": 0.8598564863204956, "mean_token_accuracy": 0.8908242285251617, "num_tokens": 509161555.0, "sample_num_tokens": 7083.25, "step": 1879, "total_num_tokens": 509189888.0, "z_loss": 0.0010130899026989937 }, { "copy_logits_max": -2.8440260887145996, "copy_logits_min": -750000000.0, "copy_num_tokens": 346.0, "epoch": 0.3839673219300485, "gen_logits_max": 6.653417110443115, "gen_logits_mean": -13.397660255432129, "gen_logits_min": -25.189441680908203, "gen_logits_std": 2.471391201019287, "gen_loss": 0.3648049533367157, "grad_norm": 0.5002445493975539, "learning_rate": 2.8554947368421055e-05, "loss": 0.335, "mean_copy_accuracy": 0.9917320013046265, "mean_gen_accuracy": 0.8610980808734894, "mean_token_accuracy": 0.8887306600809097, "num_tokens": 509417071.0, "sample_num_tokens": 6826.25, "step": 1880, "total_num_tokens": 509444376.0, "z_loss": 0.001103079179301858 }, { "copy_logits_max": -3.7176527976989746, "copy_logits_min": -750000000.0, "copy_num_tokens": 500.5, "epoch": 0.3841715598672453, "gen_logits_max": 7.122146129608154, "gen_logits_mean": -11.661855697631836, "gen_logits_min": -23.53064727783203, "gen_logits_std": 2.4856605529785156, "gen_loss": 0.3280479311943054, "grad_norm": 0.4365974754221505, "learning_rate": 2.8553684210526316e-05, "loss": 0.3208, "mean_copy_accuracy": 0.9941773265600204, "mean_gen_accuracy": 0.861221045255661, "mean_token_accuracy": 0.893761396408081, "num_tokens": 509683420.0, "sample_num_tokens": 8428.0, "step": 1881, "total_num_tokens": 509717132.0, "z_loss": 0.0012024028692394495 }, { "copy_logits_max": -5.362071990966797, "copy_logits_min": -750000000.0, "copy_num_tokens": 301.4375, "epoch": 0.3843757978044422, "gen_logits_max": 7.205477714538574, "gen_logits_mean": -11.849245071411133, "gen_logits_min": -23.02754020690918, "gen_logits_std": 2.4145851135253906, "gen_loss": 0.35771429538726807, "grad_norm": 0.4687011418180201, "learning_rate": 2.855242105263158e-05, "loss": 0.3407, "mean_copy_accuracy": 0.9929690361022949, "mean_gen_accuracy": 0.8592777699232101, "mean_token_accuracy": 0.8883463442325592, "num_tokens": 509938158.0, "sample_num_tokens": 7163.0, "step": 1882, "total_num_tokens": 509966810.0, "z_loss": 0.0010553633328527212 }, { "copy_logits_max": -5.393221855163574, "copy_logits_min": -750000000.0, "copy_num_tokens": 380.3125, "epoch": 0.384580035741639, "gen_logits_max": 6.421531677246094, "gen_logits_mean": -13.408432006835938, "gen_logits_min": -25.064659118652344, "gen_logits_std": 2.435213088989258, "gen_loss": 0.270575612783432, "grad_norm": 0.4604305378852017, "learning_rate": 2.855115789473684e-05, "loss": 0.3197, "mean_copy_accuracy": 0.990724042057991, "mean_gen_accuracy": 0.8702306896448135, "mean_token_accuracy": 0.8963450193405151, "num_tokens": 510201453.0, "sample_num_tokens": 8058.25, "step": 1883, "total_num_tokens": 510233686.0, "z_loss": 0.0008755790768191218 }, { "copy_logits_max": -4.005979537963867, "copy_logits_min": -750000000.0, "copy_num_tokens": 446.125, "epoch": 0.38478427367883583, "gen_logits_max": 7.000513076782227, "gen_logits_mean": -12.057317733764648, "gen_logits_min": -23.302091598510742, "gen_logits_std": 2.42881441116333, "gen_loss": 0.3574475347995758, "grad_norm": 0.48101063251791637, "learning_rate": 2.854989473684211e-05, "loss": 0.3365, "mean_copy_accuracy": 0.9926467686891556, "mean_gen_accuracy": 0.8544785678386688, "mean_token_accuracy": 0.8900184333324432, "num_tokens": 510461203.0, "sample_num_tokens": 8328.75, "step": 1884, "total_num_tokens": 510494518.0, "z_loss": 0.0010582203976809978 }, { "copy_logits_max": -4.565869331359863, "copy_logits_min": -750000000.0, "copy_num_tokens": 392.0, "epoch": 0.3849885116160327, "gen_logits_max": 7.566336154937744, "gen_logits_mean": -10.702987670898438, "gen_logits_min": -21.729028701782227, "gen_logits_std": 2.364232063293457, "gen_loss": 0.3592882752418518, "grad_norm": 0.49369046805895833, "learning_rate": 2.854863157894737e-05, "loss": 0.3422, "mean_copy_accuracy": 0.9929599463939667, "mean_gen_accuracy": 0.8564630001783371, "mean_token_accuracy": 0.8884769529104233, "num_tokens": 510711653.0, "sample_num_tokens": 8082.25, "step": 1885, "total_num_tokens": 510743982.0, "z_loss": 0.0010913183214142919 }, { "copy_logits_max": -3.8469979763031006, "copy_logits_min": -750000000.0, "copy_num_tokens": 335.125, "epoch": 0.3851927495532295, "gen_logits_max": 7.416738510131836, "gen_logits_mean": -12.19293212890625, "gen_logits_min": -23.639318466186523, "gen_logits_std": 2.4477415084838867, "gen_loss": 0.3338920772075653, "grad_norm": 0.47638056882470997, "learning_rate": 2.8547368421052634e-05, "loss": 0.3128, "mean_copy_accuracy": 0.9926723092794418, "mean_gen_accuracy": 0.8656880855560303, "mean_token_accuracy": 0.8977545946836472, "num_tokens": 510995215.0, "sample_num_tokens": 6846.75, "step": 1886, "total_num_tokens": 511022602.0, "z_loss": 0.0010034677106887102 }, { "copy_logits_max": -4.1607160568237305, "copy_logits_min": -750000000.0, "copy_num_tokens": 864.25, "epoch": 0.38539698749042633, "gen_logits_max": 6.176815986633301, "gen_logits_mean": -12.399827003479004, "gen_logits_min": -24.117223739624023, "gen_logits_std": 2.461768627166748, "gen_loss": 0.2833339273929596, "grad_norm": 0.4404224702537562, "learning_rate": 2.8546105263157895e-05, "loss": 0.3197, "mean_copy_accuracy": 0.9945035874843597, "mean_gen_accuracy": 0.8611189126968384, "mean_token_accuracy": 0.8956682980060577, "num_tokens": 511260230.0, "sample_num_tokens": 11020.0, "step": 1887, "total_num_tokens": 511304310.0, "z_loss": 0.0008851594175212085 }, { "copy_logits_max": -4.6619110107421875, "copy_logits_min": -687500032.0, "copy_num_tokens": 313.0625, "epoch": 0.3856012254276232, "gen_logits_max": 7.439730167388916, "gen_logits_mean": -11.109795570373535, "gen_logits_min": -22.39553451538086, "gen_logits_std": 2.4041695594787598, "gen_loss": 0.4098869264125824, "grad_norm": 0.5403980623882684, "learning_rate": 2.854484210526316e-05, "loss": 0.3781, "mean_copy_accuracy": 0.9910420626401901, "mean_gen_accuracy": 0.8442872166633606, "mean_token_accuracy": 0.8766760677099228, "num_tokens": 511516532.0, "sample_num_tokens": 6522.0, "step": 1888, "total_num_tokens": 511542620.0, "z_loss": 0.001027577556669712 }, { "copy_logits_max": -3.5784764289855957, "copy_logits_min": -750000000.0, "copy_num_tokens": 480.5, "epoch": 0.38580546336482, "gen_logits_max": 6.547173500061035, "gen_logits_mean": -11.70368766784668, "gen_logits_min": -23.370807647705078, "gen_logits_std": 2.458620548248291, "gen_loss": 0.3146213889122009, "grad_norm": 0.4740402154052406, "learning_rate": 2.854357894736842e-05, "loss": 0.3238, "mean_copy_accuracy": 0.9943123906850815, "mean_gen_accuracy": 0.8612408638000488, "mean_token_accuracy": 0.8933796882629395, "num_tokens": 511776541.0, "sample_num_tokens": 7315.25, "step": 1889, "total_num_tokens": 511805802.0, "z_loss": 0.0009141528280451894 }, { "copy_logits_max": -4.154232978820801, "copy_logits_min": -750000000.0, "copy_num_tokens": 625.375, "epoch": 0.38600970130201684, "gen_logits_max": 7.198460102081299, "gen_logits_mean": -10.96097183227539, "gen_logits_min": -22.739910125732422, "gen_logits_std": 2.481720447540283, "gen_loss": 0.32430094480514526, "grad_norm": 0.5685127698651405, "learning_rate": 2.8542315789473684e-05, "loss": 0.3361, "mean_copy_accuracy": 0.9941819161176682, "mean_gen_accuracy": 0.8542413115501404, "mean_token_accuracy": 0.8912579417228699, "num_tokens": 512054243.0, "sample_num_tokens": 9310.75, "step": 1890, "total_num_tokens": 512091486.0, "z_loss": 0.000891725649125874 }, { "copy_logits_max": -7.579464912414551, "copy_logits_min": -750000000.0, "copy_num_tokens": 315.6875, "epoch": 0.3862139392392137, "gen_logits_max": 7.07514762878418, "gen_logits_mean": -11.313169479370117, "gen_logits_min": -22.512887954711914, "gen_logits_std": 2.3866803646087646, "gen_loss": 0.3003276586532593, "grad_norm": 0.47491413957043715, "learning_rate": 2.8541052631578945e-05, "loss": 0.3147, "mean_copy_accuracy": 0.9929796904325485, "mean_gen_accuracy": 0.8679738491773605, "mean_token_accuracy": 0.8952894806861877, "num_tokens": 512327972.0, "sample_num_tokens": 7403.0, "step": 1891, "total_num_tokens": 512357584.0, "z_loss": 0.0007850077236071229 }, { "copy_logits_max": -2.8047609329223633, "copy_logits_min": -750000000.0, "copy_num_tokens": 797.875, "epoch": 0.3864181771764105, "gen_logits_max": 5.373035430908203, "gen_logits_mean": -12.161361694335938, "gen_logits_min": -23.996570587158203, "gen_logits_std": 2.483659505844116, "gen_loss": 0.24001890420913696, "grad_norm": 0.46792312994766316, "learning_rate": 2.853978947368421e-05, "loss": 0.3009, "mean_copy_accuracy": 0.9935517758131027, "mean_gen_accuracy": 0.8661912530660629, "mean_token_accuracy": 0.9019775092601776, "num_tokens": 512611941.0, "sample_num_tokens": 9763.25, "step": 1892, "total_num_tokens": 512650994.0, "z_loss": 0.0008200338925234973 }, { "copy_logits_max": -5.326909065246582, "copy_logits_min": -750000000.0, "copy_num_tokens": 438.5, "epoch": 0.38662241511360734, "gen_logits_max": 6.285970211029053, "gen_logits_mean": -11.147329330444336, "gen_logits_min": -22.139429092407227, "gen_logits_std": 2.371674060821533, "gen_loss": 0.3617493510246277, "grad_norm": 0.49414935551980405, "learning_rate": 2.8538526315789477e-05, "loss": 0.3468, "mean_copy_accuracy": 0.993018388748169, "mean_gen_accuracy": 0.8564396500587463, "mean_token_accuracy": 0.889202281832695, "num_tokens": 512874789.0, "sample_num_tokens": 8259.25, "step": 1893, "total_num_tokens": 512907826.0, "z_loss": 0.0008884875569492579 }, { "copy_logits_max": -4.533153533935547, "copy_logits_min": -750000000.0, "copy_num_tokens": 399.5, "epoch": 0.3868266530508042, "gen_logits_max": 6.603913307189941, "gen_logits_mean": -12.908681869506836, "gen_logits_min": -24.120725631713867, "gen_logits_std": 2.443382978439331, "gen_loss": 0.39696359634399414, "grad_norm": 0.5093477000053878, "learning_rate": 2.8537263157894738e-05, "loss": 0.334, "mean_copy_accuracy": 0.9936379790306091, "mean_gen_accuracy": 0.8527518510818481, "mean_token_accuracy": 0.8912048786878586, "num_tokens": 513166259.0, "sample_num_tokens": 7642.75, "step": 1894, "total_num_tokens": 513196830.0, "z_loss": 0.0009660972282290459 }, { "copy_logits_max": -5.393795013427734, "copy_logits_min": -750000000.0, "copy_num_tokens": 267.6875, "epoch": 0.38703089098800103, "gen_logits_max": 7.022221565246582, "gen_logits_mean": -11.73698616027832, "gen_logits_min": -22.91481590270996, "gen_logits_std": 2.3977856636047363, "gen_loss": 0.34262311458587646, "grad_norm": 0.4899166628318979, "learning_rate": 2.8536000000000003e-05, "loss": 0.3396, "mean_copy_accuracy": 0.9902550280094147, "mean_gen_accuracy": 0.8576153963804245, "mean_token_accuracy": 0.8870096653699875, "num_tokens": 513407116.0, "sample_num_tokens": 6376.5, "step": 1895, "total_num_tokens": 513432622.0, "z_loss": 0.0008719167090021074 }, { "copy_logits_max": -4.687185764312744, "copy_logits_min": -750000000.0, "copy_num_tokens": 631.8125, "epoch": 0.38723512892519785, "gen_logits_max": 5.892009735107422, "gen_logits_mean": -12.86575698852539, "gen_logits_min": -24.375568389892578, "gen_logits_std": 2.4681828022003174, "gen_loss": 0.2975119352340698, "grad_norm": 0.4268032072404203, "learning_rate": 2.8534736842105264e-05, "loss": 0.3106, "mean_copy_accuracy": 0.9937410056591034, "mean_gen_accuracy": 0.8665911108255386, "mean_token_accuracy": 0.8971676528453827, "num_tokens": 513670663.0, "sample_num_tokens": 9413.25, "step": 1896, "total_num_tokens": 513708316.0, "z_loss": 0.000925083237234503 }, { "copy_logits_max": -2.9138169288635254, "copy_logits_min": -750000000.0, "copy_num_tokens": 481.375, "epoch": 0.38743936686239466, "gen_logits_max": 6.9453935623168945, "gen_logits_mean": -11.834403991699219, "gen_logits_min": -23.882713317871094, "gen_logits_std": 2.4422860145568848, "gen_loss": 0.3594818711280823, "grad_norm": 0.42696302263608776, "learning_rate": 2.8533473684210528e-05, "loss": 0.3406, "mean_copy_accuracy": 0.9936344921588898, "mean_gen_accuracy": 0.8583096712827682, "mean_token_accuracy": 0.8883452862501144, "num_tokens": 513938075.0, "sample_num_tokens": 8789.75, "step": 1897, "total_num_tokens": 513973234.0, "z_loss": 0.0009593564318493009 }, { "copy_logits_max": -5.134379863739014, "copy_logits_min": -750000000.0, "copy_num_tokens": 433.75, "epoch": 0.38764360479959153, "gen_logits_max": 8.102568626403809, "gen_logits_mean": -10.371849060058594, "gen_logits_min": -21.733102798461914, "gen_logits_std": 2.360852003097534, "gen_loss": 0.3237486481666565, "grad_norm": 0.5007323972708771, "learning_rate": 2.853221052631579e-05, "loss": 0.32, "mean_copy_accuracy": 0.9916240572929382, "mean_gen_accuracy": 0.8625858575105667, "mean_token_accuracy": 0.8965572416782379, "num_tokens": 514219222.0, "sample_num_tokens": 8467.0, "step": 1898, "total_num_tokens": 514253090.0, "z_loss": 0.000900836254004389 }, { "copy_logits_max": -3.9065675735473633, "copy_logits_min": -750000000.0, "copy_num_tokens": 425.3125, "epoch": 0.38784784273678835, "gen_logits_max": 7.076066970825195, "gen_logits_mean": -12.15520191192627, "gen_logits_min": -23.309864044189453, "gen_logits_std": 2.3880696296691895, "gen_loss": 0.3496779501438141, "grad_norm": 0.4360177347453738, "learning_rate": 2.8530947368421053e-05, "loss": 0.3365, "mean_copy_accuracy": 0.9924522042274475, "mean_gen_accuracy": 0.8605197966098785, "mean_token_accuracy": 0.890273317694664, "num_tokens": 514492693.0, "sample_num_tokens": 8242.25, "step": 1899, "total_num_tokens": 514525662.0, "z_loss": 0.00107965141069144 }, { "copy_logits_max": -3.701338052749634, "copy_logits_min": -750000000.0, "copy_num_tokens": 351.1875, "epoch": 0.38805208067398517, "gen_logits_max": 7.563286304473877, "gen_logits_mean": -11.2425537109375, "gen_logits_min": -22.712146759033203, "gen_logits_std": 2.429229736328125, "gen_loss": 0.3496159315109253, "grad_norm": 0.5001834913556317, "learning_rate": 2.8529684210526314e-05, "loss": 0.3242, "mean_copy_accuracy": 0.9920244663953781, "mean_gen_accuracy": 0.8610119968652725, "mean_token_accuracy": 0.8962032794952393, "num_tokens": 514762359.0, "sample_num_tokens": 7411.75, "step": 1900, "total_num_tokens": 514792006.0, "z_loss": 0.0011462080292403698 }, { "copy_logits_max": -3.350010871887207, "copy_logits_min": -687500032.0, "copy_num_tokens": 521.6875, "epoch": 0.38825631861118204, "gen_logits_max": 6.736652374267578, "gen_logits_mean": -11.448955535888672, "gen_logits_min": -23.155075073242188, "gen_logits_std": 2.4460604190826416, "gen_loss": 0.32418787479400635, "grad_norm": 0.40292201427834423, "learning_rate": 2.852842105263158e-05, "loss": 0.3115, "mean_copy_accuracy": 0.9944747388362885, "mean_gen_accuracy": 0.8631377071142197, "mean_token_accuracy": 0.8968532383441925, "num_tokens": 515036220.0, "sample_num_tokens": 8405.0, "step": 1901, "total_num_tokens": 515069840.0, "z_loss": 0.0010863119969144464 }, { "copy_logits_max": -6.204719543457031, "copy_logits_min": -750000000.0, "copy_num_tokens": 249.3125, "epoch": 0.38846055654837885, "gen_logits_max": 7.674803733825684, "gen_logits_mean": -11.542740821838379, "gen_logits_min": -22.592391967773438, "gen_logits_std": 2.3450052738189697, "gen_loss": 0.3646656572818756, "grad_norm": 0.4668248569619764, "learning_rate": 2.8527157894736843e-05, "loss": 0.3272, "mean_copy_accuracy": 0.9911856055259705, "mean_gen_accuracy": 0.8617281764745712, "mean_token_accuracy": 0.892397329211235, "num_tokens": 515313519.0, "sample_num_tokens": 7200.75, "step": 1902, "total_num_tokens": 515342322.0, "z_loss": 0.0010179139208048582 }, { "copy_logits_max": -6.92580509185791, "copy_logits_min": -750000000.0, "copy_num_tokens": 499.4375, "epoch": 0.38866479448557567, "gen_logits_max": 7.546952247619629, "gen_logits_mean": -10.70671558380127, "gen_logits_min": -21.832473754882812, "gen_logits_std": 2.408571720123291, "gen_loss": 0.30541032552719116, "grad_norm": 0.43371663363835955, "learning_rate": 2.8525894736842107e-05, "loss": 0.3261, "mean_copy_accuracy": 0.9927304834127426, "mean_gen_accuracy": 0.8576741814613342, "mean_token_accuracy": 0.8930750787258148, "num_tokens": 515576183.0, "sample_num_tokens": 9586.75, "step": 1903, "total_num_tokens": 515614530.0, "z_loss": 0.0008848676225170493 }, { "copy_logits_max": -5.089534759521484, "copy_logits_min": -750000000.0, "copy_num_tokens": 374.75, "epoch": 0.38886903242277254, "gen_logits_max": 7.686686992645264, "gen_logits_mean": -11.280823707580566, "gen_logits_min": -22.927520751953125, "gen_logits_std": 2.4927737712860107, "gen_loss": 0.2876656651496887, "grad_norm": 0.43773032139938345, "learning_rate": 2.8524631578947368e-05, "loss": 0.3278, "mean_copy_accuracy": 0.9932323694229126, "mean_gen_accuracy": 0.8656798154115677, "mean_token_accuracy": 0.8911445438861847, "num_tokens": 515827390.0, "sample_num_tokens": 8223.5, "step": 1904, "total_num_tokens": 515860284.0, "z_loss": 0.0009447386837564409 }, { "copy_logits_max": -5.351619720458984, "copy_logits_min": -750000000.0, "copy_num_tokens": 264.375, "epoch": 0.38907327035996936, "gen_logits_max": 7.3594865798950195, "gen_logits_mean": -12.304737091064453, "gen_logits_min": -23.55632972717285, "gen_logits_std": 2.4118094444274902, "gen_loss": 0.37089937925338745, "grad_norm": 0.46087346760485287, "learning_rate": 2.8523368421052632e-05, "loss": 0.3305, "mean_copy_accuracy": 0.9932627528905869, "mean_gen_accuracy": 0.8618158847093582, "mean_token_accuracy": 0.8914207369089127, "num_tokens": 516109733.0, "sample_num_tokens": 6863.25, "step": 1905, "total_num_tokens": 516137186.0, "z_loss": 0.0010502669028937817 }, { "copy_logits_max": -3.957573413848877, "copy_logits_min": -750000000.0, "copy_num_tokens": 744.9375, "epoch": 0.3892775082971662, "gen_logits_max": 6.898040771484375, "gen_logits_mean": -10.805402755737305, "gen_logits_min": -22.652202606201172, "gen_logits_std": 2.460594654083252, "gen_loss": 0.3457815647125244, "grad_norm": 0.4747312245007722, "learning_rate": 2.8522105263157893e-05, "loss": 0.3242, "mean_copy_accuracy": 0.993226170539856, "mean_gen_accuracy": 0.856520801782608, "mean_token_accuracy": 0.8937151432037354, "num_tokens": 516388213.0, "sample_num_tokens": 10759.75, "step": 1906, "total_num_tokens": 516431252.0, "z_loss": 0.0010691196657717228 }, { "copy_logits_max": -4.555578231811523, "copy_logits_min": -687500032.0, "copy_num_tokens": 454.0, "epoch": 0.38948174623436305, "gen_logits_max": 6.959194660186768, "gen_logits_mean": -12.272153854370117, "gen_logits_min": -24.056058883666992, "gen_logits_std": 2.4759879112243652, "gen_loss": 0.3349454402923584, "grad_norm": 0.4458231575724977, "learning_rate": 2.8520842105263157e-05, "loss": 0.3177, "mean_copy_accuracy": 0.9940542727708817, "mean_gen_accuracy": 0.8634222000837326, "mean_token_accuracy": 0.8967682421207428, "num_tokens": 516659620.0, "sample_num_tokens": 8490.5, "step": 1907, "total_num_tokens": 516693582.0, "z_loss": 0.000987075618468225 }, { "copy_logits_max": -4.97230339050293, "copy_logits_min": -750000000.0, "copy_num_tokens": 278.875, "epoch": 0.38968598417155986, "gen_logits_max": 7.863671779632568, "gen_logits_mean": -11.519561767578125, "gen_logits_min": -23.198711395263672, "gen_logits_std": 2.4762415885925293, "gen_loss": 0.386760950088501, "grad_norm": 0.5138121879040812, "learning_rate": 2.851957894736842e-05, "loss": 0.3422, "mean_copy_accuracy": 0.9934747964143753, "mean_gen_accuracy": 0.8547162860631943, "mean_token_accuracy": 0.8892636597156525, "num_tokens": 516936552.0, "sample_num_tokens": 7052.5, "step": 1908, "total_num_tokens": 516964762.0, "z_loss": 0.001004769466817379 }, { "copy_logits_max": -4.756585121154785, "copy_logits_min": -750000000.0, "copy_num_tokens": 501.1875, "epoch": 0.3898902221087567, "gen_logits_max": 6.9301557540893555, "gen_logits_mean": -11.114419937133789, "gen_logits_min": -23.000186920166016, "gen_logits_std": 2.462005615234375, "gen_loss": 0.28188788890838623, "grad_norm": 0.45100721234397473, "learning_rate": 2.8518315789473686e-05, "loss": 0.2896, "mean_copy_accuracy": 0.9931259453296661, "mean_gen_accuracy": 0.8708713948726654, "mean_token_accuracy": 0.9048880487680435, "num_tokens": 517224093.0, "sample_num_tokens": 8520.25, "step": 1909, "total_num_tokens": 517258174.0, "z_loss": 0.0008302165661007166 }, { "copy_logits_max": -5.237147331237793, "copy_logits_min": -750000000.0, "copy_num_tokens": 369.4375, "epoch": 0.39009446004595355, "gen_logits_max": 7.575350284576416, "gen_logits_mean": -10.953872680664062, "gen_logits_min": -22.149717330932617, "gen_logits_std": 2.4877431392669678, "gen_loss": 0.3401218056678772, "grad_norm": 0.487719887219057, "learning_rate": 2.851705263157895e-05, "loss": 0.3358, "mean_copy_accuracy": 0.9930137544870377, "mean_gen_accuracy": 0.8565273731946945, "mean_token_accuracy": 0.8908980935811996, "num_tokens": 517507844.0, "sample_num_tokens": 7574.0, "step": 1910, "total_num_tokens": 517538140.0, "z_loss": 0.0009238188504241407 }, { "copy_logits_max": -3.55106520652771, "copy_logits_min": -750000000.0, "copy_num_tokens": 441.25, "epoch": 0.39029869798315037, "gen_logits_max": 6.564386367797852, "gen_logits_mean": -12.80950927734375, "gen_logits_min": -24.502696990966797, "gen_logits_std": 2.488133430480957, "gen_loss": 0.30823004245758057, "grad_norm": 0.46688002549327295, "learning_rate": 2.851578947368421e-05, "loss": 0.33, "mean_copy_accuracy": 0.9944068342447281, "mean_gen_accuracy": 0.8605444133281708, "mean_token_accuracy": 0.894287496805191, "num_tokens": 517779566.0, "sample_num_tokens": 8597.0, "step": 1911, "total_num_tokens": 517813954.0, "z_loss": 0.001009804313071072 }, { "copy_logits_max": -5.8199968338012695, "copy_logits_min": -750000000.0, "copy_num_tokens": 263.375, "epoch": 0.3905029359203472, "gen_logits_max": 7.879220485687256, "gen_logits_mean": -11.059627532958984, "gen_logits_min": -22.214391708374023, "gen_logits_std": 2.4452743530273438, "gen_loss": 0.3574019968509674, "grad_norm": 0.4541646896456216, "learning_rate": 2.8514526315789476e-05, "loss": 0.3294, "mean_copy_accuracy": 0.992944061756134, "mean_gen_accuracy": 0.8563947230577469, "mean_token_accuracy": 0.8898878395557404, "num_tokens": 518047567.0, "sample_num_tokens": 7252.75, "step": 1912, "total_num_tokens": 518076578.0, "z_loss": 0.000971539702732116 }, { "copy_logits_max": -5.110633373260498, "copy_logits_min": -750000000.0, "copy_num_tokens": 441.3125, "epoch": 0.39070717385754405, "gen_logits_max": 6.775893211364746, "gen_logits_mean": -11.76905632019043, "gen_logits_min": -23.21359634399414, "gen_logits_std": 2.505077600479126, "gen_loss": 0.32047557830810547, "grad_norm": 0.46284134312427483, "learning_rate": 2.8513263157894736e-05, "loss": 0.3269, "mean_copy_accuracy": 0.9945782423019409, "mean_gen_accuracy": 0.8600448369979858, "mean_token_accuracy": 0.8934950679540634, "num_tokens": 518292894.0, "sample_num_tokens": 7681.0, "step": 1913, "total_num_tokens": 518323618.0, "z_loss": 0.000951821100898087 }, { "copy_logits_max": -6.276740074157715, "copy_logits_min": -750000000.0, "copy_num_tokens": 486.3125, "epoch": 0.39091141179474087, "gen_logits_max": 7.51838493347168, "gen_logits_mean": -10.801615715026855, "gen_logits_min": -22.266921997070312, "gen_logits_std": 2.471040725708008, "gen_loss": 0.2986600995063782, "grad_norm": 0.46371560231488984, "learning_rate": 2.8512e-05, "loss": 0.3374, "mean_copy_accuracy": 0.9951885491609573, "mean_gen_accuracy": 0.8562779277563095, "mean_token_accuracy": 0.8907789140939713, "num_tokens": 518563791.0, "sample_num_tokens": 8761.25, "step": 1914, "total_num_tokens": 518598836.0, "z_loss": 0.0009462112793698907 }, { "copy_logits_max": -5.426719665527344, "copy_logits_min": -750000000.0, "copy_num_tokens": 344.8125, "epoch": 0.3911156497319377, "gen_logits_max": 7.074716567993164, "gen_logits_mean": -13.215991973876953, "gen_logits_min": -24.350025177001953, "gen_logits_std": 2.4360294342041016, "gen_loss": 0.36741340160369873, "grad_norm": 0.46701962599249497, "learning_rate": 2.8510736842105262e-05, "loss": 0.3367, "mean_copy_accuracy": 0.992946520447731, "mean_gen_accuracy": 0.8560306131839752, "mean_token_accuracy": 0.8882825672626495, "num_tokens": 518829070.0, "sample_num_tokens": 7952.0, "step": 1915, "total_num_tokens": 518860878.0, "z_loss": 0.0009423076407983899 }, { "copy_logits_max": -4.238212585449219, "copy_logits_min": -750000064.0, "copy_num_tokens": 451.625, "epoch": 0.39131988766913456, "gen_logits_max": 6.693687438964844, "gen_logits_mean": -11.97683334350586, "gen_logits_min": -23.681285858154297, "gen_logits_std": 2.4794325828552246, "gen_loss": 0.29673200845718384, "grad_norm": 0.5197861527463605, "learning_rate": 2.8509473684210526e-05, "loss": 0.339, "mean_copy_accuracy": 0.9917654544115067, "mean_gen_accuracy": 0.8570277541875839, "mean_token_accuracy": 0.8881931155920029, "num_tokens": 519088689.0, "sample_num_tokens": 7970.25, "step": 1916, "total_num_tokens": 519120570.0, "z_loss": 0.0009121980983763933 }, { "copy_logits_max": -5.720192909240723, "copy_logits_min": -687500032.0, "copy_num_tokens": 389.3125, "epoch": 0.3915241256063314, "gen_logits_max": 6.644331932067871, "gen_logits_mean": -12.376099586486816, "gen_logits_min": -23.64723014831543, "gen_logits_std": 2.42862606048584, "gen_loss": 0.3533632159233093, "grad_norm": 0.4615642552048041, "learning_rate": 2.850821052631579e-05, "loss": 0.3301, "mean_copy_accuracy": 0.9930653721094131, "mean_gen_accuracy": 0.8602319359779358, "mean_token_accuracy": 0.8936644643545151, "num_tokens": 519379437.0, "sample_num_tokens": 7474.25, "step": 1917, "total_num_tokens": 519409334.0, "z_loss": 0.0008792550070211291 }, { "copy_logits_max": -5.489712238311768, "copy_logits_min": -687500032.0, "copy_num_tokens": 485.1875, "epoch": 0.3917283635435282, "gen_logits_max": 6.908580780029297, "gen_logits_mean": -11.835810661315918, "gen_logits_min": -23.137645721435547, "gen_logits_std": 2.4460134506225586, "gen_loss": 0.3286011815071106, "grad_norm": 0.4975903114184847, "learning_rate": 2.8506947368421055e-05, "loss": 0.3242, "mean_copy_accuracy": 0.9939453452825546, "mean_gen_accuracy": 0.8581707328557968, "mean_token_accuracy": 0.8920880407094955, "num_tokens": 519647232.0, "sample_num_tokens": 8757.5, "step": 1918, "total_num_tokens": 519682262.0, "z_loss": 0.0008707050001248717 }, { "copy_logits_max": -7.403278350830078, "copy_logits_min": -750000000.0, "copy_num_tokens": 327.8125, "epoch": 0.39193260148072506, "gen_logits_max": 7.611433982849121, "gen_logits_mean": -11.237997055053711, "gen_logits_min": -22.480920791625977, "gen_logits_std": 2.4382641315460205, "gen_loss": 0.3385677635669708, "grad_norm": 0.4682813116191506, "learning_rate": 2.8505684210526316e-05, "loss": 0.3385, "mean_copy_accuracy": 0.9934270232915878, "mean_gen_accuracy": 0.8547582924365997, "mean_token_accuracy": 0.8873421847820282, "num_tokens": 519922918.0, "sample_num_tokens": 8062.0, "step": 1919, "total_num_tokens": 519955166.0, "z_loss": 0.0008542643627151847 }, { "copy_logits_max": -4.878808975219727, "copy_logits_min": -687500032.0, "copy_num_tokens": 354.625, "epoch": 0.3921368394179219, "gen_logits_max": 7.30909538269043, "gen_logits_mean": -12.266219139099121, "gen_logits_min": -23.555116653442383, "gen_logits_std": 2.452317714691162, "gen_loss": 0.36938780546188354, "grad_norm": 0.45834434788806905, "learning_rate": 2.850442105263158e-05, "loss": 0.3452, "mean_copy_accuracy": 0.9927694499492645, "mean_gen_accuracy": 0.854745477437973, "mean_token_accuracy": 0.8848804980516434, "num_tokens": 520182387.0, "sample_num_tokens": 7771.75, "step": 1920, "total_num_tokens": 520213474.0, "z_loss": 0.0008932072087191045 }, { "copy_logits_max": -5.973567962646484, "copy_logits_min": -750000000.0, "copy_num_tokens": 414.5625, "epoch": 0.3923410773551187, "gen_logits_max": 6.432492256164551, "gen_logits_mean": -11.873956680297852, "gen_logits_min": -22.6575984954834, "gen_logits_std": 2.388017177581787, "gen_loss": 0.3372185230255127, "grad_norm": 0.43805414633337136, "learning_rate": 2.8503157894736844e-05, "loss": 0.3165, "mean_copy_accuracy": 0.992389976978302, "mean_gen_accuracy": 0.8679624199867249, "mean_token_accuracy": 0.8964543044567108, "num_tokens": 520445848.0, "sample_num_tokens": 8091.0, "step": 1921, "total_num_tokens": 520478212.0, "z_loss": 0.0008530571358278394 }, { "copy_logits_max": -5.462929725646973, "copy_logits_min": -750000000.0, "copy_num_tokens": 612.625, "epoch": 0.39254531529231557, "gen_logits_max": 6.597050666809082, "gen_logits_mean": -11.969505310058594, "gen_logits_min": -23.7337646484375, "gen_logits_std": 2.498833656311035, "gen_loss": 0.31399625539779663, "grad_norm": 0.4706126920541109, "learning_rate": 2.8501894736842105e-05, "loss": 0.3281, "mean_copy_accuracy": 0.9935438483953476, "mean_gen_accuracy": 0.8617426007986069, "mean_token_accuracy": 0.8949041962623596, "num_tokens": 520718312.0, "sample_num_tokens": 10094.0, "step": 1922, "total_num_tokens": 520758688.0, "z_loss": 0.0009182321955449879 }, { "copy_logits_max": -5.099356174468994, "copy_logits_min": -750000000.0, "copy_num_tokens": 466.125, "epoch": 0.3927495532295124, "gen_logits_max": 7.240394115447998, "gen_logits_mean": -11.293760299682617, "gen_logits_min": -22.63739585876465, "gen_logits_std": 2.4410247802734375, "gen_loss": 0.33957386016845703, "grad_norm": 0.4764311724101893, "learning_rate": 2.850063157894737e-05, "loss": 0.3375, "mean_copy_accuracy": 0.989577904343605, "mean_gen_accuracy": 0.8635239452123642, "mean_token_accuracy": 0.890871986746788, "num_tokens": 520987822.0, "sample_num_tokens": 9339.5, "step": 1923, "total_num_tokens": 521025180.0, "z_loss": 0.0008955199155025184 }, { "copy_logits_max": -3.1020448207855225, "copy_logits_min": -750000000.0, "copy_num_tokens": 519.0, "epoch": 0.3929537911667092, "gen_logits_max": 6.791629314422607, "gen_logits_mean": -11.694778442382812, "gen_logits_min": -23.224529266357422, "gen_logits_std": 2.490248203277588, "gen_loss": 0.3428836166858673, "grad_norm": 0.47559984243544484, "learning_rate": 2.849936842105263e-05, "loss": 0.3534, "mean_copy_accuracy": 0.9911567717790604, "mean_gen_accuracy": 0.8571850508451462, "mean_token_accuracy": 0.8849934190511703, "num_tokens": 521237460.0, "sample_num_tokens": 9373.0, "step": 1924, "total_num_tokens": 521274952.0, "z_loss": 0.0010499188210815191 }, { "copy_logits_max": -5.606210708618164, "copy_logits_min": -750000000.0, "copy_num_tokens": 273.5, "epoch": 0.39315802910390607, "gen_logits_max": 7.531173229217529, "gen_logits_mean": -11.776066780090332, "gen_logits_min": -23.29828643798828, "gen_logits_std": 2.482365846633911, "gen_loss": 0.3220730423927307, "grad_norm": 0.45965687629099455, "learning_rate": 2.8498105263157898e-05, "loss": 0.3297, "mean_copy_accuracy": 0.9917054623365402, "mean_gen_accuracy": 0.8605124354362488, "mean_token_accuracy": 0.8928559571504593, "num_tokens": 521522566.0, "sample_num_tokens": 7014.0, "step": 1925, "total_num_tokens": 521550622.0, "z_loss": 0.0008845495758578181 }, { "copy_logits_max": -3.9198806285858154, "copy_logits_min": -750000000.0, "copy_num_tokens": 486.6875, "epoch": 0.3933622670411029, "gen_logits_max": 6.810311317443848, "gen_logits_mean": -13.07901668548584, "gen_logits_min": -24.559476852416992, "gen_logits_std": 2.4687790870666504, "gen_loss": 0.3289458751678467, "grad_norm": 0.43248546797180076, "learning_rate": 2.849684210526316e-05, "loss": 0.3187, "mean_copy_accuracy": 0.9932987987995148, "mean_gen_accuracy": 0.8626707345247269, "mean_token_accuracy": 0.8963913172483444, "num_tokens": 521779314.0, "sample_num_tokens": 8687.5, "step": 1926, "total_num_tokens": 521814064.0, "z_loss": 0.0010406064102426171 }, { "copy_logits_max": -2.8116254806518555, "copy_logits_min": -625000064.0, "copy_num_tokens": 554.0625, "epoch": 0.3935665049782997, "gen_logits_max": 6.948726654052734, "gen_logits_mean": -11.995485305786133, "gen_logits_min": -23.88698959350586, "gen_logits_std": 2.5232882499694824, "gen_loss": 0.3167770206928253, "grad_norm": 0.4975570647306538, "learning_rate": 2.8495578947368423e-05, "loss": 0.3335, "mean_copy_accuracy": 0.9927104711532593, "mean_gen_accuracy": 0.8559970557689667, "mean_token_accuracy": 0.8913341760635376, "num_tokens": 522062602.0, "sample_num_tokens": 8787.0, "step": 1927, "total_num_tokens": 522097750.0, "z_loss": 0.001025712350383401 }, { "copy_logits_max": -2.2130205631256104, "copy_logits_min": -750000000.0, "copy_num_tokens": 549.5625, "epoch": 0.3937707429154966, "gen_logits_max": 7.011693000793457, "gen_logits_mean": -11.852336883544922, "gen_logits_min": -23.826351165771484, "gen_logits_std": 2.513225555419922, "gen_loss": 0.32442575693130493, "grad_norm": 0.4727341079068393, "learning_rate": 2.8494315789473684e-05, "loss": 0.3292, "mean_copy_accuracy": 0.9915720969438553, "mean_gen_accuracy": 0.8589123040437698, "mean_token_accuracy": 0.8918759226799011, "num_tokens": 522325373.0, "sample_num_tokens": 8971.75, "step": 1928, "total_num_tokens": 522361260.0, "z_loss": 0.0010079891653731465 }, { "copy_logits_max": -3.7825145721435547, "copy_logits_min": -687500032.0, "copy_num_tokens": 412.375, "epoch": 0.3939749808526934, "gen_logits_max": 7.619880676269531, "gen_logits_mean": -11.842317581176758, "gen_logits_min": -23.738143920898438, "gen_logits_std": 2.475996494293213, "gen_loss": 0.2994363307952881, "grad_norm": 0.4523858759446761, "learning_rate": 2.849305263157895e-05, "loss": 0.3176, "mean_copy_accuracy": 0.9917664527893066, "mean_gen_accuracy": 0.8663311153650284, "mean_token_accuracy": 0.8954254686832428, "num_tokens": 522599410.0, "sample_num_tokens": 8807.5, "step": 1929, "total_num_tokens": 522634640.0, "z_loss": 0.0009546087239868939 }, { "copy_logits_max": -4.523049354553223, "copy_logits_min": -687500032.0, "copy_num_tokens": 342.1875, "epoch": 0.3941792187898902, "gen_logits_max": 8.1914701461792, "gen_logits_mean": -11.663496017456055, "gen_logits_min": -23.291248321533203, "gen_logits_std": 2.4671032428741455, "gen_loss": 0.3398689031600952, "grad_norm": 0.5086420611799729, "learning_rate": 2.849178947368421e-05, "loss": 0.3474, "mean_copy_accuracy": 0.9917824268341064, "mean_gen_accuracy": 0.8590197861194611, "mean_token_accuracy": 0.8854863345623016, "num_tokens": 522853015.0, "sample_num_tokens": 7818.25, "step": 1930, "total_num_tokens": 522884288.0, "z_loss": 0.0009217076585628092 }, { "copy_logits_max": -3.8724207878112793, "copy_logits_min": -750000000.0, "copy_num_tokens": 453.375, "epoch": 0.3943834567270871, "gen_logits_max": 6.607526779174805, "gen_logits_mean": -12.356416702270508, "gen_logits_min": -23.888687133789062, "gen_logits_std": 2.471508502960205, "gen_loss": 0.3191874325275421, "grad_norm": 0.5152064919976567, "learning_rate": 2.8490526315789474e-05, "loss": 0.3475, "mean_copy_accuracy": 0.9937535673379898, "mean_gen_accuracy": 0.8505192995071411, "mean_token_accuracy": 0.8863868862390518, "num_tokens": 523113804.0, "sample_num_tokens": 8198.5, "step": 1931, "total_num_tokens": 523146598.0, "z_loss": 0.0009541231556795537 }, { "copy_logits_max": -1.8838074207305908, "copy_logits_min": -750000000.0, "copy_num_tokens": 560.25, "epoch": 0.3945876946642839, "gen_logits_max": 6.815756797790527, "gen_logits_mean": -11.210406303405762, "gen_logits_min": -23.137004852294922, "gen_logits_std": 2.523733139038086, "gen_loss": 0.31675195693969727, "grad_norm": 0.5289487309577207, "learning_rate": 2.8489263157894735e-05, "loss": 0.3257, "mean_copy_accuracy": 0.9926569163799286, "mean_gen_accuracy": 0.8601289540529251, "mean_token_accuracy": 0.8943494260311127, "num_tokens": 523380559.0, "sample_num_tokens": 8532.25, "step": 1932, "total_num_tokens": 523414688.0, "z_loss": 0.0008679426973685622 }, { "copy_logits_max": -3.9008615016937256, "copy_logits_min": -750000000.0, "copy_num_tokens": 312.0625, "epoch": 0.3947919326014807, "gen_logits_max": 8.465283393859863, "gen_logits_mean": -11.396775245666504, "gen_logits_min": -22.671768188476562, "gen_logits_std": 2.496504783630371, "gen_loss": 0.3616481125354767, "grad_norm": 0.4476513201429479, "learning_rate": 2.8488000000000002e-05, "loss": 0.3107, "mean_copy_accuracy": 0.9937315434217453, "mean_gen_accuracy": 0.8627807050943375, "mean_token_accuracy": 0.8992796987295151, "num_tokens": 523656281.0, "sample_num_tokens": 7880.25, "step": 1933, "total_num_tokens": 523687802.0, "z_loss": 0.0008808442507870495 }, { "copy_logits_max": -1.7113568782806396, "copy_logits_min": -750000000.0, "copy_num_tokens": 641.375, "epoch": 0.3949961705386776, "gen_logits_max": 6.87705135345459, "gen_logits_mean": -12.362096786499023, "gen_logits_min": -23.869583129882812, "gen_logits_std": 2.5269432067871094, "gen_loss": 0.2890012860298157, "grad_norm": 0.4299775628156191, "learning_rate": 2.8486736842105267e-05, "loss": 0.3199, "mean_copy_accuracy": 0.9924186766147614, "mean_gen_accuracy": 0.8637868314981461, "mean_token_accuracy": 0.8933122456073761, "num_tokens": 523918254.0, "sample_num_tokens": 10198.5, "step": 1934, "total_num_tokens": 523959048.0, "z_loss": 0.0008183649042621255 }, { "copy_logits_max": -2.2670605182647705, "copy_logits_min": -687500032.0, "copy_num_tokens": 392.4375, "epoch": 0.3952004084758744, "gen_logits_max": 7.533854007720947, "gen_logits_mean": -12.605216979980469, "gen_logits_min": -24.59101104736328, "gen_logits_std": 2.518059015274048, "gen_loss": 0.35892295837402344, "grad_norm": 0.4755447527457851, "learning_rate": 2.8485473684210528e-05, "loss": 0.3271, "mean_copy_accuracy": 0.9936158508062363, "mean_gen_accuracy": 0.8596975058317184, "mean_token_accuracy": 0.8926579505205154, "num_tokens": 524184161.0, "sample_num_tokens": 8197.75, "step": 1935, "total_num_tokens": 524216952.0, "z_loss": 0.0009090877138078213 }, { "copy_logits_max": -3.6718645095825195, "copy_logits_min": -750000000.0, "copy_num_tokens": 466.1875, "epoch": 0.3954046464130712, "gen_logits_max": 7.884433746337891, "gen_logits_mean": -12.44318675994873, "gen_logits_min": -24.007282257080078, "gen_logits_std": 2.493774175643921, "gen_loss": 0.2865908741950989, "grad_norm": 0.4469554687981461, "learning_rate": 2.8484210526315792e-05, "loss": 0.3272, "mean_copy_accuracy": 0.9935547113418579, "mean_gen_accuracy": 0.8594139665365219, "mean_token_accuracy": 0.8911851048469543, "num_tokens": 524451648.0, "sample_num_tokens": 8840.0, "step": 1936, "total_num_tokens": 524487008.0, "z_loss": 0.0008143699378706515 }, { "copy_logits_max": -1.510791540145874, "copy_logits_min": -750000000.0, "copy_num_tokens": 568.3125, "epoch": 0.3956088843502681, "gen_logits_max": 7.595470905303955, "gen_logits_mean": -11.777195930480957, "gen_logits_min": -23.899124145507812, "gen_logits_std": 2.5572333335876465, "gen_loss": 0.328336238861084, "grad_norm": 0.4237796788507826, "learning_rate": 2.8482947368421053e-05, "loss": 0.3108, "mean_copy_accuracy": 0.9941845387220383, "mean_gen_accuracy": 0.8634916841983795, "mean_token_accuracy": 0.8978915512561798, "num_tokens": 524725593.0, "sample_num_tokens": 9660.25, "step": 1937, "total_num_tokens": 524764234.0, "z_loss": 0.0009350677719339728 }, { "copy_logits_max": -2.7146899700164795, "copy_logits_min": -750000064.0, "copy_num_tokens": 345.5625, "epoch": 0.3958131222874649, "gen_logits_max": 7.706332206726074, "gen_logits_mean": -12.694642066955566, "gen_logits_min": -24.4984188079834, "gen_logits_std": 2.5143535137176514, "gen_loss": 0.338668555021286, "grad_norm": 0.4759307893457568, "learning_rate": 2.8481684210526317e-05, "loss": 0.3349, "mean_copy_accuracy": 0.9916173815727234, "mean_gen_accuracy": 0.861866220831871, "mean_token_accuracy": 0.8904800266027451, "num_tokens": 525002954.0, "sample_num_tokens": 7365.0, "step": 1938, "total_num_tokens": 525032414.0, "z_loss": 0.0009010903886519372 }, { "copy_logits_max": -2.3834736347198486, "copy_logits_min": -750000000.0, "copy_num_tokens": 451.125, "epoch": 0.3960173602246617, "gen_logits_max": 7.814738750457764, "gen_logits_mean": -11.889165878295898, "gen_logits_min": -23.56950569152832, "gen_logits_std": 2.500868320465088, "gen_loss": 0.3471655249595642, "grad_norm": 0.4478237953200675, "learning_rate": 2.8480421052631578e-05, "loss": 0.326, "mean_copy_accuracy": 0.9934440553188324, "mean_gen_accuracy": 0.8604812026023865, "mean_token_accuracy": 0.8912979364395142, "num_tokens": 525259779.0, "sample_num_tokens": 9009.75, "step": 1939, "total_num_tokens": 525295818.0, "z_loss": 0.0008761586504988372 }, { "copy_logits_max": -2.8741917610168457, "copy_logits_min": -750000000.0, "copy_num_tokens": 387.8125, "epoch": 0.3962215981618586, "gen_logits_max": 7.6767168045043945, "gen_logits_mean": -12.498608589172363, "gen_logits_min": -24.324615478515625, "gen_logits_std": 2.505699396133423, "gen_loss": 0.3654264807701111, "grad_norm": 0.5012253231504199, "learning_rate": 2.8479157894736842e-05, "loss": 0.3521, "mean_copy_accuracy": 0.9947791695594788, "mean_gen_accuracy": 0.8456165194511414, "mean_token_accuracy": 0.8838742822408676, "num_tokens": 525522772.0, "sample_num_tokens": 7844.5, "step": 1940, "total_num_tokens": 525554150.0, "z_loss": 0.0010185050778090954 }, { "copy_logits_max": -4.769108772277832, "copy_logits_min": -750000000.0, "copy_num_tokens": 317.1875, "epoch": 0.3964258360990554, "gen_logits_max": 6.97501802444458, "gen_logits_mean": -14.083822250366211, "gen_logits_min": -25.27792739868164, "gen_logits_std": 2.4433767795562744, "gen_loss": 0.346346914768219, "grad_norm": 0.4809966372132951, "learning_rate": 2.8477894736842103e-05, "loss": 0.3453, "mean_copy_accuracy": 0.9938127398490906, "mean_gen_accuracy": 0.853157177567482, "mean_token_accuracy": 0.88652104139328, "num_tokens": 525789006.0, "sample_num_tokens": 7930.5, "step": 1941, "total_num_tokens": 525820728.0, "z_loss": 0.0008810039144009352 }, { "copy_logits_max": -1.4610830545425415, "copy_logits_min": -750000000.0, "copy_num_tokens": 554.875, "epoch": 0.3966300740362522, "gen_logits_max": 7.2857666015625, "gen_logits_mean": -12.222259521484375, "gen_logits_min": -23.810630798339844, "gen_logits_std": 2.539254665374756, "gen_loss": 0.3256120979785919, "grad_norm": 0.5346493528449952, "learning_rate": 2.847663157894737e-05, "loss": 0.3236, "mean_copy_accuracy": 0.9942493438720703, "mean_gen_accuracy": 0.8595871031284332, "mean_token_accuracy": 0.8951475024223328, "num_tokens": 526056791.0, "sample_num_tokens": 8700.75, "step": 1942, "total_num_tokens": 526091594.0, "z_loss": 0.0009446043986827135 }, { "copy_logits_max": -3.872729778289795, "copy_logits_min": -750000000.0, "copy_num_tokens": 552.8125, "epoch": 0.3968343119734491, "gen_logits_max": 8.606521606445312, "gen_logits_mean": -11.129152297973633, "gen_logits_min": -23.244808197021484, "gen_logits_std": 2.5638136863708496, "gen_loss": 0.2824070453643799, "grad_norm": 0.4776648432475248, "learning_rate": 2.8475368421052632e-05, "loss": 0.3099, "mean_copy_accuracy": 0.993874192237854, "mean_gen_accuracy": 0.86589215695858, "mean_token_accuracy": 0.8997996002435684, "num_tokens": 526343192.0, "sample_num_tokens": 9073.5, "step": 1943, "total_num_tokens": 526379486.0, "z_loss": 0.0008854459156282246 }, { "copy_logits_max": -3.3707218170166016, "copy_logits_min": -750000064.0, "copy_num_tokens": 388.4375, "epoch": 0.3970385499106459, "gen_logits_max": 8.184993743896484, "gen_logits_mean": -11.711725234985352, "gen_logits_min": -23.649433135986328, "gen_logits_std": 2.499920129776001, "gen_loss": 0.3136754631996155, "grad_norm": 0.462749118178725, "learning_rate": 2.8474105263157896e-05, "loss": 0.3212, "mean_copy_accuracy": 0.9932598173618317, "mean_gen_accuracy": 0.8653034865856171, "mean_token_accuracy": 0.897176206111908, "num_tokens": 526610777.0, "sample_num_tokens": 8467.25, "step": 1944, "total_num_tokens": 526644646.0, "z_loss": 0.0009153772844001651 }, { "copy_logits_max": -4.432560920715332, "copy_logits_min": -687499968.0, "copy_num_tokens": 406.5, "epoch": 0.3972427878478427, "gen_logits_max": 8.869413375854492, "gen_logits_mean": -11.054768562316895, "gen_logits_min": -23.056591033935547, "gen_logits_std": 2.543370485305786, "gen_loss": 0.34472817182540894, "grad_norm": 0.4653253647345022, "learning_rate": 2.8472842105263157e-05, "loss": 0.3433, "mean_copy_accuracy": 0.9934723377227783, "mean_gen_accuracy": 0.8597367107868195, "mean_token_accuracy": 0.8889665603637695, "num_tokens": 526878332.0, "sample_num_tokens": 8034.0, "step": 1945, "total_num_tokens": 526910468.0, "z_loss": 0.0009275514166802168 }, { "copy_logits_max": -2.5700788497924805, "copy_logits_min": -750000000.0, "copy_num_tokens": 473.4375, "epoch": 0.3974470257850396, "gen_logits_max": 7.839385032653809, "gen_logits_mean": -11.754518508911133, "gen_logits_min": -23.992576599121094, "gen_logits_std": 2.5796449184417725, "gen_loss": 0.3082703948020935, "grad_norm": 0.444825670722254, "learning_rate": 2.847157894736842e-05, "loss": 0.3362, "mean_copy_accuracy": 0.9944546222686768, "mean_gen_accuracy": 0.8561126440763474, "mean_token_accuracy": 0.8890715390443802, "num_tokens": 527143747.0, "sample_num_tokens": 7793.25, "step": 1946, "total_num_tokens": 527174920.0, "z_loss": 0.000960320292506367 }, { "copy_logits_max": -2.841465950012207, "copy_logits_min": -750000000.0, "copy_num_tokens": 483.3125, "epoch": 0.3976512637222364, "gen_logits_max": 7.645505905151367, "gen_logits_mean": -12.19955062866211, "gen_logits_min": -24.375944137573242, "gen_logits_std": 2.517533540725708, "gen_loss": 0.3721870183944702, "grad_norm": 0.47615081682712307, "learning_rate": 2.8470315789473686e-05, "loss": 0.3492, "mean_copy_accuracy": 0.9939673691987991, "mean_gen_accuracy": 0.8476333767175674, "mean_token_accuracy": 0.8896265625953674, "num_tokens": 527412887.0, "sample_num_tokens": 8810.75, "step": 1947, "total_num_tokens": 527448130.0, "z_loss": 0.0010329654905945063 }, { "copy_logits_max": -4.637465000152588, "copy_logits_min": -750000064.0, "copy_num_tokens": 436.8125, "epoch": 0.39785550165943323, "gen_logits_max": 8.20500373840332, "gen_logits_mean": -11.893135070800781, "gen_logits_min": -24.00017738342285, "gen_logits_std": 2.5275940895080566, "gen_loss": 0.2938067317008972, "grad_norm": 0.5081164916225169, "learning_rate": 2.8469052631578947e-05, "loss": 0.3219, "mean_copy_accuracy": 0.9929003268480301, "mean_gen_accuracy": 0.8648210614919662, "mean_token_accuracy": 0.8930269479751587, "num_tokens": 527671264.0, "sample_num_tokens": 8007.5, "step": 1948, "total_num_tokens": 527703294.0, "z_loss": 0.0008611181983724236 }, { "copy_logits_max": -4.814521312713623, "copy_logits_min": -687500032.0, "copy_num_tokens": 592.1875, "epoch": 0.39805973959663005, "gen_logits_max": 7.305307388305664, "gen_logits_mean": -11.70948600769043, "gen_logits_min": -23.561298370361328, "gen_logits_std": 2.551006317138672, "gen_loss": 0.31090107560157776, "grad_norm": 0.46134450458422116, "learning_rate": 2.846778947368421e-05, "loss": 0.3133, "mean_copy_accuracy": 0.9940425306558609, "mean_gen_accuracy": 0.8610424846410751, "mean_token_accuracy": 0.8968018293380737, "num_tokens": 527944157.0, "sample_num_tokens": 9537.25, "step": 1949, "total_num_tokens": 527982306.0, "z_loss": 0.0008837043424136937 }, { "copy_logits_max": -4.998384475708008, "copy_logits_min": -750000000.0, "copy_num_tokens": 351.125, "epoch": 0.3982639775338269, "gen_logits_max": 7.76530647277832, "gen_logits_mean": -12.017324447631836, "gen_logits_min": -23.39077377319336, "gen_logits_std": 2.4621763229370117, "gen_loss": 0.3499285876750946, "grad_norm": 0.49295377028532344, "learning_rate": 2.8466526315789475e-05, "loss": 0.3521, "mean_copy_accuracy": 0.9924859404563904, "mean_gen_accuracy": 0.8556610196828842, "mean_token_accuracy": 0.8860794305801392, "num_tokens": 528206828.0, "sample_num_tokens": 7735.0, "step": 1950, "total_num_tokens": 528237768.0, "z_loss": 0.0009715836495161057 }, { "copy_logits_max": -3.9224348068237305, "copy_logits_min": -750000000.0, "copy_num_tokens": 533.25, "epoch": 0.39846821547102373, "gen_logits_max": 7.34868860244751, "gen_logits_mean": -11.461451530456543, "gen_logits_min": -23.04230499267578, "gen_logits_std": 2.4740538597106934, "gen_loss": 0.3121810257434845, "grad_norm": 0.616732630150002, "learning_rate": 2.846526315789474e-05, "loss": 0.3321, "mean_copy_accuracy": 0.9930328130722046, "mean_gen_accuracy": 0.8616685718297958, "mean_token_accuracy": 0.8904326260089874, "num_tokens": 528445765.0, "sample_num_tokens": 8562.25, "step": 1951, "total_num_tokens": 528480014.0, "z_loss": 0.0009209217969328165 }, { "copy_logits_max": -5.492447376251221, "copy_logits_min": -750000000.0, "copy_num_tokens": 254.5625, "epoch": 0.39867245340822055, "gen_logits_max": 8.630800247192383, "gen_logits_mean": -11.26441764831543, "gen_logits_min": -22.729721069335938, "gen_logits_std": 2.4829742908477783, "gen_loss": 0.3253520727157593, "grad_norm": 0.47171710378087967, "learning_rate": 2.8464e-05, "loss": 0.3434, "mean_copy_accuracy": 0.9930177181959152, "mean_gen_accuracy": 0.8547006249427795, "mean_token_accuracy": 0.8876498490571976, "num_tokens": 528707868.0, "sample_num_tokens": 6913.0, "step": 1952, "total_num_tokens": 528735520.0, "z_loss": 0.001013497356325388 }, { "copy_logits_max": -2.619157314300537, "copy_logits_min": -750000000.0, "copy_num_tokens": 501.25, "epoch": 0.3988766913454174, "gen_logits_max": 7.51062536239624, "gen_logits_mean": -11.85023307800293, "gen_logits_min": -24.12999725341797, "gen_logits_std": 2.546915292739868, "gen_loss": 0.32250887155532837, "grad_norm": 0.4737733841092187, "learning_rate": 2.8462736842105265e-05, "loss": 0.3468, "mean_copy_accuracy": 0.9925760328769684, "mean_gen_accuracy": 0.856332927942276, "mean_token_accuracy": 0.8856188952922821, "num_tokens": 528979110.0, "sample_num_tokens": 8233.5, "step": 1953, "total_num_tokens": 529012044.0, "z_loss": 0.0009497987339273095 }, { "copy_logits_max": -3.223097801208496, "copy_logits_min": -687500032.0, "copy_num_tokens": 523.9375, "epoch": 0.39908092928261424, "gen_logits_max": 7.195185661315918, "gen_logits_mean": -12.920400619506836, "gen_logits_min": -24.73880386352539, "gen_logits_std": 2.510896682739258, "gen_loss": 0.29547831416130066, "grad_norm": 0.4943756767976531, "learning_rate": 2.8461473684210526e-05, "loss": 0.321, "mean_copy_accuracy": 0.9925170838832855, "mean_gen_accuracy": 0.8620570302009583, "mean_token_accuracy": 0.8948979675769806, "num_tokens": 529270903.0, "sample_num_tokens": 8091.75, "step": 1954, "total_num_tokens": 529303270.0, "z_loss": 0.0009557507000863552 }, { "copy_logits_max": -3.623079299926758, "copy_logits_min": -750000064.0, "copy_num_tokens": 265.3125, "epoch": 0.39928516721981105, "gen_logits_max": 8.645828247070312, "gen_logits_mean": -11.262767791748047, "gen_logits_min": -23.108821868896484, "gen_logits_std": 2.508589506149292, "gen_loss": 0.3672059178352356, "grad_norm": 0.45805495356350534, "learning_rate": 2.846021052631579e-05, "loss": 0.3442, "mean_copy_accuracy": 0.9917318969964981, "mean_gen_accuracy": 0.8561352044343948, "mean_token_accuracy": 0.886869952082634, "num_tokens": 529532254.0, "sample_num_tokens": 7541.5, "step": 1955, "total_num_tokens": 529562420.0, "z_loss": 0.0010156750213354826 }, { "copy_logits_max": -3.8079094886779785, "copy_logits_min": -750000000.0, "copy_num_tokens": 544.5625, "epoch": 0.3994894051570079, "gen_logits_max": 8.196816444396973, "gen_logits_mean": -12.00859260559082, "gen_logits_min": -24.129741668701172, "gen_logits_std": 2.5350122451782227, "gen_loss": 0.2973913848400116, "grad_norm": 0.4577648229360791, "learning_rate": 2.845894736842105e-05, "loss": 0.3074, "mean_copy_accuracy": 0.9931331127882004, "mean_gen_accuracy": 0.8682723939418793, "mean_token_accuracy": 0.8988672494888306, "num_tokens": 529787731.0, "sample_num_tokens": 8612.25, "step": 1956, "total_num_tokens": 529822180.0, "z_loss": 0.0010441362392157316 }, { "copy_logits_max": -1.125760793685913, "copy_logits_min": -750000000.0, "copy_num_tokens": 412.375, "epoch": 0.39969364309420474, "gen_logits_max": 7.185785293579102, "gen_logits_mean": -12.188749313354492, "gen_logits_min": -24.28934097290039, "gen_logits_std": 2.536287784576416, "gen_loss": 0.32024306058883667, "grad_norm": 0.4514572722502492, "learning_rate": 2.8457684210526315e-05, "loss": 0.3097, "mean_copy_accuracy": 0.9943061023950577, "mean_gen_accuracy": 0.8647079616785049, "mean_token_accuracy": 0.8972823619842529, "num_tokens": 530062345.0, "sample_num_tokens": 8012.25, "step": 1957, "total_num_tokens": 530094394.0, "z_loss": 0.0010430535767227411 }, { "copy_logits_max": -6.925814151763916, "copy_logits_min": -750000064.0, "copy_num_tokens": 519.125, "epoch": 0.39989788103140156, "gen_logits_max": 7.609209060668945, "gen_logits_mean": -11.775382995605469, "gen_logits_min": -23.141490936279297, "gen_logits_std": 2.4759316444396973, "gen_loss": 0.29440563917160034, "grad_norm": 0.4708826396991308, "learning_rate": 2.845642105263158e-05, "loss": 0.3066, "mean_copy_accuracy": 0.9936589598655701, "mean_gen_accuracy": 0.8651278614997864, "mean_token_accuracy": 0.8999881148338318, "num_tokens": 530337532.0, "sample_num_tokens": 9170.0, "step": 1958, "total_num_tokens": 530374212.0, "z_loss": 0.0009260295773856342 }, { "copy_logits_max": -4.608003616333008, "copy_logits_min": -750000000.0, "copy_num_tokens": 374.375, "epoch": 0.40010211896859843, "gen_logits_max": 6.825530529022217, "gen_logits_mean": -13.191656112670898, "gen_logits_min": -24.708181381225586, "gen_logits_std": 2.4646859169006348, "gen_loss": 0.31288206577301025, "grad_norm": 0.44536777485120754, "learning_rate": 2.8455157894736844e-05, "loss": 0.312, "mean_copy_accuracy": 0.9945585131645203, "mean_gen_accuracy": 0.8638720512390137, "mean_token_accuracy": 0.8972280323505402, "num_tokens": 530615296.0, "sample_num_tokens": 7750.5, "step": 1959, "total_num_tokens": 530646298.0, "z_loss": 0.0009686558041721582 }, { "copy_logits_max": -3.5691118240356445, "copy_logits_min": -750000000.0, "copy_num_tokens": 389.3125, "epoch": 0.40030635690579525, "gen_logits_max": 8.326629638671875, "gen_logits_mean": -11.441883087158203, "gen_logits_min": -23.087156295776367, "gen_logits_std": 2.5174877643585205, "gen_loss": 0.33896735310554504, "grad_norm": 0.45266181208370987, "learning_rate": 2.8453894736842108e-05, "loss": 0.3474, "mean_copy_accuracy": 0.99290731549263, "mean_gen_accuracy": 0.8536843508481979, "mean_token_accuracy": 0.8876215517520905, "num_tokens": 530884469.0, "sample_num_tokens": 8045.75, "step": 1960, "total_num_tokens": 530916652.0, "z_loss": 0.0009437762200832367 }, { "copy_logits_max": -3.9165894985198975, "copy_logits_min": -750000064.0, "copy_num_tokens": 312.625, "epoch": 0.40051059484299206, "gen_logits_max": 8.369446754455566, "gen_logits_mean": -12.122857093811035, "gen_logits_min": -24.1837158203125, "gen_logits_std": 2.5321478843688965, "gen_loss": 0.3462734520435333, "grad_norm": 0.4638630314693213, "learning_rate": 2.845263157894737e-05, "loss": 0.334, "mean_copy_accuracy": 0.993901327252388, "mean_gen_accuracy": 0.8565285950899124, "mean_token_accuracy": 0.8910714983940125, "num_tokens": 531158430.0, "sample_num_tokens": 6566.5, "step": 1961, "total_num_tokens": 531184696.0, "z_loss": 0.0009935824200510979 }, { "copy_logits_max": -4.284049987792969, "copy_logits_min": -750000000.0, "copy_num_tokens": 458.9375, "epoch": 0.40071483278018893, "gen_logits_max": 7.786654472351074, "gen_logits_mean": -12.209890365600586, "gen_logits_min": -23.92167854309082, "gen_logits_std": 2.5105695724487305, "gen_loss": 0.339824914932251, "grad_norm": 0.4662839689133598, "learning_rate": 2.8451368421052634e-05, "loss": 0.3368, "mean_copy_accuracy": 0.9921900033950806, "mean_gen_accuracy": 0.8589613288640976, "mean_token_accuracy": 0.89162577688694, "num_tokens": 531417277.0, "sample_num_tokens": 8283.75, "step": 1962, "total_num_tokens": 531450412.0, "z_loss": 0.0009510252857580781 }, { "copy_logits_max": -3.292980432510376, "copy_logits_min": -750000000.0, "copy_num_tokens": 429.375, "epoch": 0.40091907071738575, "gen_logits_max": 6.859416484832764, "gen_logits_mean": -12.777849197387695, "gen_logits_min": -24.683317184448242, "gen_logits_std": 2.5245561599731445, "gen_loss": 0.34826087951660156, "grad_norm": 0.5802558352278211, "learning_rate": 2.8450105263157894e-05, "loss": 0.3429, "mean_copy_accuracy": 0.9917576462030411, "mean_gen_accuracy": 0.8532564043998718, "mean_token_accuracy": 0.8885203450918198, "num_tokens": 531683665.0, "sample_num_tokens": 8325.75, "step": 1963, "total_num_tokens": 531716968.0, "z_loss": 0.0009028985514305532 }, { "copy_logits_max": -2.399567127227783, "copy_logits_min": -750000000.0, "copy_num_tokens": 690.0625, "epoch": 0.40112330865458257, "gen_logits_max": 8.282536506652832, "gen_logits_mean": -10.860454559326172, "gen_logits_min": -23.170454025268555, "gen_logits_std": 2.5638341903686523, "gen_loss": 0.3096015155315399, "grad_norm": 0.479818342678352, "learning_rate": 2.844884210526316e-05, "loss": 0.329, "mean_copy_accuracy": 0.992547869682312, "mean_gen_accuracy": 0.8599725365638733, "mean_token_accuracy": 0.8910769373178482, "num_tokens": 531952114.0, "sample_num_tokens": 10852.0, "step": 1964, "total_num_tokens": 531995522.0, "z_loss": 0.0009703547693789005 }, { "copy_logits_max": -3.149960517883301, "copy_logits_min": -750000000.0, "copy_num_tokens": 666.625, "epoch": 0.40132754659177944, "gen_logits_max": 6.494607448577881, "gen_logits_mean": -13.083391189575195, "gen_logits_min": -24.85314178466797, "gen_logits_std": 2.50158953666687, "gen_loss": 0.3179931640625, "grad_norm": 0.4834766215044777, "learning_rate": 2.844757894736842e-05, "loss": 0.335, "mean_copy_accuracy": 0.9922560900449753, "mean_gen_accuracy": 0.8582242727279663, "mean_token_accuracy": 0.8923901319503784, "num_tokens": 532260141.0, "sample_num_tokens": 10811.75, "step": 1965, "total_num_tokens": 532303388.0, "z_loss": 0.000899015401955694 }, { "copy_logits_max": -2.3497114181518555, "copy_logits_min": -687500032.0, "copy_num_tokens": 523.8125, "epoch": 0.40153178452897625, "gen_logits_max": 7.902994155883789, "gen_logits_mean": -11.582396507263184, "gen_logits_min": -23.655656814575195, "gen_logits_std": 2.5674476623535156, "gen_loss": 0.32000935077667236, "grad_norm": 0.4325914530563609, "learning_rate": 2.8446315789473687e-05, "loss": 0.3462, "mean_copy_accuracy": 0.9946841597557068, "mean_gen_accuracy": 0.8505087494850159, "mean_token_accuracy": 0.8877397775650024, "num_tokens": 532563710.0, "sample_num_tokens": 9268.0, "step": 1966, "total_num_tokens": 532600782.0, "z_loss": 0.001074210274964571 }, { "copy_logits_max": -1.902171015739441, "copy_logits_min": -750000000.0, "copy_num_tokens": 535.9375, "epoch": 0.40173602246617307, "gen_logits_max": 6.570603847503662, "gen_logits_mean": -13.221662521362305, "gen_logits_min": -25.506412506103516, "gen_logits_std": 2.5451419353485107, "gen_loss": 0.315348744392395, "grad_norm": 0.48448673787328916, "learning_rate": 2.8445052631578948e-05, "loss": 0.3272, "mean_copy_accuracy": 0.9930113554000854, "mean_gen_accuracy": 0.8556868135929108, "mean_token_accuracy": 0.89295893907547, "num_tokens": 532845408.0, "sample_num_tokens": 8875.5, "step": 1967, "total_num_tokens": 532880910.0, "z_loss": 0.0009881183505058289 }, { "copy_logits_max": -1.8211472034454346, "copy_logits_min": -687500032.0, "copy_num_tokens": 588.375, "epoch": 0.40194026040336994, "gen_logits_max": 7.559038162231445, "gen_logits_mean": -12.03542709350586, "gen_logits_min": -24.18471908569336, "gen_logits_std": 2.5822806358337402, "gen_loss": 0.27357518672943115, "grad_norm": 0.5379694247591212, "learning_rate": 2.8443789473684213e-05, "loss": 0.3227, "mean_copy_accuracy": 0.992078498005867, "mean_gen_accuracy": 0.8609976768493652, "mean_token_accuracy": 0.8944940119981766, "num_tokens": 533115107.0, "sample_num_tokens": 8957.25, "step": 1968, "total_num_tokens": 533150936.0, "z_loss": 0.0009476299746893346 }, { "copy_logits_max": 0.37975990772247314, "copy_logits_min": -750000000.0, "copy_num_tokens": 684.6875, "epoch": 0.40214449834056676, "gen_logits_max": 6.903573036193848, "gen_logits_mean": -12.624765396118164, "gen_logits_min": -25.17205047607422, "gen_logits_std": 2.5671005249023438, "gen_loss": 0.2947574853897095, "grad_norm": 0.42902083888611225, "learning_rate": 2.8442526315789474e-05, "loss": 0.307, "mean_copy_accuracy": 0.9924245029687881, "mean_gen_accuracy": 0.8675412088632584, "mean_token_accuracy": 0.8994351774454117, "num_tokens": 533415163.0, "sample_num_tokens": 10056.25, "step": 1969, "total_num_tokens": 533455388.0, "z_loss": 0.0010580859379842877 }, { "copy_logits_max": -3.4046339988708496, "copy_logits_min": -750000000.0, "copy_num_tokens": 685.75, "epoch": 0.4023487362777636, "gen_logits_max": 7.752995491027832, "gen_logits_mean": -11.431450843811035, "gen_logits_min": -23.456832885742188, "gen_logits_std": 2.5351669788360596, "gen_loss": 0.2816659212112427, "grad_norm": 0.4076414587289885, "learning_rate": 2.8441263157894738e-05, "loss": 0.3296, "mean_copy_accuracy": 0.9947418868541718, "mean_gen_accuracy": 0.8613310307264328, "mean_token_accuracy": 0.8933289349079132, "num_tokens": 533704378.0, "sample_num_tokens": 10073.0, "step": 1970, "total_num_tokens": 533744670.0, "z_loss": 0.000887510715983808 }, { "copy_logits_max": -3.1281726360321045, "copy_logits_min": -750000000.0, "copy_num_tokens": 680.375, "epoch": 0.40255297421496045, "gen_logits_max": 5.845576286315918, "gen_logits_mean": -13.79862117767334, "gen_logits_min": -25.49868392944336, "gen_logits_std": 2.508455753326416, "gen_loss": 0.33061593770980835, "grad_norm": 0.4191603409729441, "learning_rate": 2.844e-05, "loss": 0.3414, "mean_copy_accuracy": 0.9937401413917542, "mean_gen_accuracy": 0.8509610891342163, "mean_token_accuracy": 0.8872459530830383, "num_tokens": 533990405.0, "sample_num_tokens": 9930.75, "step": 1971, "total_num_tokens": 534030128.0, "z_loss": 0.0009288669680245221 }, { "copy_logits_max": -4.191405296325684, "copy_logits_min": -750000000.0, "copy_num_tokens": 513.6875, "epoch": 0.40275721215215726, "gen_logits_max": 7.189355850219727, "gen_logits_mean": -11.949771881103516, "gen_logits_min": -23.769495010375977, "gen_logits_std": 2.515897274017334, "gen_loss": 0.3155164122581482, "grad_norm": 0.5147327498918927, "learning_rate": 2.8438736842105263e-05, "loss": 0.3125, "mean_copy_accuracy": 0.9927958995103836, "mean_gen_accuracy": 0.8652399331331253, "mean_token_accuracy": 0.8989370465278625, "num_tokens": 534271183.0, "sample_num_tokens": 8342.75, "step": 1972, "total_num_tokens": 534304554.0, "z_loss": 0.0008631529053673148 }, { "copy_logits_max": -2.661271572113037, "copy_logits_min": -750000000.0, "copy_num_tokens": 385.875, "epoch": 0.4029614500893541, "gen_logits_max": 7.539363384246826, "gen_logits_mean": -12.630395889282227, "gen_logits_min": -24.53902816772461, "gen_logits_std": 2.5558183193206787, "gen_loss": 0.33157211542129517, "grad_norm": 0.454922166533465, "learning_rate": 2.8437473684210524e-05, "loss": 0.3085, "mean_copy_accuracy": 0.9924237877130508, "mean_gen_accuracy": 0.868021160364151, "mean_token_accuracy": 0.8994301557540894, "num_tokens": 534547934.0, "sample_num_tokens": 7742.5, "step": 1973, "total_num_tokens": 534578904.0, "z_loss": 0.0009125836077146232 }, { "copy_logits_max": -3.437826156616211, "copy_logits_min": -750000000.0, "copy_num_tokens": 299.9375, "epoch": 0.40316568802655095, "gen_logits_max": 8.307323455810547, "gen_logits_mean": -11.778377532958984, "gen_logits_min": -23.490741729736328, "gen_logits_std": 2.5387468338012695, "gen_loss": 0.38021770119667053, "grad_norm": 0.4806360666084361, "learning_rate": 2.843621052631579e-05, "loss": 0.3524, "mean_copy_accuracy": 0.9928112626075745, "mean_gen_accuracy": 0.8552554845809937, "mean_token_accuracy": 0.8864669948816299, "num_tokens": 534810960.0, "sample_num_tokens": 6948.0, "step": 1974, "total_num_tokens": 534838752.0, "z_loss": 0.0010309047065675259 }, { "copy_logits_max": -3.5625081062316895, "copy_logits_min": -750000000.0, "copy_num_tokens": 420.75, "epoch": 0.40336992596374777, "gen_logits_max": 7.5735015869140625, "gen_logits_mean": -11.89410400390625, "gen_logits_min": -23.5565128326416, "gen_logits_std": 2.4829506874084473, "gen_loss": 0.31203487515449524, "grad_norm": 0.4584010371437224, "learning_rate": 2.8434947368421056e-05, "loss": 0.3135, "mean_copy_accuracy": 0.993267372250557, "mean_gen_accuracy": 0.8676893413066864, "mean_token_accuracy": 0.8976302593946457, "num_tokens": 535075507.0, "sample_num_tokens": 8780.75, "step": 1975, "total_num_tokens": 535110630.0, "z_loss": 0.0009221099317073822 }, { "copy_logits_max": -2.408627986907959, "copy_logits_min": -750000000.0, "copy_num_tokens": 476.125, "epoch": 0.4035741639009446, "gen_logits_max": 8.459360122680664, "gen_logits_mean": -10.555368423461914, "gen_logits_min": -22.76549530029297, "gen_logits_std": 2.556445360183716, "gen_loss": 0.341457724571228, "grad_norm": 0.47350023218786663, "learning_rate": 2.8433684210526317e-05, "loss": 0.344, "mean_copy_accuracy": 0.993475928902626, "mean_gen_accuracy": 0.8536779284477234, "mean_token_accuracy": 0.8872631788253784, "num_tokens": 535353523.0, "sample_num_tokens": 8124.75, "step": 1976, "total_num_tokens": 535386022.0, "z_loss": 0.0010370187228545547 }, { "copy_logits_max": -3.942255735397339, "copy_logits_min": -750000000.0, "copy_num_tokens": 404.5625, "epoch": 0.40377840183814145, "gen_logits_max": 7.327670097351074, "gen_logits_mean": -12.009450912475586, "gen_logits_min": -23.414222717285156, "gen_logits_std": 2.496499538421631, "gen_loss": 0.3013495206832886, "grad_norm": 0.45428896184624384, "learning_rate": 2.843242105263158e-05, "loss": 0.3068, "mean_copy_accuracy": 0.9929451197385788, "mean_gen_accuracy": 0.8697341233491898, "mean_token_accuracy": 0.8981854915618896, "num_tokens": 535621146.0, "sample_num_tokens": 8484.5, "step": 1977, "total_num_tokens": 535655084.0, "z_loss": 0.0009128013043664396 }, { "copy_logits_max": -2.3561394214630127, "copy_logits_min": -750000064.0, "copy_num_tokens": 650.5625, "epoch": 0.40398263977533827, "gen_logits_max": 8.01254940032959, "gen_logits_mean": -11.143714904785156, "gen_logits_min": -23.201791763305664, "gen_logits_std": 2.5652108192443848, "gen_loss": 0.2733790874481201, "grad_norm": 0.4353042417950329, "learning_rate": 2.8431157894736842e-05, "loss": 0.3109, "mean_copy_accuracy": 0.9939036965370178, "mean_gen_accuracy": 0.861019641160965, "mean_token_accuracy": 0.8983736336231232, "num_tokens": 535890330.0, "sample_num_tokens": 8976.0, "step": 1978, "total_num_tokens": 535926234.0, "z_loss": 0.0008063674904406071 }, { "copy_logits_max": -3.5538675785064697, "copy_logits_min": -750000064.0, "copy_num_tokens": 543.75, "epoch": 0.4041868777125351, "gen_logits_max": 7.241196632385254, "gen_logits_mean": -12.614797592163086, "gen_logits_min": -24.393878936767578, "gen_logits_std": 2.5362203121185303, "gen_loss": 0.31552013754844666, "grad_norm": 0.5092599313578945, "learning_rate": 2.8429894736842106e-05, "loss": 0.3292, "mean_copy_accuracy": 0.9920885264873505, "mean_gen_accuracy": 0.858165830373764, "mean_token_accuracy": 0.8933429569005966, "num_tokens": 536142109.0, "sample_num_tokens": 8325.75, "step": 1979, "total_num_tokens": 536175412.0, "z_loss": 0.0007962946547195315 }, { "copy_logits_max": -1.6924694776535034, "copy_logits_min": -750000000.0, "copy_num_tokens": 448.125, "epoch": 0.40439111564973196, "gen_logits_max": 6.754833221435547, "gen_logits_mean": -12.897756576538086, "gen_logits_min": -24.678651809692383, "gen_logits_std": 2.5521836280822754, "gen_loss": 0.3280344009399414, "grad_norm": 0.49032325690970585, "learning_rate": 2.8428631578947367e-05, "loss": 0.3272, "mean_copy_accuracy": 0.9927215129137039, "mean_gen_accuracy": 0.858588308095932, "mean_token_accuracy": 0.8921495974063873, "num_tokens": 536433747.0, "sample_num_tokens": 8308.25, "step": 1980, "total_num_tokens": 536466980.0, "z_loss": 0.0008450985769741237 }, { "copy_logits_max": -3.363152503967285, "copy_logits_min": -750000000.0, "copy_num_tokens": 496.8125, "epoch": 0.4045953535869288, "gen_logits_max": 7.197071075439453, "gen_logits_mean": -12.570657730102539, "gen_logits_min": -24.225961685180664, "gen_logits_std": 2.527468681335449, "gen_loss": 0.3348419666290283, "grad_norm": 0.475627731924845, "learning_rate": 2.8427368421052632e-05, "loss": 0.3135, "mean_copy_accuracy": 0.9943274110555649, "mean_gen_accuracy": 0.8661687672138214, "mean_token_accuracy": 0.8987322151660919, "num_tokens": 536704019.0, "sample_num_tokens": 8574.25, "step": 1981, "total_num_tokens": 536738316.0, "z_loss": 0.0009454935207031667 }, { "copy_logits_max": -4.9619975090026855, "copy_logits_min": -750000000.0, "copy_num_tokens": 415.5625, "epoch": 0.4047995915241256, "gen_logits_max": 7.1907548904418945, "gen_logits_mean": -12.842167854309082, "gen_logits_min": -24.396137237548828, "gen_logits_std": 2.5142948627471924, "gen_loss": 0.3452521860599518, "grad_norm": 0.4621991383966487, "learning_rate": 2.8426105263157896e-05, "loss": 0.308, "mean_copy_accuracy": 0.9935010820627213, "mean_gen_accuracy": 0.8618195056915283, "mean_token_accuracy": 0.898496687412262, "num_tokens": 536974749.0, "sample_num_tokens": 8274.75, "step": 1982, "total_num_tokens": 537007848.0, "z_loss": 0.0008600021246820688 }, { "copy_logits_max": -5.272248268127441, "copy_logits_min": -687500032.0, "copy_num_tokens": 307.6875, "epoch": 0.40500382946132246, "gen_logits_max": 7.596368789672852, "gen_logits_mean": -12.023885726928711, "gen_logits_min": -23.59725570678711, "gen_logits_std": 2.5238943099975586, "gen_loss": 0.33473581075668335, "grad_norm": 0.4855648176946203, "learning_rate": 2.842484210526316e-05, "loss": 0.3429, "mean_copy_accuracy": 0.9905810356140137, "mean_gen_accuracy": 0.8599923849105835, "mean_token_accuracy": 0.8881812244653702, "num_tokens": 537251378.0, "sample_num_tokens": 7342.0, "step": 1983, "total_num_tokens": 537280746.0, "z_loss": 0.0008337951730936766 }, { "copy_logits_max": -4.191167831420898, "copy_logits_min": -687500032.0, "copy_num_tokens": 554.375, "epoch": 0.4052080673985193, "gen_logits_max": 6.771531105041504, "gen_logits_mean": -12.648744583129883, "gen_logits_min": -24.453832626342773, "gen_logits_std": 2.5627102851867676, "gen_loss": 0.26117032766342163, "grad_norm": 0.47065482829528965, "learning_rate": 2.842357894736842e-05, "loss": 0.3058, "mean_copy_accuracy": 0.9930431395769119, "mean_gen_accuracy": 0.8663117736577988, "mean_token_accuracy": 0.8987066149711609, "num_tokens": 537527037.0, "sample_num_tokens": 8814.75, "step": 1984, "total_num_tokens": 537562296.0, "z_loss": 0.0008048338931985199 }, { "copy_logits_max": -3.9447860717773438, "copy_logits_min": -687500032.0, "copy_num_tokens": 399.0625, "epoch": 0.4054123053357161, "gen_logits_max": 7.265570640563965, "gen_logits_mean": -12.295353889465332, "gen_logits_min": -24.112810134887695, "gen_logits_std": 2.4907219409942627, "gen_loss": 0.4084347188472748, "grad_norm": 0.47747662345339426, "learning_rate": 2.8422315789473686e-05, "loss": 0.34, "mean_copy_accuracy": 0.9933096319437027, "mean_gen_accuracy": 0.8546047359704971, "mean_token_accuracy": 0.8919604420661926, "num_tokens": 537818973.0, "sample_num_tokens": 7977.75, "step": 1985, "total_num_tokens": 537850884.0, "z_loss": 0.0010270102648064494 }, { "copy_logits_max": -4.584087371826172, "copy_logits_min": -750000000.0, "copy_num_tokens": 245.0625, "epoch": 0.40561654327291297, "gen_logits_max": 8.225421905517578, "gen_logits_mean": -12.282140731811523, "gen_logits_min": -23.555419921875, "gen_logits_std": 2.4600155353546143, "gen_loss": 0.3415414094924927, "grad_norm": 0.5269251402522112, "learning_rate": 2.8421052631578946e-05, "loss": 0.3385, "mean_copy_accuracy": 0.9921612292528152, "mean_gen_accuracy": 0.860519751906395, "mean_token_accuracy": 0.8876719623804092, "num_tokens": 538057557.0, "sample_num_tokens": 7678.25, "step": 1986, "total_num_tokens": 538088270.0, "z_loss": 0.0009209716226905584 }, { "copy_logits_max": -3.418853521347046, "copy_logits_min": -750000000.0, "copy_num_tokens": 414.1875, "epoch": 0.4058207812101098, "gen_logits_max": 7.516186714172363, "gen_logits_mean": -11.690467834472656, "gen_logits_min": -23.716459274291992, "gen_logits_std": 2.546808958053589, "gen_loss": 0.3362073302268982, "grad_norm": 0.5347931456959408, "learning_rate": 2.841978947368421e-05, "loss": 0.3202, "mean_copy_accuracy": 0.991179034113884, "mean_gen_accuracy": 0.8658091574907303, "mean_token_accuracy": 0.895869717001915, "num_tokens": 538333575.0, "sample_num_tokens": 7754.25, "step": 1987, "total_num_tokens": 538364592.0, "z_loss": 0.0009724939009174705 }, { "copy_logits_max": -3.142890453338623, "copy_logits_min": -750000000.0, "copy_num_tokens": 481.625, "epoch": 0.4060250191473066, "gen_logits_max": 6.526116847991943, "gen_logits_mean": -12.933856964111328, "gen_logits_min": -24.715866088867188, "gen_logits_std": 2.524148464202881, "gen_loss": 0.28988027572631836, "grad_norm": 0.5618113978505235, "learning_rate": 2.8418526315789475e-05, "loss": 0.337, "mean_copy_accuracy": 0.9934615343809128, "mean_gen_accuracy": 0.8600494116544724, "mean_token_accuracy": 0.8885137885808945, "num_tokens": 538602655.0, "sample_num_tokens": 8179.75, "step": 1988, "total_num_tokens": 538635374.0, "z_loss": 0.0009542378829792142 }, { "copy_logits_max": -1.2687381505966187, "copy_logits_min": -750000000.0, "copy_num_tokens": 422.6875, "epoch": 0.40622925708450347, "gen_logits_max": 7.346501350402832, "gen_logits_mean": -12.141963005065918, "gen_logits_min": -23.910179138183594, "gen_logits_std": 2.528329372406006, "gen_loss": 0.3318854570388794, "grad_norm": 0.43786092450697817, "learning_rate": 2.8417263157894736e-05, "loss": 0.3249, "mean_copy_accuracy": 0.9927369505167007, "mean_gen_accuracy": 0.8655161559581757, "mean_token_accuracy": 0.8948240578174591, "num_tokens": 538876646.0, "sample_num_tokens": 8150.0, "step": 1989, "total_num_tokens": 538909246.0, "z_loss": 0.0011076244991272688 }, { "copy_logits_max": -1.4510200023651123, "copy_logits_min": -750000000.0, "copy_num_tokens": 430.8125, "epoch": 0.4064334950217003, "gen_logits_max": 8.000064849853516, "gen_logits_mean": -11.109048843383789, "gen_logits_min": -22.948013305664062, "gen_logits_std": 2.5474891662597656, "gen_loss": 0.32842057943344116, "grad_norm": 0.5059438487994831, "learning_rate": 2.8416e-05, "loss": 0.3444, "mean_copy_accuracy": 0.9918521046638489, "mean_gen_accuracy": 0.8561733514070511, "mean_token_accuracy": 0.886098101735115, "num_tokens": 539136321.0, "sample_num_tokens": 8571.25, "step": 1990, "total_num_tokens": 539170606.0, "z_loss": 0.0011090970365330577 }, { "copy_logits_max": -1.8105509281158447, "copy_logits_min": -687500032.0, "copy_num_tokens": 497.75, "epoch": 0.4066377329588971, "gen_logits_max": 7.39082145690918, "gen_logits_mean": -11.565363883972168, "gen_logits_min": -23.4962158203125, "gen_logits_std": 2.509559392929077, "gen_loss": 0.2953528165817261, "grad_norm": 0.5927126870669904, "learning_rate": 2.8414736842105265e-05, "loss": 0.3212, "mean_copy_accuracy": 0.9910973906517029, "mean_gen_accuracy": 0.8668556958436966, "mean_token_accuracy": 0.8947304934263229, "num_tokens": 539403901.0, "sample_num_tokens": 9206.25, "step": 1991, "total_num_tokens": 539440726.0, "z_loss": 0.0009907407220453024 }, { "copy_logits_max": -1.7913427352905273, "copy_logits_min": -750000000.0, "copy_num_tokens": 536.0625, "epoch": 0.406841970896094, "gen_logits_max": 7.235744953155518, "gen_logits_mean": -11.557271957397461, "gen_logits_min": -23.290794372558594, "gen_logits_std": 2.5344505310058594, "gen_loss": 0.3258287012577057, "grad_norm": 0.46999450471277654, "learning_rate": 2.841347368421053e-05, "loss": 0.3229, "mean_copy_accuracy": 0.9936847239732742, "mean_gen_accuracy": 0.8564557582139969, "mean_token_accuracy": 0.8933800160884857, "num_tokens": 539680997.0, "sample_num_tokens": 9000.25, "step": 1992, "total_num_tokens": 539716998.0, "z_loss": 0.0011092389468103647 }, { "copy_logits_max": -3.3684473037719727, "copy_logits_min": -750000000.0, "copy_num_tokens": 456.125, "epoch": 0.4070462088332908, "gen_logits_max": 7.6679534912109375, "gen_logits_mean": -10.97693920135498, "gen_logits_min": -22.25875473022461, "gen_logits_std": 2.5012872219085693, "gen_loss": 0.3561393916606903, "grad_norm": 0.4927452850377208, "learning_rate": 2.841221052631579e-05, "loss": 0.3359, "mean_copy_accuracy": 0.9904283881187439, "mean_gen_accuracy": 0.8600380569696426, "mean_token_accuracy": 0.8888803124427795, "num_tokens": 539947313.0, "sample_num_tokens": 8856.25, "step": 1993, "total_num_tokens": 539982738.0, "z_loss": 0.0010134091135114431 }, { "copy_logits_max": -4.359236240386963, "copy_logits_min": -750000000.0, "copy_num_tokens": 288.4375, "epoch": 0.4072504467704876, "gen_logits_max": 7.292037010192871, "gen_logits_mean": -11.969897270202637, "gen_logits_min": -23.392555236816406, "gen_logits_std": 2.511715888977051, "gen_loss": 0.36008739471435547, "grad_norm": 0.4741767021371476, "learning_rate": 2.8410947368421054e-05, "loss": 0.3346, "mean_copy_accuracy": 0.9936490654945374, "mean_gen_accuracy": 0.8599593192338943, "mean_token_accuracy": 0.8896984606981277, "num_tokens": 540196921.0, "sample_num_tokens": 7224.25, "step": 1994, "total_num_tokens": 540225818.0, "z_loss": 0.0009266821434721351 }, { "copy_logits_max": -2.594151735305786, "copy_logits_min": -750000000.0, "copy_num_tokens": 589.5625, "epoch": 0.4074546847076845, "gen_logits_max": 6.954596042633057, "gen_logits_mean": -11.919866561889648, "gen_logits_min": -23.341724395751953, "gen_logits_std": 2.5110487937927246, "gen_loss": 0.2954622209072113, "grad_norm": 0.4395400725768644, "learning_rate": 2.8409684210526315e-05, "loss": 0.3277, "mean_copy_accuracy": 0.9942457973957062, "mean_gen_accuracy": 0.854515552520752, "mean_token_accuracy": 0.8944777101278305, "num_tokens": 540482586.0, "sample_num_tokens": 8849.5, "step": 1995, "total_num_tokens": 540517984.0, "z_loss": 0.0008353578159585595 }, { "copy_logits_max": -2.66560697555542, "copy_logits_min": -687500032.0, "copy_num_tokens": 509.25, "epoch": 0.4076589226448813, "gen_logits_max": 7.104032039642334, "gen_logits_mean": -12.142155647277832, "gen_logits_min": -23.805036544799805, "gen_logits_std": 2.513046979904175, "gen_loss": 0.34846389293670654, "grad_norm": 0.4801739144063659, "learning_rate": 2.840842105263158e-05, "loss": 0.3516, "mean_copy_accuracy": 0.9928607493638992, "mean_gen_accuracy": 0.853015199303627, "mean_token_accuracy": 0.8855129331350327, "num_tokens": 540739099.0, "sample_num_tokens": 8641.25, "step": 1996, "total_num_tokens": 540773664.0, "z_loss": 0.0009167660609818995 }, { "copy_logits_max": -2.776411533355713, "copy_logits_min": -750000000.0, "copy_num_tokens": 499.8125, "epoch": 0.4078631605820781, "gen_logits_max": 6.401571750640869, "gen_logits_mean": -12.490089416503906, "gen_logits_min": -24.109725952148438, "gen_logits_std": 2.5296764373779297, "gen_loss": 0.30723240971565247, "grad_norm": 0.4598418202449416, "learning_rate": 2.840715789473684e-05, "loss": 0.3384, "mean_copy_accuracy": 0.99356509745121, "mean_gen_accuracy": 0.8586743623018265, "mean_token_accuracy": 0.8909237086772919, "num_tokens": 541015982.0, "sample_num_tokens": 8098.0, "step": 1997, "total_num_tokens": 541048374.0, "z_loss": 0.0008581378497183323 }, { "copy_logits_max": -2.4258460998535156, "copy_logits_min": -687500032.0, "copy_num_tokens": 374.4375, "epoch": 0.408067398519275, "gen_logits_max": 6.6481852531433105, "gen_logits_mean": -12.59255599975586, "gen_logits_min": -23.941619873046875, "gen_logits_std": 2.4842920303344727, "gen_loss": 0.3578040599822998, "grad_norm": 0.4799831343842558, "learning_rate": 2.8405894736842105e-05, "loss": 0.331, "mean_copy_accuracy": 0.9934814870357513, "mean_gen_accuracy": 0.8539564460515976, "mean_token_accuracy": 0.8916315585374832, "num_tokens": 541284745.0, "sample_num_tokens": 7296.25, "step": 1998, "total_num_tokens": 541313930.0, "z_loss": 0.0009361280826851726 }, { "copy_logits_max": -4.922436237335205, "copy_logits_min": -750000000.0, "copy_num_tokens": 343.75, "epoch": 0.4082716364564718, "gen_logits_max": 6.692164897918701, "gen_logits_mean": -13.313766479492188, "gen_logits_min": -24.49101448059082, "gen_logits_std": 2.464844226837158, "gen_loss": 0.2982211410999298, "grad_norm": 0.45843538145215096, "learning_rate": 2.840463157894737e-05, "loss": 0.3095, "mean_copy_accuracy": 0.9932542443275452, "mean_gen_accuracy": 0.867623820900917, "mean_token_accuracy": 0.8983316868543625, "num_tokens": 541578692.0, "sample_num_tokens": 8667.0, "step": 1999, "total_num_tokens": 541613360.0, "z_loss": 0.0008111692150123417 }, { "epoch": 0.4084758743936686, "grad_norm": 1.5254911656257617, "learning_rate": 2.8403368421052633e-05, "loss": 0.3261, "step": 2000 }, { "epoch": 0.4084758743936686, "eval_copy_logits_max": -6.0081562995910645, "eval_copy_logits_min": -72.29054260253906, "eval_gen_logits_max": 5.4648590087890625, "eval_gen_logits_mean": -16.747310638427734, "eval_gen_logits_min": -27.69155502319336, "eval_gen_logits_std": 2.4370288848876953, "eval_gen_loss": 0.37458372116088867, "eval_loss": 0.3550443649291992, "eval_mean_copy_accuracy": 0.9913989007472992, "eval_mean_gen_accuracy": 0.8648545145988464, "eval_mean_token_accuracy": 0.8809989094734192, "eval_num_tokens": 541903866.0, "eval_runtime": 0.7693, "eval_samples_per_second": 10.399, "eval_steps_per_second": 2.6, "eval_total_num_tokens": 541903866.0, "eval_z_loss": 0.0008193538524210453, "step": 2000 }, { "copy_logits_max": -2.6299781799316406, "copy_logits_min": -750000000.0, "copy_num_tokens": 470.8125, "epoch": 0.4086801123308655, "gen_logits_max": 6.955486297607422, "gen_logits_mean": -12.607623100280762, "gen_logits_min": -24.1651668548584, "gen_logits_std": 2.5086007118225098, "gen_loss": 0.31813937425613403, "grad_norm": 0.5476650185169969, "learning_rate": 2.8402105263157898e-05, "loss": 0.2942, "mean_copy_accuracy": 0.9932427704334259, "mean_gen_accuracy": 0.8649722561240196, "mean_token_accuracy": 0.8982793167233467, "num_tokens": 542160278.0, "sample_num_tokens": 8841.0, "step": 2001, "total_num_tokens": 542195642.0, "z_loss": 0.0008246395736932755 }, { "copy_logits_max": -2.811552047729492, "copy_logits_min": -750000000.0, "copy_num_tokens": 348.6875, "epoch": 0.4088843502680623, "gen_logits_max": 7.425764083862305, "gen_logits_mean": -11.867521286010742, "gen_logits_min": -23.748369216918945, "gen_logits_std": 2.4999303817749023, "gen_loss": 0.3417550027370453, "grad_norm": 0.4744906362224558, "learning_rate": 2.840084210526316e-05, "loss": 0.3399, "mean_copy_accuracy": 0.9908565878868103, "mean_gen_accuracy": 0.8555640876293182, "mean_token_accuracy": 0.8861196041107178, "num_tokens": 542412773.0, "sample_num_tokens": 8018.25, "step": 2002, "total_num_tokens": 542444846.0, "z_loss": 0.0009257762576453388 }, { "copy_logits_max": -2.2480926513671875, "copy_logits_min": -687500032.0, "copy_num_tokens": 433.8125, "epoch": 0.4090885882052591, "gen_logits_max": 7.504279613494873, "gen_logits_mean": -11.530939102172852, "gen_logits_min": -23.584415435791016, "gen_logits_std": 2.538789749145508, "gen_loss": 0.33822038769721985, "grad_norm": 0.48405164302436704, "learning_rate": 2.8399578947368423e-05, "loss": 0.3298, "mean_copy_accuracy": 0.9946705996990204, "mean_gen_accuracy": 0.8584372103214264, "mean_token_accuracy": 0.8937063962221146, "num_tokens": 542689447.0, "sample_num_tokens": 8278.75, "step": 2003, "total_num_tokens": 542722562.0, "z_loss": 0.0009998828172683716 }, { "copy_logits_max": -2.8401389122009277, "copy_logits_min": -750000000.0, "copy_num_tokens": 580.1875, "epoch": 0.40929282614245593, "gen_logits_max": 6.497032165527344, "gen_logits_mean": -12.394128799438477, "gen_logits_min": -24.177762985229492, "gen_logits_std": 2.498199939727783, "gen_loss": 0.3136720359325409, "grad_norm": 0.47336003534981075, "learning_rate": 2.8398315789473684e-05, "loss": 0.3261, "mean_copy_accuracy": 0.9920035302639008, "mean_gen_accuracy": 0.8610251098871231, "mean_token_accuracy": 0.893977552652359, "num_tokens": 542955452.0, "sample_num_tokens": 8361.5, "step": 2004, "total_num_tokens": 542988898.0, "z_loss": 0.0009628889383748174 }, { "copy_logits_max": -1.6585527658462524, "copy_logits_min": -750000000.0, "copy_num_tokens": 408.6875, "epoch": 0.4094970640796528, "gen_logits_max": 7.3030829429626465, "gen_logits_mean": -11.652653694152832, "gen_logits_min": -23.610227584838867, "gen_logits_std": 2.527505874633789, "gen_loss": 0.3476868271827698, "grad_norm": 0.46842621005669677, "learning_rate": 2.8397052631578948e-05, "loss": 0.3275, "mean_copy_accuracy": 0.9918208718299866, "mean_gen_accuracy": 0.8657587766647339, "mean_token_accuracy": 0.8927771896123886, "num_tokens": 543216055.0, "sample_num_tokens": 8305.25, "step": 2005, "total_num_tokens": 543249276.0, "z_loss": 0.0009702688548713923 }, { "copy_logits_max": -4.086714744567871, "copy_logits_min": -750000000.0, "copy_num_tokens": 676.5625, "epoch": 0.4097013020168496, "gen_logits_max": 6.416430473327637, "gen_logits_mean": -13.381044387817383, "gen_logits_min": -25.257549285888672, "gen_logits_std": 2.480323314666748, "gen_loss": 0.32236260175704956, "grad_norm": 0.4504468891046937, "learning_rate": 2.839578947368421e-05, "loss": 0.3393, "mean_copy_accuracy": 0.9945798218250275, "mean_gen_accuracy": 0.8501654267311096, "mean_token_accuracy": 0.8892736881971359, "num_tokens": 543499239.0, "sample_num_tokens": 10351.25, "step": 2006, "total_num_tokens": 543540644.0, "z_loss": 0.0009167458629235625 }, { "copy_logits_max": -2.7556917667388916, "copy_logits_min": -750000000.0, "copy_num_tokens": 675.0625, "epoch": 0.40990553995404644, "gen_logits_max": 7.115735054016113, "gen_logits_mean": -12.336200714111328, "gen_logits_min": -24.035472869873047, "gen_logits_std": 2.531385660171509, "gen_loss": 0.27683597803115845, "grad_norm": 0.5339646215060266, "learning_rate": 2.8394526315789477e-05, "loss": 0.2892, "mean_copy_accuracy": 0.9946101903915405, "mean_gen_accuracy": 0.8679662048816681, "mean_token_accuracy": 0.9055151492357254, "num_tokens": 543784723.0, "sample_num_tokens": 9796.25, "step": 2007, "total_num_tokens": 543823908.0, "z_loss": 0.0008520872797816992 }, { "copy_logits_max": -4.31615686416626, "copy_logits_min": -687500032.0, "copy_num_tokens": 516.125, "epoch": 0.4101097778912433, "gen_logits_max": 7.2037577629089355, "gen_logits_mean": -12.540456771850586, "gen_logits_min": -24.346574783325195, "gen_logits_std": 2.535428047180176, "gen_loss": 0.2843747138977051, "grad_norm": 0.44432776846849503, "learning_rate": 2.8393263157894738e-05, "loss": 0.3246, "mean_copy_accuracy": 0.9916015863418579, "mean_gen_accuracy": 0.8674808442592621, "mean_token_accuracy": 0.8933383077383041, "num_tokens": 544053066.0, "sample_num_tokens": 10028.5, "step": 2008, "total_num_tokens": 544093180.0, "z_loss": 0.0008502009441144764 }, { "copy_logits_max": -3.6650257110595703, "copy_logits_min": -750000000.0, "copy_num_tokens": 255.4375, "epoch": 0.4103140158284401, "gen_logits_max": 7.530414581298828, "gen_logits_mean": -12.937753677368164, "gen_logits_min": -24.438940048217773, "gen_logits_std": 2.5031065940856934, "gen_loss": 0.39538896083831787, "grad_norm": 0.4454942386044599, "learning_rate": 2.8392000000000002e-05, "loss": 0.3278, "mean_copy_accuracy": 0.9928666949272156, "mean_gen_accuracy": 0.8581148982048035, "mean_token_accuracy": 0.8916449695825577, "num_tokens": 544335102.0, "sample_num_tokens": 7058.5, "step": 2009, "total_num_tokens": 544363336.0, "z_loss": 0.0010256222449243069 }, { "copy_logits_max": -3.252213716506958, "copy_logits_min": -750000000.0, "copy_num_tokens": 447.5625, "epoch": 0.41051825376563694, "gen_logits_max": 6.2494683265686035, "gen_logits_mean": -12.926277160644531, "gen_logits_min": -24.487932205200195, "gen_logits_std": 2.52591609954834, "gen_loss": 0.3540647327899933, "grad_norm": 0.4195845729614323, "learning_rate": 2.8390736842105263e-05, "loss": 0.338, "mean_copy_accuracy": 0.9955635070800781, "mean_gen_accuracy": 0.8556804656982422, "mean_token_accuracy": 0.8903197646141052, "num_tokens": 544618989.0, "sample_num_tokens": 7642.25, "step": 2010, "total_num_tokens": 544649558.0, "z_loss": 0.0009291859460063279 }, { "copy_logits_max": -4.767446517944336, "copy_logits_min": -750000000.0, "copy_num_tokens": 176.5, "epoch": 0.4107224917028338, "gen_logits_max": 8.621330261230469, "gen_logits_mean": -11.989555358886719, "gen_logits_min": -23.836702346801758, "gen_logits_std": 2.4768199920654297, "gen_loss": 0.4300679564476013, "grad_norm": 0.4983829625515034, "learning_rate": 2.8389473684210527e-05, "loss": 0.3477, "mean_copy_accuracy": 0.9896617382764816, "mean_gen_accuracy": 0.8598441332578659, "mean_token_accuracy": 0.8866524547338486, "num_tokens": 544856169.0, "sample_num_tokens": 6673.75, "step": 2011, "total_num_tokens": 544882864.0, "z_loss": 0.0011127257021144032 }, { "copy_logits_max": -3.8897757530212402, "copy_logits_min": -750000000.0, "copy_num_tokens": 375.0625, "epoch": 0.41092672964003063, "gen_logits_max": 7.258072376251221, "gen_logits_mean": -12.794489860534668, "gen_logits_min": -24.59762191772461, "gen_logits_std": 2.519749164581299, "gen_loss": 0.32803013920783997, "grad_norm": 0.4209456148650942, "learning_rate": 2.8388210526315788e-05, "loss": 0.315, "mean_copy_accuracy": 0.9925662130117416, "mean_gen_accuracy": 0.8660159111022949, "mean_token_accuracy": 0.8958332687616348, "num_tokens": 545124311.0, "sample_num_tokens": 7950.75, "step": 2012, "total_num_tokens": 545156114.0, "z_loss": 0.0009583113715052605 }, { "copy_logits_max": -5.636399269104004, "copy_logits_min": -750000000.0, "copy_num_tokens": 315.75, "epoch": 0.41113096757722745, "gen_logits_max": 7.555519104003906, "gen_logits_mean": -12.868017196655273, "gen_logits_min": -24.37472152709961, "gen_logits_std": 2.434749126434326, "gen_loss": 0.34853148460388184, "grad_norm": 0.4371767049415005, "learning_rate": 2.8386947368421052e-05, "loss": 0.327, "mean_copy_accuracy": 0.9922099858522415, "mean_gen_accuracy": 0.8634320348501205, "mean_token_accuracy": 0.8938609957695007, "num_tokens": 545396117.0, "sample_num_tokens": 8775.25, "step": 2013, "total_num_tokens": 545431218.0, "z_loss": 0.0009256724733859301 }, { "copy_logits_max": -3.7859091758728027, "copy_logits_min": -750000064.0, "copy_num_tokens": 389.5, "epoch": 0.4113352055144243, "gen_logits_max": 7.467845916748047, "gen_logits_mean": -12.88686466217041, "gen_logits_min": -24.944753646850586, "gen_logits_std": 2.5291035175323486, "gen_loss": 0.3221361041069031, "grad_norm": 0.5074298048250662, "learning_rate": 2.8385684210526317e-05, "loss": 0.3281, "mean_copy_accuracy": 0.991829589009285, "mean_gen_accuracy": 0.8594860434532166, "mean_token_accuracy": 0.8920945823192596, "num_tokens": 545647095.0, "sample_num_tokens": 7530.25, "step": 2014, "total_num_tokens": 545677216.0, "z_loss": 0.0009316658833995461 }, { "copy_logits_max": -2.9196619987487793, "copy_logits_min": -750000000.0, "copy_num_tokens": 604.625, "epoch": 0.41153944345162113, "gen_logits_max": 7.6165771484375, "gen_logits_mean": -12.308700561523438, "gen_logits_min": -24.54587173461914, "gen_logits_std": 2.5203137397766113, "gen_loss": 0.3330245614051819, "grad_norm": 0.4909482910565409, "learning_rate": 2.838442105263158e-05, "loss": 0.3416, "mean_copy_accuracy": 0.9929175972938538, "mean_gen_accuracy": 0.8533588200807571, "mean_token_accuracy": 0.8877269178628922, "num_tokens": 545912601.0, "sample_num_tokens": 8837.75, "step": 2015, "total_num_tokens": 545947952.0, "z_loss": 0.0009204314555972815 }, { "copy_logits_max": -4.563555717468262, "copy_logits_min": -750000000.0, "copy_num_tokens": 629.9375, "epoch": 0.41174368138881795, "gen_logits_max": 5.8899641036987305, "gen_logits_mean": -13.790162086486816, "gen_logits_min": -25.511058807373047, "gen_logits_std": 2.46248197555542, "gen_loss": 0.32633376121520996, "grad_norm": 0.4217098736180872, "learning_rate": 2.8383157894736845e-05, "loss": 0.3157, "mean_copy_accuracy": 0.9938765466213226, "mean_gen_accuracy": 0.8677076399326324, "mean_token_accuracy": 0.8966282308101654, "num_tokens": 546174107.0, "sample_num_tokens": 9440.25, "step": 2016, "total_num_tokens": 546211868.0, "z_loss": 0.000814361497759819 }, { "copy_logits_max": -5.07614803314209, "copy_logits_min": -750000064.0, "copy_num_tokens": 431.4375, "epoch": 0.4119479193260148, "gen_logits_max": 6.723620891571045, "gen_logits_mean": -13.101508140563965, "gen_logits_min": -25.095870971679688, "gen_logits_std": 2.482740640640259, "gen_loss": 0.3263106942176819, "grad_norm": 0.460671190552866, "learning_rate": 2.8381894736842106e-05, "loss": 0.327, "mean_copy_accuracy": 0.9930925369262695, "mean_gen_accuracy": 0.8564462214708328, "mean_token_accuracy": 0.8934132605791092, "num_tokens": 546471270.0, "sample_num_tokens": 7887.0, "step": 2017, "total_num_tokens": 546502818.0, "z_loss": 0.0008582529844716191 }, { "copy_logits_max": -3.668186664581299, "copy_logits_min": -750000064.0, "copy_num_tokens": 415.0625, "epoch": 0.41215215726321164, "gen_logits_max": 7.710700511932373, "gen_logits_mean": -12.065671920776367, "gen_logits_min": -24.549861907958984, "gen_logits_std": 2.5041749477386475, "gen_loss": 0.3303760290145874, "grad_norm": 0.44479102300194323, "learning_rate": 2.838063157894737e-05, "loss": 0.3345, "mean_copy_accuracy": 0.9941914677619934, "mean_gen_accuracy": 0.8585937321186066, "mean_token_accuracy": 0.890802726149559, "num_tokens": 546741458.0, "sample_num_tokens": 7297.0, "step": 2018, "total_num_tokens": 546770646.0, "z_loss": 0.0009483715984970331 }, { "copy_logits_max": -5.100641250610352, "copy_logits_min": -687500032.0, "copy_num_tokens": 368.6875, "epoch": 0.41235639520040845, "gen_logits_max": 8.152127265930176, "gen_logits_mean": -12.686223030090332, "gen_logits_min": -24.47211456298828, "gen_logits_std": 2.4412193298339844, "gen_loss": 0.3192228674888611, "grad_norm": 0.4359055608416246, "learning_rate": 2.837936842105263e-05, "loss": 0.3236, "mean_copy_accuracy": 0.9928111135959625, "mean_gen_accuracy": 0.8621214777231216, "mean_token_accuracy": 0.8915734440088272, "num_tokens": 547004229.0, "sample_num_tokens": 7737.25, "step": 2019, "total_num_tokens": 547035178.0, "z_loss": 0.0008849318255670369 }, { "copy_logits_max": -5.263550758361816, "copy_logits_min": -687500032.0, "copy_num_tokens": 343.625, "epoch": 0.4125606331376053, "gen_logits_max": 7.116159439086914, "gen_logits_mean": -13.327880859375, "gen_logits_min": -24.72879409790039, "gen_logits_std": 2.4699833393096924, "gen_loss": 0.35846754908561707, "grad_norm": 0.5104409845477041, "learning_rate": 2.8378105263157896e-05, "loss": 0.365, "mean_copy_accuracy": 0.9921102672815323, "mean_gen_accuracy": 0.853552058339119, "mean_token_accuracy": 0.8812201023101807, "num_tokens": 547248797.0, "sample_num_tokens": 7349.25, "step": 2020, "total_num_tokens": 547278194.0, "z_loss": 0.0009124291827902198 }, { "copy_logits_max": -3.9196741580963135, "copy_logits_min": -750000000.0, "copy_num_tokens": 472.625, "epoch": 0.41276487107480214, "gen_logits_max": 6.589871406555176, "gen_logits_mean": -13.437994956970215, "gen_logits_min": -25.123554229736328, "gen_logits_std": 2.486921548843384, "gen_loss": 0.3138965368270874, "grad_norm": 0.5026053499365085, "learning_rate": 2.8376842105263157e-05, "loss": 0.3518, "mean_copy_accuracy": 0.9932508617639542, "mean_gen_accuracy": 0.8526133745908737, "mean_token_accuracy": 0.8856538832187653, "num_tokens": 547518047.0, "sample_num_tokens": 8036.25, "step": 2021, "total_num_tokens": 547550192.0, "z_loss": 0.0009145664516836405 }, { "copy_logits_max": -5.067710876464844, "copy_logits_min": -750000000.0, "copy_num_tokens": 340.5625, "epoch": 0.41296910901199896, "gen_logits_max": 8.561275482177734, "gen_logits_mean": -11.441629409790039, "gen_logits_min": -22.950214385986328, "gen_logits_std": 2.4904165267944336, "gen_loss": 0.32770317792892456, "grad_norm": 0.46989875445637563, "learning_rate": 2.837557894736842e-05, "loss": 0.3222, "mean_copy_accuracy": 0.9932613521814346, "mean_gen_accuracy": 0.8638359308242798, "mean_token_accuracy": 0.8947720229625702, "num_tokens": 547800730.0, "sample_num_tokens": 7548.0, "step": 2022, "total_num_tokens": 547830922.0, "z_loss": 0.0009298166260123253 }, { "copy_logits_max": -4.9761552810668945, "copy_logits_min": -750000000.0, "copy_num_tokens": 262.1875, "epoch": 0.41317334694919583, "gen_logits_max": 8.341486930847168, "gen_logits_mean": -11.660356521606445, "gen_logits_min": -23.20851707458496, "gen_logits_std": 2.5189976692199707, "gen_loss": 0.3174811601638794, "grad_norm": 0.4811239799140214, "learning_rate": 2.8374315789473685e-05, "loss": 0.3281, "mean_copy_accuracy": 0.9926515817642212, "mean_gen_accuracy": 0.8631761223077774, "mean_token_accuracy": 0.8908998668193817, "num_tokens": 548065137.0, "sample_num_tokens": 6930.25, "step": 2023, "total_num_tokens": 548092858.0, "z_loss": 0.0008194213733077049 }, { "copy_logits_max": -5.604121208190918, "copy_logits_min": -750000000.0, "copy_num_tokens": 448.75, "epoch": 0.41337758488639265, "gen_logits_max": 7.244333267211914, "gen_logits_mean": -12.847343444824219, "gen_logits_min": -24.819171905517578, "gen_logits_std": 2.526407480239868, "gen_loss": 0.2905597686767578, "grad_norm": 0.4404003735799416, "learning_rate": 2.837305263157895e-05, "loss": 0.2938, "mean_copy_accuracy": 0.993643507361412, "mean_gen_accuracy": 0.870649442076683, "mean_token_accuracy": 0.902646467089653, "num_tokens": 548340344.0, "sample_num_tokens": 8618.0, "step": 2024, "total_num_tokens": 548374816.0, "z_loss": 0.0007678530528210104 }, { "copy_logits_max": -5.1424055099487305, "copy_logits_min": -687500032.0, "copy_num_tokens": 459.1875, "epoch": 0.41358182282358946, "gen_logits_max": 7.936273097991943, "gen_logits_mean": -12.113798141479492, "gen_logits_min": -23.906635284423828, "gen_logits_std": 2.450437545776367, "gen_loss": 0.3229796886444092, "grad_norm": 0.46846825279818194, "learning_rate": 2.837178947368421e-05, "loss": 0.3408, "mean_copy_accuracy": 0.9930755794048309, "mean_gen_accuracy": 0.8547051697969437, "mean_token_accuracy": 0.8900978863239288, "num_tokens": 548610710.0, "sample_num_tokens": 7803.0, "step": 2025, "total_num_tokens": 548641922.0, "z_loss": 0.0008377756457775831 }, { "copy_logits_max": -4.450592041015625, "copy_logits_min": -750000000.0, "copy_num_tokens": 613.0625, "epoch": 0.41378606076078633, "gen_logits_max": 6.975932598114014, "gen_logits_mean": -12.291685104370117, "gen_logits_min": -24.05775260925293, "gen_logits_std": 2.5091633796691895, "gen_loss": 0.3376973867416382, "grad_norm": 0.5606704034241696, "learning_rate": 2.8370526315789475e-05, "loss": 0.3511, "mean_copy_accuracy": 0.9919025599956512, "mean_gen_accuracy": 0.853552371263504, "mean_token_accuracy": 0.8859245330095291, "num_tokens": 548867577.0, "sample_num_tokens": 9386.25, "step": 2026, "total_num_tokens": 548905122.0, "z_loss": 0.0008935579098761082 }, { "copy_logits_max": -5.744235992431641, "copy_logits_min": -750000000.0, "copy_num_tokens": 374.0, "epoch": 0.41399029869798315, "gen_logits_max": 8.031172752380371, "gen_logits_mean": -12.209125518798828, "gen_logits_min": -23.607406616210938, "gen_logits_std": 2.433338165283203, "gen_loss": 0.3187686800956726, "grad_norm": 0.4585572509055041, "learning_rate": 2.8369263157894736e-05, "loss": 0.3121, "mean_copy_accuracy": 0.9933809190988541, "mean_gen_accuracy": 0.8706143498420715, "mean_token_accuracy": 0.8961604982614517, "num_tokens": 549142775.0, "sample_num_tokens": 8570.25, "step": 2027, "total_num_tokens": 549177056.0, "z_loss": 0.0008218472357839346 }, { "copy_logits_max": -3.7345738410949707, "copy_logits_min": -687500032.0, "copy_num_tokens": 425.625, "epoch": 0.41419453663517997, "gen_logits_max": 7.325540542602539, "gen_logits_mean": -12.996835708618164, "gen_logits_min": -24.595382690429688, "gen_logits_std": 2.4788432121276855, "gen_loss": 0.36211109161376953, "grad_norm": 0.5063400925332986, "learning_rate": 2.8368e-05, "loss": 0.3184, "mean_copy_accuracy": 0.993795782327652, "mean_gen_accuracy": 0.8632218986749649, "mean_token_accuracy": 0.8964150547981262, "num_tokens": 549421024.0, "sample_num_tokens": 8418.5, "step": 2028, "total_num_tokens": 549454698.0, "z_loss": 0.0010244299191981554 }, { "copy_logits_max": -3.911055564880371, "copy_logits_min": -687500032.0, "copy_num_tokens": 454.0, "epoch": 0.41439877457237684, "gen_logits_max": 8.549360275268555, "gen_logits_mean": -11.452622413635254, "gen_logits_min": -22.888755798339844, "gen_logits_std": 2.4597249031066895, "gen_loss": 0.3699702024459839, "grad_norm": 0.5033147305416348, "learning_rate": 2.8366736842105264e-05, "loss": 0.3486, "mean_copy_accuracy": 0.9942323863506317, "mean_gen_accuracy": 0.8502299338579178, "mean_token_accuracy": 0.8864644169807434, "num_tokens": 549691126.0, "sample_num_tokens": 9030.5, "step": 2029, "total_num_tokens": 549727248.0, "z_loss": 0.001049917540512979 }, { "copy_logits_max": -4.288185119628906, "copy_logits_min": -750000000.0, "copy_num_tokens": 421.0625, "epoch": 0.41460301250957365, "gen_logits_max": 8.413911819458008, "gen_logits_mean": -11.938791275024414, "gen_logits_min": -23.65048599243164, "gen_logits_std": 2.512700080871582, "gen_loss": 0.35109272599220276, "grad_norm": 0.5087678249129479, "learning_rate": 2.8365473684210525e-05, "loss": 0.3288, "mean_copy_accuracy": 0.9921038299798965, "mean_gen_accuracy": 0.8563501387834549, "mean_token_accuracy": 0.89201420545578, "num_tokens": 549971578.0, "sample_num_tokens": 8619.0, "step": 2030, "total_num_tokens": 550006054.0, "z_loss": 0.0009769564494490623 }, { "copy_logits_max": -3.8870880603790283, "copy_logits_min": -750000000.0, "copy_num_tokens": 389.0, "epoch": 0.41480725044677047, "gen_logits_max": 8.185394287109375, "gen_logits_mean": -12.2216796875, "gen_logits_min": -23.990734100341797, "gen_logits_std": 2.512238025665283, "gen_loss": 0.33072131872177124, "grad_norm": 0.6037009605208008, "learning_rate": 2.8364210526315793e-05, "loss": 0.3231, "mean_copy_accuracy": 0.990970715880394, "mean_gen_accuracy": 0.8639800548553467, "mean_token_accuracy": 0.8937128037214279, "num_tokens": 550226339.0, "sample_num_tokens": 7765.25, "step": 2031, "total_num_tokens": 550257400.0, "z_loss": 0.0009544010390527546 }, { "copy_logits_max": -4.738436698913574, "copy_logits_min": -750000000.0, "copy_num_tokens": 260.3125, "epoch": 0.41501148838396734, "gen_logits_max": 8.09061336517334, "gen_logits_mean": -12.068832397460938, "gen_logits_min": -23.144075393676758, "gen_logits_std": 2.3830325603485107, "gen_loss": 0.3531991243362427, "grad_norm": 0.49053147974202443, "learning_rate": 2.8362947368421054e-05, "loss": 0.3258, "mean_copy_accuracy": 0.9924566894769669, "mean_gen_accuracy": 0.8621363341808319, "mean_token_accuracy": 0.8935289233922958, "num_tokens": 550484106.0, "sample_num_tokens": 6932.5, "step": 2032, "total_num_tokens": 550511836.0, "z_loss": 0.0009929570369422436 }, { "copy_logits_max": -3.7555747032165527, "copy_logits_min": -687500032.0, "copy_num_tokens": 368.9375, "epoch": 0.41521572632116416, "gen_logits_max": 7.044161796569824, "gen_logits_mean": -13.357699394226074, "gen_logits_min": -24.91831398010254, "gen_logits_std": 2.4879202842712402, "gen_loss": 0.3846043050289154, "grad_norm": 0.4854086615427964, "learning_rate": 2.8361684210526318e-05, "loss": 0.332, "mean_copy_accuracy": 0.9931049346923828, "mean_gen_accuracy": 0.8601899147033691, "mean_token_accuracy": 0.8929128348827362, "num_tokens": 550765873.0, "sample_num_tokens": 8608.25, "step": 2033, "total_num_tokens": 550800306.0, "z_loss": 0.0010179372038692236 }, { "copy_logits_max": -5.162875175476074, "copy_logits_min": -750000000.0, "copy_num_tokens": 357.375, "epoch": 0.415419964258361, "gen_logits_max": 6.967194557189941, "gen_logits_mean": -13.421418190002441, "gen_logits_min": -24.628768920898438, "gen_logits_std": 2.4285635948181152, "gen_loss": 0.3099895417690277, "grad_norm": 0.4523009011927848, "learning_rate": 2.836042105263158e-05, "loss": 0.3322, "mean_copy_accuracy": 0.9943128228187561, "mean_gen_accuracy": 0.8624785393476486, "mean_token_accuracy": 0.8892444968223572, "num_tokens": 551025563.0, "sample_num_tokens": 7748.25, "step": 2034, "total_num_tokens": 551056556.0, "z_loss": 0.0008822080562822521 }, { "copy_logits_max": -4.475859642028809, "copy_logits_min": -750000000.0, "copy_num_tokens": 425.1875, "epoch": 0.41562420219555785, "gen_logits_max": 7.872012615203857, "gen_logits_mean": -12.649646759033203, "gen_logits_min": -24.130565643310547, "gen_logits_std": 2.446187973022461, "gen_loss": 0.32807794213294983, "grad_norm": 0.4385673513369235, "learning_rate": 2.8359157894736844e-05, "loss": 0.324, "mean_copy_accuracy": 0.9924358427524567, "mean_gen_accuracy": 0.8630614876747131, "mean_token_accuracy": 0.8927788734436035, "num_tokens": 551310956.0, "sample_num_tokens": 9522.0, "step": 2035, "total_num_tokens": 551349044.0, "z_loss": 0.0009054036345332861 }, { "copy_logits_max": -2.4015631675720215, "copy_logits_min": -750000000.0, "copy_num_tokens": 495.5625, "epoch": 0.41582844013275466, "gen_logits_max": 6.535674095153809, "gen_logits_mean": -13.77432632446289, "gen_logits_min": -25.621623992919922, "gen_logits_std": 2.5161736011505127, "gen_loss": 0.35866403579711914, "grad_norm": 0.4688122100600322, "learning_rate": 2.8357894736842104e-05, "loss": 0.3331, "mean_copy_accuracy": 0.9945345222949982, "mean_gen_accuracy": 0.8570548892021179, "mean_token_accuracy": 0.8944075256586075, "num_tokens": 551577259.0, "sample_num_tokens": 8216.75, "step": 2036, "total_num_tokens": 551610126.0, "z_loss": 0.0010385280475020409 }, { "copy_logits_max": -5.72083044052124, "copy_logits_min": -687500032.0, "copy_num_tokens": 415.375, "epoch": 0.4160326780699515, "gen_logits_max": 7.528792381286621, "gen_logits_mean": -12.403194427490234, "gen_logits_min": -23.984519958496094, "gen_logits_std": 2.4731943607330322, "gen_loss": 0.32404252886772156, "grad_norm": 0.4568884219207713, "learning_rate": 2.835663157894737e-05, "loss": 0.3171, "mean_copy_accuracy": 0.9940714985132217, "mean_gen_accuracy": 0.8607858270406723, "mean_token_accuracy": 0.8950459063053131, "num_tokens": 551844995.0, "sample_num_tokens": 8141.25, "step": 2037, "total_num_tokens": 551877560.0, "z_loss": 0.0008977376855909824 }, { "copy_logits_max": -4.802507400512695, "copy_logits_min": -750000000.0, "copy_num_tokens": 394.9375, "epoch": 0.41623691600714835, "gen_logits_max": 7.091967582702637, "gen_logits_mean": -11.855207443237305, "gen_logits_min": -23.721511840820312, "gen_logits_std": 2.4809560775756836, "gen_loss": 0.33480656147003174, "grad_norm": 0.44900199767697335, "learning_rate": 2.835536842105263e-05, "loss": 0.3117, "mean_copy_accuracy": 0.9935048222541809, "mean_gen_accuracy": 0.8672366887331009, "mean_token_accuracy": 0.8973134607076645, "num_tokens": 552099853.0, "sample_num_tokens": 8210.25, "step": 2038, "total_num_tokens": 552132694.0, "z_loss": 0.0008764205267652869 }, { "copy_logits_max": -3.8829143047332764, "copy_logits_min": -750000000.0, "copy_num_tokens": 495.375, "epoch": 0.41644115394434517, "gen_logits_max": 7.352603912353516, "gen_logits_mean": -11.284271240234375, "gen_logits_min": -23.583057403564453, "gen_logits_std": 2.51884388923645, "gen_loss": 0.34638988971710205, "grad_norm": 0.5304723536616711, "learning_rate": 2.8354105263157894e-05, "loss": 0.3441, "mean_copy_accuracy": 0.9927244931459427, "mean_gen_accuracy": 0.8594774752855301, "mean_token_accuracy": 0.8889014720916748, "num_tokens": 552359117.0, "sample_num_tokens": 8740.75, "step": 2039, "total_num_tokens": 552394080.0, "z_loss": 0.0009738166118040681 }, { "copy_logits_max": -3.900136709213257, "copy_logits_min": -687500032.0, "copy_num_tokens": 492.125, "epoch": 0.416645391881542, "gen_logits_max": 6.59638786315918, "gen_logits_mean": -12.431114196777344, "gen_logits_min": -23.894119262695312, "gen_logits_std": 2.4859495162963867, "gen_loss": 0.3112524747848511, "grad_norm": 0.4808558452977943, "learning_rate": 2.8352842105263158e-05, "loss": 0.3164, "mean_copy_accuracy": 0.9943077564239502, "mean_gen_accuracy": 0.8629496693611145, "mean_token_accuracy": 0.8953402936458588, "num_tokens": 552645555.0, "sample_num_tokens": 8299.25, "step": 2040, "total_num_tokens": 552678752.0, "z_loss": 0.0008559409761801362 }, { "copy_logits_max": -3.639126777648926, "copy_logits_min": -750000000.0, "copy_num_tokens": 238.0, "epoch": 0.41684962981873885, "gen_logits_max": 7.019472122192383, "gen_logits_mean": -12.609430313110352, "gen_logits_min": -24.476194381713867, "gen_logits_std": 2.5135040283203125, "gen_loss": 0.340806782245636, "grad_norm": 0.549817617603566, "learning_rate": 2.8351578947368423e-05, "loss": 0.3339, "mean_copy_accuracy": 0.9929951876401901, "mean_gen_accuracy": 0.8577304780483246, "mean_token_accuracy": 0.8911102563142776, "num_tokens": 552926259.0, "sample_num_tokens": 5817.25, "step": 2041, "total_num_tokens": 552949528.0, "z_loss": 0.0009235336910933256 }, { "copy_logits_max": -4.53893518447876, "copy_logits_min": -687500032.0, "copy_num_tokens": 637.9375, "epoch": 0.41705386775593567, "gen_logits_max": 5.632407188415527, "gen_logits_mean": -13.456432342529297, "gen_logits_min": -24.942699432373047, "gen_logits_std": 2.4449357986450195, "gen_loss": 0.3511964678764343, "grad_norm": 1.3496773457753188, "learning_rate": 2.8350315789473687e-05, "loss": 0.3115, "mean_copy_accuracy": 0.9931484013795853, "mean_gen_accuracy": 0.864391878247261, "mean_token_accuracy": 0.8957500010728836, "num_tokens": 553194243.0, "sample_num_tokens": 10676.25, "step": 2042, "total_num_tokens": 553236948.0, "z_loss": 0.0009005924803204834 }, { "copy_logits_max": -3.9585609436035156, "copy_logits_min": -687500032.0, "copy_num_tokens": 470.0, "epoch": 0.4172581056931325, "gen_logits_max": 7.234559535980225, "gen_logits_mean": -12.00065803527832, "gen_logits_min": -23.90028190612793, "gen_logits_std": 2.485948085784912, "gen_loss": 0.3708861470222473, "grad_norm": 0.5888912671577218, "learning_rate": 2.8349052631578948e-05, "loss": 0.3361, "mean_copy_accuracy": 0.9924414455890656, "mean_gen_accuracy": 0.8588274717330933, "mean_token_accuracy": 0.8915500044822693, "num_tokens": 553493505.0, "sample_num_tokens": 9198.25, "step": 2043, "total_num_tokens": 553530298.0, "z_loss": 0.0009964790660887957 }, { "copy_logits_max": -4.174893379211426, "copy_logits_min": -750000000.0, "copy_num_tokens": 565.5, "epoch": 0.41746234363032936, "gen_logits_max": 6.276691913604736, "gen_logits_mean": -12.150975227355957, "gen_logits_min": -24.204082489013672, "gen_logits_std": 2.502448558807373, "gen_loss": 0.2983265817165375, "grad_norm": 0.5366233750204497, "learning_rate": 2.8347789473684212e-05, "loss": 0.3444, "mean_copy_accuracy": 0.9926466643810272, "mean_gen_accuracy": 0.8519484847784042, "mean_token_accuracy": 0.8878913074731827, "num_tokens": 553761325.0, "sample_num_tokens": 8156.75, "step": 2044, "total_num_tokens": 553793952.0, "z_loss": 0.0008550439961254597 }, { "copy_logits_max": -3.2251124382019043, "copy_logits_min": -750000000.0, "copy_num_tokens": 464.6875, "epoch": 0.4176665815675262, "gen_logits_max": 6.912422180175781, "gen_logits_mean": -11.604795455932617, "gen_logits_min": -23.71426010131836, "gen_logits_std": 2.449946880340576, "gen_loss": 0.33314207196235657, "grad_norm": 0.46828519312658895, "learning_rate": 2.8346526315789473e-05, "loss": 0.3339, "mean_copy_accuracy": 0.9948356598615646, "mean_gen_accuracy": 0.8525886982679367, "mean_token_accuracy": 0.8898379504680634, "num_tokens": 554029122.0, "sample_num_tokens": 8635.5, "step": 2045, "total_num_tokens": 554063664.0, "z_loss": 0.0009143742499873042 }, { "copy_logits_max": -4.142275810241699, "copy_logits_min": -750000064.0, "copy_num_tokens": 479.625, "epoch": 0.417870819504723, "gen_logits_max": 6.144533157348633, "gen_logits_mean": -12.115306854248047, "gen_logits_min": -24.277488708496094, "gen_logits_std": 2.477342128753662, "gen_loss": 0.31254029273986816, "grad_norm": 0.44033759200574707, "learning_rate": 2.8345263157894737e-05, "loss": 0.3276, "mean_copy_accuracy": 0.9944019913673401, "mean_gen_accuracy": 0.8598082065582275, "mean_token_accuracy": 0.8920207470655441, "num_tokens": 554308433.0, "sample_num_tokens": 9138.25, "step": 2046, "total_num_tokens": 554344986.0, "z_loss": 0.0008259537862613797 }, { "copy_logits_max": -4.879061222076416, "copy_logits_min": -750000000.0, "copy_num_tokens": 541.125, "epoch": 0.41807505744191986, "gen_logits_max": 6.040254592895508, "gen_logits_mean": -13.380270004272461, "gen_logits_min": -24.995697021484375, "gen_logits_std": 2.490025043487549, "gen_loss": 0.3077620565891266, "grad_norm": 0.4599533492572632, "learning_rate": 2.8344e-05, "loss": 0.336, "mean_copy_accuracy": 0.9925858229398727, "mean_gen_accuracy": 0.8614581376314163, "mean_token_accuracy": 0.8888365626335144, "num_tokens": 554564863.0, "sample_num_tokens": 9190.25, "step": 2047, "total_num_tokens": 554601624.0, "z_loss": 0.0007955645560286939 }, { "copy_logits_max": -3.417328357696533, "copy_logits_min": -687500032.0, "copy_num_tokens": 314.5, "epoch": 0.4182792953791167, "gen_logits_max": 6.786924839019775, "gen_logits_mean": -12.904073715209961, "gen_logits_min": -24.975860595703125, "gen_logits_std": 2.5290255546569824, "gen_loss": 0.36683106422424316, "grad_norm": 0.5087800346786508, "learning_rate": 2.8342736842105266e-05, "loss": 0.3468, "mean_copy_accuracy": 0.9919406920671463, "mean_gen_accuracy": 0.8575374186038971, "mean_token_accuracy": 0.8868982940912247, "num_tokens": 554836771.0, "sample_num_tokens": 7225.75, "step": 2048, "total_num_tokens": 554865674.0, "z_loss": 0.0009095435962080956 }, { "copy_logits_max": -6.051539421081543, "copy_logits_min": -750000000.0, "copy_num_tokens": 303.4375, "epoch": 0.4184835333163135, "gen_logits_max": 7.323622703552246, "gen_logits_mean": -12.079778671264648, "gen_logits_min": -24.206457138061523, "gen_logits_std": 2.4747395515441895, "gen_loss": 0.3636564612388611, "grad_norm": 0.4717174374875119, "learning_rate": 2.8341473684210527e-05, "loss": 0.3369, "mean_copy_accuracy": 0.9938652664422989, "mean_gen_accuracy": 0.856199637055397, "mean_token_accuracy": 0.892042338848114, "num_tokens": 555101736.0, "sample_num_tokens": 7634.5, "step": 2049, "total_num_tokens": 555132274.0, "z_loss": 0.0008525034645572305 }, { "copy_logits_max": -3.832636594772339, "copy_logits_min": -750000000.0, "copy_num_tokens": 501.3125, "epoch": 0.41868777125351037, "gen_logits_max": 6.9280104637146, "gen_logits_mean": -13.424354553222656, "gen_logits_min": -25.255558013916016, "gen_logits_std": 2.5346672534942627, "gen_loss": 0.29893070459365845, "grad_norm": 0.4510542594188259, "learning_rate": 2.834021052631579e-05, "loss": 0.3182, "mean_copy_accuracy": 0.9939359277486801, "mean_gen_accuracy": 0.8671379089355469, "mean_token_accuracy": 0.8952153623104095, "num_tokens": 555348356.0, "sample_num_tokens": 8622.0, "step": 2050, "total_num_tokens": 555382844.0, "z_loss": 0.0008738433243706822 }, { "copy_logits_max": -4.486843109130859, "copy_logits_min": -750000000.0, "copy_num_tokens": 528.5625, "epoch": 0.4188920091907072, "gen_logits_max": 6.304624557495117, "gen_logits_mean": -12.988083839416504, "gen_logits_min": -24.990657806396484, "gen_logits_std": 2.5005745887756348, "gen_loss": 0.3172593116760254, "grad_norm": 0.47937592938244344, "learning_rate": 2.8338947368421052e-05, "loss": 0.3367, "mean_copy_accuracy": 0.99239082634449, "mean_gen_accuracy": 0.857177659869194, "mean_token_accuracy": 0.8887890875339508, "num_tokens": 555620745.0, "sample_num_tokens": 9389.25, "step": 2051, "total_num_tokens": 555658302.0, "z_loss": 0.0009103637421503663 }, { "copy_logits_max": -3.4620819091796875, "copy_logits_min": -750000000.0, "copy_num_tokens": 320.375, "epoch": 0.419096247127904, "gen_logits_max": 7.235050201416016, "gen_logits_mean": -12.118255615234375, "gen_logits_min": -24.07807159423828, "gen_logits_std": 2.48453950881958, "gen_loss": 0.3428555727005005, "grad_norm": 0.4689565910620565, "learning_rate": 2.8337684210526316e-05, "loss": 0.3345, "mean_copy_accuracy": 0.9940678924322128, "mean_gen_accuracy": 0.8564954996109009, "mean_token_accuracy": 0.8903033286333084, "num_tokens": 555902295.0, "sample_num_tokens": 7321.75, "step": 2052, "total_num_tokens": 555931582.0, "z_loss": 0.0010294762905687094 }, { "copy_logits_max": -4.963695049285889, "copy_logits_min": -750000000.0, "copy_num_tokens": 344.0625, "epoch": 0.41930048506510087, "gen_logits_max": 7.032279968261719, "gen_logits_mean": -12.651150703430176, "gen_logits_min": -24.196008682250977, "gen_logits_std": 2.474820613861084, "gen_loss": 0.3069520890712738, "grad_norm": 0.4643619414663004, "learning_rate": 2.8336421052631577e-05, "loss": 0.3466, "mean_copy_accuracy": 0.9922099113464355, "mean_gen_accuracy": 0.8563815355300903, "mean_token_accuracy": 0.8848899602890015, "num_tokens": 556147388.0, "sample_num_tokens": 7700.0, "step": 2053, "total_num_tokens": 556178188.0, "z_loss": 0.0008888365118764341 }, { "copy_logits_max": -2.5279831886291504, "copy_logits_min": -750000000.0, "copy_num_tokens": 580.5, "epoch": 0.4195047230022977, "gen_logits_max": 6.2875494956970215, "gen_logits_mean": -12.497735977172852, "gen_logits_min": -24.589269638061523, "gen_logits_std": 2.555392026901245, "gen_loss": 0.3136805593967438, "grad_norm": 0.4662142868372351, "learning_rate": 2.8335157894736842e-05, "loss": 0.3293, "mean_copy_accuracy": 0.9920369535684586, "mean_gen_accuracy": 0.8575941771268845, "mean_token_accuracy": 0.8924608826637268, "num_tokens": 556426565.0, "sample_num_tokens": 9181.75, "step": 2054, "total_num_tokens": 556463292.0, "z_loss": 0.0009369422914460301 }, { "copy_logits_max": -4.524992942810059, "copy_logits_min": -750000000.0, "copy_num_tokens": 636.3125, "epoch": 0.4197089609394945, "gen_logits_max": 7.066660404205322, "gen_logits_mean": -12.728357315063477, "gen_logits_min": -25.167850494384766, "gen_logits_std": 2.5895261764526367, "gen_loss": 0.26760613918304443, "grad_norm": 0.4532706495651559, "learning_rate": 2.8333894736842106e-05, "loss": 0.3151, "mean_copy_accuracy": 0.9929209351539612, "mean_gen_accuracy": 0.8647248893976212, "mean_token_accuracy": 0.8951261192560196, "num_tokens": 556698153.0, "sample_num_tokens": 9492.75, "step": 2055, "total_num_tokens": 556736124.0, "z_loss": 0.0009154892759397626 }, { "copy_logits_max": -4.068020820617676, "copy_logits_min": -750000000.0, "copy_num_tokens": 479.375, "epoch": 0.4199131988766913, "gen_logits_max": 7.214460372924805, "gen_logits_mean": -12.471053123474121, "gen_logits_min": -24.537609100341797, "gen_logits_std": 2.5516245365142822, "gen_loss": 0.3103156089782715, "grad_norm": 0.44356725483112297, "learning_rate": 2.833263157894737e-05, "loss": 0.3168, "mean_copy_accuracy": 0.9940126538276672, "mean_gen_accuracy": 0.864508643746376, "mean_token_accuracy": 0.8959366232156754, "num_tokens": 556974485.0, "sample_num_tokens": 8717.25, "step": 2056, "total_num_tokens": 557009354.0, "z_loss": 0.0009741403628140688 }, { "copy_logits_max": -2.7276973724365234, "copy_logits_min": -687500032.0, "copy_num_tokens": 617.125, "epoch": 0.4201174368138882, "gen_logits_max": 6.997339248657227, "gen_logits_mean": -12.432194709777832, "gen_logits_min": -24.947952270507812, "gen_logits_std": 2.5533995628356934, "gen_loss": 0.3192845582962036, "grad_norm": 0.5097417449134533, "learning_rate": 2.8331368421052635e-05, "loss": 0.3408, "mean_copy_accuracy": 0.9931624382734299, "mean_gen_accuracy": 0.8512883931398392, "mean_token_accuracy": 0.8881706446409225, "num_tokens": 557246858.0, "sample_num_tokens": 9465.5, "step": 2057, "total_num_tokens": 557284720.0, "z_loss": 0.0011033687042072415 }, { "copy_logits_max": -2.545266628265381, "copy_logits_min": -687500032.0, "copy_num_tokens": 519.1875, "epoch": 0.420321674751085, "gen_logits_max": 6.636353492736816, "gen_logits_mean": -13.010757446289062, "gen_logits_min": -25.677501678466797, "gen_logits_std": 2.571303129196167, "gen_loss": 0.32493120431900024, "grad_norm": 0.39954965755720767, "learning_rate": 2.8330105263157896e-05, "loss": 0.3179, "mean_copy_accuracy": 0.9935393035411835, "mean_gen_accuracy": 0.8609311878681183, "mean_token_accuracy": 0.8958089798688889, "num_tokens": 557560943.0, "sample_num_tokens": 9017.75, "step": 2058, "total_num_tokens": 557597014.0, "z_loss": 0.0009845261229202151 }, { "copy_logits_max": -4.752993583679199, "copy_logits_min": -625000064.0, "copy_num_tokens": 450.8125, "epoch": 0.4205259126882818, "gen_logits_max": 7.5423126220703125, "gen_logits_mean": -12.001357078552246, "gen_logits_min": -24.211366653442383, "gen_logits_std": 2.6086981296539307, "gen_loss": 0.33559101819992065, "grad_norm": 0.4282541874711174, "learning_rate": 2.832884210526316e-05, "loss": 0.3399, "mean_copy_accuracy": 0.9926487505435944, "mean_gen_accuracy": 0.8599393516778946, "mean_token_accuracy": 0.8881220072507858, "num_tokens": 557832757.0, "sample_num_tokens": 8640.75, "step": 2059, "total_num_tokens": 557867320.0, "z_loss": 0.0009597113821655512 }, { "copy_logits_max": -2.617478370666504, "copy_logits_min": -750000000.0, "copy_num_tokens": 519.375, "epoch": 0.4207301506254787, "gen_logits_max": 6.156300067901611, "gen_logits_mean": -12.417695999145508, "gen_logits_min": -24.540508270263672, "gen_logits_std": 2.57785701751709, "gen_loss": 0.2830272912979126, "grad_norm": 0.4443792706013316, "learning_rate": 2.832757894736842e-05, "loss": 0.32, "mean_copy_accuracy": 0.9940311908721924, "mean_gen_accuracy": 0.8621988892555237, "mean_token_accuracy": 0.8944966942071915, "num_tokens": 558095503.0, "sample_num_tokens": 8779.75, "step": 2060, "total_num_tokens": 558130622.0, "z_loss": 0.00077671033795923 }, { "copy_logits_max": -3.2946221828460693, "copy_logits_min": -687500032.0, "copy_num_tokens": 815.375, "epoch": 0.4209343885626755, "gen_logits_max": 5.3892927169799805, "gen_logits_mean": -13.771157264709473, "gen_logits_min": -25.876644134521484, "gen_logits_std": 2.556936502456665, "gen_loss": 0.3083135485649109, "grad_norm": 0.46003190931637356, "learning_rate": 2.8326315789473685e-05, "loss": 0.3149, "mean_copy_accuracy": 0.9930661767721176, "mean_gen_accuracy": 0.8600066602230072, "mean_token_accuracy": 0.8946882039308548, "num_tokens": 558362351.0, "sample_num_tokens": 10842.25, "step": 2061, "total_num_tokens": 558405720.0, "z_loss": 0.0008615384576842189 }, { "copy_logits_max": -3.0028791427612305, "copy_logits_min": -687500032.0, "copy_num_tokens": 471.4375, "epoch": 0.4211386264998723, "gen_logits_max": 6.530024528503418, "gen_logits_mean": -12.804189682006836, "gen_logits_min": -24.60921859741211, "gen_logits_std": 2.5486037731170654, "gen_loss": 0.3536306321620941, "grad_norm": 0.5145842547389061, "learning_rate": 2.8325052631578946e-05, "loss": 0.3225, "mean_copy_accuracy": 0.992934063076973, "mean_gen_accuracy": 0.8588292598724365, "mean_token_accuracy": 0.8931627869606018, "num_tokens": 558637215.0, "sample_num_tokens": 8590.25, "step": 2062, "total_num_tokens": 558671576.0, "z_loss": 0.0009484110632911325 }, { "copy_logits_max": -2.6261777877807617, "copy_logits_min": -750000000.0, "copy_num_tokens": 592.1875, "epoch": 0.4213428644370692, "gen_logits_max": 6.412771224975586, "gen_logits_mean": -12.57795238494873, "gen_logits_min": -24.455780029296875, "gen_logits_std": 2.542450428009033, "gen_loss": 0.36137616634368896, "grad_norm": 0.42571144206482825, "learning_rate": 2.832378947368421e-05, "loss": 0.3225, "mean_copy_accuracy": 0.9955656975507736, "mean_gen_accuracy": 0.8557127416133881, "mean_token_accuracy": 0.8950542956590652, "num_tokens": 558921513.0, "sample_num_tokens": 9630.25, "step": 2063, "total_num_tokens": 558960034.0, "z_loss": 0.00100037083029747 }, { "copy_logits_max": -4.212831020355225, "copy_logits_min": -750000000.0, "copy_num_tokens": 329.25, "epoch": 0.421547102374266, "gen_logits_max": 7.000819683074951, "gen_logits_mean": -12.008064270019531, "gen_logits_min": -23.49675941467285, "gen_logits_std": 2.5411813259124756, "gen_loss": 0.31018197536468506, "grad_norm": 0.491202244741374, "learning_rate": 2.8322526315789475e-05, "loss": 0.3346, "mean_copy_accuracy": 0.9950125813484192, "mean_gen_accuracy": 0.8578693866729736, "mean_token_accuracy": 0.8915226459503174, "num_tokens": 559180750.0, "sample_num_tokens": 7571.0, "step": 2064, "total_num_tokens": 559211034.0, "z_loss": 0.0008885469287633896 }, { "copy_logits_max": -4.354020118713379, "copy_logits_min": -750000000.0, "copy_num_tokens": 520.0, "epoch": 0.42175134031146283, "gen_logits_max": 6.5609259605407715, "gen_logits_mean": -12.944072723388672, "gen_logits_min": -24.565282821655273, "gen_logits_std": 2.5433149337768555, "gen_loss": 0.31833747029304504, "grad_norm": 0.4202383417819552, "learning_rate": 2.832126315789474e-05, "loss": 0.3166, "mean_copy_accuracy": 0.993229866027832, "mean_gen_accuracy": 0.8628709018230438, "mean_token_accuracy": 0.895657405257225, "num_tokens": 559449039.0, "sample_num_tokens": 9954.25, "step": 2065, "total_num_tokens": 559488856.0, "z_loss": 0.0008682641200721264 }, { "copy_logits_max": -5.580500602722168, "copy_logits_min": -750000000.0, "copy_num_tokens": 341.6875, "epoch": 0.4219555782486597, "gen_logits_max": 6.791378021240234, "gen_logits_mean": -12.651451110839844, "gen_logits_min": -24.290618896484375, "gen_logits_std": 2.5508384704589844, "gen_loss": 0.30235034227371216, "grad_norm": 0.4463532729783548, "learning_rate": 2.832e-05, "loss": 0.3167, "mean_copy_accuracy": 0.9946078658103943, "mean_gen_accuracy": 0.8648030161857605, "mean_token_accuracy": 0.8965430408716202, "num_tokens": 559723413.0, "sample_num_tokens": 7745.25, "step": 2066, "total_num_tokens": 559754394.0, "z_loss": 0.0008363388478755951 }, { "copy_logits_max": -3.450599193572998, "copy_logits_min": -750000000.0, "copy_num_tokens": 590.1875, "epoch": 0.4221598161858565, "gen_logits_max": 6.956608772277832, "gen_logits_mean": -12.379681587219238, "gen_logits_min": -24.547061920166016, "gen_logits_std": 2.558685302734375, "gen_loss": 0.31514132022857666, "grad_norm": 0.509333652418899, "learning_rate": 2.8318736842105264e-05, "loss": 0.3133, "mean_copy_accuracy": 0.9948171675205231, "mean_gen_accuracy": 0.8599178045988083, "mean_token_accuracy": 0.896556556224823, "num_tokens": 560006943.0, "sample_num_tokens": 9939.75, "step": 2067, "total_num_tokens": 560046702.0, "z_loss": 0.0009172485442832112 }, { "copy_logits_max": -4.121589183807373, "copy_logits_min": -750000000.0, "copy_num_tokens": 443.9375, "epoch": 0.42236405412305333, "gen_logits_max": 6.184415340423584, "gen_logits_mean": -12.913074493408203, "gen_logits_min": -24.454500198364258, "gen_logits_std": 2.5436482429504395, "gen_loss": 0.2997696101665497, "grad_norm": 0.42561568666828525, "learning_rate": 2.831747368421053e-05, "loss": 0.3144, "mean_copy_accuracy": 0.9937974959611893, "mean_gen_accuracy": 0.8642339259386063, "mean_token_accuracy": 0.8971473276615143, "num_tokens": 560318049.0, "sample_num_tokens": 8386.25, "step": 2068, "total_num_tokens": 560351594.0, "z_loss": 0.0008628946961835027 }, { "copy_logits_max": -3.4256601333618164, "copy_logits_min": -750000000.0, "copy_num_tokens": 363.0625, "epoch": 0.4225682920602502, "gen_logits_max": 6.762310028076172, "gen_logits_mean": -12.949301719665527, "gen_logits_min": -24.48650360107422, "gen_logits_std": 2.5439248085021973, "gen_loss": 0.3393586575984955, "grad_norm": 0.45568801766665173, "learning_rate": 2.831621052631579e-05, "loss": 0.3261, "mean_copy_accuracy": 0.9933471828699112, "mean_gen_accuracy": 0.8634275943040848, "mean_token_accuracy": 0.8936365246772766, "num_tokens": 560590307.0, "sample_num_tokens": 8028.75, "step": 2069, "total_num_tokens": 560622422.0, "z_loss": 0.0009145973017439246 }, { "copy_logits_max": -3.748203992843628, "copy_logits_min": -687500032.0, "copy_num_tokens": 450.375, "epoch": 0.422772529997447, "gen_logits_max": 6.977170467376709, "gen_logits_mean": -12.005838394165039, "gen_logits_min": -24.07086181640625, "gen_logits_std": 2.5694360733032227, "gen_loss": 0.3495672345161438, "grad_norm": 0.45571973780803315, "learning_rate": 2.8314947368421054e-05, "loss": 0.3368, "mean_copy_accuracy": 0.9945004284381866, "mean_gen_accuracy": 0.8557435125112534, "mean_token_accuracy": 0.8872750699520111, "num_tokens": 560835467.0, "sample_num_tokens": 8447.75, "step": 2070, "total_num_tokens": 560869258.0, "z_loss": 0.0009471353841945529 }, { "copy_logits_max": -2.1348013877868652, "copy_logits_min": -687500032.0, "copy_num_tokens": 672.6875, "epoch": 0.42297676793464384, "gen_logits_max": 6.746525287628174, "gen_logits_mean": -9.982290267944336, "gen_logits_min": -22.334014892578125, "gen_logits_std": 2.602084159851074, "gen_loss": 0.293473482131958, "grad_norm": 0.46342889778493934, "learning_rate": 2.8313684210526315e-05, "loss": 0.32, "mean_copy_accuracy": 0.9930053651332855, "mean_gen_accuracy": 0.8654008060693741, "mean_token_accuracy": 0.8949344158172607, "num_tokens": 561100360.0, "sample_num_tokens": 9627.0, "step": 2071, "total_num_tokens": 561138868.0, "z_loss": 0.0009044177131727338 }, { "copy_logits_max": -3.7190327644348145, "copy_logits_min": -750000000.0, "copy_num_tokens": 282.25, "epoch": 0.4231810058718407, "gen_logits_max": 6.810192108154297, "gen_logits_mean": -13.381560325622559, "gen_logits_min": -24.84477996826172, "gen_logits_std": 2.530996799468994, "gen_loss": 0.34435373544692993, "grad_norm": 0.47903165862397346, "learning_rate": 2.8312421052631582e-05, "loss": 0.3279, "mean_copy_accuracy": 0.9937855750322342, "mean_gen_accuracy": 0.8602360486984253, "mean_token_accuracy": 0.8913720548152924, "num_tokens": 561364232.0, "sample_num_tokens": 6534.5, "step": 2072, "total_num_tokens": 561390370.0, "z_loss": 0.000879783823620528 }, { "copy_logits_max": -3.126242160797119, "copy_logits_min": -750000000.0, "copy_num_tokens": 527.6875, "epoch": 0.4233852438090375, "gen_logits_max": 6.096020698547363, "gen_logits_mean": -12.177774429321289, "gen_logits_min": -24.07185173034668, "gen_logits_std": 2.5651192665100098, "gen_loss": 0.30494022369384766, "grad_norm": 0.41072965440077186, "learning_rate": 2.8311157894736843e-05, "loss": 0.3071, "mean_copy_accuracy": 0.9937932342290878, "mean_gen_accuracy": 0.8636724650859833, "mean_token_accuracy": 0.897262766957283, "num_tokens": 561635728.0, "sample_num_tokens": 8711.5, "step": 2073, "total_num_tokens": 561670574.0, "z_loss": 0.000924320484045893 }, { "copy_logits_max": -4.668196201324463, "copy_logits_min": -750000064.0, "copy_num_tokens": 314.8125, "epoch": 0.42358948174623434, "gen_logits_max": 7.194711208343506, "gen_logits_mean": -12.26759147644043, "gen_logits_min": -23.97164535522461, "gen_logits_std": 2.527540922164917, "gen_loss": 0.32447904348373413, "grad_norm": 0.451408048290287, "learning_rate": 2.8309894736842108e-05, "loss": 0.3163, "mean_copy_accuracy": 0.9935389012098312, "mean_gen_accuracy": 0.8675801604986191, "mean_token_accuracy": 0.8975777924060822, "num_tokens": 561907376.0, "sample_num_tokens": 7152.0, "step": 2074, "total_num_tokens": 561935984.0, "z_loss": 0.0009084452176466584 }, { "copy_logits_max": -4.9628119468688965, "copy_logits_min": -625000000.0, "copy_num_tokens": 317.0625, "epoch": 0.4237937196834312, "gen_logits_max": 6.722720623016357, "gen_logits_mean": -12.167608261108398, "gen_logits_min": -23.56261444091797, "gen_logits_std": 2.528198719024658, "gen_loss": 0.32442235946655273, "grad_norm": 0.46918080814557556, "learning_rate": 2.830863157894737e-05, "loss": 0.3215, "mean_copy_accuracy": 0.9930302500724792, "mean_gen_accuracy": 0.866462454199791, "mean_token_accuracy": 0.8951372802257538, "num_tokens": 562168423.0, "sample_num_tokens": 7193.25, "step": 2075, "total_num_tokens": 562197196.0, "z_loss": 0.000874206714797765 }, { "copy_logits_max": -1.1513447761535645, "copy_logits_min": -687500032.0, "copy_num_tokens": 714.375, "epoch": 0.42399795762062803, "gen_logits_max": 6.007123947143555, "gen_logits_mean": -11.960659980773926, "gen_logits_min": -24.071748733520508, "gen_logits_std": 2.572352409362793, "gen_loss": 0.32250314950942993, "grad_norm": 0.4587265554797171, "learning_rate": 2.8307368421052633e-05, "loss": 0.324, "mean_copy_accuracy": 0.9950743168592453, "mean_gen_accuracy": 0.8534434884786606, "mean_token_accuracy": 0.8951299041509628, "num_tokens": 562438815.0, "sample_num_tokens": 10123.75, "step": 2076, "total_num_tokens": 562479310.0, "z_loss": 0.0009560710750520229 }, { "copy_logits_max": -2.1031994819641113, "copy_logits_min": -750000000.0, "copy_num_tokens": 647.5625, "epoch": 0.42420219555782485, "gen_logits_max": 6.969257354736328, "gen_logits_mean": -11.76553726196289, "gen_logits_min": -23.656620025634766, "gen_logits_std": 2.598501443862915, "gen_loss": 0.3108861744403839, "grad_norm": 0.5112397153298385, "learning_rate": 2.8306105263157894e-05, "loss": 0.308, "mean_copy_accuracy": 0.9949808418750763, "mean_gen_accuracy": 0.8612932711839676, "mean_token_accuracy": 0.8977632969617844, "num_tokens": 562722021.0, "sample_num_tokens": 9938.75, "step": 2077, "total_num_tokens": 562761776.0, "z_loss": 0.0010867221280932426 }, { "copy_logits_max": -4.906010627746582, "copy_logits_min": -687500032.0, "copy_num_tokens": 490.8125, "epoch": 0.4244064334950217, "gen_logits_max": 5.661505699157715, "gen_logits_mean": -14.162454605102539, "gen_logits_min": -25.812541961669922, "gen_logits_std": 2.5372095108032227, "gen_loss": 0.2889392375946045, "grad_norm": 0.49450050304657106, "learning_rate": 2.8304842105263158e-05, "loss": 0.3363, "mean_copy_accuracy": 0.9934498071670532, "mean_gen_accuracy": 0.8567007631063461, "mean_token_accuracy": 0.8890395760536194, "num_tokens": 562996157.0, "sample_num_tokens": 8713.25, "step": 2078, "total_num_tokens": 563031010.0, "z_loss": 0.0008618679130449891 }, { "copy_logits_max": -4.723907470703125, "copy_logits_min": -750000000.0, "copy_num_tokens": 382.75, "epoch": 0.42461067143221853, "gen_logits_max": 7.496990203857422, "gen_logits_mean": -11.69565200805664, "gen_logits_min": -23.46833610534668, "gen_logits_std": 2.5648488998413086, "gen_loss": 0.322476863861084, "grad_norm": 0.5173893277865099, "learning_rate": 2.830357894736842e-05, "loss": 0.3147, "mean_copy_accuracy": 0.9936519414186478, "mean_gen_accuracy": 0.8677948117256165, "mean_token_accuracy": 0.8969947248697281, "num_tokens": 563277230.0, "sample_num_tokens": 7582.5, "step": 2079, "total_num_tokens": 563307560.0, "z_loss": 0.0008159779827110469 }, { "copy_logits_max": -3.5955283641815186, "copy_logits_min": -750000000.0, "copy_num_tokens": 398.25, "epoch": 0.42481490936941535, "gen_logits_max": 6.589334487915039, "gen_logits_mean": -11.769832611083984, "gen_logits_min": -23.559589385986328, "gen_logits_std": 2.5600690841674805, "gen_loss": 0.36436372995376587, "grad_norm": 0.45980352918277523, "learning_rate": 2.8302315789473687e-05, "loss": 0.3415, "mean_copy_accuracy": 0.9933588802814484, "mean_gen_accuracy": 0.856067568063736, "mean_token_accuracy": 0.8878456056118011, "num_tokens": 563553765.0, "sample_num_tokens": 8204.75, "step": 2080, "total_num_tokens": 563586584.0, "z_loss": 0.000924851163290441 }, { "copy_logits_max": -2.1033787727355957, "copy_logits_min": -750000000.0, "copy_num_tokens": 728.8125, "epoch": 0.4250191473066122, "gen_logits_max": 5.984195709228516, "gen_logits_mean": -11.560403823852539, "gen_logits_min": -23.624252319335938, "gen_logits_std": 2.599764108657837, "gen_loss": 0.2925565838813782, "grad_norm": 0.46506077933125456, "learning_rate": 2.830105263157895e-05, "loss": 0.336, "mean_copy_accuracy": 0.995165154337883, "mean_gen_accuracy": 0.8505893051624298, "mean_token_accuracy": 0.8908513486385345, "num_tokens": 563853140.0, "sample_num_tokens": 10018.5, "step": 2081, "total_num_tokens": 563893214.0, "z_loss": 0.0009158041793853045 }, { "copy_logits_max": -1.9474427700042725, "copy_logits_min": -750000000.0, "copy_num_tokens": 396.375, "epoch": 0.42522338524380904, "gen_logits_max": 6.933340072631836, "gen_logits_mean": -12.493013381958008, "gen_logits_min": -24.29496955871582, "gen_logits_std": 2.5515029430389404, "gen_loss": 0.34135377407073975, "grad_norm": 0.5125162174680071, "learning_rate": 2.8299789473684212e-05, "loss": 0.3396, "mean_copy_accuracy": 0.9910382926464081, "mean_gen_accuracy": 0.8587363958358765, "mean_token_accuracy": 0.887459933757782, "num_tokens": 564109964.0, "sample_num_tokens": 7671.5, "step": 2082, "total_num_tokens": 564140650.0, "z_loss": 0.0009084523189812899 }, { "copy_logits_max": -3.9578967094421387, "copy_logits_min": -687500032.0, "copy_num_tokens": 438.0, "epoch": 0.42542762318100585, "gen_logits_max": 6.472675323486328, "gen_logits_mean": -12.781770706176758, "gen_logits_min": -24.476016998291016, "gen_logits_std": 2.5286383628845215, "gen_loss": 0.3471524715423584, "grad_norm": 0.4834239765592409, "learning_rate": 2.8298526315789476e-05, "loss": 0.3264, "mean_copy_accuracy": 0.9936893284320831, "mean_gen_accuracy": 0.8581233471632004, "mean_token_accuracy": 0.8950561732053757, "num_tokens": 564383599.0, "sample_num_tokens": 8460.25, "step": 2083, "total_num_tokens": 564417440.0, "z_loss": 0.0009513571858406067 }, { "copy_logits_max": -2.5182933807373047, "copy_logits_min": -750000000.0, "copy_num_tokens": 667.1875, "epoch": 0.4256318611182027, "gen_logits_max": 5.986402988433838, "gen_logits_mean": -11.345112800598145, "gen_logits_min": -23.776235580444336, "gen_logits_std": 2.5821633338928223, "gen_loss": 0.3239940404891968, "grad_norm": 0.45986837983624707, "learning_rate": 2.8297263157894737e-05, "loss": 0.3284, "mean_copy_accuracy": 0.9931518286466599, "mean_gen_accuracy": 0.8541936874389648, "mean_token_accuracy": 0.893144428730011, "num_tokens": 564662638.0, "sample_num_tokens": 9077.5, "step": 2084, "total_num_tokens": 564698948.0, "z_loss": 0.0009819997940212488 }, { "copy_logits_max": -3.9324886798858643, "copy_logits_min": -750000064.0, "copy_num_tokens": 416.8125, "epoch": 0.42583609905539954, "gen_logits_max": 7.104003429412842, "gen_logits_mean": -11.938188552856445, "gen_logits_min": -23.96377944946289, "gen_logits_std": 2.5289909839630127, "gen_loss": 0.3603309988975525, "grad_norm": 0.5034595524957673, "learning_rate": 2.8296e-05, "loss": 0.3391, "mean_copy_accuracy": 0.99365234375, "mean_gen_accuracy": 0.8488286137580872, "mean_token_accuracy": 0.8882075101137161, "num_tokens": 564945717.0, "sample_num_tokens": 7781.75, "step": 2085, "total_num_tokens": 564976844.0, "z_loss": 0.0009953747503459454 }, { "copy_logits_max": -5.096858978271484, "copy_logits_min": -687500032.0, "copy_num_tokens": 299.4375, "epoch": 0.42604033699259636, "gen_logits_max": 6.259685516357422, "gen_logits_mean": -13.70935344696045, "gen_logits_min": -25.03807258605957, "gen_logits_std": 2.517167568206787, "gen_loss": 0.3545658588409424, "grad_norm": 0.4605144481906672, "learning_rate": 2.8294736842105262e-05, "loss": 0.3322, "mean_copy_accuracy": 0.9921551197767258, "mean_gen_accuracy": 0.8611202090978622, "mean_token_accuracy": 0.8882122039794922, "num_tokens": 565198251.0, "sample_num_tokens": 7405.25, "step": 2086, "total_num_tokens": 565227872.0, "z_loss": 0.0008882998954504728 }, { "copy_logits_max": -3.605172634124756, "copy_logits_min": -750000000.0, "copy_num_tokens": 374.4375, "epoch": 0.42624457492979323, "gen_logits_max": 6.646052837371826, "gen_logits_mean": -11.82907485961914, "gen_logits_min": -23.52132797241211, "gen_logits_std": 2.5540671348571777, "gen_loss": 0.37036147713661194, "grad_norm": 0.529091447187387, "learning_rate": 2.8293473684210527e-05, "loss": 0.3536, "mean_copy_accuracy": 0.9932206720113754, "mean_gen_accuracy": 0.8529344797134399, "mean_token_accuracy": 0.8842236548662186, "num_tokens": 565454852.0, "sample_num_tokens": 7690.0, "step": 2087, "total_num_tokens": 565485612.0, "z_loss": 0.0009546354995109141 }, { "copy_logits_max": -3.5624725818634033, "copy_logits_min": -687500032.0, "copy_num_tokens": 376.875, "epoch": 0.42644881286699005, "gen_logits_max": 6.046178817749023, "gen_logits_mean": -13.089639663696289, "gen_logits_min": -24.809783935546875, "gen_logits_std": 2.5703072547912598, "gen_loss": 0.3120378255844116, "grad_norm": 0.4863210089285702, "learning_rate": 2.8292210526315788e-05, "loss": 0.3121, "mean_copy_accuracy": 0.9935112148523331, "mean_gen_accuracy": 0.8612861186265945, "mean_token_accuracy": 0.895595595240593, "num_tokens": 565711111.0, "sample_num_tokens": 7518.25, "step": 2088, "total_num_tokens": 565741184.0, "z_loss": 0.0008467216975986958 }, { "copy_logits_max": -6.180889129638672, "copy_logits_min": -750000000.0, "copy_num_tokens": 341.0625, "epoch": 0.42665305080418686, "gen_logits_max": 6.350532531738281, "gen_logits_mean": -13.448221206665039, "gen_logits_min": -24.77223777770996, "gen_logits_std": 2.4846343994140625, "gen_loss": 0.31145286560058594, "grad_norm": 0.567137488636665, "learning_rate": 2.8290947368421055e-05, "loss": 0.307, "mean_copy_accuracy": 0.9921042025089264, "mean_gen_accuracy": 0.8677986115217209, "mean_token_accuracy": 0.8989854156970978, "num_tokens": 565975545.0, "sample_num_tokens": 7415.75, "step": 2089, "total_num_tokens": 566005208.0, "z_loss": 0.0008244373602792621 }, { "copy_logits_max": -7.2804131507873535, "copy_logits_min": -750000000.0, "copy_num_tokens": 355.0625, "epoch": 0.42685728874138373, "gen_logits_max": 6.18475341796875, "gen_logits_mean": -14.305841445922852, "gen_logits_min": -25.394550323486328, "gen_logits_std": 2.474526882171631, "gen_loss": 0.34187012910842896, "grad_norm": 0.46528976454227666, "learning_rate": 2.8289684210526316e-05, "loss": 0.3292, "mean_copy_accuracy": 0.9933304041624069, "mean_gen_accuracy": 0.8621548265218735, "mean_token_accuracy": 0.8924232423305511, "num_tokens": 566255753.0, "sample_num_tokens": 8649.25, "step": 2090, "total_num_tokens": 566290350.0, "z_loss": 0.0008341408683918417 }, { "copy_logits_max": -4.217652320861816, "copy_logits_min": -750000000.0, "copy_num_tokens": 458.375, "epoch": 0.42706152667858055, "gen_logits_max": 6.607397556304932, "gen_logits_mean": -12.509639739990234, "gen_logits_min": -24.223590850830078, "gen_logits_std": 2.533998489379883, "gen_loss": 0.33841776847839355, "grad_norm": 0.5817878776303154, "learning_rate": 2.828842105263158e-05, "loss": 0.341, "mean_copy_accuracy": 0.9919860064983368, "mean_gen_accuracy": 0.8522826880216599, "mean_token_accuracy": 0.8887220174074173, "num_tokens": 566550327.0, "sample_num_tokens": 9095.75, "step": 2091, "total_num_tokens": 566586710.0, "z_loss": 0.0008850820595398545 }, { "copy_logits_max": -5.175392150878906, "copy_logits_min": -687500032.0, "copy_num_tokens": 375.1875, "epoch": 0.42726576461577737, "gen_logits_max": 6.905803680419922, "gen_logits_mean": -11.942349433898926, "gen_logits_min": -23.978092193603516, "gen_logits_std": 2.575138568878174, "gen_loss": 0.34409403800964355, "grad_norm": 0.5312951784337471, "learning_rate": 2.828715789473684e-05, "loss": 0.3475, "mean_copy_accuracy": 0.9923190921545029, "mean_gen_accuracy": 0.8592937886714935, "mean_token_accuracy": 0.8872935920953751, "num_tokens": 566805718.0, "sample_num_tokens": 7080.5, "step": 2092, "total_num_tokens": 566834040.0, "z_loss": 0.0009178915061056614 }, { "copy_logits_max": -3.2893810272216797, "copy_logits_min": -750000000.0, "copy_num_tokens": 564.0625, "epoch": 0.42747000255297424, "gen_logits_max": 6.71079158782959, "gen_logits_mean": -11.534629821777344, "gen_logits_min": -23.895912170410156, "gen_logits_std": 2.607003688812256, "gen_loss": 0.3263833820819855, "grad_norm": 0.6518145893030534, "learning_rate": 2.8285894736842106e-05, "loss": 0.3251, "mean_copy_accuracy": 0.9901827871799469, "mean_gen_accuracy": 0.8575829863548279, "mean_token_accuracy": 0.8919506669044495, "num_tokens": 567066675.0, "sample_num_tokens": 8530.75, "step": 2093, "total_num_tokens": 567100798.0, "z_loss": 0.0009434602688997984 }, { "copy_logits_max": -4.504322052001953, "copy_logits_min": -687500032.0, "copy_num_tokens": 448.0, "epoch": 0.42767424049017105, "gen_logits_max": 6.1368088722229, "gen_logits_mean": -12.382545471191406, "gen_logits_min": -24.0975341796875, "gen_logits_std": 2.563426971435547, "gen_loss": 0.31009379029273987, "grad_norm": 0.501707260785694, "learning_rate": 2.8284631578947367e-05, "loss": 0.3082, "mean_copy_accuracy": 0.9928955733776093, "mean_gen_accuracy": 0.8624861836433411, "mean_token_accuracy": 0.8973493725061417, "num_tokens": 567351307.0, "sample_num_tokens": 8456.25, "step": 2094, "total_num_tokens": 567385132.0, "z_loss": 0.0007828067755326629 }, { "copy_logits_max": -3.7966833114624023, "copy_logits_min": -687500032.0, "copy_num_tokens": 500.75, "epoch": 0.42787847842736787, "gen_logits_max": 6.001276016235352, "gen_logits_mean": -12.853039741516113, "gen_logits_min": -25.11112403869629, "gen_logits_std": 2.628577709197998, "gen_loss": 0.333862841129303, "grad_norm": 0.6470815191674144, "learning_rate": 2.828336842105263e-05, "loss": 0.3318, "mean_copy_accuracy": 0.9879824072122574, "mean_gen_accuracy": 0.8647433519363403, "mean_token_accuracy": 0.8916701078414917, "num_tokens": 567603971.0, "sample_num_tokens": 8419.75, "step": 2095, "total_num_tokens": 567637650.0, "z_loss": 0.0009284657426178455 }, { "copy_logits_max": -6.41557502746582, "copy_logits_min": -750000000.0, "copy_num_tokens": 331.75, "epoch": 0.42808271636456474, "gen_logits_max": 7.460612773895264, "gen_logits_mean": -12.793474197387695, "gen_logits_min": -24.416627883911133, "gen_logits_std": 2.5590648651123047, "gen_loss": 0.33340486884117126, "grad_norm": 0.6527873141862264, "learning_rate": 2.8282105263157895e-05, "loss": 0.3329, "mean_copy_accuracy": 0.9921431392431259, "mean_gen_accuracy": 0.8601273447275162, "mean_token_accuracy": 0.8914689421653748, "num_tokens": 567879296.0, "sample_num_tokens": 9009.5, "step": 2096, "total_num_tokens": 567915334.0, "z_loss": 0.0008007270516827703 }, { "copy_logits_max": -6.226936340332031, "copy_logits_min": -750000000.0, "copy_num_tokens": 453.625, "epoch": 0.42828695430176156, "gen_logits_max": 6.9306488037109375, "gen_logits_mean": -12.375720977783203, "gen_logits_min": -24.443084716796875, "gen_logits_std": 2.5941410064697266, "gen_loss": 0.31601378321647644, "grad_norm": 0.5170544522938576, "learning_rate": 2.828084210526316e-05, "loss": 0.3182, "mean_copy_accuracy": 0.9914877563714981, "mean_gen_accuracy": 0.8602785915136337, "mean_token_accuracy": 0.8977093249559402, "num_tokens": 568166857.0, "sample_num_tokens": 7825.75, "step": 2097, "total_num_tokens": 568198160.0, "z_loss": 0.0008195044938474894 }, { "copy_logits_max": -6.051177978515625, "copy_logits_min": -750000000.0, "copy_num_tokens": 442.6875, "epoch": 0.4284911922389584, "gen_logits_max": 6.29683780670166, "gen_logits_mean": -12.662046432495117, "gen_logits_min": -24.912155151367188, "gen_logits_std": 2.5868937969207764, "gen_loss": 0.3337668776512146, "grad_norm": 0.5270544645961421, "learning_rate": 2.8279578947368424e-05, "loss": 0.3333, "mean_copy_accuracy": 0.9910412430763245, "mean_gen_accuracy": 0.8588885217905045, "mean_token_accuracy": 0.8920726031064987, "num_tokens": 568425863.0, "sample_num_tokens": 8606.75, "step": 2098, "total_num_tokens": 568460290.0, "z_loss": 0.0008181689772754908 }, { "copy_logits_max": -5.885898113250732, "copy_logits_min": -750000000.0, "copy_num_tokens": 693.875, "epoch": 0.42869543017615525, "gen_logits_max": 6.669076919555664, "gen_logits_mean": -13.174297332763672, "gen_logits_min": -25.060325622558594, "gen_logits_std": 2.578209161758423, "gen_loss": 0.30619651079177856, "grad_norm": 0.47434039639108366, "learning_rate": 2.8278315789473685e-05, "loss": 0.3199, "mean_copy_accuracy": 0.9917693436145782, "mean_gen_accuracy": 0.8675024658441544, "mean_token_accuracy": 0.8963457196950912, "num_tokens": 568696896.0, "sample_num_tokens": 11417.5, "step": 2099, "total_num_tokens": 568742566.0, "z_loss": 0.0008185476763173938 }, { "copy_logits_max": -4.299001693725586, "copy_logits_min": -750000000.0, "copy_num_tokens": 562.5, "epoch": 0.42889966811335206, "gen_logits_max": 5.934417724609375, "gen_logits_mean": -12.7471923828125, "gen_logits_min": -24.543066024780273, "gen_logits_std": 2.541187047958374, "gen_loss": 0.3502357602119446, "grad_norm": 0.41137879538843336, "learning_rate": 2.827705263157895e-05, "loss": 0.35, "mean_copy_accuracy": 0.9936712235212326, "mean_gen_accuracy": 0.8545988351106644, "mean_token_accuracy": 0.8864958137273788, "num_tokens": 568989942.0, "sample_num_tokens": 9780.5, "step": 2100, "total_num_tokens": 569029064.0, "z_loss": 0.0009394140797667205 }, { "copy_logits_max": -3.6743574142456055, "copy_logits_min": -625000064.0, "copy_num_tokens": 707.4375, "epoch": 0.4291039060505489, "gen_logits_max": 5.815457344055176, "gen_logits_mean": -13.851036071777344, "gen_logits_min": -25.956809997558594, "gen_logits_std": 2.6018598079681396, "gen_loss": 0.2970448136329651, "grad_norm": 0.49885545800302655, "learning_rate": 2.827578947368421e-05, "loss": 0.3341, "mean_copy_accuracy": 0.9925185441970825, "mean_gen_accuracy": 0.8590265512466431, "mean_token_accuracy": 0.891880989074707, "num_tokens": 569256211.0, "sample_num_tokens": 9301.25, "step": 2101, "total_num_tokens": 569293416.0, "z_loss": 0.001005789963528514 }, { "copy_logits_max": -5.0769548416137695, "copy_logits_min": -750000000.0, "copy_num_tokens": 386.8125, "epoch": 0.42930814398774575, "gen_logits_max": 6.593652248382568, "gen_logits_mean": -13.374223709106445, "gen_logits_min": -24.895177841186523, "gen_logits_std": 2.51375675201416, "gen_loss": 0.363476037979126, "grad_norm": 0.42732258486495817, "learning_rate": 2.8274526315789474e-05, "loss": 0.3096, "mean_copy_accuracy": 0.9938341677188873, "mean_gen_accuracy": 0.8695600777864456, "mean_token_accuracy": 0.89711032807827, "num_tokens": 569527416.0, "sample_num_tokens": 8240.5, "step": 2102, "total_num_tokens": 569560378.0, "z_loss": 0.0010420596227049828 }, { "copy_logits_max": -6.055700302124023, "copy_logits_min": -750000000.0, "copy_num_tokens": 390.125, "epoch": 0.42951238192494257, "gen_logits_max": 6.478271484375, "gen_logits_mean": -13.893263816833496, "gen_logits_min": -25.099933624267578, "gen_logits_std": 2.5133795738220215, "gen_loss": 0.320764422416687, "grad_norm": 0.4318492431722055, "learning_rate": 2.8273263157894735e-05, "loss": 0.2932, "mean_copy_accuracy": 0.9936098754405975, "mean_gen_accuracy": 0.8741984516382217, "mean_token_accuracy": 0.9046887904405594, "num_tokens": 569810907.0, "sample_num_tokens": 8769.25, "step": 2103, "total_num_tokens": 569845984.0, "z_loss": 0.0008692911942489445 }, { "copy_logits_max": -6.839720249176025, "copy_logits_min": -687500032.0, "copy_num_tokens": 487.875, "epoch": 0.4297166198621394, "gen_logits_max": 6.751317977905273, "gen_logits_mean": -12.785246849060059, "gen_logits_min": -24.48981475830078, "gen_logits_std": 2.5359151363372803, "gen_loss": 0.3141047954559326, "grad_norm": 0.5088325967941777, "learning_rate": 2.8272e-05, "loss": 0.306, "mean_copy_accuracy": 0.9927912652492523, "mean_gen_accuracy": 0.858849436044693, "mean_token_accuracy": 0.8982804417610168, "num_tokens": 570090380.0, "sample_num_tokens": 8553.5, "step": 2104, "total_num_tokens": 570124594.0, "z_loss": 0.000896013923920691 }, { "copy_logits_max": -4.927101135253906, "copy_logits_min": -625000064.0, "copy_num_tokens": 676.875, "epoch": 0.42992085779933625, "gen_logits_max": 5.5904364585876465, "gen_logits_mean": -12.857555389404297, "gen_logits_min": -25.023475646972656, "gen_logits_std": 2.59796142578125, "gen_loss": 0.2955935001373291, "grad_norm": 0.6658073142868346, "learning_rate": 2.8270736842105264e-05, "loss": 0.3147, "mean_copy_accuracy": 0.9903176426887512, "mean_gen_accuracy": 0.864043340086937, "mean_token_accuracy": 0.8958952575922012, "num_tokens": 570357654.0, "sample_num_tokens": 10352.5, "step": 2105, "total_num_tokens": 570399064.0, "z_loss": 0.0009197907638736069 }, { "copy_logits_max": -3.1811866760253906, "copy_logits_min": -750000000.0, "copy_num_tokens": 764.4375, "epoch": 0.43012509573653307, "gen_logits_max": 6.793007850646973, "gen_logits_mean": -11.214487075805664, "gen_logits_min": -23.001873016357422, "gen_logits_std": 2.555079936981201, "gen_loss": 0.28124165534973145, "grad_norm": 0.4574158571535523, "learning_rate": 2.826947368421053e-05, "loss": 0.2927, "mean_copy_accuracy": 0.9941692501306534, "mean_gen_accuracy": 0.8635028004646301, "mean_token_accuracy": 0.9038238823413849, "num_tokens": 570671972.0, "sample_num_tokens": 9933.0, "step": 2106, "total_num_tokens": 570711704.0, "z_loss": 0.0012172161368653178 }, { "copy_logits_max": -5.142471790313721, "copy_logits_min": -750000000.0, "copy_num_tokens": 536.375, "epoch": 0.4303293336737299, "gen_logits_max": 6.330787658691406, "gen_logits_mean": -13.610564231872559, "gen_logits_min": -25.06962013244629, "gen_logits_std": 2.521369695663452, "gen_loss": 0.3333076536655426, "grad_norm": 0.4657525937319859, "learning_rate": 2.826821052631579e-05, "loss": 0.3298, "mean_copy_accuracy": 0.991837739944458, "mean_gen_accuracy": 0.8610066026449203, "mean_token_accuracy": 0.8938162326812744, "num_tokens": 570951730.0, "sample_num_tokens": 9999.0, "step": 2107, "total_num_tokens": 570991726.0, "z_loss": 0.0010695401579141617 }, { "copy_logits_max": -5.596795082092285, "copy_logits_min": -750000000.0, "copy_num_tokens": 469.0, "epoch": 0.4305335716109267, "gen_logits_max": 7.02400016784668, "gen_logits_mean": -12.878351211547852, "gen_logits_min": -24.542776107788086, "gen_logits_std": 2.559460163116455, "gen_loss": 0.3317255973815918, "grad_norm": 0.4637510705200175, "learning_rate": 2.8266947368421054e-05, "loss": 0.3093, "mean_copy_accuracy": 0.9918012320995331, "mean_gen_accuracy": 0.8676400631666183, "mean_token_accuracy": 0.8972777277231216, "num_tokens": 571217401.0, "sample_num_tokens": 8619.25, "step": 2108, "total_num_tokens": 571251878.0, "z_loss": 0.0010333253303542733 }, { "copy_logits_max": -4.607841491699219, "copy_logits_min": -750000000.0, "copy_num_tokens": 467.1875, "epoch": 0.4307378095481236, "gen_logits_max": 7.765235900878906, "gen_logits_mean": -11.278539657592773, "gen_logits_min": -23.228376388549805, "gen_logits_std": 2.5261752605438232, "gen_loss": 0.330096960067749, "grad_norm": 0.5153191155229321, "learning_rate": 2.8265684210526318e-05, "loss": 0.3244, "mean_copy_accuracy": 0.9932160675525665, "mean_gen_accuracy": 0.858701080083847, "mean_token_accuracy": 0.8939918428659439, "num_tokens": 571491707.0, "sample_num_tokens": 8255.75, "step": 2109, "total_num_tokens": 571524730.0, "z_loss": 0.0011051660403609276 }, { "copy_logits_max": -6.594701766967773, "copy_logits_min": -750000000.0, "copy_num_tokens": 324.5625, "epoch": 0.4309420474853204, "gen_logits_max": 6.889279842376709, "gen_logits_mean": -12.198455810546875, "gen_logits_min": -23.430950164794922, "gen_logits_std": 2.500729560852051, "gen_loss": 0.35623541474342346, "grad_norm": 0.41000552407350455, "learning_rate": 2.826442105263158e-05, "loss": 0.3139, "mean_copy_accuracy": 0.9930656999349594, "mean_gen_accuracy": 0.8665895611047745, "mean_token_accuracy": 0.897275984287262, "num_tokens": 571789268.0, "sample_num_tokens": 8864.5, "step": 2110, "total_num_tokens": 571824726.0, "z_loss": 0.0009605390951037407 }, { "copy_logits_max": -5.354109764099121, "copy_logits_min": -750000000.0, "copy_num_tokens": 320.1875, "epoch": 0.4311462854225172, "gen_logits_max": 7.710204124450684, "gen_logits_mean": -12.691625595092773, "gen_logits_min": -24.17843246459961, "gen_logits_std": 2.5047590732574463, "gen_loss": 0.37127071619033813, "grad_norm": 0.5775378613481474, "learning_rate": 2.8263157894736843e-05, "loss": 0.3709, "mean_copy_accuracy": 0.9881003499031067, "mean_gen_accuracy": 0.8527882248163223, "mean_token_accuracy": 0.8800707161426544, "num_tokens": 572044812.0, "sample_num_tokens": 7499.5, "step": 2111, "total_num_tokens": 572074810.0, "z_loss": 0.0009570311522111297 }, { "copy_logits_max": -5.040024280548096, "copy_logits_min": -687500032.0, "copy_num_tokens": 380.8125, "epoch": 0.4313505233597141, "gen_logits_max": 5.988076210021973, "gen_logits_mean": -13.379698753356934, "gen_logits_min": -25.14942169189453, "gen_logits_std": 2.525913715362549, "gen_loss": 0.3084564805030823, "grad_norm": 0.5275591627054055, "learning_rate": 2.8261894736842104e-05, "loss": 0.3111, "mean_copy_accuracy": 0.9924890398979187, "mean_gen_accuracy": 0.8654374182224274, "mean_token_accuracy": 0.8987411856651306, "num_tokens": 572305972.0, "sample_num_tokens": 7529.0, "step": 2112, "total_num_tokens": 572336088.0, "z_loss": 0.0007878175820223987 }, { "copy_logits_max": -4.828805923461914, "copy_logits_min": -750000000.0, "copy_num_tokens": 352.4375, "epoch": 0.4315547612969109, "gen_logits_max": 6.316405296325684, "gen_logits_mean": -11.897859573364258, "gen_logits_min": -23.171443939208984, "gen_logits_std": 2.474133253097534, "gen_loss": 0.3340843915939331, "grad_norm": 0.5269809604514542, "learning_rate": 2.8260631578947372e-05, "loss": 0.3128, "mean_copy_accuracy": 0.9927836209535599, "mean_gen_accuracy": 0.8626580685377121, "mean_token_accuracy": 0.8965213745832443, "num_tokens": 572566005.0, "sample_num_tokens": 7480.75, "step": 2113, "total_num_tokens": 572595928.0, "z_loss": 0.0009116322034969926 }, { "copy_logits_max": -4.797797203063965, "copy_logits_min": -687500032.0, "copy_num_tokens": 566.8125, "epoch": 0.4317589992341077, "gen_logits_max": 5.903829574584961, "gen_logits_mean": -12.657917022705078, "gen_logits_min": -24.306812286376953, "gen_logits_std": 2.5220789909362793, "gen_loss": 0.3312535881996155, "grad_norm": 0.4902319268106389, "learning_rate": 2.8259368421052633e-05, "loss": 0.3163, "mean_copy_accuracy": 0.9913345575332642, "mean_gen_accuracy": 0.8642874360084534, "mean_token_accuracy": 0.8953031450510025, "num_tokens": 572819481.0, "sample_num_tokens": 8992.75, "step": 2114, "total_num_tokens": 572855452.0, "z_loss": 0.0009123000781983137 }, { "copy_logits_max": -3.9906857013702393, "copy_logits_min": -750000000.0, "copy_num_tokens": 673.5, "epoch": 0.4319632371713046, "gen_logits_max": 6.217616081237793, "gen_logits_mean": -11.366016387939453, "gen_logits_min": -23.14997673034668, "gen_logits_std": 2.504040241241455, "gen_loss": 0.26952558755874634, "grad_norm": 0.5238941585620382, "learning_rate": 2.8258105263157897e-05, "loss": 0.3087, "mean_copy_accuracy": 0.9924566745758057, "mean_gen_accuracy": 0.8667708337306976, "mean_token_accuracy": 0.8995651751756668, "num_tokens": 573096956.0, "sample_num_tokens": 9224.0, "step": 2115, "total_num_tokens": 573133852.0, "z_loss": 0.0008489310275763273 }, { "copy_logits_max": -4.760794639587402, "copy_logits_min": -687500032.0, "copy_num_tokens": 529.6875, "epoch": 0.4321674751085014, "gen_logits_max": 5.902407646179199, "gen_logits_mean": -14.319986343383789, "gen_logits_min": -25.792720794677734, "gen_logits_std": 2.5062780380249023, "gen_loss": 0.31164348125457764, "grad_norm": 0.474054812247237, "learning_rate": 2.8256842105263158e-05, "loss": 0.3434, "mean_copy_accuracy": 0.9946805834770203, "mean_gen_accuracy": 0.8569403439760208, "mean_token_accuracy": 0.8906470686197281, "num_tokens": 573365814.0, "sample_num_tokens": 8690.5, "step": 2116, "total_num_tokens": 573400576.0, "z_loss": 0.0008591902442276478 }, { "copy_logits_max": -1.8895597457885742, "copy_logits_min": -750000000.0, "copy_num_tokens": 826.1875, "epoch": 0.4323717130456982, "gen_logits_max": 5.83502197265625, "gen_logits_mean": -12.94515609741211, "gen_logits_min": -25.288076400756836, "gen_logits_std": 2.591378927230835, "gen_loss": 0.30034297704696655, "grad_norm": 0.49525419189431025, "learning_rate": 2.8255578947368422e-05, "loss": 0.3028, "mean_copy_accuracy": 0.9951558411121368, "mean_gen_accuracy": 0.8633718639612198, "mean_token_accuracy": 0.9011908173561096, "num_tokens": 573651420.0, "sample_num_tokens": 9978.5, "step": 2117, "total_num_tokens": 573691334.0, "z_loss": 0.0009725691052153707 }, { "copy_logits_max": -5.757153034210205, "copy_logits_min": -750000000.0, "copy_num_tokens": 701.4375, "epoch": 0.4325759509828951, "gen_logits_max": 5.915446758270264, "gen_logits_mean": -11.950571060180664, "gen_logits_min": -23.81965446472168, "gen_logits_std": 2.498065233230591, "gen_loss": 0.2546060383319855, "grad_norm": 0.4808441433574754, "learning_rate": 2.8254315789473683e-05, "loss": 0.3031, "mean_copy_accuracy": 0.9940843135118484, "mean_gen_accuracy": 0.8615708649158478, "mean_token_accuracy": 0.9009920209646225, "num_tokens": 573933358.0, "sample_num_tokens": 9797.5, "step": 2118, "total_num_tokens": 573972548.0, "z_loss": 0.0007444546790793538 }, { "copy_logits_max": -4.864377021789551, "copy_logits_min": -750000128.0, "copy_num_tokens": 568.4375, "epoch": 0.4327801889200919, "gen_logits_max": 5.827878952026367, "gen_logits_mean": -11.97020435333252, "gen_logits_min": -23.771495819091797, "gen_logits_std": 2.518439769744873, "gen_loss": 0.31646841764450073, "grad_norm": 0.4908233779479005, "learning_rate": 2.8253052631578947e-05, "loss": 0.3274, "mean_copy_accuracy": 0.9917192608118057, "mean_gen_accuracy": 0.8628125935792923, "mean_token_accuracy": 0.8929037153720856, "num_tokens": 574204588.0, "sample_num_tokens": 9000.5, "step": 2119, "total_num_tokens": 574240590.0, "z_loss": 0.0008957824902608991 }, { "copy_logits_max": -5.285627365112305, "copy_logits_min": -750000000.0, "copy_num_tokens": 440.625, "epoch": 0.4329844268572887, "gen_logits_max": 6.267055988311768, "gen_logits_mean": -12.339688301086426, "gen_logits_min": -23.71002960205078, "gen_logits_std": 2.46840763092041, "gen_loss": 0.34824034571647644, "grad_norm": 0.44718257353162855, "learning_rate": 2.825178947368421e-05, "loss": 0.3294, "mean_copy_accuracy": 0.994228720664978, "mean_gen_accuracy": 0.8609293699264526, "mean_token_accuracy": 0.8930440545082092, "num_tokens": 574487696.0, "sample_num_tokens": 8004.5, "step": 2120, "total_num_tokens": 574519714.0, "z_loss": 0.0009586435044184327 }, { "copy_logits_max": -4.430210113525391, "copy_logits_min": -750000000.0, "copy_num_tokens": 565.1875, "epoch": 0.4331886647944856, "gen_logits_max": 6.292388916015625, "gen_logits_mean": -12.04377555847168, "gen_logits_min": -23.878738403320312, "gen_logits_std": 2.5394468307495117, "gen_loss": 0.298079252243042, "grad_norm": 0.4877073941255004, "learning_rate": 2.8250526315789476e-05, "loss": 0.3176, "mean_copy_accuracy": 0.9946593344211578, "mean_gen_accuracy": 0.8622726798057556, "mean_token_accuracy": 0.8959188461303711, "num_tokens": 574751755.0, "sample_num_tokens": 9399.75, "step": 2121, "total_num_tokens": 574789354.0, "z_loss": 0.001074089203029871 }, { "copy_logits_max": -5.032709121704102, "copy_logits_min": -750000000.0, "copy_num_tokens": 357.9375, "epoch": 0.4333929027316824, "gen_logits_max": 6.835875034332275, "gen_logits_mean": -12.583439826965332, "gen_logits_min": -23.83251190185547, "gen_logits_std": 2.490591526031494, "gen_loss": 0.39983752369880676, "grad_norm": 0.526452068151193, "learning_rate": 2.824926315789474e-05, "loss": 0.3342, "mean_copy_accuracy": 0.9923206269741058, "mean_gen_accuracy": 0.8557837605476379, "mean_token_accuracy": 0.8918972611427307, "num_tokens": 575039450.0, "sample_num_tokens": 8237.5, "step": 2122, "total_num_tokens": 575072400.0, "z_loss": 0.001139113330282271 }, { "copy_logits_max": -6.644509315490723, "copy_logits_min": -750000000.0, "copy_num_tokens": 444.8125, "epoch": 0.4335971406688792, "gen_logits_max": 6.13539981842041, "gen_logits_mean": -13.727433204650879, "gen_logits_min": -24.93671226501465, "gen_logits_std": 2.4691426753997803, "gen_loss": 0.32332879304885864, "grad_norm": 0.5153377376238649, "learning_rate": 2.8248e-05, "loss": 0.3508, "mean_copy_accuracy": 0.9920846223831177, "mean_gen_accuracy": 0.8548775464296341, "mean_token_accuracy": 0.885608047246933, "num_tokens": 575309522.0, "sample_num_tokens": 8421.5, "step": 2123, "total_num_tokens": 575343208.0, "z_loss": 0.000921769707929343 }, { "copy_logits_max": -3.776069402694702, "copy_logits_min": -687500032.0, "copy_num_tokens": 518.3125, "epoch": 0.4338013786060761, "gen_logits_max": 6.848067760467529, "gen_logits_mean": -11.672393798828125, "gen_logits_min": -23.77799415588379, "gen_logits_std": 2.6204826831817627, "gen_loss": 0.346874475479126, "grad_norm": 0.4782347553093233, "learning_rate": 2.8246736842105266e-05, "loss": 0.3217, "mean_copy_accuracy": 0.9934980273246765, "mean_gen_accuracy": 0.8609181344509125, "mean_token_accuracy": 0.897489070892334, "num_tokens": 575573964.0, "sample_num_tokens": 8122.5, "step": 2124, "total_num_tokens": 575606454.0, "z_loss": 0.0010536671616137028 }, { "copy_logits_max": -6.148318767547607, "copy_logits_min": -750000000.0, "copy_num_tokens": 392.125, "epoch": 0.4340056165432729, "gen_logits_max": 6.929277420043945, "gen_logits_mean": -12.938997268676758, "gen_logits_min": -24.06844711303711, "gen_logits_std": 2.4651615619659424, "gen_loss": 0.335746169090271, "grad_norm": 0.43844313248417277, "learning_rate": 2.8245473684210527e-05, "loss": 0.3051, "mean_copy_accuracy": 0.994916707277298, "mean_gen_accuracy": 0.8653600960969925, "mean_token_accuracy": 0.8984766155481339, "num_tokens": 575860875.0, "sample_num_tokens": 7707.75, "step": 2125, "total_num_tokens": 575891706.0, "z_loss": 0.0008978762198239565 }, { "copy_logits_max": -4.663671970367432, "copy_logits_min": -687500032.0, "copy_num_tokens": 471.5625, "epoch": 0.4342098544804697, "gen_logits_max": 5.861031532287598, "gen_logits_mean": -13.554678916931152, "gen_logits_min": -24.938182830810547, "gen_logits_std": 2.4864773750305176, "gen_loss": 0.34872838854789734, "grad_norm": 0.489865530854632, "learning_rate": 2.824421052631579e-05, "loss": 0.3346, "mean_copy_accuracy": 0.9930909872055054, "mean_gen_accuracy": 0.855618953704834, "mean_token_accuracy": 0.8898891061544418, "num_tokens": 576139505.0, "sample_num_tokens": 8569.75, "step": 2126, "total_num_tokens": 576173784.0, "z_loss": 0.0009251799201592803 }, { "copy_logits_max": -7.336149215698242, "copy_logits_min": -750000000.0, "copy_num_tokens": 345.5625, "epoch": 0.4344140924176666, "gen_logits_max": 6.410898685455322, "gen_logits_mean": -12.97014045715332, "gen_logits_min": -24.273488998413086, "gen_logits_std": 2.5203776359558105, "gen_loss": 0.3254050016403198, "grad_norm": 0.4948139386762566, "learning_rate": 2.8242947368421052e-05, "loss": 0.3501, "mean_copy_accuracy": 0.9923598170280457, "mean_gen_accuracy": 0.8548345565795898, "mean_token_accuracy": 0.8852126896381378, "num_tokens": 576400983.0, "sample_num_tokens": 7487.25, "step": 2127, "total_num_tokens": 576430932.0, "z_loss": 0.000791695958469063 }, { "copy_logits_max": -8.17662239074707, "copy_logits_min": -750000000.0, "copy_num_tokens": 428.5625, "epoch": 0.4346183303548634, "gen_logits_max": 6.77283239364624, "gen_logits_mean": -12.182657241821289, "gen_logits_min": -23.76813507080078, "gen_logits_std": 2.518097400665283, "gen_loss": 0.3104841113090515, "grad_norm": 0.4821377475697321, "learning_rate": 2.8241684210526316e-05, "loss": 0.328, "mean_copy_accuracy": 0.9931164532899857, "mean_gen_accuracy": 0.8646085411310196, "mean_token_accuracy": 0.8925090134143829, "num_tokens": 576679202.0, "sample_num_tokens": 9554.0, "step": 2128, "total_num_tokens": 576717418.0, "z_loss": 0.0008545927121303976 }, { "copy_logits_max": -4.522300720214844, "copy_logits_min": -750000000.0, "copy_num_tokens": 438.3125, "epoch": 0.43482256829206023, "gen_logits_max": 5.512676239013672, "gen_logits_mean": -13.829156875610352, "gen_logits_min": -25.55717658996582, "gen_logits_std": 2.5056629180908203, "gen_loss": 0.34512388706207275, "grad_norm": 0.4615575269175454, "learning_rate": 2.824042105263158e-05, "loss": 0.3095, "mean_copy_accuracy": 0.9937928169965744, "mean_gen_accuracy": 0.8648420423269272, "mean_token_accuracy": 0.8972886353731155, "num_tokens": 576942090.0, "sample_num_tokens": 8322.5, "step": 2129, "total_num_tokens": 576975380.0, "z_loss": 0.0009141540504060686 }, { "copy_logits_max": -6.250344276428223, "copy_logits_min": -750000000.0, "copy_num_tokens": 542.75, "epoch": 0.4350268062292571, "gen_logits_max": 6.3150129318237305, "gen_logits_mean": -11.919967651367188, "gen_logits_min": -23.66756248474121, "gen_logits_std": 2.5266520977020264, "gen_loss": 0.27609825134277344, "grad_norm": 0.5007277341979353, "learning_rate": 2.8239157894736845e-05, "loss": 0.3108, "mean_copy_accuracy": 0.993947833776474, "mean_gen_accuracy": 0.8618133962154388, "mean_token_accuracy": 0.897487074136734, "num_tokens": 577215066.0, "sample_num_tokens": 8927.0, "step": 2130, "total_num_tokens": 577250774.0, "z_loss": 0.0009389902697876096 }, { "copy_logits_max": -5.865338325500488, "copy_logits_min": -750000000.0, "copy_num_tokens": 438.375, "epoch": 0.4352310441664539, "gen_logits_max": 6.071467876434326, "gen_logits_mean": -12.721440315246582, "gen_logits_min": -24.351099014282227, "gen_logits_std": 2.483776092529297, "gen_loss": 0.30354729294776917, "grad_norm": 0.4587012212217974, "learning_rate": 2.8237894736842106e-05, "loss": 0.3179, "mean_copy_accuracy": 0.9929103702306747, "mean_gen_accuracy": 0.8584953546524048, "mean_token_accuracy": 0.8946099430322647, "num_tokens": 577495304.0, "sample_num_tokens": 7533.5, "step": 2131, "total_num_tokens": 577525438.0, "z_loss": 0.0009114562999457121 }, { "copy_logits_max": -5.159710884094238, "copy_logits_min": -750000000.0, "copy_num_tokens": 347.0, "epoch": 0.43543528210365073, "gen_logits_max": 5.983750343322754, "gen_logits_mean": -13.902351379394531, "gen_logits_min": -25.944561004638672, "gen_logits_std": 2.515517473220825, "gen_loss": 0.3311786651611328, "grad_norm": 0.5477594402189273, "learning_rate": 2.823663157894737e-05, "loss": 0.3348, "mean_copy_accuracy": 0.9907631278038025, "mean_gen_accuracy": 0.8618951439857483, "mean_token_accuracy": 0.8889734297990799, "num_tokens": 577741037.0, "sample_num_tokens": 7293.75, "step": 2132, "total_num_tokens": 577770212.0, "z_loss": 0.0008649826049804688 }, { "copy_logits_max": -5.588817119598389, "copy_logits_min": -750000064.0, "copy_num_tokens": 659.4375, "epoch": 0.4356395200408476, "gen_logits_max": 5.4603447914123535, "gen_logits_mean": -12.888893127441406, "gen_logits_min": -24.628711700439453, "gen_logits_std": 2.5469138622283936, "gen_loss": 0.3199464976787567, "grad_norm": 0.46858840844972727, "learning_rate": 2.823536842105263e-05, "loss": 0.3042, "mean_copy_accuracy": 0.9940037727355957, "mean_gen_accuracy": 0.8621084094047546, "mean_token_accuracy": 0.9020336121320724, "num_tokens": 578034064.0, "sample_num_tokens": 9573.0, "step": 2133, "total_num_tokens": 578072356.0, "z_loss": 0.0008718433091416955 }, { "copy_logits_max": -5.718605041503906, "copy_logits_min": -750000000.0, "copy_num_tokens": 436.5, "epoch": 0.4358437579780444, "gen_logits_max": 5.864865303039551, "gen_logits_mean": -13.767328262329102, "gen_logits_min": -25.500316619873047, "gen_logits_std": 2.4953556060791016, "gen_loss": 0.3206034302711487, "grad_norm": 0.5086918858762814, "learning_rate": 2.8234105263157895e-05, "loss": 0.3238, "mean_copy_accuracy": 0.9938828647136688, "mean_gen_accuracy": 0.8648372739553452, "mean_token_accuracy": 0.894500270485878, "num_tokens": 578312059.0, "sample_num_tokens": 8486.25, "step": 2134, "total_num_tokens": 578346004.0, "z_loss": 0.0009027391788549721 }, { "copy_logits_max": -5.32507848739624, "copy_logits_min": -750000000.0, "copy_num_tokens": 498.9375, "epoch": 0.43604799591524124, "gen_logits_max": 6.324707508087158, "gen_logits_mean": -12.01496410369873, "gen_logits_min": -23.984020233154297, "gen_logits_std": 2.5637831687927246, "gen_loss": 0.2901299297809601, "grad_norm": 0.48810684861950876, "learning_rate": 2.823284210526316e-05, "loss": 0.3271, "mean_copy_accuracy": 0.9933798313140869, "mean_gen_accuracy": 0.8542758971452713, "mean_token_accuracy": 0.891340047121048, "num_tokens": 578584927.0, "sample_num_tokens": 8443.25, "step": 2135, "total_num_tokens": 578618700.0, "z_loss": 0.0008878476801328361 }, { "copy_logits_max": -3.680069923400879, "copy_logits_min": -750000000.0, "copy_num_tokens": 706.75, "epoch": 0.4362522338524381, "gen_logits_max": 5.340064525604248, "gen_logits_mean": -13.05925178527832, "gen_logits_min": -25.81064224243164, "gen_logits_std": 2.639772653579712, "gen_loss": 0.27830156683921814, "grad_norm": 0.540003058998883, "learning_rate": 2.823157894736842e-05, "loss": 0.2939, "mean_copy_accuracy": 0.9922214597463608, "mean_gen_accuracy": 0.871093288064003, "mean_token_accuracy": 0.9014348536729813, "num_tokens": 578852831.0, "sample_num_tokens": 9578.25, "step": 2136, "total_num_tokens": 578891144.0, "z_loss": 0.0008082438725978136 }, { "copy_logits_max": -5.463726043701172, "copy_logits_min": -750000000.0, "copy_num_tokens": 530.0, "epoch": 0.4364564717896349, "gen_logits_max": 6.192460060119629, "gen_logits_mean": -12.507415771484375, "gen_logits_min": -24.629901885986328, "gen_logits_std": 2.544191837310791, "gen_loss": 0.27654966711997986, "grad_norm": 0.47102572836496176, "learning_rate": 2.8230315789473685e-05, "loss": 0.3318, "mean_copy_accuracy": 0.9925179183483124, "mean_gen_accuracy": 0.8635072857141495, "mean_token_accuracy": 0.8891565948724747, "num_tokens": 579112039.0, "sample_num_tokens": 9797.75, "step": 2137, "total_num_tokens": 579151230.0, "z_loss": 0.000854218378663063 }, { "copy_logits_max": -3.5867738723754883, "copy_logits_min": -750000000.0, "copy_num_tokens": 464.75, "epoch": 0.43666070972683174, "gen_logits_max": 5.915037631988525, "gen_logits_mean": -13.01361083984375, "gen_logits_min": -25.229406356811523, "gen_logits_std": 2.537830352783203, "gen_loss": 0.3373660147190094, "grad_norm": 0.47464249511018625, "learning_rate": 2.822905263157895e-05, "loss": 0.3414, "mean_copy_accuracy": 0.9941916316747665, "mean_gen_accuracy": 0.8564208149909973, "mean_token_accuracy": 0.8897498399019241, "num_tokens": 579394626.0, "sample_num_tokens": 8091.0, "step": 2138, "total_num_tokens": 579426990.0, "z_loss": 0.001081128604710102 }, { "copy_logits_max": -5.460527420043945, "copy_logits_min": -750000000.0, "copy_num_tokens": 495.6875, "epoch": 0.4368649476640286, "gen_logits_max": 6.825333595275879, "gen_logits_mean": -11.330205917358398, "gen_logits_min": -23.436676025390625, "gen_logits_std": 2.6076738834381104, "gen_loss": 0.29683372378349304, "grad_norm": 0.5023295121675463, "learning_rate": 2.8227789473684213e-05, "loss": 0.3246, "mean_copy_accuracy": 0.9932012557983398, "mean_gen_accuracy": 0.8637454509735107, "mean_token_accuracy": 0.8940785974264145, "num_tokens": 579654771.0, "sample_num_tokens": 8577.25, "step": 2139, "total_num_tokens": 579689080.0, "z_loss": 0.0010791324311867356 }, { "copy_logits_max": -4.663906097412109, "copy_logits_min": -750000000.0, "copy_num_tokens": 553.5625, "epoch": 0.43706918560122543, "gen_logits_max": 5.504607677459717, "gen_logits_mean": -13.195857048034668, "gen_logits_min": -24.88260841369629, "gen_logits_std": 2.5274417400360107, "gen_loss": 0.32492515444755554, "grad_norm": 0.40505851968231754, "learning_rate": 2.8226526315789474e-05, "loss": 0.3004, "mean_copy_accuracy": 0.9954755306243896, "mean_gen_accuracy": 0.8663197308778763, "mean_token_accuracy": 0.9013915210962296, "num_tokens": 579934218.0, "sample_num_tokens": 9149.0, "step": 2140, "total_num_tokens": 579970814.0, "z_loss": 0.0009582813363522291 }, { "copy_logits_max": -5.634736061096191, "copy_logits_min": -750000000.0, "copy_num_tokens": 493.625, "epoch": 0.43727342353842225, "gen_logits_max": 6.430408477783203, "gen_logits_mean": -12.416524887084961, "gen_logits_min": -24.432466506958008, "gen_logits_std": 2.573063850402832, "gen_loss": 0.33805251121520996, "grad_norm": 0.5026341278301498, "learning_rate": 2.822526315789474e-05, "loss": 0.3362, "mean_copy_accuracy": 0.991323783993721, "mean_gen_accuracy": 0.8566684722900391, "mean_token_accuracy": 0.88820381462574, "num_tokens": 580184866.0, "sample_num_tokens": 8458.0, "step": 2141, "total_num_tokens": 580218698.0, "z_loss": 0.000906106666661799 }, { "copy_logits_max": -5.773643493652344, "copy_logits_min": -687499968.0, "copy_num_tokens": 499.4375, "epoch": 0.4374776614756191, "gen_logits_max": 6.0979323387146, "gen_logits_mean": -12.584281921386719, "gen_logits_min": -24.54638671875, "gen_logits_std": 2.5468239784240723, "gen_loss": 0.32453489303588867, "grad_norm": 0.47105308604221063, "learning_rate": 2.8224e-05, "loss": 0.3286, "mean_copy_accuracy": 0.9918401837348938, "mean_gen_accuracy": 0.8575011193752289, "mean_token_accuracy": 0.8902354091405869, "num_tokens": 580461310.0, "sample_num_tokens": 9047.5, "step": 2142, "total_num_tokens": 580497500.0, "z_loss": 0.0008905548602342606 }, { "copy_logits_max": -6.033568382263184, "copy_logits_min": -750000000.0, "copy_num_tokens": 453.625, "epoch": 0.43768189941281593, "gen_logits_max": 6.334022521972656, "gen_logits_mean": -12.639917373657227, "gen_logits_min": -24.361202239990234, "gen_logits_std": 2.531489372253418, "gen_loss": 0.32810261845588684, "grad_norm": 0.4718073524642876, "learning_rate": 2.8222736842105264e-05, "loss": 0.3099, "mean_copy_accuracy": 0.9939779192209244, "mean_gen_accuracy": 0.8630401343107224, "mean_token_accuracy": 0.8987715095281601, "num_tokens": 580749533.0, "sample_num_tokens": 9415.25, "step": 2143, "total_num_tokens": 580787194.0, "z_loss": 0.000910339062102139 }, { "copy_logits_max": -3.301239490509033, "copy_logits_min": -750000000.0, "copy_num_tokens": 380.4375, "epoch": 0.43788613735001275, "gen_logits_max": 6.4735918045043945, "gen_logits_mean": -11.787778854370117, "gen_logits_min": -24.01177215576172, "gen_logits_std": 2.5834298133850098, "gen_loss": 0.361211895942688, "grad_norm": 0.4659011328832531, "learning_rate": 2.8221473684210525e-05, "loss": 0.3392, "mean_copy_accuracy": 0.9931536465883255, "mean_gen_accuracy": 0.8570704162120819, "mean_token_accuracy": 0.8886619508266449, "num_tokens": 581018099.0, "sample_num_tokens": 7099.75, "step": 2144, "total_num_tokens": 581046498.0, "z_loss": 0.0010351540986448526 }, { "copy_logits_max": -5.039831161499023, "copy_logits_min": -750000000.0, "copy_num_tokens": 412.3125, "epoch": 0.4380903752872096, "gen_logits_max": 6.569267272949219, "gen_logits_mean": -13.146568298339844, "gen_logits_min": -24.961301803588867, "gen_logits_std": 2.5625150203704834, "gen_loss": 0.31191930174827576, "grad_norm": 0.44449890970917133, "learning_rate": 2.822021052631579e-05, "loss": 0.3284, "mean_copy_accuracy": 0.9941676259040833, "mean_gen_accuracy": 0.8562895357608795, "mean_token_accuracy": 0.8919739723205566, "num_tokens": 581291053.0, "sample_num_tokens": 8500.25, "step": 2145, "total_num_tokens": 581325054.0, "z_loss": 0.0009199987980537117 }, { "copy_logits_max": -5.783138275146484, "copy_logits_min": -750000000.0, "copy_num_tokens": 695.125, "epoch": 0.43829461322440644, "gen_logits_max": 5.292394161224365, "gen_logits_mean": -12.880102157592773, "gen_logits_min": -24.919525146484375, "gen_logits_std": 2.5701637268066406, "gen_loss": 0.2691000699996948, "grad_norm": 0.47397865383009874, "learning_rate": 2.8218947368421053e-05, "loss": 0.2927, "mean_copy_accuracy": 0.9913489520549774, "mean_gen_accuracy": 0.8727603107690811, "mean_token_accuracy": 0.9020335674285889, "num_tokens": 581564535.0, "sample_num_tokens": 10190.75, "step": 2146, "total_num_tokens": 581605298.0, "z_loss": 0.0007727136835455894 }, { "copy_logits_max": -4.723607540130615, "copy_logits_min": -750000064.0, "copy_num_tokens": 547.375, "epoch": 0.43849885116160325, "gen_logits_max": 6.228353023529053, "gen_logits_mean": -12.79315185546875, "gen_logits_min": -24.925445556640625, "gen_logits_std": 2.515774726867676, "gen_loss": 0.31820762157440186, "grad_norm": 0.43158706340277175, "learning_rate": 2.8217684210526318e-05, "loss": 0.3315, "mean_copy_accuracy": 0.9938217103481293, "mean_gen_accuracy": 0.8609302490949631, "mean_token_accuracy": 0.8920951932668686, "num_tokens": 581848359.0, "sample_num_tokens": 9585.75, "step": 2147, "total_num_tokens": 581886702.0, "z_loss": 0.0009219232015311718 }, { "copy_logits_max": -4.866580009460449, "copy_logits_min": -750000000.0, "copy_num_tokens": 577.4375, "epoch": 0.4387030890988001, "gen_logits_max": 5.760246276855469, "gen_logits_mean": -13.02743148803711, "gen_logits_min": -24.95758056640625, "gen_logits_std": 2.5936689376831055, "gen_loss": 0.31812405586242676, "grad_norm": 0.4479531908079082, "learning_rate": 2.821642105263158e-05, "loss": 0.3231, "mean_copy_accuracy": 0.9934645295143127, "mean_gen_accuracy": 0.859070435166359, "mean_token_accuracy": 0.8925181478261948, "num_tokens": 582121411.0, "sample_num_tokens": 9311.75, "step": 2148, "total_num_tokens": 582158658.0, "z_loss": 0.0008960465784184635 }, { "copy_logits_max": -5.603084564208984, "copy_logits_min": -750000064.0, "copy_num_tokens": 576.625, "epoch": 0.43890732703599694, "gen_logits_max": 6.000746726989746, "gen_logits_mean": -11.919442176818848, "gen_logits_min": -24.19529151916504, "gen_logits_std": 2.5536136627197266, "gen_loss": 0.2511877119541168, "grad_norm": 0.4356536785015068, "learning_rate": 2.8215157894736843e-05, "loss": 0.3064, "mean_copy_accuracy": 0.993901401758194, "mean_gen_accuracy": 0.8688759058713913, "mean_token_accuracy": 0.8990003615617752, "num_tokens": 582413283.0, "sample_num_tokens": 9222.75, "step": 2149, "total_num_tokens": 582450174.0, "z_loss": 0.0008047400042414665 }, { "copy_logits_max": -5.693264961242676, "copy_logits_min": -750000000.0, "copy_num_tokens": 332.0, "epoch": 0.43911156497319376, "gen_logits_max": 6.531453609466553, "gen_logits_mean": -13.295465469360352, "gen_logits_min": -25.064926147460938, "gen_logits_std": 2.532278060913086, "gen_loss": 0.36331814527511597, "grad_norm": 0.46282641001538244, "learning_rate": 2.8213894736842107e-05, "loss": 0.3396, "mean_copy_accuracy": 0.9938379526138306, "mean_gen_accuracy": 0.8555584400892258, "mean_token_accuracy": 0.8899911642074585, "num_tokens": 582714806.0, "sample_num_tokens": 7416.0, "step": 2150, "total_num_tokens": 582744470.0, "z_loss": 0.0009493422694504261 }, { "copy_logits_max": -6.134317398071289, "copy_logits_min": -750000000.0, "copy_num_tokens": 424.1875, "epoch": 0.43931580291039063, "gen_logits_max": 6.285305976867676, "gen_logits_mean": -11.939803123474121, "gen_logits_min": -23.968276977539062, "gen_logits_std": 2.5482964515686035, "gen_loss": 0.3299967348575592, "grad_norm": 0.46496064783431235, "learning_rate": 2.8212631578947368e-05, "loss": 0.305, "mean_copy_accuracy": 0.9923355579376221, "mean_gen_accuracy": 0.8660150021314621, "mean_token_accuracy": 0.8984332382678986, "num_tokens": 582986352.0, "sample_num_tokens": 7803.0, "step": 2151, "total_num_tokens": 583017564.0, "z_loss": 0.0008562915027141571 }, { "copy_logits_max": -6.361807346343994, "copy_logits_min": -687500032.0, "copy_num_tokens": 498.375, "epoch": 0.43952004084758745, "gen_logits_max": 6.855663299560547, "gen_logits_mean": -12.223808288574219, "gen_logits_min": -24.391082763671875, "gen_logits_std": 2.533466100692749, "gen_loss": 0.3165636658668518, "grad_norm": 0.4914417885010575, "learning_rate": 2.8211368421052632e-05, "loss": 0.3089, "mean_copy_accuracy": 0.9933922737836838, "mean_gen_accuracy": 0.8661759048700333, "mean_token_accuracy": 0.8975357860326767, "num_tokens": 583243086.0, "sample_num_tokens": 8390.0, "step": 2152, "total_num_tokens": 583276646.0, "z_loss": 0.0008495966903865337 }, { "copy_logits_max": -6.756568908691406, "copy_logits_min": -750000000.0, "copy_num_tokens": 381.875, "epoch": 0.43972427878478426, "gen_logits_max": 7.114224433898926, "gen_logits_mean": -11.474678039550781, "gen_logits_min": -24.139446258544922, "gen_logits_std": 2.558516502380371, "gen_loss": 0.3662852346897125, "grad_norm": 0.5306454881372166, "learning_rate": 2.8210105263157893e-05, "loss": 0.3408, "mean_copy_accuracy": 0.9938858151435852, "mean_gen_accuracy": 0.85507932305336, "mean_token_accuracy": 0.8881472647190094, "num_tokens": 583506110.0, "sample_num_tokens": 7621.5, "step": 2153, "total_num_tokens": 583536596.0, "z_loss": 0.0009265317930839956 }, { "copy_logits_max": -7.332944869995117, "copy_logits_min": -750000000.0, "copy_num_tokens": 336.3125, "epoch": 0.43992851672198113, "gen_logits_max": 6.1667070388793945, "gen_logits_mean": -13.679643630981445, "gen_logits_min": -25.315263748168945, "gen_logits_std": 2.5036613941192627, "gen_loss": 0.3145865201950073, "grad_norm": 0.5006372187206661, "learning_rate": 2.820884210526316e-05, "loss": 0.32, "mean_copy_accuracy": 0.9911444634199142, "mean_gen_accuracy": 0.8652576804161072, "mean_token_accuracy": 0.8924610465764999, "num_tokens": 583769678.0, "sample_num_tokens": 8027.0, "step": 2154, "total_num_tokens": 583801786.0, "z_loss": 0.0007876785239204764 }, { "copy_logits_max": -4.816464424133301, "copy_logits_min": -750000000.0, "copy_num_tokens": 545.875, "epoch": 0.44013275465917795, "gen_logits_max": 5.034928321838379, "gen_logits_mean": -13.905993461608887, "gen_logits_min": -26.25137710571289, "gen_logits_std": 2.518674373626709, "gen_loss": 0.2855255603790283, "grad_norm": 0.4515901791628992, "learning_rate": 2.8207578947368422e-05, "loss": 0.305, "mean_copy_accuracy": 0.9925730228424072, "mean_gen_accuracy": 0.8663960248231888, "mean_token_accuracy": 0.8999550193548203, "num_tokens": 584043193.0, "sample_num_tokens": 8517.25, "step": 2155, "total_num_tokens": 584077262.0, "z_loss": 0.0008182664168998599 }, { "copy_logits_max": -5.701230525970459, "copy_logits_min": -750000000.0, "copy_num_tokens": 443.0, "epoch": 0.44033699259637477, "gen_logits_max": 6.446564674377441, "gen_logits_mean": -12.592619895935059, "gen_logits_min": -24.861888885498047, "gen_logits_std": 2.544818639755249, "gen_loss": 0.3417384624481201, "grad_norm": 0.5086146039965227, "learning_rate": 2.8206315789473686e-05, "loss": 0.3308, "mean_copy_accuracy": 0.9933797717094421, "mean_gen_accuracy": 0.8596670627593994, "mean_token_accuracy": 0.893502414226532, "num_tokens": 584304028.0, "sample_num_tokens": 7761.5, "step": 2156, "total_num_tokens": 584335074.0, "z_loss": 0.0009725550189614296 }, { "copy_logits_max": -9.180206298828125, "copy_logits_min": -750000000.0, "copy_num_tokens": 241.625, "epoch": 0.44054123053357164, "gen_logits_max": 7.090814590454102, "gen_logits_mean": -13.361713409423828, "gen_logits_min": -25.51927375793457, "gen_logits_std": 2.51835298538208, "gen_loss": 0.33650293946266174, "grad_norm": 0.49025969400504693, "learning_rate": 2.8205052631578947e-05, "loss": 0.3414, "mean_copy_accuracy": 0.9929789304733276, "mean_gen_accuracy": 0.8598971366882324, "mean_token_accuracy": 0.887113019824028, "num_tokens": 584563088.0, "sample_num_tokens": 6551.0, "step": 2157, "total_num_tokens": 584589292.0, "z_loss": 0.0008595680119469762 }, { "copy_logits_max": -8.238906860351562, "copy_logits_min": -687500032.0, "copy_num_tokens": 501.125, "epoch": 0.44074546847076845, "gen_logits_max": 6.661757469177246, "gen_logits_mean": -12.531339645385742, "gen_logits_min": -25.047962188720703, "gen_logits_std": 2.5748543739318848, "gen_loss": 0.30192768573760986, "grad_norm": 0.5157247509625131, "learning_rate": 2.820378947368421e-05, "loss": 0.3292, "mean_copy_accuracy": 0.9916297197341919, "mean_gen_accuracy": 0.8597198724746704, "mean_token_accuracy": 0.890604242682457, "num_tokens": 584824392.0, "sample_num_tokens": 9357.5, "step": 2158, "total_num_tokens": 584861822.0, "z_loss": 0.0008142943843267858 }, { "copy_logits_max": -5.455503463745117, "copy_logits_min": -750000000.0, "copy_num_tokens": 419.0625, "epoch": 0.44094970640796527, "gen_logits_max": 6.494688987731934, "gen_logits_mean": -12.62459659576416, "gen_logits_min": -24.734493255615234, "gen_logits_std": 2.5222935676574707, "gen_loss": 0.36285901069641113, "grad_norm": 0.4491786539172267, "learning_rate": 2.8202526315789472e-05, "loss": 0.3161, "mean_copy_accuracy": 0.992807149887085, "mean_gen_accuracy": 0.866115003824234, "mean_token_accuracy": 0.8964444100856781, "num_tokens": 585084835.0, "sample_num_tokens": 7505.75, "step": 2159, "total_num_tokens": 585114858.0, "z_loss": 0.0009499094448983669 }, { "copy_logits_max": -5.5052032470703125, "copy_logits_min": -687500032.0, "copy_num_tokens": 787.125, "epoch": 0.44115394434516214, "gen_logits_max": 6.224420547485352, "gen_logits_mean": -12.764681816101074, "gen_logits_min": -25.62562370300293, "gen_logits_std": 2.5778393745422363, "gen_loss": 0.28646209836006165, "grad_norm": 0.45832324147240944, "learning_rate": 2.8201263157894737e-05, "loss": 0.3142, "mean_copy_accuracy": 0.9927846044301987, "mean_gen_accuracy": 0.863829493522644, "mean_token_accuracy": 0.8965979069471359, "num_tokens": 585371792.0, "sample_num_tokens": 11541.5, "step": 2160, "total_num_tokens": 585417958.0, "z_loss": 0.0009501774329692125 }, { "copy_logits_max": -3.6625447273254395, "copy_logits_min": -750000000.0, "copy_num_tokens": 578.9375, "epoch": 0.44135818228235896, "gen_logits_max": 4.760852336883545, "gen_logits_mean": -14.786352157592773, "gen_logits_min": -26.84275245666504, "gen_logits_std": 2.519930601119995, "gen_loss": 0.3020167946815491, "grad_norm": 0.45525146919914106, "learning_rate": 2.8199999999999998e-05, "loss": 0.302, "mean_copy_accuracy": 0.9944344907999039, "mean_gen_accuracy": 0.8633150011301041, "mean_token_accuracy": 0.9011102318763733, "num_tokens": 585656356.0, "sample_num_tokens": 8796.0, "step": 2161, "total_num_tokens": 585691540.0, "z_loss": 0.0008967830799520016 }, { "copy_logits_max": -8.986708641052246, "copy_logits_min": -687500032.0, "copy_num_tokens": 440.8125, "epoch": 0.4415624202195558, "gen_logits_max": 7.0349860191345215, "gen_logits_mean": -13.042181015014648, "gen_logits_min": -25.073596954345703, "gen_logits_std": 2.5298848152160645, "gen_loss": 0.31202465295791626, "grad_norm": 0.4690180651289127, "learning_rate": 2.8198736842105265e-05, "loss": 0.3416, "mean_copy_accuracy": 0.9917884320020676, "mean_gen_accuracy": 0.8582681119441986, "mean_token_accuracy": 0.8869737684726715, "num_tokens": 585915637.0, "sample_num_tokens": 9428.25, "step": 2162, "total_num_tokens": 585953350.0, "z_loss": 0.0007740347064100206 }, { "copy_logits_max": -7.348196029663086, "copy_logits_min": -687500032.0, "copy_num_tokens": 442.0, "epoch": 0.4417666581567526, "gen_logits_max": 7.5835981369018555, "gen_logits_mean": -12.613443374633789, "gen_logits_min": -24.925708770751953, "gen_logits_std": 2.550292491912842, "gen_loss": 0.3509015142917633, "grad_norm": 0.5115006367582802, "learning_rate": 2.819747368421053e-05, "loss": 0.3413, "mean_copy_accuracy": 0.9925608485937119, "mean_gen_accuracy": 0.8542091995477676, "mean_token_accuracy": 0.8918412029743195, "num_tokens": 586196913.0, "sample_num_tokens": 7559.25, "step": 2163, "total_num_tokens": 586227150.0, "z_loss": 0.0008714316645637155 }, { "copy_logits_max": -7.787479400634766, "copy_logits_min": -750000000.0, "copy_num_tokens": 474.5625, "epoch": 0.44197089609394946, "gen_logits_max": 6.222652435302734, "gen_logits_mean": -12.879853248596191, "gen_logits_min": -25.147193908691406, "gen_logits_std": 2.587289571762085, "gen_loss": 0.2797348201274872, "grad_norm": 0.4363978471516168, "learning_rate": 2.819621052631579e-05, "loss": 0.3031, "mean_copy_accuracy": 0.9930994063615799, "mean_gen_accuracy": 0.8693403899669647, "mean_token_accuracy": 0.9000222384929657, "num_tokens": 586465776.0, "sample_num_tokens": 8343.0, "step": 2164, "total_num_tokens": 586499148.0, "z_loss": 0.0007457051542587578 }, { "copy_logits_max": -5.47841739654541, "copy_logits_min": -750000128.0, "copy_num_tokens": 401.0625, "epoch": 0.4421751340311463, "gen_logits_max": 6.1117024421691895, "gen_logits_mean": -13.061382293701172, "gen_logits_min": -25.003555297851562, "gen_logits_std": 2.55477237701416, "gen_loss": 0.2905644178390503, "grad_norm": 0.5086563494016269, "learning_rate": 2.8194947368421055e-05, "loss": 0.3038, "mean_copy_accuracy": 0.9939064681529999, "mean_gen_accuracy": 0.8674755394458771, "mean_token_accuracy": 0.8995421826839447, "num_tokens": 586730333.0, "sample_num_tokens": 7245.75, "step": 2165, "total_num_tokens": 586759316.0, "z_loss": 0.0007619895040988922 }, { "copy_logits_max": -5.739832401275635, "copy_logits_min": -750000000.0, "copy_num_tokens": 513.9375, "epoch": 0.4423793719683431, "gen_logits_max": 5.811267852783203, "gen_logits_mean": -13.176025390625, "gen_logits_min": -26.42023468017578, "gen_logits_std": 2.596627712249756, "gen_loss": 0.2896023988723755, "grad_norm": 0.5180360691569501, "learning_rate": 2.8193684210526316e-05, "loss": 0.3026, "mean_copy_accuracy": 0.9939065873622894, "mean_gen_accuracy": 0.8693532347679138, "mean_token_accuracy": 0.9005338549613953, "num_tokens": 587009638.0, "sample_num_tokens": 8225.0, "step": 2166, "total_num_tokens": 587042538.0, "z_loss": 0.0007957057096064091 }, { "copy_logits_max": -5.935192108154297, "copy_logits_min": -750000000.0, "copy_num_tokens": 529.625, "epoch": 0.44258360990553997, "gen_logits_max": 5.54300594329834, "gen_logits_mean": -13.143719673156738, "gen_logits_min": -24.918487548828125, "gen_logits_std": 2.5676238536834717, "gen_loss": 0.30807310342788696, "grad_norm": 0.513465799157663, "learning_rate": 2.819242105263158e-05, "loss": 0.3182, "mean_copy_accuracy": 0.9932677745819092, "mean_gen_accuracy": 0.8641270697116852, "mean_token_accuracy": 0.8964590281248093, "num_tokens": 587285461.0, "sample_num_tokens": 9002.25, "step": 2167, "total_num_tokens": 587321470.0, "z_loss": 0.0007175274076871574 }, { "copy_logits_max": -4.957484722137451, "copy_logits_min": -750000000.0, "copy_num_tokens": 514.375, "epoch": 0.4427878478427368, "gen_logits_max": 5.864624977111816, "gen_logits_mean": -13.512802124023438, "gen_logits_min": -26.00225067138672, "gen_logits_std": 2.5525107383728027, "gen_loss": 0.3495211899280548, "grad_norm": 0.4799625223633645, "learning_rate": 2.819115789473684e-05, "loss": 0.3283, "mean_copy_accuracy": 0.9933106154203415, "mean_gen_accuracy": 0.8580682873725891, "mean_token_accuracy": 0.8938227891921997, "num_tokens": 587559390.0, "sample_num_tokens": 8988.5, "step": 2168, "total_num_tokens": 587595344.0, "z_loss": 0.0008322763023898005 }, { "copy_logits_max": -6.617605686187744, "copy_logits_min": -750000000.0, "copy_num_tokens": 352.5625, "epoch": 0.4429920857799336, "gen_logits_max": 6.7020263671875, "gen_logits_mean": -13.427091598510742, "gen_logits_min": -25.80366325378418, "gen_logits_std": 2.543438196182251, "gen_loss": 0.35465967655181885, "grad_norm": 0.5052941439094891, "learning_rate": 2.8189894736842105e-05, "loss": 0.3294, "mean_copy_accuracy": 0.9936323761940002, "mean_gen_accuracy": 0.8610375672578812, "mean_token_accuracy": 0.8928798735141754, "num_tokens": 587828981.0, "sample_num_tokens": 7378.25, "step": 2169, "total_num_tokens": 587858494.0, "z_loss": 0.0009412033250555396 }, { "copy_logits_max": -6.0404181480407715, "copy_logits_min": -750000000.0, "copy_num_tokens": 402.25, "epoch": 0.44319632371713047, "gen_logits_max": 6.245673179626465, "gen_logits_mean": -13.775800704956055, "gen_logits_min": -26.415878295898438, "gen_logits_std": 2.5476365089416504, "gen_loss": 0.29490286111831665, "grad_norm": 0.42567208293362147, "learning_rate": 2.818863157894737e-05, "loss": 0.3046, "mean_copy_accuracy": 0.9943595379590988, "mean_gen_accuracy": 0.8673179596662521, "mean_token_accuracy": 0.9013719856739044, "num_tokens": 588095911.0, "sample_num_tokens": 8164.75, "step": 2170, "total_num_tokens": 588128570.0, "z_loss": 0.0007993524777702987 }, { "copy_logits_max": -5.440561294555664, "copy_logits_min": -750000000.0, "copy_num_tokens": 444.0, "epoch": 0.4434005616543273, "gen_logits_max": 6.122859954833984, "gen_logits_mean": -13.136486053466797, "gen_logits_min": -25.870216369628906, "gen_logits_std": 2.562584638595581, "gen_loss": 0.3420538306236267, "grad_norm": 0.48485870220394284, "learning_rate": 2.8187368421052634e-05, "loss": 0.3312, "mean_copy_accuracy": 0.9930980205535889, "mean_gen_accuracy": 0.8552141636610031, "mean_token_accuracy": 0.8900995403528214, "num_tokens": 588377797.0, "sample_num_tokens": 8440.75, "step": 2171, "total_num_tokens": 588411560.0, "z_loss": 0.0008251264225691557 }, { "copy_logits_max": -7.011421203613281, "copy_logits_min": -750000000.0, "copy_num_tokens": 254.0625, "epoch": 0.4436047995915241, "gen_logits_max": 7.622888565063477, "gen_logits_mean": -13.126486778259277, "gen_logits_min": -25.328468322753906, "gen_logits_std": 2.5705623626708984, "gen_loss": 0.3197159171104431, "grad_norm": 0.4835110634775965, "learning_rate": 2.8186105263157895e-05, "loss": 0.3363, "mean_copy_accuracy": 0.9923354387283325, "mean_gen_accuracy": 0.8620841056108475, "mean_token_accuracy": 0.8871620297431946, "num_tokens": 588611278.0, "sample_num_tokens": 6791.5, "step": 2172, "total_num_tokens": 588638444.0, "z_loss": 0.0008632721146568656 }, { "copy_logits_max": -4.6630401611328125, "copy_logits_min": -750000000.0, "copy_num_tokens": 378.5, "epoch": 0.443809037528721, "gen_logits_max": 7.546213150024414, "gen_logits_mean": -11.811002731323242, "gen_logits_min": -23.973527908325195, "gen_logits_std": 2.5326364040374756, "gen_loss": 0.39222580194473267, "grad_norm": 0.41949148169843675, "learning_rate": 2.818484210526316e-05, "loss": 0.3339, "mean_copy_accuracy": 0.9935662299394608, "mean_gen_accuracy": 0.8564026951789856, "mean_token_accuracy": 0.8897151798009872, "num_tokens": 588883377.0, "sample_num_tokens": 8308.25, "step": 2173, "total_num_tokens": 588916610.0, "z_loss": 0.0009179719490930438 }, { "copy_logits_max": -6.183077335357666, "copy_logits_min": -750000000.0, "copy_num_tokens": 342.3125, "epoch": 0.4440132754659178, "gen_logits_max": 6.119380950927734, "gen_logits_mean": -14.017900466918945, "gen_logits_min": -25.798809051513672, "gen_logits_std": 2.520261287689209, "gen_loss": 0.35444176197052, "grad_norm": 0.5857657019357957, "learning_rate": 2.818357894736842e-05, "loss": 0.3159, "mean_copy_accuracy": 0.9941347390413284, "mean_gen_accuracy": 0.8638876080513, "mean_token_accuracy": 0.8953716456890106, "num_tokens": 589160159.0, "sample_num_tokens": 7563.75, "step": 2174, "total_num_tokens": 589190414.0, "z_loss": 0.0008286192314699292 }, { "copy_logits_max": -6.601649284362793, "copy_logits_min": -687500032.0, "copy_num_tokens": 673.25, "epoch": 0.4442175134031146, "gen_logits_max": 5.917463779449463, "gen_logits_mean": -12.902166366577148, "gen_logits_min": -26.071731567382812, "gen_logits_std": 2.604721784591675, "gen_loss": 0.3016359806060791, "grad_norm": 0.4266162634477666, "learning_rate": 2.8182315789473684e-05, "loss": 0.3152, "mean_copy_accuracy": 0.9954966902732849, "mean_gen_accuracy": 0.862404853105545, "mean_token_accuracy": 0.8966509401798248, "num_tokens": 589428607.0, "sample_num_tokens": 10188.25, "step": 2175, "total_num_tokens": 589469360.0, "z_loss": 0.000748427351936698 }, { "copy_logits_max": -7.387948036193848, "copy_logits_min": -750000000.0, "copy_num_tokens": 477.625, "epoch": 0.4444217513403115, "gen_logits_max": 6.729100227355957, "gen_logits_mean": -12.816987037658691, "gen_logits_min": -25.722009658813477, "gen_logits_std": 2.5985922813415527, "gen_loss": 0.29772740602493286, "grad_norm": 0.4367202742048547, "learning_rate": 2.818105263157895e-05, "loss": 0.313, "mean_copy_accuracy": 0.9933404624462128, "mean_gen_accuracy": 0.8649446964263916, "mean_token_accuracy": 0.8969035446643829, "num_tokens": 589707409.0, "sample_num_tokens": 8330.75, "step": 2176, "total_num_tokens": 589740732.0, "z_loss": 0.0007571184542030096 }, { "copy_logits_max": -6.269471168518066, "copy_logits_min": -750000000.0, "copy_num_tokens": 484.1875, "epoch": 0.4446259892775083, "gen_logits_max": 7.4483208656311035, "gen_logits_mean": -12.146677017211914, "gen_logits_min": -24.968463897705078, "gen_logits_std": 2.5792922973632812, "gen_loss": 0.3369189500808716, "grad_norm": 0.48135003858441305, "learning_rate": 2.817978947368421e-05, "loss": 0.3199, "mean_copy_accuracy": 0.9929983764886856, "mean_gen_accuracy": 0.8598615676164627, "mean_token_accuracy": 0.8948144763708115, "num_tokens": 589973517.0, "sample_num_tokens": 8445.25, "step": 2177, "total_num_tokens": 590007298.0, "z_loss": 0.0008832982275635004 }, { "copy_logits_max": -4.8505096435546875, "copy_logits_min": -750000000.0, "copy_num_tokens": 486.875, "epoch": 0.4448302272147051, "gen_logits_max": 6.783134937286377, "gen_logits_mean": -11.957237243652344, "gen_logits_min": -24.8865966796875, "gen_logits_std": 2.5861053466796875, "gen_loss": 0.31404587626457214, "grad_norm": 0.5257443306698452, "learning_rate": 2.8178526315789474e-05, "loss": 0.3444, "mean_copy_accuracy": 0.991436168551445, "mean_gen_accuracy": 0.8563854247331619, "mean_token_accuracy": 0.8855575621128082, "num_tokens": 590224539.0, "sample_num_tokens": 8754.75, "step": 2178, "total_num_tokens": 590259558.0, "z_loss": 0.0009650495485402644 }, { "copy_logits_max": -5.338481903076172, "copy_logits_min": -750000000.0, "copy_num_tokens": 336.5625, "epoch": 0.445034465151902, "gen_logits_max": 6.873995780944824, "gen_logits_mean": -12.877130508422852, "gen_logits_min": -26.125646591186523, "gen_logits_std": 2.609166145324707, "gen_loss": 0.3137694299221039, "grad_norm": 1.7233060241586449, "learning_rate": 2.817726315789474e-05, "loss": 0.3111, "mean_copy_accuracy": 0.9937550723552704, "mean_gen_accuracy": 0.8659840524196625, "mean_token_accuracy": 0.8960042148828506, "num_tokens": 590482330.0, "sample_num_tokens": 6338.0, "step": 2179, "total_num_tokens": 590507682.0, "z_loss": 0.0009696550550870597 }, { "copy_logits_max": -6.859132766723633, "copy_logits_min": -750000000.0, "copy_num_tokens": 687.625, "epoch": 0.4452387030890988, "gen_logits_max": 7.007730484008789, "gen_logits_mean": -11.758134841918945, "gen_logits_min": -24.368377685546875, "gen_logits_std": 2.5617218017578125, "gen_loss": 0.3096659481525421, "grad_norm": 0.5042155848410589, "learning_rate": 2.8176000000000003e-05, "loss": 0.2864, "mean_copy_accuracy": 0.9941435754299164, "mean_gen_accuracy": 0.8691696971654892, "mean_token_accuracy": 0.905238151550293, "num_tokens": 590764938.0, "sample_num_tokens": 9950.0, "step": 2180, "total_num_tokens": 590804738.0, "z_loss": 0.0008893199265003204 }, { "copy_logits_max": -7.043009281158447, "copy_logits_min": -750000064.0, "copy_num_tokens": 498.0625, "epoch": 0.4454429410262956, "gen_logits_max": 6.3869452476501465, "gen_logits_mean": -13.175931930541992, "gen_logits_min": -26.000993728637695, "gen_logits_std": 2.5875844955444336, "gen_loss": 0.29246556758880615, "grad_norm": 0.44016644884459827, "learning_rate": 2.8174736842105264e-05, "loss": 0.3099, "mean_copy_accuracy": 0.9936774373054504, "mean_gen_accuracy": 0.8637227565050125, "mean_token_accuracy": 0.8980784714221954, "num_tokens": 591028413.0, "sample_num_tokens": 8207.75, "step": 2181, "total_num_tokens": 591061244.0, "z_loss": 0.0008601470617577434 }, { "copy_logits_max": -8.04587173461914, "copy_logits_min": -750000064.0, "copy_num_tokens": 378.375, "epoch": 0.4456471789634925, "gen_logits_max": 6.895986557006836, "gen_logits_mean": -14.321569442749023, "gen_logits_min": -26.18459129333496, "gen_logits_std": 2.5346574783325195, "gen_loss": 0.3416416645050049, "grad_norm": 0.4835929801469315, "learning_rate": 2.8173473684210528e-05, "loss": 0.3365, "mean_copy_accuracy": 0.9935525953769684, "mean_gen_accuracy": 0.8556123375892639, "mean_token_accuracy": 0.8899344652891159, "num_tokens": 591316218.0, "sample_num_tokens": 8309.5, "step": 2182, "total_num_tokens": 591349456.0, "z_loss": 0.0008904972346499562 }, { "copy_logits_max": -7.01529598236084, "copy_logits_min": -687500032.0, "copy_num_tokens": 514.9375, "epoch": 0.4458514169006893, "gen_logits_max": 6.726095199584961, "gen_logits_mean": -13.23892593383789, "gen_logits_min": -25.33592414855957, "gen_logits_std": 2.5373013019561768, "gen_loss": 0.3104487657546997, "grad_norm": 0.5064983024614809, "learning_rate": 2.817221052631579e-05, "loss": 0.3334, "mean_copy_accuracy": 0.990730494260788, "mean_gen_accuracy": 0.8609810918569565, "mean_token_accuracy": 0.8912535756826401, "num_tokens": 591579791.0, "sample_num_tokens": 9583.25, "step": 2183, "total_num_tokens": 591618124.0, "z_loss": 0.0008418296347372234 }, { "copy_logits_max": -6.5608625411987305, "copy_logits_min": -750000000.0, "copy_num_tokens": 410.0625, "epoch": 0.4460556548378861, "gen_logits_max": 6.219023704528809, "gen_logits_mean": -13.987255096435547, "gen_logits_min": -25.904178619384766, "gen_logits_std": 2.5311203002929688, "gen_loss": 0.3316524624824524, "grad_norm": 0.4809394649927597, "learning_rate": 2.8170947368421053e-05, "loss": 0.3316, "mean_copy_accuracy": 0.9939890950918198, "mean_gen_accuracy": 0.8560707718133926, "mean_token_accuracy": 0.8913499116897583, "num_tokens": 591855019.0, "sample_num_tokens": 7921.25, "step": 2184, "total_num_tokens": 591886704.0, "z_loss": 0.000821646477561444 }, { "copy_logits_max": -8.41214656829834, "copy_logits_min": -687500032.0, "copy_num_tokens": 519.4375, "epoch": 0.446259892775083, "gen_logits_max": 7.474740982055664, "gen_logits_mean": -12.29255485534668, "gen_logits_min": -24.70413589477539, "gen_logits_std": 2.5595414638519287, "gen_loss": 0.3013673424720764, "grad_norm": 0.5644907939820192, "learning_rate": 2.8169684210526314e-05, "loss": 0.3178, "mean_copy_accuracy": 0.9936833679676056, "mean_gen_accuracy": 0.8636486381292343, "mean_token_accuracy": 0.8957590460777283, "num_tokens": 592121727.0, "sample_num_tokens": 9320.25, "step": 2185, "total_num_tokens": 592159008.0, "z_loss": 0.0007988325669430196 }, { "copy_logits_max": -8.043654441833496, "copy_logits_min": -750000000.0, "copy_num_tokens": 345.625, "epoch": 0.4464641307122798, "gen_logits_max": 7.334997177124023, "gen_logits_mean": -12.754981994628906, "gen_logits_min": -24.760631561279297, "gen_logits_std": 2.5459728240966797, "gen_loss": 0.29988130927085876, "grad_norm": 0.4369150588519981, "learning_rate": 2.816842105263158e-05, "loss": 0.3084, "mean_copy_accuracy": 0.9924336373806, "mean_gen_accuracy": 0.8670462518930435, "mean_token_accuracy": 0.898162916302681, "num_tokens": 592394430.0, "sample_num_tokens": 8312.5, "step": 2186, "total_num_tokens": 592427680.0, "z_loss": 0.0007705283351242542 }, { "copy_logits_max": -5.81455659866333, "copy_logits_min": -750000000.0, "copy_num_tokens": 415.5625, "epoch": 0.4466683686494766, "gen_logits_max": 5.745542049407959, "gen_logits_mean": -14.319381713867188, "gen_logits_min": -25.94843101501465, "gen_logits_std": 2.498831272125244, "gen_loss": 0.3100733458995819, "grad_norm": 0.5027058895480152, "learning_rate": 2.8167157894736843e-05, "loss": 0.323, "mean_copy_accuracy": 0.993355467915535, "mean_gen_accuracy": 0.8566843420267105, "mean_token_accuracy": 0.8932516723871231, "num_tokens": 592665975.0, "sample_num_tokens": 8170.25, "step": 2187, "total_num_tokens": 592698656.0, "z_loss": 0.0008190093794837594 }, { "copy_logits_max": -6.772399425506592, "copy_logits_min": -750000000.0, "copy_num_tokens": 332.3125, "epoch": 0.4468726065866735, "gen_logits_max": 7.247531890869141, "gen_logits_mean": -14.083212852478027, "gen_logits_min": -25.450340270996094, "gen_logits_std": 2.469489574432373, "gen_loss": 0.30204272270202637, "grad_norm": 0.4146677921247891, "learning_rate": 2.8165894736842107e-05, "loss": 0.3095, "mean_copy_accuracy": 0.9935044199228287, "mean_gen_accuracy": 0.870201051235199, "mean_token_accuracy": 0.8971021771430969, "num_tokens": 592935639.0, "sample_num_tokens": 7885.75, "step": 2188, "total_num_tokens": 592967182.0, "z_loss": 0.0007852568523958325 }, { "copy_logits_max": -5.809186935424805, "copy_logits_min": -750000000.0, "copy_num_tokens": 576.5625, "epoch": 0.4470768445238703, "gen_logits_max": 6.548802852630615, "gen_logits_mean": -13.159370422363281, "gen_logits_min": -24.749004364013672, "gen_logits_std": 2.5456295013427734, "gen_loss": 0.2840704321861267, "grad_norm": 0.48001281635384013, "learning_rate": 2.816463157894737e-05, "loss": 0.3156, "mean_copy_accuracy": 0.9935100376605988, "mean_gen_accuracy": 0.8612921684980392, "mean_token_accuracy": 0.898206815123558, "num_tokens": 593205373.0, "sample_num_tokens": 8866.75, "step": 2189, "total_num_tokens": 593240840.0, "z_loss": 0.0007378771551884711 }, { "copy_logits_max": -3.7043542861938477, "copy_logits_min": -750000064.0, "copy_num_tokens": 524.5, "epoch": 0.4472810824610671, "gen_logits_max": 8.315906524658203, "gen_logits_mean": -11.53542709350586, "gen_logits_min": -23.630962371826172, "gen_logits_std": 2.522587776184082, "gen_loss": 0.35180380940437317, "grad_norm": 0.4821587938969655, "learning_rate": 2.8163368421052632e-05, "loss": 0.3164, "mean_copy_accuracy": 0.9929264336824417, "mean_gen_accuracy": 0.8633901625871658, "mean_token_accuracy": 0.8953838050365448, "num_tokens": 593469264.0, "sample_num_tokens": 9050.5, "step": 2190, "total_num_tokens": 593505466.0, "z_loss": 0.0009037330164574087 }, { "copy_logits_max": -7.205419063568115, "copy_logits_min": -750000000.0, "copy_num_tokens": 315.6875, "epoch": 0.447485320398264, "gen_logits_max": 6.541715621948242, "gen_logits_mean": -14.023293495178223, "gen_logits_min": -25.467079162597656, "gen_logits_std": 2.496103286743164, "gen_loss": 0.30730897188186646, "grad_norm": 0.479637016525561, "learning_rate": 2.8162105263157897e-05, "loss": 0.3135, "mean_copy_accuracy": 0.9933264255523682, "mean_gen_accuracy": 0.8718447536230087, "mean_token_accuracy": 0.8964664787054062, "num_tokens": 593741702.0, "sample_num_tokens": 7445.5, "step": 2191, "total_num_tokens": 593771484.0, "z_loss": 0.0008462491678074002 }, { "copy_logits_max": -6.376668930053711, "copy_logits_min": -750000000.0, "copy_num_tokens": 379.9375, "epoch": 0.4476895583354608, "gen_logits_max": 7.373632431030273, "gen_logits_mean": -11.875080108642578, "gen_logits_min": -23.276809692382812, "gen_logits_std": 2.4594383239746094, "gen_loss": 0.3651166558265686, "grad_norm": 0.45843613258352944, "learning_rate": 2.8160842105263157e-05, "loss": 0.3219, "mean_copy_accuracy": 0.9945429712533951, "mean_gen_accuracy": 0.8600400388240814, "mean_token_accuracy": 0.8935590833425522, "num_tokens": 594024183.0, "sample_num_tokens": 8519.75, "step": 2192, "total_num_tokens": 594058262.0, "z_loss": 0.0009953966364264488 }, { "copy_logits_max": -5.095216274261475, "copy_logits_min": -750000000.0, "copy_num_tokens": 354.4375, "epoch": 0.44789379627265763, "gen_logits_max": 7.940944194793701, "gen_logits_mean": -10.987844467163086, "gen_logits_min": -22.66427993774414, "gen_logits_std": 2.523569345474243, "gen_loss": 0.3907742500305176, "grad_norm": 0.5531530806782173, "learning_rate": 2.8159578947368422e-05, "loss": 0.3538, "mean_copy_accuracy": 0.9921649396419525, "mean_gen_accuracy": 0.8465804308652878, "mean_token_accuracy": 0.8856855630874634, "num_tokens": 594307430.0, "sample_num_tokens": 7308.0, "step": 2193, "total_num_tokens": 594336662.0, "z_loss": 0.001133852987550199 }, { "copy_logits_max": -5.716377258300781, "copy_logits_min": -750000000.0, "copy_num_tokens": 621.8125, "epoch": 0.4480980342098545, "gen_logits_max": 5.897101879119873, "gen_logits_mean": -13.504877090454102, "gen_logits_min": -25.41912841796875, "gen_logits_std": 2.556072235107422, "gen_loss": 0.27127593755722046, "grad_norm": 0.44948963732027614, "learning_rate": 2.8158315789473683e-05, "loss": 0.3126, "mean_copy_accuracy": 0.9940423220396042, "mean_gen_accuracy": 0.8662363737821579, "mean_token_accuracy": 0.8975169956684113, "num_tokens": 594573547.0, "sample_num_tokens": 9208.25, "step": 2194, "total_num_tokens": 594610380.0, "z_loss": 0.0008386122062802315 }, { "copy_logits_max": -3.944901943206787, "copy_logits_min": -750000000.0, "copy_num_tokens": 688.5, "epoch": 0.4483022721470513, "gen_logits_max": 6.100698471069336, "gen_logits_mean": -11.868890762329102, "gen_logits_min": -23.5676212310791, "gen_logits_std": 2.5564794540405273, "gen_loss": 0.29216402769088745, "grad_norm": 0.5244831238831641, "learning_rate": 2.815705263157895e-05, "loss": 0.3251, "mean_copy_accuracy": 0.9918776750564575, "mean_gen_accuracy": 0.8614023178815842, "mean_token_accuracy": 0.8921572417020798, "num_tokens": 594825824.0, "sample_num_tokens": 9001.5, "step": 2195, "total_num_tokens": 594861830.0, "z_loss": 0.0008889532764442265 }, { "copy_logits_max": -3.9568533897399902, "copy_logits_min": -687500032.0, "copy_num_tokens": 490.8125, "epoch": 0.44850651008424813, "gen_logits_max": 6.11434268951416, "gen_logits_mean": -13.18283748626709, "gen_logits_min": -25.414968490600586, "gen_logits_std": 2.5986008644104004, "gen_loss": 0.3230011463165283, "grad_norm": 0.44714842851835107, "learning_rate": 2.815578947368421e-05, "loss": 0.3214, "mean_copy_accuracy": 0.9929004162549973, "mean_gen_accuracy": 0.8614023923873901, "mean_token_accuracy": 0.8934755921363831, "num_tokens": 595093448.0, "sample_num_tokens": 7769.0, "step": 2196, "total_num_tokens": 595124524.0, "z_loss": 0.0009043177124112844 }, { "copy_logits_max": -6.973803520202637, "copy_logits_min": -750000064.0, "copy_num_tokens": 340.875, "epoch": 0.448710748021445, "gen_logits_max": 7.06282377243042, "gen_logits_mean": -13.071575164794922, "gen_logits_min": -25.061681747436523, "gen_logits_std": 2.533761501312256, "gen_loss": 0.32678160071372986, "grad_norm": 0.5434325834448365, "learning_rate": 2.8154526315789476e-05, "loss": 0.3393, "mean_copy_accuracy": 0.9925184547901154, "mean_gen_accuracy": 0.8582797050476074, "mean_token_accuracy": 0.8909252732992172, "num_tokens": 595348570.0, "sample_num_tokens": 7567.0, "step": 2197, "total_num_tokens": 595378838.0, "z_loss": 0.0008261112961918116 }, { "copy_logits_max": -7.08903169631958, "copy_logits_min": -750000064.0, "copy_num_tokens": 501.5625, "epoch": 0.4489149859586418, "gen_logits_max": 6.471753120422363, "gen_logits_mean": -13.190913200378418, "gen_logits_min": -24.723054885864258, "gen_logits_std": 2.517855167388916, "gen_loss": 0.36703282594680786, "grad_norm": 0.4594397372995301, "learning_rate": 2.8153263157894737e-05, "loss": 0.3141, "mean_copy_accuracy": 0.9946193993091583, "mean_gen_accuracy": 0.8633540272712708, "mean_token_accuracy": 0.8978982120752335, "num_tokens": 595630895.0, "sample_num_tokens": 9048.75, "step": 2198, "total_num_tokens": 595667090.0, "z_loss": 0.0009007417829707265 }, { "copy_logits_max": -6.328577995300293, "copy_logits_min": -750000000.0, "copy_num_tokens": 411.625, "epoch": 0.44911922389583864, "gen_logits_max": 7.111399173736572, "gen_logits_mean": -12.75704574584961, "gen_logits_min": -24.382659912109375, "gen_logits_std": 2.5182714462280273, "gen_loss": 0.34758222103118896, "grad_norm": 0.46020701821548565, "learning_rate": 2.8152e-05, "loss": 0.3259, "mean_copy_accuracy": 0.992926150560379, "mean_gen_accuracy": 0.8596805781126022, "mean_token_accuracy": 0.8917873799800873, "num_tokens": 595906663.0, "sample_num_tokens": 8539.75, "step": 2199, "total_num_tokens": 595940822.0, "z_loss": 0.0009591617854312062 }, { "copy_logits_max": -3.573322057723999, "copy_logits_min": -687500032.0, "copy_num_tokens": 539.75, "epoch": 0.4493234618330355, "gen_logits_max": 4.923095226287842, "gen_logits_mean": -14.696430206298828, "gen_logits_min": -26.549266815185547, "gen_logits_std": 2.561771869659424, "gen_loss": 0.2904529571533203, "grad_norm": 0.43061655187063863, "learning_rate": 2.8150736842105262e-05, "loss": 0.3205, "mean_copy_accuracy": 0.9945210665464401, "mean_gen_accuracy": 0.858683779835701, "mean_token_accuracy": 0.8939165771007538, "num_tokens": 596167604.0, "sample_num_tokens": 8409.0, "step": 2200, "total_num_tokens": 596201240.0, "z_loss": 0.0008606712217442691 }, { "copy_logits_max": -5.847559452056885, "copy_logits_min": -625000064.0, "copy_num_tokens": 596.0625, "epoch": 0.4495276997702323, "gen_logits_max": 6.700486660003662, "gen_logits_mean": -12.140600204467773, "gen_logits_min": -24.55352783203125, "gen_logits_std": 2.6421446800231934, "gen_loss": 0.31919533014297485, "grad_norm": 0.9083464683313922, "learning_rate": 2.8149473684210526e-05, "loss": 0.3102, "mean_copy_accuracy": 0.9935660511255264, "mean_gen_accuracy": 0.8634890764951706, "mean_token_accuracy": 0.8991410881280899, "num_tokens": 596472958.0, "sample_num_tokens": 9527.5, "step": 2201, "total_num_tokens": 596511068.0, "z_loss": 0.0010613389313220978 }, { "copy_logits_max": -5.655061721801758, "copy_logits_min": -750000000.0, "copy_num_tokens": 596.75, "epoch": 0.44973193770742914, "gen_logits_max": 6.431324481964111, "gen_logits_mean": -12.98916244506836, "gen_logits_min": -25.107295989990234, "gen_logits_std": 2.6138503551483154, "gen_loss": 0.3137269616127014, "grad_norm": 0.46298528289792323, "learning_rate": 2.814821052631579e-05, "loss": 0.3283, "mean_copy_accuracy": 0.9924947321414948, "mean_gen_accuracy": 0.8614831566810608, "mean_token_accuracy": 0.8928678929805756, "num_tokens": 596749253.0, "sample_num_tokens": 9542.25, "step": 2202, "total_num_tokens": 596787422.0, "z_loss": 0.0010736348340287805 }, { "copy_logits_max": -4.202517032623291, "copy_logits_min": -625000064.0, "copy_num_tokens": 614.5, "epoch": 0.449936175644626, "gen_logits_max": 5.029716491699219, "gen_logits_mean": -14.249444007873535, "gen_logits_min": -26.340085983276367, "gen_logits_std": 2.566586971282959, "gen_loss": 0.31761160492897034, "grad_norm": 0.5271737587528598, "learning_rate": 2.8146947368421055e-05, "loss": 0.3071, "mean_copy_accuracy": 0.994618684053421, "mean_gen_accuracy": 0.863018199801445, "mean_token_accuracy": 0.8991985470056534, "num_tokens": 597040018.0, "sample_num_tokens": 9323.5, "step": 2203, "total_num_tokens": 597077312.0, "z_loss": 0.0010378229198977351 }, { "copy_logits_max": -7.403049468994141, "copy_logits_min": -687500032.0, "copy_num_tokens": 410.375, "epoch": 0.45014041358182283, "gen_logits_max": 6.683708667755127, "gen_logits_mean": -13.064111709594727, "gen_logits_min": -24.709522247314453, "gen_logits_std": 2.5561201572418213, "gen_loss": 0.36412501335144043, "grad_norm": 0.4448097108429741, "learning_rate": 2.814568421052632e-05, "loss": 0.3085, "mean_copy_accuracy": 0.9931103587150574, "mean_gen_accuracy": 0.8651215881109238, "mean_token_accuracy": 0.8981070518493652, "num_tokens": 597320737.0, "sample_num_tokens": 8337.25, "step": 2204, "total_num_tokens": 597354086.0, "z_loss": 0.0009532740223221481 }, { "copy_logits_max": -7.330137252807617, "copy_logits_min": -687500032.0, "copy_num_tokens": 527.125, "epoch": 0.45034465151901965, "gen_logits_max": 6.297430038452148, "gen_logits_mean": -12.567590713500977, "gen_logits_min": -24.392812728881836, "gen_logits_std": 2.5847136974334717, "gen_loss": 0.3284916877746582, "grad_norm": 0.46931158113720783, "learning_rate": 2.814442105263158e-05, "loss": 0.349, "mean_copy_accuracy": 0.993591845035553, "mean_gen_accuracy": 0.8480197936296463, "mean_token_accuracy": 0.8872046619653702, "num_tokens": 597594387.0, "sample_num_tokens": 9445.25, "step": 2205, "total_num_tokens": 597632168.0, "z_loss": 0.0008888209122233093 }, { "copy_logits_max": -9.675471305847168, "copy_logits_min": -750000000.0, "copy_num_tokens": 289.25, "epoch": 0.4505488894562165, "gen_logits_max": 6.360954761505127, "gen_logits_mean": -14.151175498962402, "gen_logits_min": -25.616025924682617, "gen_logits_std": 2.5310988426208496, "gen_loss": 0.2941466271877289, "grad_norm": 0.4018993306580652, "learning_rate": 2.8143157894736844e-05, "loss": 0.2868, "mean_copy_accuracy": 0.9937532693147659, "mean_gen_accuracy": 0.8768017590045929, "mean_token_accuracy": 0.9052838981151581, "num_tokens": 597880046.0, "sample_num_tokens": 7702.0, "step": 2206, "total_num_tokens": 597910854.0, "z_loss": 0.000731874315533787 }, { "copy_logits_max": -9.07208251953125, "copy_logits_min": -750000000.0, "copy_num_tokens": 235.375, "epoch": 0.45075312739341333, "gen_logits_max": 6.300193786621094, "gen_logits_mean": -14.241121292114258, "gen_logits_min": -25.490161895751953, "gen_logits_std": 2.518409490585327, "gen_loss": 0.308834046125412, "grad_norm": 0.5099216220417787, "learning_rate": 2.8141894736842105e-05, "loss": 0.3332, "mean_copy_accuracy": 0.9937191009521484, "mean_gen_accuracy": 0.8622523099184036, "mean_token_accuracy": 0.8895858377218246, "num_tokens": 598145556.0, "sample_num_tokens": 6959.5, "step": 2207, "total_num_tokens": 598173394.0, "z_loss": 0.0007331318920478225 }, { "copy_logits_max": -8.872995376586914, "copy_logits_min": -750000000.0, "copy_num_tokens": 321.625, "epoch": 0.45095736533061015, "gen_logits_max": 6.422222137451172, "gen_logits_mean": -13.183996200561523, "gen_logits_min": -24.89422607421875, "gen_logits_std": 2.5832552909851074, "gen_loss": 0.3421669006347656, "grad_norm": 0.47666318162903915, "learning_rate": 2.814063157894737e-05, "loss": 0.333, "mean_copy_accuracy": 0.9927505105733871, "mean_gen_accuracy": 0.8559319227933884, "mean_token_accuracy": 0.8911292403936386, "num_tokens": 598406567.0, "sample_num_tokens": 7484.75, "step": 2208, "total_num_tokens": 598436506.0, "z_loss": 0.0008234845008701086 }, { "copy_logits_max": -6.9418792724609375, "copy_logits_min": -687500032.0, "copy_num_tokens": 478.6875, "epoch": 0.451161603267807, "gen_logits_max": 6.560729026794434, "gen_logits_mean": -13.156319618225098, "gen_logits_min": -25.204116821289062, "gen_logits_std": 2.594221353530884, "gen_loss": 0.29876086115837097, "grad_norm": 0.45280547361553164, "learning_rate": 2.813936842105263e-05, "loss": 0.3163, "mean_copy_accuracy": 0.9920317381620407, "mean_gen_accuracy": 0.8649227619171143, "mean_token_accuracy": 0.8956422507762909, "num_tokens": 598669114.0, "sample_num_tokens": 9123.5, "step": 2209, "total_num_tokens": 598705608.0, "z_loss": 0.0007803812623023987 }, { "copy_logits_max": -6.964382648468018, "copy_logits_min": -687500032.0, "copy_num_tokens": 414.5625, "epoch": 0.45136584120500384, "gen_logits_max": 6.264519691467285, "gen_logits_mean": -14.426351547241211, "gen_logits_min": -25.8758487701416, "gen_logits_std": 2.496999979019165, "gen_loss": 0.34178459644317627, "grad_norm": 0.46103722278695364, "learning_rate": 2.8138105263157895e-05, "loss": 0.3217, "mean_copy_accuracy": 0.9927607774734497, "mean_gen_accuracy": 0.8608562499284744, "mean_token_accuracy": 0.8935060203075409, "num_tokens": 598916927.0, "sample_num_tokens": 8404.75, "step": 2210, "total_num_tokens": 598950546.0, "z_loss": 0.00089022028259933 }, { "copy_logits_max": -4.06549072265625, "copy_logits_min": -750000000.0, "copy_num_tokens": 796.5, "epoch": 0.45157007914220065, "gen_logits_max": 6.129669666290283, "gen_logits_mean": -12.723600387573242, "gen_logits_min": -25.466238021850586, "gen_logits_std": 2.6368567943573, "gen_loss": 0.2840126156806946, "grad_norm": 0.46307277337668007, "learning_rate": 2.813684210526316e-05, "loss": 0.3216, "mean_copy_accuracy": 0.9921756833791733, "mean_gen_accuracy": 0.8626080304384232, "mean_token_accuracy": 0.8955419361591339, "num_tokens": 599172603.0, "sample_num_tokens": 9635.25, "step": 2211, "total_num_tokens": 599211144.0, "z_loss": 0.0009179237531498075 }, { "copy_logits_max": -6.633123874664307, "copy_logits_min": -750000000.0, "copy_num_tokens": 399.125, "epoch": 0.4517743170793975, "gen_logits_max": 8.45964241027832, "gen_logits_mean": -10.79935073852539, "gen_logits_min": -23.073068618774414, "gen_logits_std": 2.5984549522399902, "gen_loss": 0.40279853343963623, "grad_norm": 0.4731159636452208, "learning_rate": 2.8135578947368423e-05, "loss": 0.3315, "mean_copy_accuracy": 0.9934205859899521, "mean_gen_accuracy": 0.8549835830926895, "mean_token_accuracy": 0.8918554037809372, "num_tokens": 599476876.0, "sample_num_tokens": 8189.0, "step": 2212, "total_num_tokens": 599509632.0, "z_loss": 0.0011690877145156264 }, { "copy_logits_max": -6.3552656173706055, "copy_logits_min": -750000128.0, "copy_num_tokens": 597.8125, "epoch": 0.45197855501659434, "gen_logits_max": 7.385588645935059, "gen_logits_mean": -11.585227966308594, "gen_logits_min": -23.541465759277344, "gen_logits_std": 2.602860450744629, "gen_loss": 0.30082929134368896, "grad_norm": 0.49463123308475615, "learning_rate": 2.8134315789473684e-05, "loss": 0.3016, "mean_copy_accuracy": 0.9944457560777664, "mean_gen_accuracy": 0.8670498877763748, "mean_token_accuracy": 0.9005632698535919, "num_tokens": 599753987.0, "sample_num_tokens": 9883.25, "step": 2213, "total_num_tokens": 599793520.0, "z_loss": 0.0009464622708037496 }, { "copy_logits_max": -6.5893425941467285, "copy_logits_min": -687500032.0, "copy_num_tokens": 507.0, "epoch": 0.45218279295379116, "gen_logits_max": 6.567779064178467, "gen_logits_mean": -12.769699096679688, "gen_logits_min": -25.501550674438477, "gen_logits_std": 2.602473258972168, "gen_loss": 0.3076893091201782, "grad_norm": 0.4616282207617182, "learning_rate": 2.813305263157895e-05, "loss": 0.3362, "mean_copy_accuracy": 0.9925853461027145, "mean_gen_accuracy": 0.8527974188327789, "mean_token_accuracy": 0.8881545960903168, "num_tokens": 600029963.0, "sample_num_tokens": 8185.75, "step": 2214, "total_num_tokens": 600062706.0, "z_loss": 0.0009385676239617169 }, { "copy_logits_max": -6.541077136993408, "copy_logits_min": -750000000.0, "copy_num_tokens": 484.6875, "epoch": 0.452387030890988, "gen_logits_max": 7.292049407958984, "gen_logits_mean": -13.00043773651123, "gen_logits_min": -24.76138687133789, "gen_logits_std": 2.5644750595092773, "gen_loss": 0.30276110768318176, "grad_norm": 0.4658626933922638, "learning_rate": 2.813178947368421e-05, "loss": 0.3259, "mean_copy_accuracy": 0.9932944476604462, "mean_gen_accuracy": 0.8624309301376343, "mean_token_accuracy": 0.895080029964447, "num_tokens": 600299473.0, "sample_num_tokens": 8717.75, "step": 2215, "total_num_tokens": 600334344.0, "z_loss": 0.0008075740770436823 }, { "copy_logits_max": -5.76363468170166, "copy_logits_min": -750000000.0, "copy_num_tokens": 450.875, "epoch": 0.45259126882818485, "gen_logits_max": 6.454517364501953, "gen_logits_mean": -12.873610496520996, "gen_logits_min": -24.73929214477539, "gen_logits_std": 2.5901389122009277, "gen_loss": 0.3873101472854614, "grad_norm": 0.4106344490866021, "learning_rate": 2.8130526315789474e-05, "loss": 0.3262, "mean_copy_accuracy": 0.993240088224411, "mean_gen_accuracy": 0.8586247712373734, "mean_token_accuracy": 0.8911838084459305, "num_tokens": 600586605.0, "sample_num_tokens": 8795.25, "step": 2216, "total_num_tokens": 600621786.0, "z_loss": 0.0009184944210574031 }, { "copy_logits_max": -7.466094017028809, "copy_logits_min": -750000000.0, "copy_num_tokens": 464.125, "epoch": 0.45279550676538166, "gen_logits_max": 5.822756290435791, "gen_logits_mean": -14.072942733764648, "gen_logits_min": -25.75448226928711, "gen_logits_std": 2.5912609100341797, "gen_loss": 0.3221116364002228, "grad_norm": 0.5181000039963399, "learning_rate": 2.8129263157894738e-05, "loss": 0.3352, "mean_copy_accuracy": 0.9950265288352966, "mean_gen_accuracy": 0.8603228777647018, "mean_token_accuracy": 0.891865462064743, "num_tokens": 600840603.0, "sample_num_tokens": 8870.25, "step": 2217, "total_num_tokens": 600876084.0, "z_loss": 0.000806683034170419 }, { "copy_logits_max": -9.454826354980469, "copy_logits_min": -750000000.0, "copy_num_tokens": 302.6875, "epoch": 0.4529997447025785, "gen_logits_max": 6.675029277801514, "gen_logits_mean": -13.557952880859375, "gen_logits_min": -25.285995483398438, "gen_logits_std": 2.580230236053467, "gen_loss": 0.3259298801422119, "grad_norm": 0.5085411034725061, "learning_rate": 2.8128e-05, "loss": 0.3295, "mean_copy_accuracy": 0.9914993643760681, "mean_gen_accuracy": 0.8649693429470062, "mean_token_accuracy": 0.8904644399881363, "num_tokens": 601091014.0, "sample_num_tokens": 7807.0, "step": 2218, "total_num_tokens": 601122242.0, "z_loss": 0.0007305443286895752 }, { "copy_logits_max": -7.7969183921813965, "copy_logits_min": -750000000.0, "copy_num_tokens": 479.0625, "epoch": 0.45320398263977535, "gen_logits_max": 5.706029891967773, "gen_logits_mean": -14.024556159973145, "gen_logits_min": -26.091041564941406, "gen_logits_std": 2.59395170211792, "gen_loss": 0.30736249685287476, "grad_norm": 0.5497529302237982, "learning_rate": 2.8126736842105267e-05, "loss": 0.343, "mean_copy_accuracy": 0.991229459643364, "mean_gen_accuracy": 0.8561804890632629, "mean_token_accuracy": 0.8885046988725662, "num_tokens": 601366289.0, "sample_num_tokens": 8845.75, "step": 2219, "total_num_tokens": 601401672.0, "z_loss": 0.0007101488299667835 }, { "copy_logits_max": -5.943948745727539, "copy_logits_min": -687500032.0, "copy_num_tokens": 664.625, "epoch": 0.45340822057697217, "gen_logits_max": 5.964808940887451, "gen_logits_mean": -13.363697052001953, "gen_logits_min": -25.633821487426758, "gen_logits_std": 2.630720615386963, "gen_loss": 0.3093045949935913, "grad_norm": 0.4851999352801913, "learning_rate": 2.8125473684210528e-05, "loss": 0.32, "mean_copy_accuracy": 0.9944590926170349, "mean_gen_accuracy": 0.8584716022014618, "mean_token_accuracy": 0.8956421166658401, "num_tokens": 601633972.0, "sample_num_tokens": 9014.5, "step": 2220, "total_num_tokens": 601670030.0, "z_loss": 0.0008569313213229179 }, { "copy_logits_max": -8.892663955688477, "copy_logits_min": -750000000.0, "copy_num_tokens": 392.8125, "epoch": 0.453612458514169, "gen_logits_max": 7.069513320922852, "gen_logits_mean": -12.305801391601562, "gen_logits_min": -24.139835357666016, "gen_logits_std": 2.6477456092834473, "gen_loss": 0.3595064878463745, "grad_norm": 0.5023388306773513, "learning_rate": 2.8124210526315792e-05, "loss": 0.3299, "mean_copy_accuracy": 0.9933600574731827, "mean_gen_accuracy": 0.8594779670238495, "mean_token_accuracy": 0.8922619074583054, "num_tokens": 601904699.0, "sample_num_tokens": 8020.25, "step": 2221, "total_num_tokens": 601936780.0, "z_loss": 0.0009468544158153236 }, { "copy_logits_max": -5.3105010986328125, "copy_logits_min": -750000000.0, "copy_num_tokens": 503.375, "epoch": 0.45381669645136585, "gen_logits_max": 6.589303493499756, "gen_logits_mean": -13.43199348449707, "gen_logits_min": -25.232662200927734, "gen_logits_std": 2.5850348472595215, "gen_loss": 0.3404550552368164, "grad_norm": 0.4396465407639076, "learning_rate": 2.8122947368421053e-05, "loss": 0.2999, "mean_copy_accuracy": 0.9939016103744507, "mean_gen_accuracy": 0.8696164041757584, "mean_token_accuracy": 0.8994741886854172, "num_tokens": 602177130.0, "sample_num_tokens": 9307.0, "step": 2222, "total_num_tokens": 602214358.0, "z_loss": 0.0009333903435617685 }, { "copy_logits_max": -7.117138385772705, "copy_logits_min": -750000000.0, "copy_num_tokens": 458.1875, "epoch": 0.45402093438856267, "gen_logits_max": 5.465838432312012, "gen_logits_mean": -14.821287155151367, "gen_logits_min": -26.414226531982422, "gen_logits_std": 2.550436496734619, "gen_loss": 0.2941092848777771, "grad_norm": 0.46203616909720197, "learning_rate": 2.8121684210526317e-05, "loss": 0.3175, "mean_copy_accuracy": 0.9924243837594986, "mean_gen_accuracy": 0.8623288720846176, "mean_token_accuracy": 0.8937737941741943, "num_tokens": 602448571.0, "sample_num_tokens": 8225.75, "step": 2223, "total_num_tokens": 602481474.0, "z_loss": 0.0007503622910007834 }, { "copy_logits_max": -6.813174247741699, "copy_logits_min": -750000000.0, "copy_num_tokens": 641.0625, "epoch": 0.4542251723257595, "gen_logits_max": 5.827444076538086, "gen_logits_mean": -12.388399124145508, "gen_logits_min": -24.459428787231445, "gen_logits_std": 2.58976411819458, "gen_loss": 0.32873594760894775, "grad_norm": 0.44221724026913356, "learning_rate": 2.8120421052631578e-05, "loss": 0.3435, "mean_copy_accuracy": 0.9921866655349731, "mean_gen_accuracy": 0.8566958755254745, "mean_token_accuracy": 0.886903390288353, "num_tokens": 602727844.0, "sample_num_tokens": 10477.5, "step": 2224, "total_num_tokens": 602769754.0, "z_loss": 0.0009154313011094928 }, { "copy_logits_max": -7.348022937774658, "copy_logits_min": -750000000.0, "copy_num_tokens": 538.75, "epoch": 0.45442941026295636, "gen_logits_max": 5.886354446411133, "gen_logits_mean": -12.78300666809082, "gen_logits_min": -25.105663299560547, "gen_logits_std": 2.6197669506073, "gen_loss": 0.2727508842945099, "grad_norm": 0.46320781174842335, "learning_rate": 2.8119157894736842e-05, "loss": 0.326, "mean_copy_accuracy": 0.9944941848516464, "mean_gen_accuracy": 0.8632723689079285, "mean_token_accuracy": 0.8914360404014587, "num_tokens": 602993064.0, "sample_num_tokens": 9082.0, "step": 2225, "total_num_tokens": 603029392.0, "z_loss": 0.0008168204221874475 }, { "copy_logits_max": -5.790799140930176, "copy_logits_min": -750000000.0, "copy_num_tokens": 657.6875, "epoch": 0.4546336482001532, "gen_logits_max": 5.421672344207764, "gen_logits_mean": -13.68554973602295, "gen_logits_min": -25.772836685180664, "gen_logits_std": 2.5929322242736816, "gen_loss": 0.3308255076408386, "grad_norm": 0.45381412208975624, "learning_rate": 2.8117894736842103e-05, "loss": 0.3216, "mean_copy_accuracy": 0.9941722303628922, "mean_gen_accuracy": 0.8598220944404602, "mean_token_accuracy": 0.8955053240060806, "num_tokens": 603269628.0, "sample_num_tokens": 9377.5, "step": 2226, "total_num_tokens": 603307138.0, "z_loss": 0.0009393123909831047 }, { "copy_logits_max": -5.508257865905762, "copy_logits_min": -750000000.0, "copy_num_tokens": 590.5, "epoch": 0.45483788613735, "gen_logits_max": 4.870550632476807, "gen_logits_mean": -14.6136474609375, "gen_logits_min": -26.638267517089844, "gen_logits_std": 2.561114549636841, "gen_loss": 0.3059174716472626, "grad_norm": 0.4508524092102898, "learning_rate": 2.8116631578947368e-05, "loss": 0.3484, "mean_copy_accuracy": 0.9924454391002655, "mean_gen_accuracy": 0.8514262139797211, "mean_token_accuracy": 0.8845840990543365, "num_tokens": 603533066.0, "sample_num_tokens": 9104.5, "step": 2227, "total_num_tokens": 603569484.0, "z_loss": 0.0008351546130143106 }, { "copy_logits_max": -7.291312217712402, "copy_logits_min": -750000000.0, "copy_num_tokens": 438.3125, "epoch": 0.45504212407454686, "gen_logits_max": 5.718372344970703, "gen_logits_mean": -13.91330337524414, "gen_logits_min": -25.81153106689453, "gen_logits_std": 2.5656895637512207, "gen_loss": 0.3068101406097412, "grad_norm": 0.4666636117764713, "learning_rate": 2.8115368421052632e-05, "loss": 0.3392, "mean_copy_accuracy": 0.9919280856847763, "mean_gen_accuracy": 0.8576487004756927, "mean_token_accuracy": 0.8875669986009598, "num_tokens": 603787663.0, "sample_num_tokens": 8858.25, "step": 2228, "total_num_tokens": 603823096.0, "z_loss": 0.0008177352137863636 }, { "copy_logits_max": -8.376680374145508, "copy_logits_min": -687500032.0, "copy_num_tokens": 575.125, "epoch": 0.4552463620117437, "gen_logits_max": 7.012685775756836, "gen_logits_mean": -11.774175643920898, "gen_logits_min": -23.91133689880371, "gen_logits_std": 2.607344388961792, "gen_loss": 0.35163938999176025, "grad_norm": 0.4836174647626349, "learning_rate": 2.8114105263157896e-05, "loss": 0.3372, "mean_copy_accuracy": 0.9930237233638763, "mean_gen_accuracy": 0.854962483048439, "mean_token_accuracy": 0.8900102376937866, "num_tokens": 604055508.0, "sample_num_tokens": 9678.0, "step": 2229, "total_num_tokens": 604094220.0, "z_loss": 0.0008869538432918489 }, { "copy_logits_max": -8.756196022033691, "copy_logits_min": -750000064.0, "copy_num_tokens": 461.25, "epoch": 0.4554505999489405, "gen_logits_max": 6.280755996704102, "gen_logits_mean": -12.414565086364746, "gen_logits_min": -23.788928985595703, "gen_logits_std": 2.5244669914245605, "gen_loss": 0.32475632429122925, "grad_norm": 0.40056420172280954, "learning_rate": 2.811284210526316e-05, "loss": 0.3026, "mean_copy_accuracy": 0.995033398270607, "mean_gen_accuracy": 0.8682871162891388, "mean_token_accuracy": 0.8985068500041962, "num_tokens": 604349186.0, "sample_num_tokens": 9075.5, "step": 2230, "total_num_tokens": 604385488.0, "z_loss": 0.0008573409868404269 }, { "copy_logits_max": -6.180695533752441, "copy_logits_min": -750000000.0, "copy_num_tokens": 556.25, "epoch": 0.45565483788613736, "gen_logits_max": 6.190587043762207, "gen_logits_mean": -12.975120544433594, "gen_logits_min": -25.460716247558594, "gen_logits_std": 2.624380111694336, "gen_loss": 0.34253787994384766, "grad_norm": 0.524482702636982, "learning_rate": 2.811157894736842e-05, "loss": 0.3457, "mean_copy_accuracy": 0.992803081870079, "mean_gen_accuracy": 0.8538960069417953, "mean_token_accuracy": 0.8860170096158981, "num_tokens": 604605002.0, "sample_num_tokens": 9397.5, "step": 2231, "total_num_tokens": 604642592.0, "z_loss": 0.0009257273050025105 }, { "copy_logits_max": -6.9504852294921875, "copy_logits_min": -750000000.0, "copy_num_tokens": 571.375, "epoch": 0.4558590758233342, "gen_logits_max": 5.826779842376709, "gen_logits_mean": -13.792373657226562, "gen_logits_min": -25.8884334564209, "gen_logits_std": 2.6028122901916504, "gen_loss": 0.2949371933937073, "grad_norm": 0.4514866451409276, "learning_rate": 2.8110315789473686e-05, "loss": 0.3087, "mean_copy_accuracy": 0.9924490600824356, "mean_gen_accuracy": 0.8674535304307938, "mean_token_accuracy": 0.8990626335144043, "num_tokens": 604870932.0, "sample_num_tokens": 8543.5, "step": 2232, "total_num_tokens": 604905106.0, "z_loss": 0.0008015486528165638 }, { "copy_logits_max": -9.853218078613281, "copy_logits_min": -687500032.0, "copy_num_tokens": 376.0625, "epoch": 0.456063313760531, "gen_logits_max": 7.537469387054443, "gen_logits_mean": -11.323705673217773, "gen_logits_min": -23.673383712768555, "gen_logits_std": 2.6307387351989746, "gen_loss": 0.35382747650146484, "grad_norm": 0.48808612473180707, "learning_rate": 2.8109052631578947e-05, "loss": 0.3384, "mean_copy_accuracy": 0.9921657592058182, "mean_gen_accuracy": 0.8543100357055664, "mean_token_accuracy": 0.8900018483400345, "num_tokens": 605153670.0, "sample_num_tokens": 8242.5, "step": 2233, "total_num_tokens": 605186640.0, "z_loss": 0.0009846845641732216 }, { "copy_logits_max": -6.651758193969727, "copy_logits_min": -750000000.0, "copy_num_tokens": 378.375, "epoch": 0.45626755169772787, "gen_logits_max": 5.951103687286377, "gen_logits_mean": -12.919214248657227, "gen_logits_min": -25.213151931762695, "gen_logits_std": 2.6119885444641113, "gen_loss": 0.28478044271469116, "grad_norm": 0.4550147388132208, "learning_rate": 2.810778947368421e-05, "loss": 0.3201, "mean_copy_accuracy": 0.9929887652397156, "mean_gen_accuracy": 0.8620277047157288, "mean_token_accuracy": 0.893158569931984, "num_tokens": 605424260.0, "sample_num_tokens": 6833.0, "step": 2234, "total_num_tokens": 605451592.0, "z_loss": 0.0008026042487472296 }, { "copy_logits_max": -5.176832675933838, "copy_logits_min": -750000000.0, "copy_num_tokens": 500.4375, "epoch": 0.4564717896349247, "gen_logits_max": 5.968632698059082, "gen_logits_mean": -12.859203338623047, "gen_logits_min": -25.264686584472656, "gen_logits_std": 2.6060643196105957, "gen_loss": 0.3414621949195862, "grad_norm": 0.4525552150045389, "learning_rate": 2.8106526315789472e-05, "loss": 0.3198, "mean_copy_accuracy": 0.9933869987726212, "mean_gen_accuracy": 0.8605232685804367, "mean_token_accuracy": 0.8970421850681305, "num_tokens": 605692171.0, "sample_num_tokens": 8155.25, "step": 2235, "total_num_tokens": 605724792.0, "z_loss": 0.0009665905963629484 }, { "copy_logits_max": -7.7870635986328125, "copy_logits_min": -750000000.0, "copy_num_tokens": 350.9375, "epoch": 0.4566760275721215, "gen_logits_max": 6.371618270874023, "gen_logits_mean": -12.621261596679688, "gen_logits_min": -24.499202728271484, "gen_logits_std": 2.5174560546875, "gen_loss": 0.3322019577026367, "grad_norm": 0.4322097351959392, "learning_rate": 2.810526315789474e-05, "loss": 0.3314, "mean_copy_accuracy": 0.9935867339372635, "mean_gen_accuracy": 0.8583994805812836, "mean_token_accuracy": 0.8925888985395432, "num_tokens": 605972928.0, "sample_num_tokens": 7316.5, "step": 2236, "total_num_tokens": 606002194.0, "z_loss": 0.0008632567478343844 }, { "copy_logits_max": -7.780709266662598, "copy_logits_min": -750000000.0, "copy_num_tokens": 514.8125, "epoch": 0.4568802655093184, "gen_logits_max": 6.590615749359131, "gen_logits_mean": -12.786556243896484, "gen_logits_min": -25.046884536743164, "gen_logits_std": 2.539285659790039, "gen_loss": 0.32498592138290405, "grad_norm": 0.4729258113717343, "learning_rate": 2.8104e-05, "loss": 0.2992, "mean_copy_accuracy": 0.9936301410198212, "mean_gen_accuracy": 0.8661486953496933, "mean_token_accuracy": 0.9016002863645554, "num_tokens": 606265303.0, "sample_num_tokens": 9468.25, "step": 2237, "total_num_tokens": 606303176.0, "z_loss": 0.0009098441223613918 }, { "copy_logits_max": -8.388811111450195, "copy_logits_min": -750000000.0, "copy_num_tokens": 519.125, "epoch": 0.4570845034465152, "gen_logits_max": 6.178161144256592, "gen_logits_mean": -13.71454906463623, "gen_logits_min": -25.686674118041992, "gen_logits_std": 2.58902645111084, "gen_loss": 0.2886856198310852, "grad_norm": 0.5215423186877192, "learning_rate": 2.8102736842105265e-05, "loss": 0.3435, "mean_copy_accuracy": 0.9927548915147781, "mean_gen_accuracy": 0.851763591170311, "mean_token_accuracy": 0.8865351378917694, "num_tokens": 606527486.0, "sample_num_tokens": 8982.5, "step": 2238, "total_num_tokens": 606563416.0, "z_loss": 0.0007831817492842674 }, { "copy_logits_max": -7.123011589050293, "copy_logits_min": -750000000.0, "copy_num_tokens": 516.4375, "epoch": 0.457288741383712, "gen_logits_max": 6.5755486488342285, "gen_logits_mean": -13.601279258728027, "gen_logits_min": -25.332901000976562, "gen_logits_std": 2.54379940032959, "gen_loss": 0.3348073661327362, "grad_norm": 0.5147580586341683, "learning_rate": 2.8101473684210526e-05, "loss": 0.3456, "mean_copy_accuracy": 0.9929171204566956, "mean_gen_accuracy": 0.8542054146528244, "mean_token_accuracy": 0.8866084218025208, "num_tokens": 606807655.0, "sample_num_tokens": 9955.25, "step": 2239, "total_num_tokens": 606847476.0, "z_loss": 0.0008393358439207077 }, { "copy_logits_max": -7.734449863433838, "copy_logits_min": -687500032.0, "copy_num_tokens": 455.1875, "epoch": 0.4574929793209089, "gen_logits_max": 5.289878845214844, "gen_logits_mean": -14.188743591308594, "gen_logits_min": -26.41122817993164, "gen_logits_std": 2.5629100799560547, "gen_loss": 0.26247137784957886, "grad_norm": 0.4683755222502734, "learning_rate": 2.810021052631579e-05, "loss": 0.3111, "mean_copy_accuracy": 0.9930105060338974, "mean_gen_accuracy": 0.8681102991104126, "mean_token_accuracy": 0.8983948230743408, "num_tokens": 607079339.0, "sample_num_tokens": 7659.25, "step": 2240, "total_num_tokens": 607109976.0, "z_loss": 0.0007232232019305229 }, { "copy_logits_max": -7.90491247177124, "copy_logits_min": -750000128.0, "copy_num_tokens": 463.5625, "epoch": 0.4576972172581057, "gen_logits_max": 6.735232830047607, "gen_logits_mean": -11.941532135009766, "gen_logits_min": -24.26392364501953, "gen_logits_std": 2.595916986465454, "gen_loss": 0.30698180198669434, "grad_norm": 0.5030812756526152, "learning_rate": 2.809894736842105e-05, "loss": 0.3109, "mean_copy_accuracy": 0.9917428642511368, "mean_gen_accuracy": 0.8645258098840714, "mean_token_accuracy": 0.8986664265394211, "num_tokens": 607343937.0, "sample_num_tokens": 7906.75, "step": 2241, "total_num_tokens": 607375564.0, "z_loss": 0.0009487041970714927 }, { "copy_logits_max": -5.61818790435791, "copy_logits_min": -750000000.0, "copy_num_tokens": 454.3125, "epoch": 0.4579014551953025, "gen_logits_max": 6.470975875854492, "gen_logits_mean": -13.333663940429688, "gen_logits_min": -24.91152572631836, "gen_logits_std": 2.535446882247925, "gen_loss": 0.33302074670791626, "grad_norm": 0.4262464472364914, "learning_rate": 2.8097684210526315e-05, "loss": 0.2988, "mean_copy_accuracy": 0.9942175447940826, "mean_gen_accuracy": 0.8685666620731354, "mean_token_accuracy": 0.8995791226625443, "num_tokens": 607624532.0, "sample_num_tokens": 9097.5, "step": 2242, "total_num_tokens": 607660922.0, "z_loss": 0.00099386484362185 }, { "copy_logits_max": -3.2138895988464355, "copy_logits_min": -750000000.0, "copy_num_tokens": 443.0, "epoch": 0.4581056931324994, "gen_logits_max": 5.661065578460693, "gen_logits_mean": -13.93535041809082, "gen_logits_min": -26.148099899291992, "gen_logits_std": 2.565426826477051, "gen_loss": 0.3318052589893341, "grad_norm": 0.5683170591115891, "learning_rate": 2.809642105263158e-05, "loss": 0.3398, "mean_copy_accuracy": 0.9929658025503159, "mean_gen_accuracy": 0.8550048619508743, "mean_token_accuracy": 0.888420358300209, "num_tokens": 607906233.0, "sample_num_tokens": 7659.25, "step": 2243, "total_num_tokens": 607936870.0, "z_loss": 0.0010365931084379554 }, { "copy_logits_max": -2.979233503341675, "copy_logits_min": -687500032.0, "copy_num_tokens": 718.5, "epoch": 0.4583099310696962, "gen_logits_max": 6.306644916534424, "gen_logits_mean": -11.960517883300781, "gen_logits_min": -24.155590057373047, "gen_logits_std": 2.575241804122925, "gen_loss": 0.2926471531391144, "grad_norm": 0.43682208319301513, "learning_rate": 2.8095157894736844e-05, "loss": 0.3089, "mean_copy_accuracy": 0.9952142536640167, "mean_gen_accuracy": 0.8589898943901062, "mean_token_accuracy": 0.8991921246051788, "num_tokens": 608170216.0, "sample_num_tokens": 9473.5, "step": 2244, "total_num_tokens": 608208110.0, "z_loss": 0.0010109671857208014 }, { "copy_logits_max": -5.803869724273682, "copy_logits_min": -687500032.0, "copy_num_tokens": 572.625, "epoch": 0.458514169006893, "gen_logits_max": 5.9543609619140625, "gen_logits_mean": -12.046834945678711, "gen_logits_min": -24.059650421142578, "gen_logits_std": 2.561431884765625, "gen_loss": 0.29807591438293457, "grad_norm": 0.5382422117984461, "learning_rate": 2.809389473684211e-05, "loss": 0.3292, "mean_copy_accuracy": 0.9926327764987946, "mean_gen_accuracy": 0.8539136499166489, "mean_token_accuracy": 0.8915761560201645, "num_tokens": 608445806.0, "sample_num_tokens": 8730.0, "step": 2245, "total_num_tokens": 608480726.0, "z_loss": 0.0007910996209830046 }, { "copy_logits_max": -6.363739967346191, "copy_logits_min": -687500032.0, "copy_num_tokens": 257.8125, "epoch": 0.4587184069440899, "gen_logits_max": 6.338104724884033, "gen_logits_mean": -13.803712844848633, "gen_logits_min": -25.477996826171875, "gen_logits_std": 2.512887477874756, "gen_loss": 0.3789430558681488, "grad_norm": 0.4733828705096496, "learning_rate": 2.809263157894737e-05, "loss": 0.3411, "mean_copy_accuracy": 0.9928349852561951, "mean_gen_accuracy": 0.8546089977025986, "mean_token_accuracy": 0.8878779113292694, "num_tokens": 608716599.0, "sample_num_tokens": 6997.25, "step": 2246, "total_num_tokens": 608744588.0, "z_loss": 0.0008950004703365266 }, { "copy_logits_max": -6.087163925170898, "copy_logits_min": -687500032.0, "copy_num_tokens": 527.75, "epoch": 0.4589226448812867, "gen_logits_max": 6.843605041503906, "gen_logits_mean": -12.302726745605469, "gen_logits_min": -24.6065731048584, "gen_logits_std": 2.6262941360473633, "gen_loss": 0.30346280336380005, "grad_norm": 0.4867428378424362, "learning_rate": 2.8091368421052634e-05, "loss": 0.3236, "mean_copy_accuracy": 0.995047926902771, "mean_gen_accuracy": 0.8575877845287323, "mean_token_accuracy": 0.8935992121696472, "num_tokens": 608987162.0, "sample_num_tokens": 8732.5, "step": 2247, "total_num_tokens": 609022092.0, "z_loss": 0.0009100477327592671 }, { "copy_logits_max": -7.930437088012695, "copy_logits_min": -750000000.0, "copy_num_tokens": 460.125, "epoch": 0.4591268828184835, "gen_logits_max": 5.53164529800415, "gen_logits_mean": -14.130256652832031, "gen_logits_min": -25.342750549316406, "gen_logits_std": 2.4737162590026855, "gen_loss": 0.311095654964447, "grad_norm": 0.5056981520079246, "learning_rate": 2.8090105263157894e-05, "loss": 0.3336, "mean_copy_accuracy": 0.9912659078836441, "mean_gen_accuracy": 0.8581190556287766, "mean_token_accuracy": 0.8900356888771057, "num_tokens": 609248825.0, "sample_num_tokens": 10454.25, "step": 2248, "total_num_tokens": 609290642.0, "z_loss": 0.000752714928239584 }, { "copy_logits_max": -7.566904067993164, "copy_logits_min": -750000000.0, "copy_num_tokens": 386.75, "epoch": 0.4593311207556804, "gen_logits_max": 6.169281005859375, "gen_logits_mean": -13.674140930175781, "gen_logits_min": -25.04155731201172, "gen_logits_std": 2.499434232711792, "gen_loss": 0.3067591190338135, "grad_norm": 0.5040199912743363, "learning_rate": 2.808884210526316e-05, "loss": 0.3278, "mean_copy_accuracy": 0.9923680275678635, "mean_gen_accuracy": 0.8606177419424057, "mean_token_accuracy": 0.8916338682174683, "num_tokens": 609507692.0, "sample_num_tokens": 8202.5, "step": 2249, "total_num_tokens": 609540502.0, "z_loss": 0.0007558772340416908 }, { "copy_logits_max": -4.826191425323486, "copy_logits_min": -750000000.0, "copy_num_tokens": 459.75, "epoch": 0.4595353586928772, "gen_logits_max": 5.512533187866211, "gen_logits_mean": -13.795019149780273, "gen_logits_min": -25.796478271484375, "gen_logits_std": 2.5417275428771973, "gen_loss": 0.3181644082069397, "grad_norm": 0.44997228035813963, "learning_rate": 2.808757894736842e-05, "loss": 0.3066, "mean_copy_accuracy": 0.9929886907339096, "mean_gen_accuracy": 0.8639705032110214, "mean_token_accuracy": 0.9002385586500168, "num_tokens": 609801159.0, "sample_num_tokens": 8017.75, "step": 2250, "total_num_tokens": 609833230.0, "z_loss": 0.0008433896582573652 }, { "copy_logits_max": -8.01390552520752, "copy_logits_min": -750000000.0, "copy_num_tokens": 405.3125, "epoch": 0.459739596630074, "gen_logits_max": 7.29952335357666, "gen_logits_mean": -11.671297073364258, "gen_logits_min": -23.922801971435547, "gen_logits_std": 2.4977333545684814, "gen_loss": 0.32594114542007446, "grad_norm": 0.4747778661822966, "learning_rate": 2.8086315789473684e-05, "loss": 0.3316, "mean_copy_accuracy": 0.9946594089269638, "mean_gen_accuracy": 0.857336550951004, "mean_token_accuracy": 0.8934545069932938, "num_tokens": 610064771.0, "sample_num_tokens": 8958.25, "step": 2251, "total_num_tokens": 610100604.0, "z_loss": 0.0009580468176864088 }, { "copy_logits_max": -5.497115135192871, "copy_logits_min": -750000000.0, "copy_num_tokens": 549.4375, "epoch": 0.4599438345672709, "gen_logits_max": 5.373659133911133, "gen_logits_mean": -13.816059112548828, "gen_logits_min": -26.085922241210938, "gen_logits_std": 2.555196762084961, "gen_loss": 0.29594025015830994, "grad_norm": 0.4345104368218407, "learning_rate": 2.808505263157895e-05, "loss": 0.3122, "mean_copy_accuracy": 0.9941829890012741, "mean_gen_accuracy": 0.8645817786455154, "mean_token_accuracy": 0.8968469798564911, "num_tokens": 610353528.0, "sample_num_tokens": 8791.5, "step": 2252, "total_num_tokens": 610388694.0, "z_loss": 0.0008614999242126942 }, { "copy_logits_max": -9.779556274414062, "copy_logits_min": -750000000.0, "copy_num_tokens": 356.375, "epoch": 0.4601480725044677, "gen_logits_max": 6.662796497344971, "gen_logits_mean": -12.466573715209961, "gen_logits_min": -24.144920349121094, "gen_logits_std": 2.5151844024658203, "gen_loss": 0.3316285014152527, "grad_norm": 0.4866483591023366, "learning_rate": 2.8083789473684213e-05, "loss": 0.3371, "mean_copy_accuracy": 0.9928993284702301, "mean_gen_accuracy": 0.8582520037889481, "mean_token_accuracy": 0.8893005400896072, "num_tokens": 610624960.0, "sample_num_tokens": 8692.0, "step": 2253, "total_num_tokens": 610659728.0, "z_loss": 0.0008252529660239816 }, { "copy_logits_max": -4.989708423614502, "copy_logits_min": -750000000.0, "copy_num_tokens": 532.0625, "epoch": 0.4603523104416645, "gen_logits_max": 5.6591796875, "gen_logits_mean": -13.489559173583984, "gen_logits_min": -25.545747756958008, "gen_logits_std": 2.5551037788391113, "gen_loss": 0.34308671951293945, "grad_norm": 0.46581371765135143, "learning_rate": 2.8082526315789474e-05, "loss": 0.339, "mean_copy_accuracy": 0.9927249550819397, "mean_gen_accuracy": 0.8576549887657166, "mean_token_accuracy": 0.8905248939990997, "num_tokens": 610895784.0, "sample_num_tokens": 8481.5, "step": 2254, "total_num_tokens": 610929710.0, "z_loss": 0.0008552368963137269 }, { "copy_logits_max": -6.917564868927002, "copy_logits_min": -750000000.0, "copy_num_tokens": 189.3125, "epoch": 0.4605565483788614, "gen_logits_max": 7.039107799530029, "gen_logits_mean": -13.484579086303711, "gen_logits_min": -25.582881927490234, "gen_logits_std": 2.5233497619628906, "gen_loss": 0.33754825592041016, "grad_norm": 0.5165562331968997, "learning_rate": 2.8081263157894738e-05, "loss": 0.3514, "mean_copy_accuracy": 0.9910554140806198, "mean_gen_accuracy": 0.856392115354538, "mean_token_accuracy": 0.8836271017789841, "num_tokens": 611151143.0, "sample_num_tokens": 6330.75, "step": 2255, "total_num_tokens": 611176466.0, "z_loss": 0.0008422276005148888 }, { "copy_logits_max": -5.333897590637207, "copy_logits_min": -750000000.0, "copy_num_tokens": 602.25, "epoch": 0.4607607863160582, "gen_logits_max": 5.509334564208984, "gen_logits_mean": -12.925750732421875, "gen_logits_min": -26.187564849853516, "gen_logits_std": 2.6155357360839844, "gen_loss": 0.28082844614982605, "grad_norm": 0.4185240358996128, "learning_rate": 2.8080000000000002e-05, "loss": 0.3143, "mean_copy_accuracy": 0.9941079020500183, "mean_gen_accuracy": 0.8672962933778763, "mean_token_accuracy": 0.8967038094997406, "num_tokens": 611444256.0, "sample_num_tokens": 8909.5, "step": 2256, "total_num_tokens": 611479894.0, "z_loss": 0.0008951312629505992 }, { "copy_logits_max": -5.292874336242676, "copy_logits_min": -750000000.0, "copy_num_tokens": 654.0, "epoch": 0.46096502425325503, "gen_logits_max": 5.952733993530273, "gen_logits_mean": -12.405881881713867, "gen_logits_min": -25.084918975830078, "gen_logits_std": 2.5936739444732666, "gen_loss": 0.30593234300613403, "grad_norm": 0.42315314304710494, "learning_rate": 2.8078736842105263e-05, "loss": 0.3056, "mean_copy_accuracy": 0.9935692548751831, "mean_gen_accuracy": 0.8649903535842896, "mean_token_accuracy": 0.9003265351057053, "num_tokens": 611733655.0, "sample_num_tokens": 9966.75, "step": 2257, "total_num_tokens": 611773522.0, "z_loss": 0.0009545650100335479 }, { "copy_logits_max": -6.875283241271973, "copy_logits_min": -750000000.0, "copy_num_tokens": 674.5625, "epoch": 0.4611692621904519, "gen_logits_max": 5.103567600250244, "gen_logits_mean": -13.263422966003418, "gen_logits_min": -25.74142837524414, "gen_logits_std": 2.560187578201294, "gen_loss": 0.25633883476257324, "grad_norm": 0.41433411904592427, "learning_rate": 2.8077473684210527e-05, "loss": 0.3224, "mean_copy_accuracy": 0.9945004880428314, "mean_gen_accuracy": 0.861112430691719, "mean_token_accuracy": 0.8942301720380783, "num_tokens": 612013445.0, "sample_num_tokens": 10793.75, "step": 2258, "total_num_tokens": 612056620.0, "z_loss": 0.0007852166309021413 }, { "copy_logits_max": -6.666221618652344, "copy_logits_min": -750000000.0, "copy_num_tokens": 496.75, "epoch": 0.4613735001276487, "gen_logits_max": 5.508945465087891, "gen_logits_mean": -13.870354652404785, "gen_logits_min": -25.797313690185547, "gen_logits_std": 2.5550408363342285, "gen_loss": 0.31058841943740845, "grad_norm": 0.44951198169085754, "learning_rate": 2.807621052631579e-05, "loss": 0.31, "mean_copy_accuracy": 0.9940098375082016, "mean_gen_accuracy": 0.8624422997236252, "mean_token_accuracy": 0.8975353688001633, "num_tokens": 612309008.0, "sample_num_tokens": 8293.0, "step": 2259, "total_num_tokens": 612342180.0, "z_loss": 0.0007836198201403022 }, { "copy_logits_max": -5.972795486450195, "copy_logits_min": -750000000.0, "copy_num_tokens": 511.625, "epoch": 0.46157773806484553, "gen_logits_max": 5.96917200088501, "gen_logits_mean": -13.5592679977417, "gen_logits_min": -25.868972778320312, "gen_logits_std": 2.6107077598571777, "gen_loss": 0.3170539140701294, "grad_norm": 0.5105249447781862, "learning_rate": 2.8074947368421056e-05, "loss": 0.3226, "mean_copy_accuracy": 0.994382232427597, "mean_gen_accuracy": 0.8579456210136414, "mean_token_accuracy": 0.8941687345504761, "num_tokens": 612580674.0, "sample_num_tokens": 8744.0, "step": 2260, "total_num_tokens": 612615650.0, "z_loss": 0.0008883802220225334 }, { "copy_logits_max": -5.771807670593262, "copy_logits_min": -750000000.0, "copy_num_tokens": 390.875, "epoch": 0.4617819760020424, "gen_logits_max": 6.213567733764648, "gen_logits_mean": -13.318628311157227, "gen_logits_min": -25.29208755493164, "gen_logits_std": 2.5962212085723877, "gen_loss": 0.32121384143829346, "grad_norm": 0.452898135274155, "learning_rate": 2.8073684210526317e-05, "loss": 0.3403, "mean_copy_accuracy": 0.992661252617836, "mean_gen_accuracy": 0.8578755110502243, "mean_token_accuracy": 0.8875149339437485, "num_tokens": 612845301.0, "sample_num_tokens": 7648.75, "step": 2261, "total_num_tokens": 612875896.0, "z_loss": 0.0009298642398789525 }, { "copy_logits_max": -5.279048919677734, "copy_logits_min": -750000000.0, "copy_num_tokens": 591.5625, "epoch": 0.4619862139392392, "gen_logits_max": 5.558517932891846, "gen_logits_mean": -13.581100463867188, "gen_logits_min": -25.51641082763672, "gen_logits_std": 2.5373847484588623, "gen_loss": 0.27264639735221863, "grad_norm": 0.45619629101949094, "learning_rate": 2.807242105263158e-05, "loss": 0.3081, "mean_copy_accuracy": 0.9947816729545593, "mean_gen_accuracy": 0.8643922209739685, "mean_token_accuracy": 0.8975386023521423, "num_tokens": 613107462.0, "sample_num_tokens": 9062.0, "step": 2262, "total_num_tokens": 613143710.0, "z_loss": 0.0009200723143294454 }, { "copy_logits_max": -6.261736869812012, "copy_logits_min": -687500032.0, "copy_num_tokens": 493.5625, "epoch": 0.46219045187643604, "gen_logits_max": 5.5371832847595215, "gen_logits_mean": -13.562447547912598, "gen_logits_min": -25.563072204589844, "gen_logits_std": 2.570087432861328, "gen_loss": 0.333914577960968, "grad_norm": 0.45873836366995624, "learning_rate": 2.8071157894736842e-05, "loss": 0.3339, "mean_copy_accuracy": 0.9931769669055939, "mean_gen_accuracy": 0.8562934100627899, "mean_token_accuracy": 0.8921771049499512, "num_tokens": 613393729.0, "sample_num_tokens": 8431.75, "step": 2263, "total_num_tokens": 613427456.0, "z_loss": 0.0008701813640072942 }, { "copy_logits_max": -4.695390701293945, "copy_logits_min": -687500032.0, "copy_num_tokens": 574.1875, "epoch": 0.4623946898136329, "gen_logits_max": 5.936257362365723, "gen_logits_mean": -12.994536399841309, "gen_logits_min": -25.21381187438965, "gen_logits_std": 2.6080808639526367, "gen_loss": 0.38324272632598877, "grad_norm": 0.4581889363614191, "learning_rate": 2.8069894736842107e-05, "loss": 0.3711, "mean_copy_accuracy": 0.9920587241649628, "mean_gen_accuracy": 0.8473175019025803, "mean_token_accuracy": 0.8802085071802139, "num_tokens": 613669770.0, "sample_num_tokens": 8981.5, "step": 2264, "total_num_tokens": 613705696.0, "z_loss": 0.0009237620397470891 }, { "copy_logits_max": -6.436758518218994, "copy_logits_min": -750000000.0, "copy_num_tokens": 378.875, "epoch": 0.4625989277508297, "gen_logits_max": 6.405422210693359, "gen_logits_mean": -13.393332481384277, "gen_logits_min": -25.81483268737793, "gen_logits_std": 2.564296245574951, "gen_loss": 0.3526666760444641, "grad_norm": 0.4689519928554217, "learning_rate": 2.8068631578947367e-05, "loss": 0.3393, "mean_copy_accuracy": 0.9933715909719467, "mean_gen_accuracy": 0.8565504401922226, "mean_token_accuracy": 0.88774473965168, "num_tokens": 613909221.0, "sample_num_tokens": 7516.25, "step": 2265, "total_num_tokens": 613939286.0, "z_loss": 0.0008486589067615569 }, { "copy_logits_max": -9.123385429382324, "copy_logits_min": -750000000.0, "copy_num_tokens": 272.0625, "epoch": 0.46280316568802654, "gen_logits_max": 6.406249046325684, "gen_logits_mean": -13.07684326171875, "gen_logits_min": -25.222087860107422, "gen_logits_std": 2.513702392578125, "gen_loss": 0.29565709829330444, "grad_norm": 0.4061397467374252, "learning_rate": 2.8067368421052632e-05, "loss": 0.3108, "mean_copy_accuracy": 0.9931475669145584, "mean_gen_accuracy": 0.8686099201440811, "mean_token_accuracy": 0.8956916183233261, "num_tokens": 614177159.0, "sample_num_tokens": 7695.75, "step": 2266, "total_num_tokens": 614207942.0, "z_loss": 0.0007273188093677163 }, { "copy_logits_max": -5.6444244384765625, "copy_logits_min": -687500032.0, "copy_num_tokens": 341.375, "epoch": 0.4630074036252234, "gen_logits_max": 6.002419471740723, "gen_logits_mean": -13.942056655883789, "gen_logits_min": -25.121543884277344, "gen_logits_std": 2.4990973472595215, "gen_loss": 0.37504059076309204, "grad_norm": 0.4334151349721953, "learning_rate": 2.8066105263157893e-05, "loss": 0.3246, "mean_copy_accuracy": 0.9937848895788193, "mean_gen_accuracy": 0.859873041510582, "mean_token_accuracy": 0.8926582783460617, "num_tokens": 614445967.0, "sample_num_tokens": 7480.75, "step": 2267, "total_num_tokens": 614475890.0, "z_loss": 0.0008311715209856629 }, { "copy_logits_max": -4.859748840332031, "copy_logits_min": -750000064.0, "copy_num_tokens": 664.5, "epoch": 0.46321164156242023, "gen_logits_max": 5.723029613494873, "gen_logits_mean": -12.671483993530273, "gen_logits_min": -24.4853515625, "gen_logits_std": 2.5151214599609375, "gen_loss": 0.3042677342891693, "grad_norm": 0.40598376264990776, "learning_rate": 2.806484210526316e-05, "loss": 0.3118, "mean_copy_accuracy": 0.9942696541547775, "mean_gen_accuracy": 0.8622417598962784, "mean_token_accuracy": 0.8978498876094818, "num_tokens": 614727506.0, "sample_num_tokens": 9911.0, "step": 2268, "total_num_tokens": 614767150.0, "z_loss": 0.0008151831571012735 }, { "copy_logits_max": -5.272971153259277, "copy_logits_min": -750000000.0, "copy_num_tokens": 443.25, "epoch": 0.46341587949961704, "gen_logits_max": 6.408215045928955, "gen_logits_mean": -12.918763160705566, "gen_logits_min": -24.49282455444336, "gen_logits_std": 2.5125856399536133, "gen_loss": 0.35188716650009155, "grad_norm": 0.44123447273556415, "learning_rate": 2.806357894736842e-05, "loss": 0.3496, "mean_copy_accuracy": 0.993999794125557, "mean_gen_accuracy": 0.8525312840938568, "mean_token_accuracy": 0.8869926035404205, "num_tokens": 615003212.0, "sample_num_tokens": 7934.0, "step": 2269, "total_num_tokens": 615034948.0, "z_loss": 0.0009967979276552796 }, { "copy_logits_max": -4.3166046142578125, "copy_logits_min": -687500032.0, "copy_num_tokens": 420.625, "epoch": 0.46362011743681386, "gen_logits_max": 5.633923530578613, "gen_logits_mean": -12.927440643310547, "gen_logits_min": -24.78780746459961, "gen_logits_std": 2.5522665977478027, "gen_loss": 0.30150043964385986, "grad_norm": 0.5026153303376097, "learning_rate": 2.8062315789473686e-05, "loss": 0.3341, "mean_copy_accuracy": 0.9937378466129303, "mean_gen_accuracy": 0.8584885597229004, "mean_token_accuracy": 0.8898896425962448, "num_tokens": 615259822.0, "sample_num_tokens": 7336.0, "step": 2270, "total_num_tokens": 615289166.0, "z_loss": 0.0008785112877376378 }, { "copy_logits_max": -4.54768180847168, "copy_logits_min": -750000000.0, "copy_num_tokens": 602.8125, "epoch": 0.46382435537401073, "gen_logits_max": 5.2066802978515625, "gen_logits_mean": -13.471172332763672, "gen_logits_min": -25.763700485229492, "gen_logits_std": 2.560235023498535, "gen_loss": 0.27787071466445923, "grad_norm": 0.45667518676208985, "learning_rate": 2.806105263157895e-05, "loss": 0.3208, "mean_copy_accuracy": 0.9935475289821625, "mean_gen_accuracy": 0.8563315123319626, "mean_token_accuracy": 0.8971104770898819, "num_tokens": 615526993.0, "sample_num_tokens": 9381.75, "step": 2271, "total_num_tokens": 615564520.0, "z_loss": 0.0008008108125068247 }, { "copy_logits_max": -5.078679084777832, "copy_logits_min": -750000064.0, "copy_num_tokens": 528.75, "epoch": 0.46402859331120755, "gen_logits_max": 5.28472375869751, "gen_logits_mean": -13.36238956451416, "gen_logits_min": -25.066566467285156, "gen_logits_std": 2.512218952178955, "gen_loss": 0.2931073307991028, "grad_norm": 0.4155514940017102, "learning_rate": 2.805978947368421e-05, "loss": 0.3216, "mean_copy_accuracy": 0.9940876364707947, "mean_gen_accuracy": 0.8603816628456116, "mean_token_accuracy": 0.8960102796554565, "num_tokens": 615806876.0, "sample_num_tokens": 8753.5, "step": 2272, "total_num_tokens": 615841890.0, "z_loss": 0.0008537301910109818 }, { "copy_logits_max": -3.4503791332244873, "copy_logits_min": -687500032.0, "copy_num_tokens": 403.8125, "epoch": 0.46423283124840437, "gen_logits_max": 6.133163928985596, "gen_logits_mean": -13.210619926452637, "gen_logits_min": -25.45861053466797, "gen_logits_std": 2.566531181335449, "gen_loss": 0.31844741106033325, "grad_norm": 0.39834263345653853, "learning_rate": 2.8058526315789475e-05, "loss": 0.3309, "mean_copy_accuracy": 0.9942605048418045, "mean_gen_accuracy": 0.8591254502534866, "mean_token_accuracy": 0.8909755945205688, "num_tokens": 616080498.0, "sample_num_tokens": 8070.0, "step": 2273, "total_num_tokens": 616112778.0, "z_loss": 0.0009360901894979179 }, { "copy_logits_max": -5.904166221618652, "copy_logits_min": -750000064.0, "copy_num_tokens": 331.6875, "epoch": 0.46443706918560124, "gen_logits_max": 5.7046709060668945, "gen_logits_mean": -13.524438858032227, "gen_logits_min": -25.01922607421875, "gen_logits_std": 2.5020031929016113, "gen_loss": 0.29434189200401306, "grad_norm": 0.41665894947644366, "learning_rate": 2.8057263157894736e-05, "loss": 0.3035, "mean_copy_accuracy": 0.9947217851877213, "mean_gen_accuracy": 0.8723457902669907, "mean_token_accuracy": 0.8995548337697983, "num_tokens": 616348519.0, "sample_num_tokens": 6835.25, "step": 2274, "total_num_tokens": 616375860.0, "z_loss": 0.0009104362688958645 }, { "copy_logits_max": -5.885583400726318, "copy_logits_min": -750000000.0, "copy_num_tokens": 496.0625, "epoch": 0.46464130712279805, "gen_logits_max": 5.4145989418029785, "gen_logits_mean": -12.50346565246582, "gen_logits_min": -24.330581665039062, "gen_logits_std": 2.5442309379577637, "gen_loss": 0.2752950191497803, "grad_norm": 0.44101639800261394, "learning_rate": 2.8056e-05, "loss": 0.3092, "mean_copy_accuracy": 0.992448553442955, "mean_gen_accuracy": 0.8655174523591995, "mean_token_accuracy": 0.8992491215467453, "num_tokens": 616617803.0, "sample_num_tokens": 8397.25, "step": 2275, "total_num_tokens": 616651392.0, "z_loss": 0.0007512819138355553 }, { "copy_logits_max": -7.628931045532227, "copy_logits_min": -687500032.0, "copy_num_tokens": 543.3125, "epoch": 0.46484554505999487, "gen_logits_max": 5.528745174407959, "gen_logits_mean": -13.031425476074219, "gen_logits_min": -24.82837677001953, "gen_logits_std": 2.5288503170013428, "gen_loss": 0.27128157019615173, "grad_norm": 0.40901759733402004, "learning_rate": 2.8054736842105265e-05, "loss": 0.3009, "mean_copy_accuracy": 0.9924812614917755, "mean_gen_accuracy": 0.8686086237430573, "mean_token_accuracy": 0.9002719968557358, "num_tokens": 616911931.0, "sample_num_tokens": 9093.75, "step": 2276, "total_num_tokens": 616948306.0, "z_loss": 0.0007503089727833867 }, { "copy_logits_max": -5.223926544189453, "copy_logits_min": -687500032.0, "copy_num_tokens": 583.0625, "epoch": 0.46504978299719174, "gen_logits_max": 5.644416809082031, "gen_logits_mean": -12.934755325317383, "gen_logits_min": -25.323436737060547, "gen_logits_std": 2.565829277038574, "gen_loss": 0.3132084310054779, "grad_norm": 0.4526454614514277, "learning_rate": 2.805347368421053e-05, "loss": 0.3006, "mean_copy_accuracy": 0.9937804341316223, "mean_gen_accuracy": 0.8616192489862442, "mean_token_accuracy": 0.9003998786211014, "num_tokens": 617197085.0, "sample_num_tokens": 9354.75, "step": 2277, "total_num_tokens": 617234504.0, "z_loss": 0.0008738077012822032 }, { "copy_logits_max": -6.873031139373779, "copy_logits_min": -750000000.0, "copy_num_tokens": 435.625, "epoch": 0.46525402093438856, "gen_logits_max": 5.678934097290039, "gen_logits_mean": -12.874430656433105, "gen_logits_min": -25.055959701538086, "gen_logits_std": 2.5521912574768066, "gen_loss": 0.3314421474933624, "grad_norm": 0.4317200831340257, "learning_rate": 2.805221052631579e-05, "loss": 0.3153, "mean_copy_accuracy": 0.9945571273565292, "mean_gen_accuracy": 0.8647499829530716, "mean_token_accuracy": 0.8957224488258362, "num_tokens": 617470613.0, "sample_num_tokens": 7987.75, "step": 2278, "total_num_tokens": 617502564.0, "z_loss": 0.0007905921083875 }, { "copy_logits_max": -4.504918098449707, "copy_logits_min": -687500032.0, "copy_num_tokens": 576.9375, "epoch": 0.4654582588715854, "gen_logits_max": 5.189676284790039, "gen_logits_mean": -13.854202270507812, "gen_logits_min": -25.954669952392578, "gen_logits_std": 2.5762362480163574, "gen_loss": 0.29971230030059814, "grad_norm": 0.49948277476112213, "learning_rate": 2.8050947368421054e-05, "loss": 0.3077, "mean_copy_accuracy": 0.9934321939945221, "mean_gen_accuracy": 0.8622136265039444, "mean_token_accuracy": 0.8975137174129486, "num_tokens": 617751201.0, "sample_num_tokens": 9007.25, "step": 2279, "total_num_tokens": 617787230.0, "z_loss": 0.0008020584937185049 }, { "copy_logits_max": -8.524826049804688, "copy_logits_min": -750000000.0, "copy_num_tokens": 320.75, "epoch": 0.46566249680878224, "gen_logits_max": 7.416325569152832, "gen_logits_mean": -11.452112197875977, "gen_logits_min": -22.697368621826172, "gen_logits_std": 2.4856014251708984, "gen_loss": 0.36284369230270386, "grad_norm": 0.44315065131255915, "learning_rate": 2.8049684210526315e-05, "loss": 0.3336, "mean_copy_accuracy": 0.9943365603685379, "mean_gen_accuracy": 0.8597583621740341, "mean_token_accuracy": 0.8900173604488373, "num_tokens": 618028065.0, "sample_num_tokens": 7528.25, "step": 2280, "total_num_tokens": 618058178.0, "z_loss": 0.000887277303263545 }, { "copy_logits_max": -6.049986839294434, "copy_logits_min": -750000000.0, "copy_num_tokens": 471.8125, "epoch": 0.46586673474597906, "gen_logits_max": 5.251309871673584, "gen_logits_mean": -13.57447338104248, "gen_logits_min": -25.411048889160156, "gen_logits_std": 2.5463814735412598, "gen_loss": 0.3347632586956024, "grad_norm": 0.5257813687883836, "learning_rate": 2.804842105263158e-05, "loss": 0.3411, "mean_copy_accuracy": 0.9934113174676895, "mean_gen_accuracy": 0.8554808050394058, "mean_token_accuracy": 0.8849713057279587, "num_tokens": 618273187.0, "sample_num_tokens": 8584.25, "step": 2281, "total_num_tokens": 618307524.0, "z_loss": 0.0007711077341809869 }, { "copy_logits_max": -6.929502010345459, "copy_logits_min": -687500032.0, "copy_num_tokens": 389.875, "epoch": 0.4660709726831759, "gen_logits_max": 5.486660003662109, "gen_logits_mean": -13.609315872192383, "gen_logits_min": -25.026639938354492, "gen_logits_std": 2.480757236480713, "gen_loss": 0.29382678866386414, "grad_norm": 0.48023466239327645, "learning_rate": 2.804715789473684e-05, "loss": 0.3349, "mean_copy_accuracy": 0.9925859719514847, "mean_gen_accuracy": 0.8614774495363235, "mean_token_accuracy": 0.8898919969797134, "num_tokens": 618525467.0, "sample_num_tokens": 8922.25, "step": 2282, "total_num_tokens": 618561156.0, "z_loss": 0.0006867045303806663 }, { "copy_logits_max": -7.182646751403809, "copy_logits_min": -750000000.0, "copy_num_tokens": 686.0, "epoch": 0.46627521062037275, "gen_logits_max": 5.792053699493408, "gen_logits_mean": -12.123934745788574, "gen_logits_min": -24.27663230895996, "gen_logits_std": 2.5161218643188477, "gen_loss": 0.2924610674381256, "grad_norm": 0.43923056970619717, "learning_rate": 2.8045894736842105e-05, "loss": 0.3417, "mean_copy_accuracy": 0.9925338476896286, "mean_gen_accuracy": 0.8567681908607483, "mean_token_accuracy": 0.8875617980957031, "num_tokens": 618795441.0, "sample_num_tokens": 10433.75, "step": 2283, "total_num_tokens": 618837176.0, "z_loss": 0.0007785387570038438 }, { "copy_logits_max": -3.7648637294769287, "copy_logits_min": -750000000.0, "copy_num_tokens": 438.3125, "epoch": 0.46647944855756956, "gen_logits_max": 4.6993842124938965, "gen_logits_mean": -13.53481674194336, "gen_logits_min": -25.507369995117188, "gen_logits_std": 2.5243124961853027, "gen_loss": 0.29416754841804504, "grad_norm": 0.5019485600793497, "learning_rate": 2.804463157894737e-05, "loss": 0.3224, "mean_copy_accuracy": 0.9929590672254562, "mean_gen_accuracy": 0.8612330406904221, "mean_token_accuracy": 0.8926018327474594, "num_tokens": 619055005.0, "sample_num_tokens": 7667.75, "step": 2284, "total_num_tokens": 619085676.0, "z_loss": 0.0007843726198188961 }, { "copy_logits_max": -3.974431037902832, "copy_logits_min": -750000000.0, "copy_num_tokens": 450.9375, "epoch": 0.4666836864947664, "gen_logits_max": 4.768507480621338, "gen_logits_mean": -14.060659408569336, "gen_logits_min": -25.736312866210938, "gen_logits_std": 2.5129318237304688, "gen_loss": 0.321555495262146, "grad_norm": 0.46595570753632287, "learning_rate": 2.8043368421052633e-05, "loss": 0.3342, "mean_copy_accuracy": 0.9941054731607437, "mean_gen_accuracy": 0.8549512475728989, "mean_token_accuracy": 0.888953223824501, "num_tokens": 619321850.0, "sample_num_tokens": 8022.5, "step": 2285, "total_num_tokens": 619353940.0, "z_loss": 0.0008404530817642808 }, { "copy_logits_max": -5.418332099914551, "copy_logits_min": -687500032.0, "copy_num_tokens": 601.4375, "epoch": 0.46688792443196325, "gen_logits_max": 5.748147010803223, "gen_logits_mean": -12.557995796203613, "gen_logits_min": -24.428560256958008, "gen_logits_std": 2.5042550563812256, "gen_loss": 0.2823156714439392, "grad_norm": 0.4918886222684931, "learning_rate": 2.8042105263157898e-05, "loss": 0.3129, "mean_copy_accuracy": 0.9935875535011292, "mean_gen_accuracy": 0.8639463037252426, "mean_token_accuracy": 0.8981643468141556, "num_tokens": 619599201.0, "sample_num_tokens": 9467.75, "step": 2286, "total_num_tokens": 619637072.0, "z_loss": 0.0007885665399953723 }, { "copy_logits_max": -4.962687969207764, "copy_logits_min": -750000000.0, "copy_num_tokens": 656.0625, "epoch": 0.46709216236916007, "gen_logits_max": 5.522587776184082, "gen_logits_mean": -13.260797500610352, "gen_logits_min": -25.22127914428711, "gen_logits_std": 2.5074031352996826, "gen_loss": 0.3297848105430603, "grad_norm": 0.49244815442553624, "learning_rate": 2.804084210526316e-05, "loss": 0.3248, "mean_copy_accuracy": 0.9926251173019409, "mean_gen_accuracy": 0.8611228913068771, "mean_token_accuracy": 0.8949380367994308, "num_tokens": 619875289.0, "sample_num_tokens": 9624.75, "step": 2287, "total_num_tokens": 619913788.0, "z_loss": 0.0008416245691478252 }, { "copy_logits_max": -7.271345138549805, "copy_logits_min": -750000000.0, "copy_num_tokens": 477.3125, "epoch": 0.4672964003063569, "gen_logits_max": 5.371269226074219, "gen_logits_mean": -14.383373260498047, "gen_logits_min": -25.9361629486084, "gen_logits_std": 2.498204231262207, "gen_loss": 0.2764666676521301, "grad_norm": 0.40687436004643196, "learning_rate": 2.8039578947368423e-05, "loss": 0.309, "mean_copy_accuracy": 0.9930392205715179, "mean_gen_accuracy": 0.8675116896629333, "mean_token_accuracy": 0.896814227104187, "num_tokens": 620150007.0, "sample_num_tokens": 8627.75, "step": 2288, "total_num_tokens": 620184518.0, "z_loss": 0.0006803776486776769 }, { "copy_logits_max": -6.239455223083496, "copy_logits_min": -687500032.0, "copy_num_tokens": 361.125, "epoch": 0.46750063824355376, "gen_logits_max": 6.367169380187988, "gen_logits_mean": -13.455048561096191, "gen_logits_min": -24.91205406188965, "gen_logits_std": 2.4701151847839355, "gen_loss": 0.3828124403953552, "grad_norm": 0.5220559580592334, "learning_rate": 2.8038315789473684e-05, "loss": 0.3379, "mean_copy_accuracy": 0.9935616850852966, "mean_gen_accuracy": 0.8575473427772522, "mean_token_accuracy": 0.8891167789697647, "num_tokens": 620407574.0, "sample_num_tokens": 8577.5, "step": 2289, "total_num_tokens": 620441884.0, "z_loss": 0.0008901210967451334 }, { "copy_logits_max": -8.87346076965332, "copy_logits_min": -750000000.0, "copy_num_tokens": 282.5625, "epoch": 0.4677048761807506, "gen_logits_max": 6.025639057159424, "gen_logits_mean": -12.790096282958984, "gen_logits_min": -24.014049530029297, "gen_logits_std": 2.421116828918457, "gen_loss": 0.36266660690307617, "grad_norm": 0.44172354144412207, "learning_rate": 2.8037052631578948e-05, "loss": 0.3065, "mean_copy_accuracy": 0.9940131455659866, "mean_gen_accuracy": 0.8686108738183975, "mean_token_accuracy": 0.8984495848417282, "num_tokens": 620671796.0, "sample_num_tokens": 6888.5, "step": 2290, "total_num_tokens": 620699350.0, "z_loss": 0.0007913310546427965 }, { "copy_logits_max": -6.8807525634765625, "copy_logits_min": -750000000.0, "copy_num_tokens": 521.25, "epoch": 0.4679091141179474, "gen_logits_max": 4.855628967285156, "gen_logits_mean": -13.902176856994629, "gen_logits_min": -25.468860626220703, "gen_logits_std": 2.4889001846313477, "gen_loss": 0.2982105612754822, "grad_norm": 0.5053371892073562, "learning_rate": 2.803578947368421e-05, "loss": 0.3168, "mean_copy_accuracy": 0.9934808760881424, "mean_gen_accuracy": 0.8608804792165756, "mean_token_accuracy": 0.89549520611763, "num_tokens": 620953189.0, "sample_num_tokens": 8626.25, "step": 2291, "total_num_tokens": 620987694.0, "z_loss": 0.0006912926328368485 }, { "copy_logits_max": -4.887402057647705, "copy_logits_min": -750000000.0, "copy_num_tokens": 412.4375, "epoch": 0.46811335205514426, "gen_logits_max": 5.3795671463012695, "gen_logits_mean": -12.941812515258789, "gen_logits_min": -24.421680450439453, "gen_logits_std": 2.4651830196380615, "gen_loss": 0.33129286766052246, "grad_norm": 3.3137129903149667, "learning_rate": 2.8034526315789473e-05, "loss": 0.3206, "mean_copy_accuracy": 0.9932249635457993, "mean_gen_accuracy": 0.8636399656534195, "mean_token_accuracy": 0.894835352897644, "num_tokens": 621216036.0, "sample_num_tokens": 8064.5, "step": 2292, "total_num_tokens": 621248294.0, "z_loss": 0.0007911419961601496 }, { "copy_logits_max": -5.967134475708008, "copy_logits_min": -750000000.0, "copy_num_tokens": 332.625, "epoch": 0.4683175899923411, "gen_logits_max": 5.444798946380615, "gen_logits_mean": -13.474201202392578, "gen_logits_min": -24.947338104248047, "gen_logits_std": 2.510725736618042, "gen_loss": 0.32446831464767456, "grad_norm": 0.4572621775081352, "learning_rate": 2.8033263157894738e-05, "loss": 0.3384, "mean_copy_accuracy": 0.9953445196151733, "mean_gen_accuracy": 0.8537840843200684, "mean_token_accuracy": 0.8891997337341309, "num_tokens": 621490461.0, "sample_num_tokens": 7023.75, "step": 2293, "total_num_tokens": 621518556.0, "z_loss": 0.0008171157096512616 }, { "copy_logits_max": -5.336715221405029, "copy_logits_min": -750000000.0, "copy_num_tokens": 432.9375, "epoch": 0.4685218279295379, "gen_logits_max": 4.971342086791992, "gen_logits_mean": -14.418207168579102, "gen_logits_min": -25.962478637695312, "gen_logits_std": 2.520413398742676, "gen_loss": 0.33657118678092957, "grad_norm": 0.4626814982437853, "learning_rate": 2.8032000000000002e-05, "loss": 0.3103, "mean_copy_accuracy": 0.9936694800853729, "mean_gen_accuracy": 0.8711225092411041, "mean_token_accuracy": 0.9008134752511978, "num_tokens": 621784677.0, "sample_num_tokens": 8698.75, "step": 2294, "total_num_tokens": 621819472.0, "z_loss": 0.0008774393936619163 }, { "copy_logits_max": -4.669856548309326, "copy_logits_min": -750000000.0, "copy_num_tokens": 406.0, "epoch": 0.46872606586673476, "gen_logits_max": 5.339482307434082, "gen_logits_mean": -13.61029052734375, "gen_logits_min": -25.558637619018555, "gen_logits_std": 2.528383255004883, "gen_loss": 0.31890296936035156, "grad_norm": 0.4684006612341981, "learning_rate": 2.8030736842105263e-05, "loss": 0.3191, "mean_copy_accuracy": 0.994526669383049, "mean_gen_accuracy": 0.8664741963148117, "mean_token_accuracy": 0.8943505585193634, "num_tokens": 622050282.0, "sample_num_tokens": 8253.5, "step": 2295, "total_num_tokens": 622083296.0, "z_loss": 0.0008348166011273861 }, { "copy_logits_max": -5.3150553703308105, "copy_logits_min": -687500032.0, "copy_num_tokens": 376.0, "epoch": 0.4689303038039316, "gen_logits_max": 5.147237300872803, "gen_logits_mean": -14.498651504516602, "gen_logits_min": -25.831371307373047, "gen_logits_std": 2.4827988147735596, "gen_loss": 0.3513544499874115, "grad_norm": 0.7467984131123321, "learning_rate": 2.8029473684210527e-05, "loss": 0.3516, "mean_copy_accuracy": 0.9910088628530502, "mean_gen_accuracy": 0.8550137728452682, "mean_token_accuracy": 0.8841965645551682, "num_tokens": 622304041.0, "sample_num_tokens": 8708.25, "step": 2296, "total_num_tokens": 622338874.0, "z_loss": 0.0008569799829274416 }, { "copy_logits_max": -2.849392890930176, "copy_logits_min": -750000000.0, "copy_num_tokens": 606.25, "epoch": 0.4691345417411284, "gen_logits_max": 5.333010196685791, "gen_logits_mean": -13.344169616699219, "gen_logits_min": -25.15085220336914, "gen_logits_std": 2.541414976119995, "gen_loss": 0.3581238090991974, "grad_norm": 0.49537696320885566, "learning_rate": 2.802821052631579e-05, "loss": 0.3341, "mean_copy_accuracy": 0.9943537414073944, "mean_gen_accuracy": 0.8568919152021408, "mean_token_accuracy": 0.8933814018964767, "num_tokens": 622579715.0, "sample_num_tokens": 9244.75, "step": 2297, "total_num_tokens": 622616694.0, "z_loss": 0.0009537859004922211 }, { "copy_logits_max": -3.807102680206299, "copy_logits_min": -687500032.0, "copy_num_tokens": 740.5, "epoch": 0.46933877967832527, "gen_logits_max": 4.006526470184326, "gen_logits_mean": -13.861396789550781, "gen_logits_min": -26.106891632080078, "gen_logits_std": 2.636353015899658, "gen_loss": 0.24085935950279236, "grad_norm": 0.5012918060951188, "learning_rate": 2.8026947368421052e-05, "loss": 0.3262, "mean_copy_accuracy": 0.9939149022102356, "mean_gen_accuracy": 0.860653281211853, "mean_token_accuracy": 0.8954396843910217, "num_tokens": 622855547.0, "sample_num_tokens": 9087.25, "step": 2298, "total_num_tokens": 622891896.0, "z_loss": 0.0007641050033271313 }, { "copy_logits_max": -3.8712549209594727, "copy_logits_min": -750000000.0, "copy_num_tokens": 663.0625, "epoch": 0.4695430176155221, "gen_logits_max": 4.967002868652344, "gen_logits_mean": -13.648914337158203, "gen_logits_min": -25.655378341674805, "gen_logits_std": 2.635162115097046, "gen_loss": 0.3253937363624573, "grad_norm": 0.5100149650757895, "learning_rate": 2.8025684210526317e-05, "loss": 0.315, "mean_copy_accuracy": 0.9952214509248734, "mean_gen_accuracy": 0.8604805916547775, "mean_token_accuracy": 0.8969437927007675, "num_tokens": 623127060.0, "sample_num_tokens": 9210.5, "step": 2299, "total_num_tokens": 623163902.0, "z_loss": 0.0009533400880172849 }, { "copy_logits_max": -4.84397554397583, "copy_logits_min": -750000000.0, "copy_num_tokens": 377.4375, "epoch": 0.4697472555527189, "gen_logits_max": 5.469274520874023, "gen_logits_mean": -13.940868377685547, "gen_logits_min": -25.621463775634766, "gen_logits_std": 2.573111057281494, "gen_loss": 0.3078203797340393, "grad_norm": 0.44033776725034257, "learning_rate": 2.8024421052631578e-05, "loss": 0.3003, "mean_copy_accuracy": 0.99366295337677, "mean_gen_accuracy": 0.8656461834907532, "mean_token_accuracy": 0.8984234929084778, "num_tokens": 623408448.0, "sample_num_tokens": 7878.5, "step": 2300, "total_num_tokens": 623439962.0, "z_loss": 0.0008662568288855255 }, { "copy_logits_max": -5.677891731262207, "copy_logits_min": -750000000.0, "copy_num_tokens": 446.5, "epoch": 0.4699514934899158, "gen_logits_max": 5.168246269226074, "gen_logits_mean": -14.209951400756836, "gen_logits_min": -26.475383758544922, "gen_logits_std": 2.5667858123779297, "gen_loss": 0.33948037028312683, "grad_norm": 0.5368015728392589, "learning_rate": 2.8023157894736845e-05, "loss": 0.333, "mean_copy_accuracy": 0.9905551075935364, "mean_gen_accuracy": 0.8650851100683212, "mean_token_accuracy": 0.8909866958856583, "num_tokens": 623656803.0, "sample_num_tokens": 8562.25, "step": 2301, "total_num_tokens": 623691052.0, "z_loss": 0.0008615462575107813 }, { "copy_logits_max": -5.611259460449219, "copy_logits_min": -750000000.0, "copy_num_tokens": 491.375, "epoch": 0.4701557314271126, "gen_logits_max": 4.899982929229736, "gen_logits_mean": -13.627793312072754, "gen_logits_min": -25.047534942626953, "gen_logits_std": 2.517693519592285, "gen_loss": 0.3239145874977112, "grad_norm": 0.45001225040031645, "learning_rate": 2.8021894736842106e-05, "loss": 0.3269, "mean_copy_accuracy": 0.9939512610435486, "mean_gen_accuracy": 0.8610616624355316, "mean_token_accuracy": 0.8943527936935425, "num_tokens": 623921672.0, "sample_num_tokens": 8994.5, "step": 2302, "total_num_tokens": 623957650.0, "z_loss": 0.0008237936999648809 }, { "copy_logits_max": -7.931743144989014, "copy_logits_min": -750000064.0, "copy_num_tokens": 441.25, "epoch": 0.4703599693643094, "gen_logits_max": 5.22093391418457, "gen_logits_mean": -13.751119613647461, "gen_logits_min": -25.618297576904297, "gen_logits_std": 2.5810067653656006, "gen_loss": 0.27120116353034973, "grad_norm": 0.4426131672254673, "learning_rate": 2.802063157894737e-05, "loss": 0.3087, "mean_copy_accuracy": 0.9944626092910767, "mean_gen_accuracy": 0.8607817441225052, "mean_token_accuracy": 0.8991882652044296, "num_tokens": 624204012.0, "sample_num_tokens": 8713.5, "step": 2303, "total_num_tokens": 624238866.0, "z_loss": 0.0007830268586985767 }, { "copy_logits_max": -5.929294586181641, "copy_logits_min": -750000000.0, "copy_num_tokens": 313.25, "epoch": 0.4705642073015063, "gen_logits_max": 5.349763870239258, "gen_logits_mean": -14.460736274719238, "gen_logits_min": -25.7288761138916, "gen_logits_std": 2.495051860809326, "gen_loss": 0.35145533084869385, "grad_norm": 0.47572084445157964, "learning_rate": 2.801936842105263e-05, "loss": 0.3304, "mean_copy_accuracy": 0.9934376031160355, "mean_gen_accuracy": 0.8581734299659729, "mean_token_accuracy": 0.8906431347131729, "num_tokens": 624473756.0, "sample_num_tokens": 7311.5, "step": 2304, "total_num_tokens": 624503002.0, "z_loss": 0.0008654841221868992 }, { "copy_logits_max": -5.919783115386963, "copy_logits_min": -750000000.0, "copy_num_tokens": 347.8125, "epoch": 0.4707684452387031, "gen_logits_max": 5.193416595458984, "gen_logits_mean": -13.43997573852539, "gen_logits_min": -25.20993423461914, "gen_logits_std": 2.5218536853790283, "gen_loss": 0.35018226504325867, "grad_norm": 0.4935833497434978, "learning_rate": 2.8018105263157896e-05, "loss": 0.3594, "mean_copy_accuracy": 0.9915059059858322, "mean_gen_accuracy": 0.8516533970832825, "mean_token_accuracy": 0.8826083987951279, "num_tokens": 624723396.0, "sample_num_tokens": 7171.5, "step": 2305, "total_num_tokens": 624752082.0, "z_loss": 0.0008926019072532654 }, { "copy_logits_max": -4.713295936584473, "copy_logits_min": -687500032.0, "copy_num_tokens": 407.1875, "epoch": 0.4709726831758999, "gen_logits_max": 4.90228796005249, "gen_logits_mean": -14.073302268981934, "gen_logits_min": -26.071487426757812, "gen_logits_std": 2.5552523136138916, "gen_loss": 0.31494319438934326, "grad_norm": 0.4721797436671849, "learning_rate": 2.8016842105263157e-05, "loss": 0.3447, "mean_copy_accuracy": 0.9921465516090393, "mean_gen_accuracy": 0.8534782379865646, "mean_token_accuracy": 0.886639341711998, "num_tokens": 624982730.0, "sample_num_tokens": 7864.0, "step": 2306, "total_num_tokens": 625014186.0, "z_loss": 0.0008024905691854656 }, { "copy_logits_max": -3.0605969429016113, "copy_logits_min": -750000000.0, "copy_num_tokens": 544.1875, "epoch": 0.4711769211130968, "gen_logits_max": 4.9042439460754395, "gen_logits_mean": -13.587971687316895, "gen_logits_min": -24.834285736083984, "gen_logits_std": 2.513152837753296, "gen_loss": 0.29765352606773376, "grad_norm": 0.43390736164615235, "learning_rate": 2.801557894736842e-05, "loss": 0.3059, "mean_copy_accuracy": 0.9927484542131424, "mean_gen_accuracy": 0.8644102960824966, "mean_token_accuracy": 0.8977129310369492, "num_tokens": 625249829.0, "sample_num_tokens": 8764.75, "step": 2307, "total_num_tokens": 625284888.0, "z_loss": 0.0007967517012730241 }, { "copy_logits_max": -3.7525887489318848, "copy_logits_min": -687499968.0, "copy_num_tokens": 848.0625, "epoch": 0.4713811590502936, "gen_logits_max": 4.75740909576416, "gen_logits_mean": -13.141127586364746, "gen_logits_min": -25.43793487548828, "gen_logits_std": 2.6107959747314453, "gen_loss": 0.25666147470474243, "grad_norm": 0.47444366064478527, "learning_rate": 2.8014315789473682e-05, "loss": 0.3087, "mean_copy_accuracy": 0.9936421662569046, "mean_gen_accuracy": 0.861720085144043, "mean_token_accuracy": 0.8981808125972748, "num_tokens": 625539541.0, "sample_num_tokens": 10917.75, "step": 2308, "total_num_tokens": 625583212.0, "z_loss": 0.0007340505253523588 }, { "copy_logits_max": -1.423179268836975, "copy_logits_min": -750000064.0, "copy_num_tokens": 514.25, "epoch": 0.4715853969874904, "gen_logits_max": 5.708256244659424, "gen_logits_mean": -12.893070220947266, "gen_logits_min": -24.561887741088867, "gen_logits_std": 2.5647153854370117, "gen_loss": 0.34316813945770264, "grad_norm": 0.46868640032909237, "learning_rate": 2.801305263157895e-05, "loss": 0.334, "mean_copy_accuracy": 0.9933266192674637, "mean_gen_accuracy": 0.8572467714548111, "mean_token_accuracy": 0.889566957950592, "num_tokens": 625817240.0, "sample_num_tokens": 9072.5, "step": 2309, "total_num_tokens": 625853530.0, "z_loss": 0.0009215391473844647 }, { "copy_logits_max": -4.6666789054870605, "copy_logits_min": -750000128.0, "copy_num_tokens": 436.25, "epoch": 0.4717896349246873, "gen_logits_max": 5.40968132019043, "gen_logits_mean": -13.703634262084961, "gen_logits_min": -25.280941009521484, "gen_logits_std": 2.526106357574463, "gen_loss": 0.3590514063835144, "grad_norm": 0.5560995171978079, "learning_rate": 2.8011789473684214e-05, "loss": 0.3265, "mean_copy_accuracy": 0.9933225363492966, "mean_gen_accuracy": 0.8624174296855927, "mean_token_accuracy": 0.8935700505971909, "num_tokens": 626081723.0, "sample_num_tokens": 8440.75, "step": 2310, "total_num_tokens": 626115486.0, "z_loss": 0.0007929679122753441 }, { "copy_logits_max": -4.089184761047363, "copy_logits_min": -687500032.0, "copy_num_tokens": 501.8125, "epoch": 0.4719938728618841, "gen_logits_max": 5.057668685913086, "gen_logits_mean": -14.268478393554688, "gen_logits_min": -25.995620727539062, "gen_logits_std": 2.528048515319824, "gen_loss": 0.3158259987831116, "grad_norm": 0.4766914929779968, "learning_rate": 2.8010526315789475e-05, "loss": 0.3045, "mean_copy_accuracy": 0.9929798245429993, "mean_gen_accuracy": 0.8706907778978348, "mean_token_accuracy": 0.8986352235078812, "num_tokens": 626338073.0, "sample_num_tokens": 8512.75, "step": 2311, "total_num_tokens": 626372124.0, "z_loss": 0.0008416667697019875 }, { "copy_logits_max": -5.458538055419922, "copy_logits_min": -687500032.0, "copy_num_tokens": 499.25, "epoch": 0.4721981107990809, "gen_logits_max": 5.341549873352051, "gen_logits_mean": -14.107356071472168, "gen_logits_min": -25.680837631225586, "gen_logits_std": 2.5306286811828613, "gen_loss": 0.35243159532546997, "grad_norm": 0.47020077052797865, "learning_rate": 2.800926315789474e-05, "loss": 0.3213, "mean_copy_accuracy": 0.9937882572412491, "mean_gen_accuracy": 0.864081934094429, "mean_token_accuracy": 0.895812526345253, "num_tokens": 626599328.0, "sample_num_tokens": 7834.5, "step": 2312, "total_num_tokens": 626630666.0, "z_loss": 0.0008791660657152534 }, { "copy_logits_max": -4.160204887390137, "copy_logits_min": -687500032.0, "copy_num_tokens": 462.6875, "epoch": 0.4724023487362778, "gen_logits_max": 5.077064037322998, "gen_logits_mean": -14.352324485778809, "gen_logits_min": -25.95433807373047, "gen_logits_std": 2.5597715377807617, "gen_loss": 0.2912430167198181, "grad_norm": 0.48694507681011767, "learning_rate": 2.8008e-05, "loss": 0.3396, "mean_copy_accuracy": 0.9910280406475067, "mean_gen_accuracy": 0.8582442402839661, "mean_token_accuracy": 0.8870285302400589, "num_tokens": 626852466.0, "sample_num_tokens": 8059.0, "step": 2313, "total_num_tokens": 626884702.0, "z_loss": 0.0008692400879226625 }, { "copy_logits_max": -4.499876022338867, "copy_logits_min": -750000000.0, "copy_num_tokens": 303.0, "epoch": 0.4726065866734746, "gen_logits_max": 6.338122367858887, "gen_logits_mean": -13.369400024414062, "gen_logits_min": -24.594680786132812, "gen_logits_std": 2.5143392086029053, "gen_loss": 0.36010169982910156, "grad_norm": 0.5335182872879219, "learning_rate": 2.8006736842105264e-05, "loss": 0.3364, "mean_copy_accuracy": 0.9914667010307312, "mean_gen_accuracy": 0.8600536286830902, "mean_token_accuracy": 0.8875051140785217, "num_tokens": 627110173.0, "sample_num_tokens": 7790.25, "step": 2314, "total_num_tokens": 627141334.0, "z_loss": 0.0009456408442929387 }, { "copy_logits_max": -4.566630840301514, "copy_logits_min": -687500032.0, "copy_num_tokens": 598.6875, "epoch": 0.4728108246106714, "gen_logits_max": 6.808142185211182, "gen_logits_mean": -12.489065170288086, "gen_logits_min": -24.51126480102539, "gen_logits_std": 2.6804654598236084, "gen_loss": 0.28518250584602356, "grad_norm": 0.5326816372561405, "learning_rate": 2.8005473684210525e-05, "loss": 0.3285, "mean_copy_accuracy": 0.9943699836730957, "mean_gen_accuracy": 0.8589587658643723, "mean_token_accuracy": 0.8921704739332199, "num_tokens": 627356360.0, "sample_num_tokens": 8359.5, "step": 2315, "total_num_tokens": 627389798.0, "z_loss": 0.0010625023860484362 }, { "copy_logits_max": -3.718064069747925, "copy_logits_min": -750000000.0, "copy_num_tokens": 490.1875, "epoch": 0.4730150625478683, "gen_logits_max": 5.676125526428223, "gen_logits_mean": -13.59311294555664, "gen_logits_min": -25.351669311523438, "gen_logits_std": 2.577364206314087, "gen_loss": 0.2957495450973511, "grad_norm": 0.4222516316106051, "learning_rate": 2.800421052631579e-05, "loss": 0.3236, "mean_copy_accuracy": 0.9949638098478317, "mean_gen_accuracy": 0.8574042767286301, "mean_token_accuracy": 0.8935087323188782, "num_tokens": 627619697.0, "sample_num_tokens": 8543.75, "step": 2316, "total_num_tokens": 627653872.0, "z_loss": 0.0009278638754040003 }, { "copy_logits_max": -3.4906413555145264, "copy_logits_min": -687500032.0, "copy_num_tokens": 405.1875, "epoch": 0.4732193004850651, "gen_logits_max": 6.060332298278809, "gen_logits_mean": -12.881874084472656, "gen_logits_min": -24.525371551513672, "gen_logits_std": 2.5353286266326904, "gen_loss": 0.3678237795829773, "grad_norm": 0.46920864124012046, "learning_rate": 2.8002947368421054e-05, "loss": 0.3235, "mean_copy_accuracy": 0.9918047934770584, "mean_gen_accuracy": 0.8644747734069824, "mean_token_accuracy": 0.893547311425209, "num_tokens": 627895890.0, "sample_num_tokens": 8106.5, "step": 2317, "total_num_tokens": 627928316.0, "z_loss": 0.0009995067957788706 }, { "copy_logits_max": -8.102334976196289, "copy_logits_min": -750000000.0, "copy_num_tokens": 405.0625, "epoch": 0.4734235384222619, "gen_logits_max": 6.325942516326904, "gen_logits_mean": -13.605328559875488, "gen_logits_min": -24.408916473388672, "gen_logits_std": 2.5026650428771973, "gen_loss": 0.3218275010585785, "grad_norm": 0.44597387036306774, "learning_rate": 2.800168421052632e-05, "loss": 0.3151, "mean_copy_accuracy": 0.9940058439970016, "mean_gen_accuracy": 0.8627004623413086, "mean_token_accuracy": 0.8948851525783539, "num_tokens": 628168666.0, "sample_num_tokens": 8587.0, "step": 2318, "total_num_tokens": 628203014.0, "z_loss": 0.0008718662429600954 }, { "copy_logits_max": -2.576667070388794, "copy_logits_min": -750000000.0, "copy_num_tokens": 704.625, "epoch": 0.4736277763594588, "gen_logits_max": 5.48785400390625, "gen_logits_mean": -13.013647079467773, "gen_logits_min": -24.83847999572754, "gen_logits_std": 2.577378273010254, "gen_loss": 0.2939908504486084, "grad_norm": 0.4473673148280201, "learning_rate": 2.800042105263158e-05, "loss": 0.3099, "mean_copy_accuracy": 0.9939699321985245, "mean_gen_accuracy": 0.8596646338701248, "mean_token_accuracy": 0.8981034308671951, "num_tokens": 628456756.0, "sample_num_tokens": 9222.0, "step": 2319, "total_num_tokens": 628493644.0, "z_loss": 0.0010117028141394258 }, { "copy_logits_max": -8.873262405395508, "copy_logits_min": -750000000.0, "copy_num_tokens": 249.4375, "epoch": 0.4738320142966556, "gen_logits_max": 6.6443963050842285, "gen_logits_mean": -13.62398910522461, "gen_logits_min": -24.195281982421875, "gen_logits_std": 2.468492031097412, "gen_loss": 0.36374637484550476, "grad_norm": 0.4301417373396337, "learning_rate": 2.7999157894736844e-05, "loss": 0.3302, "mean_copy_accuracy": 0.9927306622266769, "mean_gen_accuracy": 0.8629872053861618, "mean_token_accuracy": 0.8923763632774353, "num_tokens": 628746896.0, "sample_num_tokens": 8390.0, "step": 2320, "total_num_tokens": 628780456.0, "z_loss": 0.0008873207261785865 }, { "copy_logits_max": -6.692464828491211, "copy_logits_min": -750000000.0, "copy_num_tokens": 498.0, "epoch": 0.47403625223385243, "gen_logits_max": 5.786657810211182, "gen_logits_mean": -13.294628143310547, "gen_logits_min": -24.682886123657227, "gen_logits_std": 2.503791332244873, "gen_loss": 0.3535805940628052, "grad_norm": 0.5244560011925417, "learning_rate": 2.7997894736842105e-05, "loss": 0.3413, "mean_copy_accuracy": 0.9937642514705658, "mean_gen_accuracy": 0.8516437113285065, "mean_token_accuracy": 0.8865367621183395, "num_tokens": 629051611.0, "sample_num_tokens": 9181.25, "step": 2321, "total_num_tokens": 629088336.0, "z_loss": 0.0009090455714613199 }, { "copy_logits_max": -2.665097236633301, "copy_logits_min": -625000064.0, "copy_num_tokens": 625.5625, "epoch": 0.47424049017104924, "gen_logits_max": 5.1842474937438965, "gen_logits_mean": -13.004281997680664, "gen_logits_min": -25.08160972595215, "gen_logits_std": 2.565324306488037, "gen_loss": 0.3813394606113434, "grad_norm": 0.44153216883973617, "learning_rate": 2.799663157894737e-05, "loss": 0.3316, "mean_copy_accuracy": 0.9938179403543472, "mean_gen_accuracy": 0.8528546392917633, "mean_token_accuracy": 0.8936062157154083, "num_tokens": 629358889.0, "sample_num_tokens": 9497.25, "step": 2322, "total_num_tokens": 629396878.0, "z_loss": 0.0010729983914643526 }, { "copy_logits_max": -3.178986072540283, "copy_logits_min": -750000128.0, "copy_num_tokens": 482.4375, "epoch": 0.4744447281082461, "gen_logits_max": 5.742859840393066, "gen_logits_mean": -13.15872573852539, "gen_logits_min": -25.00635528564453, "gen_logits_std": 2.603972911834717, "gen_loss": 0.34169715642929077, "grad_norm": 0.4814330386380518, "learning_rate": 2.7995368421052633e-05, "loss": 0.3324, "mean_copy_accuracy": 0.9932351410388947, "mean_gen_accuracy": 0.8575404137372971, "mean_token_accuracy": 0.8911748826503754, "num_tokens": 629627449.0, "sample_num_tokens": 8080.25, "step": 2323, "total_num_tokens": 629659770.0, "z_loss": 0.0010192908812314272 }, { "copy_logits_max": -4.4829912185668945, "copy_logits_min": -750000000.0, "copy_num_tokens": 479.125, "epoch": 0.47464896604544293, "gen_logits_max": 6.420271873474121, "gen_logits_mean": -12.609310150146484, "gen_logits_min": -25.063385009765625, "gen_logits_std": 2.641387939453125, "gen_loss": 0.31428441405296326, "grad_norm": 0.4803089224999165, "learning_rate": 2.7994105263157894e-05, "loss": 0.3195, "mean_copy_accuracy": 0.992793932557106, "mean_gen_accuracy": 0.8609827011823654, "mean_token_accuracy": 0.8949132710695267, "num_tokens": 629910473.0, "sample_num_tokens": 8468.75, "step": 2324, "total_num_tokens": 629944348.0, "z_loss": 0.000935082440264523 }, { "copy_logits_max": -5.257751941680908, "copy_logits_min": -750000000.0, "copy_num_tokens": 417.3125, "epoch": 0.47485320398263975, "gen_logits_max": 5.642373561859131, "gen_logits_mean": -13.249868392944336, "gen_logits_min": -25.26149559020996, "gen_logits_std": 2.622715473175049, "gen_loss": 0.34187841415405273, "grad_norm": 0.44521885589188487, "learning_rate": 2.799284210526316e-05, "loss": 0.3263, "mean_copy_accuracy": 0.9935189336538315, "mean_gen_accuracy": 0.8594023734331131, "mean_token_accuracy": 0.8920821398496628, "num_tokens": 630181133.0, "sample_num_tokens": 7994.25, "step": 2325, "total_num_tokens": 630213110.0, "z_loss": 0.0009548037778586149 }, { "copy_logits_max": -6.8094377517700195, "copy_logits_min": -750000000.0, "copy_num_tokens": 359.8125, "epoch": 0.4750574419198366, "gen_logits_max": 5.554218769073486, "gen_logits_mean": -13.742926597595215, "gen_logits_min": -25.32441520690918, "gen_logits_std": 2.5651822090148926, "gen_loss": 0.3572078347206116, "grad_norm": 0.4899564664342433, "learning_rate": 2.7991578947368423e-05, "loss": 0.3393, "mean_copy_accuracy": 0.9923731982707977, "mean_gen_accuracy": 0.8591212183237076, "mean_token_accuracy": 0.8885543942451477, "num_tokens": 630444817.0, "sample_num_tokens": 8218.75, "step": 2326, "total_num_tokens": 630477692.0, "z_loss": 0.0009000140707939863 }, { "copy_logits_max": -5.6562066078186035, "copy_logits_min": -687500032.0, "copy_num_tokens": 397.9375, "epoch": 0.47526167985703344, "gen_logits_max": 5.174721717834473, "gen_logits_mean": -13.890362739562988, "gen_logits_min": -25.644859313964844, "gen_logits_std": 2.541901111602783, "gen_loss": 0.331329882144928, "grad_norm": 0.491697920878686, "learning_rate": 2.7990315789473687e-05, "loss": 0.3262, "mean_copy_accuracy": 0.9929755181074142, "mean_gen_accuracy": 0.8655671030282974, "mean_token_accuracy": 0.8934575468301773, "num_tokens": 630702821.0, "sample_num_tokens": 7312.75, "step": 2327, "total_num_tokens": 630732072.0, "z_loss": 0.0009273053146898746 }, { "copy_logits_max": -5.754627227783203, "copy_logits_min": -750000000.0, "copy_num_tokens": 482.8125, "epoch": 0.47546591779423025, "gen_logits_max": 5.388681411743164, "gen_logits_mean": -12.691813468933105, "gen_logits_min": -24.517269134521484, "gen_logits_std": 2.5611352920532227, "gen_loss": 0.24971745908260345, "grad_norm": 0.46376031359450826, "learning_rate": 2.7989052631578948e-05, "loss": 0.3224, "mean_copy_accuracy": 0.9931658655405045, "mean_gen_accuracy": 0.8614965677261353, "mean_token_accuracy": 0.8938350230455399, "num_tokens": 630963889.0, "sample_num_tokens": 7742.75, "step": 2328, "total_num_tokens": 630994860.0, "z_loss": 0.0008083935244940221 }, { "copy_logits_max": -7.103542327880859, "copy_logits_min": -750000000.0, "copy_num_tokens": 531.5625, "epoch": 0.4756701557314271, "gen_logits_max": 5.954614639282227, "gen_logits_mean": -13.717557907104492, "gen_logits_min": -25.209426879882812, "gen_logits_std": 2.523386240005493, "gen_loss": 0.3160814046859741, "grad_norm": 0.4432995340781374, "learning_rate": 2.7987789473684212e-05, "loss": 0.3242, "mean_copy_accuracy": 0.9927184581756592, "mean_gen_accuracy": 0.858610674738884, "mean_token_accuracy": 0.893123522400856, "num_tokens": 631237555.0, "sample_num_tokens": 9495.75, "step": 2329, "total_num_tokens": 631275538.0, "z_loss": 0.000791082507930696 }, { "copy_logits_max": -5.519326686859131, "copy_logits_min": -687500032.0, "copy_num_tokens": 473.75, "epoch": 0.47587439366862394, "gen_logits_max": 5.42276668548584, "gen_logits_mean": -14.397512435913086, "gen_logits_min": -26.013980865478516, "gen_logits_std": 2.5475199222564697, "gen_loss": 0.28683942556381226, "grad_norm": 0.4851357477477481, "learning_rate": 2.7986526315789473e-05, "loss": 0.3143, "mean_copy_accuracy": 0.992446780204773, "mean_gen_accuracy": 0.8636001795530319, "mean_token_accuracy": 0.8960396200418472, "num_tokens": 631515320.0, "sample_num_tokens": 8700.0, "step": 2330, "total_num_tokens": 631550120.0, "z_loss": 0.000755818618927151 }, { "copy_logits_max": -5.354894638061523, "copy_logits_min": -687500032.0, "copy_num_tokens": 495.0, "epoch": 0.47607863160582076, "gen_logits_max": 5.753134727478027, "gen_logits_mean": -13.85580825805664, "gen_logits_min": -25.61513328552246, "gen_logits_std": 2.576115131378174, "gen_loss": 0.33360135555267334, "grad_norm": 0.4686474178339993, "learning_rate": 2.7985263157894737e-05, "loss": 0.3489, "mean_copy_accuracy": 0.9937141984701157, "mean_gen_accuracy": 0.8514148592948914, "mean_token_accuracy": 0.8873473554849625, "num_tokens": 631787249.0, "sample_num_tokens": 8422.25, "step": 2331, "total_num_tokens": 631820938.0, "z_loss": 0.000920698163099587 }, { "copy_logits_max": -6.9166059494018555, "copy_logits_min": -750000000.0, "copy_num_tokens": 434.1875, "epoch": 0.47628286954301763, "gen_logits_max": 5.542747497558594, "gen_logits_mean": -13.566495895385742, "gen_logits_min": -25.192485809326172, "gen_logits_std": 2.543901205062866, "gen_loss": 0.36432310938835144, "grad_norm": 0.5417929424157041, "learning_rate": 2.7984e-05, "loss": 0.346, "mean_copy_accuracy": 0.9910762459039688, "mean_gen_accuracy": 0.8615230023860931, "mean_token_accuracy": 0.8880539238452911, "num_tokens": 632060458.0, "sample_num_tokens": 9162.5, "step": 2332, "total_num_tokens": 632097108.0, "z_loss": 0.0008562718285247684 }, { "copy_logits_max": -4.196736812591553, "copy_logits_min": -687500032.0, "copy_num_tokens": 554.4375, "epoch": 0.47648710748021444, "gen_logits_max": 5.173386573791504, "gen_logits_mean": -14.025339126586914, "gen_logits_min": -25.708824157714844, "gen_logits_std": 2.548828601837158, "gen_loss": 0.3125759959220886, "grad_norm": 0.4173222723097527, "learning_rate": 2.7982736842105263e-05, "loss": 0.3, "mean_copy_accuracy": 0.9933653771877289, "mean_gen_accuracy": 0.867311030626297, "mean_token_accuracy": 0.9008030593395233, "num_tokens": 632347502.0, "sample_num_tokens": 8828.5, "step": 2333, "total_num_tokens": 632382816.0, "z_loss": 0.0009186454117298126 }, { "copy_logits_max": -3.330047369003296, "copy_logits_min": -625000000.0, "copy_num_tokens": 633.8125, "epoch": 0.47669134541741126, "gen_logits_max": 4.872177600860596, "gen_logits_mean": -13.992805480957031, "gen_logits_min": -25.88705825805664, "gen_logits_std": 2.5869879722595215, "gen_loss": 0.32727348804473877, "grad_norm": 0.45214101601356094, "learning_rate": 2.7981473684210527e-05, "loss": 0.3084, "mean_copy_accuracy": 0.992882251739502, "mean_gen_accuracy": 0.865037351846695, "mean_token_accuracy": 0.8978990465402603, "num_tokens": 632608356.0, "sample_num_tokens": 9517.5, "step": 2334, "total_num_tokens": 632646426.0, "z_loss": 0.00091121019795537 }, { "copy_logits_max": -5.844712257385254, "copy_logits_min": -687500032.0, "copy_num_tokens": 605.125, "epoch": 0.47689558335460813, "gen_logits_max": 4.984523296356201, "gen_logits_mean": -13.670855522155762, "gen_logits_min": -25.593402862548828, "gen_logits_std": 2.589578628540039, "gen_loss": 0.27905261516571045, "grad_norm": 0.43171868921061496, "learning_rate": 2.798021052631579e-05, "loss": 0.3213, "mean_copy_accuracy": 0.9948292225599289, "mean_gen_accuracy": 0.8606165796518326, "mean_token_accuracy": 0.8951396495103836, "num_tokens": 632896412.0, "sample_num_tokens": 9408.5, "step": 2335, "total_num_tokens": 632934046.0, "z_loss": 0.0007956624031066895 }, { "copy_logits_max": -3.8791627883911133, "copy_logits_min": -750000000.0, "copy_num_tokens": 572.875, "epoch": 0.47709982129180495, "gen_logits_max": 4.776424884796143, "gen_logits_mean": -13.939037322998047, "gen_logits_min": -25.860422134399414, "gen_logits_std": 2.5686581134796143, "gen_loss": 0.28413280844688416, "grad_norm": 0.4515207157641473, "learning_rate": 2.7978947368421052e-05, "loss": 0.3183, "mean_copy_accuracy": 0.9904212206602097, "mean_gen_accuracy": 0.8691146969795227, "mean_token_accuracy": 0.8951850682497025, "num_tokens": 633178931.0, "sample_num_tokens": 8808.75, "step": 2336, "total_num_tokens": 633214166.0, "z_loss": 0.0007798816077411175 }, { "copy_logits_max": -5.3245697021484375, "copy_logits_min": -750000000.0, "copy_num_tokens": 557.8125, "epoch": 0.47730405922900176, "gen_logits_max": 5.543231010437012, "gen_logits_mean": -13.353364944458008, "gen_logits_min": -25.15162467956543, "gen_logits_std": 2.557138442993164, "gen_loss": 0.2933323085308075, "grad_norm": 0.4187705053450249, "learning_rate": 2.7977684210526317e-05, "loss": 0.2823, "mean_copy_accuracy": 0.9941263645887375, "mean_gen_accuracy": 0.8735979795455933, "mean_token_accuracy": 0.9075900018215179, "num_tokens": 633489392.0, "sample_num_tokens": 9329.5, "step": 2337, "total_num_tokens": 633526710.0, "z_loss": 0.0007818408776074648 }, { "copy_logits_max": -3.0450973510742188, "copy_logits_min": -750000000.0, "copy_num_tokens": 510.0625, "epoch": 0.47750829716619864, "gen_logits_max": 5.770376205444336, "gen_logits_mean": -13.313472747802734, "gen_logits_min": -25.405441284179688, "gen_logits_std": 2.5545644760131836, "gen_loss": 0.3225226402282715, "grad_norm": 0.5654849936222998, "learning_rate": 2.797642105263158e-05, "loss": 0.3484, "mean_copy_accuracy": 0.9920880198478699, "mean_gen_accuracy": 0.8490310311317444, "mean_token_accuracy": 0.8863824903964996, "num_tokens": 633736283.0, "sample_num_tokens": 8517.75, "step": 2338, "total_num_tokens": 633770354.0, "z_loss": 0.0009284531697630882 }, { "copy_logits_max": -5.542882919311523, "copy_logits_min": -750000000.0, "copy_num_tokens": 506.5625, "epoch": 0.47771253510339545, "gen_logits_max": 5.436684608459473, "gen_logits_mean": -13.292335510253906, "gen_logits_min": -25.65081214904785, "gen_logits_std": 2.6298766136169434, "gen_loss": 0.31428495049476624, "grad_norm": 0.4638245065675272, "learning_rate": 2.7975157894736842e-05, "loss": 0.3342, "mean_copy_accuracy": 0.9939414858818054, "mean_gen_accuracy": 0.8602182120084763, "mean_token_accuracy": 0.892424538731575, "num_tokens": 633999259.0, "sample_num_tokens": 8323.25, "step": 2339, "total_num_tokens": 634032552.0, "z_loss": 0.000783413415774703 }, { "copy_logits_max": -4.548739433288574, "copy_logits_min": -750000000.0, "copy_num_tokens": 391.6875, "epoch": 0.47791677304059227, "gen_logits_max": 5.0103559494018555, "gen_logits_mean": -13.397756576538086, "gen_logits_min": -25.395008087158203, "gen_logits_std": 2.5492117404937744, "gen_loss": 0.2978714108467102, "grad_norm": 0.5713011119262431, "learning_rate": 2.7973894736842106e-05, "loss": 0.3091, "mean_copy_accuracy": 0.9928889721632004, "mean_gen_accuracy": 0.8641401380300522, "mean_token_accuracy": 0.8992667347192764, "num_tokens": 634273378.0, "sample_num_tokens": 7656.0, "step": 2340, "total_num_tokens": 634304002.0, "z_loss": 0.0007706410251557827 }, { "copy_logits_max": -4.346497058868408, "copy_logits_min": -750000000.0, "copy_num_tokens": 455.0, "epoch": 0.47812101097778914, "gen_logits_max": 4.974740505218506, "gen_logits_mean": -14.102956771850586, "gen_logits_min": -25.769161224365234, "gen_logits_std": 2.544133424758911, "gen_loss": 0.3591877818107605, "grad_norm": 0.47982823623652676, "learning_rate": 2.7972631578947367e-05, "loss": 0.3544, "mean_copy_accuracy": 0.9940669536590576, "mean_gen_accuracy": 0.8519495576620102, "mean_token_accuracy": 0.8873003125190735, "num_tokens": 634526482.0, "sample_num_tokens": 8144.0, "step": 2341, "total_num_tokens": 634559058.0, "z_loss": 0.0008616620907559991 }, { "copy_logits_max": -5.358860969543457, "copy_logits_min": -750000000.0, "copy_num_tokens": 464.125, "epoch": 0.47832524891498596, "gen_logits_max": 5.463386535644531, "gen_logits_mean": -12.958646774291992, "gen_logits_min": -25.20549774169922, "gen_logits_std": 2.567923069000244, "gen_loss": 0.28543519973754883, "grad_norm": 0.44819182931765006, "learning_rate": 2.7971368421052635e-05, "loss": 0.3063, "mean_copy_accuracy": 0.9922927618026733, "mean_gen_accuracy": 0.8728090524673462, "mean_token_accuracy": 0.8986081182956696, "num_tokens": 634776114.0, "sample_num_tokens": 7888.5, "step": 2342, "total_num_tokens": 634807668.0, "z_loss": 0.0008070063777267933 }, { "copy_logits_max": -5.016395092010498, "copy_logits_min": -750000000.0, "copy_num_tokens": 499.3125, "epoch": 0.4785294868521828, "gen_logits_max": 4.879705429077148, "gen_logits_mean": -13.418940544128418, "gen_logits_min": -25.01245880126953, "gen_logits_std": 2.541471242904663, "gen_loss": 0.30877557396888733, "grad_norm": 0.43669987137544963, "learning_rate": 2.7970105263157896e-05, "loss": 0.3072, "mean_copy_accuracy": 0.9943840652704239, "mean_gen_accuracy": 0.869955450296402, "mean_token_accuracy": 0.8991183340549469, "num_tokens": 635035732.0, "sample_num_tokens": 8462.0, "step": 2343, "total_num_tokens": 635069580.0, "z_loss": 0.0007930495776236057 }, { "copy_logits_max": -3.988314390182495, "copy_logits_min": -687500032.0, "copy_num_tokens": 906.375, "epoch": 0.47873372478937964, "gen_logits_max": 4.160911560058594, "gen_logits_mean": -13.773218154907227, "gen_logits_min": -25.925682067871094, "gen_logits_std": 2.5624868869781494, "gen_loss": 0.298237144947052, "grad_norm": 0.47089448130759637, "learning_rate": 2.796884210526316e-05, "loss": 0.3441, "mean_copy_accuracy": 0.9921375960111618, "mean_gen_accuracy": 0.8525564521551132, "mean_token_accuracy": 0.8883196115493774, "num_tokens": 635305664.0, "sample_num_tokens": 10975.0, "step": 2344, "total_num_tokens": 635349564.0, "z_loss": 0.0008433113689534366 }, { "copy_logits_max": -5.135714530944824, "copy_logits_min": -687500032.0, "copy_num_tokens": 526.4375, "epoch": 0.47893796272657646, "gen_logits_max": 4.610714912414551, "gen_logits_mean": -13.435541152954102, "gen_logits_min": -25.511348724365234, "gen_logits_std": 2.5509958267211914, "gen_loss": 0.36043059825897217, "grad_norm": 0.47700967256352333, "learning_rate": 2.796757894736842e-05, "loss": 0.3591, "mean_copy_accuracy": 0.9947570711374283, "mean_gen_accuracy": 0.8450099527835846, "mean_token_accuracy": 0.8852006942033768, "num_tokens": 635578404.0, "sample_num_tokens": 7571.5, "step": 2345, "total_num_tokens": 635608690.0, "z_loss": 0.0008959182305261493 }, { "copy_logits_max": -6.249972343444824, "copy_logits_min": -750000000.0, "copy_num_tokens": 245.9375, "epoch": 0.4791422006637733, "gen_logits_max": 6.712113857269287, "gen_logits_mean": -13.1527738571167, "gen_logits_min": -24.54781150817871, "gen_logits_std": 2.521902561187744, "gen_loss": 0.4152413606643677, "grad_norm": 0.47169313055344153, "learning_rate": 2.7966315789473685e-05, "loss": 0.3542, "mean_copy_accuracy": 0.9941563010215759, "mean_gen_accuracy": 0.8519870042800903, "mean_token_accuracy": 0.8824126124382019, "num_tokens": 635826260.0, "sample_num_tokens": 7105.0, "step": 2346, "total_num_tokens": 635854680.0, "z_loss": 0.0009497705614194274 }, { "copy_logits_max": -4.984566688537598, "copy_logits_min": -687500032.0, "copy_num_tokens": 496.4375, "epoch": 0.47934643860097015, "gen_logits_max": 4.966085433959961, "gen_logits_mean": -14.018102645874023, "gen_logits_min": -25.842208862304688, "gen_logits_std": 2.5422492027282715, "gen_loss": 0.3149765133857727, "grad_norm": 0.49260512633197084, "learning_rate": 2.7965052631578946e-05, "loss": 0.3479, "mean_copy_accuracy": 0.9932415932416916, "mean_gen_accuracy": 0.8556499034166336, "mean_token_accuracy": 0.8850178867578506, "num_tokens": 636063403.0, "sample_num_tokens": 8847.25, "step": 2347, "total_num_tokens": 636098792.0, "z_loss": 0.0009001502767205238 }, { "copy_logits_max": -3.9556708335876465, "copy_logits_min": -750000000.0, "copy_num_tokens": 534.3125, "epoch": 0.47955067653816696, "gen_logits_max": 5.161402702331543, "gen_logits_mean": -13.306896209716797, "gen_logits_min": -25.262004852294922, "gen_logits_std": 2.5948615074157715, "gen_loss": 0.27935129404067993, "grad_norm": 0.38761789103489397, "learning_rate": 2.796378947368421e-05, "loss": 0.3016, "mean_copy_accuracy": 0.993440181016922, "mean_gen_accuracy": 0.8707725405693054, "mean_token_accuracy": 0.8977800905704498, "num_tokens": 636338422.0, "sample_num_tokens": 8982.0, "step": 2348, "total_num_tokens": 636374350.0, "z_loss": 0.000910822069272399 }, { "copy_logits_max": -4.527479648590088, "copy_logits_min": -687500032.0, "copy_num_tokens": 508.375, "epoch": 0.4797549144753638, "gen_logits_max": 4.813511371612549, "gen_logits_mean": -14.161616325378418, "gen_logits_min": -25.85407257080078, "gen_logits_std": 2.561491012573242, "gen_loss": 0.31657999753952026, "grad_norm": 0.45770745458187134, "learning_rate": 2.796252631578947e-05, "loss": 0.3033, "mean_copy_accuracy": 0.9934996217489243, "mean_gen_accuracy": 0.8646424114704132, "mean_token_accuracy": 0.898279681801796, "num_tokens": 636628436.0, "sample_num_tokens": 9841.5, "step": 2349, "total_num_tokens": 636667802.0, "z_loss": 0.0007917594630271196 }, { "copy_logits_max": -6.002982139587402, "copy_logits_min": -750000000.0, "copy_num_tokens": 323.375, "epoch": 0.47995915241256065, "gen_logits_max": 5.736095428466797, "gen_logits_mean": -13.405784606933594, "gen_logits_min": -25.27806854248047, "gen_logits_std": 2.582150459289551, "gen_loss": 0.3351156413555145, "grad_norm": 0.5284454098144925, "learning_rate": 2.796126315789474e-05, "loss": 0.3313, "mean_copy_accuracy": 0.991404265165329, "mean_gen_accuracy": 0.8644520342350006, "mean_token_accuracy": 0.8935503214597702, "num_tokens": 636894947.0, "sample_num_tokens": 7616.75, "step": 2350, "total_num_tokens": 636925414.0, "z_loss": 0.000849192205350846 }, { "copy_logits_max": -2.8707122802734375, "copy_logits_min": -750000000.0, "copy_num_tokens": 502.625, "epoch": 0.48016339034975747, "gen_logits_max": 4.468337059020996, "gen_logits_mean": -14.189364433288574, "gen_logits_min": -26.576644897460938, "gen_logits_std": 2.6308460235595703, "gen_loss": 0.2865452468395233, "grad_norm": 0.47485928976417435, "learning_rate": 2.7960000000000003e-05, "loss": 0.3253, "mean_copy_accuracy": 0.9950391799211502, "mean_gen_accuracy": 0.8606373965740204, "mean_token_accuracy": 0.8950261175632477, "num_tokens": 637162473.0, "sample_num_tokens": 7367.25, "step": 2351, "total_num_tokens": 637191942.0, "z_loss": 0.0008944084402173758 }, { "copy_logits_max": -4.6741180419921875, "copy_logits_min": -750000000.0, "copy_num_tokens": 385.3125, "epoch": 0.4803676282869543, "gen_logits_max": 5.35614013671875, "gen_logits_mean": -14.382760047912598, "gen_logits_min": -25.928327560424805, "gen_logits_std": 2.557384729385376, "gen_loss": 0.35183224081993103, "grad_norm": 0.48630446262809096, "learning_rate": 2.7958736842105264e-05, "loss": 0.3122, "mean_copy_accuracy": 0.9937750548124313, "mean_gen_accuracy": 0.8685146123170853, "mean_token_accuracy": 0.8947687596082687, "num_tokens": 637408666.0, "sample_num_tokens": 8185.0, "step": 2352, "total_num_tokens": 637441406.0, "z_loss": 0.0008633157704025507 }, { "copy_logits_max": -3.528402328491211, "copy_logits_min": -750000000.0, "copy_num_tokens": 448.0, "epoch": 0.48057186622415116, "gen_logits_max": 5.464098930358887, "gen_logits_mean": -12.76448917388916, "gen_logits_min": -24.43234634399414, "gen_logits_std": 2.581407070159912, "gen_loss": 0.3012838661670685, "grad_norm": 0.44013718270704744, "learning_rate": 2.795747368421053e-05, "loss": 0.3062, "mean_copy_accuracy": 0.993892028927803, "mean_gen_accuracy": 0.8616929799318314, "mean_token_accuracy": 0.8991910368204117, "num_tokens": 637676507.0, "sample_num_tokens": 7562.25, "step": 2353, "total_num_tokens": 637706756.0, "z_loss": 0.0007368875667452812 }, { "copy_logits_max": -5.8330488204956055, "copy_logits_min": -750000000.0, "copy_num_tokens": 498.625, "epoch": 0.480776104161348, "gen_logits_max": 5.394408702850342, "gen_logits_mean": -13.80646800994873, "gen_logits_min": -25.846115112304688, "gen_logits_std": 2.602565288543701, "gen_loss": 0.3632526695728302, "grad_norm": 0.5066182481421837, "learning_rate": 2.795621052631579e-05, "loss": 0.3302, "mean_copy_accuracy": 0.9911265522241592, "mean_gen_accuracy": 0.8617410063743591, "mean_token_accuracy": 0.8938263803720474, "num_tokens": 637933570.0, "sample_num_tokens": 8791.5, "step": 2354, "total_num_tokens": 637968736.0, "z_loss": 0.0007745110196992755 }, { "copy_logits_max": -5.5819501876831055, "copy_logits_min": -750000000.0, "copy_num_tokens": 521.3125, "epoch": 0.4809803420985448, "gen_logits_max": 5.389707088470459, "gen_logits_mean": -13.371955871582031, "gen_logits_min": -25.488502502441406, "gen_logits_std": 2.626594305038452, "gen_loss": 0.2911640405654907, "grad_norm": 0.43536148242923695, "learning_rate": 2.7954947368421054e-05, "loss": 0.3261, "mean_copy_accuracy": 0.9943120777606964, "mean_gen_accuracy": 0.8600883930921555, "mean_token_accuracy": 0.8955186009407043, "num_tokens": 638207604.0, "sample_num_tokens": 8316.0, "step": 2355, "total_num_tokens": 638240868.0, "z_loss": 0.0007385211065411568 }, { "copy_logits_max": -3.99175763130188, "copy_logits_min": -750000000.0, "copy_num_tokens": 453.125, "epoch": 0.48118458003574166, "gen_logits_max": 5.462235450744629, "gen_logits_mean": -13.243867874145508, "gen_logits_min": -25.208209991455078, "gen_logits_std": 2.596456289291382, "gen_loss": 0.34508851170539856, "grad_norm": 0.6844018091501333, "learning_rate": 2.7953684210526315e-05, "loss": 0.3452, "mean_copy_accuracy": 0.9931758046150208, "mean_gen_accuracy": 0.8517270982265472, "mean_token_accuracy": 0.8853740245103836, "num_tokens": 638456867.0, "sample_num_tokens": 7968.75, "step": 2356, "total_num_tokens": 638488742.0, "z_loss": 0.0008800768991932273 }, { "copy_logits_max": -3.3349621295928955, "copy_logits_min": -750000064.0, "copy_num_tokens": 677.3125, "epoch": 0.4813888179729385, "gen_logits_max": 4.348630905151367, "gen_logits_mean": -13.658158302307129, "gen_logits_min": -25.678020477294922, "gen_logits_std": 2.6017215251922607, "gen_loss": 0.27998876571655273, "grad_norm": 0.47553444056468747, "learning_rate": 2.795242105263158e-05, "loss": 0.3178, "mean_copy_accuracy": 0.9927274584770203, "mean_gen_accuracy": 0.8656462281942368, "mean_token_accuracy": 0.8955850601196289, "num_tokens": 638728685.0, "sample_num_tokens": 9703.75, "step": 2357, "total_num_tokens": 638767500.0, "z_loss": 0.0007677957764826715 }, { "copy_logits_max": -5.230252742767334, "copy_logits_min": -750000000.0, "copy_num_tokens": 501.875, "epoch": 0.4815930559101353, "gen_logits_max": 4.905587673187256, "gen_logits_mean": -14.214015007019043, "gen_logits_min": -26.19169044494629, "gen_logits_std": 2.6051807403564453, "gen_loss": 0.30329689383506775, "grad_norm": 0.4117298758688105, "learning_rate": 2.7951157894736843e-05, "loss": 0.3123, "mean_copy_accuracy": 0.9932283759117126, "mean_gen_accuracy": 0.8658865392208099, "mean_token_accuracy": 0.8958081603050232, "num_tokens": 638993916.0, "sample_num_tokens": 9407.5, "step": 2358, "total_num_tokens": 639031546.0, "z_loss": 0.0007367286598309875 }, { "copy_logits_max": -2.737994909286499, "copy_logits_min": -750000000.0, "copy_num_tokens": 773.9375, "epoch": 0.48179729384733216, "gen_logits_max": 5.840726852416992, "gen_logits_mean": -12.80030345916748, "gen_logits_min": -25.71977996826172, "gen_logits_std": 2.7040135860443115, "gen_loss": 0.2731873095035553, "grad_norm": 0.5149410110795308, "learning_rate": 2.7949894736842108e-05, "loss": 0.3074, "mean_copy_accuracy": 0.9929300844669342, "mean_gen_accuracy": 0.8650729805231094, "mean_token_accuracy": 0.8989448994398117, "num_tokens": 639257504.0, "sample_num_tokens": 10194.5, "step": 2359, "total_num_tokens": 639298282.0, "z_loss": 0.0006919706938788295 }, { "copy_logits_max": -5.026115417480469, "copy_logits_min": -687500032.0, "copy_num_tokens": 454.375, "epoch": 0.482001531784529, "gen_logits_max": 6.300192832946777, "gen_logits_mean": -12.77197265625, "gen_logits_min": -25.348491668701172, "gen_logits_std": 2.6465001106262207, "gen_loss": 0.3338854908943176, "grad_norm": 0.5024429265657767, "learning_rate": 2.794863157894737e-05, "loss": 0.3201, "mean_copy_accuracy": 0.9924549758434296, "mean_gen_accuracy": 0.8621077686548233, "mean_token_accuracy": 0.8953287601470947, "num_tokens": 639543598.0, "sample_num_tokens": 9220.5, "step": 2360, "total_num_tokens": 639580480.0, "z_loss": 0.0008858363144099712 }, { "copy_logits_max": -6.320267677307129, "copy_logits_min": -750000000.0, "copy_num_tokens": 289.875, "epoch": 0.4822057697217258, "gen_logits_max": 5.508680820465088, "gen_logits_mean": -14.693008422851562, "gen_logits_min": -26.270938873291016, "gen_logits_std": 2.563171863555908, "gen_loss": 0.3091868758201599, "grad_norm": 0.494051264683445, "learning_rate": 2.7947368421052633e-05, "loss": 0.3215, "mean_copy_accuracy": 0.9929994940757751, "mean_gen_accuracy": 0.8670963943004608, "mean_token_accuracy": 0.895477369427681, "num_tokens": 639826126.0, "sample_num_tokens": 6781.0, "step": 2361, "total_num_tokens": 639853250.0, "z_loss": 0.0008155849063768983 }, { "copy_logits_max": -4.480463027954102, "copy_logits_min": -687500032.0, "copy_num_tokens": 460.8125, "epoch": 0.48241000765892267, "gen_logits_max": 5.208069801330566, "gen_logits_mean": -13.652889251708984, "gen_logits_min": -26.178998947143555, "gen_logits_std": 2.640516519546509, "gen_loss": 0.3204054832458496, "grad_norm": 0.4584076512693478, "learning_rate": 2.7946105263157894e-05, "loss": 0.3098, "mean_copy_accuracy": 0.9943758398294449, "mean_gen_accuracy": 0.8651322424411774, "mean_token_accuracy": 0.8974649459123611, "num_tokens": 640090035.0, "sample_num_tokens": 8486.25, "step": 2362, "total_num_tokens": 640123980.0, "z_loss": 0.0009381028939969838 }, { "copy_logits_max": -4.396963119506836, "copy_logits_min": -687500032.0, "copy_num_tokens": 345.625, "epoch": 0.4826142455961195, "gen_logits_max": 6.203978538513184, "gen_logits_mean": -12.8146333694458, "gen_logits_min": -24.801952362060547, "gen_logits_std": 2.5886499881744385, "gen_loss": 0.38096487522125244, "grad_norm": 0.48830761374491555, "learning_rate": 2.7944842105263158e-05, "loss": 0.3329, "mean_copy_accuracy": 0.9927215725183487, "mean_gen_accuracy": 0.8647138625383377, "mean_token_accuracy": 0.8922233283519745, "num_tokens": 640334872.0, "sample_num_tokens": 7997.0, "step": 2363, "total_num_tokens": 640366860.0, "z_loss": 0.0010263925651088357 }, { "copy_logits_max": -3.550400733947754, "copy_logits_min": -687500032.0, "copy_num_tokens": 386.875, "epoch": 0.4828184835333163, "gen_logits_max": 5.20997428894043, "gen_logits_mean": -14.284283638000488, "gen_logits_min": -26.37753677368164, "gen_logits_std": 2.578150510787964, "gen_loss": 0.359256386756897, "grad_norm": 0.48639719238685497, "learning_rate": 2.7943578947368422e-05, "loss": 0.3266, "mean_copy_accuracy": 0.9932959526777267, "mean_gen_accuracy": 0.8591000884771347, "mean_token_accuracy": 0.8918597251176834, "num_tokens": 640590964.0, "sample_num_tokens": 7551.5, "step": 2364, "total_num_tokens": 640621170.0, "z_loss": 0.0009217462502419949 }, { "copy_logits_max": -3.495203733444214, "copy_logits_min": -625000000.0, "copy_num_tokens": 752.375, "epoch": 0.48302272147051317, "gen_logits_max": 3.9238743782043457, "gen_logits_mean": -14.274517059326172, "gen_logits_min": -27.55253028869629, "gen_logits_std": 2.6673731803894043, "gen_loss": 0.26203155517578125, "grad_norm": 0.44800827844400615, "learning_rate": 2.7942315789473683e-05, "loss": 0.3182, "mean_copy_accuracy": 0.9944968670606613, "mean_gen_accuracy": 0.8626721501350403, "mean_token_accuracy": 0.8975624740123749, "num_tokens": 640868121.0, "sample_num_tokens": 9893.25, "step": 2365, "total_num_tokens": 640907694.0, "z_loss": 0.000741467229090631 }, { "copy_logits_max": -6.445427894592285, "copy_logits_min": -687500096.0, "copy_num_tokens": 442.3125, "epoch": 0.48322695940771, "gen_logits_max": 5.42322301864624, "gen_logits_mean": -13.661336898803711, "gen_logits_min": -25.761966705322266, "gen_logits_std": 2.605461597442627, "gen_loss": 0.3160492777824402, "grad_norm": 0.4868647818912773, "learning_rate": 2.794105263157895e-05, "loss": 0.329, "mean_copy_accuracy": 0.9949295520782471, "mean_gen_accuracy": 0.8581288903951645, "mean_token_accuracy": 0.8922659456729889, "num_tokens": 641155663.0, "sample_num_tokens": 8053.75, "step": 2366, "total_num_tokens": 641187878.0, "z_loss": 0.0008583737071603537 }, { "copy_logits_max": -4.682188987731934, "copy_logits_min": -750000000.0, "copy_num_tokens": 455.0, "epoch": 0.4834311973449068, "gen_logits_max": 4.983997821807861, "gen_logits_mean": -13.706645011901855, "gen_logits_min": -26.016145706176758, "gen_logits_std": 2.611726760864258, "gen_loss": 0.32361772656440735, "grad_norm": 0.4673396883258232, "learning_rate": 2.7939789473684212e-05, "loss": 0.3326, "mean_copy_accuracy": 0.9921734929084778, "mean_gen_accuracy": 0.8614264130592346, "mean_token_accuracy": 0.8898438960313797, "num_tokens": 641404029.0, "sample_num_tokens": 7965.75, "step": 2367, "total_num_tokens": 641435892.0, "z_loss": 0.0008239148883149028 }, { "copy_logits_max": -3.9839351177215576, "copy_logits_min": -750000000.0, "copy_num_tokens": 459.3125, "epoch": 0.4836354352821037, "gen_logits_max": 5.644763469696045, "gen_logits_mean": -13.138752937316895, "gen_logits_min": -26.08860206604004, "gen_logits_std": 2.6485202312469482, "gen_loss": 0.35646140575408936, "grad_norm": 0.48549149573776706, "learning_rate": 2.7938526315789476e-05, "loss": 0.3363, "mean_copy_accuracy": 0.9921139478683472, "mean_gen_accuracy": 0.8569226264953613, "mean_token_accuracy": 0.8887373954057693, "num_tokens": 641662216.0, "sample_num_tokens": 8138.5, "step": 2368, "total_num_tokens": 641694770.0, "z_loss": 0.0009505222551524639 }, { "copy_logits_max": -7.343366622924805, "copy_logits_min": -687500032.0, "copy_num_tokens": 263.75, "epoch": 0.4838396732193005, "gen_logits_max": 6.2998247146606445, "gen_logits_mean": -14.061653137207031, "gen_logits_min": -26.2493896484375, "gen_logits_std": 2.6054720878601074, "gen_loss": 0.34030675888061523, "grad_norm": 0.5250544497109746, "learning_rate": 2.7937263157894737e-05, "loss": 0.3246, "mean_copy_accuracy": 0.9930847436189651, "mean_gen_accuracy": 0.8650591969490051, "mean_token_accuracy": 0.8930933326482773, "num_tokens": 641927480.0, "sample_num_tokens": 6950.5, "step": 2369, "total_num_tokens": 641955282.0, "z_loss": 0.0008425418636761606 }, { "copy_logits_max": -6.417210578918457, "copy_logits_min": -750000000.0, "copy_num_tokens": 473.3125, "epoch": 0.4840439111564973, "gen_logits_max": 5.481690406799316, "gen_logits_mean": -13.361794471740723, "gen_logits_min": -25.633869171142578, "gen_logits_std": 2.6286420822143555, "gen_loss": 0.32054269313812256, "grad_norm": 0.49016939440667096, "learning_rate": 2.7936e-05, "loss": 0.3331, "mean_copy_accuracy": 0.9933559149503708, "mean_gen_accuracy": 0.864311620593071, "mean_token_accuracy": 0.892156109213829, "num_tokens": 642177797.0, "sample_num_tokens": 8940.25, "step": 2370, "total_num_tokens": 642213558.0, "z_loss": 0.000795988948084414 }, { "copy_logits_max": -4.728822708129883, "copy_logits_min": -750000064.0, "copy_num_tokens": 575.625, "epoch": 0.4842481490936942, "gen_logits_max": 4.828775882720947, "gen_logits_mean": -13.04986572265625, "gen_logits_min": -25.9323787689209, "gen_logits_std": 2.677506446838379, "gen_loss": 0.28268861770629883, "grad_norm": 0.4230149018954367, "learning_rate": 2.7934736842105262e-05, "loss": 0.3197, "mean_copy_accuracy": 0.9931166023015976, "mean_gen_accuracy": 0.8634433448314667, "mean_token_accuracy": 0.8930773884057999, "num_tokens": 642454017.0, "sample_num_tokens": 8841.25, "step": 2371, "total_num_tokens": 642489382.0, "z_loss": 0.0007920657517388463 }, { "copy_logits_max": -6.75875186920166, "copy_logits_min": -625000064.0, "copy_num_tokens": 396.625, "epoch": 0.484452387030891, "gen_logits_max": 5.598094940185547, "gen_logits_mean": -13.89195442199707, "gen_logits_min": -25.88401985168457, "gen_logits_std": 2.620995044708252, "gen_loss": 0.306978702545166, "grad_norm": 0.5323235648207362, "learning_rate": 2.7933473684210527e-05, "loss": 0.2954, "mean_copy_accuracy": 0.9929348975419998, "mean_gen_accuracy": 0.8772900253534317, "mean_token_accuracy": 0.901863157749176, "num_tokens": 642727795.0, "sample_num_tokens": 8581.75, "step": 2372, "total_num_tokens": 642762122.0, "z_loss": 0.0007304380415007472 }, { "copy_logits_max": -3.786118984222412, "copy_logits_min": -750000000.0, "copy_num_tokens": 735.125, "epoch": 0.4846566249680878, "gen_logits_max": 4.9984354972839355, "gen_logits_mean": -12.21603775024414, "gen_logits_min": -24.917089462280273, "gen_logits_std": 2.7058632373809814, "gen_loss": 0.2816093862056732, "grad_norm": 0.41541554231397126, "learning_rate": 2.7932210526315788e-05, "loss": 0.3065, "mean_copy_accuracy": 0.9938230663537979, "mean_gen_accuracy": 0.8640138059854507, "mean_token_accuracy": 0.8980693072080612, "num_tokens": 643005109.0, "sample_num_tokens": 9624.25, "step": 2373, "total_num_tokens": 643043606.0, "z_loss": 0.0008086266461759806 }, { "copy_logits_max": -6.007165908813477, "copy_logits_min": -687500032.0, "copy_num_tokens": 514.125, "epoch": 0.4848608629052847, "gen_logits_max": 5.126761436462402, "gen_logits_mean": -12.594137191772461, "gen_logits_min": -25.16322135925293, "gen_logits_std": 2.604525327682495, "gen_loss": 0.31879591941833496, "grad_norm": 0.4549529432690715, "learning_rate": 2.7930947368421052e-05, "loss": 0.3376, "mean_copy_accuracy": 0.9940626919269562, "mean_gen_accuracy": 0.854682445526123, "mean_token_accuracy": 0.8893968015909195, "num_tokens": 643261731.0, "sample_num_tokens": 8260.75, "step": 2374, "total_num_tokens": 643294774.0, "z_loss": 0.0008225744240917265 }, { "copy_logits_max": -4.918575286865234, "copy_logits_min": -750000000.0, "copy_num_tokens": 431.3125, "epoch": 0.4850651008424815, "gen_logits_max": 5.858366966247559, "gen_logits_mean": -12.688127517700195, "gen_logits_min": -24.821491241455078, "gen_logits_std": 2.6421866416931152, "gen_loss": 0.33822500705718994, "grad_norm": 0.45288108780230396, "learning_rate": 2.7929684210526316e-05, "loss": 0.3232, "mean_copy_accuracy": 0.9925990104675293, "mean_gen_accuracy": 0.8570635169744492, "mean_token_accuracy": 0.8937206417322159, "num_tokens": 643545959.0, "sample_num_tokens": 8060.75, "step": 2375, "total_num_tokens": 643578202.0, "z_loss": 0.0008440183009952307 }, { "copy_logits_max": -4.752755641937256, "copy_logits_min": -750000000.0, "copy_num_tokens": 664.0625, "epoch": 0.4852693387796783, "gen_logits_max": 6.097291469573975, "gen_logits_mean": -13.119122505187988, "gen_logits_min": -25.735702514648438, "gen_logits_std": 2.649993419647217, "gen_loss": 0.3204880356788635, "grad_norm": 0.45942670834762345, "learning_rate": 2.792842105263158e-05, "loss": 0.3057, "mean_copy_accuracy": 0.9937751740217209, "mean_gen_accuracy": 0.8599593341350555, "mean_token_accuracy": 0.9005574434995651, "num_tokens": 643841001.0, "sample_num_tokens": 9377.25, "step": 2376, "total_num_tokens": 643878510.0, "z_loss": 0.0009468137286603451 }, { "copy_logits_max": -4.20062255859375, "copy_logits_min": -750000000.0, "copy_num_tokens": 459.6875, "epoch": 0.48547357671687513, "gen_logits_max": 5.644989967346191, "gen_logits_mean": -14.157758712768555, "gen_logits_min": -26.288665771484375, "gen_logits_std": 2.654052257537842, "gen_loss": 0.3227095901966095, "grad_norm": 0.4534055416514405, "learning_rate": 2.7927157894736845e-05, "loss": 0.3281, "mean_copy_accuracy": 0.9940095990896225, "mean_gen_accuracy": 0.8581893295049667, "mean_token_accuracy": 0.8919835239648819, "num_tokens": 644121120.0, "sample_num_tokens": 8102.5, "step": 2377, "total_num_tokens": 644153530.0, "z_loss": 0.000929998408537358 }, { "copy_logits_max": -5.4236345291137695, "copy_logits_min": -750000000.0, "copy_num_tokens": 376.5625, "epoch": 0.485677814654072, "gen_logits_max": 5.407891273498535, "gen_logits_mean": -14.305669784545898, "gen_logits_min": -26.253591537475586, "gen_logits_std": 2.5909900665283203, "gen_loss": 0.2957371771335602, "grad_norm": 0.4487458108954088, "learning_rate": 2.7925894736842106e-05, "loss": 0.306, "mean_copy_accuracy": 0.9939329773187637, "mean_gen_accuracy": 0.8610019981861115, "mean_token_accuracy": 0.8986760079860687, "num_tokens": 644394740.0, "sample_num_tokens": 8239.5, "step": 2378, "total_num_tokens": 644427698.0, "z_loss": 0.0008078742539510131 }, { "copy_logits_max": -6.296566486358643, "copy_logits_min": -750000000.0, "copy_num_tokens": 435.9375, "epoch": 0.4858820525912688, "gen_logits_max": 7.209874153137207, "gen_logits_mean": -13.481525421142578, "gen_logits_min": -25.981483459472656, "gen_logits_std": 2.6841392517089844, "gen_loss": 0.31351712346076965, "grad_norm": 0.4728708480897217, "learning_rate": 2.792463157894737e-05, "loss": 0.3258, "mean_copy_accuracy": 0.9914091527462006, "mean_gen_accuracy": 0.8646703660488129, "mean_token_accuracy": 0.8930392414331436, "num_tokens": 644666663.0, "sample_num_tokens": 7974.75, "step": 2379, "total_num_tokens": 644698562.0, "z_loss": 0.0008339519263245165 }, { "copy_logits_max": -3.592410087585449, "copy_logits_min": -625000064.0, "copy_num_tokens": 774.5, "epoch": 0.48608629052846564, "gen_logits_max": 4.861393928527832, "gen_logits_mean": -13.430364608764648, "gen_logits_min": -25.664392471313477, "gen_logits_std": 2.652890920639038, "gen_loss": 0.30524590611457825, "grad_norm": 0.7178165316806694, "learning_rate": 2.792336842105263e-05, "loss": 0.3338, "mean_copy_accuracy": 0.992679551243782, "mean_gen_accuracy": 0.8571353107690811, "mean_token_accuracy": 0.8909798115491867, "num_tokens": 644925572.0, "sample_num_tokens": 10276.5, "step": 2380, "total_num_tokens": 644966678.0, "z_loss": 0.0009036905248649418 }, { "copy_logits_max": -5.766441345214844, "copy_logits_min": -750000000.0, "copy_num_tokens": 613.125, "epoch": 0.4862905284656625, "gen_logits_max": 6.159413814544678, "gen_logits_mean": -13.462297439575195, "gen_logits_min": -25.420495986938477, "gen_logits_std": 2.6470487117767334, "gen_loss": 0.2934097647666931, "grad_norm": 0.43186165019647405, "learning_rate": 2.7922105263157895e-05, "loss": 0.3017, "mean_copy_accuracy": 0.9955532848834991, "mean_gen_accuracy": 0.8683736771345139, "mean_token_accuracy": 0.9009067267179489, "num_tokens": 645224311.0, "sample_num_tokens": 10332.25, "step": 2381, "total_num_tokens": 645265640.0, "z_loss": 0.0009122738847509027 }, { "copy_logits_max": -5.84537410736084, "copy_logits_min": -687500032.0, "copy_num_tokens": 480.1875, "epoch": 0.4864947664028593, "gen_logits_max": 6.778275012969971, "gen_logits_mean": -13.67831039428711, "gen_logits_min": -26.199533462524414, "gen_logits_std": 2.637927770614624, "gen_loss": 0.30812767148017883, "grad_norm": 0.5131639991818667, "learning_rate": 2.7920842105263156e-05, "loss": 0.3221, "mean_copy_accuracy": 0.9945637434720993, "mean_gen_accuracy": 0.8623412847518921, "mean_token_accuracy": 0.8946541845798492, "num_tokens": 645493236.0, "sample_num_tokens": 8559.5, "step": 2382, "total_num_tokens": 645527474.0, "z_loss": 0.0009514288976788521 }, { "copy_logits_max": -5.723090171813965, "copy_logits_min": -750000000.0, "copy_num_tokens": 386.0, "epoch": 0.48669900434005614, "gen_logits_max": 7.065947532653809, "gen_logits_mean": -13.114194869995117, "gen_logits_min": -25.34226417541504, "gen_logits_std": 2.6374928951263428, "gen_loss": 0.3331122100353241, "grad_norm": 0.5738630970061098, "learning_rate": 2.7919578947368424e-05, "loss": 0.3401, "mean_copy_accuracy": 0.9911030828952789, "mean_gen_accuracy": 0.8614757061004639, "mean_token_accuracy": 0.8885444402694702, "num_tokens": 645754162.0, "sample_num_tokens": 8458.5, "step": 2383, "total_num_tokens": 645787996.0, "z_loss": 0.0008943213615566492 }, { "copy_logits_max": -4.237250328063965, "copy_logits_min": -687500096.0, "copy_num_tokens": 752.0, "epoch": 0.486903242277253, "gen_logits_max": 6.398332595825195, "gen_logits_mean": -12.099594116210938, "gen_logits_min": -25.53386688232422, "gen_logits_std": 2.697103977203369, "gen_loss": 0.2619084119796753, "grad_norm": 0.43919078487502594, "learning_rate": 2.7918315789473685e-05, "loss": 0.3003, "mean_copy_accuracy": 0.9939769357442856, "mean_gen_accuracy": 0.8694491386413574, "mean_token_accuracy": 0.9013537764549255, "num_tokens": 646026486.0, "sample_num_tokens": 9923.5, "step": 2384, "total_num_tokens": 646066180.0, "z_loss": 0.0008324614027515054 }, { "copy_logits_max": -5.30588436126709, "copy_logits_min": -750000000.0, "copy_num_tokens": 496.0625, "epoch": 0.48710748021444983, "gen_logits_max": 6.800446033477783, "gen_logits_mean": -13.558637619018555, "gen_logits_min": -25.930368423461914, "gen_logits_std": 2.6483030319213867, "gen_loss": 0.36425715684890747, "grad_norm": 0.5186687897460536, "learning_rate": 2.791705263157895e-05, "loss": 0.3429, "mean_copy_accuracy": 0.9935565739870071, "mean_gen_accuracy": 0.8559652715921402, "mean_token_accuracy": 0.8886211216449738, "num_tokens": 646294488.0, "sample_num_tokens": 8695.0, "step": 2385, "total_num_tokens": 646329268.0, "z_loss": 0.000909847323782742 }, { "copy_logits_max": -3.884396553039551, "copy_logits_min": -750000000.0, "copy_num_tokens": 543.5625, "epoch": 0.48731171815164664, "gen_logits_max": 5.490509510040283, "gen_logits_mean": -13.44150161743164, "gen_logits_min": -26.285118103027344, "gen_logits_std": 2.6456871032714844, "gen_loss": 0.31909728050231934, "grad_norm": 0.5014548535367591, "learning_rate": 2.791578947368421e-05, "loss": 0.3178, "mean_copy_accuracy": 0.9934939444065094, "mean_gen_accuracy": 0.8620738387107849, "mean_token_accuracy": 0.8972792327404022, "num_tokens": 646565200.0, "sample_num_tokens": 8570.5, "step": 2386, "total_num_tokens": 646599482.0, "z_loss": 0.0008470959728583694 }, { "copy_logits_max": -4.685436248779297, "copy_logits_min": -687500032.0, "copy_num_tokens": 452.375, "epoch": 0.4875159560888435, "gen_logits_max": 6.710922718048096, "gen_logits_mean": -13.411031723022461, "gen_logits_min": -25.703500747680664, "gen_logits_std": 2.6361923217773438, "gen_loss": 0.3620970845222473, "grad_norm": 0.44381686549186034, "learning_rate": 2.7914526315789475e-05, "loss": 0.3426, "mean_copy_accuracy": 0.9934379458427429, "mean_gen_accuracy": 0.8535089641809464, "mean_token_accuracy": 0.8860084265470505, "num_tokens": 646831459.0, "sample_num_tokens": 8134.25, "step": 2387, "total_num_tokens": 646863996.0, "z_loss": 0.000937208766117692 }, { "copy_logits_max": -5.2279767990112305, "copy_logits_min": -687500032.0, "copy_num_tokens": 408.8125, "epoch": 0.48772019402604033, "gen_logits_max": 5.0190277099609375, "gen_logits_mean": -14.115745544433594, "gen_logits_min": -25.663803100585938, "gen_logits_std": 2.5387582778930664, "gen_loss": 0.3122483193874359, "grad_norm": 0.4812289096952412, "learning_rate": 2.7913263157894735e-05, "loss": 0.3483, "mean_copy_accuracy": 0.9908382296562195, "mean_gen_accuracy": 0.8599195778369904, "mean_token_accuracy": 0.8838615715503693, "num_tokens": 647074767.0, "sample_num_tokens": 7733.25, "step": 2388, "total_num_tokens": 647105700.0, "z_loss": 0.0007613091729581356 }, { "copy_logits_max": -5.09525203704834, "copy_logits_min": -750000000.0, "copy_num_tokens": 421.125, "epoch": 0.48792443196323715, "gen_logits_max": 5.576549053192139, "gen_logits_mean": -14.03337574005127, "gen_logits_min": -25.591659545898438, "gen_logits_std": 2.5864851474761963, "gen_loss": 0.29479509592056274, "grad_norm": 0.5304036428879529, "learning_rate": 2.7912e-05, "loss": 0.3145, "mean_copy_accuracy": 0.9937439858913422, "mean_gen_accuracy": 0.8653481900691986, "mean_token_accuracy": 0.8962709307670593, "num_tokens": 647351377.0, "sample_num_tokens": 8411.25, "step": 2389, "total_num_tokens": 647385022.0, "z_loss": 0.0007729587377980351 }, { "copy_logits_max": -3.3234004974365234, "copy_logits_min": -750000064.0, "copy_num_tokens": 554.5, "epoch": 0.488128669900434, "gen_logits_max": 5.329425811767578, "gen_logits_mean": -13.336983680725098, "gen_logits_min": -25.28155517578125, "gen_logits_std": 2.574798107147217, "gen_loss": 0.3032343089580536, "grad_norm": 0.45545625600276396, "learning_rate": 2.791073684210526e-05, "loss": 0.3152, "mean_copy_accuracy": 0.9950404763221741, "mean_gen_accuracy": 0.8549825549125671, "mean_token_accuracy": 0.8954121619462967, "num_tokens": 647643511.0, "sample_num_tokens": 8246.25, "step": 2390, "total_num_tokens": 647676496.0, "z_loss": 0.0008643885958008468 }, { "copy_logits_max": -8.17957878112793, "copy_logits_min": -687500032.0, "copy_num_tokens": 404.6875, "epoch": 0.48833290783763084, "gen_logits_max": 8.296276092529297, "gen_logits_mean": -13.193050384521484, "gen_logits_min": -24.648225784301758, "gen_logits_std": 2.590144634246826, "gen_loss": 0.3050810694694519, "grad_norm": 0.48015684021826727, "learning_rate": 2.790947368421053e-05, "loss": 0.3189, "mean_copy_accuracy": 0.9924686104059219, "mean_gen_accuracy": 0.865330845117569, "mean_token_accuracy": 0.8939642757177353, "num_tokens": 647906507.0, "sample_num_tokens": 7882.75, "step": 2391, "total_num_tokens": 647938038.0, "z_loss": 0.0007909827982075512 }, { "copy_logits_max": -6.485845565795898, "copy_logits_min": -687500032.0, "copy_num_tokens": 693.625, "epoch": 0.48853714577482765, "gen_logits_max": 6.053036689758301, "gen_logits_mean": -14.215705871582031, "gen_logits_min": -26.318893432617188, "gen_logits_std": 2.6512656211853027, "gen_loss": 0.3250846266746521, "grad_norm": 0.9721279838765547, "learning_rate": 2.7908210526315793e-05, "loss": 0.3221, "mean_copy_accuracy": 0.9947668164968491, "mean_gen_accuracy": 0.8569762706756592, "mean_token_accuracy": 0.8972198963165283, "num_tokens": 648188970.0, "sample_num_tokens": 10162.5, "step": 2392, "total_num_tokens": 648229620.0, "z_loss": 0.0008540033013559878 }, { "copy_logits_max": -5.5359954833984375, "copy_logits_min": -687500032.0, "copy_num_tokens": 525.125, "epoch": 0.4887413837120245, "gen_logits_max": 7.319303512573242, "gen_logits_mean": -12.992486000061035, "gen_logits_min": -24.629552841186523, "gen_logits_std": 2.5686864852905273, "gen_loss": 0.34823885560035706, "grad_norm": 0.553197670479847, "learning_rate": 2.7906947368421054e-05, "loss": 0.3323, "mean_copy_accuracy": 0.9912361949682236, "mean_gen_accuracy": 0.8587649166584015, "mean_token_accuracy": 0.8913567215204239, "num_tokens": 648445912.0, "sample_num_tokens": 8780.0, "step": 2393, "total_num_tokens": 648481032.0, "z_loss": 0.0008360280771739781 }, { "copy_logits_max": -7.581136703491211, "copy_logits_min": -750000000.0, "copy_num_tokens": 211.625, "epoch": 0.48894562164922134, "gen_logits_max": 6.325392723083496, "gen_logits_mean": -14.535913467407227, "gen_logits_min": -25.729084014892578, "gen_logits_std": 2.530348062515259, "gen_loss": 0.3314506411552429, "grad_norm": 0.4561311253362509, "learning_rate": 2.7905684210526318e-05, "loss": 0.3387, "mean_copy_accuracy": 0.9928591251373291, "mean_gen_accuracy": 0.8588996380567551, "mean_token_accuracy": 0.8893640786409378, "num_tokens": 648723591.0, "sample_num_tokens": 6849.25, "step": 2394, "total_num_tokens": 648750988.0, "z_loss": 0.000838039442896843 }, { "copy_logits_max": -4.262419700622559, "copy_logits_min": -687500032.0, "copy_num_tokens": 630.9375, "epoch": 0.48914985958641816, "gen_logits_max": 5.775615215301514, "gen_logits_mean": -13.193265914916992, "gen_logits_min": -25.148574829101562, "gen_logits_std": 2.6331067085266113, "gen_loss": 0.3136537969112396, "grad_norm": 0.43710074740401217, "learning_rate": 2.790442105263158e-05, "loss": 0.318, "mean_copy_accuracy": 0.9937377274036407, "mean_gen_accuracy": 0.8613183349370956, "mean_token_accuracy": 0.8974102735519409, "num_tokens": 648998170.0, "sample_num_tokens": 9816.0, "step": 2395, "total_num_tokens": 649037434.0, "z_loss": 0.000874777790158987 }, { "copy_logits_max": -5.255507469177246, "copy_logits_min": -750000000.0, "copy_num_tokens": 250.25, "epoch": 0.48935409752361503, "gen_logits_max": 6.120915412902832, "gen_logits_mean": -14.768630027770996, "gen_logits_min": -26.188987731933594, "gen_logits_std": 2.534297466278076, "gen_loss": 0.3618171811103821, "grad_norm": 0.4444028712480175, "learning_rate": 2.7903157894736843e-05, "loss": 0.3286, "mean_copy_accuracy": 0.9953088462352753, "mean_gen_accuracy": 0.8560776263475418, "mean_token_accuracy": 0.8910654485225677, "num_tokens": 649279549.0, "sample_num_tokens": 6328.75, "step": 2396, "total_num_tokens": 649304864.0, "z_loss": 0.0009005771717056632 }, { "copy_logits_max": -5.69854736328125, "copy_logits_min": -750000064.0, "copy_num_tokens": 368.8125, "epoch": 0.48955833546081184, "gen_logits_max": 6.2336835861206055, "gen_logits_mean": -13.260794639587402, "gen_logits_min": -24.801532745361328, "gen_logits_std": 2.6126012802124023, "gen_loss": 0.339751273393631, "grad_norm": 0.47260021266523844, "learning_rate": 2.7901894736842104e-05, "loss": 0.3056, "mean_copy_accuracy": 0.9947915971279144, "mean_gen_accuracy": 0.8642144352197647, "mean_token_accuracy": 0.8996293693780899, "num_tokens": 649546951.0, "sample_num_tokens": 7799.75, "step": 2397, "total_num_tokens": 649578150.0, "z_loss": 0.0008604573085904121 }, { "copy_logits_max": -6.295807361602783, "copy_logits_min": -750000000.0, "copy_num_tokens": 570.375, "epoch": 0.48976257339800866, "gen_logits_max": 5.641003131866455, "gen_logits_mean": -13.567557334899902, "gen_logits_min": -25.077068328857422, "gen_logits_std": 2.5529870986938477, "gen_loss": 0.2880810499191284, "grad_norm": 0.454583158169722, "learning_rate": 2.790063157894737e-05, "loss": 0.3158, "mean_copy_accuracy": 0.9929544627666473, "mean_gen_accuracy": 0.8650971353054047, "mean_token_accuracy": 0.8972376883029938, "num_tokens": 649832041.0, "sample_num_tokens": 9157.75, "step": 2398, "total_num_tokens": 649868672.0, "z_loss": 0.0007075945613905787 }, { "copy_logits_max": -5.692096710205078, "copy_logits_min": -750000000.0, "copy_num_tokens": 471.9375, "epoch": 0.48996681133520553, "gen_logits_max": 4.693446636199951, "gen_logits_mean": -14.083924293518066, "gen_logits_min": -25.619945526123047, "gen_logits_std": 2.51353120803833, "gen_loss": 0.3021259307861328, "grad_norm": 0.49370246088973213, "learning_rate": 2.7899368421052633e-05, "loss": 0.3366, "mean_copy_accuracy": 0.9921397864818573, "mean_gen_accuracy": 0.8597422242164612, "mean_token_accuracy": 0.8873285949230194, "num_tokens": 650087277.0, "sample_num_tokens": 8549.25, "step": 2399, "total_num_tokens": 650121474.0, "z_loss": 0.0006872392259538174 }, { "copy_logits_max": -4.2123188972473145, "copy_logits_min": -750000000.0, "copy_num_tokens": 359.1875, "epoch": 0.49017104927240235, "gen_logits_max": 5.597916603088379, "gen_logits_mean": -13.235477447509766, "gen_logits_min": -25.054519653320312, "gen_logits_std": 2.5630605220794678, "gen_loss": 0.35763946175575256, "grad_norm": 0.4565248099768197, "learning_rate": 2.7898105263157897e-05, "loss": 0.3312, "mean_copy_accuracy": 0.9940793663263321, "mean_gen_accuracy": 0.8542165160179138, "mean_token_accuracy": 0.8908701986074448, "num_tokens": 650349790.0, "sample_num_tokens": 7025.0, "step": 2400, "total_num_tokens": 650377890.0, "z_loss": 0.0008138107368722558 }, { "copy_logits_max": -7.680120944976807, "copy_logits_min": -750000128.0, "copy_num_tokens": 389.375, "epoch": 0.49037528720959916, "gen_logits_max": 5.255930423736572, "gen_logits_mean": -13.93453311920166, "gen_logits_min": -25.419069290161133, "gen_logits_std": 2.551427125930786, "gen_loss": 0.28077971935272217, "grad_norm": 0.4964731965858913, "learning_rate": 2.7896842105263158e-05, "loss": 0.3128, "mean_copy_accuracy": 0.9925970584154129, "mean_gen_accuracy": 0.865314707159996, "mean_token_accuracy": 0.8981980234384537, "num_tokens": 650618901.0, "sample_num_tokens": 8508.25, "step": 2401, "total_num_tokens": 650652934.0, "z_loss": 0.0006943541811779141 }, { "copy_logits_max": -5.414329528808594, "copy_logits_min": -750000000.0, "copy_num_tokens": 352.6875, "epoch": 0.49057952514679604, "gen_logits_max": 5.789671897888184, "gen_logits_mean": -13.284460067749023, "gen_logits_min": -24.761507034301758, "gen_logits_std": 2.5523183345794678, "gen_loss": 0.37434324622154236, "grad_norm": 0.6087591997166059, "learning_rate": 2.7895578947368422e-05, "loss": 0.341, "mean_copy_accuracy": 0.9930891394615173, "mean_gen_accuracy": 0.8544185608625412, "mean_token_accuracy": 0.8893239498138428, "num_tokens": 650891772.0, "sample_num_tokens": 8140.0, "step": 2402, "total_num_tokens": 650924332.0, "z_loss": 0.00085505994502455 }, { "copy_logits_max": -8.226079940795898, "copy_logits_min": -750000000.0, "copy_num_tokens": 371.5, "epoch": 0.49078376308399285, "gen_logits_max": 5.304258346557617, "gen_logits_mean": -14.308794021606445, "gen_logits_min": -25.77585220336914, "gen_logits_std": 2.569171905517578, "gen_loss": 0.31664103269577026, "grad_norm": 0.4792848342501199, "learning_rate": 2.7894315789473683e-05, "loss": 0.3099, "mean_copy_accuracy": 0.9938649386167526, "mean_gen_accuracy": 0.8624178916215897, "mean_token_accuracy": 0.8962999433279037, "num_tokens": 651148422.0, "sample_num_tokens": 8110.0, "step": 2403, "total_num_tokens": 651180862.0, "z_loss": 0.0007456688908860087 }, { "copy_logits_max": -5.49518346786499, "copy_logits_min": -687500032.0, "copy_num_tokens": 468.3125, "epoch": 0.49098800102118967, "gen_logits_max": 5.23062801361084, "gen_logits_mean": -13.513955116271973, "gen_logits_min": -25.413379669189453, "gen_logits_std": 2.5902822017669678, "gen_loss": 0.33727124333381653, "grad_norm": 0.5296345360343255, "learning_rate": 2.7893052631578947e-05, "loss": 0.3164, "mean_copy_accuracy": 0.9917012602090836, "mean_gen_accuracy": 0.8606021851301193, "mean_token_accuracy": 0.896055206656456, "num_tokens": 651419857.0, "sample_num_tokens": 7815.75, "step": 2404, "total_num_tokens": 651451120.0, "z_loss": 0.0008460868266411126 }, { "copy_logits_max": -4.661018371582031, "copy_logits_min": -750000000.0, "copy_num_tokens": 367.5, "epoch": 0.49119223895838654, "gen_logits_max": 6.302724838256836, "gen_logits_mean": -12.211750984191895, "gen_logits_min": -23.764978408813477, "gen_logits_std": 2.5134243965148926, "gen_loss": 0.34696781635284424, "grad_norm": 0.5153592325431979, "learning_rate": 2.7891789473684212e-05, "loss": 0.3471, "mean_copy_accuracy": 0.9918007850646973, "mean_gen_accuracy": 0.8565957099199295, "mean_token_accuracy": 0.8865714967250824, "num_tokens": 651688784.0, "sample_num_tokens": 7496.5, "step": 2405, "total_num_tokens": 651718770.0, "z_loss": 0.0009775090729817748 }, { "copy_logits_max": -4.0660319328308105, "copy_logits_min": -750000000.0, "copy_num_tokens": 544.0625, "epoch": 0.49139647689558336, "gen_logits_max": 5.255874156951904, "gen_logits_mean": -13.346969604492188, "gen_logits_min": -24.93352508544922, "gen_logits_std": 2.5514304637908936, "gen_loss": 0.32236558198928833, "grad_norm": 0.5072207057759445, "learning_rate": 2.7890526315789473e-05, "loss": 0.3092, "mean_copy_accuracy": 0.9932654350996017, "mean_gen_accuracy": 0.8670177608728409, "mean_token_accuracy": 0.8984950184822083, "num_tokens": 651944222.0, "sample_num_tokens": 8873.5, "step": 2406, "total_num_tokens": 651979716.0, "z_loss": 0.000951515743508935 }, { "copy_logits_max": -5.589253902435303, "copy_logits_min": -687500032.0, "copy_num_tokens": 819.4375, "epoch": 0.4916007148327802, "gen_logits_max": 5.2159576416015625, "gen_logits_mean": -13.71039867401123, "gen_logits_min": -25.440027236938477, "gen_logits_std": 2.568596839904785, "gen_loss": 0.29587551951408386, "grad_norm": 0.4771390280920287, "learning_rate": 2.788926315789474e-05, "loss": 0.3147, "mean_copy_accuracy": 0.9940523356199265, "mean_gen_accuracy": 0.8622086048126221, "mean_token_accuracy": 0.8966861218214035, "num_tokens": 652237326.0, "sample_num_tokens": 11481.5, "step": 2407, "total_num_tokens": 652283252.0, "z_loss": 0.0008757121977396309 }, { "copy_logits_max": -4.831879615783691, "copy_logits_min": -750000000.0, "copy_num_tokens": 418.375, "epoch": 0.49180495276997704, "gen_logits_max": 6.713422775268555, "gen_logits_mean": -11.696584701538086, "gen_logits_min": -23.696252822875977, "gen_logits_std": 2.599062442779541, "gen_loss": 0.34990131855010986, "grad_norm": 0.559544715493529, "learning_rate": 2.7888e-05, "loss": 0.3258, "mean_copy_accuracy": 0.9909283518791199, "mean_gen_accuracy": 0.8615744709968567, "mean_token_accuracy": 0.895235225558281, "num_tokens": 652534553.0, "sample_num_tokens": 8188.25, "step": 2408, "total_num_tokens": 652567306.0, "z_loss": 0.0010329001815989614 }, { "copy_logits_max": -3.993015766143799, "copy_logits_min": -750000064.0, "copy_num_tokens": 549.5, "epoch": 0.49200919070717386, "gen_logits_max": 5.7416672706604, "gen_logits_mean": -12.72197151184082, "gen_logits_min": -24.520267486572266, "gen_logits_std": 2.5703413486480713, "gen_loss": 0.31373876333236694, "grad_norm": 0.466594898059968, "learning_rate": 2.7886736842105266e-05, "loss": 0.3243, "mean_copy_accuracy": 0.9940458089113235, "mean_gen_accuracy": 0.8597131818532944, "mean_token_accuracy": 0.8948690444231033, "num_tokens": 652801938.0, "sample_num_tokens": 8966.0, "step": 2409, "total_num_tokens": 652837802.0, "z_loss": 0.0009840377606451511 }, { "copy_logits_max": -3.641608238220215, "copy_logits_min": -625000064.0, "copy_num_tokens": 617.6875, "epoch": 0.4922134286443707, "gen_logits_max": 5.935389041900635, "gen_logits_mean": -12.400856018066406, "gen_logits_min": -24.10854148864746, "gen_logits_std": 2.5465826988220215, "gen_loss": 0.2959672212600708, "grad_norm": 0.47663037224261745, "learning_rate": 2.7885473684210527e-05, "loss": 0.3268, "mean_copy_accuracy": 0.9936528950929642, "mean_gen_accuracy": 0.8609453588724136, "mean_token_accuracy": 0.892492026090622, "num_tokens": 653066696.0, "sample_num_tokens": 9406.5, "step": 2410, "total_num_tokens": 653104322.0, "z_loss": 0.0009961266769096255 }, { "copy_logits_max": -4.7129034996032715, "copy_logits_min": -750000000.0, "copy_num_tokens": 618.875, "epoch": 0.49241766658156755, "gen_logits_max": 5.102982521057129, "gen_logits_mean": -13.216146469116211, "gen_logits_min": -24.993566513061523, "gen_logits_std": 2.563666343688965, "gen_loss": 0.27643901109695435, "grad_norm": 0.4624214923216961, "learning_rate": 2.788421052631579e-05, "loss": 0.3346, "mean_copy_accuracy": 0.9944064021110535, "mean_gen_accuracy": 0.8552199304103851, "mean_token_accuracy": 0.8899580389261246, "num_tokens": 653339524.0, "sample_num_tokens": 9427.5, "step": 2411, "total_num_tokens": 653377234.0, "z_loss": 0.0007624451536685228 }, { "copy_logits_max": -6.462423324584961, "copy_logits_min": -750000000.0, "copy_num_tokens": 582.1875, "epoch": 0.49262190451876436, "gen_logits_max": 5.312601566314697, "gen_logits_mean": -13.651569366455078, "gen_logits_min": -25.57500457763672, "gen_logits_std": 2.555870294570923, "gen_loss": 0.30914008617401123, "grad_norm": 0.5729858235616939, "learning_rate": 2.7882947368421052e-05, "loss": 0.3298, "mean_copy_accuracy": 0.9914543777704239, "mean_gen_accuracy": 0.8570109903812408, "mean_token_accuracy": 0.8904401510953903, "num_tokens": 653617337.0, "sample_num_tokens": 9227.75, "step": 2412, "total_num_tokens": 653654248.0, "z_loss": 0.0007385293720290065 }, { "copy_logits_max": -6.66011905670166, "copy_logits_min": -687500032.0, "copy_num_tokens": 498.3125, "epoch": 0.4928261424559612, "gen_logits_max": 6.151857376098633, "gen_logits_mean": -11.410197257995605, "gen_logits_min": -23.081649780273438, "gen_logits_std": 2.5251011848449707, "gen_loss": 0.31575876474380493, "grad_norm": 0.4750897402465067, "learning_rate": 2.7881684210526316e-05, "loss": 0.3152, "mean_copy_accuracy": 0.9923324584960938, "mean_gen_accuracy": 0.8680649846792221, "mean_token_accuracy": 0.8968252390623093, "num_tokens": 653906059.0, "sample_num_tokens": 9675.75, "step": 2413, "total_num_tokens": 653944762.0, "z_loss": 0.0007437808671966195 }, { "copy_logits_max": -5.780637264251709, "copy_logits_min": -750000000.0, "copy_num_tokens": 391.875, "epoch": 0.49303038039315805, "gen_logits_max": 5.421076774597168, "gen_logits_mean": -14.255535125732422, "gen_logits_min": -25.815874099731445, "gen_logits_std": 2.5203542709350586, "gen_loss": 0.29986703395843506, "grad_norm": 0.4301025423068204, "learning_rate": 2.7880421052631577e-05, "loss": 0.3055, "mean_copy_accuracy": 0.9931048303842545, "mean_gen_accuracy": 0.871612623333931, "mean_token_accuracy": 0.8990786522626877, "num_tokens": 654185101.0, "sample_num_tokens": 8531.25, "step": 2414, "total_num_tokens": 654219226.0, "z_loss": 0.0007615276845172048 }, { "copy_logits_max": -4.0028300285339355, "copy_logits_min": -687500032.0, "copy_num_tokens": 417.125, "epoch": 0.49323461833035487, "gen_logits_max": 5.325403213500977, "gen_logits_mean": -13.188030242919922, "gen_logits_min": -24.58236312866211, "gen_logits_std": 2.47989821434021, "gen_loss": 0.37136217951774597, "grad_norm": 0.4923412943429324, "learning_rate": 2.7879157894736845e-05, "loss": 0.3636, "mean_copy_accuracy": 0.993159830570221, "mean_gen_accuracy": 0.8467066884040833, "mean_token_accuracy": 0.8813717365264893, "num_tokens": 654442418.0, "sample_num_tokens": 7928.5, "step": 2415, "total_num_tokens": 654474132.0, "z_loss": 0.0008595485705882311 }, { "copy_logits_max": -4.375912666320801, "copy_logits_min": -687500032.0, "copy_num_tokens": 637.375, "epoch": 0.4934388562675517, "gen_logits_max": 5.297682285308838, "gen_logits_mean": -12.067752838134766, "gen_logits_min": -24.37088394165039, "gen_logits_std": 2.5655181407928467, "gen_loss": 0.26886701583862305, "grad_norm": 0.5059497697870993, "learning_rate": 2.7877894736842106e-05, "loss": 0.3111, "mean_copy_accuracy": 0.9925593882799149, "mean_gen_accuracy": 0.8691805601119995, "mean_token_accuracy": 0.8965318351984024, "num_tokens": 654690966.0, "sample_num_tokens": 9664.0, "step": 2416, "total_num_tokens": 654729622.0, "z_loss": 0.0007880337652750313 }, { "copy_logits_max": -2.2343387603759766, "copy_logits_min": -750000000.0, "copy_num_tokens": 761.75, "epoch": 0.49364309420474856, "gen_logits_max": 4.786940574645996, "gen_logits_mean": -12.120800018310547, "gen_logits_min": -24.593013763427734, "gen_logits_std": 2.574655771255493, "gen_loss": 0.29163312911987305, "grad_norm": 0.5265624822786974, "learning_rate": 2.787663157894737e-05, "loss": 0.3151, "mean_copy_accuracy": 0.9940081238746643, "mean_gen_accuracy": 0.858862042427063, "mean_token_accuracy": 0.8965612500905991, "num_tokens": 654966880.0, "sample_num_tokens": 9289.5, "step": 2417, "total_num_tokens": 655004038.0, "z_loss": 0.0009327419684268534 }, { "copy_logits_max": -3.4342284202575684, "copy_logits_min": -625000064.0, "copy_num_tokens": 519.625, "epoch": 0.49384733214194537, "gen_logits_max": 5.051065444946289, "gen_logits_mean": -14.156976699829102, "gen_logits_min": -26.08245086669922, "gen_logits_std": 2.5867857933044434, "gen_loss": 0.2849963903427124, "grad_norm": 0.49854173448081995, "learning_rate": 2.7875368421052634e-05, "loss": 0.2978, "mean_copy_accuracy": 0.9934129416942596, "mean_gen_accuracy": 0.8697621822357178, "mean_token_accuracy": 0.9037512987852097, "num_tokens": 655280325.0, "sample_num_tokens": 8827.25, "step": 2418, "total_num_tokens": 655315634.0, "z_loss": 0.0008427794673480093 }, { "copy_logits_max": -4.707254409790039, "copy_logits_min": -687500032.0, "copy_num_tokens": 574.625, "epoch": 0.4940515700791422, "gen_logits_max": 5.5626115798950195, "gen_logits_mean": -12.601387023925781, "gen_logits_min": -24.510711669921875, "gen_logits_std": 2.547244071960449, "gen_loss": 0.32222628593444824, "grad_norm": 0.48031729189569494, "learning_rate": 2.7874105263157895e-05, "loss": 0.3302, "mean_copy_accuracy": 0.9925019890069962, "mean_gen_accuracy": 0.8521585613489151, "mean_token_accuracy": 0.8910799622535706, "num_tokens": 655569883.0, "sample_num_tokens": 9102.25, "step": 2419, "total_num_tokens": 655606292.0, "z_loss": 0.0007840674370527267 }, { "copy_logits_max": -5.931041240692139, "copy_logits_min": -750000000.0, "copy_num_tokens": 349.25, "epoch": 0.49425580801633906, "gen_logits_max": 6.406098365783691, "gen_logits_mean": -13.63589859008789, "gen_logits_min": -25.005226135253906, "gen_logits_std": 2.522646903991699, "gen_loss": 0.34826788306236267, "grad_norm": 0.4908726308618517, "learning_rate": 2.787284210526316e-05, "loss": 0.3146, "mean_copy_accuracy": 0.9916049987077713, "mean_gen_accuracy": 0.8691523373126984, "mean_token_accuracy": 0.8959167003631592, "num_tokens": 655847198.0, "sample_num_tokens": 7769.5, "step": 2420, "total_num_tokens": 655878276.0, "z_loss": 0.0008414897019974887 }, { "copy_logits_max": -5.4178667068481445, "copy_logits_min": -750000000.0, "copy_num_tokens": 463.875, "epoch": 0.4944600459535359, "gen_logits_max": 5.380260467529297, "gen_logits_mean": -12.383816719055176, "gen_logits_min": -24.247840881347656, "gen_logits_std": 2.5287132263183594, "gen_loss": 0.29713964462280273, "grad_norm": 0.47261961202425945, "learning_rate": 2.787157894736842e-05, "loss": 0.3216, "mean_copy_accuracy": 0.9935929179191589, "mean_gen_accuracy": 0.8637707382440567, "mean_token_accuracy": 0.8936881870031357, "num_tokens": 656110524.0, "sample_num_tokens": 7905.5, "step": 2421, "total_num_tokens": 656142146.0, "z_loss": 0.0008029852760955691 }, { "copy_logits_max": -2.208979368209839, "copy_logits_min": -625000064.0, "copy_num_tokens": 593.25, "epoch": 0.4946642838907327, "gen_logits_max": 5.419285774230957, "gen_logits_mean": -12.057547569274902, "gen_logits_min": -23.275238037109375, "gen_logits_std": 2.467179775238037, "gen_loss": 0.3281710147857666, "grad_norm": 0.472988257241511, "learning_rate": 2.7870315789473685e-05, "loss": 0.3242, "mean_copy_accuracy": 0.992814764380455, "mean_gen_accuracy": 0.8605499416589737, "mean_token_accuracy": 0.8931479901075363, "num_tokens": 656396537.0, "sample_num_tokens": 9236.25, "step": 2422, "total_num_tokens": 656433482.0, "z_loss": 0.0009085620986297727 }, { "copy_logits_max": -6.006036758422852, "copy_logits_min": -750000000.0, "copy_num_tokens": 387.6875, "epoch": 0.49486852182792956, "gen_logits_max": 6.342430591583252, "gen_logits_mean": -12.40488052368164, "gen_logits_min": -24.37470245361328, "gen_logits_std": 2.542865037918091, "gen_loss": 0.3287146985530853, "grad_norm": 0.44386116532670356, "learning_rate": 2.7869052631578946e-05, "loss": 0.3153, "mean_copy_accuracy": 0.9922329783439636, "mean_gen_accuracy": 0.8642405867576599, "mean_token_accuracy": 0.8955370783805847, "num_tokens": 656671874.0, "sample_num_tokens": 7959.5, "step": 2423, "total_num_tokens": 656703712.0, "z_loss": 0.0008580756257288158 }, { "copy_logits_max": -5.609702110290527, "copy_logits_min": -750000000.0, "copy_num_tokens": 636.75, "epoch": 0.4950727597651264, "gen_logits_max": 5.298728942871094, "gen_logits_mean": -12.927085876464844, "gen_logits_min": -24.088134765625, "gen_logits_std": 2.4579572677612305, "gen_loss": 0.3018409013748169, "grad_norm": 0.6106006430031932, "learning_rate": 2.7867789473684213e-05, "loss": 0.3228, "mean_copy_accuracy": 0.9918487817049026, "mean_gen_accuracy": 0.8595166802406311, "mean_token_accuracy": 0.8924203366041183, "num_tokens": 656920209.0, "sample_num_tokens": 9550.25, "step": 2424, "total_num_tokens": 656958410.0, "z_loss": 0.0007485163514502347 }, { "copy_logits_max": -4.392014503479004, "copy_logits_min": -687500032.0, "copy_num_tokens": 429.0625, "epoch": 0.4952769977023232, "gen_logits_max": 5.247442245483398, "gen_logits_mean": -13.773962020874023, "gen_logits_min": -25.434776306152344, "gen_logits_std": 2.5404739379882812, "gen_loss": 0.31754201650619507, "grad_norm": 0.4595318765081504, "learning_rate": 2.7866526315789474e-05, "loss": 0.3426, "mean_copy_accuracy": 0.9934678971767426, "mean_gen_accuracy": 0.8533560335636139, "mean_token_accuracy": 0.888562873005867, "num_tokens": 657186686.0, "sample_num_tokens": 7593.5, "step": 2425, "total_num_tokens": 657217060.0, "z_loss": 0.0008701534825377166 }, { "copy_logits_max": -3.624229907989502, "copy_logits_min": -750000000.0, "copy_num_tokens": 564.5625, "epoch": 0.49548123563952007, "gen_logits_max": 4.831395149230957, "gen_logits_mean": -12.83157730102539, "gen_logits_min": -24.749176025390625, "gen_logits_std": 2.526287317276001, "gen_loss": 0.33255285024642944, "grad_norm": 0.4155493563444963, "learning_rate": 2.786526315789474e-05, "loss": 0.3122, "mean_copy_accuracy": 0.9939403235912323, "mean_gen_accuracy": 0.8627328425645828, "mean_token_accuracy": 0.8974149525165558, "num_tokens": 657472896.0, "sample_num_tokens": 8514.5, "step": 2426, "total_num_tokens": 657506954.0, "z_loss": 0.0009259788203053176 }, { "copy_logits_max": -5.878634452819824, "copy_logits_min": -750000000.0, "copy_num_tokens": 382.125, "epoch": 0.4956854735767169, "gen_logits_max": 6.054654121398926, "gen_logits_mean": -11.923334121704102, "gen_logits_min": -23.064849853515625, "gen_logits_std": 2.4544966220855713, "gen_loss": 0.33584898710250854, "grad_norm": 0.4664670894791724, "learning_rate": 2.7864e-05, "loss": 0.3277, "mean_copy_accuracy": 0.9957572221755981, "mean_gen_accuracy": 0.8585246950387955, "mean_token_accuracy": 0.8923630118370056, "num_tokens": 657748610.0, "sample_num_tokens": 7576.0, "step": 2427, "total_num_tokens": 657778914.0, "z_loss": 0.0009086000500246882 }, { "copy_logits_max": -5.08305549621582, "copy_logits_min": -750000000.0, "copy_num_tokens": 359.0, "epoch": 0.4958897115139137, "gen_logits_max": 5.587145805358887, "gen_logits_mean": -12.791807174682617, "gen_logits_min": -24.03288459777832, "gen_logits_std": 2.4467225074768066, "gen_loss": 0.3090135455131531, "grad_norm": 0.4176456201157209, "learning_rate": 2.7862736842105264e-05, "loss": 0.3121, "mean_copy_accuracy": 0.9934541881084442, "mean_gen_accuracy": 0.8622017800807953, "mean_token_accuracy": 0.8973845690488815, "num_tokens": 658016836.0, "sample_num_tokens": 7813.5, "step": 2428, "total_num_tokens": 658048090.0, "z_loss": 0.0008661900646984577 }, { "copy_logits_max": -5.21397590637207, "copy_logits_min": -750000000.0, "copy_num_tokens": 616.25, "epoch": 0.4960939494511105, "gen_logits_max": 5.802270889282227, "gen_logits_mean": -12.845367431640625, "gen_logits_min": -24.120018005371094, "gen_logits_std": 2.4965062141418457, "gen_loss": 0.3060225248336792, "grad_norm": 0.4795974341897042, "learning_rate": 2.7861473684210525e-05, "loss": 0.3132, "mean_copy_accuracy": 0.9919461309909821, "mean_gen_accuracy": 0.8651039302349091, "mean_token_accuracy": 0.8986336439847946, "num_tokens": 658308025.0, "sample_num_tokens": 10124.25, "step": 2429, "total_num_tokens": 658348522.0, "z_loss": 0.0009269908186979592 }, { "copy_logits_max": -5.140878200531006, "copy_logits_min": -750000000.0, "copy_num_tokens": 367.0625, "epoch": 0.4962981873883074, "gen_logits_max": 5.491453170776367, "gen_logits_mean": -13.802082061767578, "gen_logits_min": -25.104331970214844, "gen_logits_std": 2.5130841732025146, "gen_loss": 0.2968756854534149, "grad_norm": 0.4460845151627045, "learning_rate": 2.786021052631579e-05, "loss": 0.2985, "mean_copy_accuracy": 0.9935626089572906, "mean_gen_accuracy": 0.8756901770830154, "mean_token_accuracy": 0.9004850387573242, "num_tokens": 658570828.0, "sample_num_tokens": 7829.5, "step": 2430, "total_num_tokens": 658602146.0, "z_loss": 0.0008399750804528594 }, { "copy_logits_max": -3.893228530883789, "copy_logits_min": -750000000.0, "copy_num_tokens": 599.0, "epoch": 0.4965024253255042, "gen_logits_max": 5.498262405395508, "gen_logits_mean": -11.807998657226562, "gen_logits_min": -23.41909408569336, "gen_logits_std": 2.470794677734375, "gen_loss": 0.3109956979751587, "grad_norm": 0.435852402079418, "learning_rate": 2.7858947368421053e-05, "loss": 0.2918, "mean_copy_accuracy": 0.9952729493379593, "mean_gen_accuracy": 0.8685133308172226, "mean_token_accuracy": 0.903274193406105, "num_tokens": 658835848.0, "sample_num_tokens": 9439.0, "step": 2431, "total_num_tokens": 658873604.0, "z_loss": 0.0009785511065274477 }, { "copy_logits_max": -7.234477996826172, "copy_logits_min": -750000000.0, "copy_num_tokens": 518.1875, "epoch": 0.496706663262701, "gen_logits_max": 5.53673791885376, "gen_logits_mean": -12.837808609008789, "gen_logits_min": -24.309656143188477, "gen_logits_std": 2.5005416870117188, "gen_loss": 0.29263556003570557, "grad_norm": 0.43112295699225345, "learning_rate": 2.7857684210526318e-05, "loss": 0.3047, "mean_copy_accuracy": 0.9943777322769165, "mean_gen_accuracy": 0.8675611019134521, "mean_token_accuracy": 0.8999671339988708, "num_tokens": 659145987.0, "sample_num_tokens": 10200.25, "step": 2432, "total_num_tokens": 659186788.0, "z_loss": 0.0008074625511653721 }, { "copy_logits_max": -5.436544418334961, "copy_logits_min": -750000000.0, "copy_num_tokens": 384.1875, "epoch": 0.4969109011998979, "gen_logits_max": 6.020018100738525, "gen_logits_mean": -11.600430488586426, "gen_logits_min": -22.973373413085938, "gen_logits_std": 2.4809818267822266, "gen_loss": 0.3074329197406769, "grad_norm": 0.48701539818359174, "learning_rate": 2.7856421052631582e-05, "loss": 0.3217, "mean_copy_accuracy": 0.9923193752765656, "mean_gen_accuracy": 0.8653667867183685, "mean_token_accuracy": 0.8940293043851852, "num_tokens": 659411369.0, "sample_num_tokens": 7458.75, "step": 2433, "total_num_tokens": 659441204.0, "z_loss": 0.0008979797130450606 }, { "copy_logits_max": -3.62160587310791, "copy_logits_min": -750000000.0, "copy_num_tokens": 576.3125, "epoch": 0.4971151391370947, "gen_logits_max": 5.10334587097168, "gen_logits_mean": -12.30068588256836, "gen_logits_min": -23.631961822509766, "gen_logits_std": 2.4335811138153076, "gen_loss": 0.3347024917602539, "grad_norm": 0.43168518101266456, "learning_rate": 2.7855157894736843e-05, "loss": 0.3263, "mean_copy_accuracy": 0.9932636469602585, "mean_gen_accuracy": 0.8625912070274353, "mean_token_accuracy": 0.8944379091262817, "num_tokens": 659687873.0, "sample_num_tokens": 9157.25, "step": 2434, "total_num_tokens": 659724502.0, "z_loss": 0.0008987125474959612 }, { "copy_logits_max": -5.567971229553223, "copy_logits_min": -750000000.0, "copy_num_tokens": 431.9375, "epoch": 0.4973193770742915, "gen_logits_max": 5.3352437019348145, "gen_logits_mean": -14.107524871826172, "gen_logits_min": -25.33978271484375, "gen_logits_std": 2.4476537704467773, "gen_loss": 0.30212539434432983, "grad_norm": 0.4600001387857078, "learning_rate": 2.7853894736842107e-05, "loss": 0.2972, "mean_copy_accuracy": 0.9926488548517227, "mean_gen_accuracy": 0.8708081692457199, "mean_token_accuracy": 0.9010825306177139, "num_tokens": 659980984.0, "sample_num_tokens": 9539.5, "step": 2435, "total_num_tokens": 660019142.0, "z_loss": 0.000805521965958178 }, { "copy_logits_max": -4.090087413787842, "copy_logits_min": -750000000.0, "copy_num_tokens": 304.1875, "epoch": 0.4975236150114884, "gen_logits_max": 5.6094512939453125, "gen_logits_mean": -13.94965934753418, "gen_logits_min": -25.18047332763672, "gen_logits_std": 2.4805166721343994, "gen_loss": 0.372728168964386, "grad_norm": 0.4446693608877752, "learning_rate": 2.7852631578947368e-05, "loss": 0.325, "mean_copy_accuracy": 0.9941516369581223, "mean_gen_accuracy": 0.857845664024353, "mean_token_accuracy": 0.8927429914474487, "num_tokens": 660267696.0, "sample_num_tokens": 7534.0, "step": 2436, "total_num_tokens": 660297832.0, "z_loss": 0.0009310031309723854 }, { "copy_logits_max": -2.9299817085266113, "copy_logits_min": -687500032.0, "copy_num_tokens": 675.625, "epoch": 0.4977278529486852, "gen_logits_max": 5.394002914428711, "gen_logits_mean": -12.602827072143555, "gen_logits_min": -24.51097869873047, "gen_logits_std": 2.52628755569458, "gen_loss": 0.3163660168647766, "grad_norm": 0.5346847969680473, "learning_rate": 2.7851368421052632e-05, "loss": 0.3182, "mean_copy_accuracy": 0.994172677397728, "mean_gen_accuracy": 0.8622127026319504, "mean_token_accuracy": 0.8958079963922501, "num_tokens": 660536171.0, "sample_num_tokens": 9466.25, "step": 2437, "total_num_tokens": 660574036.0, "z_loss": 0.0008527957834303379 }, { "copy_logits_max": -6.216856479644775, "copy_logits_min": -750000000.0, "copy_num_tokens": 429.125, "epoch": 0.49793209088588203, "gen_logits_max": 6.158562660217285, "gen_logits_mean": -13.148846626281738, "gen_logits_min": -24.919416427612305, "gen_logits_std": 2.530163049697876, "gen_loss": 0.3108198046684265, "grad_norm": 0.45915816783104096, "learning_rate": 2.7850105263157893e-05, "loss": 0.3172, "mean_copy_accuracy": 0.9930855333805084, "mean_gen_accuracy": 0.8650542795658112, "mean_token_accuracy": 0.8943367451429367, "num_tokens": 660791416.0, "sample_num_tokens": 8488.5, "step": 2438, "total_num_tokens": 660825370.0, "z_loss": 0.0008175986586138606 }, { "copy_logits_max": -4.839881420135498, "copy_logits_min": -687500032.0, "copy_num_tokens": 560.9375, "epoch": 0.4981363288230789, "gen_logits_max": 5.462355613708496, "gen_logits_mean": -12.762739181518555, "gen_logits_min": -24.108116149902344, "gen_logits_std": 2.477257490158081, "gen_loss": 0.3127027153968811, "grad_norm": 0.47901119679562293, "learning_rate": 2.7848842105263158e-05, "loss": 0.3025, "mean_copy_accuracy": 0.9918550103902817, "mean_gen_accuracy": 0.8711179047822952, "mean_token_accuracy": 0.8993102312088013, "num_tokens": 661045875.0, "sample_num_tokens": 9780.25, "step": 2439, "total_num_tokens": 661084996.0, "z_loss": 0.0008132217917591333 }, { "copy_logits_max": -5.843810081481934, "copy_logits_min": -750000000.0, "copy_num_tokens": 558.0, "epoch": 0.4983405667602757, "gen_logits_max": 6.6584296226501465, "gen_logits_mean": -12.500587463378906, "gen_logits_min": -24.059680938720703, "gen_logits_std": 2.5226361751556396, "gen_loss": 0.31808871030807495, "grad_norm": 0.43463482040540835, "learning_rate": 2.7847578947368422e-05, "loss": 0.3192, "mean_copy_accuracy": 0.9928574562072754, "mean_gen_accuracy": 0.8648199737071991, "mean_token_accuracy": 0.8952002972364426, "num_tokens": 661325527.0, "sample_num_tokens": 8812.75, "step": 2440, "total_num_tokens": 661360778.0, "z_loss": 0.0007423011120408773 }, { "copy_logits_max": -5.021642208099365, "copy_logits_min": -750000000.0, "copy_num_tokens": 543.625, "epoch": 0.49854480469747253, "gen_logits_max": 5.067112922668457, "gen_logits_mean": -13.166128158569336, "gen_logits_min": -25.09758186340332, "gen_logits_std": 2.5283055305480957, "gen_loss": 0.3269926607608795, "grad_norm": 0.5307025360369882, "learning_rate": 2.7846315789473686e-05, "loss": 0.3284, "mean_copy_accuracy": 0.9907456338405609, "mean_gen_accuracy": 0.8604319542646408, "mean_token_accuracy": 0.8925700187683105, "num_tokens": 661588776.0, "sample_num_tokens": 8412.0, "step": 2441, "total_num_tokens": 661622424.0, "z_loss": 0.0007689401973038912 }, { "copy_logits_max": -5.627669334411621, "copy_logits_min": -750000000.0, "copy_num_tokens": 401.875, "epoch": 0.4987490426346694, "gen_logits_max": 5.715821743011475, "gen_logits_mean": -13.361889839172363, "gen_logits_min": -25.118160247802734, "gen_logits_std": 2.5502655506134033, "gen_loss": 0.26957571506500244, "grad_norm": 0.46300864543442244, "learning_rate": 2.7845052631578947e-05, "loss": 0.3138, "mean_copy_accuracy": 0.9928568601608276, "mean_gen_accuracy": 0.8671037256717682, "mean_token_accuracy": 0.8937508910894394, "num_tokens": 661842284.0, "sample_num_tokens": 8026.0, "step": 2442, "total_num_tokens": 661874388.0, "z_loss": 0.0006979256868362427 }, { "copy_logits_max": -5.369768142700195, "copy_logits_min": -750000000.0, "copy_num_tokens": 354.1875, "epoch": 0.4989532805718662, "gen_logits_max": 6.756263256072998, "gen_logits_mean": -13.640715599060059, "gen_logits_min": -25.274553298950195, "gen_logits_std": 2.5012426376342773, "gen_loss": 0.32579612731933594, "grad_norm": 0.4956388282157504, "learning_rate": 2.784378947368421e-05, "loss": 0.3311, "mean_copy_accuracy": 0.9937330931425095, "mean_gen_accuracy": 0.863946869969368, "mean_token_accuracy": 0.8941561430692673, "num_tokens": 662100967.0, "sample_num_tokens": 7482.25, "step": 2443, "total_num_tokens": 662130896.0, "z_loss": 0.0007189037860371172 }, { "copy_logits_max": -5.555000305175781, "copy_logits_min": -750000000.0, "copy_num_tokens": 482.6875, "epoch": 0.49915751850906304, "gen_logits_max": 5.746582508087158, "gen_logits_mean": -13.124375343322754, "gen_logits_min": -24.598495483398438, "gen_logits_std": 2.5027005672454834, "gen_loss": 0.2992359399795532, "grad_norm": 0.43993703363869135, "learning_rate": 2.7842526315789476e-05, "loss": 0.3092, "mean_copy_accuracy": 0.992946982383728, "mean_gen_accuracy": 0.8679507225751877, "mean_token_accuracy": 0.898158997297287, "num_tokens": 662369633.0, "sample_num_tokens": 8871.75, "step": 2444, "total_num_tokens": 662405120.0, "z_loss": 0.0006838389090262353 }, { "copy_logits_max": -4.887293815612793, "copy_logits_min": -687500032.0, "copy_num_tokens": 536.0, "epoch": 0.4993617564462599, "gen_logits_max": 5.8320770263671875, "gen_logits_mean": -13.24795150756836, "gen_logits_min": -24.720617294311523, "gen_logits_std": 2.480011224746704, "gen_loss": 0.34243494272232056, "grad_norm": 0.44393086152529754, "learning_rate": 2.7841263157894737e-05, "loss": 0.3502, "mean_copy_accuracy": 0.9939358830451965, "mean_gen_accuracy": 0.8508968651294708, "mean_token_accuracy": 0.8856618404388428, "num_tokens": 662647090.0, "sample_num_tokens": 9023.5, "step": 2445, "total_num_tokens": 662683184.0, "z_loss": 0.0007555976044386625 }, { "copy_logits_max": -1.4331564903259277, "copy_logits_min": -750000000.0, "copy_num_tokens": 381.3125, "epoch": 0.4995659943834567, "gen_logits_max": 6.464682102203369, "gen_logits_mean": -13.6063232421875, "gen_logits_min": -25.483715057373047, "gen_logits_std": 2.5717179775238037, "gen_loss": 0.30796197056770325, "grad_norm": 0.4725026102505978, "learning_rate": 2.784e-05, "loss": 0.2949, "mean_copy_accuracy": 0.9927703589200974, "mean_gen_accuracy": 0.8693430423736572, "mean_token_accuracy": 0.9012558907270432, "num_tokens": 662905972.0, "sample_num_tokens": 7087.0, "step": 2446, "total_num_tokens": 662934320.0, "z_loss": 0.0007666329038329422 }, { "copy_logits_max": -4.979663848876953, "copy_logits_min": -687500032.0, "copy_num_tokens": 271.5, "epoch": 0.49977023232065354, "gen_logits_max": 6.096824645996094, "gen_logits_mean": -13.409078598022461, "gen_logits_min": -24.45357894897461, "gen_logits_std": 2.4666738510131836, "gen_loss": 0.3293909430503845, "grad_norm": 0.4772539279136602, "learning_rate": 2.7838736842105262e-05, "loss": 0.322, "mean_copy_accuracy": 0.9926946014165878, "mean_gen_accuracy": 0.8650480806827545, "mean_token_accuracy": 0.8931227177381516, "num_tokens": 663160682.0, "sample_num_tokens": 6673.5, "step": 2447, "total_num_tokens": 663187376.0, "z_loss": 0.0008253215928561985 }, { "copy_logits_max": -2.1568970680236816, "copy_logits_min": -625000064.0, "copy_num_tokens": 674.75, "epoch": 0.4999744702578504, "gen_logits_max": 5.426912307739258, "gen_logits_mean": -13.364087104797363, "gen_logits_min": -24.961355209350586, "gen_logits_std": 2.5220465660095215, "gen_loss": 0.30665427446365356, "grad_norm": 0.433538015279514, "learning_rate": 2.783747368421053e-05, "loss": 0.3331, "mean_copy_accuracy": 0.9938173294067383, "mean_gen_accuracy": 0.861554354429245, "mean_token_accuracy": 0.8896666169166565, "num_tokens": 663402429.0, "sample_num_tokens": 10143.75, "step": 2448, "total_num_tokens": 663443004.0, "z_loss": 0.0008858561632223427 }, { "copy_logits_max": -6.340149879455566, "copy_logits_min": -750000000.0, "copy_num_tokens": 313.4375, "epoch": 0.5001787081950473, "gen_logits_max": 5.483790397644043, "gen_logits_mean": -14.310245513916016, "gen_logits_min": -25.197694778442383, "gen_logits_std": 2.429105281829834, "gen_loss": 0.3169878125190735, "grad_norm": 0.4769002345991878, "learning_rate": 2.783621052631579e-05, "loss": 0.345, "mean_copy_accuracy": 0.9925181865692139, "mean_gen_accuracy": 0.8576645851135254, "mean_token_accuracy": 0.8880662769079208, "num_tokens": 663676527.0, "sample_num_tokens": 8179.25, "step": 2449, "total_num_tokens": 663709244.0, "z_loss": 0.0007494314340874553 }, { "copy_logits_max": -5.0250630378723145, "copy_logits_min": -687500032.0, "copy_num_tokens": 538.75, "epoch": 0.5003829461322441, "gen_logits_max": 6.291226387023926, "gen_logits_mean": -13.275906562805176, "gen_logits_min": -24.823240280151367, "gen_logits_std": 2.514803409576416, "gen_loss": 0.27087506651878357, "grad_norm": 0.8299295149526276, "learning_rate": 2.7834947368421055e-05, "loss": 0.3138, "mean_copy_accuracy": 0.9937775731086731, "mean_gen_accuracy": 0.86197429895401, "mean_token_accuracy": 0.8966791182756424, "num_tokens": 663962420.0, "sample_num_tokens": 8755.0, "step": 2450, "total_num_tokens": 663997440.0, "z_loss": 0.0007536851335316896 }, { "copy_logits_max": -2.393833637237549, "copy_logits_min": -750000000.0, "copy_num_tokens": 420.375, "epoch": 0.5005871840694409, "gen_logits_max": 4.8494553565979, "gen_logits_mean": -14.14129638671875, "gen_logits_min": -26.116928100585938, "gen_logits_std": 2.5473122596740723, "gen_loss": 0.3054211139678955, "grad_norm": 0.4624984378840601, "learning_rate": 2.7833684210526316e-05, "loss": 0.3273, "mean_copy_accuracy": 0.9936877638101578, "mean_gen_accuracy": 0.8615586906671524, "mean_token_accuracy": 0.8926785588264465, "num_tokens": 664231367.0, "sample_num_tokens": 7564.25, "step": 2451, "total_num_tokens": 664261624.0, "z_loss": 0.0008594801765866578 }, { "copy_logits_max": -4.962014675140381, "copy_logits_min": -750000064.0, "copy_num_tokens": 412.125, "epoch": 0.5007914220066377, "gen_logits_max": 6.478939533233643, "gen_logits_mean": -13.641327857971191, "gen_logits_min": -25.014583587646484, "gen_logits_std": 2.51603102684021, "gen_loss": 0.35352468490600586, "grad_norm": 0.44865465201128113, "learning_rate": 2.783242105263158e-05, "loss": 0.3016, "mean_copy_accuracy": 0.9933740943670273, "mean_gen_accuracy": 0.8684585392475128, "mean_token_accuracy": 0.9002309888601303, "num_tokens": 664512023.0, "sample_num_tokens": 7836.75, "step": 2452, "total_num_tokens": 664543370.0, "z_loss": 0.0008873544866219163 }, { "copy_logits_max": -5.220211029052734, "copy_logits_min": -750000000.0, "copy_num_tokens": 376.3125, "epoch": 0.5009956599438345, "gen_logits_max": 5.881960868835449, "gen_logits_mean": -13.016687393188477, "gen_logits_min": -24.304800033569336, "gen_logits_std": 2.490055799484253, "gen_loss": 0.3578278422355652, "grad_norm": 0.6791421589073473, "learning_rate": 2.783115789473684e-05, "loss": 0.3229, "mean_copy_accuracy": 0.9926303327083588, "mean_gen_accuracy": 0.8538884371519089, "mean_token_accuracy": 0.894084170460701, "num_tokens": 664782973.0, "sample_num_tokens": 7450.75, "step": 2453, "total_num_tokens": 664812776.0, "z_loss": 0.0008045362774282694 }, { "copy_logits_max": -3.221968412399292, "copy_logits_min": -750000000.0, "copy_num_tokens": 611.3125, "epoch": 0.5011998978810314, "gen_logits_max": 5.1828460693359375, "gen_logits_mean": -12.272462844848633, "gen_logits_min": -24.045604705810547, "gen_logits_std": 2.513822555541992, "gen_loss": 0.2992088496685028, "grad_norm": 0.43060681764858655, "learning_rate": 2.7829894736842105e-05, "loss": 0.3236, "mean_copy_accuracy": 0.9942411631345749, "mean_gen_accuracy": 0.8608412891626358, "mean_token_accuracy": 0.8919951915740967, "num_tokens": 665044508.0, "sample_num_tokens": 9035.0, "step": 2454, "total_num_tokens": 665080648.0, "z_loss": 0.0009812931530177593 }, { "copy_logits_max": -3.5551958084106445, "copy_logits_min": -750000128.0, "copy_num_tokens": 377.125, "epoch": 0.5014041358182282, "gen_logits_max": 6.082136154174805, "gen_logits_mean": -12.795377731323242, "gen_logits_min": -24.3978271484375, "gen_logits_std": 2.4696459770202637, "gen_loss": 0.34157249331474304, "grad_norm": 0.5245052001825419, "learning_rate": 2.7828631578947366e-05, "loss": 0.3233, "mean_copy_accuracy": 0.9920555502176285, "mean_gen_accuracy": 0.8606217950582504, "mean_token_accuracy": 0.893402099609375, "num_tokens": 665327156.0, "sample_num_tokens": 8263.0, "step": 2455, "total_num_tokens": 665360208.0, "z_loss": 0.0010859081521630287 }, { "copy_logits_max": -1.467268943786621, "copy_logits_min": -750000000.0, "copy_num_tokens": 458.5625, "epoch": 0.5016083737554251, "gen_logits_max": 7.421014785766602, "gen_logits_mean": -11.219141006469727, "gen_logits_min": -23.03424835205078, "gen_logits_std": 2.5412864685058594, "gen_loss": 0.390683650970459, "grad_norm": 0.47419841199551693, "learning_rate": 2.7827368421052634e-05, "loss": 0.3515, "mean_copy_accuracy": 0.9928404539823532, "mean_gen_accuracy": 0.8528696298599243, "mean_token_accuracy": 0.8856041878461838, "num_tokens": 665581782.0, "sample_num_tokens": 8234.0, "step": 2456, "total_num_tokens": 665614718.0, "z_loss": 0.00156414567027241 }, { "copy_logits_max": -3.3989109992980957, "copy_logits_min": -750000000.0, "copy_num_tokens": 413.125, "epoch": 0.5018126116926219, "gen_logits_max": 6.1943182945251465, "gen_logits_mean": -13.33508586883545, "gen_logits_min": -25.050640106201172, "gen_logits_std": 2.508509635925293, "gen_loss": 0.31640535593032837, "grad_norm": 0.6821220487613249, "learning_rate": 2.7826105263157895e-05, "loss": 0.3382, "mean_copy_accuracy": 0.9931178539991379, "mean_gen_accuracy": 0.8572421818971634, "mean_token_accuracy": 0.8912783712148666, "num_tokens": 665854378.0, "sample_num_tokens": 8100.0, "step": 2457, "total_num_tokens": 665886778.0, "z_loss": 0.0012485480401664972 }, { "copy_logits_max": -1.5816596746444702, "copy_logits_min": -687500032.0, "copy_num_tokens": 553.0625, "epoch": 0.5020168496298187, "gen_logits_max": 6.485368728637695, "gen_logits_mean": -12.929350852966309, "gen_logits_min": -24.217504501342773, "gen_logits_std": 2.5029842853546143, "gen_loss": 0.29300570487976074, "grad_norm": 0.7477774819851697, "learning_rate": 2.782484210526316e-05, "loss": 0.3311, "mean_copy_accuracy": 0.9931895285844803, "mean_gen_accuracy": 0.8624358773231506, "mean_token_accuracy": 0.8943201154470444, "num_tokens": 666123740.0, "sample_num_tokens": 8679.0, "step": 2458, "total_num_tokens": 666158456.0, "z_loss": 0.0013066595420241356 }, { "copy_logits_max": -2.139849901199341, "copy_logits_min": -687500032.0, "copy_num_tokens": 335.75, "epoch": 0.5022210875670156, "gen_logits_max": 6.337846279144287, "gen_logits_mean": -13.131312370300293, "gen_logits_min": -24.59071159362793, "gen_logits_std": 2.529445171356201, "gen_loss": 0.3275114893913269, "grad_norm": 0.4817203330241526, "learning_rate": 2.7823578947368424e-05, "loss": 0.3205, "mean_copy_accuracy": 0.9927504807710648, "mean_gen_accuracy": 0.864492878317833, "mean_token_accuracy": 0.8943342715501785, "num_tokens": 666409539.0, "sample_num_tokens": 7570.25, "step": 2459, "total_num_tokens": 666439820.0, "z_loss": 0.0010985906701534986 }, { "copy_logits_max": -1.695114016532898, "copy_logits_min": -750000000.0, "copy_num_tokens": 387.375, "epoch": 0.5024253255042124, "gen_logits_max": 5.885343074798584, "gen_logits_mean": -14.051642417907715, "gen_logits_min": -25.79772186279297, "gen_logits_std": 2.565326690673828, "gen_loss": 0.31780242919921875, "grad_norm": 0.5103255896282407, "learning_rate": 2.7822315789473685e-05, "loss": 0.3194, "mean_copy_accuracy": 0.9924765378236771, "mean_gen_accuracy": 0.8647449463605881, "mean_token_accuracy": 0.8961830884218216, "num_tokens": 666664432.0, "sample_num_tokens": 7643.0, "step": 2460, "total_num_tokens": 666695004.0, "z_loss": 0.0010220310650765896 }, { "copy_logits_max": -1.5543817281723022, "copy_logits_min": -625000064.0, "copy_num_tokens": 436.5, "epoch": 0.5026295634414092, "gen_logits_max": 6.015528678894043, "gen_logits_mean": -13.322487831115723, "gen_logits_min": -24.821151733398438, "gen_logits_std": 2.4962313175201416, "gen_loss": 0.3191210925579071, "grad_norm": 0.4796587524744043, "learning_rate": 2.782105263157895e-05, "loss": 0.3238, "mean_copy_accuracy": 0.9934016913175583, "mean_gen_accuracy": 0.8659142106771469, "mean_token_accuracy": 0.893630638718605, "num_tokens": 666922609.0, "sample_num_tokens": 7855.75, "step": 2461, "total_num_tokens": 666954032.0, "z_loss": 0.0009336314396932721 }, { "copy_logits_max": -3.757364511489868, "copy_logits_min": -750000000.0, "copy_num_tokens": 468.5625, "epoch": 0.5028338013786061, "gen_logits_max": 6.286755084991455, "gen_logits_mean": -12.53372573852539, "gen_logits_min": -24.405494689941406, "gen_logits_std": 2.5141611099243164, "gen_loss": 0.31405699253082275, "grad_norm": 0.43726166066421196, "learning_rate": 2.781978947368421e-05, "loss": 0.3013, "mean_copy_accuracy": 0.9933752119541168, "mean_gen_accuracy": 0.8686324656009674, "mean_token_accuracy": 0.900736927986145, "num_tokens": 667201536.0, "sample_num_tokens": 8070.5, "step": 2462, "total_num_tokens": 667233818.0, "z_loss": 0.0008172388770617545 }, { "copy_logits_max": -4.616742134094238, "copy_logits_min": -750000000.0, "copy_num_tokens": 383.625, "epoch": 0.5030380393158029, "gen_logits_max": 6.221920967102051, "gen_logits_mean": -14.516597747802734, "gen_logits_min": -25.691762924194336, "gen_logits_std": 2.509693145751953, "gen_loss": 0.358330100774765, "grad_norm": 0.588436184833032, "learning_rate": 2.7818526315789474e-05, "loss": 0.3493, "mean_copy_accuracy": 0.9930056929588318, "mean_gen_accuracy": 0.8525140136480331, "mean_token_accuracy": 0.8849648237228394, "num_tokens": 667456867.0, "sample_num_tokens": 8938.75, "step": 2463, "total_num_tokens": 667492622.0, "z_loss": 0.0008311823476105928 }, { "copy_logits_max": -0.5687854290008545, "copy_logits_min": -750000000.0, "copy_num_tokens": 513.875, "epoch": 0.5032422772529997, "gen_logits_max": 6.213340759277344, "gen_logits_mean": -13.424150466918945, "gen_logits_min": -25.1229248046875, "gen_logits_std": 2.5374228954315186, "gen_loss": 0.3683255910873413, "grad_norm": 0.47065904474758635, "learning_rate": 2.781726315789474e-05, "loss": 0.3719, "mean_copy_accuracy": 0.9929706901311874, "mean_gen_accuracy": 0.8430699110031128, "mean_token_accuracy": 0.8793460130691528, "num_tokens": 667721918.0, "sample_num_tokens": 8007.0, "step": 2464, "total_num_tokens": 667753946.0, "z_loss": 0.0010951617732644081 }, { "copy_logits_max": -0.7268075346946716, "copy_logits_min": -750000000.0, "copy_num_tokens": 443.5, "epoch": 0.5034465151901966, "gen_logits_max": 6.080605983734131, "gen_logits_mean": -13.908430099487305, "gen_logits_min": -25.593900680541992, "gen_logits_std": 2.552448272705078, "gen_loss": 0.34734809398651123, "grad_norm": 0.51968518690618, "learning_rate": 2.7816000000000003e-05, "loss": 0.315, "mean_copy_accuracy": 0.9937509894371033, "mean_gen_accuracy": 0.8600543737411499, "mean_token_accuracy": 0.8948742002248764, "num_tokens": 668002735.0, "sample_num_tokens": 8795.25, "step": 2465, "total_num_tokens": 668037916.0, "z_loss": 0.0012442402075976133 }, { "copy_logits_max": -3.2880122661590576, "copy_logits_min": -750000064.0, "copy_num_tokens": 377.6875, "epoch": 0.5036507531273934, "gen_logits_max": 6.947237968444824, "gen_logits_mean": -13.202192306518555, "gen_logits_min": -24.724632263183594, "gen_logits_std": 2.529404878616333, "gen_loss": 0.3226149380207062, "grad_norm": 0.5111898200262859, "learning_rate": 2.7814736842105264e-05, "loss": 0.3186, "mean_copy_accuracy": 0.9913457781076431, "mean_gen_accuracy": 0.8697107434272766, "mean_token_accuracy": 0.8970170766115189, "num_tokens": 668279182.0, "sample_num_tokens": 7647.5, "step": 2466, "total_num_tokens": 668309772.0, "z_loss": 0.0011212567333132029 }, { "copy_logits_max": -1.9708077907562256, "copy_logits_min": -687500032.0, "copy_num_tokens": 422.5, "epoch": 0.5038549910645902, "gen_logits_max": 5.785274505615234, "gen_logits_mean": -13.399468421936035, "gen_logits_min": -24.742664337158203, "gen_logits_std": 2.543578863143921, "gen_loss": 0.3130803406238556, "grad_norm": 0.5376148638395206, "learning_rate": 2.7813473684210528e-05, "loss": 0.3288, "mean_copy_accuracy": 0.9906229376792908, "mean_gen_accuracy": 0.8625095635652542, "mean_token_accuracy": 0.8922361135482788, "num_tokens": 668546214.0, "sample_num_tokens": 8105.0, "step": 2467, "total_num_tokens": 668578634.0, "z_loss": 0.001071011065505445 }, { "copy_logits_max": -3.925053119659424, "copy_logits_min": -687500032.0, "copy_num_tokens": 508.625, "epoch": 0.5040592290017871, "gen_logits_max": 5.823909282684326, "gen_logits_mean": -13.58078384399414, "gen_logits_min": -24.971168518066406, "gen_logits_std": 2.548211097717285, "gen_loss": 0.32817041873931885, "grad_norm": 0.4913682334347569, "learning_rate": 2.781221052631579e-05, "loss": 0.3391, "mean_copy_accuracy": 0.9930209517478943, "mean_gen_accuracy": 0.8589407652616501, "mean_token_accuracy": 0.8896873742341995, "num_tokens": 668822897.0, "sample_num_tokens": 9885.25, "step": 2468, "total_num_tokens": 668862438.0, "z_loss": 0.0010277056135237217 }, { "copy_logits_max": -3.455824613571167, "copy_logits_min": -687500032.0, "copy_num_tokens": 405.625, "epoch": 0.5042634669389839, "gen_logits_max": 5.882482051849365, "gen_logits_mean": -13.853919982910156, "gen_logits_min": -24.983537673950195, "gen_logits_std": 2.479797124862671, "gen_loss": 0.3451056480407715, "grad_norm": 0.491827421972933, "learning_rate": 2.7810947368421053e-05, "loss": 0.3331, "mean_copy_accuracy": 0.9923492819070816, "mean_gen_accuracy": 0.8597291111946106, "mean_token_accuracy": 0.8889976739883423, "num_tokens": 669092153.0, "sample_num_tokens": 9030.75, "step": 2469, "total_num_tokens": 669128276.0, "z_loss": 0.0011464017443358898 }, { "copy_logits_max": -2.2275962829589844, "copy_logits_min": -687500032.0, "copy_num_tokens": 478.75, "epoch": 0.5044677048761808, "gen_logits_max": 6.443752288818359, "gen_logits_mean": -11.74736213684082, "gen_logits_min": -23.535175323486328, "gen_logits_std": 2.513218879699707, "gen_loss": 0.3318507671356201, "grad_norm": 0.46547948243383414, "learning_rate": 2.7809684210526314e-05, "loss": 0.3197, "mean_copy_accuracy": 0.9923127144575119, "mean_gen_accuracy": 0.8673792034387589, "mean_token_accuracy": 0.895449236035347, "num_tokens": 669349947.0, "sample_num_tokens": 8707.25, "step": 2470, "total_num_tokens": 669384776.0, "z_loss": 0.001100803492590785 }, { "copy_logits_max": -4.548893928527832, "copy_logits_min": -750000000.0, "copy_num_tokens": 568.0625, "epoch": 0.5046719428133776, "gen_logits_max": 5.4301042556762695, "gen_logits_mean": -12.796897888183594, "gen_logits_min": -24.286888122558594, "gen_logits_std": 2.550527572631836, "gen_loss": 0.3154626488685608, "grad_norm": 0.5322275287065399, "learning_rate": 2.780842105263158e-05, "loss": 0.3318, "mean_copy_accuracy": 0.9919845759868622, "mean_gen_accuracy": 0.8574669659137726, "mean_token_accuracy": 0.891181007027626, "num_tokens": 669615475.0, "sample_num_tokens": 9223.25, "step": 2471, "total_num_tokens": 669652368.0, "z_loss": 0.000907497014850378 }, { "copy_logits_max": -5.569117546081543, "copy_logits_min": -750000000.0, "copy_num_tokens": 280.9375, "epoch": 0.5048761807505744, "gen_logits_max": 5.8720550537109375, "gen_logits_mean": -14.066410064697266, "gen_logits_min": -25.468250274658203, "gen_logits_std": 2.5607612133026123, "gen_loss": 0.2977457642555237, "grad_norm": 0.44236584054763084, "learning_rate": 2.7807157894736843e-05, "loss": 0.3031, "mean_copy_accuracy": 0.9933983087539673, "mean_gen_accuracy": 0.8712626844644547, "mean_token_accuracy": 0.8979824632406235, "num_tokens": 669863893.0, "sample_num_tokens": 6737.25, "step": 2472, "total_num_tokens": 669890842.0, "z_loss": 0.0007855696603655815 }, { "copy_logits_max": -1.968588948249817, "copy_logits_min": -687500032.0, "copy_num_tokens": 652.75, "epoch": 0.5050804186877712, "gen_logits_max": 4.906239986419678, "gen_logits_mean": -13.976624488830566, "gen_logits_min": -25.77065658569336, "gen_logits_std": 2.59446120262146, "gen_loss": 0.31214630603790283, "grad_norm": 0.44097014715853483, "learning_rate": 2.7805894736842107e-05, "loss": 0.2948, "mean_copy_accuracy": 0.9938594102859497, "mean_gen_accuracy": 0.8651453703641891, "mean_token_accuracy": 0.9015840142965317, "num_tokens": 670145227.0, "sample_num_tokens": 9709.25, "step": 2473, "total_num_tokens": 670184064.0, "z_loss": 0.0009312194306403399 }, { "copy_logits_max": -4.511202812194824, "copy_logits_min": -750000000.0, "copy_num_tokens": 345.8125, "epoch": 0.5052846566249681, "gen_logits_max": 5.140240669250488, "gen_logits_mean": -14.258731842041016, "gen_logits_min": -25.54647445678711, "gen_logits_std": 2.5107600688934326, "gen_loss": 0.331369012594223, "grad_norm": 0.4330327860039817, "learning_rate": 2.780463157894737e-05, "loss": 0.3431, "mean_copy_accuracy": 0.9928922355175018, "mean_gen_accuracy": 0.8565927892923355, "mean_token_accuracy": 0.8841248601675034, "num_tokens": 670413230.0, "sample_num_tokens": 7874.5, "step": 2474, "total_num_tokens": 670444728.0, "z_loss": 0.0008638792205601931 }, { "copy_logits_max": -2.5638880729675293, "copy_logits_min": -750000000.0, "copy_num_tokens": 448.4375, "epoch": 0.505488894562165, "gen_logits_max": 5.275078773498535, "gen_logits_mean": -13.483026504516602, "gen_logits_min": -24.97921371459961, "gen_logits_std": 2.55562162399292, "gen_loss": 0.3233523964881897, "grad_norm": 0.4493128602307682, "learning_rate": 2.7803368421052632e-05, "loss": 0.3212, "mean_copy_accuracy": 0.9943642020225525, "mean_gen_accuracy": 0.8640523105859756, "mean_token_accuracy": 0.8964793086051941, "num_tokens": 670679740.0, "sample_num_tokens": 8404.0, "step": 2475, "total_num_tokens": 670713356.0, "z_loss": 0.0008565432508476079 }, { "copy_logits_max": -4.017678737640381, "copy_logits_min": -750000000.0, "copy_num_tokens": 324.625, "epoch": 0.5056931324993618, "gen_logits_max": 6.405041694641113, "gen_logits_mean": -12.663656234741211, "gen_logits_min": -24.327159881591797, "gen_logits_std": 2.587067127227783, "gen_loss": 0.3133838474750519, "grad_norm": 0.46916585272336825, "learning_rate": 2.7802105263157897e-05, "loss": 0.3118, "mean_copy_accuracy": 0.9946721494197845, "mean_gen_accuracy": 0.8663390725851059, "mean_token_accuracy": 0.8975987583398819, "num_tokens": 670953862.0, "sample_num_tokens": 6920.0, "step": 2476, "total_num_tokens": 670981542.0, "z_loss": 0.0008464689599350095 }, { "copy_logits_max": -4.821304798126221, "copy_logits_min": -625000064.0, "copy_num_tokens": 406.4375, "epoch": 0.5058973704365586, "gen_logits_max": 6.1026411056518555, "gen_logits_mean": -13.375975608825684, "gen_logits_min": -24.78704261779785, "gen_logits_std": 2.5224153995513916, "gen_loss": 0.3790796399116516, "grad_norm": 0.46565746147410236, "learning_rate": 2.7800842105263157e-05, "loss": 0.3347, "mean_copy_accuracy": 0.992892250418663, "mean_gen_accuracy": 0.8592970669269562, "mean_token_accuracy": 0.8891354501247406, "num_tokens": 671205435.0, "sample_num_tokens": 8112.25, "step": 2477, "total_num_tokens": 671237884.0, "z_loss": 0.0010077499318867922 }, { "copy_logits_max": -4.705720901489258, "copy_logits_min": -750000000.0, "copy_num_tokens": 453.3125, "epoch": 0.5061016083737554, "gen_logits_max": 5.755183219909668, "gen_logits_mean": -13.178020477294922, "gen_logits_min": -24.902732849121094, "gen_logits_std": 2.5379488468170166, "gen_loss": 0.3265722990036011, "grad_norm": 0.4705763919199431, "learning_rate": 2.7799578947368422e-05, "loss": 0.33, "mean_copy_accuracy": 0.9937190860509872, "mean_gen_accuracy": 0.855973094701767, "mean_token_accuracy": 0.8933348804712296, "num_tokens": 671488396.0, "sample_num_tokens": 7585.0, "step": 2478, "total_num_tokens": 671518736.0, "z_loss": 0.0009250080911442637 }, { "copy_logits_max": -5.956257343292236, "copy_logits_min": -750000000.0, "copy_num_tokens": 482.9375, "epoch": 0.5063058463109522, "gen_logits_max": 4.965612411499023, "gen_logits_mean": -13.909351348876953, "gen_logits_min": -24.84001922607422, "gen_logits_std": 2.471904754638672, "gen_loss": 0.3076655864715576, "grad_norm": 0.4142319532058643, "learning_rate": 2.7798315789473683e-05, "loss": 0.3152, "mean_copy_accuracy": 0.9940617829561234, "mean_gen_accuracy": 0.8668213784694672, "mean_token_accuracy": 0.8957155346870422, "num_tokens": 671759005.0, "sample_num_tokens": 9092.75, "step": 2479, "total_num_tokens": 671795376.0, "z_loss": 0.0008248151862062514 }, { "copy_logits_max": -5.970587253570557, "copy_logits_min": -687500032.0, "copy_num_tokens": 247.875, "epoch": 0.5065100842481491, "gen_logits_max": 6.700368404388428, "gen_logits_mean": -13.352828979492188, "gen_logits_min": -24.11334991455078, "gen_logits_std": 2.4500980377197266, "gen_loss": 0.3576849102973938, "grad_norm": 0.4222669989176643, "learning_rate": 2.7797052631578947e-05, "loss": 0.3374, "mean_copy_accuracy": 0.9931887835264206, "mean_gen_accuracy": 0.8601909279823303, "mean_token_accuracy": 0.8891152441501617, "num_tokens": 672052580.0, "sample_num_tokens": 6654.5, "step": 2480, "total_num_tokens": 672079198.0, "z_loss": 0.0008766407263465226 }, { "copy_logits_max": -3.0833230018615723, "copy_logits_min": -750000064.0, "copy_num_tokens": 603.5, "epoch": 0.506714322185346, "gen_logits_max": 5.053952693939209, "gen_logits_mean": -13.319086074829102, "gen_logits_min": -25.41948127746582, "gen_logits_std": 2.617231845855713, "gen_loss": 0.3030962646007538, "grad_norm": 0.4763437662843272, "learning_rate": 2.779578947368421e-05, "loss": 0.3364, "mean_copy_accuracy": 0.9938073754310608, "mean_gen_accuracy": 0.8567693084478378, "mean_token_accuracy": 0.8911695778369904, "num_tokens": 672332151.0, "sample_num_tokens": 9286.25, "step": 2481, "total_num_tokens": 672369296.0, "z_loss": 0.0007788506336510181 }, { "copy_logits_max": -6.283076763153076, "copy_logits_min": -750000064.0, "copy_num_tokens": 551.4375, "epoch": 0.5069185601225428, "gen_logits_max": 5.898537635803223, "gen_logits_mean": -13.607810020446777, "gen_logits_min": -24.97350311279297, "gen_logits_std": 2.5068721771240234, "gen_loss": 0.3116316795349121, "grad_norm": 0.4166805051831945, "learning_rate": 2.7794526315789476e-05, "loss": 0.3287, "mean_copy_accuracy": 0.9934035539627075, "mean_gen_accuracy": 0.8591022044420242, "mean_token_accuracy": 0.8898665010929108, "num_tokens": 672598753.0, "sample_num_tokens": 9983.25, "step": 2482, "total_num_tokens": 672638686.0, "z_loss": 0.0007414636202156544 }, { "copy_logits_max": -5.1649861335754395, "copy_logits_min": -750000000.0, "copy_num_tokens": 468.125, "epoch": 0.5071227980597396, "gen_logits_max": 5.448962688446045, "gen_logits_mean": -13.197179794311523, "gen_logits_min": -24.663803100585938, "gen_logits_std": 2.5453317165374756, "gen_loss": 0.30806416273117065, "grad_norm": 0.4563630657052077, "learning_rate": 2.7793263157894737e-05, "loss": 0.2973, "mean_copy_accuracy": 0.9932795017957687, "mean_gen_accuracy": 0.8685125261545181, "mean_token_accuracy": 0.9001532942056656, "num_tokens": 672887446.0, "sample_num_tokens": 8248.0, "step": 2483, "total_num_tokens": 672920438.0, "z_loss": 0.0008391766459681094 }, { "copy_logits_max": -3.838096857070923, "copy_logits_min": -750000000.0, "copy_num_tokens": 633.5625, "epoch": 0.5073270359969364, "gen_logits_max": 6.1505632400512695, "gen_logits_mean": -12.631183624267578, "gen_logits_min": -24.40005874633789, "gen_logits_std": 2.550386667251587, "gen_loss": 0.33318382501602173, "grad_norm": 0.4727464107548549, "learning_rate": 2.7792e-05, "loss": 0.3422, "mean_copy_accuracy": 0.9928935170173645, "mean_gen_accuracy": 0.8529959470033646, "mean_token_accuracy": 0.8871632665395737, "num_tokens": 673143946.0, "sample_num_tokens": 9367.0, "step": 2484, "total_num_tokens": 673181414.0, "z_loss": 0.0008319609332829714 }, { "copy_logits_max": -3.391490936279297, "copy_logits_min": -750000000.0, "copy_num_tokens": 414.875, "epoch": 0.5075312739341332, "gen_logits_max": 5.916771411895752, "gen_logits_mean": -13.037071228027344, "gen_logits_min": -24.419769287109375, "gen_logits_std": 2.4742379188537598, "gen_loss": 0.34768927097320557, "grad_norm": 0.4592476254146224, "learning_rate": 2.7790736842105265e-05, "loss": 0.3143, "mean_copy_accuracy": 0.9931605756282806, "mean_gen_accuracy": 0.8601832240819931, "mean_token_accuracy": 0.8974495530128479, "num_tokens": 673442612.0, "sample_num_tokens": 8155.0, "step": 2485, "total_num_tokens": 673475232.0, "z_loss": 0.000902000698260963 }, { "copy_logits_max": -6.147587776184082, "copy_logits_min": -750000000.0, "copy_num_tokens": 382.75, "epoch": 0.5077355118713301, "gen_logits_max": 5.74923038482666, "gen_logits_mean": -14.30024528503418, "gen_logits_min": -25.00048065185547, "gen_logits_std": 2.4498658180236816, "gen_loss": 0.3795332908630371, "grad_norm": 0.46564285311882364, "learning_rate": 2.7789473684210526e-05, "loss": 0.3449, "mean_copy_accuracy": 0.9941854178905487, "mean_gen_accuracy": 0.8542175143957138, "mean_token_accuracy": 0.8872897326946259, "num_tokens": 673715861.0, "sample_num_tokens": 9364.25, "step": 2486, "total_num_tokens": 673753318.0, "z_loss": 0.0009319078526459634 }, { "copy_logits_max": -3.548814296722412, "copy_logits_min": -750000000.0, "copy_num_tokens": 321.5, "epoch": 0.507939749808527, "gen_logits_max": 7.075430393218994, "gen_logits_mean": -12.750425338745117, "gen_logits_min": -24.227636337280273, "gen_logits_std": 2.491875648498535, "gen_loss": 0.38746169209480286, "grad_norm": 0.5614121545827133, "learning_rate": 2.778821052631579e-05, "loss": 0.3337, "mean_copy_accuracy": 0.9936744868755341, "mean_gen_accuracy": 0.8573140799999237, "mean_token_accuracy": 0.8912385702133179, "num_tokens": 673977428.0, "sample_num_tokens": 7837.5, "step": 2487, "total_num_tokens": 674008778.0, "z_loss": 0.0009138505556620657 }, { "copy_logits_max": -5.438510894775391, "copy_logits_min": -687500032.0, "copy_num_tokens": 440.75, "epoch": 0.5081439877457238, "gen_logits_max": 5.058572769165039, "gen_logits_mean": -14.346345901489258, "gen_logits_min": -25.726604461669922, "gen_logits_std": 2.493884563446045, "gen_loss": 0.3236331045627594, "grad_norm": 0.4145460269817261, "learning_rate": 2.778694736842105e-05, "loss": 0.309, "mean_copy_accuracy": 0.9931771606206894, "mean_gen_accuracy": 0.8716046214103699, "mean_token_accuracy": 0.8981068283319473, "num_tokens": 674239723.0, "sample_num_tokens": 8679.25, "step": 2488, "total_num_tokens": 674274440.0, "z_loss": 0.0008168019121512771 }, { "copy_logits_max": -4.699368476867676, "copy_logits_min": -750000000.0, "copy_num_tokens": 465.9375, "epoch": 0.5083482256829206, "gen_logits_max": 5.071403503417969, "gen_logits_mean": -13.442644119262695, "gen_logits_min": -25.471887588500977, "gen_logits_std": 2.626608371734619, "gen_loss": 0.3275725245475769, "grad_norm": 0.5588408910219271, "learning_rate": 2.778568421052632e-05, "loss": 0.3231, "mean_copy_accuracy": 0.9907765686511993, "mean_gen_accuracy": 0.8633725494146347, "mean_token_accuracy": 0.8925909548997879, "num_tokens": 674504046.0, "sample_num_tokens": 8210.5, "step": 2489, "total_num_tokens": 674536888.0, "z_loss": 0.0007052203873172402 }, { "copy_logits_max": -6.379781246185303, "copy_logits_min": -750000064.0, "copy_num_tokens": 447.0625, "epoch": 0.5085524636201174, "gen_logits_max": 6.636501312255859, "gen_logits_mean": -13.35336971282959, "gen_logits_min": -25.48067283630371, "gen_logits_std": 2.5445592403411865, "gen_loss": 0.3044568598270416, "grad_norm": 0.4831789202174549, "learning_rate": 2.778442105263158e-05, "loss": 0.3241, "mean_copy_accuracy": 0.9945815950632095, "mean_gen_accuracy": 0.8657981008291245, "mean_token_accuracy": 0.897039070725441, "num_tokens": 674780848.0, "sample_num_tokens": 9238.0, "step": 2490, "total_num_tokens": 674817800.0, "z_loss": 0.0007085842080414295 }, { "copy_logits_max": -2.2455878257751465, "copy_logits_min": -750000000.0, "copy_num_tokens": 668.625, "epoch": 0.5087567015573142, "gen_logits_max": 5.111909866333008, "gen_logits_mean": -12.261883735656738, "gen_logits_min": -24.608867645263672, "gen_logits_std": 2.552863597869873, "gen_loss": 0.3058338463306427, "grad_norm": 0.4377135896331331, "learning_rate": 2.7783157894736844e-05, "loss": 0.3186, "mean_copy_accuracy": 0.9950372576713562, "mean_gen_accuracy": 0.8552278578281403, "mean_token_accuracy": 0.8931590169668198, "num_tokens": 675065978.0, "sample_num_tokens": 9544.0, "step": 2491, "total_num_tokens": 675104154.0, "z_loss": 0.0008978747064247727 }, { "copy_logits_max": -1.429467797279358, "copy_logits_min": -625000064.0, "copy_num_tokens": 641.5625, "epoch": 0.5089609394945112, "gen_logits_max": 4.515103816986084, "gen_logits_mean": -14.213666915893555, "gen_logits_min": -26.29429817199707, "gen_logits_std": 2.5792248249053955, "gen_loss": 0.3149493336677551, "grad_norm": 0.4809137340066028, "learning_rate": 2.7781894736842105e-05, "loss": 0.313, "mean_copy_accuracy": 0.9933894723653793, "mean_gen_accuracy": 0.8662062436342239, "mean_token_accuracy": 0.8962951749563217, "num_tokens": 675349740.0, "sample_num_tokens": 9767.0, "step": 2492, "total_num_tokens": 675388808.0, "z_loss": 0.0009015330579131842 }, { "copy_logits_max": -4.747363567352295, "copy_logits_min": -750000000.0, "copy_num_tokens": 291.0625, "epoch": 0.509165177431708, "gen_logits_max": 6.535300254821777, "gen_logits_mean": -13.45453929901123, "gen_logits_min": -24.693675994873047, "gen_logits_std": 2.5157313346862793, "gen_loss": 0.33005058765411377, "grad_norm": 0.4508390512944985, "learning_rate": 2.778063157894737e-05, "loss": 0.3203, "mean_copy_accuracy": 0.9927018731832504, "mean_gen_accuracy": 0.8683135360479355, "mean_token_accuracy": 0.8951500207185745, "num_tokens": 675612143.0, "sample_num_tokens": 7201.75, "step": 2493, "total_num_tokens": 675640950.0, "z_loss": 0.0009164660586975515 }, { "copy_logits_max": -5.998177528381348, "copy_logits_min": -750000000.0, "copy_num_tokens": 297.625, "epoch": 0.5093694153689048, "gen_logits_max": 5.920507431030273, "gen_logits_mean": -13.907844543457031, "gen_logits_min": -25.03085708618164, "gen_logits_std": 2.4979453086853027, "gen_loss": 0.27094390988349915, "grad_norm": 0.489275131715529, "learning_rate": 2.777936842105263e-05, "loss": 0.3064, "mean_copy_accuracy": 0.9917494803667068, "mean_gen_accuracy": 0.866627886891365, "mean_token_accuracy": 0.8988598585128784, "num_tokens": 675882676.0, "sample_num_tokens": 7762.5, "step": 2494, "total_num_tokens": 675913726.0, "z_loss": 0.0007205589790828526 }, { "copy_logits_max": -3.1305956840515137, "copy_logits_min": -750000000.0, "copy_num_tokens": 523.0, "epoch": 0.5095736533061016, "gen_logits_max": 5.291938304901123, "gen_logits_mean": -12.56326961517334, "gen_logits_min": -24.74654197692871, "gen_logits_std": 2.601886034011841, "gen_loss": 0.30252283811569214, "grad_norm": 0.4908687449030752, "learning_rate": 2.7778105263157895e-05, "loss": 0.3371, "mean_copy_accuracy": 0.9932451993227005, "mean_gen_accuracy": 0.8565904051065445, "mean_token_accuracy": 0.8890355676412582, "num_tokens": 676142566.0, "sample_num_tokens": 8910.5, "step": 2495, "total_num_tokens": 676178208.0, "z_loss": 0.0008796264883130789 }, { "copy_logits_max": -2.5599427223205566, "copy_logits_min": -750000064.0, "copy_num_tokens": 431.1875, "epoch": 0.5097778912432984, "gen_logits_max": 4.856993198394775, "gen_logits_mean": -13.82529067993164, "gen_logits_min": -24.887460708618164, "gen_logits_std": 2.48979115486145, "gen_loss": 0.3242611885070801, "grad_norm": 0.46813280769718785, "learning_rate": 2.7776842105263156e-05, "loss": 0.3329, "mean_copy_accuracy": 0.9928414076566696, "mean_gen_accuracy": 0.8610367178916931, "mean_token_accuracy": 0.8916146010160446, "num_tokens": 676411669.0, "sample_num_tokens": 8576.75, "step": 2496, "total_num_tokens": 676445976.0, "z_loss": 0.0009019451681524515 }, { "copy_logits_max": -2.610811233520508, "copy_logits_min": -750000000.0, "copy_num_tokens": 405.5, "epoch": 0.5099821291804952, "gen_logits_max": 7.290064811706543, "gen_logits_mean": -10.984415054321289, "gen_logits_min": -22.994140625, "gen_logits_std": 2.6448588371276855, "gen_loss": 0.42481619119644165, "grad_norm": 0.5090662623395648, "learning_rate": 2.7775578947368423e-05, "loss": 0.3595, "mean_copy_accuracy": 0.9936930686235428, "mean_gen_accuracy": 0.8500122576951981, "mean_token_accuracy": 0.8817417174577713, "num_tokens": 676674287.0, "sample_num_tokens": 8099.75, "step": 2497, "total_num_tokens": 676706686.0, "z_loss": 0.001141228131018579 }, { "copy_logits_max": -4.309294700622559, "copy_logits_min": -687500032.0, "copy_num_tokens": 633.8125, "epoch": 0.5101863671176922, "gen_logits_max": 5.741805076599121, "gen_logits_mean": -12.150003433227539, "gen_logits_min": -23.740463256835938, "gen_logits_std": 2.5176138877868652, "gen_loss": 0.30776068568229675, "grad_norm": 0.4153370354927443, "learning_rate": 2.7774315789473688e-05, "loss": 0.3339, "mean_copy_accuracy": 0.9938530325889587, "mean_gen_accuracy": 0.8609377443790436, "mean_token_accuracy": 0.8902336210012436, "num_tokens": 676958197.0, "sample_num_tokens": 10614.75, "step": 2498, "total_num_tokens": 677000656.0, "z_loss": 0.0008540734997950494 }, { "copy_logits_max": -2.439847469329834, "copy_logits_min": -750000000.0, "copy_num_tokens": 473.4375, "epoch": 0.510390605054889, "gen_logits_max": 4.938221454620361, "gen_logits_mean": -14.538516998291016, "gen_logits_min": -26.16962432861328, "gen_logits_std": 2.5231738090515137, "gen_loss": 0.3002655506134033, "grad_norm": 0.5028371598515217, "learning_rate": 2.777305263157895e-05, "loss": 0.3116, "mean_copy_accuracy": 0.9932949244976044, "mean_gen_accuracy": 0.8632043153047562, "mean_token_accuracy": 0.8959959745407104, "num_tokens": 677251447.0, "sample_num_tokens": 9255.25, "step": 2499, "total_num_tokens": 677288468.0, "z_loss": 0.0008081487030722201 }, { "epoch": 0.5105948429920858, "grad_norm": 0.5310942301375494, "learning_rate": 2.7771789473684213e-05, "loss": 0.3227, "step": 2500 }, { "epoch": 0.5105948429920858, "eval_copy_logits_max": -6.433323383331299, "eval_copy_logits_min": -72.87133026123047, "eval_gen_logits_max": 5.138840675354004, "eval_gen_logits_mean": -17.313993453979492, "eval_gen_logits_min": -28.017078399658203, "eval_gen_logits_std": 2.4692797660827637, "eval_gen_loss": 0.37091004848480225, "eval_loss": 0.33870798349380493, "eval_mean_copy_accuracy": 0.9894402921199799, "eval_mean_gen_accuracy": 0.8708377778530121, "eval_mean_token_accuracy": 0.8861902058124542, "eval_num_tokens": 677562208.0, "eval_runtime": 0.7642, "eval_samples_per_second": 10.468, "eval_steps_per_second": 2.617, "eval_total_num_tokens": 677562208.0, "eval_z_loss": 0.0007600006065331399, "step": 2500 }, { "copy_logits_max": -3.4804725646972656, "copy_logits_min": -750000000.0, "copy_num_tokens": 543.1875, "epoch": 0.5107990809292826, "gen_logits_max": 4.100648880004883, "gen_logits_mean": -14.600394248962402, "gen_logits_min": -26.477252960205078, "gen_logits_std": 2.514507293701172, "gen_loss": 0.26323646306991577, "grad_norm": 0.43773556681337816, "learning_rate": 2.7770526315789474e-05, "loss": 0.3262, "mean_copy_accuracy": 0.9922624677419662, "mean_gen_accuracy": 0.8597350567579269, "mean_token_accuracy": 0.8930867314338684, "num_tokens": 677808142.0, "sample_num_tokens": 8588.0, "step": 2501, "total_num_tokens": 677842494.0, "z_loss": 0.0006997886230237782 }, { "copy_logits_max": -4.393248081207275, "copy_logits_min": -687500032.0, "copy_num_tokens": 492.125, "epoch": 0.5110033188664794, "gen_logits_max": 5.4386820793151855, "gen_logits_mean": -13.832908630371094, "gen_logits_min": -25.25265121459961, "gen_logits_std": 2.5165019035339355, "gen_loss": 0.31906211376190186, "grad_norm": 0.42729279857238084, "learning_rate": 2.7769263157894738e-05, "loss": 0.3121, "mean_copy_accuracy": 0.9939824342727661, "mean_gen_accuracy": 0.8628138899803162, "mean_token_accuracy": 0.8976670354604721, "num_tokens": 678087598.0, "sample_num_tokens": 9208.0, "step": 2502, "total_num_tokens": 678124430.0, "z_loss": 0.0007656774250790477 }, { "copy_logits_max": -1.5387874841690063, "copy_logits_min": -562500096.0, "copy_num_tokens": 587.3125, "epoch": 0.5112075568036762, "gen_logits_max": 4.576723098754883, "gen_logits_mean": -13.346529006958008, "gen_logits_min": -25.258819580078125, "gen_logits_std": 2.5455126762390137, "gen_loss": 0.30882489681243896, "grad_norm": 0.5529672071043376, "learning_rate": 2.7768e-05, "loss": 0.3061, "mean_copy_accuracy": 0.99370276927948, "mean_gen_accuracy": 0.8618006259202957, "mean_token_accuracy": 0.8995730876922607, "num_tokens": 678396380.0, "sample_num_tokens": 8589.5, "step": 2503, "total_num_tokens": 678430738.0, "z_loss": 0.0007973741739988327 }, { "copy_logits_max": -5.054361820220947, "copy_logits_min": -687500032.0, "copy_num_tokens": 602.3125, "epoch": 0.5114117947408732, "gen_logits_max": 4.9775590896606445, "gen_logits_mean": -13.889558792114258, "gen_logits_min": -25.553604125976562, "gen_logits_std": 2.5274546146392822, "gen_loss": 0.2923799753189087, "grad_norm": 0.47997027753318905, "learning_rate": 2.7766736842105263e-05, "loss": 0.3058, "mean_copy_accuracy": 0.9958750605583191, "mean_gen_accuracy": 0.8619853109121323, "mean_token_accuracy": 0.9016018807888031, "num_tokens": 678695225.0, "sample_num_tokens": 10178.25, "step": 2504, "total_num_tokens": 678735938.0, "z_loss": 0.0007761328597553074 }, { "copy_logits_max": -6.5598955154418945, "copy_logits_min": -750000000.0, "copy_num_tokens": 271.375, "epoch": 0.51161603267807, "gen_logits_max": 6.539560794830322, "gen_logits_mean": -12.600366592407227, "gen_logits_min": -23.735158920288086, "gen_logits_std": 2.477200746536255, "gen_loss": 0.40597623586654663, "grad_norm": 0.5044394937841212, "learning_rate": 2.7765473684210528e-05, "loss": 0.3477, "mean_copy_accuracy": 0.9912789016962051, "mean_gen_accuracy": 0.8551261276006699, "mean_token_accuracy": 0.8851077109575272, "num_tokens": 678973641.0, "sample_num_tokens": 8404.25, "step": 2505, "total_num_tokens": 679007258.0, "z_loss": 0.000909975846298039 }, { "copy_logits_max": -6.930161476135254, "copy_logits_min": -750000000.0, "copy_num_tokens": 512.5, "epoch": 0.5118202706152668, "gen_logits_max": 5.691185474395752, "gen_logits_mean": -13.394857406616211, "gen_logits_min": -25.04743003845215, "gen_logits_std": 2.5371599197387695, "gen_loss": 0.303717702627182, "grad_norm": 0.9155567889968853, "learning_rate": 2.7764210526315792e-05, "loss": 0.3292, "mean_copy_accuracy": 0.9896081835031509, "mean_gen_accuracy": 0.8587048649787903, "mean_token_accuracy": 0.8937946259975433, "num_tokens": 679251202.0, "sample_num_tokens": 8465.0, "step": 2506, "total_num_tokens": 679285062.0, "z_loss": 0.0007431592093780637 }, { "copy_logits_max": -0.49978142976760864, "copy_logits_min": -750000000.0, "copy_num_tokens": 478.5, "epoch": 0.5120245085524636, "gen_logits_max": 7.2878923416137695, "gen_logits_mean": -11.686901092529297, "gen_logits_min": -23.883323669433594, "gen_logits_std": 2.612501621246338, "gen_loss": 0.3321875333786011, "grad_norm": 0.536502487132663, "learning_rate": 2.7762947368421053e-05, "loss": 0.356, "mean_copy_accuracy": 0.9921578466892242, "mean_gen_accuracy": 0.8545346707105637, "mean_token_accuracy": 0.883808821439743, "num_tokens": 679501535.0, "sample_num_tokens": 8072.25, "step": 2507, "total_num_tokens": 679533824.0, "z_loss": 0.0011373483575880527 }, { "copy_logits_max": 2.075944423675537, "copy_logits_min": -750000000.0, "copy_num_tokens": 474.25, "epoch": 0.5122287464896604, "gen_logits_max": 5.649029731750488, "gen_logits_mean": -13.27894401550293, "gen_logits_min": -25.30962371826172, "gen_logits_std": 2.5222506523132324, "gen_loss": 0.2981639802455902, "grad_norm": 0.6037425623664306, "learning_rate": 2.7761684210526317e-05, "loss": 0.2905, "mean_copy_accuracy": 0.9933587312698364, "mean_gen_accuracy": 0.869351863861084, "mean_token_accuracy": 0.9030988365411758, "num_tokens": 679795060.0, "sample_num_tokens": 8700.0, "step": 2508, "total_num_tokens": 679829860.0, "z_loss": 0.0017470611492171884 }, { "copy_logits_max": 5.405869483947754, "copy_logits_min": -750000000.0, "copy_num_tokens": 514.5, "epoch": 0.5124329844268573, "gen_logits_max": 6.051063537597656, "gen_logits_mean": -11.724214553833008, "gen_logits_min": -23.404279708862305, "gen_logits_std": 2.50858998298645, "gen_loss": 0.3802342712879181, "grad_norm": 0.4755449958097535, "learning_rate": 2.7760421052631578e-05, "loss": 0.3119, "mean_copy_accuracy": 0.9942077845335007, "mean_gen_accuracy": 0.8595169931650162, "mean_token_accuracy": 0.898729532957077, "num_tokens": 680065354.0, "sample_num_tokens": 8636.5, "step": 2509, "total_num_tokens": 680099900.0, "z_loss": 0.0021390998736023903 }, { "copy_logits_max": 3.534461259841919, "copy_logits_min": -750000000.0, "copy_num_tokens": 499.8125, "epoch": 0.5126372223640541, "gen_logits_max": 5.730933666229248, "gen_logits_mean": -12.990558624267578, "gen_logits_min": -25.011402130126953, "gen_logits_std": 2.55997371673584, "gen_loss": 0.29537028074264526, "grad_norm": 0.4201363166122918, "learning_rate": 2.7759157894736842e-05, "loss": 0.2911, "mean_copy_accuracy": 0.9920647740364075, "mean_gen_accuracy": 0.872501477599144, "mean_token_accuracy": 0.9053269922733307, "num_tokens": 680380310.0, "sample_num_tokens": 8565.5, "step": 2510, "total_num_tokens": 680414572.0, "z_loss": 0.0016344661125913262 }, { "copy_logits_max": 2.433642625808716, "copy_logits_min": -625000064.0, "copy_num_tokens": 551.5, "epoch": 0.512841460301251, "gen_logits_max": 4.771284103393555, "gen_logits_mean": -13.375459671020508, "gen_logits_min": -24.69668197631836, "gen_logits_std": 2.481266498565674, "gen_loss": 0.3466508090496063, "grad_norm": 0.5794448638250772, "learning_rate": 2.7757894736842103e-05, "loss": 0.3619, "mean_copy_accuracy": 0.9860590994358063, "mean_gen_accuracy": 0.8515074253082275, "mean_token_accuracy": 0.8815819621086121, "num_tokens": 680634477.0, "sample_num_tokens": 8837.25, "step": 2511, "total_num_tokens": 680669826.0, "z_loss": 0.001452410127967596 }, { "copy_logits_max": -0.10159637033939362, "copy_logits_min": -750000000.0, "copy_num_tokens": 465.3125, "epoch": 0.5130456982384478, "gen_logits_max": 4.886600494384766, "gen_logits_mean": -13.993330001831055, "gen_logits_min": -25.544078826904297, "gen_logits_std": 2.464838743209839, "gen_loss": 0.30970698595046997, "grad_norm": 0.4247399580900409, "learning_rate": 2.7756631578947368e-05, "loss": 0.3238, "mean_copy_accuracy": 0.9941760003566742, "mean_gen_accuracy": 0.8611855059862137, "mean_token_accuracy": 0.8949203789234161, "num_tokens": 680918142.0, "sample_num_tokens": 8399.0, "step": 2512, "total_num_tokens": 680951738.0, "z_loss": 0.0011177954729646444 }, { "copy_logits_max": 0.5951973795890808, "copy_logits_min": -750000000.0, "copy_num_tokens": 409.3125, "epoch": 0.5132499361756446, "gen_logits_max": 4.6575775146484375, "gen_logits_mean": -14.54720687866211, "gen_logits_min": -25.911495208740234, "gen_logits_std": 2.5054211616516113, "gen_loss": 0.3511046767234802, "grad_norm": 0.4874905014504703, "learning_rate": 2.7755368421052632e-05, "loss": 0.336, "mean_copy_accuracy": 0.9924227595329285, "mean_gen_accuracy": 0.8572443276643753, "mean_token_accuracy": 0.8900354504585266, "num_tokens": 681183181.0, "sample_num_tokens": 7564.75, "step": 2513, "total_num_tokens": 681213440.0, "z_loss": 0.0011569196358323097 }, { "copy_logits_max": -1.5602134466171265, "copy_logits_min": -687500032.0, "copy_num_tokens": 384.3125, "epoch": 0.5134541741128414, "gen_logits_max": 4.896407127380371, "gen_logits_mean": -13.082107543945312, "gen_logits_min": -24.262557983398438, "gen_logits_std": 2.4571917057037354, "gen_loss": 0.3655681014060974, "grad_norm": 0.4925906969514176, "learning_rate": 2.7754105263157896e-05, "loss": 0.3369, "mean_copy_accuracy": 0.9935568571090698, "mean_gen_accuracy": 0.8564704209566116, "mean_token_accuracy": 0.8874474167823792, "num_tokens": 681440984.0, "sample_num_tokens": 8017.0, "step": 2514, "total_num_tokens": 681473052.0, "z_loss": 0.000992890796624124 }, { "copy_logits_max": -2.9319608211517334, "copy_logits_min": -750000000.0, "copy_num_tokens": 406.875, "epoch": 0.5136584120500383, "gen_logits_max": 4.520427703857422, "gen_logits_mean": -14.272823333740234, "gen_logits_min": -25.064138412475586, "gen_logits_std": 2.4110352993011475, "gen_loss": 0.34420907497406006, "grad_norm": 0.4674872969865514, "learning_rate": 2.775284210526316e-05, "loss": 0.322, "mean_copy_accuracy": 0.993865355849266, "mean_gen_accuracy": 0.8621678650379181, "mean_token_accuracy": 0.8928350955247879, "num_tokens": 681703798.0, "sample_num_tokens": 8484.5, "step": 2515, "total_num_tokens": 681737736.0, "z_loss": 0.0008481342811137438 }, { "copy_logits_max": -2.5325100421905518, "copy_logits_min": -750000000.0, "copy_num_tokens": 277.75, "epoch": 0.5138626499872351, "gen_logits_max": 5.562850475311279, "gen_logits_mean": -13.371685028076172, "gen_logits_min": -24.51150894165039, "gen_logits_std": 2.5049333572387695, "gen_loss": 0.3653683364391327, "grad_norm": 0.5139776022526765, "learning_rate": 2.775157894736842e-05, "loss": 0.3312, "mean_copy_accuracy": 0.991028219461441, "mean_gen_accuracy": 0.8617916256189346, "mean_token_accuracy": 0.8908382058143616, "num_tokens": 681955074.0, "sample_num_tokens": 6573.5, "step": 2516, "total_num_tokens": 681981368.0, "z_loss": 0.0007557300850749016 }, { "copy_logits_max": -1.2797120809555054, "copy_logits_min": -687500032.0, "copy_num_tokens": 443.9375, "epoch": 0.514066887924432, "gen_logits_max": 4.349729537963867, "gen_logits_mean": -13.513273239135742, "gen_logits_min": -24.973251342773438, "gen_logits_std": 2.4605233669281006, "gen_loss": 0.357688844203949, "grad_norm": 0.5970945904654971, "learning_rate": 2.7750315789473686e-05, "loss": 0.3468, "mean_copy_accuracy": 0.99066461622715, "mean_gen_accuracy": 0.8528126180171967, "mean_token_accuracy": 0.8855386972427368, "num_tokens": 682206238.0, "sample_num_tokens": 7808.5, "step": 2517, "total_num_tokens": 682237472.0, "z_loss": 0.0008165898616425693 }, { "copy_logits_max": -0.6575101613998413, "copy_logits_min": -687500032.0, "copy_num_tokens": 485.3125, "epoch": 0.5142711258616288, "gen_logits_max": 4.655025959014893, "gen_logits_mean": -13.58188247680664, "gen_logits_min": -25.58362579345703, "gen_logits_std": 2.5356991291046143, "gen_loss": 0.3346821367740631, "grad_norm": 0.4127766089322957, "learning_rate": 2.7749052631578947e-05, "loss": 0.2999, "mean_copy_accuracy": 0.993778795003891, "mean_gen_accuracy": 0.8658969700336456, "mean_token_accuracy": 0.9004712551832199, "num_tokens": 682483251.0, "sample_num_tokens": 8137.25, "step": 2518, "total_num_tokens": 682515800.0, "z_loss": 0.0009030879591591656 }, { "copy_logits_max": -3.401419162750244, "copy_logits_min": -750000000.0, "copy_num_tokens": 368.0625, "epoch": 0.5144753637988256, "gen_logits_max": 5.065838813781738, "gen_logits_mean": -13.195703506469727, "gen_logits_min": -24.68292236328125, "gen_logits_std": 2.4422316551208496, "gen_loss": 0.32159876823425293, "grad_norm": 0.5191326492908506, "learning_rate": 2.774778947368421e-05, "loss": 0.3061, "mean_copy_accuracy": 0.9946204870939255, "mean_gen_accuracy": 0.8668600469827652, "mean_token_accuracy": 0.8994239568710327, "num_tokens": 682765957.0, "sample_num_tokens": 7446.25, "step": 2519, "total_num_tokens": 682795742.0, "z_loss": 0.0009536425350233912 }, { "copy_logits_max": -2.5265328884124756, "copy_logits_min": -750000000.0, "copy_num_tokens": 457.375, "epoch": 0.5146796017360225, "gen_logits_max": 4.870781898498535, "gen_logits_mean": -13.58502197265625, "gen_logits_min": -24.79727554321289, "gen_logits_std": 2.464388370513916, "gen_loss": 0.3546609878540039, "grad_norm": 0.47079360840024437, "learning_rate": 2.7746526315789472e-05, "loss": 0.3324, "mean_copy_accuracy": 0.9925500899553299, "mean_gen_accuracy": 0.8602083772420883, "mean_token_accuracy": 0.8892567306756973, "num_tokens": 683012672.0, "sample_num_tokens": 8058.0, "step": 2520, "total_num_tokens": 683044904.0, "z_loss": 0.0009532416006550193 }, { "copy_logits_max": -3.68471097946167, "copy_logits_min": -750000000.0, "copy_num_tokens": 377.75, "epoch": 0.5148838396732193, "gen_logits_max": 5.600433349609375, "gen_logits_mean": -12.970592498779297, "gen_logits_min": -23.944774627685547, "gen_logits_std": 2.4322896003723145, "gen_loss": 0.2759801745414734, "grad_norm": 0.42507145064515295, "learning_rate": 2.7745263157894736e-05, "loss": 0.3058, "mean_copy_accuracy": 0.9926174581050873, "mean_gen_accuracy": 0.8644134104251862, "mean_token_accuracy": 0.8977879285812378, "num_tokens": 683285312.0, "sample_num_tokens": 8201.5, "step": 2521, "total_num_tokens": 683318118.0, "z_loss": 0.0008151549845933914 }, { "copy_logits_max": -4.387611389160156, "copy_logits_min": -750000000.0, "copy_num_tokens": 683.8125, "epoch": 0.5150880776104161, "gen_logits_max": 4.397541522979736, "gen_logits_mean": -12.28931999206543, "gen_logits_min": -23.499588012695312, "gen_logits_std": 2.403078556060791, "gen_loss": 0.29642730951309204, "grad_norm": 0.48434420711912113, "learning_rate": 2.7744e-05, "loss": 0.3261, "mean_copy_accuracy": 0.9918512105941772, "mean_gen_accuracy": 0.8628108650445938, "mean_token_accuracy": 0.891689196228981, "num_tokens": 683537548.0, "sample_num_tokens": 10756.5, "step": 2522, "total_num_tokens": 683580574.0, "z_loss": 0.0007900676573626697 }, { "copy_logits_max": -1.4778220653533936, "copy_logits_min": -750000128.0, "copy_num_tokens": 639.6875, "epoch": 0.515292315547613, "gen_logits_max": 4.654412746429443, "gen_logits_mean": -13.617721557617188, "gen_logits_min": -25.394832611083984, "gen_logits_std": 2.5422656536102295, "gen_loss": 0.3167131543159485, "grad_norm": 0.46671998551327576, "learning_rate": 2.7742736842105265e-05, "loss": 0.3351, "mean_copy_accuracy": 0.9924834817647934, "mean_gen_accuracy": 0.8574192374944687, "mean_token_accuracy": 0.8914918154478073, "num_tokens": 683819719.0, "sample_num_tokens": 9787.75, "step": 2523, "total_num_tokens": 683858870.0, "z_loss": 0.0008648154907859862 }, { "copy_logits_max": -2.906230926513672, "copy_logits_min": -687500032.0, "copy_num_tokens": 377.5625, "epoch": 0.5154965534848098, "gen_logits_max": 5.070687770843506, "gen_logits_mean": -13.138525009155273, "gen_logits_min": -24.82964324951172, "gen_logits_std": 2.447154998779297, "gen_loss": 0.34432747960090637, "grad_norm": 0.4950373605161763, "learning_rate": 2.7741473684210526e-05, "loss": 0.3464, "mean_copy_accuracy": 0.9926861971616745, "mean_gen_accuracy": 0.8494895547628403, "mean_token_accuracy": 0.8868519514799118, "num_tokens": 684084051.0, "sample_num_tokens": 6848.25, "step": 2524, "total_num_tokens": 684111444.0, "z_loss": 0.000941705540753901 }, { "copy_logits_max": -2.4430062770843506, "copy_logits_min": -687500032.0, "copy_num_tokens": 569.9375, "epoch": 0.5157007914220066, "gen_logits_max": 4.889003753662109, "gen_logits_mean": -13.583456039428711, "gen_logits_min": -25.103343963623047, "gen_logits_std": 2.47403883934021, "gen_loss": 0.3176168203353882, "grad_norm": 0.4700885652406548, "learning_rate": 2.774021052631579e-05, "loss": 0.324, "mean_copy_accuracy": 0.9915018677711487, "mean_gen_accuracy": 0.86219522356987, "mean_token_accuracy": 0.8917715847492218, "num_tokens": 684344954.0, "sample_num_tokens": 9353.5, "step": 2525, "total_num_tokens": 684382368.0, "z_loss": 0.000880181381944567 }, { "copy_logits_max": -3.9930574893951416, "copy_logits_min": -750000000.0, "copy_num_tokens": 521.1875, "epoch": 0.5159050293592035, "gen_logits_max": 4.46766996383667, "gen_logits_mean": -14.180257797241211, "gen_logits_min": -25.719579696655273, "gen_logits_std": 2.4920437335968018, "gen_loss": 0.30336230993270874, "grad_norm": 0.43759906624596145, "learning_rate": 2.7738947368421055e-05, "loss": 0.3117, "mean_copy_accuracy": 0.9938158541917801, "mean_gen_accuracy": 0.8672340363264084, "mean_token_accuracy": 0.8976913094520569, "num_tokens": 684620118.0, "sample_num_tokens": 8791.5, "step": 2526, "total_num_tokens": 684655284.0, "z_loss": 0.0008353762677870691 }, { "copy_logits_max": -2.9034695625305176, "copy_logits_min": -750000000.0, "copy_num_tokens": 499.5625, "epoch": 0.5161092672964003, "gen_logits_max": 4.74968147277832, "gen_logits_mean": -12.743799209594727, "gen_logits_min": -24.29083824157715, "gen_logits_std": 2.4288177490234375, "gen_loss": 0.3281487226486206, "grad_norm": 0.4585842449507953, "learning_rate": 2.7737684210526315e-05, "loss": 0.3118, "mean_copy_accuracy": 0.9939704686403275, "mean_gen_accuracy": 0.8642264902591705, "mean_token_accuracy": 0.8991122841835022, "num_tokens": 684902125.0, "sample_num_tokens": 8463.75, "step": 2527, "total_num_tokens": 684935980.0, "z_loss": 0.0008333021542057395 }, { "copy_logits_max": -3.4230432510375977, "copy_logits_min": -750000000.0, "copy_num_tokens": 543.5625, "epoch": 0.5163135052335971, "gen_logits_max": 5.642061710357666, "gen_logits_mean": -12.21908187866211, "gen_logits_min": -23.922367095947266, "gen_logits_std": 2.4775145053863525, "gen_loss": 0.329865962266922, "grad_norm": 0.4933308424292736, "learning_rate": 2.773642105263158e-05, "loss": 0.3364, "mean_copy_accuracy": 0.9923542737960815, "mean_gen_accuracy": 0.8524572104215622, "mean_token_accuracy": 0.8887182623147964, "num_tokens": 685176489.0, "sample_num_tokens": 9800.75, "step": 2528, "total_num_tokens": 685215692.0, "z_loss": 0.0008101642597466707 }, { "copy_logits_max": -2.3339147567749023, "copy_logits_min": -750000000.0, "copy_num_tokens": 487.4375, "epoch": 0.516517743170794, "gen_logits_max": 5.313662528991699, "gen_logits_mean": -13.857232093811035, "gen_logits_min": -25.883195877075195, "gen_logits_std": 2.533052444458008, "gen_loss": 0.3384735584259033, "grad_norm": 0.43422170517933145, "learning_rate": 2.773515789473684e-05, "loss": 0.3089, "mean_copy_accuracy": 0.992756724357605, "mean_gen_accuracy": 0.8648847192525864, "mean_token_accuracy": 0.8963306099176407, "num_tokens": 685444580.0, "sample_num_tokens": 8737.5, "step": 2529, "total_num_tokens": 685479530.0, "z_loss": 0.0009025918552652001 }, { "copy_logits_max": -2.8463661670684814, "copy_logits_min": -750000000.0, "copy_num_tokens": 335.0, "epoch": 0.5167219811079908, "gen_logits_max": 5.71177864074707, "gen_logits_mean": -12.756144523620605, "gen_logits_min": -24.406295776367188, "gen_logits_std": 2.439565658569336, "gen_loss": 0.3475021421909332, "grad_norm": 0.48242417352719164, "learning_rate": 2.773389473684211e-05, "loss": 0.335, "mean_copy_accuracy": 0.9924831539392471, "mean_gen_accuracy": 0.8615248054265976, "mean_token_accuracy": 0.8891830742359161, "num_tokens": 685699280.0, "sample_num_tokens": 7901.0, "step": 2530, "total_num_tokens": 685730884.0, "z_loss": 0.0009343215497210622 }, { "copy_logits_max": -3.1384129524230957, "copy_logits_min": -750000000.0, "copy_num_tokens": 326.0625, "epoch": 0.5169262190451877, "gen_logits_max": 6.1277618408203125, "gen_logits_mean": -13.560744285583496, "gen_logits_min": -24.909481048583984, "gen_logits_std": 2.4670495986938477, "gen_loss": 0.354714572429657, "grad_norm": 0.4755984384137862, "learning_rate": 2.773263157894737e-05, "loss": 0.3361, "mean_copy_accuracy": 0.9947802275419235, "mean_gen_accuracy": 0.8626207113265991, "mean_token_accuracy": 0.8910622149705887, "num_tokens": 685975072.0, "sample_num_tokens": 7661.5, "step": 2531, "total_num_tokens": 686005718.0, "z_loss": 0.000917916651815176 }, { "copy_logits_max": -3.319000244140625, "copy_logits_min": -750000000.0, "copy_num_tokens": 338.6875, "epoch": 0.5171304569823845, "gen_logits_max": 5.350563049316406, "gen_logits_mean": -13.658297538757324, "gen_logits_min": -24.661537170410156, "gen_logits_std": 2.4575552940368652, "gen_loss": 0.31461966037750244, "grad_norm": 0.42815477673381724, "learning_rate": 2.7731368421052634e-05, "loss": 0.3114, "mean_copy_accuracy": 0.9914201349020004, "mean_gen_accuracy": 0.8717210590839386, "mean_token_accuracy": 0.8949101567268372, "num_tokens": 686238943.0, "sample_num_tokens": 7304.75, "step": 2532, "total_num_tokens": 686268162.0, "z_loss": 0.0008483199635520577 }, { "copy_logits_max": -2.618804693222046, "copy_logits_min": -687500032.0, "copy_num_tokens": 550.625, "epoch": 0.5173346949195813, "gen_logits_max": 4.555673599243164, "gen_logits_mean": -14.673702239990234, "gen_logits_min": -26.309886932373047, "gen_logits_std": 2.533698558807373, "gen_loss": 0.3238031566143036, "grad_norm": 0.49127385135394375, "learning_rate": 2.7730105263157895e-05, "loss": 0.3129, "mean_copy_accuracy": 0.9943446964025497, "mean_gen_accuracy": 0.859393298625946, "mean_token_accuracy": 0.8967809528112411, "num_tokens": 686515455.0, "sample_num_tokens": 9193.75, "step": 2533, "total_num_tokens": 686552230.0, "z_loss": 0.0009533989359624684 }, { "copy_logits_max": -4.257071495056152, "copy_logits_min": -750000000.0, "copy_num_tokens": 370.4375, "epoch": 0.5175389328567781, "gen_logits_max": 5.788990020751953, "gen_logits_mean": -11.9777250289917, "gen_logits_min": -23.14895248413086, "gen_logits_std": 2.4453680515289307, "gen_loss": 0.3407805860042572, "grad_norm": 0.46159125783626287, "learning_rate": 2.772884210526316e-05, "loss": 0.3244, "mean_copy_accuracy": 0.993200957775116, "mean_gen_accuracy": 0.8587814420461655, "mean_token_accuracy": 0.8913673311471939, "num_tokens": 686796649.0, "sample_num_tokens": 8212.25, "step": 2534, "total_num_tokens": 686829498.0, "z_loss": 0.0008491078624501824 }, { "copy_logits_max": -1.5349565744400024, "copy_logits_min": -750000000.0, "copy_num_tokens": 660.125, "epoch": 0.517743170793975, "gen_logits_max": 5.438467979431152, "gen_logits_mean": -11.990581512451172, "gen_logits_min": -23.28882598876953, "gen_logits_std": 2.518585205078125, "gen_loss": 0.3075793981552124, "grad_norm": 0.4543098206624341, "learning_rate": 2.772757894736842e-05, "loss": 0.3205, "mean_copy_accuracy": 0.9938715696334839, "mean_gen_accuracy": 0.8605736941099167, "mean_token_accuracy": 0.8962605446577072, "num_tokens": 687069417.0, "sample_num_tokens": 9779.25, "step": 2535, "total_num_tokens": 687108534.0, "z_loss": 0.0009371633059345186 }, { "copy_logits_max": -3.9437177181243896, "copy_logits_min": -750000000.0, "copy_num_tokens": 287.3125, "epoch": 0.5179474087311718, "gen_logits_max": 5.879135608673096, "gen_logits_mean": -12.09315013885498, "gen_logits_min": -23.149755477905273, "gen_logits_std": 2.407921075820923, "gen_loss": 0.33996981382369995, "grad_norm": 0.5301966118233177, "learning_rate": 2.7726315789473684e-05, "loss": 0.3178, "mean_copy_accuracy": 0.9933529794216156, "mean_gen_accuracy": 0.8659576922655106, "mean_token_accuracy": 0.8965688198804855, "num_tokens": 687354512.0, "sample_num_tokens": 6831.0, "step": 2536, "total_num_tokens": 687381836.0, "z_loss": 0.0007725601317360997 }, { "copy_logits_max": -4.848458290100098, "copy_logits_min": -750000000.0, "copy_num_tokens": 558.8125, "epoch": 0.5181516466683687, "gen_logits_max": 5.034834384918213, "gen_logits_mean": -12.755325317382812, "gen_logits_min": -23.949970245361328, "gen_logits_std": 2.5128135681152344, "gen_loss": 0.2865588665008545, "grad_norm": 0.4136006518167939, "learning_rate": 2.7725052631578945e-05, "loss": 0.2988, "mean_copy_accuracy": 0.9935893714427948, "mean_gen_accuracy": 0.8723951131105423, "mean_token_accuracy": 0.9006824195384979, "num_tokens": 687624339.0, "sample_num_tokens": 9256.75, "step": 2537, "total_num_tokens": 687661366.0, "z_loss": 0.0007728705531917512 }, { "copy_logits_max": -3.0873379707336426, "copy_logits_min": -750000000.0, "copy_num_tokens": 617.3125, "epoch": 0.5183558846055655, "gen_logits_max": 4.584418296813965, "gen_logits_mean": -12.579292297363281, "gen_logits_min": -24.32772445678711, "gen_logits_std": 2.4961040019989014, "gen_loss": 0.2709070146083832, "grad_norm": 0.4128108231166635, "learning_rate": 2.7723789473684213e-05, "loss": 0.2984, "mean_copy_accuracy": 0.9941292256116867, "mean_gen_accuracy": 0.8684933036565781, "mean_token_accuracy": 0.9020015597343445, "num_tokens": 687893404.0, "sample_num_tokens": 9382.0, "step": 2538, "total_num_tokens": 687930932.0, "z_loss": 0.0006761759286746383 }, { "copy_logits_max": -4.0868330001831055, "copy_logits_min": -625000000.0, "copy_num_tokens": 410.0625, "epoch": 0.5185601225427623, "gen_logits_max": 4.978790283203125, "gen_logits_mean": -13.401716232299805, "gen_logits_min": -24.720481872558594, "gen_logits_std": 2.471989631652832, "gen_loss": 0.3448268473148346, "grad_norm": 0.458269041527128, "learning_rate": 2.7722526315789477e-05, "loss": 0.3132, "mean_copy_accuracy": 0.9934916794300079, "mean_gen_accuracy": 0.8664308786392212, "mean_token_accuracy": 0.8967930227518082, "num_tokens": 688154188.0, "sample_num_tokens": 7580.0, "step": 2539, "total_num_tokens": 688184508.0, "z_loss": 0.0008104299195110798 }, { "copy_logits_max": -2.763984203338623, "copy_logits_min": -750000000.0, "copy_num_tokens": 435.875, "epoch": 0.5187643604799591, "gen_logits_max": 4.885237216949463, "gen_logits_mean": -13.630868911743164, "gen_logits_min": -25.20754051208496, "gen_logits_std": 2.5417652130126953, "gen_loss": 0.306365430355072, "grad_norm": 0.4507300170166356, "learning_rate": 2.7721263157894738e-05, "loss": 0.3084, "mean_copy_accuracy": 0.9943294078111649, "mean_gen_accuracy": 0.8645380437374115, "mean_token_accuracy": 0.8987817615270615, "num_tokens": 688414474.0, "sample_num_tokens": 8439.0, "step": 2540, "total_num_tokens": 688448230.0, "z_loss": 0.0007963122334331274 }, { "copy_logits_max": -4.540583610534668, "copy_logits_min": -750000000.0, "copy_num_tokens": 524.75, "epoch": 0.518968598417156, "gen_logits_max": 4.48655891418457, "gen_logits_mean": -14.243821144104004, "gen_logits_min": -25.825958251953125, "gen_logits_std": 2.5443620681762695, "gen_loss": 0.333187460899353, "grad_norm": 0.4539312619303075, "learning_rate": 2.7720000000000002e-05, "loss": 0.3296, "mean_copy_accuracy": 0.9932652413845062, "mean_gen_accuracy": 0.8589579910039902, "mean_token_accuracy": 0.8923215270042419, "num_tokens": 688668482.0, "sample_num_tokens": 9245.0, "step": 2541, "total_num_tokens": 688705462.0, "z_loss": 0.0007897461182437837 }, { "copy_logits_max": -3.273622512817383, "copy_logits_min": -750000064.0, "copy_num_tokens": 549.375, "epoch": 0.5191728363543529, "gen_logits_max": 4.951150894165039, "gen_logits_mean": -12.845465660095215, "gen_logits_min": -24.344648361206055, "gen_logits_std": 2.476809501647949, "gen_loss": 0.34054145216941833, "grad_norm": 0.4395896277938379, "learning_rate": 2.7718736842105263e-05, "loss": 0.3388, "mean_copy_accuracy": 0.9938708543777466, "mean_gen_accuracy": 0.8523479551076889, "mean_token_accuracy": 0.8923665434122086, "num_tokens": 688966069.0, "sample_num_tokens": 8349.75, "step": 2542, "total_num_tokens": 688999468.0, "z_loss": 0.0008110449998639524 }, { "copy_logits_max": -7.042571067810059, "copy_logits_min": -750000000.0, "copy_num_tokens": 304.3125, "epoch": 0.5193770742915497, "gen_logits_max": 5.463890075683594, "gen_logits_mean": -13.260034561157227, "gen_logits_min": -24.44853973388672, "gen_logits_std": 2.4741506576538086, "gen_loss": 0.3059687912464142, "grad_norm": 0.3758597872834746, "learning_rate": 2.7717473684210527e-05, "loss": 0.2898, "mean_copy_accuracy": 0.9940761774778366, "mean_gen_accuracy": 0.8752012997865677, "mean_token_accuracy": 0.9040797352790833, "num_tokens": 689248770.0, "sample_num_tokens": 7525.5, "step": 2543, "total_num_tokens": 689278872.0, "z_loss": 0.0007761964807286859 }, { "copy_logits_max": -5.522324085235596, "copy_logits_min": -750000000.0, "copy_num_tokens": 511.3125, "epoch": 0.5195813122287465, "gen_logits_max": 4.284728050231934, "gen_logits_mean": -13.946490287780762, "gen_logits_min": -25.631610870361328, "gen_logits_std": 2.4893360137939453, "gen_loss": 0.2874460220336914, "grad_norm": 0.40797902603327774, "learning_rate": 2.771621052631579e-05, "loss": 0.3212, "mean_copy_accuracy": 0.9943631589412689, "mean_gen_accuracy": 0.8607970923185349, "mean_token_accuracy": 0.8940436840057373, "num_tokens": 689516138.0, "sample_num_tokens": 8183.5, "step": 2544, "total_num_tokens": 689548872.0, "z_loss": 0.0007581774843856692 }, { "copy_logits_max": -6.185140132904053, "copy_logits_min": -750000000.0, "copy_num_tokens": 521.8125, "epoch": 0.5197855501659433, "gen_logits_max": 4.18848991394043, "gen_logits_mean": -14.074197769165039, "gen_logits_min": -25.093448638916016, "gen_logits_std": 2.493494749069214, "gen_loss": 0.31029245257377625, "grad_norm": 0.43442456615522695, "learning_rate": 2.7714947368421053e-05, "loss": 0.3312, "mean_copy_accuracy": 0.9938867539167404, "mean_gen_accuracy": 0.8593405485153198, "mean_token_accuracy": 0.8902904391288757, "num_tokens": 689776226.0, "sample_num_tokens": 8359.0, "step": 2545, "total_num_tokens": 689809662.0, "z_loss": 0.0007284533348865807 }, { "copy_logits_max": -4.3266448974609375, "copy_logits_min": -750000000.0, "copy_num_tokens": 586.9375, "epoch": 0.5199897881031401, "gen_logits_max": 4.551131248474121, "gen_logits_mean": -14.09422779083252, "gen_logits_min": -25.56504249572754, "gen_logits_std": 2.5040030479431152, "gen_loss": 0.2998296022415161, "grad_norm": 0.5203824617285778, "learning_rate": 2.7713684210526317e-05, "loss": 0.3217, "mean_copy_accuracy": 0.994264617562294, "mean_gen_accuracy": 0.8633151203393936, "mean_token_accuracy": 0.8949863761663437, "num_tokens": 690050670.0, "sample_num_tokens": 9243.0, "step": 2546, "total_num_tokens": 690087642.0, "z_loss": 0.0007624151185154915 }, { "copy_logits_max": -4.381267547607422, "copy_logits_min": -687500032.0, "copy_num_tokens": 379.875, "epoch": 0.520194026040337, "gen_logits_max": 5.36082124710083, "gen_logits_mean": -13.514937400817871, "gen_logits_min": -24.714567184448242, "gen_logits_std": 2.4828944206237793, "gen_loss": 0.35127103328704834, "grad_norm": 0.4459584189331429, "learning_rate": 2.771242105263158e-05, "loss": 0.3277, "mean_copy_accuracy": 0.9941397607326508, "mean_gen_accuracy": 0.8602309376001358, "mean_token_accuracy": 0.8911633938550949, "num_tokens": 690293175.0, "sample_num_tokens": 7883.25, "step": 2547, "total_num_tokens": 690324708.0, "z_loss": 0.0009489801013842225 }, { "copy_logits_max": -4.52742862701416, "copy_logits_min": -750000000.0, "copy_num_tokens": 494.0625, "epoch": 0.5203982639775339, "gen_logits_max": 4.750019073486328, "gen_logits_mean": -14.210803031921387, "gen_logits_min": -25.758319854736328, "gen_logits_std": 2.575317144393921, "gen_loss": 0.31159019470214844, "grad_norm": 0.41047012662039767, "learning_rate": 2.7711157894736842e-05, "loss": 0.3112, "mean_copy_accuracy": 0.9947439879179001, "mean_gen_accuracy": 0.8660716712474823, "mean_token_accuracy": 0.8976276367902756, "num_tokens": 690560087.0, "sample_num_tokens": 8293.25, "step": 2548, "total_num_tokens": 690593260.0, "z_loss": 0.000835077662486583 }, { "copy_logits_max": -3.4732658863067627, "copy_logits_min": -750000000.0, "copy_num_tokens": 661.8125, "epoch": 0.5206025019147307, "gen_logits_max": 4.07929801940918, "gen_logits_mean": -13.75571060180664, "gen_logits_min": -25.586822509765625, "gen_logits_std": 2.5265307426452637, "gen_loss": 0.26941734552383423, "grad_norm": 0.5132255854451463, "learning_rate": 2.7709894736842107e-05, "loss": 0.322, "mean_copy_accuracy": 0.993067130446434, "mean_gen_accuracy": 0.8591904789209366, "mean_token_accuracy": 0.8936799615621567, "num_tokens": 690825326.0, "sample_num_tokens": 9465.0, "step": 2549, "total_num_tokens": 690863186.0, "z_loss": 0.0007686138851568103 }, { "copy_logits_max": -5.024747848510742, "copy_logits_min": -750000064.0, "copy_num_tokens": 405.0, "epoch": 0.5208067398519275, "gen_logits_max": 5.160481929779053, "gen_logits_mean": -14.74271297454834, "gen_logits_min": -25.71468734741211, "gen_logits_std": 2.4737443923950195, "gen_loss": 0.3343399167060852, "grad_norm": 0.4617035377212, "learning_rate": 2.7708631578947368e-05, "loss": 0.3213, "mean_copy_accuracy": 0.9941549450159073, "mean_gen_accuracy": 0.8600039333105087, "mean_token_accuracy": 0.8944508135318756, "num_tokens": 691090670.0, "sample_num_tokens": 8353.5, "step": 2550, "total_num_tokens": 691124084.0, "z_loss": 0.0008491795160807669 }, { "copy_logits_max": -4.047613620758057, "copy_logits_min": -750000000.0, "copy_num_tokens": 410.0, "epoch": 0.5210109777891243, "gen_logits_max": 4.943389892578125, "gen_logits_mean": -14.164240837097168, "gen_logits_min": -25.29363441467285, "gen_logits_std": 2.469390869140625, "gen_loss": 0.32976850867271423, "grad_norm": 0.4334093481345514, "learning_rate": 2.7707368421052632e-05, "loss": 0.3248, "mean_copy_accuracy": 0.9924654811620712, "mean_gen_accuracy": 0.8650394678115845, "mean_token_accuracy": 0.8914297819137573, "num_tokens": 691351430.0, "sample_num_tokens": 8383.0, "step": 2551, "total_num_tokens": 691384962.0, "z_loss": 0.0008652880205772817 }, { "copy_logits_max": -3.1802544593811035, "copy_logits_min": -687500032.0, "copy_num_tokens": 368.8125, "epoch": 0.5212152157263211, "gen_logits_max": 5.644573211669922, "gen_logits_mean": -14.252395629882812, "gen_logits_min": -25.682369232177734, "gen_logits_std": 2.510061502456665, "gen_loss": 0.38039010763168335, "grad_norm": 0.4446241306257942, "learning_rate": 2.7706105263157896e-05, "loss": 0.3358, "mean_copy_accuracy": 0.9930597096681595, "mean_gen_accuracy": 0.8599894195795059, "mean_token_accuracy": 0.8899794965982437, "num_tokens": 691612792.0, "sample_num_tokens": 8215.0, "step": 2552, "total_num_tokens": 691645652.0, "z_loss": 0.0010468255495652556 }, { "copy_logits_max": -3.3362512588500977, "copy_logits_min": -750000000.0, "copy_num_tokens": 482.25, "epoch": 0.521419453663518, "gen_logits_max": 4.446658611297607, "gen_logits_mean": -13.632955551147461, "gen_logits_min": -24.44639015197754, "gen_logits_std": 2.4536678791046143, "gen_loss": 0.31167954206466675, "grad_norm": 0.4244686014152387, "learning_rate": 2.7704842105263157e-05, "loss": 0.3039, "mean_copy_accuracy": 0.9945169240236282, "mean_gen_accuracy": 0.869876429438591, "mean_token_accuracy": 0.9009996950626373, "num_tokens": 691896177.0, "sample_num_tokens": 8424.75, "step": 2553, "total_num_tokens": 691929876.0, "z_loss": 0.0008778463234193623 }, { "copy_logits_max": -2.9484715461730957, "copy_logits_min": -750000000.0, "copy_num_tokens": 512.8125, "epoch": 0.5216236916007149, "gen_logits_max": 4.828881740570068, "gen_logits_mean": -12.903904914855957, "gen_logits_min": -24.51618194580078, "gen_logits_std": 2.533895969390869, "gen_loss": 0.35468152165412903, "grad_norm": 0.4292772074350703, "learning_rate": 2.7703578947368425e-05, "loss": 0.316, "mean_copy_accuracy": 0.9940660893917084, "mean_gen_accuracy": 0.8581137359142303, "mean_token_accuracy": 0.8965651243925095, "num_tokens": 692181831.0, "sample_num_tokens": 8955.25, "step": 2554, "total_num_tokens": 692217652.0, "z_loss": 0.000939259072765708 }, { "copy_logits_max": -4.375243186950684, "copy_logits_min": -750000000.0, "copy_num_tokens": 482.8125, "epoch": 0.5218279295379117, "gen_logits_max": 4.884361743927002, "gen_logits_mean": -14.472444534301758, "gen_logits_min": -25.619354248046875, "gen_logits_std": 2.4847402572631836, "gen_loss": 0.3706818222999573, "grad_norm": 0.4574756704830488, "learning_rate": 2.7702315789473686e-05, "loss": 0.3273, "mean_copy_accuracy": 0.993307888507843, "mean_gen_accuracy": 0.8604226261377335, "mean_token_accuracy": 0.8922672867774963, "num_tokens": 692450481.0, "sample_num_tokens": 8936.75, "step": 2555, "total_num_tokens": 692486228.0, "z_loss": 0.000847645103931427 }, { "copy_logits_max": -4.3952531814575195, "copy_logits_min": -750000128.0, "copy_num_tokens": 603.25, "epoch": 0.5220321674751085, "gen_logits_max": 4.340641975402832, "gen_logits_mean": -15.013318061828613, "gen_logits_min": -26.623733520507812, "gen_logits_std": 2.5622310638427734, "gen_loss": 0.3082110285758972, "grad_norm": 0.47176042460428447, "learning_rate": 2.770105263157895e-05, "loss": 0.3135, "mean_copy_accuracy": 0.9934256821870804, "mean_gen_accuracy": 0.860671728849411, "mean_token_accuracy": 0.8958054184913635, "num_tokens": 692724928.0, "sample_num_tokens": 9719.0, "step": 2556, "total_num_tokens": 692763804.0, "z_loss": 0.0007395847933366895 }, { "copy_logits_max": -3.55489182472229, "copy_logits_min": -750000064.0, "copy_num_tokens": 488.75, "epoch": 0.5222364054123053, "gen_logits_max": 5.042880058288574, "gen_logits_mean": -13.544843673706055, "gen_logits_min": -25.13821029663086, "gen_logits_std": 2.5495071411132812, "gen_loss": 0.30292683839797974, "grad_norm": 0.43102497949648194, "learning_rate": 2.769978947368421e-05, "loss": 0.308, "mean_copy_accuracy": 0.9947627037763596, "mean_gen_accuracy": 0.8634325563907623, "mean_token_accuracy": 0.896992415189743, "num_tokens": 693018546.0, "sample_num_tokens": 7987.5, "step": 2557, "total_num_tokens": 693050496.0, "z_loss": 0.0007881688070483506 }, { "copy_logits_max": -6.408760070800781, "copy_logits_min": -750000000.0, "copy_num_tokens": 341.5, "epoch": 0.5224406433495021, "gen_logits_max": 5.010360240936279, "gen_logits_mean": -13.378271102905273, "gen_logits_min": -24.235376358032227, "gen_logits_std": 2.474440574645996, "gen_loss": 0.35575494170188904, "grad_norm": 0.4887937064290117, "learning_rate": 2.7698526315789475e-05, "loss": 0.3359, "mean_copy_accuracy": 0.9908605217933655, "mean_gen_accuracy": 0.8629539310932159, "mean_token_accuracy": 0.8873644024133682, "num_tokens": 693265382.0, "sample_num_tokens": 7783.5, "step": 2558, "total_num_tokens": 693296516.0, "z_loss": 0.0007926651742309332 }, { "copy_logits_max": -7.138774871826172, "copy_logits_min": -750000000.0, "copy_num_tokens": 296.8125, "epoch": 0.5226448812866991, "gen_logits_max": 5.771245956420898, "gen_logits_mean": -13.86772346496582, "gen_logits_min": -24.638427734375, "gen_logits_std": 2.447139263153076, "gen_loss": 0.3491755723953247, "grad_norm": 0.4259287673033838, "learning_rate": 2.7697263157894736e-05, "loss": 0.3372, "mean_copy_accuracy": 0.9947527647018433, "mean_gen_accuracy": 0.8589995205402374, "mean_token_accuracy": 0.8885167390108109, "num_tokens": 693524769.0, "sample_num_tokens": 8672.75, "step": 2559, "total_num_tokens": 693559460.0, "z_loss": 0.0008145651081576943 }, { "copy_logits_max": -6.313897132873535, "copy_logits_min": -687500032.0, "copy_num_tokens": 242.625, "epoch": 0.5228491192238959, "gen_logits_max": 5.876337051391602, "gen_logits_mean": -13.609302520751953, "gen_logits_min": -24.380916595458984, "gen_logits_std": 2.4494097232818604, "gen_loss": 0.34923696517944336, "grad_norm": 0.4543881691327576, "learning_rate": 2.7696e-05, "loss": 0.3113, "mean_copy_accuracy": 0.9924282729625702, "mean_gen_accuracy": 0.8678565621376038, "mean_token_accuracy": 0.8946542590856552, "num_tokens": 693791203.0, "sample_num_tokens": 7668.25, "step": 2560, "total_num_tokens": 693821876.0, "z_loss": 0.0008465509163215756 }, { "copy_logits_max": -6.637901782989502, "copy_logits_min": -750000128.0, "copy_num_tokens": 358.4375, "epoch": 0.5230533571610927, "gen_logits_max": 4.9499125480651855, "gen_logits_mean": -13.583248138427734, "gen_logits_min": -24.59588623046875, "gen_logits_std": 2.4713776111602783, "gen_loss": 0.3212675452232361, "grad_norm": 0.4550941774808478, "learning_rate": 2.769473684210526e-05, "loss": 0.3305, "mean_copy_accuracy": 0.9941717982292175, "mean_gen_accuracy": 0.8608502596616745, "mean_token_accuracy": 0.8920716047286987, "num_tokens": 694035922.0, "sample_num_tokens": 7311.5, "step": 2561, "total_num_tokens": 694065168.0, "z_loss": 0.0007537354249507189 }, { "copy_logits_max": -3.7594399452209473, "copy_logits_min": -750000000.0, "copy_num_tokens": 573.1875, "epoch": 0.5232575950982895, "gen_logits_max": 5.010962009429932, "gen_logits_mean": -13.808591842651367, "gen_logits_min": -25.15678596496582, "gen_logits_std": 2.5120625495910645, "gen_loss": 0.36443185806274414, "grad_norm": 0.4144240767646562, "learning_rate": 2.769347368421053e-05, "loss": 0.3244, "mean_copy_accuracy": 0.9932261407375336, "mean_gen_accuracy": 0.8604082316160202, "mean_token_accuracy": 0.8926285803318024, "num_tokens": 694312785.0, "sample_num_tokens": 10125.75, "step": 2562, "total_num_tokens": 694353288.0, "z_loss": 0.0009207292459905148 }, { "copy_logits_max": -5.218727111816406, "copy_logits_min": -687500032.0, "copy_num_tokens": 379.5625, "epoch": 0.5234618330354863, "gen_logits_max": 5.143263816833496, "gen_logits_mean": -12.202093124389648, "gen_logits_min": -23.681594848632812, "gen_logits_std": 2.49777889251709, "gen_loss": 0.3355804681777954, "grad_norm": 0.4537430554739836, "learning_rate": 2.769221052631579e-05, "loss": 0.3277, "mean_copy_accuracy": 0.9942275136709213, "mean_gen_accuracy": 0.8555821180343628, "mean_token_accuracy": 0.8924808949232101, "num_tokens": 694576901.0, "sample_num_tokens": 8240.25, "step": 2563, "total_num_tokens": 694609862.0, "z_loss": 0.0008911863551475108 }, { "copy_logits_max": -3.060455322265625, "copy_logits_min": -750000000.0, "copy_num_tokens": 492.8125, "epoch": 0.5236660709726831, "gen_logits_max": 4.993078231811523, "gen_logits_mean": -12.568623542785645, "gen_logits_min": -24.33417510986328, "gen_logits_std": 2.548297882080078, "gen_loss": 0.2648610472679138, "grad_norm": 0.4402631016708205, "learning_rate": 2.7690947368421054e-05, "loss": 0.3058, "mean_copy_accuracy": 0.9945282936096191, "mean_gen_accuracy": 0.8624064028263092, "mean_token_accuracy": 0.8969161808490753, "num_tokens": 694854584.0, "sample_num_tokens": 7783.0, "step": 2564, "total_num_tokens": 694885716.0, "z_loss": 0.000787381490226835 }, { "copy_logits_max": -4.0933380126953125, "copy_logits_min": -750000000.0, "copy_num_tokens": 507.125, "epoch": 0.52387030890988, "gen_logits_max": 4.827322006225586, "gen_logits_mean": -12.976595878601074, "gen_logits_min": -24.576416015625, "gen_logits_std": 2.577211856842041, "gen_loss": 0.3340783715248108, "grad_norm": 0.4318093529085861, "learning_rate": 2.768968421052632e-05, "loss": 0.3162, "mean_copy_accuracy": 0.9948424994945526, "mean_gen_accuracy": 0.8606384694576263, "mean_token_accuracy": 0.8961503803730011, "num_tokens": 695134903.0, "sample_num_tokens": 8736.25, "step": 2565, "total_num_tokens": 695169848.0, "z_loss": 0.0008901576511561871 }, { "copy_logits_max": -5.434122085571289, "copy_logits_min": -750000000.0, "copy_num_tokens": 626.4375, "epoch": 0.5240745468470769, "gen_logits_max": 4.222352981567383, "gen_logits_mean": -13.07199478149414, "gen_logits_min": -24.25768280029297, "gen_logits_std": 2.4738917350769043, "gen_loss": 0.27139824628829956, "grad_norm": 0.47192973733427046, "learning_rate": 2.768842105263158e-05, "loss": 0.3302, "mean_copy_accuracy": 0.9927074164152145, "mean_gen_accuracy": 0.8529019206762314, "mean_token_accuracy": 0.8910896629095078, "num_tokens": 695416374.0, "sample_num_tokens": 9801.0, "step": 2566, "total_num_tokens": 695455578.0, "z_loss": 0.0007171678589656949 }, { "copy_logits_max": -3.1716692447662354, "copy_logits_min": -750000000.0, "copy_num_tokens": 440.3125, "epoch": 0.5242787847842737, "gen_logits_max": 5.392064571380615, "gen_logits_mean": -12.129600524902344, "gen_logits_min": -23.1397705078125, "gen_logits_std": 2.5046539306640625, "gen_loss": 0.3517240583896637, "grad_norm": 0.4574333097846962, "learning_rate": 2.7687157894736844e-05, "loss": 0.3495, "mean_copy_accuracy": 0.993466779589653, "mean_gen_accuracy": 0.8544900119304657, "mean_token_accuracy": 0.884665846824646, "num_tokens": 695680049.0, "sample_num_tokens": 8299.25, "step": 2567, "total_num_tokens": 695713246.0, "z_loss": 0.0009261791710741818 }, { "copy_logits_max": -4.147943496704102, "copy_logits_min": -687500032.0, "copy_num_tokens": 353.0625, "epoch": 0.5244830227214705, "gen_logits_max": 4.871781826019287, "gen_logits_mean": -14.78847599029541, "gen_logits_min": -26.131521224975586, "gen_logits_std": 2.5372507572174072, "gen_loss": 0.30202537775039673, "grad_norm": 0.5210705231027611, "learning_rate": 2.7685894736842105e-05, "loss": 0.3188, "mean_copy_accuracy": 0.9939281046390533, "mean_gen_accuracy": 0.8597324639558792, "mean_token_accuracy": 0.8957053273916245, "num_tokens": 695955957.0, "sample_num_tokens": 7813.25, "step": 2568, "total_num_tokens": 695987210.0, "z_loss": 0.0008587075863033533 }, { "copy_logits_max": -2.540489673614502, "copy_logits_min": -750000000.0, "copy_num_tokens": 390.1875, "epoch": 0.5246872606586673, "gen_logits_max": 5.770677089691162, "gen_logits_mean": -13.402948379516602, "gen_logits_min": -24.70855712890625, "gen_logits_std": 2.538749933242798, "gen_loss": 0.35184144973754883, "grad_norm": 0.4664615364114023, "learning_rate": 2.768463157894737e-05, "loss": 0.3418, "mean_copy_accuracy": 0.9945701956748962, "mean_gen_accuracy": 0.8576654195785522, "mean_token_accuracy": 0.88861183822155, "num_tokens": 696216189.0, "sample_num_tokens": 8199.25, "step": 2569, "total_num_tokens": 696248986.0, "z_loss": 0.0009320159442722797 }, { "copy_logits_max": -5.429660320281982, "copy_logits_min": -750000000.0, "copy_num_tokens": 289.375, "epoch": 0.5248914985958641, "gen_logits_max": 5.414605140686035, "gen_logits_mean": -13.60612678527832, "gen_logits_min": -24.937288284301758, "gen_logits_std": 2.5269393920898438, "gen_loss": 0.37160271406173706, "grad_norm": 0.4647455104481007, "learning_rate": 2.768336842105263e-05, "loss": 0.3508, "mean_copy_accuracy": 0.992845892906189, "mean_gen_accuracy": 0.8515928983688354, "mean_token_accuracy": 0.8855358958244324, "num_tokens": 696471150.0, "sample_num_tokens": 6761.5, "step": 2570, "total_num_tokens": 696498196.0, "z_loss": 0.0008976981043815613 }, { "copy_logits_max": -5.0402631759643555, "copy_logits_min": -750000000.0, "copy_num_tokens": 391.625, "epoch": 0.525095736533061, "gen_logits_max": 4.8604536056518555, "gen_logits_mean": -14.996223449707031, "gen_logits_min": -26.115299224853516, "gen_logits_std": 2.536898136138916, "gen_loss": 0.31649476289749146, "grad_norm": 0.4402531761752499, "learning_rate": 2.7682105263157898e-05, "loss": 0.306, "mean_copy_accuracy": 0.9933136850595474, "mean_gen_accuracy": 0.8679345697164536, "mean_token_accuracy": 0.8971824049949646, "num_tokens": 696722657.0, "sample_num_tokens": 8250.25, "step": 2571, "total_num_tokens": 696755658.0, "z_loss": 0.0007773413090035319 }, { "copy_logits_max": -2.0255346298217773, "copy_logits_min": -750000000.0, "copy_num_tokens": 419.875, "epoch": 0.5252999744702579, "gen_logits_max": 5.67891263961792, "gen_logits_mean": -13.06488037109375, "gen_logits_min": -24.797733306884766, "gen_logits_std": 2.5721821784973145, "gen_loss": 0.33740803599357605, "grad_norm": 0.44134779217382114, "learning_rate": 2.768084210526316e-05, "loss": 0.336, "mean_copy_accuracy": 0.9954636991024017, "mean_gen_accuracy": 0.8542990833520889, "mean_token_accuracy": 0.887506827712059, "num_tokens": 696993104.0, "sample_num_tokens": 7846.0, "step": 2572, "total_num_tokens": 697024488.0, "z_loss": 0.0008710051188245416 }, { "copy_logits_max": -2.7541046142578125, "copy_logits_min": -750000064.0, "copy_num_tokens": 476.75, "epoch": 0.5255042124074547, "gen_logits_max": 5.344205379486084, "gen_logits_mean": -13.3740873336792, "gen_logits_min": -24.753934860229492, "gen_logits_std": 2.533397674560547, "gen_loss": 0.2964206337928772, "grad_norm": 0.44190910184070287, "learning_rate": 2.7679578947368423e-05, "loss": 0.3183, "mean_copy_accuracy": 0.9952888190746307, "mean_gen_accuracy": 0.8624295294284821, "mean_token_accuracy": 0.8961777985095978, "num_tokens": 697270706.0, "sample_num_tokens": 8218.5, "step": 2573, "total_num_tokens": 697303580.0, "z_loss": 0.0008462572586722672 }, { "copy_logits_max": -2.5773189067840576, "copy_logits_min": -687500032.0, "copy_num_tokens": 655.3125, "epoch": 0.5257084503446515, "gen_logits_max": 5.065452575683594, "gen_logits_mean": -13.230443954467773, "gen_logits_min": -24.9239559173584, "gen_logits_std": 2.573244571685791, "gen_loss": 0.3556908965110779, "grad_norm": 0.47783786491110863, "learning_rate": 2.7678315789473684e-05, "loss": 0.3226, "mean_copy_accuracy": 0.9942090213298798, "mean_gen_accuracy": 0.8593256324529648, "mean_token_accuracy": 0.8947524726390839, "num_tokens": 697542426.0, "sample_num_tokens": 9554.0, "step": 2574, "total_num_tokens": 697580642.0, "z_loss": 0.0008989071939140558 }, { "copy_logits_max": -4.263287544250488, "copy_logits_min": -750000000.0, "copy_num_tokens": 641.25, "epoch": 0.5259126882818483, "gen_logits_max": 4.207346439361572, "gen_logits_mean": -14.210659980773926, "gen_logits_min": -26.238903045654297, "gen_logits_std": 2.5804052352905273, "gen_loss": 0.28838402032852173, "grad_norm": 0.4848035183694721, "learning_rate": 2.7677052631578948e-05, "loss": 0.3436, "mean_copy_accuracy": 0.9913288056850433, "mean_gen_accuracy": 0.8571905195713043, "mean_token_accuracy": 0.889327809214592, "num_tokens": 697807605.0, "sample_num_tokens": 9597.75, "step": 2575, "total_num_tokens": 697845996.0, "z_loss": 0.0007111832965165377 }, { "copy_logits_max": -3.136585235595703, "copy_logits_min": -750000000.0, "copy_num_tokens": 481.8125, "epoch": 0.5261169262190452, "gen_logits_max": 5.187225341796875, "gen_logits_mean": -13.690567016601562, "gen_logits_min": -25.155128479003906, "gen_logits_std": 2.544374942779541, "gen_loss": 0.3414285182952881, "grad_norm": 0.48321487947750047, "learning_rate": 2.767578947368421e-05, "loss": 0.3345, "mean_copy_accuracy": 0.9933003187179565, "mean_gen_accuracy": 0.8562329113483429, "mean_token_accuracy": 0.8871556669473648, "num_tokens": 698076141.0, "sample_num_tokens": 9429.75, "step": 2576, "total_num_tokens": 698113860.0, "z_loss": 0.0008395960321649909 }, { "copy_logits_max": -3.5799520015716553, "copy_logits_min": -750000064.0, "copy_num_tokens": 375.0625, "epoch": 0.526321164156242, "gen_logits_max": 5.412758827209473, "gen_logits_mean": -14.048089981079102, "gen_logits_min": -25.38353729248047, "gen_logits_std": 2.5969433784484863, "gen_loss": 0.2961035966873169, "grad_norm": 0.43245611481642404, "learning_rate": 2.7674526315789473e-05, "loss": 0.32, "mean_copy_accuracy": 0.9931428581476212, "mean_gen_accuracy": 0.866388127207756, "mean_token_accuracy": 0.893555223941803, "num_tokens": 698336784.0, "sample_num_tokens": 8015.0, "step": 2577, "total_num_tokens": 698368844.0, "z_loss": 0.0006671941373497248 }, { "copy_logits_max": -3.3586950302124023, "copy_logits_min": -687500032.0, "copy_num_tokens": 489.5625, "epoch": 0.5265254020934389, "gen_logits_max": 5.76174783706665, "gen_logits_mean": -13.206148147583008, "gen_logits_min": -24.99287223815918, "gen_logits_std": 2.5857930183410645, "gen_loss": 0.3399978280067444, "grad_norm": 0.4737663220759521, "learning_rate": 2.7673263157894734e-05, "loss": 0.3195, "mean_copy_accuracy": 0.9945966005325317, "mean_gen_accuracy": 0.8611156642436981, "mean_token_accuracy": 0.893834725022316, "num_tokens": 698604212.0, "sample_num_tokens": 9115.0, "step": 2578, "total_num_tokens": 698640672.0, "z_loss": 0.0007970191654749215 }, { "copy_logits_max": -4.3754682540893555, "copy_logits_min": -625000064.0, "copy_num_tokens": 463.4375, "epoch": 0.5267296400306357, "gen_logits_max": 5.23103141784668, "gen_logits_mean": -13.413025856018066, "gen_logits_min": -25.002174377441406, "gen_logits_std": 2.554182529449463, "gen_loss": 0.3280148506164551, "grad_norm": 0.5096829178459544, "learning_rate": 2.7672000000000002e-05, "loss": 0.3246, "mean_copy_accuracy": 0.9935718476772308, "mean_gen_accuracy": 0.868335634469986, "mean_token_accuracy": 0.8932523727416992, "num_tokens": 698854512.0, "sample_num_tokens": 8572.0, "step": 2579, "total_num_tokens": 698888800.0, "z_loss": 0.0007287738844752312 }, { "copy_logits_max": -4.552431106567383, "copy_logits_min": -750000000.0, "copy_num_tokens": 303.125, "epoch": 0.5269338779678325, "gen_logits_max": 5.103109359741211, "gen_logits_mean": -14.6561279296875, "gen_logits_min": -25.847721099853516, "gen_logits_std": 2.538578987121582, "gen_loss": 0.3475211560726166, "grad_norm": 0.4580834066008014, "learning_rate": 2.7670736842105266e-05, "loss": 0.3318, "mean_copy_accuracy": 0.9931197911500931, "mean_gen_accuracy": 0.8629539459943771, "mean_token_accuracy": 0.890555128455162, "num_tokens": 699107026.0, "sample_num_tokens": 7657.0, "step": 2580, "total_num_tokens": 699137654.0, "z_loss": 0.0007783955661579967 }, { "copy_logits_max": -3.2248926162719727, "copy_logits_min": -750000000.0, "copy_num_tokens": 453.0, "epoch": 0.5271381159050293, "gen_logits_max": 5.648193359375, "gen_logits_mean": -13.501035690307617, "gen_logits_min": -25.03688621520996, "gen_logits_std": 2.475762367248535, "gen_loss": 0.30253538489341736, "grad_norm": 0.42635932765706147, "learning_rate": 2.7669473684210527e-05, "loss": 0.3104, "mean_copy_accuracy": 0.9947759062051773, "mean_gen_accuracy": 0.8624488711357117, "mean_token_accuracy": 0.897421583533287, "num_tokens": 699370167.0, "sample_num_tokens": 8331.75, "step": 2581, "total_num_tokens": 699403494.0, "z_loss": 0.0007546941051259637 }, { "copy_logits_max": -2.9636526107788086, "copy_logits_min": -750000000.0, "copy_num_tokens": 355.1875, "epoch": 0.5273423538422262, "gen_logits_max": 5.446639060974121, "gen_logits_mean": -14.190838813781738, "gen_logits_min": -25.14435386657715, "gen_logits_std": 2.502655506134033, "gen_loss": 0.35102713108062744, "grad_norm": 0.480391280074649, "learning_rate": 2.766821052631579e-05, "loss": 0.3245, "mean_copy_accuracy": 0.9934068471193314, "mean_gen_accuracy": 0.8653401434421539, "mean_token_accuracy": 0.8943989276885986, "num_tokens": 699629433.0, "sample_num_tokens": 8035.75, "step": 2582, "total_num_tokens": 699661576.0, "z_loss": 0.0007873671129345894 }, { "copy_logits_max": -3.3686137199401855, "copy_logits_min": -750000000.0, "copy_num_tokens": 430.875, "epoch": 0.527546591779423, "gen_logits_max": 5.333774566650391, "gen_logits_mean": -13.557268142700195, "gen_logits_min": -25.037675857543945, "gen_logits_std": 2.5968856811523438, "gen_loss": 0.3013579249382019, "grad_norm": 0.4352394226848435, "learning_rate": 2.7666947368421053e-05, "loss": 0.3134, "mean_copy_accuracy": 0.9944885969161987, "mean_gen_accuracy": 0.8650203794240952, "mean_token_accuracy": 0.8955489099025726, "num_tokens": 699914312.0, "sample_num_tokens": 8619.5, "step": 2583, "total_num_tokens": 699948790.0, "z_loss": 0.0008018587250262499 }, { "copy_logits_max": -1.5299816131591797, "copy_logits_min": -750000000.0, "copy_num_tokens": 438.875, "epoch": 0.5277508297166199, "gen_logits_max": 5.594050407409668, "gen_logits_mean": -13.069878578186035, "gen_logits_min": -24.610759735107422, "gen_logits_std": 2.5650596618652344, "gen_loss": 0.3633677661418915, "grad_norm": 0.45581437716427897, "learning_rate": 2.7665684210526317e-05, "loss": 0.334, "mean_copy_accuracy": 0.9950401186943054, "mean_gen_accuracy": 0.856298103928566, "mean_token_accuracy": 0.8896882832050323, "num_tokens": 700187430.0, "sample_num_tokens": 8547.5, "step": 2584, "total_num_tokens": 700221620.0, "z_loss": 0.0009592106216587126 }, { "copy_logits_max": -3.2977399826049805, "copy_logits_min": -750000064.0, "copy_num_tokens": 547.1875, "epoch": 0.5279550676538167, "gen_logits_max": 4.642622947692871, "gen_logits_mean": -13.926032066345215, "gen_logits_min": -25.1046142578125, "gen_logits_std": 2.523139476776123, "gen_loss": 0.2863505780696869, "grad_norm": 0.4014499527641548, "learning_rate": 2.7664421052631578e-05, "loss": 0.2888, "mean_copy_accuracy": 0.9942201673984528, "mean_gen_accuracy": 0.8663471192121506, "mean_token_accuracy": 0.9044059962034225, "num_tokens": 700478043.0, "sample_num_tokens": 8452.75, "step": 2585, "total_num_tokens": 700511854.0, "z_loss": 0.0008044978603720665 }, { "copy_logits_max": -1.7881715297698975, "copy_logits_min": -625000064.0, "copy_num_tokens": 731.875, "epoch": 0.5281593055910135, "gen_logits_max": 4.928978443145752, "gen_logits_mean": -13.638833999633789, "gen_logits_min": -25.31447982788086, "gen_logits_std": 2.550433397293091, "gen_loss": 0.29992589354515076, "grad_norm": 0.4231376462815922, "learning_rate": 2.7663157894736842e-05, "loss": 0.3064, "mean_copy_accuracy": 0.9947906732559204, "mean_gen_accuracy": 0.8658614307641983, "mean_token_accuracy": 0.8990118652582169, "num_tokens": 700767533.0, "sample_num_tokens": 9957.25, "step": 2586, "total_num_tokens": 700807362.0, "z_loss": 0.000808530836366117 }, { "copy_logits_max": -3.0285074710845947, "copy_logits_min": -750000000.0, "copy_num_tokens": 547.0625, "epoch": 0.5283635435282104, "gen_logits_max": 4.676506042480469, "gen_logits_mean": -13.933538436889648, "gen_logits_min": -25.669540405273438, "gen_logits_std": 2.6096129417419434, "gen_loss": 0.32018065452575684, "grad_norm": 0.6418618229028321, "learning_rate": 2.7661894736842106e-05, "loss": 0.31, "mean_copy_accuracy": 0.9936757832765579, "mean_gen_accuracy": 0.863015279173851, "mean_token_accuracy": 0.8975345194339752, "num_tokens": 701065551.0, "sample_num_tokens": 8604.75, "step": 2587, "total_num_tokens": 701099970.0, "z_loss": 0.0007332757813856006 }, { "copy_logits_max": -1.027219533920288, "copy_logits_min": -750000064.0, "copy_num_tokens": 797.0625, "epoch": 0.5285677814654072, "gen_logits_max": 4.258844375610352, "gen_logits_mean": -13.291730880737305, "gen_logits_min": -24.913618087768555, "gen_logits_std": 2.5527706146240234, "gen_loss": 0.30357837677001953, "grad_norm": 0.4339635413244413, "learning_rate": 2.766063157894737e-05, "loss": 0.3102, "mean_copy_accuracy": 0.9945170432329178, "mean_gen_accuracy": 0.8584611713886261, "mean_token_accuracy": 0.8989281952381134, "num_tokens": 701359077.0, "sample_num_tokens": 10095.75, "step": 2588, "total_num_tokens": 701399460.0, "z_loss": 0.0008273147395811975 }, { "copy_logits_max": -1.5854952335357666, "copy_logits_min": -750000128.0, "copy_num_tokens": 531.75, "epoch": 0.528772019402604, "gen_logits_max": 4.085125923156738, "gen_logits_mean": -13.745559692382812, "gen_logits_min": -25.53193473815918, "gen_logits_std": 2.6251230239868164, "gen_loss": 0.29244959354400635, "grad_norm": 0.44874192775702854, "learning_rate": 2.765936842105263e-05, "loss": 0.3067, "mean_copy_accuracy": 0.9945896416902542, "mean_gen_accuracy": 0.8627168089151382, "mean_token_accuracy": 0.899946317076683, "num_tokens": 701666746.0, "sample_num_tokens": 8011.5, "step": 2589, "total_num_tokens": 701698792.0, "z_loss": 0.0007153376354835927 }, { "copy_logits_max": -2.8391222953796387, "copy_logits_min": -750000000.0, "copy_num_tokens": 581.9375, "epoch": 0.5289762573398009, "gen_logits_max": 3.608396291732788, "gen_logits_mean": -14.448708534240723, "gen_logits_min": -26.155485153198242, "gen_logits_std": 2.5660324096679688, "gen_loss": 0.3061193823814392, "grad_norm": 0.4247896846074365, "learning_rate": 2.7658105263157896e-05, "loss": 0.3008, "mean_copy_accuracy": 0.9953523576259613, "mean_gen_accuracy": 0.8689799606800079, "mean_token_accuracy": 0.9025870859622955, "num_tokens": 701946464.0, "sample_num_tokens": 9202.5, "step": 2590, "total_num_tokens": 701983274.0, "z_loss": 0.0007120153168216348 }, { "copy_logits_max": -0.7250156402587891, "copy_logits_min": -625000064.0, "copy_num_tokens": 666.3125, "epoch": 0.5291804952769977, "gen_logits_max": 5.1278181076049805, "gen_logits_mean": -12.578205108642578, "gen_logits_min": -24.438678741455078, "gen_logits_std": 2.6516757011413574, "gen_loss": 0.3190103769302368, "grad_norm": 0.5217621622213009, "learning_rate": 2.7656842105263157e-05, "loss": 0.3081, "mean_copy_accuracy": 0.9944610446691513, "mean_gen_accuracy": 0.8552477061748505, "mean_token_accuracy": 0.899386078119278, "num_tokens": 702241000.0, "sample_num_tokens": 9553.5, "step": 2591, "total_num_tokens": 702279214.0, "z_loss": 0.0007679174887016416 }, { "copy_logits_max": -1.057289958000183, "copy_logits_min": -750000000.0, "copy_num_tokens": 332.6875, "epoch": 0.5293847332141945, "gen_logits_max": 6.005008220672607, "gen_logits_mean": -12.846480369567871, "gen_logits_min": -24.337839126586914, "gen_logits_std": 2.5944275856018066, "gen_loss": 0.3445618748664856, "grad_norm": 0.45798385694950133, "learning_rate": 2.765557894736842e-05, "loss": 0.3293, "mean_copy_accuracy": 0.9934672713279724, "mean_gen_accuracy": 0.8591519296169281, "mean_token_accuracy": 0.8929914385080338, "num_tokens": 702507083.0, "sample_num_tokens": 7540.75, "step": 2592, "total_num_tokens": 702537246.0, "z_loss": 0.0007997732609510422 }, { "copy_logits_max": -2.6062755584716797, "copy_logits_min": -687500032.0, "copy_num_tokens": 520.375, "epoch": 0.5295889711513914, "gen_logits_max": 4.8174920082092285, "gen_logits_mean": -14.402576446533203, "gen_logits_min": -25.956336975097656, "gen_logits_std": 2.5489015579223633, "gen_loss": 0.3403615951538086, "grad_norm": 0.46405638381090125, "learning_rate": 2.7654315789473685e-05, "loss": 0.3207, "mean_copy_accuracy": 0.9947100281715393, "mean_gen_accuracy": 0.8640625327825546, "mean_token_accuracy": 0.89505535364151, "num_tokens": 702780798.0, "sample_num_tokens": 9703.0, "step": 2593, "total_num_tokens": 702819610.0, "z_loss": 0.000826445349957794 }, { "copy_logits_max": -5.129530906677246, "copy_logits_min": -750000000.0, "copy_num_tokens": 377.8125, "epoch": 0.5297932090885882, "gen_logits_max": 4.86798095703125, "gen_logits_mean": -13.680902481079102, "gen_logits_min": -24.871829986572266, "gen_logits_std": 2.5449657440185547, "gen_loss": 0.3384839594364166, "grad_norm": 0.4039726279454486, "learning_rate": 2.7653052631578946e-05, "loss": 0.3094, "mean_copy_accuracy": 0.9951801747083664, "mean_gen_accuracy": 0.8676835298538208, "mean_token_accuracy": 0.8976685851812363, "num_tokens": 703065765.0, "sample_num_tokens": 8527.25, "step": 2594, "total_num_tokens": 703099874.0, "z_loss": 0.0007752685924060643 }, { "copy_logits_max": -2.084160804748535, "copy_logits_min": -687500032.0, "copy_num_tokens": 598.375, "epoch": 0.529997447025785, "gen_logits_max": 4.531470775604248, "gen_logits_mean": -13.17664623260498, "gen_logits_min": -24.84059715270996, "gen_logits_std": 2.57066011428833, "gen_loss": 0.29876258969306946, "grad_norm": 0.4696866875893108, "learning_rate": 2.7651789473684214e-05, "loss": 0.2988, "mean_copy_accuracy": 0.9935748726129532, "mean_gen_accuracy": 0.863427922129631, "mean_token_accuracy": 0.8991064429283142, "num_tokens": 703339396.0, "sample_num_tokens": 8696.0, "step": 2595, "total_num_tokens": 703374180.0, "z_loss": 0.0008166242623701692 }, { "copy_logits_max": -3.9148218631744385, "copy_logits_min": -687500032.0, "copy_num_tokens": 359.5625, "epoch": 0.5302016849629819, "gen_logits_max": 6.1952314376831055, "gen_logits_mean": -12.492384910583496, "gen_logits_min": -23.98579978942871, "gen_logits_std": 2.534085273742676, "gen_loss": 0.3183148503303528, "grad_norm": 0.4512735428152353, "learning_rate": 2.7650526315789475e-05, "loss": 0.3032, "mean_copy_accuracy": 0.9916855841875076, "mean_gen_accuracy": 0.8702610731124878, "mean_token_accuracy": 0.8998152911663055, "num_tokens": 703629706.0, "sample_num_tokens": 7762.5, "step": 2596, "total_num_tokens": 703660756.0, "z_loss": 0.0008521032286807895 }, { "copy_logits_max": -1.6207411289215088, "copy_logits_min": -750000000.0, "copy_num_tokens": 530.4375, "epoch": 0.5304059229001787, "gen_logits_max": 4.6155266761779785, "gen_logits_mean": -13.16425609588623, "gen_logits_min": -24.82598304748535, "gen_logits_std": 2.5596940517425537, "gen_loss": 0.2979470491409302, "grad_norm": 0.4658199201262189, "learning_rate": 2.764926315789474e-05, "loss": 0.3212, "mean_copy_accuracy": 0.992655947804451, "mean_gen_accuracy": 0.8659283071756363, "mean_token_accuracy": 0.8947361260652542, "num_tokens": 703909065.0, "sample_num_tokens": 8603.75, "step": 2597, "total_num_tokens": 703943480.0, "z_loss": 0.0009112627012655139 }, { "copy_logits_max": -0.527703046798706, "copy_logits_min": -687500032.0, "copy_num_tokens": 526.5, "epoch": 0.5306101608373756, "gen_logits_max": 5.501031875610352, "gen_logits_mean": -12.430488586425781, "gen_logits_min": -24.27605628967285, "gen_logits_std": 2.6043519973754883, "gen_loss": 0.33183664083480835, "grad_norm": 0.4628155226966951, "learning_rate": 2.7648e-05, "loss": 0.3183, "mean_copy_accuracy": 0.9943456798791885, "mean_gen_accuracy": 0.8560205549001694, "mean_token_accuracy": 0.894787073135376, "num_tokens": 704194991.0, "sample_num_tokens": 8652.25, "step": 2598, "total_num_tokens": 704229600.0, "z_loss": 0.0009710289305076003 }, { "copy_logits_max": -2.0646443367004395, "copy_logits_min": -750000000.0, "copy_num_tokens": 391.75, "epoch": 0.5308143987745724, "gen_logits_max": 5.443987846374512, "gen_logits_mean": -12.552947998046875, "gen_logits_min": -24.22724151611328, "gen_logits_std": 2.5616652965545654, "gen_loss": 0.359358012676239, "grad_norm": 0.5576716285472516, "learning_rate": 2.7646736842105265e-05, "loss": 0.3233, "mean_copy_accuracy": 0.9935116171836853, "mean_gen_accuracy": 0.8579599559307098, "mean_token_accuracy": 0.8945286124944687, "num_tokens": 704477794.0, "sample_num_tokens": 7257.5, "step": 2599, "total_num_tokens": 704506824.0, "z_loss": 0.0008605023031122983 }, { "copy_logits_max": -3.4640748500823975, "copy_logits_min": -687500032.0, "copy_num_tokens": 627.75, "epoch": 0.5310186367117692, "gen_logits_max": 4.308769702911377, "gen_logits_mean": -14.068246841430664, "gen_logits_min": -25.714872360229492, "gen_logits_std": 2.561290979385376, "gen_loss": 0.29887330532073975, "grad_norm": 0.4035929311880443, "learning_rate": 2.7645473684210525e-05, "loss": 0.3274, "mean_copy_accuracy": 0.9940639287233353, "mean_gen_accuracy": 0.8580953776836395, "mean_token_accuracy": 0.8924838453531265, "num_tokens": 704749989.0, "sample_num_tokens": 9554.25, "step": 2600, "total_num_tokens": 704788206.0, "z_loss": 0.0008071298943832517 }, { "copy_logits_max": -3.1391842365264893, "copy_logits_min": -750000000.0, "copy_num_tokens": 566.0625, "epoch": 0.531222874648966, "gen_logits_max": 4.454522132873535, "gen_logits_mean": -13.663154602050781, "gen_logits_min": -25.440380096435547, "gen_logits_std": 2.584160804748535, "gen_loss": 0.26591432094573975, "grad_norm": 0.4300044801602912, "learning_rate": 2.764421052631579e-05, "loss": 0.2963, "mean_copy_accuracy": 0.9939902424812317, "mean_gen_accuracy": 0.8751360923051834, "mean_token_accuracy": 0.9031065851449966, "num_tokens": 705033474.0, "sample_num_tokens": 8889.5, "step": 2601, "total_num_tokens": 705069032.0, "z_loss": 0.0007740940200164914 }, { "copy_logits_max": -5.383798599243164, "copy_logits_min": -750000000.0, "copy_num_tokens": 408.125, "epoch": 0.5314271125861629, "gen_logits_max": 4.8135457038879395, "gen_logits_mean": -14.0499267578125, "gen_logits_min": -25.090736389160156, "gen_logits_std": 2.515361785888672, "gen_loss": 0.3217507004737854, "grad_norm": 0.4725701117773497, "learning_rate": 2.764294736842105e-05, "loss": 0.3242, "mean_copy_accuracy": 0.9952264875173569, "mean_gen_accuracy": 0.8631875514984131, "mean_token_accuracy": 0.8951616436243057, "num_tokens": 705293642.0, "sample_num_tokens": 8067.5, "step": 2602, "total_num_tokens": 705325912.0, "z_loss": 0.0007594688795506954 }, { "copy_logits_max": -1.984354019165039, "copy_logits_min": -750000000.0, "copy_num_tokens": 473.625, "epoch": 0.5316313505233597, "gen_logits_max": 5.7897162437438965, "gen_logits_mean": -12.25366497039795, "gen_logits_min": -24.032093048095703, "gen_logits_std": 2.5653011798858643, "gen_loss": 0.34556394815444946, "grad_norm": 0.43143838295637227, "learning_rate": 2.764168421052632e-05, "loss": 0.3426, "mean_copy_accuracy": 0.9941388815641403, "mean_gen_accuracy": 0.8523851335048676, "mean_token_accuracy": 0.8859264403581619, "num_tokens": 705548205.0, "sample_num_tokens": 8107.25, "step": 2603, "total_num_tokens": 705580634.0, "z_loss": 0.0008332048309966922 }, { "copy_logits_max": -3.257568359375, "copy_logits_min": -750000000.0, "copy_num_tokens": 404.1875, "epoch": 0.5318355884605566, "gen_logits_max": 5.448932647705078, "gen_logits_mean": -12.690258979797363, "gen_logits_min": -24.217021942138672, "gen_logits_std": 2.5305092334747314, "gen_loss": 0.3463669419288635, "grad_norm": 0.4717053084271623, "learning_rate": 2.764042105263158e-05, "loss": 0.323, "mean_copy_accuracy": 0.9930711984634399, "mean_gen_accuracy": 0.8615512996912003, "mean_token_accuracy": 0.8929606974124908, "num_tokens": 705811835.0, "sample_num_tokens": 7403.25, "step": 2604, "total_num_tokens": 705841448.0, "z_loss": 0.0008292276179417968 }, { "copy_logits_max": -3.216660499572754, "copy_logits_min": -750000000.0, "copy_num_tokens": 475.8125, "epoch": 0.5320398263977534, "gen_logits_max": 4.971027374267578, "gen_logits_mean": -13.098602294921875, "gen_logits_min": -24.882312774658203, "gen_logits_std": 2.539248466491699, "gen_loss": 0.32108256220817566, "grad_norm": 0.42791730361104063, "learning_rate": 2.7639157894736844e-05, "loss": 0.3018, "mean_copy_accuracy": 0.9956239312887192, "mean_gen_accuracy": 0.8644275963306427, "mean_token_accuracy": 0.902938649058342, "num_tokens": 706113970.0, "sample_num_tokens": 8313.0, "step": 2605, "total_num_tokens": 706147222.0, "z_loss": 0.0007643492426723242 }, { "copy_logits_max": -4.108465671539307, "copy_logits_min": -750000000.0, "copy_num_tokens": 455.5, "epoch": 0.5322440643349502, "gen_logits_max": 5.276098251342773, "gen_logits_mean": -12.093589782714844, "gen_logits_min": -23.651378631591797, "gen_logits_std": 2.516064167022705, "gen_loss": 0.3145752549171448, "grad_norm": 0.5152066589634163, "learning_rate": 2.7637894736842108e-05, "loss": 0.3156, "mean_copy_accuracy": 0.9946992695331573, "mean_gen_accuracy": 0.8651034832000732, "mean_token_accuracy": 0.8980220556259155, "num_tokens": 706381559.0, "sample_num_tokens": 7796.75, "step": 2606, "total_num_tokens": 706412746.0, "z_loss": 0.0008061362896114588 }, { "copy_logits_max": -3.9745774269104004, "copy_logits_min": -687500032.0, "copy_num_tokens": 592.875, "epoch": 0.532448302272147, "gen_logits_max": 5.190505027770996, "gen_logits_mean": -13.379521369934082, "gen_logits_min": -24.984020233154297, "gen_logits_std": 2.5358190536499023, "gen_loss": 0.3661561608314514, "grad_norm": 0.45884840550557143, "learning_rate": 2.763663157894737e-05, "loss": 0.3167, "mean_copy_accuracy": 0.9952410608530045, "mean_gen_accuracy": 0.862859770655632, "mean_token_accuracy": 0.8967425674200058, "num_tokens": 706671216.0, "sample_num_tokens": 10200.5, "step": 2607, "total_num_tokens": 706712018.0, "z_loss": 0.0008691522525623441 }, { "copy_logits_max": -4.080792427062988, "copy_logits_min": -750000000.0, "copy_num_tokens": 336.8125, "epoch": 0.5326525402093439, "gen_logits_max": 5.048596382141113, "gen_logits_mean": -14.054550170898438, "gen_logits_min": -25.242664337158203, "gen_logits_std": 2.5120034217834473, "gen_loss": 0.31370967626571655, "grad_norm": 0.4437838078313038, "learning_rate": 2.7635368421052633e-05, "loss": 0.3213, "mean_copy_accuracy": 0.9936054646968842, "mean_gen_accuracy": 0.8628537803888321, "mean_token_accuracy": 0.8920040130615234, "num_tokens": 706936014.0, "sample_num_tokens": 7053.0, "step": 2608, "total_num_tokens": 706964226.0, "z_loss": 0.0007138949586078525 }, { "copy_logits_max": -5.544512748718262, "copy_logits_min": -750000000.0, "copy_num_tokens": 538.0, "epoch": 0.5328567781465408, "gen_logits_max": 4.573974132537842, "gen_logits_mean": -13.889612197875977, "gen_logits_min": -25.201234817504883, "gen_logits_std": 2.5390961170196533, "gen_loss": 0.2959928512573242, "grad_norm": 0.43201205347815963, "learning_rate": 2.7634105263157894e-05, "loss": 0.2952, "mean_copy_accuracy": 0.995199128985405, "mean_gen_accuracy": 0.8643204271793365, "mean_token_accuracy": 0.9023706763982773, "num_tokens": 707209151.0, "sample_num_tokens": 8852.75, "step": 2609, "total_num_tokens": 707244562.0, "z_loss": 0.0006718338700011373 }, { "copy_logits_max": -6.257178783416748, "copy_logits_min": -750000000.0, "copy_num_tokens": 427.8125, "epoch": 0.5330610160837376, "gen_logits_max": 4.953376770019531, "gen_logits_mean": -13.293790817260742, "gen_logits_min": -24.002906799316406, "gen_logits_std": 2.4228103160858154, "gen_loss": 0.2903677523136139, "grad_norm": 0.4763644964093802, "learning_rate": 2.763284210526316e-05, "loss": 0.3418, "mean_copy_accuracy": 0.994149386882782, "mean_gen_accuracy": 0.8573079854249954, "mean_token_accuracy": 0.8870870023965836, "num_tokens": 707471196.0, "sample_num_tokens": 8882.0, "step": 2610, "total_num_tokens": 707506724.0, "z_loss": 0.0006320491665974259 }, { "copy_logits_max": -3.3763303756713867, "copy_logits_min": -687500032.0, "copy_num_tokens": 338.5625, "epoch": 0.5332652540209344, "gen_logits_max": 5.28993558883667, "gen_logits_mean": -13.134857177734375, "gen_logits_min": -24.747201919555664, "gen_logits_std": 2.5436506271362305, "gen_loss": 0.33748742938041687, "grad_norm": 0.6069928486296323, "learning_rate": 2.7631578947368423e-05, "loss": 0.3151, "mean_copy_accuracy": 0.9923475682735443, "mean_gen_accuracy": 0.8685212135314941, "mean_token_accuracy": 0.8965014815330505, "num_tokens": 707730576.0, "sample_num_tokens": 7264.5, "step": 2611, "total_num_tokens": 707759634.0, "z_loss": 0.0007807413348928094 }, { "copy_logits_max": -1.4339916706085205, "copy_logits_min": -750000000.0, "copy_num_tokens": 651.875, "epoch": 0.5334694919581312, "gen_logits_max": 4.585789203643799, "gen_logits_mean": -13.443860054016113, "gen_logits_min": -25.757078170776367, "gen_logits_std": 2.545191764831543, "gen_loss": 0.3316100537776947, "grad_norm": 0.45934270467697025, "learning_rate": 2.7630315789473687e-05, "loss": 0.3252, "mean_copy_accuracy": 0.9948116540908813, "mean_gen_accuracy": 0.8631021976470947, "mean_token_accuracy": 0.8952008932828903, "num_tokens": 708004111.0, "sample_num_tokens": 10364.75, "step": 2612, "total_num_tokens": 708045570.0, "z_loss": 0.0008071279735304415 }, { "copy_logits_max": -3.8102355003356934, "copy_logits_min": -750000000.0, "copy_num_tokens": 415.1875, "epoch": 0.533673729895328, "gen_logits_max": 5.6561198234558105, "gen_logits_mean": -12.562904357910156, "gen_logits_min": -24.06519317626953, "gen_logits_std": 2.4833924770355225, "gen_loss": 0.33752578496932983, "grad_norm": 0.4185295961211138, "learning_rate": 2.7629052631578948e-05, "loss": 0.2998, "mean_copy_accuracy": 0.9948456883430481, "mean_gen_accuracy": 0.8740669935941696, "mean_token_accuracy": 0.9004935771226883, "num_tokens": 708273470.0, "sample_num_tokens": 8645.5, "step": 2613, "total_num_tokens": 708308052.0, "z_loss": 0.0007793692639097571 }, { "copy_logits_max": -3.323087692260742, "copy_logits_min": -750000000.0, "copy_num_tokens": 488.6875, "epoch": 0.533877967832525, "gen_logits_max": 4.971646308898926, "gen_logits_mean": -13.19727611541748, "gen_logits_min": -25.461162567138672, "gen_logits_std": 2.5292258262634277, "gen_loss": 0.29024988412857056, "grad_norm": 0.4393677452408679, "learning_rate": 2.7627789473684212e-05, "loss": 0.3285, "mean_copy_accuracy": 0.9937325716018677, "mean_gen_accuracy": 0.8570516854524612, "mean_token_accuracy": 0.8898820877075195, "num_tokens": 708531878.0, "sample_num_tokens": 8203.0, "step": 2614, "total_num_tokens": 708564690.0, "z_loss": 0.0008035814971663058 }, { "copy_logits_max": -2.1190545558929443, "copy_logits_min": -687500032.0, "copy_num_tokens": 440.625, "epoch": 0.5340822057697218, "gen_logits_max": 4.872520446777344, "gen_logits_mean": -13.951035499572754, "gen_logits_min": -26.07175064086914, "gen_logits_std": 2.5335373878479004, "gen_loss": 0.35754328966140747, "grad_norm": 0.44843436691613836, "learning_rate": 2.7626526315789473e-05, "loss": 0.3369, "mean_copy_accuracy": 0.9949977248907089, "mean_gen_accuracy": 0.8543050140142441, "mean_token_accuracy": 0.8899343609809875, "num_tokens": 708804661.0, "sample_num_tokens": 7768.25, "step": 2615, "total_num_tokens": 708835734.0, "z_loss": 0.0008658713195472956 }, { "copy_logits_max": -4.550144195556641, "copy_logits_min": -750000000.0, "copy_num_tokens": 330.3125, "epoch": 0.5342864437069186, "gen_logits_max": 5.188321590423584, "gen_logits_mean": -13.50216293334961, "gen_logits_min": -24.538711547851562, "gen_logits_std": 2.472980499267578, "gen_loss": 0.32652896642684937, "grad_norm": 0.44853645536832615, "learning_rate": 2.7625263157894738e-05, "loss": 0.3211, "mean_copy_accuracy": 0.9938211292028427, "mean_gen_accuracy": 0.8644428849220276, "mean_token_accuracy": 0.8949789106845856, "num_tokens": 709062701.0, "sample_num_tokens": 7553.25, "step": 2616, "total_num_tokens": 709092914.0, "z_loss": 0.0007599623640999198 }, { "copy_logits_max": -5.04837703704834, "copy_logits_min": -750000000.0, "copy_num_tokens": 425.6875, "epoch": 0.5344906816441154, "gen_logits_max": 4.411945343017578, "gen_logits_mean": -15.18068790435791, "gen_logits_min": -26.626054763793945, "gen_logits_std": 2.5398454666137695, "gen_loss": 0.3333425223827362, "grad_norm": 0.4123461517365413, "learning_rate": 2.7624e-05, "loss": 0.3277, "mean_copy_accuracy": 0.9943906664848328, "mean_gen_accuracy": 0.8665237873792648, "mean_token_accuracy": 0.8928933590650558, "num_tokens": 709324400.0, "sample_num_tokens": 8081.0, "step": 2617, "total_num_tokens": 709356724.0, "z_loss": 0.0006992387352511287 }, { "copy_logits_max": -5.68748664855957, "copy_logits_min": -750000000.0, "copy_num_tokens": 390.25, "epoch": 0.5346949195813122, "gen_logits_max": 4.763426780700684, "gen_logits_mean": -13.486512184143066, "gen_logits_min": -24.926300048828125, "gen_logits_std": 2.460662364959717, "gen_loss": 0.32077544927597046, "grad_norm": 0.46610135690931964, "learning_rate": 2.7622736842105263e-05, "loss": 0.3084, "mean_copy_accuracy": 0.9951232373714447, "mean_gen_accuracy": 0.8636294603347778, "mean_token_accuracy": 0.8953963071107864, "num_tokens": 709575500.0, "sample_num_tokens": 7934.5, "step": 2618, "total_num_tokens": 709607238.0, "z_loss": 0.0007544982945546508 }, { "copy_logits_max": -4.174299240112305, "copy_logits_min": -750000000.0, "copy_num_tokens": 327.4375, "epoch": 0.534899157518509, "gen_logits_max": 4.930728912353516, "gen_logits_mean": -13.515671730041504, "gen_logits_min": -24.926015853881836, "gen_logits_std": 2.5251224040985107, "gen_loss": 0.3019446134567261, "grad_norm": 0.4104280078494259, "learning_rate": 2.7621473684210527e-05, "loss": 0.3057, "mean_copy_accuracy": 0.9937642961740494, "mean_gen_accuracy": 0.8649076372385025, "mean_token_accuracy": 0.8985816985368729, "num_tokens": 709868423.0, "sample_num_tokens": 6614.25, "step": 2619, "total_num_tokens": 709894880.0, "z_loss": 0.0007886816747486591 }, { "copy_logits_max": -5.175705909729004, "copy_logits_min": -687500032.0, "copy_num_tokens": 204.375, "epoch": 0.5351033954557058, "gen_logits_max": 5.676405429840088, "gen_logits_mean": -13.107061386108398, "gen_logits_min": -24.34503936767578, "gen_logits_std": 2.467770576477051, "gen_loss": 0.2985864579677582, "grad_norm": 0.44312380453277406, "learning_rate": 2.762021052631579e-05, "loss": 0.3256, "mean_copy_accuracy": 0.9936380237340927, "mean_gen_accuracy": 0.8657345771789551, "mean_token_accuracy": 0.8924272209405899, "num_tokens": 710117615.0, "sample_num_tokens": 7107.75, "step": 2620, "total_num_tokens": 710146046.0, "z_loss": 0.0007939045317471027 }, { "copy_logits_max": -4.002493858337402, "copy_logits_min": -750000000.0, "copy_num_tokens": 719.125, "epoch": 0.5353076333929028, "gen_logits_max": 4.308646202087402, "gen_logits_mean": -13.392753601074219, "gen_logits_min": -24.910226821899414, "gen_logits_std": 2.514707565307617, "gen_loss": 0.2572248578071594, "grad_norm": 0.4320415345336371, "learning_rate": 2.7618947368421056e-05, "loss": 0.3185, "mean_copy_accuracy": 0.9938418716192245, "mean_gen_accuracy": 0.8670198619365692, "mean_token_accuracy": 0.8953652530908585, "num_tokens": 710383952.0, "sample_num_tokens": 10496.5, "step": 2621, "total_num_tokens": 710425938.0, "z_loss": 0.000885755056515336 }, { "copy_logits_max": -2.580385684967041, "copy_logits_min": -750000000.0, "copy_num_tokens": 502.9375, "epoch": 0.5355118713300996, "gen_logits_max": 4.31614875793457, "gen_logits_mean": -12.805795669555664, "gen_logits_min": -23.754329681396484, "gen_logits_std": 2.466783046722412, "gen_loss": 0.3212183713912964, "grad_norm": 0.4509162991749292, "learning_rate": 2.7617684210526317e-05, "loss": 0.3436, "mean_copy_accuracy": 0.9931717365980148, "mean_gen_accuracy": 0.8560929298400879, "mean_token_accuracy": 0.8889510333538055, "num_tokens": 710646836.0, "sample_num_tokens": 8443.5, "step": 2622, "total_num_tokens": 710680610.0, "z_loss": 0.0009602836798876524 }, { "copy_logits_max": -2.2431538105010986, "copy_logits_min": -750000000.0, "copy_num_tokens": 646.5625, "epoch": 0.5357161092672964, "gen_logits_max": 4.627092361450195, "gen_logits_mean": -13.977264404296875, "gen_logits_min": -25.995126724243164, "gen_logits_std": 2.562997817993164, "gen_loss": 0.2742688059806824, "grad_norm": 0.39965832045112815, "learning_rate": 2.761642105263158e-05, "loss": 0.298, "mean_copy_accuracy": 0.9941229522228241, "mean_gen_accuracy": 0.8677687644958496, "mean_token_accuracy": 0.9013969898223877, "num_tokens": 710922215.0, "sample_num_tokens": 9334.25, "step": 2623, "total_num_tokens": 710959552.0, "z_loss": 0.000878841383382678 }, { "copy_logits_max": -1.878371238708496, "copy_logits_min": -750000064.0, "copy_num_tokens": 558.125, "epoch": 0.5359203472044932, "gen_logits_max": 3.9856224060058594, "gen_logits_mean": -14.555455207824707, "gen_logits_min": -26.022945404052734, "gen_logits_std": 2.5318474769592285, "gen_loss": 0.28700438141822815, "grad_norm": 0.44505080243961725, "learning_rate": 2.7615157894736842e-05, "loss": 0.3239, "mean_copy_accuracy": 0.9936007261276245, "mean_gen_accuracy": 0.8595404922962189, "mean_token_accuracy": 0.8916953355073929, "num_tokens": 711203272.0, "sample_num_tokens": 9390.0, "step": 2624, "total_num_tokens": 711240832.0, "z_loss": 0.0008830031147226691 }, { "copy_logits_max": -4.116552352905273, "copy_logits_min": -687500032.0, "copy_num_tokens": 458.3125, "epoch": 0.53612458514169, "gen_logits_max": 4.86773681640625, "gen_logits_mean": -12.89006519317627, "gen_logits_min": -24.274703979492188, "gen_logits_std": 2.47346830368042, "gen_loss": 0.3575689196586609, "grad_norm": 0.43863834420993036, "learning_rate": 2.7613894736842106e-05, "loss": 0.3222, "mean_copy_accuracy": 0.993860200047493, "mean_gen_accuracy": 0.8641572147607803, "mean_token_accuracy": 0.8969233483076096, "num_tokens": 711478755.0, "sample_num_tokens": 7775.75, "step": 2625, "total_num_tokens": 711509858.0, "z_loss": 0.0008598928106948733 }, { "copy_logits_max": -2.372633457183838, "copy_logits_min": -750000000.0, "copy_num_tokens": 488.125, "epoch": 0.5363288230788869, "gen_logits_max": 4.6414995193481445, "gen_logits_mean": -13.893216133117676, "gen_logits_min": -25.66318702697754, "gen_logits_std": 2.5383191108703613, "gen_loss": 0.3583354353904724, "grad_norm": 0.439740915111941, "learning_rate": 2.7612631578947367e-05, "loss": 0.3255, "mean_copy_accuracy": 0.9936366975307465, "mean_gen_accuracy": 0.8581807762384415, "mean_token_accuracy": 0.8931994885206223, "num_tokens": 711749052.0, "sample_num_tokens": 8591.0, "step": 2626, "total_num_tokens": 711783416.0, "z_loss": 0.0008197256829589605 }, { "copy_logits_max": -3.5097384452819824, "copy_logits_min": -750000000.0, "copy_num_tokens": 517.0, "epoch": 0.5365330610160838, "gen_logits_max": 4.4021806716918945, "gen_logits_mean": -13.690359115600586, "gen_logits_min": -25.096328735351562, "gen_logits_std": 2.5103893280029297, "gen_loss": 0.3124827742576599, "grad_norm": 0.47323358819203326, "learning_rate": 2.761136842105263e-05, "loss": 0.3455, "mean_copy_accuracy": 0.9936425536870956, "mean_gen_accuracy": 0.8481390029191971, "mean_token_accuracy": 0.8852765411138535, "num_tokens": 712014309.0, "sample_num_tokens": 8790.25, "step": 2627, "total_num_tokens": 712049470.0, "z_loss": 0.0007780144223943353 }, { "copy_logits_max": -6.387277126312256, "copy_logits_min": -750000000.0, "copy_num_tokens": 356.25, "epoch": 0.5367372989532806, "gen_logits_max": 4.619162559509277, "gen_logits_mean": -14.925846099853516, "gen_logits_min": -26.298006057739258, "gen_logits_std": 2.509495973587036, "gen_loss": 0.32997551560401917, "grad_norm": 0.5281576711882352, "learning_rate": 2.7610105263157896e-05, "loss": 0.3343, "mean_copy_accuracy": 0.9930640757083893, "mean_gen_accuracy": 0.8589638024568558, "mean_token_accuracy": 0.8912435621023178, "num_tokens": 712302292.0, "sample_num_tokens": 8036.0, "step": 2628, "total_num_tokens": 712334436.0, "z_loss": 0.0007100962684489787 }, { "copy_logits_max": -3.837876796722412, "copy_logits_min": -687500032.0, "copy_num_tokens": 561.5, "epoch": 0.5369415368904774, "gen_logits_max": 3.563632011413574, "gen_logits_mean": -15.504389762878418, "gen_logits_min": -26.624576568603516, "gen_logits_std": 2.498100996017456, "gen_loss": 0.2892833650112152, "grad_norm": 0.4557330608638041, "learning_rate": 2.760884210526316e-05, "loss": 0.3006, "mean_copy_accuracy": 0.994050532579422, "mean_gen_accuracy": 0.8706880211830139, "mean_token_accuracy": 0.9002823531627655, "num_tokens": 712562792.0, "sample_num_tokens": 8880.5, "step": 2629, "total_num_tokens": 712598314.0, "z_loss": 0.0006943610496819019 }, { "copy_logits_max": -3.8626549243927, "copy_logits_min": -750000000.0, "copy_num_tokens": 575.6875, "epoch": 0.5371457748276742, "gen_logits_max": 3.849106550216675, "gen_logits_mean": -14.064523696899414, "gen_logits_min": -26.004730224609375, "gen_logits_std": 2.5224931240081787, "gen_loss": 0.2802531123161316, "grad_norm": 0.44945879335927624, "learning_rate": 2.760757894736842e-05, "loss": 0.3222, "mean_copy_accuracy": 0.9945450872182846, "mean_gen_accuracy": 0.8660727590322495, "mean_token_accuracy": 0.8930987417697906, "num_tokens": 712838067.0, "sample_num_tokens": 9859.75, "step": 2630, "total_num_tokens": 712877506.0, "z_loss": 0.0006904728361405432 }, { "copy_logits_max": -4.588080406188965, "copy_logits_min": -750000000.0, "copy_num_tokens": 375.5, "epoch": 0.537350012764871, "gen_logits_max": 5.075745582580566, "gen_logits_mean": -13.778888702392578, "gen_logits_min": -25.106857299804688, "gen_logits_std": 2.4976165294647217, "gen_loss": 0.3639630377292633, "grad_norm": 0.42260270623724167, "learning_rate": 2.7606315789473685e-05, "loss": 0.3204, "mean_copy_accuracy": 0.9948169440031052, "mean_gen_accuracy": 0.8643274754285812, "mean_token_accuracy": 0.8948225826025009, "num_tokens": 713105492.0, "sample_num_tokens": 7896.0, "step": 2631, "total_num_tokens": 713137076.0, "z_loss": 0.0008002695976756513 }, { "copy_logits_max": -5.016467094421387, "copy_logits_min": -750000000.0, "copy_num_tokens": 330.4375, "epoch": 0.5375542507020679, "gen_logits_max": 5.317319869995117, "gen_logits_mean": -12.638559341430664, "gen_logits_min": -24.890357971191406, "gen_logits_std": 2.4938344955444336, "gen_loss": 0.3517734408378601, "grad_norm": 0.5584439596249634, "learning_rate": 2.7605052631578946e-05, "loss": 0.35, "mean_copy_accuracy": 0.9917935878038406, "mean_gen_accuracy": 0.8541398346424103, "mean_token_accuracy": 0.8848390728235245, "num_tokens": 713368807.0, "sample_num_tokens": 7239.25, "step": 2632, "total_num_tokens": 713397764.0, "z_loss": 0.0007186056463979185 }, { "copy_logits_max": -1.5924527645111084, "copy_logits_min": -750000000.0, "copy_num_tokens": 463.75, "epoch": 0.5377584886392648, "gen_logits_max": 4.266103744506836, "gen_logits_mean": -13.670938491821289, "gen_logits_min": -25.66010284423828, "gen_logits_std": 2.58750581741333, "gen_loss": 0.3193471133708954, "grad_norm": 0.4609890490654574, "learning_rate": 2.760378947368421e-05, "loss": 0.3549, "mean_copy_accuracy": 0.9935801476240158, "mean_gen_accuracy": 0.853301391005516, "mean_token_accuracy": 0.8845421373844147, "num_tokens": 713617265.0, "sample_num_tokens": 7979.25, "step": 2633, "total_num_tokens": 713649182.0, "z_loss": 0.000744320685043931 }, { "copy_logits_max": -2.0635063648223877, "copy_logits_min": -750000000.0, "copy_num_tokens": 473.5, "epoch": 0.5379627265764616, "gen_logits_max": 5.400033950805664, "gen_logits_mean": -12.463004112243652, "gen_logits_min": -24.33325958251953, "gen_logits_std": 2.5662643909454346, "gen_loss": 0.2983022630214691, "grad_norm": 0.45423707192921225, "learning_rate": 2.7602526315789475e-05, "loss": 0.3202, "mean_copy_accuracy": 0.9930223971605301, "mean_gen_accuracy": 0.8637319207191467, "mean_token_accuracy": 0.8940462023019791, "num_tokens": 713874270.0, "sample_num_tokens": 8378.0, "step": 2634, "total_num_tokens": 713907782.0, "z_loss": 0.0008064925204962492 }, { "copy_logits_max": -2.011249542236328, "copy_logits_min": -750000000.0, "copy_num_tokens": 520.625, "epoch": 0.5381669645136584, "gen_logits_max": 4.319560527801514, "gen_logits_mean": -13.665342330932617, "gen_logits_min": -25.34435272216797, "gen_logits_std": 2.5189435482025146, "gen_loss": 0.2699146270751953, "grad_norm": 0.7988260226808566, "learning_rate": 2.7601263157894736e-05, "loss": 0.2935, "mean_copy_accuracy": 0.9957364350557327, "mean_gen_accuracy": 0.8631164282560349, "mean_token_accuracy": 0.9041258841753006, "num_tokens": 714165507.0, "sample_num_tokens": 8036.25, "step": 2635, "total_num_tokens": 714197652.0, "z_loss": 0.0007708844495937228 }, { "copy_logits_max": -1.2270711660385132, "copy_logits_min": -750000064.0, "copy_num_tokens": 519.3125, "epoch": 0.5383712024508552, "gen_logits_max": 4.309948921203613, "gen_logits_mean": -14.302074432373047, "gen_logits_min": -26.63558006286621, "gen_logits_std": 2.5473787784576416, "gen_loss": 0.30714911222457886, "grad_norm": 0.3950054069373574, "learning_rate": 2.7600000000000003e-05, "loss": 0.2996, "mean_copy_accuracy": 0.9948171526193619, "mean_gen_accuracy": 0.8640255630016327, "mean_token_accuracy": 0.8983563631772995, "num_tokens": 714437580.0, "sample_num_tokens": 8315.5, "step": 2636, "total_num_tokens": 714470842.0, "z_loss": 0.0008553696097806096 }, { "copy_logits_max": -0.6518120765686035, "copy_logits_min": -750000000.0, "copy_num_tokens": 719.125, "epoch": 0.538575440388052, "gen_logits_max": 5.232664585113525, "gen_logits_mean": -11.553651809692383, "gen_logits_min": -24.371231079101562, "gen_logits_std": 2.5444746017456055, "gen_loss": 0.30672913789749146, "grad_norm": 0.41618879189344826, "learning_rate": 2.7598736842105264e-05, "loss": 0.3355, "mean_copy_accuracy": 0.9919788092374802, "mean_gen_accuracy": 0.8597564399242401, "mean_token_accuracy": 0.8889579623937607, "num_tokens": 714711199.0, "sample_num_tokens": 10600.25, "step": 2637, "total_num_tokens": 714753600.0, "z_loss": 0.0009350499603897333 }, { "copy_logits_max": -2.5884480476379395, "copy_logits_min": -687500032.0, "copy_num_tokens": 465.0, "epoch": 0.5387796783252489, "gen_logits_max": 4.806802749633789, "gen_logits_mean": -14.112308502197266, "gen_logits_min": -26.171142578125, "gen_logits_std": 2.539745807647705, "gen_loss": 0.3019307851791382, "grad_norm": 0.4192343720501331, "learning_rate": 2.759747368421053e-05, "loss": 0.3019, "mean_copy_accuracy": 0.994164451956749, "mean_gen_accuracy": 0.8660228848457336, "mean_token_accuracy": 0.8992699533700943, "num_tokens": 714990490.0, "sample_num_tokens": 8791.0, "step": 2638, "total_num_tokens": 715025654.0, "z_loss": 0.0008492124616168439 }, { "copy_logits_max": -2.755542278289795, "copy_logits_min": -750000000.0, "copy_num_tokens": 367.1875, "epoch": 0.5389839162624458, "gen_logits_max": 6.841944694519043, "gen_logits_mean": -12.682413101196289, "gen_logits_min": -25.413063049316406, "gen_logits_std": 2.5868661403656006, "gen_loss": 0.3845604658126831, "grad_norm": 0.4681654755833121, "learning_rate": 2.759621052631579e-05, "loss": 0.333, "mean_copy_accuracy": 0.9939086437225342, "mean_gen_accuracy": 0.8574840426445007, "mean_token_accuracy": 0.8920162916183472, "num_tokens": 715267560.0, "sample_num_tokens": 7615.5, "step": 2639, "total_num_tokens": 715298022.0, "z_loss": 0.0010592112084850669 }, { "copy_logits_max": -2.214224338531494, "copy_logits_min": -750000000.0, "copy_num_tokens": 555.8125, "epoch": 0.5391881541996426, "gen_logits_max": 5.414619445800781, "gen_logits_mean": -13.083159446716309, "gen_logits_min": -25.190500259399414, "gen_logits_std": 2.545304775238037, "gen_loss": 0.3113101124763489, "grad_norm": 0.47564428808499054, "learning_rate": 2.7594947368421054e-05, "loss": 0.3131, "mean_copy_accuracy": 0.9955039918422699, "mean_gen_accuracy": 0.8602242022752762, "mean_token_accuracy": 0.8963483572006226, "num_tokens": 715537757.0, "sample_num_tokens": 9063.75, "step": 2640, "total_num_tokens": 715574012.0, "z_loss": 0.0008717447053641081 }, { "copy_logits_max": -4.852889060974121, "copy_logits_min": -750000064.0, "copy_num_tokens": 398.9375, "epoch": 0.5393923921368394, "gen_logits_max": 5.113438606262207, "gen_logits_mean": -13.773042678833008, "gen_logits_min": -25.598731994628906, "gen_logits_std": 2.553262710571289, "gen_loss": 0.3055194914340973, "grad_norm": 0.49201913724012564, "learning_rate": 2.7593684210526315e-05, "loss": 0.3255, "mean_copy_accuracy": 0.9936144649982452, "mean_gen_accuracy": 0.862797424197197, "mean_token_accuracy": 0.8927467465400696, "num_tokens": 715813059.0, "sample_num_tokens": 7976.25, "step": 2641, "total_num_tokens": 715844964.0, "z_loss": 0.0007425091462209821 }, { "copy_logits_max": -3.249800682067871, "copy_logits_min": -750000000.0, "copy_num_tokens": 487.5, "epoch": 0.5395966300740362, "gen_logits_max": 5.542827606201172, "gen_logits_mean": -12.960159301757812, "gen_logits_min": -25.341388702392578, "gen_logits_std": 2.542653799057007, "gen_loss": 0.36776003241539, "grad_norm": 0.38031339513382956, "learning_rate": 2.759242105263158e-05, "loss": 0.2962, "mean_copy_accuracy": 0.9949949085712433, "mean_gen_accuracy": 0.8690800815820694, "mean_token_accuracy": 0.9038920849561691, "num_tokens": 716102844.0, "sample_num_tokens": 9534.0, "step": 2642, "total_num_tokens": 716140980.0, "z_loss": 0.0009097873698920012 }, { "copy_logits_max": -2.1306400299072266, "copy_logits_min": -750000000.0, "copy_num_tokens": 391.875, "epoch": 0.5398008680112331, "gen_logits_max": 5.336906433105469, "gen_logits_mean": -12.818737030029297, "gen_logits_min": -25.3564453125, "gen_logits_std": 2.5290515422821045, "gen_loss": 0.34143149852752686, "grad_norm": 0.44356658356446177, "learning_rate": 2.759115789473684e-05, "loss": 0.3256, "mean_copy_accuracy": 0.9958901852369308, "mean_gen_accuracy": 0.8566096425056458, "mean_token_accuracy": 0.8929642587900162, "num_tokens": 716388357.0, "sample_num_tokens": 7743.25, "step": 2643, "total_num_tokens": 716419330.0, "z_loss": 0.0008582444279454648 }, { "copy_logits_max": -3.966092109680176, "copy_logits_min": -750000000.0, "copy_num_tokens": 357.375, "epoch": 0.5400051059484299, "gen_logits_max": 5.914316177368164, "gen_logits_mean": -13.15894889831543, "gen_logits_min": -25.379886627197266, "gen_logits_std": 2.5138673782348633, "gen_loss": 0.3675965368747711, "grad_norm": 0.457747089713289, "learning_rate": 2.7589894736842108e-05, "loss": 0.3326, "mean_copy_accuracy": 0.9937300384044647, "mean_gen_accuracy": 0.8532137274742126, "mean_token_accuracy": 0.8905126303434372, "num_tokens": 716662051.0, "sample_num_tokens": 8009.75, "step": 2644, "total_num_tokens": 716694090.0, "z_loss": 0.0009024455794133246 }, { "copy_logits_max": -0.7770085334777832, "copy_logits_min": -625000064.0, "copy_num_tokens": 714.375, "epoch": 0.5402093438856268, "gen_logits_max": 4.620083808898926, "gen_logits_mean": -12.735285758972168, "gen_logits_min": -25.563129425048828, "gen_logits_std": 2.5928707122802734, "gen_loss": 0.3020051121711731, "grad_norm": 0.417326590756409, "learning_rate": 2.758863157894737e-05, "loss": 0.3215, "mean_copy_accuracy": 0.9951259791851044, "mean_gen_accuracy": 0.8563207536935806, "mean_token_accuracy": 0.8925849348306656, "num_tokens": 716944746.0, "sample_num_tokens": 9624.5, "step": 2645, "total_num_tokens": 716983244.0, "z_loss": 0.0008799700299277902 }, { "copy_logits_max": -2.833258867263794, "copy_logits_min": -750000000.0, "copy_num_tokens": 556.9375, "epoch": 0.5404135818228236, "gen_logits_max": 4.796238422393799, "gen_logits_mean": -12.868242263793945, "gen_logits_min": -25.188968658447266, "gen_logits_std": 2.5206146240234375, "gen_loss": 0.30662697553634644, "grad_norm": 0.43991271811467636, "learning_rate": 2.7587368421052633e-05, "loss": 0.3154, "mean_copy_accuracy": 0.9951290190219879, "mean_gen_accuracy": 0.8639657497406006, "mean_token_accuracy": 0.8970963060855865, "num_tokens": 717228519.0, "sample_num_tokens": 8655.75, "step": 2646, "total_num_tokens": 717263142.0, "z_loss": 0.0007554632029496133 }, { "copy_logits_max": -1.8205870389938354, "copy_logits_min": -750000000.0, "copy_num_tokens": 771.625, "epoch": 0.5406178197600204, "gen_logits_max": 4.2990522384643555, "gen_logits_mean": -14.003010749816895, "gen_logits_min": -25.77835464477539, "gen_logits_std": 2.548330545425415, "gen_loss": 0.2943921685218811, "grad_norm": 0.42354844756931115, "learning_rate": 2.7586105263157897e-05, "loss": 0.3215, "mean_copy_accuracy": 0.9941556751728058, "mean_gen_accuracy": 0.8603485524654388, "mean_token_accuracy": 0.8950323015451431, "num_tokens": 717497661.0, "sample_num_tokens": 9918.75, "step": 2647, "total_num_tokens": 717537336.0, "z_loss": 0.0007904074154794216 }, { "copy_logits_max": -3.2684569358825684, "copy_logits_min": -687500032.0, "copy_num_tokens": 614.0, "epoch": 0.5408220576972173, "gen_logits_max": 4.2788987159729, "gen_logits_mean": -14.491198539733887, "gen_logits_min": -26.144075393676758, "gen_logits_std": 2.5540380477905273, "gen_loss": 0.2839090824127197, "grad_norm": 0.4228091932768151, "learning_rate": 2.7584842105263158e-05, "loss": 0.3108, "mean_copy_accuracy": 0.9950495064258575, "mean_gen_accuracy": 0.8650384396314621, "mean_token_accuracy": 0.8972783386707306, "num_tokens": 717777935.0, "sample_num_tokens": 9412.75, "step": 2648, "total_num_tokens": 717815586.0, "z_loss": 0.0008003377006389201 }, { "copy_logits_max": -4.45029878616333, "copy_logits_min": -750000000.0, "copy_num_tokens": 487.4375, "epoch": 0.5410262956344141, "gen_logits_max": 4.724972724914551, "gen_logits_mean": -13.99612808227539, "gen_logits_min": -26.53260612487793, "gen_logits_std": 2.553567886352539, "gen_loss": 0.313262939453125, "grad_norm": 0.43763642210925036, "learning_rate": 2.7583578947368423e-05, "loss": 0.3181, "mean_copy_accuracy": 0.9939188957214355, "mean_gen_accuracy": 0.8657821714878082, "mean_token_accuracy": 0.8958642482757568, "num_tokens": 718036802.0, "sample_num_tokens": 8834.5, "step": 2649, "total_num_tokens": 718072140.0, "z_loss": 0.0008452274487353861 }, { "copy_logits_max": -2.950002670288086, "copy_logits_min": -750000000.0, "copy_num_tokens": 526.1875, "epoch": 0.5412305335716109, "gen_logits_max": 4.866291046142578, "gen_logits_mean": -13.481064796447754, "gen_logits_min": -25.668773651123047, "gen_logits_std": 2.5240941047668457, "gen_loss": 0.3346782624721527, "grad_norm": 0.4286323951916207, "learning_rate": 2.7582315789473683e-05, "loss": 0.3307, "mean_copy_accuracy": 0.9936982989311218, "mean_gen_accuracy": 0.8587938845157623, "mean_token_accuracy": 0.8920146226882935, "num_tokens": 718321970.0, "sample_num_tokens": 9339.0, "step": 2650, "total_num_tokens": 718359326.0, "z_loss": 0.0009138498571701348 }, { "copy_logits_max": -4.58390474319458, "copy_logits_min": -750000000.0, "copy_num_tokens": 392.25, "epoch": 0.5414347715088078, "gen_logits_max": 5.713387966156006, "gen_logits_mean": -12.855228424072266, "gen_logits_min": -24.802871704101562, "gen_logits_std": 2.5677413940429688, "gen_loss": 0.32452988624572754, "grad_norm": 0.5937200176614343, "learning_rate": 2.7581052631578948e-05, "loss": 0.3117, "mean_copy_accuracy": 0.9943975210189819, "mean_gen_accuracy": 0.8629572540521622, "mean_token_accuracy": 0.897045448422432, "num_tokens": 718612452.0, "sample_num_tokens": 6890.5, "step": 2651, "total_num_tokens": 718640014.0, "z_loss": 0.000893814954906702 }, { "copy_logits_max": -1.7714259624481201, "copy_logits_min": -687500032.0, "copy_num_tokens": 658.1875, "epoch": 0.5416390094460046, "gen_logits_max": 3.6073670387268066, "gen_logits_mean": -14.05449390411377, "gen_logits_min": -26.238990783691406, "gen_logits_std": 2.568362236022949, "gen_loss": 0.2537522614002228, "grad_norm": 0.4021310308960207, "learning_rate": 2.7579789473684212e-05, "loss": 0.2793, "mean_copy_accuracy": 0.9955917596817017, "mean_gen_accuracy": 0.8712781369686127, "mean_token_accuracy": 0.908605232834816, "num_tokens": 718882202.0, "sample_num_tokens": 8801.5, "step": 2652, "total_num_tokens": 718917408.0, "z_loss": 0.0007323700119741261 }, { "copy_logits_max": -3.207589626312256, "copy_logits_min": -750000000.0, "copy_num_tokens": 836.1875, "epoch": 0.5418432473832014, "gen_logits_max": 4.261560440063477, "gen_logits_mean": -12.945415496826172, "gen_logits_min": -25.538421630859375, "gen_logits_std": 2.570307970046997, "gen_loss": 0.27462711930274963, "grad_norm": 0.39121253621082913, "learning_rate": 2.7578526315789476e-05, "loss": 0.2962, "mean_copy_accuracy": 0.9962492734193802, "mean_gen_accuracy": 0.8634049594402313, "mean_token_accuracy": 0.9034165143966675, "num_tokens": 719190757.0, "sample_num_tokens": 11416.75, "step": 2653, "total_num_tokens": 719236424.0, "z_loss": 0.0008217371068894863 }, { "copy_logits_max": -4.61176872253418, "copy_logits_min": -750000000.0, "copy_num_tokens": 424.9375, "epoch": 0.5420474853203983, "gen_logits_max": 5.5808258056640625, "gen_logits_mean": -12.228156089782715, "gen_logits_min": -24.520959854125977, "gen_logits_std": 2.573636770248413, "gen_loss": 0.3269730806350708, "grad_norm": 0.4320274600890261, "learning_rate": 2.7577263157894737e-05, "loss": 0.3197, "mean_copy_accuracy": 0.9941421002149582, "mean_gen_accuracy": 0.8637270331382751, "mean_token_accuracy": 0.8956668674945831, "num_tokens": 719467392.0, "sample_num_tokens": 8214.5, "step": 2654, "total_num_tokens": 719500250.0, "z_loss": 0.0007625161670148373 }, { "copy_logits_max": -4.476963043212891, "copy_logits_min": -750000000.0, "copy_num_tokens": 351.3125, "epoch": 0.5422517232575951, "gen_logits_max": 4.92765998840332, "gen_logits_mean": -13.682547569274902, "gen_logits_min": -25.561294555664062, "gen_logits_std": 2.5190186500549316, "gen_loss": 0.2928272485733032, "grad_norm": 0.4396235957401461, "learning_rate": 2.7576e-05, "loss": 0.3354, "mean_copy_accuracy": 0.9934228807687759, "mean_gen_accuracy": 0.8623626381158829, "mean_token_accuracy": 0.8859893381595612, "num_tokens": 719725266.0, "sample_num_tokens": 7969.0, "step": 2655, "total_num_tokens": 719757142.0, "z_loss": 0.0007294196984730661 }, { "copy_logits_max": -2.2307419776916504, "copy_logits_min": -750000000.0, "copy_num_tokens": 509.75, "epoch": 0.5424559611947919, "gen_logits_max": 5.030777454376221, "gen_logits_mean": -13.019865036010742, "gen_logits_min": -24.694965362548828, "gen_logits_std": 2.5465822219848633, "gen_loss": 0.37352418899536133, "grad_norm": 0.430886594965363, "learning_rate": 2.7574736842105263e-05, "loss": 0.3198, "mean_copy_accuracy": 0.9943259060382843, "mean_gen_accuracy": 0.8587074130773544, "mean_token_accuracy": 0.8942368179559708, "num_tokens": 719983805.0, "sample_num_tokens": 9058.25, "step": 2656, "total_num_tokens": 720020038.0, "z_loss": 0.0008862455142661929 }, { "copy_logits_max": -3.2152256965637207, "copy_logits_min": -750000000.0, "copy_num_tokens": 432.4375, "epoch": 0.5426601991319888, "gen_logits_max": 5.158099174499512, "gen_logits_mean": -13.300333023071289, "gen_logits_min": -24.759130477905273, "gen_logits_std": 2.5280814170837402, "gen_loss": 0.32940101623535156, "grad_norm": 0.460857991636697, "learning_rate": 2.7573473684210527e-05, "loss": 0.3102, "mean_copy_accuracy": 0.9928234219551086, "mean_gen_accuracy": 0.868746355175972, "mean_token_accuracy": 0.8996381759643555, "num_tokens": 720269937.0, "sample_num_tokens": 8413.25, "step": 2657, "total_num_tokens": 720303590.0, "z_loss": 0.0007005234947428107 }, { "copy_logits_max": -4.565969467163086, "copy_logits_min": -750000000.0, "copy_num_tokens": 338.0, "epoch": 0.5428644370691856, "gen_logits_max": 5.248234748840332, "gen_logits_mean": -13.39967155456543, "gen_logits_min": -25.062854766845703, "gen_logits_std": 2.5143418312072754, "gen_loss": 0.3047581911087036, "grad_norm": 0.49483184776180333, "learning_rate": 2.7572210526315788e-05, "loss": 0.3253, "mean_copy_accuracy": 0.992786630988121, "mean_gen_accuracy": 0.8657228052616119, "mean_token_accuracy": 0.8930360525846481, "num_tokens": 720542814.0, "sample_num_tokens": 7970.0, "step": 2658, "total_num_tokens": 720574694.0, "z_loss": 0.0006605781964026392 }, { "copy_logits_max": -4.317801475524902, "copy_logits_min": -687500032.0, "copy_num_tokens": 499.4375, "epoch": 0.5430686750063825, "gen_logits_max": 4.791126728057861, "gen_logits_mean": -13.649340629577637, "gen_logits_min": -25.192893981933594, "gen_logits_std": 2.5319414138793945, "gen_loss": 0.34652531147003174, "grad_norm": 0.41957892991372414, "learning_rate": 2.7570947368421052e-05, "loss": 0.3453, "mean_copy_accuracy": 0.9948215633630753, "mean_gen_accuracy": 0.8549883216619492, "mean_token_accuracy": 0.884976327419281, "num_tokens": 720802543.0, "sample_num_tokens": 9489.25, "step": 2659, "total_num_tokens": 720840500.0, "z_loss": 0.0007644861470907927 }, { "copy_logits_max": -3.399174213409424, "copy_logits_min": -687500032.0, "copy_num_tokens": 534.5, "epoch": 0.5432729129435793, "gen_logits_max": 4.481436729431152, "gen_logits_mean": -14.002216339111328, "gen_logits_min": -26.31260871887207, "gen_logits_std": 2.5639517307281494, "gen_loss": 0.2919992208480835, "grad_norm": 0.44581855490327993, "learning_rate": 2.7569684210526316e-05, "loss": 0.3036, "mean_copy_accuracy": 0.9947254955768585, "mean_gen_accuracy": 0.8636258542537689, "mean_token_accuracy": 0.8997431397438049, "num_tokens": 721093294.0, "sample_num_tokens": 8988.0, "step": 2660, "total_num_tokens": 721129246.0, "z_loss": 0.0007016889285296202 }, { "copy_logits_max": -3.5444602966308594, "copy_logits_min": -687500032.0, "copy_num_tokens": 372.0625, "epoch": 0.5434771508807761, "gen_logits_max": 5.1900529861450195, "gen_logits_mean": -12.86095905303955, "gen_logits_min": -24.992061614990234, "gen_logits_std": 2.568699359893799, "gen_loss": 0.33797693252563477, "grad_norm": 0.446190937485182, "learning_rate": 2.756842105263158e-05, "loss": 0.3382, "mean_copy_accuracy": 0.9943665564060211, "mean_gen_accuracy": 0.8566510826349258, "mean_token_accuracy": 0.8895731866359711, "num_tokens": 721359387.0, "sample_num_tokens": 7719.75, "step": 2661, "total_num_tokens": 721390266.0, "z_loss": 0.0007813930278643966 }, { "copy_logits_max": -3.994011878967285, "copy_logits_min": -750000000.0, "copy_num_tokens": 445.4375, "epoch": 0.5436813888179729, "gen_logits_max": 5.053281307220459, "gen_logits_mean": -12.285074234008789, "gen_logits_min": -23.89335060119629, "gen_logits_std": 2.5756022930145264, "gen_loss": 0.3120080828666687, "grad_norm": 0.43505580187509335, "learning_rate": 2.7567157894736845e-05, "loss": 0.3005, "mean_copy_accuracy": 0.9946995228528976, "mean_gen_accuracy": 0.8693471103906631, "mean_token_accuracy": 0.9022956490516663, "num_tokens": 721636368.0, "sample_num_tokens": 7827.5, "step": 2662, "total_num_tokens": 721667678.0, "z_loss": 0.0007567150751128793 }, { "copy_logits_max": -2.839906692504883, "copy_logits_min": -750000000.0, "copy_num_tokens": 537.125, "epoch": 0.5438856267551698, "gen_logits_max": 5.217563152313232, "gen_logits_mean": -12.985821723937988, "gen_logits_min": -24.265432357788086, "gen_logits_std": 2.54774808883667, "gen_loss": 0.30103057622909546, "grad_norm": 0.4673254977143682, "learning_rate": 2.7565894736842106e-05, "loss": 0.3448, "mean_copy_accuracy": 0.9941889941692352, "mean_gen_accuracy": 0.85413758456707, "mean_token_accuracy": 0.8861981928348541, "num_tokens": 721906039.0, "sample_num_tokens": 9180.25, "step": 2663, "total_num_tokens": 721942760.0, "z_loss": 0.0007464869413524866 }, { "copy_logits_max": -3.087430477142334, "copy_logits_min": -750000064.0, "copy_num_tokens": 513.4375, "epoch": 0.5440898646923666, "gen_logits_max": 4.5190839767456055, "gen_logits_mean": -14.632813453674316, "gen_logits_min": -26.188213348388672, "gen_logits_std": 2.5517187118530273, "gen_loss": 0.2957663834095001, "grad_norm": 0.4234960107283184, "learning_rate": 2.756463157894737e-05, "loss": 0.3163, "mean_copy_accuracy": 0.9950608313083649, "mean_gen_accuracy": 0.8682619333267212, "mean_token_accuracy": 0.8957893550395966, "num_tokens": 722175253.0, "sample_num_tokens": 8803.25, "step": 2664, "total_num_tokens": 722210466.0, "z_loss": 0.0007723314338363707 }, { "copy_logits_max": -2.1518077850341797, "copy_logits_min": -750000000.0, "copy_num_tokens": 409.625, "epoch": 0.5442941026295635, "gen_logits_max": 5.889613628387451, "gen_logits_mean": -12.860223770141602, "gen_logits_min": -25.048004150390625, "gen_logits_std": 2.578467845916748, "gen_loss": 0.33745306730270386, "grad_norm": 0.43523490739940074, "learning_rate": 2.756336842105263e-05, "loss": 0.318, "mean_copy_accuracy": 0.993674173951149, "mean_gen_accuracy": 0.8648951500654221, "mean_token_accuracy": 0.8954751193523407, "num_tokens": 722459251.0, "sample_num_tokens": 8589.25, "step": 2665, "total_num_tokens": 722493608.0, "z_loss": 0.0008754611480981112 }, { "copy_logits_max": -4.118189334869385, "copy_logits_min": -750000000.0, "copy_num_tokens": 524.875, "epoch": 0.5444983405667603, "gen_logits_max": 4.646230697631836, "gen_logits_mean": -14.197324752807617, "gen_logits_min": -25.95224380493164, "gen_logits_std": 2.536252737045288, "gen_loss": 0.3474881649017334, "grad_norm": 0.4697528224889452, "learning_rate": 2.7562105263157895e-05, "loss": 0.3119, "mean_copy_accuracy": 0.9924958050251007, "mean_gen_accuracy": 0.8659690767526627, "mean_token_accuracy": 0.8979979008436203, "num_tokens": 722748680.0, "sample_num_tokens": 9361.5, "step": 2666, "total_num_tokens": 722786126.0, "z_loss": 0.0007077964837662876 }, { "copy_logits_max": -3.8391666412353516, "copy_logits_min": -750000064.0, "copy_num_tokens": 446.375, "epoch": 0.5447025785039571, "gen_logits_max": 5.657045364379883, "gen_logits_mean": -12.4316987991333, "gen_logits_min": -24.285980224609375, "gen_logits_std": 2.5894570350646973, "gen_loss": 0.32248449325561523, "grad_norm": 0.48356239256132794, "learning_rate": 2.7560842105263156e-05, "loss": 0.3182, "mean_copy_accuracy": 0.9943974763154984, "mean_gen_accuracy": 0.864000141620636, "mean_token_accuracy": 0.8956788331270218, "num_tokens": 723019657.0, "sample_num_tokens": 8495.25, "step": 2667, "total_num_tokens": 723053638.0, "z_loss": 0.0007746296469122171 }, { "copy_logits_max": -1.0378286838531494, "copy_logits_min": -750000000.0, "copy_num_tokens": 549.0625, "epoch": 0.5449068164411539, "gen_logits_max": 5.1066155433654785, "gen_logits_mean": -12.770156860351562, "gen_logits_min": -24.852619171142578, "gen_logits_std": 2.5578324794769287, "gen_loss": 0.3110176920890808, "grad_norm": 0.4535965210304998, "learning_rate": 2.755957894736842e-05, "loss": 0.3151, "mean_copy_accuracy": 0.9948185533285141, "mean_gen_accuracy": 0.8599889874458313, "mean_token_accuracy": 0.8956433832645416, "num_tokens": 723308740.0, "sample_num_tokens": 8695.0, "step": 2668, "total_num_tokens": 723343520.0, "z_loss": 0.0007592199835926294 }, { "copy_logits_max": -1.5178651809692383, "copy_logits_min": -750000000.0, "copy_num_tokens": 458.125, "epoch": 0.5451110543783507, "gen_logits_max": 5.000412464141846, "gen_logits_mean": -12.520576477050781, "gen_logits_min": -24.54031753540039, "gen_logits_std": 2.5387678146362305, "gen_loss": 0.3133106231689453, "grad_norm": 0.48544549849703844, "learning_rate": 2.7558315789473685e-05, "loss": 0.3319, "mean_copy_accuracy": 0.993453711271286, "mean_gen_accuracy": 0.8546428233385086, "mean_token_accuracy": 0.8899732679128647, "num_tokens": 723574219.0, "sample_num_tokens": 7598.25, "step": 2669, "total_num_tokens": 723604612.0, "z_loss": 0.000734505127184093 }, { "copy_logits_max": -3.4987740516662598, "copy_logits_min": -750000000.0, "copy_num_tokens": 457.1875, "epoch": 0.5453152923155477, "gen_logits_max": 5.008851051330566, "gen_logits_mean": -13.643682479858398, "gen_logits_min": -25.61227798461914, "gen_logits_std": 2.5337514877319336, "gen_loss": 0.29953134059906006, "grad_norm": 0.4066354213311537, "learning_rate": 2.755705263157895e-05, "loss": 0.3181, "mean_copy_accuracy": 0.9942082315683365, "mean_gen_accuracy": 0.8630308657884598, "mean_token_accuracy": 0.8944345563650131, "num_tokens": 723847322.0, "sample_num_tokens": 8178.0, "step": 2670, "total_num_tokens": 723880034.0, "z_loss": 0.0007284783059731126 }, { "copy_logits_max": -3.6305603981018066, "copy_logits_min": -687500032.0, "copy_num_tokens": 601.8125, "epoch": 0.5455195302527445, "gen_logits_max": 4.63566780090332, "gen_logits_mean": -13.336705207824707, "gen_logits_min": -25.09294891357422, "gen_logits_std": 2.526183843612671, "gen_loss": 0.3435630798339844, "grad_norm": 0.4610996941381556, "learning_rate": 2.755578947368421e-05, "loss": 0.3063, "mean_copy_accuracy": 0.9919595718383789, "mean_gen_accuracy": 0.8647524118423462, "mean_token_accuracy": 0.8987942337989807, "num_tokens": 724130229.0, "sample_num_tokens": 9249.25, "step": 2671, "total_num_tokens": 724167226.0, "z_loss": 0.0008115199161693454 }, { "copy_logits_max": -5.588613033294678, "copy_logits_min": -687500032.0, "copy_num_tokens": 323.0625, "epoch": 0.5457237681899413, "gen_logits_max": 5.3845672607421875, "gen_logits_mean": -14.507562637329102, "gen_logits_min": -25.89494514465332, "gen_logits_std": 2.503499984741211, "gen_loss": 0.3202778697013855, "grad_norm": 0.45109858882325754, "learning_rate": 2.7554526315789475e-05, "loss": 0.3134, "mean_copy_accuracy": 0.9947369396686554, "mean_gen_accuracy": 0.8651837408542633, "mean_token_accuracy": 0.8964135348796844, "num_tokens": 724402071.0, "sample_num_tokens": 7857.25, "step": 2672, "total_num_tokens": 724433500.0, "z_loss": 0.0007653827778995037 }, { "copy_logits_max": -5.680006504058838, "copy_logits_min": -750000000.0, "copy_num_tokens": 444.1875, "epoch": 0.5459280061271381, "gen_logits_max": 5.478917598724365, "gen_logits_mean": -13.668770790100098, "gen_logits_min": -25.051624298095703, "gen_logits_std": 2.5343236923217773, "gen_loss": 0.28022444248199463, "grad_norm": 0.4009965821141748, "learning_rate": 2.755326315789474e-05, "loss": 0.3006, "mean_copy_accuracy": 0.9940195083618164, "mean_gen_accuracy": 0.870690181851387, "mean_token_accuracy": 0.8998612463474274, "num_tokens": 724700935.0, "sample_num_tokens": 8511.25, "step": 2673, "total_num_tokens": 724734980.0, "z_loss": 0.0006718845688737929 }, { "copy_logits_max": -5.081644058227539, "copy_logits_min": -750000128.0, "copy_num_tokens": 386.375, "epoch": 0.5461322440643349, "gen_logits_max": 6.10014009475708, "gen_logits_mean": -12.316400527954102, "gen_logits_min": -23.482574462890625, "gen_logits_std": 2.505460262298584, "gen_loss": 0.3054814338684082, "grad_norm": 0.47069194040159734, "learning_rate": 2.7552e-05, "loss": 0.3247, "mean_copy_accuracy": 0.9934157878160477, "mean_gen_accuracy": 0.8648440539836884, "mean_token_accuracy": 0.8912073522806168, "num_tokens": 724959673.0, "sample_num_tokens": 7921.75, "step": 2674, "total_num_tokens": 724991360.0, "z_loss": 0.0007303601014427841 }, { "copy_logits_max": -3.3686888217926025, "copy_logits_min": -750000000.0, "copy_num_tokens": 449.625, "epoch": 0.5463364820015317, "gen_logits_max": 5.869794845581055, "gen_logits_mean": -12.261041641235352, "gen_logits_min": -24.330657958984375, "gen_logits_std": 2.5644073486328125, "gen_loss": 0.3633132576942444, "grad_norm": 0.475687598942142, "learning_rate": 2.7550736842105264e-05, "loss": 0.3393, "mean_copy_accuracy": 0.9926509410142899, "mean_gen_accuracy": 0.8562220931053162, "mean_token_accuracy": 0.8873081654310226, "num_tokens": 725206099.0, "sample_num_tokens": 7850.75, "step": 2675, "total_num_tokens": 725237502.0, "z_loss": 0.0009032967500388622 }, { "copy_logits_max": -3.708033800125122, "copy_logits_min": -687500032.0, "copy_num_tokens": 472.5625, "epoch": 0.5465407199387287, "gen_logits_max": 4.849339485168457, "gen_logits_mean": -13.517212867736816, "gen_logits_min": -24.51815414428711, "gen_logits_std": 2.477564573287964, "gen_loss": 0.35777556896209717, "grad_norm": 0.5443110660762132, "learning_rate": 2.7549473684210525e-05, "loss": 0.3274, "mean_copy_accuracy": 0.9949923157691956, "mean_gen_accuracy": 0.8612192273139954, "mean_token_accuracy": 0.8928828686475754, "num_tokens": 725467939.0, "sample_num_tokens": 8534.75, "step": 2676, "total_num_tokens": 725502078.0, "z_loss": 0.0008209114894270897 }, { "copy_logits_max": -3.9513707160949707, "copy_logits_min": -750000000.0, "copy_num_tokens": 561.0, "epoch": 0.5467449578759255, "gen_logits_max": 4.81990909576416, "gen_logits_mean": -14.378561973571777, "gen_logits_min": -26.369606018066406, "gen_logits_std": 2.6012279987335205, "gen_loss": 0.28584226965904236, "grad_norm": 0.4949828887546434, "learning_rate": 2.7548210526315793e-05, "loss": 0.3027, "mean_copy_accuracy": 0.9933116734027863, "mean_gen_accuracy": 0.870317280292511, "mean_token_accuracy": 0.8993269056081772, "num_tokens": 725730404.0, "sample_num_tokens": 9104.5, "step": 2677, "total_num_tokens": 725766822.0, "z_loss": 0.0007309805369004607 }, { "copy_logits_max": -3.7925381660461426, "copy_logits_min": -750000000.0, "copy_num_tokens": 515.0625, "epoch": 0.5469491958131223, "gen_logits_max": 5.182930946350098, "gen_logits_mean": -13.072598457336426, "gen_logits_min": -25.101449966430664, "gen_logits_std": 2.5930733680725098, "gen_loss": 0.3451051115989685, "grad_norm": 0.45578895108240153, "learning_rate": 2.7546947368421054e-05, "loss": 0.3203, "mean_copy_accuracy": 0.99391008913517, "mean_gen_accuracy": 0.8599903583526611, "mean_token_accuracy": 0.894476905465126, "num_tokens": 726015678.0, "sample_num_tokens": 8258.0, "step": 2678, "total_num_tokens": 726048710.0, "z_loss": 0.0008621421875432134 }, { "copy_logits_max": -3.7205047607421875, "copy_logits_min": -687500032.0, "copy_num_tokens": 509.1875, "epoch": 0.5471534337503191, "gen_logits_max": 5.4982500076293945, "gen_logits_mean": -12.660661697387695, "gen_logits_min": -24.187305450439453, "gen_logits_std": 2.5156970024108887, "gen_loss": 0.33452728390693665, "grad_norm": 0.4608994301085392, "learning_rate": 2.7545684210526318e-05, "loss": 0.3119, "mean_copy_accuracy": 0.9934284538030624, "mean_gen_accuracy": 0.8675559014081955, "mean_token_accuracy": 0.8974315375089645, "num_tokens": 726285616.0, "sample_num_tokens": 9284.0, "step": 2679, "total_num_tokens": 726322752.0, "z_loss": 0.0008186212508007884 }, { "copy_logits_max": -3.483934164047241, "copy_logits_min": -750000000.0, "copy_num_tokens": 684.125, "epoch": 0.5473576716875159, "gen_logits_max": 5.152621269226074, "gen_logits_mean": -13.520411491394043, "gen_logits_min": -25.157438278198242, "gen_logits_std": 2.5679969787597656, "gen_loss": 0.31906527280807495, "grad_norm": 0.41463235772803847, "learning_rate": 2.754442105263158e-05, "loss": 0.3033, "mean_copy_accuracy": 0.9936508387327194, "mean_gen_accuracy": 0.8672910779714584, "mean_token_accuracy": 0.8996735364198685, "num_tokens": 726569747.0, "sample_num_tokens": 10615.25, "step": 2680, "total_num_tokens": 726612208.0, "z_loss": 0.0008350233547389507 }, { "copy_logits_max": -2.372255802154541, "copy_logits_min": -687500096.0, "copy_num_tokens": 717.0, "epoch": 0.5475619096247127, "gen_logits_max": 3.9225385189056396, "gen_logits_mean": -14.330169677734375, "gen_logits_min": -26.16407585144043, "gen_logits_std": 2.5333588123321533, "gen_loss": 0.31350117921829224, "grad_norm": 0.4594701305741526, "learning_rate": 2.7543157894736843e-05, "loss": 0.3316, "mean_copy_accuracy": 0.9937575161457062, "mean_gen_accuracy": 0.8565164506435394, "mean_token_accuracy": 0.8907842189073563, "num_tokens": 726867617.0, "sample_num_tokens": 10325.75, "step": 2681, "total_num_tokens": 726908920.0, "z_loss": 0.0007856502197682858 }, { "copy_logits_max": -4.654065132141113, "copy_logits_min": -687500096.0, "copy_num_tokens": 319.0, "epoch": 0.5477661475619097, "gen_logits_max": 5.6319966316223145, "gen_logits_mean": -13.725914001464844, "gen_logits_min": -24.8287353515625, "gen_logits_std": 2.5109152793884277, "gen_loss": 0.35857337713241577, "grad_norm": 0.5099753306134384, "learning_rate": 2.7541894736842104e-05, "loss": 0.3223, "mean_copy_accuracy": 0.9923050701618195, "mean_gen_accuracy": 0.8582149296998978, "mean_token_accuracy": 0.8932745009660721, "num_tokens": 727147335.0, "sample_num_tokens": 6943.75, "step": 2682, "total_num_tokens": 727175110.0, "z_loss": 0.0008445894927717745 }, { "copy_logits_max": -2.301730155944824, "copy_logits_min": -750000000.0, "copy_num_tokens": 485.8125, "epoch": 0.5479703854991065, "gen_logits_max": 6.018510818481445, "gen_logits_mean": -11.482585906982422, "gen_logits_min": -23.076717376708984, "gen_logits_std": 2.553410530090332, "gen_loss": 0.37015387415885925, "grad_norm": 0.4112432990617709, "learning_rate": 2.754063157894737e-05, "loss": 0.3365, "mean_copy_accuracy": 0.994192436337471, "mean_gen_accuracy": 0.8593430519104004, "mean_token_accuracy": 0.8890819996595383, "num_tokens": 727425855.0, "sample_num_tokens": 9475.25, "step": 2683, "total_num_tokens": 727463756.0, "z_loss": 0.0009461033623665571 }, { "copy_logits_max": -4.4069318771362305, "copy_logits_min": -750000000.0, "copy_num_tokens": 297.125, "epoch": 0.5481746234363033, "gen_logits_max": 5.643138885498047, "gen_logits_mean": -13.888239860534668, "gen_logits_min": -25.610671997070312, "gen_logits_std": 2.550811767578125, "gen_loss": 0.3282642960548401, "grad_norm": 0.47389051783282904, "learning_rate": 2.753936842105263e-05, "loss": 0.3311, "mean_copy_accuracy": 0.9929770231246948, "mean_gen_accuracy": 0.8652447462081909, "mean_token_accuracy": 0.8879486471414566, "num_tokens": 727688126.0, "sample_num_tokens": 7566.5, "step": 2684, "total_num_tokens": 727718392.0, "z_loss": 0.0007345081539824605 }, { "copy_logits_max": -3.9913101196289062, "copy_logits_min": -750000000.0, "copy_num_tokens": 422.9375, "epoch": 0.5483788613735001, "gen_logits_max": 5.0583906173706055, "gen_logits_mean": -12.826133728027344, "gen_logits_min": -24.50646209716797, "gen_logits_std": 2.5391929149627686, "gen_loss": 0.301334947347641, "grad_norm": 0.4681314523757986, "learning_rate": 2.7538105263157897e-05, "loss": 0.3195, "mean_copy_accuracy": 0.9923270791769028, "mean_gen_accuracy": 0.8626479953527451, "mean_token_accuracy": 0.8951002955436707, "num_tokens": 727967853.0, "sample_num_tokens": 8032.25, "step": 2685, "total_num_tokens": 727999982.0, "z_loss": 0.0006289739394560456 }, { "copy_logits_max": -2.508419990539551, "copy_logits_min": -750000064.0, "copy_num_tokens": 417.4375, "epoch": 0.5485830993106969, "gen_logits_max": 5.456149101257324, "gen_logits_mean": -12.840278625488281, "gen_logits_min": -24.52181053161621, "gen_logits_std": 2.5992608070373535, "gen_loss": 0.3261243999004364, "grad_norm": 0.4108581765016414, "learning_rate": 2.7536842105263158e-05, "loss": 0.3088, "mean_copy_accuracy": 0.9942927956581116, "mean_gen_accuracy": 0.8686866909265518, "mean_token_accuracy": 0.8976379781961441, "num_tokens": 728227349.0, "sample_num_tokens": 7523.25, "step": 2686, "total_num_tokens": 728257442.0, "z_loss": 0.0007151259342208505 }, { "copy_logits_max": -3.2931995391845703, "copy_logits_min": -750000064.0, "copy_num_tokens": 349.8125, "epoch": 0.5487873372478937, "gen_logits_max": 5.5712432861328125, "gen_logits_mean": -12.812041282653809, "gen_logits_min": -24.16078758239746, "gen_logits_std": 2.5510647296905518, "gen_loss": 0.3663364350795746, "grad_norm": 0.4888396106146964, "learning_rate": 2.7535578947368422e-05, "loss": 0.3431, "mean_copy_accuracy": 0.9940217584371567, "mean_gen_accuracy": 0.852851927280426, "mean_token_accuracy": 0.8893511593341827, "num_tokens": 728480700.0, "sample_num_tokens": 6784.0, "step": 2687, "total_num_tokens": 728507836.0, "z_loss": 0.0007627839222550392 }, { "copy_logits_max": -4.124263763427734, "copy_logits_min": -750000064.0, "copy_num_tokens": 385.1875, "epoch": 0.5489915751850907, "gen_logits_max": 5.192509174346924, "gen_logits_mean": -14.46254825592041, "gen_logits_min": -25.610340118408203, "gen_logits_std": 2.550485610961914, "gen_loss": 0.32515275478363037, "grad_norm": 0.451637092882177, "learning_rate": 2.7534315789473687e-05, "loss": 0.3106, "mean_copy_accuracy": 0.9933944940567017, "mean_gen_accuracy": 0.8703145980834961, "mean_token_accuracy": 0.8966844975948334, "num_tokens": 728729195.0, "sample_num_tokens": 7997.25, "step": 2688, "total_num_tokens": 728761184.0, "z_loss": 0.0006432933732867241 }, { "copy_logits_max": -3.0188522338867188, "copy_logits_min": -687500032.0, "copy_num_tokens": 637.0625, "epoch": 0.5491958131222875, "gen_logits_max": 4.4462785720825195, "gen_logits_mean": -13.443595886230469, "gen_logits_min": -24.941390991210938, "gen_logits_std": 2.545792579650879, "gen_loss": 0.27230310440063477, "grad_norm": 0.40006114290642075, "learning_rate": 2.7533052631578948e-05, "loss": 0.2963, "mean_copy_accuracy": 0.9940067082643509, "mean_gen_accuracy": 0.8691037446260452, "mean_token_accuracy": 0.9030330032110214, "num_tokens": 729005166.0, "sample_num_tokens": 9641.0, "step": 2689, "total_num_tokens": 729043730.0, "z_loss": 0.000590720446780324 }, { "copy_logits_max": -4.238008499145508, "copy_logits_min": -750000000.0, "copy_num_tokens": 349.25, "epoch": 0.5494000510594843, "gen_logits_max": 5.566266059875488, "gen_logits_mean": -14.002372741699219, "gen_logits_min": -25.422168731689453, "gen_logits_std": 2.564150810241699, "gen_loss": 0.36241063475608826, "grad_norm": 0.40271485929522893, "learning_rate": 2.7531789473684212e-05, "loss": 0.3064, "mean_copy_accuracy": 0.9946832656860352, "mean_gen_accuracy": 0.8691424578428268, "mean_token_accuracy": 0.8990691751241684, "num_tokens": 729294512.0, "sample_num_tokens": 9120.5, "step": 2690, "total_num_tokens": 729330994.0, "z_loss": 0.0007289138156920671 }, { "copy_logits_max": -1.5847989320755005, "copy_logits_min": -750000000.0, "copy_num_tokens": 474.8125, "epoch": 0.5496042889966811, "gen_logits_max": 4.784670829772949, "gen_logits_mean": -13.659278869628906, "gen_logits_min": -25.214160919189453, "gen_logits_std": 2.534078359603882, "gen_loss": 0.320907860994339, "grad_norm": 0.43067435025450534, "learning_rate": 2.7530526315789473e-05, "loss": 0.3327, "mean_copy_accuracy": 0.993508517742157, "mean_gen_accuracy": 0.8615785241127014, "mean_token_accuracy": 0.8898083418607712, "num_tokens": 729561052.0, "sample_num_tokens": 8653.0, "step": 2691, "total_num_tokens": 729595664.0, "z_loss": 0.0007143911207094789 }, { "copy_logits_max": -2.3974218368530273, "copy_logits_min": -750000000.0, "copy_num_tokens": 456.9375, "epoch": 0.5498085269338779, "gen_logits_max": 5.90961217880249, "gen_logits_mean": -13.575754165649414, "gen_logits_min": -25.43675994873047, "gen_logits_std": 2.603480100631714, "gen_loss": 0.2817505896091461, "grad_norm": 0.4743408282934565, "learning_rate": 2.7529263157894737e-05, "loss": 0.3337, "mean_copy_accuracy": 0.9937364012002945, "mean_gen_accuracy": 0.856156125664711, "mean_token_accuracy": 0.8899850100278854, "num_tokens": 729819789.0, "sample_num_tokens": 8338.75, "step": 2692, "total_num_tokens": 729853144.0, "z_loss": 0.0006972866831347346 }, { "copy_logits_max": -1.0284746885299683, "copy_logits_min": -750000000.0, "copy_num_tokens": 422.4375, "epoch": 0.5500127648710748, "gen_logits_max": 5.146329879760742, "gen_logits_mean": -13.534010887145996, "gen_logits_min": -25.865833282470703, "gen_logits_std": 2.5884933471679688, "gen_loss": 0.3312450647354126, "grad_norm": 0.4079388408092099, "learning_rate": 2.7528e-05, "loss": 0.2943, "mean_copy_accuracy": 0.9957546442747116, "mean_gen_accuracy": 0.8692919760942459, "mean_token_accuracy": 0.9028785526752472, "num_tokens": 730120499.0, "sample_num_tokens": 8032.75, "step": 2693, "total_num_tokens": 730152630.0, "z_loss": 0.000840101158246398 }, { "copy_logits_max": -4.957759857177734, "copy_logits_min": -750000000.0, "copy_num_tokens": 399.25, "epoch": 0.5502170028082717, "gen_logits_max": 5.481046676635742, "gen_logits_mean": -12.640089988708496, "gen_logits_min": -24.029428482055664, "gen_logits_std": 2.576465129852295, "gen_loss": 0.3310278058052063, "grad_norm": 0.4317169154493159, "learning_rate": 2.7526736842105266e-05, "loss": 0.3236, "mean_copy_accuracy": 0.9933293163776398, "mean_gen_accuracy": 0.8650352209806442, "mean_token_accuracy": 0.8927857875823975, "num_tokens": 730390686.0, "sample_num_tokens": 7993.0, "step": 2694, "total_num_tokens": 730422658.0, "z_loss": 0.0008087049354799092 }, { "copy_logits_max": -2.2018625736236572, "copy_logits_min": -750000000.0, "copy_num_tokens": 407.9375, "epoch": 0.5504212407454685, "gen_logits_max": 5.8825297355651855, "gen_logits_mean": -12.102209091186523, "gen_logits_min": -24.44911766052246, "gen_logits_std": 2.626335620880127, "gen_loss": 0.28851351141929626, "grad_norm": 0.4499801096141858, "learning_rate": 2.7525473684210527e-05, "loss": 0.31, "mean_copy_accuracy": 0.9925375431776047, "mean_gen_accuracy": 0.8665967732667923, "mean_token_accuracy": 0.8980384469032288, "num_tokens": 730664265.0, "sample_num_tokens": 7934.25, "step": 2695, "total_num_tokens": 730696002.0, "z_loss": 0.0006787080783396959 }, { "copy_logits_max": -1.2802162170410156, "copy_logits_min": -750000000.0, "copy_num_tokens": 404.1875, "epoch": 0.5506254786826653, "gen_logits_max": 5.6263651847839355, "gen_logits_mean": -12.938432693481445, "gen_logits_min": -25.084367752075195, "gen_logits_std": 2.636746883392334, "gen_loss": 0.3006758391857147, "grad_norm": 0.4178962290969768, "learning_rate": 2.752421052631579e-05, "loss": 0.318, "mean_copy_accuracy": 0.9944187253713608, "mean_gen_accuracy": 0.8600986450910568, "mean_token_accuracy": 0.8946857750415802, "num_tokens": 730941073.0, "sample_num_tokens": 7326.75, "step": 2696, "total_num_tokens": 730970380.0, "z_loss": 0.0007115170592442155 }, { "copy_logits_max": -2.7978224754333496, "copy_logits_min": -750000000.0, "copy_num_tokens": 432.0, "epoch": 0.5508297166198621, "gen_logits_max": 5.748696327209473, "gen_logits_mean": -12.953320503234863, "gen_logits_min": -24.822555541992188, "gen_logits_std": 2.6199276447296143, "gen_loss": 0.3310444951057434, "grad_norm": 0.47524848057008695, "learning_rate": 2.7522947368421052e-05, "loss": 0.336, "mean_copy_accuracy": 0.9942556917667389, "mean_gen_accuracy": 0.85869200527668, "mean_token_accuracy": 0.890844464302063, "num_tokens": 731170740.0, "sample_num_tokens": 7522.5, "step": 2697, "total_num_tokens": 731200830.0, "z_loss": 0.0007684585871174932 }, { "copy_logits_max": -3.2805771827697754, "copy_logits_min": -750000000.0, "copy_num_tokens": 425.1875, "epoch": 0.551033954557059, "gen_logits_max": 5.312533378601074, "gen_logits_mean": -13.018106460571289, "gen_logits_min": -24.974903106689453, "gen_logits_std": 2.639878511428833, "gen_loss": 0.37577715516090393, "grad_norm": 0.41321456991447153, "learning_rate": 2.7521684210526316e-05, "loss": 0.3081, "mean_copy_accuracy": 0.9941776841878891, "mean_gen_accuracy": 0.867347314953804, "mean_token_accuracy": 0.8974533081054688, "num_tokens": 731441046.0, "sample_num_tokens": 7776.5, "step": 2698, "total_num_tokens": 731472152.0, "z_loss": 0.0007425175281241536 }, { "copy_logits_max": -5.193552017211914, "copy_logits_min": -750000000.0, "copy_num_tokens": 326.75, "epoch": 0.5512381924942558, "gen_logits_max": 5.940784454345703, "gen_logits_mean": -13.104377746582031, "gen_logits_min": -24.60470962524414, "gen_logits_std": 2.58148455619812, "gen_loss": 0.33985447883605957, "grad_norm": 0.4835024562507659, "learning_rate": 2.7520421052631577e-05, "loss": 0.3297, "mean_copy_accuracy": 0.993516594171524, "mean_gen_accuracy": 0.8630099296569824, "mean_token_accuracy": 0.8923612534999847, "num_tokens": 731702870.0, "sample_num_tokens": 7651.5, "step": 2699, "total_num_tokens": 731733476.0, "z_loss": 0.0007367523503489792 }, { "copy_logits_max": -1.2137632369995117, "copy_logits_min": -625000064.0, "copy_num_tokens": 531.1875, "epoch": 0.5514424304314527, "gen_logits_max": 5.683394908905029, "gen_logits_mean": -12.915994644165039, "gen_logits_min": -24.873882293701172, "gen_logits_std": 2.639188766479492, "gen_loss": 0.3373255729675293, "grad_norm": 0.3884302662440249, "learning_rate": 2.751915789473684e-05, "loss": 0.3151, "mean_copy_accuracy": 0.9942883104085922, "mean_gen_accuracy": 0.8642483055591583, "mean_token_accuracy": 0.8964893966913223, "num_tokens": 731997154.0, "sample_num_tokens": 9146.0, "step": 2700, "total_num_tokens": 732033738.0, "z_loss": 0.0007406482473015785 }, { "copy_logits_max": -2.3252522945404053, "copy_logits_min": -750000000.0, "copy_num_tokens": 374.125, "epoch": 0.5516466683686495, "gen_logits_max": 5.215091705322266, "gen_logits_mean": -14.017938613891602, "gen_logits_min": -25.85350799560547, "gen_logits_std": 2.6360504627227783, "gen_loss": 0.3062325716018677, "grad_norm": 0.4333159687152555, "learning_rate": 2.751789473684211e-05, "loss": 0.2911, "mean_copy_accuracy": 0.9946901053190231, "mean_gen_accuracy": 0.8699890226125717, "mean_token_accuracy": 0.9038608521223068, "num_tokens": 732272452.0, "sample_num_tokens": 7500.5, "step": 2701, "total_num_tokens": 732302454.0, "z_loss": 0.0007716049440205097 }, { "copy_logits_max": -1.4957504272460938, "copy_logits_min": -750000000.0, "copy_num_tokens": 486.1875, "epoch": 0.5518509063058463, "gen_logits_max": 5.6854448318481445, "gen_logits_mean": -12.904561996459961, "gen_logits_min": -25.104564666748047, "gen_logits_std": 2.6658029556274414, "gen_loss": 0.2975003719329834, "grad_norm": 0.46236080052927603, "learning_rate": 2.751663157894737e-05, "loss": 0.316, "mean_copy_accuracy": 0.9944377243518829, "mean_gen_accuracy": 0.8648627251386642, "mean_token_accuracy": 0.896360456943512, "num_tokens": 732514390.0, "sample_num_tokens": 8422.0, "step": 2702, "total_num_tokens": 732548078.0, "z_loss": 0.0007368955411948264 }, { "copy_logits_max": -2.055314064025879, "copy_logits_min": -750000064.0, "copy_num_tokens": 508.625, "epoch": 0.5520551442430431, "gen_logits_max": 5.099817752838135, "gen_logits_mean": -14.068768501281738, "gen_logits_min": -25.925735473632812, "gen_logits_std": 2.6261565685272217, "gen_loss": 0.3279559016227722, "grad_norm": 0.4012714402957978, "learning_rate": 2.7515368421052634e-05, "loss": 0.298, "mean_copy_accuracy": 0.9945079535245895, "mean_gen_accuracy": 0.865900844335556, "mean_token_accuracy": 0.899713933467865, "num_tokens": 732792840.0, "sample_num_tokens": 9100.0, "step": 2703, "total_num_tokens": 732829240.0, "z_loss": 0.0007043968071229756 }, { "copy_logits_max": -2.3335094451904297, "copy_logits_min": -750000000.0, "copy_num_tokens": 528.0, "epoch": 0.55225938218024, "gen_logits_max": 5.570742607116699, "gen_logits_mean": -13.281639099121094, "gen_logits_min": -25.019073486328125, "gen_logits_std": 2.6190710067749023, "gen_loss": 0.3435167670249939, "grad_norm": 0.4276217223273483, "learning_rate": 2.7514105263157895e-05, "loss": 0.3201, "mean_copy_accuracy": 0.9941761940717697, "mean_gen_accuracy": 0.8624185621738434, "mean_token_accuracy": 0.8955192267894745, "num_tokens": 733077676.0, "sample_num_tokens": 8333.0, "step": 2704, "total_num_tokens": 733111008.0, "z_loss": 0.0007392263505607843 }, { "copy_logits_max": 1.3408676385879517, "copy_logits_min": -687500032.0, "copy_num_tokens": 785.3125, "epoch": 0.5524636201174368, "gen_logits_max": 6.264443874359131, "gen_logits_mean": -11.9182767868042, "gen_logits_min": -24.852947235107422, "gen_logits_std": 2.707777500152588, "gen_loss": 0.2641345262527466, "grad_norm": 0.4400086237016863, "learning_rate": 2.751284210526316e-05, "loss": 0.3089, "mean_copy_accuracy": 0.9933356940746307, "mean_gen_accuracy": 0.8644644618034363, "mean_token_accuracy": 0.8997108489274979, "num_tokens": 733357401.0, "sample_num_tokens": 10898.75, "step": 2705, "total_num_tokens": 733400996.0, "z_loss": 0.0007880249759182334 }, { "copy_logits_max": -1.1125519275665283, "copy_logits_min": -750000000.0, "copy_num_tokens": 481.1875, "epoch": 0.5526678580546337, "gen_logits_max": 5.846550941467285, "gen_logits_mean": -12.77341365814209, "gen_logits_min": -24.689159393310547, "gen_logits_std": 2.652437448501587, "gen_loss": 0.31742405891418457, "grad_norm": 0.439085923171191, "learning_rate": 2.751157894736842e-05, "loss": 0.298, "mean_copy_accuracy": 0.995154544711113, "mean_gen_accuracy": 0.8685892075300217, "mean_token_accuracy": 0.9020449817180634, "num_tokens": 733643938.0, "sample_num_tokens": 7627.0, "step": 2706, "total_num_tokens": 733674446.0, "z_loss": 0.0008086010348051786 }, { "copy_logits_max": -0.6846325397491455, "copy_logits_min": -750000000.0, "copy_num_tokens": 609.875, "epoch": 0.5528720959918305, "gen_logits_max": 5.595207691192627, "gen_logits_mean": -13.43016242980957, "gen_logits_min": -25.27425765991211, "gen_logits_std": 2.6111676692962646, "gen_loss": 0.33047181367874146, "grad_norm": 0.41455881335694045, "learning_rate": 2.7510315789473685e-05, "loss": 0.3065, "mean_copy_accuracy": 0.9952739328145981, "mean_gen_accuracy": 0.8610283136367798, "mean_token_accuracy": 0.8990356177091599, "num_tokens": 733917294.0, "sample_num_tokens": 9651.0, "step": 2707, "total_num_tokens": 733955898.0, "z_loss": 0.0009063329780474305 }, { "copy_logits_max": -4.421270847320557, "copy_logits_min": -750000064.0, "copy_num_tokens": 281.1875, "epoch": 0.5530763339290273, "gen_logits_max": 6.085165977478027, "gen_logits_mean": -14.15080451965332, "gen_logits_min": -25.702098846435547, "gen_logits_std": 2.592317581176758, "gen_loss": 0.30658671259880066, "grad_norm": 0.45826325992592126, "learning_rate": 2.7509052631578946e-05, "loss": 0.331, "mean_copy_accuracy": 0.9921852797269821, "mean_gen_accuracy": 0.8592525124549866, "mean_token_accuracy": 0.890025869011879, "num_tokens": 734180603.0, "sample_num_tokens": 7372.25, "step": 2708, "total_num_tokens": 734210092.0, "z_loss": 0.000863261055201292 }, { "copy_logits_max": -4.379044532775879, "copy_logits_min": -750000000.0, "copy_num_tokens": 287.5, "epoch": 0.5532805718662241, "gen_logits_max": 6.448633670806885, "gen_logits_mean": -14.146799087524414, "gen_logits_min": -25.415088653564453, "gen_logits_std": 2.5684549808502197, "gen_loss": 0.34822624921798706, "grad_norm": 0.4288685405590626, "learning_rate": 2.750778947368421e-05, "loss": 0.3135, "mean_copy_accuracy": 0.9941630810499191, "mean_gen_accuracy": 0.8631597608327866, "mean_token_accuracy": 0.8971440494060516, "num_tokens": 734463086.0, "sample_num_tokens": 7272.5, "step": 2709, "total_num_tokens": 734492176.0, "z_loss": 0.0009191851131618023 }, { "copy_logits_max": -2.33764386177063, "copy_logits_min": -750000000.0, "copy_num_tokens": 619.8125, "epoch": 0.553484809803421, "gen_logits_max": 5.358301639556885, "gen_logits_mean": -14.0494966506958, "gen_logits_min": -26.096607208251953, "gen_logits_std": 2.6360344886779785, "gen_loss": 0.2906950116157532, "grad_norm": 0.4159366023467639, "learning_rate": 2.7506526315789474e-05, "loss": 0.2985, "mean_copy_accuracy": 0.994603231549263, "mean_gen_accuracy": 0.8686459213495255, "mean_token_accuracy": 0.9030529409646988, "num_tokens": 734747612.0, "sample_num_tokens": 9503.5, "step": 2710, "total_num_tokens": 734785626.0, "z_loss": 0.0007599712116643786 }, { "copy_logits_max": -3.5431301593780518, "copy_logits_min": -750000000.0, "copy_num_tokens": 494.625, "epoch": 0.5536890477406178, "gen_logits_max": 5.178084373474121, "gen_logits_mean": -14.402578353881836, "gen_logits_min": -26.085786819458008, "gen_logits_std": 2.6136975288391113, "gen_loss": 0.31854963302612305, "grad_norm": 0.46121753384243186, "learning_rate": 2.750526315789474e-05, "loss": 0.3037, "mean_copy_accuracy": 0.9922689348459244, "mean_gen_accuracy": 0.8723268806934357, "mean_token_accuracy": 0.8982491195201874, "num_tokens": 735004073.0, "sample_num_tokens": 9214.25, "step": 2711, "total_num_tokens": 735040930.0, "z_loss": 0.0007570096058771014 }, { "copy_logits_max": 0.568374514579773, "copy_logits_min": -750000000.0, "copy_num_tokens": 537.5, "epoch": 0.5538932856778147, "gen_logits_max": 6.453816890716553, "gen_logits_mean": -12.46600341796875, "gen_logits_min": -24.391529083251953, "gen_logits_std": 2.6526613235473633, "gen_loss": 0.33799928426742554, "grad_norm": 0.44941317612511616, "learning_rate": 2.7504e-05, "loss": 0.3149, "mean_copy_accuracy": 0.9929724037647247, "mean_gen_accuracy": 0.8603399693965912, "mean_token_accuracy": 0.8945641666650772, "num_tokens": 735285846.0, "sample_num_tokens": 9170.5, "step": 2712, "total_num_tokens": 735322528.0, "z_loss": 0.0007987255812622607 }, { "copy_logits_max": 0.4696733355522156, "copy_logits_min": -687500032.0, "copy_num_tokens": 487.9375, "epoch": 0.5540975236150115, "gen_logits_max": 5.977146148681641, "gen_logits_mean": -12.691984176635742, "gen_logits_min": -25.16596031188965, "gen_logits_std": 2.6722500324249268, "gen_loss": 0.3067048192024231, "grad_norm": 0.45339939100335747, "learning_rate": 2.7502736842105264e-05, "loss": 0.3132, "mean_copy_accuracy": 0.9943334758281708, "mean_gen_accuracy": 0.8617575764656067, "mean_token_accuracy": 0.8966673165559769, "num_tokens": 735579259.0, "sample_num_tokens": 8807.75, "step": 2713, "total_num_tokens": 735614490.0, "z_loss": 0.000723242643289268 }, { "copy_logits_max": -1.6671862602233887, "copy_logits_min": -750000000.0, "copy_num_tokens": 390.125, "epoch": 0.5543017615522083, "gen_logits_max": 5.819577693939209, "gen_logits_mean": -13.508686065673828, "gen_logits_min": -25.49740982055664, "gen_logits_std": 2.6302199363708496, "gen_loss": 0.3470913767814636, "grad_norm": 0.4470823202091236, "learning_rate": 2.7501473684210528e-05, "loss": 0.3276, "mean_copy_accuracy": 0.9951683431863785, "mean_gen_accuracy": 0.8600251525640488, "mean_token_accuracy": 0.8910748064517975, "num_tokens": 735855253.0, "sample_num_tokens": 8300.25, "step": 2714, "total_num_tokens": 735888454.0, "z_loss": 0.0008457095827907324 }, { "copy_logits_max": -1.2131422758102417, "copy_logits_min": -750000000.0, "copy_num_tokens": 393.1875, "epoch": 0.5545059994894052, "gen_logits_max": 5.0967512130737305, "gen_logits_mean": -14.051385879516602, "gen_logits_min": -25.905933380126953, "gen_logits_std": 2.6276323795318604, "gen_loss": 0.34013551473617554, "grad_norm": 0.47053256988439957, "learning_rate": 2.750021052631579e-05, "loss": 0.2953, "mean_copy_accuracy": 0.9945570975542068, "mean_gen_accuracy": 0.8665852248668671, "mean_token_accuracy": 0.9019214808940887, "num_tokens": 736140432.0, "sample_num_tokens": 7506.5, "step": 2715, "total_num_tokens": 736170458.0, "z_loss": 0.000757525791414082 }, { "copy_logits_max": 0.11353862285614014, "copy_logits_min": -750000000.0, "copy_num_tokens": 357.25, "epoch": 0.554710237426602, "gen_logits_max": 6.273257732391357, "gen_logits_mean": -11.451251983642578, "gen_logits_min": -23.443218231201172, "gen_logits_std": 2.6445584297180176, "gen_loss": 0.3499217629432678, "grad_norm": 0.40692025443453467, "learning_rate": 2.7498947368421053e-05, "loss": 0.3159, "mean_copy_accuracy": 0.9951490312814713, "mean_gen_accuracy": 0.860470712184906, "mean_token_accuracy": 0.8957474827766418, "num_tokens": 736418630.0, "sample_num_tokens": 7682.0, "step": 2716, "total_num_tokens": 736449358.0, "z_loss": 0.0007574828341603279 }, { "copy_logits_max": 0.3074498176574707, "copy_logits_min": -750000000.0, "copy_num_tokens": 388.0, "epoch": 0.5549144753637988, "gen_logits_max": 5.838966369628906, "gen_logits_mean": -11.968564987182617, "gen_logits_min": -24.03866195678711, "gen_logits_std": 2.6357669830322266, "gen_loss": 0.34187567234039307, "grad_norm": 0.44722617021504524, "learning_rate": 2.7497684210526314e-05, "loss": 0.3177, "mean_copy_accuracy": 0.9946380108594894, "mean_gen_accuracy": 0.863021045923233, "mean_token_accuracy": 0.8954639434814453, "num_tokens": 736671211.0, "sample_num_tokens": 7575.25, "step": 2717, "total_num_tokens": 736701512.0, "z_loss": 0.0008052084012888372 }, { "copy_logits_max": -1.641446828842163, "copy_logits_min": -687500032.0, "copy_num_tokens": 393.1875, "epoch": 0.5551187133009957, "gen_logits_max": 5.134029388427734, "gen_logits_mean": -14.431185722351074, "gen_logits_min": -26.113040924072266, "gen_logits_std": 2.6068685054779053, "gen_loss": 0.3649542033672333, "grad_norm": 0.4757525809320183, "learning_rate": 2.7496421052631582e-05, "loss": 0.316, "mean_copy_accuracy": 0.9949825257062912, "mean_gen_accuracy": 0.8618673533201218, "mean_token_accuracy": 0.894659548997879, "num_tokens": 736940167.0, "sample_num_tokens": 7670.25, "step": 2718, "total_num_tokens": 736970848.0, "z_loss": 0.0007135731866583228 }, { "copy_logits_max": 0.4903656244277954, "copy_logits_min": -687500032.0, "copy_num_tokens": 578.6875, "epoch": 0.5553229512381925, "gen_logits_max": 5.470377445220947, "gen_logits_mean": -12.706541061401367, "gen_logits_min": -25.171533584594727, "gen_logits_std": 2.6497504711151123, "gen_loss": 0.294267475605011, "grad_norm": 0.5662518278394348, "learning_rate": 2.7495157894736843e-05, "loss": 0.3149, "mean_copy_accuracy": 0.9925596117973328, "mean_gen_accuracy": 0.864213615655899, "mean_token_accuracy": 0.896333172917366, "num_tokens": 737205008.0, "sample_num_tokens": 9416.0, "step": 2719, "total_num_tokens": 737242672.0, "z_loss": 0.0006529130041599274 }, { "copy_logits_max": 0.9437446594238281, "copy_logits_min": -687500032.0, "copy_num_tokens": 384.3125, "epoch": 0.5555271891753893, "gen_logits_max": 6.083540439605713, "gen_logits_mean": -12.745274543762207, "gen_logits_min": -24.86843490600586, "gen_logits_std": 2.626096248626709, "gen_loss": 0.3351747393608093, "grad_norm": 0.46819399391605127, "learning_rate": 2.7493894736842107e-05, "loss": 0.3217, "mean_copy_accuracy": 0.994614914059639, "mean_gen_accuracy": 0.863764226436615, "mean_token_accuracy": 0.8945183902978897, "num_tokens": 737471000.0, "sample_num_tokens": 8172.5, "step": 2720, "total_num_tokens": 737503690.0, "z_loss": 0.0008129546185955405 }, { "copy_logits_max": 0.4467412233352661, "copy_logits_min": -750000000.0, "copy_num_tokens": 560.625, "epoch": 0.5557314271125862, "gen_logits_max": 6.434843063354492, "gen_logits_mean": -12.090923309326172, "gen_logits_min": -24.226951599121094, "gen_logits_std": 2.6538872718811035, "gen_loss": 0.27172040939331055, "grad_norm": 0.41491934033223543, "learning_rate": 2.7492631578947368e-05, "loss": 0.291, "mean_copy_accuracy": 0.994498148560524, "mean_gen_accuracy": 0.8673721104860306, "mean_token_accuracy": 0.9024985730648041, "num_tokens": 737760130.0, "sample_num_tokens": 8757.5, "step": 2721, "total_num_tokens": 737795160.0, "z_loss": 0.000752786232624203 }, { "copy_logits_max": -1.022111177444458, "copy_logits_min": -750000000.0, "copy_num_tokens": 360.625, "epoch": 0.555935665049783, "gen_logits_max": 5.836422920227051, "gen_logits_mean": -13.516904830932617, "gen_logits_min": -26.049888610839844, "gen_logits_std": 2.620743751525879, "gen_loss": 0.3243580162525177, "grad_norm": 0.5246861750126122, "learning_rate": 2.7491368421052633e-05, "loss": 0.328, "mean_copy_accuracy": 0.993253156542778, "mean_gen_accuracy": 0.8615874499082565, "mean_token_accuracy": 0.8930291533470154, "num_tokens": 738035733.0, "sample_num_tokens": 7190.75, "step": 2722, "total_num_tokens": 738064496.0, "z_loss": 0.0008385983528569341 }, { "copy_logits_max": -1.5651230812072754, "copy_logits_min": -687500032.0, "copy_num_tokens": 479.5625, "epoch": 0.5561399029869798, "gen_logits_max": 5.239114761352539, "gen_logits_mean": -13.920297622680664, "gen_logits_min": -25.77873420715332, "gen_logits_std": 2.6053218841552734, "gen_loss": 0.34195512533187866, "grad_norm": 0.4321466155216363, "learning_rate": 2.7490105263157893e-05, "loss": 0.3383, "mean_copy_accuracy": 0.9946119636297226, "mean_gen_accuracy": 0.859921932220459, "mean_token_accuracy": 0.8889418989419937, "num_tokens": 738294859.0, "sample_num_tokens": 8635.25, "step": 2723, "total_num_tokens": 738329400.0, "z_loss": 0.0008607071358710527 }, { "copy_logits_max": -2.8249197006225586, "copy_logits_min": -750000000.0, "copy_num_tokens": 283.5, "epoch": 0.5563441409241766, "gen_logits_max": 6.247220993041992, "gen_logits_mean": -13.183541297912598, "gen_logits_min": -25.51881980895996, "gen_logits_std": 2.619371175765991, "gen_loss": 0.33745622634887695, "grad_norm": 0.4456844040844346, "learning_rate": 2.7488842105263158e-05, "loss": 0.3121, "mean_copy_accuracy": 0.994327113032341, "mean_gen_accuracy": 0.8606991320848465, "mean_token_accuracy": 0.8949021548032761, "num_tokens": 738545908.0, "sample_num_tokens": 6571.0, "step": 2724, "total_num_tokens": 738572192.0, "z_loss": 0.0008513248176313937 }, { "copy_logits_max": 0.5966435074806213, "copy_logits_min": -750000000.0, "copy_num_tokens": 461.5, "epoch": 0.5565483788613735, "gen_logits_max": 5.176661014556885, "gen_logits_mean": -13.03327465057373, "gen_logits_min": -25.36495590209961, "gen_logits_std": 2.661874294281006, "gen_loss": 0.2798006534576416, "grad_norm": 0.39221054980528414, "learning_rate": 2.748757894736842e-05, "loss": 0.3018, "mean_copy_accuracy": 0.9960108399391174, "mean_gen_accuracy": 0.8643947392702103, "mean_token_accuracy": 0.9001740515232086, "num_tokens": 738803740.0, "sample_num_tokens": 7572.0, "step": 2725, "total_num_tokens": 738834028.0, "z_loss": 0.0007949398132041097 }, { "copy_logits_max": -0.34158265590667725, "copy_logits_min": -750000000.0, "copy_num_tokens": 399.4375, "epoch": 0.5567526167985704, "gen_logits_max": 6.045346736907959, "gen_logits_mean": -12.608512878417969, "gen_logits_min": -24.742053985595703, "gen_logits_std": 2.6565310955047607, "gen_loss": 0.3124372363090515, "grad_norm": 0.47997899523630344, "learning_rate": 2.7486315789473686e-05, "loss": 0.3171, "mean_copy_accuracy": 0.9929989874362946, "mean_gen_accuracy": 0.861601248383522, "mean_token_accuracy": 0.8944294154644012, "num_tokens": 739073370.0, "sample_num_tokens": 8994.5, "step": 2726, "total_num_tokens": 739109348.0, "z_loss": 0.0008059267420321703 }, { "copy_logits_max": -2.3293418884277344, "copy_logits_min": -750000000.0, "copy_num_tokens": 396.625, "epoch": 0.5569568547357672, "gen_logits_max": 6.013890266418457, "gen_logits_mean": -13.063835144042969, "gen_logits_min": -25.190765380859375, "gen_logits_std": 2.656816005706787, "gen_loss": 0.3372989296913147, "grad_norm": 0.44238759773284003, "learning_rate": 2.748505263157895e-05, "loss": 0.3181, "mean_copy_accuracy": 0.9941287040710449, "mean_gen_accuracy": 0.8681029677391052, "mean_token_accuracy": 0.8948319256305695, "num_tokens": 739348245.0, "sample_num_tokens": 7686.75, "step": 2727, "total_num_tokens": 739378992.0, "z_loss": 0.0008734396542422473 }, { "copy_logits_max": -0.7812400460243225, "copy_logits_min": -687500032.0, "copy_num_tokens": 355.625, "epoch": 0.557161092672964, "gen_logits_max": 5.4930100440979, "gen_logits_mean": -14.045530319213867, "gen_logits_min": -26.15103530883789, "gen_logits_std": 2.6099112033843994, "gen_loss": 0.33723127841949463, "grad_norm": 0.483881022171004, "learning_rate": 2.748378947368421e-05, "loss": 0.3234, "mean_copy_accuracy": 0.9928485751152039, "mean_gen_accuracy": 0.860743060708046, "mean_token_accuracy": 0.8913054764270782, "num_tokens": 739588694.0, "sample_num_tokens": 7818.5, "step": 2728, "total_num_tokens": 739619968.0, "z_loss": 0.0007417533197440207 }, { "copy_logits_max": -1.6531481742858887, "copy_logits_min": -687500032.0, "copy_num_tokens": 411.5, "epoch": 0.5573653306101608, "gen_logits_max": 5.952132225036621, "gen_logits_mean": -12.562246322631836, "gen_logits_min": -24.570871353149414, "gen_logits_std": 2.618002414703369, "gen_loss": 0.32894062995910645, "grad_norm": 0.4362504737469214, "learning_rate": 2.7482526315789476e-05, "loss": 0.3147, "mean_copy_accuracy": 0.9947754144668579, "mean_gen_accuracy": 0.8639909327030182, "mean_token_accuracy": 0.8966533094644547, "num_tokens": 739846703.0, "sample_num_tokens": 8118.25, "step": 2729, "total_num_tokens": 739879176.0, "z_loss": 0.0007716963882558048 }, { "copy_logits_max": -1.030698299407959, "copy_logits_min": -687500032.0, "copy_num_tokens": 616.8125, "epoch": 0.5575695685473576, "gen_logits_max": 4.961425304412842, "gen_logits_mean": -12.222414016723633, "gen_logits_min": -24.720277786254883, "gen_logits_std": 2.721116542816162, "gen_loss": 0.2791525423526764, "grad_norm": 0.40757243412650884, "learning_rate": 2.7481263157894737e-05, "loss": 0.2977, "mean_copy_accuracy": 0.9949721992015839, "mean_gen_accuracy": 0.8644354343414307, "mean_token_accuracy": 0.8997391760349274, "num_tokens": 740118187.0, "sample_num_tokens": 8735.25, "step": 2730, "total_num_tokens": 740153128.0, "z_loss": 0.0006107066292315722 }, { "copy_logits_max": -2.4687623977661133, "copy_logits_min": -750000000.0, "copy_num_tokens": 390.5625, "epoch": 0.5577738064845545, "gen_logits_max": 4.864513397216797, "gen_logits_mean": -14.60639762878418, "gen_logits_min": -26.581274032592773, "gen_logits_std": 2.649340867996216, "gen_loss": 0.30116674304008484, "grad_norm": 0.44094928683153506, "learning_rate": 2.748e-05, "loss": 0.2949, "mean_copy_accuracy": 0.9947842657566071, "mean_gen_accuracy": 0.867816150188446, "mean_token_accuracy": 0.9025586247444153, "num_tokens": 740410190.0, "sample_num_tokens": 7743.0, "step": 2731, "total_num_tokens": 740441162.0, "z_loss": 0.0006852730875834823 }, { "copy_logits_max": -1.0456351041793823, "copy_logits_min": -687500032.0, "copy_num_tokens": 557.5625, "epoch": 0.5579780444217514, "gen_logits_max": 4.657924175262451, "gen_logits_mean": -13.670293807983398, "gen_logits_min": -25.790557861328125, "gen_logits_std": 2.6770501136779785, "gen_loss": 0.27681398391723633, "grad_norm": 0.5162833944208715, "learning_rate": 2.7478736842105262e-05, "loss": 0.3349, "mean_copy_accuracy": 0.9934736788272858, "mean_gen_accuracy": 0.8564089834690094, "mean_token_accuracy": 0.8886183351278305, "num_tokens": 740669662.0, "sample_num_tokens": 8466.5, "step": 2732, "total_num_tokens": 740703528.0, "z_loss": 0.0006185514503158629 }, { "copy_logits_max": -0.8410604000091553, "copy_logits_min": -750000000.0, "copy_num_tokens": 562.625, "epoch": 0.5581822823589482, "gen_logits_max": 4.676529884338379, "gen_logits_mean": -14.33646011352539, "gen_logits_min": -26.680870056152344, "gen_logits_std": 2.6600492000579834, "gen_loss": 0.2894015312194824, "grad_norm": 0.41311314561206, "learning_rate": 2.7477473684210526e-05, "loss": 0.2993, "mean_copy_accuracy": 0.9934146255254745, "mean_gen_accuracy": 0.8724597841501236, "mean_token_accuracy": 0.9014418125152588, "num_tokens": 740952287.0, "sample_num_tokens": 9603.25, "step": 2733, "total_num_tokens": 740990700.0, "z_loss": 0.0006368892500177026 }, { "copy_logits_max": -4.251852512359619, "copy_logits_min": -750000000.0, "copy_num_tokens": 358.375, "epoch": 0.558386520296145, "gen_logits_max": 4.882658958435059, "gen_logits_mean": -14.95422649383545, "gen_logits_min": -26.962806701660156, "gen_logits_std": 2.6047744750976562, "gen_loss": 0.312308669090271, "grad_norm": 0.40979457550817733, "learning_rate": 2.747621052631579e-05, "loss": 0.3042, "mean_copy_accuracy": 0.9944993853569031, "mean_gen_accuracy": 0.8685354590415955, "mean_token_accuracy": 0.899316594004631, "num_tokens": 741234114.0, "sample_num_tokens": 8037.0, "step": 2734, "total_num_tokens": 741266262.0, "z_loss": 0.0006707054562866688 }, { "copy_logits_max": -0.9023429155349731, "copy_logits_min": -750000000.0, "copy_num_tokens": 349.75, "epoch": 0.5585907582333418, "gen_logits_max": 5.933108329772949, "gen_logits_mean": -12.072850227355957, "gen_logits_min": -24.752994537353516, "gen_logits_std": 2.6629371643066406, "gen_loss": 0.3355664610862732, "grad_norm": 0.46536982916201114, "learning_rate": 2.7474947368421055e-05, "loss": 0.3446, "mean_copy_accuracy": 0.9927437007427216, "mean_gen_accuracy": 0.8572914600372314, "mean_token_accuracy": 0.8887829482555389, "num_tokens": 741507846.0, "sample_num_tokens": 7337.5, "step": 2735, "total_num_tokens": 741537196.0, "z_loss": 0.0007321500452235341 }, { "copy_logits_max": -1.8714162111282349, "copy_logits_min": -625000000.0, "copy_num_tokens": 480.125, "epoch": 0.5587949961705386, "gen_logits_max": 5.844524383544922, "gen_logits_mean": -12.139686584472656, "gen_logits_min": -24.278318405151367, "gen_logits_std": 2.649810791015625, "gen_loss": 0.35022294521331787, "grad_norm": 0.41573775029321897, "learning_rate": 2.7473684210526316e-05, "loss": 0.3167, "mean_copy_accuracy": 0.9937301576137543, "mean_gen_accuracy": 0.8612397611141205, "mean_token_accuracy": 0.8952812552452087, "num_tokens": 741798514.0, "sample_num_tokens": 7904.0, "step": 2736, "total_num_tokens": 741830130.0, "z_loss": 0.000806317781098187 }, { "copy_logits_max": -2.1168081760406494, "copy_logits_min": -750000000.0, "copy_num_tokens": 705.8125, "epoch": 0.5589992341077356, "gen_logits_max": 5.752362251281738, "gen_logits_mean": -12.673099517822266, "gen_logits_min": -24.824176788330078, "gen_logits_std": 2.644211530685425, "gen_loss": 0.2897959053516388, "grad_norm": 0.410961268326089, "learning_rate": 2.747242105263158e-05, "loss": 0.3119, "mean_copy_accuracy": 0.9937617927789688, "mean_gen_accuracy": 0.8638346195220947, "mean_token_accuracy": 0.8962420672178268, "num_tokens": 742066915.0, "sample_num_tokens": 10273.25, "step": 2737, "total_num_tokens": 742108008.0, "z_loss": 0.0007090388098731637 }, { "copy_logits_max": -2.0483648777008057, "copy_logits_min": -750000000.0, "copy_num_tokens": 369.125, "epoch": 0.5592034720449324, "gen_logits_max": 5.1501970291137695, "gen_logits_mean": -13.821074485778809, "gen_logits_min": -25.758731842041016, "gen_logits_std": 2.6343657970428467, "gen_loss": 0.30779147148132324, "grad_norm": 0.4423903597097118, "learning_rate": 2.747115789473684e-05, "loss": 0.2899, "mean_copy_accuracy": 0.9950414150953293, "mean_gen_accuracy": 0.868325725197792, "mean_token_accuracy": 0.9026816189289093, "num_tokens": 742357332.0, "sample_num_tokens": 7411.5, "step": 2738, "total_num_tokens": 742386978.0, "z_loss": 0.0007138577639125288 }, { "copy_logits_max": -0.43981459736824036, "copy_logits_min": -750000000.0, "copy_num_tokens": 541.25, "epoch": 0.5594077099821292, "gen_logits_max": 5.784947395324707, "gen_logits_mean": -12.502128601074219, "gen_logits_min": -24.698820114135742, "gen_logits_std": 2.6711854934692383, "gen_loss": 0.3188033401966095, "grad_norm": 0.4234226073302434, "learning_rate": 2.7469894736842105e-05, "loss": 0.3183, "mean_copy_accuracy": 0.9932179749011993, "mean_gen_accuracy": 0.8623332381248474, "mean_token_accuracy": 0.8944889307022095, "num_tokens": 742623070.0, "sample_num_tokens": 8527.5, "step": 2739, "total_num_tokens": 742657180.0, "z_loss": 0.0007734324317425489 }, { "copy_logits_max": -4.945137977600098, "copy_logits_min": -750000000.0, "copy_num_tokens": 368.6875, "epoch": 0.559611947919326, "gen_logits_max": 6.692313194274902, "gen_logits_mean": -12.415704727172852, "gen_logits_min": -24.104516983032227, "gen_logits_std": 2.6150448322296143, "gen_loss": 0.351001501083374, "grad_norm": 0.4810861531785639, "learning_rate": 2.746863157894737e-05, "loss": 0.3292, "mean_copy_accuracy": 0.9942752718925476, "mean_gen_accuracy": 0.861882671713829, "mean_token_accuracy": 0.8923076391220093, "num_tokens": 742892800.0, "sample_num_tokens": 8139.5, "step": 2740, "total_num_tokens": 742925358.0, "z_loss": 0.0008747258107177913 }, { "copy_logits_max": 0.023043513298034668, "copy_logits_min": -750000064.0, "copy_num_tokens": 644.6875, "epoch": 0.5598161858565228, "gen_logits_max": 6.008707523345947, "gen_logits_mean": -12.298044204711914, "gen_logits_min": -24.932064056396484, "gen_logits_std": 2.695554494857788, "gen_loss": 0.35485774278640747, "grad_norm": 0.45628191829413484, "learning_rate": 2.746736842105263e-05, "loss": 0.3294, "mean_copy_accuracy": 0.9943467378616333, "mean_gen_accuracy": 0.8541192263364792, "mean_token_accuracy": 0.8910678327083588, "num_tokens": 743187787.0, "sample_num_tokens": 9671.25, "step": 2741, "total_num_tokens": 743226472.0, "z_loss": 0.0009781215339899063 }, { "copy_logits_max": -2.065946578979492, "copy_logits_min": -750000000.0, "copy_num_tokens": 403.9375, "epoch": 0.5600204237937196, "gen_logits_max": 5.686023712158203, "gen_logits_mean": -12.904348373413086, "gen_logits_min": -25.07555389404297, "gen_logits_std": 2.6767516136169434, "gen_loss": 0.26836055517196655, "grad_norm": 0.4329948142064514, "learning_rate": 2.74661052631579e-05, "loss": 0.3194, "mean_copy_accuracy": 0.9939872175455093, "mean_gen_accuracy": 0.8611927926540375, "mean_token_accuracy": 0.8940403461456299, "num_tokens": 743449947.0, "sample_num_tokens": 7681.75, "step": 2742, "total_num_tokens": 743480674.0, "z_loss": 0.0007030805572867393 }, { "copy_logits_max": -1.7384761571884155, "copy_logits_min": -750000000.0, "copy_num_tokens": 358.9375, "epoch": 0.5602246617309166, "gen_logits_max": 6.004550457000732, "gen_logits_mean": -13.095884323120117, "gen_logits_min": -24.85083770751953, "gen_logits_std": 2.6089370250701904, "gen_loss": 0.3612053692340851, "grad_norm": 0.40883553754242097, "learning_rate": 2.746484210526316e-05, "loss": 0.3018, "mean_copy_accuracy": 0.9941293001174927, "mean_gen_accuracy": 0.8656343668699265, "mean_token_accuracy": 0.9008177667856216, "num_tokens": 743735892.0, "sample_num_tokens": 8078.5, "step": 2743, "total_num_tokens": 743768206.0, "z_loss": 0.0008812588639557362 }, { "copy_logits_max": -2.7302932739257812, "copy_logits_min": -750000000.0, "copy_num_tokens": 422.9375, "epoch": 0.5604288996681134, "gen_logits_max": 5.616521835327148, "gen_logits_mean": -13.87138557434082, "gen_logits_min": -26.111696243286133, "gen_logits_std": 2.6398215293884277, "gen_loss": 0.3214324712753296, "grad_norm": 0.4623464593852114, "learning_rate": 2.7463578947368424e-05, "loss": 0.3321, "mean_copy_accuracy": 0.9940268397331238, "mean_gen_accuracy": 0.8559471666812897, "mean_token_accuracy": 0.8898112922906876, "num_tokens": 743993430.0, "sample_num_tokens": 7908.5, "step": 2744, "total_num_tokens": 744025064.0, "z_loss": 0.000761740724556148 }, { "copy_logits_max": -3.7232279777526855, "copy_logits_min": -750000000.0, "copy_num_tokens": 510.4375, "epoch": 0.5606331376053102, "gen_logits_max": 5.450583457946777, "gen_logits_mean": -13.285273551940918, "gen_logits_min": -25.298873901367188, "gen_logits_std": 2.636406660079956, "gen_loss": 0.314624160528183, "grad_norm": 0.43211050715858024, "learning_rate": 2.7462315789473685e-05, "loss": 0.324, "mean_copy_accuracy": 0.9946091771125793, "mean_gen_accuracy": 0.8583963513374329, "mean_token_accuracy": 0.8909106999635696, "num_tokens": 744261694.0, "sample_num_tokens": 9210.0, "step": 2745, "total_num_tokens": 744298534.0, "z_loss": 0.0007204310386441648 }, { "copy_logits_max": -3.1241164207458496, "copy_logits_min": -750000000.0, "copy_num_tokens": 408.375, "epoch": 0.560837375542507, "gen_logits_max": 5.549220561981201, "gen_logits_mean": -13.33019733428955, "gen_logits_min": -25.097063064575195, "gen_logits_std": 2.641684055328369, "gen_loss": 0.3138563632965088, "grad_norm": 0.4230848035301147, "learning_rate": 2.746105263157895e-05, "loss": 0.3109, "mean_copy_accuracy": 0.9953092783689499, "mean_gen_accuracy": 0.8608500510454178, "mean_token_accuracy": 0.8962226510047913, "num_tokens": 744525715.0, "sample_num_tokens": 7977.75, "step": 2746, "total_num_tokens": 744557626.0, "z_loss": 0.0007416181033477187 }, { "copy_logits_max": -0.5652517676353455, "copy_logits_min": -750000000.0, "copy_num_tokens": 623.6875, "epoch": 0.5610416134797038, "gen_logits_max": 5.431208610534668, "gen_logits_mean": -12.978134155273438, "gen_logits_min": -25.15781593322754, "gen_logits_std": 2.6657490730285645, "gen_loss": 0.31783026456832886, "grad_norm": 0.41394151238646276, "learning_rate": 2.745978947368421e-05, "loss": 0.3191, "mean_copy_accuracy": 0.9956770688295364, "mean_gen_accuracy": 0.8610353767871857, "mean_token_accuracy": 0.8952926993370056, "num_tokens": 744804342.0, "sample_num_tokens": 9382.0, "step": 2747, "total_num_tokens": 744841870.0, "z_loss": 0.0007674427470192313 }, { "copy_logits_max": -1.5854586362838745, "copy_logits_min": -750000000.0, "copy_num_tokens": 599.8125, "epoch": 0.5612458514169006, "gen_logits_max": 5.084455490112305, "gen_logits_mean": -12.559383392333984, "gen_logits_min": -25.51908302307129, "gen_logits_std": 2.6714794635772705, "gen_loss": 0.2699110805988312, "grad_norm": 0.4730727151135811, "learning_rate": 2.7458526315789474e-05, "loss": 0.3168, "mean_copy_accuracy": 0.9939966648817062, "mean_gen_accuracy": 0.8605401813983917, "mean_token_accuracy": 0.8939568251371384, "num_tokens": 745053518.0, "sample_num_tokens": 8877.0, "step": 2748, "total_num_tokens": 745089026.0, "z_loss": 0.000678863434586674 }, { "copy_logits_max": -4.007221221923828, "copy_logits_min": -750000000.0, "copy_num_tokens": 379.4375, "epoch": 0.5614500893540976, "gen_logits_max": 5.882384300231934, "gen_logits_mean": -13.375247955322266, "gen_logits_min": -25.593950271606445, "gen_logits_std": 2.6363205909729004, "gen_loss": 0.34972909092903137, "grad_norm": 0.5195605420823385, "learning_rate": 2.7457263157894735e-05, "loss": 0.3088, "mean_copy_accuracy": 0.993766725063324, "mean_gen_accuracy": 0.86675725877285, "mean_token_accuracy": 0.8974623382091522, "num_tokens": 745324807.0, "sample_num_tokens": 8196.75, "step": 2749, "total_num_tokens": 745357594.0, "z_loss": 0.0008564176969230175 }, { "copy_logits_max": -2.400831937789917, "copy_logits_min": -750000000.0, "copy_num_tokens": 422.4375, "epoch": 0.5616543272912944, "gen_logits_max": 6.0981035232543945, "gen_logits_mean": -12.162424087524414, "gen_logits_min": -23.884151458740234, "gen_logits_std": 2.621530294418335, "gen_loss": 0.36661821603775024, "grad_norm": 0.42220147472014696, "learning_rate": 2.7456000000000003e-05, "loss": 0.3309, "mean_copy_accuracy": 0.9940363019704819, "mean_gen_accuracy": 0.8623666316270828, "mean_token_accuracy": 0.8901104032993317, "num_tokens": 745578309.0, "sample_num_tokens": 8800.75, "step": 2750, "total_num_tokens": 745613512.0, "z_loss": 0.0009332192130386829 }, { "copy_logits_max": -2.8421409130096436, "copy_logits_min": -750000000.0, "copy_num_tokens": 339.6875, "epoch": 0.5618585652284912, "gen_logits_max": 5.594951629638672, "gen_logits_mean": -13.393989562988281, "gen_logits_min": -24.9547061920166, "gen_logits_std": 2.5900611877441406, "gen_loss": 0.3159313499927521, "grad_norm": 0.46357835006241566, "learning_rate": 2.7454736842105264e-05, "loss": 0.332, "mean_copy_accuracy": 0.9942187517881393, "mean_gen_accuracy": 0.8534675240516663, "mean_token_accuracy": 0.8901835680007935, "num_tokens": 745847062.0, "sample_num_tokens": 7614.0, "step": 2751, "total_num_tokens": 745877518.0, "z_loss": 0.0007986975833773613 }, { "copy_logits_max": -2.8294501304626465, "copy_logits_min": -687500032.0, "copy_num_tokens": 479.3125, "epoch": 0.562062803165688, "gen_logits_max": 4.733968734741211, "gen_logits_mean": -14.82537841796875, "gen_logits_min": -26.737327575683594, "gen_logits_std": 2.625056743621826, "gen_loss": 0.3029543161392212, "grad_norm": 0.4406499790942252, "learning_rate": 2.7453473684210528e-05, "loss": 0.3196, "mean_copy_accuracy": 0.9939235001802444, "mean_gen_accuracy": 0.8624260872602463, "mean_token_accuracy": 0.8948037028312683, "num_tokens": 746122318.0, "sample_num_tokens": 9053.0, "step": 2752, "total_num_tokens": 746158530.0, "z_loss": 0.0007689175545237958 }, { "copy_logits_max": -0.28555548191070557, "copy_logits_min": -687500032.0, "copy_num_tokens": 546.0, "epoch": 0.5622670411028848, "gen_logits_max": 5.373307704925537, "gen_logits_mean": -12.618303298950195, "gen_logits_min": -25.170364379882812, "gen_logits_std": 2.685006618499756, "gen_loss": 0.3023698031902313, "grad_norm": 0.3970094632280216, "learning_rate": 2.745221052631579e-05, "loss": 0.3147, "mean_copy_accuracy": 0.9949026256799698, "mean_gen_accuracy": 0.8651313036680222, "mean_token_accuracy": 0.8965224921703339, "num_tokens": 746376264.0, "sample_num_tokens": 8683.5, "step": 2753, "total_num_tokens": 746410998.0, "z_loss": 0.0008301606867462397 }, { "copy_logits_max": -2.3014464378356934, "copy_logits_min": -750000000.0, "copy_num_tokens": 444.375, "epoch": 0.5624712790400817, "gen_logits_max": 5.254134178161621, "gen_logits_mean": -12.939476013183594, "gen_logits_min": -25.027082443237305, "gen_logits_std": 2.654649496078491, "gen_loss": 0.33459413051605225, "grad_norm": 0.4347901752253945, "learning_rate": 2.7450947368421053e-05, "loss": 0.3195, "mean_copy_accuracy": 0.9947398155927658, "mean_gen_accuracy": 0.8657348155975342, "mean_token_accuracy": 0.8941463679075241, "num_tokens": 746641149.0, "sample_num_tokens": 8240.25, "step": 2754, "total_num_tokens": 746674110.0, "z_loss": 0.0007725965115241706 }, { "copy_logits_max": -2.059460163116455, "copy_logits_min": -687500032.0, "copy_num_tokens": 357.3125, "epoch": 0.5626755169772786, "gen_logits_max": 5.627655506134033, "gen_logits_mean": -13.47293758392334, "gen_logits_min": -25.63088607788086, "gen_logits_std": 2.638518810272217, "gen_loss": 0.3602425456047058, "grad_norm": 0.4067249893837301, "learning_rate": 2.7449684210526318e-05, "loss": 0.3075, "mean_copy_accuracy": 0.9938125163316727, "mean_gen_accuracy": 0.8703360110521317, "mean_token_accuracy": 0.8994106948375702, "num_tokens": 746881395.0, "sample_num_tokens": 7291.25, "step": 2755, "total_num_tokens": 746910560.0, "z_loss": 0.0008394805481657386 }, { "copy_logits_max": -3.209075450897217, "copy_logits_min": -750000000.0, "copy_num_tokens": 367.375, "epoch": 0.5628797549144754, "gen_logits_max": 5.074283599853516, "gen_logits_mean": -15.05078411102295, "gen_logits_min": -26.50348663330078, "gen_logits_std": 2.5753822326660156, "gen_loss": 0.3378310203552246, "grad_norm": 0.46209577257154455, "learning_rate": 2.744842105263158e-05, "loss": 0.333, "mean_copy_accuracy": 0.9943645149469376, "mean_gen_accuracy": 0.8573820143938065, "mean_token_accuracy": 0.8895508199930191, "num_tokens": 747132136.0, "sample_num_tokens": 8438.0, "step": 2756, "total_num_tokens": 747165888.0, "z_loss": 0.0007544849067926407 }, { "copy_logits_max": -2.6442508697509766, "copy_logits_min": -750000000.0, "copy_num_tokens": 252.1875, "epoch": 0.5630839928516722, "gen_logits_max": 6.729816436767578, "gen_logits_mean": -12.547577857971191, "gen_logits_min": -23.786636352539062, "gen_logits_std": 2.583372116088867, "gen_loss": 0.34494516253471375, "grad_norm": 0.8485724233944821, "learning_rate": 2.7447157894736843e-05, "loss": 0.3413, "mean_copy_accuracy": 0.992148756980896, "mean_gen_accuracy": 0.8619455695152283, "mean_token_accuracy": 0.8861820697784424, "num_tokens": 747373872.0, "sample_num_tokens": 8156.5, "step": 2757, "total_num_tokens": 747406498.0, "z_loss": 0.0007778829894959927 }, { "copy_logits_max": -2.8935461044311523, "copy_logits_min": -687500032.0, "copy_num_tokens": 481.75, "epoch": 0.563288230788869, "gen_logits_max": 5.315201759338379, "gen_logits_mean": -12.78553581237793, "gen_logits_min": -24.118532180786133, "gen_logits_std": 2.5298521518707275, "gen_loss": 0.34758085012435913, "grad_norm": 0.5373980796363761, "learning_rate": 2.7445894736842104e-05, "loss": 0.3516, "mean_copy_accuracy": 0.9939843118190765, "mean_gen_accuracy": 0.8548570722341537, "mean_token_accuracy": 0.8860516995191574, "num_tokens": 747656881.0, "sample_num_tokens": 8335.75, "step": 2758, "total_num_tokens": 747690224.0, "z_loss": 0.0007676497334614396 }, { "copy_logits_max": -2.6863503456115723, "copy_logits_min": -687500032.0, "copy_num_tokens": 337.875, "epoch": 0.5634924687260658, "gen_logits_max": 5.546020984649658, "gen_logits_mean": -13.79415512084961, "gen_logits_min": -25.41484832763672, "gen_logits_std": 2.5956435203552246, "gen_loss": 0.34341877698898315, "grad_norm": 0.45150515345499737, "learning_rate": 2.744463157894737e-05, "loss": 0.3272, "mean_copy_accuracy": 0.9922571927309036, "mean_gen_accuracy": 0.862712487578392, "mean_token_accuracy": 0.8907988220453262, "num_tokens": 747934954.0, "sample_num_tokens": 7651.5, "step": 2759, "total_num_tokens": 747965560.0, "z_loss": 0.0007829376263543963 }, { "copy_logits_max": -4.417956829071045, "copy_logits_min": -750000000.0, "copy_num_tokens": 520.125, "epoch": 0.5636967066632627, "gen_logits_max": 4.244669437408447, "gen_logits_mean": -15.55981731414795, "gen_logits_min": -27.161067962646484, "gen_logits_std": 2.6020820140838623, "gen_loss": 0.2788931429386139, "grad_norm": 0.43564531560766684, "learning_rate": 2.7443368421052632e-05, "loss": 0.2944, "mean_copy_accuracy": 0.9942373782396317, "mean_gen_accuracy": 0.8671813756227493, "mean_token_accuracy": 0.9039906561374664, "num_tokens": 748224621.0, "sample_num_tokens": 9175.25, "step": 2760, "total_num_tokens": 748261322.0, "z_loss": 0.000613694079220295 }, { "copy_logits_max": -0.6297852396965027, "copy_logits_min": -750000000.0, "copy_num_tokens": 638.5625, "epoch": 0.5639009446004596, "gen_logits_max": 5.778624534606934, "gen_logits_mean": -11.264402389526367, "gen_logits_min": -23.558692932128906, "gen_logits_std": 2.659942626953125, "gen_loss": 0.318037748336792, "grad_norm": 0.48857148853665394, "learning_rate": 2.7442105263157897e-05, "loss": 0.3211, "mean_copy_accuracy": 0.9934530854225159, "mean_gen_accuracy": 0.8601392805576324, "mean_token_accuracy": 0.893736407160759, "num_tokens": 748497410.0, "sample_num_tokens": 9866.5, "step": 2761, "total_num_tokens": 748536876.0, "z_loss": 0.0007924331584945321 }, { "copy_logits_max": -1.8407566547393799, "copy_logits_min": -750000000.0, "copy_num_tokens": 343.8125, "epoch": 0.5641051825376564, "gen_logits_max": 6.112149238586426, "gen_logits_mean": -13.466646194458008, "gen_logits_min": -25.28531837463379, "gen_logits_std": 2.6271538734436035, "gen_loss": 0.3324880301952362, "grad_norm": 0.45353642759073454, "learning_rate": 2.7440842105263158e-05, "loss": 0.3104, "mean_copy_accuracy": 0.9929997771978378, "mean_gen_accuracy": 0.8671735376119614, "mean_token_accuracy": 0.8960222452878952, "num_tokens": 748761143.0, "sample_num_tokens": 7810.75, "step": 2762, "total_num_tokens": 748792386.0, "z_loss": 0.0008169646025635302 }, { "copy_logits_max": -0.39770716428756714, "copy_logits_min": -687500096.0, "copy_num_tokens": 433.125, "epoch": 0.5643094204748532, "gen_logits_max": 4.94903039932251, "gen_logits_mean": -14.419549942016602, "gen_logits_min": -26.311973571777344, "gen_logits_std": 2.6130788326263428, "gen_loss": 0.35018855333328247, "grad_norm": 0.42699668077821024, "learning_rate": 2.7439578947368422e-05, "loss": 0.3323, "mean_copy_accuracy": 0.9937624782323837, "mean_gen_accuracy": 0.8544367402791977, "mean_token_accuracy": 0.8928266912698746, "num_tokens": 749041567.0, "sample_num_tokens": 8493.75, "step": 2763, "total_num_tokens": 749075542.0, "z_loss": 0.0008702431805431843 }, { "copy_logits_max": -2.9551987648010254, "copy_logits_min": -750000000.0, "copy_num_tokens": 350.75, "epoch": 0.56451365841205, "gen_logits_max": 5.635568618774414, "gen_logits_mean": -13.784660339355469, "gen_logits_min": -25.43366241455078, "gen_logits_std": 2.629835605621338, "gen_loss": 0.3241618275642395, "grad_norm": 0.4231936095316051, "learning_rate": 2.7438315789473683e-05, "loss": 0.3116, "mean_copy_accuracy": 0.9943574219942093, "mean_gen_accuracy": 0.8660965412855148, "mean_token_accuracy": 0.8967074006795883, "num_tokens": 749318669.0, "sample_num_tokens": 7012.75, "step": 2764, "total_num_tokens": 749346720.0, "z_loss": 0.0008165219333022833 }, { "copy_logits_max": -4.267912864685059, "copy_logits_min": -750000000.0, "copy_num_tokens": 216.375, "epoch": 0.5647178963492469, "gen_logits_max": 6.2879319190979, "gen_logits_mean": -13.515551567077637, "gen_logits_min": -24.910184860229492, "gen_logits_std": 2.589054584503174, "gen_loss": 0.3326437771320343, "grad_norm": 0.515060431586617, "learning_rate": 2.7437052631578947e-05, "loss": 0.3312, "mean_copy_accuracy": 0.9915450513362885, "mean_gen_accuracy": 0.8619946539402008, "mean_token_accuracy": 0.8900718092918396, "num_tokens": 749570198.0, "sample_num_tokens": 6780.0, "step": 2765, "total_num_tokens": 749597318.0, "z_loss": 0.0007558467332273722 }, { "copy_logits_max": -1.9661492109298706, "copy_logits_min": -750000000.0, "copy_num_tokens": 423.25, "epoch": 0.5649221342864437, "gen_logits_max": 5.256237983703613, "gen_logits_mean": -13.108373641967773, "gen_logits_min": -24.88119125366211, "gen_logits_std": 2.610994815826416, "gen_loss": 0.2858067750930786, "grad_norm": 0.41858690009327876, "learning_rate": 2.7435789473684208e-05, "loss": 0.3067, "mean_copy_accuracy": 0.9953192323446274, "mean_gen_accuracy": 0.8667849004268646, "mean_token_accuracy": 0.8997866064310074, "num_tokens": 749818520.0, "sample_num_tokens": 7118.5, "step": 2766, "total_num_tokens": 749846994.0, "z_loss": 0.0007092265295796096 }, { "copy_logits_max": -0.5852866172790527, "copy_logits_min": -687500032.0, "copy_num_tokens": 366.0625, "epoch": 0.5651263722236406, "gen_logits_max": 5.702122688293457, "gen_logits_mean": -12.696924209594727, "gen_logits_min": -24.423107147216797, "gen_logits_std": 2.560176134109497, "gen_loss": 0.34501129388809204, "grad_norm": 0.48648016518621373, "learning_rate": 2.7434526315789476e-05, "loss": 0.3379, "mean_copy_accuracy": 0.9945526421070099, "mean_gen_accuracy": 0.853847324848175, "mean_token_accuracy": 0.888007178902626, "num_tokens": 750075216.0, "sample_num_tokens": 7359.0, "step": 2767, "total_num_tokens": 750104652.0, "z_loss": 0.0008850631420500576 }, { "copy_logits_max": -2.921982765197754, "copy_logits_min": -687500032.0, "copy_num_tokens": 338.0625, "epoch": 0.5653306101608374, "gen_logits_max": 5.449018478393555, "gen_logits_mean": -13.859881401062012, "gen_logits_min": -25.609594345092773, "gen_logits_std": 2.60414981842041, "gen_loss": 0.32045474648475647, "grad_norm": 0.47582542315504833, "learning_rate": 2.743326315789474e-05, "loss": 0.3216, "mean_copy_accuracy": 0.994217187166214, "mean_gen_accuracy": 0.8661996722221375, "mean_token_accuracy": 0.8964049369096756, "num_tokens": 750342127.0, "sample_num_tokens": 7504.25, "step": 2768, "total_num_tokens": 750372144.0, "z_loss": 0.0007204317953437567 }, { "copy_logits_max": -1.8205506801605225, "copy_logits_min": -687500032.0, "copy_num_tokens": 587.375, "epoch": 0.5655348480980342, "gen_logits_max": 4.784193992614746, "gen_logits_mean": -13.667852401733398, "gen_logits_min": -25.42254066467285, "gen_logits_std": 2.548173427581787, "gen_loss": 0.32076334953308105, "grad_norm": 0.4769781703979152, "learning_rate": 2.7432e-05, "loss": 0.3092, "mean_copy_accuracy": 0.9938350021839142, "mean_gen_accuracy": 0.8581286370754242, "mean_token_accuracy": 0.8975597023963928, "num_tokens": 750636637.0, "sample_num_tokens": 9111.25, "step": 2769, "total_num_tokens": 750673082.0, "z_loss": 0.0007571113528683782 }, { "copy_logits_max": -1.1806588172912598, "copy_logits_min": -750000000.0, "copy_num_tokens": 424.8125, "epoch": 0.565739086035231, "gen_logits_max": 6.024788856506348, "gen_logits_mean": -12.448060989379883, "gen_logits_min": -24.784069061279297, "gen_logits_std": 2.646233558654785, "gen_loss": 0.3434932827949524, "grad_norm": 0.6530809831489975, "learning_rate": 2.7430736842105265e-05, "loss": 0.3212, "mean_copy_accuracy": 0.9938860237598419, "mean_gen_accuracy": 0.8579681515693665, "mean_token_accuracy": 0.8927683979272842, "num_tokens": 750929898.0, "sample_num_tokens": 8402.0, "step": 2770, "total_num_tokens": 750963506.0, "z_loss": 0.0008594099199399352 }, { "copy_logits_max": -2.0617737770080566, "copy_logits_min": -750000000.0, "copy_num_tokens": 454.625, "epoch": 0.5659433239724279, "gen_logits_max": 5.252490043640137, "gen_logits_mean": -13.432623863220215, "gen_logits_min": -25.678848266601562, "gen_logits_std": 2.646507978439331, "gen_loss": 0.2977059483528137, "grad_norm": 0.4289027789430946, "learning_rate": 2.7429473684210526e-05, "loss": 0.3122, "mean_copy_accuracy": 0.9945805817842484, "mean_gen_accuracy": 0.8630655705928802, "mean_token_accuracy": 0.8972902745008469, "num_tokens": 751186549.0, "sample_num_tokens": 7503.25, "step": 2771, "total_num_tokens": 751216562.0, "z_loss": 0.0007919067284092307 }, { "copy_logits_max": -2.0359463691711426, "copy_logits_min": -687500032.0, "copy_num_tokens": 474.0625, "epoch": 0.5661475619096247, "gen_logits_max": 4.452948570251465, "gen_logits_mean": -14.417421340942383, "gen_logits_min": -25.88976287841797, "gen_logits_std": 2.558173418045044, "gen_loss": 0.3008202910423279, "grad_norm": 0.530261088145542, "learning_rate": 2.742821052631579e-05, "loss": 0.3241, "mean_copy_accuracy": 0.991038590669632, "mean_gen_accuracy": 0.8657224923372269, "mean_token_accuracy": 0.8942476660013199, "num_tokens": 751442534.0, "sample_num_tokens": 8253.0, "step": 2772, "total_num_tokens": 751475546.0, "z_loss": 0.0006744452985003591 }, { "copy_logits_max": -1.3324518203735352, "copy_logits_min": -750000000.0, "copy_num_tokens": 527.0, "epoch": 0.5663517998468216, "gen_logits_max": 5.141387462615967, "gen_logits_mean": -13.856306076049805, "gen_logits_min": -25.646141052246094, "gen_logits_std": 2.6244497299194336, "gen_loss": 0.3104773163795471, "grad_norm": 0.38859133482124575, "learning_rate": 2.742694736842105e-05, "loss": 0.3185, "mean_copy_accuracy": 0.9943774342536926, "mean_gen_accuracy": 0.8605836182832718, "mean_token_accuracy": 0.8951842039823532, "num_tokens": 751714239.0, "sample_num_tokens": 8424.25, "step": 2773, "total_num_tokens": 751747936.0, "z_loss": 0.0007001255289651453 }, { "copy_logits_max": -2.514150619506836, "copy_logits_min": -750000000.0, "copy_num_tokens": 381.5625, "epoch": 0.5665560377840184, "gen_logits_max": 4.884955406188965, "gen_logits_mean": -14.549970626831055, "gen_logits_min": -25.886573791503906, "gen_logits_std": 2.5651369094848633, "gen_loss": 0.3629176616668701, "grad_norm": 0.42859555976609814, "learning_rate": 2.7425684210526316e-05, "loss": 0.3287, "mean_copy_accuracy": 0.9929168224334717, "mean_gen_accuracy": 0.8610872626304626, "mean_token_accuracy": 0.8912704735994339, "num_tokens": 751986813.0, "sample_num_tokens": 8000.25, "step": 2774, "total_num_tokens": 752018814.0, "z_loss": 0.0007702196598984301 }, { "copy_logits_max": -0.9796146154403687, "copy_logits_min": -750000000.0, "copy_num_tokens": 647.1875, "epoch": 0.5667602757212152, "gen_logits_max": 4.8026533126831055, "gen_logits_mean": -13.179195404052734, "gen_logits_min": -25.69746971130371, "gen_logits_std": 2.6348061561584473, "gen_loss": 0.267792284488678, "grad_norm": 0.44109007624757096, "learning_rate": 2.742442105263158e-05, "loss": 0.3033, "mean_copy_accuracy": 0.9934410154819489, "mean_gen_accuracy": 0.8668876886367798, "mean_token_accuracy": 0.900021493434906, "num_tokens": 752252104.0, "sample_num_tokens": 9765.5, "step": 2775, "total_num_tokens": 752291166.0, "z_loss": 0.0007438163738697767 }, { "copy_logits_max": -0.7073442935943604, "copy_logits_min": -750000000.0, "copy_num_tokens": 377.0, "epoch": 0.566964513658412, "gen_logits_max": 5.917855262756348, "gen_logits_mean": -12.505426406860352, "gen_logits_min": -24.7799072265625, "gen_logits_std": 2.6954824924468994, "gen_loss": 0.2801136076450348, "grad_norm": 0.41875392670906453, "learning_rate": 2.7423157894736844e-05, "loss": 0.2955, "mean_copy_accuracy": 0.9941289126873016, "mean_gen_accuracy": 0.8730354905128479, "mean_token_accuracy": 0.900739774107933, "num_tokens": 752523862.0, "sample_num_tokens": 8071.0, "step": 2776, "total_num_tokens": 752556146.0, "z_loss": 0.0007525807595811784 }, { "copy_logits_max": -0.9250429272651672, "copy_logits_min": -750000000.0, "copy_num_tokens": 323.6875, "epoch": 0.5671687515956089, "gen_logits_max": 5.112311840057373, "gen_logits_mean": -14.693108558654785, "gen_logits_min": -26.326581954956055, "gen_logits_std": 2.5869336128234863, "gen_loss": 0.3298584222793579, "grad_norm": 0.41982145597513093, "learning_rate": 2.7421894736842105e-05, "loss": 0.3179, "mean_copy_accuracy": 0.9935309439897537, "mean_gen_accuracy": 0.8710549175739288, "mean_token_accuracy": 0.8972358256578445, "num_tokens": 752796617.0, "sample_num_tokens": 7854.25, "step": 2777, "total_num_tokens": 752828034.0, "z_loss": 0.0007597809308208525 }, { "copy_logits_max": -0.4193408191204071, "copy_logits_min": -750000000.0, "copy_num_tokens": 500.3125, "epoch": 0.5673729895328057, "gen_logits_max": 4.330336570739746, "gen_logits_mean": -13.568437576293945, "gen_logits_min": -25.40772819519043, "gen_logits_std": 2.5943822860717773, "gen_loss": 0.2985510230064392, "grad_norm": 0.46450899254806277, "learning_rate": 2.742063157894737e-05, "loss": 0.3213, "mean_copy_accuracy": 0.9942243099212646, "mean_gen_accuracy": 0.8583831042051315, "mean_token_accuracy": 0.8931578993797302, "num_tokens": 753061777.0, "sample_num_tokens": 7671.75, "step": 2778, "total_num_tokens": 753092464.0, "z_loss": 0.0007379999151453376 }, { "copy_logits_max": 0.5830395221710205, "copy_logits_min": -562500032.0, "copy_num_tokens": 699.25, "epoch": 0.5675772274700025, "gen_logits_max": 5.314795970916748, "gen_logits_mean": -13.497868537902832, "gen_logits_min": -25.354562759399414, "gen_logits_std": 2.583526849746704, "gen_loss": 0.32836082577705383, "grad_norm": 0.41876116442495437, "learning_rate": 2.741936842105263e-05, "loss": 0.3098, "mean_copy_accuracy": 0.9937792271375656, "mean_gen_accuracy": 0.8674463629722595, "mean_token_accuracy": 0.9007516205310822, "num_tokens": 753347263.0, "sample_num_tokens": 10489.25, "step": 2779, "total_num_tokens": 753389220.0, "z_loss": 0.0008737021125853062 }, { "copy_logits_max": 0.1806541234254837, "copy_logits_min": -750000000.0, "copy_num_tokens": 369.0, "epoch": 0.5677814654071994, "gen_logits_max": 5.98792839050293, "gen_logits_mean": -12.587857246398926, "gen_logits_min": -24.31857681274414, "gen_logits_std": 2.575901508331299, "gen_loss": 0.33416399359703064, "grad_norm": 0.4432743965022921, "learning_rate": 2.7418105263157895e-05, "loss": 0.3305, "mean_copy_accuracy": 0.9930663555860519, "mean_gen_accuracy": 0.8639621734619141, "mean_token_accuracy": 0.8922456651926041, "num_tokens": 753625728.0, "sample_num_tokens": 7992.5, "step": 2780, "total_num_tokens": 753657698.0, "z_loss": 0.0007990440935827792 }, { "copy_logits_max": -2.232724189758301, "copy_logits_min": -750000000.0, "copy_num_tokens": 419.5, "epoch": 0.5679857033443962, "gen_logits_max": 5.051916599273682, "gen_logits_mean": -14.524883270263672, "gen_logits_min": -25.785194396972656, "gen_logits_std": 2.5721845626831055, "gen_loss": 0.33252662420272827, "grad_norm": 0.4630779593654232, "learning_rate": 2.741684210526316e-05, "loss": 0.3374, "mean_copy_accuracy": 0.992290809750557, "mean_gen_accuracy": 0.8547854125499725, "mean_token_accuracy": 0.8897867649793625, "num_tokens": 753916560.0, "sample_num_tokens": 8359.5, "step": 2781, "total_num_tokens": 753949998.0, "z_loss": 0.0007818914018571377 }, { "copy_logits_max": 0.0583224892616272, "copy_logits_min": -750000000.0, "copy_num_tokens": 367.5, "epoch": 0.5681899412815931, "gen_logits_max": 6.734522342681885, "gen_logits_mean": -12.679984092712402, "gen_logits_min": -25.07740020751953, "gen_logits_std": 2.6323981285095215, "gen_loss": 0.37963879108428955, "grad_norm": 0.44591147036591827, "learning_rate": 2.741557894736842e-05, "loss": 0.3308, "mean_copy_accuracy": 0.9944809526205063, "mean_gen_accuracy": 0.8575648963451385, "mean_token_accuracy": 0.8915086388587952, "num_tokens": 754183413.0, "sample_num_tokens": 7613.75, "step": 2782, "total_num_tokens": 754213868.0, "z_loss": 0.0009076484711840749 }, { "copy_logits_max": 0.7195663452148438, "copy_logits_min": -750000000.0, "copy_num_tokens": 425.625, "epoch": 0.5683941792187899, "gen_logits_max": 5.314772605895996, "gen_logits_mean": -13.164671897888184, "gen_logits_min": -24.922069549560547, "gen_logits_std": 2.599581003189087, "gen_loss": 0.3673846125602722, "grad_norm": 0.5230130066105335, "learning_rate": 2.7414315789473688e-05, "loss": 0.328, "mean_copy_accuracy": 0.9942738711833954, "mean_gen_accuracy": 0.8606937378644943, "mean_token_accuracy": 0.8925386220216751, "num_tokens": 754443349.0, "sample_num_tokens": 7925.75, "step": 2783, "total_num_tokens": 754475052.0, "z_loss": 0.0009685510885901749 }, { "copy_logits_max": 1.3445754051208496, "copy_logits_min": -750000064.0, "copy_num_tokens": 718.125, "epoch": 0.5685984171559867, "gen_logits_max": 4.778594017028809, "gen_logits_mean": -14.112048149108887, "gen_logits_min": -26.445688247680664, "gen_logits_std": 2.6342456340789795, "gen_loss": 0.29105642437934875, "grad_norm": 0.4338315931906727, "learning_rate": 2.741305263157895e-05, "loss": 0.3019, "mean_copy_accuracy": 0.9946606457233429, "mean_gen_accuracy": 0.8663344979286194, "mean_token_accuracy": 0.9028974920511246, "num_tokens": 754749085.0, "sample_num_tokens": 9208.25, "step": 2784, "total_num_tokens": 754785918.0, "z_loss": 0.0008742815116420388 }, { "copy_logits_max": -1.1470075845718384, "copy_logits_min": -750000000.0, "copy_num_tokens": 428.3125, "epoch": 0.5688026550931835, "gen_logits_max": 5.611292362213135, "gen_logits_mean": -14.225065231323242, "gen_logits_min": -25.88711929321289, "gen_logits_std": 2.619575262069702, "gen_loss": 0.301130473613739, "grad_norm": 0.42554827555411945, "learning_rate": 2.7411789473684213e-05, "loss": 0.299, "mean_copy_accuracy": 0.9934339225292206, "mean_gen_accuracy": 0.8683187663555145, "mean_token_accuracy": 0.9006897211074829, "num_tokens": 755029687.0, "sample_num_tokens": 8325.75, "step": 2785, "total_num_tokens": 755062990.0, "z_loss": 0.0007764155743643641 }, { "copy_logits_max": -0.5193638801574707, "copy_logits_min": -750000000.0, "copy_num_tokens": 338.5625, "epoch": 0.5690068930303804, "gen_logits_max": 5.566082954406738, "gen_logits_mean": -13.672210693359375, "gen_logits_min": -25.98458480834961, "gen_logits_std": 2.630925416946411, "gen_loss": 0.3445085287094116, "grad_norm": 0.4547216273811181, "learning_rate": 2.7410526315789474e-05, "loss": 0.3333, "mean_copy_accuracy": 0.9944040328264236, "mean_gen_accuracy": 0.8571218103170395, "mean_token_accuracy": 0.8885617852210999, "num_tokens": 755297907.0, "sample_num_tokens": 7352.75, "step": 2786, "total_num_tokens": 755327318.0, "z_loss": 0.0008372595766559243 }, { "copy_logits_max": 0.19626986980438232, "copy_logits_min": -687500032.0, "copy_num_tokens": 505.875, "epoch": 0.5692111309675773, "gen_logits_max": 6.412535667419434, "gen_logits_mean": -12.704000473022461, "gen_logits_min": -24.869508743286133, "gen_logits_std": 2.629201889038086, "gen_loss": 0.34113809466362, "grad_norm": 0.434966595658001, "learning_rate": 2.7409263157894738e-05, "loss": 0.336, "mean_copy_accuracy": 0.993779793381691, "mean_gen_accuracy": 0.8568453341722488, "mean_token_accuracy": 0.8909551501274109, "num_tokens": 755561512.0, "sample_num_tokens": 9271.0, "step": 2787, "total_num_tokens": 755598596.0, "z_loss": 0.0008527010213583708 }, { "copy_logits_max": 0.7463886737823486, "copy_logits_min": -687499968.0, "copy_num_tokens": 570.75, "epoch": 0.5694153689047741, "gen_logits_max": 5.976559162139893, "gen_logits_mean": -13.414191246032715, "gen_logits_min": -25.399492263793945, "gen_logits_std": 2.6350440979003906, "gen_loss": 0.301514208316803, "grad_norm": 0.4652807039662957, "learning_rate": 2.7408e-05, "loss": 0.3167, "mean_copy_accuracy": 0.9924981445074081, "mean_gen_accuracy": 0.8619712889194489, "mean_token_accuracy": 0.8948578387498856, "num_tokens": 755812422.0, "sample_num_tokens": 9412.0, "step": 2788, "total_num_tokens": 755850070.0, "z_loss": 0.0007588597945868969 }, { "copy_logits_max": -1.2986068725585938, "copy_logits_min": -750000000.0, "copy_num_tokens": 356.125, "epoch": 0.5696196068419709, "gen_logits_max": 5.908647537231445, "gen_logits_mean": -13.309019088745117, "gen_logits_min": -25.028852462768555, "gen_logits_std": 2.6271066665649414, "gen_loss": 0.3473685383796692, "grad_norm": 0.43454207906428455, "learning_rate": 2.7406736842105263e-05, "loss": 0.3163, "mean_copy_accuracy": 0.9924319386482239, "mean_gen_accuracy": 0.8672482520341873, "mean_token_accuracy": 0.8964214771986008, "num_tokens": 756087144.0, "sample_num_tokens": 7438.5, "step": 2789, "total_num_tokens": 756116898.0, "z_loss": 0.0007538063218817115 }, { "copy_logits_max": -2.5377964973449707, "copy_logits_min": -687500032.0, "copy_num_tokens": 373.875, "epoch": 0.5698238447791677, "gen_logits_max": 6.224395275115967, "gen_logits_mean": -13.1177339553833, "gen_logits_min": -25.15774917602539, "gen_logits_std": 2.6310172080993652, "gen_loss": 0.32371020317077637, "grad_norm": 0.47326472733457503, "learning_rate": 2.7405473684210524e-05, "loss": 0.3181, "mean_copy_accuracy": 0.9927418828010559, "mean_gen_accuracy": 0.866096630692482, "mean_token_accuracy": 0.8971230238676071, "num_tokens": 756356070.0, "sample_num_tokens": 8094.5, "step": 2790, "total_num_tokens": 756388448.0, "z_loss": 0.0006921482272446156 }, { "copy_logits_max": -2.2418441772460938, "copy_logits_min": -750000000.0, "copy_num_tokens": 328.375, "epoch": 0.5700280827163645, "gen_logits_max": 5.154188632965088, "gen_logits_mean": -14.498273849487305, "gen_logits_min": -26.066791534423828, "gen_logits_std": 2.5987586975097656, "gen_loss": 0.31552278995513916, "grad_norm": 0.4807484693660519, "learning_rate": 2.7404210526315792e-05, "loss": 0.3153, "mean_copy_accuracy": 0.9926664531230927, "mean_gen_accuracy": 0.8625026792287827, "mean_token_accuracy": 0.8949754238128662, "num_tokens": 756617868.0, "sample_num_tokens": 7420.0, "step": 2791, "total_num_tokens": 756647548.0, "z_loss": 0.0007196979713626206 }, { "copy_logits_max": 0.6515114307403564, "copy_logits_min": -750000000.0, "copy_num_tokens": 362.0625, "epoch": 0.5702323206535614, "gen_logits_max": 6.627834320068359, "gen_logits_mean": -12.92760944366455, "gen_logits_min": -24.739974975585938, "gen_logits_std": 2.646083354949951, "gen_loss": 0.3730800747871399, "grad_norm": 0.44628078278720384, "learning_rate": 2.7402947368421053e-05, "loss": 0.3319, "mean_copy_accuracy": 0.994386151432991, "mean_gen_accuracy": 0.8597119599580765, "mean_token_accuracy": 0.8910987675189972, "num_tokens": 756901638.0, "sample_num_tokens": 8243.5, "step": 2792, "total_num_tokens": 756934612.0, "z_loss": 0.0008554186206310987 }, { "copy_logits_max": -0.810710072517395, "copy_logits_min": -750000000.0, "copy_num_tokens": 637.125, "epoch": 0.5704365585907583, "gen_logits_max": 5.605514049530029, "gen_logits_mean": -13.276602745056152, "gen_logits_min": -25.451160430908203, "gen_logits_std": 2.6969528198242188, "gen_loss": 0.2695150375366211, "grad_norm": 0.42858696036276106, "learning_rate": 2.7401684210526317e-05, "loss": 0.3063, "mean_copy_accuracy": 0.9949116706848145, "mean_gen_accuracy": 0.8650786131620407, "mean_token_accuracy": 0.8987582325935364, "num_tokens": 757196563.0, "sample_num_tokens": 9902.25, "step": 2793, "total_num_tokens": 757236172.0, "z_loss": 0.0006682457751594484 }, { "copy_logits_max": -0.353499174118042, "copy_logits_min": -687500032.0, "copy_num_tokens": 578.375, "epoch": 0.5706407965279551, "gen_logits_max": 5.650788307189941, "gen_logits_mean": -13.016326904296875, "gen_logits_min": -25.561771392822266, "gen_logits_std": 2.6532795429229736, "gen_loss": 0.29931938648223877, "grad_norm": 0.46243695812954844, "learning_rate": 2.740042105263158e-05, "loss": 0.3088, "mean_copy_accuracy": 0.994865283370018, "mean_gen_accuracy": 0.8631486892700195, "mean_token_accuracy": 0.8988883644342422, "num_tokens": 757495920.0, "sample_num_tokens": 9204.0, "step": 2794, "total_num_tokens": 757532736.0, "z_loss": 0.000705819868016988 }, { "copy_logits_max": -1.5415736436843872, "copy_logits_min": -750000064.0, "copy_num_tokens": 535.5, "epoch": 0.5708450344651519, "gen_logits_max": 5.9910430908203125, "gen_logits_mean": -13.707801818847656, "gen_logits_min": -25.719743728637695, "gen_logits_std": 2.6083106994628906, "gen_loss": 0.3366859257221222, "grad_norm": 0.4511265725595573, "learning_rate": 2.7399157894736843e-05, "loss": 0.3128, "mean_copy_accuracy": 0.9934740662574768, "mean_gen_accuracy": 0.8645536452531815, "mean_token_accuracy": 0.8969639986753464, "num_tokens": 757766611.0, "sample_num_tokens": 8744.25, "step": 2795, "total_num_tokens": 757801588.0, "z_loss": 0.0007765558548271656 }, { "copy_logits_max": -1.471003532409668, "copy_logits_min": -750000064.0, "copy_num_tokens": 405.5625, "epoch": 0.5710492724023487, "gen_logits_max": 5.770676612854004, "gen_logits_mean": -13.714218139648438, "gen_logits_min": -25.44415855407715, "gen_logits_std": 2.6075615882873535, "gen_loss": 0.3573440909385681, "grad_norm": 0.4292972519485093, "learning_rate": 2.7397894736842107e-05, "loss": 0.3308, "mean_copy_accuracy": 0.992861732840538, "mean_gen_accuracy": 0.861011415719986, "mean_token_accuracy": 0.8918497562408447, "num_tokens": 758050529.0, "sample_num_tokens": 9060.75, "step": 2796, "total_num_tokens": 758086772.0, "z_loss": 0.0007856243755668402 }, { "copy_logits_max": 0.022591084241867065, "copy_logits_min": -687500032.0, "copy_num_tokens": 542.625, "epoch": 0.5712535103395455, "gen_logits_max": 5.071807861328125, "gen_logits_mean": -13.689626693725586, "gen_logits_min": -26.080631256103516, "gen_logits_std": 2.6184957027435303, "gen_loss": 0.32233402132987976, "grad_norm": 0.4552329844111485, "learning_rate": 2.7396631578947368e-05, "loss": 0.3353, "mean_copy_accuracy": 0.9945606738328934, "mean_gen_accuracy": 0.8545577675104141, "mean_token_accuracy": 0.8891500979661942, "num_tokens": 758325578.0, "sample_num_tokens": 8864.0, "step": 2797, "total_num_tokens": 758361034.0, "z_loss": 0.0008505657315254211 }, { "copy_logits_max": 1.0793246030807495, "copy_logits_min": -750000000.0, "copy_num_tokens": 566.625, "epoch": 0.5714577482767424, "gen_logits_max": 6.432332992553711, "gen_logits_mean": -12.80528450012207, "gen_logits_min": -24.935340881347656, "gen_logits_std": 2.620398759841919, "gen_loss": 0.3158210515975952, "grad_norm": 0.40611157138979737, "learning_rate": 2.7395368421052632e-05, "loss": 0.3098, "mean_copy_accuracy": 0.9946929067373276, "mean_gen_accuracy": 0.8685994297266006, "mean_token_accuracy": 0.8953976631164551, "num_tokens": 758594588.0, "sample_num_tokens": 9497.0, "step": 2798, "total_num_tokens": 758632576.0, "z_loss": 0.0007392471889033914 }, { "copy_logits_max": -1.0855824947357178, "copy_logits_min": -750000000.0, "copy_num_tokens": 374.0625, "epoch": 0.5716619862139393, "gen_logits_max": 5.75213623046875, "gen_logits_mean": -13.947131156921387, "gen_logits_min": -25.8974609375, "gen_logits_std": 2.6289072036743164, "gen_loss": 0.3337882161140442, "grad_norm": 0.46948224043863473, "learning_rate": 2.7394105263157896e-05, "loss": 0.303, "mean_copy_accuracy": 0.9928438365459442, "mean_gen_accuracy": 0.8697586506605148, "mean_token_accuracy": 0.8991992920637131, "num_tokens": 758839874.0, "sample_num_tokens": 7777.0, "step": 2799, "total_num_tokens": 758870982.0, "z_loss": 0.0007517002522945404 }, { "copy_logits_max": -0.378947913646698, "copy_logits_min": -750000000.0, "copy_num_tokens": 650.8125, "epoch": 0.5718662241511361, "gen_logits_max": 4.906159400939941, "gen_logits_mean": -13.984244346618652, "gen_logits_min": -26.170549392700195, "gen_logits_std": 2.671839714050293, "gen_loss": 0.3052492141723633, "grad_norm": 0.4800625732965865, "learning_rate": 2.739284210526316e-05, "loss": 0.3451, "mean_copy_accuracy": 0.9946472495794296, "mean_gen_accuracy": 0.8495980948209763, "mean_token_accuracy": 0.8851442933082581, "num_tokens": 759114465.0, "sample_num_tokens": 9909.75, "step": 2800, "total_num_tokens": 759154104.0, "z_loss": 0.0007602917612530291 }, { "copy_logits_max": 1.5782737731933594, "copy_logits_min": -687500032.0, "copy_num_tokens": 560.3125, "epoch": 0.5720704620883329, "gen_logits_max": 5.91721773147583, "gen_logits_mean": -12.078603744506836, "gen_logits_min": -24.583953857421875, "gen_logits_std": 2.652665615081787, "gen_loss": 0.3032185137271881, "grad_norm": 0.4299293707245322, "learning_rate": 2.739157894736842e-05, "loss": 0.3416, "mean_copy_accuracy": 0.9948859810829163, "mean_gen_accuracy": 0.8558488488197327, "mean_token_accuracy": 0.8868647217750549, "num_tokens": 759380141.0, "sample_num_tokens": 9066.25, "step": 2801, "total_num_tokens": 759416406.0, "z_loss": 0.000772243132814765 }, { "copy_logits_max": -1.1099439859390259, "copy_logits_min": -750000064.0, "copy_num_tokens": 274.8125, "epoch": 0.5722747000255297, "gen_logits_max": 5.862959861755371, "gen_logits_mean": -13.379775047302246, "gen_logits_min": -24.997211456298828, "gen_logits_std": 2.6083288192749023, "gen_loss": 0.33718448877334595, "grad_norm": 0.4419091970136465, "learning_rate": 2.7390315789473686e-05, "loss": 0.3295, "mean_copy_accuracy": 0.9947179704904556, "mean_gen_accuracy": 0.8596453964710236, "mean_token_accuracy": 0.8931587636470795, "num_tokens": 759640870.0, "sample_num_tokens": 6823.0, "step": 2802, "total_num_tokens": 759668162.0, "z_loss": 0.0007566210115328431 }, { "copy_logits_max": -0.14268162846565247, "copy_logits_min": -750000000.0, "copy_num_tokens": 457.4375, "epoch": 0.5724789379627265, "gen_logits_max": 5.889439582824707, "gen_logits_mean": -13.380624771118164, "gen_logits_min": -25.1308536529541, "gen_logits_std": 2.645765542984009, "gen_loss": 0.31641051173210144, "grad_norm": 0.43733561097913737, "learning_rate": 2.7389052631578947e-05, "loss": 0.3223, "mean_copy_accuracy": 0.9933807700872421, "mean_gen_accuracy": 0.8626837730407715, "mean_token_accuracy": 0.8925897628068924, "num_tokens": 759901156.0, "sample_num_tokens": 8583.5, "step": 2803, "total_num_tokens": 759935490.0, "z_loss": 0.0007595450151711702 }, { "copy_logits_max": 0.4461395740509033, "copy_logits_min": -750000000.0, "copy_num_tokens": 383.625, "epoch": 0.5726831758999235, "gen_logits_max": 5.669979095458984, "gen_logits_mean": -13.437356948852539, "gen_logits_min": -25.765113830566406, "gen_logits_std": 2.676910400390625, "gen_loss": 0.29906362295150757, "grad_norm": 0.441124300778772, "learning_rate": 2.738778947368421e-05, "loss": 0.3107, "mean_copy_accuracy": 0.992305725812912, "mean_gen_accuracy": 0.8684126436710358, "mean_token_accuracy": 0.8963532447814941, "num_tokens": 760180501.0, "sample_num_tokens": 7096.25, "step": 2804, "total_num_tokens": 760208886.0, "z_loss": 0.0006886715418659151 }, { "copy_logits_max": 0.7962958812713623, "copy_logits_min": -750000000.0, "copy_num_tokens": 678.0, "epoch": 0.5728874138371203, "gen_logits_max": 5.422185897827148, "gen_logits_mean": -12.959365844726562, "gen_logits_min": -25.799449920654297, "gen_logits_std": 2.711909770965576, "gen_loss": 0.2610618472099304, "grad_norm": 0.4421994088650022, "learning_rate": 2.7386526315789472e-05, "loss": 0.317, "mean_copy_accuracy": 0.9953131824731827, "mean_gen_accuracy": 0.860098123550415, "mean_token_accuracy": 0.8967732489109039, "num_tokens": 760462436.0, "sample_num_tokens": 9287.0, "step": 2805, "total_num_tokens": 760499584.0, "z_loss": 0.0007118151988834143 }, { "copy_logits_max": -0.00914376974105835, "copy_logits_min": -687500032.0, "copy_num_tokens": 531.4375, "epoch": 0.5730916517743171, "gen_logits_max": 5.58919620513916, "gen_logits_mean": -13.084747314453125, "gen_logits_min": -25.917940139770508, "gen_logits_std": 2.682190418243408, "gen_loss": 0.31818321347236633, "grad_norm": 0.46362040161960166, "learning_rate": 2.7385263157894736e-05, "loss": 0.3306, "mean_copy_accuracy": 0.9939831048250198, "mean_gen_accuracy": 0.8556234389543533, "mean_token_accuracy": 0.8916224241256714, "num_tokens": 760718717.0, "sample_num_tokens": 8086.75, "step": 2806, "total_num_tokens": 760751064.0, "z_loss": 0.0008118137484416366 }, { "copy_logits_max": -1.2118074893951416, "copy_logits_min": -750000128.0, "copy_num_tokens": 459.25, "epoch": 0.5732958897115139, "gen_logits_max": 5.643327236175537, "gen_logits_mean": -13.633402824401855, "gen_logits_min": -26.080644607543945, "gen_logits_std": 2.6890182495117188, "gen_loss": 0.32493510842323303, "grad_norm": 0.4388062123640929, "learning_rate": 2.7383999999999997e-05, "loss": 0.3264, "mean_copy_accuracy": 0.9932603389024734, "mean_gen_accuracy": 0.8628241717815399, "mean_token_accuracy": 0.8926667273044586, "num_tokens": 760986569.0, "sample_num_tokens": 7799.25, "step": 2807, "total_num_tokens": 761017766.0, "z_loss": 0.0007573176990263164 }, { "copy_logits_max": -0.6563732624053955, "copy_logits_min": -687500032.0, "copy_num_tokens": 562.375, "epoch": 0.5735001276487107, "gen_logits_max": 4.629899501800537, "gen_logits_mean": -14.250394821166992, "gen_logits_min": -26.729589462280273, "gen_logits_std": 2.663008213043213, "gen_loss": 0.2869707942008972, "grad_norm": 0.4028401444641936, "learning_rate": 2.7382736842105265e-05, "loss": 0.3004, "mean_copy_accuracy": 0.9944491684436798, "mean_gen_accuracy": 0.8663344085216522, "mean_token_accuracy": 0.9016721099615097, "num_tokens": 761280946.0, "sample_num_tokens": 8904.5, "step": 2808, "total_num_tokens": 761316564.0, "z_loss": 0.000679783639498055 }, { "copy_logits_max": 0.07422298192977905, "copy_logits_min": -687500032.0, "copy_num_tokens": 502.9375, "epoch": 0.5737043655859075, "gen_logits_max": 5.770932674407959, "gen_logits_mean": -13.235923767089844, "gen_logits_min": -25.823320388793945, "gen_logits_std": 2.6981778144836426, "gen_loss": 0.31055599451065063, "grad_norm": 0.4526590633992903, "learning_rate": 2.738147368421053e-05, "loss": 0.3293, "mean_copy_accuracy": 0.9937339276075363, "mean_gen_accuracy": 0.8579731434583664, "mean_token_accuracy": 0.8898678123950958, "num_tokens": 761555704.0, "sample_num_tokens": 8519.0, "step": 2809, "total_num_tokens": 761589780.0, "z_loss": 0.0007080830982886255 }, { "copy_logits_max": -0.2727399468421936, "copy_logits_min": -750000000.0, "copy_num_tokens": 542.5625, "epoch": 0.5739086035231045, "gen_logits_max": 5.136114120483398, "gen_logits_mean": -14.018348693847656, "gen_logits_min": -26.581161499023438, "gen_logits_std": 2.6707396507263184, "gen_loss": 0.30861014127731323, "grad_norm": 0.4259103296773783, "learning_rate": 2.738021052631579e-05, "loss": 0.3, "mean_copy_accuracy": 0.9936847984790802, "mean_gen_accuracy": 0.8689715415239334, "mean_token_accuracy": 0.9005894809961319, "num_tokens": 761828507.0, "sample_num_tokens": 9120.75, "step": 2810, "total_num_tokens": 761864990.0, "z_loss": 0.0006610602722503245 }, { "copy_logits_max": 1.8903337717056274, "copy_logits_min": -750000000.0, "copy_num_tokens": 665.8125, "epoch": 0.5741128414603013, "gen_logits_max": 4.983109474182129, "gen_logits_mean": -13.600475311279297, "gen_logits_min": -26.684402465820312, "gen_logits_std": 2.7166357040405273, "gen_loss": 0.2715612053871155, "grad_norm": 0.47016114273521975, "learning_rate": 2.7378947368421055e-05, "loss": 0.305, "mean_copy_accuracy": 0.9934600293636322, "mean_gen_accuracy": 0.8632032126188278, "mean_token_accuracy": 0.8981577157974243, "num_tokens": 762082236.0, "sample_num_tokens": 9024.0, "step": 2811, "total_num_tokens": 762118332.0, "z_loss": 0.0006648855633102357 }, { "copy_logits_max": -1.8745100498199463, "copy_logits_min": -750000000.0, "copy_num_tokens": 348.1875, "epoch": 0.5743170793974981, "gen_logits_max": 6.1403350830078125, "gen_logits_mean": -13.419391632080078, "gen_logits_min": -25.69179916381836, "gen_logits_std": 2.6933963298797607, "gen_loss": 0.3202401399612427, "grad_norm": 0.4660293328913485, "learning_rate": 2.7377684210526316e-05, "loss": 0.3211, "mean_copy_accuracy": 0.9952703267335892, "mean_gen_accuracy": 0.8583014905452728, "mean_token_accuracy": 0.8940356969833374, "num_tokens": 762359821.0, "sample_num_tokens": 7523.25, "step": 2812, "total_num_tokens": 762389914.0, "z_loss": 0.0007798082078807056 }, { "copy_logits_max": -2.0668036937713623, "copy_logits_min": -750000000.0, "copy_num_tokens": 441.9375, "epoch": 0.5745213173346949, "gen_logits_max": 5.202069282531738, "gen_logits_mean": -14.848179817199707, "gen_logits_min": -26.998647689819336, "gen_logits_std": 2.6541571617126465, "gen_loss": 0.30674833059310913, "grad_norm": 0.44882166471078133, "learning_rate": 2.737642105263158e-05, "loss": 0.3061, "mean_copy_accuracy": 0.9945276826620102, "mean_gen_accuracy": 0.8668603152036667, "mean_token_accuracy": 0.8998315781354904, "num_tokens": 762620156.0, "sample_num_tokens": 7430.0, "step": 2813, "total_num_tokens": 762649876.0, "z_loss": 0.0007517783669754863 }, { "copy_logits_max": -0.6445552110671997, "copy_logits_min": -687500032.0, "copy_num_tokens": 476.125, "epoch": 0.5747255552718917, "gen_logits_max": 5.719526767730713, "gen_logits_mean": -14.180826187133789, "gen_logits_min": -26.453760147094727, "gen_logits_std": 2.676074504852295, "gen_loss": 0.32528504729270935, "grad_norm": 0.47444425124917255, "learning_rate": 2.737515789473684e-05, "loss": 0.3196, "mean_copy_accuracy": 0.9937489330768585, "mean_gen_accuracy": 0.8653497993946075, "mean_token_accuracy": 0.8932964205741882, "num_tokens": 762880045.0, "sample_num_tokens": 9142.25, "step": 2814, "total_num_tokens": 762916614.0, "z_loss": 0.0007818980957381427 }, { "copy_logits_max": -1.5829834938049316, "copy_logits_min": -687500032.0, "copy_num_tokens": 364.5625, "epoch": 0.5749297932090885, "gen_logits_max": 6.677315711975098, "gen_logits_mean": -12.842506408691406, "gen_logits_min": -24.914085388183594, "gen_logits_std": 2.6522326469421387, "gen_loss": 0.3571673631668091, "grad_norm": 0.46348932303636553, "learning_rate": 2.7373894736842105e-05, "loss": 0.3048, "mean_copy_accuracy": 0.9935504645109177, "mean_gen_accuracy": 0.8626797646284103, "mean_token_accuracy": 0.8990356177091599, "num_tokens": 763176931.0, "sample_num_tokens": 8416.25, "step": 2815, "total_num_tokens": 763210596.0, "z_loss": 0.0008156842086464167 }, { "copy_logits_max": 0.5419582724571228, "copy_logits_min": -687500032.0, "copy_num_tokens": 809.4375, "epoch": 0.5751340311462855, "gen_logits_max": 5.079129219055176, "gen_logits_mean": -12.890256881713867, "gen_logits_min": -25.641286849975586, "gen_logits_std": 2.7107396125793457, "gen_loss": 0.24961605668067932, "grad_norm": 0.4395858771911084, "learning_rate": 2.737263157894737e-05, "loss": 0.2941, "mean_copy_accuracy": 0.9943339377641678, "mean_gen_accuracy": 0.8733142465353012, "mean_token_accuracy": 0.9045626074075699, "num_tokens": 763470213.0, "sample_num_tokens": 10761.75, "step": 2816, "total_num_tokens": 763513260.0, "z_loss": 0.000780136208049953 }, { "copy_logits_max": -1.4774589538574219, "copy_logits_min": -750000000.0, "copy_num_tokens": 399.3125, "epoch": 0.5753382690834823, "gen_logits_max": 5.63074254989624, "gen_logits_mean": -14.621665954589844, "gen_logits_min": -26.435359954833984, "gen_logits_std": 2.6383273601531982, "gen_loss": 0.2903655767440796, "grad_norm": 0.5097605158391927, "learning_rate": 2.7371368421052634e-05, "loss": 0.3204, "mean_copy_accuracy": 0.993911474943161, "mean_gen_accuracy": 0.8662697970867157, "mean_token_accuracy": 0.8939346224069595, "num_tokens": 763745643.0, "sample_num_tokens": 9315.25, "step": 2817, "total_num_tokens": 763782904.0, "z_loss": 0.0008646511705592275 }, { "copy_logits_max": -0.6268818974494934, "copy_logits_min": -750000000.0, "copy_num_tokens": 470.4375, "epoch": 0.5755425070206791, "gen_logits_max": 5.32488489151001, "gen_logits_mean": -14.088668823242188, "gen_logits_min": -26.276735305786133, "gen_logits_std": 2.6481096744537354, "gen_loss": 0.3370300829410553, "grad_norm": 0.4868412987604309, "learning_rate": 2.7370105263157895e-05, "loss": 0.3225, "mean_copy_accuracy": 0.9944998174905777, "mean_gen_accuracy": 0.857800230383873, "mean_token_accuracy": 0.8942602872848511, "num_tokens": 764047919.0, "sample_num_tokens": 8997.75, "step": 2818, "total_num_tokens": 764083910.0, "z_loss": 0.0008932645432651043 }, { "copy_logits_max": -2.140028238296509, "copy_logits_min": -750000000.0, "copy_num_tokens": 447.75, "epoch": 0.5757467449578759, "gen_logits_max": 6.013406276702881, "gen_logits_mean": -14.038213729858398, "gen_logits_min": -25.98595428466797, "gen_logits_std": 2.6621451377868652, "gen_loss": 0.31351375579833984, "grad_norm": 0.47474270449876027, "learning_rate": 2.736884210526316e-05, "loss": 0.3055, "mean_copy_accuracy": 0.99305759370327, "mean_gen_accuracy": 0.8693017959594727, "mean_token_accuracy": 0.8995592296123505, "num_tokens": 764360361.0, "sample_num_tokens": 9430.75, "step": 2819, "total_num_tokens": 764398084.0, "z_loss": 0.0007652536733075976 }, { "copy_logits_max": -1.4894263744354248, "copy_logits_min": -750000000.0, "copy_num_tokens": 627.3125, "epoch": 0.5759509828950727, "gen_logits_max": 6.356420040130615, "gen_logits_mean": -12.463006973266602, "gen_logits_min": -25.024585723876953, "gen_logits_std": 2.7187845706939697, "gen_loss": 0.28998008370399475, "grad_norm": 0.5340230681328805, "learning_rate": 2.736757894736842e-05, "loss": 0.316, "mean_copy_accuracy": 0.9942624270915985, "mean_gen_accuracy": 0.8601476848125458, "mean_token_accuracy": 0.8942615687847137, "num_tokens": 764624694.0, "sample_num_tokens": 9577.0, "step": 2820, "total_num_tokens": 764663002.0, "z_loss": 0.0007131964666768909 }, { "copy_logits_max": 0.12846755981445312, "copy_logits_min": -625000064.0, "copy_num_tokens": 433.5, "epoch": 0.5761552208322696, "gen_logits_max": 6.730580806732178, "gen_logits_mean": -12.338194847106934, "gen_logits_min": -24.663372039794922, "gen_logits_std": 2.713367462158203, "gen_loss": 0.3472188115119934, "grad_norm": 0.49168740244825, "learning_rate": 2.7366315789473684e-05, "loss": 0.3321, "mean_copy_accuracy": 0.9929239153862, "mean_gen_accuracy": 0.8586226254701614, "mean_token_accuracy": 0.8892044425010681, "num_tokens": 764880780.0, "sample_num_tokens": 8296.0, "step": 2821, "total_num_tokens": 764913964.0, "z_loss": 0.0009274228359572589 }, { "copy_logits_max": -1.899490237236023, "copy_logits_min": -750000000.0, "copy_num_tokens": 499.9375, "epoch": 0.5763594587694665, "gen_logits_max": 4.647176265716553, "gen_logits_mean": -14.982046127319336, "gen_logits_min": -27.0463809967041, "gen_logits_std": 2.6343250274658203, "gen_loss": 0.31546953320503235, "grad_norm": 0.4890709292560688, "learning_rate": 2.736505263157895e-05, "loss": 0.3313, "mean_copy_accuracy": 0.995039701461792, "mean_gen_accuracy": 0.858249306678772, "mean_token_accuracy": 0.8918433487415314, "num_tokens": 765152989.0, "sample_num_tokens": 8426.25, "step": 2822, "total_num_tokens": 765186694.0, "z_loss": 0.0008226775098592043 }, { "copy_logits_max": -2.874054431915283, "copy_logits_min": -750000000.0, "copy_num_tokens": 424.8125, "epoch": 0.5765636967066633, "gen_logits_max": 6.272432327270508, "gen_logits_mean": -14.306215286254883, "gen_logits_min": -26.075111389160156, "gen_logits_std": 2.6255624294281006, "gen_loss": 0.30713406205177307, "grad_norm": 0.4701651446300969, "learning_rate": 2.736378947368421e-05, "loss": 0.3276, "mean_copy_accuracy": 0.99491386115551, "mean_gen_accuracy": 0.8584666103124619, "mean_token_accuracy": 0.8925571888685226, "num_tokens": 765430459.0, "sample_num_tokens": 8978.25, "step": 2823, "total_num_tokens": 765466372.0, "z_loss": 0.0007313452661037445 }, { "copy_logits_max": -1.520430564880371, "copy_logits_min": -750000000.0, "copy_num_tokens": 322.875, "epoch": 0.5767679346438601, "gen_logits_max": 6.929766654968262, "gen_logits_mean": -12.526138305664062, "gen_logits_min": -24.787776947021484, "gen_logits_std": 2.716609001159668, "gen_loss": 0.2598247528076172, "grad_norm": 0.49465830635983354, "learning_rate": 2.7362526315789477e-05, "loss": 0.3075, "mean_copy_accuracy": 0.9938458949327469, "mean_gen_accuracy": 0.8667985945940018, "mean_token_accuracy": 0.8989409059286118, "num_tokens": 765731004.0, "sample_num_tokens": 7435.5, "step": 2824, "total_num_tokens": 765760746.0, "z_loss": 0.0006293098558671772 }, { "copy_logits_max": -2.599917411804199, "copy_logits_min": -750000000.0, "copy_num_tokens": 290.75, "epoch": 0.5769721725810569, "gen_logits_max": 6.6229472160339355, "gen_logits_mean": -13.152610778808594, "gen_logits_min": -24.79190444946289, "gen_logits_std": 2.553740978240967, "gen_loss": 0.3099707365036011, "grad_norm": 0.4513495716578764, "learning_rate": 2.7361263157894738e-05, "loss": 0.3201, "mean_copy_accuracy": 0.9938962012529373, "mean_gen_accuracy": 0.8654624968767166, "mean_token_accuracy": 0.8928676396608353, "num_tokens": 765973132.0, "sample_num_tokens": 7716.5, "step": 2825, "total_num_tokens": 766003998.0, "z_loss": 0.0007476532482542098 }, { "copy_logits_max": -1.7310278415679932, "copy_logits_min": -750000000.0, "copy_num_tokens": 409.5, "epoch": 0.5771764105182537, "gen_logits_max": 6.009682655334473, "gen_logits_mean": -14.100200653076172, "gen_logits_min": -26.07216453552246, "gen_logits_std": 2.6741490364074707, "gen_loss": 0.3003401458263397, "grad_norm": 0.5452825116644399, "learning_rate": 2.7360000000000002e-05, "loss": 0.3021, "mean_copy_accuracy": 0.9942775964736938, "mean_gen_accuracy": 0.8636640012264252, "mean_token_accuracy": 0.8991683721542358, "num_tokens": 766257265.0, "sample_num_tokens": 7899.75, "step": 2826, "total_num_tokens": 766288864.0, "z_loss": 0.0007537567289546132 }, { "copy_logits_max": 0.10761603713035583, "copy_logits_min": -750000000.0, "copy_num_tokens": 468.6875, "epoch": 0.5773806484554506, "gen_logits_max": 6.440696716308594, "gen_logits_mean": -12.531190872192383, "gen_logits_min": -25.22681427001953, "gen_logits_std": 2.683933734893799, "gen_loss": 0.3314383625984192, "grad_norm": 0.4960439208408523, "learning_rate": 2.7358736842105263e-05, "loss": 0.3318, "mean_copy_accuracy": 0.9930645674467087, "mean_gen_accuracy": 0.8589258939027786, "mean_token_accuracy": 0.8913383036851883, "num_tokens": 766520701.0, "sample_num_tokens": 7798.75, "step": 2827, "total_num_tokens": 766551896.0, "z_loss": 0.0008083553984761238 }, { "copy_logits_max": -3.4874050617218018, "copy_logits_min": -750000000.0, "copy_num_tokens": 459.4375, "epoch": 0.5775848863926475, "gen_logits_max": 5.721381664276123, "gen_logits_mean": -13.84604263305664, "gen_logits_min": -26.106306076049805, "gen_logits_std": 2.6596078872680664, "gen_loss": 0.2787620425224304, "grad_norm": 0.5954661051545028, "learning_rate": 2.7357473684210528e-05, "loss": 0.3179, "mean_copy_accuracy": 0.9931551814079285, "mean_gen_accuracy": 0.8650783151388168, "mean_token_accuracy": 0.8942180722951889, "num_tokens": 766779182.0, "sample_num_tokens": 8226.0, "step": 2828, "total_num_tokens": 766812086.0, "z_loss": 0.0006374074146151543 }, { "copy_logits_max": -1.4850102663040161, "copy_logits_min": -687500032.0, "copy_num_tokens": 397.0, "epoch": 0.5777891243298443, "gen_logits_max": 5.577096462249756, "gen_logits_mean": -14.04302978515625, "gen_logits_min": -26.355144500732422, "gen_logits_std": 2.6525888442993164, "gen_loss": 0.34735482931137085, "grad_norm": 0.5168497847012458, "learning_rate": 2.735621052631579e-05, "loss": 0.3117, "mean_copy_accuracy": 0.9925723522901535, "mean_gen_accuracy": 0.8616810142993927, "mean_token_accuracy": 0.8963690996170044, "num_tokens": 767050913.0, "sample_num_tokens": 7575.25, "step": 2829, "total_num_tokens": 767081214.0, "z_loss": 0.000821296707727015 }, { "copy_logits_max": -1.3696470260620117, "copy_logits_min": -750000000.0, "copy_num_tokens": 512.6875, "epoch": 0.5779933622670411, "gen_logits_max": 5.478041172027588, "gen_logits_mean": -14.152264595031738, "gen_logits_min": -26.388410568237305, "gen_logits_std": 2.6773383617401123, "gen_loss": 0.3126720190048218, "grad_norm": 0.4395302513070669, "learning_rate": 2.7354947368421053e-05, "loss": 0.3016, "mean_copy_accuracy": 0.9959024637937546, "mean_gen_accuracy": 0.8688904196023941, "mean_token_accuracy": 0.9025922268629074, "num_tokens": 767349584.0, "sample_num_tokens": 9181.5, "step": 2830, "total_num_tokens": 767386310.0, "z_loss": 0.0007977726636454463 }, { "copy_logits_max": -2.2131810188293457, "copy_logits_min": -750000000.0, "copy_num_tokens": 623.25, "epoch": 0.5781976002042379, "gen_logits_max": 4.279423713684082, "gen_logits_mean": -15.043675422668457, "gen_logits_min": -27.14919662475586, "gen_logits_std": 2.6689391136169434, "gen_loss": 0.31089919805526733, "grad_norm": 0.41955960210073867, "learning_rate": 2.7353684210526314e-05, "loss": 0.298, "mean_copy_accuracy": 0.9946900904178619, "mean_gen_accuracy": 0.8679190129041672, "mean_token_accuracy": 0.9011875092983246, "num_tokens": 767645328.0, "sample_num_tokens": 9194.5, "step": 2831, "total_num_tokens": 767682106.0, "z_loss": 0.0006924099288880825 }, { "copy_logits_max": -2.1251602172851562, "copy_logits_min": -750000000.0, "copy_num_tokens": 324.5, "epoch": 0.5784018381414348, "gen_logits_max": 6.360551834106445, "gen_logits_mean": -13.427363395690918, "gen_logits_min": -25.589082717895508, "gen_logits_std": 2.649972438812256, "gen_loss": 0.3830590546131134, "grad_norm": 0.5136272779168116, "learning_rate": 2.735242105263158e-05, "loss": 0.349, "mean_copy_accuracy": 0.9926287680864334, "mean_gen_accuracy": 0.8519012480974197, "mean_token_accuracy": 0.8855956494808197, "num_tokens": 767916942.0, "sample_num_tokens": 7491.5, "step": 2832, "total_num_tokens": 767946908.0, "z_loss": 0.0008070560870692134 }, { "copy_logits_max": -2.351349353790283, "copy_logits_min": -750000000.0, "copy_num_tokens": 356.1875, "epoch": 0.5786060760786316, "gen_logits_max": 6.186236381530762, "gen_logits_mean": -14.17707633972168, "gen_logits_min": -26.106311798095703, "gen_logits_std": 2.631798267364502, "gen_loss": 0.3380855321884155, "grad_norm": 0.46920375679597115, "learning_rate": 2.7351157894736842e-05, "loss": 0.3288, "mean_copy_accuracy": 0.993509978055954, "mean_gen_accuracy": 0.8599124401807785, "mean_token_accuracy": 0.8898723125457764, "num_tokens": 768168443.0, "sample_num_tokens": 7887.25, "step": 2833, "total_num_tokens": 768199992.0, "z_loss": 0.0006961147300899029 }, { "copy_logits_max": -0.8336426019668579, "copy_logits_min": -750000128.0, "copy_num_tokens": 524.0625, "epoch": 0.5788103140158284, "gen_logits_max": 5.323439121246338, "gen_logits_mean": -13.768464088439941, "gen_logits_min": -25.911659240722656, "gen_logits_std": 2.624826431274414, "gen_loss": 0.29882311820983887, "grad_norm": 0.4210496847822744, "learning_rate": 2.7349894736842107e-05, "loss": 0.3072, "mean_copy_accuracy": 0.9946072399616241, "mean_gen_accuracy": 0.8665246963500977, "mean_token_accuracy": 0.8974910378456116, "num_tokens": 768437859.0, "sample_num_tokens": 8749.25, "step": 2834, "total_num_tokens": 768472856.0, "z_loss": 0.0007247474859468639 }, { "copy_logits_max": 0.17083066701889038, "copy_logits_min": -750000000.0, "copy_num_tokens": 471.5, "epoch": 0.5790145519530253, "gen_logits_max": 5.122793197631836, "gen_logits_mean": -13.39609432220459, "gen_logits_min": -25.6124267578125, "gen_logits_std": 2.669022798538208, "gen_loss": 0.33394181728363037, "grad_norm": 0.48155607132740896, "learning_rate": 2.734863157894737e-05, "loss": 0.3384, "mean_copy_accuracy": 0.9938443601131439, "mean_gen_accuracy": 0.85250224173069, "mean_token_accuracy": 0.8882980197668076, "num_tokens": 768710263.0, "sample_num_tokens": 8140.25, "step": 2835, "total_num_tokens": 768742824.0, "z_loss": 0.0008805068209767342 }, { "copy_logits_max": -2.284315586090088, "copy_logits_min": -750000000.0, "copy_num_tokens": 610.0, "epoch": 0.5792187898902221, "gen_logits_max": 4.822422027587891, "gen_logits_mean": -14.117055892944336, "gen_logits_min": -26.057010650634766, "gen_logits_std": 2.612264633178711, "gen_loss": 0.301266074180603, "grad_norm": 0.4928101809460719, "learning_rate": 2.7347368421052632e-05, "loss": 0.3338, "mean_copy_accuracy": 0.9936629682779312, "mean_gen_accuracy": 0.8589118421077728, "mean_token_accuracy": 0.8908496201038361, "num_tokens": 768961980.0, "sample_num_tokens": 9098.0, "step": 2836, "total_num_tokens": 768998372.0, "z_loss": 0.0007398892776109278 }, { "copy_logits_max": -1.4410935640335083, "copy_logits_min": -750000000.0, "copy_num_tokens": 422.8125, "epoch": 0.579423027827419, "gen_logits_max": 5.275071144104004, "gen_logits_mean": -13.267999649047852, "gen_logits_min": -24.84322738647461, "gen_logits_std": 2.6037890911102295, "gen_loss": 0.2992943227291107, "grad_norm": 0.4452495243741973, "learning_rate": 2.7346105263157896e-05, "loss": 0.3251, "mean_copy_accuracy": 0.9944914877414703, "mean_gen_accuracy": 0.8634675145149231, "mean_token_accuracy": 0.8934486508369446, "num_tokens": 769214143.0, "sample_num_tokens": 8073.25, "step": 2837, "total_num_tokens": 769246436.0, "z_loss": 0.0007341929012909532 }, { "copy_logits_max": -0.992268443107605, "copy_logits_min": -750000000.0, "copy_num_tokens": 607.375, "epoch": 0.5796272657646158, "gen_logits_max": 6.563157081604004, "gen_logits_mean": -12.864612579345703, "gen_logits_min": -25.362539291381836, "gen_logits_std": 2.6313185691833496, "gen_loss": 0.33042001724243164, "grad_norm": 0.46311728286070886, "learning_rate": 2.7344842105263157e-05, "loss": 0.3266, "mean_copy_accuracy": 0.9935161471366882, "mean_gen_accuracy": 0.8593531399965286, "mean_token_accuracy": 0.8942886888980865, "num_tokens": 769511372.0, "sample_num_tokens": 8819.0, "step": 2838, "total_num_tokens": 769546648.0, "z_loss": 0.0007735223043709993 }, { "copy_logits_max": -4.353763103485107, "copy_logits_min": -750000000.0, "copy_num_tokens": 372.125, "epoch": 0.5798315037018126, "gen_logits_max": 5.613295555114746, "gen_logits_mean": -13.916187286376953, "gen_logits_min": -26.092445373535156, "gen_logits_std": 2.619142770767212, "gen_loss": 0.307422012090683, "grad_norm": 0.43290765816285526, "learning_rate": 2.734357894736842e-05, "loss": 0.3135, "mean_copy_accuracy": 0.9948574751615524, "mean_gen_accuracy": 0.863517165184021, "mean_token_accuracy": 0.8967712372541428, "num_tokens": 769784319.0, "sample_num_tokens": 7182.25, "step": 2839, "total_num_tokens": 769813048.0, "z_loss": 0.0006863694288767874 }, { "copy_logits_max": -2.2025697231292725, "copy_logits_min": -750000000.0, "copy_num_tokens": 391.0625, "epoch": 0.5800357416390094, "gen_logits_max": 5.718325614929199, "gen_logits_mean": -13.39511489868164, "gen_logits_min": -25.971084594726562, "gen_logits_std": 2.638843059539795, "gen_loss": 0.28252509236335754, "grad_norm": 0.42909060063282983, "learning_rate": 2.7342315789473686e-05, "loss": 0.3052, "mean_copy_accuracy": 0.9938492327928543, "mean_gen_accuracy": 0.8683551847934723, "mean_token_accuracy": 0.8984174430370331, "num_tokens": 770033652.0, "sample_num_tokens": 7769.0, "step": 2840, "total_num_tokens": 770064728.0, "z_loss": 0.000670970301143825 }, { "copy_logits_max": -1.887349247932434, "copy_logits_min": -750000000.0, "copy_num_tokens": 387.875, "epoch": 0.5802399795762063, "gen_logits_max": 6.069397449493408, "gen_logits_mean": -13.636119842529297, "gen_logits_min": -25.652198791503906, "gen_logits_std": 2.6288323402404785, "gen_loss": 0.32152801752090454, "grad_norm": 0.42545085113172865, "learning_rate": 2.734105263157895e-05, "loss": 0.3221, "mean_copy_accuracy": 0.9959232658147812, "mean_gen_accuracy": 0.8598849177360535, "mean_token_accuracy": 0.8931465744972229, "num_tokens": 770287166.0, "sample_num_tokens": 8008.0, "step": 2841, "total_num_tokens": 770319198.0, "z_loss": 0.0006899908185005188 }, { "copy_logits_max": -2.605698347091675, "copy_logits_min": -750000000.0, "copy_num_tokens": 596.375, "epoch": 0.5804442175134031, "gen_logits_max": 4.350675582885742, "gen_logits_mean": -14.529924392700195, "gen_logits_min": -27.062664031982422, "gen_logits_std": 2.650129795074463, "gen_loss": 0.29059338569641113, "grad_norm": 0.4163220536218429, "learning_rate": 2.733978947368421e-05, "loss": 0.3241, "mean_copy_accuracy": 0.9950161874294281, "mean_gen_accuracy": 0.8622606843709946, "mean_token_accuracy": 0.892410397529602, "num_tokens": 770543688.0, "sample_num_tokens": 9359.0, "step": 2842, "total_num_tokens": 770581124.0, "z_loss": 0.00064882478909567 }, { "copy_logits_max": -2.1778738498687744, "copy_logits_min": -625000064.0, "copy_num_tokens": 369.5625, "epoch": 0.5806484554506, "gen_logits_max": 5.764042854309082, "gen_logits_mean": -13.611919403076172, "gen_logits_min": -26.043380737304688, "gen_logits_std": 2.6638550758361816, "gen_loss": 0.33218926191329956, "grad_norm": 0.4641123758721025, "learning_rate": 2.7338526315789475e-05, "loss": 0.3074, "mean_copy_accuracy": 0.9926939457654953, "mean_gen_accuracy": 0.8688887506723404, "mean_token_accuracy": 0.896894633769989, "num_tokens": 770815540.0, "sample_num_tokens": 7393.5, "step": 2843, "total_num_tokens": 770845114.0, "z_loss": 0.0007649904582649469 }, { "copy_logits_max": -2.3569939136505127, "copy_logits_min": -750000000.0, "copy_num_tokens": 510.6875, "epoch": 0.5808526933877968, "gen_logits_max": 5.2028350830078125, "gen_logits_mean": -13.907527923583984, "gen_logits_min": -25.69922637939453, "gen_logits_std": 2.575653076171875, "gen_loss": 0.30580607056617737, "grad_norm": 0.4934729701192468, "learning_rate": 2.7337263157894736e-05, "loss": 0.322, "mean_copy_accuracy": 0.9939814954996109, "mean_gen_accuracy": 0.857425794005394, "mean_token_accuracy": 0.8949746638536453, "num_tokens": 771085465.0, "sample_num_tokens": 8564.25, "step": 2844, "total_num_tokens": 771119722.0, "z_loss": 0.0007832768606022 }, { "copy_logits_max": -2.665731906890869, "copy_logits_min": -625000064.0, "copy_num_tokens": 438.5625, "epoch": 0.5810569313249936, "gen_logits_max": 5.95821475982666, "gen_logits_mean": -13.066019058227539, "gen_logits_min": -24.960098266601562, "gen_logits_std": 2.5358405113220215, "gen_loss": 0.35252565145492554, "grad_norm": 0.4348975316532192, "learning_rate": 2.7336e-05, "loss": 0.3291, "mean_copy_accuracy": 0.9936952441930771, "mean_gen_accuracy": 0.8593379557132721, "mean_token_accuracy": 0.8910192251205444, "num_tokens": 771357604.0, "sample_num_tokens": 8535.5, "step": 2845, "total_num_tokens": 771391746.0, "z_loss": 0.0009070865344256163 }, { "copy_logits_max": -1.6577614545822144, "copy_logits_min": -750000000.0, "copy_num_tokens": 393.25, "epoch": 0.5812611692621904, "gen_logits_max": 5.939395904541016, "gen_logits_mean": -13.340800285339355, "gen_logits_min": -26.145477294921875, "gen_logits_std": 2.6586551666259766, "gen_loss": 0.33445867896080017, "grad_norm": 0.5146482912063708, "learning_rate": 2.733473684210526e-05, "loss": 0.3179, "mean_copy_accuracy": 0.9912175685167313, "mean_gen_accuracy": 0.8635075837373734, "mean_token_accuracy": 0.8940049558877945, "num_tokens": 771629472.0, "sample_num_tokens": 7562.0, "step": 2846, "total_num_tokens": 771659720.0, "z_loss": 0.000846660346724093 }, { "copy_logits_max": -3.0014491081237793, "copy_logits_min": -750000000.0, "copy_num_tokens": 373.625, "epoch": 0.5814654071993873, "gen_logits_max": 6.236482620239258, "gen_logits_mean": -13.877206802368164, "gen_logits_min": -26.56145668029785, "gen_logits_std": 2.63631010055542, "gen_loss": 0.29095688462257385, "grad_norm": 0.41779348434929314, "learning_rate": 2.7333473684210526e-05, "loss": 0.3134, "mean_copy_accuracy": 0.9945424795150757, "mean_gen_accuracy": 0.8599763959646225, "mean_token_accuracy": 0.896231472492218, "num_tokens": 771905086.0, "sample_num_tokens": 8154.5, "step": 2847, "total_num_tokens": 771937704.0, "z_loss": 0.00066673185210675 }, { "copy_logits_max": -1.6046394109725952, "copy_logits_min": -687500032.0, "copy_num_tokens": 528.0, "epoch": 0.5816696451365841, "gen_logits_max": 5.860180377960205, "gen_logits_mean": -12.917425155639648, "gen_logits_min": -25.625743865966797, "gen_logits_std": 2.616034984588623, "gen_loss": 0.28263935446739197, "grad_norm": 0.453750268039437, "learning_rate": 2.7332210526315793e-05, "loss": 0.3105, "mean_copy_accuracy": 0.9939568340778351, "mean_gen_accuracy": 0.866093099117279, "mean_token_accuracy": 0.8967646956443787, "num_tokens": 772166904.0, "sample_num_tokens": 8947.5, "step": 2848, "total_num_tokens": 772202694.0, "z_loss": 0.0007538755307905376 }, { "copy_logits_max": -0.22898858785629272, "copy_logits_min": -749999936.0, "copy_num_tokens": 630.875, "epoch": 0.581873883073781, "gen_logits_max": 5.702998161315918, "gen_logits_mean": -12.902787208557129, "gen_logits_min": -24.99921989440918, "gen_logits_std": 2.611114025115967, "gen_loss": 0.28044527769088745, "grad_norm": 0.43780274492756843, "learning_rate": 2.7330947368421054e-05, "loss": 0.3149, "mean_copy_accuracy": 0.993355318903923, "mean_gen_accuracy": 0.8686255812644958, "mean_token_accuracy": 0.8974740952253342, "num_tokens": 772453363.0, "sample_num_tokens": 10222.25, "step": 2849, "total_num_tokens": 772494252.0, "z_loss": 0.0007034888840280473 }, { "copy_logits_max": -1.4901478290557861, "copy_logits_min": -687500032.0, "copy_num_tokens": 393.5625, "epoch": 0.5820781210109778, "gen_logits_max": 5.4342522621154785, "gen_logits_mean": -13.717428207397461, "gen_logits_min": -25.958208084106445, "gen_logits_std": 2.5790696144104004, "gen_loss": 0.3175497055053711, "grad_norm": 0.43278471325561446, "learning_rate": 2.732968421052632e-05, "loss": 0.3146, "mean_copy_accuracy": 0.995068222284317, "mean_gen_accuracy": 0.8598249405622482, "mean_token_accuracy": 0.8961535543203354, "num_tokens": 772762612.0, "sample_num_tokens": 7792.0, "step": 2850, "total_num_tokens": 772793780.0, "z_loss": 0.0007818299345672131 }, { "copy_logits_max": -4.472349166870117, "copy_logits_min": -750000000.0, "copy_num_tokens": 375.375, "epoch": 0.5822823589481746, "gen_logits_max": 4.845232009887695, "gen_logits_mean": -15.733115196228027, "gen_logits_min": -27.422962188720703, "gen_logits_std": 2.5731990337371826, "gen_loss": 0.317654550075531, "grad_norm": 0.4572601438157568, "learning_rate": 2.732842105263158e-05, "loss": 0.3182, "mean_copy_accuracy": 0.9938075095415115, "mean_gen_accuracy": 0.8636951595544815, "mean_token_accuracy": 0.8941031843423843, "num_tokens": 773020687.0, "sample_num_tokens": 7947.25, "step": 2851, "total_num_tokens": 773052476.0, "z_loss": 0.0007132198661565781 }, { "copy_logits_max": -3.7149782180786133, "copy_logits_min": -750000000.0, "copy_num_tokens": 431.875, "epoch": 0.5824865968853714, "gen_logits_max": 5.747440338134766, "gen_logits_mean": -12.520547866821289, "gen_logits_min": -24.685131072998047, "gen_logits_std": 2.62931227684021, "gen_loss": 0.3416881561279297, "grad_norm": 0.5404112535346115, "learning_rate": 2.7327157894736844e-05, "loss": 0.3316, "mean_copy_accuracy": 0.9904190748929977, "mean_gen_accuracy": 0.8588541895151138, "mean_token_accuracy": 0.8912187665700912, "num_tokens": 773297678.0, "sample_num_tokens": 7643.0, "step": 2852, "total_num_tokens": 773328250.0, "z_loss": 0.0007858950411900878 }, { "copy_logits_max": -3.866459369659424, "copy_logits_min": -687500032.0, "copy_num_tokens": 274.3125, "epoch": 0.5826908348225683, "gen_logits_max": 6.113386154174805, "gen_logits_mean": -14.794450759887695, "gen_logits_min": -26.475292205810547, "gen_logits_std": 2.5946056842803955, "gen_loss": 0.3274667263031006, "grad_norm": 0.42906474007387846, "learning_rate": 2.7325894736842105e-05, "loss": 0.2985, "mean_copy_accuracy": 0.9945991784334183, "mean_gen_accuracy": 0.8676915764808655, "mean_token_accuracy": 0.9006826281547546, "num_tokens": 773585789.0, "sample_num_tokens": 6942.25, "step": 2853, "total_num_tokens": 773613558.0, "z_loss": 0.0007541811792179942 }, { "copy_logits_max": -0.7778392434120178, "copy_logits_min": -750000000.0, "copy_num_tokens": 451.75, "epoch": 0.5828950727597652, "gen_logits_max": 5.815310478210449, "gen_logits_mean": -12.983162879943848, "gen_logits_min": -25.31250762939453, "gen_logits_std": 2.5801596641540527, "gen_loss": 0.33199870586395264, "grad_norm": 0.493005943547327, "learning_rate": 2.732463157894737e-05, "loss": 0.3412, "mean_copy_accuracy": 0.9949212372303009, "mean_gen_accuracy": 0.8530399203300476, "mean_token_accuracy": 0.8870070725679398, "num_tokens": 773835083.0, "sample_num_tokens": 8283.25, "step": 2854, "total_num_tokens": 773868216.0, "z_loss": 0.0007740522851236165 }, { "copy_logits_max": -3.2353475093841553, "copy_logits_min": -750000000.0, "copy_num_tokens": 338.3125, "epoch": 0.583099310696962, "gen_logits_max": 6.155757904052734, "gen_logits_mean": -12.546202659606934, "gen_logits_min": -24.858509063720703, "gen_logits_std": 2.60715913772583, "gen_loss": 0.33730244636535645, "grad_norm": 0.4390544244790975, "learning_rate": 2.732336842105263e-05, "loss": 0.2977, "mean_copy_accuracy": 0.995486706495285, "mean_gen_accuracy": 0.8712272942066193, "mean_token_accuracy": 0.9009748995304108, "num_tokens": 774090770.0, "sample_num_tokens": 7558.5, "step": 2855, "total_num_tokens": 774121004.0, "z_loss": 0.0007550597656518221 }, { "copy_logits_max": -5.2433624267578125, "copy_logits_min": -687500032.0, "copy_num_tokens": 458.625, "epoch": 0.5833035486341588, "gen_logits_max": 5.461132526397705, "gen_logits_mean": -14.429676055908203, "gen_logits_min": -26.398109436035156, "gen_logits_std": 2.6348936557769775, "gen_loss": 0.2995023727416992, "grad_norm": 0.48851270931229457, "learning_rate": 2.7322105263157894e-05, "loss": 0.3059, "mean_copy_accuracy": 0.9938996434211731, "mean_gen_accuracy": 0.872335359454155, "mean_token_accuracy": 0.90009406208992, "num_tokens": 774349646.0, "sample_num_tokens": 8263.0, "step": 2856, "total_num_tokens": 774382698.0, "z_loss": 0.0007091413135640323 }, { "copy_logits_max": -2.9801738262176514, "copy_logits_min": -687500032.0, "copy_num_tokens": 338.5, "epoch": 0.5835077865713556, "gen_logits_max": 6.267498016357422, "gen_logits_mean": -13.190705299377441, "gen_logits_min": -24.761707305908203, "gen_logits_std": 2.5530190467834473, "gen_loss": 0.35301879048347473, "grad_norm": 0.4322551380580615, "learning_rate": 2.732084210526316e-05, "loss": 0.3248, "mean_copy_accuracy": 0.993476390838623, "mean_gen_accuracy": 0.8651094436645508, "mean_token_accuracy": 0.8938336372375488, "num_tokens": 774618433.0, "sample_num_tokens": 8192.75, "step": 2857, "total_num_tokens": 774651204.0, "z_loss": 0.0008132754010148346 }, { "copy_logits_max": -3.461367130279541, "copy_logits_min": -750000000.0, "copy_num_tokens": 482.375, "epoch": 0.5837120245085524, "gen_logits_max": 5.414945125579834, "gen_logits_mean": -14.10947036743164, "gen_logits_min": -26.131080627441406, "gen_logits_std": 2.617694854736328, "gen_loss": 0.3209441900253296, "grad_norm": 0.44928940332793776, "learning_rate": 2.7319578947368423e-05, "loss": 0.3278, "mean_copy_accuracy": 0.9951101839542389, "mean_gen_accuracy": 0.8618554174900055, "mean_token_accuracy": 0.8937760889530182, "num_tokens": 774890199.0, "sample_num_tokens": 8816.25, "step": 2858, "total_num_tokens": 774925464.0, "z_loss": 0.0007702766451984644 }, { "copy_logits_max": -2.8936095237731934, "copy_logits_min": -750000000.0, "copy_num_tokens": 472.5625, "epoch": 0.5839162624457493, "gen_logits_max": 4.975432872772217, "gen_logits_mean": -14.277236938476562, "gen_logits_min": -26.164718627929688, "gen_logits_std": 2.569054126739502, "gen_loss": 0.3024337887763977, "grad_norm": 0.47650683443963354, "learning_rate": 2.7318315789473684e-05, "loss": 0.3218, "mean_copy_accuracy": 0.9959123134613037, "mean_gen_accuracy": 0.8591182082891464, "mean_token_accuracy": 0.8958945125341415, "num_tokens": 775168091.0, "sample_num_tokens": 8734.75, "step": 2859, "total_num_tokens": 775203030.0, "z_loss": 0.0007488183327950537 }, { "copy_logits_max": -2.462754249572754, "copy_logits_min": -750000000.0, "copy_num_tokens": 318.9375, "epoch": 0.5841205003829462, "gen_logits_max": 6.5580058097839355, "gen_logits_mean": -13.339828491210938, "gen_logits_min": -25.450176239013672, "gen_logits_std": 2.6161844730377197, "gen_loss": 0.3192198872566223, "grad_norm": 0.43215236052441175, "learning_rate": 2.7317052631578948e-05, "loss": 0.3013, "mean_copy_accuracy": 0.9935659170150757, "mean_gen_accuracy": 0.8698635250329971, "mean_token_accuracy": 0.8999087512493134, "num_tokens": 775452033.0, "sample_num_tokens": 8198.25, "step": 2860, "total_num_tokens": 775484826.0, "z_loss": 0.0007621109252795577 }, { "copy_logits_max": -3.115093946456909, "copy_logits_min": -750000000.0, "copy_num_tokens": 517.375, "epoch": 0.584324738320143, "gen_logits_max": 5.721427917480469, "gen_logits_mean": -13.593574523925781, "gen_logits_min": -25.44479751586914, "gen_logits_std": 2.5899229049682617, "gen_loss": 0.3391057848930359, "grad_norm": 0.4095132181166677, "learning_rate": 2.7315789473684213e-05, "loss": 0.3204, "mean_copy_accuracy": 0.9945486485958099, "mean_gen_accuracy": 0.8597147613763809, "mean_token_accuracy": 0.8937604576349258, "num_tokens": 775732484.0, "sample_num_tokens": 9370.0, "step": 2861, "total_num_tokens": 775769964.0, "z_loss": 0.0007941410876810551 }, { "copy_logits_max": -2.933711051940918, "copy_logits_min": -750000000.0, "copy_num_tokens": 338.5625, "epoch": 0.5845289762573398, "gen_logits_max": 6.114184379577637, "gen_logits_mean": -12.94505786895752, "gen_logits_min": -25.196266174316406, "gen_logits_std": 2.618527412414551, "gen_loss": 0.3017406761646271, "grad_norm": 0.45037459898050375, "learning_rate": 2.7314526315789473e-05, "loss": 0.3029, "mean_copy_accuracy": 0.9933077245950699, "mean_gen_accuracy": 0.8693819046020508, "mean_token_accuracy": 0.8975066840648651, "num_tokens": 775987976.0, "sample_num_tokens": 7035.5, "step": 2862, "total_num_tokens": 776016118.0, "z_loss": 0.0007288479246199131 }, { "copy_logits_max": -1.8598754405975342, "copy_logits_min": -750000000.0, "copy_num_tokens": 426.25, "epoch": 0.5847332141945366, "gen_logits_max": 6.131407260894775, "gen_logits_mean": -13.319463729858398, "gen_logits_min": -25.948646545410156, "gen_logits_std": 2.6404218673706055, "gen_loss": 0.3080967962741852, "grad_norm": 0.45790507493705385, "learning_rate": 2.7313263157894738e-05, "loss": 0.3329, "mean_copy_accuracy": 0.9933741539716721, "mean_gen_accuracy": 0.85956971347332, "mean_token_accuracy": 0.8932225108146667, "num_tokens": 776251746.0, "sample_num_tokens": 8612.0, "step": 2863, "total_num_tokens": 776286194.0, "z_loss": 0.0006844865856692195 }, { "copy_logits_max": -0.14662861824035645, "copy_logits_min": -750000000.0, "copy_num_tokens": 326.625, "epoch": 0.5849374521317334, "gen_logits_max": 6.835849285125732, "gen_logits_mean": -12.353548049926758, "gen_logits_min": -24.299579620361328, "gen_logits_std": 2.5921177864074707, "gen_loss": 0.36904191970825195, "grad_norm": 0.44991865582623547, "learning_rate": 2.7312e-05, "loss": 0.3443, "mean_copy_accuracy": 0.9941577464342117, "mean_gen_accuracy": 0.8597826361656189, "mean_token_accuracy": 0.8861866593360901, "num_tokens": 776506322.0, "sample_num_tokens": 7090.5, "step": 2864, "total_num_tokens": 776534684.0, "z_loss": 0.0008135338430292904 }, { "copy_logits_max": -0.175886869430542, "copy_logits_min": -625000064.0, "copy_num_tokens": 504.25, "epoch": 0.5851416900689304, "gen_logits_max": 5.798434257507324, "gen_logits_mean": -13.301279067993164, "gen_logits_min": -25.803482055664062, "gen_logits_std": 2.6451964378356934, "gen_loss": 0.31347420811653137, "grad_norm": 0.4314816685777876, "learning_rate": 2.7310736842105266e-05, "loss": 0.3189, "mean_copy_accuracy": 0.9947007596492767, "mean_gen_accuracy": 0.8592240214347839, "mean_token_accuracy": 0.8945315331220627, "num_tokens": 776766958.0, "sample_num_tokens": 8078.5, "step": 2865, "total_num_tokens": 776799272.0, "z_loss": 0.0006910621887072921 }, { "copy_logits_max": -4.901532173156738, "copy_logits_min": -750000000.0, "copy_num_tokens": 383.0, "epoch": 0.5853459280061272, "gen_logits_max": 5.207579612731934, "gen_logits_mean": -14.929625511169434, "gen_logits_min": -26.54657554626465, "gen_logits_std": 2.613856792449951, "gen_loss": 0.2817685604095459, "grad_norm": 0.42739415529002645, "learning_rate": 2.7309473684210527e-05, "loss": 0.3152, "mean_copy_accuracy": 0.9940522164106369, "mean_gen_accuracy": 0.8651244193315506, "mean_token_accuracy": 0.894687220454216, "num_tokens": 777014023.0, "sample_num_tokens": 7885.25, "step": 2866, "total_num_tokens": 777045564.0, "z_loss": 0.0006350161274895072 }, { "copy_logits_max": -3.7759346961975098, "copy_logits_min": -750000000.0, "copy_num_tokens": 441.5, "epoch": 0.585550165943324, "gen_logits_max": 5.930835723876953, "gen_logits_mean": -13.005508422851562, "gen_logits_min": -25.45287322998047, "gen_logits_std": 2.6188032627105713, "gen_loss": 0.3552144169807434, "grad_norm": 0.4552362119292762, "learning_rate": 2.730821052631579e-05, "loss": 0.299, "mean_copy_accuracy": 0.9955653548240662, "mean_gen_accuracy": 0.8669181615114212, "mean_token_accuracy": 0.9008863568305969, "num_tokens": 777323156.0, "sample_num_tokens": 8217.0, "step": 2867, "total_num_tokens": 777356024.0, "z_loss": 0.0007469039410352707 }, { "copy_logits_max": -4.036078453063965, "copy_logits_min": -750000000.0, "copy_num_tokens": 544.75, "epoch": 0.5857544038805208, "gen_logits_max": 4.738500595092773, "gen_logits_mean": -14.788009643554688, "gen_logits_min": -26.34819793701172, "gen_logits_std": 2.615776538848877, "gen_loss": 0.2842012643814087, "grad_norm": 0.4391296491884612, "learning_rate": 2.7306947368421053e-05, "loss": 0.309, "mean_copy_accuracy": 0.9954488128423691, "mean_gen_accuracy": 0.8639654964208603, "mean_token_accuracy": 0.8982774317264557, "num_tokens": 777599609.0, "sample_num_tokens": 9317.25, "step": 2868, "total_num_tokens": 777636878.0, "z_loss": 0.000569782336242497 }, { "copy_logits_max": -3.136265277862549, "copy_logits_min": -687500032.0, "copy_num_tokens": 482.4375, "epoch": 0.5859586418177176, "gen_logits_max": 5.5480194091796875, "gen_logits_mean": -13.10202693939209, "gen_logits_min": -24.99442481994629, "gen_logits_std": 2.593404769897461, "gen_loss": 0.32777705788612366, "grad_norm": 0.45447666799418435, "learning_rate": 2.7305684210526317e-05, "loss": 0.3325, "mean_copy_accuracy": 0.993814080953598, "mean_gen_accuracy": 0.8573024719953537, "mean_token_accuracy": 0.8907103687524796, "num_tokens": 777872843.0, "sample_num_tokens": 8408.75, "step": 2869, "total_num_tokens": 777906478.0, "z_loss": 0.000713711662683636 }, { "copy_logits_max": -5.688699245452881, "copy_logits_min": -750000000.0, "copy_num_tokens": 498.5, "epoch": 0.5861628797549144, "gen_logits_max": 4.460348129272461, "gen_logits_mean": -14.412177085876465, "gen_logits_min": -25.55908966064453, "gen_logits_std": 2.5256764888763428, "gen_loss": 0.2695609927177429, "grad_norm": 0.453713145519811, "learning_rate": 2.7304421052631578e-05, "loss": 0.3084, "mean_copy_accuracy": 0.9936524778604507, "mean_gen_accuracy": 0.867593452334404, "mean_token_accuracy": 0.8973796665668488, "num_tokens": 778125761.0, "sample_num_tokens": 8444.75, "step": 2870, "total_num_tokens": 778159540.0, "z_loss": 0.00058557657757774 }, { "copy_logits_max": -4.104844570159912, "copy_logits_min": -750000000.0, "copy_num_tokens": 449.0, "epoch": 0.5863671176921114, "gen_logits_max": 4.851798057556152, "gen_logits_mean": -14.296125411987305, "gen_logits_min": -26.058853149414062, "gen_logits_std": 2.5963125228881836, "gen_loss": 0.31370195746421814, "grad_norm": 0.41034201407005994, "learning_rate": 2.7303157894736842e-05, "loss": 0.3051, "mean_copy_accuracy": 0.9945775270462036, "mean_gen_accuracy": 0.8703676909208298, "mean_token_accuracy": 0.8993009030818939, "num_tokens": 778402020.0, "sample_num_tokens": 8205.5, "step": 2871, "total_num_tokens": 778434842.0, "z_loss": 0.0007299996796064079 }, { "copy_logits_max": -1.6314421892166138, "copy_logits_min": -750000000.0, "copy_num_tokens": 511.0625, "epoch": 0.5865713556293082, "gen_logits_max": 5.832546234130859, "gen_logits_mean": -12.55810260772705, "gen_logits_min": -24.731260299682617, "gen_logits_std": 2.620333671569824, "gen_loss": 0.29655104875564575, "grad_norm": 0.4250365066164777, "learning_rate": 2.7301894736842103e-05, "loss": 0.3188, "mean_copy_accuracy": 0.994376927614212, "mean_gen_accuracy": 0.8621800541877747, "mean_token_accuracy": 0.8937957584857941, "num_tokens": 778671351.0, "sample_num_tokens": 8563.75, "step": 2872, "total_num_tokens": 778705606.0, "z_loss": 0.0007497133919969201 }, { "copy_logits_max": -2.1819448471069336, "copy_logits_min": -750000064.0, "copy_num_tokens": 594.3125, "epoch": 0.586775593566505, "gen_logits_max": 4.5565056800842285, "gen_logits_mean": -14.635906219482422, "gen_logits_min": -26.525848388671875, "gen_logits_std": 2.602010726928711, "gen_loss": 0.29643166065216064, "grad_norm": 0.48343253683510234, "learning_rate": 2.730063157894737e-05, "loss": 0.3229, "mean_copy_accuracy": 0.994038388133049, "mean_gen_accuracy": 0.861795961856842, "mean_token_accuracy": 0.8946644067764282, "num_tokens": 778930303.0, "sample_num_tokens": 9170.75, "step": 2873, "total_num_tokens": 778966986.0, "z_loss": 0.0007903539226390421 }, { "copy_logits_max": -4.33672571182251, "copy_logits_min": -750000000.0, "copy_num_tokens": 390.4375, "epoch": 0.5869798315037018, "gen_logits_max": 6.481738090515137, "gen_logits_mean": -13.365557670593262, "gen_logits_min": -24.78514862060547, "gen_logits_std": 2.564398765563965, "gen_loss": 0.3294251561164856, "grad_norm": 0.4230740132612886, "learning_rate": 2.729936842105263e-05, "loss": 0.3293, "mean_copy_accuracy": 0.9929240196943283, "mean_gen_accuracy": 0.8617663979530334, "mean_token_accuracy": 0.8894201815128326, "num_tokens": 779187672.0, "sample_num_tokens": 8216.0, "step": 2874, "total_num_tokens": 779220536.0, "z_loss": 0.000765887089073658 }, { "copy_logits_max": -0.9892159700393677, "copy_logits_min": -687500032.0, "copy_num_tokens": 462.5, "epoch": 0.5871840694408986, "gen_logits_max": 6.50304651260376, "gen_logits_mean": -11.365415573120117, "gen_logits_min": -23.52300262451172, "gen_logits_std": 2.643016815185547, "gen_loss": 0.3357415795326233, "grad_norm": 0.4639720402563008, "learning_rate": 2.7298105263157896e-05, "loss": 0.3329, "mean_copy_accuracy": 0.993359386920929, "mean_gen_accuracy": 0.860484391450882, "mean_token_accuracy": 0.8906913995742798, "num_tokens": 779433374.0, "sample_num_tokens": 7664.0, "step": 2875, "total_num_tokens": 779464030.0, "z_loss": 0.0008394200121983886 }, { "copy_logits_max": -3.43179988861084, "copy_logits_min": -750000000.0, "copy_num_tokens": 577.6875, "epoch": 0.5873883073780954, "gen_logits_max": 5.294024467468262, "gen_logits_mean": -12.97014331817627, "gen_logits_min": -24.911785125732422, "gen_logits_std": 2.632319927215576, "gen_loss": 0.2593831419944763, "grad_norm": 0.3928119127991048, "learning_rate": 2.729684210526316e-05, "loss": 0.3115, "mean_copy_accuracy": 0.9939292371273041, "mean_gen_accuracy": 0.8625429421663284, "mean_token_accuracy": 0.8939789831638336, "num_tokens": 779709639.0, "sample_num_tokens": 9438.25, "step": 2876, "total_num_tokens": 779747392.0, "z_loss": 0.0006455044494941831 }, { "copy_logits_max": -3.7833735942840576, "copy_logits_min": -750000000.0, "copy_num_tokens": 446.0625, "epoch": 0.5875925453152924, "gen_logits_max": 5.307162284851074, "gen_logits_mean": -13.981856346130371, "gen_logits_min": -25.874095916748047, "gen_logits_std": 2.6171767711639404, "gen_loss": 0.3076251447200775, "grad_norm": 0.42057709651538727, "learning_rate": 2.729557894736842e-05, "loss": 0.2989, "mean_copy_accuracy": 0.9946150332689285, "mean_gen_accuracy": 0.8657117635011673, "mean_token_accuracy": 0.9000237733125687, "num_tokens": 780003223.0, "sample_num_tokens": 8195.75, "step": 2877, "total_num_tokens": 780036006.0, "z_loss": 0.000748482474591583 }, { "copy_logits_max": -3.1789534091949463, "copy_logits_min": -750000000.0, "copy_num_tokens": 560.5, "epoch": 0.5877967832524892, "gen_logits_max": 5.4100775718688965, "gen_logits_mean": -13.592639923095703, "gen_logits_min": -24.993946075439453, "gen_logits_std": 2.5951898097991943, "gen_loss": 0.285747230052948, "grad_norm": 0.4234157962599957, "learning_rate": 2.7294315789473686e-05, "loss": 0.3051, "mean_copy_accuracy": 0.9948970228433609, "mean_gen_accuracy": 0.864788219332695, "mean_token_accuracy": 0.8992439657449722, "num_tokens": 780269487.0, "sample_num_tokens": 9140.75, "step": 2878, "total_num_tokens": 780306050.0, "z_loss": 0.0006895362748764455 }, { "copy_logits_max": -3.2092580795288086, "copy_logits_min": -750000000.0, "copy_num_tokens": 432.75, "epoch": 0.588001021189686, "gen_logits_max": 4.993399620056152, "gen_logits_mean": -14.820274353027344, "gen_logits_min": -26.850921630859375, "gen_logits_std": 2.608582019805908, "gen_loss": 0.28719231486320496, "grad_norm": 0.46980319355947425, "learning_rate": 2.7293052631578946e-05, "loss": 0.307, "mean_copy_accuracy": 0.9933284819126129, "mean_gen_accuracy": 0.86997389793396, "mean_token_accuracy": 0.9001117795705795, "num_tokens": 780530927.0, "sample_num_tokens": 8247.25, "step": 2879, "total_num_tokens": 780563916.0, "z_loss": 0.0007524896645918489 }, { "copy_logits_max": -4.648196697235107, "copy_logits_min": -750000000.0, "copy_num_tokens": 391.9375, "epoch": 0.5882052591268828, "gen_logits_max": 5.97592830657959, "gen_logits_mean": -12.730504989624023, "gen_logits_min": -24.31294822692871, "gen_logits_std": 2.5590505599975586, "gen_loss": 0.28884950280189514, "grad_norm": 0.45688499804722654, "learning_rate": 2.729178947368421e-05, "loss": 0.3227, "mean_copy_accuracy": 0.9938312023878098, "mean_gen_accuracy": 0.8647774159908295, "mean_token_accuracy": 0.8940074890851974, "num_tokens": 780776535.0, "sample_num_tokens": 7754.75, "step": 2880, "total_num_tokens": 780807554.0, "z_loss": 0.0007434808649122715 }, { "copy_logits_max": -0.4138113260269165, "copy_logits_min": -750000064.0, "copy_num_tokens": 395.3125, "epoch": 0.5884094970640796, "gen_logits_max": 7.3590240478515625, "gen_logits_mean": -10.738948822021484, "gen_logits_min": -22.99268341064453, "gen_logits_std": 2.5479369163513184, "gen_loss": 0.36658960580825806, "grad_norm": 0.4471645999188083, "learning_rate": 2.7290526315789475e-05, "loss": 0.3262, "mean_copy_accuracy": 0.9949055463075638, "mean_gen_accuracy": 0.8606195449829102, "mean_token_accuracy": 0.8932922035455704, "num_tokens": 781053731.0, "sample_num_tokens": 8205.25, "step": 2881, "total_num_tokens": 781086552.0, "z_loss": 0.0008296242449432611 }, { "copy_logits_max": -5.500840663909912, "copy_logits_min": -750000000.0, "copy_num_tokens": 271.375, "epoch": 0.5886137350012765, "gen_logits_max": 6.031885147094727, "gen_logits_mean": -13.816242218017578, "gen_logits_min": -25.128463745117188, "gen_logits_std": 2.5381603240966797, "gen_loss": 0.3039170503616333, "grad_norm": 0.4362011827544118, "learning_rate": 2.728926315789474e-05, "loss": 0.3308, "mean_copy_accuracy": 0.9927958399057388, "mean_gen_accuracy": 0.8664923161268234, "mean_token_accuracy": 0.8942202776670456, "num_tokens": 781313606.0, "sample_num_tokens": 7038.0, "step": 2882, "total_num_tokens": 781341758.0, "z_loss": 0.000645193038508296 }, { "copy_logits_max": -0.5722699165344238, "copy_logits_min": -750000000.0, "copy_num_tokens": 505.75, "epoch": 0.5888179729384733, "gen_logits_max": 5.22418212890625, "gen_logits_mean": -11.888096809387207, "gen_logits_min": -23.829265594482422, "gen_logits_std": 2.542356014251709, "gen_loss": 0.290957510471344, "grad_norm": 0.41672009449224373, "learning_rate": 2.7288e-05, "loss": 0.3225, "mean_copy_accuracy": 0.9946825057268143, "mean_gen_accuracy": 0.8579496145248413, "mean_token_accuracy": 0.8956277966499329, "num_tokens": 781600857.0, "sample_num_tokens": 7859.25, "step": 2883, "total_num_tokens": 781632294.0, "z_loss": 0.000708293984644115 }, { "copy_logits_max": -2.585383892059326, "copy_logits_min": -750000000.0, "copy_num_tokens": 417.375, "epoch": 0.5890222108756702, "gen_logits_max": 6.544834136962891, "gen_logits_mean": -11.45269775390625, "gen_logits_min": -23.133380889892578, "gen_logits_std": 2.558994770050049, "gen_loss": 0.3324892222881317, "grad_norm": 0.4179454285894141, "learning_rate": 2.7286736842105265e-05, "loss": 0.3175, "mean_copy_accuracy": 0.9948079437017441, "mean_gen_accuracy": 0.8668739646673203, "mean_token_accuracy": 0.8977895975112915, "num_tokens": 781877127.0, "sample_num_tokens": 8477.25, "step": 2884, "total_num_tokens": 781911036.0, "z_loss": 0.0007989726145751774 }, { "copy_logits_max": -2.2990384101867676, "copy_logits_min": -750000000.0, "copy_num_tokens": 486.625, "epoch": 0.589226448812867, "gen_logits_max": 5.531701564788818, "gen_logits_mean": -12.943201065063477, "gen_logits_min": -25.269149780273438, "gen_logits_std": 2.6036055088043213, "gen_loss": 0.3123161196708679, "grad_norm": 0.8893897515727041, "learning_rate": 2.7285473684210526e-05, "loss": 0.3038, "mean_copy_accuracy": 0.9943118244409561, "mean_gen_accuracy": 0.866389736533165, "mean_token_accuracy": 0.9022621065378189, "num_tokens": 782178039.0, "sample_num_tokens": 8534.75, "step": 2885, "total_num_tokens": 782212178.0, "z_loss": 0.0007120986701920629 }, { "copy_logits_max": -5.000097751617432, "copy_logits_min": -750000064.0, "copy_num_tokens": 347.625, "epoch": 0.5894306867500638, "gen_logits_max": 6.110537052154541, "gen_logits_mean": -13.14736557006836, "gen_logits_min": -24.666324615478516, "gen_logits_std": 2.5718936920166016, "gen_loss": 0.32920631766319275, "grad_norm": 0.4159092139131634, "learning_rate": 2.728421052631579e-05, "loss": 0.304, "mean_copy_accuracy": 0.9937793761491776, "mean_gen_accuracy": 0.8670822381973267, "mean_token_accuracy": 0.8985742926597595, "num_tokens": 782452759.0, "sample_num_tokens": 7766.25, "step": 2886, "total_num_tokens": 782483824.0, "z_loss": 0.0007162051624618471 }, { "copy_logits_max": -3.09536075592041, "copy_logits_min": -750000000.0, "copy_num_tokens": 474.625, "epoch": 0.5896349246872606, "gen_logits_max": 5.377391338348389, "gen_logits_mean": -13.77414321899414, "gen_logits_min": -25.694425582885742, "gen_logits_std": 2.62380313873291, "gen_loss": 0.33926665782928467, "grad_norm": 0.47007101728910394, "learning_rate": 2.728294736842105e-05, "loss": 0.3357, "mean_copy_accuracy": 0.9953782558441162, "mean_gen_accuracy": 0.8536648601293564, "mean_token_accuracy": 0.8900524824857712, "num_tokens": 782707006.0, "sample_num_tokens": 8168.5, "step": 2887, "total_num_tokens": 782739680.0, "z_loss": 0.0007708497578278184 }, { "copy_logits_max": -3.2326340675354004, "copy_logits_min": -750000000.0, "copy_num_tokens": 697.9375, "epoch": 0.5898391626244575, "gen_logits_max": 5.016608715057373, "gen_logits_mean": -13.924017906188965, "gen_logits_min": -25.712589263916016, "gen_logits_std": 2.5625457763671875, "gen_loss": 0.3285020887851715, "grad_norm": 0.45536798157868175, "learning_rate": 2.7281684210526315e-05, "loss": 0.3276, "mean_copy_accuracy": 0.993407592177391, "mean_gen_accuracy": 0.8595099151134491, "mean_token_accuracy": 0.8895839899778366, "num_tokens": 782970359.0, "sample_num_tokens": 10791.75, "step": 2888, "total_num_tokens": 783013526.0, "z_loss": 0.0007722490117885172 }, { "copy_logits_max": -2.127213954925537, "copy_logits_min": -750000000.0, "copy_num_tokens": 504.625, "epoch": 0.5900434005616543, "gen_logits_max": 6.127362251281738, "gen_logits_mean": -13.733678817749023, "gen_logits_min": -25.4581356048584, "gen_logits_std": 2.624357223510742, "gen_loss": 0.33549484610557556, "grad_norm": 0.46702004561173305, "learning_rate": 2.7280421052631583e-05, "loss": 0.3174, "mean_copy_accuracy": 0.993734210729599, "mean_gen_accuracy": 0.8631158620119095, "mean_token_accuracy": 0.8970652669668198, "num_tokens": 783261262.0, "sample_num_tokens": 9708.0, "step": 2889, "total_num_tokens": 783300094.0, "z_loss": 0.0007405323558486998 }, { "copy_logits_max": -4.26595401763916, "copy_logits_min": -750000000.0, "copy_num_tokens": 662.3125, "epoch": 0.5902476384988512, "gen_logits_max": 5.749415397644043, "gen_logits_mean": -12.81745719909668, "gen_logits_min": -24.385501861572266, "gen_logits_std": 2.6234896183013916, "gen_loss": 0.2930070757865906, "grad_norm": 0.49021461868402727, "learning_rate": 2.7279157894736844e-05, "loss": 0.3111, "mean_copy_accuracy": 0.9943359345197678, "mean_gen_accuracy": 0.8609730899333954, "mean_token_accuracy": 0.8977915048599243, "num_tokens": 783543248.0, "sample_num_tokens": 10100.0, "step": 2890, "total_num_tokens": 783583648.0, "z_loss": 0.0006995995063334703 }, { "copy_logits_max": -3.6965417861938477, "copy_logits_min": -687500032.0, "copy_num_tokens": 503.1875, "epoch": 0.590451876436048, "gen_logits_max": 4.905111312866211, "gen_logits_mean": -14.432634353637695, "gen_logits_min": -26.32643699645996, "gen_logits_std": 2.6468324661254883, "gen_loss": 0.3244052231311798, "grad_norm": 0.441216952145089, "learning_rate": 2.7277894736842108e-05, "loss": 0.3214, "mean_copy_accuracy": 0.995199665427208, "mean_gen_accuracy": 0.8594763427972794, "mean_token_accuracy": 0.8932843059301376, "num_tokens": 783821195.0, "sample_num_tokens": 8392.75, "step": 2891, "total_num_tokens": 783854766.0, "z_loss": 0.0007429046090692282 }, { "copy_logits_max": -4.35155725479126, "copy_logits_min": -687500032.0, "copy_num_tokens": 418.125, "epoch": 0.5906561143732448, "gen_logits_max": 6.552487373352051, "gen_logits_mean": -12.877248764038086, "gen_logits_min": -24.699506759643555, "gen_logits_std": 2.604703903198242, "gen_loss": 0.2965320944786072, "grad_norm": 0.4236352994687994, "learning_rate": 2.727663157894737e-05, "loss": 0.3094, "mean_copy_accuracy": 0.993274450302124, "mean_gen_accuracy": 0.8706702440977097, "mean_token_accuracy": 0.8995092958211899, "num_tokens": 784090221.0, "sample_num_tokens": 8068.75, "step": 2892, "total_num_tokens": 784122496.0, "z_loss": 0.0007131128804758191 }, { "copy_logits_max": -4.998985290527344, "copy_logits_min": -687500032.0, "copy_num_tokens": 401.875, "epoch": 0.5908603523104416, "gen_logits_max": 5.765021800994873, "gen_logits_mean": -13.2606840133667, "gen_logits_min": -24.585994720458984, "gen_logits_std": 2.5879557132720947, "gen_loss": 0.3155462443828583, "grad_norm": 0.4544235327470817, "learning_rate": 2.7275368421052633e-05, "loss": 0.3244, "mean_copy_accuracy": 0.992880254983902, "mean_gen_accuracy": 0.8676710724830627, "mean_token_accuracy": 0.8933299779891968, "num_tokens": 784341636.0, "sample_num_tokens": 8012.5, "step": 2893, "total_num_tokens": 784373686.0, "z_loss": 0.0007187350420281291 }, { "copy_logits_max": -1.6435352563858032, "copy_logits_min": -750000000.0, "copy_num_tokens": 350.875, "epoch": 0.5910645902476385, "gen_logits_max": 5.808538913726807, "gen_logits_mean": -13.601619720458984, "gen_logits_min": -25.05147933959961, "gen_logits_std": 2.619960308074951, "gen_loss": 0.3615148067474365, "grad_norm": 0.4489790165407958, "learning_rate": 2.7274105263157894e-05, "loss": 0.3438, "mean_copy_accuracy": 0.9930664896965027, "mean_gen_accuracy": 0.8628480136394501, "mean_token_accuracy": 0.890998899936676, "num_tokens": 784639809.0, "sample_num_tokens": 7155.75, "step": 2894, "total_num_tokens": 784668432.0, "z_loss": 0.0008603626629337668 }, { "copy_logits_max": -2.657101631164551, "copy_logits_min": -750000000.0, "copy_num_tokens": 509.8125, "epoch": 0.5912688281848353, "gen_logits_max": 5.467752933502197, "gen_logits_mean": -13.325034141540527, "gen_logits_min": -25.033401489257812, "gen_logits_std": 2.6198508739471436, "gen_loss": 0.28067994117736816, "grad_norm": 0.47525926963018905, "learning_rate": 2.727284210526316e-05, "loss": 0.3102, "mean_copy_accuracy": 0.9928815960884094, "mean_gen_accuracy": 0.8688039928674698, "mean_token_accuracy": 0.8959694355726242, "num_tokens": 784876198.0, "sample_num_tokens": 8737.0, "step": 2895, "total_num_tokens": 784911146.0, "z_loss": 0.000688766420353204 }, { "copy_logits_max": -4.186099052429199, "copy_logits_min": -750000000.0, "copy_num_tokens": 500.625, "epoch": 0.5914730661220322, "gen_logits_max": 5.207019805908203, "gen_logits_mean": -14.283915519714355, "gen_logits_min": -25.966812133789062, "gen_logits_std": 2.6188387870788574, "gen_loss": 0.29885923862457275, "grad_norm": 0.38966517036414866, "learning_rate": 2.727157894736842e-05, "loss": 0.3016, "mean_copy_accuracy": 0.9959117025136948, "mean_gen_accuracy": 0.8628276884555817, "mean_token_accuracy": 0.8998937755823135, "num_tokens": 785162269.0, "sample_num_tokens": 8467.75, "step": 2896, "total_num_tokens": 785196140.0, "z_loss": 0.000710949650965631 }, { "copy_logits_max": -4.266041278839111, "copy_logits_min": -750000000.0, "copy_num_tokens": 498.0, "epoch": 0.591677304059229, "gen_logits_max": 5.281630516052246, "gen_logits_mean": -13.73942756652832, "gen_logits_min": -24.868906021118164, "gen_logits_std": 2.5530433654785156, "gen_loss": 0.31160420179367065, "grad_norm": 0.44453273160684137, "learning_rate": 2.7270315789473687e-05, "loss": 0.3108, "mean_copy_accuracy": 0.9950819760560989, "mean_gen_accuracy": 0.8647156953811646, "mean_token_accuracy": 0.8991413414478302, "num_tokens": 785446394.0, "sample_num_tokens": 8444.5, "step": 2897, "total_num_tokens": 785480172.0, "z_loss": 0.0007287112530320883 }, { "copy_logits_max": -4.507089138031006, "copy_logits_min": -750000000.0, "copy_num_tokens": 466.4375, "epoch": 0.5918815419964258, "gen_logits_max": 6.002536773681641, "gen_logits_mean": -13.716452598571777, "gen_logits_min": -24.788604736328125, "gen_logits_std": 2.533982276916504, "gen_loss": 0.3241702616214752, "grad_norm": 0.4123867399451265, "learning_rate": 2.7269052631578948e-05, "loss": 0.3197, "mean_copy_accuracy": 0.9948074072599411, "mean_gen_accuracy": 0.8617766201496124, "mean_token_accuracy": 0.8942338228225708, "num_tokens": 785724501.0, "sample_num_tokens": 8754.25, "step": 2898, "total_num_tokens": 785759518.0, "z_loss": 0.0007590746972709894 }, { "copy_logits_max": -5.556574821472168, "copy_logits_min": -750000000.0, "copy_num_tokens": 325.25, "epoch": 0.5920857799336227, "gen_logits_max": 5.486114501953125, "gen_logits_mean": -14.480722427368164, "gen_logits_min": -26.094120025634766, "gen_logits_std": 2.580639362335205, "gen_loss": 0.33282285928726196, "grad_norm": 0.4387240936646302, "learning_rate": 2.7267789473684212e-05, "loss": 0.3293, "mean_copy_accuracy": 0.9931416660547256, "mean_gen_accuracy": 0.8605989068746567, "mean_token_accuracy": 0.8887081444263458, "num_tokens": 785995015.0, "sample_num_tokens": 8020.25, "step": 2899, "total_num_tokens": 786027096.0, "z_loss": 0.0007756889099255204 }, { "copy_logits_max": -4.416591167449951, "copy_logits_min": -750000000.0, "copy_num_tokens": 433.8125, "epoch": 0.5922900178708195, "gen_logits_max": 5.012551784515381, "gen_logits_mean": -14.175758361816406, "gen_logits_min": -25.4008731842041, "gen_logits_std": 2.5755934715270996, "gen_loss": 0.3024885654449463, "grad_norm": 0.41995920107832846, "learning_rate": 2.7266526315789473e-05, "loss": 0.3025, "mean_copy_accuracy": 0.9950116723775864, "mean_gen_accuracy": 0.8644662499427795, "mean_token_accuracy": 0.8984181880950928, "num_tokens": 786267635.0, "sample_num_tokens": 7616.25, "step": 2900, "total_num_tokens": 786298100.0, "z_loss": 0.0006660697981715202 }, { "copy_logits_max": -3.483661651611328, "copy_logits_min": -625000064.0, "copy_num_tokens": 491.75, "epoch": 0.5924942558080163, "gen_logits_max": 5.369319915771484, "gen_logits_mean": -13.425213813781738, "gen_logits_min": -24.57802963256836, "gen_logits_std": 2.539170503616333, "gen_loss": 0.3406299352645874, "grad_norm": 0.49733166910865323, "learning_rate": 2.7265263157894738e-05, "loss": 0.3386, "mean_copy_accuracy": 0.9938157945871353, "mean_gen_accuracy": 0.8525291830301285, "mean_token_accuracy": 0.8889795988798141, "num_tokens": 786511330.0, "sample_num_tokens": 8389.0, "step": 2901, "total_num_tokens": 786544886.0, "z_loss": 0.00071622064569965 }, { "copy_logits_max": -5.480464935302734, "copy_logits_min": -750000000.0, "copy_num_tokens": 527.625, "epoch": 0.5926984937452132, "gen_logits_max": 6.270391464233398, "gen_logits_mean": -12.252786636352539, "gen_logits_min": -24.335773468017578, "gen_logits_std": 2.640392780303955, "gen_loss": 0.309671550989151, "grad_norm": 0.42407580882292095, "learning_rate": 2.7264000000000002e-05, "loss": 0.3192, "mean_copy_accuracy": 0.9938531219959259, "mean_gen_accuracy": 0.8605701923370361, "mean_token_accuracy": 0.893327072262764, "num_tokens": 786772481.0, "sample_num_tokens": 8719.75, "step": 2902, "total_num_tokens": 786807360.0, "z_loss": 0.0006660165963694453 }, { "copy_logits_max": -5.803398132324219, "copy_logits_min": -750000000.0, "copy_num_tokens": 359.875, "epoch": 0.59290273168241, "gen_logits_max": 5.908130168914795, "gen_logits_mean": -13.988471031188965, "gen_logits_min": -25.155017852783203, "gen_logits_std": 2.619917869567871, "gen_loss": 0.3966050148010254, "grad_norm": 0.4426966833963724, "learning_rate": 2.7262736842105263e-05, "loss": 0.3215, "mean_copy_accuracy": 0.994752898812294, "mean_gen_accuracy": 0.860504686832428, "mean_token_accuracy": 0.8938214182853699, "num_tokens": 787057096.0, "sample_num_tokens": 8264.0, "step": 2903, "total_num_tokens": 787090152.0, "z_loss": 0.0008125034510158002 }, { "copy_logits_max": -4.460741996765137, "copy_logits_min": -750000000.0, "copy_num_tokens": 554.3125, "epoch": 0.5931069696196068, "gen_logits_max": 5.223515033721924, "gen_logits_mean": -12.570634841918945, "gen_logits_min": -24.42029571533203, "gen_logits_std": 2.5684657096862793, "gen_loss": 0.29395002126693726, "grad_norm": 0.5013701403244444, "learning_rate": 2.7261473684210527e-05, "loss": 0.3193, "mean_copy_accuracy": 0.9956153780221939, "mean_gen_accuracy": 0.8590945452451706, "mean_token_accuracy": 0.8950058966875076, "num_tokens": 787313128.0, "sample_num_tokens": 8136.5, "step": 2904, "total_num_tokens": 787345674.0, "z_loss": 0.0006617341423407197 }, { "copy_logits_max": -4.839097023010254, "copy_logits_min": -687500096.0, "copy_num_tokens": 562.125, "epoch": 0.5933112075568037, "gen_logits_max": 5.602840423583984, "gen_logits_mean": -12.741950988769531, "gen_logits_min": -24.690528869628906, "gen_logits_std": 2.6492390632629395, "gen_loss": 0.3338314890861511, "grad_norm": 0.4458164347816989, "learning_rate": 2.7260210526315788e-05, "loss": 0.3303, "mean_copy_accuracy": 0.9945700615644455, "mean_gen_accuracy": 0.8592197000980377, "mean_token_accuracy": 0.8924859315156937, "num_tokens": 787591822.0, "sample_num_tokens": 8633.0, "step": 2905, "total_num_tokens": 787626354.0, "z_loss": 0.0006889956421218812 }, { "copy_logits_max": -7.105766773223877, "copy_logits_min": -750000000.0, "copy_num_tokens": 278.5625, "epoch": 0.5935154454940005, "gen_logits_max": 5.737735748291016, "gen_logits_mean": -13.50071907043457, "gen_logits_min": -24.68328857421875, "gen_logits_std": 2.5543837547302246, "gen_loss": 0.3192684054374695, "grad_norm": 0.4382604887133289, "learning_rate": 2.7258947368421056e-05, "loss": 0.3225, "mean_copy_accuracy": 0.9934379309415817, "mean_gen_accuracy": 0.8657344430685043, "mean_token_accuracy": 0.8933445364236832, "num_tokens": 787865472.0, "sample_num_tokens": 7396.0, "step": 2906, "total_num_tokens": 787895056.0, "z_loss": 0.000640170881524682 }, { "copy_logits_max": -5.080258369445801, "copy_logits_min": -750000000.0, "copy_num_tokens": 473.625, "epoch": 0.5937196834311973, "gen_logits_max": 5.262846946716309, "gen_logits_mean": -13.24234390258789, "gen_logits_min": -24.418842315673828, "gen_logits_std": 2.492093563079834, "gen_loss": 0.3168398141860962, "grad_norm": 0.5068461433584706, "learning_rate": 2.7257684210526317e-05, "loss": 0.3505, "mean_copy_accuracy": 0.9942772835493088, "mean_gen_accuracy": 0.8544676899909973, "mean_token_accuracy": 0.8864896595478058, "num_tokens": 788135054.0, "sample_num_tokens": 8326.5, "step": 2907, "total_num_tokens": 788168360.0, "z_loss": 0.0006630887510254979 }, { "copy_logits_max": -3.852933645248413, "copy_logits_min": -750000064.0, "copy_num_tokens": 458.3125, "epoch": 0.5939239213683942, "gen_logits_max": 5.294643402099609, "gen_logits_mean": -13.430545806884766, "gen_logits_min": -24.676898956298828, "gen_logits_std": 2.5560853481292725, "gen_loss": 0.32197368144989014, "grad_norm": 0.42260227214492974, "learning_rate": 2.725642105263158e-05, "loss": 0.3111, "mean_copy_accuracy": 0.9937991201877594, "mean_gen_accuracy": 0.8662203401327133, "mean_token_accuracy": 0.8943080753087997, "num_tokens": 788401197.0, "sample_num_tokens": 7898.75, "step": 2908, "total_num_tokens": 788432792.0, "z_loss": 0.0006539805908687413 }, { "copy_logits_max": -5.290132522583008, "copy_logits_min": -750000000.0, "copy_num_tokens": 301.0625, "epoch": 0.594128159305591, "gen_logits_max": 5.935468673706055, "gen_logits_mean": -14.292658805847168, "gen_logits_min": -25.523658752441406, "gen_logits_std": 2.5531294345855713, "gen_loss": 0.3632608652114868, "grad_norm": 0.5185091201524935, "learning_rate": 2.7255157894736842e-05, "loss": 0.3439, "mean_copy_accuracy": 0.9923916757106781, "mean_gen_accuracy": 0.8541664183139801, "mean_token_accuracy": 0.8858971297740936, "num_tokens": 788660371.0, "sample_num_tokens": 7181.75, "step": 2909, "total_num_tokens": 788689098.0, "z_loss": 0.0008356751641258597 }, { "copy_logits_max": -5.70613956451416, "copy_logits_min": -750000000.0, "copy_num_tokens": 563.0, "epoch": 0.5943323972427879, "gen_logits_max": 5.126341819763184, "gen_logits_mean": -13.454751014709473, "gen_logits_min": -24.58647918701172, "gen_logits_std": 2.537646770477295, "gen_loss": 0.302680104970932, "grad_norm": 0.4203026600003505, "learning_rate": 2.7253894736842106e-05, "loss": 0.3207, "mean_copy_accuracy": 0.9934316128492355, "mean_gen_accuracy": 0.8584521412849426, "mean_token_accuracy": 0.892901822924614, "num_tokens": 788915495.0, "sample_num_tokens": 9008.75, "step": 2910, "total_num_tokens": 788951530.0, "z_loss": 0.0006997783202677965 }, { "copy_logits_max": -2.482447624206543, "copy_logits_min": -750000000.0, "copy_num_tokens": 421.1875, "epoch": 0.5945366351799847, "gen_logits_max": 5.992592811584473, "gen_logits_mean": -12.295160293579102, "gen_logits_min": -23.505678176879883, "gen_logits_std": 2.5205535888671875, "gen_loss": 0.32822075486183167, "grad_norm": 0.702247019452751, "learning_rate": 2.7252631578947367e-05, "loss": 0.3378, "mean_copy_accuracy": 0.9933735430240631, "mean_gen_accuracy": 0.8608840852975845, "mean_token_accuracy": 0.8889150023460388, "num_tokens": 789183915.0, "sample_num_tokens": 8192.25, "step": 2911, "total_num_tokens": 789216684.0, "z_loss": 0.0008280504262074828 }, { "copy_logits_max": -2.351287603378296, "copy_logits_min": -750000000.0, "copy_num_tokens": 859.5, "epoch": 0.5947408731171815, "gen_logits_max": 4.748384475708008, "gen_logits_mean": -13.738038063049316, "gen_logits_min": -25.47112464904785, "gen_logits_std": 2.5930368900299072, "gen_loss": 0.3001202344894409, "grad_norm": 0.40830482430873544, "learning_rate": 2.725136842105263e-05, "loss": 0.3035, "mean_copy_accuracy": 0.9956160485744476, "mean_gen_accuracy": 0.8664564937353134, "mean_token_accuracy": 0.9034185707569122, "num_tokens": 789456359.0, "sample_num_tokens": 11298.25, "step": 2912, "total_num_tokens": 789501552.0, "z_loss": 0.0007492919685319066 }, { "copy_logits_max": -4.73301887512207, "copy_logits_min": -750000000.0, "copy_num_tokens": 333.0, "epoch": 0.5949451110543783, "gen_logits_max": 6.092769145965576, "gen_logits_mean": -13.428030014038086, "gen_logits_min": -24.972286224365234, "gen_logits_std": 2.527472972869873, "gen_loss": 0.3365764319896698, "grad_norm": 0.510807232120542, "learning_rate": 2.7250105263157892e-05, "loss": 0.3179, "mean_copy_accuracy": 0.9932419508695602, "mean_gen_accuracy": 0.869801327586174, "mean_token_accuracy": 0.8962214589118958, "num_tokens": 789740302.0, "sample_num_tokens": 8123.0, "step": 2913, "total_num_tokens": 789772794.0, "z_loss": 0.0007712615188211203 }, { "copy_logits_max": -5.2233099937438965, "copy_logits_min": -750000000.0, "copy_num_tokens": 423.375, "epoch": 0.5951493489915752, "gen_logits_max": 5.343143939971924, "gen_logits_mean": -14.161359786987305, "gen_logits_min": -25.612661361694336, "gen_logits_std": 2.571197986602783, "gen_loss": 0.2818898558616638, "grad_norm": 0.48461445470027303, "learning_rate": 2.724884210526316e-05, "loss": 0.3171, "mean_copy_accuracy": 0.9931287318468094, "mean_gen_accuracy": 0.8647065460681915, "mean_token_accuracy": 0.8939048647880554, "num_tokens": 789990207.0, "sample_num_tokens": 7964.75, "step": 2914, "total_num_tokens": 790022066.0, "z_loss": 0.0007196972146630287 }, { "copy_logits_max": -2.9145026206970215, "copy_logits_min": -687500032.0, "copy_num_tokens": 570.75, "epoch": 0.595353586928772, "gen_logits_max": 5.270727157592773, "gen_logits_mean": -13.983685493469238, "gen_logits_min": -25.72178840637207, "gen_logits_std": 2.614025592803955, "gen_loss": 0.35353195667266846, "grad_norm": 0.4573767341676563, "learning_rate": 2.7247578947368424e-05, "loss": 0.3187, "mean_copy_accuracy": 0.9955055266618729, "mean_gen_accuracy": 0.8608054667711258, "mean_token_accuracy": 0.8966304659843445, "num_tokens": 790249109.0, "sample_num_tokens": 9810.25, "step": 2915, "total_num_tokens": 790288350.0, "z_loss": 0.0008861973765306175 }, { "copy_logits_max": -3.11257004737854, "copy_logits_min": -687500032.0, "copy_num_tokens": 565.375, "epoch": 0.5955578248659689, "gen_logits_max": 4.953797340393066, "gen_logits_mean": -13.377704620361328, "gen_logits_min": -25.30167579650879, "gen_logits_std": 2.606154680252075, "gen_loss": 0.27053356170654297, "grad_norm": 0.4330191653603496, "learning_rate": 2.7246315789473685e-05, "loss": 0.3114, "mean_copy_accuracy": 0.9953147917985916, "mean_gen_accuracy": 0.8661564737558365, "mean_token_accuracy": 0.896005854010582, "num_tokens": 790536057.0, "sample_num_tokens": 8779.75, "step": 2916, "total_num_tokens": 790571176.0, "z_loss": 0.0008140145801007748 }, { "copy_logits_max": -1.4591964483261108, "copy_logits_min": -687500032.0, "copy_num_tokens": 551.75, "epoch": 0.5957620628031657, "gen_logits_max": 4.885645389556885, "gen_logits_mean": -14.183213233947754, "gen_logits_min": -25.86166763305664, "gen_logits_std": 2.5946333408355713, "gen_loss": 0.29412081837654114, "grad_norm": 0.4917347756719582, "learning_rate": 2.724505263157895e-05, "loss": 0.3253, "mean_copy_accuracy": 0.9942096769809723, "mean_gen_accuracy": 0.8611854761838913, "mean_token_accuracy": 0.8916881084442139, "num_tokens": 790794603.0, "sample_num_tokens": 9166.75, "step": 2917, "total_num_tokens": 790831270.0, "z_loss": 0.0008016510400921106 }, { "copy_logits_max": -5.735558986663818, "copy_logits_min": -750000000.0, "copy_num_tokens": 304.75, "epoch": 0.5959663007403625, "gen_logits_max": 6.051375389099121, "gen_logits_mean": -14.18160629272461, "gen_logits_min": -25.861099243164062, "gen_logits_std": 2.6128454208374023, "gen_loss": 0.2958856225013733, "grad_norm": 0.49186032828359516, "learning_rate": 2.724378947368421e-05, "loss": 0.3227, "mean_copy_accuracy": 0.9921040385961533, "mean_gen_accuracy": 0.8647591918706894, "mean_token_accuracy": 0.8928874284029007, "num_tokens": 791047279.0, "sample_num_tokens": 7698.75, "step": 2918, "total_num_tokens": 791078074.0, "z_loss": 0.0006820360431447625 }, { "copy_logits_max": -2.6259777545928955, "copy_logits_min": -750000000.0, "copy_num_tokens": 484.375, "epoch": 0.5961705386775593, "gen_logits_max": 5.790863037109375, "gen_logits_mean": -12.883734703063965, "gen_logits_min": -24.719585418701172, "gen_logits_std": 2.6262731552124023, "gen_loss": 0.34382596611976624, "grad_norm": 0.4357689262050614, "learning_rate": 2.7242526315789475e-05, "loss": 0.3197, "mean_copy_accuracy": 0.9940082877874374, "mean_gen_accuracy": 0.8632104992866516, "mean_token_accuracy": 0.8949701189994812, "num_tokens": 791317499.0, "sample_num_tokens": 9067.25, "step": 2919, "total_num_tokens": 791353768.0, "z_loss": 0.0007767872884869576 }, { "copy_logits_max": -2.6126291751861572, "copy_logits_min": -750000000.0, "copy_num_tokens": 357.625, "epoch": 0.5963747766147562, "gen_logits_max": 5.730021953582764, "gen_logits_mean": -13.57225227355957, "gen_logits_min": -24.85720443725586, "gen_logits_std": 2.5609793663024902, "gen_loss": 0.26530492305755615, "grad_norm": 0.4654748419808826, "learning_rate": 2.7241263157894736e-05, "loss": 0.302, "mean_copy_accuracy": 0.9937683790922165, "mean_gen_accuracy": 0.8675539344549179, "mean_token_accuracy": 0.8972527235746384, "num_tokens": 791584485.0, "sample_num_tokens": 7919.25, "step": 2920, "total_num_tokens": 791616162.0, "z_loss": 0.000667620450258255 }, { "copy_logits_max": -1.1117104291915894, "copy_logits_min": -750000000.0, "copy_num_tokens": 451.5625, "epoch": 0.5965790145519531, "gen_logits_max": 6.042981147766113, "gen_logits_mean": -12.815624237060547, "gen_logits_min": -24.377166748046875, "gen_logits_std": 2.6136457920074463, "gen_loss": 0.2990769147872925, "grad_norm": 0.42289267124968144, "learning_rate": 2.724e-05, "loss": 0.3161, "mean_copy_accuracy": 0.9951977133750916, "mean_gen_accuracy": 0.8617964535951614, "mean_token_accuracy": 0.8962628841400146, "num_tokens": 791870404.0, "sample_num_tokens": 8217.5, "step": 2921, "total_num_tokens": 791903274.0, "z_loss": 0.0007295612595044076 }, { "copy_logits_max": -3.0607218742370605, "copy_logits_min": -687500032.0, "copy_num_tokens": 460.0625, "epoch": 0.5967832524891499, "gen_logits_max": 4.9373931884765625, "gen_logits_mean": -14.230624198913574, "gen_logits_min": -25.85516357421875, "gen_logits_std": 2.5460574626922607, "gen_loss": 0.2797483205795288, "grad_norm": 0.4771957804110773, "learning_rate": 2.7238736842105264e-05, "loss": 0.3111, "mean_copy_accuracy": 0.9919993430376053, "mean_gen_accuracy": 0.8690833896398544, "mean_token_accuracy": 0.8957662135362625, "num_tokens": 792116445.0, "sample_num_tokens": 8392.25, "step": 2922, "total_num_tokens": 792150014.0, "z_loss": 0.0006879633292555809 }, { "copy_logits_max": -3.3580360412597656, "copy_logits_min": -750000000.0, "copy_num_tokens": 388.5625, "epoch": 0.5969874904263467, "gen_logits_max": 5.400510787963867, "gen_logits_mean": -14.41645622253418, "gen_logits_min": -26.074445724487305, "gen_logits_std": 2.600313186645508, "gen_loss": 0.3199957013130188, "grad_norm": 0.4622959718136557, "learning_rate": 2.723747368421053e-05, "loss": 0.3139, "mean_copy_accuracy": 0.9931895285844803, "mean_gen_accuracy": 0.8675650656223297, "mean_token_accuracy": 0.8953537791967392, "num_tokens": 792365122.0, "sample_num_tokens": 8338.5, "step": 2923, "total_num_tokens": 792398476.0, "z_loss": 0.0006697053322568536 }, { "copy_logits_max": -2.2900233268737793, "copy_logits_min": -625000064.0, "copy_num_tokens": 673.1875, "epoch": 0.5971917283635435, "gen_logits_max": 4.9569854736328125, "gen_logits_mean": -12.468629837036133, "gen_logits_min": -25.252840042114258, "gen_logits_std": 2.610785484313965, "gen_loss": 0.2874022126197815, "grad_norm": 0.43987362445149486, "learning_rate": 2.723621052631579e-05, "loss": 0.322, "mean_copy_accuracy": 0.9952712953090668, "mean_gen_accuracy": 0.8510680943727493, "mean_token_accuracy": 0.8932504206895828, "num_tokens": 792648666.0, "sample_num_tokens": 9282.5, "step": 2924, "total_num_tokens": 792685796.0, "z_loss": 0.0007133393082767725 }, { "copy_logits_max": -2.300365686416626, "copy_logits_min": -625000064.0, "copy_num_tokens": 515.4375, "epoch": 0.5973959663007403, "gen_logits_max": 5.526252269744873, "gen_logits_mean": -13.591985702514648, "gen_logits_min": -25.137676239013672, "gen_logits_std": 2.5723047256469727, "gen_loss": 0.3021182715892792, "grad_norm": 0.41737298968936815, "learning_rate": 2.7234947368421054e-05, "loss": 0.3178, "mean_copy_accuracy": 0.9954253882169724, "mean_gen_accuracy": 0.8655002117156982, "mean_token_accuracy": 0.8940582871437073, "num_tokens": 792947765.0, "sample_num_tokens": 9596.75, "step": 2925, "total_num_tokens": 792986152.0, "z_loss": 0.0006989471148699522 }, { "copy_logits_max": -2.931352138519287, "copy_logits_min": -750000000.0, "copy_num_tokens": 560.3125, "epoch": 0.5976002042379372, "gen_logits_max": 5.169860363006592, "gen_logits_mean": -14.050817489624023, "gen_logits_min": -25.48766326904297, "gen_logits_std": 2.5516176223754883, "gen_loss": 0.30103835463523865, "grad_norm": 0.43504056297404203, "learning_rate": 2.7233684210526315e-05, "loss": 0.3097, "mean_copy_accuracy": 0.9943432360887527, "mean_gen_accuracy": 0.8669571876525879, "mean_token_accuracy": 0.8975698053836823, "num_tokens": 793213150.0, "sample_num_tokens": 9438.0, "step": 2926, "total_num_tokens": 793250902.0, "z_loss": 0.0006779601098969579 }, { "copy_logits_max": -2.9368033409118652, "copy_logits_min": -750000000.0, "copy_num_tokens": 386.6875, "epoch": 0.5978044421751341, "gen_logits_max": 6.68033504486084, "gen_logits_mean": -12.674524307250977, "gen_logits_min": -24.572093963623047, "gen_logits_std": 2.5983386039733887, "gen_loss": 0.31270694732666016, "grad_norm": 0.4437243758566934, "learning_rate": 2.723242105263158e-05, "loss": 0.3507, "mean_copy_accuracy": 0.9936178624629974, "mean_gen_accuracy": 0.8487192988395691, "mean_token_accuracy": 0.8843749612569809, "num_tokens": 793486926.0, "sample_num_tokens": 7173.5, "step": 2927, "total_num_tokens": 793515620.0, "z_loss": 0.0007482002256438136 }, { "copy_logits_max": -1.9637205600738525, "copy_logits_min": -750000064.0, "copy_num_tokens": 529.875, "epoch": 0.5980086801123309, "gen_logits_max": 5.832320213317871, "gen_logits_mean": -12.29864501953125, "gen_logits_min": -24.28095817565918, "gen_logits_std": 2.605069160461426, "gen_loss": 0.28940916061401367, "grad_norm": 0.4330093520136626, "learning_rate": 2.723115789473684e-05, "loss": 0.3189, "mean_copy_accuracy": 0.9948171675205231, "mean_gen_accuracy": 0.8613769114017487, "mean_token_accuracy": 0.8922872245311737, "num_tokens": 793729355.0, "sample_num_tokens": 8835.75, "step": 2928, "total_num_tokens": 793764698.0, "z_loss": 0.0006747673032805324 }, { "copy_logits_max": -1.0223257541656494, "copy_logits_min": -687500032.0, "copy_num_tokens": 530.0, "epoch": 0.5982129180495277, "gen_logits_max": 6.816806793212891, "gen_logits_mean": -9.999549865722656, "gen_logits_min": -22.35687255859375, "gen_logits_std": 2.535478115081787, "gen_loss": 0.35599619150161743, "grad_norm": 0.4779135653173917, "learning_rate": 2.7229894736842104e-05, "loss": 0.3501, "mean_copy_accuracy": 0.9919433444738388, "mean_gen_accuracy": 0.8505001366138458, "mean_token_accuracy": 0.8863328993320465, "num_tokens": 793987114.0, "sample_num_tokens": 8739.0, "step": 2929, "total_num_tokens": 794022070.0, "z_loss": 0.0008065245929174125 }, { "copy_logits_max": -2.7020151615142822, "copy_logits_min": -750000000.0, "copy_num_tokens": 410.3125, "epoch": 0.5984171559867245, "gen_logits_max": 6.429943561553955, "gen_logits_mean": -12.315059661865234, "gen_logits_min": -23.723529815673828, "gen_logits_std": 2.530158519744873, "gen_loss": 0.3406723141670227, "grad_norm": 0.5096309496448076, "learning_rate": 2.7228631578947372e-05, "loss": 0.339, "mean_copy_accuracy": 0.9929290413856506, "mean_gen_accuracy": 0.8573321551084518, "mean_token_accuracy": 0.8870731592178345, "num_tokens": 794250425.0, "sample_num_tokens": 8783.75, "step": 2930, "total_num_tokens": 794285560.0, "z_loss": 0.0008548370678909123 }, { "copy_logits_max": -3.135068416595459, "copy_logits_min": -687500032.0, "copy_num_tokens": 480.3125, "epoch": 0.5986213939239213, "gen_logits_max": 5.094184875488281, "gen_logits_mean": -13.697181701660156, "gen_logits_min": -26.29036521911621, "gen_logits_std": 2.567892551422119, "gen_loss": 0.26469171047210693, "grad_norm": 0.462936668189071, "learning_rate": 2.7227368421052633e-05, "loss": 0.3178, "mean_copy_accuracy": 0.993005245923996, "mean_gen_accuracy": 0.8673248738050461, "mean_token_accuracy": 0.8941805362701416, "num_tokens": 794499533.0, "sample_num_tokens": 7859.25, "step": 2931, "total_num_tokens": 794530970.0, "z_loss": 0.0006463007885031402 }, { "copy_logits_max": -1.556912899017334, "copy_logits_min": -750000000.0, "copy_num_tokens": 572.625, "epoch": 0.5988256318611183, "gen_logits_max": 5.355092525482178, "gen_logits_mean": -12.808694839477539, "gen_logits_min": -25.031734466552734, "gen_logits_std": 2.5922913551330566, "gen_loss": 0.31976333260536194, "grad_norm": 0.48224023958315465, "learning_rate": 2.7226105263157897e-05, "loss": 0.3309, "mean_copy_accuracy": 0.994036391377449, "mean_gen_accuracy": 0.8570161312818527, "mean_token_accuracy": 0.8905882388353348, "num_tokens": 794759358.0, "sample_num_tokens": 9092.0, "step": 2932, "total_num_tokens": 794795726.0, "z_loss": 0.0007596419891342521 }, { "copy_logits_max": -2.586122989654541, "copy_logits_min": -750000000.0, "copy_num_tokens": 442.25, "epoch": 0.5990298697983151, "gen_logits_max": 5.514862537384033, "gen_logits_mean": -13.370882987976074, "gen_logits_min": -24.962005615234375, "gen_logits_std": 2.5102105140686035, "gen_loss": 0.36458778381347656, "grad_norm": 0.47015671230178346, "learning_rate": 2.7224842105263158e-05, "loss": 0.3313, "mean_copy_accuracy": 0.9923121482133865, "mean_gen_accuracy": 0.8598177134990692, "mean_token_accuracy": 0.8896266520023346, "num_tokens": 795026962.0, "sample_num_tokens": 8458.5, "step": 2933, "total_num_tokens": 795060796.0, "z_loss": 0.0008547124452888966 }, { "copy_logits_max": -4.236390113830566, "copy_logits_min": -687500032.0, "copy_num_tokens": 361.6875, "epoch": 0.5992341077355119, "gen_logits_max": 6.1775736808776855, "gen_logits_mean": -12.1866455078125, "gen_logits_min": -24.123920440673828, "gen_logits_std": 2.491504430770874, "gen_loss": 0.33189821243286133, "grad_norm": 0.4180978933905998, "learning_rate": 2.7223578947368423e-05, "loss": 0.3112, "mean_copy_accuracy": 0.9955965131521225, "mean_gen_accuracy": 0.8620979934930801, "mean_token_accuracy": 0.8973061740398407, "num_tokens": 795305379.0, "sample_num_tokens": 7433.25, "step": 2934, "total_num_tokens": 795335112.0, "z_loss": 0.000828953052405268 }, { "copy_logits_max": -2.4752323627471924, "copy_logits_min": -687500032.0, "copy_num_tokens": 562.0625, "epoch": 0.5994383456727087, "gen_logits_max": 5.590359210968018, "gen_logits_mean": -12.878501892089844, "gen_logits_min": -24.61612319946289, "gen_logits_std": 2.5550546646118164, "gen_loss": 0.3010522127151489, "grad_norm": 0.42859670125581956, "learning_rate": 2.7222315789473683e-05, "loss": 0.3001, "mean_copy_accuracy": 0.9946351647377014, "mean_gen_accuracy": 0.8660868257284164, "mean_token_accuracy": 0.90120629966259, "num_tokens": 795575816.0, "sample_num_tokens": 9558.0, "step": 2935, "total_num_tokens": 795614048.0, "z_loss": 0.0007433179998770356 }, { "copy_logits_max": -2.9118099212646484, "copy_logits_min": -750000000.0, "copy_num_tokens": 356.375, "epoch": 0.5996425836099055, "gen_logits_max": 6.390284061431885, "gen_logits_mean": -12.291019439697266, "gen_logits_min": -24.572948455810547, "gen_logits_std": 2.559894561767578, "gen_loss": 0.30284377932548523, "grad_norm": 0.44586977552834456, "learning_rate": 2.7221052631578948e-05, "loss": 0.3098, "mean_copy_accuracy": 0.9947548359632492, "mean_gen_accuracy": 0.8655363321304321, "mean_token_accuracy": 0.8967824280261993, "num_tokens": 795863430.0, "sample_num_tokens": 7281.5, "step": 2936, "total_num_tokens": 795892556.0, "z_loss": 0.0007697665714658797 }, { "copy_logits_max": -1.9668787717819214, "copy_logits_min": -750000000.0, "copy_num_tokens": 599.1875, "epoch": 0.5998468215471023, "gen_logits_max": 5.512183666229248, "gen_logits_mean": -12.388978958129883, "gen_logits_min": -24.30510711669922, "gen_logits_std": 2.5023975372314453, "gen_loss": 0.3107946515083313, "grad_norm": 0.48612644994757115, "learning_rate": 2.721978947368421e-05, "loss": 0.3075, "mean_copy_accuracy": 0.9948669224977493, "mean_gen_accuracy": 0.8652336448431015, "mean_token_accuracy": 0.8973168432712555, "num_tokens": 796117774.0, "sample_num_tokens": 8999.5, "step": 2937, "total_num_tokens": 796153772.0, "z_loss": 0.0007660546689294279 }, { "copy_logits_max": -4.437265396118164, "copy_logits_min": -750000000.0, "copy_num_tokens": 314.8125, "epoch": 0.6000510594842992, "gen_logits_max": 5.536848068237305, "gen_logits_mean": -14.30582046508789, "gen_logits_min": -25.968130111694336, "gen_logits_std": 2.5416221618652344, "gen_loss": 0.3010746240615845, "grad_norm": 0.4113587424260148, "learning_rate": 2.7218526315789476e-05, "loss": 0.3115, "mean_copy_accuracy": 0.9945439994335175, "mean_gen_accuracy": 0.8711688369512558, "mean_token_accuracy": 0.896918997168541, "num_tokens": 796378449.0, "sample_num_tokens": 7730.25, "step": 2938, "total_num_tokens": 796409370.0, "z_loss": 0.0006414091913029552 }, { "copy_logits_max": -1.9123848676681519, "copy_logits_min": -750000000.0, "copy_num_tokens": 491.25, "epoch": 0.6002552974214961, "gen_logits_max": 6.146188735961914, "gen_logits_mean": -12.197324752807617, "gen_logits_min": -24.709455490112305, "gen_logits_std": 2.5363516807556152, "gen_loss": 0.3513321280479431, "grad_norm": 0.46533909238217563, "learning_rate": 2.7217263157894737e-05, "loss": 0.3248, "mean_copy_accuracy": 0.9928210377693176, "mean_gen_accuracy": 0.8623814284801483, "mean_token_accuracy": 0.8929425030946732, "num_tokens": 796642135.0, "sample_num_tokens": 8238.25, "step": 2939, "total_num_tokens": 796675088.0, "z_loss": 0.000843245186842978 }, { "copy_logits_max": -0.3218452036380768, "copy_logits_min": -687500032.0, "copy_num_tokens": 551.3125, "epoch": 0.6004595353586929, "gen_logits_max": 5.802072525024414, "gen_logits_mean": -12.553577423095703, "gen_logits_min": -24.515140533447266, "gen_logits_std": 2.558514356613159, "gen_loss": 0.30176210403442383, "grad_norm": 0.4455407337899186, "learning_rate": 2.7216e-05, "loss": 0.3078, "mean_copy_accuracy": 0.9942504316568375, "mean_gen_accuracy": 0.8650509417057037, "mean_token_accuracy": 0.8990884125232697, "num_tokens": 796919264.0, "sample_num_tokens": 8928.0, "step": 2940, "total_num_tokens": 796954976.0, "z_loss": 0.0007885582745075226 }, { "copy_logits_max": -0.6127786636352539, "copy_logits_min": -687500032.0, "copy_num_tokens": 563.875, "epoch": 0.6006637732958897, "gen_logits_max": 5.175045013427734, "gen_logits_mean": -13.916860580444336, "gen_logits_min": -25.985706329345703, "gen_logits_std": 2.628098487854004, "gen_loss": 0.3193134665489197, "grad_norm": 0.4926320100188234, "learning_rate": 2.7214736842105263e-05, "loss": 0.3296, "mean_copy_accuracy": 0.9952020943164825, "mean_gen_accuracy": 0.8576840758323669, "mean_token_accuracy": 0.8949006348848343, "num_tokens": 797208471.0, "sample_num_tokens": 8771.75, "step": 2941, "total_num_tokens": 797243558.0, "z_loss": 0.0008523341966792941 }, { "copy_logits_max": -1.7798751592636108, "copy_logits_min": -750000000.0, "copy_num_tokens": 277.625, "epoch": 0.6008680112330865, "gen_logits_max": 5.452445983886719, "gen_logits_mean": -13.937444686889648, "gen_logits_min": -25.593523025512695, "gen_logits_std": 2.5714645385742188, "gen_loss": 0.3144836723804474, "grad_norm": 0.4700868307310975, "learning_rate": 2.7213473684210527e-05, "loss": 0.3288, "mean_copy_accuracy": 0.9933674782514572, "mean_gen_accuracy": 0.8632290661334991, "mean_token_accuracy": 0.8913017511367798, "num_tokens": 797448361.0, "sample_num_tokens": 7068.25, "step": 2942, "total_num_tokens": 797476634.0, "z_loss": 0.0007446084637194872 }, { "copy_logits_max": -2.6699447631835938, "copy_logits_min": -750000000.0, "copy_num_tokens": 442.3125, "epoch": 0.6010722491702833, "gen_logits_max": 5.699456214904785, "gen_logits_mean": -13.37175464630127, "gen_logits_min": -25.503150939941406, "gen_logits_std": 2.6036317348480225, "gen_loss": 0.3108808994293213, "grad_norm": 0.4958530583757045, "learning_rate": 2.721221052631579e-05, "loss": 0.3077, "mean_copy_accuracy": 0.9941102117300034, "mean_gen_accuracy": 0.8632548749446869, "mean_token_accuracy": 0.8997265845537186, "num_tokens": 797723475.0, "sample_num_tokens": 7894.75, "step": 2943, "total_num_tokens": 797755054.0, "z_loss": 0.0007273056544363499 }, { "copy_logits_max": -3.976867437362671, "copy_logits_min": -750000000.0, "copy_num_tokens": 411.6875, "epoch": 0.6012764871074802, "gen_logits_max": 4.082945346832275, "gen_logits_mean": -15.024605751037598, "gen_logits_min": -26.400312423706055, "gen_logits_std": 2.5669069290161133, "gen_loss": 0.3098767101764679, "grad_norm": 0.4402704627347813, "learning_rate": 2.7210947368421052e-05, "loss": 0.3016, "mean_copy_accuracy": 0.9946149289608002, "mean_gen_accuracy": 0.8665392994880676, "mean_token_accuracy": 0.89849953353405, "num_tokens": 797975334.0, "sample_num_tokens": 7404.5, "step": 2944, "total_num_tokens": 798004952.0, "z_loss": 0.0005978586850687861 }, { "copy_logits_max": -3.6189627647399902, "copy_logits_min": -687500032.0, "copy_num_tokens": 404.5625, "epoch": 0.6014807250446771, "gen_logits_max": 5.780470371246338, "gen_logits_mean": -12.94613265991211, "gen_logits_min": -24.684602737426758, "gen_logits_std": 2.5383121967315674, "gen_loss": 0.33794230222702026, "grad_norm": 0.46591453348775436, "learning_rate": 2.7209684210526316e-05, "loss": 0.3291, "mean_copy_accuracy": 0.9941906929016113, "mean_gen_accuracy": 0.8589072525501251, "mean_token_accuracy": 0.8918982148170471, "num_tokens": 798219897.0, "sample_num_tokens": 7359.75, "step": 2945, "total_num_tokens": 798249336.0, "z_loss": 0.0006785290315747261 }, { "copy_logits_max": -1.1302111148834229, "copy_logits_min": -687500032.0, "copy_num_tokens": 496.125, "epoch": 0.6016849629818739, "gen_logits_max": 5.989686012268066, "gen_logits_mean": -12.157798767089844, "gen_logits_min": -24.064489364624023, "gen_logits_std": 2.5601518154144287, "gen_loss": 0.2940877079963684, "grad_norm": 0.41762457852429, "learning_rate": 2.720842105263158e-05, "loss": 0.3088, "mean_copy_accuracy": 0.9952730089426041, "mean_gen_accuracy": 0.8649439811706543, "mean_token_accuracy": 0.8987978845834732, "num_tokens": 798485545.0, "sample_num_tokens": 9011.75, "step": 2946, "total_num_tokens": 798521592.0, "z_loss": 0.0007153598126024008 }, { "copy_logits_max": -3.6787643432617188, "copy_logits_min": -687500032.0, "copy_num_tokens": 405.25, "epoch": 0.6018892009190707, "gen_logits_max": 5.530994415283203, "gen_logits_mean": -12.680261611938477, "gen_logits_min": -23.98202896118164, "gen_logits_std": 2.461395740509033, "gen_loss": 0.3217991292476654, "grad_norm": 0.44531445357031507, "learning_rate": 2.7207157894736845e-05, "loss": 0.3131, "mean_copy_accuracy": 0.9948401600122452, "mean_gen_accuracy": 0.8602825254201889, "mean_token_accuracy": 0.8984877020120621, "num_tokens": 798775255.0, "sample_num_tokens": 8399.75, "step": 2947, "total_num_tokens": 798808854.0, "z_loss": 0.0007150991586968303 }, { "copy_logits_max": -3.094026565551758, "copy_logits_min": -750000000.0, "copy_num_tokens": 488.5625, "epoch": 0.6020934388562675, "gen_logits_max": 5.439080238342285, "gen_logits_mean": -13.050945281982422, "gen_logits_min": -24.527057647705078, "gen_logits_std": 2.5667853355407715, "gen_loss": 0.29745933413505554, "grad_norm": 0.42873662519863814, "learning_rate": 2.7205894736842106e-05, "loss": 0.3157, "mean_copy_accuracy": 0.994828537106514, "mean_gen_accuracy": 0.8641713261604309, "mean_token_accuracy": 0.8952789008617401, "num_tokens": 799038178.0, "sample_num_tokens": 8143.0, "step": 2948, "total_num_tokens": 799070750.0, "z_loss": 0.0006339907995425165 }, { "copy_logits_max": -3.2269835472106934, "copy_logits_min": -750000000.0, "copy_num_tokens": 562.125, "epoch": 0.6022976767934644, "gen_logits_max": 4.752053260803223, "gen_logits_mean": -13.3661527633667, "gen_logits_min": -25.014694213867188, "gen_logits_std": 2.551218032836914, "gen_loss": 0.29822367429733276, "grad_norm": 0.42572082787023263, "learning_rate": 2.720463157894737e-05, "loss": 0.3091, "mean_copy_accuracy": 0.9943665117025375, "mean_gen_accuracy": 0.8635549992322922, "mean_token_accuracy": 0.9006237685680389, "num_tokens": 799321069.0, "sample_num_tokens": 8740.25, "step": 2949, "total_num_tokens": 799356030.0, "z_loss": 0.0006933205295354128 }, { "copy_logits_max": -3.954699993133545, "copy_logits_min": -687500032.0, "copy_num_tokens": 657.1875, "epoch": 0.6025019147306612, "gen_logits_max": 5.12076473236084, "gen_logits_mean": -13.473466873168945, "gen_logits_min": -25.319908142089844, "gen_logits_std": 2.58170747756958, "gen_loss": 0.31317174434661865, "grad_norm": 0.4221576930186429, "learning_rate": 2.720336842105263e-05, "loss": 0.3134, "mean_copy_accuracy": 0.9945573657751083, "mean_gen_accuracy": 0.8606978505849838, "mean_token_accuracy": 0.8956866264343262, "num_tokens": 799597071.0, "sample_num_tokens": 10045.75, "step": 2950, "total_num_tokens": 799637254.0, "z_loss": 0.0007085517863743007 }, { "copy_logits_max": -2.7906014919281006, "copy_logits_min": -687500032.0, "copy_num_tokens": 543.5, "epoch": 0.6027061526678581, "gen_logits_max": 5.684385299682617, "gen_logits_mean": -12.568010330200195, "gen_logits_min": -24.384859085083008, "gen_logits_std": 2.599565029144287, "gen_loss": 0.2775668203830719, "grad_norm": 0.42923662054928574, "learning_rate": 2.7202105263157896e-05, "loss": 0.2814, "mean_copy_accuracy": 0.9958536475896835, "mean_gen_accuracy": 0.8737820833921432, "mean_token_accuracy": 0.9067584425210953, "num_tokens": 799887540.0, "sample_num_tokens": 8886.0, "step": 2951, "total_num_tokens": 799923084.0, "z_loss": 0.000732912274543196 }, { "copy_logits_max": -3.958484649658203, "copy_logits_min": -750000000.0, "copy_num_tokens": 423.6875, "epoch": 0.6029103906050549, "gen_logits_max": 4.783252716064453, "gen_logits_mean": -14.348812103271484, "gen_logits_min": -26.02652359008789, "gen_logits_std": 2.5614194869995117, "gen_loss": 0.289104700088501, "grad_norm": 0.4465277821648845, "learning_rate": 2.7200842105263156e-05, "loss": 0.307, "mean_copy_accuracy": 0.9949766397476196, "mean_gen_accuracy": 0.8662036657333374, "mean_token_accuracy": 0.8967991471290588, "num_tokens": 800150309.0, "sample_num_tokens": 8232.25, "step": 2952, "total_num_tokens": 800183238.0, "z_loss": 0.0006652357405982912 }, { "copy_logits_max": -2.6009693145751953, "copy_logits_min": -750000000.0, "copy_num_tokens": 471.4375, "epoch": 0.6031146285422517, "gen_logits_max": 6.149301052093506, "gen_logits_mean": -11.879341125488281, "gen_logits_min": -23.591632843017578, "gen_logits_std": 2.502131223678589, "gen_loss": 0.3556259572505951, "grad_norm": 0.4287841906794724, "learning_rate": 2.719957894736842e-05, "loss": 0.2961, "mean_copy_accuracy": 0.9943422675132751, "mean_gen_accuracy": 0.8747156113386154, "mean_token_accuracy": 0.9023543894290924, "num_tokens": 800438421.0, "sample_num_tokens": 8889.75, "step": 2953, "total_num_tokens": 800473980.0, "z_loss": 0.0008526432211510837 }, { "copy_logits_max": -3.92653751373291, "copy_logits_min": -750000000.0, "copy_num_tokens": 387.75, "epoch": 0.6033188664794485, "gen_logits_max": 5.489063262939453, "gen_logits_mean": -14.456987380981445, "gen_logits_min": -26.07745361328125, "gen_logits_std": 2.5703680515289307, "gen_loss": 0.3507915139198303, "grad_norm": 0.44106900999430154, "learning_rate": 2.719831578947368e-05, "loss": 0.3135, "mean_copy_accuracy": 0.992513507604599, "mean_gen_accuracy": 0.8656027466058731, "mean_token_accuracy": 0.8944689780473709, "num_tokens": 800680127.0, "sample_num_tokens": 7489.25, "step": 2954, "total_num_tokens": 800710084.0, "z_loss": 0.0008440202218480408 }, { "copy_logits_max": -4.058786392211914, "copy_logits_min": -750000000.0, "copy_num_tokens": 579.1875, "epoch": 0.6035231044166454, "gen_logits_max": 4.74022102355957, "gen_logits_mean": -14.34012222290039, "gen_logits_min": -26.532245635986328, "gen_logits_std": 2.617004871368408, "gen_loss": 0.25497496128082275, "grad_norm": 0.428105959543181, "learning_rate": 2.719705263157895e-05, "loss": 0.2811, "mean_copy_accuracy": 0.9944389909505844, "mean_gen_accuracy": 0.870499461889267, "mean_token_accuracy": 0.9064838886260986, "num_tokens": 800965616.0, "sample_num_tokens": 9126.5, "step": 2955, "total_num_tokens": 801002122.0, "z_loss": 0.0006441624136641622 }, { "copy_logits_max": -2.3821938037872314, "copy_logits_min": -750000000.0, "copy_num_tokens": 490.4375, "epoch": 0.6037273423538422, "gen_logits_max": 5.174678802490234, "gen_logits_mean": -13.265554428100586, "gen_logits_min": -24.97478675842285, "gen_logits_std": 2.5436642169952393, "gen_loss": 0.31715917587280273, "grad_norm": 0.46592899393599335, "learning_rate": 2.7195789473684214e-05, "loss": 0.3213, "mean_copy_accuracy": 0.9917886406183243, "mean_gen_accuracy": 0.8627842664718628, "mean_token_accuracy": 0.8923493176698685, "num_tokens": 801209609.0, "sample_num_tokens": 7871.25, "step": 2956, "total_num_tokens": 801241094.0, "z_loss": 0.000763840798754245 }, { "copy_logits_max": -2.22139835357666, "copy_logits_min": -750000000.0, "copy_num_tokens": 484.0625, "epoch": 0.6039315802910391, "gen_logits_max": 5.303520202636719, "gen_logits_mean": -12.134625434875488, "gen_logits_min": -23.859176635742188, "gen_logits_std": 2.530097007751465, "gen_loss": 0.30732035636901855, "grad_norm": 0.45194301099961076, "learning_rate": 2.7194526315789475e-05, "loss": 0.3173, "mean_copy_accuracy": 0.9961972236633301, "mean_gen_accuracy": 0.8613824248313904, "mean_token_accuracy": 0.8953805267810822, "num_tokens": 801471440.0, "sample_num_tokens": 8167.0, "step": 2957, "total_num_tokens": 801504108.0, "z_loss": 0.0007618311792612076 }, { "copy_logits_max": -4.142560958862305, "copy_logits_min": -750000000.0, "copy_num_tokens": 607.125, "epoch": 0.6041358182282359, "gen_logits_max": 4.6938276290893555, "gen_logits_mean": -13.09886646270752, "gen_logits_min": -24.40186309814453, "gen_logits_std": 2.529374361038208, "gen_loss": 0.27010321617126465, "grad_norm": 0.45236013509439255, "learning_rate": 2.719326315789474e-05, "loss": 0.3163, "mean_copy_accuracy": 0.9931409060955048, "mean_gen_accuracy": 0.8675450086593628, "mean_token_accuracy": 0.8970039784908295, "num_tokens": 801727924.0, "sample_num_tokens": 9000.5, "step": 2958, "total_num_tokens": 801763926.0, "z_loss": 0.0007128031575120986 }, { "copy_logits_max": -4.5397257804870605, "copy_logits_min": -750000000.0, "copy_num_tokens": 380.875, "epoch": 0.6043400561654327, "gen_logits_max": 5.047885894775391, "gen_logits_mean": -13.864370346069336, "gen_logits_min": -25.187923431396484, "gen_logits_std": 2.5382165908813477, "gen_loss": 0.2930018901824951, "grad_norm": 0.4358714516401734, "learning_rate": 2.7192e-05, "loss": 0.3117, "mean_copy_accuracy": 0.9950868338346481, "mean_gen_accuracy": 0.8666741251945496, "mean_token_accuracy": 0.8967649936676025, "num_tokens": 801987769.0, "sample_num_tokens": 7719.75, "step": 2959, "total_num_tokens": 802018648.0, "z_loss": 0.0007041231729090214 }, { "copy_logits_max": -5.846164703369141, "copy_logits_min": -625000000.0, "copy_num_tokens": 485.0, "epoch": 0.6045442941026296, "gen_logits_max": 5.299108505249023, "gen_logits_mean": -13.116739273071289, "gen_logits_min": -24.31485366821289, "gen_logits_std": 2.5250134468078613, "gen_loss": 0.3064776659011841, "grad_norm": 0.5135793072585082, "learning_rate": 2.7190736842105264e-05, "loss": 0.3254, "mean_copy_accuracy": 0.9953896552324295, "mean_gen_accuracy": 0.861362636089325, "mean_token_accuracy": 0.8927479088306427, "num_tokens": 802256281.0, "sample_num_tokens": 8678.25, "step": 2960, "total_num_tokens": 802290994.0, "z_loss": 0.0007515873876400292 }, { "copy_logits_max": -2.8011746406555176, "copy_logits_min": -750000000.0, "copy_num_tokens": 505.8125, "epoch": 0.6047485320398264, "gen_logits_max": 4.611572265625, "gen_logits_mean": -13.40585994720459, "gen_logits_min": -25.197118759155273, "gen_logits_std": 2.5829224586486816, "gen_loss": 0.2782752513885498, "grad_norm": 0.44083476220442974, "learning_rate": 2.7189473684210525e-05, "loss": 0.2761, "mean_copy_accuracy": 0.9935502111911774, "mean_gen_accuracy": 0.8743709474802017, "mean_token_accuracy": 0.907937079668045, "num_tokens": 802529510.0, "sample_num_tokens": 8017.5, "step": 2961, "total_num_tokens": 802561580.0, "z_loss": 0.0006807752652093768 }, { "copy_logits_max": -3.258936882019043, "copy_logits_min": -750000000.0, "copy_num_tokens": 379.5625, "epoch": 0.6049527699770232, "gen_logits_max": 5.842219352722168, "gen_logits_mean": -12.469762802124023, "gen_logits_min": -24.280155181884766, "gen_logits_std": 2.5233941078186035, "gen_loss": 0.3174958825111389, "grad_norm": 0.4278856339901043, "learning_rate": 2.718821052631579e-05, "loss": 0.2956, "mean_copy_accuracy": 0.9944543987512589, "mean_gen_accuracy": 0.8685039430856705, "mean_token_accuracy": 0.9002287536859512, "num_tokens": 802803886.0, "sample_num_tokens": 7655.5, "step": 2962, "total_num_tokens": 802834508.0, "z_loss": 0.0007336490089073777 }, { "copy_logits_max": -4.1076836585998535, "copy_logits_min": -750000000.0, "copy_num_tokens": 586.4375, "epoch": 0.6051570079142201, "gen_logits_max": 6.28829288482666, "gen_logits_mean": -10.657796859741211, "gen_logits_min": -22.756261825561523, "gen_logits_std": 2.577038526535034, "gen_loss": 0.2934805452823639, "grad_norm": 0.5458279240241517, "learning_rate": 2.7186947368421054e-05, "loss": 0.3245, "mean_copy_accuracy": 0.9950489550828934, "mean_gen_accuracy": 0.857262060046196, "mean_token_accuracy": 0.8937670141458511, "num_tokens": 803084049.0, "sample_num_tokens": 10451.25, "step": 2963, "total_num_tokens": 803125854.0, "z_loss": 0.000703562400303781 }, { "copy_logits_max": -5.248132228851318, "copy_logits_min": -687500032.0, "copy_num_tokens": 347.5625, "epoch": 0.6053612458514169, "gen_logits_max": 4.611878395080566, "gen_logits_mean": -14.623482704162598, "gen_logits_min": -26.070302963256836, "gen_logits_std": 2.5899975299835205, "gen_loss": 0.2584841847419739, "grad_norm": 0.4375884917320133, "learning_rate": 2.7185684210526318e-05, "loss": 0.2889, "mean_copy_accuracy": 0.9953732192516327, "mean_gen_accuracy": 0.8725467920303345, "mean_token_accuracy": 0.9030013382434845, "num_tokens": 803345398.0, "sample_num_tokens": 7682.5, "step": 2964, "total_num_tokens": 803376128.0, "z_loss": 0.0006196598405949771 }, { "copy_logits_max": -3.411508560180664, "copy_logits_min": -750000000.0, "copy_num_tokens": 447.1875, "epoch": 0.6055654837886137, "gen_logits_max": 4.4779839515686035, "gen_logits_mean": -15.284022331237793, "gen_logits_min": -27.017305374145508, "gen_logits_std": 2.611898899078369, "gen_loss": 0.32615771889686584, "grad_norm": 0.3972902838971126, "learning_rate": 2.718442105263158e-05, "loss": 0.2955, "mean_copy_accuracy": 0.9964925199747086, "mean_gen_accuracy": 0.8647376149892807, "mean_token_accuracy": 0.9034087955951691, "num_tokens": 803622976.0, "sample_num_tokens": 8068.5, "step": 2965, "total_num_tokens": 803655250.0, "z_loss": 0.0007703750161454082 }, { "copy_logits_max": -4.264089584350586, "copy_logits_min": -750000000.0, "copy_num_tokens": 534.875, "epoch": 0.6057697217258106, "gen_logits_max": 5.20013427734375, "gen_logits_mean": -13.215312957763672, "gen_logits_min": -25.027931213378906, "gen_logits_std": 2.5174694061279297, "gen_loss": 0.2786318063735962, "grad_norm": 0.4516680111919596, "learning_rate": 2.7183157894736843e-05, "loss": 0.327, "mean_copy_accuracy": 0.9947241842746735, "mean_gen_accuracy": 0.8617022037506104, "mean_token_accuracy": 0.894509494304657, "num_tokens": 803895003.0, "sample_num_tokens": 8859.25, "step": 2966, "total_num_tokens": 803930440.0, "z_loss": 0.0005922153359279037 }, { "copy_logits_max": -4.682278633117676, "copy_logits_min": -750000000.0, "copy_num_tokens": 299.625, "epoch": 0.6059739596630074, "gen_logits_max": 5.527131080627441, "gen_logits_mean": -13.277111053466797, "gen_logits_min": -24.80300521850586, "gen_logits_std": 2.5762274265289307, "gen_loss": 0.3174665570259094, "grad_norm": 0.4690561556561224, "learning_rate": 2.7181894736842104e-05, "loss": 0.2993, "mean_copy_accuracy": 0.9937593042850494, "mean_gen_accuracy": 0.8774090558290482, "mean_token_accuracy": 0.9001499712467194, "num_tokens": 804147611.0, "sample_num_tokens": 7073.25, "step": 2967, "total_num_tokens": 804175904.0, "z_loss": 0.0006903600879013538 }, { "copy_logits_max": -3.3890695571899414, "copy_logits_min": -750000000.0, "copy_num_tokens": 582.8125, "epoch": 0.6061781976002042, "gen_logits_max": 4.853716850280762, "gen_logits_mean": -13.04449462890625, "gen_logits_min": -25.111045837402344, "gen_logits_std": 2.637578010559082, "gen_loss": 0.3179880976676941, "grad_norm": 0.4223741700644296, "learning_rate": 2.718063157894737e-05, "loss": 0.3194, "mean_copy_accuracy": 0.9926692843437195, "mean_gen_accuracy": 0.8639971613883972, "mean_token_accuracy": 0.8933022171258926, "num_tokens": 804396628.0, "sample_num_tokens": 8549.0, "step": 2968, "total_num_tokens": 804430824.0, "z_loss": 0.0007292301161214709 }, { "copy_logits_max": -4.38492488861084, "copy_logits_min": -750000000.0, "copy_num_tokens": 458.5625, "epoch": 0.6063824355374011, "gen_logits_max": 5.603152275085449, "gen_logits_mean": -13.269500732421875, "gen_logits_min": -24.68606185913086, "gen_logits_std": 2.591763973236084, "gen_loss": 0.3564622700214386, "grad_norm": 0.4087298189897757, "learning_rate": 2.7179368421052633e-05, "loss": 0.3087, "mean_copy_accuracy": 0.9959379732608795, "mean_gen_accuracy": 0.8608033359050751, "mean_token_accuracy": 0.8975378572940826, "num_tokens": 804682905.0, "sample_num_tokens": 9304.75, "step": 2969, "total_num_tokens": 804720124.0, "z_loss": 0.0008415093179792166 }, { "copy_logits_max": -4.068939208984375, "copy_logits_min": -750000000.0, "copy_num_tokens": 507.6875, "epoch": 0.6065866734745979, "gen_logits_max": 5.318314552307129, "gen_logits_mean": -12.454059600830078, "gen_logits_min": -24.204959869384766, "gen_logits_std": 2.558046817779541, "gen_loss": 0.31234315037727356, "grad_norm": 0.4362647731137854, "learning_rate": 2.7178105263157894e-05, "loss": 0.3125, "mean_copy_accuracy": 0.9956402033567429, "mean_gen_accuracy": 0.8655030429363251, "mean_token_accuracy": 0.8988660722970963, "num_tokens": 804974536.0, "sample_num_tokens": 8994.0, "step": 2970, "total_num_tokens": 805010512.0, "z_loss": 0.0007764284964650869 }, { "copy_logits_max": -5.804656028747559, "copy_logits_min": -750000000.0, "copy_num_tokens": 424.625, "epoch": 0.6067909114117948, "gen_logits_max": 4.726593971252441, "gen_logits_mean": -15.02389907836914, "gen_logits_min": -26.533401489257812, "gen_logits_std": 2.5847244262695312, "gen_loss": 0.30602145195007324, "grad_norm": 0.4542982862010669, "learning_rate": 2.717684210526316e-05, "loss": 0.3391, "mean_copy_accuracy": 0.9947459995746613, "mean_gen_accuracy": 0.8518044352531433, "mean_token_accuracy": 0.8864514529705048, "num_tokens": 805245954.0, "sample_num_tokens": 8648.0, "step": 2971, "total_num_tokens": 805280546.0, "z_loss": 0.000678928685374558 }, { "copy_logits_max": -5.754271030426025, "copy_logits_min": -750000000.0, "copy_num_tokens": 354.5, "epoch": 0.6069951493489916, "gen_logits_max": 5.337053298950195, "gen_logits_mean": -12.962757110595703, "gen_logits_min": -24.398502349853516, "gen_logits_std": 2.5720081329345703, "gen_loss": 0.3176417648792267, "grad_norm": 0.44071243520140807, "learning_rate": 2.7175578947368422e-05, "loss": 0.3105, "mean_copy_accuracy": 0.9940944463014603, "mean_gen_accuracy": 0.8664510399103165, "mean_token_accuracy": 0.8971105515956879, "num_tokens": 805528399.0, "sample_num_tokens": 7861.25, "step": 2972, "total_num_tokens": 805559844.0, "z_loss": 0.0007162087131291628 }, { "copy_logits_max": -3.8468616008758545, "copy_logits_min": -750000000.0, "copy_num_tokens": 403.375, "epoch": 0.6071993872861884, "gen_logits_max": 4.792011260986328, "gen_logits_mean": -14.277793884277344, "gen_logits_min": -25.524131774902344, "gen_logits_std": 2.56715726852417, "gen_loss": 0.30710369348526, "grad_norm": 0.5337253337071954, "learning_rate": 2.7174315789473687e-05, "loss": 0.314, "mean_copy_accuracy": 0.9936568588018417, "mean_gen_accuracy": 0.8674641251564026, "mean_token_accuracy": 0.8931988030672073, "num_tokens": 805782313.0, "sample_num_tokens": 8120.25, "step": 2973, "total_num_tokens": 805814794.0, "z_loss": 0.0007411973783746362 }, { "copy_logits_max": -3.4329867362976074, "copy_logits_min": -750000000.0, "copy_num_tokens": 497.875, "epoch": 0.6074036252233852, "gen_logits_max": 4.972500801086426, "gen_logits_mean": -13.621063232421875, "gen_logits_min": -25.092411041259766, "gen_logits_std": 2.5339536666870117, "gen_loss": 0.35306257009506226, "grad_norm": 0.4593002686823523, "learning_rate": 2.7173052631578948e-05, "loss": 0.3433, "mean_copy_accuracy": 0.9941136986017227, "mean_gen_accuracy": 0.8576588779687881, "mean_token_accuracy": 0.889173611998558, "num_tokens": 806046543.0, "sample_num_tokens": 9422.75, "step": 2974, "total_num_tokens": 806084234.0, "z_loss": 0.000832511461339891 }, { "copy_logits_max": -3.564816474914551, "copy_logits_min": -750000000.0, "copy_num_tokens": 479.9375, "epoch": 0.6076078631605821, "gen_logits_max": 5.971823692321777, "gen_logits_mean": -12.305258750915527, "gen_logits_min": -23.85061264038086, "gen_logits_std": 2.5732266902923584, "gen_loss": 0.32562652230262756, "grad_norm": 0.46790376705345216, "learning_rate": 2.7171789473684212e-05, "loss": 0.2951, "mean_copy_accuracy": 0.9931921064853668, "mean_gen_accuracy": 0.8658370971679688, "mean_token_accuracy": 0.9014099985361099, "num_tokens": 806321551.0, "sample_num_tokens": 8747.25, "step": 2975, "total_num_tokens": 806356540.0, "z_loss": 0.0008029098971746862 }, { "copy_logits_max": -5.501537799835205, "copy_logits_min": -750000000.0, "copy_num_tokens": 395.625, "epoch": 0.607812101097779, "gen_logits_max": 5.672355651855469, "gen_logits_mean": -12.91785717010498, "gen_logits_min": -24.42967987060547, "gen_logits_std": 2.5279672145843506, "gen_loss": 0.32731160521507263, "grad_norm": 0.43558905474740706, "learning_rate": 2.7170526315789473e-05, "loss": 0.3195, "mean_copy_accuracy": 0.9937451630830765, "mean_gen_accuracy": 0.8641991168260574, "mean_token_accuracy": 0.8950270861387253, "num_tokens": 806603920.0, "sample_num_tokens": 9078.0, "step": 2976, "total_num_tokens": 806640232.0, "z_loss": 0.0007609215099364519 }, { "copy_logits_max": -3.9446206092834473, "copy_logits_min": -750000000.0, "copy_num_tokens": 623.5625, "epoch": 0.6080163390349758, "gen_logits_max": 5.010025501251221, "gen_logits_mean": -11.731897354125977, "gen_logits_min": -24.044918060302734, "gen_logits_std": 2.6249873638153076, "gen_loss": 0.2832818329334259, "grad_norm": 0.4707908655143927, "learning_rate": 2.7169263157894737e-05, "loss": 0.3099, "mean_copy_accuracy": 0.9949481636285782, "mean_gen_accuracy": 0.859903022646904, "mean_token_accuracy": 0.8979920595884323, "num_tokens": 806881277.0, "sample_num_tokens": 9119.75, "step": 2977, "total_num_tokens": 806917756.0, "z_loss": 0.0007343130419030786 }, { "copy_logits_max": -4.690462112426758, "copy_logits_min": -750000000.0, "copy_num_tokens": 490.75, "epoch": 0.6082205769721726, "gen_logits_max": 5.6106672286987305, "gen_logits_mean": -12.825591087341309, "gen_logits_min": -23.943023681640625, "gen_logits_std": 2.5624427795410156, "gen_loss": 0.33032089471817017, "grad_norm": 0.4193917222745211, "learning_rate": 2.7167999999999998e-05, "loss": 0.3342, "mean_copy_accuracy": 0.9934539645910263, "mean_gen_accuracy": 0.861533522605896, "mean_token_accuracy": 0.890818402171135, "num_tokens": 807168301.0, "sample_num_tokens": 9640.75, "step": 2978, "total_num_tokens": 807206864.0, "z_loss": 0.0007570956950075924 }, { "copy_logits_max": -3.960838794708252, "copy_logits_min": -750000000.0, "copy_num_tokens": 453.4375, "epoch": 0.6084248149093694, "gen_logits_max": 4.97163724899292, "gen_logits_mean": -13.561212539672852, "gen_logits_min": -24.939668655395508, "gen_logits_std": 2.572322368621826, "gen_loss": 0.3222612142562866, "grad_norm": 0.4690430823850682, "learning_rate": 2.7166736842105266e-05, "loss": 0.3139, "mean_copy_accuracy": 0.9943415820598602, "mean_gen_accuracy": 0.8604343980550766, "mean_token_accuracy": 0.8967832922935486, "num_tokens": 807447079.0, "sample_num_tokens": 8554.75, "step": 2979, "total_num_tokens": 807481298.0, "z_loss": 0.0007900646887719631 }, { "copy_logits_max": -4.452474594116211, "copy_logits_min": -750000000.0, "copy_num_tokens": 389.5, "epoch": 0.6086290528465662, "gen_logits_max": 5.620530605316162, "gen_logits_mean": -12.681323051452637, "gen_logits_min": -23.978683471679688, "gen_logits_std": 2.543327808380127, "gen_loss": 0.36152809858322144, "grad_norm": 0.4221576297660071, "learning_rate": 2.7165473684210527e-05, "loss": 0.3052, "mean_copy_accuracy": 0.994587853550911, "mean_gen_accuracy": 0.8690318912267685, "mean_token_accuracy": 0.9013085812330246, "num_tokens": 807711819.0, "sample_num_tokens": 8588.75, "step": 2980, "total_num_tokens": 807746174.0, "z_loss": 0.000819275388494134 }, { "copy_logits_max": -2.346257448196411, "copy_logits_min": -750000000.0, "copy_num_tokens": 624.1875, "epoch": 0.6088332907837631, "gen_logits_max": 4.166155815124512, "gen_logits_mean": -14.247328758239746, "gen_logits_min": -26.064973831176758, "gen_logits_std": 2.606743097305298, "gen_loss": 0.2860882878303528, "grad_norm": 0.4403420053249128, "learning_rate": 2.716421052631579e-05, "loss": 0.2979, "mean_copy_accuracy": 0.995767280459404, "mean_gen_accuracy": 0.8640819638967514, "mean_token_accuracy": 0.9038504958152771, "num_tokens": 808012380.0, "sample_num_tokens": 9139.0, "step": 2981, "total_num_tokens": 808048936.0, "z_loss": 0.0006560584297403693 }, { "copy_logits_max": -4.758455276489258, "copy_logits_min": -750000000.0, "copy_num_tokens": 499.5625, "epoch": 0.60903752872096, "gen_logits_max": 4.627306938171387, "gen_logits_mean": -14.104283332824707, "gen_logits_min": -25.17066192626953, "gen_logits_std": 2.505608081817627, "gen_loss": 0.32311853766441345, "grad_norm": 0.4365461451886633, "learning_rate": 2.7162947368421055e-05, "loss": 0.3224, "mean_copy_accuracy": 0.9946006089448929, "mean_gen_accuracy": 0.8605962097644806, "mean_token_accuracy": 0.8941957354545593, "num_tokens": 808276084.0, "sample_num_tokens": 9085.0, "step": 2982, "total_num_tokens": 808312424.0, "z_loss": 0.0007364319171756506 }, { "copy_logits_max": -4.267033576965332, "copy_logits_min": -750000000.0, "copy_num_tokens": 406.875, "epoch": 0.6092417666581568, "gen_logits_max": 4.380715370178223, "gen_logits_mean": -13.967196464538574, "gen_logits_min": -25.321147918701172, "gen_logits_std": 2.539818286895752, "gen_loss": 0.3159247636795044, "grad_norm": 0.44331383190565593, "learning_rate": 2.7161684210526316e-05, "loss": 0.3113, "mean_copy_accuracy": 0.9934639036655426, "mean_gen_accuracy": 0.8651856333017349, "mean_token_accuracy": 0.8969209939241409, "num_tokens": 808563261.0, "sample_num_tokens": 7785.25, "step": 2983, "total_num_tokens": 808594402.0, "z_loss": 0.0006719417287968099 }, { "copy_logits_max": -2.7224490642547607, "copy_logits_min": -750000000.0, "copy_num_tokens": 650.9375, "epoch": 0.6094460045953536, "gen_logits_max": 3.5743625164031982, "gen_logits_mean": -14.805351257324219, "gen_logits_min": -26.165462493896484, "gen_logits_std": 2.539759635925293, "gen_loss": 0.308698445558548, "grad_norm": 0.4244200648029609, "learning_rate": 2.716042105263158e-05, "loss": 0.3202, "mean_copy_accuracy": 0.9948277026414871, "mean_gen_accuracy": 0.8601116687059402, "mean_token_accuracy": 0.8951181322336197, "num_tokens": 808832577.0, "sample_num_tokens": 9260.25, "step": 2984, "total_num_tokens": 808869618.0, "z_loss": 0.0007302434532903135 }, { "copy_logits_max": -4.1044511795043945, "copy_logits_min": -687499968.0, "copy_num_tokens": 527.0, "epoch": 0.6096502425325504, "gen_logits_max": 4.816112518310547, "gen_logits_mean": -13.915960311889648, "gen_logits_min": -24.74112892150879, "gen_logits_std": 2.493309497833252, "gen_loss": 0.24587345123291016, "grad_norm": 0.4270064533334166, "learning_rate": 2.715915789473684e-05, "loss": 0.3027, "mean_copy_accuracy": 0.9937587976455688, "mean_gen_accuracy": 0.8739401698112488, "mean_token_accuracy": 0.9014714658260345, "num_tokens": 809093700.0, "sample_num_tokens": 9027.0, "step": 2985, "total_num_tokens": 809129808.0, "z_loss": 0.0005875751376152039 }, { "copy_logits_max": -5.762951850891113, "copy_logits_min": -687500032.0, "copy_num_tokens": 456.1875, "epoch": 0.6098544804697472, "gen_logits_max": 5.575437545776367, "gen_logits_mean": -12.434553146362305, "gen_logits_min": -23.405529022216797, "gen_logits_std": 2.4713215827941895, "gen_loss": 0.293682336807251, "grad_norm": 0.49297451721317154, "learning_rate": 2.7157894736842106e-05, "loss": 0.3179, "mean_copy_accuracy": 0.9948503822088242, "mean_gen_accuracy": 0.8628200590610504, "mean_token_accuracy": 0.8948867470026016, "num_tokens": 809363354.0, "sample_num_tokens": 8652.5, "step": 2986, "total_num_tokens": 809397964.0, "z_loss": 0.0007238360121846199 }, { "copy_logits_max": -3.5314979553222656, "copy_logits_min": -750000000.0, "copy_num_tokens": 399.875, "epoch": 0.6100587184069441, "gen_logits_max": 4.665497303009033, "gen_logits_mean": -14.135181427001953, "gen_logits_min": -25.452434539794922, "gen_logits_std": 2.535165786743164, "gen_loss": 0.34074854850769043, "grad_norm": 0.47989577991741755, "learning_rate": 2.715663157894737e-05, "loss": 0.3211, "mean_copy_accuracy": 0.9927189648151398, "mean_gen_accuracy": 0.8606235235929489, "mean_token_accuracy": 0.8937096148729324, "num_tokens": 809630298.0, "sample_num_tokens": 7498.0, "step": 2987, "total_num_tokens": 809660290.0, "z_loss": 0.0007510732393711805 }, { "copy_logits_max": -5.468548774719238, "copy_logits_min": -750000000.0, "copy_num_tokens": 263.625, "epoch": 0.610262956344141, "gen_logits_max": 5.102050304412842, "gen_logits_mean": -13.84250545501709, "gen_logits_min": -24.408390045166016, "gen_logits_std": 2.4545295238494873, "gen_loss": 0.3390139043331146, "grad_norm": 0.41726229686171545, "learning_rate": 2.7155368421052634e-05, "loss": 0.3103, "mean_copy_accuracy": 0.9946044981479645, "mean_gen_accuracy": 0.8656208366155624, "mean_token_accuracy": 0.8974706381559372, "num_tokens": 809902251.0, "sample_num_tokens": 6701.75, "step": 2988, "total_num_tokens": 809929058.0, "z_loss": 0.0007539411308243871 }, { "copy_logits_max": -5.010071754455566, "copy_logits_min": -687500032.0, "copy_num_tokens": 315.0625, "epoch": 0.6104671942813378, "gen_logits_max": 5.240386009216309, "gen_logits_mean": -12.815803527832031, "gen_logits_min": -23.450735092163086, "gen_logits_std": 2.424302577972412, "gen_loss": 0.3174704313278198, "grad_norm": 0.41393204773097814, "learning_rate": 2.7154105263157895e-05, "loss": 0.3054, "mean_copy_accuracy": 0.9951478242874146, "mean_gen_accuracy": 0.8698462694883347, "mean_token_accuracy": 0.8985612690448761, "num_tokens": 810174019.0, "sample_num_tokens": 6989.25, "step": 2989, "total_num_tokens": 810201976.0, "z_loss": 0.0008370372233912349 }, { "copy_logits_max": -2.6813905239105225, "copy_logits_min": -750000000.0, "copy_num_tokens": 724.4375, "epoch": 0.6106714322185346, "gen_logits_max": 4.007645606994629, "gen_logits_mean": -13.47210693359375, "gen_logits_min": -24.385995864868164, "gen_logits_std": 2.4503912925720215, "gen_loss": 0.33024096488952637, "grad_norm": 0.43958292297752866, "learning_rate": 2.715284210526316e-05, "loss": 0.333, "mean_copy_accuracy": 0.9946060925722122, "mean_gen_accuracy": 0.8556906580924988, "mean_token_accuracy": 0.8926662057638168, "num_tokens": 810421455.0, "sample_num_tokens": 9809.25, "step": 2990, "total_num_tokens": 810460692.0, "z_loss": 0.0008526133024133742 }, { "copy_logits_max": -3.1496963500976562, "copy_logits_min": -750000000.0, "copy_num_tokens": 424.625, "epoch": 0.6108756701557314, "gen_logits_max": 4.434242248535156, "gen_logits_mean": -14.113264083862305, "gen_logits_min": -25.038394927978516, "gen_logits_std": 2.474792242050171, "gen_loss": 0.29275211691856384, "grad_norm": 0.44003473100927626, "learning_rate": 2.715157894736842e-05, "loss": 0.3197, "mean_copy_accuracy": 0.9943994879722595, "mean_gen_accuracy": 0.86088427901268, "mean_token_accuracy": 0.8938403427600861, "num_tokens": 810686127.0, "sample_num_tokens": 7630.75, "step": 2991, "total_num_tokens": 810716650.0, "z_loss": 0.0007609567255713046 }, { "copy_logits_max": -4.281445503234863, "copy_logits_min": -750000000.0, "copy_num_tokens": 421.375, "epoch": 0.6110799080929282, "gen_logits_max": 4.009361743927002, "gen_logits_mean": -15.146150588989258, "gen_logits_min": -25.965917587280273, "gen_logits_std": 2.4759340286254883, "gen_loss": 0.31267207860946655, "grad_norm": 0.43494599450333216, "learning_rate": 2.7150315789473685e-05, "loss": 0.3185, "mean_copy_accuracy": 0.9941556453704834, "mean_gen_accuracy": 0.8628238588571548, "mean_token_accuracy": 0.8929636180400848, "num_tokens": 810949024.0, "sample_num_tokens": 8009.5, "step": 2992, "total_num_tokens": 810981062.0, "z_loss": 0.0007152510806918144 }, { "copy_logits_max": -2.0897507667541504, "copy_logits_min": -750000064.0, "copy_num_tokens": 454.875, "epoch": 0.611284146030125, "gen_logits_max": 4.841577529907227, "gen_logits_mean": -12.338203430175781, "gen_logits_min": -23.582277297973633, "gen_logits_std": 2.5040478706359863, "gen_loss": 0.32195135951042175, "grad_norm": 0.7428865586877434, "learning_rate": 2.7149052631578946e-05, "loss": 0.305, "mean_copy_accuracy": 0.9940050691366196, "mean_gen_accuracy": 0.8667640686035156, "mean_token_accuracy": 0.8998347669839859, "num_tokens": 811213191.0, "sample_num_tokens": 7516.25, "step": 2993, "total_num_tokens": 811243256.0, "z_loss": 0.0008388285059481859 }, { "copy_logits_max": -3.551002025604248, "copy_logits_min": -750000000.0, "copy_num_tokens": 494.6875, "epoch": 0.611488383967322, "gen_logits_max": 4.387421131134033, "gen_logits_mean": -13.370260238647461, "gen_logits_min": -24.135765075683594, "gen_logits_std": 2.4548768997192383, "gen_loss": 0.28458285331726074, "grad_norm": 0.39959494052761, "learning_rate": 2.714778947368421e-05, "loss": 0.2983, "mean_copy_accuracy": 0.994673103094101, "mean_gen_accuracy": 0.8699089586734772, "mean_token_accuracy": 0.9013354480266571, "num_tokens": 811505332.0, "sample_num_tokens": 8366.0, "step": 2994, "total_num_tokens": 811538796.0, "z_loss": 0.000725637364666909 }, { "copy_logits_max": -2.780062437057495, "copy_logits_min": -750000000.0, "copy_num_tokens": 505.0625, "epoch": 0.6116926219045188, "gen_logits_max": 4.4293928146362305, "gen_logits_mean": -12.184286117553711, "gen_logits_min": -22.980735778808594, "gen_logits_std": 2.4319357872009277, "gen_loss": 0.29140645265579224, "grad_norm": 0.4105776148074117, "learning_rate": 2.7146526315789474e-05, "loss": 0.3229, "mean_copy_accuracy": 0.9951076656579971, "mean_gen_accuracy": 0.8583104312419891, "mean_token_accuracy": 0.8921440690755844, "num_tokens": 811781982.0, "sample_num_tokens": 8549.5, "step": 2995, "total_num_tokens": 811816180.0, "z_loss": 0.0007229420589283109 }, { "copy_logits_max": -3.9569637775421143, "copy_logits_min": -750000000.0, "copy_num_tokens": 364.5625, "epoch": 0.6118968598417156, "gen_logits_max": 4.876266956329346, "gen_logits_mean": -13.425786972045898, "gen_logits_min": -24.218069076538086, "gen_logits_std": 2.4440810680389404, "gen_loss": 0.32349467277526855, "grad_norm": 0.4556606520306217, "learning_rate": 2.714526315789474e-05, "loss": 0.3142, "mean_copy_accuracy": 0.993724524974823, "mean_gen_accuracy": 0.8686117231845856, "mean_token_accuracy": 0.8966941684484482, "num_tokens": 812028884.0, "sample_num_tokens": 7756.5, "step": 2996, "total_num_tokens": 812059910.0, "z_loss": 0.0007619456155225635 }, { "copy_logits_max": -4.382645606994629, "copy_logits_min": -750000000.0, "copy_num_tokens": 480.25, "epoch": 0.6121010977789124, "gen_logits_max": 4.140723705291748, "gen_logits_mean": -14.549476623535156, "gen_logits_min": -25.05230712890625, "gen_logits_std": 2.4500160217285156, "gen_loss": 0.31658250093460083, "grad_norm": 0.425035851179577, "learning_rate": 2.7144000000000003e-05, "loss": 0.3105, "mean_copy_accuracy": 0.9952617734670639, "mean_gen_accuracy": 0.8638221621513367, "mean_token_accuracy": 0.8966668844223022, "num_tokens": 812286101.0, "sample_num_tokens": 9373.75, "step": 2997, "total_num_tokens": 812323596.0, "z_loss": 0.0006623931112699211 }, { "copy_logits_max": -3.5812418460845947, "copy_logits_min": -750000000.0, "copy_num_tokens": 369.0, "epoch": 0.6123053357161092, "gen_logits_max": 5.055791854858398, "gen_logits_mean": -12.456436157226562, "gen_logits_min": -23.05227279663086, "gen_logits_std": 2.41756534576416, "gen_loss": 0.4051188826560974, "grad_norm": 0.44569049421845663, "learning_rate": 2.7142736842105264e-05, "loss": 0.3305, "mean_copy_accuracy": 0.9953272938728333, "mean_gen_accuracy": 0.8561907857656479, "mean_token_accuracy": 0.8913374245166779, "num_tokens": 812561561.0, "sample_num_tokens": 7980.75, "step": 2998, "total_num_tokens": 812593484.0, "z_loss": 0.0008802780066616833 }, { "copy_logits_max": -3.261042356491089, "copy_logits_min": -750000000.0, "copy_num_tokens": 461.75, "epoch": 0.612509573653306, "gen_logits_max": 3.9783616065979004, "gen_logits_mean": -14.483785629272461, "gen_logits_min": -25.40866470336914, "gen_logits_std": 2.482487201690674, "gen_loss": 0.328987181186676, "grad_norm": 0.43844275572743496, "learning_rate": 2.7141473684210528e-05, "loss": 0.3042, "mean_copy_accuracy": 0.9936820864677429, "mean_gen_accuracy": 0.8656584471464157, "mean_token_accuracy": 0.8988614231348038, "num_tokens": 812847467.0, "sample_num_tokens": 8163.75, "step": 2999, "total_num_tokens": 812880122.0, "z_loss": 0.0007248710026033223 }, { "epoch": 0.612713811590503, "grad_norm": 0.41642242461850015, "learning_rate": 2.714021052631579e-05, "loss": 0.3133, "step": 3000 }, { "epoch": 0.612713811590503, "eval_copy_logits_max": -5.1385674476623535, "eval_copy_logits_min": -74.57744598388672, "eval_gen_logits_max": 4.237589359283447, "eval_gen_logits_mean": -18.32798194885254, "eval_gen_logits_min": -29.07415199279785, "eval_gen_logits_std": 2.560828685760498, "eval_gen_loss": 0.360080748796463, "eval_loss": 0.3380771279335022, "eval_mean_copy_accuracy": 0.9913989007472992, "eval_mean_gen_accuracy": 0.8698131144046783, "eval_mean_token_accuracy": 0.8853568434715271, "eval_num_tokens": 813134290.0, "eval_runtime": 0.7808, "eval_samples_per_second": 10.246, "eval_steps_per_second": 2.561, "eval_total_num_tokens": 813134290.0, "eval_z_loss": 0.0006858622655272484, "step": 3000 }, { "copy_logits_max": -2.275082588195801, "copy_logits_min": -750000000.0, "copy_num_tokens": 560.25, "epoch": 0.6129180495276998, "gen_logits_max": 3.324248790740967, "gen_logits_mean": -14.383867263793945, "gen_logits_min": -25.210477828979492, "gen_logits_std": 2.426462173461914, "gen_loss": 0.2657278776168823, "grad_norm": 0.40302674032470365, "learning_rate": 2.7138947368421054e-05, "loss": 0.3057, "mean_copy_accuracy": 0.9940215274691582, "mean_gen_accuracy": 0.8646709397435188, "mean_token_accuracy": 0.8972183614969254, "num_tokens": 813385024.0, "sample_num_tokens": 8200.0, "step": 3001, "total_num_tokens": 813417824.0, "z_loss": 0.0007025189697742462 }, { "copy_logits_max": -4.4435319900512695, "copy_logits_min": -750000000.0, "copy_num_tokens": 403.4375, "epoch": 0.6131222874648966, "gen_logits_max": 5.086735725402832, "gen_logits_mean": -13.015064239501953, "gen_logits_min": -23.581043243408203, "gen_logits_std": 2.433563232421875, "gen_loss": 0.3564378321170807, "grad_norm": 0.432203493701536, "learning_rate": 2.7137684210526314e-05, "loss": 0.3171, "mean_copy_accuracy": 0.9936317503452301, "mean_gen_accuracy": 0.8656439632177353, "mean_token_accuracy": 0.8962517082691193, "num_tokens": 813668102.0, "sample_num_tokens": 8727.0, "step": 3002, "total_num_tokens": 813703010.0, "z_loss": 0.0007379422313533723 }, { "copy_logits_max": -3.835063934326172, "copy_logits_min": -750000000.0, "copy_num_tokens": 334.25, "epoch": 0.6133265254020934, "gen_logits_max": 4.6600494384765625, "gen_logits_mean": -14.094066619873047, "gen_logits_min": -24.83580207824707, "gen_logits_std": 2.465151071548462, "gen_loss": 0.3579533100128174, "grad_norm": 0.4212557581694276, "learning_rate": 2.713642105263158e-05, "loss": 0.3108, "mean_copy_accuracy": 0.9957849681377411, "mean_gen_accuracy": 0.8610161989927292, "mean_token_accuracy": 0.8969990462064743, "num_tokens": 813948548.0, "sample_num_tokens": 7110.0, "step": 3003, "total_num_tokens": 813976988.0, "z_loss": 0.0007425518124364316 }, { "copy_logits_max": -3.971726417541504, "copy_logits_min": -750000000.0, "copy_num_tokens": 483.25, "epoch": 0.6135307633392902, "gen_logits_max": 4.6083221435546875, "gen_logits_mean": -12.811067581176758, "gen_logits_min": -23.79629898071289, "gen_logits_std": 2.431036949157715, "gen_loss": 0.31749868392944336, "grad_norm": 0.45586362997615804, "learning_rate": 2.7135157894736843e-05, "loss": 0.3194, "mean_copy_accuracy": 0.9925585091114044, "mean_gen_accuracy": 0.8636544495820999, "mean_token_accuracy": 0.8963803201913834, "num_tokens": 814221202.0, "sample_num_tokens": 8784.0, "step": 3004, "total_num_tokens": 814256338.0, "z_loss": 0.0006749875610694289 }, { "copy_logits_max": -3.9308958053588867, "copy_logits_min": -750000000.0, "copy_num_tokens": 438.3125, "epoch": 0.6137350012764871, "gen_logits_max": 4.243766784667969, "gen_logits_mean": -12.836769104003906, "gen_logits_min": -23.985780715942383, "gen_logits_std": 2.419769763946533, "gen_loss": 0.2991390526294708, "grad_norm": 0.4263063732524035, "learning_rate": 2.7133894736842107e-05, "loss": 0.3099, "mean_copy_accuracy": 0.9953683316707611, "mean_gen_accuracy": 0.8581215292215347, "mean_token_accuracy": 0.8968720436096191, "num_tokens": 814501056.0, "sample_num_tokens": 7568.5, "step": 3005, "total_num_tokens": 814531330.0, "z_loss": 0.0006500178715214133 }, { "copy_logits_max": -2.8708722591400146, "copy_logits_min": -750000000.0, "copy_num_tokens": 684.0625, "epoch": 0.613939239213684, "gen_logits_max": 4.348979949951172, "gen_logits_mean": -13.105560302734375, "gen_logits_min": -24.43785858154297, "gen_logits_std": 2.450965404510498, "gen_loss": 0.2731523811817169, "grad_norm": 0.4451991808869346, "learning_rate": 2.7132631578947368e-05, "loss": 0.3004, "mean_copy_accuracy": 0.9949632883071899, "mean_gen_accuracy": 0.8661757707595825, "mean_token_accuracy": 0.9002400040626526, "num_tokens": 814758065.0, "sample_num_tokens": 10251.25, "step": 3006, "total_num_tokens": 814799070.0, "z_loss": 0.000609540962614119 }, { "copy_logits_max": -5.028058052062988, "copy_logits_min": -750000000.0, "copy_num_tokens": 479.5625, "epoch": 0.6141434771508808, "gen_logits_max": 4.152729034423828, "gen_logits_mean": -13.57116985321045, "gen_logits_min": -24.68994903564453, "gen_logits_std": 2.4675445556640625, "gen_loss": 0.2907322645187378, "grad_norm": 0.45466236294500106, "learning_rate": 2.7131368421052633e-05, "loss": 0.3125, "mean_copy_accuracy": 0.9934304803609848, "mean_gen_accuracy": 0.8672309070825577, "mean_token_accuracy": 0.8955698311328888, "num_tokens": 815012140.0, "sample_num_tokens": 8375.5, "step": 3007, "total_num_tokens": 815045642.0, "z_loss": 0.0006260266527533531 }, { "copy_logits_max": -5.796541690826416, "copy_logits_min": -750000000.0, "copy_num_tokens": 400.1875, "epoch": 0.6143477150880776, "gen_logits_max": 4.437185287475586, "gen_logits_mean": -15.276998519897461, "gen_logits_min": -26.164478302001953, "gen_logits_std": 2.4952609539031982, "gen_loss": 0.3095609247684479, "grad_norm": 0.42894796174296845, "learning_rate": 2.7130105263157894e-05, "loss": 0.3126, "mean_copy_accuracy": 0.9948862195014954, "mean_gen_accuracy": 0.8638767600059509, "mean_token_accuracy": 0.8955787569284439, "num_tokens": 815290098.0, "sample_num_tokens": 8833.5, "step": 3008, "total_num_tokens": 815325432.0, "z_loss": 0.0007186421426013112 }, { "copy_logits_max": -2.864638328552246, "copy_logits_min": -750000000.0, "copy_num_tokens": 599.25, "epoch": 0.6145519530252744, "gen_logits_max": 4.932880401611328, "gen_logits_mean": -12.492372512817383, "gen_logits_min": -24.55757713317871, "gen_logits_std": 2.536996841430664, "gen_loss": 0.28011319041252136, "grad_norm": 0.45090959382431567, "learning_rate": 2.7128842105263158e-05, "loss": 0.3237, "mean_copy_accuracy": 0.9935150146484375, "mean_gen_accuracy": 0.8633168786764145, "mean_token_accuracy": 0.8930318206548691, "num_tokens": 815556031.0, "sample_num_tokens": 8772.25, "step": 3009, "total_num_tokens": 815591120.0, "z_loss": 0.0006154902512207627 }, { "copy_logits_max": -5.6731109619140625, "copy_logits_min": -750000064.0, "copy_num_tokens": 534.4375, "epoch": 0.6147561909624712, "gen_logits_max": 4.383365631103516, "gen_logits_mean": -14.312599182128906, "gen_logits_min": -25.268142700195312, "gen_logits_std": 2.4833872318267822, "gen_loss": 0.2953006327152252, "grad_norm": 0.4190044595590253, "learning_rate": 2.7127578947368422e-05, "loss": 0.3172, "mean_copy_accuracy": 0.9952310919761658, "mean_gen_accuracy": 0.864080548286438, "mean_token_accuracy": 0.895220935344696, "num_tokens": 815831425.0, "sample_num_tokens": 9201.75, "step": 3010, "total_num_tokens": 815868232.0, "z_loss": 0.0006551395053975284 }, { "copy_logits_max": -4.737811088562012, "copy_logits_min": -750000000.0, "copy_num_tokens": 402.125, "epoch": 0.6149604288996681, "gen_logits_max": 4.988125324249268, "gen_logits_mean": -14.08919620513916, "gen_logits_min": -25.267719268798828, "gen_logits_std": 2.527353525161743, "gen_loss": 0.3751767873764038, "grad_norm": 0.4270209059762446, "learning_rate": 2.7126315789473683e-05, "loss": 0.3135, "mean_copy_accuracy": 0.9944703876972198, "mean_gen_accuracy": 0.8676179647445679, "mean_token_accuracy": 0.8977688997983932, "num_tokens": 816095087.0, "sample_num_tokens": 8247.25, "step": 3011, "total_num_tokens": 816128076.0, "z_loss": 0.0008128351764753461 }, { "copy_logits_max": -5.484751224517822, "copy_logits_min": -687500032.0, "copy_num_tokens": 269.5625, "epoch": 0.615164666836865, "gen_logits_max": 5.417751312255859, "gen_logits_mean": -13.990589141845703, "gen_logits_min": -25.090564727783203, "gen_logits_std": 2.5119128227233887, "gen_loss": 0.3564426302909851, "grad_norm": 0.46271176431840005, "learning_rate": 2.712505263157895e-05, "loss": 0.3384, "mean_copy_accuracy": 0.9937599897384644, "mean_gen_accuracy": 0.8597779124975204, "mean_token_accuracy": 0.8885704874992371, "num_tokens": 816357561.0, "sample_num_tokens": 5895.25, "step": 3012, "total_num_tokens": 816381142.0, "z_loss": 0.0007517677731812 }, { "copy_logits_max": -3.9670684337615967, "copy_logits_min": -750000000.0, "copy_num_tokens": 619.6875, "epoch": 0.6153689047740618, "gen_logits_max": 4.339205265045166, "gen_logits_mean": -14.308356285095215, "gen_logits_min": -25.495014190673828, "gen_logits_std": 2.4979379177093506, "gen_loss": 0.28153470158576965, "grad_norm": 0.41508070869018615, "learning_rate": 2.712378947368421e-05, "loss": 0.2937, "mean_copy_accuracy": 0.9944890737533569, "mean_gen_accuracy": 0.8720062971115112, "mean_token_accuracy": 0.902799054980278, "num_tokens": 816632772.0, "sample_num_tokens": 10970.5, "step": 3013, "total_num_tokens": 816676654.0, "z_loss": 0.0006505074561573565 }, { "copy_logits_max": -3.1476705074310303, "copy_logits_min": -750000000.0, "copy_num_tokens": 359.375, "epoch": 0.6155731427112586, "gen_logits_max": 5.299066066741943, "gen_logits_mean": -13.792617797851562, "gen_logits_min": -25.15215301513672, "gen_logits_std": 2.528474807739258, "gen_loss": 0.36182713508605957, "grad_norm": 0.46729071983439646, "learning_rate": 2.7122526315789476e-05, "loss": 0.351, "mean_copy_accuracy": 0.9935210943222046, "mean_gen_accuracy": 0.856575608253479, "mean_token_accuracy": 0.8836779147386551, "num_tokens": 816893122.0, "sample_num_tokens": 7773.0, "step": 3014, "total_num_tokens": 816924214.0, "z_loss": 0.0008543077856302261 }, { "copy_logits_max": -4.303346633911133, "copy_logits_min": -750000000.0, "copy_num_tokens": 451.9375, "epoch": 0.6157773806484554, "gen_logits_max": 5.144700527191162, "gen_logits_mean": -12.315923690795898, "gen_logits_min": -23.59886932373047, "gen_logits_std": 2.4625818729400635, "gen_loss": 0.33746957778930664, "grad_norm": 0.434062114447805, "learning_rate": 2.7121263157894737e-05, "loss": 0.3261, "mean_copy_accuracy": 0.9936240464448929, "mean_gen_accuracy": 0.8607888668775558, "mean_token_accuracy": 0.8910745978355408, "num_tokens": 817152125.0, "sample_num_tokens": 7727.25, "step": 3015, "total_num_tokens": 817183034.0, "z_loss": 0.0008199748117476702 }, { "copy_logits_max": -4.318336486816406, "copy_logits_min": -750000000.0, "copy_num_tokens": 500.25, "epoch": 0.6159816185856523, "gen_logits_max": 5.006990432739258, "gen_logits_mean": -13.928479194641113, "gen_logits_min": -25.547760009765625, "gen_logits_std": 2.5245773792266846, "gen_loss": 0.3299756944179535, "grad_norm": 0.4155577798755885, "learning_rate": 2.712e-05, "loss": 0.3171, "mean_copy_accuracy": 0.9946661740541458, "mean_gen_accuracy": 0.8611913472414017, "mean_token_accuracy": 0.8951127529144287, "num_tokens": 817417124.0, "sample_num_tokens": 8762.0, "step": 3016, "total_num_tokens": 817452172.0, "z_loss": 0.0008386134868487716 }, { "copy_logits_max": -4.030488014221191, "copy_logits_min": -625000064.0, "copy_num_tokens": 411.8125, "epoch": 0.6161858565228491, "gen_logits_max": 5.659242630004883, "gen_logits_mean": -12.702489852905273, "gen_logits_min": -24.629684448242188, "gen_logits_std": 2.5277915000915527, "gen_loss": 0.33132028579711914, "grad_norm": 0.4274143114630442, "learning_rate": 2.7118736842105262e-05, "loss": 0.3298, "mean_copy_accuracy": 0.994215577840805, "mean_gen_accuracy": 0.8593299090862274, "mean_token_accuracy": 0.8901133239269257, "num_tokens": 817681048.0, "sample_num_tokens": 7812.5, "step": 3017, "total_num_tokens": 817712298.0, "z_loss": 0.0007403012132272124 }, { "copy_logits_max": -2.8460092544555664, "copy_logits_min": -750000000.0, "copy_num_tokens": 510.75, "epoch": 0.616390094460046, "gen_logits_max": 4.832099914550781, "gen_logits_mean": -13.252906799316406, "gen_logits_min": -25.024993896484375, "gen_logits_std": 2.5893774032592773, "gen_loss": 0.2660764157772064, "grad_norm": 0.4591239091175084, "learning_rate": 2.7117473684210526e-05, "loss": 0.3, "mean_copy_accuracy": 0.9939850121736526, "mean_gen_accuracy": 0.8702231794595718, "mean_token_accuracy": 0.9035581350326538, "num_tokens": 817974721.0, "sample_num_tokens": 8106.75, "step": 3018, "total_num_tokens": 818007148.0, "z_loss": 0.000698201940394938 }, { "copy_logits_max": -3.5044796466827393, "copy_logits_min": -750000064.0, "copy_num_tokens": 508.9375, "epoch": 0.6165943323972428, "gen_logits_max": 4.457426071166992, "gen_logits_mean": -13.614337921142578, "gen_logits_min": -25.283374786376953, "gen_logits_std": 2.551132917404175, "gen_loss": 0.322036474943161, "grad_norm": 0.4220976901739597, "learning_rate": 2.7116210526315787e-05, "loss": 0.3172, "mean_copy_accuracy": 0.9956602901220322, "mean_gen_accuracy": 0.857834666967392, "mean_token_accuracy": 0.8969308733940125, "num_tokens": 818255751.0, "sample_num_tokens": 8125.75, "step": 3019, "total_num_tokens": 818288254.0, "z_loss": 0.0007617464289069176 }, { "copy_logits_max": -3.0289833545684814, "copy_logits_min": -750000000.0, "copy_num_tokens": 752.625, "epoch": 0.6167985703344396, "gen_logits_max": 3.766986131668091, "gen_logits_mean": -13.540122985839844, "gen_logits_min": -24.998567581176758, "gen_logits_std": 2.4963889122009277, "gen_loss": 0.26977357268333435, "grad_norm": 0.4263745117220798, "learning_rate": 2.7114947368421055e-05, "loss": 0.297, "mean_copy_accuracy": 0.9939577579498291, "mean_gen_accuracy": 0.8711321353912354, "mean_token_accuracy": 0.9023113250732422, "num_tokens": 818542161.0, "sample_num_tokens": 10625.25, "step": 3020, "total_num_tokens": 818584662.0, "z_loss": 0.0006980195757932961 }, { "copy_logits_max": -5.502260684967041, "copy_logits_min": -750000064.0, "copy_num_tokens": 225.3125, "epoch": 0.6170028082716364, "gen_logits_max": 5.684388160705566, "gen_logits_mean": -13.6069974899292, "gen_logits_min": -24.224040985107422, "gen_logits_std": 2.4398341178894043, "gen_loss": 0.3790879249572754, "grad_norm": 0.45898112000938573, "learning_rate": 2.7113684210526316e-05, "loss": 0.313, "mean_copy_accuracy": 0.9944351017475128, "mean_gen_accuracy": 0.8672571629285812, "mean_token_accuracy": 0.8961213231086731, "num_tokens": 818842682.0, "sample_num_tokens": 6667.5, "step": 3021, "total_num_tokens": 818869352.0, "z_loss": 0.0007641700794920325 }, { "copy_logits_max": -4.312698841094971, "copy_logits_min": -750000000.0, "copy_num_tokens": 359.9375, "epoch": 0.6172070462088333, "gen_logits_max": 5.274337291717529, "gen_logits_mean": -14.197126388549805, "gen_logits_min": -25.61740493774414, "gen_logits_std": 2.531737804412842, "gen_loss": 0.34121736884117126, "grad_norm": 0.4545112761984424, "learning_rate": 2.711242105263158e-05, "loss": 0.3149, "mean_copy_accuracy": 0.9932138174772263, "mean_gen_accuracy": 0.8610251545906067, "mean_token_accuracy": 0.8953818827867508, "num_tokens": 819106871.0, "sample_num_tokens": 7943.75, "step": 3022, "total_num_tokens": 819138646.0, "z_loss": 0.000737236812710762 }, { "copy_logits_max": -2.0525341033935547, "copy_logits_min": -750000000.0, "copy_num_tokens": 352.625, "epoch": 0.6174112841460301, "gen_logits_max": 4.98411226272583, "gen_logits_mean": -12.669448852539062, "gen_logits_min": -23.453346252441406, "gen_logits_std": 2.450270891189575, "gen_loss": 0.33728376030921936, "grad_norm": 0.40474966388302386, "learning_rate": 2.7111157894736845e-05, "loss": 0.327, "mean_copy_accuracy": 0.9949037581682205, "mean_gen_accuracy": 0.8588346689939499, "mean_token_accuracy": 0.890516996383667, "num_tokens": 819380881.0, "sample_num_tokens": 7889.75, "step": 3023, "total_num_tokens": 819412440.0, "z_loss": 0.0007533914176747203 }, { "copy_logits_max": -2.759368896484375, "copy_logits_min": -750000000.0, "copy_num_tokens": 515.375, "epoch": 0.617615522083227, "gen_logits_max": 4.355108261108398, "gen_logits_mean": -13.666109085083008, "gen_logits_min": -24.613588333129883, "gen_logits_std": 2.478703498840332, "gen_loss": 0.3356870412826538, "grad_norm": 0.45397941854822305, "learning_rate": 2.7109894736842106e-05, "loss": 0.3318, "mean_copy_accuracy": 0.9938381016254425, "mean_gen_accuracy": 0.859307736158371, "mean_token_accuracy": 0.8911393582820892, "num_tokens": 819652564.0, "sample_num_tokens": 8272.0, "step": 3024, "total_num_tokens": 819685652.0, "z_loss": 0.000771320192143321 }, { "copy_logits_max": -4.0196614265441895, "copy_logits_min": -750000000.0, "copy_num_tokens": 585.6875, "epoch": 0.6178197600204238, "gen_logits_max": 4.311623573303223, "gen_logits_mean": -13.860658645629883, "gen_logits_min": -24.869029998779297, "gen_logits_std": 2.485956907272339, "gen_loss": 0.31854793429374695, "grad_norm": 0.4474573666318608, "learning_rate": 2.710863157894737e-05, "loss": 0.3344, "mean_copy_accuracy": 0.9938980489969254, "mean_gen_accuracy": 0.8571808040142059, "mean_token_accuracy": 0.8892971724271774, "num_tokens": 819920905.0, "sample_num_tokens": 9447.75, "step": 3025, "total_num_tokens": 819958696.0, "z_loss": 0.0007180743268691003 }, { "copy_logits_max": -2.5619916915893555, "copy_logits_min": -750000000.0, "copy_num_tokens": 572.6875, "epoch": 0.6180239979576206, "gen_logits_max": 4.010832786560059, "gen_logits_mean": -14.162899017333984, "gen_logits_min": -25.583545684814453, "gen_logits_std": 2.535116672515869, "gen_loss": 0.2980566620826721, "grad_norm": 0.40532751202981, "learning_rate": 2.710736842105263e-05, "loss": 0.3101, "mean_copy_accuracy": 0.9960358142852783, "mean_gen_accuracy": 0.8603502511978149, "mean_token_accuracy": 0.8978903442621231, "num_tokens": 820202952.0, "sample_num_tokens": 8601.0, "step": 3026, "total_num_tokens": 820237356.0, "z_loss": 0.0006564413197338581 }, { "copy_logits_max": -1.489530086517334, "copy_logits_min": -687500032.0, "copy_num_tokens": 535.8125, "epoch": 0.6182282358948175, "gen_logits_max": 4.4020280838012695, "gen_logits_mean": -13.737014770507812, "gen_logits_min": -24.963756561279297, "gen_logits_std": 2.5134949684143066, "gen_loss": 0.3100993037223816, "grad_norm": 0.40759724281961546, "learning_rate": 2.7106105263157895e-05, "loss": 0.3024, "mean_copy_accuracy": 0.9948596358299255, "mean_gen_accuracy": 0.863865539431572, "mean_token_accuracy": 0.9005471765995026, "num_tokens": 820482709.0, "sample_num_tokens": 9498.75, "step": 3027, "total_num_tokens": 820520704.0, "z_loss": 0.0007006691885180771 }, { "copy_logits_max": -2.471391201019287, "copy_logits_min": -750000000.0, "copy_num_tokens": 504.125, "epoch": 0.6184324738320143, "gen_logits_max": 4.8518218994140625, "gen_logits_mean": -13.581755638122559, "gen_logits_min": -25.281848907470703, "gen_logits_std": 2.5565900802612305, "gen_loss": 0.29260319471359253, "grad_norm": 0.42677098629758053, "learning_rate": 2.710484210526316e-05, "loss": 0.3144, "mean_copy_accuracy": 0.9950716495513916, "mean_gen_accuracy": 0.8632651418447495, "mean_token_accuracy": 0.8941078782081604, "num_tokens": 820747926.0, "sample_num_tokens": 8715.0, "step": 3028, "total_num_tokens": 820782786.0, "z_loss": 0.0007584282429888844 }, { "copy_logits_max": -2.0406885147094727, "copy_logits_min": -687500032.0, "copy_num_tokens": 557.0625, "epoch": 0.6186367117692111, "gen_logits_max": 4.219460487365723, "gen_logits_mean": -14.10047435760498, "gen_logits_min": -25.584192276000977, "gen_logits_std": 2.562898874282837, "gen_loss": 0.31180113554000854, "grad_norm": 0.4418872087888916, "learning_rate": 2.7103578947368424e-05, "loss": 0.3232, "mean_copy_accuracy": 0.9951095283031464, "mean_gen_accuracy": 0.8606140911579132, "mean_token_accuracy": 0.89293073117733, "num_tokens": 821014908.0, "sample_num_tokens": 8479.0, "step": 3029, "total_num_tokens": 821048824.0, "z_loss": 0.0008474913192912936 }, { "copy_logits_max": -2.984157085418701, "copy_logits_min": -687500032.0, "copy_num_tokens": 345.0, "epoch": 0.618840949706408, "gen_logits_max": 5.023650169372559, "gen_logits_mean": -14.132429122924805, "gen_logits_min": -25.061729431152344, "gen_logits_std": 2.5248401165008545, "gen_loss": 0.30536121129989624, "grad_norm": 0.40309929172111275, "learning_rate": 2.7102315789473685e-05, "loss": 0.3039, "mean_copy_accuracy": 0.9944299906492233, "mean_gen_accuracy": 0.8674998432397842, "mean_token_accuracy": 0.8985576778650284, "num_tokens": 821307841.0, "sample_num_tokens": 7556.25, "step": 3030, "total_num_tokens": 821338066.0, "z_loss": 0.0007055304013192654 }, { "copy_logits_max": -3.795750379562378, "copy_logits_min": -750000000.0, "copy_num_tokens": 419.25, "epoch": 0.6190451876436048, "gen_logits_max": 5.1627912521362305, "gen_logits_mean": -13.734014511108398, "gen_logits_min": -24.930009841918945, "gen_logits_std": 2.5653696060180664, "gen_loss": 0.2744457423686981, "grad_norm": 0.39748139501154695, "learning_rate": 2.710105263157895e-05, "loss": 0.2849, "mean_copy_accuracy": 0.9932025372982025, "mean_gen_accuracy": 0.8780815154314041, "mean_token_accuracy": 0.9039178639650345, "num_tokens": 821567405.0, "sample_num_tokens": 8440.75, "step": 3031, "total_num_tokens": 821601168.0, "z_loss": 0.0006611923454329371 }, { "copy_logits_max": -4.071913719177246, "copy_logits_min": -750000000.0, "copy_num_tokens": 453.875, "epoch": 0.6192494255808016, "gen_logits_max": 5.420140743255615, "gen_logits_mean": -13.043984413146973, "gen_logits_min": -24.19164276123047, "gen_logits_std": 2.528815746307373, "gen_loss": 0.3146687150001526, "grad_norm": 0.41518463061097804, "learning_rate": 2.709978947368421e-05, "loss": 0.3208, "mean_copy_accuracy": 0.9948581755161285, "mean_gen_accuracy": 0.8544278591871262, "mean_token_accuracy": 0.8921158462762833, "num_tokens": 821831747.0, "sample_num_tokens": 8717.75, "step": 3032, "total_num_tokens": 821866618.0, "z_loss": 0.0007741783047094941 }, { "copy_logits_max": -2.851837396621704, "copy_logits_min": -750000000.0, "copy_num_tokens": 509.625, "epoch": 0.6194536635179985, "gen_logits_max": 5.435440540313721, "gen_logits_mean": -12.467978477478027, "gen_logits_min": -23.768917083740234, "gen_logits_std": 2.5045881271362305, "gen_loss": 0.31988221406936646, "grad_norm": 0.43342711440392334, "learning_rate": 2.7098526315789474e-05, "loss": 0.3005, "mean_copy_accuracy": 0.9962251931428909, "mean_gen_accuracy": 0.8722767233848572, "mean_token_accuracy": 0.9058777987957001, "num_tokens": 822115433.0, "sample_num_tokens": 9234.75, "step": 3033, "total_num_tokens": 822152372.0, "z_loss": 0.0007845803047530353 }, { "copy_logits_max": -4.501672744750977, "copy_logits_min": -750000064.0, "copy_num_tokens": 444.0, "epoch": 0.6196579014551953, "gen_logits_max": 4.5720601081848145, "gen_logits_mean": -13.612146377563477, "gen_logits_min": -24.801748275756836, "gen_logits_std": 2.4936561584472656, "gen_loss": 0.30500221252441406, "grad_norm": 0.41088612417513287, "learning_rate": 2.7097263157894735e-05, "loss": 0.3266, "mean_copy_accuracy": 0.9947057962417603, "mean_gen_accuracy": 0.8632079660892487, "mean_token_accuracy": 0.8938946574926376, "num_tokens": 822385219.0, "sample_num_tokens": 7974.75, "step": 3034, "total_num_tokens": 822417118.0, "z_loss": 0.0007285658502951264 }, { "copy_logits_max": -3.2362542152404785, "copy_logits_min": -750000000.0, "copy_num_tokens": 596.5, "epoch": 0.6198621393923921, "gen_logits_max": 5.230424404144287, "gen_logits_mean": -12.19985294342041, "gen_logits_min": -23.612722396850586, "gen_logits_std": 2.5552520751953125, "gen_loss": 0.28687843680381775, "grad_norm": 0.45561503862585445, "learning_rate": 2.7096e-05, "loss": 0.3172, "mean_copy_accuracy": 0.994454637169838, "mean_gen_accuracy": 0.8642674684524536, "mean_token_accuracy": 0.8949753046035767, "num_tokens": 822650863.0, "sample_num_tokens": 9735.25, "step": 3035, "total_num_tokens": 822689804.0, "z_loss": 0.0006792999338358641 }, { "copy_logits_max": -3.853076934814453, "copy_logits_min": -750000000.0, "copy_num_tokens": 334.9375, "epoch": 0.620066377329589, "gen_logits_max": 5.043439865112305, "gen_logits_mean": -14.119359970092773, "gen_logits_min": -25.515729904174805, "gen_logits_std": 2.5341954231262207, "gen_loss": 0.3187948167324066, "grad_norm": 0.4247555406082038, "learning_rate": 2.7094736842105267e-05, "loss": 0.3009, "mean_copy_accuracy": 0.9939470887184143, "mean_gen_accuracy": 0.8701580464839935, "mean_token_accuracy": 0.899968221783638, "num_tokens": 822931535.0, "sample_num_tokens": 7273.25, "step": 3036, "total_num_tokens": 822960628.0, "z_loss": 0.0007532325689680874 }, { "copy_logits_max": -3.324613571166992, "copy_logits_min": -625000064.0, "copy_num_tokens": 729.6875, "epoch": 0.6202706152667858, "gen_logits_max": 4.810637950897217, "gen_logits_mean": -12.945685386657715, "gen_logits_min": -24.34162139892578, "gen_logits_std": 2.545332431793213, "gen_loss": 0.3257130980491638, "grad_norm": 0.40359862873780356, "learning_rate": 2.7093473684210528e-05, "loss": 0.3037, "mean_copy_accuracy": 0.9964428097009659, "mean_gen_accuracy": 0.8668183982372284, "mean_token_accuracy": 0.9011489152908325, "num_tokens": 823213549.0, "sample_num_tokens": 9895.75, "step": 3037, "total_num_tokens": 823253132.0, "z_loss": 0.0007826857035979629 }, { "copy_logits_max": -5.437067031860352, "copy_logits_min": -750000000.0, "copy_num_tokens": 430.625, "epoch": 0.6204748532039827, "gen_logits_max": 5.250179290771484, "gen_logits_mean": -13.341310501098633, "gen_logits_min": -24.475914001464844, "gen_logits_std": 2.5023903846740723, "gen_loss": 0.28999775648117065, "grad_norm": 0.3855144445963223, "learning_rate": 2.7092210526315792e-05, "loss": 0.2987, "mean_copy_accuracy": 0.9962176084518433, "mean_gen_accuracy": 0.8654404729604721, "mean_token_accuracy": 0.9014749377965927, "num_tokens": 823519956.0, "sample_num_tokens": 8377.5, "step": 3038, "total_num_tokens": 823553466.0, "z_loss": 0.0007241432904265821 }, { "copy_logits_max": -1.7688024044036865, "copy_logits_min": -687500032.0, "copy_num_tokens": 391.8125, "epoch": 0.6206790911411795, "gen_logits_max": 5.521678924560547, "gen_logits_mean": -13.25739860534668, "gen_logits_min": -24.673267364501953, "gen_logits_std": 2.57175874710083, "gen_loss": 0.34069356322288513, "grad_norm": 0.4339545153198941, "learning_rate": 2.7090947368421053e-05, "loss": 0.3054, "mean_copy_accuracy": 0.9932468235492706, "mean_gen_accuracy": 0.8684323877096176, "mean_token_accuracy": 0.8980312049388885, "num_tokens": 823777422.0, "sample_num_tokens": 7675.5, "step": 3039, "total_num_tokens": 823808124.0, "z_loss": 0.0008261924376711249 }, { "copy_logits_max": -3.6801950931549072, "copy_logits_min": -687500032.0, "copy_num_tokens": 435.8125, "epoch": 0.6208833290783763, "gen_logits_max": 5.2499494552612305, "gen_logits_mean": -13.11497688293457, "gen_logits_min": -24.53525161743164, "gen_logits_std": 2.5417442321777344, "gen_loss": 0.3156661093235016, "grad_norm": 0.42895034569262164, "learning_rate": 2.7089684210526318e-05, "loss": 0.3229, "mean_copy_accuracy": 0.9948709905147552, "mean_gen_accuracy": 0.8594217747449875, "mean_token_accuracy": 0.8932302594184875, "num_tokens": 824034180.0, "sample_num_tokens": 7923.5, "step": 3040, "total_num_tokens": 824065874.0, "z_loss": 0.0008177892887033522 }, { "copy_logits_max": -4.101927757263184, "copy_logits_min": -750000000.0, "copy_num_tokens": 332.5625, "epoch": 0.6210875670155731, "gen_logits_max": 6.10600471496582, "gen_logits_mean": -12.014842987060547, "gen_logits_min": -23.147289276123047, "gen_logits_std": 2.48268461227417, "gen_loss": 0.3919103443622589, "grad_norm": 0.42656332971067706, "learning_rate": 2.708842105263158e-05, "loss": 0.3274, "mean_copy_accuracy": 0.9941416084766388, "mean_gen_accuracy": 0.8671348094940186, "mean_token_accuracy": 0.893343135714531, "num_tokens": 824283630.0, "sample_num_tokens": 7275.0, "step": 3041, "total_num_tokens": 824312730.0, "z_loss": 0.0009011648944579065 }, { "copy_logits_max": -2.9233832359313965, "copy_logits_min": -687500032.0, "copy_num_tokens": 488.5, "epoch": 0.6212918049527699, "gen_logits_max": 5.257730007171631, "gen_logits_mean": -12.096545219421387, "gen_logits_min": -23.910655975341797, "gen_logits_std": 2.5675811767578125, "gen_loss": 0.3066873550415039, "grad_norm": 0.41906140319113366, "learning_rate": 2.7087157894736843e-05, "loss": 0.3119, "mean_copy_accuracy": 0.9936521351337433, "mean_gen_accuracy": 0.8632282316684723, "mean_token_accuracy": 0.8963722139596939, "num_tokens": 824547636.0, "sample_num_tokens": 8091.5, "step": 3042, "total_num_tokens": 824580002.0, "z_loss": 0.0007233776850625873 }, { "copy_logits_max": -4.178197383880615, "copy_logits_min": -750000000.0, "copy_num_tokens": 309.8125, "epoch": 0.6214960428899668, "gen_logits_max": 5.909930229187012, "gen_logits_mean": -13.071077346801758, "gen_logits_min": -24.503074645996094, "gen_logits_std": 2.5814387798309326, "gen_loss": 0.3678767681121826, "grad_norm": 0.435901841885159, "learning_rate": 2.7085894736842104e-05, "loss": 0.3314, "mean_copy_accuracy": 0.9948882460594177, "mean_gen_accuracy": 0.8597862720489502, "mean_token_accuracy": 0.8909742087125778, "num_tokens": 824813770.0, "sample_num_tokens": 6815.5, "step": 3043, "total_num_tokens": 824841032.0, "z_loss": 0.0008031761972233653 }, { "copy_logits_max": -2.789076328277588, "copy_logits_min": -750000064.0, "copy_num_tokens": 412.3125, "epoch": 0.6217002808271637, "gen_logits_max": 4.937826156616211, "gen_logits_mean": -12.820242881774902, "gen_logits_min": -23.817886352539062, "gen_logits_std": 2.5033957958221436, "gen_loss": 0.28995242714881897, "grad_norm": 0.4536772400118153, "learning_rate": 2.7084631578947368e-05, "loss": 0.2852, "mean_copy_accuracy": 0.9943513125181198, "mean_gen_accuracy": 0.8763315081596375, "mean_token_accuracy": 0.9044889658689499, "num_tokens": 825082116.0, "sample_num_tokens": 8695.0, "step": 3044, "total_num_tokens": 825116896.0, "z_loss": 0.0006444210303016007 }, { "copy_logits_max": -2.573148250579834, "copy_logits_min": -687500032.0, "copy_num_tokens": 430.1875, "epoch": 0.6219045187643605, "gen_logits_max": 4.82515811920166, "gen_logits_mean": -13.036023139953613, "gen_logits_min": -23.91756820678711, "gen_logits_std": 2.451871633529663, "gen_loss": 0.36981695890426636, "grad_norm": 0.4012072219380178, "learning_rate": 2.7083368421052632e-05, "loss": 0.3227, "mean_copy_accuracy": 0.9957510977983475, "mean_gen_accuracy": 0.8610900193452835, "mean_token_accuracy": 0.8938743025064468, "num_tokens": 825362013.0, "sample_num_tokens": 8593.25, "step": 3045, "total_num_tokens": 825396386.0, "z_loss": 0.0007734125247225165 }, { "copy_logits_max": -5.198599815368652, "copy_logits_min": -750000000.0, "copy_num_tokens": 458.9375, "epoch": 0.6221087567015573, "gen_logits_max": 4.575278282165527, "gen_logits_mean": -15.19257926940918, "gen_logits_min": -26.138111114501953, "gen_logits_std": 2.490804433822632, "gen_loss": 0.31645655632019043, "grad_norm": 0.6146843450716737, "learning_rate": 2.7082105263157897e-05, "loss": 0.2985, "mean_copy_accuracy": 0.9949114620685577, "mean_gen_accuracy": 0.8686492443084717, "mean_token_accuracy": 0.9008079171180725, "num_tokens": 825658241.0, "sample_num_tokens": 9015.25, "step": 3046, "total_num_tokens": 825694302.0, "z_loss": 0.0006916715065017343 }, { "copy_logits_max": -4.098318099975586, "copy_logits_min": -687500032.0, "copy_num_tokens": 670.375, "epoch": 0.6223129946387541, "gen_logits_max": 4.357789039611816, "gen_logits_mean": -13.319631576538086, "gen_logits_min": -24.774730682373047, "gen_logits_std": 2.51982045173645, "gen_loss": 0.28473448753356934, "grad_norm": 0.43793630343161255, "learning_rate": 2.7080842105263158e-05, "loss": 0.3017, "mean_copy_accuracy": 0.9946337193250656, "mean_gen_accuracy": 0.8673924207687378, "mean_token_accuracy": 0.9035660475492477, "num_tokens": 825949719.0, "sample_num_tokens": 8869.75, "step": 3047, "total_num_tokens": 825985198.0, "z_loss": 0.0006052330136299133 }, { "copy_logits_max": -5.153371334075928, "copy_logits_min": -749999936.0, "copy_num_tokens": 477.875, "epoch": 0.6225172325759509, "gen_logits_max": 4.537605285644531, "gen_logits_mean": -13.407791137695312, "gen_logits_min": -24.45230484008789, "gen_logits_std": 2.4642138481140137, "gen_loss": 0.30174630880355835, "grad_norm": 0.39612049765772595, "learning_rate": 2.7079578947368422e-05, "loss": 0.3085, "mean_copy_accuracy": 0.9951792359352112, "mean_gen_accuracy": 0.8675976693630219, "mean_token_accuracy": 0.8983201682567596, "num_tokens": 826230122.0, "sample_num_tokens": 9080.0, "step": 3048, "total_num_tokens": 826266442.0, "z_loss": 0.0006335749058052897 }, { "copy_logits_max": -2.271811008453369, "copy_logits_min": -750000000.0, "copy_num_tokens": 575.25, "epoch": 0.6227214705131479, "gen_logits_max": 4.461960792541504, "gen_logits_mean": -13.537664413452148, "gen_logits_min": -24.723608016967773, "gen_logits_std": 2.5139803886413574, "gen_loss": 0.31307679414749146, "grad_norm": 0.4474557351501947, "learning_rate": 2.7078315789473683e-05, "loss": 0.3143, "mean_copy_accuracy": 0.9953561574220657, "mean_gen_accuracy": 0.8618805855512619, "mean_token_accuracy": 0.8966860324144363, "num_tokens": 826493384.0, "sample_num_tokens": 8973.0, "step": 3049, "total_num_tokens": 826529276.0, "z_loss": 0.0007314133690670133 }, { "copy_logits_max": -3.891183853149414, "copy_logits_min": -687500032.0, "copy_num_tokens": 520.4375, "epoch": 0.6229257084503447, "gen_logits_max": 5.6454901695251465, "gen_logits_mean": -10.925378799438477, "gen_logits_min": -22.144935607910156, "gen_logits_std": 2.4146621227264404, "gen_loss": 0.29978689551353455, "grad_norm": 0.5047573599284723, "learning_rate": 2.7077052631578947e-05, "loss": 0.2979, "mean_copy_accuracy": 0.9948714375495911, "mean_gen_accuracy": 0.8685277849435806, "mean_token_accuracy": 0.9004956781864166, "num_tokens": 826766503.0, "sample_num_tokens": 9241.25, "step": 3050, "total_num_tokens": 826803468.0, "z_loss": 0.0006077252328395844 }, { "copy_logits_max": -4.086259841918945, "copy_logits_min": -750000000.0, "copy_num_tokens": 552.75, "epoch": 0.6231299463875415, "gen_logits_max": 4.360088348388672, "gen_logits_mean": -13.123054504394531, "gen_logits_min": -23.880138397216797, "gen_logits_std": 2.457651376724243, "gen_loss": 0.26595965027809143, "grad_norm": 0.4163906537549367, "learning_rate": 2.707578947368421e-05, "loss": 0.305, "mean_copy_accuracy": 0.9964449852705002, "mean_gen_accuracy": 0.8647834807634354, "mean_token_accuracy": 0.8994164913892746, "num_tokens": 827040659.0, "sample_num_tokens": 9129.25, "step": 3051, "total_num_tokens": 827077176.0, "z_loss": 0.0006510058301500976 }, { "copy_logits_max": -2.1961910724639893, "copy_logits_min": -625000064.0, "copy_num_tokens": 472.5625, "epoch": 0.6233341843247383, "gen_logits_max": 4.979963302612305, "gen_logits_mean": -13.358073234558105, "gen_logits_min": -24.170543670654297, "gen_logits_std": 2.4460415840148926, "gen_loss": 0.35233139991760254, "grad_norm": 0.44258558239561846, "learning_rate": 2.7074526315789472e-05, "loss": 0.3141, "mean_copy_accuracy": 0.9952572137117386, "mean_gen_accuracy": 0.8645853102207184, "mean_token_accuracy": 0.8975912034511566, "num_tokens": 827306221.0, "sample_num_tokens": 9000.25, "step": 3052, "total_num_tokens": 827342222.0, "z_loss": 0.0009022451122291386 }, { "copy_logits_max": -3.0888514518737793, "copy_logits_min": -687500032.0, "copy_num_tokens": 575.1875, "epoch": 0.6235384222619351, "gen_logits_max": 5.367876052856445, "gen_logits_mean": -11.14708423614502, "gen_logits_min": -21.87872314453125, "gen_logits_std": 2.3965325355529785, "gen_loss": 0.3222410976886749, "grad_norm": 0.4520327789930377, "learning_rate": 2.707326315789474e-05, "loss": 0.3311, "mean_copy_accuracy": 0.992047980427742, "mean_gen_accuracy": 0.8606596142053604, "mean_token_accuracy": 0.8891693353652954, "num_tokens": 827562755.0, "sample_num_tokens": 9891.25, "step": 3053, "total_num_tokens": 827602320.0, "z_loss": 0.0008719149045646191 }, { "copy_logits_max": -1.2956581115722656, "copy_logits_min": -750000000.0, "copy_num_tokens": 536.875, "epoch": 0.6237426601991319, "gen_logits_max": 4.320940017700195, "gen_logits_mean": -13.014691352844238, "gen_logits_min": -23.77841567993164, "gen_logits_std": 2.464008092880249, "gen_loss": 0.2786409854888916, "grad_norm": 0.3931939859375467, "learning_rate": 2.7072e-05, "loss": 0.3176, "mean_copy_accuracy": 0.9956018477678299, "mean_gen_accuracy": 0.8607891798019409, "mean_token_accuracy": 0.8978658765554428, "num_tokens": 827857950.0, "sample_num_tokens": 8359.0, "step": 3054, "total_num_tokens": 827891386.0, "z_loss": 0.000919046753551811 }, { "copy_logits_max": -2.300445079803467, "copy_logits_min": -750000000.0, "copy_num_tokens": 606.6875, "epoch": 0.6239468981363289, "gen_logits_max": 4.045863151550293, "gen_logits_mean": -13.910833358764648, "gen_logits_min": -24.995548248291016, "gen_logits_std": 2.4765000343322754, "gen_loss": 0.29400181770324707, "grad_norm": 0.39476573813023375, "learning_rate": 2.7070736842105265e-05, "loss": 0.3008, "mean_copy_accuracy": 0.9940664619207382, "mean_gen_accuracy": 0.874779611825943, "mean_token_accuracy": 0.9033192545175552, "num_tokens": 828154209.0, "sample_num_tokens": 9064.25, "step": 3055, "total_num_tokens": 828190466.0, "z_loss": 0.000866673537530005 }, { "copy_logits_max": -3.788166046142578, "copy_logits_min": -750000000.0, "copy_num_tokens": 382.125, "epoch": 0.6241511360735257, "gen_logits_max": 5.757270812988281, "gen_logits_mean": -12.967520713806152, "gen_logits_min": -23.474611282348633, "gen_logits_std": 2.4357492923736572, "gen_loss": 0.3431939482688904, "grad_norm": 0.4289062433151266, "learning_rate": 2.7069473684210526e-05, "loss": 0.3264, "mean_copy_accuracy": 0.9931445270776749, "mean_gen_accuracy": 0.86054927110672, "mean_token_accuracy": 0.8902627378702164, "num_tokens": 828426016.0, "sample_num_tokens": 8498.0, "step": 3056, "total_num_tokens": 828460008.0, "z_loss": 0.0008570370846427977 }, { "copy_logits_max": -1.076546311378479, "copy_logits_min": -750000000.0, "copy_num_tokens": 449.125, "epoch": 0.6243553740107225, "gen_logits_max": 4.84037971496582, "gen_logits_mean": -12.12625503540039, "gen_logits_min": -23.241893768310547, "gen_logits_std": 2.4860262870788574, "gen_loss": 0.32295382022857666, "grad_norm": 0.42637687684446546, "learning_rate": 2.706821052631579e-05, "loss": 0.3101, "mean_copy_accuracy": 0.993722677230835, "mean_gen_accuracy": 0.8636506050825119, "mean_token_accuracy": 0.8979820758104324, "num_tokens": 828703826.0, "sample_num_tokens": 7618.0, "step": 3057, "total_num_tokens": 828734298.0, "z_loss": 0.0008776627946645021 }, { "copy_logits_max": -2.0179600715637207, "copy_logits_min": -750000000.0, "copy_num_tokens": 416.875, "epoch": 0.6245596119479193, "gen_logits_max": 4.596724510192871, "gen_logits_mean": -14.434833526611328, "gen_logits_min": -25.704341888427734, "gen_logits_std": 2.5259649753570557, "gen_loss": 0.31359851360321045, "grad_norm": 0.44664028429819613, "learning_rate": 2.706694736842105e-05, "loss": 0.321, "mean_copy_accuracy": 0.993254229426384, "mean_gen_accuracy": 0.8633983433246613, "mean_token_accuracy": 0.8941497057676315, "num_tokens": 828963397.0, "sample_num_tokens": 7616.25, "step": 3058, "total_num_tokens": 828993862.0, "z_loss": 0.0008676353027112782 }, { "copy_logits_max": -1.1360294818878174, "copy_logits_min": -687500032.0, "copy_num_tokens": 613.375, "epoch": 0.6247638498851161, "gen_logits_max": 5.027107238769531, "gen_logits_mean": -11.973341941833496, "gen_logits_min": -23.218360900878906, "gen_logits_std": 2.523686408996582, "gen_loss": 0.2687883973121643, "grad_norm": 0.46701828197678513, "learning_rate": 2.7065684210526316e-05, "loss": 0.2982, "mean_copy_accuracy": 0.9941861629486084, "mean_gen_accuracy": 0.8760445266962051, "mean_token_accuracy": 0.902888685464859, "num_tokens": 829241781.0, "sample_num_tokens": 9693.25, "step": 3059, "total_num_tokens": 829280554.0, "z_loss": 0.0007336019771173596 }, { "copy_logits_max": -3.4405806064605713, "copy_logits_min": -750000000.0, "copy_num_tokens": 416.375, "epoch": 0.624968087822313, "gen_logits_max": 5.227419853210449, "gen_logits_mean": -12.422067642211914, "gen_logits_min": -23.078886032104492, "gen_logits_std": 2.4419283866882324, "gen_loss": 0.3242977559566498, "grad_norm": 0.4154330625331047, "learning_rate": 2.7064421052631577e-05, "loss": 0.2989, "mean_copy_accuracy": 0.9956527054309845, "mean_gen_accuracy": 0.8605926483869553, "mean_token_accuracy": 0.9004964530467987, "num_tokens": 829528163.0, "sample_num_tokens": 8227.75, "step": 3060, "total_num_tokens": 829561074.0, "z_loss": 0.0007961421506479383 }, { "copy_logits_max": -3.1945273876190186, "copy_logits_min": -750000000.0, "copy_num_tokens": 400.8125, "epoch": 0.6251723257595099, "gen_logits_max": 5.112782955169678, "gen_logits_mean": -13.476253509521484, "gen_logits_min": -24.782926559448242, "gen_logits_std": 2.518394947052002, "gen_loss": 0.3087084889411926, "grad_norm": 0.45140581378045763, "learning_rate": 2.7063157894736844e-05, "loss": 0.3181, "mean_copy_accuracy": 0.994316354393959, "mean_gen_accuracy": 0.8665080815553665, "mean_token_accuracy": 0.8970166444778442, "num_tokens": 829801632.0, "sample_num_tokens": 7588.0, "step": 3061, "total_num_tokens": 829831984.0, "z_loss": 0.0007477195467799902 }, { "copy_logits_max": -3.2694435119628906, "copy_logits_min": -750000064.0, "copy_num_tokens": 350.625, "epoch": 0.6253765636967067, "gen_logits_max": 5.164329528808594, "gen_logits_mean": -12.723668098449707, "gen_logits_min": -23.571884155273438, "gen_logits_std": 2.478848695755005, "gen_loss": 0.33547067642211914, "grad_norm": 0.4222017091981559, "learning_rate": 2.7061894736842105e-05, "loss": 0.3029, "mean_copy_accuracy": 0.9943787902593613, "mean_gen_accuracy": 0.8674822598695755, "mean_token_accuracy": 0.9013239741325378, "num_tokens": 830072052.0, "sample_num_tokens": 6976.5, "step": 3062, "total_num_tokens": 830099958.0, "z_loss": 0.0007076305919326842 }, { "copy_logits_max": -2.6594078540802, "copy_logits_min": -687500032.0, "copy_num_tokens": 506.0, "epoch": 0.6255808016339035, "gen_logits_max": 5.846841812133789, "gen_logits_mean": -11.570255279541016, "gen_logits_min": -22.733440399169922, "gen_logits_std": 2.4597744941711426, "gen_loss": 0.31674110889434814, "grad_norm": 0.4281062625971736, "learning_rate": 2.706063157894737e-05, "loss": 0.3127, "mean_copy_accuracy": 0.9951064735651016, "mean_gen_accuracy": 0.8578886538743973, "mean_token_accuracy": 0.8979831486940384, "num_tokens": 830340902.0, "sample_num_tokens": 8259.0, "step": 3063, "total_num_tokens": 830373938.0, "z_loss": 0.0007389391539618373 }, { "copy_logits_max": -3.896888017654419, "copy_logits_min": -687500032.0, "copy_num_tokens": 519.0625, "epoch": 0.6257850395711003, "gen_logits_max": 5.483461380004883, "gen_logits_mean": -11.613628387451172, "gen_logits_min": -22.7913818359375, "gen_logits_std": 2.488164186477661, "gen_loss": 0.2617071866989136, "grad_norm": 0.48110540295564275, "learning_rate": 2.7059368421052634e-05, "loss": 0.3169, "mean_copy_accuracy": 0.9921332895755768, "mean_gen_accuracy": 0.8677767515182495, "mean_token_accuracy": 0.8946487754583359, "num_tokens": 830589873.0, "sample_num_tokens": 8781.25, "step": 3064, "total_num_tokens": 830624998.0, "z_loss": 0.000655986019410193 }, { "copy_logits_max": -3.797650098800659, "copy_logits_min": -687500032.0, "copy_num_tokens": 430.0625, "epoch": 0.6259892775082971, "gen_logits_max": 5.805110931396484, "gen_logits_mean": -12.190194129943848, "gen_logits_min": -23.016294479370117, "gen_logits_std": 2.464776039123535, "gen_loss": 0.358306884765625, "grad_norm": 0.4183284961762915, "learning_rate": 2.7058105263157895e-05, "loss": 0.3345, "mean_copy_accuracy": 0.9934430420398712, "mean_gen_accuracy": 0.8568595498800278, "mean_token_accuracy": 0.8899122625589371, "num_tokens": 830864232.0, "sample_num_tokens": 9202.5, "step": 3065, "total_num_tokens": 830901042.0, "z_loss": 0.0008280002512037754 }, { "copy_logits_max": -4.081933975219727, "copy_logits_min": -750000000.0, "copy_num_tokens": 294.5, "epoch": 0.626193515445494, "gen_logits_max": 5.6573872566223145, "gen_logits_mean": -13.527437210083008, "gen_logits_min": -24.97298812866211, "gen_logits_std": 2.5363447666168213, "gen_loss": 0.35871583223342896, "grad_norm": 0.45000923396264114, "learning_rate": 2.705684210526316e-05, "loss": 0.2942, "mean_copy_accuracy": 0.9936419278383255, "mean_gen_accuracy": 0.8742989599704742, "mean_token_accuracy": 0.901794970035553, "num_tokens": 831141500.0, "sample_num_tokens": 6711.0, "step": 3066, "total_num_tokens": 831168344.0, "z_loss": 0.000794282415881753 }, { "copy_logits_max": -4.592830181121826, "copy_logits_min": -687500032.0, "copy_num_tokens": 290.625, "epoch": 0.6263977533826909, "gen_logits_max": 5.661032199859619, "gen_logits_mean": -13.36054515838623, "gen_logits_min": -24.312740325927734, "gen_logits_std": 2.4978044033050537, "gen_loss": 0.34107476472854614, "grad_norm": 0.45192155653851185, "learning_rate": 2.705557894736842e-05, "loss": 0.316, "mean_copy_accuracy": 0.9939541071653366, "mean_gen_accuracy": 0.8633156716823578, "mean_token_accuracy": 0.8940479159355164, "num_tokens": 831391381.0, "sample_num_tokens": 7329.75, "step": 3067, "total_num_tokens": 831420700.0, "z_loss": 0.0007298082928173244 }, { "copy_logits_max": -3.601226329803467, "copy_logits_min": -687500032.0, "copy_num_tokens": 414.5625, "epoch": 0.6266019913198877, "gen_logits_max": 5.171625137329102, "gen_logits_mean": -12.911483764648438, "gen_logits_min": -24.050819396972656, "gen_logits_std": 2.5531506538391113, "gen_loss": 0.30211636424064636, "grad_norm": 0.44047543394673205, "learning_rate": 2.7054315789473684e-05, "loss": 0.3245, "mean_copy_accuracy": 0.9947995841503143, "mean_gen_accuracy": 0.8624177277088165, "mean_token_accuracy": 0.8913022875785828, "num_tokens": 831636996.0, "sample_num_tokens": 7036.5, "step": 3068, "total_num_tokens": 831665142.0, "z_loss": 0.0007401186157949269 }, { "copy_logits_max": -3.2305517196655273, "copy_logits_min": -687500032.0, "copy_num_tokens": 478.8125, "epoch": 0.6268062292570845, "gen_logits_max": 4.95046854019165, "gen_logits_mean": -13.570409774780273, "gen_logits_min": -24.606857299804688, "gen_logits_std": 2.4845125675201416, "gen_loss": 0.30853089690208435, "grad_norm": 0.4891250710762753, "learning_rate": 2.705305263157895e-05, "loss": 0.3227, "mean_copy_accuracy": 0.9944233298301697, "mean_gen_accuracy": 0.8664393872022629, "mean_token_accuracy": 0.8956512659788132, "num_tokens": 831893844.0, "sample_num_tokens": 9000.5, "step": 3069, "total_num_tokens": 831929846.0, "z_loss": 0.0006942697800695896 }, { "copy_logits_max": -5.080461502075195, "copy_logits_min": -687500032.0, "copy_num_tokens": 374.125, "epoch": 0.6270104671942813, "gen_logits_max": 5.61688232421875, "gen_logits_mean": -13.556377410888672, "gen_logits_min": -24.633193969726562, "gen_logits_std": 2.531938314437866, "gen_loss": 0.3034459352493286, "grad_norm": 0.4310575416631865, "learning_rate": 2.7051789473684213e-05, "loss": 0.3007, "mean_copy_accuracy": 0.9947557598352432, "mean_gen_accuracy": 0.8621495217084885, "mean_token_accuracy": 0.9000456631183624, "num_tokens": 832179472.0, "sample_num_tokens": 7429.0, "step": 3070, "total_num_tokens": 832209188.0, "z_loss": 0.0007221272680908442 }, { "copy_logits_max": -4.38193941116333, "copy_logits_min": -687500032.0, "copy_num_tokens": 547.3125, "epoch": 0.6272147051314781, "gen_logits_max": 5.284303188323975, "gen_logits_mean": -12.428380966186523, "gen_logits_min": -23.12149429321289, "gen_logits_std": 2.464634895324707, "gen_loss": 0.3091675937175751, "grad_norm": 0.41071066457396366, "learning_rate": 2.7050526315789474e-05, "loss": 0.3003, "mean_copy_accuracy": 0.9951478242874146, "mean_gen_accuracy": 0.8669769465923309, "mean_token_accuracy": 0.9016067534685135, "num_tokens": 832451264.0, "sample_num_tokens": 8400.0, "step": 3071, "total_num_tokens": 832484864.0, "z_loss": 0.0007335570990107954 }, { "copy_logits_max": -2.592343330383301, "copy_logits_min": -750000000.0, "copy_num_tokens": 535.875, "epoch": 0.627418943068675, "gen_logits_max": 5.510295391082764, "gen_logits_mean": -12.509994506835938, "gen_logits_min": -24.262231826782227, "gen_logits_std": 2.558192729949951, "gen_loss": 0.31781041622161865, "grad_norm": 0.5400649390611401, "learning_rate": 2.7049263157894738e-05, "loss": 0.3356, "mean_copy_accuracy": 0.9935256540775299, "mean_gen_accuracy": 0.8535783439874649, "mean_token_accuracy": 0.8907770812511444, "num_tokens": 832711708.0, "sample_num_tokens": 8012.5, "step": 3072, "total_num_tokens": 832743758.0, "z_loss": 0.0007445194059982896 }, { "copy_logits_max": -1.1704273223876953, "copy_logits_min": -750000000.0, "copy_num_tokens": 730.5625, "epoch": 0.6276231810058719, "gen_logits_max": 4.446481704711914, "gen_logits_mean": -13.41199016571045, "gen_logits_min": -25.18509292602539, "gen_logits_std": 2.5386271476745605, "gen_loss": 0.29953134059906006, "grad_norm": 0.4567617451817865, "learning_rate": 2.7048e-05, "loss": 0.3064, "mean_copy_accuracy": 0.9952019155025482, "mean_gen_accuracy": 0.8693878650665283, "mean_token_accuracy": 0.9007946848869324, "num_tokens": 832978238.0, "sample_num_tokens": 9554.0, "step": 3073, "total_num_tokens": 833016454.0, "z_loss": 0.0007421335903927684 }, { "copy_logits_max": -2.99173641204834, "copy_logits_min": -750000000.0, "copy_num_tokens": 336.625, "epoch": 0.6278274189430687, "gen_logits_max": 5.303621768951416, "gen_logits_mean": -14.022377967834473, "gen_logits_min": -24.990692138671875, "gen_logits_std": 2.4768190383911133, "gen_loss": 0.32476353645324707, "grad_norm": 0.5434396567960585, "learning_rate": 2.7046736842105264e-05, "loss": 0.332, "mean_copy_accuracy": 0.9933378547430038, "mean_gen_accuracy": 0.8592293113470078, "mean_token_accuracy": 0.8879416286945343, "num_tokens": 833231230.0, "sample_num_tokens": 7930.0, "step": 3074, "total_num_tokens": 833262950.0, "z_loss": 0.0007971181767061353 }, { "copy_logits_max": -0.6500217318534851, "copy_logits_min": -750000000.0, "copy_num_tokens": 370.0, "epoch": 0.6280316568802655, "gen_logits_max": 5.424463272094727, "gen_logits_mean": -12.986191749572754, "gen_logits_min": -24.155025482177734, "gen_logits_std": 2.519498825073242, "gen_loss": 0.3404686748981476, "grad_norm": 0.5330641714935133, "learning_rate": 2.7045473684210524e-05, "loss": 0.3212, "mean_copy_accuracy": 0.9937752485275269, "mean_gen_accuracy": 0.8641234934329987, "mean_token_accuracy": 0.8942878693342209, "num_tokens": 833501711.0, "sample_num_tokens": 8076.75, "step": 3075, "total_num_tokens": 833534018.0, "z_loss": 0.0008371329167857766 }, { "copy_logits_max": -2.199672222137451, "copy_logits_min": -750000000.0, "copy_num_tokens": 326.0625, "epoch": 0.6282358948174623, "gen_logits_max": 5.4623589515686035, "gen_logits_mean": -12.784775733947754, "gen_logits_min": -23.550952911376953, "gen_logits_std": 2.487698554992676, "gen_loss": 0.3396448791027069, "grad_norm": 0.515578256039177, "learning_rate": 2.704421052631579e-05, "loss": 0.3304, "mean_copy_accuracy": 0.9919154345989227, "mean_gen_accuracy": 0.8630048036575317, "mean_token_accuracy": 0.8899448215961456, "num_tokens": 833755089.0, "sample_num_tokens": 7364.25, "step": 3076, "total_num_tokens": 833784546.0, "z_loss": 0.0007608211017213762 }, { "copy_logits_max": -2.321833610534668, "copy_logits_min": -750000000.0, "copy_num_tokens": 242.3125, "epoch": 0.6284401327546592, "gen_logits_max": 6.291436195373535, "gen_logits_mean": -12.97159194946289, "gen_logits_min": -23.880615234375, "gen_logits_std": 2.487372875213623, "gen_loss": 0.33408641815185547, "grad_norm": 0.463825162842532, "learning_rate": 2.7042947368421056e-05, "loss": 0.3286, "mean_copy_accuracy": 0.9926761090755463, "mean_gen_accuracy": 0.8623493760824203, "mean_token_accuracy": 0.891996294260025, "num_tokens": 834012575.0, "sample_num_tokens": 6878.25, "step": 3077, "total_num_tokens": 834040088.0, "z_loss": 0.000815074541606009 }, { "copy_logits_max": -2.3297829627990723, "copy_logits_min": -750000000.0, "copy_num_tokens": 360.4375, "epoch": 0.628644370691856, "gen_logits_max": 5.406806945800781, "gen_logits_mean": -14.19734001159668, "gen_logits_min": -25.220613479614258, "gen_logits_std": 2.5363528728485107, "gen_loss": 0.3404559791088104, "grad_norm": 0.520102139710505, "learning_rate": 2.7041684210526317e-05, "loss": 0.3109, "mean_copy_accuracy": 0.9934729486703873, "mean_gen_accuracy": 0.8665594458580017, "mean_token_accuracy": 0.8975940048694611, "num_tokens": 834280763.0, "sample_num_tokens": 7933.75, "step": 3078, "total_num_tokens": 834312498.0, "z_loss": 0.000875512370839715 }, { "copy_logits_max": -3.5836238861083984, "copy_logits_min": -750000064.0, "copy_num_tokens": 403.75, "epoch": 0.6288486086290529, "gen_logits_max": 5.651388168334961, "gen_logits_mean": -13.798497200012207, "gen_logits_min": -24.778629302978516, "gen_logits_std": 2.5288286209106445, "gen_loss": 0.32835686206817627, "grad_norm": 0.44973725365636114, "learning_rate": 2.704042105263158e-05, "loss": 0.301, "mean_copy_accuracy": 0.9943915158510208, "mean_gen_accuracy": 0.8697220236063004, "mean_token_accuracy": 0.9007073640823364, "num_tokens": 834546425.0, "sample_num_tokens": 7621.75, "step": 3079, "total_num_tokens": 834576912.0, "z_loss": 0.0008474973728880286 }, { "copy_logits_max": -2.3682305812835693, "copy_logits_min": -750000000.0, "copy_num_tokens": 447.5625, "epoch": 0.6290528465662497, "gen_logits_max": 6.093698024749756, "gen_logits_mean": -12.734066009521484, "gen_logits_min": -23.747159957885742, "gen_logits_std": 2.472003936767578, "gen_loss": 0.3746418356895447, "grad_norm": 0.4986942387070336, "learning_rate": 2.7039157894736843e-05, "loss": 0.3339, "mean_copy_accuracy": 0.9928385615348816, "mean_gen_accuracy": 0.8585337698459625, "mean_token_accuracy": 0.891612246632576, "num_tokens": 834814828.0, "sample_num_tokens": 9570.0, "step": 3080, "total_num_tokens": 834853108.0, "z_loss": 0.0008754052687436342 }, { "copy_logits_max": -2.3608744144439697, "copy_logits_min": -750000000.0, "copy_num_tokens": 429.1875, "epoch": 0.6292570845034465, "gen_logits_max": 5.771142959594727, "gen_logits_mean": -13.244039535522461, "gen_logits_min": -24.227115631103516, "gen_logits_std": 2.5133211612701416, "gen_loss": 0.33224374055862427, "grad_norm": 0.5500460426801048, "learning_rate": 2.7037894736842107e-05, "loss": 0.3018, "mean_copy_accuracy": 0.9929632544517517, "mean_gen_accuracy": 0.8717680126428604, "mean_token_accuracy": 0.9011905342340469, "num_tokens": 835093608.0, "sample_num_tokens": 9154.0, "step": 3081, "total_num_tokens": 835130224.0, "z_loss": 0.0008066939772106707 }, { "copy_logits_max": -3.2822265625, "copy_logits_min": -750000000.0, "copy_num_tokens": 380.875, "epoch": 0.6294613224406433, "gen_logits_max": 5.461643695831299, "gen_logits_mean": -14.226449966430664, "gen_logits_min": -24.9694766998291, "gen_logits_std": 2.4805610179901123, "gen_loss": 0.3127383589744568, "grad_norm": 0.373626711818095, "learning_rate": 2.7036631578947368e-05, "loss": 0.2906, "mean_copy_accuracy": 0.9945369213819504, "mean_gen_accuracy": 0.8749829530715942, "mean_token_accuracy": 0.9014821797609329, "num_tokens": 835381768.0, "sample_num_tokens": 8981.5, "step": 3082, "total_num_tokens": 835417694.0, "z_loss": 0.0007301319856196642 }, { "copy_logits_max": -1.961935043334961, "copy_logits_min": -750000000.0, "copy_num_tokens": 382.6875, "epoch": 0.6296655603778402, "gen_logits_max": 5.312255382537842, "gen_logits_mean": -13.75157356262207, "gen_logits_min": -25.10944366455078, "gen_logits_std": 2.555267333984375, "gen_loss": 0.37902700901031494, "grad_norm": 0.5475919045052964, "learning_rate": 2.7035368421052632e-05, "loss": 0.329, "mean_copy_accuracy": 0.9936414510011673, "mean_gen_accuracy": 0.8588757961988449, "mean_token_accuracy": 0.8906843513250351, "num_tokens": 835652214.0, "sample_num_tokens": 8079.5, "step": 3083, "total_num_tokens": 835684532.0, "z_loss": 0.0009207751136273146 }, { "copy_logits_max": -1.1006221771240234, "copy_logits_min": -625000064.0, "copy_num_tokens": 414.875, "epoch": 0.629869798315037, "gen_logits_max": 5.307159423828125, "gen_logits_mean": -13.411667823791504, "gen_logits_min": -24.984834671020508, "gen_logits_std": 2.579437732696533, "gen_loss": 0.35502517223358154, "grad_norm": 0.4392993847461656, "learning_rate": 2.7034105263157893e-05, "loss": 0.333, "mean_copy_accuracy": 0.9949802905321121, "mean_gen_accuracy": 0.8551005870103836, "mean_token_accuracy": 0.8891952335834503, "num_tokens": 835911564.0, "sample_num_tokens": 8454.5, "step": 3084, "total_num_tokens": 835945382.0, "z_loss": 0.0007930720457807183 }, { "copy_logits_max": -1.7208952903747559, "copy_logits_min": -750000000.0, "copy_num_tokens": 372.1875, "epoch": 0.6300740362522339, "gen_logits_max": 5.626873016357422, "gen_logits_mean": -12.398834228515625, "gen_logits_min": -23.3959903717041, "gen_logits_std": 2.539048194885254, "gen_loss": 0.36333009600639343, "grad_norm": 0.5332874094605922, "learning_rate": 2.703284210526316e-05, "loss": 0.3317, "mean_copy_accuracy": 0.992302656173706, "mean_gen_accuracy": 0.858977660536766, "mean_token_accuracy": 0.889775350689888, "num_tokens": 836169293.0, "sample_num_tokens": 7979.25, "step": 3085, "total_num_tokens": 836201210.0, "z_loss": 0.000702305871527642 }, { "copy_logits_max": -3.263476610183716, "copy_logits_min": -750000064.0, "copy_num_tokens": 335.125, "epoch": 0.6302782741894307, "gen_logits_max": 5.591003894805908, "gen_logits_mean": -13.156020164489746, "gen_logits_min": -24.176530838012695, "gen_logits_std": 2.475510358810425, "gen_loss": 0.3464757204055786, "grad_norm": 0.5450419770313715, "learning_rate": 2.7031578947368422e-05, "loss": 0.3073, "mean_copy_accuracy": 0.9937867820262909, "mean_gen_accuracy": 0.8623291999101639, "mean_token_accuracy": 0.8973374515771866, "num_tokens": 836457509.0, "sample_num_tokens": 7587.25, "step": 3086, "total_num_tokens": 836487858.0, "z_loss": 0.0007620544056408107 }, { "copy_logits_max": -0.28303495049476624, "copy_logits_min": -750000064.0, "copy_num_tokens": 607.8125, "epoch": 0.6304825121266275, "gen_logits_max": 4.705266952514648, "gen_logits_mean": -13.5895357131958, "gen_logits_min": -25.263282775878906, "gen_logits_std": 2.535684585571289, "gen_loss": 0.3182520270347595, "grad_norm": 0.5805046000020643, "learning_rate": 2.7030315789473686e-05, "loss": 0.3191, "mean_copy_accuracy": 0.9935079663991928, "mean_gen_accuracy": 0.862431138753891, "mean_token_accuracy": 0.8964934945106506, "num_tokens": 836731576.0, "sample_num_tokens": 10143.5, "step": 3087, "total_num_tokens": 836772150.0, "z_loss": 0.0008178243297152221 }, { "copy_logits_max": -1.2825731039047241, "copy_logits_min": -687500032.0, "copy_num_tokens": 789.0, "epoch": 0.6306867500638244, "gen_logits_max": 4.561947822570801, "gen_logits_mean": -11.972875595092773, "gen_logits_min": -23.29673194885254, "gen_logits_std": 2.4966001510620117, "gen_loss": 0.267849862575531, "grad_norm": 0.42137258104405045, "learning_rate": 2.7029052631578947e-05, "loss": 0.2908, "mean_copy_accuracy": 0.9952597320079803, "mean_gen_accuracy": 0.8727793246507645, "mean_token_accuracy": 0.9058020859956741, "num_tokens": 837022907.0, "sample_num_tokens": 10378.75, "step": 3088, "total_num_tokens": 837064422.0, "z_loss": 0.0007422065827995539 }, { "copy_logits_max": -2.58833646774292, "copy_logits_min": -687500032.0, "copy_num_tokens": 430.5, "epoch": 0.6308909880010212, "gen_logits_max": 5.1168742179870605, "gen_logits_mean": -14.047910690307617, "gen_logits_min": -25.046138763427734, "gen_logits_std": 2.5086545944213867, "gen_loss": 0.3421340882778168, "grad_norm": 0.5362035352558833, "learning_rate": 2.702778947368421e-05, "loss": 0.3177, "mean_copy_accuracy": 0.991669163107872, "mean_gen_accuracy": 0.8708025813102722, "mean_token_accuracy": 0.8949364125728607, "num_tokens": 837273371.0, "sample_num_tokens": 8532.25, "step": 3089, "total_num_tokens": 837307500.0, "z_loss": 0.0007256576209329069 }, { "copy_logits_max": -1.6756216287612915, "copy_logits_min": -687500032.0, "copy_num_tokens": 272.4375, "epoch": 0.631095225938218, "gen_logits_max": 6.054329872131348, "gen_logits_mean": -11.591839790344238, "gen_logits_min": -22.41134262084961, "gen_logits_std": 2.441927909851074, "gen_loss": 0.3454592823982239, "grad_norm": 0.4535287168107436, "learning_rate": 2.7026526315789476e-05, "loss": 0.3143, "mean_copy_accuracy": 0.9927611500024796, "mean_gen_accuracy": 0.8646689206361771, "mean_token_accuracy": 0.8971187323331833, "num_tokens": 837567614.0, "sample_num_tokens": 7390.0, "step": 3090, "total_num_tokens": 837597174.0, "z_loss": 0.0006421295693144202 }, { "copy_logits_max": -0.0072179436683654785, "copy_logits_min": -750000000.0, "copy_num_tokens": 622.5, "epoch": 0.6312994638754149, "gen_logits_max": 4.010807991027832, "gen_logits_mean": -13.444892883300781, "gen_logits_min": -24.669883728027344, "gen_logits_std": 2.4956250190734863, "gen_loss": 0.2817273437976837, "grad_norm": 0.4293254746548466, "learning_rate": 2.7025263157894736e-05, "loss": 0.3117, "mean_copy_accuracy": 0.9946920573711395, "mean_gen_accuracy": 0.8630947172641754, "mean_token_accuracy": 0.898773729801178, "num_tokens": 837836551.0, "sample_num_tokens": 9491.75, "step": 3091, "total_num_tokens": 837874518.0, "z_loss": 0.0008524779113940895 }, { "copy_logits_max": 0.06751851737499237, "copy_logits_min": -750000000.0, "copy_num_tokens": 537.8125, "epoch": 0.6315037018126117, "gen_logits_max": 4.514898300170898, "gen_logits_mean": -13.378314971923828, "gen_logits_min": -24.529144287109375, "gen_logits_std": 2.533792734146118, "gen_loss": 0.30411481857299805, "grad_norm": 0.43421898678746773, "learning_rate": 2.7024e-05, "loss": 0.294, "mean_copy_accuracy": 0.9962109476327896, "mean_gen_accuracy": 0.8690502345561981, "mean_token_accuracy": 0.9022295624017715, "num_tokens": 838114009.0, "sample_num_tokens": 9089.25, "step": 3092, "total_num_tokens": 838150366.0, "z_loss": 0.0008408227004110813 }, { "copy_logits_max": -3.400742530822754, "copy_logits_min": -687500032.0, "copy_num_tokens": 184.3125, "epoch": 0.6317079397498085, "gen_logits_max": 5.600099563598633, "gen_logits_mean": -14.689390182495117, "gen_logits_min": -25.61219024658203, "gen_logits_std": 2.516756534576416, "gen_loss": 0.3515879511833191, "grad_norm": 0.5183418228183924, "learning_rate": 2.7022736842105262e-05, "loss": 0.3303, "mean_copy_accuracy": 0.9933843314647675, "mean_gen_accuracy": 0.8625239580869675, "mean_token_accuracy": 0.8908735513687134, "num_tokens": 838357961.0, "sample_num_tokens": 5942.75, "step": 3093, "total_num_tokens": 838381732.0, "z_loss": 0.0007706278702244163 }, { "copy_logits_max": 0.131814107298851, "copy_logits_min": -687500096.0, "copy_num_tokens": 434.125, "epoch": 0.6319121776870054, "gen_logits_max": 5.29052734375, "gen_logits_mean": -12.892401695251465, "gen_logits_min": -24.11463165283203, "gen_logits_std": 2.513047218322754, "gen_loss": 0.3038060665130615, "grad_norm": 0.5071048443853322, "learning_rate": 2.702147368421053e-05, "loss": 0.2926, "mean_copy_accuracy": 0.991997241973877, "mean_gen_accuracy": 0.8727135956287384, "mean_token_accuracy": 0.902226448059082, "num_tokens": 838618221.0, "sample_num_tokens": 7515.25, "step": 3094, "total_num_tokens": 838648282.0, "z_loss": 0.0007516461773775518 }, { "copy_logits_max": -2.0650548934936523, "copy_logits_min": -750000000.0, "copy_num_tokens": 263.3125, "epoch": 0.6321164156242022, "gen_logits_max": 6.200530529022217, "gen_logits_mean": -13.350861549377441, "gen_logits_min": -24.42923355102539, "gen_logits_std": 2.520315647125244, "gen_loss": 0.3594561815261841, "grad_norm": 0.48741748766166554, "learning_rate": 2.702021052631579e-05, "loss": 0.3245, "mean_copy_accuracy": 0.993459016084671, "mean_gen_accuracy": 0.864401787519455, "mean_token_accuracy": 0.8930562138557434, "num_tokens": 838879044.0, "sample_num_tokens": 6689.0, "step": 3095, "total_num_tokens": 838905800.0, "z_loss": 0.0008154664537869394 }, { "copy_logits_max": -0.4361955523490906, "copy_logits_min": -750000000.0, "copy_num_tokens": 506.1875, "epoch": 0.632320653561399, "gen_logits_max": 5.01192569732666, "gen_logits_mean": -13.493027687072754, "gen_logits_min": -24.703697204589844, "gen_logits_std": 2.546964168548584, "gen_loss": 0.29136842489242554, "grad_norm": 0.4126328360290194, "learning_rate": 2.7018947368421055e-05, "loss": 0.3126, "mean_copy_accuracy": 0.9949670881032944, "mean_gen_accuracy": 0.8654332607984543, "mean_token_accuracy": 0.8970371335744858, "num_tokens": 839146511.0, "sample_num_tokens": 8720.75, "step": 3096, "total_num_tokens": 839181394.0, "z_loss": 0.0007843116763979197 }, { "copy_logits_max": -1.1553287506103516, "copy_logits_min": -750000000.0, "copy_num_tokens": 332.5625, "epoch": 0.6325248914985958, "gen_logits_max": 6.449995994567871, "gen_logits_mean": -11.904769897460938, "gen_logits_min": -22.931198120117188, "gen_logits_std": 2.497241973876953, "gen_loss": 0.32384610176086426, "grad_norm": 0.5175303447295398, "learning_rate": 2.7017684210526316e-05, "loss": 0.3195, "mean_copy_accuracy": 0.994369626045227, "mean_gen_accuracy": 0.8677868098020554, "mean_token_accuracy": 0.8957697749137878, "num_tokens": 839412073.0, "sample_num_tokens": 8373.25, "step": 3097, "total_num_tokens": 839445566.0, "z_loss": 0.0007628381717950106 }, { "copy_logits_max": -0.5362403392791748, "copy_logits_min": -750000000.0, "copy_num_tokens": 571.5, "epoch": 0.6327291294357927, "gen_logits_max": 4.6399383544921875, "gen_logits_mean": -13.050420761108398, "gen_logits_min": -24.458751678466797, "gen_logits_std": 2.5154128074645996, "gen_loss": 0.29444754123687744, "grad_norm": 0.4183640364129488, "learning_rate": 2.701642105263158e-05, "loss": 0.303, "mean_copy_accuracy": 0.9947276264429092, "mean_gen_accuracy": 0.8662891089916229, "mean_token_accuracy": 0.8988534808158875, "num_tokens": 839683379.0, "sample_num_tokens": 9201.25, "step": 3098, "total_num_tokens": 839720184.0, "z_loss": 0.0006701412494294345 }, { "copy_logits_max": -1.7806578874588013, "copy_logits_min": -750000000.0, "copy_num_tokens": 506.125, "epoch": 0.6329333673729896, "gen_logits_max": 5.1174211502075195, "gen_logits_mean": -13.603693008422852, "gen_logits_min": -24.798826217651367, "gen_logits_std": 2.5751585960388184, "gen_loss": 0.3337671160697937, "grad_norm": 0.5725909509867664, "learning_rate": 2.701515789473684e-05, "loss": 0.333, "mean_copy_accuracy": 0.992339015007019, "mean_gen_accuracy": 0.8535802364349365, "mean_token_accuracy": 0.8877212107181549, "num_tokens": 839953369.0, "sample_num_tokens": 8827.25, "step": 3099, "total_num_tokens": 839988678.0, "z_loss": 0.0007224594592116773 }, { "copy_logits_max": -1.3615609407424927, "copy_logits_min": -750000128.0, "copy_num_tokens": 409.8125, "epoch": 0.6331376053101864, "gen_logits_max": 5.416725158691406, "gen_logits_mean": -12.600449562072754, "gen_logits_min": -23.41508674621582, "gen_logits_std": 2.439781665802002, "gen_loss": 0.3283264935016632, "grad_norm": 0.45734278394472183, "learning_rate": 2.7013894736842105e-05, "loss": 0.3186, "mean_copy_accuracy": 0.9934750199317932, "mean_gen_accuracy": 0.8654452413320541, "mean_token_accuracy": 0.8933145701885223, "num_tokens": 840217570.0, "sample_num_tokens": 8129.5, "step": 3100, "total_num_tokens": 840250088.0, "z_loss": 0.000741106690838933 }, { "copy_logits_max": -2.573167562484741, "copy_logits_min": -750000000.0, "copy_num_tokens": 386.125, "epoch": 0.6333418432473832, "gen_logits_max": 5.006866455078125, "gen_logits_mean": -14.586091995239258, "gen_logits_min": -25.540889739990234, "gen_logits_std": 2.5390965938568115, "gen_loss": 0.31864112615585327, "grad_norm": 0.5216689079145499, "learning_rate": 2.7012631578947366e-05, "loss": 0.3202, "mean_copy_accuracy": 0.9950498789548874, "mean_gen_accuracy": 0.8638540804386139, "mean_token_accuracy": 0.8932133615016937, "num_tokens": 840486095.0, "sample_num_tokens": 8055.25, "step": 3101, "total_num_tokens": 840518316.0, "z_loss": 0.0007850799593143165 }, { "copy_logits_max": -2.031599521636963, "copy_logits_min": -750000000.0, "copy_num_tokens": 510.5625, "epoch": 0.63354608118458, "gen_logits_max": 4.895458221435547, "gen_logits_mean": -13.554262161254883, "gen_logits_min": -24.967422485351562, "gen_logits_std": 2.4599270820617676, "gen_loss": 0.29198989272117615, "grad_norm": 0.4412884280640313, "learning_rate": 2.7011368421052634e-05, "loss": 0.3118, "mean_copy_accuracy": 0.9942347556352615, "mean_gen_accuracy": 0.8627827167510986, "mean_token_accuracy": 0.8981587886810303, "num_tokens": 840784514.0, "sample_num_tokens": 8518.5, "step": 3102, "total_num_tokens": 840818588.0, "z_loss": 0.000757134985178709 }, { "copy_logits_max": -1.4947761297225952, "copy_logits_min": -750000000.0, "copy_num_tokens": 489.3125, "epoch": 0.6337503191217768, "gen_logits_max": 5.167792320251465, "gen_logits_mean": -12.457228660583496, "gen_logits_min": -23.47882652282715, "gen_logits_std": 2.4417309761047363, "gen_loss": 0.3076411485671997, "grad_norm": 0.4190534075085761, "learning_rate": 2.7010105263157898e-05, "loss": 0.3099, "mean_copy_accuracy": 0.9938932210206985, "mean_gen_accuracy": 0.8622792065143585, "mean_token_accuracy": 0.8958840519189835, "num_tokens": 841070418.0, "sample_num_tokens": 9002.0, "step": 3103, "total_num_tokens": 841106426.0, "z_loss": 0.0007092328742146492 }, { "copy_logits_max": -1.0232337713241577, "copy_logits_min": -750000000.0, "copy_num_tokens": 309.6875, "epoch": 0.6339545570589737, "gen_logits_max": 6.281455039978027, "gen_logits_mean": -12.358275413513184, "gen_logits_min": -23.53167152404785, "gen_logits_std": 2.515934467315674, "gen_loss": 0.3543410897254944, "grad_norm": 0.4810779640888535, "learning_rate": 2.700884210526316e-05, "loss": 0.3148, "mean_copy_accuracy": 0.9925367534160614, "mean_gen_accuracy": 0.8624681681394577, "mean_token_accuracy": 0.8959627151489258, "num_tokens": 841351973.0, "sample_num_tokens": 6902.25, "step": 3104, "total_num_tokens": 841379582.0, "z_loss": 0.0007633958593942225 }, { "copy_logits_max": 0.0014705359935760498, "copy_logits_min": -750000000.0, "copy_num_tokens": 289.5625, "epoch": 0.6341587949961706, "gen_logits_max": 6.341470241546631, "gen_logits_mean": -12.434850692749023, "gen_logits_min": -23.58762550354004, "gen_logits_std": 2.5103273391723633, "gen_loss": 0.3320704400539398, "grad_norm": 0.44470776213602964, "learning_rate": 2.7007578947368423e-05, "loss": 0.3192, "mean_copy_accuracy": 0.9939658641815186, "mean_gen_accuracy": 0.862269937992096, "mean_token_accuracy": 0.8954876959323883, "num_tokens": 841634161.0, "sample_num_tokens": 6914.75, "step": 3105, "total_num_tokens": 841661820.0, "z_loss": 0.0008136143442243338 }, { "copy_logits_max": 0.5551299452781677, "copy_logits_min": -750000000.0, "copy_num_tokens": 478.5625, "epoch": 0.6343630329333674, "gen_logits_max": 6.08290958404541, "gen_logits_mean": -11.943965911865234, "gen_logits_min": -22.902332305908203, "gen_logits_std": 2.457880735397339, "gen_loss": 0.3154356777667999, "grad_norm": 0.44747613932184177, "learning_rate": 2.7006315789473684e-05, "loss": 0.3324, "mean_copy_accuracy": 0.9952689409255981, "mean_gen_accuracy": 0.8592957556247711, "mean_token_accuracy": 0.8919142037630081, "num_tokens": 841889832.0, "sample_num_tokens": 8598.5, "step": 3106, "total_num_tokens": 841924226.0, "z_loss": 0.0008154803654178977 }, { "copy_logits_max": -0.24517065286636353, "copy_logits_min": -687500032.0, "copy_num_tokens": 440.25, "epoch": 0.6345672708705642, "gen_logits_max": 5.902961730957031, "gen_logits_mean": -12.239484786987305, "gen_logits_min": -23.95501136779785, "gen_logits_std": 2.5231494903564453, "gen_loss": 0.3481913208961487, "grad_norm": 0.4221426843959472, "learning_rate": 2.700505263157895e-05, "loss": 0.3272, "mean_copy_accuracy": 0.9939126521348953, "mean_gen_accuracy": 0.8590519726276398, "mean_token_accuracy": 0.8907093852758408, "num_tokens": 842146985.0, "sample_num_tokens": 8124.25, "step": 3107, "total_num_tokens": 842179482.0, "z_loss": 0.0008482512785121799 }, { "copy_logits_max": -1.7126052379608154, "copy_logits_min": -750000000.0, "copy_num_tokens": 489.3125, "epoch": 0.634771508807761, "gen_logits_max": 4.852265357971191, "gen_logits_mean": -13.874795913696289, "gen_logits_min": -24.900287628173828, "gen_logits_std": 2.514636754989624, "gen_loss": 0.3264826536178589, "grad_norm": 0.44532457852503193, "learning_rate": 2.700378947368421e-05, "loss": 0.3236, "mean_copy_accuracy": 0.9935036599636078, "mean_gen_accuracy": 0.8598766624927521, "mean_token_accuracy": 0.8929112553596497, "num_tokens": 842410298.0, "sample_num_tokens": 8116.5, "step": 3108, "total_num_tokens": 842442764.0, "z_loss": 0.0006801355630159378 }, { "copy_logits_max": -5.166391372680664, "copy_logits_min": -750000000.0, "copy_num_tokens": 322.875, "epoch": 0.6349757467449578, "gen_logits_max": 5.354386329650879, "gen_logits_mean": -14.970243453979492, "gen_logits_min": -26.171337127685547, "gen_logits_std": 2.5329031944274902, "gen_loss": 0.3018982708454132, "grad_norm": 0.4220816699456865, "learning_rate": 2.7002526315789474e-05, "loss": 0.2926, "mean_copy_accuracy": 0.993931770324707, "mean_gen_accuracy": 0.8701609820127487, "mean_token_accuracy": 0.9027664512395859, "num_tokens": 842695249.0, "sample_num_tokens": 7336.25, "step": 3109, "total_num_tokens": 842724594.0, "z_loss": 0.000715774018317461 }, { "copy_logits_max": -2.4398956298828125, "copy_logits_min": -750000000.0, "copy_num_tokens": 283.0625, "epoch": 0.6351799846821548, "gen_logits_max": 5.736626625061035, "gen_logits_mean": -13.878332138061523, "gen_logits_min": -25.14397621154785, "gen_logits_std": 2.555842876434326, "gen_loss": 0.3863508403301239, "grad_norm": 0.4944785998962977, "learning_rate": 2.7001263157894738e-05, "loss": 0.3538, "mean_copy_accuracy": 0.9916882514953613, "mean_gen_accuracy": 0.8569040298461914, "mean_token_accuracy": 0.8834716081619263, "num_tokens": 842936984.0, "sample_num_tokens": 6739.0, "step": 3110, "total_num_tokens": 842963940.0, "z_loss": 0.0008343932568095624 }, { "copy_logits_max": 1.0010404586791992, "copy_logits_min": -750000064.0, "copy_num_tokens": 500.75, "epoch": 0.6353842226193516, "gen_logits_max": 6.638179779052734, "gen_logits_mean": -11.187128067016602, "gen_logits_min": -22.22089958190918, "gen_logits_std": 2.4789962768554688, "gen_loss": 0.26914846897125244, "grad_norm": 0.5629457488318945, "learning_rate": 2.7000000000000002e-05, "loss": 0.31, "mean_copy_accuracy": 0.9946271926164627, "mean_gen_accuracy": 0.8676191568374634, "mean_token_accuracy": 0.8980059027671814, "num_tokens": 843192669.0, "sample_num_tokens": 8472.25, "step": 3111, "total_num_tokens": 843226558.0, "z_loss": 0.0006370184128172696 }, { "copy_logits_max": -0.3297872543334961, "copy_logits_min": -750000000.0, "copy_num_tokens": 523.875, "epoch": 0.6355884605565484, "gen_logits_max": 6.154097557067871, "gen_logits_mean": -11.87643051147461, "gen_logits_min": -23.947628021240234, "gen_logits_std": 2.586099147796631, "gen_loss": 0.320138543844223, "grad_norm": 0.44563004494567726, "learning_rate": 2.6998736842105263e-05, "loss": 0.3146, "mean_copy_accuracy": 0.9955675452947617, "mean_gen_accuracy": 0.8642330765724182, "mean_token_accuracy": 0.8994108289480209, "num_tokens": 843472046.0, "sample_num_tokens": 8720.0, "step": 3112, "total_num_tokens": 843506926.0, "z_loss": 0.0007683931034989655 }, { "copy_logits_max": -0.6664935946464539, "copy_logits_min": -687500032.0, "copy_num_tokens": 357.25, "epoch": 0.6357926984937452, "gen_logits_max": 5.971327304840088, "gen_logits_mean": -12.393589973449707, "gen_logits_min": -24.30792999267578, "gen_logits_std": 2.5343029499053955, "gen_loss": 0.3706592321395874, "grad_norm": 0.42846736072829467, "learning_rate": 2.6997473684210528e-05, "loss": 0.3365, "mean_copy_accuracy": 0.994315966963768, "mean_gen_accuracy": 0.8548854738473892, "mean_token_accuracy": 0.88911272585392, "num_tokens": 843743985.0, "sample_num_tokens": 8022.75, "step": 3113, "total_num_tokens": 843776076.0, "z_loss": 0.0007826411165297031 }, { "copy_logits_max": -0.5679435729980469, "copy_logits_min": -687500032.0, "copy_num_tokens": 695.875, "epoch": 0.635996936430942, "gen_logits_max": 3.933152198791504, "gen_logits_mean": -14.278084754943848, "gen_logits_min": -26.07648468017578, "gen_logits_std": 2.59425950050354, "gen_loss": 0.2335631102323532, "grad_norm": 0.4008103045735185, "learning_rate": 2.699621052631579e-05, "loss": 0.2905, "mean_copy_accuracy": 0.9952374696731567, "mean_gen_accuracy": 0.8695603609085083, "mean_token_accuracy": 0.9031155854463577, "num_tokens": 844015840.0, "sample_num_tokens": 9369.5, "step": 3114, "total_num_tokens": 844053318.0, "z_loss": 0.0005583535530604422 }, { "copy_logits_max": 1.6598260402679443, "copy_logits_min": -687500032.0, "copy_num_tokens": 529.75, "epoch": 0.6362011743681388, "gen_logits_max": 4.54750394821167, "gen_logits_mean": -13.696311950683594, "gen_logits_min": -25.253877639770508, "gen_logits_std": 2.5847418308258057, "gen_loss": 0.38854074478149414, "grad_norm": 0.40571576173813084, "learning_rate": 2.6994947368421053e-05, "loss": 0.3228, "mean_copy_accuracy": 0.9948775172233582, "mean_gen_accuracy": 0.8600466251373291, "mean_token_accuracy": 0.8969578593969345, "num_tokens": 844290018.0, "sample_num_tokens": 8679.0, "step": 3115, "total_num_tokens": 844324734.0, "z_loss": 0.0007943619857542217 }, { "copy_logits_max": -1.2942850589752197, "copy_logits_min": -750000000.0, "copy_num_tokens": 622.0625, "epoch": 0.6364054123053358, "gen_logits_max": 3.901003360748291, "gen_logits_mean": -14.729476928710938, "gen_logits_min": -25.762691497802734, "gen_logits_std": 2.5185861587524414, "gen_loss": 0.27990394830703735, "grad_norm": 0.43048849981997833, "learning_rate": 2.6993684210526314e-05, "loss": 0.3125, "mean_copy_accuracy": 0.9948324859142303, "mean_gen_accuracy": 0.8607500493526459, "mean_token_accuracy": 0.8954249769449234, "num_tokens": 844543342.0, "sample_num_tokens": 9116.0, "step": 3116, "total_num_tokens": 844579806.0, "z_loss": 0.0006013050442561507 }, { "copy_logits_max": -2.546935558319092, "copy_logits_min": -750000000.0, "copy_num_tokens": 349.5625, "epoch": 0.6366096502425326, "gen_logits_max": 5.24580192565918, "gen_logits_mean": -12.884689331054688, "gen_logits_min": -23.982257843017578, "gen_logits_std": 2.4673776626586914, "gen_loss": 0.3553238809108734, "grad_norm": 0.40400525805742044, "learning_rate": 2.6992421052631578e-05, "loss": 0.3131, "mean_copy_accuracy": 0.9942765235900879, "mean_gen_accuracy": 0.8677537143230438, "mean_token_accuracy": 0.8967706263065338, "num_tokens": 844816465.0, "sample_num_tokens": 7457.75, "step": 3117, "total_num_tokens": 844846296.0, "z_loss": 0.0006764247082173824 }, { "copy_logits_max": -1.8794474601745605, "copy_logits_min": -750000000.0, "copy_num_tokens": 441.8125, "epoch": 0.6368138881797294, "gen_logits_max": 5.8609299659729, "gen_logits_mean": -11.491856575012207, "gen_logits_min": -22.224292755126953, "gen_logits_std": 2.3918139934539795, "gen_loss": 0.3287580907344818, "grad_norm": 0.45227540094029167, "learning_rate": 2.6991157894736846e-05, "loss": 0.314, "mean_copy_accuracy": 0.9949317276477814, "mean_gen_accuracy": 0.8654613345861435, "mean_token_accuracy": 0.8956918269395828, "num_tokens": 845078582.0, "sample_num_tokens": 8147.5, "step": 3118, "total_num_tokens": 845111172.0, "z_loss": 0.0006921369931660593 }, { "copy_logits_max": 0.9001756906509399, "copy_logits_min": -750000000.0, "copy_num_tokens": 852.6875, "epoch": 0.6370181261169262, "gen_logits_max": 5.5914225578308105, "gen_logits_mean": -10.941305160522461, "gen_logits_min": -22.319210052490234, "gen_logits_std": 2.429464340209961, "gen_loss": 0.2866339087486267, "grad_norm": 0.4177218655020719, "learning_rate": 2.6989894736842107e-05, "loss": 0.3084, "mean_copy_accuracy": 0.994655966758728, "mean_gen_accuracy": 0.8656736463308334, "mean_token_accuracy": 0.8999140858650208, "num_tokens": 845348739.0, "sample_num_tokens": 11145.25, "step": 3119, "total_num_tokens": 845393320.0, "z_loss": 0.0007629011524841189 }, { "copy_logits_max": -4.215346336364746, "copy_logits_min": -750000064.0, "copy_num_tokens": 320.0625, "epoch": 0.637222364054123, "gen_logits_max": 6.2471795082092285, "gen_logits_mean": -11.860011100769043, "gen_logits_min": -22.583343505859375, "gen_logits_std": 2.430224657058716, "gen_loss": 0.3585074841976166, "grad_norm": 0.47652622233346614, "learning_rate": 2.698863157894737e-05, "loss": 0.3101, "mean_copy_accuracy": 0.994996190071106, "mean_gen_accuracy": 0.8666970729827881, "mean_token_accuracy": 0.8964788764715195, "num_tokens": 845618417.0, "sample_num_tokens": 7809.75, "step": 3120, "total_num_tokens": 845649656.0, "z_loss": 0.0007405783399008214 }, { "copy_logits_max": -5.800621032714844, "copy_logits_min": -750000000.0, "copy_num_tokens": 354.0, "epoch": 0.6374266019913198, "gen_logits_max": 5.625931739807129, "gen_logits_mean": -13.112201690673828, "gen_logits_min": -23.480037689208984, "gen_logits_std": 2.439980983734131, "gen_loss": 0.3101074695587158, "grad_norm": 0.43954991117311293, "learning_rate": 2.6987368421052632e-05, "loss": 0.3361, "mean_copy_accuracy": 0.9924485385417938, "mean_gen_accuracy": 0.8618511110544205, "mean_token_accuracy": 0.8873684853315353, "num_tokens": 845870527.0, "sample_num_tokens": 8777.75, "step": 3121, "total_num_tokens": 845905638.0, "z_loss": 0.0006295447237789631 }, { "copy_logits_max": -4.221308708190918, "copy_logits_min": -687500032.0, "copy_num_tokens": 258.3125, "epoch": 0.6376308399285168, "gen_logits_max": 5.604087829589844, "gen_logits_mean": -13.647293090820312, "gen_logits_min": -24.691055297851562, "gen_logits_std": 2.466064214706421, "gen_loss": 0.3412628173828125, "grad_norm": 0.43311887817887096, "learning_rate": 2.6986105263157896e-05, "loss": 0.329, "mean_copy_accuracy": 0.9949456006288528, "mean_gen_accuracy": 0.8591061383485794, "mean_token_accuracy": 0.8892767131328583, "num_tokens": 846132768.0, "sample_num_tokens": 6525.5, "step": 3122, "total_num_tokens": 846158870.0, "z_loss": 0.0007304640021175146 }, { "copy_logits_max": -3.3661651611328125, "copy_logits_min": -750000000.0, "copy_num_tokens": 329.0, "epoch": 0.6378350778657136, "gen_logits_max": 5.693180084228516, "gen_logits_mean": -12.876174926757812, "gen_logits_min": -23.305721282958984, "gen_logits_std": 2.427351236343384, "gen_loss": 0.3382114768028259, "grad_norm": 0.410620144185046, "learning_rate": 2.6984842105263157e-05, "loss": 0.3048, "mean_copy_accuracy": 0.9945865273475647, "mean_gen_accuracy": 0.871586948633194, "mean_token_accuracy": 0.8990301191806793, "num_tokens": 846409047.0, "sample_num_tokens": 8921.25, "step": 3123, "total_num_tokens": 846444732.0, "z_loss": 0.0006953872507438064 }, { "copy_logits_max": -1.5738084316253662, "copy_logits_min": -750000000.0, "copy_num_tokens": 435.5, "epoch": 0.6380393158029104, "gen_logits_max": 5.734445571899414, "gen_logits_mean": -12.864888191223145, "gen_logits_min": -24.081401824951172, "gen_logits_std": 2.4874513149261475, "gen_loss": 0.30632880330085754, "grad_norm": 0.4513475449706444, "learning_rate": 2.698357894736842e-05, "loss": 0.325, "mean_copy_accuracy": 0.9956724643707275, "mean_gen_accuracy": 0.8626226037740707, "mean_token_accuracy": 0.8934532105922699, "num_tokens": 846679134.0, "sample_num_tokens": 7959.0, "step": 3124, "total_num_tokens": 846710970.0, "z_loss": 0.000793289567809552 }, { "copy_logits_max": -3.33937406539917, "copy_logits_min": -750000000.0, "copy_num_tokens": 469.3125, "epoch": 0.6382435537401072, "gen_logits_max": 6.198172092437744, "gen_logits_mean": -13.02174186706543, "gen_logits_min": -23.943649291992188, "gen_logits_std": 2.51814603805542, "gen_loss": 0.2909727096557617, "grad_norm": 0.40402525748090445, "learning_rate": 2.6982315789473682e-05, "loss": 0.2981, "mean_copy_accuracy": 0.9961510598659515, "mean_gen_accuracy": 0.8705713897943497, "mean_token_accuracy": 0.9020891934633255, "num_tokens": 846950315.0, "sample_num_tokens": 9120.25, "step": 3125, "total_num_tokens": 846986796.0, "z_loss": 0.0006793413194827735 }, { "copy_logits_max": -5.496799468994141, "copy_logits_min": -750000000.0, "copy_num_tokens": 391.375, "epoch": 0.638447791677304, "gen_logits_max": 5.353682994842529, "gen_logits_mean": -13.995162010192871, "gen_logits_min": -24.606739044189453, "gen_logits_std": 2.4607303142547607, "gen_loss": 0.34983426332473755, "grad_norm": 0.41457240289617986, "learning_rate": 2.698105263157895e-05, "loss": 0.3307, "mean_copy_accuracy": 0.994553342461586, "mean_gen_accuracy": 0.8577483594417572, "mean_token_accuracy": 0.8896222561597824, "num_tokens": 847204661.0, "sample_num_tokens": 7887.75, "step": 3126, "total_num_tokens": 847236212.0, "z_loss": 0.000830231118015945 }, { "copy_logits_max": -4.26093053817749, "copy_logits_min": -687500032.0, "copy_num_tokens": 411.5625, "epoch": 0.6386520296145008, "gen_logits_max": 5.447432041168213, "gen_logits_mean": -15.192444801330566, "gen_logits_min": -26.233154296875, "gen_logits_std": 2.5368502140045166, "gen_loss": 0.3678343892097473, "grad_norm": 0.41994938611029076, "learning_rate": 2.697978947368421e-05, "loss": 0.3198, "mean_copy_accuracy": 0.9932047873735428, "mean_gen_accuracy": 0.8691394627094269, "mean_token_accuracy": 0.893404170870781, "num_tokens": 847464342.0, "sample_num_tokens": 8787.0, "step": 3127, "total_num_tokens": 847499490.0, "z_loss": 0.000804800190962851 }, { "copy_logits_max": -2.743459463119507, "copy_logits_min": -750000000.0, "copy_num_tokens": 526.0, "epoch": 0.6388562675516978, "gen_logits_max": 5.472082138061523, "gen_logits_mean": -12.926322937011719, "gen_logits_min": -23.97743034362793, "gen_logits_std": 2.5111074447631836, "gen_loss": 0.26072508096694946, "grad_norm": 0.4102497181751032, "learning_rate": 2.6978526315789475e-05, "loss": 0.2928, "mean_copy_accuracy": 0.9948278814554214, "mean_gen_accuracy": 0.8730112016201019, "mean_token_accuracy": 0.9030015170574188, "num_tokens": 847720684.0, "sample_num_tokens": 7929.0, "step": 3128, "total_num_tokens": 847752400.0, "z_loss": 0.0006312160403467715 }, { "copy_logits_max": -4.563832759857178, "copy_logits_min": -750000000.0, "copy_num_tokens": 354.3125, "epoch": 0.6390605054888946, "gen_logits_max": 5.66966438293457, "gen_logits_mean": -13.86527156829834, "gen_logits_min": -25.111042022705078, "gen_logits_std": 2.504941463470459, "gen_loss": 0.291378915309906, "grad_norm": 0.4059591116332821, "learning_rate": 2.6977263157894736e-05, "loss": 0.2939, "mean_copy_accuracy": 0.9955162853002548, "mean_gen_accuracy": 0.8698322474956512, "mean_token_accuracy": 0.9020327478647232, "num_tokens": 847995729.0, "sample_num_tokens": 7753.75, "step": 3129, "total_num_tokens": 848026744.0, "z_loss": 0.0006555980071425438 }, { "copy_logits_max": -2.7158162593841553, "copy_logits_min": -750000000.0, "copy_num_tokens": 304.3125, "epoch": 0.6392647434260914, "gen_logits_max": 6.028899192810059, "gen_logits_mean": -12.520384788513184, "gen_logits_min": -23.498367309570312, "gen_logits_std": 2.455033302307129, "gen_loss": 0.3417842984199524, "grad_norm": 0.41157658413998166, "learning_rate": 2.6976e-05, "loss": 0.3124, "mean_copy_accuracy": 0.9956894516944885, "mean_gen_accuracy": 0.8647034764289856, "mean_token_accuracy": 0.8967259973287582, "num_tokens": 848266262.0, "sample_num_tokens": 6984.5, "step": 3130, "total_num_tokens": 848294200.0, "z_loss": 0.0008155829273164272 }, { "copy_logits_max": -2.0439682006835938, "copy_logits_min": -687500032.0, "copy_num_tokens": 473.5625, "epoch": 0.6394689813632882, "gen_logits_max": 5.274651527404785, "gen_logits_mean": -14.062031745910645, "gen_logits_min": -25.412776947021484, "gen_logits_std": 2.5604028701782227, "gen_loss": 0.3140204846858978, "grad_norm": 0.4900825470286317, "learning_rate": 2.6974736842105265e-05, "loss": 0.3282, "mean_copy_accuracy": 0.9934517741203308, "mean_gen_accuracy": 0.8632116913795471, "mean_token_accuracy": 0.8914774060249329, "num_tokens": 848511460.0, "sample_num_tokens": 7839.0, "step": 3131, "total_num_tokens": 848542816.0, "z_loss": 0.0008542832802049816 }, { "copy_logits_max": -3.444181442260742, "copy_logits_min": -750000000.0, "copy_num_tokens": 439.1875, "epoch": 0.639673219300485, "gen_logits_max": 6.270498275756836, "gen_logits_mean": -11.77619743347168, "gen_logits_min": -22.834171295166016, "gen_logits_std": 2.4508888721466064, "gen_loss": 0.33953097462654114, "grad_norm": 0.4209426824671102, "learning_rate": 2.6973473684210526e-05, "loss": 0.3152, "mean_copy_accuracy": 0.9952301979064941, "mean_gen_accuracy": 0.8575335592031479, "mean_token_accuracy": 0.8968924880027771, "num_tokens": 848780299.0, "sample_num_tokens": 7819.25, "step": 3132, "total_num_tokens": 848811576.0, "z_loss": 0.0007789666997268796 }, { "copy_logits_max": -3.16373872756958, "copy_logits_min": -750000000.0, "copy_num_tokens": 549.375, "epoch": 0.6398774572376819, "gen_logits_max": 4.419683456420898, "gen_logits_mean": -12.980411529541016, "gen_logits_min": -24.647048950195312, "gen_logits_std": 2.4985549449920654, "gen_loss": 0.2824714779853821, "grad_norm": 0.4213828116200381, "learning_rate": 2.697221052631579e-05, "loss": 0.2952, "mean_copy_accuracy": 0.9961007535457611, "mean_gen_accuracy": 0.8684345185756683, "mean_token_accuracy": 0.9013712108135223, "num_tokens": 849046829.0, "sample_num_tokens": 8354.25, "step": 3133, "total_num_tokens": 849080246.0, "z_loss": 0.0006687861168757081 }, { "copy_logits_max": -3.0942025184631348, "copy_logits_min": -687500032.0, "copy_num_tokens": 535.0625, "epoch": 0.6400816951748788, "gen_logits_max": 5.955869674682617, "gen_logits_mean": -12.292808532714844, "gen_logits_min": -23.594066619873047, "gen_logits_std": 2.5582823753356934, "gen_loss": 0.31596678495407104, "grad_norm": 0.45704531404025983, "learning_rate": 2.6970947368421054e-05, "loss": 0.3247, "mean_copy_accuracy": 0.9935660511255264, "mean_gen_accuracy": 0.8635517358779907, "mean_token_accuracy": 0.895832970738411, "num_tokens": 849304472.0, "sample_num_tokens": 9153.0, "step": 3134, "total_num_tokens": 849341084.0, "z_loss": 0.0007077819900587201 }, { "copy_logits_max": -4.076694011688232, "copy_logits_min": -687500032.0, "copy_num_tokens": 294.1875, "epoch": 0.6402859331120756, "gen_logits_max": 5.815505504608154, "gen_logits_mean": -13.864090919494629, "gen_logits_min": -25.067909240722656, "gen_logits_std": 2.5429327487945557, "gen_loss": 0.3327253758907318, "grad_norm": 0.44867525502573236, "learning_rate": 2.696968421052632e-05, "loss": 0.3458, "mean_copy_accuracy": 0.995394229888916, "mean_gen_accuracy": 0.8526926636695862, "mean_token_accuracy": 0.8854677677154541, "num_tokens": 849581839.0, "sample_num_tokens": 6598.25, "step": 3135, "total_num_tokens": 849608232.0, "z_loss": 0.000681136385537684 }, { "copy_logits_max": -3.6532344818115234, "copy_logits_min": -750000000.0, "copy_num_tokens": 519.9375, "epoch": 0.6404901710492724, "gen_logits_max": 4.7088470458984375, "gen_logits_mean": -13.519536018371582, "gen_logits_min": -25.01290512084961, "gen_logits_std": 2.5235776901245117, "gen_loss": 0.27192968130111694, "grad_norm": 0.41618656052636527, "learning_rate": 2.696842105263158e-05, "loss": 0.3111, "mean_copy_accuracy": 0.9944219291210175, "mean_gen_accuracy": 0.8651384115219116, "mean_token_accuracy": 0.8957008421421051, "num_tokens": 849842490.0, "sample_num_tokens": 8136.5, "step": 3136, "total_num_tokens": 849875036.0, "z_loss": 0.0006131641566753387 }, { "copy_logits_max": -4.523025035858154, "copy_logits_min": -687500032.0, "copy_num_tokens": 439.0, "epoch": 0.6406944089864692, "gen_logits_max": 6.088824272155762, "gen_logits_mean": -13.145544052124023, "gen_logits_min": -24.327789306640625, "gen_logits_std": 2.5477077960968018, "gen_loss": 0.31381314992904663, "grad_norm": 0.4373767307610979, "learning_rate": 2.6967157894736844e-05, "loss": 0.3185, "mean_copy_accuracy": 0.994922325015068, "mean_gen_accuracy": 0.8622383922338486, "mean_token_accuracy": 0.894360139966011, "num_tokens": 850087025.0, "sample_num_tokens": 7935.25, "step": 3137, "total_num_tokens": 850118766.0, "z_loss": 0.0006106332875788212 }, { "copy_logits_max": -3.9679276943206787, "copy_logits_min": -687500032.0, "copy_num_tokens": 445.5, "epoch": 0.640898646923666, "gen_logits_max": 6.100059986114502, "gen_logits_mean": -13.254843711853027, "gen_logits_min": -24.601057052612305, "gen_logits_std": 2.5802178382873535, "gen_loss": 0.35664433240890503, "grad_norm": 0.4389152363176712, "learning_rate": 2.6965894736842105e-05, "loss": 0.3185, "mean_copy_accuracy": 0.9941310584545135, "mean_gen_accuracy": 0.8625919222831726, "mean_token_accuracy": 0.8940790593624115, "num_tokens": 850356566.0, "sample_num_tokens": 8379.0, "step": 3138, "total_num_tokens": 850390082.0, "z_loss": 0.0006630493444390595 }, { "copy_logits_max": -3.4063191413879395, "copy_logits_min": -750000000.0, "copy_num_tokens": 494.0625, "epoch": 0.6411028848608629, "gen_logits_max": 5.991357326507568, "gen_logits_mean": -11.987643241882324, "gen_logits_min": -22.841289520263672, "gen_logits_std": 2.4556541442871094, "gen_loss": 0.34691378474235535, "grad_norm": 0.41251174366732335, "learning_rate": 2.696463157894737e-05, "loss": 0.3214, "mean_copy_accuracy": 0.9951401799917221, "mean_gen_accuracy": 0.8617819100618362, "mean_token_accuracy": 0.893963098526001, "num_tokens": 850635465.0, "sample_num_tokens": 8812.25, "step": 3139, "total_num_tokens": 850670714.0, "z_loss": 0.0007198003586381674 }, { "copy_logits_max": -3.9704456329345703, "copy_logits_min": -750000000.0, "copy_num_tokens": 394.4375, "epoch": 0.6413071227980598, "gen_logits_max": 5.306734085083008, "gen_logits_mean": -13.764249801635742, "gen_logits_min": -25.153610229492188, "gen_logits_std": 2.552487850189209, "gen_loss": 0.299222469329834, "grad_norm": 0.48091028426887156, "learning_rate": 2.696336842105263e-05, "loss": 0.3155, "mean_copy_accuracy": 0.9946769922971725, "mean_gen_accuracy": 0.8679112195968628, "mean_token_accuracy": 0.8970657140016556, "num_tokens": 850903003.0, "sample_num_tokens": 7920.25, "step": 3140, "total_num_tokens": 850934684.0, "z_loss": 0.0006638988270424306 }, { "copy_logits_max": -8.255584716796875, "copy_logits_min": -750000000.0, "copy_num_tokens": 353.6875, "epoch": 0.6415113607352566, "gen_logits_max": 5.453656196594238, "gen_logits_mean": -14.461214065551758, "gen_logits_min": -25.338600158691406, "gen_logits_std": 2.5336248874664307, "gen_loss": 0.29687583446502686, "grad_norm": 0.3973764859206545, "learning_rate": 2.6962105263157894e-05, "loss": 0.2961, "mean_copy_accuracy": 0.9959125816822052, "mean_gen_accuracy": 0.8721825778484344, "mean_token_accuracy": 0.9016731083393097, "num_tokens": 851182288.0, "sample_num_tokens": 7612.0, "step": 3141, "total_num_tokens": 851212736.0, "z_loss": 0.0005911518237553537 }, { "copy_logits_max": -4.092792510986328, "copy_logits_min": -750000000.0, "copy_num_tokens": 470.1875, "epoch": 0.6417155986724534, "gen_logits_max": 5.263789176940918, "gen_logits_mean": -12.607664108276367, "gen_logits_min": -23.33041000366211, "gen_logits_std": 2.415426731109619, "gen_loss": 0.32192692160606384, "grad_norm": 0.46464311244985107, "learning_rate": 2.696084210526316e-05, "loss": 0.3139, "mean_copy_accuracy": 0.9933137446641922, "mean_gen_accuracy": 0.8683728724718094, "mean_token_accuracy": 0.897154688835144, "num_tokens": 851443726.0, "sample_num_tokens": 8523.0, "step": 3142, "total_num_tokens": 851477818.0, "z_loss": 0.0006482755998149514 }, { "copy_logits_max": -2.599459171295166, "copy_logits_min": -750000000.0, "copy_num_tokens": 615.4375, "epoch": 0.6419198366096502, "gen_logits_max": 5.65693998336792, "gen_logits_mean": -12.077651977539062, "gen_logits_min": -23.46786117553711, "gen_logits_std": 2.537374496459961, "gen_loss": 0.35889822244644165, "grad_norm": 0.4787784220963676, "learning_rate": 2.6959578947368423e-05, "loss": 0.3472, "mean_copy_accuracy": 0.9923371821641922, "mean_gen_accuracy": 0.8532108962535858, "mean_token_accuracy": 0.8863012790679932, "num_tokens": 851722605.0, "sample_num_tokens": 8836.75, "step": 3143, "total_num_tokens": 851757952.0, "z_loss": 0.0008191422093659639 }, { "copy_logits_max": -2.7341020107269287, "copy_logits_min": -687500032.0, "copy_num_tokens": 656.6875, "epoch": 0.6421240745468471, "gen_logits_max": 6.027730941772461, "gen_logits_mean": -12.608396530151367, "gen_logits_min": -24.05141830444336, "gen_logits_std": 2.5753836631774902, "gen_loss": 0.288341224193573, "grad_norm": 0.5324932372672172, "learning_rate": 2.6958315789473687e-05, "loss": 0.3024, "mean_copy_accuracy": 0.9949999451637268, "mean_gen_accuracy": 0.8723306804895401, "mean_token_accuracy": 0.9032352566719055, "num_tokens": 851996732.0, "sample_num_tokens": 10371.0, "step": 3144, "total_num_tokens": 852038216.0, "z_loss": 0.0008287022938020527 }, { "copy_logits_max": -1.2202000617980957, "copy_logits_min": -750000000.0, "copy_num_tokens": 672.5, "epoch": 0.6423283124840439, "gen_logits_max": 7.02225923538208, "gen_logits_mean": -10.687999725341797, "gen_logits_min": -22.566390991210938, "gen_logits_std": 2.5026254653930664, "gen_loss": 0.2934882938861847, "grad_norm": 0.43373971283396756, "learning_rate": 2.6957052631578948e-05, "loss": 0.3245, "mean_copy_accuracy": 0.9963356405496597, "mean_gen_accuracy": 0.8532282710075378, "mean_token_accuracy": 0.8903290182352066, "num_tokens": 852284208.0, "sample_num_tokens": 9682.0, "step": 3145, "total_num_tokens": 852322936.0, "z_loss": 0.0008287126547656953 }, { "copy_logits_max": -2.9466781616210938, "copy_logits_min": -687500032.0, "copy_num_tokens": 354.3125, "epoch": 0.6425325504212408, "gen_logits_max": 5.917864799499512, "gen_logits_mean": -13.391061782836914, "gen_logits_min": -25.027048110961914, "gen_logits_std": 2.553891897201538, "gen_loss": 0.3088277578353882, "grad_norm": 0.4891147311512858, "learning_rate": 2.6955789473684213e-05, "loss": 0.3399, "mean_copy_accuracy": 0.9935608059167862, "mean_gen_accuracy": 0.8571874797344208, "mean_token_accuracy": 0.8893444538116455, "num_tokens": 852547201.0, "sample_num_tokens": 7406.25, "step": 3146, "total_num_tokens": 852576826.0, "z_loss": 0.0007641263655386865 }, { "copy_logits_max": -3.9497804641723633, "copy_logits_min": -687500096.0, "copy_num_tokens": 416.25, "epoch": 0.6427367883584376, "gen_logits_max": 6.33806037902832, "gen_logits_mean": -12.608565330505371, "gen_logits_min": -24.437381744384766, "gen_logits_std": 2.5238287448883057, "gen_loss": 0.3137703239917755, "grad_norm": 0.5346822676180785, "learning_rate": 2.6954526315789474e-05, "loss": 0.3276, "mean_copy_accuracy": 0.9934319406747818, "mean_gen_accuracy": 0.8572301268577576, "mean_token_accuracy": 0.8908982127904892, "num_tokens": 852786203.0, "sample_num_tokens": 7362.25, "step": 3147, "total_num_tokens": 852815652.0, "z_loss": 0.0008355214959010482 }, { "copy_logits_max": -4.533756256103516, "copy_logits_min": -750000000.0, "copy_num_tokens": 255.4375, "epoch": 0.6429410262956344, "gen_logits_max": 7.326637268066406, "gen_logits_mean": -12.082467079162598, "gen_logits_min": -23.633602142333984, "gen_logits_std": 2.4749650955200195, "gen_loss": 0.37653419375419617, "grad_norm": 0.4401738310691064, "learning_rate": 2.6953263157894738e-05, "loss": 0.3362, "mean_copy_accuracy": 0.9944793432950974, "mean_gen_accuracy": 0.8620597124099731, "mean_token_accuracy": 0.8892742246389389, "num_tokens": 853032155.0, "sample_num_tokens": 6720.25, "step": 3148, "total_num_tokens": 853059036.0, "z_loss": 0.0008932615164667368 }, { "copy_logits_max": -1.4143857955932617, "copy_logits_min": -750000000.0, "copy_num_tokens": 631.375, "epoch": 0.6431452642328312, "gen_logits_max": 5.161813735961914, "gen_logits_mean": -12.440631866455078, "gen_logits_min": -23.548809051513672, "gen_logits_std": 2.4616663455963135, "gen_loss": 0.2687202990055084, "grad_norm": 0.4137428368685618, "learning_rate": 2.6952e-05, "loss": 0.3146, "mean_copy_accuracy": 0.9958143085241318, "mean_gen_accuracy": 0.8600880056619644, "mean_token_accuracy": 0.8981817662715912, "num_tokens": 853313444.0, "sample_num_tokens": 8948.0, "step": 3149, "total_num_tokens": 853349236.0, "z_loss": 0.0008460960816591978 }, { "copy_logits_max": -5.456489562988281, "copy_logits_min": -750000000.0, "copy_num_tokens": 208.0625, "epoch": 0.6433495021700281, "gen_logits_max": 6.244307994842529, "gen_logits_mean": -13.445829391479492, "gen_logits_min": -24.056489944458008, "gen_logits_std": 2.4493908882141113, "gen_loss": 0.31876543164253235, "grad_norm": 0.4368271914287383, "learning_rate": 2.6950736842105263e-05, "loss": 0.3067, "mean_copy_accuracy": 0.9941848814487457, "mean_gen_accuracy": 0.8708552271127701, "mean_token_accuracy": 0.8996060639619827, "num_tokens": 853587374.0, "sample_num_tokens": 6272.5, "step": 3150, "total_num_tokens": 853612464.0, "z_loss": 0.000762796844355762 }, { "copy_logits_max": -2.501694917678833, "copy_logits_min": -750000000.0, "copy_num_tokens": 495.125, "epoch": 0.6435537401072249, "gen_logits_max": 5.473233222961426, "gen_logits_mean": -13.901374816894531, "gen_logits_min": -25.875333786010742, "gen_logits_std": 2.594666004180908, "gen_loss": 0.3187914490699768, "grad_norm": 0.5223600987492449, "learning_rate": 2.6949473684210527e-05, "loss": 0.3234, "mean_copy_accuracy": 0.9944902211427689, "mean_gen_accuracy": 0.8627989590167999, "mean_token_accuracy": 0.893028199672699, "num_tokens": 853824051.0, "sample_num_tokens": 8658.25, "step": 3151, "total_num_tokens": 853858684.0, "z_loss": 0.0007559274090453982 }, { "copy_logits_max": -4.263257026672363, "copy_logits_min": -750000000.0, "copy_num_tokens": 328.0, "epoch": 0.6437579780444217, "gen_logits_max": 5.697022438049316, "gen_logits_mean": -12.316295623779297, "gen_logits_min": -23.378868103027344, "gen_logits_std": 2.497148036956787, "gen_loss": 0.2947043180465698, "grad_norm": 0.5470164766063567, "learning_rate": 2.6948210526315792e-05, "loss": 0.3141, "mean_copy_accuracy": 0.99336177110672, "mean_gen_accuracy": 0.8640391379594803, "mean_token_accuracy": 0.8947493731975555, "num_tokens": 854081243.0, "sample_num_tokens": 6742.75, "step": 3152, "total_num_tokens": 854108214.0, "z_loss": 0.0006969560636207461 }, { "copy_logits_max": -2.722740888595581, "copy_logits_min": -750000000.0, "copy_num_tokens": 303.25, "epoch": 0.6439622159816186, "gen_logits_max": 5.810068607330322, "gen_logits_mean": -13.378427505493164, "gen_logits_min": -24.243160247802734, "gen_logits_std": 2.50120210647583, "gen_loss": 0.3557828664779663, "grad_norm": 0.4329236099602855, "learning_rate": 2.6946947368421053e-05, "loss": 0.3148, "mean_copy_accuracy": 0.9951889216899872, "mean_gen_accuracy": 0.8657436668872833, "mean_token_accuracy": 0.8970046192407608, "num_tokens": 854355005.0, "sample_num_tokens": 7340.25, "step": 3153, "total_num_tokens": 854384366.0, "z_loss": 0.0008196150884032249 }, { "copy_logits_max": -3.846097946166992, "copy_logits_min": -750000000.0, "copy_num_tokens": 440.875, "epoch": 0.6441664539188154, "gen_logits_max": 5.661754608154297, "gen_logits_mean": -14.094644546508789, "gen_logits_min": -25.70928955078125, "gen_logits_std": 2.5749876499176025, "gen_loss": 0.31655800342559814, "grad_norm": 0.4318449929714574, "learning_rate": 2.6945684210526317e-05, "loss": 0.2793, "mean_copy_accuracy": 0.9963968694210052, "mean_gen_accuracy": 0.8739431351423264, "mean_token_accuracy": 0.9067256301641464, "num_tokens": 854636226.0, "sample_num_tokens": 8517.0, "step": 3154, "total_num_tokens": 854670294.0, "z_loss": 0.0007407148368656635 }, { "copy_logits_max": -2.781162738800049, "copy_logits_min": -750000000.0, "copy_num_tokens": 549.9375, "epoch": 0.6443706918560123, "gen_logits_max": 5.586288928985596, "gen_logits_mean": -12.075202941894531, "gen_logits_min": -23.23651123046875, "gen_logits_std": 2.5261263847351074, "gen_loss": 0.3002108931541443, "grad_norm": 0.609894890764666, "learning_rate": 2.6944421052631578e-05, "loss": 0.325, "mean_copy_accuracy": 0.9941080808639526, "mean_gen_accuracy": 0.8600969016551971, "mean_token_accuracy": 0.8930304795503616, "num_tokens": 854905014.0, "sample_num_tokens": 8299.5, "step": 3155, "total_num_tokens": 854938212.0, "z_loss": 0.0006952002295292914 }, { "copy_logits_max": -3.5437073707580566, "copy_logits_min": -750000064.0, "copy_num_tokens": 504.0, "epoch": 0.6445749297932091, "gen_logits_max": 5.303915023803711, "gen_logits_mean": -12.981792449951172, "gen_logits_min": -24.175189971923828, "gen_logits_std": 2.533717155456543, "gen_loss": 0.29592251777648926, "grad_norm": 0.3614652016834863, "learning_rate": 2.6943157894736842e-05, "loss": 0.2776, "mean_copy_accuracy": 0.995151549577713, "mean_gen_accuracy": 0.8753985911607742, "mean_token_accuracy": 0.9056826084852219, "num_tokens": 855186458.0, "sample_num_tokens": 9183.5, "step": 3156, "total_num_tokens": 855223192.0, "z_loss": 0.0006925977650098503 }, { "copy_logits_max": -1.955110788345337, "copy_logits_min": -750000000.0, "copy_num_tokens": 450.125, "epoch": 0.6447791677304059, "gen_logits_max": 5.409968852996826, "gen_logits_mean": -12.646867752075195, "gen_logits_min": -23.565868377685547, "gen_logits_std": 2.475330114364624, "gen_loss": 0.3450159430503845, "grad_norm": 0.5466393085292446, "learning_rate": 2.6941894736842106e-05, "loss": 0.3388, "mean_copy_accuracy": 0.9940963685512543, "mean_gen_accuracy": 0.8502392470836639, "mean_token_accuracy": 0.8889926671981812, "num_tokens": 855465694.0, "sample_num_tokens": 8034.5, "step": 3157, "total_num_tokens": 855497832.0, "z_loss": 0.0008080926490947604 }, { "copy_logits_max": -2.9740939140319824, "copy_logits_min": -750000000.0, "copy_num_tokens": 439.1875, "epoch": 0.6449834056676027, "gen_logits_max": 4.866909503936768, "gen_logits_mean": -13.761432647705078, "gen_logits_min": -25.101543426513672, "gen_logits_std": 2.5791571140289307, "gen_loss": 0.30746519565582275, "grad_norm": 0.43009168052395635, "learning_rate": 2.6940631578947367e-05, "loss": 0.303, "mean_copy_accuracy": 0.9948249310255051, "mean_gen_accuracy": 0.8618987947702408, "mean_token_accuracy": 0.9001747667789459, "num_tokens": 855764471.0, "sample_num_tokens": 7901.75, "step": 3158, "total_num_tokens": 855796078.0, "z_loss": 0.000662770529743284 }, { "copy_logits_max": -4.806161880493164, "copy_logits_min": -750000000.0, "copy_num_tokens": 522.375, "epoch": 0.6451876436047996, "gen_logits_max": 4.550893783569336, "gen_logits_mean": -14.480504989624023, "gen_logits_min": -26.16056251525879, "gen_logits_std": 2.5922975540161133, "gen_loss": 0.2925146222114563, "grad_norm": 0.45632322902485045, "learning_rate": 2.6939368421052635e-05, "loss": 0.316, "mean_copy_accuracy": 0.9946427792310715, "mean_gen_accuracy": 0.8584063798189163, "mean_token_accuracy": 0.8941463083028793, "num_tokens": 856024094.0, "sample_num_tokens": 8928.0, "step": 3159, "total_num_tokens": 856059806.0, "z_loss": 0.0005610964726656675 }, { "copy_logits_max": -4.331898212432861, "copy_logits_min": -750000000.0, "copy_num_tokens": 312.3125, "epoch": 0.6453918815419964, "gen_logits_max": 5.360933303833008, "gen_logits_mean": -13.229440689086914, "gen_logits_min": -24.839557647705078, "gen_logits_std": 2.5638575553894043, "gen_loss": 0.31845948100090027, "grad_norm": 0.4518024304142286, "learning_rate": 2.6938105263157896e-05, "loss": 0.3341, "mean_copy_accuracy": 0.9939853996038437, "mean_gen_accuracy": 0.857799306511879, "mean_token_accuracy": 0.8882991522550583, "num_tokens": 856298406.0, "sample_num_tokens": 6684.5, "step": 3160, "total_num_tokens": 856325144.0, "z_loss": 0.0006276241037994623 }, { "copy_logits_max": -4.954105377197266, "copy_logits_min": -750000000.0, "copy_num_tokens": 322.25, "epoch": 0.6455961194791933, "gen_logits_max": 4.2907867431640625, "gen_logits_mean": -15.835021018981934, "gen_logits_min": -27.092817306518555, "gen_logits_std": 2.590815544128418, "gen_loss": 0.3159610331058502, "grad_norm": 0.447267440821902, "learning_rate": 2.693684210526316e-05, "loss": 0.3173, "mean_copy_accuracy": 0.9933653622865677, "mean_gen_accuracy": 0.8641011863946915, "mean_token_accuracy": 0.8939270079135895, "num_tokens": 856547852.0, "sample_num_tokens": 6811.0, "step": 3161, "total_num_tokens": 856575096.0, "z_loss": 0.0006347278831526637 }, { "copy_logits_max": -3.776196002960205, "copy_logits_min": -750000000.0, "copy_num_tokens": 492.3125, "epoch": 0.6458003574163901, "gen_logits_max": 4.408483505249023, "gen_logits_mean": -14.924328804016113, "gen_logits_min": -26.70393180847168, "gen_logits_std": 2.579010248184204, "gen_loss": 0.34335413575172424, "grad_norm": 0.5065486377667692, "learning_rate": 2.693557894736842e-05, "loss": 0.3377, "mean_copy_accuracy": 0.9943688213825226, "mean_gen_accuracy": 0.8539350032806396, "mean_token_accuracy": 0.8882558941841125, "num_tokens": 856817060.0, "sample_num_tokens": 8869.5, "step": 3162, "total_num_tokens": 856852538.0, "z_loss": 0.0006995581206865609 }, { "copy_logits_max": -3.229315757751465, "copy_logits_min": -750000000.0, "copy_num_tokens": 702.5625, "epoch": 0.6460045953535869, "gen_logits_max": 4.485605239868164, "gen_logits_mean": -13.212357521057129, "gen_logits_min": -25.184432983398438, "gen_logits_std": 2.5619988441467285, "gen_loss": 0.2848006784915924, "grad_norm": 0.44123278454993503, "learning_rate": 2.6934315789473686e-05, "loss": 0.303, "mean_copy_accuracy": 0.9939838200807571, "mean_gen_accuracy": 0.8676490187644958, "mean_token_accuracy": 0.8990266770124435, "num_tokens": 857093104.0, "sample_num_tokens": 10904.5, "step": 3163, "total_num_tokens": 857136722.0, "z_loss": 0.0006119320751167834 }, { "copy_logits_max": -2.8291897773742676, "copy_logits_min": -750000064.0, "copy_num_tokens": 478.6875, "epoch": 0.6462088332907837, "gen_logits_max": 5.835694313049316, "gen_logits_mean": -12.745779037475586, "gen_logits_min": -24.496034622192383, "gen_logits_std": 2.5451018810272217, "gen_loss": 0.32426971197128296, "grad_norm": 0.41387488808022554, "learning_rate": 2.6933052631578947e-05, "loss": 0.3027, "mean_copy_accuracy": 0.9947282373905182, "mean_gen_accuracy": 0.8686306327581406, "mean_token_accuracy": 0.8991615623235703, "num_tokens": 857378812.0, "sample_num_tokens": 8878.0, "step": 3164, "total_num_tokens": 857414324.0, "z_loss": 0.0007374424603767693 }, { "copy_logits_max": -2.2388741970062256, "copy_logits_min": -750000000.0, "copy_num_tokens": 402.625, "epoch": 0.6464130712279806, "gen_logits_max": 5.760904312133789, "gen_logits_mean": -13.23619270324707, "gen_logits_min": -25.179344177246094, "gen_logits_std": 2.589752197265625, "gen_loss": 0.3560580611228943, "grad_norm": 0.4637137483172841, "learning_rate": 2.693178947368421e-05, "loss": 0.3227, "mean_copy_accuracy": 0.9939384013414383, "mean_gen_accuracy": 0.8603923320770264, "mean_token_accuracy": 0.8921817243099213, "num_tokens": 857655742.0, "sample_num_tokens": 7996.5, "step": 3165, "total_num_tokens": 857687728.0, "z_loss": 0.0007819275488145649 }, { "copy_logits_max": -5.032228946685791, "copy_logits_min": -750000000.0, "copy_num_tokens": 351.4375, "epoch": 0.6466173091651775, "gen_logits_max": 5.543455123901367, "gen_logits_mean": -14.59765625, "gen_logits_min": -26.40774917602539, "gen_logits_std": 2.599517345428467, "gen_loss": 0.31362372636795044, "grad_norm": 0.4291901412813106, "learning_rate": 2.6930526315789472e-05, "loss": 0.2953, "mean_copy_accuracy": 0.9943965077400208, "mean_gen_accuracy": 0.8715279400348663, "mean_token_accuracy": 0.9011354893445969, "num_tokens": 857916414.0, "sample_num_tokens": 7635.5, "step": 3166, "total_num_tokens": 857946956.0, "z_loss": 0.0007483236840926111 }, { "copy_logits_max": -3.5553975105285645, "copy_logits_min": -750000000.0, "copy_num_tokens": 403.875, "epoch": 0.6468215471023743, "gen_logits_max": 4.968758583068848, "gen_logits_mean": -14.780303955078125, "gen_logits_min": -26.73413848876953, "gen_logits_std": 2.6171674728393555, "gen_loss": 0.3292371332645416, "grad_norm": 0.41328704706872393, "learning_rate": 2.692926315789474e-05, "loss": 0.2887, "mean_copy_accuracy": 0.9945795089006424, "mean_gen_accuracy": 0.8734411597251892, "mean_token_accuracy": 0.905305489897728, "num_tokens": 858189831.0, "sample_num_tokens": 7423.75, "step": 3167, "total_num_tokens": 858219526.0, "z_loss": 0.0007268398767337203 }, { "copy_logits_max": -5.878206253051758, "copy_logits_min": -687500032.0, "copy_num_tokens": 354.1875, "epoch": 0.6470257850395711, "gen_logits_max": 5.202759742736816, "gen_logits_mean": -14.51974105834961, "gen_logits_min": -26.14672088623047, "gen_logits_std": 2.601301670074463, "gen_loss": 0.29714715480804443, "grad_norm": 0.4125657107063012, "learning_rate": 2.6928e-05, "loss": 0.3084, "mean_copy_accuracy": 0.9958447813987732, "mean_gen_accuracy": 0.8643105030059814, "mean_token_accuracy": 0.8985404223203659, "num_tokens": 858453246.0, "sample_num_tokens": 7338.0, "step": 3168, "total_num_tokens": 858482598.0, "z_loss": 0.0006960925529710948 }, { "copy_logits_max": -5.600683212280273, "copy_logits_min": -750000000.0, "copy_num_tokens": 402.75, "epoch": 0.6472300229767679, "gen_logits_max": 5.466757774353027, "gen_logits_mean": -12.81232738494873, "gen_logits_min": -24.782094955444336, "gen_logits_std": 2.565943479537964, "gen_loss": 0.31430888175964355, "grad_norm": 0.4535949851099319, "learning_rate": 2.6926736842105265e-05, "loss": 0.3212, "mean_copy_accuracy": 0.9949386119842529, "mean_gen_accuracy": 0.8604380786418915, "mean_token_accuracy": 0.8932132869958878, "num_tokens": 858714309.0, "sample_num_tokens": 7652.25, "step": 3169, "total_num_tokens": 858744918.0, "z_loss": 0.0006436273106373847 }, { "copy_logits_max": -5.255319118499756, "copy_logits_min": -750000000.0, "copy_num_tokens": 462.9375, "epoch": 0.6474342609139647, "gen_logits_max": 5.669700622558594, "gen_logits_mean": -13.517269134521484, "gen_logits_min": -26.221101760864258, "gen_logits_std": 2.6555380821228027, "gen_loss": 0.3203948140144348, "grad_norm": 0.3962084961870799, "learning_rate": 2.6925473684210526e-05, "loss": 0.3205, "mean_copy_accuracy": 0.9960088282823563, "mean_gen_accuracy": 0.8596159368753433, "mean_token_accuracy": 0.8927249014377594, "num_tokens": 858995394.0, "sample_num_tokens": 8887.5, "step": 3170, "total_num_tokens": 859030944.0, "z_loss": 0.0007507356931455433 }, { "copy_logits_max": -4.921853065490723, "copy_logits_min": -687500032.0, "copy_num_tokens": 525.8125, "epoch": 0.6476384988511616, "gen_logits_max": 5.73416805267334, "gen_logits_mean": -12.623403549194336, "gen_logits_min": -24.76924705505371, "gen_logits_std": 2.62617826461792, "gen_loss": 0.325495183467865, "grad_norm": 0.48244558544829885, "learning_rate": 2.692421052631579e-05, "loss": 0.3184, "mean_copy_accuracy": 0.9939575344324112, "mean_gen_accuracy": 0.8597883582115173, "mean_token_accuracy": 0.8966555893421173, "num_tokens": 859279220.0, "sample_num_tokens": 8692.0, "step": 3171, "total_num_tokens": 859313988.0, "z_loss": 0.0006871981313452125 }, { "copy_logits_max": -5.415596008300781, "copy_logits_min": -750000000.0, "copy_num_tokens": 353.0, "epoch": 0.6478427367883585, "gen_logits_max": 5.673766613006592, "gen_logits_mean": -13.572242736816406, "gen_logits_min": -25.538558959960938, "gen_logits_std": 2.6381759643554688, "gen_loss": 0.32076939940452576, "grad_norm": 0.46707815926874835, "learning_rate": 2.6922947368421054e-05, "loss": 0.3072, "mean_copy_accuracy": 0.9948231130838394, "mean_gen_accuracy": 0.8672250956296921, "mean_token_accuracy": 0.896228164434433, "num_tokens": 859545325.0, "sample_num_tokens": 7682.75, "step": 3172, "total_num_tokens": 859576056.0, "z_loss": 0.0007142365793697536 }, { "copy_logits_max": -5.143559455871582, "copy_logits_min": -687500032.0, "copy_num_tokens": 545.75, "epoch": 0.6480469747255553, "gen_logits_max": 5.430452346801758, "gen_logits_mean": -12.77456283569336, "gen_logits_min": -24.123863220214844, "gen_logits_std": 2.5959763526916504, "gen_loss": 0.30818986892700195, "grad_norm": 0.41642540961973656, "learning_rate": 2.6921684210526315e-05, "loss": 0.3026, "mean_copy_accuracy": 0.995454266667366, "mean_gen_accuracy": 0.8676923513412476, "mean_token_accuracy": 0.9011351764202118, "num_tokens": 859836177.0, "sample_num_tokens": 9557.25, "step": 3173, "total_num_tokens": 859874406.0, "z_loss": 0.0007340693846344948 }, { "copy_logits_max": -2.8001184463500977, "copy_logits_min": -750000000.0, "copy_num_tokens": 557.625, "epoch": 0.6482512126627521, "gen_logits_max": 5.018959045410156, "gen_logits_mean": -13.629425048828125, "gen_logits_min": -26.088342666625977, "gen_logits_std": 2.6615147590637207, "gen_loss": 0.2949157953262329, "grad_norm": 0.41001591298356393, "learning_rate": 2.692042105263158e-05, "loss": 0.298, "mean_copy_accuracy": 0.9949013590812683, "mean_gen_accuracy": 0.8683267831802368, "mean_token_accuracy": 0.9030311405658722, "num_tokens": 860149856.0, "sample_num_tokens": 9193.5, "step": 3174, "total_num_tokens": 860186630.0, "z_loss": 0.0007114564650692046 }, { "copy_logits_max": -4.8554487228393555, "copy_logits_min": -750000000.0, "copy_num_tokens": 329.6875, "epoch": 0.6484554505999489, "gen_logits_max": 5.617717266082764, "gen_logits_mean": -13.575860023498535, "gen_logits_min": -24.981281280517578, "gen_logits_std": 2.640251636505127, "gen_loss": 0.3326350450515747, "grad_norm": 0.41249219721509156, "learning_rate": 2.6919157894736844e-05, "loss": 0.3435, "mean_copy_accuracy": 0.9936249852180481, "mean_gen_accuracy": 0.858747273683548, "mean_token_accuracy": 0.8878521174192429, "num_tokens": 860423182.0, "sample_num_tokens": 7779.5, "step": 3175, "total_num_tokens": 860454300.0, "z_loss": 0.000752679246943444 }, { "copy_logits_max": -4.665478706359863, "copy_logits_min": -750000000.0, "copy_num_tokens": 459.75, "epoch": 0.6486596885371457, "gen_logits_max": 5.219296932220459, "gen_logits_mean": -13.856974601745605, "gen_logits_min": -25.593765258789062, "gen_logits_std": 2.6392674446105957, "gen_loss": 0.3181436061859131, "grad_norm": 0.4384396550709159, "learning_rate": 2.6917894736842108e-05, "loss": 0.3202, "mean_copy_accuracy": 0.9946995973587036, "mean_gen_accuracy": 0.8636731058359146, "mean_token_accuracy": 0.894936665892601, "num_tokens": 860680376.0, "sample_num_tokens": 8727.0, "step": 3176, "total_num_tokens": 860715284.0, "z_loss": 0.0007668443722650409 }, { "copy_logits_max": -2.844733238220215, "copy_logits_min": -750000000.0, "copy_num_tokens": 347.0625, "epoch": 0.6488639264743427, "gen_logits_max": 5.433157920837402, "gen_logits_mean": -13.714707374572754, "gen_logits_min": -25.3425235748291, "gen_logits_std": 2.6320040225982666, "gen_loss": 0.3365580439567566, "grad_norm": 0.4526395837035687, "learning_rate": 2.691663157894737e-05, "loss": 0.3121, "mean_copy_accuracy": 0.9942492693662643, "mean_gen_accuracy": 0.8630553632974625, "mean_token_accuracy": 0.8974771350622177, "num_tokens": 860951322.0, "sample_num_tokens": 7641.5, "step": 3177, "total_num_tokens": 860981888.0, "z_loss": 0.0007922506192699075 }, { "copy_logits_max": -5.421266555786133, "copy_logits_min": -750000000.0, "copy_num_tokens": 341.6875, "epoch": 0.6490681644115395, "gen_logits_max": 6.192662239074707, "gen_logits_mean": -12.884628295898438, "gen_logits_min": -24.236270904541016, "gen_logits_std": 2.604631185531616, "gen_loss": 0.33416056632995605, "grad_norm": 0.4284595240522966, "learning_rate": 2.6915368421052633e-05, "loss": 0.3324, "mean_copy_accuracy": 0.9943282008171082, "mean_gen_accuracy": 0.8611647188663483, "mean_token_accuracy": 0.8926694840192795, "num_tokens": 861222016.0, "sample_num_tokens": 7747.0, "step": 3178, "total_num_tokens": 861253004.0, "z_loss": 0.0008060292457230389 }, { "copy_logits_max": -3.1558663845062256, "copy_logits_min": -687500032.0, "copy_num_tokens": 515.625, "epoch": 0.6492724023487363, "gen_logits_max": 5.764575958251953, "gen_logits_mean": -12.577339172363281, "gen_logits_min": -24.669273376464844, "gen_logits_std": 2.659524440765381, "gen_loss": 0.3232668340206146, "grad_norm": 0.43637423533376396, "learning_rate": 2.6914105263157894e-05, "loss": 0.3164, "mean_copy_accuracy": 0.9939362406730652, "mean_gen_accuracy": 0.8650714457035065, "mean_token_accuracy": 0.8963737189769745, "num_tokens": 861487058.0, "sample_num_tokens": 8772.5, "step": 3179, "total_num_tokens": 861522148.0, "z_loss": 0.0007336988346651196 }, { "copy_logits_max": -3.669402599334717, "copy_logits_min": -687500032.0, "copy_num_tokens": 556.6875, "epoch": 0.6494766402859331, "gen_logits_max": 5.179806709289551, "gen_logits_mean": -12.650487899780273, "gen_logits_min": -24.436752319335938, "gen_logits_std": 2.6706109046936035, "gen_loss": 0.281782865524292, "grad_norm": 0.4353539557387949, "learning_rate": 2.691284210526316e-05, "loss": 0.299, "mean_copy_accuracy": 0.9942804276943207, "mean_gen_accuracy": 0.8682655692100525, "mean_token_accuracy": 0.9020759761333466, "num_tokens": 861757066.0, "sample_num_tokens": 8879.0, "step": 3180, "total_num_tokens": 861792582.0, "z_loss": 0.0006293347105383873 }, { "copy_logits_max": -2.4491820335388184, "copy_logits_min": -750000000.0, "copy_num_tokens": 479.4375, "epoch": 0.6496808782231299, "gen_logits_max": 5.260963439941406, "gen_logits_mean": -14.03605842590332, "gen_logits_min": -26.237777709960938, "gen_logits_std": 2.6599061489105225, "gen_loss": 0.3292921185493469, "grad_norm": 0.431370816211978, "learning_rate": 2.691157894736842e-05, "loss": 0.3181, "mean_copy_accuracy": 0.9944839477539062, "mean_gen_accuracy": 0.858939602971077, "mean_token_accuracy": 0.8946758806705475, "num_tokens": 862022647.0, "sample_num_tokens": 8670.75, "step": 3181, "total_num_tokens": 862057330.0, "z_loss": 0.0007336413254961371 }, { "copy_logits_max": -5.076940536499023, "copy_logits_min": -750000000.0, "copy_num_tokens": 287.4375, "epoch": 0.6498851161603267, "gen_logits_max": 6.7530951499938965, "gen_logits_mean": -12.784067153930664, "gen_logits_min": -24.41155242919922, "gen_logits_std": 2.66963529586792, "gen_loss": 0.32994014024734497, "grad_norm": 0.4568019894937664, "learning_rate": 2.6910315789473684e-05, "loss": 0.3155, "mean_copy_accuracy": 0.9940787553787231, "mean_gen_accuracy": 0.8660454005002975, "mean_token_accuracy": 0.8970757871866226, "num_tokens": 862285867.0, "sample_num_tokens": 8032.75, "step": 3182, "total_num_tokens": 862317998.0, "z_loss": 0.0006893261452205479 }, { "copy_logits_max": -2.949460983276367, "copy_logits_min": -750000000.0, "copy_num_tokens": 458.125, "epoch": 0.6500893540975237, "gen_logits_max": 5.40549373626709, "gen_logits_mean": -14.29850959777832, "gen_logits_min": -25.981914520263672, "gen_logits_std": 2.6608211994171143, "gen_loss": 0.32236143946647644, "grad_norm": 0.430603485490152, "learning_rate": 2.6909052631578948e-05, "loss": 0.3003, "mean_copy_accuracy": 0.9947172999382019, "mean_gen_accuracy": 0.8690294027328491, "mean_token_accuracy": 0.899482473731041, "num_tokens": 862558489.0, "sample_num_tokens": 8472.25, "step": 3183, "total_num_tokens": 862592378.0, "z_loss": 0.0007375587010756135 }, { "copy_logits_max": -1.019906997680664, "copy_logits_min": -750000000.0, "copy_num_tokens": 590.9375, "epoch": 0.6502935920347205, "gen_logits_max": 4.851833343505859, "gen_logits_mean": -13.61086654663086, "gen_logits_min": -25.741592407226562, "gen_logits_std": 2.6470882892608643, "gen_loss": 0.3035278916358948, "grad_norm": 0.4500039910910483, "learning_rate": 2.6907789473684212e-05, "loss": 0.3192, "mean_copy_accuracy": 0.9937274605035782, "mean_gen_accuracy": 0.8639096766710281, "mean_token_accuracy": 0.8960138559341431, "num_tokens": 862820320.0, "sample_num_tokens": 9266.5, "step": 3184, "total_num_tokens": 862857386.0, "z_loss": 0.0007019439362920821 }, { "copy_logits_max": -3.9180452823638916, "copy_logits_min": -750000000.0, "copy_num_tokens": 516.375, "epoch": 0.6504978299719173, "gen_logits_max": 5.0684380531311035, "gen_logits_mean": -13.983683586120605, "gen_logits_min": -25.937931060791016, "gen_logits_std": 2.6299173831939697, "gen_loss": 0.31060150265693665, "grad_norm": 0.4514243450464035, "learning_rate": 2.6906526315789477e-05, "loss": 0.3281, "mean_copy_accuracy": 0.9937293976545334, "mean_gen_accuracy": 0.8623317927122116, "mean_token_accuracy": 0.8920579552650452, "num_tokens": 863101598.0, "sample_num_tokens": 9640.5, "step": 3185, "total_num_tokens": 863140160.0, "z_loss": 0.0006818868569098413 }, { "copy_logits_max": -1.3297357559204102, "copy_logits_min": -750000000.0, "copy_num_tokens": 544.0, "epoch": 0.6507020679091141, "gen_logits_max": 5.6302642822265625, "gen_logits_mean": -11.739938735961914, "gen_logits_min": -24.24512481689453, "gen_logits_std": 2.6994824409484863, "gen_loss": 0.33480873703956604, "grad_norm": 0.43673377881895964, "learning_rate": 2.6905263157894738e-05, "loss": 0.3242, "mean_copy_accuracy": 0.9951726198196411, "mean_gen_accuracy": 0.8614640533924103, "mean_token_accuracy": 0.892249658703804, "num_tokens": 863392176.0, "sample_num_tokens": 8649.5, "step": 3186, "total_num_tokens": 863426774.0, "z_loss": 0.0007854568539187312 }, { "copy_logits_max": -2.4300103187561035, "copy_logits_min": -750000000.0, "copy_num_tokens": 525.8125, "epoch": 0.6509063058463109, "gen_logits_max": 5.920950889587402, "gen_logits_mean": -12.751953125, "gen_logits_min": -25.147247314453125, "gen_logits_std": 2.621229410171509, "gen_loss": 0.3158741593360901, "grad_norm": 0.5094565948961193, "learning_rate": 2.6904000000000002e-05, "loss": 0.3386, "mean_copy_accuracy": 0.9942564368247986, "mean_gen_accuracy": 0.8634619414806366, "mean_token_accuracy": 0.8900420963764191, "num_tokens": 863653166.0, "sample_num_tokens": 9127.0, "step": 3187, "total_num_tokens": 863689674.0, "z_loss": 0.0006695531192235649 }, { "copy_logits_max": -3.4220118522644043, "copy_logits_min": -750000000.0, "copy_num_tokens": 366.9375, "epoch": 0.6511105437835077, "gen_logits_max": 5.711167335510254, "gen_logits_mean": -13.495241165161133, "gen_logits_min": -25.535598754882812, "gen_logits_std": 2.6929469108581543, "gen_loss": 0.3113466501235962, "grad_norm": 0.49491744411926897, "learning_rate": 2.6902736842105263e-05, "loss": 0.3403, "mean_copy_accuracy": 0.9946579188108444, "mean_gen_accuracy": 0.8509694039821625, "mean_token_accuracy": 0.8882315754890442, "num_tokens": 863923361.0, "sample_num_tokens": 7217.75, "step": 3188, "total_num_tokens": 863952232.0, "z_loss": 0.0007135369232855737 }, { "copy_logits_max": -3.0139119625091553, "copy_logits_min": -750000000.0, "copy_num_tokens": 509.875, "epoch": 0.6513147817207047, "gen_logits_max": 4.909951210021973, "gen_logits_mean": -13.612590789794922, "gen_logits_min": -25.746784210205078, "gen_logits_std": 2.6236283779144287, "gen_loss": 0.3023577332496643, "grad_norm": 0.3851449012401678, "learning_rate": 2.6901473684210527e-05, "loss": 0.2928, "mean_copy_accuracy": 0.9958468079566956, "mean_gen_accuracy": 0.8687760978937149, "mean_token_accuracy": 0.9020828902721405, "num_tokens": 864204297.0, "sample_num_tokens": 8324.25, "step": 3189, "total_num_tokens": 864237594.0, "z_loss": 0.0007151447935029864 }, { "copy_logits_max": -0.6497088074684143, "copy_logits_min": -687500032.0, "copy_num_tokens": 434.0, "epoch": 0.6515190196579015, "gen_logits_max": 5.293050765991211, "gen_logits_mean": -13.569354057312012, "gen_logits_min": -25.782154083251953, "gen_logits_std": 2.6205265522003174, "gen_loss": 0.32301557064056396, "grad_norm": 0.46822994801980156, "learning_rate": 2.6900210526315788e-05, "loss": 0.3315, "mean_copy_accuracy": 0.9941661804914474, "mean_gen_accuracy": 0.857321634888649, "mean_token_accuracy": 0.8898105472326279, "num_tokens": 864462465.0, "sample_num_tokens": 7442.25, "step": 3190, "total_num_tokens": 864492234.0, "z_loss": 0.0008635949343442917 }, { "copy_logits_max": -4.340371131896973, "copy_logits_min": -750000000.0, "copy_num_tokens": 348.4375, "epoch": 0.6517232575950983, "gen_logits_max": 5.6767778396606445, "gen_logits_mean": -13.550098419189453, "gen_logits_min": -24.606504440307617, "gen_logits_std": 2.5495944023132324, "gen_loss": 0.3392706513404846, "grad_norm": 0.44556861787840074, "learning_rate": 2.6898947368421052e-05, "loss": 0.331, "mean_copy_accuracy": 0.9950746148824692, "mean_gen_accuracy": 0.8613272160291672, "mean_token_accuracy": 0.8912501782178879, "num_tokens": 864740406.0, "sample_num_tokens": 8063.0, "step": 3191, "total_num_tokens": 864772658.0, "z_loss": 0.000753172964323312 }, { "copy_logits_max": -3.2245712280273438, "copy_logits_min": -750000000.0, "copy_num_tokens": 468.875, "epoch": 0.6519274955322951, "gen_logits_max": 5.4572248458862305, "gen_logits_mean": -12.751104354858398, "gen_logits_min": -24.5858154296875, "gen_logits_std": 2.6245555877685547, "gen_loss": 0.29106926918029785, "grad_norm": 0.47755851472527416, "learning_rate": 2.6897684210526317e-05, "loss": 0.3079, "mean_copy_accuracy": 0.9936801642179489, "mean_gen_accuracy": 0.8676706999540329, "mean_token_accuracy": 0.898337721824646, "num_tokens": 865008542.0, "sample_num_tokens": 8285.5, "step": 3192, "total_num_tokens": 865041684.0, "z_loss": 0.0006134277791716158 }, { "copy_logits_max": -3.2682456970214844, "copy_logits_min": -750000000.0, "copy_num_tokens": 354.5, "epoch": 0.6521317334694919, "gen_logits_max": 5.744068145751953, "gen_logits_mean": -13.713619232177734, "gen_logits_min": -25.159086227416992, "gen_logits_std": 2.574000597000122, "gen_loss": 0.32497313618659973, "grad_norm": 0.3860188841019959, "learning_rate": 2.689642105263158e-05, "loss": 0.3219, "mean_copy_accuracy": 0.9946471303701401, "mean_gen_accuracy": 0.8571217507123947, "mean_token_accuracy": 0.8920090347528458, "num_tokens": 865286014.0, "sample_num_tokens": 8031.5, "step": 3193, "total_num_tokens": 865318140.0, "z_loss": 0.0007336671696975827 }, { "copy_logits_max": -1.2439876794815063, "copy_logits_min": -750000000.0, "copy_num_tokens": 641.25, "epoch": 0.6523359714066888, "gen_logits_max": 4.913119316101074, "gen_logits_mean": -13.17314338684082, "gen_logits_min": -24.266071319580078, "gen_logits_std": 2.5317516326904297, "gen_loss": 0.28544020652770996, "grad_norm": 0.7361948623456595, "learning_rate": 2.6895157894736842e-05, "loss": 0.307, "mean_copy_accuracy": 0.9957268089056015, "mean_gen_accuracy": 0.8657429218292236, "mean_token_accuracy": 0.897743359208107, "num_tokens": 865569796.0, "sample_num_tokens": 9856.0, "step": 3194, "total_num_tokens": 865609220.0, "z_loss": 0.0008129252819344401 }, { "copy_logits_max": -3.630331516265869, "copy_logits_min": -750000000.0, "copy_num_tokens": 381.0, "epoch": 0.6525402093438857, "gen_logits_max": 5.273193836212158, "gen_logits_mean": -13.820049285888672, "gen_logits_min": -25.567981719970703, "gen_logits_std": 2.612222194671631, "gen_loss": 0.36286723613739014, "grad_norm": 0.49116572378666173, "learning_rate": 2.6893894736842106e-05, "loss": 0.3241, "mean_copy_accuracy": 0.9956521242856979, "mean_gen_accuracy": 0.8602935373783112, "mean_token_accuracy": 0.8954441398382187, "num_tokens": 865835531.0, "sample_num_tokens": 7729.75, "step": 3195, "total_num_tokens": 865866450.0, "z_loss": 0.000813222723081708 }, { "copy_logits_max": -0.326732337474823, "copy_logits_min": -750000000.0, "copy_num_tokens": 393.1875, "epoch": 0.6527444472810825, "gen_logits_max": 5.704763889312744, "gen_logits_mean": -12.611122131347656, "gen_logits_min": -24.684864044189453, "gen_logits_std": 2.6274337768554688, "gen_loss": 0.322262704372406, "grad_norm": 0.47356072048627307, "learning_rate": 2.6892631578947367e-05, "loss": 0.3228, "mean_copy_accuracy": 0.9926962107419968, "mean_gen_accuracy": 0.8617285192012787, "mean_token_accuracy": 0.8927808701992035, "num_tokens": 866090552.0, "sample_num_tokens": 7302.5, "step": 3196, "total_num_tokens": 866119762.0, "z_loss": 0.0007565633277408779 }, { "copy_logits_max": -4.522183418273926, "copy_logits_min": -687500032.0, "copy_num_tokens": 476.0625, "epoch": 0.6529486852182793, "gen_logits_max": 5.6900482177734375, "gen_logits_mean": -12.3765230178833, "gen_logits_min": -24.026424407958984, "gen_logits_std": 2.589883804321289, "gen_loss": 0.32043883204460144, "grad_norm": 0.5051329670580255, "learning_rate": 2.689136842105263e-05, "loss": 0.3304, "mean_copy_accuracy": 0.9917066395282745, "mean_gen_accuracy": 0.859235867857933, "mean_token_accuracy": 0.8908606022596359, "num_tokens": 866342717.0, "sample_num_tokens": 8088.75, "step": 3197, "total_num_tokens": 866375072.0, "z_loss": 0.000765683245845139 }, { "copy_logits_max": -3.8406484127044678, "copy_logits_min": -687500032.0, "copy_num_tokens": 441.4375, "epoch": 0.6531529231554761, "gen_logits_max": 5.702341079711914, "gen_logits_mean": -12.713862419128418, "gen_logits_min": -24.605541229248047, "gen_logits_std": 2.6618735790252686, "gen_loss": 0.33276012539863586, "grad_norm": 0.4129276716813656, "learning_rate": 2.6890105263157896e-05, "loss": 0.3204, "mean_copy_accuracy": 0.9950828701257706, "mean_gen_accuracy": 0.8638343065977097, "mean_token_accuracy": 0.8952393680810928, "num_tokens": 866611834.0, "sample_num_tokens": 7966.0, "step": 3198, "total_num_tokens": 866643698.0, "z_loss": 0.0007643016870133579 }, { "copy_logits_max": -4.45717716217041, "copy_logits_min": -750000000.0, "copy_num_tokens": 334.5, "epoch": 0.653357161092673, "gen_logits_max": 5.477336883544922, "gen_logits_mean": -13.813240051269531, "gen_logits_min": -24.689109802246094, "gen_logits_std": 2.575868606567383, "gen_loss": 0.3578055799007416, "grad_norm": 0.4608176048520108, "learning_rate": 2.6888842105263157e-05, "loss": 0.3294, "mean_copy_accuracy": 0.9938436448574066, "mean_gen_accuracy": 0.8622223287820816, "mean_token_accuracy": 0.891926646232605, "num_tokens": 866881906.0, "sample_num_tokens": 8332.5, "step": 3199, "total_num_tokens": 866915236.0, "z_loss": 0.0008022902766242623 }, { "copy_logits_max": -2.3966801166534424, "copy_logits_min": -750000000.0, "copy_num_tokens": 360.75, "epoch": 0.6535613990298698, "gen_logits_max": 5.289178371429443, "gen_logits_mean": -13.327543258666992, "gen_logits_min": -25.512014389038086, "gen_logits_std": 2.598658561706543, "gen_loss": 0.31409114599227905, "grad_norm": 0.459335788939064, "learning_rate": 2.6887578947368424e-05, "loss": 0.3226, "mean_copy_accuracy": 0.9944180250167847, "mean_gen_accuracy": 0.8598814010620117, "mean_token_accuracy": 0.892568364739418, "num_tokens": 867158573.0, "sample_num_tokens": 6706.75, "step": 3200, "total_num_tokens": 867185400.0, "z_loss": 0.00067862868309021 }, { "copy_logits_max": -5.051501274108887, "copy_logits_min": -750000000.0, "copy_num_tokens": 373.5, "epoch": 0.6537656369670667, "gen_logits_max": 5.1492815017700195, "gen_logits_mean": -15.214071273803711, "gen_logits_min": -26.547693252563477, "gen_logits_std": 2.6053872108459473, "gen_loss": 0.3234096169471741, "grad_norm": 0.43986481941814376, "learning_rate": 2.6886315789473685e-05, "loss": 0.3339, "mean_copy_accuracy": 0.9921525567770004, "mean_gen_accuracy": 0.8622123301029205, "mean_token_accuracy": 0.8879999816417694, "num_tokens": 867407214.0, "sample_num_tokens": 7853.5, "step": 3201, "total_num_tokens": 867438628.0, "z_loss": 0.0007026129751466215 }, { "copy_logits_max": -1.3928802013397217, "copy_logits_min": -750000000.0, "copy_num_tokens": 560.5625, "epoch": 0.6539698749042635, "gen_logits_max": 5.296928405761719, "gen_logits_mean": -12.612293243408203, "gen_logits_min": -24.485610961914062, "gen_logits_std": 2.6217355728149414, "gen_loss": 0.3452339470386505, "grad_norm": 0.5096877137984172, "learning_rate": 2.688505263157895e-05, "loss": 0.3172, "mean_copy_accuracy": 0.9924323409795761, "mean_gen_accuracy": 0.8616430461406708, "mean_token_accuracy": 0.8950536102056503, "num_tokens": 867678117.0, "sample_num_tokens": 9015.25, "step": 3202, "total_num_tokens": 867714178.0, "z_loss": 0.0007717458065599203 }, { "copy_logits_max": -4.697989463806152, "copy_logits_min": -750000000.0, "copy_num_tokens": 325.0, "epoch": 0.6541741128414603, "gen_logits_max": 5.960639953613281, "gen_logits_mean": -12.076128005981445, "gen_logits_min": -23.489015579223633, "gen_logits_std": 2.588914155960083, "gen_loss": 0.35162943601608276, "grad_norm": 0.4389639375303174, "learning_rate": 2.688378947368421e-05, "loss": 0.3273, "mean_copy_accuracy": 0.9945585131645203, "mean_gen_accuracy": 0.8640298247337341, "mean_token_accuracy": 0.8926163017749786, "num_tokens": 867944919.0, "sample_num_tokens": 7237.25, "step": 3203, "total_num_tokens": 867973868.0, "z_loss": 0.0008170166984200478 }, { "copy_logits_max": -4.330035209655762, "copy_logits_min": -750000064.0, "copy_num_tokens": 357.9375, "epoch": 0.6543783507786571, "gen_logits_max": 5.036203384399414, "gen_logits_mean": -13.574566841125488, "gen_logits_min": -24.998859405517578, "gen_logits_std": 2.6018011569976807, "gen_loss": 0.24307502806186676, "grad_norm": 0.4637743232515354, "learning_rate": 2.6882526315789475e-05, "loss": 0.3198, "mean_copy_accuracy": 0.9952726513147354, "mean_gen_accuracy": 0.8629326671361923, "mean_token_accuracy": 0.893837496638298, "num_tokens": 868205440.0, "sample_num_tokens": 8426.0, "step": 3204, "total_num_tokens": 868239144.0, "z_loss": 0.0006517808069474995 }, { "copy_logits_max": -1.8700087070465088, "copy_logits_min": -750000000.0, "copy_num_tokens": 485.125, "epoch": 0.654582588715854, "gen_logits_max": 4.860010147094727, "gen_logits_mean": -13.397306442260742, "gen_logits_min": -25.07174301147461, "gen_logits_std": 2.640676498413086, "gen_loss": 0.32064491510391235, "grad_norm": 0.46738926194741637, "learning_rate": 2.6881263157894736e-05, "loss": 0.3196, "mean_copy_accuracy": 0.993580773472786, "mean_gen_accuracy": 0.8640781193971634, "mean_token_accuracy": 0.894474446773529, "num_tokens": 868444413.0, "sample_num_tokens": 7910.25, "step": 3205, "total_num_tokens": 868476054.0, "z_loss": 0.0006896493723616004 }, { "copy_logits_max": 0.9037031531333923, "copy_logits_min": -750000000.0, "copy_num_tokens": 427.75, "epoch": 0.6547868266530508, "gen_logits_max": 6.247019290924072, "gen_logits_mean": -11.907690048217773, "gen_logits_min": -23.836292266845703, "gen_logits_std": 2.6736578941345215, "gen_loss": 0.31006115674972534, "grad_norm": 0.4660863980105486, "learning_rate": 2.688e-05, "loss": 0.3088, "mean_copy_accuracy": 0.9936081618070602, "mean_gen_accuracy": 0.8651765733957291, "mean_token_accuracy": 0.8983189910650253, "num_tokens": 868744332.0, "sample_num_tokens": 8026.5, "step": 3206, "total_num_tokens": 868776438.0, "z_loss": 0.0006648417329415679 }, { "copy_logits_max": -1.27057683467865, "copy_logits_min": -687500032.0, "copy_num_tokens": 349.75, "epoch": 0.6549910645902476, "gen_logits_max": 5.213657379150391, "gen_logits_mean": -13.84580135345459, "gen_logits_min": -25.658435821533203, "gen_logits_std": 2.6217527389526367, "gen_loss": 0.31428825855255127, "grad_norm": 0.40611252764495015, "learning_rate": 2.687873684210526e-05, "loss": 0.2922, "mean_copy_accuracy": 0.9941367357969284, "mean_gen_accuracy": 0.8703973144292831, "mean_token_accuracy": 0.9010427296161652, "num_tokens": 869021133.0, "sample_num_tokens": 7416.75, "step": 3207, "total_num_tokens": 869050800.0, "z_loss": 0.0007083365926519036 }, { "copy_logits_max": 0.3000698387622833, "copy_logits_min": -750000000.0, "copy_num_tokens": 461.4375, "epoch": 0.6551953025274445, "gen_logits_max": 7.195102214813232, "gen_logits_mean": -10.84239387512207, "gen_logits_min": -22.940181732177734, "gen_logits_std": 2.7009172439575195, "gen_loss": 0.32208889722824097, "grad_norm": 0.48400381000776715, "learning_rate": 2.687747368421053e-05, "loss": 0.3299, "mean_copy_accuracy": 0.994965672492981, "mean_gen_accuracy": 0.853624239563942, "mean_token_accuracy": 0.890491709113121, "num_tokens": 869271566.0, "sample_num_tokens": 8310.5, "step": 3208, "total_num_tokens": 869304808.0, "z_loss": 0.0007810357492417097 }, { "copy_logits_max": -1.6370488405227661, "copy_logits_min": -750000000.0, "copy_num_tokens": 370.0625, "epoch": 0.6553995404646413, "gen_logits_max": 6.067641258239746, "gen_logits_mean": -13.534907341003418, "gen_logits_min": -25.227277755737305, "gen_logits_std": 2.664011001586914, "gen_loss": 0.36852407455444336, "grad_norm": 0.43850683032755766, "learning_rate": 2.687621052631579e-05, "loss": 0.3293, "mean_copy_accuracy": 0.9933921545743942, "mean_gen_accuracy": 0.8584174662828445, "mean_token_accuracy": 0.8910897523164749, "num_tokens": 869560115.0, "sample_num_tokens": 7660.75, "step": 3209, "total_num_tokens": 869590758.0, "z_loss": 0.000832927122246474 }, { "copy_logits_max": 0.7536698579788208, "copy_logits_min": -625000064.0, "copy_num_tokens": 357.4375, "epoch": 0.6556037784018381, "gen_logits_max": 7.080949783325195, "gen_logits_mean": -11.800482749938965, "gen_logits_min": -23.791507720947266, "gen_logits_std": 2.6926817893981934, "gen_loss": 0.31757673621177673, "grad_norm": 0.47128444795126734, "learning_rate": 2.6874947368421054e-05, "loss": 0.3306, "mean_copy_accuracy": 0.9930520355701447, "mean_gen_accuracy": 0.8594277948141098, "mean_token_accuracy": 0.889323890209198, "num_tokens": 869826893.0, "sample_num_tokens": 7638.25, "step": 3210, "total_num_tokens": 869857446.0, "z_loss": 0.0007316713454201818 }, { "copy_logits_max": -0.6719822883605957, "copy_logits_min": -750000000.0, "copy_num_tokens": 318.875, "epoch": 0.655808016339035, "gen_logits_max": 6.240044593811035, "gen_logits_mean": -12.91905403137207, "gen_logits_min": -24.501052856445312, "gen_logits_std": 2.651911973953247, "gen_loss": 0.3138374388217926, "grad_norm": 0.4635221317412735, "learning_rate": 2.687368421052632e-05, "loss": 0.3219, "mean_copy_accuracy": 0.9919851869344711, "mean_gen_accuracy": 0.8722766786813736, "mean_token_accuracy": 0.8932281732559204, "num_tokens": 870082178.0, "sample_num_tokens": 8421.5, "step": 3211, "total_num_tokens": 870115864.0, "z_loss": 0.0006752136396244168 }, { "copy_logits_max": -0.9414768218994141, "copy_logits_min": -750000128.0, "copy_num_tokens": 460.8125, "epoch": 0.6560122542762318, "gen_logits_max": 6.07338285446167, "gen_logits_mean": -13.43099308013916, "gen_logits_min": -25.512773513793945, "gen_logits_std": 2.671492576599121, "gen_loss": 0.34700095653533936, "grad_norm": 0.4092144069285071, "learning_rate": 2.687242105263158e-05, "loss": 0.3135, "mean_copy_accuracy": 0.9940008968114853, "mean_gen_accuracy": 0.8576137125492096, "mean_token_accuracy": 0.8957148492336273, "num_tokens": 870361558.0, "sample_num_tokens": 9022.0, "step": 3212, "total_num_tokens": 870397646.0, "z_loss": 0.0008170955698005855 }, { "copy_logits_max": 0.6864093542098999, "copy_logits_min": -750000000.0, "copy_num_tokens": 524.6875, "epoch": 0.6562164922134286, "gen_logits_max": 5.994899272918701, "gen_logits_mean": -12.703423500061035, "gen_logits_min": -24.722816467285156, "gen_logits_std": 2.7165184020996094, "gen_loss": 0.27690836787223816, "grad_norm": 0.4521957901692565, "learning_rate": 2.6871157894736844e-05, "loss": 0.3166, "mean_copy_accuracy": 0.9953499287366867, "mean_gen_accuracy": 0.8599024415016174, "mean_token_accuracy": 0.8954154402017593, "num_tokens": 870613197.0, "sample_num_tokens": 8200.25, "step": 3213, "total_num_tokens": 870645998.0, "z_loss": 0.0006470080697908998 }, { "copy_logits_max": -3.6209561824798584, "copy_logits_min": -750000000.0, "copy_num_tokens": 390.4375, "epoch": 0.6564207301506255, "gen_logits_max": 5.692488670349121, "gen_logits_mean": -14.004817962646484, "gen_logits_min": -25.524290084838867, "gen_logits_std": 2.6495437622070312, "gen_loss": 0.3116355538368225, "grad_norm": 0.4321068999139982, "learning_rate": 2.6869894736842104e-05, "loss": 0.3269, "mean_copy_accuracy": 0.9938294291496277, "mean_gen_accuracy": 0.8546173423528671, "mean_token_accuracy": 0.891060009598732, "num_tokens": 870888515.0, "sample_num_tokens": 7446.25, "step": 3214, "total_num_tokens": 870918300.0, "z_loss": 0.0007660384871996939 }, { "copy_logits_max": -0.8698397874832153, "copy_logits_min": -625000064.0, "copy_num_tokens": 475.375, "epoch": 0.6566249680878223, "gen_logits_max": 6.222629547119141, "gen_logits_mean": -11.477243423461914, "gen_logits_min": -23.281267166137695, "gen_logits_std": 2.7009761333465576, "gen_loss": 0.3242574632167816, "grad_norm": 0.40475846594939363, "learning_rate": 2.686863157894737e-05, "loss": 0.3181, "mean_copy_accuracy": 0.9933031648397446, "mean_gen_accuracy": 0.8712847083806992, "mean_token_accuracy": 0.8992365598678589, "num_tokens": 871169202.0, "sample_num_tokens": 8818.5, "step": 3215, "total_num_tokens": 871204476.0, "z_loss": 0.0007839822210371494 }, { "copy_logits_max": -2.6406636238098145, "copy_logits_min": -687500032.0, "copy_num_tokens": 416.1875, "epoch": 0.6568292060250192, "gen_logits_max": 6.978736877441406, "gen_logits_mean": -12.165531158447266, "gen_logits_min": -23.688175201416016, "gen_logits_std": 2.6465911865234375, "gen_loss": 0.34248751401901245, "grad_norm": 0.4380351632676754, "learning_rate": 2.6867368421052633e-05, "loss": 0.3117, "mean_copy_accuracy": 0.9940302670001984, "mean_gen_accuracy": 0.8651532530784607, "mean_token_accuracy": 0.8969749957323074, "num_tokens": 871451075.0, "sample_num_tokens": 8734.25, "step": 3216, "total_num_tokens": 871486012.0, "z_loss": 0.0007884805672802031 }, { "copy_logits_max": 0.1210685670375824, "copy_logits_min": -750000064.0, "copy_num_tokens": 505.6875, "epoch": 0.657033443962216, "gen_logits_max": 6.119935989379883, "gen_logits_mean": -12.098243713378906, "gen_logits_min": -24.42719841003418, "gen_logits_std": 2.6984755992889404, "gen_loss": 0.322775274515152, "grad_norm": 0.38352200475123055, "learning_rate": 2.6866105263157897e-05, "loss": 0.3028, "mean_copy_accuracy": 0.994555190205574, "mean_gen_accuracy": 0.8642131835222244, "mean_token_accuracy": 0.899687871336937, "num_tokens": 871746501.0, "sample_num_tokens": 8176.75, "step": 3217, "total_num_tokens": 871779208.0, "z_loss": 0.0007631075568497181 }, { "copy_logits_max": -5.261760711669922, "copy_logits_min": -750000000.0, "copy_num_tokens": 499.5, "epoch": 0.6572376818994128, "gen_logits_max": 5.812833786010742, "gen_logits_mean": -13.05569076538086, "gen_logits_min": -25.047021865844727, "gen_logits_std": 2.710533857345581, "gen_loss": 0.25399431586265564, "grad_norm": 0.3930263704053359, "learning_rate": 2.686484210526316e-05, "loss": 0.295, "mean_copy_accuracy": 0.9945186674594879, "mean_gen_accuracy": 0.8687658458948135, "mean_token_accuracy": 0.9039180874824524, "num_tokens": 872013502.0, "sample_num_tokens": 8811.5, "step": 3218, "total_num_tokens": 872048748.0, "z_loss": 0.0005792205920442939 }, { "copy_logits_max": -3.622973918914795, "copy_logits_min": -687500032.0, "copy_num_tokens": 640.875, "epoch": 0.6574419198366096, "gen_logits_max": 5.058869361877441, "gen_logits_mean": -13.603096961975098, "gen_logits_min": -25.5152645111084, "gen_logits_std": 2.7100584506988525, "gen_loss": 0.26679205894470215, "grad_norm": 0.4218738696948901, "learning_rate": 2.6863578947368423e-05, "loss": 0.3124, "mean_copy_accuracy": 0.9937543869018555, "mean_gen_accuracy": 0.867321714758873, "mean_token_accuracy": 0.8955198675394058, "num_tokens": 872291432.0, "sample_num_tokens": 9726.5, "step": 3219, "total_num_tokens": 872330338.0, "z_loss": 0.0006565570365637541 }, { "copy_logits_max": -3.8559141159057617, "copy_logits_min": -750000000.0, "copy_num_tokens": 368.75, "epoch": 0.6576461577738065, "gen_logits_max": 6.562845706939697, "gen_logits_mean": -13.174229621887207, "gen_logits_min": -25.076465606689453, "gen_logits_std": 2.6531176567077637, "gen_loss": 0.3491293787956238, "grad_norm": 0.43992952618510794, "learning_rate": 2.6862315789473684e-05, "loss": 0.3182, "mean_copy_accuracy": 0.9945710301399231, "mean_gen_accuracy": 0.862051859498024, "mean_token_accuracy": 0.8954005837440491, "num_tokens": 872573740.0, "sample_num_tokens": 8346.5, "step": 3220, "total_num_tokens": 872607126.0, "z_loss": 0.0008169825887307525 }, { "copy_logits_max": -5.00063419342041, "copy_logits_min": -750000000.0, "copy_num_tokens": 413.125, "epoch": 0.6578503957110033, "gen_logits_max": 6.329531669616699, "gen_logits_mean": -12.944124221801758, "gen_logits_min": -24.391155242919922, "gen_logits_std": 2.68703031539917, "gen_loss": 0.32530519366264343, "grad_norm": 0.40470966580193135, "learning_rate": 2.6861052631578948e-05, "loss": 0.322, "mean_copy_accuracy": 0.994642972946167, "mean_gen_accuracy": 0.8605592548847198, "mean_token_accuracy": 0.8922309726476669, "num_tokens": 872843478.0, "sample_num_tokens": 7899.5, "step": 3221, "total_num_tokens": 872875076.0, "z_loss": 0.0007255111122503877 }, { "copy_logits_max": -2.896653890609741, "copy_logits_min": -750000000.0, "copy_num_tokens": 347.1875, "epoch": 0.6580546336482002, "gen_logits_max": 5.859633922576904, "gen_logits_mean": -13.383256912231445, "gen_logits_min": -25.427438735961914, "gen_logits_std": 2.697499990463257, "gen_loss": 0.34713834524154663, "grad_norm": 0.413461935759029, "learning_rate": 2.685978947368421e-05, "loss": 0.3088, "mean_copy_accuracy": 0.9944224506616592, "mean_gen_accuracy": 0.8688791692256927, "mean_token_accuracy": 0.8963985741138458, "num_tokens": 873117653.0, "sample_num_tokens": 7636.75, "step": 3222, "total_num_tokens": 873148200.0, "z_loss": 0.0007393551641143858 }, { "copy_logits_max": -1.9126012325286865, "copy_logits_min": -750000000.0, "copy_num_tokens": 646.0625, "epoch": 0.658258871585397, "gen_logits_max": 5.798274040222168, "gen_logits_mean": -13.142457008361816, "gen_logits_min": -25.100872039794922, "gen_logits_std": 2.6818325519561768, "gen_loss": 0.3055245876312256, "grad_norm": 0.4539157122874622, "learning_rate": 2.6858526315789473e-05, "loss": 0.3176, "mean_copy_accuracy": 0.9929541349411011, "mean_gen_accuracy": 0.8682622611522675, "mean_token_accuracy": 0.8945322930812836, "num_tokens": 873377282.0, "sample_num_tokens": 10582.0, "step": 3223, "total_num_tokens": 873419610.0, "z_loss": 0.0007626940496265888 }, { "copy_logits_max": -5.200538635253906, "copy_logits_min": -750000000.0, "copy_num_tokens": 441.75, "epoch": 0.6584631095225938, "gen_logits_max": 5.429067611694336, "gen_logits_mean": -13.970842361450195, "gen_logits_min": -25.353469848632812, "gen_logits_std": 2.6558384895324707, "gen_loss": 0.34390193223953247, "grad_norm": 0.4627555550391727, "learning_rate": 2.685726315789474e-05, "loss": 0.3273, "mean_copy_accuracy": 0.993632435798645, "mean_gen_accuracy": 0.8621670305728912, "mean_token_accuracy": 0.8910309076309204, "num_tokens": 873636438.0, "sample_num_tokens": 8770.0, "step": 3224, "total_num_tokens": 873671518.0, "z_loss": 0.000704231031704694 }, { "copy_logits_max": -4.523551940917969, "copy_logits_min": -750000000.0, "copy_num_tokens": 411.1875, "epoch": 0.6586673474597906, "gen_logits_max": 4.929511547088623, "gen_logits_mean": -14.57136058807373, "gen_logits_min": -26.26095199584961, "gen_logits_std": 2.653876543045044, "gen_loss": 0.34783104062080383, "grad_norm": 0.40611726920058067, "learning_rate": 2.6856000000000002e-05, "loss": 0.3205, "mean_copy_accuracy": 0.9945498406887054, "mean_gen_accuracy": 0.8618933409452438, "mean_token_accuracy": 0.8914471119642258, "num_tokens": 873889009.0, "sample_num_tokens": 8146.25, "step": 3225, "total_num_tokens": 873921594.0, "z_loss": 0.0007210481562651694 }, { "copy_logits_max": -4.808581352233887, "copy_logits_min": -687500032.0, "copy_num_tokens": 331.5625, "epoch": 0.6588715853969875, "gen_logits_max": 4.848133563995361, "gen_logits_mean": -14.72659683227539, "gen_logits_min": -26.430025100708008, "gen_logits_std": 2.670503616333008, "gen_loss": 0.3343391418457031, "grad_norm": 0.42007614425100404, "learning_rate": 2.6854736842105266e-05, "loss": 0.3246, "mean_copy_accuracy": 0.9945705682039261, "mean_gen_accuracy": 0.8620370477437973, "mean_token_accuracy": 0.8925818651914597, "num_tokens": 874157317.0, "sample_num_tokens": 7105.75, "step": 3226, "total_num_tokens": 874185740.0, "z_loss": 0.0007572191534563899 }, { "copy_logits_max": -3.9215147495269775, "copy_logits_min": -750000064.0, "copy_num_tokens": 602.5625, "epoch": 0.6590758233341844, "gen_logits_max": 4.6898956298828125, "gen_logits_mean": -13.32327651977539, "gen_logits_min": -25.561506271362305, "gen_logits_std": 2.7282121181488037, "gen_loss": 0.27954819798469543, "grad_norm": 0.40788008257473246, "learning_rate": 2.6853473684210527e-05, "loss": 0.297, "mean_copy_accuracy": 0.9950122684240341, "mean_gen_accuracy": 0.8672935962677002, "mean_token_accuracy": 0.9015118330717087, "num_tokens": 874430745.0, "sample_num_tokens": 8156.25, "step": 3227, "total_num_tokens": 874463370.0, "z_loss": 0.00066900416277349 }, { "copy_logits_max": -6.455751419067383, "copy_logits_min": -625000064.0, "copy_num_tokens": 343.0625, "epoch": 0.6592800612713812, "gen_logits_max": 5.007902145385742, "gen_logits_mean": -15.198089599609375, "gen_logits_min": -26.76383399963379, "gen_logits_std": 2.6606857776641846, "gen_loss": 0.3445403575897217, "grad_norm": 0.4482141880758134, "learning_rate": 2.685221052631579e-05, "loss": 0.3393, "mean_copy_accuracy": 0.9926687031984329, "mean_gen_accuracy": 0.8604580163955688, "mean_token_accuracy": 0.8870550245046616, "num_tokens": 874705586.0, "sample_num_tokens": 8200.5, "step": 3228, "total_num_tokens": 874738388.0, "z_loss": 0.0007121115922927856 }, { "copy_logits_max": -6.120471000671387, "copy_logits_min": -687500032.0, "copy_num_tokens": 442.0, "epoch": 0.659484299208578, "gen_logits_max": 5.236198425292969, "gen_logits_mean": -13.62991714477539, "gen_logits_min": -25.31850242614746, "gen_logits_std": 2.661808967590332, "gen_loss": 0.28006911277770996, "grad_norm": 0.45280553246603183, "learning_rate": 2.6850947368421052e-05, "loss": 0.3273, "mean_copy_accuracy": 0.9945676326751709, "mean_gen_accuracy": 0.8638797700405121, "mean_token_accuracy": 0.8932735472917557, "num_tokens": 874962376.0, "sample_num_tokens": 7667.0, "step": 3229, "total_num_tokens": 874993044.0, "z_loss": 0.0006336477817967534 }, { "copy_logits_max": -6.758665561676025, "copy_logits_min": -750000000.0, "copy_num_tokens": 386.25, "epoch": 0.6596885371457748, "gen_logits_max": 5.178072929382324, "gen_logits_mean": -14.633874893188477, "gen_logits_min": -26.19858741760254, "gen_logits_std": 2.6720378398895264, "gen_loss": 0.309338241815567, "grad_norm": 0.4348786152989738, "learning_rate": 2.6849684210526317e-05, "loss": 0.3329, "mean_copy_accuracy": 0.9951224029064178, "mean_gen_accuracy": 0.8603646904230118, "mean_token_accuracy": 0.8888018727302551, "num_tokens": 875223015.0, "sample_num_tokens": 7897.75, "step": 3230, "total_num_tokens": 875254606.0, "z_loss": 0.0006766445003449917 }, { "copy_logits_max": -4.260876655578613, "copy_logits_min": -687500032.0, "copy_num_tokens": 543.125, "epoch": 0.6598927750829716, "gen_logits_max": 5.443375110626221, "gen_logits_mean": -13.321695327758789, "gen_logits_min": -25.46785545349121, "gen_logits_std": 2.708014488220215, "gen_loss": 0.3358251750469208, "grad_norm": 0.4608743277209249, "learning_rate": 2.6848421052631577e-05, "loss": 0.3457, "mean_copy_accuracy": 0.9939552694559097, "mean_gen_accuracy": 0.8559414744377136, "mean_token_accuracy": 0.8886754214763641, "num_tokens": 875488659.0, "sample_num_tokens": 8941.75, "step": 3231, "total_num_tokens": 875524426.0, "z_loss": 0.0007031251443549991 }, { "copy_logits_max": -5.663575172424316, "copy_logits_min": -687500032.0, "copy_num_tokens": 543.375, "epoch": 0.6600970130201685, "gen_logits_max": 5.818965911865234, "gen_logits_mean": -13.521363258361816, "gen_logits_min": -25.593486785888672, "gen_logits_std": 2.7255678176879883, "gen_loss": 0.2869799733161926, "grad_norm": 0.41077033280867437, "learning_rate": 2.6847157894736845e-05, "loss": 0.3186, "mean_copy_accuracy": 0.9945303797721863, "mean_gen_accuracy": 0.8667211830615997, "mean_token_accuracy": 0.8972431272268295, "num_tokens": 875754571.0, "sample_num_tokens": 8923.25, "step": 3232, "total_num_tokens": 875790264.0, "z_loss": 0.0006993525894358754 }, { "copy_logits_max": -4.268364906311035, "copy_logits_min": -750000064.0, "copy_num_tokens": 557.4375, "epoch": 0.6603012509573654, "gen_logits_max": 5.042238712310791, "gen_logits_mean": -13.743019104003906, "gen_logits_min": -25.26978302001953, "gen_logits_std": 2.677145004272461, "gen_loss": 0.32014310359954834, "grad_norm": 0.41731002101445097, "learning_rate": 2.6845894736842106e-05, "loss": 0.3246, "mean_copy_accuracy": 0.9956590980291367, "mean_gen_accuracy": 0.85786372423172, "mean_token_accuracy": 0.8944307565689087, "num_tokens": 876035533.0, "sample_num_tokens": 9354.25, "step": 3233, "total_num_tokens": 876072950.0, "z_loss": 0.0007968220743350685 }, { "copy_logits_max": 0.12028300762176514, "copy_logits_min": -750000000.0, "copy_num_tokens": 539.125, "epoch": 0.6605054888945622, "gen_logits_max": 5.318504333496094, "gen_logits_mean": -13.368598937988281, "gen_logits_min": -25.180288314819336, "gen_logits_std": 2.697695255279541, "gen_loss": 0.3244696259498596, "grad_norm": 0.44550454077581075, "learning_rate": 2.684463157894737e-05, "loss": 0.3189, "mean_copy_accuracy": 0.9942164719104767, "mean_gen_accuracy": 0.8571668416261673, "mean_token_accuracy": 0.8934154659509659, "num_tokens": 876309629.0, "sample_num_tokens": 8732.75, "step": 3234, "total_num_tokens": 876344560.0, "z_loss": 0.0007768908399157226 }, { "copy_logits_max": -5.793728828430176, "copy_logits_min": -750000000.0, "copy_num_tokens": 661.9375, "epoch": 0.660709726831759, "gen_logits_max": 5.1950578689575195, "gen_logits_mean": -12.808223724365234, "gen_logits_min": -24.966093063354492, "gen_logits_std": 2.688575029373169, "gen_loss": 0.2862420678138733, "grad_norm": 0.48008900587025344, "learning_rate": 2.684336842105263e-05, "loss": 0.3125, "mean_copy_accuracy": 0.994101271033287, "mean_gen_accuracy": 0.8632145524024963, "mean_token_accuracy": 0.8962835967540741, "num_tokens": 876573368.0, "sample_num_tokens": 10348.5, "step": 3235, "total_num_tokens": 876614762.0, "z_loss": 0.0006446146289817989 }, { "copy_logits_max": -3.6121509075164795, "copy_logits_min": -750000000.0, "copy_num_tokens": 409.75, "epoch": 0.6609139647689558, "gen_logits_max": 5.729750633239746, "gen_logits_mean": -13.51754379272461, "gen_logits_min": -25.90689468383789, "gen_logits_std": 2.721876859664917, "gen_loss": 0.3344673216342926, "grad_norm": 0.449509627584838, "learning_rate": 2.6842105263157896e-05, "loss": 0.331, "mean_copy_accuracy": 0.9935998320579529, "mean_gen_accuracy": 0.8623005002737045, "mean_token_accuracy": 0.8904079347848892, "num_tokens": 876838704.0, "sample_num_tokens": 7737.5, "step": 3236, "total_num_tokens": 876869654.0, "z_loss": 0.0007661979179829359 }, { "copy_logits_max": -4.600228309631348, "copy_logits_min": -687500032.0, "copy_num_tokens": 409.75, "epoch": 0.6611182027061526, "gen_logits_max": 5.538542747497559, "gen_logits_mean": -14.753377914428711, "gen_logits_min": -26.422964096069336, "gen_logits_std": 2.66837739944458, "gen_loss": 0.31634753942489624, "grad_norm": 0.4363547975611083, "learning_rate": 2.6840842105263157e-05, "loss": 0.3173, "mean_copy_accuracy": 0.9946903139352798, "mean_gen_accuracy": 0.8644507676362991, "mean_token_accuracy": 0.8949930816888809, "num_tokens": 877107954.0, "sample_num_tokens": 9265.0, "step": 3237, "total_num_tokens": 877145014.0, "z_loss": 0.0007690777420066297 }, { "copy_logits_max": -0.5765156149864197, "copy_logits_min": -750000000.0, "copy_num_tokens": 452.125, "epoch": 0.6613224406433496, "gen_logits_max": 4.984832763671875, "gen_logits_mean": -13.952306747436523, "gen_logits_min": -25.99839210510254, "gen_logits_std": 2.6931519508361816, "gen_loss": 0.32111990451812744, "grad_norm": 0.4146567117238744, "learning_rate": 2.683957894736842e-05, "loss": 0.3162, "mean_copy_accuracy": 0.9953668415546417, "mean_gen_accuracy": 0.8581647276878357, "mean_token_accuracy": 0.8964884877204895, "num_tokens": 877398483.0, "sample_num_tokens": 7929.75, "step": 3238, "total_num_tokens": 877430202.0, "z_loss": 0.000778495566919446 }, { "copy_logits_max": -1.9438589811325073, "copy_logits_min": -687500032.0, "copy_num_tokens": 477.0, "epoch": 0.6615266785805464, "gen_logits_max": 5.273159027099609, "gen_logits_mean": -13.666794776916504, "gen_logits_min": -25.940616607666016, "gen_logits_std": 2.6964054107666016, "gen_loss": 0.3304508924484253, "grad_norm": 0.4085001674137162, "learning_rate": 2.6838315789473685e-05, "loss": 0.31, "mean_copy_accuracy": 0.9954717606306076, "mean_gen_accuracy": 0.8635012209415436, "mean_token_accuracy": 0.8964357376098633, "num_tokens": 877677833.0, "sample_num_tokens": 8174.25, "step": 3239, "total_num_tokens": 877710530.0, "z_loss": 0.0007679713307879865 }, { "copy_logits_max": -2.1104323863983154, "copy_logits_min": -750000000.0, "copy_num_tokens": 475.75, "epoch": 0.6617309165177432, "gen_logits_max": 4.959593296051025, "gen_logits_mean": -13.788240432739258, "gen_logits_min": -26.133556365966797, "gen_logits_std": 2.722390651702881, "gen_loss": 0.3087884783744812, "grad_norm": 0.46251844139354475, "learning_rate": 2.6837052631578946e-05, "loss": 0.3161, "mean_copy_accuracy": 0.9940802603960037, "mean_gen_accuracy": 0.8623377531766891, "mean_token_accuracy": 0.895465299487114, "num_tokens": 877944084.0, "sample_num_tokens": 8138.5, "step": 3240, "total_num_tokens": 877976638.0, "z_loss": 0.0007224120781756938 }, { "copy_logits_max": -0.9276079535484314, "copy_logits_min": -750000000.0, "copy_num_tokens": 509.3125, "epoch": 0.66193515445494, "gen_logits_max": 5.828546524047852, "gen_logits_mean": -12.904281616210938, "gen_logits_min": -24.884605407714844, "gen_logits_std": 2.6757302284240723, "gen_loss": 0.3189879357814789, "grad_norm": 0.4383173049214765, "learning_rate": 2.6835789473684214e-05, "loss": 0.3131, "mean_copy_accuracy": 0.995405986905098, "mean_gen_accuracy": 0.863493487238884, "mean_token_accuracy": 0.8976231068372726, "num_tokens": 878210604.0, "sample_num_tokens": 8735.0, "step": 3241, "total_num_tokens": 878245544.0, "z_loss": 0.0007721285219304264 }, { "copy_logits_max": -4.968420028686523, "copy_logits_min": -750000000.0, "copy_num_tokens": 354.8125, "epoch": 0.6621393923921368, "gen_logits_max": 5.466757774353027, "gen_logits_mean": -14.881877899169922, "gen_logits_min": -26.585548400878906, "gen_logits_std": 2.678868293762207, "gen_loss": 0.29569756984710693, "grad_norm": 0.40972480406418754, "learning_rate": 2.6834526315789475e-05, "loss": 0.3243, "mean_copy_accuracy": 0.9943841993808746, "mean_gen_accuracy": 0.8666600435972214, "mean_token_accuracy": 0.8930311053991318, "num_tokens": 878493682.0, "sample_num_tokens": 8138.5, "step": 3242, "total_num_tokens": 878526236.0, "z_loss": 0.0006495849811471999 }, { "copy_logits_max": -2.9207210540771484, "copy_logits_min": -750000000.0, "copy_num_tokens": 288.0625, "epoch": 0.6623436303293336, "gen_logits_max": 5.825061798095703, "gen_logits_mean": -14.001646041870117, "gen_logits_min": -25.71442222595215, "gen_logits_std": 2.6751227378845215, "gen_loss": 0.3310544490814209, "grad_norm": 0.4081834615468343, "learning_rate": 2.683326315789474e-05, "loss": 0.3259, "mean_copy_accuracy": 0.9939278364181519, "mean_gen_accuracy": 0.8630921393632889, "mean_token_accuracy": 0.8922597616910934, "num_tokens": 878763716.0, "sample_num_tokens": 7539.5, "step": 3243, "total_num_tokens": 878793874.0, "z_loss": 0.0007203398854471743 }, { "copy_logits_max": -4.400496006011963, "copy_logits_min": -625000064.0, "copy_num_tokens": 545.3125, "epoch": 0.6625478682665306, "gen_logits_max": 5.04702091217041, "gen_logits_mean": -14.018991470336914, "gen_logits_min": -26.062429428100586, "gen_logits_std": 2.7083239555358887, "gen_loss": 0.33233311772346497, "grad_norm": 0.40779239735661177, "learning_rate": 2.6832e-05, "loss": 0.3313, "mean_copy_accuracy": 0.9940207451581955, "mean_gen_accuracy": 0.8604647517204285, "mean_token_accuracy": 0.8909143656492233, "num_tokens": 879027758.0, "sample_num_tokens": 9601.5, "step": 3244, "total_num_tokens": 879066164.0, "z_loss": 0.0006732254987582564 }, { "copy_logits_max": -2.117690086364746, "copy_logits_min": -750000000.0, "copy_num_tokens": 439.4375, "epoch": 0.6627521062037274, "gen_logits_max": 5.9657511711120605, "gen_logits_mean": -13.18336296081543, "gen_logits_min": -25.464155197143555, "gen_logits_std": 2.7176804542541504, "gen_loss": 0.3428346812725067, "grad_norm": 0.43513634141436475, "learning_rate": 2.6830736842105264e-05, "loss": 0.3184, "mean_copy_accuracy": 0.9951886385679245, "mean_gen_accuracy": 0.8635530918836594, "mean_token_accuracy": 0.8953674584627151, "num_tokens": 879303324.0, "sample_num_tokens": 8421.5, "step": 3245, "total_num_tokens": 879337010.0, "z_loss": 0.0007559866644442081 }, { "copy_logits_max": -5.607939720153809, "copy_logits_min": -750000000.0, "copy_num_tokens": 500.8125, "epoch": 0.6629563441409242, "gen_logits_max": 5.755472183227539, "gen_logits_mean": -12.949851989746094, "gen_logits_min": -24.909271240234375, "gen_logits_std": 2.7147161960601807, "gen_loss": 0.31600886583328247, "grad_norm": 0.4821975847293874, "learning_rate": 2.6829473684210525e-05, "loss": 0.326, "mean_copy_accuracy": 0.994451493024826, "mean_gen_accuracy": 0.8651434481143951, "mean_token_accuracy": 0.8933097571134567, "num_tokens": 879554268.0, "sample_num_tokens": 8517.5, "step": 3246, "total_num_tokens": 879588338.0, "z_loss": 0.0006688821013085544 }, { "copy_logits_max": -4.328823089599609, "copy_logits_min": -687500096.0, "copy_num_tokens": 476.6875, "epoch": 0.663160582078121, "gen_logits_max": 5.382737636566162, "gen_logits_mean": -14.104801177978516, "gen_logits_min": -26.308778762817383, "gen_logits_std": 2.7190380096435547, "gen_loss": 0.2931872606277466, "grad_norm": 0.4015949539161335, "learning_rate": 2.682821052631579e-05, "loss": 0.3157, "mean_copy_accuracy": 0.9929213970899582, "mean_gen_accuracy": 0.8676536381244659, "mean_token_accuracy": 0.8960962444543839, "num_tokens": 879827262.0, "sample_num_tokens": 9241.0, "step": 3247, "total_num_tokens": 879864226.0, "z_loss": 0.0007235731463879347 }, { "copy_logits_max": -5.952716827392578, "copy_logits_min": -750000000.0, "copy_num_tokens": 389.8125, "epoch": 0.6633648200153178, "gen_logits_max": 5.302407264709473, "gen_logits_mean": -13.99708080291748, "gen_logits_min": -26.099430084228516, "gen_logits_std": 2.6877248287200928, "gen_loss": 0.3215668797492981, "grad_norm": 0.42209312491758144, "learning_rate": 2.682694736842105e-05, "loss": 0.3038, "mean_copy_accuracy": 0.994383916258812, "mean_gen_accuracy": 0.8688156753778458, "mean_token_accuracy": 0.8990042805671692, "num_tokens": 880102622.0, "sample_num_tokens": 7890.0, "step": 3248, "total_num_tokens": 880134182.0, "z_loss": 0.0006940740859135985 }, { "copy_logits_max": -1.3827109336853027, "copy_logits_min": -687500032.0, "copy_num_tokens": 533.1875, "epoch": 0.6635690579525146, "gen_logits_max": 3.8623127937316895, "gen_logits_mean": -15.186178207397461, "gen_logits_min": -27.34524154663086, "gen_logits_std": 2.7487704753875732, "gen_loss": 0.2940257489681244, "grad_norm": 0.4281802926356265, "learning_rate": 2.6825684210526318e-05, "loss": 0.3072, "mean_copy_accuracy": 0.9955688416957855, "mean_gen_accuracy": 0.8683896958827972, "mean_token_accuracy": 0.8973926156759262, "num_tokens": 880365587.0, "sample_num_tokens": 8168.75, "step": 3249, "total_num_tokens": 880398262.0, "z_loss": 0.0007063779048621655 }, { "copy_logits_max": -3.620699405670166, "copy_logits_min": -750000000.0, "copy_num_tokens": 262.8125, "epoch": 0.6637732958897116, "gen_logits_max": 6.00105619430542, "gen_logits_mean": -13.852506637573242, "gen_logits_min": -25.706768035888672, "gen_logits_std": 2.6866025924682617, "gen_loss": 0.36674803495407104, "grad_norm": 0.4910017610683997, "learning_rate": 2.682442105263158e-05, "loss": 0.344, "mean_copy_accuracy": 0.9940992891788483, "mean_gen_accuracy": 0.8580222427845001, "mean_token_accuracy": 0.8859251886606216, "num_tokens": 880608515.0, "sample_num_tokens": 6675.25, "step": 3250, "total_num_tokens": 880635216.0, "z_loss": 0.0008064869325608015 }, { "copy_logits_max": -4.681334495544434, "copy_logits_min": -750000000.0, "copy_num_tokens": 611.4375, "epoch": 0.6639775338269084, "gen_logits_max": 4.327165603637695, "gen_logits_mean": -14.694438934326172, "gen_logits_min": -26.768770217895508, "gen_logits_std": 2.7317540645599365, "gen_loss": 0.30511242151260376, "grad_norm": 0.3961750819242682, "learning_rate": 2.6823157894736843e-05, "loss": 0.2889, "mean_copy_accuracy": 0.9951276183128357, "mean_gen_accuracy": 0.8689727932214737, "mean_token_accuracy": 0.9028077721595764, "num_tokens": 880887975.0, "sample_num_tokens": 9224.25, "step": 3251, "total_num_tokens": 880924872.0, "z_loss": 0.0006280082161538303 }, { "copy_logits_max": -5.30828857421875, "copy_logits_min": -750000000.0, "copy_num_tokens": 391.5, "epoch": 0.6641817717641052, "gen_logits_max": 4.949246883392334, "gen_logits_mean": -14.461130142211914, "gen_logits_min": -26.29227066040039, "gen_logits_std": 2.706744432449341, "gen_loss": 0.30725088715553284, "grad_norm": 0.43518518648464877, "learning_rate": 2.6821894736842108e-05, "loss": 0.2991, "mean_copy_accuracy": 0.9943281710147858, "mean_gen_accuracy": 0.8642604053020477, "mean_token_accuracy": 0.9001717418432236, "num_tokens": 881179825.0, "sample_num_tokens": 7439.75, "step": 3252, "total_num_tokens": 881209584.0, "z_loss": 0.0005936413072049618 }, { "copy_logits_max": -2.5937838554382324, "copy_logits_min": -687500032.0, "copy_num_tokens": 741.1875, "epoch": 0.664386009701302, "gen_logits_max": 5.145938873291016, "gen_logits_mean": -12.883783340454102, "gen_logits_min": -24.977920532226562, "gen_logits_std": 2.6957993507385254, "gen_loss": 0.2812378406524658, "grad_norm": 0.42218262731949247, "learning_rate": 2.682063157894737e-05, "loss": 0.2892, "mean_copy_accuracy": 0.9939178079366684, "mean_gen_accuracy": 0.873891070485115, "mean_token_accuracy": 0.9052629917860031, "num_tokens": 881426503.0, "sample_num_tokens": 9677.25, "step": 3253, "total_num_tokens": 881465212.0, "z_loss": 0.0006760730175301433 }, { "copy_logits_max": -4.385331153869629, "copy_logits_min": -750000000.0, "copy_num_tokens": 412.1875, "epoch": 0.6645902476384988, "gen_logits_max": 5.569457054138184, "gen_logits_mean": -13.520082473754883, "gen_logits_min": -25.65488052368164, "gen_logits_std": 2.693314790725708, "gen_loss": 0.36912867426872253, "grad_norm": 0.46563096555460376, "learning_rate": 2.6819368421052633e-05, "loss": 0.3341, "mean_copy_accuracy": 0.9934121966362, "mean_gen_accuracy": 0.8632089793682098, "mean_token_accuracy": 0.8899298459291458, "num_tokens": 881676201.0, "sample_num_tokens": 8163.75, "step": 3254, "total_num_tokens": 881708856.0, "z_loss": 0.0008420217782258987 }, { "copy_logits_max": -3.286506175994873, "copy_logits_min": -749999936.0, "copy_num_tokens": 595.125, "epoch": 0.6647944855756956, "gen_logits_max": 4.895359039306641, "gen_logits_mean": -14.141775131225586, "gen_logits_min": -26.238540649414062, "gen_logits_std": 2.703500747680664, "gen_loss": 0.2942863702774048, "grad_norm": 0.39923734857165144, "learning_rate": 2.6818105263157894e-05, "loss": 0.2914, "mean_copy_accuracy": 0.9950143098831177, "mean_gen_accuracy": 0.8715115189552307, "mean_token_accuracy": 0.9018073379993439, "num_tokens": 881941423.0, "sample_num_tokens": 8941.25, "step": 3255, "total_num_tokens": 881977188.0, "z_loss": 0.0007463042857125401 }, { "copy_logits_max": -5.93062686920166, "copy_logits_min": -750000000.0, "copy_num_tokens": 472.0625, "epoch": 0.6649987235128925, "gen_logits_max": 5.412355422973633, "gen_logits_mean": -13.718866348266602, "gen_logits_min": -25.55493927001953, "gen_logits_std": 2.669816493988037, "gen_loss": 0.29407066106796265, "grad_norm": 0.4107550762271188, "learning_rate": 2.6816842105263158e-05, "loss": 0.2988, "mean_copy_accuracy": 0.9938722252845764, "mean_gen_accuracy": 0.866808295249939, "mean_token_accuracy": 0.9012071043252945, "num_tokens": 882222262.0, "sample_num_tokens": 8156.0, "step": 3256, "total_num_tokens": 882254886.0, "z_loss": 0.0006898105493746698 }, { "copy_logits_max": -6.163049221038818, "copy_logits_min": -750000000.0, "copy_num_tokens": 342.8125, "epoch": 0.6652029614500894, "gen_logits_max": 6.396453857421875, "gen_logits_mean": -13.218564987182617, "gen_logits_min": -24.506114959716797, "gen_logits_std": 2.62359619140625, "gen_loss": 0.3314898908138275, "grad_norm": 0.4028456297094158, "learning_rate": 2.6815578947368422e-05, "loss": 0.3115, "mean_copy_accuracy": 0.9951096922159195, "mean_gen_accuracy": 0.8672535717487335, "mean_token_accuracy": 0.8954194784164429, "num_tokens": 882470553.0, "sample_num_tokens": 7789.25, "step": 3257, "total_num_tokens": 882501710.0, "z_loss": 0.0007528576534241438 }, { "copy_logits_max": -4.923938274383545, "copy_logits_min": -750000000.0, "copy_num_tokens": 585.1875, "epoch": 0.6654071993872862, "gen_logits_max": 4.169985771179199, "gen_logits_mean": -14.77661418914795, "gen_logits_min": -26.368873596191406, "gen_logits_std": 2.634089469909668, "gen_loss": 0.28543901443481445, "grad_norm": 0.3973508081082848, "learning_rate": 2.6814315789473687e-05, "loss": 0.3096, "mean_copy_accuracy": 0.995167151093483, "mean_gen_accuracy": 0.8672026842832565, "mean_token_accuracy": 0.8988318741321564, "num_tokens": 882733046.0, "sample_num_tokens": 9626.0, "step": 3258, "total_num_tokens": 882771550.0, "z_loss": 0.0006673004245385528 }, { "copy_logits_max": -3.7553770542144775, "copy_logits_min": -750000000.0, "copy_num_tokens": 376.125, "epoch": 0.665611437324483, "gen_logits_max": 5.061899662017822, "gen_logits_mean": -14.933538436889648, "gen_logits_min": -26.78550148010254, "gen_logits_std": 2.6674275398254395, "gen_loss": 0.32307446002960205, "grad_norm": 0.39531561657904324, "learning_rate": 2.6813052631578948e-05, "loss": 0.2992, "mean_copy_accuracy": 0.9943547248840332, "mean_gen_accuracy": 0.871020957827568, "mean_token_accuracy": 0.8996732532978058, "num_tokens": 883012321.0, "sample_num_tokens": 7347.75, "step": 3259, "total_num_tokens": 883041712.0, "z_loss": 0.0007422225899063051 }, { "copy_logits_max": -4.438296318054199, "copy_logits_min": -750000000.0, "copy_num_tokens": 419.875, "epoch": 0.6658156752616798, "gen_logits_max": 5.666691780090332, "gen_logits_mean": -12.93109130859375, "gen_logits_min": -24.831222534179688, "gen_logits_std": 2.659001111984253, "gen_loss": 0.3058679699897766, "grad_norm": 0.4032265047609247, "learning_rate": 2.6811789473684212e-05, "loss": 0.3082, "mean_copy_accuracy": 0.9942163974046707, "mean_gen_accuracy": 0.8641120791435242, "mean_token_accuracy": 0.8966493308544159, "num_tokens": 883287712.0, "sample_num_tokens": 7682.0, "step": 3260, "total_num_tokens": 883318440.0, "z_loss": 0.0006780086550861597 }, { "copy_logits_max": -5.4783220291137695, "copy_logits_min": -750000000.0, "copy_num_tokens": 548.6875, "epoch": 0.6660199131988767, "gen_logits_max": 4.867822647094727, "gen_logits_mean": -14.187751770019531, "gen_logits_min": -25.552461624145508, "gen_logits_std": 2.64327335357666, "gen_loss": 0.320357084274292, "grad_norm": 0.41955073747813815, "learning_rate": 2.6810526315789473e-05, "loss": 0.322, "mean_copy_accuracy": 0.9946896731853485, "mean_gen_accuracy": 0.8564065098762512, "mean_token_accuracy": 0.8945430368185043, "num_tokens": 883560158.0, "sample_num_tokens": 9028.5, "step": 3261, "total_num_tokens": 883596272.0, "z_loss": 0.0007088305428624153 }, { "copy_logits_max": -6.145987510681152, "copy_logits_min": -687500032.0, "copy_num_tokens": 517.4375, "epoch": 0.6662241511360735, "gen_logits_max": 5.052131652832031, "gen_logits_mean": -14.193974494934082, "gen_logits_min": -25.503698348999023, "gen_logits_std": 2.6446571350097656, "gen_loss": 0.29420578479766846, "grad_norm": 0.41052099172087964, "learning_rate": 2.6809263157894737e-05, "loss": 0.2884, "mean_copy_accuracy": 0.9954564869403839, "mean_gen_accuracy": 0.8716720044612885, "mean_token_accuracy": 0.9043553024530411, "num_tokens": 883849048.0, "sample_num_tokens": 9907.0, "step": 3262, "total_num_tokens": 883888676.0, "z_loss": 0.000641087768599391 }, { "copy_logits_max": -6.235836029052734, "copy_logits_min": -750000000.0, "copy_num_tokens": 363.5625, "epoch": 0.6664283890732704, "gen_logits_max": 5.989798069000244, "gen_logits_mean": -13.328603744506836, "gen_logits_min": -25.208986282348633, "gen_logits_std": 2.6622419357299805, "gen_loss": 0.28830772638320923, "grad_norm": 0.4520247237402088, "learning_rate": 2.6807999999999998e-05, "loss": 0.3138, "mean_copy_accuracy": 0.9942978024482727, "mean_gen_accuracy": 0.8692743927240372, "mean_token_accuracy": 0.8973967134952545, "num_tokens": 884124814.0, "sample_num_tokens": 7106.0, "step": 3263, "total_num_tokens": 884153238.0, "z_loss": 0.0006493323598988354 }, { "copy_logits_max": -5.719529151916504, "copy_logits_min": -750000000.0, "copy_num_tokens": 604.625, "epoch": 0.6666326270104672, "gen_logits_max": 4.638185501098633, "gen_logits_mean": -13.191494941711426, "gen_logits_min": -25.052940368652344, "gen_logits_std": 2.6449642181396484, "gen_loss": 0.2879170775413513, "grad_norm": 0.4051769577283897, "learning_rate": 2.6806736842105262e-05, "loss": 0.3048, "mean_copy_accuracy": 0.9959156513214111, "mean_gen_accuracy": 0.8576115667819977, "mean_token_accuracy": 0.8997060358524323, "num_tokens": 884406238.0, "sample_num_tokens": 9222.5, "step": 3264, "total_num_tokens": 884443128.0, "z_loss": 0.0006556871230714023 }, { "copy_logits_max": -5.266590118408203, "copy_logits_min": -625000064.0, "copy_num_tokens": 446.1875, "epoch": 0.666836864947664, "gen_logits_max": 5.598927974700928, "gen_logits_mean": -13.41794204711914, "gen_logits_min": -24.697097778320312, "gen_logits_std": 2.6011312007904053, "gen_loss": 0.3553888201713562, "grad_norm": 0.4093480450120218, "learning_rate": 2.680547368421053e-05, "loss": 0.3131, "mean_copy_accuracy": 0.9940289705991745, "mean_gen_accuracy": 0.8656361699104309, "mean_token_accuracy": 0.8962275981903076, "num_tokens": 884672179.0, "sample_num_tokens": 8648.25, "step": 3265, "total_num_tokens": 884706772.0, "z_loss": 0.0007810502429492772 }, { "copy_logits_max": -6.1150712966918945, "copy_logits_min": -750000000.0, "copy_num_tokens": 280.1875, "epoch": 0.6670411028848608, "gen_logits_max": 6.455411911010742, "gen_logits_mean": -13.384262084960938, "gen_logits_min": -24.939002990722656, "gen_logits_std": 2.623828649520874, "gen_loss": 0.320701539516449, "grad_norm": 0.4071278201850902, "learning_rate": 2.680421052631579e-05, "loss": 0.3083, "mean_copy_accuracy": 0.9955000132322311, "mean_gen_accuracy": 0.8678725510835648, "mean_token_accuracy": 0.8960848152637482, "num_tokens": 884937198.0, "sample_num_tokens": 7488.0, "step": 3266, "total_num_tokens": 884967150.0, "z_loss": 0.0007430022233165801 }, { "copy_logits_max": -4.752109050750732, "copy_logits_min": -687500032.0, "copy_num_tokens": 637.25, "epoch": 0.6672453408220577, "gen_logits_max": 5.670029640197754, "gen_logits_mean": -12.495007514953613, "gen_logits_min": -24.30133819580078, "gen_logits_std": 2.7132210731506348, "gen_loss": 0.2725783586502075, "grad_norm": 0.4217908878852439, "learning_rate": 2.6802947368421055e-05, "loss": 0.3041, "mean_copy_accuracy": 0.9964998960494995, "mean_gen_accuracy": 0.8666934669017792, "mean_token_accuracy": 0.8985792547464371, "num_tokens": 885191141.0, "sample_num_tokens": 9200.75, "step": 3267, "total_num_tokens": 885227944.0, "z_loss": 0.0006522510666400194 }, { "copy_logits_max": -6.613717555999756, "copy_logits_min": -750000000.0, "copy_num_tokens": 281.5, "epoch": 0.6674495787592545, "gen_logits_max": 6.51982307434082, "gen_logits_mean": -12.627862930297852, "gen_logits_min": -24.19334602355957, "gen_logits_std": 2.614729881286621, "gen_loss": 0.3716604709625244, "grad_norm": 0.44937804192927866, "learning_rate": 2.6801684210526316e-05, "loss": 0.302, "mean_copy_accuracy": 0.9953883290290833, "mean_gen_accuracy": 0.8652714341878891, "mean_token_accuracy": 0.8985888212919235, "num_tokens": 885494118.0, "sample_num_tokens": 7969.5, "step": 3268, "total_num_tokens": 885525996.0, "z_loss": 0.0007776591228321195 }, { "copy_logits_max": -5.73436164855957, "copy_logits_min": -750000000.0, "copy_num_tokens": 412.9375, "epoch": 0.6676538166964514, "gen_logits_max": 4.760236740112305, "gen_logits_mean": -14.289321899414062, "gen_logits_min": -26.526958465576172, "gen_logits_std": 2.66190242767334, "gen_loss": 0.29508763551712036, "grad_norm": 0.4581207926004728, "learning_rate": 2.680042105263158e-05, "loss": 0.3339, "mean_copy_accuracy": 0.9945099651813507, "mean_gen_accuracy": 0.8577504605054855, "mean_token_accuracy": 0.8874923884868622, "num_tokens": 885756674.0, "sample_num_tokens": 7518.0, "step": 3269, "total_num_tokens": 885786746.0, "z_loss": 0.0006548319943249226 }, { "copy_logits_max": -5.692628860473633, "copy_logits_min": -750000064.0, "copy_num_tokens": 503.375, "epoch": 0.6678580546336482, "gen_logits_max": 5.784982204437256, "gen_logits_mean": -12.908607482910156, "gen_logits_min": -24.816669464111328, "gen_logits_std": 2.661334991455078, "gen_loss": 0.3208698332309723, "grad_norm": 0.43958504269749893, "learning_rate": 2.679915789473684e-05, "loss": 0.3262, "mean_copy_accuracy": 0.995380088686943, "mean_gen_accuracy": 0.8607358783483505, "mean_token_accuracy": 0.8924928158521652, "num_tokens": 886035649.0, "sample_num_tokens": 8898.75, "step": 3270, "total_num_tokens": 886071244.0, "z_loss": 0.0007149768061935902 }, { "copy_logits_max": -4.08195686340332, "copy_logits_min": -687500032.0, "copy_num_tokens": 429.4375, "epoch": 0.668062292570845, "gen_logits_max": 5.360324859619141, "gen_logits_mean": -13.726211547851562, "gen_logits_min": -25.371631622314453, "gen_logits_std": 2.6375536918640137, "gen_loss": 0.3255833387374878, "grad_norm": 0.3985506828132431, "learning_rate": 2.6797894736842106e-05, "loss": 0.2989, "mean_copy_accuracy": 0.9951219856739044, "mean_gen_accuracy": 0.8687203675508499, "mean_token_accuracy": 0.8994627892971039, "num_tokens": 886330331.0, "sample_num_tokens": 7876.25, "step": 3271, "total_num_tokens": 886361836.0, "z_loss": 0.0008038417436182499 }, { "copy_logits_max": -3.5604701042175293, "copy_logits_min": -625000064.0, "copy_num_tokens": 591.125, "epoch": 0.6682665305080419, "gen_logits_max": 5.111910820007324, "gen_logits_mean": -12.574423789978027, "gen_logits_min": -24.67867660522461, "gen_logits_std": 2.665212631225586, "gen_loss": 0.29970818758010864, "grad_norm": 0.4386398000244137, "learning_rate": 2.6796631578947367e-05, "loss": 0.317, "mean_copy_accuracy": 0.9953591078519821, "mean_gen_accuracy": 0.8633636087179184, "mean_token_accuracy": 0.8952001929283142, "num_tokens": 886592736.0, "sample_num_tokens": 9384.0, "step": 3272, "total_num_tokens": 886630272.0, "z_loss": 0.0007961574010550976 }, { "copy_logits_max": -4.376567840576172, "copy_logits_min": -750000000.0, "copy_num_tokens": 537.6875, "epoch": 0.6684707684452387, "gen_logits_max": 4.538334846496582, "gen_logits_mean": -15.131498336791992, "gen_logits_min": -27.0069580078125, "gen_logits_std": 2.661315441131592, "gen_loss": 0.3115752041339874, "grad_norm": 0.4239105672786924, "learning_rate": 2.6795368421052634e-05, "loss": 0.3201, "mean_copy_accuracy": 0.9946658909320831, "mean_gen_accuracy": 0.861546128988266, "mean_token_accuracy": 0.8945450782775879, "num_tokens": 886865010.0, "sample_num_tokens": 8468.0, "step": 3273, "total_num_tokens": 886898882.0, "z_loss": 0.000738086411729455 }, { "copy_logits_max": -6.044609069824219, "copy_logits_min": -750000000.0, "copy_num_tokens": 531.9375, "epoch": 0.6686750063824355, "gen_logits_max": 4.569380760192871, "gen_logits_mean": -14.547362327575684, "gen_logits_min": -26.10590171813965, "gen_logits_std": 2.6440539360046387, "gen_loss": 0.2715386152267456, "grad_norm": 0.43956306050240374, "learning_rate": 2.6794105263157895e-05, "loss": 0.3164, "mean_copy_accuracy": 0.9946993291378021, "mean_gen_accuracy": 0.8620122522115707, "mean_token_accuracy": 0.8969658017158508, "num_tokens": 887148552.0, "sample_num_tokens": 8967.0, "step": 3274, "total_num_tokens": 887184420.0, "z_loss": 0.0006324772257357836 }, { "copy_logits_max": -7.511041641235352, "copy_logits_min": -750000064.0, "copy_num_tokens": 383.0625, "epoch": 0.6688792443196324, "gen_logits_max": 6.2720046043396, "gen_logits_mean": -13.071422576904297, "gen_logits_min": -24.676109313964844, "gen_logits_std": 2.6153011322021484, "gen_loss": 0.3588443994522095, "grad_norm": 0.41231001302608505, "learning_rate": 2.679284210526316e-05, "loss": 0.3058, "mean_copy_accuracy": 0.9953401833772659, "mean_gen_accuracy": 0.8687040209770203, "mean_token_accuracy": 0.8994437903165817, "num_tokens": 887426725.0, "sample_num_tokens": 8492.75, "step": 3275, "total_num_tokens": 887460696.0, "z_loss": 0.0007839109166525304 }, { "copy_logits_max": -7.404932975769043, "copy_logits_min": -750000000.0, "copy_num_tokens": 257.4375, "epoch": 0.6690834822568292, "gen_logits_max": 5.78525447845459, "gen_logits_mean": -14.458037376403809, "gen_logits_min": -26.18904685974121, "gen_logits_std": 2.6190543174743652, "gen_loss": 0.3461146056652069, "grad_norm": 0.41025369513645854, "learning_rate": 2.679157894736842e-05, "loss": 0.2996, "mean_copy_accuracy": 0.9951165467500687, "mean_gen_accuracy": 0.8692616373300552, "mean_token_accuracy": 0.8989580124616623, "num_tokens": 887697661.0, "sample_num_tokens": 7255.25, "step": 3276, "total_num_tokens": 887726682.0, "z_loss": 0.0007701449794694781 }, { "copy_logits_max": -5.930250644683838, "copy_logits_min": -687500032.0, "copy_num_tokens": 556.8125, "epoch": 0.669287720194026, "gen_logits_max": 4.673669338226318, "gen_logits_mean": -13.285470962524414, "gen_logits_min": -25.21710205078125, "gen_logits_std": 2.6137027740478516, "gen_loss": 0.25390443205833435, "grad_norm": 0.4530162484833234, "learning_rate": 2.6790315789473685e-05, "loss": 0.3224, "mean_copy_accuracy": 0.9954799711704254, "mean_gen_accuracy": 0.8604298233985901, "mean_token_accuracy": 0.893245667219162, "num_tokens": 887947765.0, "sample_num_tokens": 8459.75, "step": 3277, "total_num_tokens": 887981604.0, "z_loss": 0.0006204219534993172 }, { "copy_logits_max": -5.722484111785889, "copy_logits_min": -750000000.0, "copy_num_tokens": 447.625, "epoch": 0.6694919581312229, "gen_logits_max": 5.278502464294434, "gen_logits_mean": -13.334589004516602, "gen_logits_min": -25.299758911132812, "gen_logits_std": 2.6852736473083496, "gen_loss": 0.29581546783447266, "grad_norm": 0.3872753332182185, "learning_rate": 2.678905263157895e-05, "loss": 0.2998, "mean_copy_accuracy": 0.9952192008495331, "mean_gen_accuracy": 0.868478387594223, "mean_token_accuracy": 0.9013027995824814, "num_tokens": 888227039.0, "sample_num_tokens": 8140.25, "step": 3278, "total_num_tokens": 888259600.0, "z_loss": 0.0007238902035169303 }, { "copy_logits_max": -6.751269340515137, "copy_logits_min": -750000000.0, "copy_num_tokens": 595.5625, "epoch": 0.6696961960684197, "gen_logits_max": 5.272256374359131, "gen_logits_mean": -13.070234298706055, "gen_logits_min": -25.16649627685547, "gen_logits_std": 2.6671342849731445, "gen_loss": 0.30664485692977905, "grad_norm": 0.3851346610217131, "learning_rate": 2.678778947368421e-05, "loss": 0.2931, "mean_copy_accuracy": 0.9956700503826141, "mean_gen_accuracy": 0.8701359778642654, "mean_token_accuracy": 0.9040064364671707, "num_tokens": 888510793.0, "sample_num_tokens": 9285.25, "step": 3279, "total_num_tokens": 888547934.0, "z_loss": 0.000705473474226892 }, { "copy_logits_max": -6.059713363647461, "copy_logits_min": -687500032.0, "copy_num_tokens": 410.625, "epoch": 0.6699004340056165, "gen_logits_max": 6.264035224914551, "gen_logits_mean": -12.870649337768555, "gen_logits_min": -25.522319793701172, "gen_logits_std": 2.6835570335388184, "gen_loss": 0.3359125852584839, "grad_norm": 0.48320536540038134, "learning_rate": 2.6786526315789474e-05, "loss": 0.33, "mean_copy_accuracy": 0.9919006824493408, "mean_gen_accuracy": 0.8602014183998108, "mean_token_accuracy": 0.8892549574375153, "num_tokens": 888784815.0, "sample_num_tokens": 8476.25, "step": 3280, "total_num_tokens": 888818720.0, "z_loss": 0.0007256803219206631 }, { "copy_logits_max": -5.63254451751709, "copy_logits_min": -687500032.0, "copy_num_tokens": 423.4375, "epoch": 0.6701046719428134, "gen_logits_max": 5.8117876052856445, "gen_logits_mean": -14.250075340270996, "gen_logits_min": -26.138092041015625, "gen_logits_std": 2.670771598815918, "gen_loss": 0.316092848777771, "grad_norm": 0.43638959751133843, "learning_rate": 2.678526315789474e-05, "loss": 0.3054, "mean_copy_accuracy": 0.9951174706220627, "mean_gen_accuracy": 0.8620914220809937, "mean_token_accuracy": 0.8983037620782852, "num_tokens": 889059567.0, "sample_num_tokens": 7884.25, "step": 3281, "total_num_tokens": 889091104.0, "z_loss": 0.0006666334229521453 }, { "copy_logits_max": -5.68779182434082, "copy_logits_min": -750000064.0, "copy_num_tokens": 541.6875, "epoch": 0.6703089098800102, "gen_logits_max": 5.842756748199463, "gen_logits_mean": -12.268854141235352, "gen_logits_min": -24.836475372314453, "gen_logits_std": 2.6891613006591797, "gen_loss": 0.293357253074646, "grad_norm": 0.42653482637938767, "learning_rate": 2.6784000000000003e-05, "loss": 0.2952, "mean_copy_accuracy": 0.9953395128250122, "mean_gen_accuracy": 0.8675946295261383, "mean_token_accuracy": 0.9024988561868668, "num_tokens": 889354488.0, "sample_num_tokens": 8193.0, "step": 3282, "total_num_tokens": 889387260.0, "z_loss": 0.0006491863750852644 }, { "copy_logits_max": -6.649111747741699, "copy_logits_min": -750000000.0, "copy_num_tokens": 473.0, "epoch": 0.6705131478172071, "gen_logits_max": 4.8555121421813965, "gen_logits_mean": -13.665544509887695, "gen_logits_min": -25.75372314453125, "gen_logits_std": 2.6574134826660156, "gen_loss": 0.3166797161102295, "grad_norm": 0.44578659608290017, "learning_rate": 2.6782736842105264e-05, "loss": 0.3183, "mean_copy_accuracy": 0.9955971837043762, "mean_gen_accuracy": 0.8598037213087082, "mean_token_accuracy": 0.8950423151254654, "num_tokens": 889606863.0, "sample_num_tokens": 7840.75, "step": 3283, "total_num_tokens": 889638226.0, "z_loss": 0.0006470434600487351 }, { "copy_logits_max": -5.410915851593018, "copy_logits_min": -750000000.0, "copy_num_tokens": 494.6875, "epoch": 0.6707173857544039, "gen_logits_max": 4.806429862976074, "gen_logits_mean": -14.727890014648438, "gen_logits_min": -26.762252807617188, "gen_logits_std": 2.6593735218048096, "gen_loss": 0.3188783526420593, "grad_norm": 0.41556801905679314, "learning_rate": 2.678147368421053e-05, "loss": 0.3077, "mean_copy_accuracy": 0.9948091804981232, "mean_gen_accuracy": 0.8657120168209076, "mean_token_accuracy": 0.8973328769207001, "num_tokens": 889873011.0, "sample_num_tokens": 8459.25, "step": 3284, "total_num_tokens": 889906848.0, "z_loss": 0.000661055906675756 }, { "copy_logits_max": -2.548556327819824, "copy_logits_min": -750000000.0, "copy_num_tokens": 474.5625, "epoch": 0.6709216236916007, "gen_logits_max": 5.23166561126709, "gen_logits_mean": -14.231779098510742, "gen_logits_min": -26.270952224731445, "gen_logits_std": 2.66042423248291, "gen_loss": 0.2578171491622925, "grad_norm": 0.4421065571213377, "learning_rate": 2.678021052631579e-05, "loss": 0.2814, "mean_copy_accuracy": 0.9939817190170288, "mean_gen_accuracy": 0.8767559379339218, "mean_token_accuracy": 0.9054470360279083, "num_tokens": 890148177.0, "sample_num_tokens": 9252.25, "step": 3285, "total_num_tokens": 890185186.0, "z_loss": 0.0005821987870149314 }, { "copy_logits_max": -4.508970260620117, "copy_logits_min": -687500032.0, "copy_num_tokens": 421.0, "epoch": 0.6711258616287975, "gen_logits_max": 5.857210159301758, "gen_logits_mean": -13.175924301147461, "gen_logits_min": -25.603412628173828, "gen_logits_std": 2.7134649753570557, "gen_loss": 0.2892487347126007, "grad_norm": 0.41003554874169695, "learning_rate": 2.6778947368421054e-05, "loss": 0.299, "mean_copy_accuracy": 0.9944371730089188, "mean_gen_accuracy": 0.8662842512130737, "mean_token_accuracy": 0.9024792015552521, "num_tokens": 890425077.0, "sample_num_tokens": 7611.75, "step": 3286, "total_num_tokens": 890455524.0, "z_loss": 0.0006801767740398645 }, { "copy_logits_max": -3.158580780029297, "copy_logits_min": -750000064.0, "copy_num_tokens": 595.0625, "epoch": 0.6713300995659944, "gen_logits_max": 4.823236465454102, "gen_logits_mean": -13.068490982055664, "gen_logits_min": -25.42449378967285, "gen_logits_std": 2.684316873550415, "gen_loss": 0.29793503880500793, "grad_norm": 0.4491114356461295, "learning_rate": 2.6777684210526314e-05, "loss": 0.3103, "mean_copy_accuracy": 0.9936355203390121, "mean_gen_accuracy": 0.867002084851265, "mean_token_accuracy": 0.8976616561412811, "num_tokens": 890673635.0, "sample_num_tokens": 8988.25, "step": 3287, "total_num_tokens": 890709588.0, "z_loss": 0.0006811564089730382 }, { "copy_logits_max": -4.80737829208374, "copy_logits_min": -750000000.0, "copy_num_tokens": 309.375, "epoch": 0.6715343375031912, "gen_logits_max": 5.475702285766602, "gen_logits_mean": -14.546806335449219, "gen_logits_min": -26.508636474609375, "gen_logits_std": 2.663752555847168, "gen_loss": 0.3459954559803009, "grad_norm": 0.4290321122928772, "learning_rate": 2.677642105263158e-05, "loss": 0.3322, "mean_copy_accuracy": 0.9942093938589096, "mean_gen_accuracy": 0.8593007475137711, "mean_token_accuracy": 0.8904630094766617, "num_tokens": 890952008.0, "sample_num_tokens": 8168.0, "step": 3288, "total_num_tokens": 890984680.0, "z_loss": 0.0007416896987706423 }, { "copy_logits_max": -3.80314564704895, "copy_logits_min": -750000000.0, "copy_num_tokens": 472.8125, "epoch": 0.6717385754403881, "gen_logits_max": 4.830482482910156, "gen_logits_mean": -13.722526550292969, "gen_logits_min": -26.020221710205078, "gen_logits_std": 2.6857383251190186, "gen_loss": 0.2921423017978668, "grad_norm": 0.5035961629198132, "learning_rate": 2.677515789473684e-05, "loss": 0.3039, "mean_copy_accuracy": 0.993779256939888, "mean_gen_accuracy": 0.86761873960495, "mean_token_accuracy": 0.8982445746660233, "num_tokens": 891214219.0, "sample_num_tokens": 8116.25, "step": 3289, "total_num_tokens": 891246684.0, "z_loss": 0.0007080084760673344 }, { "copy_logits_max": -5.349772930145264, "copy_logits_min": -750000064.0, "copy_num_tokens": 565.0625, "epoch": 0.6719428133775849, "gen_logits_max": 5.260119438171387, "gen_logits_mean": -13.750981330871582, "gen_logits_min": -25.378116607666016, "gen_logits_std": 2.62473464012146, "gen_loss": 0.3059841990470886, "grad_norm": 0.39130413467562647, "learning_rate": 2.6773894736842107e-05, "loss": 0.3101, "mean_copy_accuracy": 0.9939577728509903, "mean_gen_accuracy": 0.8647731691598892, "mean_token_accuracy": 0.8972291350364685, "num_tokens": 891497748.0, "sample_num_tokens": 10644.5, "step": 3290, "total_num_tokens": 891540326.0, "z_loss": 0.0006931141251698136 }, { "copy_logits_max": -2.7087130546569824, "copy_logits_min": -750000000.0, "copy_num_tokens": 431.5625, "epoch": 0.6721470513147817, "gen_logits_max": 6.077828407287598, "gen_logits_mean": -12.732701301574707, "gen_logits_min": -25.012794494628906, "gen_logits_std": 2.6590659618377686, "gen_loss": 0.30952250957489014, "grad_norm": 0.4511148058212794, "learning_rate": 2.677263157894737e-05, "loss": 0.3051, "mean_copy_accuracy": 0.9939722716808319, "mean_gen_accuracy": 0.8713438510894775, "mean_token_accuracy": 0.9006704986095428, "num_tokens": 891774843.0, "sample_num_tokens": 9354.75, "step": 3291, "total_num_tokens": 891812262.0, "z_loss": 0.0006616306491196156 }, { "copy_logits_max": -4.462625503540039, "copy_logits_min": -562499968.0, "copy_num_tokens": 730.0625, "epoch": 0.6723512892519785, "gen_logits_max": 5.560434341430664, "gen_logits_mean": -12.530522346496582, "gen_logits_min": -25.29104995727539, "gen_logits_std": 2.7118325233459473, "gen_loss": 0.302797794342041, "grad_norm": 0.425912661303472, "learning_rate": 2.6771368421052633e-05, "loss": 0.3181, "mean_copy_accuracy": 0.9937316179275513, "mean_gen_accuracy": 0.8653999269008636, "mean_token_accuracy": 0.8949847221374512, "num_tokens": 892025762.0, "sample_num_tokens": 10234.0, "step": 3292, "total_num_tokens": 892066698.0, "z_loss": 0.0007359772571362555 }, { "copy_logits_max": -6.1901021003723145, "copy_logits_min": -750000000.0, "copy_num_tokens": 465.5, "epoch": 0.6725555271891754, "gen_logits_max": 4.712477684020996, "gen_logits_mean": -15.285492897033691, "gen_logits_min": -26.861225128173828, "gen_logits_std": 2.6520004272460938, "gen_loss": 0.32045966386795044, "grad_norm": 0.37915544706653415, "learning_rate": 2.6770105263157897e-05, "loss": 0.3052, "mean_copy_accuracy": 0.9961913079023361, "mean_gen_accuracy": 0.862651452422142, "mean_token_accuracy": 0.8987108916044235, "num_tokens": 892341586.0, "sample_num_tokens": 9625.5, "step": 3293, "total_num_tokens": 892380088.0, "z_loss": 0.0006740289973095059 }, { "copy_logits_max": -5.953151702880859, "copy_logits_min": -750000000.0, "copy_num_tokens": 461.5625, "epoch": 0.6727597651263723, "gen_logits_max": 5.436059951782227, "gen_logits_mean": -13.847785949707031, "gen_logits_min": -25.55559539794922, "gen_logits_std": 2.6460609436035156, "gen_loss": 0.3319011926651001, "grad_norm": 0.45149786893573485, "learning_rate": 2.6768842105263158e-05, "loss": 0.3302, "mean_copy_accuracy": 0.9958836287260056, "mean_gen_accuracy": 0.856871172785759, "mean_token_accuracy": 0.8936545848846436, "num_tokens": 892621781.0, "sample_num_tokens": 9283.75, "step": 3294, "total_num_tokens": 892658916.0, "z_loss": 0.0006386874592863023 }, { "copy_logits_max": -6.114450931549072, "copy_logits_min": -750000000.0, "copy_num_tokens": 369.375, "epoch": 0.6729640030635691, "gen_logits_max": 4.639703273773193, "gen_logits_mean": -14.987762451171875, "gen_logits_min": -26.416797637939453, "gen_logits_std": 2.633613109588623, "gen_loss": 0.32091090083122253, "grad_norm": 0.43628203736132726, "learning_rate": 2.6767578947368422e-05, "loss": 0.2971, "mean_copy_accuracy": 0.9941782057285309, "mean_gen_accuracy": 0.8694368302822113, "mean_token_accuracy": 0.901318296790123, "num_tokens": 892903725.0, "sample_num_tokens": 7030.25, "step": 3295, "total_num_tokens": 892931846.0, "z_loss": 0.0006344165303744376 }, { "copy_logits_max": -5.985138416290283, "copy_logits_min": -687500032.0, "copy_num_tokens": 420.25, "epoch": 0.6731682410007659, "gen_logits_max": 5.483140468597412, "gen_logits_mean": -12.602601051330566, "gen_logits_min": -25.34396743774414, "gen_logits_std": 2.6397790908813477, "gen_loss": 0.2927814722061157, "grad_norm": 0.4163990848833877, "learning_rate": 2.6766315789473683e-05, "loss": 0.2893, "mean_copy_accuracy": 0.9921290427446365, "mean_gen_accuracy": 0.8743849992752075, "mean_token_accuracy": 0.902700200676918, "num_tokens": 893173430.0, "sample_num_tokens": 7728.5, "step": 3296, "total_num_tokens": 893204344.0, "z_loss": 0.0006209000712260604 }, { "copy_logits_max": -6.578629016876221, "copy_logits_min": -750000000.0, "copy_num_tokens": 339.8125, "epoch": 0.6733724789379627, "gen_logits_max": 5.809396743774414, "gen_logits_mean": -13.952568054199219, "gen_logits_min": -25.208438873291016, "gen_logits_std": 2.643737316131592, "gen_loss": 0.33016836643218994, "grad_norm": 0.41285391080528555, "learning_rate": 2.6765052631578947e-05, "loss": 0.3037, "mean_copy_accuracy": 0.9954687654972076, "mean_gen_accuracy": 0.8746185898780823, "mean_token_accuracy": 0.8987863063812256, "num_tokens": 893428572.0, "sample_num_tokens": 7531.0, "step": 3297, "total_num_tokens": 893458696.0, "z_loss": 0.000667697167955339 }, { "copy_logits_max": -6.488887786865234, "copy_logits_min": -750000000.0, "copy_num_tokens": 366.25, "epoch": 0.6735767168751595, "gen_logits_max": 5.27968692779541, "gen_logits_mean": -13.362445831298828, "gen_logits_min": -25.072376251220703, "gen_logits_std": 2.6023080348968506, "gen_loss": 0.30589061975479126, "grad_norm": 0.4205321427135293, "learning_rate": 2.6763789473684212e-05, "loss": 0.3078, "mean_copy_accuracy": 0.9955088198184967, "mean_gen_accuracy": 0.8594519048929214, "mean_token_accuracy": 0.8958758115768433, "num_tokens": 893711581.0, "sample_num_tokens": 7548.25, "step": 3298, "total_num_tokens": 893741774.0, "z_loss": 0.0007436256855726242 }, { "copy_logits_max": -3.428110122680664, "copy_logits_min": -750000000.0, "copy_num_tokens": 349.125, "epoch": 0.6737809548123564, "gen_logits_max": 5.747344017028809, "gen_logits_mean": -13.442472457885742, "gen_logits_min": -25.621950149536133, "gen_logits_std": 2.6601548194885254, "gen_loss": 0.3520928621292114, "grad_norm": 0.4212705725556114, "learning_rate": 2.6762526315789476e-05, "loss": 0.3233, "mean_copy_accuracy": 0.9943195283412933, "mean_gen_accuracy": 0.8601125031709671, "mean_token_accuracy": 0.8929844945669174, "num_tokens": 893973356.0, "sample_num_tokens": 7804.5, "step": 3299, "total_num_tokens": 894004574.0, "z_loss": 0.000830169185064733 }, { "copy_logits_max": -6.130685806274414, "copy_logits_min": -750000000.0, "copy_num_tokens": 312.9375, "epoch": 0.6739851927495533, "gen_logits_max": 4.815500259399414, "gen_logits_mean": -14.9219970703125, "gen_logits_min": -26.04009437561035, "gen_logits_std": 2.565399646759033, "gen_loss": 0.31452447175979614, "grad_norm": 0.40961995850865507, "learning_rate": 2.6761263157894737e-05, "loss": 0.3198, "mean_copy_accuracy": 0.9935425370931625, "mean_gen_accuracy": 0.8702698796987534, "mean_token_accuracy": 0.8960038274526596, "num_tokens": 894241209.0, "sample_num_tokens": 8207.25, "step": 3300, "total_num_tokens": 894274038.0, "z_loss": 0.0006685920525342226 }, { "copy_logits_max": -4.367159843444824, "copy_logits_min": -687500032.0, "copy_num_tokens": 589.4375, "epoch": 0.6741894306867501, "gen_logits_max": 4.990637302398682, "gen_logits_mean": -13.433703422546387, "gen_logits_min": -25.105995178222656, "gen_logits_std": 2.635528087615967, "gen_loss": 0.3106582760810852, "grad_norm": 0.37712982108917653, "learning_rate": 2.676e-05, "loss": 0.3076, "mean_copy_accuracy": 0.9961893111467361, "mean_gen_accuracy": 0.8549489974975586, "mean_token_accuracy": 0.8971179872751236, "num_tokens": 894525366.0, "sample_num_tokens": 9103.0, "step": 3301, "total_num_tokens": 894561778.0, "z_loss": 0.0007683264557272196 }, { "copy_logits_max": -6.608102798461914, "copy_logits_min": -750000000.0, "copy_num_tokens": 509.5625, "epoch": 0.6743936686239469, "gen_logits_max": 4.08840274810791, "gen_logits_mean": -15.66541862487793, "gen_logits_min": -27.21055793762207, "gen_logits_std": 2.6408534049987793, "gen_loss": 0.2769576907157898, "grad_norm": 0.39898962114658976, "learning_rate": 2.6758736842105262e-05, "loss": 0.2973, "mean_copy_accuracy": 0.9957433044910431, "mean_gen_accuracy": 0.8717624396085739, "mean_token_accuracy": 0.9002868980169296, "num_tokens": 894794973.0, "sample_num_tokens": 8969.25, "step": 3302, "total_num_tokens": 894830850.0, "z_loss": 0.0006052880780771375 }, { "copy_logits_max": -6.226556301116943, "copy_logits_min": -750000000.0, "copy_num_tokens": 357.875, "epoch": 0.6745979065611437, "gen_logits_max": 5.491274833679199, "gen_logits_mean": -13.903359413146973, "gen_logits_min": -26.015060424804688, "gen_logits_std": 2.669128894805908, "gen_loss": 0.34525641798973083, "grad_norm": 0.3978539786974081, "learning_rate": 2.6757473684210527e-05, "loss": 0.302, "mean_copy_accuracy": 0.9947252720594406, "mean_gen_accuracy": 0.8675441741943359, "mean_token_accuracy": 0.900098979473114, "num_tokens": 895094434.0, "sample_num_tokens": 7883.5, "step": 3303, "total_num_tokens": 895125968.0, "z_loss": 0.0007235313532873988 }, { "copy_logits_max": -5.795349597930908, "copy_logits_min": -750000000.0, "copy_num_tokens": 545.5625, "epoch": 0.6748021444983405, "gen_logits_max": 6.325939655303955, "gen_logits_mean": -11.833698272705078, "gen_logits_min": -24.48087501525879, "gen_logits_std": 2.682671546936035, "gen_loss": 0.311768114566803, "grad_norm": 0.4191249743755066, "learning_rate": 2.6756210526315787e-05, "loss": 0.3168, "mean_copy_accuracy": 0.9948983788490295, "mean_gen_accuracy": 0.8639901876449585, "mean_token_accuracy": 0.8974743485450745, "num_tokens": 895379985.0, "sample_num_tokens": 9004.75, "step": 3304, "total_num_tokens": 895416004.0, "z_loss": 0.0006744872080162168 }, { "copy_logits_max": -6.351657867431641, "copy_logits_min": -750000000.0, "copy_num_tokens": 453.625, "epoch": 0.6750063824355375, "gen_logits_max": 4.958728790283203, "gen_logits_mean": -14.43878173828125, "gen_logits_min": -26.331693649291992, "gen_logits_std": 2.7086973190307617, "gen_loss": 0.30867254734039307, "grad_norm": 0.4261548795767172, "learning_rate": 2.6754947368421052e-05, "loss": 0.3074, "mean_copy_accuracy": 0.9949504882097244, "mean_gen_accuracy": 0.8629265874624252, "mean_token_accuracy": 0.8997434824705124, "num_tokens": 895664563.0, "sample_num_tokens": 8110.75, "step": 3305, "total_num_tokens": 895697006.0, "z_loss": 0.0006621445063501596 }, { "copy_logits_max": -4.973626136779785, "copy_logits_min": -687500032.0, "copy_num_tokens": 668.375, "epoch": 0.6752106203727343, "gen_logits_max": 4.030279159545898, "gen_logits_mean": -14.333076477050781, "gen_logits_min": -26.6292724609375, "gen_logits_std": 2.7157187461853027, "gen_loss": 0.31305158138275146, "grad_norm": 0.4228063948046986, "learning_rate": 2.675368421052632e-05, "loss": 0.309, "mean_copy_accuracy": 0.9948065876960754, "mean_gen_accuracy": 0.8635625541210175, "mean_token_accuracy": 0.8954679667949677, "num_tokens": 895918981.0, "sample_num_tokens": 9140.75, "step": 3306, "total_num_tokens": 895955544.0, "z_loss": 0.000713453977368772 }, { "copy_logits_max": -5.642221927642822, "copy_logits_min": -687500032.0, "copy_num_tokens": 458.875, "epoch": 0.6754148583099311, "gen_logits_max": 5.317718505859375, "gen_logits_mean": -13.59608268737793, "gen_logits_min": -25.427762985229492, "gen_logits_std": 2.674543619155884, "gen_loss": 0.285043329000473, "grad_norm": 0.4411260575564237, "learning_rate": 2.675242105263158e-05, "loss": 0.3022, "mean_copy_accuracy": 0.9952645599842072, "mean_gen_accuracy": 0.8703533560037613, "mean_token_accuracy": 0.8999684303998947, "num_tokens": 896196747.0, "sample_num_tokens": 8723.25, "step": 3307, "total_num_tokens": 896231640.0, "z_loss": 0.0006053770775906742 }, { "copy_logits_max": -6.326395034790039, "copy_logits_min": -750000000.0, "copy_num_tokens": 322.75, "epoch": 0.6756190962471279, "gen_logits_max": 6.458610534667969, "gen_logits_mean": -13.142414093017578, "gen_logits_min": -25.538658142089844, "gen_logits_std": 2.668724536895752, "gen_loss": 0.33724167943000793, "grad_norm": 0.41289584995507994, "learning_rate": 2.6751157894736845e-05, "loss": 0.3127, "mean_copy_accuracy": 0.993696540594101, "mean_gen_accuracy": 0.8683346658945084, "mean_token_accuracy": 0.8959973752498627, "num_tokens": 896485588.0, "sample_num_tokens": 7238.5, "step": 3308, "total_num_tokens": 896514542.0, "z_loss": 0.0007907820399850607 }, { "copy_logits_max": -6.142658233642578, "copy_logits_min": -687500032.0, "copy_num_tokens": 371.4375, "epoch": 0.6758233341843247, "gen_logits_max": 6.210304260253906, "gen_logits_mean": -13.525169372558594, "gen_logits_min": -25.12082290649414, "gen_logits_std": 2.6445131301879883, "gen_loss": 0.3389472961425781, "grad_norm": 0.4293397513397531, "learning_rate": 2.6749894736842106e-05, "loss": 0.3188, "mean_copy_accuracy": 0.9941000044345856, "mean_gen_accuracy": 0.8692230433225632, "mean_token_accuracy": 0.8955346643924713, "num_tokens": 896740827.0, "sample_num_tokens": 8227.75, "step": 3309, "total_num_tokens": 896773738.0, "z_loss": 0.0007399986498057842 }, { "copy_logits_max": -4.59623384475708, "copy_logits_min": -750000000.0, "copy_num_tokens": 662.625, "epoch": 0.6760275721215215, "gen_logits_max": 4.095397472381592, "gen_logits_mean": -14.473426818847656, "gen_logits_min": -26.774633407592773, "gen_logits_std": 2.660038471221924, "gen_loss": 0.3043040633201599, "grad_norm": 0.41227990362469447, "learning_rate": 2.674863157894737e-05, "loss": 0.2993, "mean_copy_accuracy": 0.9956555068492889, "mean_gen_accuracy": 0.8666835725307465, "mean_token_accuracy": 0.9011159688234329, "num_tokens": 897014494.0, "sample_num_tokens": 9251.5, "step": 3310, "total_num_tokens": 897051500.0, "z_loss": 0.0007006963714957237 }, { "copy_logits_max": -6.254307746887207, "copy_logits_min": -750000000.0, "copy_num_tokens": 446.6875, "epoch": 0.6762318100587184, "gen_logits_max": 5.222551345825195, "gen_logits_mean": -14.181559562683105, "gen_logits_min": -26.195520401000977, "gen_logits_std": 2.664285659790039, "gen_loss": 0.34467387199401855, "grad_norm": 0.4542164840991043, "learning_rate": 2.674736842105263e-05, "loss": 0.3325, "mean_copy_accuracy": 0.9952260851860046, "mean_gen_accuracy": 0.8600129783153534, "mean_token_accuracy": 0.8899937570095062, "num_tokens": 897272309.0, "sample_num_tokens": 9204.75, "step": 3311, "total_num_tokens": 897309128.0, "z_loss": 0.0007599439122714102 }, { "copy_logits_max": -3.8465311527252197, "copy_logits_min": -750000000.0, "copy_num_tokens": 580.75, "epoch": 0.6764360479959153, "gen_logits_max": 4.869203567504883, "gen_logits_mean": -13.571534156799316, "gen_logits_min": -25.775775909423828, "gen_logits_std": 2.6884169578552246, "gen_loss": 0.3108476996421814, "grad_norm": 0.42021157930242864, "learning_rate": 2.6746105263157895e-05, "loss": 0.3246, "mean_copy_accuracy": 0.9959973692893982, "mean_gen_accuracy": 0.8577123135328293, "mean_token_accuracy": 0.8931379318237305, "num_tokens": 897542328.0, "sample_num_tokens": 9170.5, "step": 3312, "total_num_tokens": 897579010.0, "z_loss": 0.0007177679799497128 }, { "copy_logits_max": -1.6064720153808594, "copy_logits_min": -750000000.0, "copy_num_tokens": 487.5625, "epoch": 0.6766402859331121, "gen_logits_max": 4.712050437927246, "gen_logits_mean": -14.779035568237305, "gen_logits_min": -26.848758697509766, "gen_logits_std": 2.7024621963500977, "gen_loss": 0.3398522138595581, "grad_norm": 0.46863943725891966, "learning_rate": 2.6744842105263156e-05, "loss": 0.3294, "mean_copy_accuracy": 0.9946446120738983, "mean_gen_accuracy": 0.8529535382986069, "mean_token_accuracy": 0.8925100713968277, "num_tokens": 897825082.0, "sample_num_tokens": 7884.5, "step": 3313, "total_num_tokens": 897856620.0, "z_loss": 0.0007658319664187729 }, { "copy_logits_max": -3.4058475494384766, "copy_logits_min": -750000000.0, "copy_num_tokens": 597.1875, "epoch": 0.6768445238703089, "gen_logits_max": 4.729520797729492, "gen_logits_mean": -14.102363586425781, "gen_logits_min": -26.399198532104492, "gen_logits_std": 2.6619181632995605, "gen_loss": 0.27763375639915466, "grad_norm": 0.4109619040220681, "learning_rate": 2.6743578947368424e-05, "loss": 0.3041, "mean_copy_accuracy": 0.9943185746669769, "mean_gen_accuracy": 0.8684735745191574, "mean_token_accuracy": 0.8997338265180588, "num_tokens": 898091286.0, "sample_num_tokens": 8831.5, "step": 3314, "total_num_tokens": 898126612.0, "z_loss": 0.0006621062057092786 }, { "copy_logits_max": -4.168949127197266, "copy_logits_min": -750000000.0, "copy_num_tokens": 369.875, "epoch": 0.6770487618075057, "gen_logits_max": 6.228281021118164, "gen_logits_mean": -13.852666854858398, "gen_logits_min": -25.2381649017334, "gen_logits_std": 2.618313789367676, "gen_loss": 0.3076251745223999, "grad_norm": 0.42455884275556666, "learning_rate": 2.6742315789473685e-05, "loss": 0.3079, "mean_copy_accuracy": 0.9945283234119415, "mean_gen_accuracy": 0.8683340400457382, "mean_token_accuracy": 0.8986364156007767, "num_tokens": 898354021.0, "sample_num_tokens": 8152.25, "step": 3315, "total_num_tokens": 898386630.0, "z_loss": 0.0007006992236711085 }, { "copy_logits_max": -4.86395263671875, "copy_logits_min": -750000000.0, "copy_num_tokens": 320.8125, "epoch": 0.6772529997447025, "gen_logits_max": 6.0961480140686035, "gen_logits_mean": -13.585161209106445, "gen_logits_min": -25.338253021240234, "gen_logits_std": 2.6504368782043457, "gen_loss": 0.29929399490356445, "grad_norm": 0.39475009251718285, "learning_rate": 2.674105263157895e-05, "loss": 0.2946, "mean_copy_accuracy": 0.9941260516643524, "mean_gen_accuracy": 0.8745507150888443, "mean_token_accuracy": 0.9019224941730499, "num_tokens": 898616590.0, "sample_num_tokens": 7439.5, "step": 3316, "total_num_tokens": 898646348.0, "z_loss": 0.0006043602479621768 }, { "copy_logits_max": -3.0168890953063965, "copy_logits_min": -750000000.0, "copy_num_tokens": 618.3125, "epoch": 0.6774572376818994, "gen_logits_max": 6.017241477966309, "gen_logits_mean": -13.901102066040039, "gen_logits_min": -25.687273025512695, "gen_logits_std": 2.6828627586364746, "gen_loss": 0.2953200340270996, "grad_norm": 0.39144649781079505, "learning_rate": 2.673978947368421e-05, "loss": 0.3119, "mean_copy_accuracy": 0.9960199594497681, "mean_gen_accuracy": 0.8655889183282852, "mean_token_accuracy": 0.8970262706279755, "num_tokens": 898881797.0, "sample_num_tokens": 9562.25, "step": 3317, "total_num_tokens": 898920046.0, "z_loss": 0.0006483410252258182 }, { "copy_logits_max": -2.9011855125427246, "copy_logits_min": -750000064.0, "copy_num_tokens": 384.75, "epoch": 0.6776614756190963, "gen_logits_max": 5.366448879241943, "gen_logits_mean": -14.316204071044922, "gen_logits_min": -26.073837280273438, "gen_logits_std": 2.6704885959625244, "gen_loss": 0.34810665249824524, "grad_norm": 0.45608519364243805, "learning_rate": 2.6738526315789474e-05, "loss": 0.3146, "mean_copy_accuracy": 0.9930213242769241, "mean_gen_accuracy": 0.866714671254158, "mean_token_accuracy": 0.8952442705631256, "num_tokens": 899144844.0, "sample_num_tokens": 8180.5, "step": 3318, "total_num_tokens": 899177566.0, "z_loss": 0.0007602674304507673 }, { "copy_logits_max": -1.725069284439087, "copy_logits_min": -687500032.0, "copy_num_tokens": 503.0625, "epoch": 0.6778657135562931, "gen_logits_max": 5.600547790527344, "gen_logits_mean": -13.882205963134766, "gen_logits_min": -25.83255386352539, "gen_logits_std": 2.6841554641723633, "gen_loss": 0.29217177629470825, "grad_norm": 0.37681332295379427, "learning_rate": 2.673726315789474e-05, "loss": 0.3043, "mean_copy_accuracy": 0.9959110170602798, "mean_gen_accuracy": 0.8664386123418808, "mean_token_accuracy": 0.8990859538316727, "num_tokens": 899413596.0, "sample_num_tokens": 8122.5, "step": 3319, "total_num_tokens": 899446086.0, "z_loss": 0.0006913359393365681 }, { "copy_logits_max": -6.337961673736572, "copy_logits_min": -750000000.0, "copy_num_tokens": 317.6875, "epoch": 0.6780699514934899, "gen_logits_max": 5.8386454582214355, "gen_logits_mean": -14.336132049560547, "gen_logits_min": -25.836734771728516, "gen_logits_std": 2.607886552810669, "gen_loss": 0.35638242959976196, "grad_norm": 0.41676904779136, "learning_rate": 2.6736e-05, "loss": 0.3167, "mean_copy_accuracy": 0.9950374811887741, "mean_gen_accuracy": 0.8667150437831879, "mean_token_accuracy": 0.8949855118989944, "num_tokens": 899684289.0, "sample_num_tokens": 7440.75, "step": 3320, "total_num_tokens": 899714052.0, "z_loss": 0.0007249053451232612 }, { "copy_logits_max": -5.137044906616211, "copy_logits_min": -750000000.0, "copy_num_tokens": 410.1875, "epoch": 0.6782741894306867, "gen_logits_max": 5.421318054199219, "gen_logits_mean": -14.363988876342773, "gen_logits_min": -26.11524200439453, "gen_logits_std": 2.6788787841796875, "gen_loss": 0.32464975118637085, "grad_norm": 0.4503378532742131, "learning_rate": 2.6734736842105264e-05, "loss": 0.3143, "mean_copy_accuracy": 0.9943758845329285, "mean_gen_accuracy": 0.866705447435379, "mean_token_accuracy": 0.8952936828136444, "num_tokens": 899953406.0, "sample_num_tokens": 8113.0, "step": 3321, "total_num_tokens": 899985858.0, "z_loss": 0.000779115769546479 }, { "copy_logits_max": -3.317152976989746, "copy_logits_min": -687500032.0, "copy_num_tokens": 468.0, "epoch": 0.6784784273678836, "gen_logits_max": 5.501339912414551, "gen_logits_mean": -15.026179313659668, "gen_logits_min": -26.758155822753906, "gen_logits_std": 2.694370985031128, "gen_loss": 0.2893163561820984, "grad_norm": 0.4475366170818731, "learning_rate": 2.6733473684210528e-05, "loss": 0.3203, "mean_copy_accuracy": 0.994989350438118, "mean_gen_accuracy": 0.8664486110210419, "mean_token_accuracy": 0.8943920582532883, "num_tokens": 900191095.0, "sample_num_tokens": 8122.75, "step": 3322, "total_num_tokens": 900223586.0, "z_loss": 0.0006814380176365376 }, { "copy_logits_max": -2.413144588470459, "copy_logits_min": -750000000.0, "copy_num_tokens": 813.5625, "epoch": 0.6786826653050804, "gen_logits_max": 4.54152774810791, "gen_logits_mean": -14.272493362426758, "gen_logits_min": -26.385417938232422, "gen_logits_std": 2.757692813873291, "gen_loss": 0.268391877412796, "grad_norm": 0.478085104013758, "learning_rate": 2.6732210526315792e-05, "loss": 0.311, "mean_copy_accuracy": 0.9927960932254791, "mean_gen_accuracy": 0.866208866238594, "mean_token_accuracy": 0.8985386937856674, "num_tokens": 900448418.0, "sample_num_tokens": 9134.5, "step": 3323, "total_num_tokens": 900484956.0, "z_loss": 0.000646948697976768 }, { "copy_logits_max": -3.1123642921447754, "copy_logits_min": -687500032.0, "copy_num_tokens": 573.5, "epoch": 0.6788869032422773, "gen_logits_max": 5.231541156768799, "gen_logits_mean": -14.684572219848633, "gen_logits_min": -26.790550231933594, "gen_logits_std": 2.711392879486084, "gen_loss": 0.30887967348098755, "grad_norm": 0.42468780542935664, "learning_rate": 2.6730947368421053e-05, "loss": 0.3224, "mean_copy_accuracy": 0.9960017949342728, "mean_gen_accuracy": 0.857537031173706, "mean_token_accuracy": 0.8924150317907333, "num_tokens": 900725514.0, "sample_num_tokens": 9508.5, "step": 3324, "total_num_tokens": 900763548.0, "z_loss": 0.0007044784142635763 }, { "copy_logits_max": -2.7795581817626953, "copy_logits_min": -625000064.0, "copy_num_tokens": 392.4375, "epoch": 0.6790911411794741, "gen_logits_max": 5.360805034637451, "gen_logits_mean": -14.707460403442383, "gen_logits_min": -26.939098358154297, "gen_logits_std": 2.7263526916503906, "gen_loss": 0.32378584146499634, "grad_norm": 0.4631226430797048, "learning_rate": 2.6729684210526318e-05, "loss": 0.3094, "mean_copy_accuracy": 0.994457870721817, "mean_gen_accuracy": 0.8660553842782974, "mean_token_accuracy": 0.8968250155448914, "num_tokens": 900996343.0, "sample_num_tokens": 7151.75, "step": 3325, "total_num_tokens": 901024950.0, "z_loss": 0.0007565323612652719 }, { "copy_logits_max": -3.6833884716033936, "copy_logits_min": -687500032.0, "copy_num_tokens": 421.375, "epoch": 0.6792953791166709, "gen_logits_max": 6.8461785316467285, "gen_logits_mean": -12.896628379821777, "gen_logits_min": -24.913368225097656, "gen_logits_std": 2.7195844650268555, "gen_loss": 0.3480519652366638, "grad_norm": 0.5461737474907559, "learning_rate": 2.672842105263158e-05, "loss": 0.3117, "mean_copy_accuracy": 0.9933053106069565, "mean_gen_accuracy": 0.8662443161010742, "mean_token_accuracy": 0.8958221673965454, "num_tokens": 901254134.0, "sample_num_tokens": 8449.5, "step": 3326, "total_num_tokens": 901287932.0, "z_loss": 0.0006953405099920928 }, { "copy_logits_max": -4.56376314163208, "copy_logits_min": -750000000.0, "copy_num_tokens": 478.375, "epoch": 0.6794996170538677, "gen_logits_max": 5.5873212814331055, "gen_logits_mean": -15.514790534973145, "gen_logits_min": -27.34443473815918, "gen_logits_std": 2.7505102157592773, "gen_loss": 0.31633591651916504, "grad_norm": 0.41590616657224266, "learning_rate": 2.6727157894736843e-05, "loss": 0.3156, "mean_copy_accuracy": 0.9959790408611298, "mean_gen_accuracy": 0.8651429861783981, "mean_token_accuracy": 0.8953571617603302, "num_tokens": 901522796.0, "sample_num_tokens": 9226.5, "step": 3327, "total_num_tokens": 901559702.0, "z_loss": 0.0007143120747059584 }, { "copy_logits_max": -2.079503059387207, "copy_logits_min": -750000000.0, "copy_num_tokens": 315.8125, "epoch": 0.6797038549910646, "gen_logits_max": 4.816215991973877, "gen_logits_mean": -15.681558609008789, "gen_logits_min": -27.383520126342773, "gen_logits_std": 2.7147068977355957, "gen_loss": 0.31065672636032104, "grad_norm": 0.44597011036795253, "learning_rate": 2.6725894736842104e-05, "loss": 0.2903, "mean_copy_accuracy": 0.993582546710968, "mean_gen_accuracy": 0.8703425526618958, "mean_token_accuracy": 0.9016133248806, "num_tokens": 901818625.0, "sample_num_tokens": 6856.75, "step": 3328, "total_num_tokens": 901846052.0, "z_loss": 0.0008902779081836343 }, { "copy_logits_max": 1.20814847946167, "copy_logits_min": -750000000.0, "copy_num_tokens": 529.375, "epoch": 0.6799080929282614, "gen_logits_max": 5.2621989250183105, "gen_logits_mean": -14.439596176147461, "gen_logits_min": -26.64586639404297, "gen_logits_std": 2.7586400508880615, "gen_loss": 0.3101833462715149, "grad_norm": 0.43711390997514793, "learning_rate": 2.6724631578947368e-05, "loss": 0.3126, "mean_copy_accuracy": 0.9944479316473007, "mean_gen_accuracy": 0.8649354428052902, "mean_token_accuracy": 0.8957139998674393, "num_tokens": 902075887.0, "sample_num_tokens": 8016.25, "step": 3329, "total_num_tokens": 902107952.0, "z_loss": 0.001149053918197751 }, { "copy_logits_max": 4.921622276306152, "copy_logits_min": -750000000.0, "copy_num_tokens": 492.0, "epoch": 0.6801123308654583, "gen_logits_max": 6.248661994934082, "gen_logits_mean": -12.739505767822266, "gen_logits_min": -25.21783447265625, "gen_logits_std": 2.784212350845337, "gen_loss": 0.306286484003067, "grad_norm": 0.4067133888222094, "learning_rate": 2.6723368421052632e-05, "loss": 0.303, "mean_copy_accuracy": 0.9946351200342178, "mean_gen_accuracy": 0.8702284246683121, "mean_token_accuracy": 0.8996918350458145, "num_tokens": 902346275.0, "sample_num_tokens": 8618.75, "step": 3330, "total_num_tokens": 902380750.0, "z_loss": 0.001059508416801691 }, { "copy_logits_max": -0.2046276479959488, "copy_logits_min": -750000000.0, "copy_num_tokens": 399.625, "epoch": 0.6803165688026551, "gen_logits_max": 5.605086326599121, "gen_logits_mean": -13.836067199707031, "gen_logits_min": -25.35304832458496, "gen_logits_std": 2.7386887073516846, "gen_loss": 0.35744917392730713, "grad_norm": 0.5388610234411844, "learning_rate": 2.6722105263157897e-05, "loss": 0.3393, "mean_copy_accuracy": 0.992773100733757, "mean_gen_accuracy": 0.8560008257627487, "mean_token_accuracy": 0.8871454447507858, "num_tokens": 902605878.0, "sample_num_tokens": 7874.5, "step": 3331, "total_num_tokens": 902637376.0, "z_loss": 0.0008718195022083819 }, { "copy_logits_max": -1.0103052854537964, "copy_logits_min": -750000000.0, "copy_num_tokens": 374.6875, "epoch": 0.6805208067398519, "gen_logits_max": 5.009286880493164, "gen_logits_mean": -15.47846794128418, "gen_logits_min": -27.200284957885742, "gen_logits_std": 2.722463369369507, "gen_loss": 0.2977234125137329, "grad_norm": 0.43512598804769176, "learning_rate": 2.672084210526316e-05, "loss": 0.3089, "mean_copy_accuracy": 0.9948161244392395, "mean_gen_accuracy": 0.8655660599470139, "mean_token_accuracy": 0.8974646180868149, "num_tokens": 902876065.0, "sample_num_tokens": 7453.75, "step": 3332, "total_num_tokens": 902905880.0, "z_loss": 0.0008330881246365607 }, { "copy_logits_max": 1.481971025466919, "copy_logits_min": -750000000.0, "copy_num_tokens": 645.5, "epoch": 0.6807250446770488, "gen_logits_max": 4.508203983306885, "gen_logits_mean": -15.080070495605469, "gen_logits_min": -27.150634765625, "gen_logits_std": 2.7323288917541504, "gen_loss": 0.2831266522407532, "grad_norm": 0.4903586730322245, "learning_rate": 2.6719578947368422e-05, "loss": 0.3065, "mean_copy_accuracy": 0.9946489483118057, "mean_gen_accuracy": 0.8686046153306961, "mean_token_accuracy": 0.8994058519601822, "num_tokens": 903152137.0, "sample_num_tokens": 11269.75, "step": 3333, "total_num_tokens": 903197216.0, "z_loss": 0.0006862896261736751 }, { "copy_logits_max": -2.4923524856567383, "copy_logits_min": -750000000.0, "copy_num_tokens": 508.625, "epoch": 0.6809292826142456, "gen_logits_max": 4.897708415985107, "gen_logits_mean": -14.440906524658203, "gen_logits_min": -25.971574783325195, "gen_logits_std": 2.720973253250122, "gen_loss": 0.2955611050128937, "grad_norm": 0.42492155931519626, "learning_rate": 2.6718315789473686e-05, "loss": 0.2916, "mean_copy_accuracy": 0.9955431967973709, "mean_gen_accuracy": 0.8745090365409851, "mean_token_accuracy": 0.906415730714798, "num_tokens": 903434706.0, "sample_num_tokens": 9365.5, "step": 3334, "total_num_tokens": 903472168.0, "z_loss": 0.0006552984123118222 }, { "copy_logits_max": -1.95573091506958, "copy_logits_min": -750000000.0, "copy_num_tokens": 504.6875, "epoch": 0.6811335205514424, "gen_logits_max": 5.657707691192627, "gen_logits_mean": -14.25546646118164, "gen_logits_min": -25.975387573242188, "gen_logits_std": 2.7301714420318604, "gen_loss": 0.30189067125320435, "grad_norm": 0.4766672399568982, "learning_rate": 2.6717052631578947e-05, "loss": 0.3251, "mean_copy_accuracy": 0.9933636784553528, "mean_gen_accuracy": 0.8609696328639984, "mean_token_accuracy": 0.8929184228181839, "num_tokens": 903702200.0, "sample_num_tokens": 9356.5, "step": 3335, "total_num_tokens": 903739626.0, "z_loss": 0.0006788918981328607 }, { "copy_logits_max": 0.16403448581695557, "copy_logits_min": -687500032.0, "copy_num_tokens": 474.4375, "epoch": 0.6813377584886393, "gen_logits_max": 5.123122215270996, "gen_logits_mean": -14.52501106262207, "gen_logits_min": -26.58786392211914, "gen_logits_std": 2.7684426307678223, "gen_loss": 0.28117895126342773, "grad_norm": 0.43143366773017483, "learning_rate": 2.671578947368421e-05, "loss": 0.3061, "mean_copy_accuracy": 0.9943967908620834, "mean_gen_accuracy": 0.8647186011075974, "mean_token_accuracy": 0.898223415017128, "num_tokens": 903980731.0, "sample_num_tokens": 8358.75, "step": 3336, "total_num_tokens": 904014166.0, "z_loss": 0.0007177486550062895 }, { "copy_logits_max": -0.695281445980072, "copy_logits_min": -750000000.0, "copy_num_tokens": 527.1875, "epoch": 0.6815419964258361, "gen_logits_max": 5.330358982086182, "gen_logits_mean": -13.80369758605957, "gen_logits_min": -25.713640213012695, "gen_logits_std": 2.7160403728485107, "gen_loss": 0.3222014307975769, "grad_norm": 0.41870403846708026, "learning_rate": 2.6714526315789472e-05, "loss": 0.3306, "mean_copy_accuracy": 0.9947437793016434, "mean_gen_accuracy": 0.8600637167692184, "mean_token_accuracy": 0.8890818953514099, "num_tokens": 904238940.0, "sample_num_tokens": 10108.0, "step": 3337, "total_num_tokens": 904279372.0, "z_loss": 0.0008071707561612129 }, { "copy_logits_max": -0.9333351254463196, "copy_logits_min": -750000000.0, "copy_num_tokens": 355.0625, "epoch": 0.6817462343630329, "gen_logits_max": 5.329444885253906, "gen_logits_mean": -14.907413482666016, "gen_logits_min": -26.294097900390625, "gen_logits_std": 2.675375461578369, "gen_loss": 0.3504810333251953, "grad_norm": 0.4242506240036287, "learning_rate": 2.6713263157894737e-05, "loss": 0.3021, "mean_copy_accuracy": 0.9954233914613724, "mean_gen_accuracy": 0.8653082549571991, "mean_token_accuracy": 0.8987036496400833, "num_tokens": 904523507.0, "sample_num_tokens": 8028.25, "step": 3338, "total_num_tokens": 904555620.0, "z_loss": 0.0008248510421253741 }, { "copy_logits_max": -0.25440657138824463, "copy_logits_min": -750000000.0, "copy_num_tokens": 609.9375, "epoch": 0.6819504723002298, "gen_logits_max": 4.941169738769531, "gen_logits_mean": -14.42523193359375, "gen_logits_min": -26.716012954711914, "gen_logits_std": 2.7132511138916016, "gen_loss": 0.3215731680393219, "grad_norm": 0.4126816329458178, "learning_rate": 2.6712e-05, "loss": 0.3113, "mean_copy_accuracy": 0.9942582100629807, "mean_gen_accuracy": 0.8640043586492538, "mean_token_accuracy": 0.8971186578273773, "num_tokens": 904776595.0, "sample_num_tokens": 9049.25, "step": 3339, "total_num_tokens": 904812792.0, "z_loss": 0.000812059675808996 }, { "copy_logits_max": -2.773667335510254, "copy_logits_min": -750000000.0, "copy_num_tokens": 420.6875, "epoch": 0.6821547102374266, "gen_logits_max": 5.25745964050293, "gen_logits_mean": -15.290897369384766, "gen_logits_min": -27.052837371826172, "gen_logits_std": 2.695560932159424, "gen_loss": 0.30992716550827026, "grad_norm": 0.39543977729315744, "learning_rate": 2.6710736842105265e-05, "loss": 0.3117, "mean_copy_accuracy": 0.9948337525129318, "mean_gen_accuracy": 0.8640754222869873, "mean_token_accuracy": 0.8955938220024109, "num_tokens": 905038799.0, "sample_num_tokens": 7704.25, "step": 3340, "total_num_tokens": 905069616.0, "z_loss": 0.000738698523491621 }, { "copy_logits_max": -3.019467353820801, "copy_logits_min": -750000064.0, "copy_num_tokens": 349.6875, "epoch": 0.6823589481746234, "gen_logits_max": 5.790889739990234, "gen_logits_mean": -15.373936653137207, "gen_logits_min": -26.893741607666016, "gen_logits_std": 2.709321975708008, "gen_loss": 0.3066157400608063, "grad_norm": 0.44128738189759936, "learning_rate": 2.6709473684210526e-05, "loss": 0.3278, "mean_copy_accuracy": 0.9937836527824402, "mean_gen_accuracy": 0.8618467152118683, "mean_token_accuracy": 0.8908782601356506, "num_tokens": 905294143.0, "sample_num_tokens": 7645.25, "step": 3341, "total_num_tokens": 905324724.0, "z_loss": 0.0007277890108525753 }, { "copy_logits_max": 1.5797538757324219, "copy_logits_min": -687500032.0, "copy_num_tokens": 514.25, "epoch": 0.6825631861118203, "gen_logits_max": 6.77889347076416, "gen_logits_mean": -12.942667007446289, "gen_logits_min": -25.50843048095703, "gen_logits_std": 2.7512331008911133, "gen_loss": 0.30081453919410706, "grad_norm": 0.41578275881369475, "learning_rate": 2.670821052631579e-05, "loss": 0.2935, "mean_copy_accuracy": 0.9961225837469101, "mean_gen_accuracy": 0.8668017983436584, "mean_token_accuracy": 0.9025186151266098, "num_tokens": 905558874.0, "sample_num_tokens": 8769.5, "step": 3342, "total_num_tokens": 905593952.0, "z_loss": 0.0008263927884399891 }, { "copy_logits_max": 0.5245798826217651, "copy_logits_min": -750000000.0, "copy_num_tokens": 332.375, "epoch": 0.6827674240490171, "gen_logits_max": 5.641638278961182, "gen_logits_mean": -14.18193531036377, "gen_logits_min": -25.853015899658203, "gen_logits_std": 2.6747941970825195, "gen_loss": 0.3254215717315674, "grad_norm": 0.42475123077326893, "learning_rate": 2.670694736842105e-05, "loss": 0.3025, "mean_copy_accuracy": 0.9951234608888626, "mean_gen_accuracy": 0.8610270768404007, "mean_token_accuracy": 0.8987099230289459, "num_tokens": 905829639.0, "sample_num_tokens": 7183.25, "step": 3343, "total_num_tokens": 905858372.0, "z_loss": 0.0007834420539438725 }, { "copy_logits_max": -4.730445861816406, "copy_logits_min": -750000064.0, "copy_num_tokens": 299.6875, "epoch": 0.682971661986214, "gen_logits_max": 6.225008010864258, "gen_logits_mean": -14.449592590332031, "gen_logits_min": -26.34494972229004, "gen_logits_std": 2.7150440216064453, "gen_loss": 0.30109697580337524, "grad_norm": 0.5197091685154556, "learning_rate": 2.6705684210526316e-05, "loss": 0.3091, "mean_copy_accuracy": 0.9954275339841843, "mean_gen_accuracy": 0.8663529753684998, "mean_token_accuracy": 0.8982131630182266, "num_tokens": 906080863.0, "sample_num_tokens": 6496.75, "step": 3344, "total_num_tokens": 906106850.0, "z_loss": 0.0006674023461528122 }, { "copy_logits_max": -0.017523914575576782, "copy_logits_min": -750000000.0, "copy_num_tokens": 404.75, "epoch": 0.6831758999234108, "gen_logits_max": 5.625706195831299, "gen_logits_mean": -14.026510238647461, "gen_logits_min": -26.036556243896484, "gen_logits_std": 2.7679381370544434, "gen_loss": 0.32081395387649536, "grad_norm": 0.42988207789576455, "learning_rate": 2.6704421052631577e-05, "loss": 0.2983, "mean_copy_accuracy": 0.993876188993454, "mean_gen_accuracy": 0.8717257082462311, "mean_token_accuracy": 0.900455042719841, "num_tokens": 906356248.0, "sample_num_tokens": 7814.5, "step": 3345, "total_num_tokens": 906387506.0, "z_loss": 0.0007308681961148977 }, { "copy_logits_max": -0.6835871338844299, "copy_logits_min": -750000000.0, "copy_num_tokens": 365.9375, "epoch": 0.6833801378606076, "gen_logits_max": 6.537817001342773, "gen_logits_mean": -14.149086952209473, "gen_logits_min": -26.070499420166016, "gen_logits_std": 2.7414493560791016, "gen_loss": 0.27534613013267517, "grad_norm": 0.43414242049494517, "learning_rate": 2.670315789473684e-05, "loss": 0.3085, "mean_copy_accuracy": 0.9939276874065399, "mean_gen_accuracy": 0.8672804981470108, "mean_token_accuracy": 0.8959842622280121, "num_tokens": 906611910.0, "sample_num_tokens": 7387.0, "step": 3346, "total_num_tokens": 906641458.0, "z_loss": 0.0006322302506305277 }, { "copy_logits_max": -1.7706801891326904, "copy_logits_min": -750000000.0, "copy_num_tokens": 389.625, "epoch": 0.6835843757978044, "gen_logits_max": 5.6759443283081055, "gen_logits_mean": -14.517332077026367, "gen_logits_min": -26.749378204345703, "gen_logits_std": 2.7270002365112305, "gen_loss": 0.3558955490589142, "grad_norm": 0.42991915573837114, "learning_rate": 2.670189473684211e-05, "loss": 0.3093, "mean_copy_accuracy": 0.9947403818368912, "mean_gen_accuracy": 0.8627073168754578, "mean_token_accuracy": 0.8970791846513748, "num_tokens": 906881806.0, "sample_num_tokens": 7726.5, "step": 3347, "total_num_tokens": 906912712.0, "z_loss": 0.000797073938883841 }, { "copy_logits_max": -0.6107455492019653, "copy_logits_min": -750000000.0, "copy_num_tokens": 428.9375, "epoch": 0.6837886137350013, "gen_logits_max": 5.453004360198975, "gen_logits_mean": -14.249019622802734, "gen_logits_min": -26.084985733032227, "gen_logits_std": 2.7033214569091797, "gen_loss": 0.3208010196685791, "grad_norm": 0.5019530365222439, "learning_rate": 2.670063157894737e-05, "loss": 0.333, "mean_copy_accuracy": 0.9948305487632751, "mean_gen_accuracy": 0.8588448613882065, "mean_token_accuracy": 0.888036698102951, "num_tokens": 907128057.0, "sample_num_tokens": 8593.25, "step": 3348, "total_num_tokens": 907162430.0, "z_loss": 0.0006629338022321463 }, { "copy_logits_max": -2.2559046745300293, "copy_logits_min": -750000000.0, "copy_num_tokens": 361.9375, "epoch": 0.6839928516721981, "gen_logits_max": 4.898279666900635, "gen_logits_mean": -15.08372688293457, "gen_logits_min": -26.96074104309082, "gen_logits_std": 2.7275729179382324, "gen_loss": 0.29599976539611816, "grad_norm": 0.42874154264978226, "learning_rate": 2.6699368421052634e-05, "loss": 0.3178, "mean_copy_accuracy": 0.9943856447935104, "mean_gen_accuracy": 0.860662654042244, "mean_token_accuracy": 0.8937088400125504, "num_tokens": 907414694.0, "sample_num_tokens": 7854.5, "step": 3349, "total_num_tokens": 907446112.0, "z_loss": 0.0006227253470569849 }, { "copy_logits_max": 0.15073496103286743, "copy_logits_min": -687500032.0, "copy_num_tokens": 510.375, "epoch": 0.684197089609395, "gen_logits_max": 5.169412612915039, "gen_logits_mean": -14.221311569213867, "gen_logits_min": -26.112682342529297, "gen_logits_std": 2.7159180641174316, "gen_loss": 0.31167423725128174, "grad_norm": 0.39214062159010465, "learning_rate": 2.6698105263157895e-05, "loss": 0.297, "mean_copy_accuracy": 0.9953473061323166, "mean_gen_accuracy": 0.8663181662559509, "mean_token_accuracy": 0.9006007015705109, "num_tokens": 907695220.0, "sample_num_tokens": 8830.0, "step": 3350, "total_num_tokens": 907730540.0, "z_loss": 0.0007825132342986763 }, { "copy_logits_max": -1.5655200481414795, "copy_logits_min": -687500032.0, "copy_num_tokens": 585.4375, "epoch": 0.6844013275465918, "gen_logits_max": 5.271512985229492, "gen_logits_mean": -14.322696685791016, "gen_logits_min": -26.57508087158203, "gen_logits_std": 2.7473883628845215, "gen_loss": 0.29375189542770386, "grad_norm": 0.4356320336452649, "learning_rate": 2.669684210526316e-05, "loss": 0.3124, "mean_copy_accuracy": 0.9951346516609192, "mean_gen_accuracy": 0.8630962371826172, "mean_token_accuracy": 0.8980398029088974, "num_tokens": 907967589.0, "sample_num_tokens": 8798.25, "step": 3351, "total_num_tokens": 908002782.0, "z_loss": 0.0007097319466993213 }, { "copy_logits_max": -0.7139624953269958, "copy_logits_min": -562500032.0, "copy_num_tokens": 422.75, "epoch": 0.6846055654837886, "gen_logits_max": 4.984500885009766, "gen_logits_mean": -15.024980545043945, "gen_logits_min": -26.786739349365234, "gen_logits_std": 2.6959431171417236, "gen_loss": 0.3025127053260803, "grad_norm": 0.37624049212834065, "learning_rate": 2.669557894736842e-05, "loss": 0.3042, "mean_copy_accuracy": 0.9949540346860886, "mean_gen_accuracy": 0.8612718880176544, "mean_token_accuracy": 0.8977220207452774, "num_tokens": 908243884.0, "sample_num_tokens": 8522.5, "step": 3352, "total_num_tokens": 908277974.0, "z_loss": 0.0006701437523588538 }, { "copy_logits_max": 1.1047110557556152, "copy_logits_min": -687500032.0, "copy_num_tokens": 713.125, "epoch": 0.6848098034209854, "gen_logits_max": 4.08322286605835, "gen_logits_mean": -14.942159652709961, "gen_logits_min": -27.641387939453125, "gen_logits_std": 2.7433605194091797, "gen_loss": 0.2969108819961548, "grad_norm": 0.4681537567607503, "learning_rate": 2.6694315789473684e-05, "loss": 0.3235, "mean_copy_accuracy": 0.9929527938365936, "mean_gen_accuracy": 0.861428901553154, "mean_token_accuracy": 0.8949724584817886, "num_tokens": 908520253.0, "sample_num_tokens": 10425.25, "step": 3353, "total_num_tokens": 908561954.0, "z_loss": 0.0007203970453701913 }, { "copy_logits_max": -0.7673349380493164, "copy_logits_min": -750000000.0, "copy_num_tokens": 460.5625, "epoch": 0.6850140413581823, "gen_logits_max": 5.336250305175781, "gen_logits_mean": -14.508222579956055, "gen_logits_min": -26.62166976928711, "gen_logits_std": 2.717496395111084, "gen_loss": 0.3198871612548828, "grad_norm": 0.4200509073734994, "learning_rate": 2.6693052631578945e-05, "loss": 0.3212, "mean_copy_accuracy": 0.9945390075445175, "mean_gen_accuracy": 0.868126854300499, "mean_token_accuracy": 0.8951445072889328, "num_tokens": 908781074.0, "sample_num_tokens": 8136.5, "step": 3354, "total_num_tokens": 908813620.0, "z_loss": 0.0007767698843963444 }, { "copy_logits_max": 1.9454253911972046, "copy_logits_min": -750000000.0, "copy_num_tokens": 579.125, "epoch": 0.6852182792953792, "gen_logits_max": 4.955406188964844, "gen_logits_mean": -14.265449523925781, "gen_logits_min": -26.824764251708984, "gen_logits_std": 2.755387306213379, "gen_loss": 0.2801969051361084, "grad_norm": 0.5000008049827109, "learning_rate": 2.6691789473684213e-05, "loss": 0.3048, "mean_copy_accuracy": 0.9956530928611755, "mean_gen_accuracy": 0.862565353512764, "mean_token_accuracy": 0.8973855823278427, "num_tokens": 909041602.0, "sample_num_tokens": 8719.0, "step": 3355, "total_num_tokens": 909076478.0, "z_loss": 0.0006946874782443047 }, { "copy_logits_max": -1.0366557836532593, "copy_logits_min": -750000000.0, "copy_num_tokens": 333.75, "epoch": 0.685422517232576, "gen_logits_max": 5.9347124099731445, "gen_logits_mean": -14.330134391784668, "gen_logits_min": -26.65109634399414, "gen_logits_std": 2.7222414016723633, "gen_loss": 0.3085804581642151, "grad_norm": 0.4218624379887043, "learning_rate": 2.6690526315789474e-05, "loss": 0.3293, "mean_copy_accuracy": 0.9944131374359131, "mean_gen_accuracy": 0.8600215911865234, "mean_token_accuracy": 0.8901148587465286, "num_tokens": 909305577.0, "sample_num_tokens": 7614.75, "step": 3356, "total_num_tokens": 909336036.0, "z_loss": 0.000747353711631149 }, { "copy_logits_max": -2.152766704559326, "copy_logits_min": -750000000.0, "copy_num_tokens": 440.4375, "epoch": 0.6856267551697728, "gen_logits_max": 5.467042446136475, "gen_logits_mean": -14.071727752685547, "gen_logits_min": -26.116317749023438, "gen_logits_std": 2.7336244583129883, "gen_loss": 0.2967248558998108, "grad_norm": 0.41861254083302357, "learning_rate": 2.668926315789474e-05, "loss": 0.3106, "mean_copy_accuracy": 0.9937562942504883, "mean_gen_accuracy": 0.8688441514968872, "mean_token_accuracy": 0.8982440829277039, "num_tokens": 909557135.0, "sample_num_tokens": 8684.25, "step": 3357, "total_num_tokens": 909591872.0, "z_loss": 0.0006549215177074075 }, { "copy_logits_max": 0.17199689149856567, "copy_logits_min": -750000064.0, "copy_num_tokens": 758.4375, "epoch": 0.6858309931069696, "gen_logits_max": 4.877786636352539, "gen_logits_mean": -13.46219253540039, "gen_logits_min": -26.392805099487305, "gen_logits_std": 2.792834997177124, "gen_loss": 0.2804749310016632, "grad_norm": 0.40235172171490985, "learning_rate": 2.6688e-05, "loss": 0.324, "mean_copy_accuracy": 0.9956947565078735, "mean_gen_accuracy": 0.8613694757223129, "mean_token_accuracy": 0.8965510576963425, "num_tokens": 909848425.0, "sample_num_tokens": 10753.75, "step": 3358, "total_num_tokens": 909891440.0, "z_loss": 0.0006388179608620703 }, { "copy_logits_max": -1.0904419422149658, "copy_logits_min": -750000000.0, "copy_num_tokens": 299.625, "epoch": 0.6860352310441664, "gen_logits_max": 5.686554431915283, "gen_logits_mean": -14.2012357711792, "gen_logits_min": -26.102519989013672, "gen_logits_std": 2.734848976135254, "gen_loss": 0.32366999983787537, "grad_norm": 0.4043096092797418, "learning_rate": 2.6686736842105264e-05, "loss": 0.3203, "mean_copy_accuracy": 0.9961217641830444, "mean_gen_accuracy": 0.8660403192043304, "mean_token_accuracy": 0.891492635011673, "num_tokens": 910119562.0, "sample_num_tokens": 7507.5, "step": 3359, "total_num_tokens": 910149592.0, "z_loss": 0.0007184514543041587 }, { "copy_logits_max": -0.4431915879249573, "copy_logits_min": -750000000.0, "copy_num_tokens": 475.6875, "epoch": 0.6862394689813633, "gen_logits_max": 4.953088760375977, "gen_logits_mean": -14.234052658081055, "gen_logits_min": -26.785356521606445, "gen_logits_std": 2.7859272956848145, "gen_loss": 0.2976534366607666, "grad_norm": 0.4286601481637009, "learning_rate": 2.6685473684210528e-05, "loss": 0.3213, "mean_copy_accuracy": 0.9952949285507202, "mean_gen_accuracy": 0.8626740872859955, "mean_token_accuracy": 0.89364093542099, "num_tokens": 910378059.0, "sample_num_tokens": 8138.25, "step": 3360, "total_num_tokens": 910410612.0, "z_loss": 0.0006608180701732635 }, { "copy_logits_max": -2.514716625213623, "copy_logits_min": -750000000.0, "copy_num_tokens": 373.0625, "epoch": 0.6864437069185602, "gen_logits_max": 5.045092582702637, "gen_logits_mean": -15.399280548095703, "gen_logits_min": -26.92185401916504, "gen_logits_std": 2.7166123390197754, "gen_loss": 0.32736673951148987, "grad_norm": 0.43472781044504755, "learning_rate": 2.668421052631579e-05, "loss": 0.3059, "mean_copy_accuracy": 0.9951656311750412, "mean_gen_accuracy": 0.8699062764644623, "mean_token_accuracy": 0.8970035016536713, "num_tokens": 910623245.0, "sample_num_tokens": 7843.75, "step": 3361, "total_num_tokens": 910654620.0, "z_loss": 0.0007087705889716744 }, { "copy_logits_max": -0.6379243731498718, "copy_logits_min": -687500032.0, "copy_num_tokens": 489.0, "epoch": 0.686647944855757, "gen_logits_max": 5.528940200805664, "gen_logits_mean": -13.875761032104492, "gen_logits_min": -25.433317184448242, "gen_logits_std": 2.705303192138672, "gen_loss": 0.3466891944408417, "grad_norm": 0.4469909047882483, "learning_rate": 2.6682947368421053e-05, "loss": 0.3194, "mean_copy_accuracy": 0.994395524263382, "mean_gen_accuracy": 0.8617327213287354, "mean_token_accuracy": 0.8952540159225464, "num_tokens": 910877928.0, "sample_num_tokens": 8139.0, "step": 3362, "total_num_tokens": 910910484.0, "z_loss": 0.0007893192232586443 }, { "copy_logits_max": -0.3466886878013611, "copy_logits_min": -750000000.0, "copy_num_tokens": 508.9375, "epoch": 0.6868521827929538, "gen_logits_max": 5.501019477844238, "gen_logits_mean": -14.363687515258789, "gen_logits_min": -26.603885650634766, "gen_logits_std": 2.7522146701812744, "gen_loss": 0.31824660301208496, "grad_norm": 0.429923847740335, "learning_rate": 2.6681684210526317e-05, "loss": 0.325, "mean_copy_accuracy": 0.9944430887699127, "mean_gen_accuracy": 0.8605962097644806, "mean_token_accuracy": 0.8929593563079834, "num_tokens": 911151356.0, "sample_num_tokens": 9432.0, "step": 3363, "total_num_tokens": 911189084.0, "z_loss": 0.0007218008395284414 }, { "copy_logits_max": -2.363615036010742, "copy_logits_min": -687500032.0, "copy_num_tokens": 508.875, "epoch": 0.6870564207301506, "gen_logits_max": 4.437591552734375, "gen_logits_mean": -15.877141952514648, "gen_logits_min": -27.84906578063965, "gen_logits_std": 2.706324577331543, "gen_loss": 0.31633588671684265, "grad_norm": 0.39992075640166175, "learning_rate": 2.6680421052631582e-05, "loss": 0.3257, "mean_copy_accuracy": 0.9951479732990265, "mean_gen_accuracy": 0.8652237355709076, "mean_token_accuracy": 0.8935987204313278, "num_tokens": 911420799.0, "sample_num_tokens": 8681.25, "step": 3364, "total_num_tokens": 911455524.0, "z_loss": 0.0007112300954759121 }, { "copy_logits_max": 1.142958641052246, "copy_logits_min": -687500032.0, "copy_num_tokens": 513.1875, "epoch": 0.6872606586673474, "gen_logits_max": 5.513175010681152, "gen_logits_mean": -14.204273223876953, "gen_logits_min": -25.85745620727539, "gen_logits_std": 2.734757423400879, "gen_loss": 0.3505750894546509, "grad_norm": 0.4165745058622904, "learning_rate": 2.6679157894736843e-05, "loss": 0.3205, "mean_copy_accuracy": 0.9946221858263016, "mean_gen_accuracy": 0.8642972409725189, "mean_token_accuracy": 0.892818421125412, "num_tokens": 911684453.0, "sample_num_tokens": 8971.25, "step": 3365, "total_num_tokens": 911720338.0, "z_loss": 0.0007459205808117986 }, { "copy_logits_max": 0.3070957064628601, "copy_logits_min": -750000000.0, "copy_num_tokens": 716.4375, "epoch": 0.6874648966045442, "gen_logits_max": 4.88955020904541, "gen_logits_mean": -13.953387260437012, "gen_logits_min": -26.62596321105957, "gen_logits_std": 2.745706081390381, "gen_loss": 0.2749466896057129, "grad_norm": 0.3801817831546554, "learning_rate": 2.6677894736842107e-05, "loss": 0.2849, "mean_copy_accuracy": 0.994980126619339, "mean_gen_accuracy": 0.8700267672538757, "mean_token_accuracy": 0.9056469649076462, "num_tokens": 911986156.0, "sample_num_tokens": 10071.5, "step": 3366, "total_num_tokens": 912026442.0, "z_loss": 0.0006274307379499078 }, { "copy_logits_max": -1.4830713272094727, "copy_logits_min": -750000000.0, "copy_num_tokens": 343.0, "epoch": 0.6876691345417412, "gen_logits_max": 5.353893280029297, "gen_logits_mean": -15.125502586364746, "gen_logits_min": -26.964031219482422, "gen_logits_std": 2.71713924407959, "gen_loss": 0.3262658417224884, "grad_norm": 0.38649107321396003, "learning_rate": 2.6676631578947368e-05, "loss": 0.3117, "mean_copy_accuracy": 0.995379626750946, "mean_gen_accuracy": 0.8591833710670471, "mean_token_accuracy": 0.8954437971115112, "num_tokens": 912284830.0, "sample_num_tokens": 7086.5, "step": 3367, "total_num_tokens": 912313176.0, "z_loss": 0.0006834260420873761 }, { "copy_logits_max": 1.8065333366394043, "copy_logits_min": -750000000.0, "copy_num_tokens": 476.25, "epoch": 0.687873372478938, "gen_logits_max": 5.403020858764648, "gen_logits_mean": -13.590133666992188, "gen_logits_min": -25.81641387939453, "gen_logits_std": 2.742290496826172, "gen_loss": 0.33343565464019775, "grad_norm": 0.39460049681850806, "learning_rate": 2.6675368421052632e-05, "loss": 0.32, "mean_copy_accuracy": 0.9957771152257919, "mean_gen_accuracy": 0.854908898472786, "mean_token_accuracy": 0.8942706733942032, "num_tokens": 912561891.0, "sample_num_tokens": 8549.25, "step": 3368, "total_num_tokens": 912596088.0, "z_loss": 0.0006760135293006897 }, { "copy_logits_max": -0.1176801323890686, "copy_logits_min": -750000000.0, "copy_num_tokens": 495.1875, "epoch": 0.6880776104161348, "gen_logits_max": 4.876480579376221, "gen_logits_mean": -13.837425231933594, "gen_logits_min": -25.780738830566406, "gen_logits_std": 2.703038215637207, "gen_loss": 0.31998729705810547, "grad_norm": 0.3867465034149663, "learning_rate": 2.6674105263157893e-05, "loss": 0.3041, "mean_copy_accuracy": 0.9955993443727493, "mean_gen_accuracy": 0.8672354817390442, "mean_token_accuracy": 0.8984262943267822, "num_tokens": 912846755.0, "sample_num_tokens": 8544.75, "step": 3369, "total_num_tokens": 912880934.0, "z_loss": 0.000654293573461473 }, { "copy_logits_max": -1.5924983024597168, "copy_logits_min": -750000000.0, "copy_num_tokens": 384.875, "epoch": 0.6882818483533316, "gen_logits_max": 5.422947883605957, "gen_logits_mean": -14.379128456115723, "gen_logits_min": -26.416412353515625, "gen_logits_std": 2.720797538757324, "gen_loss": 0.3303617238998413, "grad_norm": 0.42494331302035004, "learning_rate": 2.6672842105263157e-05, "loss": 0.3334, "mean_copy_accuracy": 0.9959262758493423, "mean_gen_accuracy": 0.8580332547426224, "mean_token_accuracy": 0.889487698674202, "num_tokens": 913116493.0, "sample_num_tokens": 7970.25, "step": 3370, "total_num_tokens": 913148374.0, "z_loss": 0.0006378327962011099 }, { "copy_logits_max": 1.9691379070281982, "copy_logits_min": -625000064.0, "copy_num_tokens": 675.6875, "epoch": 0.6884860862905284, "gen_logits_max": 4.167448997497559, "gen_logits_mean": -14.421070098876953, "gen_logits_min": -26.969446182250977, "gen_logits_std": 2.748934030532837, "gen_loss": 0.2961600124835968, "grad_norm": 0.4133123045178724, "learning_rate": 2.6671578947368422e-05, "loss": 0.3201, "mean_copy_accuracy": 0.9956868588924408, "mean_gen_accuracy": 0.8621815890073776, "mean_token_accuracy": 0.8944580256938934, "num_tokens": 913381975.0, "sample_num_tokens": 9054.25, "step": 3371, "total_num_tokens": 913418192.0, "z_loss": 0.0006242012605071068 }, { "copy_logits_max": -0.6405490040779114, "copy_logits_min": -750000000.0, "copy_num_tokens": 395.1875, "epoch": 0.6886903242277252, "gen_logits_max": 5.18461275100708, "gen_logits_mean": -13.83338737487793, "gen_logits_min": -25.625680923461914, "gen_logits_std": 2.746962070465088, "gen_loss": 0.30949828028678894, "grad_norm": 0.4046565704047593, "learning_rate": 2.6670315789473686e-05, "loss": 0.323, "mean_copy_accuracy": 0.9952278435230255, "mean_gen_accuracy": 0.8660959303379059, "mean_token_accuracy": 0.8960720896720886, "num_tokens": 913662581.0, "sample_num_tokens": 7978.25, "step": 3372, "total_num_tokens": 913694494.0, "z_loss": 0.0006199184572324157 }, { "copy_logits_max": -4.077255725860596, "copy_logits_min": -750000000.0, "copy_num_tokens": 413.75, "epoch": 0.6888945621649222, "gen_logits_max": 4.527449607849121, "gen_logits_mean": -15.368751525878906, "gen_logits_min": -26.75674057006836, "gen_logits_std": 2.6857893466949463, "gen_loss": 0.30686065554618835, "grad_norm": 0.44686343348316915, "learning_rate": 2.666905263157895e-05, "loss": 0.3182, "mean_copy_accuracy": 0.9943146109580994, "mean_gen_accuracy": 0.8598633706569672, "mean_token_accuracy": 0.8923052251338959, "num_tokens": 913931007.0, "sample_num_tokens": 8700.25, "step": 3373, "total_num_tokens": 913965808.0, "z_loss": 0.0006017200066708028 }, { "copy_logits_max": 1.3449357748031616, "copy_logits_min": -750000000.0, "copy_num_tokens": 543.1875, "epoch": 0.689098800102119, "gen_logits_max": 3.9175877571105957, "gen_logits_mean": -15.16232681274414, "gen_logits_min": -27.342853546142578, "gen_logits_std": 2.7432401180267334, "gen_loss": 0.2807208001613617, "grad_norm": 0.3928685792433492, "learning_rate": 2.666778947368421e-05, "loss": 0.3036, "mean_copy_accuracy": 0.9962980002164841, "mean_gen_accuracy": 0.8633153289556503, "mean_token_accuracy": 0.8991543352603912, "num_tokens": 914206785.0, "sample_num_tokens": 7904.25, "step": 3374, "total_num_tokens": 914238402.0, "z_loss": 0.0005830248119309545 }, { "copy_logits_max": 0.5260649919509888, "copy_logits_min": -750000000.0, "copy_num_tokens": 791.3125, "epoch": 0.6893030380393158, "gen_logits_max": 3.79244065284729, "gen_logits_mean": -15.107383728027344, "gen_logits_min": -27.687397003173828, "gen_logits_std": 2.7413032054901123, "gen_loss": 0.2862709164619446, "grad_norm": 0.3957758450456801, "learning_rate": 2.6666526315789476e-05, "loss": 0.2941, "mean_copy_accuracy": 0.9954182952642441, "mean_gen_accuracy": 0.8631915748119354, "mean_token_accuracy": 0.9040699452161789, "num_tokens": 914493714.0, "sample_num_tokens": 10992.0, "step": 3375, "total_num_tokens": 914537682.0, "z_loss": 0.0006321253604255617 }, { "copy_logits_max": 3.612294912338257, "copy_logits_min": -687500032.0, "copy_num_tokens": 503.5, "epoch": 0.6895072759765126, "gen_logits_max": 4.307945251464844, "gen_logits_mean": -14.014031410217285, "gen_logits_min": -25.610631942749023, "gen_logits_std": 2.706054210662842, "gen_loss": 0.29867130517959595, "grad_norm": 0.41639683810715633, "learning_rate": 2.6665263157894737e-05, "loss": 0.296, "mean_copy_accuracy": 0.9949827939271927, "mean_gen_accuracy": 0.8702155500650406, "mean_token_accuracy": 0.9011921435594559, "num_tokens": 914800407.0, "sample_num_tokens": 8784.75, "step": 3376, "total_num_tokens": 914835546.0, "z_loss": 0.000642305298242718 }, { "copy_logits_max": 1.622177243232727, "copy_logits_min": -687500032.0, "copy_num_tokens": 615.5625, "epoch": 0.6897115139137094, "gen_logits_max": 4.857973098754883, "gen_logits_mean": -14.236763000488281, "gen_logits_min": -26.595577239990234, "gen_logits_std": 2.717277765274048, "gen_loss": 0.30934756994247437, "grad_norm": 0.4344708165738309, "learning_rate": 2.6664e-05, "loss": 0.3146, "mean_copy_accuracy": 0.9959267377853394, "mean_gen_accuracy": 0.8582935780286789, "mean_token_accuracy": 0.8980061411857605, "num_tokens": 915073228.0, "sample_num_tokens": 8993.5, "step": 3377, "total_num_tokens": 915109202.0, "z_loss": 0.0008174289832822978 }, { "copy_logits_max": 1.9392595291137695, "copy_logits_min": -750000064.0, "copy_num_tokens": 514.0625, "epoch": 0.6899157518509063, "gen_logits_max": 4.737743377685547, "gen_logits_mean": -13.907011985778809, "gen_logits_min": -25.948457717895508, "gen_logits_std": 2.7274556159973145, "gen_loss": 0.3139776587486267, "grad_norm": 0.39607985987761407, "learning_rate": 2.6662736842105262e-05, "loss": 0.3061, "mean_copy_accuracy": 0.995194599032402, "mean_gen_accuracy": 0.8663802891969681, "mean_token_accuracy": 0.8987463265657425, "num_tokens": 915337024.0, "sample_num_tokens": 8632.5, "step": 3378, "total_num_tokens": 915371554.0, "z_loss": 0.0007636822410859168 }, { "copy_logits_max": 1.5928475856781006, "copy_logits_min": -750000000.0, "copy_num_tokens": 478.5625, "epoch": 0.6901199897881032, "gen_logits_max": 5.304359436035156, "gen_logits_mean": -13.68708324432373, "gen_logits_min": -26.596839904785156, "gen_logits_std": 2.736091136932373, "gen_loss": 0.3128371834754944, "grad_norm": 0.43797411241681145, "learning_rate": 2.6661473684210526e-05, "loss": 0.3224, "mean_copy_accuracy": 0.9939584881067276, "mean_gen_accuracy": 0.8638897240161896, "mean_token_accuracy": 0.8931006044149399, "num_tokens": 915616025.0, "sample_num_tokens": 9000.75, "step": 3379, "total_num_tokens": 915652028.0, "z_loss": 0.0007177457446232438 }, { "copy_logits_max": 0.45797717571258545, "copy_logits_min": -750000064.0, "copy_num_tokens": 420.75, "epoch": 0.6903242277253, "gen_logits_max": 5.151571750640869, "gen_logits_mean": -14.455304145812988, "gen_logits_min": -26.217790603637695, "gen_logits_std": 2.716970443725586, "gen_loss": 0.2997068166732788, "grad_norm": 0.44646720115200283, "learning_rate": 2.666021052631579e-05, "loss": 0.3066, "mean_copy_accuracy": 0.994834378361702, "mean_gen_accuracy": 0.8646819442510605, "mean_token_accuracy": 0.8970385789871216, "num_tokens": 915869287.0, "sample_num_tokens": 7716.25, "step": 3380, "total_num_tokens": 915900152.0, "z_loss": 0.0006978226592764258 }, { "copy_logits_max": -0.6149766445159912, "copy_logits_min": -750000000.0, "copy_num_tokens": 431.0625, "epoch": 0.6905284656624968, "gen_logits_max": 5.179139614105225, "gen_logits_mean": -14.213019371032715, "gen_logits_min": -26.19134521484375, "gen_logits_std": 2.7228989601135254, "gen_loss": 0.32746243476867676, "grad_norm": 0.4390555172499063, "learning_rate": 2.6658947368421055e-05, "loss": 0.3083, "mean_copy_accuracy": 0.9942707717418671, "mean_gen_accuracy": 0.8656432032585144, "mean_token_accuracy": 0.8978792577981949, "num_tokens": 916131472.0, "sample_num_tokens": 8305.0, "step": 3381, "total_num_tokens": 916164692.0, "z_loss": 0.0007765196496620774 }, { "copy_logits_max": -1.6487799882888794, "copy_logits_min": -687500032.0, "copy_num_tokens": 542.3125, "epoch": 0.6907327035996936, "gen_logits_max": 5.141498565673828, "gen_logits_mean": -14.469616889953613, "gen_logits_min": -26.64993667602539, "gen_logits_std": 2.7466955184936523, "gen_loss": 0.3031172454357147, "grad_norm": 0.4184789549171653, "learning_rate": 2.6657684210526316e-05, "loss": 0.3099, "mean_copy_accuracy": 0.9947270601987839, "mean_gen_accuracy": 0.86656054854393, "mean_token_accuracy": 0.8984016329050064, "num_tokens": 916398307.0, "sample_num_tokens": 9039.75, "step": 3382, "total_num_tokens": 916434466.0, "z_loss": 0.0007502140942960978 }, { "copy_logits_max": -1.6467570066452026, "copy_logits_min": -750000000.0, "copy_num_tokens": 373.875, "epoch": 0.6909369415368904, "gen_logits_max": 5.549952983856201, "gen_logits_mean": -14.339147567749023, "gen_logits_min": -26.487037658691406, "gen_logits_std": 2.7072722911834717, "gen_loss": 0.32455289363861084, "grad_norm": 0.47218918763972767, "learning_rate": 2.665642105263158e-05, "loss": 0.2986, "mean_copy_accuracy": 0.993900716304779, "mean_gen_accuracy": 0.8705515414476395, "mean_token_accuracy": 0.9003164768218994, "num_tokens": 916675931.0, "sample_num_tokens": 7835.75, "step": 3383, "total_num_tokens": 916707274.0, "z_loss": 0.0007140830275602639 }, { "copy_logits_max": -2.169821262359619, "copy_logits_min": -750000000.0, "copy_num_tokens": 485.125, "epoch": 0.6911411794740873, "gen_logits_max": 4.002530574798584, "gen_logits_mean": -15.571521759033203, "gen_logits_min": -27.400550842285156, "gen_logits_std": 2.7006185054779053, "gen_loss": 0.2983855605125427, "grad_norm": 0.448011828853935, "learning_rate": 2.665515789473684e-05, "loss": 0.3183, "mean_copy_accuracy": 0.9945276826620102, "mean_gen_accuracy": 0.8606439232826233, "mean_token_accuracy": 0.8962763398885727, "num_tokens": 916940912.0, "sample_num_tokens": 7901.0, "step": 3384, "total_num_tokens": 916972516.0, "z_loss": 0.0006817999528720975 }, { "copy_logits_max": -1.6484286785125732, "copy_logits_min": -750000000.0, "copy_num_tokens": 645.625, "epoch": 0.6913454174112842, "gen_logits_max": 4.611551284790039, "gen_logits_mean": -12.767744064331055, "gen_logits_min": -25.159608840942383, "gen_logits_std": 2.7640271186828613, "gen_loss": 0.27391138672828674, "grad_norm": 0.4223239398731848, "learning_rate": 2.6653894736842105e-05, "loss": 0.3215, "mean_copy_accuracy": 0.9959823191165924, "mean_gen_accuracy": 0.8613948225975037, "mean_token_accuracy": 0.8939129412174225, "num_tokens": 917192614.0, "sample_num_tokens": 9195.0, "step": 3385, "total_num_tokens": 917229394.0, "z_loss": 0.0006701268721371889 }, { "copy_logits_max": -3.2586610317230225, "copy_logits_min": -750000000.0, "copy_num_tokens": 423.625, "epoch": 0.691549655348481, "gen_logits_max": 4.200533866882324, "gen_logits_mean": -15.442889213562012, "gen_logits_min": -27.255783081054688, "gen_logits_std": 2.716179847717285, "gen_loss": 0.3364589512348175, "grad_norm": 0.4992329575274606, "learning_rate": 2.665263157894737e-05, "loss": 0.3194, "mean_copy_accuracy": 0.9932815730571747, "mean_gen_accuracy": 0.8630229234695435, "mean_token_accuracy": 0.8945265412330627, "num_tokens": 917460222.0, "sample_num_tokens": 7817.5, "step": 3386, "total_num_tokens": 917491492.0, "z_loss": 0.0007179898675531149 }, { "copy_logits_max": -2.753446578979492, "copy_logits_min": -750000000.0, "copy_num_tokens": 468.0, "epoch": 0.6917538932856778, "gen_logits_max": 4.72336483001709, "gen_logits_mean": -14.772720336914062, "gen_logits_min": -26.57403564453125, "gen_logits_std": 2.729869842529297, "gen_loss": 0.2965856194496155, "grad_norm": 0.4120293281561023, "learning_rate": 2.665136842105263e-05, "loss": 0.2962, "mean_copy_accuracy": 0.9951601624488831, "mean_gen_accuracy": 0.8690483570098877, "mean_token_accuracy": 0.9046612977981567, "num_tokens": 917757169.0, "sample_num_tokens": 8373.75, "step": 3387, "total_num_tokens": 917790664.0, "z_loss": 0.0006230399012565613 }, { "copy_logits_max": -1.7883139848709106, "copy_logits_min": -750000000.0, "copy_num_tokens": 476.5, "epoch": 0.6919581312228746, "gen_logits_max": 4.633273601531982, "gen_logits_mean": -14.73283576965332, "gen_logits_min": -26.51396942138672, "gen_logits_std": 2.715529203414917, "gen_loss": 0.2984866499900818, "grad_norm": 0.4647054783616198, "learning_rate": 2.6650105263157898e-05, "loss": 0.3389, "mean_copy_accuracy": 0.9922236502170563, "mean_gen_accuracy": 0.8604826033115387, "mean_token_accuracy": 0.8871070891618729, "num_tokens": 918001830.0, "sample_num_tokens": 8792.5, "step": 3388, "total_num_tokens": 918037000.0, "z_loss": 0.0006276701460592449 }, { "copy_logits_max": -0.5754803419113159, "copy_logits_min": -687500032.0, "copy_num_tokens": 434.625, "epoch": 0.6921623691600715, "gen_logits_max": 4.6641740798950195, "gen_logits_mean": -15.267253875732422, "gen_logits_min": -27.137582778930664, "gen_logits_std": 2.695169448852539, "gen_loss": 0.31507617235183716, "grad_norm": 0.42091694320415024, "learning_rate": 2.664884210526316e-05, "loss": 0.307, "mean_copy_accuracy": 0.994186133146286, "mean_gen_accuracy": 0.867706373333931, "mean_token_accuracy": 0.8972873985767365, "num_tokens": 918283978.0, "sample_num_tokens": 8578.5, "step": 3389, "total_num_tokens": 918318292.0, "z_loss": 0.0006326116272248328 }, { "copy_logits_max": -1.809398889541626, "copy_logits_min": -750000000.0, "copy_num_tokens": 496.6875, "epoch": 0.6923666070972683, "gen_logits_max": 4.738344192504883, "gen_logits_mean": -14.264081954956055, "gen_logits_min": -26.378772735595703, "gen_logits_std": 2.6935696601867676, "gen_loss": 0.36126136779785156, "grad_norm": 0.41679241219997654, "learning_rate": 2.6647578947368423e-05, "loss": 0.3387, "mean_copy_accuracy": 0.9942537695169449, "mean_gen_accuracy": 0.8590042293071747, "mean_token_accuracy": 0.8885692059993744, "num_tokens": 918545377.0, "sample_num_tokens": 9088.75, "step": 3390, "total_num_tokens": 918581732.0, "z_loss": 0.0007770075462758541 }, { "copy_logits_max": -1.494387149810791, "copy_logits_min": -687500032.0, "copy_num_tokens": 560.375, "epoch": 0.6925708450344652, "gen_logits_max": 3.31400728225708, "gen_logits_mean": -15.812728881835938, "gen_logits_min": -27.34420394897461, "gen_logits_std": 2.6623013019561768, "gen_loss": 0.31576722860336304, "grad_norm": 0.48766894640837155, "learning_rate": 2.6646315789473684e-05, "loss": 0.3126, "mean_copy_accuracy": 0.9924829006195068, "mean_gen_accuracy": 0.8681691437959671, "mean_token_accuracy": 0.8969496488571167, "num_tokens": 918797131.0, "sample_num_tokens": 8788.75, "step": 3391, "total_num_tokens": 918832286.0, "z_loss": 0.0007603487465530634 }, { "copy_logits_max": -3.532414436340332, "copy_logits_min": -750000000.0, "copy_num_tokens": 503.4375, "epoch": 0.692775082971662, "gen_logits_max": 4.409643173217773, "gen_logits_mean": -14.446939468383789, "gen_logits_min": -27.05396270751953, "gen_logits_std": 2.7247672080993652, "gen_loss": 0.306048721075058, "grad_norm": 0.44279419584701973, "learning_rate": 2.664505263157895e-05, "loss": 0.2986, "mean_copy_accuracy": 0.9946582764387131, "mean_gen_accuracy": 0.868969663977623, "mean_token_accuracy": 0.9006960541009903, "num_tokens": 919055716.0, "sample_num_tokens": 8396.5, "step": 3392, "total_num_tokens": 919089302.0, "z_loss": 0.0006869133794680238 }, { "copy_logits_max": -3.258030414581299, "copy_logits_min": -750000000.0, "copy_num_tokens": 648.125, "epoch": 0.6929793209088588, "gen_logits_max": 4.732907772064209, "gen_logits_mean": -13.474874496459961, "gen_logits_min": -25.44009780883789, "gen_logits_std": 2.700932741165161, "gen_loss": 0.30069488286972046, "grad_norm": 0.43937802105488716, "learning_rate": 2.664378947368421e-05, "loss": 0.2954, "mean_copy_accuracy": 0.993959978222847, "mean_gen_accuracy": 0.8630362302064896, "mean_token_accuracy": 0.9000741690397263, "num_tokens": 919343769.0, "sample_num_tokens": 10009.75, "step": 3393, "total_num_tokens": 919383808.0, "z_loss": 0.0006579997716471553 }, { "copy_logits_max": -4.932279586791992, "copy_logits_min": -750000000.0, "copy_num_tokens": 385.3125, "epoch": 0.6931835588460556, "gen_logits_max": 4.996831893920898, "gen_logits_mean": -13.121904373168945, "gen_logits_min": -24.81420135498047, "gen_logits_std": 2.6920080184936523, "gen_loss": 0.30999356508255005, "grad_norm": 0.40248315455747125, "learning_rate": 2.6642526315789474e-05, "loss": 0.3135, "mean_copy_accuracy": 0.994683638215065, "mean_gen_accuracy": 0.8607077747583389, "mean_token_accuracy": 0.8960022330284119, "num_tokens": 919619935.0, "sample_num_tokens": 7987.75, "step": 3394, "total_num_tokens": 919651886.0, "z_loss": 0.0006405323510989547 }, { "copy_logits_max": -4.419149398803711, "copy_logits_min": -750000000.0, "copy_num_tokens": 585.3125, "epoch": 0.6933877967832525, "gen_logits_max": 5.109134197235107, "gen_logits_mean": -13.035235404968262, "gen_logits_min": -24.92416000366211, "gen_logits_std": 2.6378207206726074, "gen_loss": 0.2534328103065491, "grad_norm": 0.35897121941625454, "learning_rate": 2.6641263157894735e-05, "loss": 0.2776, "mean_copy_accuracy": 0.9950522482395172, "mean_gen_accuracy": 0.8765015602111816, "mean_token_accuracy": 0.9060846865177155, "num_tokens": 919903008.0, "sample_num_tokens": 9658.5, "step": 3395, "total_num_tokens": 919941642.0, "z_loss": 0.000615566736087203 }, { "copy_logits_max": -4.3713274002075195, "copy_logits_min": -687500032.0, "copy_num_tokens": 717.25, "epoch": 0.6935920347204493, "gen_logits_max": 4.002408504486084, "gen_logits_mean": -14.586217880249023, "gen_logits_min": -26.989931106567383, "gen_logits_std": 2.736098051071167, "gen_loss": 0.25381407141685486, "grad_norm": 0.40208665971252594, "learning_rate": 2.6640000000000002e-05, "loss": 0.2933, "mean_copy_accuracy": 0.9955833107233047, "mean_gen_accuracy": 0.8681335747241974, "mean_token_accuracy": 0.9029460549354553, "num_tokens": 920187067.0, "sample_num_tokens": 10111.25, "step": 3396, "total_num_tokens": 920227512.0, "z_loss": 0.0006496284622699022 }, { "copy_logits_max": -2.1554956436157227, "copy_logits_min": -750000000.0, "copy_num_tokens": 741.5, "epoch": 0.6937962726576462, "gen_logits_max": 3.297700881958008, "gen_logits_mean": -14.97295093536377, "gen_logits_min": -27.63275718688965, "gen_logits_std": 2.711137294769287, "gen_loss": 0.25192373991012573, "grad_norm": 0.3978229700235829, "learning_rate": 2.6638736842105263e-05, "loss": 0.2846, "mean_copy_accuracy": 0.9955995082855225, "mean_gen_accuracy": 0.8706548511981964, "mean_token_accuracy": 0.9055463671684265, "num_tokens": 920477251.0, "sample_num_tokens": 9671.25, "step": 3397, "total_num_tokens": 920515936.0, "z_loss": 0.0006489744991995394 }, { "copy_logits_max": -4.1018476486206055, "copy_logits_min": -750000000.0, "copy_num_tokens": 367.5, "epoch": 0.694000510594843, "gen_logits_max": 5.090558052062988, "gen_logits_mean": -14.030852317810059, "gen_logits_min": -26.183677673339844, "gen_logits_std": 2.6818926334381104, "gen_loss": 0.3396739363670349, "grad_norm": 0.39952442937398996, "learning_rate": 2.6637473684210528e-05, "loss": 0.2949, "mean_copy_accuracy": 0.994453564286232, "mean_gen_accuracy": 0.8716563135385513, "mean_token_accuracy": 0.9023068398237228, "num_tokens": 920755403.0, "sample_num_tokens": 7356.25, "step": 3398, "total_num_tokens": 920784828.0, "z_loss": 0.0007905507227405906 }, { "copy_logits_max": -6.545393943786621, "copy_logits_min": -750000000.0, "copy_num_tokens": 381.625, "epoch": 0.6942047485320398, "gen_logits_max": 4.183053016662598, "gen_logits_mean": -16.432228088378906, "gen_logits_min": -27.967056274414062, "gen_logits_std": 2.6821911334991455, "gen_loss": 0.3047371506690979, "grad_norm": 0.4130165152660404, "learning_rate": 2.6636210526315792e-05, "loss": 0.3114, "mean_copy_accuracy": 0.9942604899406433, "mean_gen_accuracy": 0.8659479171037674, "mean_token_accuracy": 0.8976940512657166, "num_tokens": 921021648.0, "sample_num_tokens": 7900.0, "step": 3399, "total_num_tokens": 921053248.0, "z_loss": 0.0006641405634582043 }, { "copy_logits_max": -6.362427234649658, "copy_logits_min": -750000000.0, "copy_num_tokens": 500.0625, "epoch": 0.6944089864692367, "gen_logits_max": 5.0220866203308105, "gen_logits_mean": -13.914487838745117, "gen_logits_min": -25.730224609375, "gen_logits_std": 2.6961557865142822, "gen_loss": 0.3082234263420105, "grad_norm": 0.4173885062707374, "learning_rate": 2.6634947368421053e-05, "loss": 0.2996, "mean_copy_accuracy": 0.994350865483284, "mean_gen_accuracy": 0.8673172891139984, "mean_token_accuracy": 0.898713618516922, "num_tokens": 921299866.0, "sample_num_tokens": 8678.5, "step": 3400, "total_num_tokens": 921334580.0, "z_loss": 0.0006524099735543132 }, { "copy_logits_max": -7.1151299476623535, "copy_logits_min": -750000000.0, "copy_num_tokens": 414.25, "epoch": 0.6946132244064335, "gen_logits_max": 6.5085530281066895, "gen_logits_mean": -11.992301940917969, "gen_logits_min": -23.7236385345459, "gen_logits_std": 2.646894931793213, "gen_loss": 0.31686556339263916, "grad_norm": 0.39494454091978837, "learning_rate": 2.6633684210526317e-05, "loss": 0.3122, "mean_copy_accuracy": 0.9948969334363937, "mean_gen_accuracy": 0.8673236072063446, "mean_token_accuracy": 0.8976714462041855, "num_tokens": 921588182.0, "sample_num_tokens": 8587.5, "step": 3401, "total_num_tokens": 921622532.0, "z_loss": 0.0007301943842321634 }, { "copy_logits_max": -6.654818534851074, "copy_logits_min": -750000000.0, "copy_num_tokens": 666.8125, "epoch": 0.6948174623436303, "gen_logits_max": 5.126193523406982, "gen_logits_mean": -12.706989288330078, "gen_logits_min": -26.32489013671875, "gen_logits_std": 2.78171968460083, "gen_loss": 0.28381869196891785, "grad_norm": 0.4371105867471632, "learning_rate": 2.6632421052631578e-05, "loss": 0.3037, "mean_copy_accuracy": 0.9946269541978836, "mean_gen_accuracy": 0.867296427488327, "mean_token_accuracy": 0.8992497473955154, "num_tokens": 921863111.0, "sample_num_tokens": 9284.25, "step": 3402, "total_num_tokens": 921900248.0, "z_loss": 0.0006764681893400848 }, { "copy_logits_max": -6.545161247253418, "copy_logits_min": -750000000.0, "copy_num_tokens": 491.9375, "epoch": 0.6950217002808272, "gen_logits_max": 4.367899417877197, "gen_logits_mean": -15.580781936645508, "gen_logits_min": -27.138395309448242, "gen_logits_std": 2.6444993019104004, "gen_loss": 0.29837241768836975, "grad_norm": 0.3634070784612376, "learning_rate": 2.6631157894736842e-05, "loss": 0.2907, "mean_copy_accuracy": 0.9957295507192612, "mean_gen_accuracy": 0.8748359680175781, "mean_token_accuracy": 0.905900850892067, "num_tokens": 922147032.0, "sample_num_tokens": 8624.5, "step": 3403, "total_num_tokens": 922181530.0, "z_loss": 0.0006629342096857727 }, { "copy_logits_max": -5.781683921813965, "copy_logits_min": -750000000.0, "copy_num_tokens": 456.375, "epoch": 0.695225938218024, "gen_logits_max": 5.120030403137207, "gen_logits_mean": -14.516079902648926, "gen_logits_min": -26.09186363220215, "gen_logits_std": 2.665536403656006, "gen_loss": 0.30791234970092773, "grad_norm": 0.4078195482792437, "learning_rate": 2.6629894736842107e-05, "loss": 0.305, "mean_copy_accuracy": 0.9947818517684937, "mean_gen_accuracy": 0.8671262562274933, "mean_token_accuracy": 0.8975998759269714, "num_tokens": 922414134.0, "sample_num_tokens": 8378.5, "step": 3404, "total_num_tokens": 922447648.0, "z_loss": 0.0007175841019488871 }, { "copy_logits_max": -8.352912902832031, "copy_logits_min": -750000000.0, "copy_num_tokens": 388.625, "epoch": 0.6954301761552208, "gen_logits_max": 5.649226665496826, "gen_logits_mean": -13.326507568359375, "gen_logits_min": -25.179285049438477, "gen_logits_std": 2.692549705505371, "gen_loss": 0.3099897503852844, "grad_norm": 0.40902683925014516, "learning_rate": 2.662863157894737e-05, "loss": 0.3051, "mean_copy_accuracy": 0.9953954070806503, "mean_gen_accuracy": 0.8675698786973953, "mean_token_accuracy": 0.8982105404138565, "num_tokens": 922683905.0, "sample_num_tokens": 8269.25, "step": 3405, "total_num_tokens": 922716982.0, "z_loss": 0.0006706003332510591 }, { "copy_logits_max": -7.034516334533691, "copy_logits_min": -687500032.0, "copy_num_tokens": 249.0625, "epoch": 0.6956344140924177, "gen_logits_max": 6.073573589324951, "gen_logits_mean": -13.847121238708496, "gen_logits_min": -25.299442291259766, "gen_logits_std": 2.632007598876953, "gen_loss": 0.3453057110309601, "grad_norm": 0.4484421914456124, "learning_rate": 2.6627368421052632e-05, "loss": 0.3227, "mean_copy_accuracy": 0.9940154850482941, "mean_gen_accuracy": 0.8649656623601913, "mean_token_accuracy": 0.8920713365077972, "num_tokens": 922935162.0, "sample_num_tokens": 6944.5, "step": 3406, "total_num_tokens": 922962940.0, "z_loss": 0.0007026050006970763 }, { "copy_logits_max": -5.464483261108398, "copy_logits_min": -750000000.0, "copy_num_tokens": 377.5, "epoch": 0.6958386520296145, "gen_logits_max": 5.463345527648926, "gen_logits_mean": -14.454313278198242, "gen_logits_min": -26.333568572998047, "gen_logits_std": 2.6816205978393555, "gen_loss": 0.3232518136501312, "grad_norm": 0.40898467799585936, "learning_rate": 2.6626105263157896e-05, "loss": 0.3173, "mean_copy_accuracy": 0.9939704984426498, "mean_gen_accuracy": 0.8654691874980927, "mean_token_accuracy": 0.8931077718734741, "num_tokens": 923195691.0, "sample_num_tokens": 7871.75, "step": 3407, "total_num_tokens": 923227178.0, "z_loss": 0.000695183640345931 }, { "copy_logits_max": -6.39957332611084, "copy_logits_min": -750000000.0, "copy_num_tokens": 522.9375, "epoch": 0.6960428899668113, "gen_logits_max": 4.5099592208862305, "gen_logits_mean": -15.649325370788574, "gen_logits_min": -27.306943893432617, "gen_logits_std": 2.6927061080932617, "gen_loss": 0.3132348656654358, "grad_norm": 0.4188938159398532, "learning_rate": 2.6624842105263157e-05, "loss": 0.3244, "mean_copy_accuracy": 0.9952952116727829, "mean_gen_accuracy": 0.8569879978895187, "mean_token_accuracy": 0.8932945281267166, "num_tokens": 923471381.0, "sample_num_tokens": 8693.25, "step": 3408, "total_num_tokens": 923506154.0, "z_loss": 0.0007306140614673495 }, { "copy_logits_max": -5.389143466949463, "copy_logits_min": -750000000.0, "copy_num_tokens": 558.6875, "epoch": 0.6962471279040082, "gen_logits_max": 4.708277702331543, "gen_logits_mean": -14.678966522216797, "gen_logits_min": -26.526683807373047, "gen_logits_std": 2.6794276237487793, "gen_loss": 0.30104342103004456, "grad_norm": 0.37653762923718853, "learning_rate": 2.662357894736842e-05, "loss": 0.2962, "mean_copy_accuracy": 0.9961870014667511, "mean_gen_accuracy": 0.8684682548046112, "mean_token_accuracy": 0.9010355770587921, "num_tokens": 923750273.0, "sample_num_tokens": 9541.75, "step": 3409, "total_num_tokens": 923788440.0, "z_loss": 0.0007280917488969862 }, { "copy_logits_max": -3.786672353744507, "copy_logits_min": -750000000.0, "copy_num_tokens": 608.9375, "epoch": 0.696451365841205, "gen_logits_max": 4.212699890136719, "gen_logits_mean": -14.946091651916504, "gen_logits_min": -26.879636764526367, "gen_logits_std": 2.689635753631592, "gen_loss": 0.3220619857311249, "grad_norm": 0.43826936805209715, "learning_rate": 2.6622315789473682e-05, "loss": 0.3073, "mean_copy_accuracy": 0.994391605257988, "mean_gen_accuracy": 0.86395063996315, "mean_token_accuracy": 0.8988714665174484, "num_tokens": 924033362.0, "sample_num_tokens": 10174.5, "step": 3410, "total_num_tokens": 924074060.0, "z_loss": 0.0007154872291721404 }, { "copy_logits_max": -6.350688934326172, "copy_logits_min": -750000000.0, "copy_num_tokens": 368.5625, "epoch": 0.6966556037784019, "gen_logits_max": 4.98321533203125, "gen_logits_mean": -14.241090774536133, "gen_logits_min": -26.048900604248047, "gen_logits_std": 2.698441743850708, "gen_loss": 0.3183867931365967, "grad_norm": 0.47671478016488555, "learning_rate": 2.6621052631578947e-05, "loss": 0.3163, "mean_copy_accuracy": 0.9948954880237579, "mean_gen_accuracy": 0.8633519411087036, "mean_token_accuracy": 0.8956647217273712, "num_tokens": 924327286.0, "sample_num_tokens": 7899.0, "step": 3411, "total_num_tokens": 924358882.0, "z_loss": 0.0007104979013092816 }, { "copy_logits_max": -3.445505380630493, "copy_logits_min": -687500032.0, "copy_num_tokens": 607.1875, "epoch": 0.6968598417155987, "gen_logits_max": 4.264244079589844, "gen_logits_mean": -14.021612167358398, "gen_logits_min": -26.504512786865234, "gen_logits_std": 2.7130696773529053, "gen_loss": 0.3327845335006714, "grad_norm": 0.4441744242243087, "learning_rate": 2.661978947368421e-05, "loss": 0.3184, "mean_copy_accuracy": 0.994739294052124, "mean_gen_accuracy": 0.860645204782486, "mean_token_accuracy": 0.8951026052236557, "num_tokens": 924594734.0, "sample_num_tokens": 9152.0, "step": 3412, "total_num_tokens": 924631342.0, "z_loss": 0.0007677962421439588 }, { "copy_logits_max": -6.605905532836914, "copy_logits_min": -750000000.0, "copy_num_tokens": 318.375, "epoch": 0.6970640796527955, "gen_logits_max": 5.756182670593262, "gen_logits_mean": -13.211923599243164, "gen_logits_min": -24.919816970825195, "gen_logits_std": 2.6976633071899414, "gen_loss": 0.33157721161842346, "grad_norm": 0.39860130279275946, "learning_rate": 2.6618526315789475e-05, "loss": 0.2991, "mean_copy_accuracy": 0.9954243451356888, "mean_gen_accuracy": 0.8697441816329956, "mean_token_accuracy": 0.9010238945484161, "num_tokens": 924880109.0, "sample_num_tokens": 8023.75, "step": 3413, "total_num_tokens": 924912204.0, "z_loss": 0.0007236741366796196 }, { "copy_logits_max": -7.009189605712891, "copy_logits_min": -750000000.0, "copy_num_tokens": 266.0625, "epoch": 0.6972683175899923, "gen_logits_max": 5.693807125091553, "gen_logits_mean": -13.889686584472656, "gen_logits_min": -26.610416412353516, "gen_logits_std": 2.6904945373535156, "gen_loss": 0.36078503727912903, "grad_norm": 0.4257970680744817, "learning_rate": 2.661726315789474e-05, "loss": 0.3143, "mean_copy_accuracy": 0.9941803067922592, "mean_gen_accuracy": 0.8674516528844833, "mean_token_accuracy": 0.8952583521604538, "num_tokens": 925160887.0, "sample_num_tokens": 7609.75, "step": 3414, "total_num_tokens": 925191326.0, "z_loss": 0.0006857654079794884 }, { "copy_logits_max": -5.4909257888793945, "copy_logits_min": -625000064.0, "copy_num_tokens": 451.875, "epoch": 0.6974725555271892, "gen_logits_max": 3.468252420425415, "gen_logits_mean": -16.317546844482422, "gen_logits_min": -28.44851303100586, "gen_logits_std": 2.701647996902466, "gen_loss": 0.2806175649166107, "grad_norm": 0.5823707182850376, "learning_rate": 2.6616e-05, "loss": 0.3044, "mean_copy_accuracy": 0.9947641789913177, "mean_gen_accuracy": 0.8660682290792465, "mean_token_accuracy": 0.8988510370254517, "num_tokens": 925431368.0, "sample_num_tokens": 8153.5, "step": 3415, "total_num_tokens": 925463982.0, "z_loss": 0.0005830537411384284 }, { "copy_logits_max": -5.6641011238098145, "copy_logits_min": -750000000.0, "copy_num_tokens": 355.0, "epoch": 0.697676793464386, "gen_logits_max": 5.718820571899414, "gen_logits_mean": -13.115180969238281, "gen_logits_min": -24.972793579101562, "gen_logits_std": 2.695842742919922, "gen_loss": 0.34960222244262695, "grad_norm": 0.417745067617801, "learning_rate": 2.6614736842105265e-05, "loss": 0.3192, "mean_copy_accuracy": 0.9946293383836746, "mean_gen_accuracy": 0.8629720658063889, "mean_token_accuracy": 0.8950964212417603, "num_tokens": 925709985.0, "sample_num_tokens": 7541.25, "step": 3416, "total_num_tokens": 925740150.0, "z_loss": 0.0007579962257295847 }, { "copy_logits_max": -6.002743721008301, "copy_logits_min": -750000000.0, "copy_num_tokens": 432.5625, "epoch": 0.6978810314015829, "gen_logits_max": 5.510333061218262, "gen_logits_mean": -13.232728004455566, "gen_logits_min": -25.753284454345703, "gen_logits_std": 2.7291905879974365, "gen_loss": 0.31867048144340515, "grad_norm": 0.4301467551851882, "learning_rate": 2.6613473684210526e-05, "loss": 0.291, "mean_copy_accuracy": 0.9951616078615189, "mean_gen_accuracy": 0.8721167594194412, "mean_token_accuracy": 0.9027216881513596, "num_tokens": 925974775.0, "sample_num_tokens": 8024.75, "step": 3417, "total_num_tokens": 926006874.0, "z_loss": 0.000667956774123013 }, { "copy_logits_max": -6.009773254394531, "copy_logits_min": -687500032.0, "copy_num_tokens": 601.625, "epoch": 0.6980852693387797, "gen_logits_max": 4.871137619018555, "gen_logits_mean": -14.141357421875, "gen_logits_min": -25.94124984741211, "gen_logits_std": 2.6583492755889893, "gen_loss": 0.3107645511627197, "grad_norm": 0.45922695777018885, "learning_rate": 2.661221052631579e-05, "loss": 0.3186, "mean_copy_accuracy": 0.9925150722265244, "mean_gen_accuracy": 0.8672332912683487, "mean_token_accuracy": 0.894528865814209, "num_tokens": 926219748.0, "sample_num_tokens": 10215.5, "step": 3418, "total_num_tokens": 926260610.0, "z_loss": 0.0006762279663234949 }, { "copy_logits_max": -5.50045108795166, "copy_logits_min": -750000000.0, "copy_num_tokens": 459.75, "epoch": 0.6982895072759765, "gen_logits_max": 5.063089370727539, "gen_logits_mean": -13.748881340026855, "gen_logits_min": -25.958309173583984, "gen_logits_std": 2.703284740447998, "gen_loss": 0.3139573931694031, "grad_norm": 0.47717775572218146, "learning_rate": 2.661094736842105e-05, "loss": 0.3129, "mean_copy_accuracy": 0.9936101287603378, "mean_gen_accuracy": 0.8687906116247177, "mean_token_accuracy": 0.8974799662828445, "num_tokens": 926476661.0, "sample_num_tokens": 8207.75, "step": 3419, "total_num_tokens": 926509492.0, "z_loss": 0.000723794219084084 }, { "copy_logits_max": -5.74746561050415, "copy_logits_min": -687500032.0, "copy_num_tokens": 417.125, "epoch": 0.6984937452131733, "gen_logits_max": 4.630647659301758, "gen_logits_mean": -14.667463302612305, "gen_logits_min": -26.99472427368164, "gen_logits_std": 2.672718048095703, "gen_loss": 0.30173271894454956, "grad_norm": 0.40802487507551705, "learning_rate": 2.660968421052632e-05, "loss": 0.319, "mean_copy_accuracy": 0.9950023144483566, "mean_gen_accuracy": 0.8640243858098984, "mean_token_accuracy": 0.8935327678918839, "num_tokens": 926736255.0, "sample_num_tokens": 8413.25, "step": 3420, "total_num_tokens": 926769908.0, "z_loss": 0.0007104036048986018 }, { "copy_logits_max": -5.007349491119385, "copy_logits_min": -687500032.0, "copy_num_tokens": 477.625, "epoch": 0.6986979831503701, "gen_logits_max": 4.273599624633789, "gen_logits_mean": -14.869133949279785, "gen_logits_min": -26.69318199157715, "gen_logits_std": 2.686053991317749, "gen_loss": 0.288519024848938, "grad_norm": 0.4767102743466434, "learning_rate": 2.660842105263158e-05, "loss": 0.3053, "mean_copy_accuracy": 0.9934470355510712, "mean_gen_accuracy": 0.8653799444437027, "mean_token_accuracy": 0.8969313055276871, "num_tokens": 927004127.0, "sample_num_tokens": 8111.75, "step": 3421, "total_num_tokens": 927036574.0, "z_loss": 0.0007136575295589864 }, { "copy_logits_max": -5.171595096588135, "copy_logits_min": -750000000.0, "copy_num_tokens": 492.8125, "epoch": 0.698902221087567, "gen_logits_max": 4.149771213531494, "gen_logits_mean": -15.59045696258545, "gen_logits_min": -27.303144454956055, "gen_logits_std": 2.725801706314087, "gen_loss": 0.26409590244293213, "grad_norm": 0.4435905497403177, "learning_rate": 2.6607157894736844e-05, "loss": 0.3011, "mean_copy_accuracy": 0.995213508605957, "mean_gen_accuracy": 0.8730269819498062, "mean_token_accuracy": 0.9012960642576218, "num_tokens": 927263212.0, "sample_num_tokens": 8108.0, "step": 3422, "total_num_tokens": 927295644.0, "z_loss": 0.0006536681903526187 }, { "copy_logits_max": -5.508707523345947, "copy_logits_min": -750000000.0, "copy_num_tokens": 492.3125, "epoch": 0.6991064590247639, "gen_logits_max": 4.160292148590088, "gen_logits_mean": -16.037517547607422, "gen_logits_min": -27.84942054748535, "gen_logits_std": 2.7037160396575928, "gen_loss": 0.32208460569381714, "grad_norm": 0.42843120142348684, "learning_rate": 2.6605894736842105e-05, "loss": 0.3134, "mean_copy_accuracy": 0.9958034455776215, "mean_gen_accuracy": 0.8579400926828384, "mean_token_accuracy": 0.8947236388921738, "num_tokens": 927549124.0, "sample_num_tokens": 9338.5, "step": 3423, "total_num_tokens": 927586478.0, "z_loss": 0.000735532259568572 }, { "copy_logits_max": -3.821695566177368, "copy_logits_min": -750000000.0, "copy_num_tokens": 342.125, "epoch": 0.6993106969619607, "gen_logits_max": 5.633703231811523, "gen_logits_mean": -14.176742553710938, "gen_logits_min": -25.94302749633789, "gen_logits_std": 2.707313299179077, "gen_loss": 0.3499196171760559, "grad_norm": 0.4714862341301393, "learning_rate": 2.660463157894737e-05, "loss": 0.3388, "mean_copy_accuracy": 0.994186669588089, "mean_gen_accuracy": 0.8567032963037491, "mean_token_accuracy": 0.8891101330518723, "num_tokens": 927804776.0, "sample_num_tokens": 7178.0, "step": 3424, "total_num_tokens": 927833488.0, "z_loss": 0.0007702921866439283 }, { "copy_logits_max": -3.4715960025787354, "copy_logits_min": -750000000.0, "copy_num_tokens": 312.3125, "epoch": 0.6995149348991575, "gen_logits_max": 5.419436454772949, "gen_logits_mean": -13.973997116088867, "gen_logits_min": -25.974197387695312, "gen_logits_std": 2.7292351722717285, "gen_loss": 0.3345959186553955, "grad_norm": 0.5277564926580275, "learning_rate": 2.660336842105263e-05, "loss": 0.321, "mean_copy_accuracy": 0.9945143610239029, "mean_gen_accuracy": 0.8598712533712387, "mean_token_accuracy": 0.8938532173633575, "num_tokens": 928083093.0, "sample_num_tokens": 7201.75, "step": 3425, "total_num_tokens": 928111900.0, "z_loss": 0.0006929397932253778 }, { "copy_logits_max": -5.863729476928711, "copy_logits_min": -687500032.0, "copy_num_tokens": 507.9375, "epoch": 0.6997191728363543, "gen_logits_max": 4.58404016494751, "gen_logits_mean": -14.755424499511719, "gen_logits_min": -26.519886016845703, "gen_logits_std": 2.705108165740967, "gen_loss": 0.3216932713985443, "grad_norm": 0.4442377471282397, "learning_rate": 2.6602105263157895e-05, "loss": 0.311, "mean_copy_accuracy": 0.9936773329973221, "mean_gen_accuracy": 0.8642244189977646, "mean_token_accuracy": 0.8957323729991913, "num_tokens": 928348025.0, "sample_num_tokens": 9485.25, "step": 3426, "total_num_tokens": 928385966.0, "z_loss": 0.0006890819640830159 }, { "copy_logits_max": -5.269015312194824, "copy_logits_min": -687500032.0, "copy_num_tokens": 367.6875, "epoch": 0.6999234107735511, "gen_logits_max": 5.487105846405029, "gen_logits_mean": -13.589489936828613, "gen_logits_min": -25.42001724243164, "gen_logits_std": 2.7084851264953613, "gen_loss": 0.36579301953315735, "grad_norm": 0.44143329971354084, "learning_rate": 2.660084210526316e-05, "loss": 0.3263, "mean_copy_accuracy": 0.9944451302289963, "mean_gen_accuracy": 0.8570843935012817, "mean_token_accuracy": 0.8926193863153458, "num_tokens": 928636978.0, "sample_num_tokens": 7919.0, "step": 3427, "total_num_tokens": 928668654.0, "z_loss": 0.0007823235355317593 }, { "copy_logits_max": -4.067793846130371, "copy_logits_min": -750000000.0, "copy_num_tokens": 463.0625, "epoch": 0.7001276487107481, "gen_logits_max": 4.529632091522217, "gen_logits_mean": -14.321945190429688, "gen_logits_min": -26.697982788085938, "gen_logits_std": 2.7130250930786133, "gen_loss": 0.2913576066493988, "grad_norm": 0.4981843380539396, "learning_rate": 2.6599578947368423e-05, "loss": 0.3193, "mean_copy_accuracy": 0.9937015324831009, "mean_gen_accuracy": 0.8632394373416901, "mean_token_accuracy": 0.8930970579385757, "num_tokens": 928891757.0, "sample_num_tokens": 7616.25, "step": 3428, "total_num_tokens": 928922222.0, "z_loss": 0.0006231543375179172 }, { "copy_logits_max": -5.99139404296875, "copy_logits_min": -750000000.0, "copy_num_tokens": 412.375, "epoch": 0.7003318866479449, "gen_logits_max": 5.008238315582275, "gen_logits_mean": -14.29019832611084, "gen_logits_min": -26.380054473876953, "gen_logits_std": 2.7278828620910645, "gen_loss": 0.2996203601360321, "grad_norm": 0.4398187136173753, "learning_rate": 2.6598315789473687e-05, "loss": 0.3017, "mean_copy_accuracy": 0.9940473586320877, "mean_gen_accuracy": 0.8734537065029144, "mean_token_accuracy": 0.9006585031747818, "num_tokens": 929146637.0, "sample_num_tokens": 7342.25, "step": 3429, "total_num_tokens": 929176006.0, "z_loss": 0.0006028544157743454 }, { "copy_logits_max": -3.559140920639038, "copy_logits_min": -687500032.0, "copy_num_tokens": 787.0, "epoch": 0.7005361245851417, "gen_logits_max": 3.647770881652832, "gen_logits_mean": -14.508098602294922, "gen_logits_min": -26.825613021850586, "gen_logits_std": 2.7377192974090576, "gen_loss": 0.2910674810409546, "grad_norm": 0.4543545407802986, "learning_rate": 2.659705263157895e-05, "loss": 0.3101, "mean_copy_accuracy": 0.9947453737258911, "mean_gen_accuracy": 0.8581659346818924, "mean_token_accuracy": 0.8964371234178543, "num_tokens": 929417649.0, "sample_num_tokens": 10824.25, "step": 3430, "total_num_tokens": 929460946.0, "z_loss": 0.0006398452678695321 }, { "copy_logits_max": -4.274088382720947, "copy_logits_min": -750000000.0, "copy_num_tokens": 572.9375, "epoch": 0.7007403625223385, "gen_logits_max": 5.217923164367676, "gen_logits_mean": -13.386941909790039, "gen_logits_min": -26.03022003173828, "gen_logits_std": 2.7718396186828613, "gen_loss": 0.26477041840553284, "grad_norm": 0.4166690082030424, "learning_rate": 2.6595789473684213e-05, "loss": 0.3113, "mean_copy_accuracy": 0.9953108876943588, "mean_gen_accuracy": 0.8598159849643707, "mean_token_accuracy": 0.8967047780752182, "num_tokens": 929714848.0, "sample_num_tokens": 7959.0, "step": 3431, "total_num_tokens": 929746684.0, "z_loss": 0.0006259874207898974 }, { "copy_logits_max": -2.387035369873047, "copy_logits_min": -750000000.0, "copy_num_tokens": 554.0625, "epoch": 0.7009446004595353, "gen_logits_max": 5.548235893249512, "gen_logits_mean": -13.129758834838867, "gen_logits_min": -25.390789031982422, "gen_logits_std": 2.7432422637939453, "gen_loss": 0.34349188208580017, "grad_norm": 0.572412938616163, "learning_rate": 2.6594526315789474e-05, "loss": 0.346, "mean_copy_accuracy": 0.9923046231269836, "mean_gen_accuracy": 0.8556738197803497, "mean_token_accuracy": 0.8873748779296875, "num_tokens": 929966581.0, "sample_num_tokens": 9906.75, "step": 3432, "total_num_tokens": 930006208.0, "z_loss": 0.0007290739449672401 }, { "copy_logits_max": -6.014167785644531, "copy_logits_min": -750000000.0, "copy_num_tokens": 364.4375, "epoch": 0.7011488383967321, "gen_logits_max": 5.174897193908691, "gen_logits_mean": -14.970701217651367, "gen_logits_min": -26.406591415405273, "gen_logits_std": 2.672083854675293, "gen_loss": 0.35695838928222656, "grad_norm": 0.4163587203085159, "learning_rate": 2.6593263157894738e-05, "loss": 0.3346, "mean_copy_accuracy": 0.9930706918239594, "mean_gen_accuracy": 0.8619936406612396, "mean_token_accuracy": 0.8899631649255753, "num_tokens": 930233243.0, "sample_num_tokens": 8629.75, "step": 3433, "total_num_tokens": 930267762.0, "z_loss": 0.000769263890106231 }, { "copy_logits_max": -4.4507904052734375, "copy_logits_min": -750000000.0, "copy_num_tokens": 777.375, "epoch": 0.7013530763339291, "gen_logits_max": 4.62129545211792, "gen_logits_mean": -14.361433029174805, "gen_logits_min": -26.358219146728516, "gen_logits_std": 2.7013864517211914, "gen_loss": 0.28137391805648804, "grad_norm": 0.44901165280134964, "learning_rate": 2.6592e-05, "loss": 0.3052, "mean_copy_accuracy": 0.9934114813804626, "mean_gen_accuracy": 0.8669904470443726, "mean_token_accuracy": 0.8998551517724991, "num_tokens": 930510844.0, "sample_num_tokens": 10196.0, "step": 3434, "total_num_tokens": 930551628.0, "z_loss": 0.0006909752264618874 }, { "copy_logits_max": -5.718876838684082, "copy_logits_min": -750000000.0, "copy_num_tokens": 429.375, "epoch": 0.7015573142711259, "gen_logits_max": 4.437902450561523, "gen_logits_mean": -14.90294361114502, "gen_logits_min": -27.05561637878418, "gen_logits_std": 2.6736953258514404, "gen_loss": 0.2911771535873413, "grad_norm": 0.4197790016546233, "learning_rate": 2.6590736842105263e-05, "loss": 0.3187, "mean_copy_accuracy": 0.9902398437261581, "mean_gen_accuracy": 0.8677554279565811, "mean_token_accuracy": 0.8923031687736511, "num_tokens": 930772508.0, "sample_num_tokens": 7847.0, "step": 3435, "total_num_tokens": 930803896.0, "z_loss": 0.0006801560521125793 }, { "copy_logits_max": -3.3619911670684814, "copy_logits_min": -687500032.0, "copy_num_tokens": 545.9375, "epoch": 0.7017615522083227, "gen_logits_max": 4.324135780334473, "gen_logits_mean": -14.455193519592285, "gen_logits_min": -26.545148849487305, "gen_logits_std": 2.7165849208831787, "gen_loss": 0.3331158757209778, "grad_norm": 0.4276547860356135, "learning_rate": 2.6589473684210524e-05, "loss": 0.3251, "mean_copy_accuracy": 0.9933899641036987, "mean_gen_accuracy": 0.8604847490787506, "mean_token_accuracy": 0.8921038061380386, "num_tokens": 931036194.0, "sample_num_tokens": 8420.5, "step": 3436, "total_num_tokens": 931069876.0, "z_loss": 0.0007789962110109627 }, { "copy_logits_max": -4.782473087310791, "copy_logits_min": -687500032.0, "copy_num_tokens": 420.5625, "epoch": 0.7019657901455195, "gen_logits_max": 4.843072891235352, "gen_logits_mean": -14.688691139221191, "gen_logits_min": -26.745100021362305, "gen_logits_std": 2.7093324661254883, "gen_loss": 0.3351261019706726, "grad_norm": 0.45370041826084284, "learning_rate": 2.6588210526315792e-05, "loss": 0.3294, "mean_copy_accuracy": 0.9931701123714447, "mean_gen_accuracy": 0.8590734452009201, "mean_token_accuracy": 0.890876054763794, "num_tokens": 931319701.0, "sample_num_tokens": 8016.25, "step": 3437, "total_num_tokens": 931351766.0, "z_loss": 0.0007217008387669921 }, { "copy_logits_max": -3.359550952911377, "copy_logits_min": -750000000.0, "copy_num_tokens": 603.375, "epoch": 0.7021700280827163, "gen_logits_max": 4.725640773773193, "gen_logits_mean": -13.335023880004883, "gen_logits_min": -25.751237869262695, "gen_logits_std": 2.714719772338867, "gen_loss": 0.293794184923172, "grad_norm": 0.42153460383719543, "learning_rate": 2.6586947368421053e-05, "loss": 0.3049, "mean_copy_accuracy": 0.994714766740799, "mean_gen_accuracy": 0.8662473559379578, "mean_token_accuracy": 0.9007826298475266, "num_tokens": 931604018.0, "sample_num_tokens": 8783.0, "step": 3438, "total_num_tokens": 931639150.0, "z_loss": 0.0007332680397666991 }, { "copy_logits_max": -5.346463680267334, "copy_logits_min": -750000000.0, "copy_num_tokens": 378.375, "epoch": 0.7023742660199132, "gen_logits_max": 5.282703399658203, "gen_logits_mean": -14.449623107910156, "gen_logits_min": -26.292566299438477, "gen_logits_std": 2.66056227684021, "gen_loss": 0.3354170024394989, "grad_norm": 0.4047616013263552, "learning_rate": 2.6585684210526317e-05, "loss": 0.2974, "mean_copy_accuracy": 0.9953380674123764, "mean_gen_accuracy": 0.870791956782341, "mean_token_accuracy": 0.9019177258014679, "num_tokens": 931870070.0, "sample_num_tokens": 8376.0, "step": 3439, "total_num_tokens": 931903574.0, "z_loss": 0.0008164889295585454 }, { "copy_logits_max": -5.613792896270752, "copy_logits_min": -750000064.0, "copy_num_tokens": 379.6875, "epoch": 0.7025785039571101, "gen_logits_max": 6.320745944976807, "gen_logits_mean": -12.583831787109375, "gen_logits_min": -24.431346893310547, "gen_logits_std": 2.6842665672302246, "gen_loss": 0.3706357777118683, "grad_norm": 0.43677259280689124, "learning_rate": 2.658442105263158e-05, "loss": 0.3222, "mean_copy_accuracy": 0.9949921816587448, "mean_gen_accuracy": 0.8553895801305771, "mean_token_accuracy": 0.8930992037057877, "num_tokens": 932135977.0, "sample_num_tokens": 8184.75, "step": 3440, "total_num_tokens": 932168716.0, "z_loss": 0.000894324854016304 }, { "copy_logits_max": -6.615694046020508, "copy_logits_min": -750000000.0, "copy_num_tokens": 634.8125, "epoch": 0.7027827418943069, "gen_logits_max": 5.253199577331543, "gen_logits_mean": -12.732255935668945, "gen_logits_min": -25.294103622436523, "gen_logits_std": 2.748670816421509, "gen_loss": 0.30767861008644104, "grad_norm": 0.4414527013795838, "learning_rate": 2.6583157894736842e-05, "loss": 0.3063, "mean_copy_accuracy": 0.9935760945081711, "mean_gen_accuracy": 0.8648577630519867, "mean_token_accuracy": 0.9002810269594193, "num_tokens": 932412770.0, "sample_num_tokens": 9329.0, "step": 3441, "total_num_tokens": 932450086.0, "z_loss": 0.0007130309822969139 }, { "copy_logits_max": -7.870322227478027, "copy_logits_min": -687500032.0, "copy_num_tokens": 327.0, "epoch": 0.7029869798315037, "gen_logits_max": 5.705115795135498, "gen_logits_mean": -14.033480644226074, "gen_logits_min": -26.151437759399414, "gen_logits_std": 2.661637306213379, "gen_loss": 0.28009501099586487, "grad_norm": 0.41785837310073953, "learning_rate": 2.6581894736842107e-05, "loss": 0.3003, "mean_copy_accuracy": 0.9935983270406723, "mean_gen_accuracy": 0.8699917048215866, "mean_token_accuracy": 0.8983055651187897, "num_tokens": 932684180.0, "sample_num_tokens": 8328.0, "step": 3442, "total_num_tokens": 932717492.0, "z_loss": 0.0006279101362451911 }, { "copy_logits_max": -7.125988960266113, "copy_logits_min": -750000000.0, "copy_num_tokens": 442.4375, "epoch": 0.7031912177687005, "gen_logits_max": 5.757381916046143, "gen_logits_mean": -12.637502670288086, "gen_logits_min": -24.706127166748047, "gen_logits_std": 2.6848080158233643, "gen_loss": 0.33288970589637756, "grad_norm": 0.7051631973229028, "learning_rate": 2.6580631578947367e-05, "loss": 0.3276, "mean_copy_accuracy": 0.9949478656053543, "mean_gen_accuracy": 0.8581222593784332, "mean_token_accuracy": 0.8931961506605148, "num_tokens": 932962143.0, "sample_num_tokens": 8720.25, "step": 3443, "total_num_tokens": 932997024.0, "z_loss": 0.0007371273241005838 }, { "copy_logits_max": -5.617654800415039, "copy_logits_min": -750000000.0, "copy_num_tokens": 407.75, "epoch": 0.7033954557058973, "gen_logits_max": 4.9137468338012695, "gen_logits_mean": -14.548802375793457, "gen_logits_min": -26.702056884765625, "gen_logits_std": 2.7071409225463867, "gen_loss": 0.2988733947277069, "grad_norm": 0.4378301984744584, "learning_rate": 2.6579368421052632e-05, "loss": 0.2933, "mean_copy_accuracy": 0.9943189918994904, "mean_gen_accuracy": 0.8737790137529373, "mean_token_accuracy": 0.9026924520730972, "num_tokens": 933232643.0, "sample_num_tokens": 8039.25, "step": 3444, "total_num_tokens": 933264800.0, "z_loss": 0.0006848897319287062 }, { "copy_logits_max": -6.0867919921875, "copy_logits_min": -687500032.0, "copy_num_tokens": 418.125, "epoch": 0.7035996936430942, "gen_logits_max": 5.67560338973999, "gen_logits_mean": -12.868738174438477, "gen_logits_min": -24.512920379638672, "gen_logits_std": 2.7033891677856445, "gen_loss": 0.2985914647579193, "grad_norm": 0.4574012706135882, "learning_rate": 2.6578105263157896e-05, "loss": 0.3039, "mean_copy_accuracy": 0.9948129206895828, "mean_gen_accuracy": 0.8636440187692642, "mean_token_accuracy": 0.8994162976741791, "num_tokens": 933501814.0, "sample_num_tokens": 7795.5, "step": 3445, "total_num_tokens": 933532996.0, "z_loss": 0.0006884156027808785 }, { "copy_logits_max": -6.411890029907227, "copy_logits_min": -750000000.0, "copy_num_tokens": 351.6875, "epoch": 0.7038039315802911, "gen_logits_max": 5.574309825897217, "gen_logits_mean": -13.915868759155273, "gen_logits_min": -25.190671920776367, "gen_logits_std": 2.6708929538726807, "gen_loss": 0.3439271152019501, "grad_norm": 0.4453191474611918, "learning_rate": 2.657684210526316e-05, "loss": 0.3146, "mean_copy_accuracy": 0.9942545145750046, "mean_gen_accuracy": 0.8663731515407562, "mean_token_accuracy": 0.8968432992696762, "num_tokens": 933770043.0, "sample_num_tokens": 7728.25, "step": 3446, "total_num_tokens": 933800956.0, "z_loss": 0.0007107051787897944 }, { "copy_logits_max": -5.321479320526123, "copy_logits_min": -750000000.0, "copy_num_tokens": 314.625, "epoch": 0.7040081695174879, "gen_logits_max": 5.147669315338135, "gen_logits_mean": -14.611820220947266, "gen_logits_min": -26.64679718017578, "gen_logits_std": 2.673370361328125, "gen_loss": 0.3218991458415985, "grad_norm": 0.4044718411535327, "learning_rate": 2.657557894736842e-05, "loss": 0.2996, "mean_copy_accuracy": 0.9949581921100616, "mean_gen_accuracy": 0.8678363263607025, "mean_token_accuracy": 0.8993511646986008, "num_tokens": 934046097.0, "sample_num_tokens": 8045.25, "step": 3447, "total_num_tokens": 934078278.0, "z_loss": 0.0006427335320040584 }, { "copy_logits_max": -5.189752578735352, "copy_logits_min": -750000064.0, "copy_num_tokens": 428.875, "epoch": 0.7042124074546847, "gen_logits_max": 4.198336601257324, "gen_logits_mean": -14.343584060668945, "gen_logits_min": -25.766647338867188, "gen_logits_std": 2.69754695892334, "gen_loss": 0.3095707893371582, "grad_norm": 0.4452029823171762, "learning_rate": 2.6574315789473686e-05, "loss": 0.3373, "mean_copy_accuracy": 0.9949913173913956, "mean_gen_accuracy": 0.8568544238805771, "mean_token_accuracy": 0.8874043822288513, "num_tokens": 934303895.0, "sample_num_tokens": 7379.75, "step": 3448, "total_num_tokens": 934333414.0, "z_loss": 0.0006348425522446632 }, { "copy_logits_max": -4.332684516906738, "copy_logits_min": -687500032.0, "copy_num_tokens": 568.9375, "epoch": 0.7044166453918815, "gen_logits_max": 3.752692222595215, "gen_logits_mean": -15.880036354064941, "gen_logits_min": -27.352771759033203, "gen_logits_std": 2.6716461181640625, "gen_loss": 0.28779417276382446, "grad_norm": 0.39636176831426634, "learning_rate": 2.6573052631578947e-05, "loss": 0.2989, "mean_copy_accuracy": 0.995056077837944, "mean_gen_accuracy": 0.8659501820802689, "mean_token_accuracy": 0.90128393471241, "num_tokens": 934575926.0, "sample_num_tokens": 9144.5, "step": 3449, "total_num_tokens": 934612504.0, "z_loss": 0.0005974473897367716 }, { "copy_logits_max": -4.0508131980896, "copy_logits_min": -750000000.0, "copy_num_tokens": 395.1875, "epoch": 0.7046208833290784, "gen_logits_max": 4.705865859985352, "gen_logits_mean": -13.867097854614258, "gen_logits_min": -25.5621337890625, "gen_logits_std": 2.698809862136841, "gen_loss": 0.2927722930908203, "grad_norm": 0.4816607993430005, "learning_rate": 2.657178947368421e-05, "loss": 0.3117, "mean_copy_accuracy": 0.9943608045578003, "mean_gen_accuracy": 0.8643907010555267, "mean_token_accuracy": 0.8963331282138824, "num_tokens": 934838633.0, "sample_num_tokens": 7924.25, "step": 3450, "total_num_tokens": 934870330.0, "z_loss": 0.0006365389563143253 }, { "copy_logits_max": -7.158585071563721, "copy_logits_min": -750000000.0, "copy_num_tokens": 270.0625, "epoch": 0.7048251212662752, "gen_logits_max": 4.978373050689697, "gen_logits_mean": -14.903189659118652, "gen_logits_min": -26.00621795654297, "gen_logits_std": 2.6784744262695312, "gen_loss": 0.2869986593723297, "grad_norm": 0.4037038823802911, "learning_rate": 2.6570526315789472e-05, "loss": 0.2942, "mean_copy_accuracy": 0.9941059947013855, "mean_gen_accuracy": 0.8742242157459259, "mean_token_accuracy": 0.9011022746562958, "num_tokens": 935095943.0, "sample_num_tokens": 7665.25, "step": 3451, "total_num_tokens": 935126604.0, "z_loss": 0.0005653846892528236 }, { "copy_logits_max": -5.817676544189453, "copy_logits_min": -750000000.0, "copy_num_tokens": 293.375, "epoch": 0.7050293592034721, "gen_logits_max": 5.546218395233154, "gen_logits_mean": -12.316498756408691, "gen_logits_min": -24.215282440185547, "gen_logits_std": 2.6951146125793457, "gen_loss": 0.33145368099212646, "grad_norm": 0.40806805688082165, "learning_rate": 2.6569263157894736e-05, "loss": 0.3039, "mean_copy_accuracy": 0.9948699176311493, "mean_gen_accuracy": 0.8639654070138931, "mean_token_accuracy": 0.8986119478940964, "num_tokens": 935364899.0, "sample_num_tokens": 6619.25, "step": 3452, "total_num_tokens": 935391376.0, "z_loss": 0.0006778566748835146 }, { "copy_logits_max": -4.308789253234863, "copy_logits_min": -750000000.0, "copy_num_tokens": 497.75, "epoch": 0.7052335971406689, "gen_logits_max": 4.206602096557617, "gen_logits_mean": -14.736492156982422, "gen_logits_min": -26.712177276611328, "gen_logits_std": 2.6876120567321777, "gen_loss": 0.28107380867004395, "grad_norm": 0.41061486349257376, "learning_rate": 2.6568000000000004e-05, "loss": 0.2956, "mean_copy_accuracy": 0.9947868138551712, "mean_gen_accuracy": 0.8683904707431793, "mean_token_accuracy": 0.9031448811292648, "num_tokens": 935653049.0, "sample_num_tokens": 9470.75, "step": 3453, "total_num_tokens": 935690932.0, "z_loss": 0.0006106265354901552 }, { "copy_logits_max": -5.742777347564697, "copy_logits_min": -750000064.0, "copy_num_tokens": 518.25, "epoch": 0.7054378350778657, "gen_logits_max": 4.3245086669921875, "gen_logits_mean": -14.580952644348145, "gen_logits_min": -26.185880661010742, "gen_logits_std": 2.6965208053588867, "gen_loss": 0.28072893619537354, "grad_norm": 0.39202248679447876, "learning_rate": 2.6566736842105265e-05, "loss": 0.3056, "mean_copy_accuracy": 0.9944917410612106, "mean_gen_accuracy": 0.8663826882839203, "mean_token_accuracy": 0.8981097489595413, "num_tokens": 935941919.0, "sample_num_tokens": 8354.25, "step": 3454, "total_num_tokens": 935975336.0, "z_loss": 0.0005891983164474368 }, { "copy_logits_max": -4.52366828918457, "copy_logits_min": -750000000.0, "copy_num_tokens": 403.0625, "epoch": 0.7056420730150625, "gen_logits_max": 5.882333755493164, "gen_logits_mean": -13.035679817199707, "gen_logits_min": -24.513423919677734, "gen_logits_std": 2.6604433059692383, "gen_loss": 0.32111844420433044, "grad_norm": 0.4781053078101916, "learning_rate": 2.656547368421053e-05, "loss": 0.3361, "mean_copy_accuracy": 0.9933242946863174, "mean_gen_accuracy": 0.854501411318779, "mean_token_accuracy": 0.8868684023618698, "num_tokens": 936201110.0, "sample_num_tokens": 7780.5, "step": 3455, "total_num_tokens": 936232232.0, "z_loss": 0.0006548117962665856 }, { "copy_logits_max": -4.674373149871826, "copy_logits_min": -750000000.0, "copy_num_tokens": 543.9375, "epoch": 0.7058463109522594, "gen_logits_max": 4.416147708892822, "gen_logits_mean": -14.008472442626953, "gen_logits_min": -25.54368782043457, "gen_logits_std": 2.6906023025512695, "gen_loss": 0.29623299837112427, "grad_norm": 0.38495446308282816, "learning_rate": 2.656421052631579e-05, "loss": 0.3032, "mean_copy_accuracy": 0.9954167902469635, "mean_gen_accuracy": 0.8673099875450134, "mean_token_accuracy": 0.8994289487600327, "num_tokens": 936456934.0, "sample_num_tokens": 8614.5, "step": 3456, "total_num_tokens": 936491392.0, "z_loss": 0.0006521903560496867 }, { "copy_logits_max": -4.797703266143799, "copy_logits_min": -750000000.0, "copy_num_tokens": 357.25, "epoch": 0.7060505488894562, "gen_logits_max": 6.1700358390808105, "gen_logits_mean": -12.45826530456543, "gen_logits_min": -24.02819061279297, "gen_logits_std": 2.6498019695281982, "gen_loss": 0.35337045788764954, "grad_norm": 0.43977581095458734, "learning_rate": 2.6562947368421054e-05, "loss": 0.3463, "mean_copy_accuracy": 0.9949120432138443, "mean_gen_accuracy": 0.8569867759943008, "mean_token_accuracy": 0.8869470208883286, "num_tokens": 936727628.0, "sample_num_tokens": 7849.5, "step": 3457, "total_num_tokens": 936759026.0, "z_loss": 0.000786535267252475 }, { "copy_logits_max": -4.529630661010742, "copy_logits_min": -750000000.0, "copy_num_tokens": 488.9375, "epoch": 0.7062547868266531, "gen_logits_max": 5.20607852935791, "gen_logits_mean": -13.967187881469727, "gen_logits_min": -25.884532928466797, "gen_logits_std": 2.7069883346557617, "gen_loss": 0.28993886709213257, "grad_norm": 0.42422984505705885, "learning_rate": 2.6561684210526315e-05, "loss": 0.3097, "mean_copy_accuracy": 0.994404524564743, "mean_gen_accuracy": 0.8650785833597183, "mean_token_accuracy": 0.8967908471822739, "num_tokens": 936997799.0, "sample_num_tokens": 8414.25, "step": 3458, "total_num_tokens": 937031456.0, "z_loss": 0.0007139267399907112 }, { "copy_logits_max": -5.679169654846191, "copy_logits_min": -750000000.0, "copy_num_tokens": 367.5, "epoch": 0.7064590247638499, "gen_logits_max": 4.436242580413818, "gen_logits_mean": -15.803625106811523, "gen_logits_min": -27.087635040283203, "gen_logits_std": 2.6645867824554443, "gen_loss": 0.2947594225406647, "grad_norm": 0.3961877827757412, "learning_rate": 2.656042105263158e-05, "loss": 0.3278, "mean_copy_accuracy": 0.9955260157585144, "mean_gen_accuracy": 0.8561562299728394, "mean_token_accuracy": 0.8903543651103973, "num_tokens": 937277153.0, "sample_num_tokens": 7262.25, "step": 3459, "total_num_tokens": 937306202.0, "z_loss": 0.0006818972178734839 }, { "copy_logits_max": -4.094460487365723, "copy_logits_min": -750000000.0, "copy_num_tokens": 395.6875, "epoch": 0.7066632627010467, "gen_logits_max": 6.039013862609863, "gen_logits_mean": -12.63686752319336, "gen_logits_min": -24.22391128540039, "gen_logits_std": 2.630124568939209, "gen_loss": 0.2878953218460083, "grad_norm": 0.43115810682633, "learning_rate": 2.655915789473684e-05, "loss": 0.3286, "mean_copy_accuracy": 0.9949347823858261, "mean_gen_accuracy": 0.8626856803894043, "mean_token_accuracy": 0.893589198589325, "num_tokens": 937536263.0, "sample_num_tokens": 8573.75, "step": 3460, "total_num_tokens": 937570558.0, "z_loss": 0.0007313569076359272 }, { "copy_logits_max": -5.632257461547852, "copy_logits_min": -750000000.0, "copy_num_tokens": 526.3125, "epoch": 0.7068675006382436, "gen_logits_max": 5.394482135772705, "gen_logits_mean": -12.722597122192383, "gen_logits_min": -24.302711486816406, "gen_logits_std": 2.6006717681884766, "gen_loss": 0.32236823439598083, "grad_norm": 0.39803975711885864, "learning_rate": 2.6557894736842108e-05, "loss": 0.3003, "mean_copy_accuracy": 0.995978593826294, "mean_gen_accuracy": 0.8654332607984543, "mean_token_accuracy": 0.8998549282550812, "num_tokens": 937807252.0, "sample_num_tokens": 8540.5, "step": 3461, "total_num_tokens": 937841414.0, "z_loss": 0.0007838160381652415 }, { "copy_logits_max": -5.286154747009277, "copy_logits_min": -750000000.0, "copy_num_tokens": 411.5, "epoch": 0.7070717385754404, "gen_logits_max": 5.612484455108643, "gen_logits_mean": -13.105103492736816, "gen_logits_min": -24.42949104309082, "gen_logits_std": 2.623509407043457, "gen_loss": 0.30358749628067017, "grad_norm": 0.4295668575054313, "learning_rate": 2.655663157894737e-05, "loss": 0.3074, "mean_copy_accuracy": 0.9956371486186981, "mean_gen_accuracy": 0.8637914657592773, "mean_token_accuracy": 0.8987313508987427, "num_tokens": 938086291.0, "sample_num_tokens": 8006.75, "step": 3462, "total_num_tokens": 938118318.0, "z_loss": 0.0007786300266161561 }, { "copy_logits_max": -6.614049911499023, "copy_logits_min": -750000000.0, "copy_num_tokens": 406.25, "epoch": 0.7072759765126372, "gen_logits_max": 6.173618316650391, "gen_logits_mean": -12.499643325805664, "gen_logits_min": -24.130178451538086, "gen_logits_std": 2.574110746383667, "gen_loss": 0.34533756971359253, "grad_norm": 0.41728844745041893, "learning_rate": 2.6555368421052633e-05, "loss": 0.3151, "mean_copy_accuracy": 0.9942744821310043, "mean_gen_accuracy": 0.8638542294502258, "mean_token_accuracy": 0.8953614383935928, "num_tokens": 938362963.0, "sample_num_tokens": 8003.75, "step": 3463, "total_num_tokens": 938394978.0, "z_loss": 0.0008279205649159849 }, { "copy_logits_max": -4.256946563720703, "copy_logits_min": -750000000.0, "copy_num_tokens": 539.9375, "epoch": 0.7074802144498341, "gen_logits_max": 4.000702381134033, "gen_logits_mean": -15.322654724121094, "gen_logits_min": -27.175395965576172, "gen_logits_std": 2.670576810836792, "gen_loss": 0.3306789994239807, "grad_norm": 0.395182685911372, "learning_rate": 2.6554105263157894e-05, "loss": 0.3133, "mean_copy_accuracy": 0.9937128871679306, "mean_gen_accuracy": 0.864826425909996, "mean_token_accuracy": 0.8968636244535446, "num_tokens": 938628001.0, "sample_num_tokens": 8330.25, "step": 3464, "total_num_tokens": 938661322.0, "z_loss": 0.0007327843923121691 }, { "copy_logits_max": -3.4570465087890625, "copy_logits_min": -750000000.0, "copy_num_tokens": 676.6875, "epoch": 0.7076844523870309, "gen_logits_max": 4.2691497802734375, "gen_logits_mean": -14.674995422363281, "gen_logits_min": -26.840152740478516, "gen_logits_std": 2.7211050987243652, "gen_loss": 0.2695578336715698, "grad_norm": 0.42455884752542566, "learning_rate": 2.655284210526316e-05, "loss": 0.3024, "mean_copy_accuracy": 0.9947259575128555, "mean_gen_accuracy": 0.859528049826622, "mean_token_accuracy": 0.9002024084329605, "num_tokens": 938916390.0, "sample_num_tokens": 10413.0, "step": 3465, "total_num_tokens": 938958042.0, "z_loss": 0.0006189770065248013 }, { "copy_logits_max": -5.5634636878967285, "copy_logits_min": -750000000.0, "copy_num_tokens": 493.3125, "epoch": 0.7078886903242277, "gen_logits_max": 5.172947883605957, "gen_logits_mean": -13.42151165008545, "gen_logits_min": -25.12151336669922, "gen_logits_std": 2.680319309234619, "gen_loss": 0.3132183849811554, "grad_norm": 0.4143709340602237, "learning_rate": 2.655157894736842e-05, "loss": 0.3192, "mean_copy_accuracy": 0.9948645383119583, "mean_gen_accuracy": 0.8614133894443512, "mean_token_accuracy": 0.8930603712797165, "num_tokens": 939187896.0, "sample_num_tokens": 8573.5, "step": 3466, "total_num_tokens": 939222190.0, "z_loss": 0.0006591464625671506 }, { "copy_logits_max": -5.155753135681152, "copy_logits_min": -687500032.0, "copy_num_tokens": 771.9375, "epoch": 0.7080929282614246, "gen_logits_max": 4.111923694610596, "gen_logits_mean": -14.133058547973633, "gen_logits_min": -26.361217498779297, "gen_logits_std": 2.710279941558838, "gen_loss": 0.25301510095596313, "grad_norm": 0.41528006462267875, "learning_rate": 2.6550315789473684e-05, "loss": 0.302, "mean_copy_accuracy": 0.9947319328784943, "mean_gen_accuracy": 0.8632534444332123, "mean_token_accuracy": 0.8982708752155304, "num_tokens": 939459655.0, "sample_num_tokens": 10606.75, "step": 3467, "total_num_tokens": 939502082.0, "z_loss": 0.0005902881966903806 }, { "copy_logits_max": -2.8525314331054688, "copy_logits_min": -750000000.0, "copy_num_tokens": 472.5, "epoch": 0.7082971661986214, "gen_logits_max": 4.882391929626465, "gen_logits_mean": -14.298196792602539, "gen_logits_min": -25.62188720703125, "gen_logits_std": 2.6557562351226807, "gen_loss": 0.3312199115753174, "grad_norm": 0.40876674184052825, "learning_rate": 2.6549052631578948e-05, "loss": 0.3304, "mean_copy_accuracy": 0.9938800781965256, "mean_gen_accuracy": 0.8603823781013489, "mean_token_accuracy": 0.8889361619949341, "num_tokens": 939740471.0, "sample_num_tokens": 8758.75, "step": 3468, "total_num_tokens": 939775506.0, "z_loss": 0.0006383494473993778 }, { "copy_logits_max": -3.697934150695801, "copy_logits_min": -750000000.0, "copy_num_tokens": 591.5625, "epoch": 0.7085014041358182, "gen_logits_max": 4.218715190887451, "gen_logits_mean": -15.735921859741211, "gen_logits_min": -27.618181228637695, "gen_logits_std": 2.6928305625915527, "gen_loss": 0.3135410249233246, "grad_norm": 0.4063490409814962, "learning_rate": 2.6547789473684212e-05, "loss": 0.2954, "mean_copy_accuracy": 0.9946915954351425, "mean_gen_accuracy": 0.8695974051952362, "mean_token_accuracy": 0.9025420546531677, "num_tokens": 940035597.0, "sample_num_tokens": 10188.75, "step": 3469, "total_num_tokens": 940076352.0, "z_loss": 0.0006498588481917977 }, { "copy_logits_max": -2.526076078414917, "copy_logits_min": -687500032.0, "copy_num_tokens": 379.375, "epoch": 0.708705642073015, "gen_logits_max": 5.526678085327148, "gen_logits_mean": -13.911127090454102, "gen_logits_min": -25.633495330810547, "gen_logits_std": 2.7155728340148926, "gen_loss": 0.3329429030418396, "grad_norm": 0.47346507954184025, "learning_rate": 2.6546526315789477e-05, "loss": 0.3096, "mean_copy_accuracy": 0.9942213743925095, "mean_gen_accuracy": 0.864310085773468, "mean_token_accuracy": 0.8970104157924652, "num_tokens": 940328610.0, "sample_num_tokens": 8109.0, "step": 3470, "total_num_tokens": 940361046.0, "z_loss": 0.0006801068084314466 }, { "copy_logits_max": -4.92301607131958, "copy_logits_min": -750000000.0, "copy_num_tokens": 490.8125, "epoch": 0.7089098800102119, "gen_logits_max": 4.803584098815918, "gen_logits_mean": -15.029641151428223, "gen_logits_min": -26.481578826904297, "gen_logits_std": 2.691481113433838, "gen_loss": 0.30434849858283997, "grad_norm": 0.3945994775576478, "learning_rate": 2.6545263157894738e-05, "loss": 0.3016, "mean_copy_accuracy": 0.9950974583625793, "mean_gen_accuracy": 0.866102397441864, "mean_token_accuracy": 0.8993921875953674, "num_tokens": 940605647.0, "sample_num_tokens": 9053.25, "step": 3471, "total_num_tokens": 940641860.0, "z_loss": 0.0007290862267836928 }, { "copy_logits_max": -4.315116882324219, "copy_logits_min": -750000064.0, "copy_num_tokens": 437.1875, "epoch": 0.7091141179474088, "gen_logits_max": 5.039546966552734, "gen_logits_mean": -14.027854919433594, "gen_logits_min": -26.671512603759766, "gen_logits_std": 2.7581987380981445, "gen_loss": 0.25520414113998413, "grad_norm": 0.40299860990910574, "learning_rate": 2.6544000000000002e-05, "loss": 0.2978, "mean_copy_accuracy": 0.9951293915510178, "mean_gen_accuracy": 0.8697111308574677, "mean_token_accuracy": 0.9004717171192169, "num_tokens": 940891565.0, "sample_num_tokens": 8018.25, "step": 3472, "total_num_tokens": 940923638.0, "z_loss": 0.0006114768330007792 }, { "copy_logits_max": -4.606884479522705, "copy_logits_min": -750000064.0, "copy_num_tokens": 613.5, "epoch": 0.7093183558846056, "gen_logits_max": 4.47092342376709, "gen_logits_mean": -15.93912124633789, "gen_logits_min": -27.22048568725586, "gen_logits_std": 2.680020332336426, "gen_loss": 0.32889413833618164, "grad_norm": 0.422015138767747, "learning_rate": 2.6542736842105263e-05, "loss": 0.3331, "mean_copy_accuracy": 0.9948436468839645, "mean_gen_accuracy": 0.8597063273191452, "mean_token_accuracy": 0.8900002688169479, "num_tokens": 941169295.0, "sample_num_tokens": 9584.75, "step": 3473, "total_num_tokens": 941207634.0, "z_loss": 0.0007457836763933301 }, { "copy_logits_max": -5.949135780334473, "copy_logits_min": -750000000.0, "copy_num_tokens": 424.5625, "epoch": 0.7095225938218024, "gen_logits_max": 5.679346084594727, "gen_logits_mean": -14.30255126953125, "gen_logits_min": -26.016422271728516, "gen_logits_std": 2.677556276321411, "gen_loss": 0.33257195353507996, "grad_norm": 0.6498150450660938, "learning_rate": 2.6541473684210527e-05, "loss": 0.3474, "mean_copy_accuracy": 0.993122398853302, "mean_gen_accuracy": 0.857418566942215, "mean_token_accuracy": 0.8863936066627502, "num_tokens": 941422909.0, "sample_num_tokens": 9613.75, "step": 3474, "total_num_tokens": 941461364.0, "z_loss": 0.0007598306401632726 }, { "copy_logits_max": -3.7717273235321045, "copy_logits_min": -750000064.0, "copy_num_tokens": 547.125, "epoch": 0.7097268317589992, "gen_logits_max": 5.600773334503174, "gen_logits_mean": -12.913349151611328, "gen_logits_min": -25.316654205322266, "gen_logits_std": 2.711264133453369, "gen_loss": 0.31387966871261597, "grad_norm": 0.8415438408290429, "learning_rate": 2.6540210526315788e-05, "loss": 0.3116, "mean_copy_accuracy": 0.9942470192909241, "mean_gen_accuracy": 0.8670196086168289, "mean_token_accuracy": 0.896616667509079, "num_tokens": 941685960.0, "sample_num_tokens": 8790.5, "step": 3475, "total_num_tokens": 941721122.0, "z_loss": 0.0007886899402365088 }, { "copy_logits_max": -4.338913917541504, "copy_logits_min": -687500032.0, "copy_num_tokens": 351.6875, "epoch": 0.709931069696196, "gen_logits_max": 6.368677139282227, "gen_logits_mean": -13.298604965209961, "gen_logits_min": -25.762619018554688, "gen_logits_std": 2.7292490005493164, "gen_loss": 0.3468359708786011, "grad_norm": 0.48990920864096665, "learning_rate": 2.6538947368421052e-05, "loss": 0.3108, "mean_copy_accuracy": 0.9943762868642807, "mean_gen_accuracy": 0.8653193265199661, "mean_token_accuracy": 0.8985873758792877, "num_tokens": 941972517.0, "sample_num_tokens": 7896.75, "step": 3476, "total_num_tokens": 942004104.0, "z_loss": 0.0007889048429206014 }, { "copy_logits_max": -4.355323791503906, "copy_logits_min": -687500032.0, "copy_num_tokens": 466.6875, "epoch": 0.7101353076333929, "gen_logits_max": 4.979581832885742, "gen_logits_mean": -15.195747375488281, "gen_logits_min": -26.621803283691406, "gen_logits_std": 2.6848597526550293, "gen_loss": 0.3295435905456543, "grad_norm": 0.46344704991417945, "learning_rate": 2.6537684210526317e-05, "loss": 0.331, "mean_copy_accuracy": 0.9938345551490784, "mean_gen_accuracy": 0.8565081506967545, "mean_token_accuracy": 0.8916617929935455, "num_tokens": 942242489.0, "sample_num_tokens": 8604.75, "step": 3477, "total_num_tokens": 942276908.0, "z_loss": 0.000643707811832428 }, { "copy_logits_max": -5.093229293823242, "copy_logits_min": -687500032.0, "copy_num_tokens": 347.25, "epoch": 0.7103395455705898, "gen_logits_max": 4.862975597381592, "gen_logits_mean": -15.677604675292969, "gen_logits_min": -27.028371810913086, "gen_logits_std": 2.6739907264709473, "gen_loss": 0.3193090558052063, "grad_norm": 0.41763713344949016, "learning_rate": 2.653642105263158e-05, "loss": 0.3383, "mean_copy_accuracy": 0.9940609037876129, "mean_gen_accuracy": 0.859164759516716, "mean_token_accuracy": 0.8873666226863861, "num_tokens": 942524423.0, "sample_num_tokens": 7719.25, "step": 3478, "total_num_tokens": 942555300.0, "z_loss": 0.0007035174639895558 }, { "copy_logits_max": -3.4605588912963867, "copy_logits_min": -750000000.0, "copy_num_tokens": 479.0625, "epoch": 0.7105437835077866, "gen_logits_max": 5.578961372375488, "gen_logits_mean": -12.970595359802246, "gen_logits_min": -25.16605567932129, "gen_logits_std": 2.716787338256836, "gen_loss": 0.3303379416465759, "grad_norm": 0.5312319945968778, "learning_rate": 2.6535157894736842e-05, "loss": 0.3368, "mean_copy_accuracy": 0.9934538751840591, "mean_gen_accuracy": 0.8562683761119843, "mean_token_accuracy": 0.889097586274147, "num_tokens": 942792095.0, "sample_num_tokens": 8287.75, "step": 3479, "total_num_tokens": 942825246.0, "z_loss": 0.0008018937660381198 }, { "copy_logits_max": -3.511049747467041, "copy_logits_min": -750000000.0, "copy_num_tokens": 396.9375, "epoch": 0.7107480214449834, "gen_logits_max": 4.610044479370117, "gen_logits_mean": -14.52271842956543, "gen_logits_min": -26.12106704711914, "gen_logits_std": 2.695277214050293, "gen_loss": 0.3111259937286377, "grad_norm": 0.6156082181259257, "learning_rate": 2.6533894736842106e-05, "loss": 0.3203, "mean_copy_accuracy": 0.9930778741836548, "mean_gen_accuracy": 0.869136169552803, "mean_token_accuracy": 0.8927036821842194, "num_tokens": 943034701.0, "sample_num_tokens": 7732.25, "step": 3480, "total_num_tokens": 943065630.0, "z_loss": 0.0007235086522996426 }, { "copy_logits_max": -4.686227798461914, "copy_logits_min": -750000000.0, "copy_num_tokens": 319.375, "epoch": 0.7109522593821802, "gen_logits_max": 5.179123878479004, "gen_logits_mean": -15.5006103515625, "gen_logits_min": -26.870235443115234, "gen_logits_std": 2.6410374641418457, "gen_loss": 0.3394601345062256, "grad_norm": 0.4489299550079759, "learning_rate": 2.653263157894737e-05, "loss": 0.3207, "mean_copy_accuracy": 0.9951074868440628, "mean_gen_accuracy": 0.8651643693447113, "mean_token_accuracy": 0.8934648633003235, "num_tokens": 943280924.0, "sample_num_tokens": 7346.5, "step": 3481, "total_num_tokens": 943310310.0, "z_loss": 0.0007243806030601263 }, { "copy_logits_max": -3.789400339126587, "copy_logits_min": -750000000.0, "copy_num_tokens": 356.4375, "epoch": 0.711156497319377, "gen_logits_max": 5.111515998840332, "gen_logits_mean": -14.461694717407227, "gen_logits_min": -26.084270477294922, "gen_logits_std": 2.702622413635254, "gen_loss": 0.3307252526283264, "grad_norm": 0.4581194983516588, "learning_rate": 2.653136842105263e-05, "loss": 0.2897, "mean_copy_accuracy": 0.9929471164941788, "mean_gen_accuracy": 0.8709161281585693, "mean_token_accuracy": 0.90242700278759, "num_tokens": 943556807.0, "sample_num_tokens": 7740.75, "step": 3482, "total_num_tokens": 943587770.0, "z_loss": 0.0007339486037380993 }, { "copy_logits_max": -2.683061122894287, "copy_logits_min": -750000000.0, "copy_num_tokens": 651.375, "epoch": 0.711360735256574, "gen_logits_max": 3.9979891777038574, "gen_logits_mean": -15.380880355834961, "gen_logits_min": -27.3765811920166, "gen_logits_std": 2.7128093242645264, "gen_loss": 0.27473893761634827, "grad_norm": 0.4293600223831869, "learning_rate": 2.6530105263157896e-05, "loss": 0.3108, "mean_copy_accuracy": 0.9936193376779556, "mean_gen_accuracy": 0.8621286153793335, "mean_token_accuracy": 0.894547700881958, "num_tokens": 943817692.0, "sample_num_tokens": 9343.0, "step": 3483, "total_num_tokens": 943855064.0, "z_loss": 0.0006237453781068325 }, { "copy_logits_max": -3.2166786193847656, "copy_logits_min": -687500032.0, "copy_num_tokens": 392.8125, "epoch": 0.7115649731937708, "gen_logits_max": 5.26780891418457, "gen_logits_mean": -14.852815628051758, "gen_logits_min": -26.919734954833984, "gen_logits_std": 2.7089767456054688, "gen_loss": 0.323358416557312, "grad_norm": 0.4150218259704317, "learning_rate": 2.6528842105263157e-05, "loss": 0.3172, "mean_copy_accuracy": 0.994754284620285, "mean_gen_accuracy": 0.8632987886667252, "mean_token_accuracy": 0.8951723128557205, "num_tokens": 944095478.0, "sample_num_tokens": 7502.5, "step": 3484, "total_num_tokens": 944125488.0, "z_loss": 0.0007223867578431964 }, { "copy_logits_max": -2.7578346729278564, "copy_logits_min": -750000000.0, "copy_num_tokens": 491.4375, "epoch": 0.7117692111309676, "gen_logits_max": 4.690936088562012, "gen_logits_mean": -14.335034370422363, "gen_logits_min": -26.126178741455078, "gen_logits_std": 2.6933486461639404, "gen_loss": 0.3239430785179138, "grad_norm": 0.44342651366003394, "learning_rate": 2.652757894736842e-05, "loss": 0.3124, "mean_copy_accuracy": 0.9948517382144928, "mean_gen_accuracy": 0.8639288544654846, "mean_token_accuracy": 0.8955731093883514, "num_tokens": 944362881.0, "sample_num_tokens": 8879.25, "step": 3485, "total_num_tokens": 944398398.0, "z_loss": 0.0007305759936571121 }, { "copy_logits_max": -3.0762746334075928, "copy_logits_min": -750000000.0, "copy_num_tokens": 357.875, "epoch": 0.7119734490681644, "gen_logits_max": 5.535905838012695, "gen_logits_mean": -13.736263275146484, "gen_logits_min": -25.594676971435547, "gen_logits_std": 2.6320154666900635, "gen_loss": 0.3541165292263031, "grad_norm": 0.480358382547791, "learning_rate": 2.6526315789473685e-05, "loss": 0.3288, "mean_copy_accuracy": 0.9940725117921829, "mean_gen_accuracy": 0.861147090792656, "mean_token_accuracy": 0.8919972777366638, "num_tokens": 944638945.0, "sample_num_tokens": 7779.25, "step": 3486, "total_num_tokens": 944670062.0, "z_loss": 0.0007683064904995263 }, { "copy_logits_max": -5.312968730926514, "copy_logits_min": -687500032.0, "copy_num_tokens": 384.875, "epoch": 0.7121776870053612, "gen_logits_max": 5.214402198791504, "gen_logits_mean": -14.06230640411377, "gen_logits_min": -25.885860443115234, "gen_logits_std": 2.662515163421631, "gen_loss": 0.3162420690059662, "grad_norm": 0.4556071634990751, "learning_rate": 2.652505263157895e-05, "loss": 0.3141, "mean_copy_accuracy": 0.9938665181398392, "mean_gen_accuracy": 0.8652266561985016, "mean_token_accuracy": 0.896465927362442, "num_tokens": 944897571.0, "sample_num_tokens": 7284.25, "step": 3487, "total_num_tokens": 944926708.0, "z_loss": 0.0006456418195739388 }, { "copy_logits_max": -2.985227584838867, "copy_logits_min": -750000000.0, "copy_num_tokens": 579.8125, "epoch": 0.712381924942558, "gen_logits_max": 5.199576377868652, "gen_logits_mean": -12.993061065673828, "gen_logits_min": -24.759159088134766, "gen_logits_std": 2.6989707946777344, "gen_loss": 0.3074282705783844, "grad_norm": 0.4205101082236674, "learning_rate": 2.652378947368421e-05, "loss": 0.3214, "mean_copy_accuracy": 0.9939371794462204, "mean_gen_accuracy": 0.864350751042366, "mean_token_accuracy": 0.8925109654664993, "num_tokens": 945160359.0, "sample_num_tokens": 10162.25, "step": 3488, "total_num_tokens": 945201008.0, "z_loss": 0.0006472617969848216 }, { "copy_logits_max": -2.7176334857940674, "copy_logits_min": -750000000.0, "copy_num_tokens": 454.0, "epoch": 0.712586162879755, "gen_logits_max": 5.289973735809326, "gen_logits_mean": -13.273107528686523, "gen_logits_min": -25.044235229492188, "gen_logits_std": 2.6682724952697754, "gen_loss": 0.31946247816085815, "grad_norm": 0.44451375117566694, "learning_rate": 2.6522526315789475e-05, "loss": 0.2974, "mean_copy_accuracy": 0.99432772397995, "mean_gen_accuracy": 0.8674557507038116, "mean_token_accuracy": 0.9018732607364655, "num_tokens": 945438500.0, "sample_num_tokens": 7994.5, "step": 3489, "total_num_tokens": 945470478.0, "z_loss": 0.0007004916551522911 }, { "copy_logits_max": -3.919348955154419, "copy_logits_min": -750000000.0, "copy_num_tokens": 375.375, "epoch": 0.7127904008169518, "gen_logits_max": 5.109643936157227, "gen_logits_mean": -13.6392183303833, "gen_logits_min": -24.80042266845703, "gen_logits_std": 2.6082801818847656, "gen_loss": 0.3099040687084198, "grad_norm": 0.4232305850632594, "learning_rate": 2.6521263157894736e-05, "loss": 0.2971, "mean_copy_accuracy": 0.9956786930561066, "mean_gen_accuracy": 0.8655774593353271, "mean_token_accuracy": 0.9019953012466431, "num_tokens": 945730028.0, "sample_num_tokens": 8722.5, "step": 3490, "total_num_tokens": 945764918.0, "z_loss": 0.0006867226911708713 }, { "copy_logits_max": -5.028321743011475, "copy_logits_min": -750000000.0, "copy_num_tokens": 317.6875, "epoch": 0.7129946387541486, "gen_logits_max": 5.064675331115723, "gen_logits_mean": -14.692543983459473, "gen_logits_min": -26.673049926757812, "gen_logits_std": 2.718592882156372, "gen_loss": 0.35260385274887085, "grad_norm": 0.4359770571171945, "learning_rate": 2.652e-05, "loss": 0.3511, "mean_copy_accuracy": 0.9940802901983261, "mean_gen_accuracy": 0.8579207807779312, "mean_token_accuracy": 0.8854379951953888, "num_tokens": 945984178.0, "sample_num_tokens": 6915.0, "step": 3491, "total_num_tokens": 946011838.0, "z_loss": 0.0007362932665273547 }, { "copy_logits_max": -5.380413055419922, "copy_logits_min": -750000064.0, "copy_num_tokens": 444.3125, "epoch": 0.7131988766913454, "gen_logits_max": 4.739095687866211, "gen_logits_mean": -15.3095703125, "gen_logits_min": -26.927833557128906, "gen_logits_std": 2.682889223098755, "gen_loss": 0.3128228187561035, "grad_norm": 0.4460808573351448, "learning_rate": 2.651873684210526e-05, "loss": 0.3148, "mean_copy_accuracy": 0.9947545975446701, "mean_gen_accuracy": 0.8641934394836426, "mean_token_accuracy": 0.8968496173620224, "num_tokens": 946241155.0, "sample_num_tokens": 8501.75, "step": 3492, "total_num_tokens": 946275162.0, "z_loss": 0.0006230277940630913 }, { "copy_logits_max": -5.246346950531006, "copy_logits_min": -750000000.0, "copy_num_tokens": 521.375, "epoch": 0.7134031146285422, "gen_logits_max": 4.528989791870117, "gen_logits_mean": -14.878028869628906, "gen_logits_min": -26.904932022094727, "gen_logits_std": 2.676276922225952, "gen_loss": 0.3005993664264679, "grad_norm": 0.4744853127802855, "learning_rate": 2.6517473684210525e-05, "loss": 0.315, "mean_copy_accuracy": 0.994721308350563, "mean_gen_accuracy": 0.8636335283517838, "mean_token_accuracy": 0.8974685072898865, "num_tokens": 946518014.0, "sample_num_tokens": 7999.5, "step": 3493, "total_num_tokens": 946550012.0, "z_loss": 0.000640499813016504 }, { "copy_logits_max": -6.289493083953857, "copy_logits_min": -750000000.0, "copy_num_tokens": 485.375, "epoch": 0.713607352565739, "gen_logits_max": 4.8507585525512695, "gen_logits_mean": -14.042617797851562, "gen_logits_min": -26.063785552978516, "gen_logits_std": 2.678865671157837, "gen_loss": 0.27860334515571594, "grad_norm": 0.42023882381855954, "learning_rate": 2.6516210526315793e-05, "loss": 0.3091, "mean_copy_accuracy": 0.9954485297203064, "mean_gen_accuracy": 0.86391282081604, "mean_token_accuracy": 0.8982354253530502, "num_tokens": 946797292.0, "sample_num_tokens": 7954.0, "step": 3494, "total_num_tokens": 946829108.0, "z_loss": 0.0005661908071488142 }, { "copy_logits_max": -3.99821138381958, "copy_logits_min": -687500032.0, "copy_num_tokens": 498.625, "epoch": 0.713811590502936, "gen_logits_max": 4.374917030334473, "gen_logits_mean": -15.277528762817383, "gen_logits_min": -26.91020393371582, "gen_logits_std": 2.692857503890991, "gen_loss": 0.3411540389060974, "grad_norm": 0.44157454441785343, "learning_rate": 2.6514947368421054e-05, "loss": 0.3478, "mean_copy_accuracy": 0.9949340671300888, "mean_gen_accuracy": 0.8527664840221405, "mean_token_accuracy": 0.8844391256570816, "num_tokens": 947055362.0, "sample_num_tokens": 8349.0, "step": 3495, "total_num_tokens": 947088758.0, "z_loss": 0.0007008719258010387 }, { "copy_logits_max": -5.024377822875977, "copy_logits_min": -750000000.0, "copy_num_tokens": 521.375, "epoch": 0.7140158284401328, "gen_logits_max": 4.482924461364746, "gen_logits_mean": -14.631986618041992, "gen_logits_min": -26.689374923706055, "gen_logits_std": 2.724557399749756, "gen_loss": 0.3162982761859894, "grad_norm": 0.4731005105448188, "learning_rate": 2.651368421052632e-05, "loss": 0.3156, "mean_copy_accuracy": 0.9940678030252457, "mean_gen_accuracy": 0.8645064681768417, "mean_token_accuracy": 0.8951964825391769, "num_tokens": 947326415.0, "sample_num_tokens": 8455.75, "step": 3496, "total_num_tokens": 947360238.0, "z_loss": 0.0005968725308775902 }, { "copy_logits_max": -3.9449734687805176, "copy_logits_min": -750000000.0, "copy_num_tokens": 452.5, "epoch": 0.7142200663773296, "gen_logits_max": 4.823294639587402, "gen_logits_mean": -15.166261672973633, "gen_logits_min": -26.74374008178711, "gen_logits_std": 2.698957920074463, "gen_loss": 0.3566969931125641, "grad_norm": 0.4366075078161212, "learning_rate": 2.651242105263158e-05, "loss": 0.3381, "mean_copy_accuracy": 0.9954660534858704, "mean_gen_accuracy": 0.8537412136793137, "mean_token_accuracy": 0.8882188647985458, "num_tokens": 947596146.0, "sample_num_tokens": 8388.0, "step": 3497, "total_num_tokens": 947629698.0, "z_loss": 0.0007497949991375208 }, { "copy_logits_max": -1.4786889553070068, "copy_logits_min": -750000064.0, "copy_num_tokens": 374.125, "epoch": 0.7144243043145264, "gen_logits_max": 5.8258209228515625, "gen_logits_mean": -13.769872665405273, "gen_logits_min": -25.518999099731445, "gen_logits_std": 2.6971538066864014, "gen_loss": 0.34181439876556396, "grad_norm": 0.4458677321432078, "learning_rate": 2.6511157894736844e-05, "loss": 0.3247, "mean_copy_accuracy": 0.9941631406545639, "mean_gen_accuracy": 0.8594775050878525, "mean_token_accuracy": 0.8921755999326706, "num_tokens": 947868174.0, "sample_num_tokens": 8090.0, "step": 3498, "total_num_tokens": 947900534.0, "z_loss": 0.0008087779278866947 }, { "copy_logits_max": -4.6030731201171875, "copy_logits_min": -750000000.0, "copy_num_tokens": 494.75, "epoch": 0.7146285422517232, "gen_logits_max": 5.5591139793396, "gen_logits_mean": -13.685440063476562, "gen_logits_min": -25.43231773376465, "gen_logits_std": 2.678316593170166, "gen_loss": 0.32389774918556213, "grad_norm": 0.42053375306763463, "learning_rate": 2.6509894736842105e-05, "loss": 0.3192, "mean_copy_accuracy": 0.9953875243663788, "mean_gen_accuracy": 0.8572683930397034, "mean_token_accuracy": 0.8963107168674469, "num_tokens": 948145140.0, "sample_num_tokens": 8517.0, "step": 3499, "total_num_tokens": 948179208.0, "z_loss": 0.0007175140781328082 }, { "epoch": 0.71483278018892, "grad_norm": 0.4227026200210539, "learning_rate": 2.650863157894737e-05, "loss": 0.3008, "step": 3500 }, { "epoch": 0.71483278018892, "eval_copy_logits_max": -5.394382476806641, "eval_copy_logits_min": -77.31293487548828, "eval_gen_logits_max": 4.527896881103516, "eval_gen_logits_mean": -18.336315155029297, "eval_gen_logits_min": -29.050193786621094, "eval_gen_logits_std": 2.634082317352295, "eval_gen_loss": 0.3574363589286804, "eval_loss": 0.33803266286849976, "eval_mean_copy_accuracy": 0.9902251958847046, "eval_mean_gen_accuracy": 0.8701052367687225, "eval_mean_token_accuracy": 0.8854676187038422, "eval_num_tokens": 948454778.0, "eval_runtime": 0.7582, "eval_samples_per_second": 10.552, "eval_steps_per_second": 2.638, "eval_total_num_tokens": 948454778.0, "eval_z_loss": 0.0007080455543473363, "step": 3500 }, { "copy_logits_max": -2.276925802230835, "copy_logits_min": -750000000.0, "copy_num_tokens": 468.6875, "epoch": 0.715037018126117, "gen_logits_max": 4.219297409057617, "gen_logits_mean": -15.358402252197266, "gen_logits_min": -27.422256469726562, "gen_logits_std": 2.719839334487915, "gen_loss": 0.30252590775489807, "grad_norm": 0.4672814901727264, "learning_rate": 2.650736842105263e-05, "loss": 0.3205, "mean_copy_accuracy": 0.9933265969157219, "mean_gen_accuracy": 0.865325354039669, "mean_token_accuracy": 0.8966019377112389, "num_tokens": 948678719.0, "sample_num_tokens": 8074.75, "step": 3501, "total_num_tokens": 948711018.0, "z_loss": 0.0007206961745396256 }, { "copy_logits_max": -2.118767261505127, "copy_logits_min": -750000000.0, "copy_num_tokens": 599.875, "epoch": 0.7152412560633138, "gen_logits_max": 4.889564514160156, "gen_logits_mean": -13.903326988220215, "gen_logits_min": -25.537675857543945, "gen_logits_std": 2.7232627868652344, "gen_loss": 0.27575474977493286, "grad_norm": 0.3973205206115736, "learning_rate": 2.6506105263157897e-05, "loss": 0.2801, "mean_copy_accuracy": 0.9955122470855713, "mean_gen_accuracy": 0.871636226773262, "mean_token_accuracy": 0.9057501256465912, "num_tokens": 948967368.0, "sample_num_tokens": 9002.5, "step": 3502, "total_num_tokens": 949003378.0, "z_loss": 0.0006824383744969964 }, { "copy_logits_max": -1.8783469200134277, "copy_logits_min": -750000000.0, "copy_num_tokens": 610.8125, "epoch": 0.7154454940005106, "gen_logits_max": 4.976993083953857, "gen_logits_mean": -12.973143577575684, "gen_logits_min": -24.835554122924805, "gen_logits_std": 2.6995129585266113, "gen_loss": 0.34285709261894226, "grad_norm": 0.4045756074410364, "learning_rate": 2.650484210526316e-05, "loss": 0.3081, "mean_copy_accuracy": 0.9955119490623474, "mean_gen_accuracy": 0.8611710220575333, "mean_token_accuracy": 0.8994982540607452, "num_tokens": 949259342.0, "sample_num_tokens": 9470.0, "step": 3503, "total_num_tokens": 949297222.0, "z_loss": 0.0007930909050628543 }, { "copy_logits_max": -3.4236574172973633, "copy_logits_min": -750000000.0, "copy_num_tokens": 426.125, "epoch": 0.7156497319377074, "gen_logits_max": 5.392917633056641, "gen_logits_mean": -13.585277557373047, "gen_logits_min": -25.5745849609375, "gen_logits_std": 2.7520437240600586, "gen_loss": 0.2759721875190735, "grad_norm": 0.4059080268020917, "learning_rate": 2.6503578947368423e-05, "loss": 0.3192, "mean_copy_accuracy": 0.9940570741891861, "mean_gen_accuracy": 0.8624840527772903, "mean_token_accuracy": 0.8960945159196854, "num_tokens": 949551058.0, "sample_num_tokens": 8770.0, "step": 3504, "total_num_tokens": 949586138.0, "z_loss": 0.0006477964343503118 }, { "copy_logits_max": -5.112118721008301, "copy_logits_min": -750000000.0, "copy_num_tokens": 336.125, "epoch": 0.7158539698749042, "gen_logits_max": 6.202757358551025, "gen_logits_mean": -13.613014221191406, "gen_logits_min": -25.095102310180664, "gen_logits_std": 2.6951966285705566, "gen_loss": 0.32487785816192627, "grad_norm": 0.4963746558607362, "learning_rate": 2.6502315789473684e-05, "loss": 0.3126, "mean_copy_accuracy": 0.9931455552577972, "mean_gen_accuracy": 0.8673980683088303, "mean_token_accuracy": 0.8978423476219177, "num_tokens": 949805441.0, "sample_num_tokens": 7173.25, "step": 3505, "total_num_tokens": 949834134.0, "z_loss": 0.0007507956470362842 }, { "copy_logits_max": -5.284148693084717, "copy_logits_min": -750000000.0, "copy_num_tokens": 306.6875, "epoch": 0.716058207812101, "gen_logits_max": 6.355111122131348, "gen_logits_mean": -13.666943550109863, "gen_logits_min": -25.01697540283203, "gen_logits_std": 2.6418707370758057, "gen_loss": 0.32369208335876465, "grad_norm": 0.42267755152670716, "learning_rate": 2.6501052631578948e-05, "loss": 0.3126, "mean_copy_accuracy": 0.9941378235816956, "mean_gen_accuracy": 0.8724780678749084, "mean_token_accuracy": 0.8967662900686264, "num_tokens": 950053122.0, "sample_num_tokens": 8117.0, "step": 3506, "total_num_tokens": 950085590.0, "z_loss": 0.0007180642569437623 }, { "copy_logits_max": -2.6898956298828125, "copy_logits_min": -750000000.0, "copy_num_tokens": 396.25, "epoch": 0.716262445749298, "gen_logits_max": 4.515959739685059, "gen_logits_mean": -14.81300163269043, "gen_logits_min": -26.513992309570312, "gen_logits_std": 2.68928861618042, "gen_loss": 0.32455015182495117, "grad_norm": 0.4462103305328867, "learning_rate": 2.6499789473684212e-05, "loss": 0.3066, "mean_copy_accuracy": 0.9955762177705765, "mean_gen_accuracy": 0.8673661649227142, "mean_token_accuracy": 0.8966815173625946, "num_tokens": 950309312.0, "sample_num_tokens": 7399.5, "step": 3507, "total_num_tokens": 950338910.0, "z_loss": 0.000729061895981431 }, { "copy_logits_max": -4.728271484375, "copy_logits_min": -687500032.0, "copy_num_tokens": 358.0625, "epoch": 0.7164666836864948, "gen_logits_max": 5.137794494628906, "gen_logits_mean": -14.324153900146484, "gen_logits_min": -25.59669303894043, "gen_logits_std": 2.6621623039245605, "gen_loss": 0.32950490713119507, "grad_norm": 0.3917801450743825, "learning_rate": 2.6498526315789473e-05, "loss": 0.3193, "mean_copy_accuracy": 0.9950806200504303, "mean_gen_accuracy": 0.8642957210540771, "mean_token_accuracy": 0.8936222642660141, "num_tokens": 950596099.0, "sample_num_tokens": 8307.75, "step": 3508, "total_num_tokens": 950629330.0, "z_loss": 0.0007236084784381092 }, { "copy_logits_max": -4.225439071655273, "copy_logits_min": -750000064.0, "copy_num_tokens": 443.8125, "epoch": 0.7166709216236916, "gen_logits_max": 5.252480506896973, "gen_logits_mean": -13.399138450622559, "gen_logits_min": -25.260929107666016, "gen_logits_std": 2.7038121223449707, "gen_loss": 0.2864416837692261, "grad_norm": 0.4815683410691575, "learning_rate": 2.6497263157894737e-05, "loss": 0.296, "mean_copy_accuracy": 0.9940285533666611, "mean_gen_accuracy": 0.8651814311742783, "mean_token_accuracy": 0.9003272950649261, "num_tokens": 950862170.0, "sample_num_tokens": 7489.5, "step": 3509, "total_num_tokens": 950892128.0, "z_loss": 0.0006632297299802303 }, { "copy_logits_max": -2.5795493125915527, "copy_logits_min": -687500032.0, "copy_num_tokens": 590.375, "epoch": 0.7168751595608884, "gen_logits_max": 4.941044807434082, "gen_logits_mean": -13.4384126663208, "gen_logits_min": -25.106277465820312, "gen_logits_std": 2.713454246520996, "gen_loss": 0.2899853587150574, "grad_norm": 0.39222435695132757, "learning_rate": 2.6496000000000002e-05, "loss": 0.3126, "mean_copy_accuracy": 0.9946389347314835, "mean_gen_accuracy": 0.868561714887619, "mean_token_accuracy": 0.8972695171833038, "num_tokens": 951112771.0, "sample_num_tokens": 9735.25, "step": 3510, "total_num_tokens": 951151712.0, "z_loss": 0.0006485702469944954 }, { "copy_logits_max": -5.382847309112549, "copy_logits_min": -625000064.0, "copy_num_tokens": 350.8125, "epoch": 0.7170793974980852, "gen_logits_max": 5.143692970275879, "gen_logits_mean": -14.156349182128906, "gen_logits_min": -25.971982955932617, "gen_logits_std": 2.7209973335266113, "gen_loss": 0.32071393728256226, "grad_norm": 0.46227560573844745, "learning_rate": 2.6494736842105266e-05, "loss": 0.3168, "mean_copy_accuracy": 0.9950191825628281, "mean_gen_accuracy": 0.8676531612873077, "mean_token_accuracy": 0.8955542743206024, "num_tokens": 951381477.0, "sample_num_tokens": 7296.75, "step": 3511, "total_num_tokens": 951410664.0, "z_loss": 0.000781092734541744 }, { "copy_logits_max": -2.9864540100097656, "copy_logits_min": -687500032.0, "copy_num_tokens": 488.6875, "epoch": 0.7172836354352821, "gen_logits_max": 5.222692489624023, "gen_logits_mean": -13.259866714477539, "gen_logits_min": -25.211578369140625, "gen_logits_std": 2.705599308013916, "gen_loss": 0.33218204975128174, "grad_norm": 0.5182651459347456, "learning_rate": 2.6493473684210527e-05, "loss": 0.3023, "mean_copy_accuracy": 0.9947454035282135, "mean_gen_accuracy": 0.8651274591684341, "mean_token_accuracy": 0.8986235558986664, "num_tokens": 951631138.0, "sample_num_tokens": 8456.0, "step": 3512, "total_num_tokens": 951664962.0, "z_loss": 0.000762730953283608 }, { "copy_logits_max": -4.320342063903809, "copy_logits_min": -750000000.0, "copy_num_tokens": 391.25, "epoch": 0.717487873372479, "gen_logits_max": 5.4970879554748535, "gen_logits_mean": -13.721704483032227, "gen_logits_min": -25.20641326904297, "gen_logits_std": 2.6799752712249756, "gen_loss": 0.36243483424186707, "grad_norm": 0.42119700229214324, "learning_rate": 2.649221052631579e-05, "loss": 0.3204, "mean_copy_accuracy": 0.9948619604110718, "mean_gen_accuracy": 0.863994762301445, "mean_token_accuracy": 0.8949975818395615, "num_tokens": 951905272.0, "sample_num_tokens": 8090.0, "step": 3513, "total_num_tokens": 951937632.0, "z_loss": 0.0007485650712624192 }, { "copy_logits_max": -2.365957736968994, "copy_logits_min": -750000000.0, "copy_num_tokens": 424.0625, "epoch": 0.7176921113096758, "gen_logits_max": 5.039834022521973, "gen_logits_mean": -14.180383682250977, "gen_logits_min": -25.97551155090332, "gen_logits_std": 2.7021312713623047, "gen_loss": 0.3465777635574341, "grad_norm": 0.41234542732672996, "learning_rate": 2.6490947368421052e-05, "loss": 0.3419, "mean_copy_accuracy": 0.9934659451246262, "mean_gen_accuracy": 0.8568620532751083, "mean_token_accuracy": 0.8884478211402893, "num_tokens": 952164304.0, "sample_num_tokens": 7608.0, "step": 3514, "total_num_tokens": 952194736.0, "z_loss": 0.0006983681814745069 }, { "copy_logits_max": -4.36635160446167, "copy_logits_min": -750000000.0, "copy_num_tokens": 361.6875, "epoch": 0.7178963492468726, "gen_logits_max": 5.0389404296875, "gen_logits_mean": -14.754425048828125, "gen_logits_min": -26.622539520263672, "gen_logits_std": 2.6885862350463867, "gen_loss": 0.37976810336112976, "grad_norm": 0.4419252613951862, "learning_rate": 2.6489684210526317e-05, "loss": 0.3377, "mean_copy_accuracy": 0.9943190068006516, "mean_gen_accuracy": 0.8571300059556961, "mean_token_accuracy": 0.8899430185556412, "num_tokens": 952422773.0, "sample_num_tokens": 7827.75, "step": 3515, "total_num_tokens": 952454084.0, "z_loss": 0.0007908102707006037 }, { "copy_logits_max": -4.968994617462158, "copy_logits_min": -750000000.0, "copy_num_tokens": 373.1875, "epoch": 0.7181005871840694, "gen_logits_max": 5.23556661605835, "gen_logits_mean": -13.139533996582031, "gen_logits_min": -24.964427947998047, "gen_logits_std": 2.7188611030578613, "gen_loss": 0.33278703689575195, "grad_norm": 0.3937210162040469, "learning_rate": 2.6488421052631577e-05, "loss": 0.3067, "mean_copy_accuracy": 0.994954839348793, "mean_gen_accuracy": 0.8647888153791428, "mean_token_accuracy": 0.8985218554735184, "num_tokens": 952698954.0, "sample_num_tokens": 7846.0, "step": 3516, "total_num_tokens": 952730338.0, "z_loss": 0.0007111956365406513 }, { "copy_logits_max": -4.754637718200684, "copy_logits_min": -750000064.0, "copy_num_tokens": 345.375, "epoch": 0.7183048251212663, "gen_logits_max": 5.042662620544434, "gen_logits_mean": -14.75796127319336, "gen_logits_min": -26.439739227294922, "gen_logits_std": 2.6993064880371094, "gen_loss": 0.3053598403930664, "grad_norm": 0.44118227319311887, "learning_rate": 2.6487157894736842e-05, "loss": 0.3066, "mean_copy_accuracy": 0.9927530884742737, "mean_gen_accuracy": 0.8685369789600372, "mean_token_accuracy": 0.8965973556041718, "num_tokens": 952963980.0, "sample_num_tokens": 8123.0, "step": 3517, "total_num_tokens": 952996472.0, "z_loss": 0.0006510091479867697 }, { "copy_logits_max": -5.949579238891602, "copy_logits_min": -687500032.0, "copy_num_tokens": 370.4375, "epoch": 0.7185090630584631, "gen_logits_max": 5.345137119293213, "gen_logits_mean": -13.770981788635254, "gen_logits_min": -25.353912353515625, "gen_logits_std": 2.682512044906616, "gen_loss": 0.30209046602249146, "grad_norm": 0.41908269896916245, "learning_rate": 2.6485894736842106e-05, "loss": 0.3038, "mean_copy_accuracy": 0.9957545399665833, "mean_gen_accuracy": 0.8650348037481308, "mean_token_accuracy": 0.8989560306072235, "num_tokens": 953216736.0, "sample_num_tokens": 8031.0, "step": 3518, "total_num_tokens": 953248860.0, "z_loss": 0.0006360993720591068 }, { "copy_logits_max": -3.8947789669036865, "copy_logits_min": -687500032.0, "copy_num_tokens": 539.375, "epoch": 0.71871330099566, "gen_logits_max": 4.36285400390625, "gen_logits_mean": -13.95827865600586, "gen_logits_min": -25.763811111450195, "gen_logits_std": 2.7121775150299072, "gen_loss": 0.29550307989120483, "grad_norm": 0.406707213141963, "learning_rate": 2.648463157894737e-05, "loss": 0.2903, "mean_copy_accuracy": 0.994803786277771, "mean_gen_accuracy": 0.8725414425134659, "mean_token_accuracy": 0.9026043117046356, "num_tokens": 953490406.0, "sample_num_tokens": 8588.0, "step": 3519, "total_num_tokens": 953524758.0, "z_loss": 0.0006301308167167008 }, { "copy_logits_max": -5.592258453369141, "copy_logits_min": -750000000.0, "copy_num_tokens": 402.625, "epoch": 0.7189175389328568, "gen_logits_max": 5.514666557312012, "gen_logits_mean": -13.851432800292969, "gen_logits_min": -25.463747024536133, "gen_logits_std": 2.670067310333252, "gen_loss": 0.39033764600753784, "grad_norm": 0.4316126291759316, "learning_rate": 2.6483368421052635e-05, "loss": 0.3088, "mean_copy_accuracy": 0.9956639558076859, "mean_gen_accuracy": 0.8628389686346054, "mean_token_accuracy": 0.899106815457344, "num_tokens": 953758791.0, "sample_num_tokens": 8029.25, "step": 3520, "total_num_tokens": 953790908.0, "z_loss": 0.0008055781945586205 }, { "copy_logits_max": -4.406829357147217, "copy_logits_min": -750000000.0, "copy_num_tokens": 371.375, "epoch": 0.7191217768700536, "gen_logits_max": 5.090047836303711, "gen_logits_mean": -14.495376586914062, "gen_logits_min": -26.467079162597656, "gen_logits_std": 2.689864158630371, "gen_loss": 0.36178022623062134, "grad_norm": 0.4591041369052236, "learning_rate": 2.6482105263157896e-05, "loss": 0.3296, "mean_copy_accuracy": 0.9925274699926376, "mean_gen_accuracy": 0.8638667613267899, "mean_token_accuracy": 0.8913892507553101, "num_tokens": 953998442.0, "sample_num_tokens": 7426.5, "step": 3521, "total_num_tokens": 954028148.0, "z_loss": 0.0008137768600136042 }, { "copy_logits_max": -5.140257835388184, "copy_logits_min": -687500032.0, "copy_num_tokens": 490.25, "epoch": 0.7193260148072504, "gen_logits_max": 5.086655616760254, "gen_logits_mean": -13.134891510009766, "gen_logits_min": -24.85555648803711, "gen_logits_std": 2.6747994422912598, "gen_loss": 0.30576828122138977, "grad_norm": 0.40664417191518515, "learning_rate": 2.648084210526316e-05, "loss": 0.3119, "mean_copy_accuracy": 0.9950535297393799, "mean_gen_accuracy": 0.8598029166460037, "mean_token_accuracy": 0.8979941606521606, "num_tokens": 954290490.0, "sample_num_tokens": 8352.0, "step": 3522, "total_num_tokens": 954323898.0, "z_loss": 0.0006773625500500202 }, { "copy_logits_max": -3.761723041534424, "copy_logits_min": -750000000.0, "copy_num_tokens": 520.3125, "epoch": 0.7195302527444473, "gen_logits_max": 4.758701324462891, "gen_logits_mean": -14.71968936920166, "gen_logits_min": -26.542705535888672, "gen_logits_std": 2.6817498207092285, "gen_loss": 0.3187894821166992, "grad_norm": 0.43310994750540693, "learning_rate": 2.647957894736842e-05, "loss": 0.3095, "mean_copy_accuracy": 0.9954828172922134, "mean_gen_accuracy": 0.8613673448562622, "mean_token_accuracy": 0.8985244482755661, "num_tokens": 954592804.0, "sample_num_tokens": 8983.0, "step": 3523, "total_num_tokens": 954628736.0, "z_loss": 0.0007200618274509907 }, { "copy_logits_max": -6.987441062927246, "copy_logits_min": -750000000.0, "copy_num_tokens": 250.375, "epoch": 0.7197344906816441, "gen_logits_max": 5.129122734069824, "gen_logits_mean": -15.548243522644043, "gen_logits_min": -26.849679946899414, "gen_logits_std": 2.645857334136963, "gen_loss": 0.3044591248035431, "grad_norm": 0.4305992993721458, "learning_rate": 2.6478315789473685e-05, "loss": 0.2962, "mean_copy_accuracy": 0.9933268874883652, "mean_gen_accuracy": 0.8728544563055038, "mean_token_accuracy": 0.9007101505994797, "num_tokens": 954863536.0, "sample_num_tokens": 7252.5, "step": 3524, "total_num_tokens": 954892546.0, "z_loss": 0.0006716470816172659 }, { "copy_logits_max": -5.9235615730285645, "copy_logits_min": -750000000.0, "copy_num_tokens": 444.6875, "epoch": 0.7199387286188409, "gen_logits_max": 4.538167476654053, "gen_logits_mean": -14.656013488769531, "gen_logits_min": -25.786937713623047, "gen_logits_std": 2.6302223205566406, "gen_loss": 0.3018799126148224, "grad_norm": 0.4180413886972331, "learning_rate": 2.6477052631578946e-05, "loss": 0.2998, "mean_copy_accuracy": 0.9957716166973114, "mean_gen_accuracy": 0.8708586692810059, "mean_token_accuracy": 0.9020431786775589, "num_tokens": 955133753.0, "sample_num_tokens": 8564.75, "step": 3525, "total_num_tokens": 955168012.0, "z_loss": 0.0007144748233258724 }, { "copy_logits_max": -4.851987838745117, "copy_logits_min": -750000000.0, "copy_num_tokens": 439.75, "epoch": 0.7201429665560378, "gen_logits_max": 4.876923561096191, "gen_logits_mean": -14.608980178833008, "gen_logits_min": -26.05223846435547, "gen_logits_std": 2.6785569190979004, "gen_loss": 0.3082258105278015, "grad_norm": 0.42563320141599087, "learning_rate": 2.647578947368421e-05, "loss": 0.3354, "mean_copy_accuracy": 0.9937938004732132, "mean_gen_accuracy": 0.8612082302570343, "mean_token_accuracy": 0.8894567042589188, "num_tokens": 955400861.0, "sample_num_tokens": 8623.75, "step": 3526, "total_num_tokens": 955435356.0, "z_loss": 0.0006835819222033024 }, { "copy_logits_max": -5.2451491355896, "copy_logits_min": -750000000.0, "copy_num_tokens": 419.8125, "epoch": 0.7203472044932346, "gen_logits_max": 4.905720233917236, "gen_logits_mean": -14.55212116241455, "gen_logits_min": -25.936622619628906, "gen_logits_std": 2.6412196159362793, "gen_loss": 0.2926030457019806, "grad_norm": 0.390138628029276, "learning_rate": 2.6474526315789475e-05, "loss": 0.3104, "mean_copy_accuracy": 0.9950813502073288, "mean_gen_accuracy": 0.8646518141031265, "mean_token_accuracy": 0.897559255361557, "num_tokens": 955673596.0, "sample_num_tokens": 9373.5, "step": 3527, "total_num_tokens": 955711090.0, "z_loss": 0.0006385138840414584 }, { "copy_logits_max": -2.561272621154785, "copy_logits_min": -750000000.0, "copy_num_tokens": 539.0, "epoch": 0.7205514424304315, "gen_logits_max": 4.534855365753174, "gen_logits_mean": -14.197835922241211, "gen_logits_min": -26.30736541748047, "gen_logits_std": 2.708435297012329, "gen_loss": 0.290965735912323, "grad_norm": 0.42241111478713056, "learning_rate": 2.647326315789474e-05, "loss": 0.3154, "mean_copy_accuracy": 0.9950249940156937, "mean_gen_accuracy": 0.8610918819904327, "mean_token_accuracy": 0.8967165350914001, "num_tokens": 955974196.0, "sample_num_tokens": 8415.0, "step": 3528, "total_num_tokens": 956007856.0, "z_loss": 0.0006707198917865753 }, { "copy_logits_max": -5.210354328155518, "copy_logits_min": -750000000.0, "copy_num_tokens": 486.75, "epoch": 0.7207556803676283, "gen_logits_max": 5.671415328979492, "gen_logits_mean": -13.451728820800781, "gen_logits_min": -24.67478370666504, "gen_logits_std": 2.670055389404297, "gen_loss": 0.28156769275665283, "grad_norm": 0.40011553975956476, "learning_rate": 2.6472e-05, "loss": 0.3015, "mean_copy_accuracy": 0.9962891489267349, "mean_gen_accuracy": 0.865935742855072, "mean_token_accuracy": 0.8993148505687714, "num_tokens": 956270448.0, "sample_num_tokens": 9246.0, "step": 3529, "total_num_tokens": 956307432.0, "z_loss": 0.0006006655748933554 }, { "copy_logits_max": -4.096435546875, "copy_logits_min": -750000000.0, "copy_num_tokens": 590.375, "epoch": 0.7209599183048251, "gen_logits_max": 4.799007415771484, "gen_logits_mean": -13.972208023071289, "gen_logits_min": -25.389930725097656, "gen_logits_std": 2.6579995155334473, "gen_loss": 0.28597545623779297, "grad_norm": 0.368096710728892, "learning_rate": 2.6470736842105264e-05, "loss": 0.2828, "mean_copy_accuracy": 0.9970461428165436, "mean_gen_accuracy": 0.8718100339174271, "mean_token_accuracy": 0.9059010744094849, "num_tokens": 956566296.0, "sample_num_tokens": 9740.0, "step": 3530, "total_num_tokens": 956605256.0, "z_loss": 0.0006695372285321355 }, { "copy_logits_max": -4.4609856605529785, "copy_logits_min": -750000000.0, "copy_num_tokens": 541.75, "epoch": 0.7211641562420219, "gen_logits_max": 4.133578777313232, "gen_logits_mean": -15.306282043457031, "gen_logits_min": -27.145751953125, "gen_logits_std": 2.699491024017334, "gen_loss": 0.3316155672073364, "grad_norm": 0.45298449968175225, "learning_rate": 2.6469473684210525e-05, "loss": 0.3223, "mean_copy_accuracy": 0.9952287524938583, "mean_gen_accuracy": 0.8571629971265793, "mean_token_accuracy": 0.8943050354719162, "num_tokens": 956853205.0, "sample_num_tokens": 8593.25, "step": 3531, "total_num_tokens": 956887578.0, "z_loss": 0.0007566749700345099 }, { "copy_logits_max": -4.6204304695129395, "copy_logits_min": -750000000.0, "copy_num_tokens": 264.5625, "epoch": 0.7213683941792188, "gen_logits_max": 5.866865158081055, "gen_logits_mean": -13.85014820098877, "gen_logits_min": -25.073375701904297, "gen_logits_std": 2.618091106414795, "gen_loss": 0.33920371532440186, "grad_norm": 0.4074094319251918, "learning_rate": 2.646821052631579e-05, "loss": 0.3309, "mean_copy_accuracy": 0.9954174011945724, "mean_gen_accuracy": 0.8557573556900024, "mean_token_accuracy": 0.8886093348264694, "num_tokens": 957136403.0, "sample_num_tokens": 6890.25, "step": 3532, "total_num_tokens": 957163964.0, "z_loss": 0.0007688896730542183 }, { "copy_logits_max": -3.231297254562378, "copy_logits_min": -750000000.0, "copy_num_tokens": 592.3125, "epoch": 0.7215726321164156, "gen_logits_max": 4.559884071350098, "gen_logits_mean": -13.81190299987793, "gen_logits_min": -25.3527889251709, "gen_logits_std": 2.6547622680664062, "gen_loss": 0.30198395252227783, "grad_norm": 0.41249518554354314, "learning_rate": 2.646694736842105e-05, "loss": 0.3121, "mean_copy_accuracy": 0.993772953748703, "mean_gen_accuracy": 0.8641737997531891, "mean_token_accuracy": 0.8961748778820038, "num_tokens": 957396605.0, "sample_num_tokens": 9602.75, "step": 3533, "total_num_tokens": 957435016.0, "z_loss": 0.000661686179228127 }, { "copy_logits_max": -0.42546170949935913, "copy_logits_min": -750000064.0, "copy_num_tokens": 580.0, "epoch": 0.7217768700536125, "gen_logits_max": 4.600554943084717, "gen_logits_mean": -14.45145034790039, "gen_logits_min": -26.554237365722656, "gen_logits_std": 2.6776440143585205, "gen_loss": 0.2981073558330536, "grad_norm": 0.4464968623001787, "learning_rate": 2.6465684210526315e-05, "loss": 0.3096, "mean_copy_accuracy": 0.9957038462162018, "mean_gen_accuracy": 0.8627988696098328, "mean_token_accuracy": 0.8946153819561005, "num_tokens": 957649566.0, "sample_num_tokens": 8788.0, "step": 3534, "total_num_tokens": 957684718.0, "z_loss": 0.0006469084764830768 }, { "copy_logits_max": -2.8588576316833496, "copy_logits_min": -750000000.0, "copy_num_tokens": 411.25, "epoch": 0.7219811079908093, "gen_logits_max": 5.068107604980469, "gen_logits_mean": -14.06716537475586, "gen_logits_min": -25.921932220458984, "gen_logits_std": 2.6712968349456787, "gen_loss": 0.25475162267684937, "grad_norm": 0.3858041296961412, "learning_rate": 2.6464421052631582e-05, "loss": 0.3045, "mean_copy_accuracy": 0.996045246720314, "mean_gen_accuracy": 0.8647270351648331, "mean_token_accuracy": 0.899879589676857, "num_tokens": 957921138.0, "sample_num_tokens": 7274.0, "step": 3535, "total_num_tokens": 957950234.0, "z_loss": 0.0006328810704872012 }, { "copy_logits_max": -2.816624641418457, "copy_logits_min": -750000000.0, "copy_num_tokens": 474.375, "epoch": 0.7221853459280061, "gen_logits_max": 5.049505710601807, "gen_logits_mean": -13.346793174743652, "gen_logits_min": -24.775724411010742, "gen_logits_std": 2.656930923461914, "gen_loss": 0.3168926239013672, "grad_norm": 0.5041676345141611, "learning_rate": 2.6463157894736843e-05, "loss": 0.3337, "mean_copy_accuracy": 0.9957747459411621, "mean_gen_accuracy": 0.8539795577526093, "mean_token_accuracy": 0.8898839950561523, "num_tokens": 958187261.0, "sample_num_tokens": 8428.25, "step": 3536, "total_num_tokens": 958220974.0, "z_loss": 0.0007773288525640965 }, { "copy_logits_max": -5.053494453430176, "copy_logits_min": -750000064.0, "copy_num_tokens": 417.0625, "epoch": 0.7223895838652029, "gen_logits_max": 5.01021146774292, "gen_logits_mean": -13.098518371582031, "gen_logits_min": -24.52056121826172, "gen_logits_std": 2.640591859817505, "gen_loss": 0.3467997908592224, "grad_norm": 0.3948979868412641, "learning_rate": 2.6461894736842108e-05, "loss": 0.308, "mean_copy_accuracy": 0.9958061724901199, "mean_gen_accuracy": 0.8601032793521881, "mean_token_accuracy": 0.9002644717693329, "num_tokens": 958478010.0, "sample_num_tokens": 8016.5, "step": 3537, "total_num_tokens": 958510076.0, "z_loss": 0.0007067305850796402 }, { "copy_logits_max": -6.683572769165039, "copy_logits_min": -750000000.0, "copy_num_tokens": 481.4375, "epoch": 0.7225938218023998, "gen_logits_max": 5.114012241363525, "gen_logits_mean": -14.432124137878418, "gen_logits_min": -25.94696044921875, "gen_logits_std": 2.6845648288726807, "gen_loss": 0.33085525035858154, "grad_norm": 0.41575133140192033, "learning_rate": 2.646063157894737e-05, "loss": 0.3235, "mean_copy_accuracy": 0.9946018159389496, "mean_gen_accuracy": 0.862166240811348, "mean_token_accuracy": 0.8926480114459991, "num_tokens": 958766635.0, "sample_num_tokens": 9525.25, "step": 3538, "total_num_tokens": 958804736.0, "z_loss": 0.0006461773300543427 }, { "copy_logits_max": -6.3283538818359375, "copy_logits_min": -750000000.0, "copy_num_tokens": 289.75, "epoch": 0.7227980597395967, "gen_logits_max": 4.268795967102051, "gen_logits_mean": -15.866571426391602, "gen_logits_min": -26.901159286499023, "gen_logits_std": 2.6256942749023438, "gen_loss": 0.3431248664855957, "grad_norm": 0.5855305511045619, "learning_rate": 2.6459368421052633e-05, "loss": 0.3229, "mean_copy_accuracy": 0.9940904527902603, "mean_gen_accuracy": 0.8615823239088058, "mean_token_accuracy": 0.8936312645673752, "num_tokens": 959045961.0, "sample_num_tokens": 7875.75, "step": 3539, "total_num_tokens": 959077464.0, "z_loss": 0.0005970105994492769 }, { "copy_logits_max": -3.305875778198242, "copy_logits_min": -687500032.0, "copy_num_tokens": 585.75, "epoch": 0.7230022976767935, "gen_logits_max": 3.4289426803588867, "gen_logits_mean": -15.718631744384766, "gen_logits_min": -26.99250030517578, "gen_logits_std": 2.6857752799987793, "gen_loss": 0.30805152654647827, "grad_norm": 0.4075953965072044, "learning_rate": 2.6458105263157894e-05, "loss": 0.2959, "mean_copy_accuracy": 0.9958992898464203, "mean_gen_accuracy": 0.8651017248630524, "mean_token_accuracy": 0.9000267535448074, "num_tokens": 959319074.0, "sample_num_tokens": 9769.0, "step": 3540, "total_num_tokens": 959358150.0, "z_loss": 0.0005780620849691331 }, { "copy_logits_max": -2.7354705333709717, "copy_logits_min": -750000000.0, "copy_num_tokens": 325.0625, "epoch": 0.7232065356139903, "gen_logits_max": 3.883753776550293, "gen_logits_mean": -15.119820594787598, "gen_logits_min": -26.778179168701172, "gen_logits_std": 2.6272058486938477, "gen_loss": 0.33590540289878845, "grad_norm": 0.4755763258234316, "learning_rate": 2.6456842105263158e-05, "loss": 0.3073, "mean_copy_accuracy": 0.9955954998731613, "mean_gen_accuracy": 0.8666884899139404, "mean_token_accuracy": 0.8988727480173111, "num_tokens": 959596566.0, "sample_num_tokens": 6659.5, "step": 3541, "total_num_tokens": 959623204.0, "z_loss": 0.0006289193988777697 }, { "copy_logits_max": -4.401137351989746, "copy_logits_min": -625000064.0, "copy_num_tokens": 387.125, "epoch": 0.7234107735511871, "gen_logits_max": 4.806022644042969, "gen_logits_mean": -13.78221321105957, "gen_logits_min": -25.040925979614258, "gen_logits_std": 2.650519609451294, "gen_loss": 0.3045479655265808, "grad_norm": 0.40472840647135844, "learning_rate": 2.645557894736842e-05, "loss": 0.299, "mean_copy_accuracy": 0.9951188415288925, "mean_gen_accuracy": 0.8716484904289246, "mean_token_accuracy": 0.9007148891687393, "num_tokens": 959869395.0, "sample_num_tokens": 8517.25, "step": 3542, "total_num_tokens": 959903464.0, "z_loss": 0.0005899507086724043 }, { "copy_logits_max": -3.8578596115112305, "copy_logits_min": -750000000.0, "copy_num_tokens": 457.0, "epoch": 0.7236150114883839, "gen_logits_max": 4.075889587402344, "gen_logits_mean": -14.211736679077148, "gen_logits_min": -25.391387939453125, "gen_logits_std": 2.6256983280181885, "gen_loss": 0.29901206493377686, "grad_norm": 0.4433466623801094, "learning_rate": 2.6454315789473687e-05, "loss": 0.3138, "mean_copy_accuracy": 0.9945139288902283, "mean_gen_accuracy": 0.864877387881279, "mean_token_accuracy": 0.897332176566124, "num_tokens": 960144743.0, "sample_num_tokens": 8687.75, "step": 3543, "total_num_tokens": 960179494.0, "z_loss": 0.0006072382675483823 }, { "copy_logits_max": -5.592858791351318, "copy_logits_min": -687500096.0, "copy_num_tokens": 301.9375, "epoch": 0.7238192494255808, "gen_logits_max": 4.630314826965332, "gen_logits_mean": -14.906299591064453, "gen_logits_min": -26.187623977661133, "gen_logits_std": 2.601402759552002, "gen_loss": 0.3181615471839905, "grad_norm": 0.4085198449199114, "learning_rate": 2.6453052631578948e-05, "loss": 0.2977, "mean_copy_accuracy": 0.9956269264221191, "mean_gen_accuracy": 0.8685324490070343, "mean_token_accuracy": 0.9003519117832184, "num_tokens": 960422921.0, "sample_num_tokens": 7116.25, "step": 3544, "total_num_tokens": 960451386.0, "z_loss": 0.0006647843983955681 }, { "copy_logits_max": -5.250367641448975, "copy_logits_min": -750000000.0, "copy_num_tokens": 466.8125, "epoch": 0.7240234873627777, "gen_logits_max": 4.4092912673950195, "gen_logits_mean": -14.779853820800781, "gen_logits_min": -26.17132568359375, "gen_logits_std": 2.6435091495513916, "gen_loss": 0.31118881702423096, "grad_norm": 0.42108143936230513, "learning_rate": 2.6451789473684212e-05, "loss": 0.3157, "mean_copy_accuracy": 0.9946111589670181, "mean_gen_accuracy": 0.8635488301515579, "mean_token_accuracy": 0.8954726308584213, "num_tokens": 960697163.0, "sample_num_tokens": 8473.75, "step": 3545, "total_num_tokens": 960731058.0, "z_loss": 0.0007027137326076627 }, { "copy_logits_max": -2.316678285598755, "copy_logits_min": -750000000.0, "copy_num_tokens": 557.0, "epoch": 0.7242277252999745, "gen_logits_max": 3.903791904449463, "gen_logits_mean": -13.08062744140625, "gen_logits_min": -24.77799415588379, "gen_logits_std": 2.6483962535858154, "gen_loss": 0.2618046998977661, "grad_norm": 0.41331408129168407, "learning_rate": 2.6450526315789473e-05, "loss": 0.32, "mean_copy_accuracy": 0.9960353374481201, "mean_gen_accuracy": 0.8571977764368057, "mean_token_accuracy": 0.8951205164194107, "num_tokens": 960970950.0, "sample_num_tokens": 7782.0, "step": 3546, "total_num_tokens": 961002078.0, "z_loss": 0.0006279434892348945 }, { "copy_logits_max": -5.448204040527344, "copy_logits_min": -750000000.0, "copy_num_tokens": 406.5, "epoch": 0.7244319632371713, "gen_logits_max": 5.2769856452941895, "gen_logits_mean": -13.395059585571289, "gen_logits_min": -24.764955520629883, "gen_logits_std": 2.6290018558502197, "gen_loss": 0.34617018699645996, "grad_norm": 0.490319803692533, "learning_rate": 2.6449263157894737e-05, "loss": 0.3035, "mean_copy_accuracy": 0.9941657483577728, "mean_gen_accuracy": 0.8666499853134155, "mean_token_accuracy": 0.8973181545734406, "num_tokens": 961234777.0, "sample_num_tokens": 7831.75, "step": 3547, "total_num_tokens": 961266104.0, "z_loss": 0.0007335793343372643 }, { "copy_logits_max": -4.977783679962158, "copy_logits_min": -625000064.0, "copy_num_tokens": 457.5625, "epoch": 0.7246362011743681, "gen_logits_max": 4.886571884155273, "gen_logits_mean": -14.466346740722656, "gen_logits_min": -25.72412109375, "gen_logits_std": 2.600515365600586, "gen_loss": 0.3187037706375122, "grad_norm": 0.4026662596102988, "learning_rate": 2.6448e-05, "loss": 0.3014, "mean_copy_accuracy": 0.9953574240207672, "mean_gen_accuracy": 0.869057297706604, "mean_token_accuracy": 0.9008902460336685, "num_tokens": 961503171.0, "sample_num_tokens": 8657.25, "step": 3548, "total_num_tokens": 961537800.0, "z_loss": 0.0007650437764823437 }, { "copy_logits_max": -3.120211601257324, "copy_logits_min": -750000000.0, "copy_num_tokens": 537.6875, "epoch": 0.7248404391115649, "gen_logits_max": 4.115359306335449, "gen_logits_mean": -14.099525451660156, "gen_logits_min": -26.000553131103516, "gen_logits_std": 2.6751112937927246, "gen_loss": 0.2755052447319031, "grad_norm": 0.3769868417510915, "learning_rate": 2.6446736842105262e-05, "loss": 0.2778, "mean_copy_accuracy": 0.9964847564697266, "mean_gen_accuracy": 0.8681069910526276, "mean_token_accuracy": 0.9067179709672928, "num_tokens": 961791068.0, "sample_num_tokens": 8157.0, "step": 3549, "total_num_tokens": 961823696.0, "z_loss": 0.0008504037978127599 }, { "copy_logits_max": -5.722260475158691, "copy_logits_min": -750000000.0, "copy_num_tokens": 395.9375, "epoch": 0.7250446770487619, "gen_logits_max": 5.438105583190918, "gen_logits_mean": -14.088417053222656, "gen_logits_min": -25.20037078857422, "gen_logits_std": 2.650813102722168, "gen_loss": 0.3513756990432739, "grad_norm": 0.3967397759704951, "learning_rate": 2.6445473684210527e-05, "loss": 0.3253, "mean_copy_accuracy": 0.9963171035051346, "mean_gen_accuracy": 0.859237551689148, "mean_token_accuracy": 0.8924422115087509, "num_tokens": 962072140.0, "sample_num_tokens": 7393.0, "step": 3550, "total_num_tokens": 962101712.0, "z_loss": 0.0009272912866435945 }, { "copy_logits_max": -6.251894950866699, "copy_logits_min": -750000000.0, "copy_num_tokens": 588.125, "epoch": 0.7252489149859587, "gen_logits_max": 5.339747428894043, "gen_logits_mean": -11.836873054504395, "gen_logits_min": -23.00765037536621, "gen_logits_std": 2.5635204315185547, "gen_loss": 0.29027464985847473, "grad_norm": 0.4847883936444301, "learning_rate": 2.644421052631579e-05, "loss": 0.31, "mean_copy_accuracy": 0.9952033758163452, "mean_gen_accuracy": 0.8653374463319778, "mean_token_accuracy": 0.8955236822366714, "num_tokens": 962339535.0, "sample_num_tokens": 9583.25, "step": 3551, "total_num_tokens": 962377868.0, "z_loss": 0.0008824141696095467 }, { "copy_logits_max": -4.79755973815918, "copy_logits_min": -750000000.0, "copy_num_tokens": 400.25, "epoch": 0.7254531529231555, "gen_logits_max": 5.567139148712158, "gen_logits_mean": -14.329947471618652, "gen_logits_min": -25.869144439697266, "gen_logits_std": 2.67360258102417, "gen_loss": 0.3303500711917877, "grad_norm": 0.43620736310127595, "learning_rate": 2.6442947368421055e-05, "loss": 0.318, "mean_copy_accuracy": 0.9946403652429581, "mean_gen_accuracy": 0.8643195331096649, "mean_token_accuracy": 0.8944924026727676, "num_tokens": 962593692.0, "sample_num_tokens": 7937.5, "step": 3552, "total_num_tokens": 962625442.0, "z_loss": 0.0008932255441322923 }, { "copy_logits_max": -5.922755718231201, "copy_logits_min": -750000000.0, "copy_num_tokens": 456.5625, "epoch": 0.7256573908603523, "gen_logits_max": 4.853643417358398, "gen_logits_mean": -14.362202644348145, "gen_logits_min": -25.519018173217773, "gen_logits_std": 2.6527822017669678, "gen_loss": 0.31985947489738464, "grad_norm": 0.41397938220419495, "learning_rate": 2.6441684210526316e-05, "loss": 0.3207, "mean_copy_accuracy": 0.994606226682663, "mean_gen_accuracy": 0.8617765605449677, "mean_token_accuracy": 0.8936237394809723, "num_tokens": 962854675.0, "sample_num_tokens": 8070.75, "step": 3553, "total_num_tokens": 962886958.0, "z_loss": 0.0007503186934627593 }, { "copy_logits_max": -6.227782726287842, "copy_logits_min": -750000000.0, "copy_num_tokens": 438.8125, "epoch": 0.7258616287975491, "gen_logits_max": 4.6513261795043945, "gen_logits_mean": -14.70795726776123, "gen_logits_min": -25.99920654296875, "gen_logits_std": 2.6664350032806396, "gen_loss": 0.2704496681690216, "grad_norm": 0.36872785963913557, "learning_rate": 2.644042105263158e-05, "loss": 0.2918, "mean_copy_accuracy": 0.9957167208194733, "mean_gen_accuracy": 0.8650121539831161, "mean_token_accuracy": 0.9015151262283325, "num_tokens": 963125637.0, "sample_num_tokens": 8494.25, "step": 3554, "total_num_tokens": 963159614.0, "z_loss": 0.0006683978717774153 }, { "copy_logits_max": -4.826403617858887, "copy_logits_min": -750000000.0, "copy_num_tokens": 653.875, "epoch": 0.7260658667347459, "gen_logits_max": 4.190464496612549, "gen_logits_mean": -14.885334014892578, "gen_logits_min": -26.110671997070312, "gen_logits_std": 2.6323533058166504, "gen_loss": 0.2819868326187134, "grad_norm": 0.41627524759153606, "learning_rate": 2.643915789473684e-05, "loss": 0.3112, "mean_copy_accuracy": 0.9945453405380249, "mean_gen_accuracy": 0.862990215420723, "mean_token_accuracy": 0.8963836431503296, "num_tokens": 963408111.0, "sample_num_tokens": 10543.25, "step": 3555, "total_num_tokens": 963450284.0, "z_loss": 0.0006696259370073676 }, { "copy_logits_max": -3.7650222778320312, "copy_logits_min": -750000000.0, "copy_num_tokens": 338.125, "epoch": 0.7262701046719429, "gen_logits_max": 5.564258575439453, "gen_logits_mean": -13.803022384643555, "gen_logits_min": -25.083698272705078, "gen_logits_std": 2.6495659351348877, "gen_loss": 0.3755846619606018, "grad_norm": 0.42346120535080006, "learning_rate": 2.6437894736842106e-05, "loss": 0.3261, "mean_copy_accuracy": 0.9951592683792114, "mean_gen_accuracy": 0.8659761846065521, "mean_token_accuracy": 0.8919588923454285, "num_tokens": 963683244.0, "sample_num_tokens": 7560.5, "step": 3556, "total_num_tokens": 963713486.0, "z_loss": 0.0007799913873896003 }, { "copy_logits_max": -4.930206298828125, "copy_logits_min": -750000000.0, "copy_num_tokens": 385.3125, "epoch": 0.7264743426091397, "gen_logits_max": 4.903191566467285, "gen_logits_mean": -13.884725570678711, "gen_logits_min": -25.46770477294922, "gen_logits_std": 2.6739964485168457, "gen_loss": 0.3135862648487091, "grad_norm": 0.43328709539044385, "learning_rate": 2.6436631578947367e-05, "loss": 0.3063, "mean_copy_accuracy": 0.9947031587362289, "mean_gen_accuracy": 0.8696029037237167, "mean_token_accuracy": 0.8987954705953598, "num_tokens": 963944373.0, "sample_num_tokens": 7604.25, "step": 3557, "total_num_tokens": 963974790.0, "z_loss": 0.0006822478026151657 }, { "copy_logits_max": -5.263641834259033, "copy_logits_min": -687500032.0, "copy_num_tokens": 558.25, "epoch": 0.7266785805463365, "gen_logits_max": 3.9593217372894287, "gen_logits_mean": -14.748620986938477, "gen_logits_min": -25.859786987304688, "gen_logits_std": 2.642242908477783, "gen_loss": 0.27342700958251953, "grad_norm": 0.39989248189884746, "learning_rate": 2.643536842105263e-05, "loss": 0.2838, "mean_copy_accuracy": 0.99536994099617, "mean_gen_accuracy": 0.8730953186750412, "mean_token_accuracy": 0.9054318964481354, "num_tokens": 964216535.0, "sample_num_tokens": 9062.75, "step": 3558, "total_num_tokens": 964252786.0, "z_loss": 0.0005756831378675997 }, { "copy_logits_max": -5.23075008392334, "copy_logits_min": -750000000.0, "copy_num_tokens": 490.5, "epoch": 0.7268828184835333, "gen_logits_max": 4.098339080810547, "gen_logits_mean": -14.48499584197998, "gen_logits_min": -25.865455627441406, "gen_logits_std": 2.660672664642334, "gen_loss": 0.26384037733078003, "grad_norm": 0.43202824540679496, "learning_rate": 2.6434105263157895e-05, "loss": 0.309, "mean_copy_accuracy": 0.9948323667049408, "mean_gen_accuracy": 0.8644604086875916, "mean_token_accuracy": 0.8988343775272369, "num_tokens": 964488041.0, "sample_num_tokens": 8799.75, "step": 3559, "total_num_tokens": 964523240.0, "z_loss": 0.0005007586441934109 }, { "copy_logits_max": -4.6915998458862305, "copy_logits_min": -750000000.0, "copy_num_tokens": 402.6875, "epoch": 0.7270870564207301, "gen_logits_max": 4.73824405670166, "gen_logits_mean": -14.923215866088867, "gen_logits_min": -26.096057891845703, "gen_logits_std": 2.6471457481384277, "gen_loss": 0.3660385310649872, "grad_norm": 0.4491315228394383, "learning_rate": 2.643284210526316e-05, "loss": 0.3354, "mean_copy_accuracy": 0.995351254940033, "mean_gen_accuracy": 0.856739729642868, "mean_token_accuracy": 0.8893094956874847, "num_tokens": 964742668.0, "sample_num_tokens": 7937.5, "step": 3560, "total_num_tokens": 964774418.0, "z_loss": 0.0007377126603387296 }, { "copy_logits_max": -4.568894863128662, "copy_logits_min": -750000000.0, "copy_num_tokens": 317.1875, "epoch": 0.7272912943579269, "gen_logits_max": 4.871748447418213, "gen_logits_mean": -15.189653396606445, "gen_logits_min": -26.224632263183594, "gen_logits_std": 2.6436376571655273, "gen_loss": 0.30458134412765503, "grad_norm": 0.4255245038370153, "learning_rate": 2.6431578947368424e-05, "loss": 0.3138, "mean_copy_accuracy": 0.9958030134439468, "mean_gen_accuracy": 0.8595416992902756, "mean_token_accuracy": 0.8965427130460739, "num_tokens": 965020826.0, "sample_num_tokens": 6739.5, "step": 3561, "total_num_tokens": 965047784.0, "z_loss": 0.0006326708244159818 }, { "copy_logits_max": -2.826594352722168, "copy_logits_min": -750000000.0, "copy_num_tokens": 520.625, "epoch": 0.7274955322951239, "gen_logits_max": 5.5575103759765625, "gen_logits_mean": -13.101633071899414, "gen_logits_min": -24.21799659729004, "gen_logits_std": 2.632417917251587, "gen_loss": 0.2970457673072815, "grad_norm": 0.48391222703475417, "learning_rate": 2.6430315789473685e-05, "loss": 0.3245, "mean_copy_accuracy": 0.9925050139427185, "mean_gen_accuracy": 0.8620219528675079, "mean_token_accuracy": 0.8920390456914902, "num_tokens": 965272326.0, "sample_num_tokens": 9451.5, "step": 3562, "total_num_tokens": 965310132.0, "z_loss": 0.0006624551606364548 }, { "copy_logits_max": -4.155256271362305, "copy_logits_min": -750000000.0, "copy_num_tokens": 419.3125, "epoch": 0.7276997702323207, "gen_logits_max": 6.025700569152832, "gen_logits_mean": -13.558847427368164, "gen_logits_min": -24.398998260498047, "gen_logits_std": 2.5338101387023926, "gen_loss": 0.3658476173877716, "grad_norm": 0.4197669495819524, "learning_rate": 2.642905263157895e-05, "loss": 0.3508, "mean_copy_accuracy": 0.9935757964849472, "mean_gen_accuracy": 0.8550853133201599, "mean_token_accuracy": 0.8841132074594498, "num_tokens": 965538302.0, "sample_num_tokens": 9221.0, "step": 3563, "total_num_tokens": 965575186.0, "z_loss": 0.0007672704523429275 }, { "copy_logits_max": -4.414559364318848, "copy_logits_min": -750000000.0, "copy_num_tokens": 351.3125, "epoch": 0.7279040081695175, "gen_logits_max": 4.952969551086426, "gen_logits_mean": -15.031858444213867, "gen_logits_min": -26.44971466064453, "gen_logits_std": 2.6687071323394775, "gen_loss": 0.337510347366333, "grad_norm": 0.4673269436388068, "learning_rate": 2.642778947368421e-05, "loss": 0.3191, "mean_copy_accuracy": 0.9944998174905777, "mean_gen_accuracy": 0.8565884083509445, "mean_token_accuracy": 0.8983481228351593, "num_tokens": 965839979.0, "sample_num_tokens": 7843.75, "step": 3564, "total_num_tokens": 965871354.0, "z_loss": 0.0006759492680430412 }, { "copy_logits_max": -2.8245034217834473, "copy_logits_min": -750000000.0, "copy_num_tokens": 451.5625, "epoch": 0.7281082461067143, "gen_logits_max": 4.183501243591309, "gen_logits_mean": -14.352678298950195, "gen_logits_min": -25.949501037597656, "gen_logits_std": 2.6596970558166504, "gen_loss": 0.28116053342819214, "grad_norm": 0.4159751962430944, "learning_rate": 2.6426526315789475e-05, "loss": 0.3065, "mean_copy_accuracy": 0.9941732287406921, "mean_gen_accuracy": 0.8687418550252914, "mean_token_accuracy": 0.8970643430948257, "num_tokens": 966105181.0, "sample_num_tokens": 7882.25, "step": 3565, "total_num_tokens": 966136710.0, "z_loss": 0.0007115120533853769 }, { "copy_logits_max": -4.925230979919434, "copy_logits_min": -750000000.0, "copy_num_tokens": 304.25, "epoch": 0.7283124840439111, "gen_logits_max": 4.900879859924316, "gen_logits_mean": -15.524707794189453, "gen_logits_min": -26.426063537597656, "gen_logits_std": 2.5970606803894043, "gen_loss": 0.3318997919559479, "grad_norm": 0.39515067581044594, "learning_rate": 2.6425263157894735e-05, "loss": 0.2949, "mean_copy_accuracy": 0.9956829696893692, "mean_gen_accuracy": 0.8695597350597382, "mean_token_accuracy": 0.9022658169269562, "num_tokens": 966381582.0, "sample_num_tokens": 7335.5, "step": 3566, "total_num_tokens": 966410924.0, "z_loss": 0.0007568000582978129 }, { "copy_logits_max": -2.2105846405029297, "copy_logits_min": -750000000.0, "copy_num_tokens": 555.625, "epoch": 0.728516721981108, "gen_logits_max": 4.634044647216797, "gen_logits_mean": -13.475278854370117, "gen_logits_min": -25.11176300048828, "gen_logits_std": 2.644284725189209, "gen_loss": 0.3198622465133667, "grad_norm": 0.42334130378273344, "learning_rate": 2.6424000000000003e-05, "loss": 0.3191, "mean_copy_accuracy": 0.9957418143749237, "mean_gen_accuracy": 0.8588613867759705, "mean_token_accuracy": 0.8929937183856964, "num_tokens": 966662036.0, "sample_num_tokens": 9079.0, "step": 3567, "total_num_tokens": 966698352.0, "z_loss": 0.000742862350307405 }, { "copy_logits_max": -2.573561191558838, "copy_logits_min": -750000000.0, "copy_num_tokens": 547.9375, "epoch": 0.7287209599183049, "gen_logits_max": 3.8190789222717285, "gen_logits_mean": -15.217107772827148, "gen_logits_min": -26.84188461303711, "gen_logits_std": 2.6831424236297607, "gen_loss": 0.27478957176208496, "grad_norm": 0.4277145952708614, "learning_rate": 2.6422736842105264e-05, "loss": 0.3092, "mean_copy_accuracy": 0.9952731281518936, "mean_gen_accuracy": 0.8628349453210831, "mean_token_accuracy": 0.8971185982227325, "num_tokens": 966922153.0, "sample_num_tokens": 8443.75, "step": 3568, "total_num_tokens": 966955928.0, "z_loss": 0.0006288377335295081 }, { "copy_logits_max": -2.3523831367492676, "copy_logits_min": -750000000.0, "copy_num_tokens": 489.4375, "epoch": 0.7289251978555017, "gen_logits_max": 4.559053421020508, "gen_logits_mean": -14.25812816619873, "gen_logits_min": -25.54561424255371, "gen_logits_std": 2.6257052421569824, "gen_loss": 0.3209008574485779, "grad_norm": 0.47793894245628094, "learning_rate": 2.642147368421053e-05, "loss": 0.3269, "mean_copy_accuracy": 0.9934802800416946, "mean_gen_accuracy": 0.8613500744104385, "mean_token_accuracy": 0.8902207463979721, "num_tokens": 967169778.0, "sample_num_tokens": 8284.5, "step": 3569, "total_num_tokens": 967202916.0, "z_loss": 0.0007089655846357346 }, { "copy_logits_max": -2.4008073806762695, "copy_logits_min": -687500032.0, "copy_num_tokens": 563.25, "epoch": 0.7291294357926985, "gen_logits_max": 4.223737716674805, "gen_logits_mean": -14.614501953125, "gen_logits_min": -26.36979103088379, "gen_logits_std": 2.6865172386169434, "gen_loss": 0.3080580234527588, "grad_norm": 0.43528790020421093, "learning_rate": 2.642021052631579e-05, "loss": 0.317, "mean_copy_accuracy": 0.9927686303853989, "mean_gen_accuracy": 0.8644997626543045, "mean_token_accuracy": 0.893589973449707, "num_tokens": 967441471.0, "sample_num_tokens": 9054.75, "step": 3570, "total_num_tokens": 967477690.0, "z_loss": 0.0006731686880812049 }, { "copy_logits_max": -2.262726306915283, "copy_logits_min": -750000000.0, "copy_num_tokens": 394.5, "epoch": 0.7293336737298953, "gen_logits_max": 4.003139019012451, "gen_logits_mean": -14.872496604919434, "gen_logits_min": -26.48677635192871, "gen_logits_std": 2.660381555557251, "gen_loss": 0.2793780565261841, "grad_norm": 0.49388451751039425, "learning_rate": 2.6418947368421054e-05, "loss": 0.2945, "mean_copy_accuracy": 0.9952423274517059, "mean_gen_accuracy": 0.8688949793577194, "mean_token_accuracy": 0.9040257632732391, "num_tokens": 967709478.0, "sample_num_tokens": 7058.0, "step": 3571, "total_num_tokens": 967737710.0, "z_loss": 0.0006529301754198968 }, { "copy_logits_max": -0.7131137251853943, "copy_logits_min": -687500032.0, "copy_num_tokens": 491.625, "epoch": 0.7295379116670921, "gen_logits_max": 5.058233261108398, "gen_logits_mean": -12.320806503295898, "gen_logits_min": -23.858734130859375, "gen_logits_std": 2.6590325832366943, "gen_loss": 0.31434905529022217, "grad_norm": 0.46992459322918156, "learning_rate": 2.6417684210526315e-05, "loss": 0.2937, "mean_copy_accuracy": 0.9963352680206299, "mean_gen_accuracy": 0.8636115938425064, "mean_token_accuracy": 0.902601569890976, "num_tokens": 967981024.0, "sample_num_tokens": 8060.5, "step": 3572, "total_num_tokens": 968013266.0, "z_loss": 0.0007849984103813767 }, { "copy_logits_max": -1.4155007600784302, "copy_logits_min": -750000000.0, "copy_num_tokens": 408.5625, "epoch": 0.729742149604289, "gen_logits_max": 4.546001434326172, "gen_logits_mean": -14.762127876281738, "gen_logits_min": -26.017972946166992, "gen_logits_std": 2.6170284748077393, "gen_loss": 0.3069344758987427, "grad_norm": 0.46694284335479386, "learning_rate": 2.641642105263158e-05, "loss": 0.3179, "mean_copy_accuracy": 0.9948433041572571, "mean_gen_accuracy": 0.8646566420793533, "mean_token_accuracy": 0.8959227651357651, "num_tokens": 968242700.0, "sample_num_tokens": 8062.0, "step": 3573, "total_num_tokens": 968274948.0, "z_loss": 0.0007185074500739574 }, { "copy_logits_max": -5.370193958282471, "copy_logits_min": -750000000.0, "copy_num_tokens": 407.4375, "epoch": 0.7299463875414859, "gen_logits_max": 4.817629814147949, "gen_logits_mean": -14.969700813293457, "gen_logits_min": -25.91415786743164, "gen_logits_std": 2.584291696548462, "gen_loss": 0.3483562469482422, "grad_norm": 0.47673715328117183, "learning_rate": 2.6415157894736843e-05, "loss": 0.3359, "mean_copy_accuracy": 0.993074044585228, "mean_gen_accuracy": 0.86003178358078, "mean_token_accuracy": 0.8879861384630203, "num_tokens": 968501261.0, "sample_num_tokens": 8199.75, "step": 3574, "total_num_tokens": 968534060.0, "z_loss": 0.0007289015920832753 }, { "copy_logits_max": -6.3822126388549805, "copy_logits_min": -750000000.0, "copy_num_tokens": 462.0625, "epoch": 0.7301506254786827, "gen_logits_max": 4.886390686035156, "gen_logits_mean": -14.939167022705078, "gen_logits_min": -25.81130599975586, "gen_logits_std": 2.603877067565918, "gen_loss": 0.334714412689209, "grad_norm": 0.49632020418927286, "learning_rate": 2.6413894736842104e-05, "loss": 0.3343, "mean_copy_accuracy": 0.9934042692184448, "mean_gen_accuracy": 0.8582383990287781, "mean_token_accuracy": 0.8886930048465729, "num_tokens": 968762263.0, "sample_num_tokens": 9619.75, "step": 3575, "total_num_tokens": 968800742.0, "z_loss": 0.0007014770526438951 }, { "copy_logits_max": -3.322617530822754, "copy_logits_min": -750000000.0, "copy_num_tokens": 646.9375, "epoch": 0.7303548634158795, "gen_logits_max": 4.182947635650635, "gen_logits_mean": -14.079814910888672, "gen_logits_min": -25.332592010498047, "gen_logits_std": 2.624934673309326, "gen_loss": 0.2854522168636322, "grad_norm": 0.4050848282185621, "learning_rate": 2.6412631578947372e-05, "loss": 0.2927, "mean_copy_accuracy": 0.9949632883071899, "mean_gen_accuracy": 0.868584543466568, "mean_token_accuracy": 0.9018940776586533, "num_tokens": 969050358.0, "sample_num_tokens": 9601.0, "step": 3576, "total_num_tokens": 969088762.0, "z_loss": 0.0006335540674626827 }, { "copy_logits_max": -1.3106192350387573, "copy_logits_min": -750000000.0, "copy_num_tokens": 677.25, "epoch": 0.7305591013530763, "gen_logits_max": 4.182764530181885, "gen_logits_mean": -13.429588317871094, "gen_logits_min": -24.65369415283203, "gen_logits_std": 2.5684404373168945, "gen_loss": 0.34280338883399963, "grad_norm": 0.4567408340477131, "learning_rate": 2.6411368421052633e-05, "loss": 0.3308, "mean_copy_accuracy": 0.9948443621397018, "mean_gen_accuracy": 0.8603669255971909, "mean_token_accuracy": 0.8893737345933914, "num_tokens": 969313576.0, "sample_num_tokens": 10398.0, "step": 3577, "total_num_tokens": 969355168.0, "z_loss": 0.0008069595787674189 }, { "copy_logits_max": -3.6419475078582764, "copy_logits_min": -687500032.0, "copy_num_tokens": 500.75, "epoch": 0.7307633392902732, "gen_logits_max": 4.406643867492676, "gen_logits_mean": -14.350503921508789, "gen_logits_min": -25.013891220092773, "gen_logits_std": 2.540269374847412, "gen_loss": 0.2954191565513611, "grad_norm": 0.4588568743424305, "learning_rate": 2.6410105263157897e-05, "loss": 0.3134, "mean_copy_accuracy": 0.9949448555707932, "mean_gen_accuracy": 0.864809662103653, "mean_token_accuracy": 0.896210789680481, "num_tokens": 969591531.0, "sample_num_tokens": 9395.25, "step": 3578, "total_num_tokens": 969629112.0, "z_loss": 0.0006940665189176798 }, { "copy_logits_max": -3.519139051437378, "copy_logits_min": -750000000.0, "copy_num_tokens": 277.8125, "epoch": 0.73096757722747, "gen_logits_max": 5.235665321350098, "gen_logits_mean": -14.864349365234375, "gen_logits_min": -26.080684661865234, "gen_logits_std": 2.638817548751831, "gen_loss": 0.33504271507263184, "grad_norm": 0.4353434715494308, "learning_rate": 2.6408842105263158e-05, "loss": 0.3107, "mean_copy_accuracy": 0.9934997856616974, "mean_gen_accuracy": 0.8628466129302979, "mean_token_accuracy": 0.895983874797821, "num_tokens": 969847154.0, "sample_num_tokens": 7135.0, "step": 3579, "total_num_tokens": 969875694.0, "z_loss": 0.0007589512970298529 }, { "copy_logits_max": -3.100673198699951, "copy_logits_min": -687500032.0, "copy_num_tokens": 550.125, "epoch": 0.7311718151646668, "gen_logits_max": 4.399717330932617, "gen_logits_mean": -14.589455604553223, "gen_logits_min": -26.10956382751465, "gen_logits_std": 2.650883674621582, "gen_loss": 0.31969982385635376, "grad_norm": 0.45476979977656756, "learning_rate": 2.6407578947368422e-05, "loss": 0.3399, "mean_copy_accuracy": 0.9946228563785553, "mean_gen_accuracy": 0.8534034639596939, "mean_token_accuracy": 0.8889105021953583, "num_tokens": 970093966.0, "sample_num_tokens": 8926.0, "step": 3580, "total_num_tokens": 970129670.0, "z_loss": 0.000717991148121655 }, { "copy_logits_max": -3.8663241863250732, "copy_logits_min": -687500032.0, "copy_num_tokens": 602.75, "epoch": 0.7313760531018637, "gen_logits_max": 4.483181953430176, "gen_logits_mean": -14.620229721069336, "gen_logits_min": -25.861923217773438, "gen_logits_std": 2.615811586380005, "gen_loss": 0.2882913053035736, "grad_norm": 0.37769061819449135, "learning_rate": 2.6406315789473683e-05, "loss": 0.289, "mean_copy_accuracy": 0.9945372641086578, "mean_gen_accuracy": 0.8672447353601456, "mean_token_accuracy": 0.9025228470563889, "num_tokens": 970395362.0, "sample_num_tokens": 9979.0, "step": 3581, "total_num_tokens": 970435278.0, "z_loss": 0.0006607947871088982 }, { "copy_logits_max": -6.045426845550537, "copy_logits_min": -750000000.0, "copy_num_tokens": 409.9375, "epoch": 0.7315802910390605, "gen_logits_max": 4.419266700744629, "gen_logits_mean": -15.128374099731445, "gen_logits_min": -26.082298278808594, "gen_logits_std": 2.653787612915039, "gen_loss": 0.2868252992630005, "grad_norm": 0.4144168314287711, "learning_rate": 2.6405052631578947e-05, "loss": 0.3178, "mean_copy_accuracy": 0.9945223927497864, "mean_gen_accuracy": 0.8648493885993958, "mean_token_accuracy": 0.8954626321792603, "num_tokens": 970677946.0, "sample_num_tokens": 7652.0, "step": 3582, "total_num_tokens": 970708554.0, "z_loss": 0.0006330400938168168 }, { "copy_logits_max": -5.3243021965026855, "copy_logits_min": -687500032.0, "copy_num_tokens": 292.6875, "epoch": 0.7317845289762573, "gen_logits_max": 4.769911766052246, "gen_logits_mean": -14.462465286254883, "gen_logits_min": -25.932140350341797, "gen_logits_std": 2.6775248050689697, "gen_loss": 0.32788321375846863, "grad_norm": 0.4704164510943727, "learning_rate": 2.640378947368421e-05, "loss": 0.3074, "mean_copy_accuracy": 0.99420565366745, "mean_gen_accuracy": 0.8660137057304382, "mean_token_accuracy": 0.8977125138044357, "num_tokens": 970938161.0, "sample_num_tokens": 6554.25, "step": 3583, "total_num_tokens": 970964378.0, "z_loss": 0.0007090225117281079 }, { "copy_logits_max": -4.307767868041992, "copy_logits_min": -750000000.0, "copy_num_tokens": 568.8125, "epoch": 0.7319887669134542, "gen_logits_max": 4.112529754638672, "gen_logits_mean": -14.704497337341309, "gen_logits_min": -25.98093032836914, "gen_logits_std": 2.640103340148926, "gen_loss": 0.31040143966674805, "grad_norm": 0.4572665049317709, "learning_rate": 2.6402526315789476e-05, "loss": 0.3219, "mean_copy_accuracy": 0.9918961524963379, "mean_gen_accuracy": 0.8656754046678543, "mean_token_accuracy": 0.8935083001852036, "num_tokens": 971199873.0, "sample_num_tokens": 9303.25, "step": 3584, "total_num_tokens": 971237086.0, "z_loss": 0.0007105547119863331 }, { "copy_logits_max": -6.212338447570801, "copy_logits_min": -750000000.0, "copy_num_tokens": 482.75, "epoch": 0.732193004850651, "gen_logits_max": 4.479545593261719, "gen_logits_mean": -15.645546913146973, "gen_logits_min": -27.23770523071289, "gen_logits_std": 2.667386293411255, "gen_loss": 0.3100101053714752, "grad_norm": 0.4125794016493009, "learning_rate": 2.6401263157894737e-05, "loss": 0.319, "mean_copy_accuracy": 0.9948627054691315, "mean_gen_accuracy": 0.8641998469829559, "mean_token_accuracy": 0.8941592127084732, "num_tokens": 971460586.0, "sample_num_tokens": 8780.5, "step": 3585, "total_num_tokens": 971495708.0, "z_loss": 0.0006859390414319932 }, { "copy_logits_max": -8.158241271972656, "copy_logits_min": -750000128.0, "copy_num_tokens": 253.9375, "epoch": 0.7323972427878478, "gen_logits_max": 4.9693498611450195, "gen_logits_mean": -15.513057708740234, "gen_logits_min": -26.4140567779541, "gen_logits_std": 2.601189136505127, "gen_loss": 0.314181923866272, "grad_norm": 0.4179345743109877, "learning_rate": 2.64e-05, "loss": 0.3116, "mean_copy_accuracy": 0.9935789406299591, "mean_gen_accuracy": 0.865685448050499, "mean_token_accuracy": 0.8967934995889664, "num_tokens": 971739264.0, "sample_num_tokens": 6867.0, "step": 3586, "total_num_tokens": 971766732.0, "z_loss": 0.0006823100848123431 }, { "copy_logits_max": -6.015178680419922, "copy_logits_min": -750000000.0, "copy_num_tokens": 449.25, "epoch": 0.7326014807250447, "gen_logits_max": 4.699766159057617, "gen_logits_mean": -14.64661979675293, "gen_logits_min": -26.02634048461914, "gen_logits_std": 2.6348142623901367, "gen_loss": 0.29906466603279114, "grad_norm": 0.4802188540123384, "learning_rate": 2.6398736842105262e-05, "loss": 0.341, "mean_copy_accuracy": 0.991719201207161, "mean_gen_accuracy": 0.8608347773551941, "mean_token_accuracy": 0.886828139424324, "num_tokens": 971980414.0, "sample_num_tokens": 9384.0, "step": 3587, "total_num_tokens": 972017950.0, "z_loss": 0.0006595837185159326 }, { "copy_logits_max": -6.1183342933654785, "copy_logits_min": -750000000.0, "copy_num_tokens": 358.4375, "epoch": 0.7328057186622415, "gen_logits_max": 5.414313316345215, "gen_logits_mean": -13.958452224731445, "gen_logits_min": -24.964473724365234, "gen_logits_std": 2.628349781036377, "gen_loss": 0.34440284967422485, "grad_norm": 0.43996881359928297, "learning_rate": 2.6397473684210527e-05, "loss": 0.3258, "mean_copy_accuracy": 0.9948936402797699, "mean_gen_accuracy": 0.8619727492332458, "mean_token_accuracy": 0.8951316177845001, "num_tokens": 972252844.0, "sample_num_tokens": 7879.5, "step": 3588, "total_num_tokens": 972284362.0, "z_loss": 0.0006845168536528945 }, { "copy_logits_max": -5.259603023529053, "copy_logits_min": -750000064.0, "copy_num_tokens": 461.5625, "epoch": 0.7330099565994384, "gen_logits_max": 4.392197132110596, "gen_logits_mean": -15.03346061706543, "gen_logits_min": -26.27309799194336, "gen_logits_std": 2.640829086303711, "gen_loss": 0.317959189414978, "grad_norm": 0.40264918900187635, "learning_rate": 2.639621052631579e-05, "loss": 0.2854, "mean_copy_accuracy": 0.9948671460151672, "mean_gen_accuracy": 0.8727942854166031, "mean_token_accuracy": 0.9039463400840759, "num_tokens": 972548752.0, "sample_num_tokens": 8481.0, "step": 3589, "total_num_tokens": 972582676.0, "z_loss": 0.0006964938947930932 }, { "copy_logits_max": -3.900625705718994, "copy_logits_min": -750000000.0, "copy_num_tokens": 348.5625, "epoch": 0.7332141945366352, "gen_logits_max": 5.55031681060791, "gen_logits_mean": -14.306697845458984, "gen_logits_min": -25.957408905029297, "gen_logits_std": 2.6503982543945312, "gen_loss": 0.37302303314208984, "grad_norm": 0.4383910660419792, "learning_rate": 2.6394947368421052e-05, "loss": 0.3278, "mean_copy_accuracy": 0.9944771081209183, "mean_gen_accuracy": 0.8594493716955185, "mean_token_accuracy": 0.8905608803033829, "num_tokens": 972810996.0, "sample_num_tokens": 7785.0, "step": 3590, "total_num_tokens": 972842136.0, "z_loss": 0.0008439908269792795 }, { "copy_logits_max": -6.229744911193848, "copy_logits_min": -750000000.0, "copy_num_tokens": 502.8125, "epoch": 0.733418432473832, "gen_logits_max": 3.7962265014648438, "gen_logits_mean": -16.717838287353516, "gen_logits_min": -27.820240020751953, "gen_logits_std": 2.6364991664886475, "gen_loss": 0.2817457914352417, "grad_norm": 0.40344253326491686, "learning_rate": 2.6393684210526316e-05, "loss": 0.3227, "mean_copy_accuracy": 0.9954931735992432, "mean_gen_accuracy": 0.861867293715477, "mean_token_accuracy": 0.8932445496320724, "num_tokens": 973095885.0, "sample_num_tokens": 10273.25, "step": 3591, "total_num_tokens": 973136978.0, "z_loss": 0.0006187680410221219 }, { "copy_logits_max": -2.699357032775879, "copy_logits_min": -750000000.0, "copy_num_tokens": 650.375, "epoch": 0.7336226704110288, "gen_logits_max": 4.942432403564453, "gen_logits_mean": -13.230613708496094, "gen_logits_min": -25.07233428955078, "gen_logits_std": 2.6638851165771484, "gen_loss": 0.2963487505912781, "grad_norm": 0.43421174754295466, "learning_rate": 2.639242105263158e-05, "loss": 0.2895, "mean_copy_accuracy": 0.9958389848470688, "mean_gen_accuracy": 0.8677397817373276, "mean_token_accuracy": 0.9052110016345978, "num_tokens": 973381566.0, "sample_num_tokens": 9235.5, "step": 3592, "total_num_tokens": 973418508.0, "z_loss": 0.0006387021858245134 }, { "copy_logits_max": -5.5087480545043945, "copy_logits_min": -750000000.0, "copy_num_tokens": 372.625, "epoch": 0.7338269083482257, "gen_logits_max": 5.451394081115723, "gen_logits_mean": -13.994528770446777, "gen_logits_min": -25.944059371948242, "gen_logits_std": 2.6733484268188477, "gen_loss": 0.32672277092933655, "grad_norm": 0.4407049262733405, "learning_rate": 2.6391157894736845e-05, "loss": 0.3008, "mean_copy_accuracy": 0.9919639676809311, "mean_gen_accuracy": 0.8741903007030487, "mean_token_accuracy": 0.9001504629850388, "num_tokens": 973645271.0, "sample_num_tokens": 7889.25, "step": 3593, "total_num_tokens": 973676828.0, "z_loss": 0.0007319280994124711 }, { "copy_logits_max": -4.115743637084961, "copy_logits_min": -750000000.0, "copy_num_tokens": 544.4375, "epoch": 0.7340311462854225, "gen_logits_max": 4.968374252319336, "gen_logits_mean": -13.603757858276367, "gen_logits_min": -25.127466201782227, "gen_logits_std": 2.6347153186798096, "gen_loss": 0.3231298327445984, "grad_norm": 0.4263194568098529, "learning_rate": 2.6389894736842106e-05, "loss": 0.3228, "mean_copy_accuracy": 0.9941708296537399, "mean_gen_accuracy": 0.8665269762277603, "mean_token_accuracy": 0.8918057829141617, "num_tokens": 973889750.0, "sample_num_tokens": 9702.5, "step": 3594, "total_num_tokens": 973928560.0, "z_loss": 0.0007685588207095861 }, { "copy_logits_max": -2.46417236328125, "copy_logits_min": -750000000.0, "copy_num_tokens": 386.125, "epoch": 0.7342353842226194, "gen_logits_max": 5.441466331481934, "gen_logits_mean": -13.371628761291504, "gen_logits_min": -24.79869842529297, "gen_logits_std": 2.611630916595459, "gen_loss": 0.3556296229362488, "grad_norm": 0.46576607631892547, "learning_rate": 2.638863157894737e-05, "loss": 0.3295, "mean_copy_accuracy": 0.9937222003936768, "mean_gen_accuracy": 0.8616902828216553, "mean_token_accuracy": 0.8913859575986862, "num_tokens": 974152324.0, "sample_num_tokens": 7934.0, "step": 3595, "total_num_tokens": 974184060.0, "z_loss": 0.0008144477033056319 }, { "copy_logits_max": -4.887077808380127, "copy_logits_min": -750000000.0, "copy_num_tokens": 447.875, "epoch": 0.7344396221598162, "gen_logits_max": 5.359322547912598, "gen_logits_mean": -12.356321334838867, "gen_logits_min": -23.405921936035156, "gen_logits_std": 2.5979666709899902, "gen_loss": 0.2949822247028351, "grad_norm": 0.3829176103383967, "learning_rate": 2.638736842105263e-05, "loss": 0.3016, "mean_copy_accuracy": 0.9944038093090057, "mean_gen_accuracy": 0.8618120551109314, "mean_token_accuracy": 0.8986640125513077, "num_tokens": 974431155.0, "sample_num_tokens": 8002.25, "step": 3596, "total_num_tokens": 974463164.0, "z_loss": 0.000705016078427434 }, { "copy_logits_max": -3.688119888305664, "copy_logits_min": -750000000.0, "copy_num_tokens": 441.9375, "epoch": 0.734643860097013, "gen_logits_max": 5.733270645141602, "gen_logits_mean": -12.533203125, "gen_logits_min": -24.09395408630371, "gen_logits_std": 2.649512767791748, "gen_loss": 0.3422228693962097, "grad_norm": 0.39716808435991524, "learning_rate": 2.6386105263157895e-05, "loss": 0.3087, "mean_copy_accuracy": 0.9942571669816971, "mean_gen_accuracy": 0.8691733926534653, "mean_token_accuracy": 0.8969509899616241, "num_tokens": 974710123.0, "sample_num_tokens": 8160.25, "step": 3597, "total_num_tokens": 974742764.0, "z_loss": 0.000753417203668505 }, { "copy_logits_max": -7.022656440734863, "copy_logits_min": -750000064.0, "copy_num_tokens": 569.4375, "epoch": 0.7348480980342098, "gen_logits_max": 4.711929798126221, "gen_logits_mean": -12.671024322509766, "gen_logits_min": -23.818004608154297, "gen_logits_std": 2.5758187770843506, "gen_loss": 0.2590979337692261, "grad_norm": 0.39280122115282057, "learning_rate": 2.6384842105263156e-05, "loss": 0.2989, "mean_copy_accuracy": 0.9951056391000748, "mean_gen_accuracy": 0.8690135926008224, "mean_token_accuracy": 0.8999917805194855, "num_tokens": 974998580.0, "sample_num_tokens": 9486.5, "step": 3598, "total_num_tokens": 975036526.0, "z_loss": 0.0005684708012267947 }, { "copy_logits_max": -3.4987540245056152, "copy_logits_min": -750000064.0, "copy_num_tokens": 544.9375, "epoch": 0.7350523359714067, "gen_logits_max": 5.220708847045898, "gen_logits_mean": -13.600847244262695, "gen_logits_min": -25.11560821533203, "gen_logits_std": 2.6556596755981445, "gen_loss": 0.2922325134277344, "grad_norm": 0.42528487037774754, "learning_rate": 2.638357894736842e-05, "loss": 0.3052, "mean_copy_accuracy": 0.9960650354623795, "mean_gen_accuracy": 0.8628902286291122, "mean_token_accuracy": 0.8966746032238007, "num_tokens": 975283214.0, "sample_num_tokens": 9670.0, "step": 3599, "total_num_tokens": 975321894.0, "z_loss": 0.0005957122193649411 }, { "copy_logits_max": -2.6691336631774902, "copy_logits_min": -687500032.0, "copy_num_tokens": 875.1875, "epoch": 0.7352565739086036, "gen_logits_max": 4.29941463470459, "gen_logits_mean": -13.747591018676758, "gen_logits_min": -25.739892959594727, "gen_logits_std": 2.7187747955322266, "gen_loss": 0.25737494230270386, "grad_norm": 0.4202558251897293, "learning_rate": 2.6382315789473685e-05, "loss": 0.3092, "mean_copy_accuracy": 0.9937494993209839, "mean_gen_accuracy": 0.8663569688796997, "mean_token_accuracy": 0.8978700637817383, "num_tokens": 975550592.0, "sample_num_tokens": 10920.5, "step": 3600, "total_num_tokens": 975594274.0, "z_loss": 0.0006165258819237351 }, { "copy_logits_max": -4.720270156860352, "copy_logits_min": -750000000.0, "copy_num_tokens": 620.8125, "epoch": 0.7354608118458004, "gen_logits_max": 4.861839771270752, "gen_logits_mean": -13.725500106811523, "gen_logits_min": -25.278234481811523, "gen_logits_std": 2.693953037261963, "gen_loss": 0.30980807542800903, "grad_norm": 0.4005268389397369, "learning_rate": 2.638105263157895e-05, "loss": 0.29, "mean_copy_accuracy": 0.9958133846521378, "mean_gen_accuracy": 0.8739177286624908, "mean_token_accuracy": 0.9030966758728027, "num_tokens": 975850408.0, "sample_num_tokens": 10207.5, "step": 3601, "total_num_tokens": 975891238.0, "z_loss": 0.0006391325732693076 }, { "copy_logits_max": -5.122414588928223, "copy_logits_min": -687500032.0, "copy_num_tokens": 563.8125, "epoch": 0.7356650497829972, "gen_logits_max": 4.77893590927124, "gen_logits_mean": -13.713423728942871, "gen_logits_min": -25.36774253845215, "gen_logits_std": 2.6972901821136475, "gen_loss": 0.3036381006240845, "grad_norm": 0.4914155674916544, "learning_rate": 2.6379789473684213e-05, "loss": 0.3326, "mean_copy_accuracy": 0.9941276162862778, "mean_gen_accuracy": 0.8562890738248825, "mean_token_accuracy": 0.8905106335878372, "num_tokens": 976131145.0, "sample_num_tokens": 9029.25, "step": 3602, "total_num_tokens": 976167262.0, "z_loss": 0.0005676813889294863 }, { "copy_logits_max": -3.57637619972229, "copy_logits_min": -750000000.0, "copy_num_tokens": 561.5, "epoch": 0.735869287720194, "gen_logits_max": 4.024989128112793, "gen_logits_mean": -14.432598114013672, "gen_logits_min": -25.931644439697266, "gen_logits_std": 2.647350788116455, "gen_loss": 0.26894742250442505, "grad_norm": 0.38853989436653236, "learning_rate": 2.6378526315789474e-05, "loss": 0.2999, "mean_copy_accuracy": 0.9960163682699203, "mean_gen_accuracy": 0.866034597158432, "mean_token_accuracy": 0.9005640000104904, "num_tokens": 976421190.0, "sample_num_tokens": 9459.5, "step": 3603, "total_num_tokens": 976459028.0, "z_loss": 0.0005626332713291049 }, { "copy_logits_max": -5.24744176864624, "copy_logits_min": -750000000.0, "copy_num_tokens": 394.0, "epoch": 0.7360735256573908, "gen_logits_max": 4.682546138763428, "gen_logits_mean": -15.002262115478516, "gen_logits_min": -26.013221740722656, "gen_logits_std": 2.65358304977417, "gen_loss": 0.27845969796180725, "grad_norm": 0.48273601799617216, "learning_rate": 2.637726315789474e-05, "loss": 0.2917, "mean_copy_accuracy": 0.9954033493995667, "mean_gen_accuracy": 0.8734299391508102, "mean_token_accuracy": 0.9024714976549149, "num_tokens": 976681108.0, "sample_num_tokens": 8426.5, "step": 3604, "total_num_tokens": 976714814.0, "z_loss": 0.0006541743641719222 }, { "copy_logits_max": -3.976346492767334, "copy_logits_min": -750000000.0, "copy_num_tokens": 531.875, "epoch": 0.7362777635945877, "gen_logits_max": 4.401632308959961, "gen_logits_mean": -14.708428382873535, "gen_logits_min": -26.79140853881836, "gen_logits_std": 2.7170701026916504, "gen_loss": 0.3111988604068756, "grad_norm": 0.42827834515653207, "learning_rate": 2.6376e-05, "loss": 0.3079, "mean_copy_accuracy": 0.995143473148346, "mean_gen_accuracy": 0.8616849482059479, "mean_token_accuracy": 0.8974030464887619, "num_tokens": 976970169.0, "sample_num_tokens": 9277.25, "step": 3605, "total_num_tokens": 977007278.0, "z_loss": 0.0006598902982659638 }, { "copy_logits_max": -5.239418029785156, "copy_logits_min": -750000000.0, "copy_num_tokens": 597.6875, "epoch": 0.7364820015317846, "gen_logits_max": 4.92427921295166, "gen_logits_mean": -13.83646011352539, "gen_logits_min": -25.38623809814453, "gen_logits_std": 2.650596857070923, "gen_loss": 0.2798610329627991, "grad_norm": 0.4547537088732803, "learning_rate": 2.6374736842105264e-05, "loss": 0.3072, "mean_copy_accuracy": 0.9947940707206726, "mean_gen_accuracy": 0.8701060563325882, "mean_token_accuracy": 0.9021308869123459, "num_tokens": 977251778.0, "sample_num_tokens": 9334.0, "step": 3606, "total_num_tokens": 977289114.0, "z_loss": 0.0005598800489678979 }, { "copy_logits_max": -4.394381046295166, "copy_logits_min": -750000000.0, "copy_num_tokens": 509.75, "epoch": 0.7366862394689814, "gen_logits_max": 5.2897562980651855, "gen_logits_mean": -13.742115020751953, "gen_logits_min": -25.336273193359375, "gen_logits_std": 2.701542377471924, "gen_loss": 0.2892669439315796, "grad_norm": 0.4593059487884186, "learning_rate": 2.6373473684210525e-05, "loss": 0.2908, "mean_copy_accuracy": 0.9931834489107132, "mean_gen_accuracy": 0.8726637810468674, "mean_token_accuracy": 0.9046529084444046, "num_tokens": 977517835.0, "sample_num_tokens": 8611.75, "step": 3607, "total_num_tokens": 977552282.0, "z_loss": 0.0005694697611033916 }, { "copy_logits_max": -3.11606502532959, "copy_logits_min": -750000000.0, "copy_num_tokens": 360.25, "epoch": 0.7368904774061782, "gen_logits_max": 5.182861328125, "gen_logits_mean": -13.016845703125, "gen_logits_min": -25.005558013916016, "gen_logits_std": 2.727214813232422, "gen_loss": 0.28838253021240234, "grad_norm": 0.43971627219086146, "learning_rate": 2.6372210526315792e-05, "loss": 0.2945, "mean_copy_accuracy": 0.995430514216423, "mean_gen_accuracy": 0.8703526109457016, "mean_token_accuracy": 0.9019954055547714, "num_tokens": 977784582.0, "sample_num_tokens": 6642.0, "step": 3608, "total_num_tokens": 977811150.0, "z_loss": 0.0006384182488545775 }, { "copy_logits_max": -1.3315539360046387, "copy_logits_min": -750000000.0, "copy_num_tokens": 567.125, "epoch": 0.737094715343375, "gen_logits_max": 5.173984527587891, "gen_logits_mean": -13.614215850830078, "gen_logits_min": -25.290014266967773, "gen_logits_std": 2.675243377685547, "gen_loss": 0.3174774646759033, "grad_norm": 0.43783674512911097, "learning_rate": 2.6370947368421053e-05, "loss": 0.308, "mean_copy_accuracy": 0.9943557381629944, "mean_gen_accuracy": 0.8634162843227386, "mean_token_accuracy": 0.8964714109897614, "num_tokens": 978025935.0, "sample_num_tokens": 8948.75, "step": 3609, "total_num_tokens": 978061730.0, "z_loss": 0.0007085995748639107 }, { "copy_logits_max": -2.3200550079345703, "copy_logits_min": -687500032.0, "copy_num_tokens": 708.5, "epoch": 0.7372989532805718, "gen_logits_max": 4.463010787963867, "gen_logits_mean": -14.039789199829102, "gen_logits_min": -25.723430633544922, "gen_logits_std": 2.698575973510742, "gen_loss": 0.27700746059417725, "grad_norm": 0.41885099909995405, "learning_rate": 2.6369684210526318e-05, "loss": 0.2821, "mean_copy_accuracy": 0.9955088198184967, "mean_gen_accuracy": 0.8741203993558884, "mean_token_accuracy": 0.9074069410562515, "num_tokens": 978298962.0, "sample_num_tokens": 9935.5, "step": 3610, "total_num_tokens": 978338704.0, "z_loss": 0.0006412985967472196 }, { "copy_logits_max": -1.5229604244232178, "copy_logits_min": -750000000.0, "copy_num_tokens": 577.375, "epoch": 0.7375031912177687, "gen_logits_max": 4.679963111877441, "gen_logits_mean": -12.957756996154785, "gen_logits_min": -24.523380279541016, "gen_logits_std": 2.6110968589782715, "gen_loss": 0.3186878561973572, "grad_norm": 0.48461103057510746, "learning_rate": 2.636842105263158e-05, "loss": 0.3083, "mean_copy_accuracy": 0.992949828505516, "mean_gen_accuracy": 0.8689199835062027, "mean_token_accuracy": 0.8971685022115707, "num_tokens": 978561318.0, "sample_num_tokens": 9197.0, "step": 3611, "total_num_tokens": 978598106.0, "z_loss": 0.0006760827964171767 }, { "copy_logits_max": -3.4551193714141846, "copy_logits_min": -750000000.0, "copy_num_tokens": 293.6875, "epoch": 0.7377074291549656, "gen_logits_max": 5.377079963684082, "gen_logits_mean": -14.202366828918457, "gen_logits_min": -25.52357292175293, "gen_logits_std": 2.6519975662231445, "gen_loss": 0.30700570344924927, "grad_norm": 0.3891860901116848, "learning_rate": 2.6367157894736843e-05, "loss": 0.3017, "mean_copy_accuracy": 0.9942848682403564, "mean_gen_accuracy": 0.8726020902395248, "mean_token_accuracy": 0.9014828205108643, "num_tokens": 978852911.0, "sample_num_tokens": 7291.75, "step": 3612, "total_num_tokens": 978882078.0, "z_loss": 0.0007125661359168589 }, { "copy_logits_max": -0.9801964163780212, "copy_logits_min": -750000000.0, "copy_num_tokens": 339.6875, "epoch": 0.7379116670921624, "gen_logits_max": 5.422762393951416, "gen_logits_mean": -13.479869842529297, "gen_logits_min": -25.03893280029297, "gen_logits_std": 2.6931684017181396, "gen_loss": 0.3083713948726654, "grad_norm": 0.4585304079866684, "learning_rate": 2.6365894736842104e-05, "loss": 0.3036, "mean_copy_accuracy": 0.9949846267700195, "mean_gen_accuracy": 0.8701167404651642, "mean_token_accuracy": 0.9002718180418015, "num_tokens": 979125234.0, "sample_num_tokens": 6533.0, "step": 3613, "total_num_tokens": 979151366.0, "z_loss": 0.000848385039716959 }, { "copy_logits_max": 1.3255704641342163, "copy_logits_min": -750000000.0, "copy_num_tokens": 395.125, "epoch": 0.7381159050293592, "gen_logits_max": 5.978001594543457, "gen_logits_mean": -12.575859069824219, "gen_logits_min": -24.0200138092041, "gen_logits_std": 2.7086386680603027, "gen_loss": 0.3012620806694031, "grad_norm": 0.408605508412523, "learning_rate": 2.6364631578947368e-05, "loss": 0.2939, "mean_copy_accuracy": 0.9951894730329514, "mean_gen_accuracy": 0.8720495402812958, "mean_token_accuracy": 0.9024677872657776, "num_tokens": 979400070.0, "sample_num_tokens": 8875.0, "step": 3614, "total_num_tokens": 979435570.0, "z_loss": 0.0007747460622340441 }, { "copy_logits_max": -3.3480918407440186, "copy_logits_min": -750000000.0, "copy_num_tokens": 281.5625, "epoch": 0.738320142966556, "gen_logits_max": 6.326743125915527, "gen_logits_mean": -13.795259475708008, "gen_logits_min": -25.020605087280273, "gen_logits_std": 2.6587600708007812, "gen_loss": 0.3608013391494751, "grad_norm": 0.4630467480277954, "learning_rate": 2.6363368421052632e-05, "loss": 0.3294, "mean_copy_accuracy": 0.9940907657146454, "mean_gen_accuracy": 0.8657567054033279, "mean_token_accuracy": 0.8947541564702988, "num_tokens": 979660473.0, "sample_num_tokens": 6837.75, "step": 3615, "total_num_tokens": 979687824.0, "z_loss": 0.0008467377629131079 }, { "copy_logits_max": -3.291780948638916, "copy_logits_min": -750000000.0, "copy_num_tokens": 622.375, "epoch": 0.7385243809037528, "gen_logits_max": 4.431445121765137, "gen_logits_mean": -14.536555290222168, "gen_logits_min": -26.42142105102539, "gen_logits_std": 2.7180967330932617, "gen_loss": 0.2745078206062317, "grad_norm": 0.443573025954772, "learning_rate": 2.6362105263157897e-05, "loss": 0.321, "mean_copy_accuracy": 0.9935040920972824, "mean_gen_accuracy": 0.8610819280147552, "mean_token_accuracy": 0.8934333026409149, "num_tokens": 979942352.0, "sample_num_tokens": 9407.0, "step": 3616, "total_num_tokens": 979979980.0, "z_loss": 0.0006588506512343884 }, { "copy_logits_max": -4.612067699432373, "copy_logits_min": -750000000.0, "copy_num_tokens": 623.625, "epoch": 0.7387286188409498, "gen_logits_max": 4.4807538986206055, "gen_logits_mean": -13.070767402648926, "gen_logits_min": -24.619848251342773, "gen_logits_std": 2.6543426513671875, "gen_loss": 0.25233572721481323, "grad_norm": 0.42187978471300425, "learning_rate": 2.636084210526316e-05, "loss": 0.3185, "mean_copy_accuracy": 0.9959966987371445, "mean_gen_accuracy": 0.8605867773294449, "mean_token_accuracy": 0.8952974230051041, "num_tokens": 980205353.0, "sample_num_tokens": 8564.75, "step": 3617, "total_num_tokens": 980239612.0, "z_loss": 0.0006716757779940963 }, { "copy_logits_max": -3.8493504524230957, "copy_logits_min": -750000000.0, "copy_num_tokens": 340.875, "epoch": 0.7389328567781466, "gen_logits_max": 6.123787879943848, "gen_logits_mean": -13.841602325439453, "gen_logits_min": -25.571439743041992, "gen_logits_std": 2.687777042388916, "gen_loss": 0.3386325240135193, "grad_norm": 0.3721557533238761, "learning_rate": 2.6359578947368422e-05, "loss": 0.3042, "mean_copy_accuracy": 0.9960191547870636, "mean_gen_accuracy": 0.869329109787941, "mean_token_accuracy": 0.8980102986097336, "num_tokens": 980469562.0, "sample_num_tokens": 7995.0, "step": 3618, "total_num_tokens": 980501542.0, "z_loss": 0.0007932792650535703 }, { "copy_logits_max": 1.713130235671997, "copy_logits_min": -750000000.0, "copy_num_tokens": 501.25, "epoch": 0.7391370947153434, "gen_logits_max": 5.9539923667907715, "gen_logits_mean": -12.78538703918457, "gen_logits_min": -24.847858428955078, "gen_logits_std": 2.6819636821746826, "gen_loss": 0.30451780557632446, "grad_norm": 0.39359178944833667, "learning_rate": 2.6358315789473686e-05, "loss": 0.2881, "mean_copy_accuracy": 0.9957896173000336, "mean_gen_accuracy": 0.8729659616947174, "mean_token_accuracy": 0.9069595038890839, "num_tokens": 980751970.0, "sample_num_tokens": 9529.5, "step": 3619, "total_num_tokens": 980790088.0, "z_loss": 0.0007053372100926936 }, { "copy_logits_max": -1.9553396701812744, "copy_logits_min": -750000000.0, "copy_num_tokens": 537.0625, "epoch": 0.7393413326525402, "gen_logits_max": 4.335477352142334, "gen_logits_mean": -14.448530197143555, "gen_logits_min": -26.273170471191406, "gen_logits_std": 2.725632905960083, "gen_loss": 0.2972845435142517, "grad_norm": 0.41941408030936095, "learning_rate": 2.6357052631578947e-05, "loss": 0.3357, "mean_copy_accuracy": 0.9934184700250626, "mean_gen_accuracy": 0.8593588173389435, "mean_token_accuracy": 0.8903151154518127, "num_tokens": 981024669.0, "sample_num_tokens": 8229.75, "step": 3620, "total_num_tokens": 981057588.0, "z_loss": 0.000715858768671751 }, { "copy_logits_max": -2.5412180423736572, "copy_logits_min": -750000000.0, "copy_num_tokens": 442.5, "epoch": 0.739545570589737, "gen_logits_max": 5.963693618774414, "gen_logits_mean": -12.92479133605957, "gen_logits_min": -24.0504093170166, "gen_logits_std": 2.6784582138061523, "gen_loss": 0.316987544298172, "grad_norm": 0.41622124379473224, "learning_rate": 2.635578947368421e-05, "loss": 0.3181, "mean_copy_accuracy": 0.9946605861186981, "mean_gen_accuracy": 0.8656821846961975, "mean_token_accuracy": 0.8934749811887741, "num_tokens": 981280906.0, "sample_num_tokens": 9082.5, "step": 3621, "total_num_tokens": 981317236.0, "z_loss": 0.0006740755634382367 }, { "copy_logits_max": -0.6009276509284973, "copy_logits_min": -687500032.0, "copy_num_tokens": 683.6875, "epoch": 0.7397498085269338, "gen_logits_max": 4.883029937744141, "gen_logits_mean": -13.102973937988281, "gen_logits_min": -25.22771453857422, "gen_logits_std": 2.7390716075897217, "gen_loss": 0.2904018759727478, "grad_norm": 0.38606890982233905, "learning_rate": 2.6354526315789473e-05, "loss": 0.3032, "mean_copy_accuracy": 0.9960873872041702, "mean_gen_accuracy": 0.8682171255350113, "mean_token_accuracy": 0.9005701690912247, "num_tokens": 981567386.0, "sample_num_tokens": 9830.0, "step": 3622, "total_num_tokens": 981606706.0, "z_loss": 0.0007139946101233363 }, { "copy_logits_max": -0.7544130086898804, "copy_logits_min": -750000064.0, "copy_num_tokens": 446.875, "epoch": 0.7399540464641308, "gen_logits_max": 4.714422702789307, "gen_logits_mean": -14.502314567565918, "gen_logits_min": -26.494403839111328, "gen_logits_std": 2.741044282913208, "gen_loss": 0.31131118535995483, "grad_norm": 0.40279448019294234, "learning_rate": 2.6353263157894737e-05, "loss": 0.3076, "mean_copy_accuracy": 0.995433434844017, "mean_gen_accuracy": 0.8699000924825668, "mean_token_accuracy": 0.8985822945833206, "num_tokens": 981844224.0, "sample_num_tokens": 8363.0, "step": 3623, "total_num_tokens": 981877676.0, "z_loss": 0.0007225588196888566 }, { "copy_logits_max": -2.0108728408813477, "copy_logits_min": -750000000.0, "copy_num_tokens": 283.625, "epoch": 0.7401582844013276, "gen_logits_max": 5.762872695922852, "gen_logits_mean": -12.710396766662598, "gen_logits_min": -23.895179748535156, "gen_logits_std": 2.671988010406494, "gen_loss": 0.3150097727775574, "grad_norm": 0.4207708217798588, "learning_rate": 2.6351999999999998e-05, "loss": 0.2923, "mean_copy_accuracy": 0.9938492029905319, "mean_gen_accuracy": 0.871715784072876, "mean_token_accuracy": 0.9016262143850327, "num_tokens": 982122614.0, "sample_num_tokens": 6946.0, "step": 3624, "total_num_tokens": 982150398.0, "z_loss": 0.0006335775833576918 }, { "copy_logits_max": -4.6309814453125, "copy_logits_min": -750000000.0, "copy_num_tokens": 443.1875, "epoch": 0.7403625223385244, "gen_logits_max": 5.420750617980957, "gen_logits_mean": -13.888440132141113, "gen_logits_min": -25.705120086669922, "gen_logits_std": 2.7179856300354004, "gen_loss": 0.32249414920806885, "grad_norm": 0.44421148205761196, "learning_rate": 2.6350736842105265e-05, "loss": 0.3087, "mean_copy_accuracy": 0.9941908270120621, "mean_gen_accuracy": 0.8648647218942642, "mean_token_accuracy": 0.8981329947710037, "num_tokens": 982380406.0, "sample_num_tokens": 8123.5, "step": 3625, "total_num_tokens": 982412900.0, "z_loss": 0.0006467972416430712 }, { "copy_logits_max": -4.362940788269043, "copy_logits_min": -750000000.0, "copy_num_tokens": 240.9375, "epoch": 0.7405667602757212, "gen_logits_max": 5.396737098693848, "gen_logits_mean": -14.270833015441895, "gen_logits_min": -25.518680572509766, "gen_logits_std": 2.6699113845825195, "gen_loss": 0.3193703591823578, "grad_norm": 0.39035495718084817, "learning_rate": 2.6349473684210526e-05, "loss": 0.3027, "mean_copy_accuracy": 0.9950180053710938, "mean_gen_accuracy": 0.8698994666337967, "mean_token_accuracy": 0.8978354930877686, "num_tokens": 982642904.0, "sample_num_tokens": 6522.0, "step": 3626, "total_num_tokens": 982668992.0, "z_loss": 0.0006480654701590538 }, { "copy_logits_max": -2.0758495330810547, "copy_logits_min": -750000000.0, "copy_num_tokens": 645.8125, "epoch": 0.740770998212918, "gen_logits_max": 5.257019519805908, "gen_logits_mean": -13.088163375854492, "gen_logits_min": -24.77958869934082, "gen_logits_std": 2.64095401763916, "gen_loss": 0.3295162320137024, "grad_norm": 0.4469708787276, "learning_rate": 2.634821052631579e-05, "loss": 0.3271, "mean_copy_accuracy": 0.9956596046686172, "mean_gen_accuracy": 0.8579068183898926, "mean_token_accuracy": 0.8935984820127487, "num_tokens": 982912109.0, "sample_num_tokens": 9639.25, "step": 3627, "total_num_tokens": 982950666.0, "z_loss": 0.0007388937519863248 }, { "copy_logits_max": -0.4552854299545288, "copy_logits_min": -750000000.0, "copy_num_tokens": 688.125, "epoch": 0.7409752361501148, "gen_logits_max": 3.744947910308838, "gen_logits_mean": -15.789670944213867, "gen_logits_min": -27.934471130371094, "gen_logits_std": 2.7219133377075195, "gen_loss": 0.27748286724090576, "grad_norm": 0.40402205961076526, "learning_rate": 2.6346947368421055e-05, "loss": 0.2928, "mean_copy_accuracy": 0.9949462115764618, "mean_gen_accuracy": 0.8711745589971542, "mean_token_accuracy": 0.9026355594396591, "num_tokens": 983192114.0, "sample_num_tokens": 10006.0, "step": 3628, "total_num_tokens": 983232138.0, "z_loss": 0.0006222804076969624 }, { "copy_logits_max": -5.652496337890625, "copy_logits_min": -750000000.0, "copy_num_tokens": 376.0625, "epoch": 0.7411794740873117, "gen_logits_max": 5.521878242492676, "gen_logits_mean": -13.958710670471191, "gen_logits_min": -24.770551681518555, "gen_logits_std": 2.561680316925049, "gen_loss": 0.3242824971675873, "grad_norm": 0.41897112556560295, "learning_rate": 2.6345684210526316e-05, "loss": 0.2877, "mean_copy_accuracy": 0.994912177324295, "mean_gen_accuracy": 0.877074584364891, "mean_token_accuracy": 0.9058976769447327, "num_tokens": 983468592.0, "sample_num_tokens": 8585.5, "step": 3629, "total_num_tokens": 983502934.0, "z_loss": 0.0006355191580951214 }, { "copy_logits_max": -0.8933901786804199, "copy_logits_min": -625000000.0, "copy_num_tokens": 578.75, "epoch": 0.7413837120245086, "gen_logits_max": 5.144920349121094, "gen_logits_mean": -13.182821273803711, "gen_logits_min": -24.840858459472656, "gen_logits_std": 2.663247585296631, "gen_loss": 0.3029515743255615, "grad_norm": 0.4016769034812388, "learning_rate": 2.634442105263158e-05, "loss": 0.3103, "mean_copy_accuracy": 0.9946445375680923, "mean_gen_accuracy": 0.8614028096199036, "mean_token_accuracy": 0.8970619440078735, "num_tokens": 983761656.0, "sample_num_tokens": 8631.0, "step": 3630, "total_num_tokens": 983796180.0, "z_loss": 0.0006162136560305953 }, { "copy_logits_max": -0.8962031602859497, "copy_logits_min": -750000000.0, "copy_num_tokens": 325.5625, "epoch": 0.7415879499617054, "gen_logits_max": 5.580327033996582, "gen_logits_mean": -13.806523323059082, "gen_logits_min": -24.86927032470703, "gen_logits_std": 2.5947518348693848, "gen_loss": 0.31284886598587036, "grad_norm": 0.38135277180701677, "learning_rate": 2.634315789473684e-05, "loss": 0.2941, "mean_copy_accuracy": 0.9946921169757843, "mean_gen_accuracy": 0.874515950679779, "mean_token_accuracy": 0.9008282423019409, "num_tokens": 984045079.0, "sample_num_tokens": 7812.25, "step": 3631, "total_num_tokens": 984076328.0, "z_loss": 0.0006333664059638977 }, { "copy_logits_max": 0.09608511626720428, "copy_logits_min": -750000000.0, "copy_num_tokens": 521.6875, "epoch": 0.7417921878989022, "gen_logits_max": 4.871983528137207, "gen_logits_mean": -13.378324508666992, "gen_logits_min": -24.971786499023438, "gen_logits_std": 2.661407947540283, "gen_loss": 0.3013190031051636, "grad_norm": 0.5433060577247636, "learning_rate": 2.6341894736842105e-05, "loss": 0.306, "mean_copy_accuracy": 0.9958538711071014, "mean_gen_accuracy": 0.8566901683807373, "mean_token_accuracy": 0.8975649178028107, "num_tokens": 984320095.0, "sample_num_tokens": 7671.25, "step": 3632, "total_num_tokens": 984350780.0, "z_loss": 0.0007313722162507474 }, { "copy_logits_max": -3.655233383178711, "copy_logits_min": -750000000.0, "copy_num_tokens": 340.8125, "epoch": 0.741996425836099, "gen_logits_max": 5.615217685699463, "gen_logits_mean": -14.43991756439209, "gen_logits_min": -25.84126091003418, "gen_logits_std": 2.6289865970611572, "gen_loss": 0.33172357082366943, "grad_norm": 0.4072581131940021, "learning_rate": 2.634063157894737e-05, "loss": 0.3144, "mean_copy_accuracy": 0.9944742023944855, "mean_gen_accuracy": 0.8655126243829727, "mean_token_accuracy": 0.89652518928051, "num_tokens": 984587694.0, "sample_num_tokens": 7504.0, "step": 3633, "total_num_tokens": 984617710.0, "z_loss": 0.0007240786799229681 }, { "copy_logits_max": 1.0883618593215942, "copy_logits_min": -625000064.0, "copy_num_tokens": 580.4375, "epoch": 0.7422006637732959, "gen_logits_max": 6.469902038574219, "gen_logits_mean": -12.064767837524414, "gen_logits_min": -23.64788246154785, "gen_logits_std": 2.67366361618042, "gen_loss": 0.29168951511383057, "grad_norm": 0.41581319101702086, "learning_rate": 2.6339368421052634e-05, "loss": 0.3072, "mean_copy_accuracy": 0.9946993887424469, "mean_gen_accuracy": 0.8664580881595612, "mean_token_accuracy": 0.8981537073850632, "num_tokens": 984861958.0, "sample_num_tokens": 9195.0, "step": 3634, "total_num_tokens": 984898738.0, "z_loss": 0.0006874797400087118 }, { "copy_logits_max": 1.1899926662445068, "copy_logits_min": -687500032.0, "copy_num_tokens": 583.25, "epoch": 0.7424049017104927, "gen_logits_max": 5.8751373291015625, "gen_logits_mean": -13.105249404907227, "gen_logits_min": -24.817161560058594, "gen_logits_std": 2.7124428749084473, "gen_loss": 0.2919924855232239, "grad_norm": 0.40872411459368224, "learning_rate": 2.6338105263157895e-05, "loss": 0.3199, "mean_copy_accuracy": 0.9943063259124756, "mean_gen_accuracy": 0.8601809293031693, "mean_token_accuracy": 0.8938238620758057, "num_tokens": 985134991.0, "sample_num_tokens": 9326.25, "step": 3635, "total_num_tokens": 985172296.0, "z_loss": 0.0006897400016896427 }, { "copy_logits_max": -0.17438015341758728, "copy_logits_min": -750000128.0, "copy_num_tokens": 433.5, "epoch": 0.7426091396476896, "gen_logits_max": 5.7474517822265625, "gen_logits_mean": -12.663046836853027, "gen_logits_min": -24.089916229248047, "gen_logits_std": 2.657536506652832, "gen_loss": 0.3180296719074249, "grad_norm": 0.43722044091766044, "learning_rate": 2.633684210526316e-05, "loss": 0.3386, "mean_copy_accuracy": 0.9940707385540009, "mean_gen_accuracy": 0.8554423600435257, "mean_token_accuracy": 0.8874967992305756, "num_tokens": 985406436.0, "sample_num_tokens": 8090.0, "step": 3636, "total_num_tokens": 985438796.0, "z_loss": 0.0007475849706679583 }, { "copy_logits_max": -2.4979047775268555, "copy_logits_min": -750000000.0, "copy_num_tokens": 463.0625, "epoch": 0.7428133775848864, "gen_logits_max": 4.638599872589111, "gen_logits_mean": -15.199975967407227, "gen_logits_min": -26.340410232543945, "gen_logits_std": 2.654035806655884, "gen_loss": 0.3323725163936615, "grad_norm": 0.39759479139855386, "learning_rate": 2.633557894736842e-05, "loss": 0.2892, "mean_copy_accuracy": 0.9958109706640244, "mean_gen_accuracy": 0.869160458445549, "mean_token_accuracy": 0.9037921130657196, "num_tokens": 985670412.0, "sample_num_tokens": 8851.0, "step": 3637, "total_num_tokens": 985705816.0, "z_loss": 0.0007266770116984844 }, { "copy_logits_max": -0.7009270191192627, "copy_logits_min": -750000000.0, "copy_num_tokens": 490.875, "epoch": 0.7430176155220832, "gen_logits_max": 4.705173015594482, "gen_logits_mean": -14.062644958496094, "gen_logits_min": -25.74625015258789, "gen_logits_std": 2.721245527267456, "gen_loss": 0.32488709688186646, "grad_norm": 0.4583052814588657, "learning_rate": 2.6334315789473685e-05, "loss": 0.3108, "mean_copy_accuracy": 0.993745744228363, "mean_gen_accuracy": 0.8648690283298492, "mean_token_accuracy": 0.8992953449487686, "num_tokens": 985945576.0, "sample_num_tokens": 7959.5, "step": 3638, "total_num_tokens": 985977414.0, "z_loss": 0.0006672514136880636 }, { "copy_logits_max": -2.633704423904419, "copy_logits_min": -750000000.0, "copy_num_tokens": 451.8125, "epoch": 0.74322185345928, "gen_logits_max": 6.000463485717773, "gen_logits_mean": -13.360580444335938, "gen_logits_min": -24.800886154174805, "gen_logits_std": 2.658411741256714, "gen_loss": 0.30872902274131775, "grad_norm": 0.427898616904996, "learning_rate": 2.6333052631578945e-05, "loss": 0.304, "mean_copy_accuracy": 0.9939078390598297, "mean_gen_accuracy": 0.8671894371509552, "mean_token_accuracy": 0.9000058621168137, "num_tokens": 986214895.0, "sample_num_tokens": 8428.25, "step": 3639, "total_num_tokens": 986248608.0, "z_loss": 0.000656401738524437 }, { "copy_logits_max": -0.7554834485054016, "copy_logits_min": -750000000.0, "copy_num_tokens": 507.0625, "epoch": 0.7434260913964769, "gen_logits_max": 5.1976823806762695, "gen_logits_mean": -13.405226707458496, "gen_logits_min": -25.511425018310547, "gen_logits_std": 2.735240936279297, "gen_loss": 0.28728508949279785, "grad_norm": 0.4103581255591005, "learning_rate": 2.633178947368421e-05, "loss": 0.2969, "mean_copy_accuracy": 0.9943780899047852, "mean_gen_accuracy": 0.8703406155109406, "mean_token_accuracy": 0.9017230272293091, "num_tokens": 986503560.0, "sample_num_tokens": 9254.5, "step": 3640, "total_num_tokens": 986540578.0, "z_loss": 0.0006540374597534537 }, { "copy_logits_max": -3.6346793174743652, "copy_logits_min": -687500032.0, "copy_num_tokens": 270.0625, "epoch": 0.7436303293336737, "gen_logits_max": 5.544600963592529, "gen_logits_mean": -14.329366683959961, "gen_logits_min": -25.863426208496094, "gen_logits_std": 2.6758151054382324, "gen_loss": 0.3334817588329315, "grad_norm": 0.4074779731752401, "learning_rate": 2.6330526315789477e-05, "loss": 0.3166, "mean_copy_accuracy": 0.995350107550621, "mean_gen_accuracy": 0.8624949604272842, "mean_token_accuracy": 0.8947092443704605, "num_tokens": 986785576.0, "sample_num_tokens": 7320.5, "step": 3641, "total_num_tokens": 986814858.0, "z_loss": 0.0007423408096656203 }, { "copy_logits_max": -1.3964380025863647, "copy_logits_min": -687500032.0, "copy_num_tokens": 668.8125, "epoch": 0.7438345672708706, "gen_logits_max": 3.975778579711914, "gen_logits_mean": -14.531801223754883, "gen_logits_min": -26.672964096069336, "gen_logits_std": 2.704155921936035, "gen_loss": 0.31035393476486206, "grad_norm": 0.4088370288940985, "learning_rate": 2.632926315789474e-05, "loss": 0.2994, "mean_copy_accuracy": 0.9957615584135056, "mean_gen_accuracy": 0.8685528784990311, "mean_token_accuracy": 0.9028954803943634, "num_tokens": 987073513.0, "sample_num_tokens": 8360.25, "step": 3642, "total_num_tokens": 987106954.0, "z_loss": 0.0007063586381264031 }, { "copy_logits_max": -1.0915799140930176, "copy_logits_min": -687500032.0, "copy_num_tokens": 529.8125, "epoch": 0.7440388052080674, "gen_logits_max": 4.831946849822998, "gen_logits_mean": -13.36744213104248, "gen_logits_min": -25.12997817993164, "gen_logits_std": 2.695166826248169, "gen_loss": 0.3251616954803467, "grad_norm": 0.4287904982530846, "learning_rate": 2.6328000000000003e-05, "loss": 0.3073, "mean_copy_accuracy": 0.9957208335399628, "mean_gen_accuracy": 0.8640217781066895, "mean_token_accuracy": 0.8971636295318604, "num_tokens": 987352937.0, "sample_num_tokens": 8407.25, "step": 3643, "total_num_tokens": 987386566.0, "z_loss": 0.0006962938932701945 }, { "copy_logits_max": -2.4338719844818115, "copy_logits_min": -750000000.0, "copy_num_tokens": 563.5, "epoch": 0.7442430431452642, "gen_logits_max": 4.88879919052124, "gen_logits_mean": -14.136200904846191, "gen_logits_min": -25.70240592956543, "gen_logits_std": 2.6557059288024902, "gen_loss": 0.2884940505027771, "grad_norm": 0.4250187056958043, "learning_rate": 2.6326736842105264e-05, "loss": 0.3135, "mean_copy_accuracy": 0.9948611110448837, "mean_gen_accuracy": 0.8634660691022873, "mean_token_accuracy": 0.8965165615081787, "num_tokens": 987620759.0, "sample_num_tokens": 9211.75, "step": 3644, "total_num_tokens": 987657606.0, "z_loss": 0.0006170488195493817 }, { "copy_logits_max": -3.5392303466796875, "copy_logits_min": -687500032.0, "copy_num_tokens": 361.9375, "epoch": 0.744447281082461, "gen_logits_max": 5.64903450012207, "gen_logits_mean": -13.631753921508789, "gen_logits_min": -25.2003116607666, "gen_logits_std": 2.674659252166748, "gen_loss": 0.30352067947387695, "grad_norm": 0.4695058991086407, "learning_rate": 2.6325473684210528e-05, "loss": 0.3185, "mean_copy_accuracy": 0.9945858120918274, "mean_gen_accuracy": 0.8654900193214417, "mean_token_accuracy": 0.893648087978363, "num_tokens": 987871011.0, "sample_num_tokens": 7412.75, "step": 3645, "total_num_tokens": 987900662.0, "z_loss": 0.0006350307958200574 }, { "copy_logits_max": -4.895991802215576, "copy_logits_min": -750000000.0, "copy_num_tokens": 458.6875, "epoch": 0.7446515190196579, "gen_logits_max": 4.751340866088867, "gen_logits_mean": -13.606988906860352, "gen_logits_min": -25.33357810974121, "gen_logits_std": 2.6488871574401855, "gen_loss": 0.30975234508514404, "grad_norm": 0.4548799557407122, "learning_rate": 2.632421052631579e-05, "loss": 0.3024, "mean_copy_accuracy": 0.9927914142608643, "mean_gen_accuracy": 0.8695853799581528, "mean_token_accuracy": 0.9004345238208771, "num_tokens": 988131964.0, "sample_num_tokens": 7726.5, "step": 3646, "total_num_tokens": 988162870.0, "z_loss": 0.000616918783634901 }, { "copy_logits_max": -4.313773155212402, "copy_logits_min": -750000000.0, "copy_num_tokens": 560.875, "epoch": 0.7448557569568547, "gen_logits_max": 4.26704216003418, "gen_logits_mean": -14.16335391998291, "gen_logits_min": -25.219276428222656, "gen_logits_std": 2.6025381088256836, "gen_loss": 0.29405078291893005, "grad_norm": 0.4124074896484368, "learning_rate": 2.6322947368421053e-05, "loss": 0.3037, "mean_copy_accuracy": 0.9951727986335754, "mean_gen_accuracy": 0.8681188821792603, "mean_token_accuracy": 0.9000483900308609, "num_tokens": 988399305.0, "sample_num_tokens": 8825.25, "step": 3647, "total_num_tokens": 988434606.0, "z_loss": 0.000613240001257509 }, { "copy_logits_max": -1.8953131437301636, "copy_logits_min": -687500032.0, "copy_num_tokens": 525.125, "epoch": 0.7450599948940516, "gen_logits_max": 5.045271873474121, "gen_logits_mean": -13.555567741394043, "gen_logits_min": -25.20563507080078, "gen_logits_std": 2.693518877029419, "gen_loss": 0.3256531357765198, "grad_norm": 0.45228900701197866, "learning_rate": 2.6321684210526314e-05, "loss": 0.3089, "mean_copy_accuracy": 0.994349405169487, "mean_gen_accuracy": 0.8629025220870972, "mean_token_accuracy": 0.8962123095989227, "num_tokens": 988677726.0, "sample_num_tokens": 8611.5, "step": 3648, "total_num_tokens": 988712172.0, "z_loss": 0.0007117923232726753 }, { "copy_logits_max": -0.9223875999450684, "copy_logits_min": -750000000.0, "copy_num_tokens": 516.875, "epoch": 0.7452642328312484, "gen_logits_max": 4.95930290222168, "gen_logits_mean": -14.083572387695312, "gen_logits_min": -25.992441177368164, "gen_logits_std": 2.695246696472168, "gen_loss": 0.2763206362724304, "grad_norm": 0.4511532380050805, "learning_rate": 2.6320421052631582e-05, "loss": 0.2944, "mean_copy_accuracy": 0.9963133037090302, "mean_gen_accuracy": 0.8712309747934341, "mean_token_accuracy": 0.9029527306556702, "num_tokens": 988957962.0, "sample_num_tokens": 8634.5, "step": 3649, "total_num_tokens": 988992500.0, "z_loss": 0.0005706031224690378 }, { "copy_logits_max": -5.270218849182129, "copy_logits_min": -750000000.0, "copy_num_tokens": 429.375, "epoch": 0.7454684707684452, "gen_logits_max": 5.811293601989746, "gen_logits_mean": -12.647744178771973, "gen_logits_min": -24.684404373168945, "gen_logits_std": 2.6439764499664307, "gen_loss": 0.29622071981430054, "grad_norm": 0.4210763808840196, "learning_rate": 2.6319157894736843e-05, "loss": 0.2984, "mean_copy_accuracy": 0.9947409331798553, "mean_gen_accuracy": 0.8738650381565094, "mean_token_accuracy": 0.9012785404920578, "num_tokens": 989237238.0, "sample_num_tokens": 9332.0, "step": 3650, "total_num_tokens": 989274566.0, "z_loss": 0.0006565882358700037 }, { "copy_logits_max": -5.07508659362793, "copy_logits_min": -750000000.0, "copy_num_tokens": 424.9375, "epoch": 0.7456727087056421, "gen_logits_max": 5.964259147644043, "gen_logits_mean": -13.54902172088623, "gen_logits_min": -25.356277465820312, "gen_logits_std": 2.6619536876678467, "gen_loss": 0.3265801668167114, "grad_norm": 0.5014724914898616, "learning_rate": 2.6317894736842107e-05, "loss": 0.3128, "mean_copy_accuracy": 0.9933685511350632, "mean_gen_accuracy": 0.8684845566749573, "mean_token_accuracy": 0.8960273563861847, "num_tokens": 989500363.0, "sample_num_tokens": 8848.25, "step": 3651, "total_num_tokens": 989535756.0, "z_loss": 0.0006543145282194018 }, { "copy_logits_max": -4.881356716156006, "copy_logits_min": -750000000.0, "copy_num_tokens": 415.25, "epoch": 0.7458769466428389, "gen_logits_max": 5.561545372009277, "gen_logits_mean": -13.979201316833496, "gen_logits_min": -25.445783615112305, "gen_logits_std": 2.638711452484131, "gen_loss": 0.30411022901535034, "grad_norm": 0.39479064548729376, "learning_rate": 2.6316631578947368e-05, "loss": 0.3147, "mean_copy_accuracy": 0.9953467547893524, "mean_gen_accuracy": 0.8621715605258942, "mean_token_accuracy": 0.8963501900434494, "num_tokens": 989779682.0, "sample_num_tokens": 7931.5, "step": 3652, "total_num_tokens": 989811408.0, "z_loss": 0.000648540910333395 }, { "copy_logits_max": 0.17436471581459045, "copy_logits_min": -750000000.0, "copy_num_tokens": 478.875, "epoch": 0.7460811845800357, "gen_logits_max": 5.340203285217285, "gen_logits_mean": -13.051994323730469, "gen_logits_min": -24.337881088256836, "gen_logits_std": 2.615832805633545, "gen_loss": 0.3053746819496155, "grad_norm": 0.45305307077003654, "learning_rate": 2.6315368421052632e-05, "loss": 0.3196, "mean_copy_accuracy": 0.9956171661615372, "mean_gen_accuracy": 0.8614921420812607, "mean_token_accuracy": 0.8956485390663147, "num_tokens": 990046939.0, "sample_num_tokens": 8558.75, "step": 3653, "total_num_tokens": 990081174.0, "z_loss": 0.0008158920099958777 }, { "copy_logits_max": 0.14203602075576782, "copy_logits_min": -687500032.0, "copy_num_tokens": 746.125, "epoch": 0.7462854225172326, "gen_logits_max": 5.37603759765625, "gen_logits_mean": -12.627664566040039, "gen_logits_min": -24.4653263092041, "gen_logits_std": 2.650621175765991, "gen_loss": 0.30697911977767944, "grad_norm": 0.42577407396069866, "learning_rate": 2.6314105263157893e-05, "loss": 0.3095, "mean_copy_accuracy": 0.9963643401861191, "mean_gen_accuracy": 0.8607020676136017, "mean_token_accuracy": 0.8999712914228439, "num_tokens": 990325264.0, "sample_num_tokens": 9492.0, "step": 3654, "total_num_tokens": 990363232.0, "z_loss": 0.0009331782930530608 }, { "copy_logits_max": -1.5019817352294922, "copy_logits_min": -750000000.0, "copy_num_tokens": 457.75, "epoch": 0.7464896604544294, "gen_logits_max": 5.121125221252441, "gen_logits_mean": -13.523120880126953, "gen_logits_min": -25.250732421875, "gen_logits_std": 2.615779161453247, "gen_loss": 0.2582724094390869, "grad_norm": 0.3936775849765039, "learning_rate": 2.6312842105263158e-05, "loss": 0.2963, "mean_copy_accuracy": 0.993619978427887, "mean_gen_accuracy": 0.8722153604030609, "mean_token_accuracy": 0.9017761051654816, "num_tokens": 990589563.0, "sample_num_tokens": 8748.25, "step": 3655, "total_num_tokens": 990624556.0, "z_loss": 0.0007441035122610629 }, { "copy_logits_max": -1.5091878175735474, "copy_logits_min": -750000064.0, "copy_num_tokens": 403.625, "epoch": 0.7466938983916263, "gen_logits_max": 4.904823303222656, "gen_logits_mean": -15.161477088928223, "gen_logits_min": -26.22342300415039, "gen_logits_std": 2.6438698768615723, "gen_loss": 0.29762914776802063, "grad_norm": 0.39476655832023594, "learning_rate": 2.6311578947368422e-05, "loss": 0.3124, "mean_copy_accuracy": 0.9934970736503601, "mean_gen_accuracy": 0.8672231733798981, "mean_token_accuracy": 0.8956551998853683, "num_tokens": 990870409.0, "sample_num_tokens": 8779.25, "step": 3656, "total_num_tokens": 990905526.0, "z_loss": 0.0007688145851716399 }, { "copy_logits_max": -1.1770395040512085, "copy_logits_min": -750000000.0, "copy_num_tokens": 533.25, "epoch": 0.7468981363288231, "gen_logits_max": 5.992386817932129, "gen_logits_mean": -13.761813163757324, "gen_logits_min": -25.301122665405273, "gen_logits_std": 2.674701690673828, "gen_loss": 0.31348612904548645, "grad_norm": 0.3737740491585182, "learning_rate": 2.6310315789473686e-05, "loss": 0.2941, "mean_copy_accuracy": 0.9941786527633667, "mean_gen_accuracy": 0.8748227804899216, "mean_token_accuracy": 0.9020838290452957, "num_tokens": 991145885.0, "sample_num_tokens": 10213.75, "step": 3657, "total_num_tokens": 991186740.0, "z_loss": 0.0007744516478851438 }, { "copy_logits_max": 0.3719399571418762, "copy_logits_min": -750000000.0, "copy_num_tokens": 429.6875, "epoch": 0.7471023742660199, "gen_logits_max": 5.619915008544922, "gen_logits_mean": -14.011013984680176, "gen_logits_min": -25.955883026123047, "gen_logits_std": 2.710622787475586, "gen_loss": 0.28657618165016174, "grad_norm": 0.3753701749497238, "learning_rate": 2.630905263157895e-05, "loss": 0.3051, "mean_copy_accuracy": 0.9943366944789886, "mean_gen_accuracy": 0.8711769729852676, "mean_token_accuracy": 0.8971067816019058, "num_tokens": 991440506.0, "sample_num_tokens": 8465.0, "step": 3658, "total_num_tokens": 991474366.0, "z_loss": 0.0006844180170446634 }, { "copy_logits_max": -2.076982259750366, "copy_logits_min": -750000000.0, "copy_num_tokens": 530.625, "epoch": 0.7473066122032167, "gen_logits_max": 4.803393840789795, "gen_logits_mean": -14.821451187133789, "gen_logits_min": -25.868194580078125, "gen_logits_std": 2.603266716003418, "gen_loss": 0.3055112659931183, "grad_norm": 0.43075130492505653, "learning_rate": 2.630778947368421e-05, "loss": 0.3077, "mean_copy_accuracy": 0.9957921355962753, "mean_gen_accuracy": 0.8641548305749893, "mean_token_accuracy": 0.8984784483909607, "num_tokens": 991723685.0, "sample_num_tokens": 9151.75, "step": 3659, "total_num_tokens": 991760292.0, "z_loss": 0.0006626619724556804 }, { "copy_logits_max": -5.960503578186035, "copy_logits_min": -750000000.0, "copy_num_tokens": 195.8125, "epoch": 0.7475108501404136, "gen_logits_max": 5.937899112701416, "gen_logits_mean": -15.170075416564941, "gen_logits_min": -26.11301612854004, "gen_logits_std": 2.6039440631866455, "gen_loss": 0.3410497307777405, "grad_norm": 0.40287187116255324, "learning_rate": 2.6306526315789476e-05, "loss": 0.3118, "mean_copy_accuracy": 0.9953384697437286, "mean_gen_accuracy": 0.8698647767305374, "mean_token_accuracy": 0.895779550075531, "num_tokens": 991983523.0, "sample_num_tokens": 6974.25, "step": 3660, "total_num_tokens": 992011420.0, "z_loss": 0.0007147707510739565 }, { "copy_logits_max": -0.9738487601280212, "copy_logits_min": -687500032.0, "copy_num_tokens": 362.9375, "epoch": 0.7477150880776104, "gen_logits_max": 5.613539695739746, "gen_logits_mean": -13.303421020507812, "gen_logits_min": -24.425025939941406, "gen_logits_std": 2.650078296661377, "gen_loss": 0.3081984519958496, "grad_norm": 0.4346555459398811, "learning_rate": 2.6305263157894737e-05, "loss": 0.3221, "mean_copy_accuracy": 0.9946594685316086, "mean_gen_accuracy": 0.8630969375371933, "mean_token_accuracy": 0.8925131410360336, "num_tokens": 992256796.0, "sample_num_tokens": 7866.0, "step": 3661, "total_num_tokens": 992288260.0, "z_loss": 0.0006422207225114107 }, { "copy_logits_max": -3.360100746154785, "copy_logits_min": -687500032.0, "copy_num_tokens": 714.75, "epoch": 0.7479193260148073, "gen_logits_max": 4.014991760253906, "gen_logits_mean": -14.713187217712402, "gen_logits_min": -26.369091033935547, "gen_logits_std": 2.690978765487671, "gen_loss": 0.274367094039917, "grad_norm": 0.7342479380346154, "learning_rate": 2.6304e-05, "loss": 0.3168, "mean_copy_accuracy": 0.9944053292274475, "mean_gen_accuracy": 0.8621446788311005, "mean_token_accuracy": 0.8949691206216812, "num_tokens": 992531236.0, "sample_num_tokens": 10445.5, "step": 3662, "total_num_tokens": 992573018.0, "z_loss": 0.0006076921708881855 }, { "copy_logits_max": 0.8707684874534607, "copy_logits_min": -750000000.0, "copy_num_tokens": 568.0625, "epoch": 0.7481235639520041, "gen_logits_max": 4.394349098205566, "gen_logits_mean": -14.561607360839844, "gen_logits_min": -26.079177856445312, "gen_logits_std": 2.701704978942871, "gen_loss": 0.27408769726753235, "grad_norm": 0.38013701488862006, "learning_rate": 2.6302736842105262e-05, "loss": 0.3054, "mean_copy_accuracy": 0.9961493611335754, "mean_gen_accuracy": 0.8652877807617188, "mean_token_accuracy": 0.8985224515199661, "num_tokens": 992822935.0, "sample_num_tokens": 9179.25, "step": 3663, "total_num_tokens": 992859652.0, "z_loss": 0.0006829799385741353 }, { "copy_logits_max": 1.99850332736969, "copy_logits_min": -687500032.0, "copy_num_tokens": 650.375, "epoch": 0.7483278018892009, "gen_logits_max": 5.306805610656738, "gen_logits_mean": -13.37144660949707, "gen_logits_min": -25.424278259277344, "gen_logits_std": 2.7046432495117188, "gen_loss": 0.2866433560848236, "grad_norm": 0.3727452874023377, "learning_rate": 2.6301473684210526e-05, "loss": 0.275, "mean_copy_accuracy": 0.9967640936374664, "mean_gen_accuracy": 0.8729568719863892, "mean_token_accuracy": 0.9079132974147797, "num_tokens": 993114803.0, "sample_num_tokens": 10008.75, "step": 3664, "total_num_tokens": 993154838.0, "z_loss": 0.0007325370097532868 }, { "copy_logits_max": -2.6490859985351562, "copy_logits_min": -750000000.0, "copy_num_tokens": 428.875, "epoch": 0.7485320398263977, "gen_logits_max": 4.944334030151367, "gen_logits_mean": -15.096317291259766, "gen_logits_min": -26.32022476196289, "gen_logits_std": 2.6604182720184326, "gen_loss": 0.30195924639701843, "grad_norm": 0.40572931404443735, "learning_rate": 2.630021052631579e-05, "loss": 0.322, "mean_copy_accuracy": 0.9944973886013031, "mean_gen_accuracy": 0.8648334294557571, "mean_token_accuracy": 0.893167033791542, "num_tokens": 993392932.0, "sample_num_tokens": 8287.0, "step": 3665, "total_num_tokens": 993426080.0, "z_loss": 0.0007447157986462116 }, { "copy_logits_max": -0.9197518825531006, "copy_logits_min": -750000000.0, "copy_num_tokens": 350.1875, "epoch": 0.7487362777635946, "gen_logits_max": 6.143579959869385, "gen_logits_mean": -13.34655475616455, "gen_logits_min": -24.4731502532959, "gen_logits_std": 2.6593544483184814, "gen_loss": 0.31412002444267273, "grad_norm": 0.44522362985072433, "learning_rate": 2.6298947368421055e-05, "loss": 0.3268, "mean_copy_accuracy": 0.995013102889061, "mean_gen_accuracy": 0.8630413264036179, "mean_token_accuracy": 0.894955962896347, "num_tokens": 993659557.0, "sample_num_tokens": 7400.25, "step": 3666, "total_num_tokens": 993689158.0, "z_loss": 0.000638828962109983 }, { "copy_logits_max": -4.569618225097656, "copy_logits_min": -750000000.0, "copy_num_tokens": 202.875, "epoch": 0.7489405157007915, "gen_logits_max": 6.364438056945801, "gen_logits_mean": -14.202802658081055, "gen_logits_min": -25.34880828857422, "gen_logits_std": 2.6639504432678223, "gen_loss": 0.3689269423484802, "grad_norm": 0.4416444249228253, "learning_rate": 2.6297684210526316e-05, "loss": 0.3099, "mean_copy_accuracy": 0.9936835914850235, "mean_gen_accuracy": 0.8704364150762558, "mean_token_accuracy": 0.895932525396347, "num_tokens": 993896788.0, "sample_num_tokens": 6368.0, "step": 3667, "total_num_tokens": 993922260.0, "z_loss": 0.0007563638500869274 }, { "copy_logits_max": -1.572525978088379, "copy_logits_min": -687500032.0, "copy_num_tokens": 364.4375, "epoch": 0.7491447536379883, "gen_logits_max": 5.026776313781738, "gen_logits_mean": -14.511387825012207, "gen_logits_min": -25.990459442138672, "gen_logits_std": 2.6637210845947266, "gen_loss": 0.34585869312286377, "grad_norm": 0.3998952043314351, "learning_rate": 2.629642105263158e-05, "loss": 0.3271, "mean_copy_accuracy": 0.9937475919723511, "mean_gen_accuracy": 0.8573592007160187, "mean_token_accuracy": 0.8893257230520248, "num_tokens": 994156220.0, "sample_num_tokens": 7380.5, "step": 3668, "total_num_tokens": 994185742.0, "z_loss": 0.0007621589465998113 }, { "copy_logits_max": -1.0789908170700073, "copy_logits_min": -750000000.0, "copy_num_tokens": 479.625, "epoch": 0.7493489915751851, "gen_logits_max": 5.508376121520996, "gen_logits_mean": -13.729025840759277, "gen_logits_min": -25.378284454345703, "gen_logits_std": 2.6825170516967773, "gen_loss": 0.31366682052612305, "grad_norm": 0.4347115414744353, "learning_rate": 2.6295157894736844e-05, "loss": 0.3288, "mean_copy_accuracy": 0.9954613745212555, "mean_gen_accuracy": 0.8543278872966766, "mean_token_accuracy": 0.8910884410142899, "num_tokens": 994410876.0, "sample_num_tokens": 8174.5, "step": 3669, "total_num_tokens": 994443574.0, "z_loss": 0.0006937716389074922 }, { "copy_logits_max": -4.644925117492676, "copy_logits_min": -750000000.0, "copy_num_tokens": 347.9375, "epoch": 0.7495532295123819, "gen_logits_max": 5.493027687072754, "gen_logits_mean": -14.1400785446167, "gen_logits_min": -25.00863265991211, "gen_logits_std": 2.5931479930877686, "gen_loss": 0.3383476436138153, "grad_norm": 0.38648955312995786, "learning_rate": 2.6293894736842105e-05, "loss": 0.3253, "mean_copy_accuracy": 0.9952693581581116, "mean_gen_accuracy": 0.866531252861023, "mean_token_accuracy": 0.8925569802522659, "num_tokens": 994685259.0, "sample_num_tokens": 7426.25, "step": 3670, "total_num_tokens": 994714964.0, "z_loss": 0.0006695922929793596 }, { "copy_logits_max": 1.116868495941162, "copy_logits_min": -750000064.0, "copy_num_tokens": 534.375, "epoch": 0.7497574674495787, "gen_logits_max": 5.864092826843262, "gen_logits_mean": -12.459449768066406, "gen_logits_min": -24.452974319458008, "gen_logits_std": 2.6767680644989014, "gen_loss": 0.3120241165161133, "grad_norm": 0.42365850261608834, "learning_rate": 2.629263157894737e-05, "loss": 0.332, "mean_copy_accuracy": 0.99466872215271, "mean_gen_accuracy": 0.8588019758462906, "mean_token_accuracy": 0.8910556733608246, "num_tokens": 994961574.0, "sample_num_tokens": 8938.0, "step": 3671, "total_num_tokens": 994997326.0, "z_loss": 0.0006784545257687569 }, { "copy_logits_max": -4.0022053718566895, "copy_logits_min": -750000000.0, "copy_num_tokens": 382.4375, "epoch": 0.7499617053867756, "gen_logits_max": 4.692110061645508, "gen_logits_mean": -15.037477493286133, "gen_logits_min": -26.59379005432129, "gen_logits_std": 2.708616018295288, "gen_loss": 0.2924222946166992, "grad_norm": 0.4205519936874978, "learning_rate": 2.629136842105263e-05, "loss": 0.3022, "mean_copy_accuracy": 0.9942710697650909, "mean_gen_accuracy": 0.8666634112596512, "mean_token_accuracy": 0.8989045917987823, "num_tokens": 995222431.0, "sample_num_tokens": 7705.25, "step": 3672, "total_num_tokens": 995253252.0, "z_loss": 0.000570928503293544 }, { "copy_logits_max": -0.6265170574188232, "copy_logits_min": -750000000.0, "copy_num_tokens": 526.625, "epoch": 0.7501659433239725, "gen_logits_max": 4.636144638061523, "gen_logits_mean": -14.15118408203125, "gen_logits_min": -25.735881805419922, "gen_logits_std": 2.6774230003356934, "gen_loss": 0.3392658531665802, "grad_norm": 0.38541692676605793, "learning_rate": 2.6290105263157895e-05, "loss": 0.3119, "mean_copy_accuracy": 0.9959118217229843, "mean_gen_accuracy": 0.859464630484581, "mean_token_accuracy": 0.8978559225797653, "num_tokens": 995499348.0, "sample_num_tokens": 8705.5, "step": 3673, "total_num_tokens": 995534170.0, "z_loss": 0.0007190625183284283 }, { "copy_logits_max": -2.0645642280578613, "copy_logits_min": -750000000.0, "copy_num_tokens": 328.3125, "epoch": 0.7503701812611693, "gen_logits_max": 5.179425239562988, "gen_logits_mean": -14.816804885864258, "gen_logits_min": -26.17028045654297, "gen_logits_std": 2.673251152038574, "gen_loss": 0.33776015043258667, "grad_norm": 0.4043375608050652, "learning_rate": 2.628884210526316e-05, "loss": 0.2953, "mean_copy_accuracy": 0.9959805756807327, "mean_gen_accuracy": 0.8614097088575363, "mean_token_accuracy": 0.9010828733444214, "num_tokens": 995778749.0, "sample_num_tokens": 7323.75, "step": 3674, "total_num_tokens": 995808044.0, "z_loss": 0.0007833319250494242 }, { "copy_logits_max": 0.004459202289581299, "copy_logits_min": -750000000.0, "copy_num_tokens": 434.375, "epoch": 0.7505744191983661, "gen_logits_max": 4.944606304168701, "gen_logits_mean": -13.215593338012695, "gen_logits_min": -24.684959411621094, "gen_logits_std": 2.6638240814208984, "gen_loss": 0.2936176359653473, "grad_norm": 0.39564601491242773, "learning_rate": 2.6287578947368423e-05, "loss": 0.3132, "mean_copy_accuracy": 0.9950553625822067, "mean_gen_accuracy": 0.8636422902345657, "mean_token_accuracy": 0.8972235471010208, "num_tokens": 996043027.0, "sample_num_tokens": 7179.75, "step": 3675, "total_num_tokens": 996071746.0, "z_loss": 0.0007252307841554284 }, { "copy_logits_max": -2.779698610305786, "copy_logits_min": -687500032.0, "copy_num_tokens": 336.1875, "epoch": 0.7507786571355629, "gen_logits_max": 5.999271392822266, "gen_logits_mean": -13.623636245727539, "gen_logits_min": -24.880878448486328, "gen_logits_std": 2.652451515197754, "gen_loss": 0.3347353935241699, "grad_norm": 0.40250602662307156, "learning_rate": 2.6286315789473684e-05, "loss": 0.3149, "mean_copy_accuracy": 0.995215967297554, "mean_gen_accuracy": 0.8645195364952087, "mean_token_accuracy": 0.8958206176757812, "num_tokens": 996317996.0, "sample_num_tokens": 8087.5, "step": 3676, "total_num_tokens": 996350346.0, "z_loss": 0.0007078337948769331 }, { "copy_logits_max": -3.4893381595611572, "copy_logits_min": -750000000.0, "copy_num_tokens": 477.1875, "epoch": 0.7509828950727597, "gen_logits_max": 5.652634620666504, "gen_logits_mean": -13.37144660949707, "gen_logits_min": -25.015111923217773, "gen_logits_std": 2.6531078815460205, "gen_loss": 0.3321782946586609, "grad_norm": 0.4233860815580358, "learning_rate": 2.628505263157895e-05, "loss": 0.3177, "mean_copy_accuracy": 0.9944516122341156, "mean_gen_accuracy": 0.8603192716836929, "mean_token_accuracy": 0.8945315331220627, "num_tokens": 996568495.0, "sample_num_tokens": 8328.75, "step": 3677, "total_num_tokens": 996601810.0, "z_loss": 0.0007821092731319368 }, { "copy_logits_max": -1.6162625551223755, "copy_logits_min": -750000000.0, "copy_num_tokens": 453.9375, "epoch": 0.7511871330099567, "gen_logits_max": 5.351980209350586, "gen_logits_mean": -13.800724029541016, "gen_logits_min": -25.236162185668945, "gen_logits_std": 2.650399684906006, "gen_loss": 0.34685346484184265, "grad_norm": 0.4640431408555099, "learning_rate": 2.628378947368421e-05, "loss": 0.329, "mean_copy_accuracy": 0.9944083541631699, "mean_gen_accuracy": 0.8593444377183914, "mean_token_accuracy": 0.8924151062965393, "num_tokens": 996823761.0, "sample_num_tokens": 8601.25, "step": 3678, "total_num_tokens": 996858166.0, "z_loss": 0.0008106925524771214 }, { "copy_logits_max": -1.4249825477600098, "copy_logits_min": -750000000.0, "copy_num_tokens": 556.5625, "epoch": 0.7513913709471535, "gen_logits_max": 5.54754638671875, "gen_logits_mean": -12.511701583862305, "gen_logits_min": -24.276517868041992, "gen_logits_std": 2.689903736114502, "gen_loss": 0.28087514638900757, "grad_norm": 0.40216647186215776, "learning_rate": 2.6282526315789474e-05, "loss": 0.3086, "mean_copy_accuracy": 0.9956268221139908, "mean_gen_accuracy": 0.8608876168727875, "mean_token_accuracy": 0.8997603058815002, "num_tokens": 997107407.0, "sample_num_tokens": 8607.25, "step": 3679, "total_num_tokens": 997141836.0, "z_loss": 0.0006921055028215051 }, { "copy_logits_max": -2.268472671508789, "copy_logits_min": -687500032.0, "copy_num_tokens": 505.5625, "epoch": 0.7515956088843503, "gen_logits_max": 3.9101898670196533, "gen_logits_mean": -15.34520149230957, "gen_logits_min": -26.928674697875977, "gen_logits_std": 2.675231456756592, "gen_loss": 0.26608794927597046, "grad_norm": 0.4053949333887448, "learning_rate": 2.6281263157894735e-05, "loss": 0.3034, "mean_copy_accuracy": 0.9952910840511322, "mean_gen_accuracy": 0.8675562739372253, "mean_token_accuracy": 0.8994274735450745, "num_tokens": 997378566.0, "sample_num_tokens": 8332.5, "step": 3680, "total_num_tokens": 997411896.0, "z_loss": 0.0006459139985963702 }, { "copy_logits_max": -3.149157762527466, "copy_logits_min": -687500032.0, "copy_num_tokens": 282.75, "epoch": 0.7517998468215471, "gen_logits_max": 5.615869522094727, "gen_logits_mean": -14.077569007873535, "gen_logits_min": -25.08232879638672, "gen_logits_std": 2.6357083320617676, "gen_loss": 0.3869931697845459, "grad_norm": 0.4037809957777093, "learning_rate": 2.628e-05, "loss": 0.3436, "mean_copy_accuracy": 0.9945059716701508, "mean_gen_accuracy": 0.8562625646591187, "mean_token_accuracy": 0.8855199813842773, "num_tokens": 997642217.0, "sample_num_tokens": 6930.25, "step": 3681, "total_num_tokens": 997669938.0, "z_loss": 0.0007876278250478208 }, { "copy_logits_max": -3.190901279449463, "copy_logits_min": -750000000.0, "copy_num_tokens": 454.4375, "epoch": 0.7520040847587439, "gen_logits_max": 5.0421905517578125, "gen_logits_mean": -13.818522453308105, "gen_logits_min": -24.699113845825195, "gen_logits_std": 2.601052761077881, "gen_loss": 0.3600004315376282, "grad_norm": 0.42685239343727777, "learning_rate": 2.6278736842105267e-05, "loss": 0.3171, "mean_copy_accuracy": 0.9959026128053665, "mean_gen_accuracy": 0.8563120365142822, "mean_token_accuracy": 0.8956235647201538, "num_tokens": 997925959.0, "sample_num_tokens": 8490.25, "step": 3682, "total_num_tokens": 997959920.0, "z_loss": 0.0007102119270712137 }, { "copy_logits_max": -0.4537677764892578, "copy_logits_min": -687500032.0, "copy_num_tokens": 810.0, "epoch": 0.7522083226959407, "gen_logits_max": 5.014954566955566, "gen_logits_mean": -11.504888534545898, "gen_logits_min": -23.893692016601562, "gen_logits_std": 2.687523126602173, "gen_loss": 0.27855294942855835, "grad_norm": 0.43951924092236905, "learning_rate": 2.6277473684210528e-05, "loss": 0.2921, "mean_copy_accuracy": 0.9952983111143112, "mean_gen_accuracy": 0.860515832901001, "mean_token_accuracy": 0.9036357551813126, "num_tokens": 998235052.0, "sample_num_tokens": 10012.0, "step": 3683, "total_num_tokens": 998275100.0, "z_loss": 0.0005813388852402568 }, { "copy_logits_max": -4.483543395996094, "copy_logits_min": -750000000.0, "copy_num_tokens": 378.125, "epoch": 0.7524125606331376, "gen_logits_max": 5.486760139465332, "gen_logits_mean": -13.341187477111816, "gen_logits_min": -24.791542053222656, "gen_logits_std": 2.65659236907959, "gen_loss": 0.27627724409103394, "grad_norm": 0.4105700522916786, "learning_rate": 2.6276210526315792e-05, "loss": 0.3075, "mean_copy_accuracy": 0.9955422729253769, "mean_gen_accuracy": 0.8689400106668472, "mean_token_accuracy": 0.8985240161418915, "num_tokens": 998512935.0, "sample_num_tokens": 7680.75, "step": 3684, "total_num_tokens": 998543658.0, "z_loss": 0.0006217039190232754 }, { "copy_logits_max": -3.481997013092041, "copy_logits_min": -750000000.0, "copy_num_tokens": 381.875, "epoch": 0.7526167985703345, "gen_logits_max": 5.2575273513793945, "gen_logits_mean": -12.894899368286133, "gen_logits_min": -23.714519500732422, "gen_logits_std": 2.574263095855713, "gen_loss": 0.33887791633605957, "grad_norm": 0.4149602469779216, "learning_rate": 2.6274947368421053e-05, "loss": 0.3233, "mean_copy_accuracy": 0.9958776533603668, "mean_gen_accuracy": 0.8629983365535736, "mean_token_accuracy": 0.8933872729539871, "num_tokens": 998778862.0, "sample_num_tokens": 8198.0, "step": 3685, "total_num_tokens": 998811654.0, "z_loss": 0.0007021636702120304 }, { "copy_logits_max": -4.502652168273926, "copy_logits_min": -750000000.0, "copy_num_tokens": 333.5625, "epoch": 0.7528210365075313, "gen_logits_max": 4.545734405517578, "gen_logits_mean": -15.444171905517578, "gen_logits_min": -26.41057777404785, "gen_logits_std": 2.6292366981506348, "gen_loss": 0.3116944432258606, "grad_norm": 0.479296646608652, "learning_rate": 2.6273684210526317e-05, "loss": 0.3276, "mean_copy_accuracy": 0.9940951317548752, "mean_gen_accuracy": 0.8615942150354385, "mean_token_accuracy": 0.8913048952817917, "num_tokens": 999044831.0, "sample_num_tokens": 7631.75, "step": 3686, "total_num_tokens": 999075358.0, "z_loss": 0.0007084489916451275 }, { "copy_logits_max": -6.534337520599365, "copy_logits_min": -687500032.0, "copy_num_tokens": 495.0625, "epoch": 0.7530252744447281, "gen_logits_max": 4.92409610748291, "gen_logits_mean": -14.410697937011719, "gen_logits_min": -25.292261123657227, "gen_logits_std": 2.6322693824768066, "gen_loss": 0.3301061987876892, "grad_norm": 0.4401462383801607, "learning_rate": 2.6272421052631578e-05, "loss": 0.3231, "mean_copy_accuracy": 0.9923727214336395, "mean_gen_accuracy": 0.867014691233635, "mean_token_accuracy": 0.8932571709156036, "num_tokens": 999288086.0, "sample_num_tokens": 9429.0, "step": 3687, "total_num_tokens": 999325802.0, "z_loss": 0.0006771720363758504 }, { "copy_logits_max": -3.336460590362549, "copy_logits_min": -687500032.0, "copy_num_tokens": 544.1875, "epoch": 0.7532295123819249, "gen_logits_max": 5.172072410583496, "gen_logits_mean": -13.511703491210938, "gen_logits_min": -24.400863647460938, "gen_logits_std": 2.603473424911499, "gen_loss": 0.30507615208625793, "grad_norm": 0.41282865366674526, "learning_rate": 2.6271157894736843e-05, "loss": 0.3098, "mean_copy_accuracy": 0.9935590624809265, "mean_gen_accuracy": 0.8707401901483536, "mean_token_accuracy": 0.8991500288248062, "num_tokens": 999572747.0, "sample_num_tokens": 9667.75, "step": 3688, "total_num_tokens": 999611418.0, "z_loss": 0.0005849332083016634 }, { "copy_logits_max": -3.6793816089630127, "copy_logits_min": -750000000.0, "copy_num_tokens": 596.3125, "epoch": 0.7534337503191217, "gen_logits_max": 3.9360828399658203, "gen_logits_mean": -14.90835189819336, "gen_logits_min": -26.22824478149414, "gen_logits_std": 2.6501779556274414, "gen_loss": 0.2917390465736389, "grad_norm": 0.41016648993304483, "learning_rate": 2.6269894736842103e-05, "loss": 0.297, "mean_copy_accuracy": 0.9940109699964523, "mean_gen_accuracy": 0.8696056753396988, "mean_token_accuracy": 0.9012311100959778, "num_tokens": 999868272.0, "sample_num_tokens": 9099.5, "step": 3689, "total_num_tokens": 999904670.0, "z_loss": 0.0005669429083354771 }, { "copy_logits_max": -2.720404624938965, "copy_logits_min": -750000000.0, "copy_num_tokens": 645.875, "epoch": 0.7536379882563186, "gen_logits_max": 3.9541609287261963, "gen_logits_mean": -14.076923370361328, "gen_logits_min": -25.405651092529297, "gen_logits_std": 2.6357688903808594, "gen_loss": 0.23520098626613617, "grad_norm": 0.39121361619306255, "learning_rate": 2.626863157894737e-05, "loss": 0.2774, "mean_copy_accuracy": 0.9939796924591064, "mean_gen_accuracy": 0.8737121522426605, "mean_token_accuracy": 0.9066972732543945, "num_tokens": 1000149338.0, "sample_num_tokens": 9627.0, "step": 3690, "total_num_tokens": 1000187846.0, "z_loss": 0.0005289503023959696 }, { "copy_logits_max": -5.156269073486328, "copy_logits_min": -750000000.0, "copy_num_tokens": 380.0625, "epoch": 0.7538422261935155, "gen_logits_max": 4.994932174682617, "gen_logits_mean": -14.610831260681152, "gen_logits_min": -25.101482391357422, "gen_logits_std": 2.594482898712158, "gen_loss": 0.3186912536621094, "grad_norm": 0.40292940185920617, "learning_rate": 2.6267368421052632e-05, "loss": 0.3103, "mean_copy_accuracy": 0.996324747800827, "mean_gen_accuracy": 0.8593772798776627, "mean_token_accuracy": 0.8969229012727737, "num_tokens": 1000436006.0, "sample_num_tokens": 8512.5, "step": 3691, "total_num_tokens": 1000470056.0, "z_loss": 0.0005839516525156796 }, { "copy_logits_max": -2.292323112487793, "copy_logits_min": -750000000.0, "copy_num_tokens": 613.125, "epoch": 0.7540464641307123, "gen_logits_max": 4.945046424865723, "gen_logits_mean": -13.022574424743652, "gen_logits_min": -24.6080322265625, "gen_logits_std": 2.6804535388946533, "gen_loss": 0.30299487709999084, "grad_norm": 0.4473972573328268, "learning_rate": 2.6266105263157896e-05, "loss": 0.3025, "mean_copy_accuracy": 0.9949583560228348, "mean_gen_accuracy": 0.8678819984197617, "mean_token_accuracy": 0.9011895805597305, "num_tokens": 1000687723.0, "sample_num_tokens": 9119.75, "step": 3692, "total_num_tokens": 1000724202.0, "z_loss": 0.0006972230039536953 }, { "copy_logits_max": -5.223236083984375, "copy_logits_min": -750000128.0, "copy_num_tokens": 481.75, "epoch": 0.7542507020679091, "gen_logits_max": 4.223751068115234, "gen_logits_mean": -14.749456405639648, "gen_logits_min": -26.791305541992188, "gen_logits_std": 2.674407958984375, "gen_loss": 0.28225111961364746, "grad_norm": 0.38636756581609993, "learning_rate": 2.6264842105263157e-05, "loss": 0.3101, "mean_copy_accuracy": 0.9953051060438156, "mean_gen_accuracy": 0.8668596297502518, "mean_token_accuracy": 0.8986319899559021, "num_tokens": 1000978190.0, "sample_num_tokens": 8110.5, "step": 3693, "total_num_tokens": 1001010632.0, "z_loss": 0.0005263545899651945 }, { "copy_logits_max": -3.691603899002075, "copy_logits_min": -687500032.0, "copy_num_tokens": 397.875, "epoch": 0.7544549400051059, "gen_logits_max": 5.48179292678833, "gen_logits_mean": -13.396240234375, "gen_logits_min": -25.117847442626953, "gen_logits_std": 2.677940845489502, "gen_loss": 0.31527742743492126, "grad_norm": 0.5248414802992859, "learning_rate": 2.626357894736842e-05, "loss": 0.3023, "mean_copy_accuracy": 0.9951155632734299, "mean_gen_accuracy": 0.8644576370716095, "mean_token_accuracy": 0.8994081765413284, "num_tokens": 1001250713.0, "sample_num_tokens": 7409.75, "step": 3694, "total_num_tokens": 1001280352.0, "z_loss": 0.0006015108083374798 }, { "copy_logits_max": -0.5757114887237549, "copy_logits_min": -750000000.0, "copy_num_tokens": 397.0, "epoch": 0.7546591779423028, "gen_logits_max": 6.5235795974731445, "gen_logits_mean": -11.446809768676758, "gen_logits_min": -22.94801139831543, "gen_logits_std": 2.688431739807129, "gen_loss": 0.3089612126350403, "grad_norm": 0.38064884195129906, "learning_rate": 2.6262315789473686e-05, "loss": 0.3054, "mean_copy_accuracy": 0.9951186925172806, "mean_gen_accuracy": 0.8679045140743256, "mean_token_accuracy": 0.8988848030567169, "num_tokens": 1001507229.0, "sample_num_tokens": 7754.75, "step": 3695, "total_num_tokens": 1001538248.0, "z_loss": 0.0006943029584363103 }, { "copy_logits_max": 0.5129830241203308, "copy_logits_min": -625000000.0, "copy_num_tokens": 652.9375, "epoch": 0.7548634158794996, "gen_logits_max": 5.470816612243652, "gen_logits_mean": -13.096906661987305, "gen_logits_min": -25.590925216674805, "gen_logits_std": 2.7282493114471436, "gen_loss": 0.29978305101394653, "grad_norm": 0.4392176810112877, "learning_rate": 2.6261052631578947e-05, "loss": 0.313, "mean_copy_accuracy": 0.9942294210195541, "mean_gen_accuracy": 0.862054780125618, "mean_token_accuracy": 0.895089328289032, "num_tokens": 1001772208.0, "sample_num_tokens": 10130.0, "step": 3696, "total_num_tokens": 1001812728.0, "z_loss": 0.0007453170255757868 }, { "copy_logits_max": -1.6118876934051514, "copy_logits_min": -687500032.0, "copy_num_tokens": 554.125, "epoch": 0.7550676538166965, "gen_logits_max": 5.82116174697876, "gen_logits_mean": -12.601799011230469, "gen_logits_min": -24.714244842529297, "gen_logits_std": 2.6990270614624023, "gen_loss": 0.3148925304412842, "grad_norm": 0.40663771922424513, "learning_rate": 2.625978947368421e-05, "loss": 0.3098, "mean_copy_accuracy": 0.9946667850017548, "mean_gen_accuracy": 0.8613286018371582, "mean_token_accuracy": 0.8994998335838318, "num_tokens": 1002057939.0, "sample_num_tokens": 8556.75, "step": 3697, "total_num_tokens": 1002092166.0, "z_loss": 0.0008611571975052357 }, { "copy_logits_max": -4.613587856292725, "copy_logits_min": -750000000.0, "copy_num_tokens": 387.8125, "epoch": 0.7552718917538933, "gen_logits_max": 4.98148250579834, "gen_logits_mean": -14.132255554199219, "gen_logits_min": -25.528301239013672, "gen_logits_std": 2.6456780433654785, "gen_loss": 0.33136627078056335, "grad_norm": 0.41006642612562927, "learning_rate": 2.6258526315789475e-05, "loss": 0.3123, "mean_copy_accuracy": 0.9956642836332321, "mean_gen_accuracy": 0.8635677695274353, "mean_token_accuracy": 0.8973998576402664, "num_tokens": 1002325179.0, "sample_num_tokens": 7737.25, "step": 3698, "total_num_tokens": 1002356128.0, "z_loss": 0.0007104035466909409 }, { "copy_logits_max": -3.4977216720581055, "copy_logits_min": -750000064.0, "copy_num_tokens": 356.1875, "epoch": 0.7554761296910901, "gen_logits_max": 5.411330223083496, "gen_logits_mean": -14.388383865356445, "gen_logits_min": -25.9603271484375, "gen_logits_std": 2.711386203765869, "gen_loss": 0.3318428099155426, "grad_norm": 0.4254099280471971, "learning_rate": 2.625726315789474e-05, "loss": 0.3219, "mean_copy_accuracy": 0.9948441535234451, "mean_gen_accuracy": 0.8613208383321762, "mean_token_accuracy": 0.8929518163204193, "num_tokens": 1002593711.0, "sample_num_tokens": 7613.75, "step": 3699, "total_num_tokens": 1002624166.0, "z_loss": 0.0006781023112125695 }, { "copy_logits_max": -3.1703133583068848, "copy_logits_min": -750000000.0, "copy_num_tokens": 482.75, "epoch": 0.7556803676282869, "gen_logits_max": 5.226122856140137, "gen_logits_mean": -12.81888198852539, "gen_logits_min": -24.463661193847656, "gen_logits_std": 2.6250531673431396, "gen_loss": 0.3166101276874542, "grad_norm": 0.45030859492119585, "learning_rate": 2.6256e-05, "loss": 0.3104, "mean_copy_accuracy": 0.9953653663396835, "mean_gen_accuracy": 0.8629772514104843, "mean_token_accuracy": 0.8986823409795761, "num_tokens": 1002858389.0, "sample_num_tokens": 8704.25, "step": 3700, "total_num_tokens": 1002893206.0, "z_loss": 0.0006936087738722563 }, { "copy_logits_max": -4.264827251434326, "copy_logits_min": -750000000.0, "copy_num_tokens": 490.8125, "epoch": 0.7558846055654838, "gen_logits_max": 4.48448371887207, "gen_logits_mean": -15.099913597106934, "gen_logits_min": -26.63050079345703, "gen_logits_std": 2.6727304458618164, "gen_loss": 0.3083266019821167, "grad_norm": 0.6419852539072713, "learning_rate": 2.6254736842105265e-05, "loss": 0.3102, "mean_copy_accuracy": 0.9947226345539093, "mean_gen_accuracy": 0.8676827251911163, "mean_token_accuracy": 0.8984862715005875, "num_tokens": 1003121848.0, "sample_num_tokens": 8866.0, "step": 3701, "total_num_tokens": 1003157312.0, "z_loss": 0.0006595579907298088 }, { "copy_logits_max": -3.610760450363159, "copy_logits_min": -750000000.0, "copy_num_tokens": 333.9375, "epoch": 0.7560888435026806, "gen_logits_max": 5.222417831420898, "gen_logits_mean": -14.471620559692383, "gen_logits_min": -26.227840423583984, "gen_logits_std": 2.666393280029297, "gen_loss": 0.36169105768203735, "grad_norm": 0.4025361193083142, "learning_rate": 2.6253473684210526e-05, "loss": 0.3147, "mean_copy_accuracy": 0.9948853999376297, "mean_gen_accuracy": 0.8687487542629242, "mean_token_accuracy": 0.8973894715309143, "num_tokens": 1003403971.0, "sample_num_tokens": 7261.25, "step": 3702, "total_num_tokens": 1003433016.0, "z_loss": 0.0008108313195407391 }, { "copy_logits_max": -4.646151065826416, "copy_logits_min": -750000000.0, "copy_num_tokens": 385.4375, "epoch": 0.7562930814398775, "gen_logits_max": 5.149287223815918, "gen_logits_mean": -13.713235855102539, "gen_logits_min": -24.945831298828125, "gen_logits_std": 2.599555015563965, "gen_loss": 0.3761588931083679, "grad_norm": 0.4336065298510751, "learning_rate": 2.625221052631579e-05, "loss": 0.3207, "mean_copy_accuracy": 0.9953306764364243, "mean_gen_accuracy": 0.8596756458282471, "mean_token_accuracy": 0.8928393721580505, "num_tokens": 1003660441.0, "sample_num_tokens": 7954.75, "step": 3703, "total_num_tokens": 1003692260.0, "z_loss": 0.0007230874616652727 }, { "copy_logits_max": -7.674300670623779, "copy_logits_min": -750000000.0, "copy_num_tokens": 295.0625, "epoch": 0.7564973193770743, "gen_logits_max": 5.556629180908203, "gen_logits_mean": -13.296113967895508, "gen_logits_min": -24.649883270263672, "gen_logits_std": 2.587514638900757, "gen_loss": 0.3145347833633423, "grad_norm": 0.40112471287624973, "learning_rate": 2.625094736842105e-05, "loss": 0.3218, "mean_copy_accuracy": 0.99473936855793, "mean_gen_accuracy": 0.8630228787660599, "mean_token_accuracy": 0.8932156413793564, "num_tokens": 1003920279.0, "sample_num_tokens": 6508.75, "step": 3704, "total_num_tokens": 1003946314.0, "z_loss": 0.0006407040636986494 }, { "copy_logits_max": -3.9176712036132812, "copy_logits_min": -687500032.0, "copy_num_tokens": 410.6875, "epoch": 0.7567015573142711, "gen_logits_max": 6.115788459777832, "gen_logits_mean": -12.718338012695312, "gen_logits_min": -23.87750244140625, "gen_logits_std": 2.60115385055542, "gen_loss": 0.3380258083343506, "grad_norm": 0.3847233582524224, "learning_rate": 2.6249684210526315e-05, "loss": 0.3109, "mean_copy_accuracy": 0.994674414396286, "mean_gen_accuracy": 0.8686050623655319, "mean_token_accuracy": 0.8980284333229065, "num_tokens": 1004195583.0, "sample_num_tokens": 8524.75, "step": 3705, "total_num_tokens": 1004229682.0, "z_loss": 0.0006592790014110506 }, { "copy_logits_max": -3.5144312381744385, "copy_logits_min": -750000000.0, "copy_num_tokens": 389.75, "epoch": 0.756905795251468, "gen_logits_max": 5.5324273109436035, "gen_logits_mean": -13.21893310546875, "gen_logits_min": -24.36489486694336, "gen_logits_std": 2.5650882720947266, "gen_loss": 0.2809292674064636, "grad_norm": 0.4311710089820739, "learning_rate": 2.624842105263158e-05, "loss": 0.3042, "mean_copy_accuracy": 0.9943568259477615, "mean_gen_accuracy": 0.8669203370809555, "mean_token_accuracy": 0.8984286189079285, "num_tokens": 1004469371.0, "sample_num_tokens": 7972.75, "step": 3706, "total_num_tokens": 1004501262.0, "z_loss": 0.0005692130653187633 }, { "copy_logits_max": -4.452785015106201, "copy_logits_min": -750000000.0, "copy_num_tokens": 453.0625, "epoch": 0.7571100331886648, "gen_logits_max": 4.23665714263916, "gen_logits_mean": -15.577779769897461, "gen_logits_min": -26.94502830505371, "gen_logits_std": 2.657902956008911, "gen_loss": 0.3103872537612915, "grad_norm": 0.3968292908011719, "learning_rate": 2.6247157894736844e-05, "loss": 0.31, "mean_copy_accuracy": 0.9952571243047714, "mean_gen_accuracy": 0.8691351562738419, "mean_token_accuracy": 0.8984930217266083, "num_tokens": 1004758277.0, "sample_num_tokens": 8483.75, "step": 3707, "total_num_tokens": 1004792212.0, "z_loss": 0.0006435724790208042 }, { "copy_logits_max": -2.937370777130127, "copy_logits_min": -562500096.0, "copy_num_tokens": 479.6875, "epoch": 0.7573142711258616, "gen_logits_max": 4.541688919067383, "gen_logits_mean": -13.076038360595703, "gen_logits_min": -24.732717514038086, "gen_logits_std": 2.6713671684265137, "gen_loss": 0.3220590054988861, "grad_norm": 0.40571153022063255, "learning_rate": 2.6245894736842105e-05, "loss": 0.3205, "mean_copy_accuracy": 0.996235728263855, "mean_gen_accuracy": 0.8582378625869751, "mean_token_accuracy": 0.8954320549964905, "num_tokens": 1005028669.0, "sample_num_tokens": 7760.75, "step": 3708, "total_num_tokens": 1005059712.0, "z_loss": 0.0006817749235779047 }, { "copy_logits_max": -3.4889636039733887, "copy_logits_min": -687500032.0, "copy_num_tokens": 604.875, "epoch": 0.7575185090630585, "gen_logits_max": 4.505486488342285, "gen_logits_mean": -14.096288681030273, "gen_logits_min": -25.683639526367188, "gen_logits_std": 2.6750521659851074, "gen_loss": 0.26421093940734863, "grad_norm": 0.44038154683497893, "learning_rate": 2.624463157894737e-05, "loss": 0.3006, "mean_copy_accuracy": 0.9953824728727341, "mean_gen_accuracy": 0.8685138523578644, "mean_token_accuracy": 0.9001073241233826, "num_tokens": 1005300539.0, "sample_num_tokens": 9660.25, "step": 3709, "total_num_tokens": 1005339180.0, "z_loss": 0.0005707938107661903 }, { "copy_logits_max": -6.623673439025879, "copy_logits_min": -750000000.0, "copy_num_tokens": 253.3125, "epoch": 0.7577227470002553, "gen_logits_max": 5.758749008178711, "gen_logits_mean": -14.237349510192871, "gen_logits_min": -25.169126510620117, "gen_logits_std": 2.5731005668640137, "gen_loss": 0.32639065384864807, "grad_norm": 0.3858565194796048, "learning_rate": 2.6243368421052634e-05, "loss": 0.3064, "mean_copy_accuracy": 0.9960825741291046, "mean_gen_accuracy": 0.8667486310005188, "mean_token_accuracy": 0.8963466733694077, "num_tokens": 1005563158.0, "sample_num_tokens": 6917.0, "step": 3710, "total_num_tokens": 1005590826.0, "z_loss": 0.0006491367239505053 }, { "copy_logits_max": -6.344420433044434, "copy_logits_min": -750000000.0, "copy_num_tokens": 384.4375, "epoch": 0.7579269849374521, "gen_logits_max": 5.40806770324707, "gen_logits_mean": -12.73043441772461, "gen_logits_min": -24.41744613647461, "gen_logits_std": 2.6364896297454834, "gen_loss": 0.3611346483230591, "grad_norm": 0.4161005177997084, "learning_rate": 2.6242105263157895e-05, "loss": 0.2982, "mean_copy_accuracy": 0.9955001920461655, "mean_gen_accuracy": 0.8636093139648438, "mean_token_accuracy": 0.8997106701135635, "num_tokens": 1005827878.0, "sample_num_tokens": 7412.5, "step": 3711, "total_num_tokens": 1005857528.0, "z_loss": 0.0006742017576470971 }, { "copy_logits_max": -6.664952754974365, "copy_logits_min": -750000000.0, "copy_num_tokens": 341.875, "epoch": 0.758131222874649, "gen_logits_max": 5.724453926086426, "gen_logits_mean": -13.26004695892334, "gen_logits_min": -24.287574768066406, "gen_logits_std": 2.5825278759002686, "gen_loss": 0.31155329942703247, "grad_norm": 0.4463618992617261, "learning_rate": 2.624084210526316e-05, "loss": 0.3188, "mean_copy_accuracy": 0.9931811988353729, "mean_gen_accuracy": 0.8655926585197449, "mean_token_accuracy": 0.8933213651180267, "num_tokens": 1006076417.0, "sample_num_tokens": 7915.75, "step": 3712, "total_num_tokens": 1006108080.0, "z_loss": 0.000624581181909889 }, { "copy_logits_max": -5.030503749847412, "copy_logits_min": -750000000.0, "copy_num_tokens": 566.4375, "epoch": 0.7583354608118458, "gen_logits_max": 4.4043707847595215, "gen_logits_mean": -14.22243595123291, "gen_logits_min": -25.734811782836914, "gen_logits_std": 2.6420767307281494, "gen_loss": 0.29904332756996155, "grad_norm": 0.4265147361721421, "learning_rate": 2.623957894736842e-05, "loss": 0.297, "mean_copy_accuracy": 0.9961730688810349, "mean_gen_accuracy": 0.8612092286348343, "mean_token_accuracy": 0.9041229337453842, "num_tokens": 1006372819.0, "sample_num_tokens": 8826.75, "step": 3713, "total_num_tokens": 1006408126.0, "z_loss": 0.0006223646341823041 }, { "copy_logits_max": -6.057040691375732, "copy_logits_min": -750000000.0, "copy_num_tokens": 442.25, "epoch": 0.7585396987490426, "gen_logits_max": 5.710301399230957, "gen_logits_mean": -12.394787788391113, "gen_logits_min": -23.461477279663086, "gen_logits_std": 2.579705238342285, "gen_loss": 0.3442533016204834, "grad_norm": 0.46629065985166207, "learning_rate": 2.6238315789473687e-05, "loss": 0.3318, "mean_copy_accuracy": 0.9942284971475601, "mean_gen_accuracy": 0.8541771173477173, "mean_token_accuracy": 0.8894385546445847, "num_tokens": 1006662128.0, "sample_num_tokens": 8884.0, "step": 3714, "total_num_tokens": 1006697664.0, "z_loss": 0.0007102666422724724 }, { "copy_logits_max": -2.6301746368408203, "copy_logits_min": -750000000.0, "copy_num_tokens": 666.75, "epoch": 0.7587439366862395, "gen_logits_max": 3.7976067066192627, "gen_logits_mean": -14.394248962402344, "gen_logits_min": -25.972393035888672, "gen_logits_std": 2.627852439880371, "gen_loss": 0.2635698914527893, "grad_norm": 0.43646262204342384, "learning_rate": 2.623705263157895e-05, "loss": 0.2988, "mean_copy_accuracy": 0.9959659576416016, "mean_gen_accuracy": 0.862522765994072, "mean_token_accuracy": 0.9016029387712479, "num_tokens": 1006936144.0, "sample_num_tokens": 8714.5, "step": 3715, "total_num_tokens": 1006971002.0, "z_loss": 0.0006441725417971611 }, { "copy_logits_max": -5.906288146972656, "copy_logits_min": -687500032.0, "copy_num_tokens": 560.0, "epoch": 0.7589481746234363, "gen_logits_max": 4.230929374694824, "gen_logits_mean": -14.111130714416504, "gen_logits_min": -25.100772857666016, "gen_logits_std": 2.603821277618408, "gen_loss": 0.25801655650138855, "grad_norm": 0.3856865882629296, "learning_rate": 2.6235789473684213e-05, "loss": 0.3119, "mean_copy_accuracy": 0.9950399249792099, "mean_gen_accuracy": 0.8674106448888779, "mean_token_accuracy": 0.8962151110172272, "num_tokens": 1007202168.0, "sample_num_tokens": 8918.0, "step": 3716, "total_num_tokens": 1007237840.0, "z_loss": 0.0005554466042667627 }, { "copy_logits_max": -3.1639909744262695, "copy_logits_min": -750000000.0, "copy_num_tokens": 613.125, "epoch": 0.7591524125606331, "gen_logits_max": 4.50659704208374, "gen_logits_mean": -14.295612335205078, "gen_logits_min": -26.15547752380371, "gen_logits_std": 2.628366470336914, "gen_loss": 0.29796266555786133, "grad_norm": 0.5074303418428392, "learning_rate": 2.6234526315789474e-05, "loss": 0.3149, "mean_copy_accuracy": 0.993236631155014, "mean_gen_accuracy": 0.8651264160871506, "mean_token_accuracy": 0.8949266374111176, "num_tokens": 1007467786.0, "sample_num_tokens": 9712.0, "step": 3717, "total_num_tokens": 1007506634.0, "z_loss": 0.0006006523035466671 }, { "copy_logits_max": -3.981186628341675, "copy_logits_min": -750000000.0, "copy_num_tokens": 422.4375, "epoch": 0.75935665049783, "gen_logits_max": 4.8784637451171875, "gen_logits_mean": -14.068605422973633, "gen_logits_min": -26.130863189697266, "gen_logits_std": 2.5991714000701904, "gen_loss": 0.33001184463500977, "grad_norm": 0.41863868171620416, "learning_rate": 2.6233263157894738e-05, "loss": 0.3315, "mean_copy_accuracy": 0.9957737475633621, "mean_gen_accuracy": 0.860280454158783, "mean_token_accuracy": 0.8912164717912674, "num_tokens": 1007739890.0, "sample_num_tokens": 7452.5, "step": 3718, "total_num_tokens": 1007769700.0, "z_loss": 0.000670659530442208 }, { "copy_logits_max": -5.409815311431885, "copy_logits_min": -750000000.0, "copy_num_tokens": 449.6875, "epoch": 0.7595608884350268, "gen_logits_max": 4.5720062255859375, "gen_logits_mean": -14.613576889038086, "gen_logits_min": -25.858579635620117, "gen_logits_std": 2.6295127868652344, "gen_loss": 0.328341007232666, "grad_norm": 0.4434837704811757, "learning_rate": 2.6232e-05, "loss": 0.3087, "mean_copy_accuracy": 0.9947786033153534, "mean_gen_accuracy": 0.8646118938922882, "mean_token_accuracy": 0.8965364694595337, "num_tokens": 1008026840.0, "sample_num_tokens": 8975.5, "step": 3719, "total_num_tokens": 1008062742.0, "z_loss": 0.0006822238210588694 }, { "copy_logits_max": -3.436613082885742, "copy_logits_min": -687500032.0, "copy_num_tokens": 689.1875, "epoch": 0.7597651263722236, "gen_logits_max": 5.588894844055176, "gen_logits_mean": -12.043402671813965, "gen_logits_min": -23.167617797851562, "gen_logits_std": 2.5472970008850098, "gen_loss": 0.31717872619628906, "grad_norm": 0.393311259185307, "learning_rate": 2.6230736842105263e-05, "loss": 0.3275, "mean_copy_accuracy": 0.9951552003622055, "mean_gen_accuracy": 0.8575735837221146, "mean_token_accuracy": 0.8925119787454605, "num_tokens": 1008304952.0, "sample_num_tokens": 10017.5, "step": 3720, "total_num_tokens": 1008345022.0, "z_loss": 0.000712356879375875 }, { "copy_logits_max": -3.193251371383667, "copy_logits_min": -750000064.0, "copy_num_tokens": 336.0625, "epoch": 0.7599693643094205, "gen_logits_max": 6.376314640045166, "gen_logits_mean": -12.679328918457031, "gen_logits_min": -23.907142639160156, "gen_logits_std": 2.529341220855713, "gen_loss": 0.3700457811355591, "grad_norm": 0.47390325563160746, "learning_rate": 2.6229473684210524e-05, "loss": 0.3482, "mean_copy_accuracy": 0.9941090494394302, "mean_gen_accuracy": 0.8563769161701202, "mean_token_accuracy": 0.886313870549202, "num_tokens": 1008572853.0, "sample_num_tokens": 7604.25, "step": 3721, "total_num_tokens": 1008603270.0, "z_loss": 0.0007676002569496632 }, { "copy_logits_max": -3.636793851852417, "copy_logits_min": -750000000.0, "copy_num_tokens": 420.9375, "epoch": 0.7601736022466173, "gen_logits_max": 5.937176704406738, "gen_logits_mean": -12.832609176635742, "gen_logits_min": -23.910545349121094, "gen_logits_std": 2.6148805618286133, "gen_loss": 0.32382726669311523, "grad_norm": 0.5552605010624719, "learning_rate": 2.622821052631579e-05, "loss": 0.3012, "mean_copy_accuracy": 0.9944128543138504, "mean_gen_accuracy": 0.8706973046064377, "mean_token_accuracy": 0.9016900956630707, "num_tokens": 1008829779.0, "sample_num_tokens": 8335.75, "step": 3722, "total_num_tokens": 1008863122.0, "z_loss": 0.000685748877003789 }, { "copy_logits_max": -3.4043405055999756, "copy_logits_min": -750000000.0, "copy_num_tokens": 402.125, "epoch": 0.7603778401838142, "gen_logits_max": 5.257171630859375, "gen_logits_mean": -14.392654418945312, "gen_logits_min": -25.748682022094727, "gen_logits_std": 2.6624913215637207, "gen_loss": 0.38026827573776245, "grad_norm": 0.5155062134866328, "learning_rate": 2.6226947368421056e-05, "loss": 0.3451, "mean_copy_accuracy": 0.994367703795433, "mean_gen_accuracy": 0.85569167137146, "mean_token_accuracy": 0.8851304203271866, "num_tokens": 1009099081.0, "sample_num_tokens": 8735.25, "step": 3723, "total_num_tokens": 1009134022.0, "z_loss": 0.0008077441016212106 }, { "copy_logits_max": 0.9557422995567322, "copy_logits_min": -687500032.0, "copy_num_tokens": 572.25, "epoch": 0.760582078121011, "gen_logits_max": 3.978466510772705, "gen_logits_mean": -14.559038162231445, "gen_logits_min": -26.20905303955078, "gen_logits_std": 2.730177879333496, "gen_loss": 0.2929638624191284, "grad_norm": 0.41272421642853885, "learning_rate": 2.6225684210526317e-05, "loss": 0.3195, "mean_copy_accuracy": 0.9956231713294983, "mean_gen_accuracy": 0.8646386116743088, "mean_token_accuracy": 0.8934907764196396, "num_tokens": 1009362427.0, "sample_num_tokens": 8870.25, "step": 3724, "total_num_tokens": 1009397908.0, "z_loss": 0.0009072826942428946 }, { "copy_logits_max": -1.2939016819000244, "copy_logits_min": -750000000.0, "copy_num_tokens": 439.875, "epoch": 0.7607863160582078, "gen_logits_max": 5.507290840148926, "gen_logits_mean": -13.790145874023438, "gen_logits_min": -24.964393615722656, "gen_logits_std": 2.6312899589538574, "gen_loss": 0.31870830059051514, "grad_norm": 0.42885372134013283, "learning_rate": 2.622442105263158e-05, "loss": 0.305, "mean_copy_accuracy": 0.9946126788854599, "mean_gen_accuracy": 0.8702115267515182, "mean_token_accuracy": 0.8996340185403824, "num_tokens": 1009650938.0, "sample_num_tokens": 8821.5, "step": 3725, "total_num_tokens": 1009686224.0, "z_loss": 0.0009428859921172261 }, { "copy_logits_max": 0.12715435028076172, "copy_logits_min": -687500032.0, "copy_num_tokens": 400.4375, "epoch": 0.7609905539954046, "gen_logits_max": 5.003981590270996, "gen_logits_mean": -13.625638961791992, "gen_logits_min": -25.404863357543945, "gen_logits_std": 2.6382861137390137, "gen_loss": 0.32030755281448364, "grad_norm": 0.5475223247467852, "learning_rate": 2.6223157894736842e-05, "loss": 0.3114, "mean_copy_accuracy": 0.993153527379036, "mean_gen_accuracy": 0.8638411611318588, "mean_token_accuracy": 0.8970078676939011, "num_tokens": 1009927791.0, "sample_num_tokens": 7319.25, "step": 3726, "total_num_tokens": 1009957068.0, "z_loss": 0.0009710850426927209 }, { "copy_logits_max": -1.482419490814209, "copy_logits_min": -750000000.0, "copy_num_tokens": 562.0625, "epoch": 0.7611947919326015, "gen_logits_max": 5.005542755126953, "gen_logits_mean": -13.529807090759277, "gen_logits_min": -25.22781753540039, "gen_logits_std": 2.6714906692504883, "gen_loss": 0.2854863405227661, "grad_norm": 0.4489047254430094, "learning_rate": 2.6221894736842107e-05, "loss": 0.287, "mean_copy_accuracy": 0.9943295121192932, "mean_gen_accuracy": 0.8750575929880142, "mean_token_accuracy": 0.9048458188772202, "num_tokens": 1010197599.0, "sample_num_tokens": 10040.75, "step": 3727, "total_num_tokens": 1010237762.0, "z_loss": 0.000865500420331955 }, { "copy_logits_max": -1.9962916374206543, "copy_logits_min": -750000000.0, "copy_num_tokens": 481.6875, "epoch": 0.7613990298697983, "gen_logits_max": 5.706228733062744, "gen_logits_mean": -12.585044860839844, "gen_logits_min": -24.316051483154297, "gen_logits_std": 2.6746397018432617, "gen_loss": 0.29435986280441284, "grad_norm": 0.40538923085798073, "learning_rate": 2.6220631578947368e-05, "loss": 0.3001, "mean_copy_accuracy": 0.9953642934560776, "mean_gen_accuracy": 0.8644457459449768, "mean_token_accuracy": 0.899928092956543, "num_tokens": 1010490493.0, "sample_num_tokens": 8721.25, "step": 3728, "total_num_tokens": 1010525378.0, "z_loss": 0.0007260990678332746 }, { "copy_logits_max": -0.7078591585159302, "copy_logits_min": -750000000.0, "copy_num_tokens": 406.5, "epoch": 0.7616032678069952, "gen_logits_max": 5.640920639038086, "gen_logits_mean": -12.991300582885742, "gen_logits_min": -24.04608154296875, "gen_logits_std": 2.6029489040374756, "gen_loss": 0.3326619565486908, "grad_norm": 0.4435053258453492, "learning_rate": 2.6219368421052632e-05, "loss": 0.2854, "mean_copy_accuracy": 0.9931895434856415, "mean_gen_accuracy": 0.8760831356048584, "mean_token_accuracy": 0.9050111919641495, "num_tokens": 1010770598.0, "sample_num_tokens": 8642.0, "step": 3729, "total_num_tokens": 1010805166.0, "z_loss": 0.0006681117229163647 }, { "copy_logits_max": -1.5335149765014648, "copy_logits_min": -750000000.0, "copy_num_tokens": 452.75, "epoch": 0.761807505744192, "gen_logits_max": 5.066689491271973, "gen_logits_mean": -13.414384841918945, "gen_logits_min": -24.842809677124023, "gen_logits_std": 2.6536247730255127, "gen_loss": 0.33904698491096497, "grad_norm": 0.433001369697124, "learning_rate": 2.6218105263157893e-05, "loss": 0.332, "mean_copy_accuracy": 0.9932424426078796, "mean_gen_accuracy": 0.8618936687707901, "mean_token_accuracy": 0.8897751271724701, "num_tokens": 1011042994.0, "sample_num_tokens": 8280.5, "step": 3730, "total_num_tokens": 1011076116.0, "z_loss": 0.0006836181273683906 }, { "copy_logits_max": -2.5662758350372314, "copy_logits_min": -750000000.0, "copy_num_tokens": 561.0, "epoch": 0.7620117436813888, "gen_logits_max": 4.370943546295166, "gen_logits_mean": -14.493843078613281, "gen_logits_min": -26.079364776611328, "gen_logits_std": 2.6646852493286133, "gen_loss": 0.31997257471084595, "grad_norm": 0.39574488290784426, "learning_rate": 2.621684210526316e-05, "loss": 0.3033, "mean_copy_accuracy": 0.9959870874881744, "mean_gen_accuracy": 0.8636007905006409, "mean_token_accuracy": 0.899571567773819, "num_tokens": 1011341937.0, "sample_num_tokens": 8521.25, "step": 3731, "total_num_tokens": 1011376022.0, "z_loss": 0.0006394506199285388 }, { "copy_logits_max": -2.323014974594116, "copy_logits_min": -687500032.0, "copy_num_tokens": 598.5, "epoch": 0.7622159816185856, "gen_logits_max": 5.070229530334473, "gen_logits_mean": -12.622635841369629, "gen_logits_min": -23.96385955810547, "gen_logits_std": 2.672899007797241, "gen_loss": 0.2895183265209198, "grad_norm": 0.40354853705327015, "learning_rate": 2.621557894736842e-05, "loss": 0.3075, "mean_copy_accuracy": 0.995135948061943, "mean_gen_accuracy": 0.8695753663778305, "mean_token_accuracy": 0.897163137793541, "num_tokens": 1011607460.0, "sample_num_tokens": 9590.5, "step": 3732, "total_num_tokens": 1011645822.0, "z_loss": 0.0005718350294046104 }, { "copy_logits_max": -2.3978114128112793, "copy_logits_min": -750000000.0, "copy_num_tokens": 561.125, "epoch": 0.7624202195557825, "gen_logits_max": 6.172264099121094, "gen_logits_mean": -11.75339126586914, "gen_logits_min": -23.65850257873535, "gen_logits_std": 2.658531904220581, "gen_loss": 0.3686983287334442, "grad_norm": 0.43425540131855545, "learning_rate": 2.6214315789473686e-05, "loss": 0.3273, "mean_copy_accuracy": 0.9959001392126083, "mean_gen_accuracy": 0.8536784499883652, "mean_token_accuracy": 0.8940642923116684, "num_tokens": 1011907092.0, "sample_num_tokens": 9272.0, "step": 3733, "total_num_tokens": 1011944180.0, "z_loss": 0.0007561745587736368 }, { "copy_logits_max": -2.6761856079101562, "copy_logits_min": -625000000.0, "copy_num_tokens": 366.8125, "epoch": 0.7626244574929794, "gen_logits_max": 5.706080436706543, "gen_logits_mean": -13.898366928100586, "gen_logits_min": -25.562885284423828, "gen_logits_std": 2.712136745452881, "gen_loss": 0.3118321895599365, "grad_norm": 0.45018770104254024, "learning_rate": 2.6213052631578947e-05, "loss": 0.3253, "mean_copy_accuracy": 0.9948842972517014, "mean_gen_accuracy": 0.8667621463537216, "mean_token_accuracy": 0.8917213976383209, "num_tokens": 1012141846.0, "sample_num_tokens": 7757.0, "step": 3734, "total_num_tokens": 1012172874.0, "z_loss": 0.0006523850024677813 }, { "copy_logits_max": -3.8703017234802246, "copy_logits_min": -750000000.0, "copy_num_tokens": 530.0625, "epoch": 0.7628286954301762, "gen_logits_max": 4.915409088134766, "gen_logits_mean": -14.179008483886719, "gen_logits_min": -26.094478607177734, "gen_logits_std": 2.7059383392333984, "gen_loss": 0.3298913240432739, "grad_norm": 0.4378041845423854, "learning_rate": 2.621178947368421e-05, "loss": 0.3248, "mean_copy_accuracy": 0.9948990941047668, "mean_gen_accuracy": 0.859445333480835, "mean_token_accuracy": 0.8939987272024155, "num_tokens": 1012414508.0, "sample_num_tokens": 9285.5, "step": 3735, "total_num_tokens": 1012451650.0, "z_loss": 0.000697942276019603 }, { "copy_logits_max": -1.042022705078125, "copy_logits_min": -750000000.0, "copy_num_tokens": 525.875, "epoch": 0.763032933367373, "gen_logits_max": 4.7598161697387695, "gen_logits_mean": -13.817522048950195, "gen_logits_min": -25.805767059326172, "gen_logits_std": 2.7474684715270996, "gen_loss": 0.3239842355251312, "grad_norm": 0.4597186311998683, "learning_rate": 2.6210526315789475e-05, "loss": 0.3473, "mean_copy_accuracy": 0.9954363703727722, "mean_gen_accuracy": 0.8542570918798447, "mean_token_accuracy": 0.8859633505344391, "num_tokens": 1012675487.0, "sample_num_tokens": 8167.75, "step": 3736, "total_num_tokens": 1012708158.0, "z_loss": 0.0006864099414087832 }, { "copy_logits_max": -3.871872901916504, "copy_logits_min": -687500032.0, "copy_num_tokens": 337.9375, "epoch": 0.7632371713045698, "gen_logits_max": 4.597417831420898, "gen_logits_mean": -15.397758483886719, "gen_logits_min": -27.000778198242188, "gen_logits_std": 2.7342402935028076, "gen_loss": 0.30256786942481995, "grad_norm": 0.5079179746620129, "learning_rate": 2.6209263157894736e-05, "loss": 0.3151, "mean_copy_accuracy": 0.9936688542366028, "mean_gen_accuracy": 0.8668032288551331, "mean_token_accuracy": 0.8943367302417755, "num_tokens": 1012925019.0, "sample_num_tokens": 7317.75, "step": 3737, "total_num_tokens": 1012954290.0, "z_loss": 0.0006113243871368468 }, { "copy_logits_max": -4.674139022827148, "copy_logits_min": -687500032.0, "copy_num_tokens": 380.0625, "epoch": 0.7634414092417666, "gen_logits_max": 5.331602096557617, "gen_logits_mean": -14.496487617492676, "gen_logits_min": -26.020687103271484, "gen_logits_std": 2.691671848297119, "gen_loss": 0.31927162408828735, "grad_norm": 0.37872081979652183, "learning_rate": 2.6208e-05, "loss": 0.2811, "mean_copy_accuracy": 0.9951236844062805, "mean_gen_accuracy": 0.8812005966901779, "mean_token_accuracy": 0.9069334417581558, "num_tokens": 1013185344.0, "sample_num_tokens": 8857.0, "step": 3738, "total_num_tokens": 1013220772.0, "z_loss": 0.0006828808691352606 }, { "copy_logits_max": -1.6299469470977783, "copy_logits_min": -687500032.0, "copy_num_tokens": 338.5625, "epoch": 0.7636456471789634, "gen_logits_max": 5.090930461883545, "gen_logits_mean": -14.689722061157227, "gen_logits_min": -26.399972915649414, "gen_logits_std": 2.732761859893799, "gen_loss": 0.2985597252845764, "grad_norm": 0.47628640590970717, "learning_rate": 2.6206736842105265e-05, "loss": 0.3071, "mean_copy_accuracy": 0.9942955374717712, "mean_gen_accuracy": 0.8717498481273651, "mean_token_accuracy": 0.896260678768158, "num_tokens": 1013440256.0, "sample_num_tokens": 7061.0, "step": 3739, "total_num_tokens": 1013468500.0, "z_loss": 0.0006888110656291246 }, { "copy_logits_max": -0.8959765434265137, "copy_logits_min": -687500032.0, "copy_num_tokens": 502.8125, "epoch": 0.7638498851161604, "gen_logits_max": 5.025460720062256, "gen_logits_mean": -13.018166542053223, "gen_logits_min": -24.944637298583984, "gen_logits_std": 2.737581253051758, "gen_loss": 0.28313812613487244, "grad_norm": 0.43987059966999864, "learning_rate": 2.620547368421053e-05, "loss": 0.327, "mean_copy_accuracy": 0.9945341348648071, "mean_gen_accuracy": 0.8594806492328644, "mean_token_accuracy": 0.8921737968921661, "num_tokens": 1013699935.0, "sample_num_tokens": 8139.75, "step": 3740, "total_num_tokens": 1013732494.0, "z_loss": 0.0006638418417423964 }, { "copy_logits_max": -2.815882444381714, "copy_logits_min": -687500032.0, "copy_num_tokens": 422.75, "epoch": 0.7640541230533572, "gen_logits_max": 6.103676795959473, "gen_logits_mean": -12.711158752441406, "gen_logits_min": -24.875625610351562, "gen_logits_std": 2.7433242797851562, "gen_loss": 0.32504385709762573, "grad_norm": 0.45947841206518575, "learning_rate": 2.620421052631579e-05, "loss": 0.3185, "mean_copy_accuracy": 0.993118092417717, "mean_gen_accuracy": 0.8627828359603882, "mean_token_accuracy": 0.8939661532640457, "num_tokens": 1013958042.0, "sample_num_tokens": 7313.5, "step": 3741, "total_num_tokens": 1013987296.0, "z_loss": 0.0007236807141453028 }, { "copy_logits_max": -2.2216901779174805, "copy_logits_min": -750000000.0, "copy_num_tokens": 439.875, "epoch": 0.764258360990554, "gen_logits_max": 6.376006603240967, "gen_logits_mean": -12.85105037689209, "gen_logits_min": -24.819555282592773, "gen_logits_std": 2.756755828857422, "gen_loss": 0.3077540695667267, "grad_norm": 0.3989945085334259, "learning_rate": 2.6202947368421054e-05, "loss": 0.287, "mean_copy_accuracy": 0.9956464916467667, "mean_gen_accuracy": 0.8761028349399567, "mean_token_accuracy": 0.9058631807565689, "num_tokens": 1014212686.0, "sample_num_tokens": 8143.0, "step": 3742, "total_num_tokens": 1014245258.0, "z_loss": 0.0007156039355322719 }, { "copy_logits_max": -4.286089897155762, "copy_logits_min": -687500032.0, "copy_num_tokens": 432.1875, "epoch": 0.7644625989277508, "gen_logits_max": 6.001955986022949, "gen_logits_mean": -12.883363723754883, "gen_logits_min": -25.199813842773438, "gen_logits_std": 2.7203872203826904, "gen_loss": 0.3257068395614624, "grad_norm": 0.4166864199657346, "learning_rate": 2.6201684210526315e-05, "loss": 0.3108, "mean_copy_accuracy": 0.9947643131017685, "mean_gen_accuracy": 0.8660613596439362, "mean_token_accuracy": 0.8980714678764343, "num_tokens": 1014493003.0, "sample_num_tokens": 7811.75, "step": 3743, "total_num_tokens": 1014524250.0, "z_loss": 0.0007494005258195102 }, { "copy_logits_max": -2.476285219192505, "copy_logits_min": -687500032.0, "copy_num_tokens": 477.6875, "epoch": 0.7646668368649476, "gen_logits_max": 5.266067028045654, "gen_logits_mean": -13.09965705871582, "gen_logits_min": -25.040058135986328, "gen_logits_std": 2.7002902030944824, "gen_loss": 0.3102196156978607, "grad_norm": 0.4166453840837527, "learning_rate": 2.620042105263158e-05, "loss": 0.3063, "mean_copy_accuracy": 0.994181215763092, "mean_gen_accuracy": 0.8706507980823517, "mean_token_accuracy": 0.8970261961221695, "num_tokens": 1014746227.0, "sample_num_tokens": 7992.25, "step": 3744, "total_num_tokens": 1014778196.0, "z_loss": 0.0007794824196025729 }, { "copy_logits_max": -4.67746114730835, "copy_logits_min": -750000000.0, "copy_num_tokens": 561.3125, "epoch": 0.7648710748021444, "gen_logits_max": 5.801100730895996, "gen_logits_mean": -11.812545776367188, "gen_logits_min": -23.91072654724121, "gen_logits_std": 2.6836647987365723, "gen_loss": 0.24622172117233276, "grad_norm": 0.4615478114186386, "learning_rate": 2.619915789473684e-05, "loss": 0.3025, "mean_copy_accuracy": 0.9944016188383102, "mean_gen_accuracy": 0.8629037737846375, "mean_token_accuracy": 0.9004344344139099, "num_tokens": 1015016641.0, "sample_num_tokens": 8377.75, "step": 3745, "total_num_tokens": 1015050152.0, "z_loss": 0.0006249917205423117 }, { "copy_logits_max": -3.8606808185577393, "copy_logits_min": -750000000.0, "copy_num_tokens": 380.8125, "epoch": 0.7650753127393414, "gen_logits_max": 6.688027381896973, "gen_logits_mean": -12.547722816467285, "gen_logits_min": -24.206031799316406, "gen_logits_std": 2.5936427116394043, "gen_loss": 0.31088411808013916, "grad_norm": 0.39264147405214067, "learning_rate": 2.6197894736842105e-05, "loss": 0.3229, "mean_copy_accuracy": 0.9949134588241577, "mean_gen_accuracy": 0.8650824725627899, "mean_token_accuracy": 0.8941230773925781, "num_tokens": 1015294732.0, "sample_num_tokens": 8598.5, "step": 3746, "total_num_tokens": 1015329126.0, "z_loss": 0.0007398193120025098 }, { "copy_logits_max": -4.423829078674316, "copy_logits_min": -750000000.0, "copy_num_tokens": 477.8125, "epoch": 0.7652795506765382, "gen_logits_max": 5.258439064025879, "gen_logits_mean": -14.130456924438477, "gen_logits_min": -25.687442779541016, "gen_logits_std": 2.6676340103149414, "gen_loss": 0.27992838621139526, "grad_norm": 0.39917490675468315, "learning_rate": 2.619663157894737e-05, "loss": 0.3265, "mean_copy_accuracy": 0.993196427822113, "mean_gen_accuracy": 0.864482194185257, "mean_token_accuracy": 0.8913469761610031, "num_tokens": 1015557528.0, "sample_num_tokens": 8288.0, "step": 3747, "total_num_tokens": 1015590680.0, "z_loss": 0.0006867138436064124 }, { "copy_logits_max": -4.012407302856445, "copy_logits_min": -687500032.0, "copy_num_tokens": 435.6875, "epoch": 0.765483788613735, "gen_logits_max": 4.530461311340332, "gen_logits_mean": -15.260007858276367, "gen_logits_min": -27.12112808227539, "gen_logits_std": 2.7283387184143066, "gen_loss": 0.28472334146499634, "grad_norm": 0.38984631011297427, "learning_rate": 2.6195368421052633e-05, "loss": 0.2995, "mean_copy_accuracy": 0.9941572695970535, "mean_gen_accuracy": 0.870673730969429, "mean_token_accuracy": 0.8992619514465332, "num_tokens": 1015803807.0, "sample_num_tokens": 8044.25, "step": 3748, "total_num_tokens": 1015835984.0, "z_loss": 0.0006991823320277035 }, { "copy_logits_max": -3.1782615184783936, "copy_logits_min": -687500032.0, "copy_num_tokens": 452.125, "epoch": 0.7656880265509318, "gen_logits_max": 4.625703811645508, "gen_logits_mean": -14.364031791687012, "gen_logits_min": -26.416540145874023, "gen_logits_std": 2.7137598991394043, "gen_loss": 0.2773972749710083, "grad_norm": 0.4006629034822754, "learning_rate": 2.6194105263157898e-05, "loss": 0.2834, "mean_copy_accuracy": 0.9952901154756546, "mean_gen_accuracy": 0.8718093037605286, "mean_token_accuracy": 0.905060738325119, "num_tokens": 1016086057.0, "sample_num_tokens": 7980.25, "step": 3749, "total_num_tokens": 1016117978.0, "z_loss": 0.0007581167737953365 }, { "copy_logits_max": -3.8031206130981445, "copy_logits_min": -625000064.0, "copy_num_tokens": 399.3125, "epoch": 0.7658922644881286, "gen_logits_max": 6.916891574859619, "gen_logits_mean": -11.728476524353027, "gen_logits_min": -23.231353759765625, "gen_logits_std": 2.6423959732055664, "gen_loss": 0.36778098344802856, "grad_norm": 0.4038697819271688, "learning_rate": 2.619284210526316e-05, "loss": 0.3185, "mean_copy_accuracy": 0.9948583990335464, "mean_gen_accuracy": 0.858087033033371, "mean_token_accuracy": 0.8939530104398727, "num_tokens": 1016364075.0, "sample_num_tokens": 9067.75, "step": 3750, "total_num_tokens": 1016400346.0, "z_loss": 0.0008370804134756327 }, { "copy_logits_max": -3.2891597747802734, "copy_logits_min": -687500096.0, "copy_num_tokens": 432.625, "epoch": 0.7660965024253255, "gen_logits_max": 5.047746658325195, "gen_logits_mean": -13.479024887084961, "gen_logits_min": -25.218463897705078, "gen_logits_std": 2.6669163703918457, "gen_loss": 0.3032841682434082, "grad_norm": 0.40245326123114616, "learning_rate": 2.6191578947368423e-05, "loss": 0.3034, "mean_copy_accuracy": 0.9948728233575821, "mean_gen_accuracy": 0.8651756644248962, "mean_token_accuracy": 0.8977499604225159, "num_tokens": 1016618580.0, "sample_num_tokens": 7810.5, "step": 3751, "total_num_tokens": 1016649822.0, "z_loss": 0.0006870774086564779 }, { "copy_logits_max": -4.191687107086182, "copy_logits_min": -750000000.0, "copy_num_tokens": 477.5, "epoch": 0.7663007403625224, "gen_logits_max": 4.580184459686279, "gen_logits_mean": -15.543691635131836, "gen_logits_min": -27.023365020751953, "gen_logits_std": 2.6917052268981934, "gen_loss": 0.28790679574012756, "grad_norm": 0.42706130988485175, "learning_rate": 2.6190315789473684e-05, "loss": 0.3032, "mean_copy_accuracy": 0.9936901777982712, "mean_gen_accuracy": 0.8687306791543961, "mean_token_accuracy": 0.8972917646169662, "num_tokens": 1016883480.0, "sample_num_tokens": 8674.0, "step": 3752, "total_num_tokens": 1016918176.0, "z_loss": 0.0006219822680577636 }, { "copy_logits_max": -5.153309345245361, "copy_logits_min": -750000000.0, "copy_num_tokens": 506.0625, "epoch": 0.7665049782997192, "gen_logits_max": 4.611731052398682, "gen_logits_mean": -15.087939262390137, "gen_logits_min": -26.491531372070312, "gen_logits_std": 2.655064821243286, "gen_loss": 0.25439637899398804, "grad_norm": 0.6281346814850798, "learning_rate": 2.6189052631578948e-05, "loss": 0.3037, "mean_copy_accuracy": 0.9942434430122375, "mean_gen_accuracy": 0.8702205717563629, "mean_token_accuracy": 0.8981179147958755, "num_tokens": 1017145952.0, "sample_num_tokens": 9167.0, "step": 3753, "total_num_tokens": 1017182620.0, "z_loss": 0.0005740912165492773 }, { "copy_logits_max": -2.6255991458892822, "copy_logits_min": -750000000.0, "copy_num_tokens": 391.4375, "epoch": 0.766709216236916, "gen_logits_max": 5.382518768310547, "gen_logits_mean": -12.85174560546875, "gen_logits_min": -23.934253692626953, "gen_logits_std": 2.5957350730895996, "gen_loss": 0.34468328952789307, "grad_norm": 0.4271186728727134, "learning_rate": 2.618778947368421e-05, "loss": 0.3267, "mean_copy_accuracy": 0.9955917447805405, "mean_gen_accuracy": 0.8620662540197372, "mean_token_accuracy": 0.8907973766326904, "num_tokens": 1017402788.0, "sample_num_tokens": 7255.5, "step": 3754, "total_num_tokens": 1017431810.0, "z_loss": 0.0007369730155915022 }, { "copy_logits_max": -3.8283610343933105, "copy_logits_min": -687500032.0, "copy_num_tokens": 714.6875, "epoch": 0.7669134541741128, "gen_logits_max": 4.2178802490234375, "gen_logits_mean": -14.58757495880127, "gen_logits_min": -25.948272705078125, "gen_logits_std": 2.6670517921447754, "gen_loss": 0.2795008718967438, "grad_norm": 0.4479855541578259, "learning_rate": 2.6186526315789477e-05, "loss": 0.3343, "mean_copy_accuracy": 0.9951581507921219, "mean_gen_accuracy": 0.8541241437196732, "mean_token_accuracy": 0.8903812170028687, "num_tokens": 1017665722.0, "sample_num_tokens": 10322.5, "step": 3755, "total_num_tokens": 1017707012.0, "z_loss": 0.0005812409799546003 }, { "copy_logits_max": -7.343862533569336, "copy_logits_min": -750000064.0, "copy_num_tokens": 333.25, "epoch": 0.7671176921113096, "gen_logits_max": 5.484740734100342, "gen_logits_mean": -13.984885215759277, "gen_logits_min": -24.942060470581055, "gen_logits_std": 2.5730578899383545, "gen_loss": 0.3421369791030884, "grad_norm": 0.41307690814304127, "learning_rate": 2.6185263157894738e-05, "loss": 0.3283, "mean_copy_accuracy": 0.994113951921463, "mean_gen_accuracy": 0.8572761714458466, "mean_token_accuracy": 0.8896604180335999, "num_tokens": 1017941319.0, "sample_num_tokens": 7503.25, "step": 3756, "total_num_tokens": 1017971332.0, "z_loss": 0.0006618098705075681 }, { "copy_logits_max": -4.919857025146484, "copy_logits_min": -750000000.0, "copy_num_tokens": 355.125, "epoch": 0.7673219300485065, "gen_logits_max": 4.864663124084473, "gen_logits_mean": -14.524295806884766, "gen_logits_min": -25.774452209472656, "gen_logits_std": 2.6449408531188965, "gen_loss": 0.31745636463165283, "grad_norm": 0.65116743382605, "learning_rate": 2.6184000000000002e-05, "loss": 0.3251, "mean_copy_accuracy": 0.9951144754886627, "mean_gen_accuracy": 0.8594697266817093, "mean_token_accuracy": 0.8926863819360733, "num_tokens": 1018224835.0, "sample_num_tokens": 7781.25, "step": 3757, "total_num_tokens": 1018255960.0, "z_loss": 0.0006219877395778894 }, { "copy_logits_max": -4.7354607582092285, "copy_logits_min": -750000000.0, "copy_num_tokens": 407.75, "epoch": 0.7675261679857034, "gen_logits_max": 5.054227352142334, "gen_logits_mean": -14.626184463500977, "gen_logits_min": -25.577430725097656, "gen_logits_std": 2.6207034587860107, "gen_loss": 0.2926522195339203, "grad_norm": 0.4056847357568297, "learning_rate": 2.6182736842105263e-05, "loss": 0.2948, "mean_copy_accuracy": 0.9933865666389465, "mean_gen_accuracy": 0.8709593117237091, "mean_token_accuracy": 0.9011122733354568, "num_tokens": 1018485663.0, "sample_num_tokens": 8245.25, "step": 3758, "total_num_tokens": 1018518644.0, "z_loss": 0.0005944555741734803 }, { "copy_logits_max": -1.7714825868606567, "copy_logits_min": -750000064.0, "copy_num_tokens": 591.3125, "epoch": 0.7677304059229002, "gen_logits_max": 5.416896820068359, "gen_logits_mean": -12.482999801635742, "gen_logits_min": -24.137598037719727, "gen_logits_std": 2.716719627380371, "gen_loss": 0.3273637294769287, "grad_norm": 0.43382478591678564, "learning_rate": 2.6181473684210527e-05, "loss": 0.3175, "mean_copy_accuracy": 0.994685024023056, "mean_gen_accuracy": 0.8633871674537659, "mean_token_accuracy": 0.8941749781370163, "num_tokens": 1018762275.0, "sample_num_tokens": 9786.25, "step": 3759, "total_num_tokens": 1018801420.0, "z_loss": 0.000695146038196981 }, { "copy_logits_max": -1.966252326965332, "copy_logits_min": -750000064.0, "copy_num_tokens": 664.375, "epoch": 0.767934643860097, "gen_logits_max": 3.99232816696167, "gen_logits_mean": -15.058331489562988, "gen_logits_min": -26.939430236816406, "gen_logits_std": 2.7645392417907715, "gen_loss": 0.31630775332450867, "grad_norm": 0.387091208965281, "learning_rate": 2.6180210526315788e-05, "loss": 0.3059, "mean_copy_accuracy": 0.9962591677904129, "mean_gen_accuracy": 0.8625307977199554, "mean_token_accuracy": 0.8985183089971542, "num_tokens": 1019059962.0, "sample_num_tokens": 9681.5, "step": 3760, "total_num_tokens": 1019098688.0, "z_loss": 0.0007349952356889844 }, { "copy_logits_max": -3.301924228668213, "copy_logits_min": -625000064.0, "copy_num_tokens": 468.3125, "epoch": 0.7681388817972938, "gen_logits_max": 5.209990501403809, "gen_logits_mean": -13.72808837890625, "gen_logits_min": -25.074230194091797, "gen_logits_std": 2.651315689086914, "gen_loss": 0.3064371943473816, "grad_norm": 0.42008176328351526, "learning_rate": 2.6178947368421053e-05, "loss": 0.3112, "mean_copy_accuracy": 0.9948306232690811, "mean_gen_accuracy": 0.8697501420974731, "mean_token_accuracy": 0.8968133926391602, "num_tokens": 1019317717.0, "sample_num_tokens": 8089.25, "step": 3761, "total_num_tokens": 1019350074.0, "z_loss": 0.0006880010478198528 }, { "copy_logits_max": -3.0307021141052246, "copy_logits_min": -750000000.0, "copy_num_tokens": 361.0625, "epoch": 0.7683431197344907, "gen_logits_max": 5.440827369689941, "gen_logits_mean": -13.984248161315918, "gen_logits_min": -24.84425163269043, "gen_logits_std": 2.5991954803466797, "gen_loss": 0.3847183883190155, "grad_norm": 0.48576716298133943, "learning_rate": 2.6177684210526317e-05, "loss": 0.3519, "mean_copy_accuracy": 0.9938614815473557, "mean_gen_accuracy": 0.8525867462158203, "mean_token_accuracy": 0.8849554061889648, "num_tokens": 1019594560.0, "sample_num_tokens": 8363.0, "step": 3762, "total_num_tokens": 1019628012.0, "z_loss": 0.0007249581394717097 }, { "copy_logits_max": -1.8344905376434326, "copy_logits_min": -750000000.0, "copy_num_tokens": 416.8125, "epoch": 0.7685473576716875, "gen_logits_max": 4.449760913848877, "gen_logits_mean": -15.077507019042969, "gen_logits_min": -26.495685577392578, "gen_logits_std": 2.675525188446045, "gen_loss": 0.2907850742340088, "grad_norm": 0.49547198076167115, "learning_rate": 2.617642105263158e-05, "loss": 0.2989, "mean_copy_accuracy": 0.9929907023906708, "mean_gen_accuracy": 0.8656948208808899, "mean_token_accuracy": 0.9003085941076279, "num_tokens": 1019869511.0, "sample_num_tokens": 7400.25, "step": 3763, "total_num_tokens": 1019899112.0, "z_loss": 0.0006598084000870585 }, { "copy_logits_max": -0.5607326030731201, "copy_logits_min": -687500032.0, "copy_num_tokens": 648.5625, "epoch": 0.7687515956088844, "gen_logits_max": 5.306632995605469, "gen_logits_mean": -12.613182067871094, "gen_logits_min": -24.495582580566406, "gen_logits_std": 2.6606528759002686, "gen_loss": 0.31228119134902954, "grad_norm": 0.41919071603891545, "learning_rate": 2.6175157894736845e-05, "loss": 0.3003, "mean_copy_accuracy": 0.9939698278903961, "mean_gen_accuracy": 0.8681772500276566, "mean_token_accuracy": 0.8989780396223068, "num_tokens": 1020146230.0, "sample_num_tokens": 9843.5, "step": 3764, "total_num_tokens": 1020185604.0, "z_loss": 0.0008121749851852655 }, { "copy_logits_max": 0.7402980923652649, "copy_logits_min": -750000000.0, "copy_num_tokens": 647.25, "epoch": 0.7689558335460812, "gen_logits_max": 4.800538539886475, "gen_logits_mean": -13.756159782409668, "gen_logits_min": -25.68517303466797, "gen_logits_std": 2.749786376953125, "gen_loss": 0.3214201331138611, "grad_norm": 0.4122026255437976, "learning_rate": 2.6173894736842106e-05, "loss": 0.3255, "mean_copy_accuracy": 0.9955199509859085, "mean_gen_accuracy": 0.8547193855047226, "mean_token_accuracy": 0.8898058831691742, "num_tokens": 1020433044.0, "sample_num_tokens": 9934.0, "step": 3765, "total_num_tokens": 1020472780.0, "z_loss": 0.0008316428866237402 }, { "copy_logits_max": -2.8593902587890625, "copy_logits_min": -750000000.0, "copy_num_tokens": 467.375, "epoch": 0.769160071483278, "gen_logits_max": 5.146296501159668, "gen_logits_mean": -14.098527908325195, "gen_logits_min": -26.089702606201172, "gen_logits_std": 2.7488157749176025, "gen_loss": 0.3105217218399048, "grad_norm": 0.3815127379405625, "learning_rate": 2.617263157894737e-05, "loss": 0.3059, "mean_copy_accuracy": 0.9950084239244461, "mean_gen_accuracy": 0.8668392300605774, "mean_token_accuracy": 0.8987607806921005, "num_tokens": 1020726985.0, "sample_num_tokens": 8737.25, "step": 3766, "total_num_tokens": 1020761934.0, "z_loss": 0.0007391757098957896 }, { "copy_logits_max": -1.5272716283798218, "copy_logits_min": -687500032.0, "copy_num_tokens": 554.625, "epoch": 0.7693643094204748, "gen_logits_max": 4.814940452575684, "gen_logits_mean": -14.56833553314209, "gen_logits_min": -26.581314086914062, "gen_logits_std": 2.7822606563568115, "gen_loss": 0.2821062207221985, "grad_norm": 0.41976151151576696, "learning_rate": 2.617136842105263e-05, "loss": 0.3085, "mean_copy_accuracy": 0.9940447807312012, "mean_gen_accuracy": 0.8712344467639923, "mean_token_accuracy": 0.8982700556516647, "num_tokens": 1020984601.0, "sample_num_tokens": 9193.25, "step": 3767, "total_num_tokens": 1021021374.0, "z_loss": 0.0006575846346095204 }, { "copy_logits_max": -2.3740673065185547, "copy_logits_min": -750000000.0, "copy_num_tokens": 538.25, "epoch": 0.7695685473576717, "gen_logits_max": 4.724656581878662, "gen_logits_mean": -14.251592636108398, "gen_logits_min": -25.832595825195312, "gen_logits_std": 2.734083652496338, "gen_loss": 0.28820207715034485, "grad_norm": 0.4868105208555092, "learning_rate": 2.6170105263157896e-05, "loss": 0.3043, "mean_copy_accuracy": 0.9943555295467377, "mean_gen_accuracy": 0.8688579797744751, "mean_token_accuracy": 0.9007825553417206, "num_tokens": 1021258823.0, "sample_num_tokens": 9262.25, "step": 3768, "total_num_tokens": 1021295872.0, "z_loss": 0.0005747524555772543 }, { "copy_logits_max": 0.3807259202003479, "copy_logits_min": -687500032.0, "copy_num_tokens": 305.5625, "epoch": 0.7697727852948685, "gen_logits_max": 5.437429428100586, "gen_logits_mean": -13.978830337524414, "gen_logits_min": -26.155540466308594, "gen_logits_std": 2.756042957305908, "gen_loss": 0.33020806312561035, "grad_norm": 0.4166286143721848, "learning_rate": 2.6168842105263157e-05, "loss": 0.2948, "mean_copy_accuracy": 0.9961400926113129, "mean_gen_accuracy": 0.8693424016237259, "mean_token_accuracy": 0.9021333009004593, "num_tokens": 1021535626.0, "sample_num_tokens": 7178.0, "step": 3769, "total_num_tokens": 1021564338.0, "z_loss": 0.0006539485766552389 }, { "copy_logits_max": 0.064139723777771, "copy_logits_min": -750000000.0, "copy_num_tokens": 203.0, "epoch": 0.7699770232320654, "gen_logits_max": 6.708481788635254, "gen_logits_mean": -12.282264709472656, "gen_logits_min": -23.51413345336914, "gen_logits_std": 2.698030471801758, "gen_loss": 0.34752339124679565, "grad_norm": 0.44219855796710744, "learning_rate": 2.616757894736842e-05, "loss": 0.3125, "mean_copy_accuracy": 0.9939638674259186, "mean_gen_accuracy": 0.8677861988544464, "mean_token_accuracy": 0.8945716321468353, "num_tokens": 1021797456.0, "sample_num_tokens": 6997.0, "step": 3770, "total_num_tokens": 1021825444.0, "z_loss": 0.0006560617475770414 }, { "copy_logits_max": -2.2551310062408447, "copy_logits_min": -750000000.0, "copy_num_tokens": 543.1875, "epoch": 0.7701812611692622, "gen_logits_max": 3.7341678142547607, "gen_logits_mean": -15.536372184753418, "gen_logits_min": -26.876949310302734, "gen_logits_std": 2.7102975845336914, "gen_loss": 0.26301878690719604, "grad_norm": 0.3982708794090355, "learning_rate": 2.6166315789473682e-05, "loss": 0.313, "mean_copy_accuracy": 0.9936780035495758, "mean_gen_accuracy": 0.8676614612340927, "mean_token_accuracy": 0.8958083242177963, "num_tokens": 1022060780.0, "sample_num_tokens": 9605.5, "step": 3771, "total_num_tokens": 1022099202.0, "z_loss": 0.0005021133692935109 }, { "copy_logits_max": 1.6382018327713013, "copy_logits_min": -750000000.0, "copy_num_tokens": 497.125, "epoch": 0.770385499106459, "gen_logits_max": 4.365225791931152, "gen_logits_mean": -13.454334259033203, "gen_logits_min": -25.32415199279785, "gen_logits_std": 2.6879119873046875, "gen_loss": 0.33089593052864075, "grad_norm": 0.4584905610270988, "learning_rate": 2.616505263157895e-05, "loss": 0.3093, "mean_copy_accuracy": 0.9933652877807617, "mean_gen_accuracy": 0.8665182143449783, "mean_token_accuracy": 0.8967503011226654, "num_tokens": 1022328986.0, "sample_num_tokens": 8093.5, "step": 3772, "total_num_tokens": 1022361360.0, "z_loss": 0.0006227614358067513 }, { "copy_logits_max": -1.9583951234817505, "copy_logits_min": -750000000.0, "copy_num_tokens": 392.75, "epoch": 0.7705897370436559, "gen_logits_max": 4.726689338684082, "gen_logits_mean": -15.007818222045898, "gen_logits_min": -26.536766052246094, "gen_logits_std": 2.7034144401550293, "gen_loss": 0.3224613070487976, "grad_norm": 0.41149421452429563, "learning_rate": 2.616378947368421e-05, "loss": 0.2961, "mean_copy_accuracy": 0.9950991570949554, "mean_gen_accuracy": 0.8733485192060471, "mean_token_accuracy": 0.9021504372358322, "num_tokens": 1022589550.0, "sample_num_tokens": 8260.5, "step": 3773, "total_num_tokens": 1022622592.0, "z_loss": 0.0006707442807964981 }, { "copy_logits_max": -0.990681529045105, "copy_logits_min": -750000000.0, "copy_num_tokens": 381.875, "epoch": 0.7707939749808527, "gen_logits_max": 5.087658882141113, "gen_logits_mean": -13.905540466308594, "gen_logits_min": -25.200756072998047, "gen_logits_std": 2.6728017330169678, "gen_loss": 0.3019396960735321, "grad_norm": 0.41282955183779124, "learning_rate": 2.6162526315789475e-05, "loss": 0.3157, "mean_copy_accuracy": 0.9952352344989777, "mean_gen_accuracy": 0.86296546459198, "mean_token_accuracy": 0.8939524292945862, "num_tokens": 1022850936.0, "sample_num_tokens": 8207.5, "step": 3774, "total_num_tokens": 1022883766.0, "z_loss": 0.0006496371934190392 }, { "copy_logits_max": -2.9976229667663574, "copy_logits_min": -750000000.0, "copy_num_tokens": 334.0625, "epoch": 0.7709982129180495, "gen_logits_max": 5.1564202308654785, "gen_logits_mean": -14.261631965637207, "gen_logits_min": -25.961894989013672, "gen_logits_std": 2.6591930389404297, "gen_loss": 0.3589349687099457, "grad_norm": 0.4568113165568862, "learning_rate": 2.6161263157894736e-05, "loss": 0.2982, "mean_copy_accuracy": 0.9934551864862442, "mean_gen_accuracy": 0.8714711666107178, "mean_token_accuracy": 0.9005002826452255, "num_tokens": 1023124941.0, "sample_num_tokens": 8652.75, "step": 3775, "total_num_tokens": 1023159552.0, "z_loss": 0.0006847582408227026 }, { "copy_logits_max": -0.9061230421066284, "copy_logits_min": -750000064.0, "copy_num_tokens": 628.6875, "epoch": 0.7712024508552464, "gen_logits_max": 3.693018913269043, "gen_logits_mean": -14.965838432312012, "gen_logits_min": -26.635547637939453, "gen_logits_std": 2.7046866416931152, "gen_loss": 0.27531954646110535, "grad_norm": 0.40404103265670627, "learning_rate": 2.616e-05, "loss": 0.3091, "mean_copy_accuracy": 0.9948672205209732, "mean_gen_accuracy": 0.8654806911945343, "mean_token_accuracy": 0.8962843269109726, "num_tokens": 1023387289.0, "sample_num_tokens": 8812.75, "step": 3776, "total_num_tokens": 1023422540.0, "z_loss": 0.0006120754405856133 }, { "copy_logits_max": 0.04028940200805664, "copy_logits_min": -687500032.0, "copy_num_tokens": 433.125, "epoch": 0.7714066887924432, "gen_logits_max": 5.071150302886963, "gen_logits_mean": -13.859587669372559, "gen_logits_min": -25.754268646240234, "gen_logits_std": 2.66916561126709, "gen_loss": 0.3361741304397583, "grad_norm": 0.440036589242169, "learning_rate": 2.6158736842105265e-05, "loss": 0.3312, "mean_copy_accuracy": 0.9935961663722992, "mean_gen_accuracy": 0.8577652722597122, "mean_token_accuracy": 0.8903500735759735, "num_tokens": 1023659534.0, "sample_num_tokens": 8823.0, "step": 3777, "total_num_tokens": 1023694826.0, "z_loss": 0.0007017446914687753 }, { "copy_logits_max": -0.4412841200828552, "copy_logits_min": -687500096.0, "copy_num_tokens": 435.5, "epoch": 0.77161092672964, "gen_logits_max": 4.880048751831055, "gen_logits_mean": -13.070788383483887, "gen_logits_min": -24.21015167236328, "gen_logits_std": 2.6293435096740723, "gen_loss": 0.2799605429172516, "grad_norm": 0.4417884379852541, "learning_rate": 2.6157473684210525e-05, "loss": 0.3166, "mean_copy_accuracy": 0.9950441718101501, "mean_gen_accuracy": 0.8674776405096054, "mean_token_accuracy": 0.8953961282968521, "num_tokens": 1023922734.0, "sample_num_tokens": 8290.0, "step": 3778, "total_num_tokens": 1023955894.0, "z_loss": 0.0006872540689073503 }, { "copy_logits_max": 0.07713067531585693, "copy_logits_min": -750000000.0, "copy_num_tokens": 553.875, "epoch": 0.7718151646668369, "gen_logits_max": 4.713565349578857, "gen_logits_mean": -13.513679504394531, "gen_logits_min": -24.728883743286133, "gen_logits_std": 2.6169633865356445, "gen_loss": 0.323225200176239, "grad_norm": 0.3995133712547198, "learning_rate": 2.615621052631579e-05, "loss": 0.3105, "mean_copy_accuracy": 0.9941003918647766, "mean_gen_accuracy": 0.8703538924455643, "mean_token_accuracy": 0.8983120769262314, "num_tokens": 1024193418.0, "sample_num_tokens": 9534.0, "step": 3779, "total_num_tokens": 1024231554.0, "z_loss": 0.0006961974431760609 }, { "copy_logits_max": 0.04134342074394226, "copy_logits_min": -750000000.0, "copy_num_tokens": 438.125, "epoch": 0.7720194026040337, "gen_logits_max": 4.770036697387695, "gen_logits_mean": -14.32303237915039, "gen_logits_min": -25.94091033935547, "gen_logits_std": 2.699288845062256, "gen_loss": 0.3562220335006714, "grad_norm": 0.44173875574810506, "learning_rate": 2.6154947368421054e-05, "loss": 0.3104, "mean_copy_accuracy": 0.9951812624931335, "mean_gen_accuracy": 0.8618147969245911, "mean_token_accuracy": 0.8983272761106491, "num_tokens": 1024483939.0, "sample_num_tokens": 7736.25, "step": 3780, "total_num_tokens": 1024514884.0, "z_loss": 0.000713534071110189 }, { "copy_logits_max": 4.023854732513428, "copy_logits_min": -687500032.0, "copy_num_tokens": 644.3125, "epoch": 0.7722236405412305, "gen_logits_max": 4.685731887817383, "gen_logits_mean": -13.879783630371094, "gen_logits_min": -26.260536193847656, "gen_logits_std": 2.671703338623047, "gen_loss": 0.31431418657302856, "grad_norm": 0.4007827778102815, "learning_rate": 2.615368421052632e-05, "loss": 0.3033, "mean_copy_accuracy": 0.9950261116027832, "mean_gen_accuracy": 0.867111936211586, "mean_token_accuracy": 0.9002883732318878, "num_tokens": 1024770958.0, "sample_num_tokens": 9327.5, "step": 3781, "total_num_tokens": 1024808268.0, "z_loss": 0.0006454632384702563 }, { "copy_logits_max": -2.2371997833251953, "copy_logits_min": -687500032.0, "copy_num_tokens": 245.625, "epoch": 0.7724278784784274, "gen_logits_max": 4.589485168457031, "gen_logits_mean": -15.017692565917969, "gen_logits_min": -26.253707885742188, "gen_logits_std": 2.6510584354400635, "gen_loss": 0.3315109610557556, "grad_norm": 0.41539767017436435, "learning_rate": 2.615242105263158e-05, "loss": 0.3244, "mean_copy_accuracy": 0.994084358215332, "mean_gen_accuracy": 0.8613615185022354, "mean_token_accuracy": 0.8917321413755417, "num_tokens": 1025031457.0, "sample_num_tokens": 5865.25, "step": 3782, "total_num_tokens": 1025054918.0, "z_loss": 0.0006979136378504336 }, { "copy_logits_max": 0.052269577980041504, "copy_logits_min": -750000064.0, "copy_num_tokens": 519.625, "epoch": 0.7726321164156242, "gen_logits_max": 5.39589786529541, "gen_logits_mean": -14.541444778442383, "gen_logits_min": -26.5838623046875, "gen_logits_std": 2.7522361278533936, "gen_loss": 0.28760212659835815, "grad_norm": 0.41707828819228526, "learning_rate": 2.6151157894736844e-05, "loss": 0.3062, "mean_copy_accuracy": 0.9960341453552246, "mean_gen_accuracy": 0.8675535768270493, "mean_token_accuracy": 0.8968866467475891, "num_tokens": 1025303271.0, "sample_num_tokens": 9069.25, "step": 3783, "total_num_tokens": 1025339548.0, "z_loss": 0.0006576504674740136 }, { "copy_logits_max": 0.9748892188072205, "copy_logits_min": -750000000.0, "copy_num_tokens": 436.0, "epoch": 0.772836354352821, "gen_logits_max": 4.734072685241699, "gen_logits_mean": -14.711681365966797, "gen_logits_min": -26.640060424804688, "gen_logits_std": 2.7068724632263184, "gen_loss": 0.28881558775901794, "grad_norm": 0.39486915433334135, "learning_rate": 2.6149894736842105e-05, "loss": 0.3077, "mean_copy_accuracy": 0.9968531280755997, "mean_gen_accuracy": 0.8698748052120209, "mean_token_accuracy": 0.8987279534339905, "num_tokens": 1025563039.0, "sample_num_tokens": 7909.75, "step": 3784, "total_num_tokens": 1025594678.0, "z_loss": 0.0006341119878925383 }, { "copy_logits_max": -1.1787645816802979, "copy_logits_min": -750000000.0, "copy_num_tokens": 331.625, "epoch": 0.7730405922900179, "gen_logits_max": 4.95032262802124, "gen_logits_mean": -14.13754940032959, "gen_logits_min": -25.586275100708008, "gen_logits_std": 2.665269613265991, "gen_loss": 0.3341737985610962, "grad_norm": 0.4187701447949198, "learning_rate": 2.614863157894737e-05, "loss": 0.3007, "mean_copy_accuracy": 0.9960785061120987, "mean_gen_accuracy": 0.8675905019044876, "mean_token_accuracy": 0.8997321724891663, "num_tokens": 1025811190.0, "sample_num_tokens": 7169.5, "step": 3785, "total_num_tokens": 1025839868.0, "z_loss": 0.0007602591067552567 }, { "copy_logits_max": 0.24279558658599854, "copy_logits_min": -750000000.0, "copy_num_tokens": 548.1875, "epoch": 0.7732448302272147, "gen_logits_max": 4.918953895568848, "gen_logits_mean": -14.059885025024414, "gen_logits_min": -26.136857986450195, "gen_logits_std": 2.710014820098877, "gen_loss": 0.2743532657623291, "grad_norm": 0.4344017620374123, "learning_rate": 2.614736842105263e-05, "loss": 0.3146, "mean_copy_accuracy": 0.9939600974321365, "mean_gen_accuracy": 0.8650930970907211, "mean_token_accuracy": 0.8965530544519424, "num_tokens": 1026074582.0, "sample_num_tokens": 9595.0, "step": 3786, "total_num_tokens": 1026112962.0, "z_loss": 0.0006369692273437977 }, { "copy_logits_max": -2.1318655014038086, "copy_logits_min": -750000000.0, "copy_num_tokens": 399.875, "epoch": 0.7734490681644115, "gen_logits_max": 4.69088077545166, "gen_logits_mean": -15.30808162689209, "gen_logits_min": -27.01032257080078, "gen_logits_std": 2.69464111328125, "gen_loss": 0.30247271060943604, "grad_norm": 0.3951262553457305, "learning_rate": 2.6146105263157894e-05, "loss": 0.2976, "mean_copy_accuracy": 0.9956175982952118, "mean_gen_accuracy": 0.8667604178190231, "mean_token_accuracy": 0.9009672701358795, "num_tokens": 1026362098.0, "sample_num_tokens": 7649.5, "step": 3787, "total_num_tokens": 1026392696.0, "z_loss": 0.0006440808647312224 }, { "copy_logits_max": 2.784534454345703, "copy_logits_min": -750000000.0, "copy_num_tokens": 364.9375, "epoch": 0.7736533061016084, "gen_logits_max": 5.988840103149414, "gen_logits_mean": -12.866222381591797, "gen_logits_min": -25.119354248046875, "gen_logits_std": 2.720322847366333, "gen_loss": 0.27313610911369324, "grad_norm": 0.38400044272638684, "learning_rate": 2.614484210526316e-05, "loss": 0.276, "mean_copy_accuracy": 0.9950593709945679, "mean_gen_accuracy": 0.8833160400390625, "mean_token_accuracy": 0.9086311906576157, "num_tokens": 1026627770.0, "sample_num_tokens": 7949.0, "step": 3788, "total_num_tokens": 1026659566.0, "z_loss": 0.0005727863754145801 }, { "copy_logits_max": 2.335793972015381, "copy_logits_min": -750000000.0, "copy_num_tokens": 506.1875, "epoch": 0.7738575440388052, "gen_logits_max": 5.617500305175781, "gen_logits_mean": -12.641778945922852, "gen_logits_min": -24.669994354248047, "gen_logits_std": 2.7100658416748047, "gen_loss": 0.3238847851753235, "grad_norm": 0.41728979168601177, "learning_rate": 2.6143578947368423e-05, "loss": 0.3286, "mean_copy_accuracy": 0.9942434728145599, "mean_gen_accuracy": 0.8638203293085098, "mean_token_accuracy": 0.89007967710495, "num_tokens": 1026884385.0, "sample_num_tokens": 9033.75, "step": 3789, "total_num_tokens": 1026920520.0, "z_loss": 0.0006598809268325567 }, { "copy_logits_max": -0.4067077040672302, "copy_logits_min": -750000000.0, "copy_num_tokens": 380.75, "epoch": 0.7740617819760021, "gen_logits_max": 5.406323432922363, "gen_logits_mean": -14.659662246704102, "gen_logits_min": -26.67837905883789, "gen_logits_std": 2.6788151264190674, "gen_loss": 0.3155902922153473, "grad_norm": 0.43025253203762553, "learning_rate": 2.6142315789473687e-05, "loss": 0.3039, "mean_copy_accuracy": 0.9964354187250137, "mean_gen_accuracy": 0.8657882213592529, "mean_token_accuracy": 0.8990150392055511, "num_tokens": 1027161147.0, "sample_num_tokens": 7735.75, "step": 3790, "total_num_tokens": 1027192090.0, "z_loss": 0.0006318395026028156 }, { "copy_logits_max": 2.4536192417144775, "copy_logits_min": -625000064.0, "copy_num_tokens": 371.0, "epoch": 0.7742660199131989, "gen_logits_max": 5.351552963256836, "gen_logits_mean": -13.69473648071289, "gen_logits_min": -25.358177185058594, "gen_logits_std": 2.7047293186187744, "gen_loss": 0.30828991532325745, "grad_norm": 0.45488127023595454, "learning_rate": 2.6141052631578948e-05, "loss": 0.3148, "mean_copy_accuracy": 0.9934466034173965, "mean_gen_accuracy": 0.8655668348073959, "mean_token_accuracy": 0.8946758657693863, "num_tokens": 1027418757.0, "sample_num_tokens": 7888.25, "step": 3791, "total_num_tokens": 1027450310.0, "z_loss": 0.0005879372474737465 }, { "copy_logits_max": 0.8225221633911133, "copy_logits_min": -687500032.0, "copy_num_tokens": 347.625, "epoch": 0.7744702578503957, "gen_logits_max": 5.104914665222168, "gen_logits_mean": -14.458354949951172, "gen_logits_min": -26.38277244567871, "gen_logits_std": 2.727663993835449, "gen_loss": 0.33499425649642944, "grad_norm": 0.4146753082052732, "learning_rate": 2.6139789473684212e-05, "loss": 0.3131, "mean_copy_accuracy": 0.9938565045595169, "mean_gen_accuracy": 0.8662067502737045, "mean_token_accuracy": 0.8962713778018951, "num_tokens": 1027684010.0, "sample_num_tokens": 7579.5, "step": 3792, "total_num_tokens": 1027714328.0, "z_loss": 0.0006541787879541516 }, { "copy_logits_max": 0.7430866956710815, "copy_logits_min": -625000000.0, "copy_num_tokens": 329.5625, "epoch": 0.7746744957875925, "gen_logits_max": 5.003120422363281, "gen_logits_mean": -15.207353591918945, "gen_logits_min": -26.498308181762695, "gen_logits_std": 2.6954431533813477, "gen_loss": 0.34825390577316284, "grad_norm": 0.3894672448977964, "learning_rate": 2.6138526315789473e-05, "loss": 0.3146, "mean_copy_accuracy": 0.9956715852022171, "mean_gen_accuracy": 0.8624090999364853, "mean_token_accuracy": 0.8955969959497452, "num_tokens": 1027967287.0, "sample_num_tokens": 7276.75, "step": 3793, "total_num_tokens": 1027996394.0, "z_loss": 0.0007637393428012729 }, { "copy_logits_max": 2.0984294414520264, "copy_logits_min": -750000000.0, "copy_num_tokens": 248.0625, "epoch": 0.7748787337247893, "gen_logits_max": 6.177861213684082, "gen_logits_mean": -13.355189323425293, "gen_logits_min": -25.10072898864746, "gen_logits_std": 2.675663948059082, "gen_loss": 0.33794108033180237, "grad_norm": 0.4500759722433683, "learning_rate": 2.6137263157894738e-05, "loss": 0.3191, "mean_copy_accuracy": 0.9954750239849091, "mean_gen_accuracy": 0.859064444899559, "mean_token_accuracy": 0.8939609527587891, "num_tokens": 1028237379.0, "sample_num_tokens": 7200.75, "step": 3794, "total_num_tokens": 1028266182.0, "z_loss": 0.0007363504264503717 }, { "copy_logits_max": 2.9417953491210938, "copy_logits_min": -625000064.0, "copy_num_tokens": 826.375, "epoch": 0.7750829716619863, "gen_logits_max": 3.6972885131835938, "gen_logits_mean": -15.242295265197754, "gen_logits_min": -27.940120697021484, "gen_logits_std": 2.7929024696350098, "gen_loss": 0.27410706877708435, "grad_norm": 0.3820349984349156, "learning_rate": 2.6136e-05, "loss": 0.2953, "mean_copy_accuracy": 0.9962434470653534, "mean_gen_accuracy": 0.8653130233287811, "mean_token_accuracy": 0.9038967788219452, "num_tokens": 1028540811.0, "sample_num_tokens": 10590.25, "step": 3795, "total_num_tokens": 1028583172.0, "z_loss": 0.0006814098451286554 }, { "copy_logits_max": 0.25079596042633057, "copy_logits_min": -750000000.0, "copy_num_tokens": 392.5, "epoch": 0.7752872095991831, "gen_logits_max": 4.750655174255371, "gen_logits_mean": -15.16508674621582, "gen_logits_min": -26.455415725708008, "gen_logits_std": 2.706705093383789, "gen_loss": 0.29606398940086365, "grad_norm": 0.3987642762977295, "learning_rate": 2.6134736842105266e-05, "loss": 0.3192, "mean_copy_accuracy": 0.9940651059150696, "mean_gen_accuracy": 0.8687108308076859, "mean_token_accuracy": 0.8952054083347321, "num_tokens": 1028810816.0, "sample_num_tokens": 7815.0, "step": 3796, "total_num_tokens": 1028842076.0, "z_loss": 0.0006277449429035187 }, { "copy_logits_max": -1.416712999343872, "copy_logits_min": -750000000.0, "copy_num_tokens": 366.5, "epoch": 0.7754914475363799, "gen_logits_max": 5.227417945861816, "gen_logits_mean": -14.906067848205566, "gen_logits_min": -26.481826782226562, "gen_logits_std": 2.6958069801330566, "gen_loss": 0.3526552617549896, "grad_norm": 0.4562120748117524, "learning_rate": 2.6133473684210527e-05, "loss": 0.3138, "mean_copy_accuracy": 0.9943694025278091, "mean_gen_accuracy": 0.8625523895025253, "mean_token_accuracy": 0.8955958485603333, "num_tokens": 1029078377.0, "sample_num_tokens": 7980.75, "step": 3797, "total_num_tokens": 1029110300.0, "z_loss": 0.0007414452848024666 }, { "copy_logits_max": -0.7338647246360779, "copy_logits_min": -750000000.0, "copy_num_tokens": 491.8125, "epoch": 0.7756956854735767, "gen_logits_max": 5.064711093902588, "gen_logits_mean": -14.416498184204102, "gen_logits_min": -26.002574920654297, "gen_logits_std": 2.716803789138794, "gen_loss": 0.3360958695411682, "grad_norm": 0.42667651809955354, "learning_rate": 2.613221052631579e-05, "loss": 0.3227, "mean_copy_accuracy": 0.9951563030481339, "mean_gen_accuracy": 0.8553583323955536, "mean_token_accuracy": 0.892622247338295, "num_tokens": 1029360319.0, "sample_num_tokens": 9450.25, "step": 3798, "total_num_tokens": 1029398120.0, "z_loss": 0.0007304598693735898 }, { "copy_logits_max": 2.110858917236328, "copy_logits_min": -750000000.0, "copy_num_tokens": 585.3125, "epoch": 0.7758999234107735, "gen_logits_max": 5.413734436035156, "gen_logits_mean": -13.268604278564453, "gen_logits_min": -25.812952041625977, "gen_logits_std": 2.75028133392334, "gen_loss": 0.3029318153858185, "grad_norm": 0.41610028235032037, "learning_rate": 2.6130947368421052e-05, "loss": 0.3149, "mean_copy_accuracy": 0.99433434009552, "mean_gen_accuracy": 0.8701646327972412, "mean_token_accuracy": 0.8966920077800751, "num_tokens": 1029630791.0, "sample_num_tokens": 9375.25, "step": 3799, "total_num_tokens": 1029668292.0, "z_loss": 0.0007208638708107173 }, { "copy_logits_max": -0.6129790544509888, "copy_logits_min": -750000000.0, "copy_num_tokens": 558.9375, "epoch": 0.7761041613479703, "gen_logits_max": 3.8462305068969727, "gen_logits_mean": -15.859395980834961, "gen_logits_min": -27.615747451782227, "gen_logits_std": 2.732400894165039, "gen_loss": 0.29269424080848694, "grad_norm": 0.39691057602980007, "learning_rate": 2.6129684210526317e-05, "loss": 0.2863, "mean_copy_accuracy": 0.9956721514463425, "mean_gen_accuracy": 0.8713011592626572, "mean_token_accuracy": 0.9060150533914566, "num_tokens": 1029923310.0, "sample_num_tokens": 8974.5, "step": 3800, "total_num_tokens": 1029959208.0, "z_loss": 0.0006367303431034088 }, { "copy_logits_max": 4.3516845703125, "copy_logits_min": -750000128.0, "copy_num_tokens": 596.375, "epoch": 0.7763083992851673, "gen_logits_max": 4.910307884216309, "gen_logits_mean": -12.925399780273438, "gen_logits_min": -24.896961212158203, "gen_logits_std": 2.7890095710754395, "gen_loss": 0.29709723591804504, "grad_norm": 0.39097448899947823, "learning_rate": 2.6128421052631578e-05, "loss": 0.293, "mean_copy_accuracy": 0.9947050213813782, "mean_gen_accuracy": 0.8690265864133835, "mean_token_accuracy": 0.9028826653957367, "num_tokens": 1030213694.0, "sample_num_tokens": 8696.5, "step": 3801, "total_num_tokens": 1030248480.0, "z_loss": 0.0006471129599958658 }, { "copy_logits_max": 0.8751225471496582, "copy_logits_min": -750000000.0, "copy_num_tokens": 395.9375, "epoch": 0.7765126372223641, "gen_logits_max": 5.169005870819092, "gen_logits_mean": -13.85659122467041, "gen_logits_min": -25.548107147216797, "gen_logits_std": 2.7388033866882324, "gen_loss": 0.33102473616600037, "grad_norm": 0.4220859890198615, "learning_rate": 2.6127157894736842e-05, "loss": 0.3107, "mean_copy_accuracy": 0.9936342388391495, "mean_gen_accuracy": 0.8660140782594681, "mean_token_accuracy": 0.8967190086841583, "num_tokens": 1030485968.0, "sample_num_tokens": 8554.0, "step": 3802, "total_num_tokens": 1030520184.0, "z_loss": 0.0007135736523196101 }, { "copy_logits_max": -0.28903546929359436, "copy_logits_min": -687500032.0, "copy_num_tokens": 484.8125, "epoch": 0.7767168751595609, "gen_logits_max": 4.467217445373535, "gen_logits_mean": -15.503697395324707, "gen_logits_min": -27.1424617767334, "gen_logits_std": 2.7658495903015137, "gen_loss": 0.3073614537715912, "grad_norm": 0.3864063655324497, "learning_rate": 2.6125894736842106e-05, "loss": 0.3045, "mean_copy_accuracy": 0.9963674247264862, "mean_gen_accuracy": 0.8649125397205353, "mean_token_accuracy": 0.8989037275314331, "num_tokens": 1030772084.0, "sample_num_tokens": 8756.5, "step": 3803, "total_num_tokens": 1030807110.0, "z_loss": 0.0006784008000977337 }, { "copy_logits_max": -3.094878673553467, "copy_logits_min": -750000000.0, "copy_num_tokens": 515.375, "epoch": 0.7769211130967577, "gen_logits_max": 3.6727750301361084, "gen_logits_mean": -16.684823989868164, "gen_logits_min": -28.647655487060547, "gen_logits_std": 2.7763757705688477, "gen_loss": 0.2542310953140259, "grad_norm": 0.3939037534578961, "learning_rate": 2.612463157894737e-05, "loss": 0.3097, "mean_copy_accuracy": 0.9954517036676407, "mean_gen_accuracy": 0.8638050854206085, "mean_token_accuracy": 0.8985675573348999, "num_tokens": 1031041998.0, "sample_num_tokens": 9001.0, "step": 3804, "total_num_tokens": 1031078002.0, "z_loss": 0.000545536691788584 }, { "copy_logits_max": 1.3008322715759277, "copy_logits_min": -750000000.0, "copy_num_tokens": 498.8125, "epoch": 0.7771253510339545, "gen_logits_max": 5.442390441894531, "gen_logits_mean": -13.556146621704102, "gen_logits_min": -25.269624710083008, "gen_logits_std": 2.7378551959991455, "gen_loss": 0.34421247243881226, "grad_norm": 0.39582125336845414, "learning_rate": 2.6123368421052635e-05, "loss": 0.3393, "mean_copy_accuracy": 0.9941135793924332, "mean_gen_accuracy": 0.8642778694629669, "mean_token_accuracy": 0.8870536535978317, "num_tokens": 1031292715.0, "sample_num_tokens": 8901.25, "step": 3805, "total_num_tokens": 1031328320.0, "z_loss": 0.0007017219322733581 }, { "copy_logits_max": 0.2495747208595276, "copy_logits_min": -750000000.0, "copy_num_tokens": 571.3125, "epoch": 0.7773295889711513, "gen_logits_max": 4.997951507568359, "gen_logits_mean": -13.840328216552734, "gen_logits_min": -26.546798706054688, "gen_logits_std": 2.800018787384033, "gen_loss": 0.2870270013809204, "grad_norm": 0.4177741892204672, "learning_rate": 2.6122105263157896e-05, "loss": 0.3086, "mean_copy_accuracy": 0.994800865650177, "mean_gen_accuracy": 0.8627464175224304, "mean_token_accuracy": 0.8967696726322174, "num_tokens": 1031573201.0, "sample_num_tokens": 9066.25, "step": 3806, "total_num_tokens": 1031609466.0, "z_loss": 0.0005832865717820823 }, { "copy_logits_max": -0.4715653955936432, "copy_logits_min": -750000000.0, "copy_num_tokens": 514.125, "epoch": 0.7775338269083483, "gen_logits_max": 5.106557846069336, "gen_logits_mean": -13.082342147827148, "gen_logits_min": -25.474998474121094, "gen_logits_std": 2.763113498687744, "gen_loss": 0.29439765214920044, "grad_norm": 0.4228671422414867, "learning_rate": 2.612084210526316e-05, "loss": 0.3084, "mean_copy_accuracy": 0.9953162521123886, "mean_gen_accuracy": 0.8663567453622818, "mean_token_accuracy": 0.8978393077850342, "num_tokens": 1031836319.0, "sample_num_tokens": 8161.25, "step": 3807, "total_num_tokens": 1031868964.0, "z_loss": 0.0006456952542066574 }, { "copy_logits_max": -1.7765426635742188, "copy_logits_min": -750000000.0, "copy_num_tokens": 454.625, "epoch": 0.7777380648455451, "gen_logits_max": 4.3604254722595215, "gen_logits_mean": -15.812746047973633, "gen_logits_min": -27.434816360473633, "gen_logits_std": 2.764678955078125, "gen_loss": 0.2862226068973541, "grad_norm": 0.42982086248349344, "learning_rate": 2.611957894736842e-05, "loss": 0.3063, "mean_copy_accuracy": 0.9943767488002777, "mean_gen_accuracy": 0.8725092113018036, "mean_token_accuracy": 0.9005010426044464, "num_tokens": 1032096383.0, "sample_num_tokens": 8994.25, "step": 3808, "total_num_tokens": 1032132360.0, "z_loss": 0.0005816521006636322 }, { "copy_logits_max": 1.9297218322753906, "copy_logits_min": -750000000.0, "copy_num_tokens": 455.0, "epoch": 0.7779423027827419, "gen_logits_max": 5.693622589111328, "gen_logits_mean": -12.993246078491211, "gen_logits_min": -25.346790313720703, "gen_logits_std": 2.7563586235046387, "gen_loss": 0.3155710995197296, "grad_norm": 0.5775721259844442, "learning_rate": 2.6118315789473685e-05, "loss": 0.3284, "mean_copy_accuracy": 0.9950521439313889, "mean_gen_accuracy": 0.8580721020698547, "mean_token_accuracy": 0.8918312340974808, "num_tokens": 1032356768.0, "sample_num_tokens": 9127.0, "step": 3809, "total_num_tokens": 1032393276.0, "z_loss": 0.0007022051140666008 }, { "copy_logits_max": 0.22905918955802917, "copy_logits_min": -750000000.0, "copy_num_tokens": 399.8125, "epoch": 0.7781465407199387, "gen_logits_max": 4.955920696258545, "gen_logits_mean": -14.499086380004883, "gen_logits_min": -26.34319496154785, "gen_logits_std": 2.7647104263305664, "gen_loss": 0.33160918951034546, "grad_norm": 0.413241538543115, "learning_rate": 2.6117052631578946e-05, "loss": 0.3206, "mean_copy_accuracy": 0.9946808516979218, "mean_gen_accuracy": 0.8648377507925034, "mean_token_accuracy": 0.8935463726520538, "num_tokens": 1032622918.0, "sample_num_tokens": 7766.5, "step": 3810, "total_num_tokens": 1032653984.0, "z_loss": 0.0007155756466090679 }, { "copy_logits_max": -1.143821358680725, "copy_logits_min": -625000000.0, "copy_num_tokens": 348.875, "epoch": 0.7783507786571355, "gen_logits_max": 5.288832664489746, "gen_logits_mean": -14.376619338989258, "gen_logits_min": -25.755191802978516, "gen_logits_std": 2.7127387523651123, "gen_loss": 0.3904431462287903, "grad_norm": 0.41646092240035526, "learning_rate": 2.611578947368421e-05, "loss": 0.3174, "mean_copy_accuracy": 0.9951465427875519, "mean_gen_accuracy": 0.8620159476995468, "mean_token_accuracy": 0.89557184278965, "num_tokens": 1032906464.0, "sample_num_tokens": 8030.0, "step": 3811, "total_num_tokens": 1032938584.0, "z_loss": 0.0007975478656589985 }, { "copy_logits_max": -3.712453842163086, "copy_logits_min": -750000000.0, "copy_num_tokens": 424.6875, "epoch": 0.7785550165943323, "gen_logits_max": 5.103605270385742, "gen_logits_mean": -15.61838150024414, "gen_logits_min": -27.505725860595703, "gen_logits_std": 2.7672064304351807, "gen_loss": 0.3114541471004486, "grad_norm": 0.447027518268417, "learning_rate": 2.6114526315789475e-05, "loss": 0.3122, "mean_copy_accuracy": 0.9945244640111923, "mean_gen_accuracy": 0.8663812130689621, "mean_token_accuracy": 0.8955691605806351, "num_tokens": 1033160927.0, "sample_num_tokens": 7731.25, "step": 3812, "total_num_tokens": 1033191852.0, "z_loss": 0.0006795161752961576 }, { "copy_logits_max": 3.2531204223632812, "copy_logits_min": -750000000.0, "copy_num_tokens": 557.3125, "epoch": 0.7787592545315293, "gen_logits_max": 5.125422477722168, "gen_logits_mean": -13.029452323913574, "gen_logits_min": -25.12457275390625, "gen_logits_std": 2.8014450073242188, "gen_loss": 0.28525838255882263, "grad_norm": 0.40187968011528463, "learning_rate": 2.611326315789474e-05, "loss": 0.2815, "mean_copy_accuracy": 0.9957624971866608, "mean_gen_accuracy": 0.8715513199567795, "mean_token_accuracy": 0.9066914021968842, "num_tokens": 1033466342.0, "sample_num_tokens": 8378.0, "step": 3813, "total_num_tokens": 1033499854.0, "z_loss": 0.0007081924704834819 }, { "copy_logits_max": 3.1501574516296387, "copy_logits_min": -750000064.0, "copy_num_tokens": 521.625, "epoch": 0.7789634924687261, "gen_logits_max": 5.889551162719727, "gen_logits_mean": -12.190521240234375, "gen_logits_min": -24.448650360107422, "gen_logits_std": 2.8008995056152344, "gen_loss": 0.317818820476532, "grad_norm": 0.42940260622567533, "learning_rate": 2.6112e-05, "loss": 0.3053, "mean_copy_accuracy": 0.9951650649309158, "mean_gen_accuracy": 0.865157350897789, "mean_token_accuracy": 0.8983111828565598, "num_tokens": 1033736610.0, "sample_num_tokens": 9342.5, "step": 3814, "total_num_tokens": 1033773980.0, "z_loss": 0.0007994574261829257 }, { "copy_logits_max": -0.566472053527832, "copy_logits_min": -687500032.0, "copy_num_tokens": 334.125, "epoch": 0.7791677304059229, "gen_logits_max": 4.810006618499756, "gen_logits_mean": -15.066289901733398, "gen_logits_min": -27.41372299194336, "gen_logits_std": 2.7847280502319336, "gen_loss": 0.3482317328453064, "grad_norm": 0.44210541658064795, "learning_rate": 2.6110736842105264e-05, "loss": 0.3263, "mean_copy_accuracy": 0.9953220039606094, "mean_gen_accuracy": 0.8608219772577286, "mean_token_accuracy": 0.8921540230512619, "num_tokens": 1033995540.0, "sample_num_tokens": 6612.0, "step": 3815, "total_num_tokens": 1034021988.0, "z_loss": 0.0007510042632929981 }, { "copy_logits_max": -1.8005000352859497, "copy_logits_min": -687500032.0, "copy_num_tokens": 528.375, "epoch": 0.7793719683431197, "gen_logits_max": 4.524583339691162, "gen_logits_mean": -14.658339500427246, "gen_logits_min": -27.000946044921875, "gen_logits_std": 2.8175275325775146, "gen_loss": 0.2862473726272583, "grad_norm": 0.4166021134171608, "learning_rate": 2.610947368421053e-05, "loss": 0.2975, "mean_copy_accuracy": 0.9944472163915634, "mean_gen_accuracy": 0.8708481788635254, "mean_token_accuracy": 0.9020662605762482, "num_tokens": 1034252545.0, "sample_num_tokens": 8447.25, "step": 3816, "total_num_tokens": 1034286334.0, "z_loss": 0.0005885077407583594 }, { "copy_logits_max": 0.013328686356544495, "copy_logits_min": -750000000.0, "copy_num_tokens": 745.375, "epoch": 0.7795762062803165, "gen_logits_max": 3.9760401248931885, "gen_logits_mean": -15.318493843078613, "gen_logits_min": -27.588977813720703, "gen_logits_std": 2.8123860359191895, "gen_loss": 0.2771033048629761, "grad_norm": 0.5346641995692047, "learning_rate": 2.610821052631579e-05, "loss": 0.3149, "mean_copy_accuracy": 0.992895245552063, "mean_gen_accuracy": 0.8637086451053619, "mean_token_accuracy": 0.8970472067594528, "num_tokens": 1034523118.0, "sample_num_tokens": 9945.0, "step": 3817, "total_num_tokens": 1034562898.0, "z_loss": 0.0005799639038741589 }, { "copy_logits_max": -1.055047869682312, "copy_logits_min": -750000000.0, "copy_num_tokens": 338.375, "epoch": 0.7797804442175134, "gen_logits_max": 5.055039405822754, "gen_logits_mean": -15.0973539352417, "gen_logits_min": -27.280485153198242, "gen_logits_std": 2.798482894897461, "gen_loss": 0.29146280884742737, "grad_norm": 0.4373862013960494, "learning_rate": 2.6106947368421054e-05, "loss": 0.296, "mean_copy_accuracy": 0.9952085018157959, "mean_gen_accuracy": 0.8722139000892639, "mean_token_accuracy": 0.9001878201961517, "num_tokens": 1034799551.0, "sample_num_tokens": 7529.75, "step": 3818, "total_num_tokens": 1034829670.0, "z_loss": 0.0006386801251210272 }, { "copy_logits_max": 3.025381565093994, "copy_logits_min": -750000128.0, "copy_num_tokens": 619.1875, "epoch": 0.7799846821547103, "gen_logits_max": 5.079973220825195, "gen_logits_mean": -13.739659309387207, "gen_logits_min": -25.85049057006836, "gen_logits_std": 2.796114444732666, "gen_loss": 0.293098509311676, "grad_norm": 0.4228571634734509, "learning_rate": 2.6105684210526315e-05, "loss": 0.3241, "mean_copy_accuracy": 0.9957824349403381, "mean_gen_accuracy": 0.8596891611814499, "mean_token_accuracy": 0.8932417780160904, "num_tokens": 1035076507.0, "sample_num_tokens": 9868.25, "step": 3819, "total_num_tokens": 1035115980.0, "z_loss": 0.0006794697255827487 }, { "copy_logits_max": 0.42098963260650635, "copy_logits_min": -750000000.0, "copy_num_tokens": 525.75, "epoch": 0.7801889200919071, "gen_logits_max": 4.391140460968018, "gen_logits_mean": -14.85041332244873, "gen_logits_min": -27.12221908569336, "gen_logits_std": 2.818321466445923, "gen_loss": 0.3037973642349243, "grad_norm": 0.4248220000113583, "learning_rate": 2.610442105263158e-05, "loss": 0.3132, "mean_copy_accuracy": 0.9951614439487457, "mean_gen_accuracy": 0.861359179019928, "mean_token_accuracy": 0.897511214017868, "num_tokens": 1035356167.0, "sample_num_tokens": 8933.25, "step": 3820, "total_num_tokens": 1035391900.0, "z_loss": 0.0007022747304290533 }, { "copy_logits_max": -1.9256559610366821, "copy_logits_min": -750000000.0, "copy_num_tokens": 328.5625, "epoch": 0.7803931580291039, "gen_logits_max": 5.995295524597168, "gen_logits_mean": -13.82241439819336, "gen_logits_min": -25.828149795532227, "gen_logits_std": 2.773283004760742, "gen_loss": 0.3319571614265442, "grad_norm": 0.4530806870283877, "learning_rate": 2.6103157894736843e-05, "loss": 0.3204, "mean_copy_accuracy": 0.9941582232713699, "mean_gen_accuracy": 0.8699408620595932, "mean_token_accuracy": 0.8940183520317078, "num_tokens": 1035629765.0, "sample_num_tokens": 7861.75, "step": 3821, "total_num_tokens": 1035661212.0, "z_loss": 0.0006645397515967488 }, { "copy_logits_max": -2.593393087387085, "copy_logits_min": -750000000.0, "copy_num_tokens": 349.875, "epoch": 0.7805973959663007, "gen_logits_max": 4.859016418457031, "gen_logits_mean": -15.34552001953125, "gen_logits_min": -27.117023468017578, "gen_logits_std": 2.766488790512085, "gen_loss": 0.27637240290641785, "grad_norm": 0.4163735717693859, "learning_rate": 2.6101894736842108e-05, "loss": 0.3122, "mean_copy_accuracy": 0.994454488158226, "mean_gen_accuracy": 0.8680332899093628, "mean_token_accuracy": 0.8971459269523621, "num_tokens": 1035874374.0, "sample_num_tokens": 7709.5, "step": 3822, "total_num_tokens": 1035905212.0, "z_loss": 0.0005816256161779165 }, { "copy_logits_max": 2.7153053283691406, "copy_logits_min": -750000000.0, "copy_num_tokens": 595.8125, "epoch": 0.7808016339034975, "gen_logits_max": 4.512823581695557, "gen_logits_mean": -14.166677474975586, "gen_logits_min": -26.694934844970703, "gen_logits_std": 2.811690330505371, "gen_loss": 0.31082797050476074, "grad_norm": 0.4304964566634589, "learning_rate": 2.610063157894737e-05, "loss": 0.3002, "mean_copy_accuracy": 0.994180828332901, "mean_gen_accuracy": 0.8686765879392624, "mean_token_accuracy": 0.900095522403717, "num_tokens": 1036145390.0, "sample_num_tokens": 9105.0, "step": 3823, "total_num_tokens": 1036181810.0, "z_loss": 0.0006967525696381927 }, { "copy_logits_max": 0.04965925216674805, "copy_logits_min": -750000000.0, "copy_num_tokens": 640.125, "epoch": 0.7810058718406944, "gen_logits_max": 4.801401138305664, "gen_logits_mean": -13.369645118713379, "gen_logits_min": -25.476852416992188, "gen_logits_std": 2.7996625900268555, "gen_loss": 0.3219895362854004, "grad_norm": 0.43704483194616195, "learning_rate": 2.6099368421052633e-05, "loss": 0.3119, "mean_copy_accuracy": 0.9959789514541626, "mean_gen_accuracy": 0.8602918684482574, "mean_token_accuracy": 0.8956611752510071, "num_tokens": 1036427718.0, "sample_num_tokens": 10709.5, "step": 3824, "total_num_tokens": 1036470556.0, "z_loss": 0.0006685061380267143 }, { "copy_logits_max": 5.9469499588012695, "copy_logits_min": -687500032.0, "copy_num_tokens": 696.0625, "epoch": 0.7812101097778913, "gen_logits_max": 4.480457305908203, "gen_logits_mean": -13.504465103149414, "gen_logits_min": -25.682815551757812, "gen_logits_std": 2.7830862998962402, "gen_loss": 0.24930131435394287, "grad_norm": 0.41384615254004786, "learning_rate": 2.6098105263157894e-05, "loss": 0.31, "mean_copy_accuracy": 0.9955134987831116, "mean_gen_accuracy": 0.8663964569568634, "mean_token_accuracy": 0.895625501871109, "num_tokens": 1036689696.0, "sample_num_tokens": 9263.0, "step": 3825, "total_num_tokens": 1036726748.0, "z_loss": 0.0005274676368571818 }, { "copy_logits_max": -1.43988037109375, "copy_logits_min": -750000000.0, "copy_num_tokens": 372.5625, "epoch": 0.7814143477150881, "gen_logits_max": 4.801808834075928, "gen_logits_mean": -15.144097328186035, "gen_logits_min": -26.910594940185547, "gen_logits_std": 2.7652788162231445, "gen_loss": 0.33960387110710144, "grad_norm": 0.42399930460310786, "learning_rate": 2.6096842105263158e-05, "loss": 0.3192, "mean_copy_accuracy": 0.9943960607051849, "mean_gen_accuracy": 0.863070085644722, "mean_token_accuracy": 0.8931841850280762, "num_tokens": 1036956338.0, "sample_num_tokens": 8361.5, "step": 3826, "total_num_tokens": 1036989784.0, "z_loss": 0.0006380752893164754 }, { "copy_logits_max": 1.8853483200073242, "copy_logits_min": -750000000.0, "copy_num_tokens": 351.9375, "epoch": 0.7816185856522849, "gen_logits_max": 5.667360305786133, "gen_logits_mean": -14.436293601989746, "gen_logits_min": -26.269317626953125, "gen_logits_std": 2.7446436882019043, "gen_loss": 0.3763982653617859, "grad_norm": 0.40149262140236025, "learning_rate": 2.609557894736842e-05, "loss": 0.3336, "mean_copy_accuracy": 0.995742991566658, "mean_gen_accuracy": 0.860490620136261, "mean_token_accuracy": 0.8898456245660782, "num_tokens": 1037225742.0, "sample_num_tokens": 7257.0, "step": 3827, "total_num_tokens": 1037254770.0, "z_loss": 0.0007080014329403639 }, { "copy_logits_max": 1.8655738830566406, "copy_logits_min": -750000000.0, "copy_num_tokens": 376.1875, "epoch": 0.7818228235894817, "gen_logits_max": 4.970452308654785, "gen_logits_mean": -14.143542289733887, "gen_logits_min": -26.13199234008789, "gen_logits_std": 2.7691807746887207, "gen_loss": 0.31692785024642944, "grad_norm": 0.4993986225332287, "learning_rate": 2.6094315789473683e-05, "loss": 0.3036, "mean_copy_accuracy": 0.9932263344526291, "mean_gen_accuracy": 0.8697674572467804, "mean_token_accuracy": 0.8986760079860687, "num_tokens": 1037482907.0, "sample_num_tokens": 7397.75, "step": 3828, "total_num_tokens": 1037512498.0, "z_loss": 0.0006166407256387174 }, { "copy_logits_max": -0.03734655678272247, "copy_logits_min": -750000000.0, "copy_num_tokens": 350.5625, "epoch": 0.7820270615266786, "gen_logits_max": 5.362353324890137, "gen_logits_mean": -14.913837432861328, "gen_logits_min": -26.519309997558594, "gen_logits_std": 2.7496180534362793, "gen_loss": 0.3425094187259674, "grad_norm": 0.4078425033644683, "learning_rate": 2.6093052631578948e-05, "loss": 0.3235, "mean_copy_accuracy": 0.9953130185604095, "mean_gen_accuracy": 0.8619348406791687, "mean_token_accuracy": 0.8917754739522934, "num_tokens": 1037742455.0, "sample_num_tokens": 7981.25, "step": 3829, "total_num_tokens": 1037774380.0, "z_loss": 0.000710439111571759 }, { "copy_logits_max": 3.0200140476226807, "copy_logits_min": -750000000.0, "copy_num_tokens": 666.4375, "epoch": 0.7822312994638754, "gen_logits_max": 6.095973014831543, "gen_logits_mean": -12.546829223632812, "gen_logits_min": -25.100963592529297, "gen_logits_std": 2.8190689086914062, "gen_loss": 0.2781761884689331, "grad_norm": 0.4025924173477559, "learning_rate": 2.6091789473684212e-05, "loss": 0.3011, "mean_copy_accuracy": 0.996087059378624, "mean_gen_accuracy": 0.8675128072500229, "mean_token_accuracy": 0.8998953104019165, "num_tokens": 1038006482.0, "sample_num_tokens": 10562.5, "step": 3830, "total_num_tokens": 1038048732.0, "z_loss": 0.000659453624393791 }, { "copy_logits_max": 5.381075859069824, "copy_logits_min": -750000064.0, "copy_num_tokens": 625.375, "epoch": 0.7824355374010723, "gen_logits_max": 5.121764183044434, "gen_logits_mean": -13.126951217651367, "gen_logits_min": -25.337717056274414, "gen_logits_std": 2.832939863204956, "gen_loss": 0.2912612557411194, "grad_norm": 0.45516545254007906, "learning_rate": 2.6090526315789476e-05, "loss": 0.3307, "mean_copy_accuracy": 0.9943236857652664, "mean_gen_accuracy": 0.8586945831775665, "mean_token_accuracy": 0.8903319388628006, "num_tokens": 1038275863.0, "sample_num_tokens": 8776.75, "step": 3831, "total_num_tokens": 1038310970.0, "z_loss": 0.000672902271617204 }, { "copy_logits_max": 2.836900234222412, "copy_logits_min": -750000000.0, "copy_num_tokens": 492.125, "epoch": 0.7826397753382691, "gen_logits_max": 6.0736846923828125, "gen_logits_mean": -13.527976036071777, "gen_logits_min": -25.964231491088867, "gen_logits_std": 2.7912473678588867, "gen_loss": 0.2957566976547241, "grad_norm": 0.38967112303648915, "learning_rate": 2.6089263157894737e-05, "loss": 0.2977, "mean_copy_accuracy": 0.9951860308647156, "mean_gen_accuracy": 0.8739464432001114, "mean_token_accuracy": 0.902100458741188, "num_tokens": 1038540483.0, "sample_num_tokens": 8480.25, "step": 3832, "total_num_tokens": 1038574404.0, "z_loss": 0.000643144128844142 }, { "copy_logits_max": -0.7908471822738647, "copy_logits_min": -750000000.0, "copy_num_tokens": 388.875, "epoch": 0.7828440132754659, "gen_logits_max": 5.891989707946777, "gen_logits_mean": -14.052013397216797, "gen_logits_min": -25.751201629638672, "gen_logits_std": 2.7471330165863037, "gen_loss": 0.32132166624069214, "grad_norm": 0.389153322063165, "learning_rate": 2.6088e-05, "loss": 0.3148, "mean_copy_accuracy": 0.9957944452762604, "mean_gen_accuracy": 0.8671001940965652, "mean_token_accuracy": 0.8960310965776443, "num_tokens": 1038810044.0, "sample_num_tokens": 9316.5, "step": 3833, "total_num_tokens": 1038847310.0, "z_loss": 0.0006860960274934769 }, { "copy_logits_max": 3.071298360824585, "copy_logits_min": -750000000.0, "copy_num_tokens": 566.125, "epoch": 0.7830482512126627, "gen_logits_max": 6.07254695892334, "gen_logits_mean": -13.448018074035645, "gen_logits_min": -25.397579193115234, "gen_logits_std": 2.7968926429748535, "gen_loss": 0.28163066506385803, "grad_norm": 0.40966383186738076, "learning_rate": 2.6086736842105263e-05, "loss": 0.3069, "mean_copy_accuracy": 0.9946861416101456, "mean_gen_accuracy": 0.8691684752702713, "mean_token_accuracy": 0.8990633487701416, "num_tokens": 1039086077.0, "sample_num_tokens": 9257.75, "step": 3834, "total_num_tokens": 1039123108.0, "z_loss": 0.0006166910752654076 }, { "copy_logits_max": 0.49593377113342285, "copy_logits_min": -750000000.0, "copy_num_tokens": 504.6875, "epoch": 0.7832524891498596, "gen_logits_max": 4.876388072967529, "gen_logits_mean": -14.807777404785156, "gen_logits_min": -27.027942657470703, "gen_logits_std": 2.8047022819519043, "gen_loss": 0.27373984456062317, "grad_norm": 0.37447614924730777, "learning_rate": 2.6085473684210527e-05, "loss": 0.2829, "mean_copy_accuracy": 0.9956614524126053, "mean_gen_accuracy": 0.8704476356506348, "mean_token_accuracy": 0.905981257557869, "num_tokens": 1039383085.0, "sample_num_tokens": 8218.75, "step": 3835, "total_num_tokens": 1039415960.0, "z_loss": 0.0006997009040787816 }, { "copy_logits_max": -3.444967746734619, "copy_logits_min": -750000000.0, "copy_num_tokens": 316.4375, "epoch": 0.7834567270870564, "gen_logits_max": 4.43848180770874, "gen_logits_mean": -17.118541717529297, "gen_logits_min": -28.60759162902832, "gen_logits_std": 2.759939670562744, "gen_loss": 0.27926507592201233, "grad_norm": 0.3942107504987642, "learning_rate": 2.6084210526315788e-05, "loss": 0.2739, "mean_copy_accuracy": 0.9955453872680664, "mean_gen_accuracy": 0.8771241456270218, "mean_token_accuracy": 0.9093737304210663, "num_tokens": 1039672499.0, "sample_num_tokens": 7605.75, "step": 3836, "total_num_tokens": 1039702922.0, "z_loss": 0.000620522303506732 }, { "copy_logits_max": 0.040539562702178955, "copy_logits_min": -750000000.0, "copy_num_tokens": 367.4375, "epoch": 0.7836609650242533, "gen_logits_max": 5.080075263977051, "gen_logits_mean": -14.533164978027344, "gen_logits_min": -26.453847885131836, "gen_logits_std": 2.7710394859313965, "gen_loss": 0.32645392417907715, "grad_norm": 0.7624302358351713, "learning_rate": 2.6082947368421055e-05, "loss": 0.3296, "mean_copy_accuracy": 0.995039314031601, "mean_gen_accuracy": 0.857993483543396, "mean_token_accuracy": 0.8894139379262924, "num_tokens": 1039940395.0, "sample_num_tokens": 7379.25, "step": 3837, "total_num_tokens": 1039969912.0, "z_loss": 0.0006864755414426327 }, { "copy_logits_max": -0.09710371494293213, "copy_logits_min": -750000000.0, "copy_num_tokens": 416.375, "epoch": 0.7838652029614501, "gen_logits_max": 5.172699928283691, "gen_logits_mean": -14.843685150146484, "gen_logits_min": -26.78988265991211, "gen_logits_std": 2.7888236045837402, "gen_loss": 0.3219109773635864, "grad_norm": 0.4658131586281897, "learning_rate": 2.6081684210526316e-05, "loss": 0.3281, "mean_copy_accuracy": 0.9941625595092773, "mean_gen_accuracy": 0.8612380176782608, "mean_token_accuracy": 0.8908318132162094, "num_tokens": 1040177589.0, "sample_num_tokens": 7969.75, "step": 3838, "total_num_tokens": 1040209468.0, "z_loss": 0.0006853927625343204 }, { "copy_logits_max": 1.673011064529419, "copy_logits_min": -750000000.0, "copy_num_tokens": 446.5625, "epoch": 0.7840694408986469, "gen_logits_max": 5.555948257446289, "gen_logits_mean": -13.604534149169922, "gen_logits_min": -25.734397888183594, "gen_logits_std": 2.772967576980591, "gen_loss": 0.3185257315635681, "grad_norm": 0.3779086148503859, "learning_rate": 2.608042105263158e-05, "loss": 0.3109, "mean_copy_accuracy": 0.9958993196487427, "mean_gen_accuracy": 0.8656595796346664, "mean_token_accuracy": 0.8983164876699448, "num_tokens": 1040451001.0, "sample_num_tokens": 8524.25, "step": 3839, "total_num_tokens": 1040485098.0, "z_loss": 0.0006487664068117738 }, { "copy_logits_max": 3.5470831394195557, "copy_logits_min": -750000064.0, "copy_num_tokens": 510.0, "epoch": 0.7842736788358438, "gen_logits_max": 5.022558212280273, "gen_logits_mean": -13.777669906616211, "gen_logits_min": -26.286239624023438, "gen_logits_std": 2.831714630126953, "gen_loss": 0.32385730743408203, "grad_norm": 0.42634951261650245, "learning_rate": 2.607915789473684e-05, "loss": 0.31, "mean_copy_accuracy": 0.9969730973243713, "mean_gen_accuracy": 0.8580902814865112, "mean_token_accuracy": 0.8983190804719925, "num_tokens": 1040721336.0, "sample_num_tokens": 7718.0, "step": 3840, "total_num_tokens": 1040752208.0, "z_loss": 0.0007610609754920006 }, { "copy_logits_max": 2.294961452484131, "copy_logits_min": -750000064.0, "copy_num_tokens": 397.4375, "epoch": 0.7844779167730406, "gen_logits_max": 6.208807468414307, "gen_logits_mean": -14.224971771240234, "gen_logits_min": -26.38425064086914, "gen_logits_std": 2.8214778900146484, "gen_loss": 0.3241122364997864, "grad_norm": 0.5595980896706934, "learning_rate": 2.6077894736842106e-05, "loss": 0.3216, "mean_copy_accuracy": 0.9934100806713104, "mean_gen_accuracy": 0.8620326519012451, "mean_token_accuracy": 0.8959806710481644, "num_tokens": 1040996788.0, "sample_num_tokens": 7310.0, "step": 3841, "total_num_tokens": 1041026028.0, "z_loss": 0.0007300606230273843 }, { "copy_logits_max": -0.8687624931335449, "copy_logits_min": -687500032.0, "copy_num_tokens": 362.3125, "epoch": 0.7846821547102374, "gen_logits_max": 4.861971855163574, "gen_logits_mean": -15.492013931274414, "gen_logits_min": -26.985153198242188, "gen_logits_std": 2.745806932449341, "gen_loss": 0.2950751781463623, "grad_norm": 0.3721428931576, "learning_rate": 2.6076631578947367e-05, "loss": 0.2968, "mean_copy_accuracy": 0.9954977929592133, "mean_gen_accuracy": 0.8678099513053894, "mean_token_accuracy": 0.900738075375557, "num_tokens": 1041292583.0, "sample_num_tokens": 7243.25, "step": 3842, "total_num_tokens": 1041321556.0, "z_loss": 0.000648530840408057 }, { "copy_logits_max": 1.2687735557556152, "copy_logits_min": -750000000.0, "copy_num_tokens": 571.25, "epoch": 0.7848863926474342, "gen_logits_max": 4.674283981323242, "gen_logits_mean": -14.16020679473877, "gen_logits_min": -25.88836669921875, "gen_logits_std": 2.7402901649475098, "gen_loss": 0.30511122941970825, "grad_norm": 0.4097808712130223, "learning_rate": 2.607536842105263e-05, "loss": 0.3103, "mean_copy_accuracy": 0.9949939101934433, "mean_gen_accuracy": 0.8636289983987808, "mean_token_accuracy": 0.8992042541503906, "num_tokens": 1041558510.0, "sample_num_tokens": 9419.0, "step": 3843, "total_num_tokens": 1041596186.0, "z_loss": 0.0006537525914609432 }, { "copy_logits_max": -1.1818875074386597, "copy_logits_min": -750000000.0, "copy_num_tokens": 329.625, "epoch": 0.7850906305846311, "gen_logits_max": 4.452530860900879, "gen_logits_mean": -15.028935432434082, "gen_logits_min": -26.592405319213867, "gen_logits_std": 2.6728334426879883, "gen_loss": 0.3226471543312073, "grad_norm": 0.4121338717152215, "learning_rate": 2.6074105263157895e-05, "loss": 0.3062, "mean_copy_accuracy": 0.9944060295820236, "mean_gen_accuracy": 0.8705271631479263, "mean_token_accuracy": 0.897947832942009, "num_tokens": 1041822532.0, "sample_num_tokens": 7024.5, "step": 3844, "total_num_tokens": 1041850630.0, "z_loss": 0.0006672527524642646 }, { "copy_logits_max": -2.8437790870666504, "copy_logits_min": -750000000.0, "copy_num_tokens": 363.25, "epoch": 0.785294868521828, "gen_logits_max": 5.296844482421875, "gen_logits_mean": -14.859550476074219, "gen_logits_min": -26.330108642578125, "gen_logits_std": 2.71897292137146, "gen_loss": 0.3498619794845581, "grad_norm": 0.48659142159340824, "learning_rate": 2.607284210526316e-05, "loss": 0.3111, "mean_copy_accuracy": 0.9914913773536682, "mean_gen_accuracy": 0.8689824789762497, "mean_token_accuracy": 0.8954420834779739, "num_tokens": 1042070061.0, "sample_num_tokens": 8086.25, "step": 3845, "total_num_tokens": 1042102406.0, "z_loss": 0.0007087030680850148 }, { "copy_logits_max": 0.8464360237121582, "copy_logits_min": -750000000.0, "copy_num_tokens": 553.5, "epoch": 0.7854991064590248, "gen_logits_max": 4.940197944641113, "gen_logits_mean": -14.35889720916748, "gen_logits_min": -26.07782745361328, "gen_logits_std": 2.7566099166870117, "gen_loss": 0.3334517478942871, "grad_norm": 0.4270507917792542, "learning_rate": 2.6071578947368424e-05, "loss": 0.3151, "mean_copy_accuracy": 0.9942411482334137, "mean_gen_accuracy": 0.8652148544788361, "mean_token_accuracy": 0.8978356420993805, "num_tokens": 1042325145.0, "sample_num_tokens": 9159.25, "step": 3846, "total_num_tokens": 1042361782.0, "z_loss": 0.0006992750568315387 }, { "copy_logits_max": 0.5820703506469727, "copy_logits_min": -750000000.0, "copy_num_tokens": 511.3125, "epoch": 0.7857033443962216, "gen_logits_max": 4.7119975090026855, "gen_logits_mean": -14.614645004272461, "gen_logits_min": -26.58540916442871, "gen_logits_std": 2.7469751834869385, "gen_loss": 0.28309178352355957, "grad_norm": 0.4333883966089897, "learning_rate": 2.6070315789473685e-05, "loss": 0.3114, "mean_copy_accuracy": 0.9946134835481644, "mean_gen_accuracy": 0.8676632046699524, "mean_token_accuracy": 0.8964333385229111, "num_tokens": 1042586541.0, "sample_num_tokens": 9113.75, "step": 3847, "total_num_tokens": 1042622996.0, "z_loss": 0.0006557704182341695 }, { "copy_logits_max": -1.9936891794204712, "copy_logits_min": -750000000.0, "copy_num_tokens": 292.875, "epoch": 0.7859075823334184, "gen_logits_max": 6.29439640045166, "gen_logits_mean": -14.355822563171387, "gen_logits_min": -26.098312377929688, "gen_logits_std": 2.747413158416748, "gen_loss": 0.32401156425476074, "grad_norm": 0.41000617869698114, "learning_rate": 2.606905263157895e-05, "loss": 0.3176, "mean_copy_accuracy": 0.9952095448970795, "mean_gen_accuracy": 0.8666024953126907, "mean_token_accuracy": 0.8953307569026947, "num_tokens": 1042874983.0, "sample_num_tokens": 8077.25, "step": 3848, "total_num_tokens": 1042907292.0, "z_loss": 0.0007241795537993312 }, { "copy_logits_max": -2.132404327392578, "copy_logits_min": -687500032.0, "copy_num_tokens": 502.25, "epoch": 0.7861118202706152, "gen_logits_max": 4.565813064575195, "gen_logits_mean": -14.50440502166748, "gen_logits_min": -26.457870483398438, "gen_logits_std": 2.723433017730713, "gen_loss": 0.29772913455963135, "grad_norm": 0.39545545605781174, "learning_rate": 2.606778947368421e-05, "loss": 0.294, "mean_copy_accuracy": 0.9956363290548325, "mean_gen_accuracy": 0.8653066903352737, "mean_token_accuracy": 0.9026868641376495, "num_tokens": 1043164649.0, "sample_num_tokens": 9291.25, "step": 3849, "total_num_tokens": 1043201814.0, "z_loss": 0.0006797899259254336 }, { "copy_logits_max": -1.3605835437774658, "copy_logits_min": -750000000.0, "copy_num_tokens": 423.75, "epoch": 0.7863160582078121, "gen_logits_max": 4.440410137176514, "gen_logits_mean": -14.487619400024414, "gen_logits_min": -26.503921508789062, "gen_logits_std": 2.7211368083953857, "gen_loss": 0.32883620262145996, "grad_norm": 0.4732487735859094, "learning_rate": 2.6066526315789475e-05, "loss": 0.3276, "mean_copy_accuracy": 0.9951634109020233, "mean_gen_accuracy": 0.8561563640832901, "mean_token_accuracy": 0.8915682435035706, "num_tokens": 1043438482.0, "sample_num_tokens": 7775.5, "step": 3850, "total_num_tokens": 1043469584.0, "z_loss": 0.0007212553173303604 }, { "copy_logits_max": -0.8726102113723755, "copy_logits_min": -750000064.0, "copy_num_tokens": 601.625, "epoch": 0.786520296145009, "gen_logits_max": 4.138916969299316, "gen_logits_mean": -14.856128692626953, "gen_logits_min": -26.442195892333984, "gen_logits_std": 2.7217564582824707, "gen_loss": 0.26005852222442627, "grad_norm": 0.8043184727574046, "learning_rate": 2.6065263157894736e-05, "loss": 0.2907, "mean_copy_accuracy": 0.9950141608715057, "mean_gen_accuracy": 0.8677446395158768, "mean_token_accuracy": 0.9018795043230057, "num_tokens": 1043711340.0, "sample_num_tokens": 9373.5, "step": 3851, "total_num_tokens": 1043748834.0, "z_loss": 0.0006165382801555097 }, { "copy_logits_max": -3.02712345123291, "copy_logits_min": -750000000.0, "copy_num_tokens": 469.8125, "epoch": 0.7867245340822058, "gen_logits_max": 5.171789169311523, "gen_logits_mean": -13.512264251708984, "gen_logits_min": -24.679210662841797, "gen_logits_std": 2.6581649780273438, "gen_loss": 0.31567561626434326, "grad_norm": 0.4339447941833231, "learning_rate": 2.6064e-05, "loss": 0.3141, "mean_copy_accuracy": 0.995921865105629, "mean_gen_accuracy": 0.8584279119968414, "mean_token_accuracy": 0.8979265838861465, "num_tokens": 1043981732.0, "sample_num_tokens": 8441.5, "step": 3852, "total_num_tokens": 1044015498.0, "z_loss": 0.0007030558190308511 }, { "copy_logits_max": 1.0895302295684814, "copy_logits_min": -750000000.0, "copy_num_tokens": 479.3125, "epoch": 0.7869287720194026, "gen_logits_max": 4.871065616607666, "gen_logits_mean": -13.187328338623047, "gen_logits_min": -24.705204010009766, "gen_logits_std": 2.7085988521575928, "gen_loss": 0.29432618618011475, "grad_norm": 0.43365894967903373, "learning_rate": 2.6062736842105264e-05, "loss": 0.3284, "mean_copy_accuracy": 0.9942361861467361, "mean_gen_accuracy": 0.8623254597187042, "mean_token_accuracy": 0.8894502222537994, "num_tokens": 1044262806.0, "sample_num_tokens": 9261.0, "step": 3853, "total_num_tokens": 1044299850.0, "z_loss": 0.0006428713677451015 }, { "copy_logits_max": -3.0500216484069824, "copy_logits_min": -750000000.0, "copy_num_tokens": 475.0625, "epoch": 0.7871330099565994, "gen_logits_max": 4.263338088989258, "gen_logits_mean": -15.75052547454834, "gen_logits_min": -27.31049919128418, "gen_logits_std": 2.724536418914795, "gen_loss": 0.33773237466812134, "grad_norm": 0.4259487287460501, "learning_rate": 2.606147368421053e-05, "loss": 0.3321, "mean_copy_accuracy": 0.9939753860235214, "mean_gen_accuracy": 0.8600083291530609, "mean_token_accuracy": 0.8906556814908981, "num_tokens": 1044544869.0, "sample_num_tokens": 8592.75, "step": 3854, "total_num_tokens": 1044579240.0, "z_loss": 0.0006657385965809226 }, { "copy_logits_max": -2.625648260116577, "copy_logits_min": -687500032.0, "copy_num_tokens": 392.5625, "epoch": 0.7873372478937962, "gen_logits_max": 4.778709411621094, "gen_logits_mean": -14.997539520263672, "gen_logits_min": -26.574220657348633, "gen_logits_std": 2.7458534240722656, "gen_loss": 0.3375795781612396, "grad_norm": 0.4776884303610325, "learning_rate": 2.606021052631579e-05, "loss": 0.3181, "mean_copy_accuracy": 0.991479828953743, "mean_gen_accuracy": 0.8664627522230148, "mean_token_accuracy": 0.8931512236595154, "num_tokens": 1044800541.0, "sample_num_tokens": 7858.75, "step": 3855, "total_num_tokens": 1044831976.0, "z_loss": 0.0006374327349476516 }, { "copy_logits_max": -1.11663818359375, "copy_logits_min": -687500032.0, "copy_num_tokens": 595.0625, "epoch": 0.7875414858309931, "gen_logits_max": 3.641282558441162, "gen_logits_mean": -15.689508438110352, "gen_logits_min": -27.53551483154297, "gen_logits_std": 2.7687973976135254, "gen_loss": 0.2962001860141754, "grad_norm": 0.9058742326827848, "learning_rate": 2.6058947368421054e-05, "loss": 0.3139, "mean_copy_accuracy": 0.9943558722734451, "mean_gen_accuracy": 0.8613855540752411, "mean_token_accuracy": 0.8970935791730881, "num_tokens": 1045072395.0, "sample_num_tokens": 9100.75, "step": 3856, "total_num_tokens": 1045108798.0, "z_loss": 0.0007247571484185755 }, { "copy_logits_max": -2.215381145477295, "copy_logits_min": -750000000.0, "copy_num_tokens": 396.4375, "epoch": 0.78774572376819, "gen_logits_max": 5.1799421310424805, "gen_logits_mean": -13.977193832397461, "gen_logits_min": -25.3491268157959, "gen_logits_std": 2.7295777797698975, "gen_loss": 0.32074615359306335, "grad_norm": 0.46952856988470965, "learning_rate": 2.6057684210526318e-05, "loss": 0.3165, "mean_copy_accuracy": 0.9941414147615433, "mean_gen_accuracy": 0.8668860644102097, "mean_token_accuracy": 0.8942054659128189, "num_tokens": 1045359037.0, "sample_num_tokens": 9210.25, "step": 3857, "total_num_tokens": 1045395878.0, "z_loss": 0.0006957432487979531 }, { "copy_logits_max": -1.526412844657898, "copy_logits_min": -687500032.0, "copy_num_tokens": 660.0, "epoch": 0.7879499617053868, "gen_logits_max": 3.6784191131591797, "gen_logits_mean": -15.196681022644043, "gen_logits_min": -27.560195922851562, "gen_logits_std": 2.7659459114074707, "gen_loss": 0.25847041606903076, "grad_norm": 0.44559033466133663, "learning_rate": 2.605642105263158e-05, "loss": 0.2754, "mean_copy_accuracy": 0.9942956566810608, "mean_gen_accuracy": 0.8736818581819534, "mean_token_accuracy": 0.9078429043292999, "num_tokens": 1045665392.0, "sample_num_tokens": 9612.0, "step": 3858, "total_num_tokens": 1045703840.0, "z_loss": 0.0006109144305810332 }, { "copy_logits_max": -1.497738242149353, "copy_logits_min": -750000000.0, "copy_num_tokens": 543.5, "epoch": 0.7881541996425836, "gen_logits_max": 4.399249076843262, "gen_logits_mean": -15.01643180847168, "gen_logits_min": -27.015275955200195, "gen_logits_std": 2.7667276859283447, "gen_loss": 0.2714451849460602, "grad_norm": 0.5486473333270021, "learning_rate": 2.6055157894736843e-05, "loss": 0.2961, "mean_copy_accuracy": 0.993325874209404, "mean_gen_accuracy": 0.8690147399902344, "mean_token_accuracy": 0.901086688041687, "num_tokens": 1045936634.0, "sample_num_tokens": 8693.5, "step": 3859, "total_num_tokens": 1045971408.0, "z_loss": 0.0005924061988480389 }, { "copy_logits_max": -1.0750685930252075, "copy_logits_min": -687500032.0, "copy_num_tokens": 568.0, "epoch": 0.7883584375797804, "gen_logits_max": 4.539741516113281, "gen_logits_mean": -14.628884315490723, "gen_logits_min": -26.387868881225586, "gen_logits_std": 2.7913265228271484, "gen_loss": 0.32138383388519287, "grad_norm": 0.41540522064776747, "learning_rate": 2.6053894736842104e-05, "loss": 0.3239, "mean_copy_accuracy": 0.9937376230955124, "mean_gen_accuracy": 0.8611312210559845, "mean_token_accuracy": 0.8924262672662735, "num_tokens": 1046203467.0, "sample_num_tokens": 9553.25, "step": 3860, "total_num_tokens": 1046241680.0, "z_loss": 0.0007009110413491726 }, { "copy_logits_max": -3.4593844413757324, "copy_logits_min": -750000000.0, "copy_num_tokens": 403.9375, "epoch": 0.7885626755169772, "gen_logits_max": 4.815432548522949, "gen_logits_mean": -15.414913177490234, "gen_logits_min": -26.884750366210938, "gen_logits_std": 2.738609790802002, "gen_loss": 0.34786564111709595, "grad_norm": 0.3947748887204798, "learning_rate": 2.605263157894737e-05, "loss": 0.3375, "mean_copy_accuracy": 0.9937452077865601, "mean_gen_accuracy": 0.8576014041900635, "mean_token_accuracy": 0.8876651227474213, "num_tokens": 1046457396.0, "sample_num_tokens": 7947.5, "step": 3861, "total_num_tokens": 1046489186.0, "z_loss": 0.0006966302171349525 }, { "copy_logits_max": -0.9296956062316895, "copy_logits_min": -750000000.0, "copy_num_tokens": 577.75, "epoch": 0.7887669134541742, "gen_logits_max": 5.482431411743164, "gen_logits_mean": -14.245129585266113, "gen_logits_min": -26.210651397705078, "gen_logits_std": 2.8245112895965576, "gen_loss": 0.3424103856086731, "grad_norm": 0.5505655202420485, "learning_rate": 2.6051368421052633e-05, "loss": 0.3086, "mean_copy_accuracy": 0.993565246462822, "mean_gen_accuracy": 0.8666258007287979, "mean_token_accuracy": 0.8978267908096313, "num_tokens": 1046721541.0, "sample_num_tokens": 10698.75, "step": 3862, "total_num_tokens": 1046764336.0, "z_loss": 0.0007213849457912147 }, { "copy_logits_max": -2.4045960903167725, "copy_logits_min": -750000000.0, "copy_num_tokens": 405.75, "epoch": 0.788971151391371, "gen_logits_max": 4.525420665740967, "gen_logits_mean": -14.726922988891602, "gen_logits_min": -26.5601749420166, "gen_logits_std": 2.8356099128723145, "gen_loss": 0.26014307141304016, "grad_norm": 0.4372800139394115, "learning_rate": 2.6050105263157897e-05, "loss": 0.3145, "mean_copy_accuracy": 0.992840364575386, "mean_gen_accuracy": 0.8682958036661148, "mean_token_accuracy": 0.896464616060257, "num_tokens": 1046982147.0, "sample_num_tokens": 7953.25, "step": 3863, "total_num_tokens": 1047013960.0, "z_loss": 0.0005837579374201596 }, { "copy_logits_max": -2.2070984840393066, "copy_logits_min": -687500032.0, "copy_num_tokens": 567.5625, "epoch": 0.7891753893285678, "gen_logits_max": 4.692893981933594, "gen_logits_mean": -14.642333984375, "gen_logits_min": -26.88093376159668, "gen_logits_std": 2.8499536514282227, "gen_loss": 0.264944851398468, "grad_norm": 0.39693213780396724, "learning_rate": 2.6048842105263158e-05, "loss": 0.2915, "mean_copy_accuracy": 0.9955148100852966, "mean_gen_accuracy": 0.8708882927894592, "mean_token_accuracy": 0.9011943191289902, "num_tokens": 1047253411.0, "sample_num_tokens": 9621.75, "step": 3864, "total_num_tokens": 1047291898.0, "z_loss": 0.0006137220188975334 }, { "copy_logits_max": -2.067335605621338, "copy_logits_min": -687500032.0, "copy_num_tokens": 573.875, "epoch": 0.7893796272657646, "gen_logits_max": 4.017653465270996, "gen_logits_mean": -15.594901084899902, "gen_logits_min": -28.00530242919922, "gen_logits_std": 2.8026204109191895, "gen_loss": 0.31707075238227844, "grad_norm": 0.4276255240228652, "learning_rate": 2.6047578947368422e-05, "loss": 0.3071, "mean_copy_accuracy": 0.9951386004686356, "mean_gen_accuracy": 0.8629968464374542, "mean_token_accuracy": 0.8972319513559341, "num_tokens": 1047537846.0, "sample_num_tokens": 9314.0, "step": 3865, "total_num_tokens": 1047575102.0, "z_loss": 0.0006616575410589576 }, { "copy_logits_max": -0.2096307873725891, "copy_logits_min": -750000000.0, "copy_num_tokens": 560.0625, "epoch": 0.7895838652029614, "gen_logits_max": 4.400291442871094, "gen_logits_mean": -14.205232620239258, "gen_logits_min": -26.388011932373047, "gen_logits_std": 2.812592029571533, "gen_loss": 0.2687246799468994, "grad_norm": 0.4217363071091573, "learning_rate": 2.6046315789473683e-05, "loss": 0.2937, "mean_copy_accuracy": 0.9941115230321884, "mean_gen_accuracy": 0.870794028043747, "mean_token_accuracy": 0.9025780111551285, "num_tokens": 1047808200.0, "sample_num_tokens": 8287.0, "step": 3866, "total_num_tokens": 1047841348.0, "z_loss": 0.0005814576870761812 }, { "copy_logits_max": -4.296329498291016, "copy_logits_min": -750000000.0, "copy_num_tokens": 308.3125, "epoch": 0.7897881031401582, "gen_logits_max": 4.656753063201904, "gen_logits_mean": -15.898545265197754, "gen_logits_min": -27.42353057861328, "gen_logits_std": 2.7861876487731934, "gen_loss": 0.3003668487071991, "grad_norm": 0.39596859155787356, "learning_rate": 2.6045052631578948e-05, "loss": 0.3001, "mean_copy_accuracy": 0.9943997412919998, "mean_gen_accuracy": 0.8698960691690445, "mean_token_accuracy": 0.8985879570245743, "num_tokens": 1048063757.0, "sample_num_tokens": 7631.75, "step": 3867, "total_num_tokens": 1048094284.0, "z_loss": 0.0006322350818663836 }, { "copy_logits_max": 0.7696926593780518, "copy_logits_min": -750000000.0, "copy_num_tokens": 603.125, "epoch": 0.7899923410773552, "gen_logits_max": 3.59761905670166, "gen_logits_mean": -15.226888656616211, "gen_logits_min": -27.298843383789062, "gen_logits_std": 2.8247969150543213, "gen_loss": 0.303034245967865, "grad_norm": 0.42088907691294214, "learning_rate": 2.604378947368421e-05, "loss": 0.3035, "mean_copy_accuracy": 0.9952709972858429, "mean_gen_accuracy": 0.8613952249288559, "mean_token_accuracy": 0.8992074131965637, "num_tokens": 1048341540.0, "sample_num_tokens": 8954.5, "step": 3868, "total_num_tokens": 1048377358.0, "z_loss": 0.000641650112811476 }, { "copy_logits_max": -3.505877733230591, "copy_logits_min": -750000000.0, "copy_num_tokens": 335.75, "epoch": 0.790196579014552, "gen_logits_max": 4.958943843841553, "gen_logits_mean": -15.128463745117188, "gen_logits_min": -26.97215461730957, "gen_logits_std": 2.76859188079834, "gen_loss": 0.34153586626052856, "grad_norm": 0.46140698736012103, "learning_rate": 2.6042526315789473e-05, "loss": 0.3442, "mean_copy_accuracy": 0.9925230294466019, "mean_gen_accuracy": 0.8498398661613464, "mean_token_accuracy": 0.8850418776273727, "num_tokens": 1048609805.0, "sample_num_tokens": 7689.25, "step": 3869, "total_num_tokens": 1048640562.0, "z_loss": 0.000695578521117568 }, { "copy_logits_max": -2.2178282737731934, "copy_logits_min": -750000000.0, "copy_num_tokens": 665.875, "epoch": 0.7904008169517488, "gen_logits_max": 2.6848058700561523, "gen_logits_mean": -17.235807418823242, "gen_logits_min": -29.111482620239258, "gen_logits_std": 2.798811912536621, "gen_loss": 0.30290016531944275, "grad_norm": 0.39610242285809005, "learning_rate": 2.604126315789474e-05, "loss": 0.3186, "mean_copy_accuracy": 0.9935841113328934, "mean_gen_accuracy": 0.858991265296936, "mean_token_accuracy": 0.8943666517734528, "num_tokens": 1048872928.0, "sample_num_tokens": 9924.0, "step": 3870, "total_num_tokens": 1048912624.0, "z_loss": 0.0006523794727399945 }, { "copy_logits_max": -0.9374333620071411, "copy_logits_min": -750000000.0, "copy_num_tokens": 523.5625, "epoch": 0.7906050548889456, "gen_logits_max": 4.253102779388428, "gen_logits_mean": -14.747421264648438, "gen_logits_min": -26.869110107421875, "gen_logits_std": 2.803572416305542, "gen_loss": 0.3246333599090576, "grad_norm": 0.41229254441252833, "learning_rate": 2.604e-05, "loss": 0.3292, "mean_copy_accuracy": 0.9947966039180756, "mean_gen_accuracy": 0.8599231690168381, "mean_token_accuracy": 0.8919124007225037, "num_tokens": 1049150786.0, "sample_num_tokens": 8568.5, "step": 3871, "total_num_tokens": 1049185060.0, "z_loss": 0.0006801147828809917 }, { "copy_logits_max": -0.7027397155761719, "copy_logits_min": -750000064.0, "copy_num_tokens": 548.3125, "epoch": 0.7908092928261424, "gen_logits_max": 5.0412139892578125, "gen_logits_mean": -13.000974655151367, "gen_logits_min": -25.188949584960938, "gen_logits_std": 2.792917013168335, "gen_loss": 0.30303001403808594, "grad_norm": 0.4164907069942826, "learning_rate": 2.6038736842105266e-05, "loss": 0.2817, "mean_copy_accuracy": 0.994618222117424, "mean_gen_accuracy": 0.8689145147800446, "mean_token_accuracy": 0.9047095775604248, "num_tokens": 1049427738.0, "sample_num_tokens": 8460.5, "step": 3872, "total_num_tokens": 1049461580.0, "z_loss": 0.0007561846869066358 }, { "copy_logits_max": -0.5024151802062988, "copy_logits_min": -750000000.0, "copy_num_tokens": 470.0625, "epoch": 0.7910135307633392, "gen_logits_max": 5.186369895935059, "gen_logits_mean": -14.217208862304688, "gen_logits_min": -26.381633758544922, "gen_logits_std": 2.7979719638824463, "gen_loss": 0.28603196144104004, "grad_norm": 0.3868871263202237, "learning_rate": 2.6037473684210527e-05, "loss": 0.304, "mean_copy_accuracy": 0.9952071011066437, "mean_gen_accuracy": 0.8657785505056381, "mean_token_accuracy": 0.8991102576255798, "num_tokens": 1049691661.0, "sample_num_tokens": 9468.25, "step": 3873, "total_num_tokens": 1049729534.0, "z_loss": 0.0006319971289485693 }, { "copy_logits_max": 0.6018475294113159, "copy_logits_min": -750000000.0, "copy_num_tokens": 502.3125, "epoch": 0.7912177687005362, "gen_logits_max": 5.623169898986816, "gen_logits_mean": -13.581676483154297, "gen_logits_min": -26.441970825195312, "gen_logits_std": 2.7639026641845703, "gen_loss": 0.3234543204307556, "grad_norm": 0.4341506929415936, "learning_rate": 2.603621052631579e-05, "loss": 0.2974, "mean_copy_accuracy": 0.9933390021324158, "mean_gen_accuracy": 0.8711423575878143, "mean_token_accuracy": 0.9022295475006104, "num_tokens": 1049962425.0, "sample_num_tokens": 9325.25, "step": 3874, "total_num_tokens": 1049999726.0, "z_loss": 0.0007309885695576668 }, { "copy_logits_max": -2.1130003929138184, "copy_logits_min": -750000000.0, "copy_num_tokens": 427.8125, "epoch": 0.791422006637733, "gen_logits_max": 4.581336975097656, "gen_logits_mean": -15.530397415161133, "gen_logits_min": -27.41197395324707, "gen_logits_std": 2.781186103820801, "gen_loss": 0.2974960505962372, "grad_norm": 0.41286259370269324, "learning_rate": 2.6034947368421052e-05, "loss": 0.3212, "mean_copy_accuracy": 0.9945439994335175, "mean_gen_accuracy": 0.864050418138504, "mean_token_accuracy": 0.8911020904779434, "num_tokens": 1050222439.0, "sample_num_tokens": 8161.25, "step": 3875, "total_num_tokens": 1050255084.0, "z_loss": 0.0006497549475170672 }, { "copy_logits_max": -0.5931479334831238, "copy_logits_min": -750000000.0, "copy_num_tokens": 368.4375, "epoch": 0.7916262445749298, "gen_logits_max": 5.524396896362305, "gen_logits_mean": -14.53913688659668, "gen_logits_min": -26.907764434814453, "gen_logits_std": 2.7810940742492676, "gen_loss": 0.34251701831817627, "grad_norm": 0.3884121532615084, "learning_rate": 2.6033684210526316e-05, "loss": 0.2876, "mean_copy_accuracy": 0.9939906001091003, "mean_gen_accuracy": 0.8760323375463486, "mean_token_accuracy": 0.9029649198055267, "num_tokens": 1050499232.0, "sample_num_tokens": 8147.0, "step": 3876, "total_num_tokens": 1050531820.0, "z_loss": 0.0007277214899659157 }, { "copy_logits_max": 2.1972944736480713, "copy_logits_min": -750000000.0, "copy_num_tokens": 469.4375, "epoch": 0.7918304825121266, "gen_logits_max": 5.697930335998535, "gen_logits_mean": -12.545233726501465, "gen_logits_min": -25.475648880004883, "gen_logits_std": 2.7867848873138428, "gen_loss": 0.32205116748809814, "grad_norm": 0.4092321242804964, "learning_rate": 2.6032421052631577e-05, "loss": 0.3078, "mean_copy_accuracy": 0.9934226125478745, "mean_gen_accuracy": 0.867622971534729, "mean_token_accuracy": 0.8973241895437241, "num_tokens": 1050759676.0, "sample_num_tokens": 8492.0, "step": 3877, "total_num_tokens": 1050793644.0, "z_loss": 0.0006256089545786381 }, { "copy_logits_max": -0.4852753281593323, "copy_logits_min": -625000064.0, "copy_num_tokens": 672.875, "epoch": 0.7920347204493234, "gen_logits_max": 5.868433952331543, "gen_logits_mean": -13.346193313598633, "gen_logits_min": -26.127552032470703, "gen_logits_std": 2.793095827102661, "gen_loss": 0.3066118359565735, "grad_norm": 0.4019799852570181, "learning_rate": 2.6031157894736845e-05, "loss": 0.3003, "mean_copy_accuracy": 0.9949596226215363, "mean_gen_accuracy": 0.8679164797067642, "mean_token_accuracy": 0.9011074006557465, "num_tokens": 1051057384.0, "sample_num_tokens": 11617.0, "step": 3878, "total_num_tokens": 1051103852.0, "z_loss": 0.0006483905599452555 }, { "copy_logits_max": -3.9081082344055176, "copy_logits_min": -750000000.0, "copy_num_tokens": 356.6875, "epoch": 0.7922389583865203, "gen_logits_max": 5.119422912597656, "gen_logits_mean": -14.797593116760254, "gen_logits_min": -26.923274993896484, "gen_logits_std": 2.7615623474121094, "gen_loss": 0.3028270900249481, "grad_norm": 0.40470400238166315, "learning_rate": 2.6029894736842106e-05, "loss": 0.321, "mean_copy_accuracy": 0.995368167757988, "mean_gen_accuracy": 0.8634227216243744, "mean_token_accuracy": 0.8935564905405045, "num_tokens": 1051329950.0, "sample_num_tokens": 7447.0, "step": 3879, "total_num_tokens": 1051359738.0, "z_loss": 0.0006578305619768798 }, { "copy_logits_max": -1.8716751337051392, "copy_logits_min": -687500032.0, "copy_num_tokens": 281.0, "epoch": 0.7924431963237172, "gen_logits_max": 6.416540145874023, "gen_logits_mean": -11.605077743530273, "gen_logits_min": -23.734760284423828, "gen_logits_std": 2.7025225162506104, "gen_loss": 0.3947000205516815, "grad_norm": 0.43046651562492627, "learning_rate": 2.602863157894737e-05, "loss": 0.3242, "mean_copy_accuracy": 0.9940093010663986, "mean_gen_accuracy": 0.8639307916164398, "mean_token_accuracy": 0.894464373588562, "num_tokens": 1051589537.0, "sample_num_tokens": 7265.75, "step": 3880, "total_num_tokens": 1051618600.0, "z_loss": 0.0007834381540305912 }, { "copy_logits_max": -4.3603620529174805, "copy_logits_min": -750000000.0, "copy_num_tokens": 353.0, "epoch": 0.792647434260914, "gen_logits_max": 5.117179870605469, "gen_logits_mean": -15.188335418701172, "gen_logits_min": -26.513648986816406, "gen_logits_std": 2.70501708984375, "gen_loss": 0.32133859395980835, "grad_norm": 0.39527016955564137, "learning_rate": 2.602736842105263e-05, "loss": 0.3013, "mean_copy_accuracy": 0.9951403886079788, "mean_gen_accuracy": 0.8707326203584671, "mean_token_accuracy": 0.9013227969408035, "num_tokens": 1051862027.0, "sample_num_tokens": 8907.25, "step": 3881, "total_num_tokens": 1051897656.0, "z_loss": 0.000667812128085643 }, { "copy_logits_max": -2.7769968509674072, "copy_logits_min": -750000000.0, "copy_num_tokens": 324.25, "epoch": 0.7928516721981108, "gen_logits_max": 5.741408348083496, "gen_logits_mean": -14.904874801635742, "gen_logits_min": -26.963241577148438, "gen_logits_std": 2.7372472286224365, "gen_loss": 0.3253624141216278, "grad_norm": 0.41219420088824404, "learning_rate": 2.6026105263157895e-05, "loss": 0.3122, "mean_copy_accuracy": 0.9952216893434525, "mean_gen_accuracy": 0.8647235035896301, "mean_token_accuracy": 0.8985038846731186, "num_tokens": 1052141310.0, "sample_num_tokens": 8040.5, "step": 3882, "total_num_tokens": 1052173472.0, "z_loss": 0.0007307495106942952 }, { "copy_logits_max": -0.854960560798645, "copy_logits_min": -750000000.0, "copy_num_tokens": 599.3125, "epoch": 0.7930559101353076, "gen_logits_max": 4.292662143707275, "gen_logits_mean": -14.696325302124023, "gen_logits_min": -26.77252960205078, "gen_logits_std": 2.7907936573028564, "gen_loss": 0.28627488017082214, "grad_norm": 0.41365582510511184, "learning_rate": 2.602484210526316e-05, "loss": 0.3154, "mean_copy_accuracy": 0.9955070167779922, "mean_gen_accuracy": 0.8594142645597458, "mean_token_accuracy": 0.8943610191345215, "num_tokens": 1052413088.0, "sample_num_tokens": 9136.5, "step": 3883, "total_num_tokens": 1052449634.0, "z_loss": 0.000693034497089684 }, { "copy_logits_max": -3.406386375427246, "copy_logits_min": -750000000.0, "copy_num_tokens": 223.75, "epoch": 0.7932601480725044, "gen_logits_max": 5.643854141235352, "gen_logits_mean": -14.864460945129395, "gen_logits_min": -26.42898941040039, "gen_logits_std": 2.752537727355957, "gen_loss": 0.34733760356903076, "grad_norm": 0.47915662202211784, "learning_rate": 2.602357894736842e-05, "loss": 0.3153, "mean_copy_accuracy": 0.9938872754573822, "mean_gen_accuracy": 0.8677243292331696, "mean_token_accuracy": 0.8949078619480133, "num_tokens": 1052670132.0, "sample_num_tokens": 6482.5, "step": 3884, "total_num_tokens": 1052696062.0, "z_loss": 0.0007473385194316506 }, { "copy_logits_max": 0.5168448686599731, "copy_logits_min": -687500032.0, "copy_num_tokens": 538.0, "epoch": 0.7934643860097013, "gen_logits_max": 5.239081382751465, "gen_logits_mean": -13.83432388305664, "gen_logits_min": -26.154603958129883, "gen_logits_std": 2.81247615814209, "gen_loss": 0.2783525586128235, "grad_norm": 0.41501025394977153, "learning_rate": 2.6022315789473685e-05, "loss": 0.3138, "mean_copy_accuracy": 0.9935190230607986, "mean_gen_accuracy": 0.8668645620346069, "mean_token_accuracy": 0.8966898769140244, "num_tokens": 1052933743.0, "sample_num_tokens": 8781.75, "step": 3885, "total_num_tokens": 1052968870.0, "z_loss": 0.000642341619823128 }, { "copy_logits_max": -0.2599795460700989, "copy_logits_min": -750000000.0, "copy_num_tokens": 597.125, "epoch": 0.7936686239468982, "gen_logits_max": 4.509923458099365, "gen_logits_mean": -13.991762161254883, "gen_logits_min": -26.550277709960938, "gen_logits_std": 2.8210177421569824, "gen_loss": 0.2928355634212494, "grad_norm": 0.42218964939504017, "learning_rate": 2.602105263157895e-05, "loss": 0.307, "mean_copy_accuracy": 0.9930287152528763, "mean_gen_accuracy": 0.8654244244098663, "mean_token_accuracy": 0.897857055068016, "num_tokens": 1053207873.0, "sample_num_tokens": 8690.25, "step": 3886, "total_num_tokens": 1053242634.0, "z_loss": 0.000664223509375006 }, { "copy_logits_max": -3.10455060005188, "copy_logits_min": -750000000.0, "copy_num_tokens": 418.5625, "epoch": 0.793872861884095, "gen_logits_max": 5.478731155395508, "gen_logits_mean": -14.336807250976562, "gen_logits_min": -26.23572540283203, "gen_logits_std": 2.7717723846435547, "gen_loss": 0.2992945909500122, "grad_norm": 0.4054542766945078, "learning_rate": 2.6019789473684213e-05, "loss": 0.3017, "mean_copy_accuracy": 0.9964339882135391, "mean_gen_accuracy": 0.8671865612268448, "mean_token_accuracy": 0.9005427658557892, "num_tokens": 1053470407.0, "sample_num_tokens": 8792.25, "step": 3887, "total_num_tokens": 1053505576.0, "z_loss": 0.0006914712721481919 }, { "copy_logits_max": 1.3359230756759644, "copy_logits_min": -750000000.0, "copy_num_tokens": 446.5625, "epoch": 0.7940770998212918, "gen_logits_max": 5.535531997680664, "gen_logits_mean": -13.16733169555664, "gen_logits_min": -25.53586196899414, "gen_logits_std": 2.814345598220825, "gen_loss": 0.2993488609790802, "grad_norm": 0.44405776494941335, "learning_rate": 2.6018526315789474e-05, "loss": 0.3138, "mean_copy_accuracy": 0.9946164786815643, "mean_gen_accuracy": 0.8628599792718887, "mean_token_accuracy": 0.8950834721326828, "num_tokens": 1053741251.0, "sample_num_tokens": 7535.75, "step": 3888, "total_num_tokens": 1053771394.0, "z_loss": 0.0006695638876408339 }, { "copy_logits_max": -3.652862071990967, "copy_logits_min": -750000000.0, "copy_num_tokens": 328.875, "epoch": 0.7942813377584886, "gen_logits_max": 5.3896894454956055, "gen_logits_mean": -13.882787704467773, "gen_logits_min": -26.076650619506836, "gen_logits_std": 2.7695136070251465, "gen_loss": 0.30765804648399353, "grad_norm": 0.397107640645682, "learning_rate": 2.601726315789474e-05, "loss": 0.3095, "mean_copy_accuracy": 0.9944688826799393, "mean_gen_accuracy": 0.8708659410476685, "mean_token_accuracy": 0.8996440470218658, "num_tokens": 1054016983.0, "sample_num_tokens": 7402.75, "step": 3889, "total_num_tokens": 1054046594.0, "z_loss": 0.0005883878911845386 }, { "copy_logits_max": -5.0331220626831055, "copy_logits_min": -750000000.0, "copy_num_tokens": 481.25, "epoch": 0.7944855756956855, "gen_logits_max": 4.833309173583984, "gen_logits_mean": -15.152596473693848, "gen_logits_min": -27.152034759521484, "gen_logits_std": 2.7820944786071777, "gen_loss": 0.3036612272262573, "grad_norm": 0.424164380778969, "learning_rate": 2.6016e-05, "loss": 0.3025, "mean_copy_accuracy": 0.9941037446260452, "mean_gen_accuracy": 0.8697082996368408, "mean_token_accuracy": 0.8985331654548645, "num_tokens": 1054285381.0, "sample_num_tokens": 9374.25, "step": 3890, "total_num_tokens": 1054322878.0, "z_loss": 0.0005748760304413736 }, { "copy_logits_max": -0.9045048356056213, "copy_logits_min": -750000000.0, "copy_num_tokens": 538.5, "epoch": 0.7946898136328823, "gen_logits_max": 4.720985412597656, "gen_logits_mean": -13.861433982849121, "gen_logits_min": -26.220901489257812, "gen_logits_std": 2.799290895462036, "gen_loss": 0.31327545642852783, "grad_norm": 0.4286733427980671, "learning_rate": 2.6014736842105264e-05, "loss": 0.3223, "mean_copy_accuracy": 0.9949227720499039, "mean_gen_accuracy": 0.857927143573761, "mean_token_accuracy": 0.8922661691904068, "num_tokens": 1054553378.0, "sample_num_tokens": 8056.5, "step": 3891, "total_num_tokens": 1054585604.0, "z_loss": 0.0005837164935655892 }, { "copy_logits_max": -1.0730810165405273, "copy_logits_min": -750000000.0, "copy_num_tokens": 470.875, "epoch": 0.7948940515700792, "gen_logits_max": 4.8520965576171875, "gen_logits_mean": -14.429483413696289, "gen_logits_min": -26.48801612854004, "gen_logits_std": 2.8000550270080566, "gen_loss": 0.3206188678741455, "grad_norm": 0.46787988894972354, "learning_rate": 2.6013473684210525e-05, "loss": 0.323, "mean_copy_accuracy": 0.9929320961236954, "mean_gen_accuracy": 0.8629118204116821, "mean_token_accuracy": 0.8932062387466431, "num_tokens": 1054808663.0, "sample_num_tokens": 8586.75, "step": 3892, "total_num_tokens": 1054843010.0, "z_loss": 0.0005772329168394208 }, { "copy_logits_max": -2.3972387313842773, "copy_logits_min": -750000000.0, "copy_num_tokens": 414.25, "epoch": 0.795098289507276, "gen_logits_max": 5.455864429473877, "gen_logits_mean": -14.204805374145508, "gen_logits_min": -25.73810577392578, "gen_logits_std": 2.7779111862182617, "gen_loss": 0.2870001196861267, "grad_norm": 0.4307800632323272, "learning_rate": 2.601221052631579e-05, "loss": 0.2966, "mean_copy_accuracy": 0.9944528043270111, "mean_gen_accuracy": 0.8782100528478622, "mean_token_accuracy": 0.9028327018022537, "num_tokens": 1055079833.0, "sample_num_tokens": 9056.75, "step": 3893, "total_num_tokens": 1055116060.0, "z_loss": 0.0005947081372141838 }, { "copy_logits_max": -3.2480380535125732, "copy_logits_min": -687500096.0, "copy_num_tokens": 609.5625, "epoch": 0.7953025274444728, "gen_logits_max": 4.634215354919434, "gen_logits_mean": -14.775068283081055, "gen_logits_min": -26.85673713684082, "gen_logits_std": 2.797036647796631, "gen_loss": 0.28753718733787537, "grad_norm": 0.47479653447796855, "learning_rate": 2.6010947368421053e-05, "loss": 0.3148, "mean_copy_accuracy": 0.9933765530586243, "mean_gen_accuracy": 0.8681237250566483, "mean_token_accuracy": 0.8973930329084396, "num_tokens": 1055339314.0, "sample_num_tokens": 9491.5, "step": 3894, "total_num_tokens": 1055377280.0, "z_loss": 0.0005835698684677482 }, { "copy_logits_max": -2.500168800354004, "copy_logits_min": -687500032.0, "copy_num_tokens": 346.875, "epoch": 0.7955067653816696, "gen_logits_max": 5.220639705657959, "gen_logits_mean": -14.760321617126465, "gen_logits_min": -26.548797607421875, "gen_logits_std": 2.740722417831421, "gen_loss": 0.30373987555503845, "grad_norm": 0.40369024839315415, "learning_rate": 2.6009684210526318e-05, "loss": 0.3121, "mean_copy_accuracy": 0.9952794909477234, "mean_gen_accuracy": 0.8694133460521698, "mean_token_accuracy": 0.8954218178987503, "num_tokens": 1055602395.0, "sample_num_tokens": 8515.75, "step": 3895, "total_num_tokens": 1055636458.0, "z_loss": 0.0005789335118606687 }, { "copy_logits_max": -0.8452197909355164, "copy_logits_min": -750000000.0, "copy_num_tokens": 615.4375, "epoch": 0.7957110033188665, "gen_logits_max": 4.614407539367676, "gen_logits_mean": -14.153645515441895, "gen_logits_min": -26.247310638427734, "gen_logits_std": 2.8052499294281006, "gen_loss": 0.30599087476730347, "grad_norm": 0.42278182209287796, "learning_rate": 2.600842105263158e-05, "loss": 0.3102, "mean_copy_accuracy": 0.9936372339725494, "mean_gen_accuracy": 0.8683812767267227, "mean_token_accuracy": 0.8979471921920776, "num_tokens": 1055864398.0, "sample_num_tokens": 9247.5, "step": 3896, "total_num_tokens": 1055901388.0, "z_loss": 0.0006280624074861407 }, { "copy_logits_max": -2.587038516998291, "copy_logits_min": -687500032.0, "copy_num_tokens": 514.875, "epoch": 0.7959152412560633, "gen_logits_max": 5.210806846618652, "gen_logits_mean": -13.652139663696289, "gen_logits_min": -25.643352508544922, "gen_logits_std": 2.8177051544189453, "gen_loss": 0.2942546606063843, "grad_norm": 0.4538520153173893, "learning_rate": 2.6007157894736843e-05, "loss": 0.3136, "mean_copy_accuracy": 0.9932595193386078, "mean_gen_accuracy": 0.8634251356124878, "mean_token_accuracy": 0.896709755063057, "num_tokens": 1056135628.0, "sample_num_tokens": 8350.5, "step": 3897, "total_num_tokens": 1056169030.0, "z_loss": 0.0006133858114480972 }, { "copy_logits_max": -2.4411978721618652, "copy_logits_min": -687500032.0, "copy_num_tokens": 586.625, "epoch": 0.7961194791932601, "gen_logits_max": 4.508195877075195, "gen_logits_mean": -15.37809944152832, "gen_logits_min": -27.248653411865234, "gen_logits_std": 2.7735586166381836, "gen_loss": 0.33228105306625366, "grad_norm": 0.4239337048321148, "learning_rate": 2.6005894736842107e-05, "loss": 0.3027, "mean_copy_accuracy": 0.9939000308513641, "mean_gen_accuracy": 0.8677344918251038, "mean_token_accuracy": 0.9009713232517242, "num_tokens": 1056407016.0, "sample_num_tokens": 9342.0, "step": 3898, "total_num_tokens": 1056444384.0, "z_loss": 0.0006669910508207977 }, { "copy_logits_max": -0.3216080963611603, "copy_logits_min": -750000000.0, "copy_num_tokens": 660.6875, "epoch": 0.796323717130457, "gen_logits_max": 5.07198429107666, "gen_logits_mean": -13.487741470336914, "gen_logits_min": -26.54707908630371, "gen_logits_std": 2.8160228729248047, "gen_loss": 0.322582870721817, "grad_norm": 0.42948188671776816, "learning_rate": 2.6004631578947368e-05, "loss": 0.3313, "mean_copy_accuracy": 0.9950732737779617, "mean_gen_accuracy": 0.8594054579734802, "mean_token_accuracy": 0.892299085855484, "num_tokens": 1056682709.0, "sample_num_tokens": 9525.25, "step": 3899, "total_num_tokens": 1056720810.0, "z_loss": 0.0007996504427865148 }, { "copy_logits_max": 0.4313505291938782, "copy_logits_min": -687500032.0, "copy_num_tokens": 449.5625, "epoch": 0.7965279550676538, "gen_logits_max": 5.633674621582031, "gen_logits_mean": -13.736490249633789, "gen_logits_min": -25.828474044799805, "gen_logits_std": 2.7898426055908203, "gen_loss": 0.36221015453338623, "grad_norm": 0.4491574664896217, "learning_rate": 2.6003368421052633e-05, "loss": 0.3452, "mean_copy_accuracy": 0.9951375275850296, "mean_gen_accuracy": 0.8502168208360672, "mean_token_accuracy": 0.8883271813392639, "num_tokens": 1056962001.0, "sample_num_tokens": 8082.25, "step": 3900, "total_num_tokens": 1056994330.0, "z_loss": 0.0008581487345509231 }, { "copy_logits_max": -1.6154156923294067, "copy_logits_min": -750000000.0, "copy_num_tokens": 394.1875, "epoch": 0.7967321930048507, "gen_logits_max": 5.30000638961792, "gen_logits_mean": -14.590387344360352, "gen_logits_min": -26.70386505126953, "gen_logits_std": 2.7933669090270996, "gen_loss": 0.3261522650718689, "grad_norm": 0.41575529678245526, "learning_rate": 2.6002105263157893e-05, "loss": 0.3297, "mean_copy_accuracy": 0.9935842156410217, "mean_gen_accuracy": 0.8610008060932159, "mean_token_accuracy": 0.8904902637004852, "num_tokens": 1057220868.0, "sample_num_tokens": 8483.5, "step": 3901, "total_num_tokens": 1057254802.0, "z_loss": 0.0007818315643817186 }, { "copy_logits_max": -1.1009687185287476, "copy_logits_min": -750000000.0, "copy_num_tokens": 542.5, "epoch": 0.7969364309420475, "gen_logits_max": 4.905773162841797, "gen_logits_mean": -13.324098587036133, "gen_logits_min": -25.642112731933594, "gen_logits_std": 2.7617709636688232, "gen_loss": 0.28182700276374817, "grad_norm": 1.034258936659485, "learning_rate": 2.600084210526316e-05, "loss": 0.3064, "mean_copy_accuracy": 0.9941376149654388, "mean_gen_accuracy": 0.8677796125411987, "mean_token_accuracy": 0.8986509591341019, "num_tokens": 1057489591.0, "sample_num_tokens": 8812.75, "step": 3902, "total_num_tokens": 1057524842.0, "z_loss": 0.0006381460116244853 }, { "copy_logits_max": -2.884523391723633, "copy_logits_min": -750000000.0, "copy_num_tokens": 369.125, "epoch": 0.7971406688792443, "gen_logits_max": 5.482563018798828, "gen_logits_mean": -13.555733680725098, "gen_logits_min": -25.937255859375, "gen_logits_std": 2.7457361221313477, "gen_loss": 0.3348662257194519, "grad_norm": 0.4030921934479121, "learning_rate": 2.5999578947368422e-05, "loss": 0.3026, "mean_copy_accuracy": 0.9943683296442032, "mean_gen_accuracy": 0.8720282912254333, "mean_token_accuracy": 0.898267075419426, "num_tokens": 1057776617.0, "sample_num_tokens": 8404.75, "step": 3903, "total_num_tokens": 1057810236.0, "z_loss": 0.0007172260666266084 }, { "copy_logits_max": -2.297325849533081, "copy_logits_min": -750000000.0, "copy_num_tokens": 481.3125, "epoch": 0.7973449068164411, "gen_logits_max": 5.100858688354492, "gen_logits_mean": -14.002134323120117, "gen_logits_min": -26.4951229095459, "gen_logits_std": 2.8119149208068848, "gen_loss": 0.26898932456970215, "grad_norm": 0.4289234989994649, "learning_rate": 2.5998315789473686e-05, "loss": 0.2947, "mean_copy_accuracy": 0.9938668310642242, "mean_gen_accuracy": 0.8731038570404053, "mean_token_accuracy": 0.9007202386856079, "num_tokens": 1058027584.0, "sample_num_tokens": 7961.0, "step": 3904, "total_num_tokens": 1058059428.0, "z_loss": 0.0006650259019806981 }, { "copy_logits_max": -0.17267680168151855, "copy_logits_min": -687500032.0, "copy_num_tokens": 487.5, "epoch": 0.797549144753638, "gen_logits_max": 4.954146862030029, "gen_logits_mean": -14.36391830444336, "gen_logits_min": -26.46784210205078, "gen_logits_std": 2.7922544479370117, "gen_loss": 0.3239474892616272, "grad_norm": 0.4488637835390103, "learning_rate": 2.5997052631578947e-05, "loss": 0.338, "mean_copy_accuracy": 0.9946076273918152, "mean_gen_accuracy": 0.8576463758945465, "mean_token_accuracy": 0.8895492255687714, "num_tokens": 1058269128.0, "sample_num_tokens": 8610.5, "step": 3905, "total_num_tokens": 1058303570.0, "z_loss": 0.0007195954094640911 }, { "copy_logits_max": -1.0919179916381836, "copy_logits_min": -750000000.0, "copy_num_tokens": 480.4375, "epoch": 0.7977533826908348, "gen_logits_max": 4.62774658203125, "gen_logits_mean": -14.330657958984375, "gen_logits_min": -26.60660171508789, "gen_logits_std": 2.825227975845337, "gen_loss": 0.29709577560424805, "grad_norm": 0.4369360231090119, "learning_rate": 2.599578947368421e-05, "loss": 0.2861, "mean_copy_accuracy": 0.9936614781618118, "mean_gen_accuracy": 0.871191069483757, "mean_token_accuracy": 0.9030117243528366, "num_tokens": 1058544719.0, "sample_num_tokens": 7928.25, "step": 3906, "total_num_tokens": 1058576432.0, "z_loss": 0.0006064216722734272 }, { "copy_logits_max": -4.238718509674072, "copy_logits_min": -687500032.0, "copy_num_tokens": 382.8125, "epoch": 0.7979576206280317, "gen_logits_max": 4.1354265213012695, "gen_logits_mean": -15.685004234313965, "gen_logits_min": -27.534143447875977, "gen_logits_std": 2.7998621463775635, "gen_loss": 0.2640025317668915, "grad_norm": 0.38191276773559724, "learning_rate": 2.5994526315789473e-05, "loss": 0.301, "mean_copy_accuracy": 0.9951391816139221, "mean_gen_accuracy": 0.8679740279912949, "mean_token_accuracy": 0.8991389274597168, "num_tokens": 1058818977.0, "sample_num_tokens": 7850.75, "step": 3907, "total_num_tokens": 1058850380.0, "z_loss": 0.0005457574152387679 }, { "copy_logits_max": -3.365478754043579, "copy_logits_min": -750000064.0, "copy_num_tokens": 593.5625, "epoch": 0.7981618585652285, "gen_logits_max": 4.53325080871582, "gen_logits_mean": -14.348936080932617, "gen_logits_min": -26.517322540283203, "gen_logits_std": 2.774160861968994, "gen_loss": 0.30947181582450867, "grad_norm": 0.36541658110675185, "learning_rate": 2.5993263157894737e-05, "loss": 0.3023, "mean_copy_accuracy": 0.9954870790243149, "mean_gen_accuracy": 0.8630848675966263, "mean_token_accuracy": 0.8996032327413559, "num_tokens": 1059092801.0, "sample_num_tokens": 9011.25, "step": 3908, "total_num_tokens": 1059128846.0, "z_loss": 0.000615912489593029 }, { "copy_logits_max": -2.8699254989624023, "copy_logits_min": -750000064.0, "copy_num_tokens": 537.0625, "epoch": 0.7983660965024253, "gen_logits_max": 4.24352502822876, "gen_logits_mean": -15.829507827758789, "gen_logits_min": -27.630626678466797, "gen_logits_std": 2.778308868408203, "gen_loss": 0.28843116760253906, "grad_norm": 0.37830693092699913, "learning_rate": 2.5991999999999998e-05, "loss": 0.2936, "mean_copy_accuracy": 0.9949058443307877, "mean_gen_accuracy": 0.8706928193569183, "mean_token_accuracy": 0.9007136523723602, "num_tokens": 1059398528.0, "sample_num_tokens": 9354.0, "step": 3909, "total_num_tokens": 1059435944.0, "z_loss": 0.0005998470005579293 }, { "copy_logits_max": -4.867005825042725, "copy_logits_min": -687500032.0, "copy_num_tokens": 384.8125, "epoch": 0.7985703344396221, "gen_logits_max": 4.898064613342285, "gen_logits_mean": -14.966476440429688, "gen_logits_min": -26.746335983276367, "gen_logits_std": 2.7949275970458984, "gen_loss": 0.3010651767253876, "grad_norm": 0.5077303723225802, "learning_rate": 2.5990736842105262e-05, "loss": 0.3073, "mean_copy_accuracy": 0.9956953972578049, "mean_gen_accuracy": 0.8648501038551331, "mean_token_accuracy": 0.8976682871580124, "num_tokens": 1059687971.0, "sample_num_tokens": 8114.75, "step": 3910, "total_num_tokens": 1059720430.0, "z_loss": 0.0006283990805968642 }, { "copy_logits_max": -1.742851734161377, "copy_logits_min": -750000000.0, "copy_num_tokens": 388.4375, "epoch": 0.798774572376819, "gen_logits_max": 4.506160736083984, "gen_logits_mean": -14.686805725097656, "gen_logits_min": -26.906761169433594, "gen_logits_std": 2.803264617919922, "gen_loss": 0.31902995705604553, "grad_norm": 0.3932998603221393, "learning_rate": 2.598947368421053e-05, "loss": 0.3024, "mean_copy_accuracy": 0.9942571818828583, "mean_gen_accuracy": 0.8713379502296448, "mean_token_accuracy": 0.899992898106575, "num_tokens": 1059963076.0, "sample_num_tokens": 8231.5, "step": 3911, "total_num_tokens": 1059996002.0, "z_loss": 0.0006273573962971568 }, { "copy_logits_max": -1.8672664165496826, "copy_logits_min": -750000128.0, "copy_num_tokens": 602.8125, "epoch": 0.7989788103140159, "gen_logits_max": 4.0450239181518555, "gen_logits_mean": -14.648177146911621, "gen_logits_min": -27.062149047851562, "gen_logits_std": 2.8166117668151855, "gen_loss": 0.25852349400520325, "grad_norm": 0.3857221939558331, "learning_rate": 2.598821052631579e-05, "loss": 0.2925, "mean_copy_accuracy": 0.995617151260376, "mean_gen_accuracy": 0.8693388253450394, "mean_token_accuracy": 0.9020767211914062, "num_tokens": 1060227802.0, "sample_num_tokens": 8858.0, "step": 3912, "total_num_tokens": 1060263234.0, "z_loss": 0.0005354784079827368 }, { "copy_logits_max": -1.3271126747131348, "copy_logits_min": -687500032.0, "copy_num_tokens": 653.625, "epoch": 0.7991830482512127, "gen_logits_max": 4.442930221557617, "gen_logits_mean": -13.431022644042969, "gen_logits_min": -25.72503662109375, "gen_logits_std": 2.808114767074585, "gen_loss": 0.2945581078529358, "grad_norm": 0.3854342136324186, "learning_rate": 2.5986947368421055e-05, "loss": 0.2911, "mean_copy_accuracy": 0.9964782297611237, "mean_gen_accuracy": 0.8640166521072388, "mean_token_accuracy": 0.9025651961565018, "num_tokens": 1060517009.0, "sample_num_tokens": 9241.75, "step": 3913, "total_num_tokens": 1060553976.0, "z_loss": 0.0006116771255619824 }, { "copy_logits_max": -2.6572842597961426, "copy_logits_min": -750000000.0, "copy_num_tokens": 367.1875, "epoch": 0.7993872861884095, "gen_logits_max": 4.12445068359375, "gen_logits_mean": -15.046195983886719, "gen_logits_min": -27.603599548339844, "gen_logits_std": 2.7992403507232666, "gen_loss": 0.27874356508255005, "grad_norm": 0.49575531277201496, "learning_rate": 2.5985684210526316e-05, "loss": 0.3189, "mean_copy_accuracy": 0.993442565202713, "mean_gen_accuracy": 0.8659983277320862, "mean_token_accuracy": 0.8938629776239395, "num_tokens": 1060766954.0, "sample_num_tokens": 7074.0, "step": 3914, "total_num_tokens": 1060795250.0, "z_loss": 0.0005536442040465772 }, { "copy_logits_max": 1.782898187637329, "copy_logits_min": -687500032.0, "copy_num_tokens": 526.9375, "epoch": 0.7995915241256063, "gen_logits_max": 4.817895889282227, "gen_logits_mean": -13.87320327758789, "gen_logits_min": -26.208314895629883, "gen_logits_std": 2.808473587036133, "gen_loss": 0.30253326892852783, "grad_norm": 0.42821870055641637, "learning_rate": 2.598442105263158e-05, "loss": 0.3283, "mean_copy_accuracy": 0.9945903271436691, "mean_gen_accuracy": 0.8596161901950836, "mean_token_accuracy": 0.8929071128368378, "num_tokens": 1061017907.0, "sample_num_tokens": 8226.25, "step": 3915, "total_num_tokens": 1061050812.0, "z_loss": 0.0006953802658244967 }, { "copy_logits_max": -3.19309401512146, "copy_logits_min": -750000000.0, "copy_num_tokens": 358.125, "epoch": 0.7997957620628031, "gen_logits_max": 5.694164276123047, "gen_logits_mean": -12.88988208770752, "gen_logits_min": -25.16141128540039, "gen_logits_std": 2.733693838119507, "gen_loss": 0.36962205171585083, "grad_norm": 0.40216442379359524, "learning_rate": 2.598315789473684e-05, "loss": 0.3237, "mean_copy_accuracy": 0.9947167634963989, "mean_gen_accuracy": 0.863179013133049, "mean_token_accuracy": 0.8942181766033173, "num_tokens": 1061283657.0, "sample_num_tokens": 8845.25, "step": 3916, "total_num_tokens": 1061319038.0, "z_loss": 0.0007145958370529115 }, { "copy_logits_max": -1.4235968589782715, "copy_logits_min": -750000064.0, "copy_num_tokens": 469.3125, "epoch": 0.8, "gen_logits_max": 4.913914680480957, "gen_logits_mean": -14.573291778564453, "gen_logits_min": -26.469022750854492, "gen_logits_std": 2.739192485809326, "gen_loss": 0.3532523214817047, "grad_norm": 0.3855341158610749, "learning_rate": 2.5981894736842106e-05, "loss": 0.3144, "mean_copy_accuracy": 0.993887796998024, "mean_gen_accuracy": 0.8683716505765915, "mean_token_accuracy": 0.8960430473089218, "num_tokens": 1061547932.0, "sample_num_tokens": 8810.5, "step": 3917, "total_num_tokens": 1061583174.0, "z_loss": 0.0007103944662958384 }, { "copy_logits_max": -3.2608911991119385, "copy_logits_min": -750000000.0, "copy_num_tokens": 445.3125, "epoch": 0.8002042379371969, "gen_logits_max": 4.545389175415039, "gen_logits_mean": -14.456990242004395, "gen_logits_min": -26.252273559570312, "gen_logits_std": 2.7350616455078125, "gen_loss": 0.33189505338668823, "grad_norm": 0.4057813582009061, "learning_rate": 2.5980631578947366e-05, "loss": 0.322, "mean_copy_accuracy": 0.9945110082626343, "mean_gen_accuracy": 0.8653924465179443, "mean_token_accuracy": 0.8960051536560059, "num_tokens": 1061817949.0, "sample_num_tokens": 8711.25, "step": 3918, "total_num_tokens": 1061852794.0, "z_loss": 0.0006267588469199836 }, { "copy_logits_max": -1.4257351160049438, "copy_logits_min": -687500032.0, "copy_num_tokens": 402.0625, "epoch": 0.8004084758743937, "gen_logits_max": 4.913259506225586, "gen_logits_mean": -13.703133583068848, "gen_logits_min": -25.72775650024414, "gen_logits_std": 2.775730848312378, "gen_loss": 0.31408825516700745, "grad_norm": 0.42279000341359146, "learning_rate": 2.5979368421052634e-05, "loss": 0.3115, "mean_copy_accuracy": 0.9957068115472794, "mean_gen_accuracy": 0.8674424439668655, "mean_token_accuracy": 0.8975166082382202, "num_tokens": 1062085788.0, "sample_num_tokens": 7935.5, "step": 3919, "total_num_tokens": 1062117530.0, "z_loss": 0.0006592906429432333 }, { "copy_logits_max": -3.0567708015441895, "copy_logits_min": -750000000.0, "copy_num_tokens": 350.8125, "epoch": 0.8006127138115905, "gen_logits_max": 5.942751407623291, "gen_logits_mean": -13.382031440734863, "gen_logits_min": -25.13665008544922, "gen_logits_std": 2.7516374588012695, "gen_loss": 0.35070735216140747, "grad_norm": 0.4287016930457681, "learning_rate": 2.5978105263157895e-05, "loss": 0.3052, "mean_copy_accuracy": 0.9933481216430664, "mean_gen_accuracy": 0.867279663681984, "mean_token_accuracy": 0.8988516628742218, "num_tokens": 1062339282.0, "sample_num_tokens": 7239.0, "step": 3920, "total_num_tokens": 1062368238.0, "z_loss": 0.0007310414803214371 }, { "copy_logits_max": -2.500875949859619, "copy_logits_min": -625000064.0, "copy_num_tokens": 592.6875, "epoch": 0.8008169517487873, "gen_logits_max": 4.788841247558594, "gen_logits_mean": -15.081049919128418, "gen_logits_min": -26.908409118652344, "gen_logits_std": 2.768519401550293, "gen_loss": 0.31174153089523315, "grad_norm": 0.37627543930875995, "learning_rate": 2.597684210526316e-05, "loss": 0.2997, "mean_copy_accuracy": 0.9946780949831009, "mean_gen_accuracy": 0.868702694773674, "mean_token_accuracy": 0.9019601345062256, "num_tokens": 1062625399.0, "sample_num_tokens": 10427.75, "step": 3921, "total_num_tokens": 1062667110.0, "z_loss": 0.0007210358744487166 }, { "copy_logits_max": -3.5927774906158447, "copy_logits_min": -750000000.0, "copy_num_tokens": 482.5625, "epoch": 0.8010211896859841, "gen_logits_max": 5.574355602264404, "gen_logits_mean": -12.95273208618164, "gen_logits_min": -24.44927215576172, "gen_logits_std": 2.7303812503814697, "gen_loss": 0.3090679347515106, "grad_norm": 0.4437506694409534, "learning_rate": 2.597557894736842e-05, "loss": 0.3097, "mean_copy_accuracy": 0.994565799832344, "mean_gen_accuracy": 0.8646547347307205, "mean_token_accuracy": 0.8969562649726868, "num_tokens": 1062904270.0, "sample_num_tokens": 8677.5, "step": 3922, "total_num_tokens": 1062938980.0, "z_loss": 0.0005870750173926353 }, { "copy_logits_max": -1.4860210418701172, "copy_logits_min": -687500032.0, "copy_num_tokens": 373.0625, "epoch": 0.801225427623181, "gen_logits_max": 5.106745719909668, "gen_logits_mean": -14.059805870056152, "gen_logits_min": -25.831682205200195, "gen_logits_std": 2.786670207977295, "gen_loss": 0.3069345951080322, "grad_norm": 0.410543037599331, "learning_rate": 2.5974315789473685e-05, "loss": 0.3207, "mean_copy_accuracy": 0.9929294437170029, "mean_gen_accuracy": 0.8635209798812866, "mean_token_accuracy": 0.8919161707162857, "num_tokens": 1063176986.0, "sample_num_tokens": 7425.5, "step": 3923, "total_num_tokens": 1063206688.0, "z_loss": 0.0006500874878838658 }, { "copy_logits_max": -0.9372178912162781, "copy_logits_min": -750000000.0, "copy_num_tokens": 509.9375, "epoch": 0.8014296655603779, "gen_logits_max": 4.9861555099487305, "gen_logits_mean": -14.715195655822754, "gen_logits_min": -26.406234741210938, "gen_logits_std": 2.7838029861450195, "gen_loss": 0.29516565799713135, "grad_norm": 0.4141031139049751, "learning_rate": 2.597305263157895e-05, "loss": 0.2927, "mean_copy_accuracy": 0.9957121759653091, "mean_gen_accuracy": 0.8745053112506866, "mean_token_accuracy": 0.9045647978782654, "num_tokens": 1063452833.0, "sample_num_tokens": 7834.25, "step": 3924, "total_num_tokens": 1063484170.0, "z_loss": 0.0006964891217648983 }, { "copy_logits_max": 0.2619275152683258, "copy_logits_min": -750000000.0, "copy_num_tokens": 543.5625, "epoch": 0.8016339034975747, "gen_logits_max": 4.168331623077393, "gen_logits_mean": -14.675832748413086, "gen_logits_min": -26.591588973999023, "gen_logits_std": 2.76499342918396, "gen_loss": 0.3045286238193512, "grad_norm": 0.4296220393488361, "learning_rate": 2.597178947368421e-05, "loss": 0.3047, "mean_copy_accuracy": 0.9964140951633453, "mean_gen_accuracy": 0.8602704703807831, "mean_token_accuracy": 0.8993673175573349, "num_tokens": 1063707245.0, "sample_num_tokens": 8026.75, "step": 3925, "total_num_tokens": 1063739352.0, "z_loss": 0.0006961204926483333 }, { "copy_logits_max": -1.9226875305175781, "copy_logits_min": -750000000.0, "copy_num_tokens": 434.5, "epoch": 0.8018381414347715, "gen_logits_max": 5.212690353393555, "gen_logits_mean": -14.086105346679688, "gen_logits_min": -26.113330841064453, "gen_logits_std": 2.8150408267974854, "gen_loss": 0.279843270778656, "grad_norm": 0.43813924146391203, "learning_rate": 2.5970526315789474e-05, "loss": 0.3029, "mean_copy_accuracy": 0.994812473654747, "mean_gen_accuracy": 0.8706919848918915, "mean_token_accuracy": 0.8994331806898117, "num_tokens": 1063964345.0, "sample_num_tokens": 7414.75, "step": 3926, "total_num_tokens": 1063994004.0, "z_loss": 0.000592543394304812 }, { "copy_logits_max": -2.92533278465271, "copy_logits_min": -750000000.0, "copy_num_tokens": 554.75, "epoch": 0.8020423793719683, "gen_logits_max": 3.6655123233795166, "gen_logits_mean": -15.87124252319336, "gen_logits_min": -27.76361846923828, "gen_logits_std": 2.802271842956543, "gen_loss": 0.3040228486061096, "grad_norm": 0.5436679141104723, "learning_rate": 2.596926315789474e-05, "loss": 0.2982, "mean_copy_accuracy": 0.9943436533212662, "mean_gen_accuracy": 0.8665809631347656, "mean_token_accuracy": 0.9010873585939407, "num_tokens": 1064246307.0, "sample_num_tokens": 8711.75, "step": 3927, "total_num_tokens": 1064281154.0, "z_loss": 0.00062948017148301 }, { "copy_logits_max": -3.517510175704956, "copy_logits_min": -687500032.0, "copy_num_tokens": 380.0, "epoch": 0.8022466173091651, "gen_logits_max": 5.5421600341796875, "gen_logits_mean": -14.55581283569336, "gen_logits_min": -26.2159423828125, "gen_logits_std": 2.7476863861083984, "gen_loss": 0.3317500352859497, "grad_norm": 0.43508823968338306, "learning_rate": 2.5968000000000003e-05, "loss": 0.3162, "mean_copy_accuracy": 0.9941517263650894, "mean_gen_accuracy": 0.8664358705282211, "mean_token_accuracy": 0.895443394780159, "num_tokens": 1064525181.0, "sample_num_tokens": 8320.25, "step": 3928, "total_num_tokens": 1064558462.0, "z_loss": 0.0006998818716965616 }, { "copy_logits_max": -3.7310280799865723, "copy_logits_min": -750000000.0, "copy_num_tokens": 269.625, "epoch": 0.8024508552463621, "gen_logits_max": 6.336250305175781, "gen_logits_mean": -13.837148666381836, "gen_logits_min": -25.235713958740234, "gen_logits_std": 2.680759906768799, "gen_loss": 0.2849046289920807, "grad_norm": 0.39431745027081566, "learning_rate": 2.5966736842105264e-05, "loss": 0.2908, "mean_copy_accuracy": 0.9959816783666611, "mean_gen_accuracy": 0.8759230375289917, "mean_token_accuracy": 0.9050610810518265, "num_tokens": 1064803513.0, "sample_num_tokens": 6980.75, "step": 3929, "total_num_tokens": 1064831436.0, "z_loss": 0.0006098152953200042 }, { "copy_logits_max": -2.199019432067871, "copy_logits_min": -687500032.0, "copy_num_tokens": 570.4375, "epoch": 0.8026550931835589, "gen_logits_max": 5.895407676696777, "gen_logits_mean": -13.972024917602539, "gen_logits_min": -25.70392608642578, "gen_logits_std": 2.7456088066101074, "gen_loss": 0.27395620942115784, "grad_norm": 0.41944781995895497, "learning_rate": 2.5965473684210528e-05, "loss": 0.3094, "mean_copy_accuracy": 0.9941788613796234, "mean_gen_accuracy": 0.8669904321432114, "mean_token_accuracy": 0.8968095630407333, "num_tokens": 1065064959.0, "sample_num_tokens": 8763.25, "step": 3930, "total_num_tokens": 1065100012.0, "z_loss": 0.0005695526488125324 }, { "copy_logits_max": -2.697441577911377, "copy_logits_min": -750000000.0, "copy_num_tokens": 468.25, "epoch": 0.8028593311207557, "gen_logits_max": 4.4239935874938965, "gen_logits_mean": -15.729759216308594, "gen_logits_min": -28.02922248840332, "gen_logits_std": 2.7942161560058594, "gen_loss": 0.31434494256973267, "grad_norm": 0.4484177952228832, "learning_rate": 2.596421052631579e-05, "loss": 0.3129, "mean_copy_accuracy": 0.9940542429685593, "mean_gen_accuracy": 0.86312635242939, "mean_token_accuracy": 0.8977536857128143, "num_tokens": 1065334697.0, "sample_num_tokens": 8442.25, "step": 3931, "total_num_tokens": 1065368466.0, "z_loss": 0.0006255079060792923 }, { "copy_logits_max": -2.404106616973877, "copy_logits_min": -687500032.0, "copy_num_tokens": 546.1875, "epoch": 0.8030635690579525, "gen_logits_max": 5.461952209472656, "gen_logits_mean": -13.346537590026855, "gen_logits_min": -24.824155807495117, "gen_logits_std": 2.7095327377319336, "gen_loss": 0.3145473301410675, "grad_norm": 0.3805833816337121, "learning_rate": 2.5962947368421053e-05, "loss": 0.298, "mean_copy_accuracy": 0.9954990148544312, "mean_gen_accuracy": 0.867458164691925, "mean_token_accuracy": 0.9010806977748871, "num_tokens": 1065613382.0, "sample_num_tokens": 8903.0, "step": 3932, "total_num_tokens": 1065648994.0, "z_loss": 0.0006900042062625289 }, { "copy_logits_max": -4.383336067199707, "copy_logits_min": -750000000.0, "copy_num_tokens": 295.75, "epoch": 0.8032678069951493, "gen_logits_max": 5.652297019958496, "gen_logits_mean": -14.693320274353027, "gen_logits_min": -26.039695739746094, "gen_logits_std": 2.7527146339416504, "gen_loss": 0.3186314105987549, "grad_norm": 0.41943997438889813, "learning_rate": 2.5961684210526314e-05, "loss": 0.3067, "mean_copy_accuracy": 0.9949570447206497, "mean_gen_accuracy": 0.868397131562233, "mean_token_accuracy": 0.8991170227527618, "num_tokens": 1065895067.0, "sample_num_tokens": 7323.75, "step": 3933, "total_num_tokens": 1065924362.0, "z_loss": 0.0006907698698341846 }, { "copy_logits_max": -4.0654449462890625, "copy_logits_min": -750000000.0, "copy_num_tokens": 404.125, "epoch": 0.8034720449323461, "gen_logits_max": 5.978185176849365, "gen_logits_mean": -13.849400520324707, "gen_logits_min": -25.5159969329834, "gen_logits_std": 2.7665536403656006, "gen_loss": 0.34015730023384094, "grad_norm": 0.42072795590563233, "learning_rate": 2.596042105263158e-05, "loss": 0.3105, "mean_copy_accuracy": 0.9929686188697815, "mean_gen_accuracy": 0.8671886920928955, "mean_token_accuracy": 0.8968485444784164, "num_tokens": 1066158570.0, "sample_num_tokens": 8137.0, "step": 3934, "total_num_tokens": 1066191118.0, "z_loss": 0.0007431927369907498 }, { "copy_logits_max": -0.29632413387298584, "copy_logits_min": -625000000.0, "copy_num_tokens": 715.8125, "epoch": 0.8036762828695431, "gen_logits_max": 4.520026206970215, "gen_logits_mean": -13.6781587600708, "gen_logits_min": -25.767898559570312, "gen_logits_std": 2.757434844970703, "gen_loss": 0.296447217464447, "grad_norm": 0.40215597466642033, "learning_rate": 2.5959157894736843e-05, "loss": 0.2971, "mean_copy_accuracy": 0.9952266663312912, "mean_gen_accuracy": 0.8643880486488342, "mean_token_accuracy": 0.9046439230442047, "num_tokens": 1066446724.0, "sample_num_tokens": 9931.0, "step": 3935, "total_num_tokens": 1066486448.0, "z_loss": 0.0006657382473349571 }, { "copy_logits_max": -2.385200262069702, "copy_logits_min": -687500032.0, "copy_num_tokens": 532.875, "epoch": 0.8038805208067399, "gen_logits_max": 6.003498554229736, "gen_logits_mean": -13.029626846313477, "gen_logits_min": -24.63043975830078, "gen_logits_std": 2.723052978515625, "gen_loss": 0.32161009311676025, "grad_norm": 0.43255346309561604, "learning_rate": 2.5957894736842107e-05, "loss": 0.2928, "mean_copy_accuracy": 0.9923070818185806, "mean_gen_accuracy": 0.8733044266700745, "mean_token_accuracy": 0.901242733001709, "num_tokens": 1066708681.0, "sample_num_tokens": 8912.25, "step": 3936, "total_num_tokens": 1066744330.0, "z_loss": 0.0007243601139634848 }, { "copy_logits_max": -4.723543167114258, "copy_logits_min": -687500032.0, "copy_num_tokens": 354.0, "epoch": 0.8040847587439367, "gen_logits_max": 4.641963005065918, "gen_logits_mean": -15.363326072692871, "gen_logits_min": -26.872793197631836, "gen_logits_std": 2.7514259815216064, "gen_loss": 0.3560689091682434, "grad_norm": 0.3944775567509956, "learning_rate": 2.595663157894737e-05, "loss": 0.3024, "mean_copy_accuracy": 0.9948629438877106, "mean_gen_accuracy": 0.866864949464798, "mean_token_accuracy": 0.8991866558790207, "num_tokens": 1066988104.0, "sample_num_tokens": 8271.5, "step": 3937, "total_num_tokens": 1067021190.0, "z_loss": 0.0007436479208990932 }, { "copy_logits_max": -1.6233539581298828, "copy_logits_min": -750000064.0, "copy_num_tokens": 453.0, "epoch": 0.8042889966811335, "gen_logits_max": 5.2783894538879395, "gen_logits_mean": -12.879755020141602, "gen_logits_min": -24.392742156982422, "gen_logits_std": 2.7333507537841797, "gen_loss": 0.31560859084129333, "grad_norm": 0.37769857973458915, "learning_rate": 2.5955368421052632e-05, "loss": 0.2975, "mean_copy_accuracy": 0.9959991872310638, "mean_gen_accuracy": 0.865795224905014, "mean_token_accuracy": 0.9002863764762878, "num_tokens": 1067262448.0, "sample_num_tokens": 8400.5, "step": 3938, "total_num_tokens": 1067296050.0, "z_loss": 0.0007281063590198755 }, { "copy_logits_max": -1.427690029144287, "copy_logits_min": -625000064.0, "copy_num_tokens": 483.8125, "epoch": 0.8044932346183303, "gen_logits_max": 4.906678676605225, "gen_logits_mean": -13.436017036437988, "gen_logits_min": -25.544841766357422, "gen_logits_std": 2.777705669403076, "gen_loss": 0.27579259872436523, "grad_norm": 0.4595296741963108, "learning_rate": 2.5954105263157897e-05, "loss": 0.3035, "mean_copy_accuracy": 0.9936803728342056, "mean_gen_accuracy": 0.8660416603088379, "mean_token_accuracy": 0.8991722017526627, "num_tokens": 1067517365.0, "sample_num_tokens": 7861.75, "step": 3939, "total_num_tokens": 1067548812.0, "z_loss": 0.0006802580901421607 }, { "copy_logits_max": -4.197569370269775, "copy_logits_min": -687500032.0, "copy_num_tokens": 469.125, "epoch": 0.8046974725555271, "gen_logits_max": 5.94561767578125, "gen_logits_mean": -13.778538703918457, "gen_logits_min": -25.202491760253906, "gen_logits_std": 2.724376678466797, "gen_loss": 0.30693113803863525, "grad_norm": 0.38551443062782276, "learning_rate": 2.5952842105263158e-05, "loss": 0.3025, "mean_copy_accuracy": 0.9948788583278656, "mean_gen_accuracy": 0.8655054867267609, "mean_token_accuracy": 0.8973904848098755, "num_tokens": 1067793345.0, "sample_num_tokens": 8988.75, "step": 3940, "total_num_tokens": 1067829300.0, "z_loss": 0.0006605248199775815 }, { "copy_logits_max": -5.190746307373047, "copy_logits_min": -750000000.0, "copy_num_tokens": 420.4375, "epoch": 0.8049017104927241, "gen_logits_max": 5.666502952575684, "gen_logits_mean": -14.553899765014648, "gen_logits_min": -26.117923736572266, "gen_logits_std": 2.7852139472961426, "gen_loss": 0.30945342779159546, "grad_norm": 0.4168066320219402, "learning_rate": 2.5951578947368422e-05, "loss": 0.315, "mean_copy_accuracy": 0.9944988787174225, "mean_gen_accuracy": 0.8646969348192215, "mean_token_accuracy": 0.8944779187440872, "num_tokens": 1068064756.0, "sample_num_tokens": 7971.0, "step": 3941, "total_num_tokens": 1068096640.0, "z_loss": 0.000675722025334835 }, { "copy_logits_max": -1.793898105621338, "copy_logits_min": -750000000.0, "copy_num_tokens": 382.625, "epoch": 0.8051059484299209, "gen_logits_max": 6.854464530944824, "gen_logits_mean": -13.758182525634766, "gen_logits_min": -25.937030792236328, "gen_logits_std": 2.829536199569702, "gen_loss": 0.3024895489215851, "grad_norm": 0.3897804841779026, "learning_rate": 2.5950315789473683e-05, "loss": 0.2985, "mean_copy_accuracy": 0.9950093328952789, "mean_gen_accuracy": 0.8710198402404785, "mean_token_accuracy": 0.9008779078722, "num_tokens": 1068335391.0, "sample_num_tokens": 7751.75, "step": 3942, "total_num_tokens": 1068366398.0, "z_loss": 0.0007202096749097109 }, { "copy_logits_max": -3.609086036682129, "copy_logits_min": -750000000.0, "copy_num_tokens": 518.875, "epoch": 0.8053101863671177, "gen_logits_max": 5.064807415008545, "gen_logits_mean": -15.158509254455566, "gen_logits_min": -26.645179748535156, "gen_logits_std": 2.742180109024048, "gen_loss": 0.3453517556190491, "grad_norm": 0.45375177189356836, "learning_rate": 2.594905263157895e-05, "loss": 0.341, "mean_copy_accuracy": 0.9940998405218124, "mean_gen_accuracy": 0.8559068292379379, "mean_token_accuracy": 0.8872636258602142, "num_tokens": 1068599586.0, "sample_num_tokens": 9016.5, "step": 3943, "total_num_tokens": 1068635652.0, "z_loss": 0.0008160088327713311 }, { "copy_logits_max": -4.146542549133301, "copy_logits_min": -750000000.0, "copy_num_tokens": 495.5, "epoch": 0.8055144243043145, "gen_logits_max": 5.428761005401611, "gen_logits_mean": -15.090614318847656, "gen_logits_min": -26.34632682800293, "gen_logits_std": 2.7370851039886475, "gen_loss": 0.2869423031806946, "grad_norm": 0.3860611567452869, "learning_rate": 2.594778947368421e-05, "loss": 0.3144, "mean_copy_accuracy": 0.9948412030935287, "mean_gen_accuracy": 0.8621906489133835, "mean_token_accuracy": 0.8962950706481934, "num_tokens": 1068875054.0, "sample_num_tokens": 8759.5, "step": 3944, "total_num_tokens": 1068910092.0, "z_loss": 0.0006963276537135243 }, { "copy_logits_max": -0.4513317942619324, "copy_logits_min": -750000000.0, "copy_num_tokens": 410.125, "epoch": 0.8057186622415113, "gen_logits_max": 5.4137067794799805, "gen_logits_mean": -13.980720520019531, "gen_logits_min": -26.044477462768555, "gen_logits_std": 2.7661993503570557, "gen_loss": 0.36618340015411377, "grad_norm": 0.4614286291217624, "learning_rate": 2.5946526315789476e-05, "loss": 0.33, "mean_copy_accuracy": 0.9940463453531265, "mean_gen_accuracy": 0.8554919511079788, "mean_token_accuracy": 0.8915544003248215, "num_tokens": 1069145416.0, "sample_num_tokens": 7803.5, "step": 3945, "total_num_tokens": 1069176630.0, "z_loss": 0.0008548335172235966 }, { "copy_logits_max": -4.570019721984863, "copy_logits_min": -750000000.0, "copy_num_tokens": 309.5625, "epoch": 0.8059229001787082, "gen_logits_max": 6.274357795715332, "gen_logits_mean": -13.806107521057129, "gen_logits_min": -25.492294311523438, "gen_logits_std": 2.730630874633789, "gen_loss": 0.30959564447402954, "grad_norm": 0.3907484338509089, "learning_rate": 2.5945263157894737e-05, "loss": 0.3114, "mean_copy_accuracy": 0.9956031739711761, "mean_gen_accuracy": 0.8650530427694321, "mean_token_accuracy": 0.8983763605356216, "num_tokens": 1069415233.0, "sample_num_tokens": 7239.75, "step": 3946, "total_num_tokens": 1069444192.0, "z_loss": 0.0007201373227871954 }, { "copy_logits_max": -1.5061612129211426, "copy_logits_min": -750000064.0, "copy_num_tokens": 476.4375, "epoch": 0.8061271381159051, "gen_logits_max": 5.654242515563965, "gen_logits_mean": -14.132381439208984, "gen_logits_min": -26.282817840576172, "gen_logits_std": 2.76082181930542, "gen_loss": 0.3212510645389557, "grad_norm": 0.40224322772332305, "learning_rate": 2.5944e-05, "loss": 0.3055, "mean_copy_accuracy": 0.9962413758039474, "mean_gen_accuracy": 0.8655159324407578, "mean_token_accuracy": 0.8987409621477127, "num_tokens": 1069686980.0, "sample_num_tokens": 8305.0, "step": 3947, "total_num_tokens": 1069720200.0, "z_loss": 0.0007489522686228156 }, { "copy_logits_max": -2.96578311920166, "copy_logits_min": -750000000.0, "copy_num_tokens": 268.3125, "epoch": 0.8063313760531019, "gen_logits_max": 5.82562255859375, "gen_logits_mean": -14.98584270477295, "gen_logits_min": -26.635753631591797, "gen_logits_std": 2.7232041358947754, "gen_loss": 0.35563886165618896, "grad_norm": 0.464346631873754, "learning_rate": 2.5942736842105262e-05, "loss": 0.3141, "mean_copy_accuracy": 0.9935625046491623, "mean_gen_accuracy": 0.865177571773529, "mean_token_accuracy": 0.8979032635688782, "num_tokens": 1069974543.0, "sample_num_tokens": 6728.25, "step": 3948, "total_num_tokens": 1070001456.0, "z_loss": 0.0006725937710143626 }, { "copy_logits_max": -2.861220121383667, "copy_logits_min": -687500032.0, "copy_num_tokens": 589.1875, "epoch": 0.8065356139902987, "gen_logits_max": 4.545377254486084, "gen_logits_mean": -14.677899360656738, "gen_logits_min": -26.51864242553711, "gen_logits_std": 2.741483211517334, "gen_loss": 0.3094479441642761, "grad_norm": 0.3995333013675561, "learning_rate": 2.5941473684210526e-05, "loss": 0.3054, "mean_copy_accuracy": 0.9944567233324051, "mean_gen_accuracy": 0.8666013479232788, "mean_token_accuracy": 0.8971224874258041, "num_tokens": 1070223671.0, "sample_num_tokens": 8864.75, "step": 3949, "total_num_tokens": 1070259130.0, "z_loss": 0.0006216287729330361 }, { "copy_logits_max": -0.7708796262741089, "copy_logits_min": -750000000.0, "copy_num_tokens": 455.5625, "epoch": 0.8067398519274955, "gen_logits_max": 6.315574645996094, "gen_logits_mean": -12.79035758972168, "gen_logits_min": -25.031890869140625, "gen_logits_std": 2.843280076980591, "gen_loss": 0.31258660554885864, "grad_norm": 0.45077443986406646, "learning_rate": 2.5940210526315787e-05, "loss": 0.3135, "mean_copy_accuracy": 0.9944754093885422, "mean_gen_accuracy": 0.8668479323387146, "mean_token_accuracy": 0.8962089568376541, "num_tokens": 1070484854.0, "sample_num_tokens": 8628.5, "step": 3950, "total_num_tokens": 1070519368.0, "z_loss": 0.0006334484205581248 }, { "copy_logits_max": 0.008032381534576416, "copy_logits_min": -750000064.0, "copy_num_tokens": 562.25, "epoch": 0.8069440898646923, "gen_logits_max": 4.159135818481445, "gen_logits_mean": -15.244819641113281, "gen_logits_min": -27.231868743896484, "gen_logits_std": 2.813013792037964, "gen_loss": 0.2970479130744934, "grad_norm": 0.4191262264051833, "learning_rate": 2.5938947368421055e-05, "loss": 0.29, "mean_copy_accuracy": 0.9949977099895477, "mean_gen_accuracy": 0.8717120587825775, "mean_token_accuracy": 0.9040752351284027, "num_tokens": 1070743906.0, "sample_num_tokens": 8351.5, "step": 3951, "total_num_tokens": 1070777312.0, "z_loss": 0.0005768144619651139 }, { "copy_logits_max": -3.9461894035339355, "copy_logits_min": -750000000.0, "copy_num_tokens": 368.25, "epoch": 0.8071483278018892, "gen_logits_max": 4.671877861022949, "gen_logits_mean": -15.02066707611084, "gen_logits_min": -26.383892059326172, "gen_logits_std": 2.7388782501220703, "gen_loss": 0.3325539231300354, "grad_norm": 0.4659294026581478, "learning_rate": 2.593768421052632e-05, "loss": 0.3285, "mean_copy_accuracy": 0.9944490790367126, "mean_gen_accuracy": 0.8629314452409744, "mean_token_accuracy": 0.8936885297298431, "num_tokens": 1071017668.0, "sample_num_tokens": 7268.0, "step": 3952, "total_num_tokens": 1071046740.0, "z_loss": 0.0006295479834079742 }, { "copy_logits_max": 0.12504911422729492, "copy_logits_min": -750000000.0, "copy_num_tokens": 488.0625, "epoch": 0.807352565739086, "gen_logits_max": 5.960210800170898, "gen_logits_mean": -13.138891220092773, "gen_logits_min": -24.903945922851562, "gen_logits_std": 2.821214199066162, "gen_loss": 0.2893264889717102, "grad_norm": 0.4439526174044601, "learning_rate": 2.593642105263158e-05, "loss": 0.331, "mean_copy_accuracy": 0.9947705268859863, "mean_gen_accuracy": 0.8535570353269577, "mean_token_accuracy": 0.8913111388683319, "num_tokens": 1071284605.0, "sample_num_tokens": 9002.25, "step": 3953, "total_num_tokens": 1071320614.0, "z_loss": 0.0005492327036336064 }, { "copy_logits_max": -2.1109275817871094, "copy_logits_min": -750000000.0, "copy_num_tokens": 715.375, "epoch": 0.8075568036762829, "gen_logits_max": 3.7177653312683105, "gen_logits_mean": -15.091699600219727, "gen_logits_min": -26.847238540649414, "gen_logits_std": 2.771697759628296, "gen_loss": 0.26727789640426636, "grad_norm": 0.39274590233143564, "learning_rate": 2.5935157894736844e-05, "loss": 0.3036, "mean_copy_accuracy": 0.995712473988533, "mean_gen_accuracy": 0.8650619387626648, "mean_token_accuracy": 0.8997807055711746, "num_tokens": 1071550252.0, "sample_num_tokens": 10285.0, "step": 3954, "total_num_tokens": 1071591392.0, "z_loss": 0.0005974777741357684 }, { "copy_logits_max": -2.280151844024658, "copy_logits_min": -687500032.0, "copy_num_tokens": 396.6875, "epoch": 0.8077610416134797, "gen_logits_max": 5.141419887542725, "gen_logits_mean": -14.742547988891602, "gen_logits_min": -26.43259048461914, "gen_logits_std": 2.7556240558624268, "gen_loss": 0.301748663187027, "grad_norm": 0.38642497850324437, "learning_rate": 2.5933894736842105e-05, "loss": 0.3035, "mean_copy_accuracy": 0.9959204643964767, "mean_gen_accuracy": 0.863638699054718, "mean_token_accuracy": 0.8989170640707016, "num_tokens": 1071845243.0, "sample_num_tokens": 8736.75, "step": 3955, "total_num_tokens": 1071880190.0, "z_loss": 0.0005987755721434951 }, { "copy_logits_max": -1.1130750179290771, "copy_logits_min": -750000000.0, "copy_num_tokens": 392.6875, "epoch": 0.8079652795506765, "gen_logits_max": 5.6473388671875, "gen_logits_mean": -14.135391235351562, "gen_logits_min": -26.089069366455078, "gen_logits_std": 2.803110361099243, "gen_loss": 0.310032457113266, "grad_norm": 0.41646955133870317, "learning_rate": 2.593263157894737e-05, "loss": 0.3123, "mean_copy_accuracy": 0.9954682290554047, "mean_gen_accuracy": 0.8651246428489685, "mean_token_accuracy": 0.89771568775177, "num_tokens": 1072125918.0, "sample_num_tokens": 8074.0, "step": 3956, "total_num_tokens": 1072158214.0, "z_loss": 0.0006430812645703554 }, { "copy_logits_max": -0.36157119274139404, "copy_logits_min": -687500032.0, "copy_num_tokens": 768.125, "epoch": 0.8081695174878734, "gen_logits_max": 5.265820503234863, "gen_logits_mean": -12.779825210571289, "gen_logits_min": -24.770862579345703, "gen_logits_std": 2.81264066696167, "gen_loss": 0.2720295786857605, "grad_norm": 0.4454088758727209, "learning_rate": 2.593136842105263e-05, "loss": 0.2939, "mean_copy_accuracy": 0.9931025803089142, "mean_gen_accuracy": 0.8744863569736481, "mean_token_accuracy": 0.9033055156469345, "num_tokens": 1072375640.0, "sample_num_tokens": 9976.5, "step": 3957, "total_num_tokens": 1072415546.0, "z_loss": 0.0005865726852789521 }, { "copy_logits_max": -1.2862520217895508, "copy_logits_min": -750000000.0, "copy_num_tokens": 533.625, "epoch": 0.8083737554250702, "gen_logits_max": 5.348916053771973, "gen_logits_mean": -13.605352401733398, "gen_logits_min": -25.57169532775879, "gen_logits_std": 2.7508397102355957, "gen_loss": 0.3359183669090271, "grad_norm": 0.4590064038460684, "learning_rate": 2.5930105263157895e-05, "loss": 0.329, "mean_copy_accuracy": 0.995957151055336, "mean_gen_accuracy": 0.8567878603935242, "mean_token_accuracy": 0.892934113740921, "num_tokens": 1072648174.0, "sample_num_tokens": 8414.0, "step": 3958, "total_num_tokens": 1072681830.0, "z_loss": 0.0006878761923871934 }, { "copy_logits_max": -3.0019216537475586, "copy_logits_min": -750000000.0, "copy_num_tokens": 578.75, "epoch": 0.808577993362267, "gen_logits_max": 5.076354503631592, "gen_logits_mean": -14.167631149291992, "gen_logits_min": -26.59550666809082, "gen_logits_std": 2.7341175079345703, "gen_loss": 0.3291662335395813, "grad_norm": 0.39108360674200476, "learning_rate": 2.592884210526316e-05, "loss": 0.3079, "mean_copy_accuracy": 0.9958478957414627, "mean_gen_accuracy": 0.8666201233863831, "mean_token_accuracy": 0.9005941301584244, "num_tokens": 1072937774.0, "sample_num_tokens": 9967.0, "step": 3959, "total_num_tokens": 1072977642.0, "z_loss": 0.0006773967761546373 }, { "copy_logits_max": -2.7679877281188965, "copy_logits_min": -750000000.0, "copy_num_tokens": 444.375, "epoch": 0.8087822312994639, "gen_logits_max": 5.5135884284973145, "gen_logits_mean": -13.838226318359375, "gen_logits_min": -26.52126121520996, "gen_logits_std": 2.776942729949951, "gen_loss": 0.2853882610797882, "grad_norm": 0.4319234120159128, "learning_rate": 2.5927578947368423e-05, "loss": 0.3095, "mean_copy_accuracy": 0.9939301013946533, "mean_gen_accuracy": 0.8691627234220505, "mean_token_accuracy": 0.897070124745369, "num_tokens": 1073199481.0, "sample_num_tokens": 8162.75, "step": 3960, "total_num_tokens": 1073232132.0, "z_loss": 0.0006161774508655071 }, { "copy_logits_max": -2.7633635997772217, "copy_logits_min": -687500032.0, "copy_num_tokens": 516.25, "epoch": 0.8089864692366607, "gen_logits_max": 5.13931941986084, "gen_logits_mean": -13.667887687683105, "gen_logits_min": -26.7563419342041, "gen_logits_std": 2.8142759799957275, "gen_loss": 0.29821377992630005, "grad_norm": 0.41011430983190555, "learning_rate": 2.5926315789473684e-05, "loss": 0.3154, "mean_copy_accuracy": 0.9952723681926727, "mean_gen_accuracy": 0.8625016808509827, "mean_token_accuracy": 0.8947064727544785, "num_tokens": 1073442911.0, "sample_num_tokens": 7746.75, "step": 3961, "total_num_tokens": 1073473898.0, "z_loss": 0.0006467844941653311 }, { "copy_logits_max": -4.110814094543457, "copy_logits_min": -750000064.0, "copy_num_tokens": 361.25, "epoch": 0.8091907071738575, "gen_logits_max": 5.797553062438965, "gen_logits_mean": -13.915029525756836, "gen_logits_min": -24.92007827758789, "gen_logits_std": 2.686353921890259, "gen_loss": 0.32789528369903564, "grad_norm": 0.4121467774331413, "learning_rate": 2.592505263157895e-05, "loss": 0.3052, "mean_copy_accuracy": 0.9950522482395172, "mean_gen_accuracy": 0.8677191585302353, "mean_token_accuracy": 0.8975308984518051, "num_tokens": 1073704949.0, "sample_num_tokens": 7963.75, "step": 3962, "total_num_tokens": 1073736804.0, "z_loss": 0.0006994499126449227 }, { "copy_logits_max": -2.462754964828491, "copy_logits_min": -750000000.0, "copy_num_tokens": 464.875, "epoch": 0.8093949451110544, "gen_logits_max": 5.314306259155273, "gen_logits_mean": -13.04831600189209, "gen_logits_min": -24.525287628173828, "gen_logits_std": 2.668083906173706, "gen_loss": 0.30909621715545654, "grad_norm": 0.37359460490343677, "learning_rate": 2.592378947368421e-05, "loss": 0.2989, "mean_copy_accuracy": 0.9964608699083328, "mean_gen_accuracy": 0.8674883544445038, "mean_token_accuracy": 0.9037206172943115, "num_tokens": 1073984632.0, "sample_num_tokens": 7908.0, "step": 3963, "total_num_tokens": 1074016264.0, "z_loss": 0.0006412690272554755 }, { "copy_logits_max": -3.9224495887756348, "copy_logits_min": -750000064.0, "copy_num_tokens": 527.4375, "epoch": 0.8095991830482512, "gen_logits_max": 5.06396484375, "gen_logits_mean": -14.61978530883789, "gen_logits_min": -26.35881805419922, "gen_logits_std": 2.7491188049316406, "gen_loss": 0.32829487323760986, "grad_norm": 0.4220444788036296, "learning_rate": 2.5922526315789474e-05, "loss": 0.3302, "mean_copy_accuracy": 0.9954127967357635, "mean_gen_accuracy": 0.8539866060018539, "mean_token_accuracy": 0.8894302994012833, "num_tokens": 1074253546.0, "sample_num_tokens": 8773.5, "step": 3964, "total_num_tokens": 1074288640.0, "z_loss": 0.0006879898719489574 }, { "copy_logits_max": -2.1878485679626465, "copy_logits_min": -750000000.0, "copy_num_tokens": 575.375, "epoch": 0.809803420985448, "gen_logits_max": 5.259787559509277, "gen_logits_mean": -14.122724533081055, "gen_logits_min": -25.405475616455078, "gen_logits_std": 2.6938228607177734, "gen_loss": 0.3272303342819214, "grad_norm": 0.3971789358744172, "learning_rate": 2.5921263157894738e-05, "loss": 0.3058, "mean_copy_accuracy": 0.9941745549440384, "mean_gen_accuracy": 0.8678429275751114, "mean_token_accuracy": 0.9003995209932327, "num_tokens": 1074528636.0, "sample_num_tokens": 9249.5, "step": 3965, "total_num_tokens": 1074565634.0, "z_loss": 0.0006821575807407498 }, { "copy_logits_max": -2.6778337955474854, "copy_logits_min": -750000064.0, "copy_num_tokens": 499.8125, "epoch": 0.8100076589226449, "gen_logits_max": 4.426699161529541, "gen_logits_mean": -14.563469886779785, "gen_logits_min": -25.90776824951172, "gen_logits_std": 2.718212366104126, "gen_loss": 0.2886989116668701, "grad_norm": 0.4126644997620963, "learning_rate": 2.592e-05, "loss": 0.3211, "mean_copy_accuracy": 0.9957273006439209, "mean_gen_accuracy": 0.8617677837610245, "mean_token_accuracy": 0.8935208320617676, "num_tokens": 1074787197.0, "sample_num_tokens": 8098.25, "step": 3966, "total_num_tokens": 1074819590.0, "z_loss": 0.0006323043489828706 }, { "copy_logits_max": -3.3965325355529785, "copy_logits_min": -750000000.0, "copy_num_tokens": 439.6875, "epoch": 0.8102118968598417, "gen_logits_max": 5.027753829956055, "gen_logits_mean": -14.610851287841797, "gen_logits_min": -26.188541412353516, "gen_logits_std": 2.7569751739501953, "gen_loss": 0.304582417011261, "grad_norm": 0.4255537426333912, "learning_rate": 2.5918736842105263e-05, "loss": 0.2926, "mean_copy_accuracy": 0.9944346249103546, "mean_gen_accuracy": 0.870326429605484, "mean_token_accuracy": 0.9014249742031097, "num_tokens": 1075052272.0, "sample_num_tokens": 8491.0, "step": 3967, "total_num_tokens": 1075086236.0, "z_loss": 0.0006405117455869913 }, { "copy_logits_max": -2.4990899562835693, "copy_logits_min": -750000000.0, "copy_num_tokens": 397.1875, "epoch": 0.8104161347970386, "gen_logits_max": 6.249778747558594, "gen_logits_mean": -13.505552291870117, "gen_logits_min": -24.79197883605957, "gen_logits_std": 2.7068986892700195, "gen_loss": 0.32055479288101196, "grad_norm": 0.4175283042971846, "learning_rate": 2.5917473684210528e-05, "loss": 0.3067, "mean_copy_accuracy": 0.9958572238683701, "mean_gen_accuracy": 0.8604683130979538, "mean_token_accuracy": 0.8981520384550095, "num_tokens": 1075308454.0, "sample_num_tokens": 7927.0, "step": 3968, "total_num_tokens": 1075340162.0, "z_loss": 0.0006778653478249907 }, { "copy_logits_max": -0.8164393305778503, "copy_logits_min": -687500032.0, "copy_num_tokens": 591.0, "epoch": 0.8106203727342354, "gen_logits_max": 5.11446475982666, "gen_logits_mean": -14.028362274169922, "gen_logits_min": -25.922622680664062, "gen_logits_std": 2.776366710662842, "gen_loss": 0.2964451313018799, "grad_norm": 0.41218542854093554, "learning_rate": 2.5916210526315792e-05, "loss": 0.3131, "mean_copy_accuracy": 0.9958046227693558, "mean_gen_accuracy": 0.8593399524688721, "mean_token_accuracy": 0.8966230303049088, "num_tokens": 1075588615.0, "sample_num_tokens": 8761.25, "step": 3969, "total_num_tokens": 1075623660.0, "z_loss": 0.0006329350871965289 }, { "copy_logits_max": -1.8606700897216797, "copy_logits_min": -687500032.0, "copy_num_tokens": 344.125, "epoch": 0.8108246106714322, "gen_logits_max": 5.858290195465088, "gen_logits_mean": -13.807109832763672, "gen_logits_min": -25.598522186279297, "gen_logits_std": 2.753483772277832, "gen_loss": 0.36049526929855347, "grad_norm": 0.40052147621638456, "learning_rate": 2.5914947368421053e-05, "loss": 0.3295, "mean_copy_accuracy": 0.9961976110935211, "mean_gen_accuracy": 0.8557548671960831, "mean_token_accuracy": 0.8897125273942947, "num_tokens": 1075855817.0, "sample_num_tokens": 6852.25, "step": 3970, "total_num_tokens": 1075883226.0, "z_loss": 0.00078989053145051 }, { "copy_logits_max": -2.213677167892456, "copy_logits_min": -750000000.0, "copy_num_tokens": 647.75, "epoch": 0.811028848608629, "gen_logits_max": 5.183610916137695, "gen_logits_mean": -13.785795211791992, "gen_logits_min": -25.356430053710938, "gen_logits_std": 2.7383761405944824, "gen_loss": 0.3264596462249756, "grad_norm": 0.4066497251390345, "learning_rate": 2.5913684210526317e-05, "loss": 0.3224, "mean_copy_accuracy": 0.9938155114650726, "mean_gen_accuracy": 0.8622290641069412, "mean_token_accuracy": 0.8930709213018417, "num_tokens": 1076125839.0, "sample_num_tokens": 10224.25, "step": 3971, "total_num_tokens": 1076166736.0, "z_loss": 0.0007116158958524466 }, { "copy_logits_max": -3.110581398010254, "copy_logits_min": -625000064.0, "copy_num_tokens": 418.9375, "epoch": 0.8112330865458259, "gen_logits_max": 5.770571231842041, "gen_logits_mean": -13.726642608642578, "gen_logits_min": -25.307838439941406, "gen_logits_std": 2.7083020210266113, "gen_loss": 0.3176156282424927, "grad_norm": 0.4077113251899871, "learning_rate": 2.5912421052631578e-05, "loss": 0.3058, "mean_copy_accuracy": 0.9946250766515732, "mean_gen_accuracy": 0.8694351315498352, "mean_token_accuracy": 0.8981050103902817, "num_tokens": 1076389446.0, "sample_num_tokens": 7699.0, "step": 3972, "total_num_tokens": 1076420242.0, "z_loss": 0.0006733625195920467 }, { "copy_logits_max": -2.947319984436035, "copy_logits_min": -750000000.0, "copy_num_tokens": 667.5625, "epoch": 0.8114373244830227, "gen_logits_max": 5.207057476043701, "gen_logits_mean": -13.511699676513672, "gen_logits_min": -24.447168350219727, "gen_logits_std": 2.66102933883667, "gen_loss": 0.31282925605773926, "grad_norm": 0.3865098901373931, "learning_rate": 2.5911157894736843e-05, "loss": 0.3097, "mean_copy_accuracy": 0.9955505430698395, "mean_gen_accuracy": 0.8638502210378647, "mean_token_accuracy": 0.8977815359830856, "num_tokens": 1076663063.0, "sample_num_tokens": 9565.25, "step": 3973, "total_num_tokens": 1076701324.0, "z_loss": 0.0006369446055032313 }, { "copy_logits_max": -5.394413948059082, "copy_logits_min": -750000000.0, "copy_num_tokens": 484.0, "epoch": 0.8116415624202196, "gen_logits_max": 5.154037952423096, "gen_logits_mean": -14.904500961303711, "gen_logits_min": -26.292150497436523, "gen_logits_std": 2.7476329803466797, "gen_loss": 0.284305214881897, "grad_norm": 0.356872622723043, "learning_rate": 2.5909894736842103e-05, "loss": 0.2855, "mean_copy_accuracy": 0.9953093379735947, "mean_gen_accuracy": 0.8782963305711746, "mean_token_accuracy": 0.9062958508729935, "num_tokens": 1076966959.0, "sample_num_tokens": 9146.75, "step": 3974, "total_num_tokens": 1077003546.0, "z_loss": 0.0005511172930710018 }, { "copy_logits_max": -4.426548004150391, "copy_logits_min": -750000000.0, "copy_num_tokens": 347.25, "epoch": 0.8118458003574164, "gen_logits_max": 5.560083866119385, "gen_logits_mean": -13.775980949401855, "gen_logits_min": -24.928651809692383, "gen_logits_std": 2.682830810546875, "gen_loss": 0.31599292159080505, "grad_norm": 0.4267563434819898, "learning_rate": 2.5908631578947368e-05, "loss": 0.3069, "mean_copy_accuracy": 0.9937238097190857, "mean_gen_accuracy": 0.869306668639183, "mean_token_accuracy": 0.8985661566257477, "num_tokens": 1077252460.0, "sample_num_tokens": 7344.5, "step": 3975, "total_num_tokens": 1077281838.0, "z_loss": 0.0006749001331627369 }, { "copy_logits_max": -2.609805107116699, "copy_logits_min": -750000000.0, "copy_num_tokens": 553.6875, "epoch": 0.8120500382946132, "gen_logits_max": 4.599297046661377, "gen_logits_mean": -14.937603950500488, "gen_logits_min": -26.270431518554688, "gen_logits_std": 2.7203726768493652, "gen_loss": 0.3127615451812744, "grad_norm": 0.4534841978823722, "learning_rate": 2.5907368421052632e-05, "loss": 0.3528, "mean_copy_accuracy": 0.9938806891441345, "mean_gen_accuracy": 0.8536973893642426, "mean_token_accuracy": 0.8830650597810745, "num_tokens": 1077497310.0, "sample_num_tokens": 9044.5, "step": 3976, "total_num_tokens": 1077533488.0, "z_loss": 0.0006494896952062845 }, { "copy_logits_max": -2.6718597412109375, "copy_logits_min": -750000000.0, "copy_num_tokens": 537.3125, "epoch": 0.81225427623181, "gen_logits_max": 5.502535820007324, "gen_logits_mean": -13.479270935058594, "gen_logits_min": -25.728038787841797, "gen_logits_std": 2.724716901779175, "gen_loss": 0.3014199733734131, "grad_norm": 0.5823638579159258, "learning_rate": 2.5906105263157896e-05, "loss": 0.3151, "mean_copy_accuracy": 0.9927621930837631, "mean_gen_accuracy": 0.8660546839237213, "mean_token_accuracy": 0.8968743979930878, "num_tokens": 1077749042.0, "sample_num_tokens": 8677.5, "step": 3977, "total_num_tokens": 1077783752.0, "z_loss": 0.0006582141504622996 }, { "copy_logits_max": -3.4855551719665527, "copy_logits_min": -687500032.0, "copy_num_tokens": 556.0, "epoch": 0.8124585141690069, "gen_logits_max": 5.259182929992676, "gen_logits_mean": -13.52244758605957, "gen_logits_min": -25.320354461669922, "gen_logits_std": 2.731441020965576, "gen_loss": 0.33020633459091187, "grad_norm": 0.400196642511574, "learning_rate": 2.590484210526316e-05, "loss": 0.3155, "mean_copy_accuracy": 0.9954890012741089, "mean_gen_accuracy": 0.8640954792499542, "mean_token_accuracy": 0.8969945311546326, "num_tokens": 1078018655.0, "sample_num_tokens": 9204.75, "step": 3978, "total_num_tokens": 1078055474.0, "z_loss": 0.0006538794841617346 }, { "copy_logits_max": -3.8262548446655273, "copy_logits_min": -750000000.0, "copy_num_tokens": 357.1875, "epoch": 0.8126627521062038, "gen_logits_max": 5.066788196563721, "gen_logits_mean": -13.949481964111328, "gen_logits_min": -25.595373153686523, "gen_logits_std": 2.714970827102661, "gen_loss": 0.3392789363861084, "grad_norm": 0.44727957534694024, "learning_rate": 2.590357894736842e-05, "loss": 0.3095, "mean_copy_accuracy": 0.9959147721529007, "mean_gen_accuracy": 0.8641741722822189, "mean_token_accuracy": 0.8974452316761017, "num_tokens": 1078312888.0, "sample_num_tokens": 7241.5, "step": 3979, "total_num_tokens": 1078341854.0, "z_loss": 0.0006944125052541494 }, { "copy_logits_max": -6.6974639892578125, "copy_logits_min": -750000000.0, "copy_num_tokens": 547.875, "epoch": 0.8128669900434006, "gen_logits_max": 3.8009681701660156, "gen_logits_mean": -16.338855743408203, "gen_logits_min": -28.041288375854492, "gen_logits_std": 2.7790563106536865, "gen_loss": 0.30399584770202637, "grad_norm": 0.4485179662357097, "learning_rate": 2.5902315789473686e-05, "loss": 0.2932, "mean_copy_accuracy": 0.9946553856134415, "mean_gen_accuracy": 0.8708510398864746, "mean_token_accuracy": 0.9023821353912354, "num_tokens": 1078571103.0, "sample_num_tokens": 9608.25, "step": 3980, "total_num_tokens": 1078609536.0, "z_loss": 0.000557411927729845 }, { "copy_logits_max": -4.584231376647949, "copy_logits_min": -687500032.0, "copy_num_tokens": 505.9375, "epoch": 0.8130712279805974, "gen_logits_max": 4.719422340393066, "gen_logits_mean": -14.666060447692871, "gen_logits_min": -26.270057678222656, "gen_logits_std": 2.761619806289673, "gen_loss": 0.31586313247680664, "grad_norm": 0.5255135305933277, "learning_rate": 2.5901052631578947e-05, "loss": 0.3272, "mean_copy_accuracy": 0.9922258853912354, "mean_gen_accuracy": 0.862726479768753, "mean_token_accuracy": 0.8918392956256866, "num_tokens": 1078814494.0, "sample_num_tokens": 9015.5, "step": 3981, "total_num_tokens": 1078850556.0, "z_loss": 0.0006132706184871495 }, { "copy_logits_max": -4.351088047027588, "copy_logits_min": -750000000.0, "copy_num_tokens": 320.1875, "epoch": 0.8132754659177942, "gen_logits_max": 6.351709365844727, "gen_logits_mean": -14.414068222045898, "gen_logits_min": -25.529808044433594, "gen_logits_std": 2.694253444671631, "gen_loss": 0.35651975870132446, "grad_norm": 0.43905445327809295, "learning_rate": 2.589978947368421e-05, "loss": 0.3211, "mean_copy_accuracy": 0.9952856451272964, "mean_gen_accuracy": 0.860445111989975, "mean_token_accuracy": 0.8948651999235153, "num_tokens": 1079107695.0, "sample_num_tokens": 8116.25, "step": 3982, "total_num_tokens": 1079140160.0, "z_loss": 0.0006878242129459977 }, { "copy_logits_max": -5.550211429595947, "copy_logits_min": -750000000.0, "copy_num_tokens": 338.5625, "epoch": 0.813479703854991, "gen_logits_max": 5.982480525970459, "gen_logits_mean": -14.295668601989746, "gen_logits_min": -25.487850189208984, "gen_logits_std": 2.6905517578125, "gen_loss": 0.30505111813545227, "grad_norm": 0.430949725874639, "learning_rate": 2.5898526315789472e-05, "loss": 0.3096, "mean_copy_accuracy": 0.9948291778564453, "mean_gen_accuracy": 0.8715372681617737, "mean_token_accuracy": 0.8963346183300018, "num_tokens": 1079372020.0, "sample_num_tokens": 8197.0, "step": 3983, "total_num_tokens": 1079404808.0, "z_loss": 0.0006304155103862286 }, { "copy_logits_max": -3.308830738067627, "copy_logits_min": -750000000.0, "copy_num_tokens": 415.3125, "epoch": 0.813683941792188, "gen_logits_max": 5.171669006347656, "gen_logits_mean": -14.92359733581543, "gen_logits_min": -26.853321075439453, "gen_logits_std": 2.7701470851898193, "gen_loss": 0.30795741081237793, "grad_norm": 0.4028405174963334, "learning_rate": 2.589726315789474e-05, "loss": 0.3112, "mean_copy_accuracy": 0.9950616806745529, "mean_gen_accuracy": 0.8661108016967773, "mean_token_accuracy": 0.8961542695760727, "num_tokens": 1079645902.0, "sample_num_tokens": 7752.0, "step": 3984, "total_num_tokens": 1079676910.0, "z_loss": 0.0007128288270905614 }, { "copy_logits_max": -4.66432523727417, "copy_logits_min": -750000000.0, "copy_num_tokens": 599.4375, "epoch": 0.8138881797293848, "gen_logits_max": 4.951972007751465, "gen_logits_mean": -13.40549373626709, "gen_logits_min": -25.00531768798828, "gen_logits_std": 2.7540550231933594, "gen_loss": 0.31806179881095886, "grad_norm": 0.49164299215912, "learning_rate": 2.5896e-05, "loss": 0.3109, "mean_copy_accuracy": 0.9951033294200897, "mean_gen_accuracy": 0.8638411462306976, "mean_token_accuracy": 0.8975039422512054, "num_tokens": 1079902699.0, "sample_num_tokens": 8331.75, "step": 3985, "total_num_tokens": 1079936026.0, "z_loss": 0.0006816292298026383 }, { "copy_logits_max": -2.4054417610168457, "copy_logits_min": -750000000.0, "copy_num_tokens": 593.9375, "epoch": 0.8140924176665816, "gen_logits_max": 3.753082752227783, "gen_logits_mean": -16.274581909179688, "gen_logits_min": -27.89970588684082, "gen_logits_std": 2.750887393951416, "gen_loss": 0.26251059770584106, "grad_norm": 0.4362558772097986, "learning_rate": 2.5894736842105265e-05, "loss": 0.3094, "mean_copy_accuracy": 0.9940060526132584, "mean_gen_accuracy": 0.8689371198415756, "mean_token_accuracy": 0.8970193415880203, "num_tokens": 1080165920.0, "sample_num_tokens": 9427.0, "step": 3986, "total_num_tokens": 1080203628.0, "z_loss": 0.0005848166183568537 }, { "copy_logits_max": -6.423361778259277, "copy_logits_min": -750000000.0, "copy_num_tokens": 411.75, "epoch": 0.8142966556037784, "gen_logits_max": 5.098834037780762, "gen_logits_mean": -15.207357406616211, "gen_logits_min": -26.577674865722656, "gen_logits_std": 2.751476526260376, "gen_loss": 0.28330326080322266, "grad_norm": 0.3990024850722467, "learning_rate": 2.5893473684210526e-05, "loss": 0.2939, "mean_copy_accuracy": 0.9959207475185394, "mean_gen_accuracy": 0.8752090930938721, "mean_token_accuracy": 0.9016409516334534, "num_tokens": 1080429712.0, "sample_num_tokens": 7669.0, "step": 3987, "total_num_tokens": 1080460388.0, "z_loss": 0.000592061725910753 }, { "copy_logits_max": -4.501087665557861, "copy_logits_min": -750000000.0, "copy_num_tokens": 438.5625, "epoch": 0.8145008935409752, "gen_logits_max": 5.15824031829834, "gen_logits_mean": -14.63401985168457, "gen_logits_min": -25.921491622924805, "gen_logits_std": 2.737537145614624, "gen_loss": 0.32209455966949463, "grad_norm": 0.44335878556566527, "learning_rate": 2.589221052631579e-05, "loss": 0.3085, "mean_copy_accuracy": 0.9961590766906738, "mean_gen_accuracy": 0.8662210702896118, "mean_token_accuracy": 0.8968099504709244, "num_tokens": 1080696139.0, "sample_num_tokens": 7771.75, "step": 3988, "total_num_tokens": 1080727226.0, "z_loss": 0.0007174821803346276 }, { "copy_logits_max": -5.542818546295166, "copy_logits_min": -750000000.0, "copy_num_tokens": 611.3125, "epoch": 0.814705131478172, "gen_logits_max": 3.702458620071411, "gen_logits_mean": -16.066898345947266, "gen_logits_min": -27.670211791992188, "gen_logits_std": 2.76491117477417, "gen_loss": 0.2997668981552124, "grad_norm": 0.42739950944924604, "learning_rate": 2.589094736842105e-05, "loss": 0.3167, "mean_copy_accuracy": 0.99408358335495, "mean_gen_accuracy": 0.8681548535823822, "mean_token_accuracy": 0.8963660597801208, "num_tokens": 1080959680.0, "sample_num_tokens": 10294.5, "step": 3989, "total_num_tokens": 1081000858.0, "z_loss": 0.0006171087152324617 }, { "copy_logits_max": -6.168355941772461, "copy_logits_min": -750000000.0, "copy_num_tokens": 382.1875, "epoch": 0.814909369415369, "gen_logits_max": 4.685359954833984, "gen_logits_mean": -14.918817520141602, "gen_logits_min": -26.42936897277832, "gen_logits_std": 2.746096611022949, "gen_loss": 0.2881492078304291, "grad_norm": 0.5542346318637226, "learning_rate": 2.5889684210526316e-05, "loss": 0.3269, "mean_copy_accuracy": 0.9941615164279938, "mean_gen_accuracy": 0.8612327575683594, "mean_token_accuracy": 0.8928873389959335, "num_tokens": 1081223943.0, "sample_num_tokens": 7621.75, "step": 3990, "total_num_tokens": 1081254430.0, "z_loss": 0.0005251801339909434 }, { "copy_logits_max": -5.9495463371276855, "copy_logits_min": -750000064.0, "copy_num_tokens": 357.875, "epoch": 0.8151136073525658, "gen_logits_max": 5.143868923187256, "gen_logits_mean": -13.938665390014648, "gen_logits_min": -25.341529846191406, "gen_logits_std": 2.706984758377075, "gen_loss": 0.28734928369522095, "grad_norm": 0.3707848273336219, "learning_rate": 2.588842105263158e-05, "loss": 0.2891, "mean_copy_accuracy": 0.9964986741542816, "mean_gen_accuracy": 0.86599500477314, "mean_token_accuracy": 0.9036877453327179, "num_tokens": 1081527712.0, "sample_num_tokens": 7508.5, "step": 3991, "total_num_tokens": 1081557746.0, "z_loss": 0.0005622839089483023 }, { "copy_logits_max": -5.138975143432617, "copy_logits_min": -687500032.0, "copy_num_tokens": 561.6875, "epoch": 0.8153178452897626, "gen_logits_max": 5.306864261627197, "gen_logits_mean": -12.704418182373047, "gen_logits_min": -24.151531219482422, "gen_logits_std": 2.7138357162475586, "gen_loss": 0.3139895498752594, "grad_norm": 0.4306849999044521, "learning_rate": 2.5887157894736844e-05, "loss": 0.2895, "mean_copy_accuracy": 0.9960136562585831, "mean_gen_accuracy": 0.867340162396431, "mean_token_accuracy": 0.9029222279787064, "num_tokens": 1081773445.0, "sample_num_tokens": 8160.25, "step": 3992, "total_num_tokens": 1081806086.0, "z_loss": 0.0006068517104722559 }, { "copy_logits_max": -5.816262245178223, "copy_logits_min": -750000000.0, "copy_num_tokens": 503.75, "epoch": 0.8155220832269594, "gen_logits_max": 5.060078144073486, "gen_logits_mean": -14.398467063903809, "gen_logits_min": -26.02544403076172, "gen_logits_std": 2.699995517730713, "gen_loss": 0.34999966621398926, "grad_norm": 0.42900050158400876, "learning_rate": 2.588589473684211e-05, "loss": 0.3171, "mean_copy_accuracy": 0.9959774613380432, "mean_gen_accuracy": 0.8594753742218018, "mean_token_accuracy": 0.8951172679662704, "num_tokens": 1082051492.0, "sample_num_tokens": 8235.5, "step": 3993, "total_num_tokens": 1082084434.0, "z_loss": 0.0006493291584774852 }, { "copy_logits_max": -7.892762184143066, "copy_logits_min": -750000000.0, "copy_num_tokens": 590.1875, "epoch": 0.8157263211641562, "gen_logits_max": 5.114103317260742, "gen_logits_mean": -14.181432723999023, "gen_logits_min": -26.042295455932617, "gen_logits_std": 2.7562270164489746, "gen_loss": 0.279507040977478, "grad_norm": 1.1286587504362666, "learning_rate": 2.588463157894737e-05, "loss": 0.3117, "mean_copy_accuracy": 0.9949043095111847, "mean_gen_accuracy": 0.8654620349407196, "mean_token_accuracy": 0.8954200893640518, "num_tokens": 1082308865.0, "sample_num_tokens": 9352.25, "step": 3994, "total_num_tokens": 1082346274.0, "z_loss": 0.0005350507562980056 }, { "copy_logits_max": -8.377995491027832, "copy_logits_min": -750000000.0, "copy_num_tokens": 284.3125, "epoch": 0.815930559101353, "gen_logits_max": 5.108678817749023, "gen_logits_mean": -14.89515209197998, "gen_logits_min": -25.944568634033203, "gen_logits_std": 2.7029869556427, "gen_loss": 0.29854118824005127, "grad_norm": 0.387027167746609, "learning_rate": 2.5883368421052634e-05, "loss": 0.3112, "mean_copy_accuracy": 0.9948062598705292, "mean_gen_accuracy": 0.8657368719577789, "mean_token_accuracy": 0.8971818387508392, "num_tokens": 1082590809.0, "sample_num_tokens": 7526.25, "step": 3995, "total_num_tokens": 1082620914.0, "z_loss": 0.0005773719167336822 }, { "copy_logits_max": -6.024531841278076, "copy_logits_min": -750000000.0, "copy_num_tokens": 487.4375, "epoch": 0.81613479703855, "gen_logits_max": 4.660085201263428, "gen_logits_mean": -14.512086868286133, "gen_logits_min": -26.085529327392578, "gen_logits_std": 2.7048728466033936, "gen_loss": 0.3143160343170166, "grad_norm": 0.4165273505308335, "learning_rate": 2.5882105263157895e-05, "loss": 0.3068, "mean_copy_accuracy": 0.9947120249271393, "mean_gen_accuracy": 0.8670873492956161, "mean_token_accuracy": 0.8976943790912628, "num_tokens": 1082846596.0, "sample_num_tokens": 8107.0, "step": 3996, "total_num_tokens": 1082879024.0, "z_loss": 0.0006521515315398574 }, { "copy_logits_max": -5.077108383178711, "copy_logits_min": -687500032.0, "copy_num_tokens": 627.1875, "epoch": 0.8163390349757468, "gen_logits_max": 4.287507057189941, "gen_logits_mean": -14.778838157653809, "gen_logits_min": -26.530012130737305, "gen_logits_std": 2.7442281246185303, "gen_loss": 0.2866142988204956, "grad_norm": 0.40291986636687405, "learning_rate": 2.588084210526316e-05, "loss": 0.321, "mean_copy_accuracy": 0.9953974187374115, "mean_gen_accuracy": 0.8548503518104553, "mean_token_accuracy": 0.8935028165578842, "num_tokens": 1083119440.0, "sample_num_tokens": 9027.5, "step": 3997, "total_num_tokens": 1083155550.0, "z_loss": 0.0006769137689843774 }, { "copy_logits_max": -8.105417251586914, "copy_logits_min": -750000000.0, "copy_num_tokens": 360.75, "epoch": 0.8165432729129436, "gen_logits_max": 5.755816459655762, "gen_logits_mean": -14.301891326904297, "gen_logits_min": -25.475727081298828, "gen_logits_std": 2.707538604736328, "gen_loss": 0.29683202505111694, "grad_norm": 0.40129233047108304, "learning_rate": 2.587957894736842e-05, "loss": 0.3003, "mean_copy_accuracy": 0.9945490062236786, "mean_gen_accuracy": 0.8685744553804398, "mean_token_accuracy": 0.8993337154388428, "num_tokens": 1083383815.0, "sample_num_tokens": 7738.25, "step": 3998, "total_num_tokens": 1083414768.0, "z_loss": 0.0006911390810273588 }, { "copy_logits_max": -5.360945701599121, "copy_logits_min": -750000000.0, "copy_num_tokens": 511.8125, "epoch": 0.8167475108501404, "gen_logits_max": 5.440461158752441, "gen_logits_mean": -14.378599166870117, "gen_logits_min": -26.084514617919922, "gen_logits_std": 2.7539336681365967, "gen_loss": 0.3166981339454651, "grad_norm": 0.4347682977119918, "learning_rate": 2.5878315789473684e-05, "loss": 0.3279, "mean_copy_accuracy": 0.9944957941770554, "mean_gen_accuracy": 0.8559211641550064, "mean_token_accuracy": 0.8897789269685745, "num_tokens": 1083658563.0, "sample_num_tokens": 9244.75, "step": 3999, "total_num_tokens": 1083695542.0, "z_loss": 0.000762411393225193 }, { "epoch": 0.8169517487873372, "grad_norm": 0.43278675548366397, "learning_rate": 2.587705263157895e-05, "loss": 0.314, "step": 4000 }, { "epoch": 0.8169517487873372, "eval_copy_logits_max": -5.772634983062744, "eval_copy_logits_min": -77.57469940185547, "eval_gen_logits_max": 4.446339130401611, "eval_gen_logits_mean": -18.61977195739746, "eval_gen_logits_min": -29.350828170776367, "eval_gen_logits_std": 2.702519416809082, "eval_gen_loss": 0.3534870743751526, "eval_loss": 0.3333854377269745, "eval_mean_copy_accuracy": 0.9933538734912872, "eval_mean_gen_accuracy": 0.8691604137420654, "eval_mean_token_accuracy": 0.884998619556427, "eval_num_tokens": 1083939588.0, "eval_runtime": 0.7776, "eval_samples_per_second": 10.288, "eval_steps_per_second": 2.572, "eval_total_num_tokens": 1083939588.0, "eval_z_loss": 0.0007368894875980914, "step": 4000 }, { "copy_logits_max": -5.0159912109375, "copy_logits_min": -750000064.0, "copy_num_tokens": 377.5, "epoch": 0.817155986724534, "gen_logits_max": 6.458138465881348, "gen_logits_mean": -14.402497291564941, "gen_logits_min": -25.824974060058594, "gen_logits_std": 2.6991899013519287, "gen_loss": 0.3189311921596527, "grad_norm": 0.40459932334307486, "learning_rate": 2.5875789473684213e-05, "loss": 0.3069, "mean_copy_accuracy": 0.9955819994211197, "mean_gen_accuracy": 0.8686778247356415, "mean_token_accuracy": 0.9001646041870117, "num_tokens": 247171.0, "sample_num_tokens": 7756.25, "step": 4001, "total_num_tokens": 278196.0, "z_loss": 0.0008010475430637598 }, { "copy_logits_max": -4.601706027984619, "copy_logits_min": -750000000.0, "copy_num_tokens": 539.1875, "epoch": 0.817360224661731, "gen_logits_max": 5.289129257202148, "gen_logits_mean": -14.474949836730957, "gen_logits_min": -25.737628936767578, "gen_logits_std": 2.7212276458740234, "gen_loss": 0.3387242555618286, "grad_norm": 0.40018861969825054, "learning_rate": 2.5874526315789474e-05, "loss": 0.3008, "mean_copy_accuracy": 0.995965451002121, "mean_gen_accuracy": 0.867294192314148, "mean_token_accuracy": 0.9017874896526337, "num_tokens": 514181.0, "sample_num_tokens": 9783.25, "step": 4002, "total_num_tokens": 553314.0, "z_loss": 0.0007304190658032894 }, { "copy_logits_max": -3.6078813076019287, "copy_logits_min": -750000064.0, "copy_num_tokens": 441.8125, "epoch": 0.8175644625989278, "gen_logits_max": 6.063205718994141, "gen_logits_mean": -14.562628746032715, "gen_logits_min": -26.63642120361328, "gen_logits_std": 2.750141143798828, "gen_loss": 0.32515740394592285, "grad_norm": 0.4059703179235101, "learning_rate": 2.5873263157894738e-05, "loss": 0.3281, "mean_copy_accuracy": 0.9942817538976669, "mean_gen_accuracy": 0.8631753027439117, "mean_token_accuracy": 0.8920008987188339, "num_tokens": 799714.0, "sample_num_tokens": 8415.0, "step": 4003, "total_num_tokens": 833374.0, "z_loss": 0.0007572649046778679 }, { "copy_logits_max": -6.6908650398254395, "copy_logits_min": -750000064.0, "copy_num_tokens": 445.9375, "epoch": 0.8177687005361246, "gen_logits_max": 5.220202445983887, "gen_logits_mean": -15.860245704650879, "gen_logits_min": -26.939868927001953, "gen_logits_std": 2.7124600410461426, "gen_loss": 0.30498945713043213, "grad_norm": 0.401881792256596, "learning_rate": 2.5872000000000002e-05, "loss": 0.2959, "mean_copy_accuracy": 0.9952265918254852, "mean_gen_accuracy": 0.8716907650232315, "mean_token_accuracy": 0.9025387167930603, "num_tokens": 1073974.0, "sample_num_tokens": 8640.5, "step": 4004, "total_num_tokens": 1108536.0, "z_loss": 0.0005973127554170787 }, { "copy_logits_max": -5.215207099914551, "copy_logits_min": -687500032.0, "copy_num_tokens": 347.5625, "epoch": 0.8179729384733214, "gen_logits_max": 4.675133228302002, "gen_logits_mean": -14.753358840942383, "gen_logits_min": -26.40562629699707, "gen_logits_std": 2.7309823036193848, "gen_loss": 0.3325355052947998, "grad_norm": 0.39093225232505013, "learning_rate": 2.5870736842105263e-05, "loss": 0.302, "mean_copy_accuracy": 0.9951397180557251, "mean_gen_accuracy": 0.870262861251831, "mean_token_accuracy": 0.9003945738077164, "num_tokens": 1333908.0, "sample_num_tokens": 6683.0, "step": 4005, "total_num_tokens": 1360640.0, "z_loss": 0.00070302898529917 }, { "copy_logits_max": -7.904847145080566, "copy_logits_min": -750000064.0, "copy_num_tokens": 491.375, "epoch": 0.8181771764105182, "gen_logits_max": 5.3227691650390625, "gen_logits_mean": -14.0960054397583, "gen_logits_min": -25.563838958740234, "gen_logits_std": 2.7881484031677246, "gen_loss": 0.2933005690574646, "grad_norm": 0.46711468976011516, "learning_rate": 2.5869473684210528e-05, "loss": 0.3291, "mean_copy_accuracy": 0.9945006370544434, "mean_gen_accuracy": 0.8608398288488388, "mean_token_accuracy": 0.8902200311422348, "num_tokens": 1571361.0, "sample_num_tokens": 8133.25, "step": 4006, "total_num_tokens": 1603894.0, "z_loss": 0.0006511128740385175 }, { "copy_logits_max": -8.079902648925781, "copy_logits_min": -750000000.0, "copy_num_tokens": 344.5, "epoch": 0.818381414347715, "gen_logits_max": 5.345640182495117, "gen_logits_mean": -14.849870681762695, "gen_logits_min": -26.06609344482422, "gen_logits_std": 2.7056522369384766, "gen_loss": 0.3265898823738098, "grad_norm": 0.38115706479741823, "learning_rate": 2.586821052631579e-05, "loss": 0.2952, "mean_copy_accuracy": 0.996071919798851, "mean_gen_accuracy": 0.8706122636795044, "mean_token_accuracy": 0.9028196483850479, "num_tokens": 1851938.0, "sample_num_tokens": 7092.5, "step": 4007, "total_num_tokens": 1880308.0, "z_loss": 0.0006515442510135472 }, { "copy_logits_max": -7.146755695343018, "copy_logits_min": -687500032.0, "copy_num_tokens": 324.875, "epoch": 0.8185856522849119, "gen_logits_max": 6.057436466217041, "gen_logits_mean": -14.814807891845703, "gen_logits_min": -26.437889099121094, "gen_logits_std": 2.7697858810424805, "gen_loss": 0.3246121406555176, "grad_norm": 0.4624184523635141, "learning_rate": 2.5866947368421053e-05, "loss": 0.3021, "mean_copy_accuracy": 0.9929729849100113, "mean_gen_accuracy": 0.8696524351835251, "mean_token_accuracy": 0.9014119207859039, "num_tokens": 2139084.0, "sample_num_tokens": 7956.5, "step": 4008, "total_num_tokens": 2170910.0, "z_loss": 0.0006440632860176265 }, { "copy_logits_max": -5.588245391845703, "copy_logits_min": -750000000.0, "copy_num_tokens": 529.3125, "epoch": 0.8187898902221088, "gen_logits_max": 4.841183185577393, "gen_logits_mean": -15.755661964416504, "gen_logits_min": -27.068838119506836, "gen_logits_std": 2.7555224895477295, "gen_loss": 0.34650474786758423, "grad_norm": 0.4047014812247916, "learning_rate": 2.5865684210526317e-05, "loss": 0.3165, "mean_copy_accuracy": 0.9953993856906891, "mean_gen_accuracy": 0.8623663634061813, "mean_token_accuracy": 0.8950972259044647, "num_tokens": 2410459.0, "sample_num_tokens": 8667.25, "step": 4009, "total_num_tokens": 2445128.0, "z_loss": 0.0007472971919924021 }, { "copy_logits_max": -5.7635884284973145, "copy_logits_min": -625000064.0, "copy_num_tokens": 544.3125, "epoch": 0.8189941281593056, "gen_logits_max": 5.561470031738281, "gen_logits_mean": -14.102437019348145, "gen_logits_min": -26.320594787597656, "gen_logits_std": 2.854306697845459, "gen_loss": 0.28780561685562134, "grad_norm": 0.4129667305915881, "learning_rate": 2.586442105263158e-05, "loss": 0.3018, "mean_copy_accuracy": 0.9947452396154404, "mean_gen_accuracy": 0.8678008019924164, "mean_token_accuracy": 0.9008500576019287, "num_tokens": 2681950.0, "sample_num_tokens": 9400.0, "step": 4010, "total_num_tokens": 2719550.0, "z_loss": 0.0006823556032031775 }, { "copy_logits_max": -7.429375648498535, "copy_logits_min": -750000000.0, "copy_num_tokens": 552.625, "epoch": 0.8191983660965024, "gen_logits_max": 5.179997444152832, "gen_logits_mean": -14.249229431152344, "gen_logits_min": -26.393943786621094, "gen_logits_std": 2.813972234725952, "gen_loss": 0.2927023470401764, "grad_norm": 0.39954338895684655, "learning_rate": 2.5863157894736842e-05, "loss": 0.313, "mean_copy_accuracy": 0.9953722357749939, "mean_gen_accuracy": 0.8631789237260818, "mean_token_accuracy": 0.8960100561380386, "num_tokens": 2958419.0, "sample_num_tokens": 9395.75, "step": 4011, "total_num_tokens": 2996002.0, "z_loss": 0.0007624035933986306 }, { "copy_logits_max": -7.822295188903809, "copy_logits_min": -750000000.0, "copy_num_tokens": 401.25, "epoch": 0.8194026040336992, "gen_logits_max": 5.315191745758057, "gen_logits_mean": -15.08911418914795, "gen_logits_min": -26.831195831298828, "gen_logits_std": 2.7437877655029297, "gen_loss": 0.315201073884964, "grad_norm": 0.4418775596544163, "learning_rate": 2.5861894736842107e-05, "loss": 0.3196, "mean_copy_accuracy": 0.9947477132081985, "mean_gen_accuracy": 0.8607349544763565, "mean_token_accuracy": 0.8942414969205856, "num_tokens": 3233769.0, "sample_num_tokens": 9322.25, "step": 4012, "total_num_tokens": 3271058.0, "z_loss": 0.0007209533359855413 }, { "copy_logits_max": -6.624362945556641, "copy_logits_min": -687500032.0, "copy_num_tokens": 329.625, "epoch": 0.8196068419708961, "gen_logits_max": 4.958168983459473, "gen_logits_mean": -14.914592742919922, "gen_logits_min": -27.205486297607422, "gen_logits_std": 2.8142271041870117, "gen_loss": 0.3458971381187439, "grad_norm": 0.3915306867695235, "learning_rate": 2.5860631578947368e-05, "loss": 0.303, "mean_copy_accuracy": 0.9947283118963242, "mean_gen_accuracy": 0.86886265873909, "mean_token_accuracy": 0.8983815908432007, "num_tokens": 3489108.0, "sample_num_tokens": 7311.5, "step": 4013, "total_num_tokens": 3518354.0, "z_loss": 0.0007304854225367308 }, { "copy_logits_max": -8.412622451782227, "copy_logits_min": -687500032.0, "copy_num_tokens": 207.6875, "epoch": 0.8198110799080929, "gen_logits_max": 5.314606666564941, "gen_logits_mean": -15.273542404174805, "gen_logits_min": -26.961559295654297, "gen_logits_std": 2.765193462371826, "gen_loss": 0.33031871914863586, "grad_norm": 0.3969696400514433, "learning_rate": 2.5859368421052632e-05, "loss": 0.3071, "mean_copy_accuracy": 0.9944201558828354, "mean_gen_accuracy": 0.8682399690151215, "mean_token_accuracy": 0.8981066793203354, "num_tokens": 3765149.0, "sample_num_tokens": 7021.75, "step": 4014, "total_num_tokens": 3793236.0, "z_loss": 0.0006392805371433496 }, { "copy_logits_max": -8.650348663330078, "copy_logits_min": -750000000.0, "copy_num_tokens": 231.0625, "epoch": 0.8200153178452898, "gen_logits_max": 4.8968658447265625, "gen_logits_mean": -16.162460327148438, "gen_logits_min": -27.361379623413086, "gen_logits_std": 2.7701239585876465, "gen_loss": 0.3398107588291168, "grad_norm": 0.4366481719896513, "learning_rate": 2.5858105263157893e-05, "loss": 0.3406, "mean_copy_accuracy": 0.9930922091007233, "mean_gen_accuracy": 0.8614376187324524, "mean_token_accuracy": 0.8859384804964066, "num_tokens": 4013047.0, "sample_num_tokens": 7174.75, "step": 4015, "total_num_tokens": 4041746.0, "z_loss": 0.0006212685839273036 }, { "copy_logits_max": -8.919758796691895, "copy_logits_min": -687500096.0, "copy_num_tokens": 494.625, "epoch": 0.8202195557824866, "gen_logits_max": 4.672449588775635, "gen_logits_mean": -14.852535247802734, "gen_logits_min": -26.651447296142578, "gen_logits_std": 2.7621073722839355, "gen_loss": 0.2946638762950897, "grad_norm": 0.3937955267248073, "learning_rate": 2.5856842105263157e-05, "loss": 0.3023, "mean_copy_accuracy": 0.9945337623357773, "mean_gen_accuracy": 0.873702883720398, "mean_token_accuracy": 0.8991186916828156, "num_tokens": 4267337.0, "sample_num_tokens": 9117.25, "step": 4016, "total_num_tokens": 4303806.0, "z_loss": 0.0006007726769894361 }, { "copy_logits_max": -7.447795867919922, "copy_logits_min": -750000000.0, "copy_num_tokens": 329.75, "epoch": 0.8204237937196834, "gen_logits_max": 5.262718200683594, "gen_logits_mean": -14.436540603637695, "gen_logits_min": -26.211549758911133, "gen_logits_std": 2.8246803283691406, "gen_loss": 0.3244628310203552, "grad_norm": 0.3955572045905982, "learning_rate": 2.585557894736842e-05, "loss": 0.3024, "mean_copy_accuracy": 0.9940766990184784, "mean_gen_accuracy": 0.8720077127218246, "mean_token_accuracy": 0.8982043862342834, "num_tokens": 4536675.0, "sample_num_tokens": 7403.25, "step": 4017, "total_num_tokens": 4566288.0, "z_loss": 0.0006278420914895833 }, { "copy_logits_max": -8.947571754455566, "copy_logits_min": -750000000.0, "copy_num_tokens": 401.5625, "epoch": 0.8206280316568803, "gen_logits_max": 5.851211071014404, "gen_logits_mean": -13.83961296081543, "gen_logits_min": -25.450458526611328, "gen_logits_std": 2.807368516921997, "gen_loss": 0.2947314977645874, "grad_norm": 0.39915986724181723, "learning_rate": 2.5854315789473686e-05, "loss": 0.3182, "mean_copy_accuracy": 0.9950026571750641, "mean_gen_accuracy": 0.8680792599916458, "mean_token_accuracy": 0.8948389142751694, "num_tokens": 4809050.0, "sample_num_tokens": 9372.5, "step": 4018, "total_num_tokens": 4846540.0, "z_loss": 0.0006393040530383587 }, { "copy_logits_max": -5.394831657409668, "copy_logits_min": -750000000.0, "copy_num_tokens": 610.375, "epoch": 0.8208322695940771, "gen_logits_max": 4.300739288330078, "gen_logits_mean": -14.36171817779541, "gen_logits_min": -25.79964256286621, "gen_logits_std": 2.763883590698242, "gen_loss": 0.31450653076171875, "grad_norm": 0.3754529777914051, "learning_rate": 2.585305263157895e-05, "loss": 0.3105, "mean_copy_accuracy": 0.9961865842342377, "mean_gen_accuracy": 0.8629645258188248, "mean_token_accuracy": 0.8984686136245728, "num_tokens": 5097234.0, "sample_num_tokens": 9635.5, "step": 4019, "total_num_tokens": 5135776.0, "z_loss": 0.0006833174265921116 }, { "copy_logits_max": -5.971853256225586, "copy_logits_min": -750000000.0, "copy_num_tokens": 441.5, "epoch": 0.8210365075312739, "gen_logits_max": 4.058548927307129, "gen_logits_mean": -15.48422622680664, "gen_logits_min": -27.374958038330078, "gen_logits_std": 2.835216522216797, "gen_loss": 0.2945789694786072, "grad_norm": 0.38452721164455306, "learning_rate": 2.585178947368421e-05, "loss": 0.2906, "mean_copy_accuracy": 0.9942931234836578, "mean_gen_accuracy": 0.8738963007926941, "mean_token_accuracy": 0.9023390859365463, "num_tokens": 5359914.0, "sample_num_tokens": 8297.5, "step": 4020, "total_num_tokens": 5393104.0, "z_loss": 0.0006749468739144504 }, { "copy_logits_max": -4.930464744567871, "copy_logits_min": -750000000.0, "copy_num_tokens": 709.625, "epoch": 0.8212407454684708, "gen_logits_max": 3.8013947010040283, "gen_logits_mean": -15.783323287963867, "gen_logits_min": -27.541290283203125, "gen_logits_std": 2.8105921745300293, "gen_loss": 0.30666086077690125, "grad_norm": 0.4289915347820355, "learning_rate": 2.5850526315789475e-05, "loss": 0.2848, "mean_copy_accuracy": 0.9947890788316727, "mean_gen_accuracy": 0.8678428679704666, "mean_token_accuracy": 0.9047280251979828, "num_tokens": 5661851.0, "sample_num_tokens": 10549.25, "step": 4021, "total_num_tokens": 5704048.0, "z_loss": 0.0007592948386445642 }, { "copy_logits_max": -8.948602676391602, "copy_logits_min": -750000000.0, "copy_num_tokens": 419.125, "epoch": 0.8214449834056676, "gen_logits_max": 4.766935348510742, "gen_logits_mean": -14.626696586608887, "gen_logits_min": -26.920785903930664, "gen_logits_std": 2.8006582260131836, "gen_loss": 0.29850220680236816, "grad_norm": 0.41157658230159405, "learning_rate": 2.5849263157894736e-05, "loss": 0.304, "mean_copy_accuracy": 0.996802344918251, "mean_gen_accuracy": 0.8670264184474945, "mean_token_accuracy": 0.8989184349775314, "num_tokens": 5931462.0, "sample_num_tokens": 8571.5, "step": 4022, "total_num_tokens": 5965748.0, "z_loss": 0.0006439895951189101 }, { "copy_logits_max": -7.290058135986328, "copy_logits_min": -750000000.0, "copy_num_tokens": 357.3125, "epoch": 0.8216492213428644, "gen_logits_max": 5.277460098266602, "gen_logits_mean": -14.97056770324707, "gen_logits_min": -26.337358474731445, "gen_logits_std": 2.751864433288574, "gen_loss": 0.39494621753692627, "grad_norm": 0.42076273004960923, "learning_rate": 2.5848e-05, "loss": 0.324, "mean_copy_accuracy": 0.9946548640727997, "mean_gen_accuracy": 0.864581286907196, "mean_token_accuracy": 0.8934505432844162, "num_tokens": 6193156.0, "sample_num_tokens": 7700.0, "step": 4023, "total_num_tokens": 6223956.0, "z_loss": 0.0008117492543533444 }, { "copy_logits_max": -9.020453453063965, "copy_logits_min": -750000000.0, "copy_num_tokens": 718.0625, "epoch": 0.8218534592800613, "gen_logits_max": 5.422319412231445, "gen_logits_mean": -14.221837043762207, "gen_logits_min": -26.39192771911621, "gen_logits_std": 2.841564655303955, "gen_loss": 0.2742459774017334, "grad_norm": 0.41355413354689613, "learning_rate": 2.584673684210526e-05, "loss": 0.3072, "mean_copy_accuracy": 0.9961145669221878, "mean_gen_accuracy": 0.8599577695131302, "mean_token_accuracy": 0.8989560902118683, "num_tokens": 6463794.0, "sample_num_tokens": 10432.0, "step": 4024, "total_num_tokens": 6505522.0, "z_loss": 0.0005977934342809021 }, { "copy_logits_max": -8.411137580871582, "copy_logits_min": -750000000.0, "copy_num_tokens": 369.0625, "epoch": 0.8220576972172581, "gen_logits_max": 5.121140480041504, "gen_logits_mean": -15.596269607543945, "gen_logits_min": -27.259498596191406, "gen_logits_std": 2.772306442260742, "gen_loss": 0.33895882964134216, "grad_norm": 0.4087844721610584, "learning_rate": 2.584547368421053e-05, "loss": 0.3165, "mean_copy_accuracy": 0.9959129840135574, "mean_gen_accuracy": 0.8580988943576813, "mean_token_accuracy": 0.8951847553253174, "num_tokens": 6744741.0, "sample_num_tokens": 7711.75, "step": 4025, "total_num_tokens": 6775588.0, "z_loss": 0.0006759942043572664 }, { "copy_logits_max": -8.064603805541992, "copy_logits_min": -750000000.0, "copy_num_tokens": 299.0625, "epoch": 0.8222619351544549, "gen_logits_max": 6.173784255981445, "gen_logits_mean": -14.158390998840332, "gen_logits_min": -25.63100242614746, "gen_logits_std": 2.806699275970459, "gen_loss": 0.37884658575057983, "grad_norm": 0.4318035470823856, "learning_rate": 2.584421052631579e-05, "loss": 0.3304, "mean_copy_accuracy": 0.9960032105445862, "mean_gen_accuracy": 0.8590066134929657, "mean_token_accuracy": 0.8913966566324234, "num_tokens": 7024019.0, "sample_num_tokens": 7797.25, "step": 4026, "total_num_tokens": 7055208.0, "z_loss": 0.0007990187732502818 }, { "copy_logits_max": -6.8569722175598145, "copy_logits_min": -750000000.0, "copy_num_tokens": 486.625, "epoch": 0.8224661730916518, "gen_logits_max": 5.025725364685059, "gen_logits_mean": -14.662261962890625, "gen_logits_min": -26.383256912231445, "gen_logits_std": 2.8289945125579834, "gen_loss": 0.2841646075248718, "grad_norm": 0.3894545311807396, "learning_rate": 2.5842947368421054e-05, "loss": 0.3142, "mean_copy_accuracy": 0.9960662722587585, "mean_gen_accuracy": 0.861392006278038, "mean_token_accuracy": 0.8956195265054703, "num_tokens": 7295038.0, "sample_num_tokens": 8415.0, "step": 4027, "total_num_tokens": 7328698.0, "z_loss": 0.000604343309532851 }, { "copy_logits_max": -7.820114612579346, "copy_logits_min": -750000000.0, "copy_num_tokens": 483.9375, "epoch": 0.8226704110288486, "gen_logits_max": 5.207938194274902, "gen_logits_mean": -14.599885940551758, "gen_logits_min": -26.398143768310547, "gen_logits_std": 2.8090310096740723, "gen_loss": 0.2713613212108612, "grad_norm": 0.3929056684748653, "learning_rate": 2.5841684210526315e-05, "loss": 0.3126, "mean_copy_accuracy": 0.9943457245826721, "mean_gen_accuracy": 0.8620668649673462, "mean_token_accuracy": 0.8972988128662109, "num_tokens": 7573317.0, "sample_num_tokens": 8259.25, "step": 4028, "total_num_tokens": 7606354.0, "z_loss": 0.0005969745106995106 }, { "copy_logits_max": -7.767088890075684, "copy_logits_min": -687500032.0, "copy_num_tokens": 288.75, "epoch": 0.8228746489660455, "gen_logits_max": 4.726279258728027, "gen_logits_mean": -14.843935012817383, "gen_logits_min": -27.023590087890625, "gen_logits_std": 2.8106517791748047, "gen_loss": 0.2931749224662781, "grad_norm": 0.40667737673017695, "learning_rate": 2.584042105263158e-05, "loss": 0.2874, "mean_copy_accuracy": 0.995021715760231, "mean_gen_accuracy": 0.873787596821785, "mean_token_accuracy": 0.9037239253520966, "num_tokens": 7844927.0, "sample_num_tokens": 6697.75, "step": 4029, "total_num_tokens": 7871718.0, "z_loss": 0.0005831079324707389 }, { "copy_logits_max": -7.946415424346924, "copy_logits_min": -750000064.0, "copy_num_tokens": 428.125, "epoch": 0.8230788869032423, "gen_logits_max": 4.591426849365234, "gen_logits_mean": -15.19755744934082, "gen_logits_min": -26.96923065185547, "gen_logits_std": 2.8173060417175293, "gen_loss": 0.2682150602340698, "grad_norm": 0.38286683945236455, "learning_rate": 2.583915789473684e-05, "loss": 0.2883, "mean_copy_accuracy": 0.9954745024442673, "mean_gen_accuracy": 0.8685413151979446, "mean_token_accuracy": 0.9024235606193542, "num_tokens": 8123701.0, "sample_num_tokens": 7910.25, "step": 4030, "total_num_tokens": 8155342.0, "z_loss": 0.0005737633910030127 }, { "copy_logits_max": -5.995199203491211, "copy_logits_min": -750000000.0, "copy_num_tokens": 636.875, "epoch": 0.8232831248404391, "gen_logits_max": 4.770813941955566, "gen_logits_mean": -12.951064109802246, "gen_logits_min": -24.85569190979004, "gen_logits_std": 2.7867555618286133, "gen_loss": 0.2854595184326172, "grad_norm": 0.38302941211771546, "learning_rate": 2.5837894736842105e-05, "loss": 0.2973, "mean_copy_accuracy": 0.9966412037611008, "mean_gen_accuracy": 0.8638197183609009, "mean_token_accuracy": 0.9020354002714157, "num_tokens": 8408617.0, "sample_num_tokens": 8871.75, "step": 4031, "total_num_tokens": 8444104.0, "z_loss": 0.0005860802484676242 }, { "copy_logits_max": -5.673803806304932, "copy_logits_min": -750000064.0, "copy_num_tokens": 578.9375, "epoch": 0.8234873627776359, "gen_logits_max": 3.6171998977661133, "gen_logits_mean": -15.211572647094727, "gen_logits_min": -27.759824752807617, "gen_logits_std": 2.8477563858032227, "gen_loss": 0.27456995844841003, "grad_norm": 0.4089267725648035, "learning_rate": 2.583663157894737e-05, "loss": 0.3086, "mean_copy_accuracy": 0.995334804058075, "mean_gen_accuracy": 0.8653818368911743, "mean_token_accuracy": 0.8982443660497665, "num_tokens": 8683460.0, "sample_num_tokens": 8572.0, "step": 4032, "total_num_tokens": 8717748.0, "z_loss": 0.0005507394671440125 }, { "copy_logits_max": -7.769836902618408, "copy_logits_min": -750000000.0, "copy_num_tokens": 655.9375, "epoch": 0.8236916007148328, "gen_logits_max": 4.373699188232422, "gen_logits_mean": -14.422863006591797, "gen_logits_min": -26.018177032470703, "gen_logits_std": 2.769321918487549, "gen_loss": 0.3034818768501282, "grad_norm": 0.3811217633436319, "learning_rate": 2.5835368421052633e-05, "loss": 0.3024, "mean_copy_accuracy": 0.9959000796079636, "mean_gen_accuracy": 0.8653569668531418, "mean_token_accuracy": 0.9003644734621048, "num_tokens": 8967294.0, "sample_num_tokens": 9675.0, "step": 4033, "total_num_tokens": 9005994.0, "z_loss": 0.0005810747388750315 }, { "copy_logits_max": -7.389387130737305, "copy_logits_min": -625000064.0, "copy_num_tokens": 307.5, "epoch": 0.8238958386520296, "gen_logits_max": 5.421047210693359, "gen_logits_mean": -13.993959426879883, "gen_logits_min": -25.38657569885254, "gen_logits_std": 2.7749969959259033, "gen_loss": 0.37276023626327515, "grad_norm": 0.3996201556862985, "learning_rate": 2.5834105263157898e-05, "loss": 0.324, "mean_copy_accuracy": 0.9940032660961151, "mean_gen_accuracy": 0.8648818582296371, "mean_token_accuracy": 0.8922848105430603, "num_tokens": 9232253.0, "sample_num_tokens": 7001.75, "step": 4034, "total_num_tokens": 9260260.0, "z_loss": 0.0006946763605810702 }, { "copy_logits_max": -8.258600234985352, "copy_logits_min": -750000000.0, "copy_num_tokens": 267.25, "epoch": 0.8241000765892265, "gen_logits_max": 5.1763105392456055, "gen_logits_mean": -15.515318870544434, "gen_logits_min": -26.888118743896484, "gen_logits_std": 2.7749624252319336, "gen_loss": 0.35694465041160583, "grad_norm": 0.39970882711271577, "learning_rate": 2.583284210526316e-05, "loss": 0.3212, "mean_copy_accuracy": 0.9950310289859772, "mean_gen_accuracy": 0.8611505478620529, "mean_token_accuracy": 0.8938679397106171, "num_tokens": 9491094.0, "sample_num_tokens": 6772.5, "step": 4035, "total_num_tokens": 9518184.0, "z_loss": 0.0006812590872868896 }, { "copy_logits_max": -4.228476524353027, "copy_logits_min": -750000000.0, "copy_num_tokens": 554.125, "epoch": 0.8243043145264233, "gen_logits_max": 4.369156837463379, "gen_logits_mean": -14.11412239074707, "gen_logits_min": -26.63127326965332, "gen_logits_std": 2.8268070220947266, "gen_loss": 0.32352471351623535, "grad_norm": 0.40212185726832506, "learning_rate": 2.5831578947368423e-05, "loss": 0.2946, "mean_copy_accuracy": 0.995516449213028, "mean_gen_accuracy": 0.8671297878026962, "mean_token_accuracy": 0.9011912047863007, "num_tokens": 9760990.0, "sample_num_tokens": 8725.0, "step": 4036, "total_num_tokens": 9795890.0, "z_loss": 0.000645583204459399 }, { "copy_logits_max": -5.290745258331299, "copy_logits_min": -562500096.0, "copy_num_tokens": 606.9375, "epoch": 0.8245085524636201, "gen_logits_max": 4.5164690017700195, "gen_logits_mean": -15.087583541870117, "gen_logits_min": -27.251665115356445, "gen_logits_std": 2.83673095703125, "gen_loss": 0.31777241826057434, "grad_norm": 0.4617836800820224, "learning_rate": 2.5830315789473684e-05, "loss": 0.3215, "mean_copy_accuracy": 0.9958475530147552, "mean_gen_accuracy": 0.8567821681499481, "mean_token_accuracy": 0.8939577639102936, "num_tokens": 10050149.0, "sample_num_tokens": 9232.25, "step": 4037, "total_num_tokens": 10087078.0, "z_loss": 0.0006596310995519161 }, { "copy_logits_max": -6.44521427154541, "copy_logits_min": -750000000.0, "copy_num_tokens": 485.375, "epoch": 0.8247127904008169, "gen_logits_max": 4.104611396789551, "gen_logits_mean": -15.824766159057617, "gen_logits_min": -27.59364891052246, "gen_logits_std": 2.8211984634399414, "gen_loss": 0.28001153469085693, "grad_norm": 0.37779835016561636, "learning_rate": 2.5829052631578948e-05, "loss": 0.2962, "mean_copy_accuracy": 0.9953356981277466, "mean_gen_accuracy": 0.8707128763198853, "mean_token_accuracy": 0.9005329757928848, "num_tokens": 10324800.0, "sample_num_tokens": 7933.0, "step": 4038, "total_num_tokens": 10356532.0, "z_loss": 0.0005742318462580442 }, { "copy_logits_max": -4.290985107421875, "copy_logits_min": -687500032.0, "copy_num_tokens": 501.5625, "epoch": 0.8249170283380138, "gen_logits_max": 5.316193103790283, "gen_logits_mean": -14.097801208496094, "gen_logits_min": -25.865680694580078, "gen_logits_std": 2.7792723178863525, "gen_loss": 0.39072340726852417, "grad_norm": 0.4026785914327838, "learning_rate": 2.582778947368421e-05, "loss": 0.3213, "mean_copy_accuracy": 0.9949261844158173, "mean_gen_accuracy": 0.8640912473201752, "mean_token_accuracy": 0.8961728066205978, "num_tokens": 10595312.0, "sample_num_tokens": 9708.0, "step": 4039, "total_num_tokens": 10634144.0, "z_loss": 0.0007745462935417891 }, { "copy_logits_max": -5.368243217468262, "copy_logits_min": -750000000.0, "copy_num_tokens": 487.0, "epoch": 0.8251212662752107, "gen_logits_max": 4.958725929260254, "gen_logits_mean": -13.890031814575195, "gen_logits_min": -25.589767456054688, "gen_logits_std": 2.7901506423950195, "gen_loss": 0.32153263688087463, "grad_norm": 0.42778470400149105, "learning_rate": 2.5826526315789473e-05, "loss": 0.3291, "mean_copy_accuracy": 0.9949856251478195, "mean_gen_accuracy": 0.8608180284500122, "mean_token_accuracy": 0.8936198949813843, "num_tokens": 10863052.0, "sample_num_tokens": 8004.5, "step": 4040, "total_num_tokens": 10895070.0, "z_loss": 0.0006532891420647502 }, { "copy_logits_max": -8.373503684997559, "copy_logits_min": -750000000.0, "copy_num_tokens": 444.875, "epoch": 0.8253255042124075, "gen_logits_max": 5.434924125671387, "gen_logits_mean": -14.037324905395508, "gen_logits_min": -25.914085388183594, "gen_logits_std": 2.778961658477783, "gen_loss": 0.3035891652107239, "grad_norm": 0.3897461548609837, "learning_rate": 2.5825263157894738e-05, "loss": 0.301, "mean_copy_accuracy": 0.9954680353403091, "mean_gen_accuracy": 0.8662988841533661, "mean_token_accuracy": 0.8996639102697372, "num_tokens": 11149087.0, "sample_num_tokens": 7868.75, "step": 4041, "total_num_tokens": 11180562.0, "z_loss": 0.0005751901189796627 }, { "copy_logits_max": -4.623764514923096, "copy_logits_min": -687500032.0, "copy_num_tokens": 691.4375, "epoch": 0.8255297421496043, "gen_logits_max": 4.882911205291748, "gen_logits_mean": -14.149538040161133, "gen_logits_min": -25.922897338867188, "gen_logits_std": 2.8171873092651367, "gen_loss": 0.2794637680053711, "grad_norm": 0.3922815023032862, "learning_rate": 2.5824000000000002e-05, "loss": 0.2863, "mean_copy_accuracy": 0.9955668896436691, "mean_gen_accuracy": 0.8748866766691208, "mean_token_accuracy": 0.9042472243309021, "num_tokens": 11401152.0, "sample_num_tokens": 10301.0, "step": 4042, "total_num_tokens": 11442356.0, "z_loss": 0.0005937496898695827 }, { "copy_logits_max": -8.308399200439453, "copy_logits_min": -750000000.0, "copy_num_tokens": 491.3125, "epoch": 0.8257339800868011, "gen_logits_max": 4.368137359619141, "gen_logits_mean": -14.593545913696289, "gen_logits_min": -26.356666564941406, "gen_logits_std": 2.822056770324707, "gen_loss": 0.2589939832687378, "grad_norm": 0.43723056701556695, "learning_rate": 2.5822736842105263e-05, "loss": 0.2998, "mean_copy_accuracy": 0.9939357936382294, "mean_gen_accuracy": 0.8681145459413528, "mean_token_accuracy": 0.9000936895608902, "num_tokens": 11678155.0, "sample_num_tokens": 9248.25, "step": 4043, "total_num_tokens": 11715148.0, "z_loss": 0.0005224795895628631 }, { "copy_logits_max": -5.35767936706543, "copy_logits_min": -750000000.0, "copy_num_tokens": 280.75, "epoch": 0.8259382180239979, "gen_logits_max": 5.500736236572266, "gen_logits_mean": -14.88107681274414, "gen_logits_min": -26.536394119262695, "gen_logits_std": 2.768765449523926, "gen_loss": 0.37200456857681274, "grad_norm": 0.4268404808653295, "learning_rate": 2.5821473684210527e-05, "loss": 0.3317, "mean_copy_accuracy": 0.9954556077718735, "mean_gen_accuracy": 0.8590646535158157, "mean_token_accuracy": 0.889936164021492, "num_tokens": 11935367.0, "sample_num_tokens": 7114.25, "step": 4044, "total_num_tokens": 11963824.0, "z_loss": 0.0007012228015810251 }, { "copy_logits_max": -6.256779670715332, "copy_logits_min": -750000000.0, "copy_num_tokens": 419.875, "epoch": 0.8261424559611948, "gen_logits_max": 4.909297466278076, "gen_logits_mean": -14.523736953735352, "gen_logits_min": -26.33667755126953, "gen_logits_std": 2.7547357082366943, "gen_loss": 0.3301108777523041, "grad_norm": 0.513599914527105, "learning_rate": 2.582021052631579e-05, "loss": 0.3173, "mean_copy_accuracy": 0.9956395626068115, "mean_gen_accuracy": 0.8584559261798859, "mean_token_accuracy": 0.8937067240476608, "num_tokens": 12188815.0, "sample_num_tokens": 7786.75, "step": 4045, "total_num_tokens": 12219962.0, "z_loss": 0.0006770705804228783 }, { "copy_logits_max": -7.734057903289795, "copy_logits_min": -750000000.0, "copy_num_tokens": 415.1875, "epoch": 0.8263466938983917, "gen_logits_max": 6.276584625244141, "gen_logits_mean": -14.423471450805664, "gen_logits_min": -25.72288703918457, "gen_logits_std": 2.750049352645874, "gen_loss": 0.35424765944480896, "grad_norm": 0.45290169457769963, "learning_rate": 2.5818947368421053e-05, "loss": 0.326, "mean_copy_accuracy": 0.9939784407615662, "mean_gen_accuracy": 0.8618970960378647, "mean_token_accuracy": 0.8922891914844513, "num_tokens": 12436712.0, "sample_num_tokens": 8484.0, "step": 4046, "total_num_tokens": 12470648.0, "z_loss": 0.0006821264396421611 }, { "copy_logits_max": -7.847929954528809, "copy_logits_min": -750000000.0, "copy_num_tokens": 388.625, "epoch": 0.8265509318355885, "gen_logits_max": 4.44840669631958, "gen_logits_mean": -15.079142570495605, "gen_logits_min": -26.20899200439453, "gen_logits_std": 2.685781955718994, "gen_loss": 0.33873051404953003, "grad_norm": 0.40395722053642275, "learning_rate": 2.5817684210526317e-05, "loss": 0.3147, "mean_copy_accuracy": 0.9957334399223328, "mean_gen_accuracy": 0.8626971691846848, "mean_token_accuracy": 0.8949110060930252, "num_tokens": 12705745.0, "sample_num_tokens": 8144.25, "step": 4047, "total_num_tokens": 12738322.0, "z_loss": 0.0006705967243760824 }, { "copy_logits_max": -7.343727111816406, "copy_logits_min": -750000000.0, "copy_num_tokens": 604.625, "epoch": 0.8267551697727853, "gen_logits_max": 5.616079807281494, "gen_logits_mean": -13.527259826660156, "gen_logits_min": -24.830833435058594, "gen_logits_std": 2.706850528717041, "gen_loss": 0.28521400690078735, "grad_norm": 0.46013484591326453, "learning_rate": 2.5816421052631578e-05, "loss": 0.298, "mean_copy_accuracy": 0.9943251758813858, "mean_gen_accuracy": 0.8687977641820908, "mean_token_accuracy": 0.900734469294548, "num_tokens": 12983041.0, "sample_num_tokens": 10232.75, "step": 4048, "total_num_tokens": 13023972.0, "z_loss": 0.0006358175305649638 }, { "copy_logits_max": -5.805167198181152, "copy_logits_min": -750000000.0, "copy_num_tokens": 416.875, "epoch": 0.8269594077099821, "gen_logits_max": 5.000547409057617, "gen_logits_mean": -15.091312408447266, "gen_logits_min": -26.48004913330078, "gen_logits_std": 2.7196478843688965, "gen_loss": 0.29713621735572815, "grad_norm": 0.3709155489951235, "learning_rate": 2.5815157894736846e-05, "loss": 0.2976, "mean_copy_accuracy": 0.9960606843233109, "mean_gen_accuracy": 0.8677779883146286, "mean_token_accuracy": 0.9008488655090332, "num_tokens": 13265233.0, "sample_num_tokens": 7998.25, "step": 4049, "total_num_tokens": 13297226.0, "z_loss": 0.0006790061597712338 }, { "copy_logits_max": -4.910109996795654, "copy_logits_min": -750000000.0, "copy_num_tokens": 469.25, "epoch": 0.8271636456471789, "gen_logits_max": 4.969703197479248, "gen_logits_mean": -15.61789321899414, "gen_logits_min": -27.07452392578125, "gen_logits_std": 2.780735969543457, "gen_loss": 0.3033505082130432, "grad_norm": 0.46282930452062515, "learning_rate": 2.5813894736842106e-05, "loss": 0.3367, "mean_copy_accuracy": 0.9956759065389633, "mean_gen_accuracy": 0.8579404652118683, "mean_token_accuracy": 0.8900785744190216, "num_tokens": 13518178.0, "sample_num_tokens": 9014.5, "step": 4050, "total_num_tokens": 13554236.0, "z_loss": 0.0007486274116672575 }, { "copy_logits_max": -4.321706295013428, "copy_logits_min": -750000000.0, "copy_num_tokens": 659.3125, "epoch": 0.8273678835843759, "gen_logits_max": 5.405709266662598, "gen_logits_mean": -14.199949264526367, "gen_logits_min": -25.968719482421875, "gen_logits_std": 2.7510523796081543, "gen_loss": 0.3082965910434723, "grad_norm": 0.39974665052900166, "learning_rate": 2.581263157894737e-05, "loss": 0.3164, "mean_copy_accuracy": 0.9959548264741898, "mean_gen_accuracy": 0.8638238310813904, "mean_token_accuracy": 0.8965270072221756, "num_tokens": 13779943.0, "sample_num_tokens": 10514.75, "step": 4051, "total_num_tokens": 13822002.0, "z_loss": 0.000807166681624949 }, { "copy_logits_max": -5.5470075607299805, "copy_logits_min": -750000000.0, "copy_num_tokens": 284.125, "epoch": 0.8275721215215727, "gen_logits_max": 6.445734024047852, "gen_logits_mean": -14.456367492675781, "gen_logits_min": -25.498733520507812, "gen_logits_std": 2.7061069011688232, "gen_loss": 0.33477312326431274, "grad_norm": 0.41813710741727833, "learning_rate": 2.581136842105263e-05, "loss": 0.3137, "mean_copy_accuracy": 0.9933705180883408, "mean_gen_accuracy": 0.8685020804405212, "mean_token_accuracy": 0.8962129950523376, "num_tokens": 14042953.0, "sample_num_tokens": 6721.75, "step": 4052, "total_num_tokens": 14069840.0, "z_loss": 0.0007104496471583843 }, { "copy_logits_max": -6.55328893661499, "copy_logits_min": -687500032.0, "copy_num_tokens": 511.9375, "epoch": 0.8277763594587695, "gen_logits_max": 5.735049724578857, "gen_logits_mean": -14.567482948303223, "gen_logits_min": -25.61621856689453, "gen_logits_std": 2.6931633949279785, "gen_loss": 0.30518993735313416, "grad_norm": 0.41843690268305056, "learning_rate": 2.5810105263157896e-05, "loss": 0.3121, "mean_copy_accuracy": 0.9953922033309937, "mean_gen_accuracy": 0.8637814819812775, "mean_token_accuracy": 0.8950976729393005, "num_tokens": 14301508.0, "sample_num_tokens": 9583.0, "step": 4053, "total_num_tokens": 14339840.0, "z_loss": 0.0006508749211207032 }, { "copy_logits_max": -5.904538631439209, "copy_logits_min": -687500032.0, "copy_num_tokens": 594.875, "epoch": 0.8279805973959663, "gen_logits_max": 4.3509297370910645, "gen_logits_mean": -15.74084758758545, "gen_logits_min": -27.165874481201172, "gen_logits_std": 2.7559454441070557, "gen_loss": 0.27184587717056274, "grad_norm": 0.43308748866823965, "learning_rate": 2.5808842105263157e-05, "loss": 0.3142, "mean_copy_accuracy": 0.9953309148550034, "mean_gen_accuracy": 0.8668366968631744, "mean_token_accuracy": 0.8959236890077591, "num_tokens": 14557906.0, "sample_num_tokens": 8482.5, "step": 4054, "total_num_tokens": 14591836.0, "z_loss": 0.0006416958058252931 }, { "copy_logits_max": -7.8149824142456055, "copy_logits_min": -750000000.0, "copy_num_tokens": 353.75, "epoch": 0.8281848353331631, "gen_logits_max": 4.89725923538208, "gen_logits_mean": -15.977603912353516, "gen_logits_min": -26.742219924926758, "gen_logits_std": 2.696983575820923, "gen_loss": 0.3244812488555908, "grad_norm": 0.43986664601398295, "learning_rate": 2.580757894736842e-05, "loss": 0.3124, "mean_copy_accuracy": 0.9958376735448837, "mean_gen_accuracy": 0.8660489618778229, "mean_token_accuracy": 0.8956145495176315, "num_tokens": 14826747.0, "sample_num_tokens": 8061.75, "step": 4055, "total_num_tokens": 14858994.0, "z_loss": 0.0007288077031262219 }, { "copy_logits_max": -6.70936918258667, "copy_logits_min": -750000000.0, "copy_num_tokens": 322.0, "epoch": 0.8283890732703599, "gen_logits_max": 4.96057653427124, "gen_logits_mean": -15.334548950195312, "gen_logits_min": -26.35357666015625, "gen_logits_std": 2.723086357116699, "gen_loss": 0.3216775059700012, "grad_norm": 0.4278158837308749, "learning_rate": 2.5806315789473682e-05, "loss": 0.2941, "mean_copy_accuracy": 0.9955828636884689, "mean_gen_accuracy": 0.868359312415123, "mean_token_accuracy": 0.904803678393364, "num_tokens": 15115962.0, "sample_num_tokens": 7015.5, "step": 4056, "total_num_tokens": 15144024.0, "z_loss": 0.0007010565605014563 }, { "copy_logits_max": -8.121480941772461, "copy_logits_min": -750000000.0, "copy_num_tokens": 473.375, "epoch": 0.8285933112075567, "gen_logits_max": 6.382078170776367, "gen_logits_mean": -13.15008544921875, "gen_logits_min": -24.05185890197754, "gen_logits_std": 2.6659815311431885, "gen_loss": 0.3187911808490753, "grad_norm": 0.40739076885444914, "learning_rate": 2.5805052631578946e-05, "loss": 0.313, "mean_copy_accuracy": 0.9955625385046005, "mean_gen_accuracy": 0.8605024963617325, "mean_token_accuracy": 0.8948858231306076, "num_tokens": 15391768.0, "sample_num_tokens": 8352.5, "step": 4057, "total_num_tokens": 15425178.0, "z_loss": 0.0006870675715617836 }, { "copy_logits_max": -5.507085800170898, "copy_logits_min": -687500032.0, "copy_num_tokens": 480.25, "epoch": 0.8287975491447537, "gen_logits_max": 5.094067573547363, "gen_logits_mean": -14.52120304107666, "gen_logits_min": -25.968246459960938, "gen_logits_std": 2.712981700897217, "gen_loss": 0.3431380093097687, "grad_norm": 0.46737520605721733, "learning_rate": 2.5803789473684214e-05, "loss": 0.336, "mean_copy_accuracy": 0.995147630572319, "mean_gen_accuracy": 0.8585891574621201, "mean_token_accuracy": 0.8897797912359238, "num_tokens": 15631586.0, "sample_num_tokens": 8116.5, "step": 4058, "total_num_tokens": 15664052.0, "z_loss": 0.0007653653156012297 }, { "copy_logits_max": -5.415563583374023, "copy_logits_min": -750000000.0, "copy_num_tokens": 501.75, "epoch": 0.8290017870819505, "gen_logits_max": 5.045605659484863, "gen_logits_mean": -14.387483596801758, "gen_logits_min": -25.785133361816406, "gen_logits_std": 2.7314748764038086, "gen_loss": 0.31280791759490967, "grad_norm": 0.39183459427397443, "learning_rate": 2.5802526315789475e-05, "loss": 0.2876, "mean_copy_accuracy": 0.9953156858682632, "mean_gen_accuracy": 0.8736599087715149, "mean_token_accuracy": 0.9042980819940567, "num_tokens": 15886613.0, "sample_num_tokens": 8910.25, "step": 4059, "total_num_tokens": 15922254.0, "z_loss": 0.0006896094419062138 }, { "copy_logits_max": -8.586019515991211, "copy_logits_min": -750000000.0, "copy_num_tokens": 270.3125, "epoch": 0.8292060250191473, "gen_logits_max": 4.922085762023926, "gen_logits_mean": -16.843677520751953, "gen_logits_min": -27.640907287597656, "gen_logits_std": 2.7122559547424316, "gen_loss": 0.3110939562320709, "grad_norm": 0.37432612068001087, "learning_rate": 2.580126315789474e-05, "loss": 0.2946, "mean_copy_accuracy": 0.9948148131370544, "mean_gen_accuracy": 0.8703014999628067, "mean_token_accuracy": 0.9027013331651688, "num_tokens": 16159212.0, "sample_num_tokens": 7460.5, "step": 4060, "total_num_tokens": 16189054.0, "z_loss": 0.0006132536800578237 }, { "copy_logits_max": -6.249553680419922, "copy_logits_min": -750000000.0, "copy_num_tokens": 487.875, "epoch": 0.8294102629563441, "gen_logits_max": 5.971891403198242, "gen_logits_mean": -13.789896011352539, "gen_logits_min": -25.387535095214844, "gen_logits_std": 2.744966745376587, "gen_loss": 0.33378276228904724, "grad_norm": 0.41758113965570937, "learning_rate": 2.58e-05, "loss": 0.3057, "mean_copy_accuracy": 0.9956542402505875, "mean_gen_accuracy": 0.8647678196430206, "mean_token_accuracy": 0.898904025554657, "num_tokens": 16440895.0, "sample_num_tokens": 8631.75, "step": 4061, "total_num_tokens": 16475422.0, "z_loss": 0.0006897314451634884 }, { "copy_logits_max": -5.019955635070801, "copy_logits_min": -625000064.0, "copy_num_tokens": 645.625, "epoch": 0.8296145008935409, "gen_logits_max": 4.817570686340332, "gen_logits_mean": -14.568963050842285, "gen_logits_min": -26.082626342773438, "gen_logits_std": 2.7336325645446777, "gen_loss": 0.3171229660511017, "grad_norm": 0.40044620138485115, "learning_rate": 2.5798736842105265e-05, "loss": 0.3148, "mean_copy_accuracy": 0.9956679493188858, "mean_gen_accuracy": 0.8601267784833908, "mean_token_accuracy": 0.8957990258932114, "num_tokens": 16725120.0, "sample_num_tokens": 9580.5, "step": 4062, "total_num_tokens": 16763442.0, "z_loss": 0.000635708449408412 }, { "copy_logits_max": -7.191444396972656, "copy_logits_min": -750000000.0, "copy_num_tokens": 283.875, "epoch": 0.8298187388307378, "gen_logits_max": 6.339015483856201, "gen_logits_mean": -13.95187759399414, "gen_logits_min": -25.09192657470703, "gen_logits_std": 2.680657386779785, "gen_loss": 0.34932345151901245, "grad_norm": 0.40947806870772424, "learning_rate": 2.5797473684210526e-05, "loss": 0.3314, "mean_copy_accuracy": 0.9941758364439011, "mean_gen_accuracy": 0.8646267652511597, "mean_token_accuracy": 0.8899395614862442, "num_tokens": 16985623.0, "sample_num_tokens": 7466.25, "step": 4063, "total_num_tokens": 17015488.0, "z_loss": 0.0007178448140621185 }, { "copy_logits_max": -5.9313063621521, "copy_logits_min": -750000000.0, "copy_num_tokens": 575.875, "epoch": 0.8300229767679347, "gen_logits_max": 3.976055145263672, "gen_logits_mean": -14.86451530456543, "gen_logits_min": -26.59022331237793, "gen_logits_std": 2.755009174346924, "gen_loss": 0.3012271523475647, "grad_norm": 0.4088468938611108, "learning_rate": 2.579621052631579e-05, "loss": 0.2978, "mean_copy_accuracy": 0.9959405809640884, "mean_gen_accuracy": 0.8637389838695526, "mean_token_accuracy": 0.9016843140125275, "num_tokens": 17256430.0, "sample_num_tokens": 8500.0, "step": 4064, "total_num_tokens": 17290430.0, "z_loss": 0.0006430830108001828 }, { "copy_logits_max": -5.203572750091553, "copy_logits_min": -625000064.0, "copy_num_tokens": 580.9375, "epoch": 0.8302272147051315, "gen_logits_max": 6.9071125984191895, "gen_logits_mean": -12.972251892089844, "gen_logits_min": -24.76337432861328, "gen_logits_std": 2.718118667602539, "gen_loss": 0.3024434447288513, "grad_norm": 0.40408734402290625, "learning_rate": 2.579494736842105e-05, "loss": 0.2996, "mean_copy_accuracy": 0.9962936043739319, "mean_gen_accuracy": 0.8623660653829575, "mean_token_accuracy": 0.9016938805580139, "num_tokens": 17537841.0, "sample_num_tokens": 9276.75, "step": 4065, "total_num_tokens": 17574948.0, "z_loss": 0.0006577159510925412 }, { "copy_logits_max": -7.320871353149414, "copy_logits_min": -750000064.0, "copy_num_tokens": 525.3125, "epoch": 0.8304314526423283, "gen_logits_max": 4.972662448883057, "gen_logits_mean": -14.859899520874023, "gen_logits_min": -26.55754280090332, "gen_logits_std": 2.784492254257202, "gen_loss": 0.29382678866386414, "grad_norm": 0.3794273686505725, "learning_rate": 2.579368421052632e-05, "loss": 0.2776, "mean_copy_accuracy": 0.995180293917656, "mean_gen_accuracy": 0.8784826248884201, "mean_token_accuracy": 0.9075212627649307, "num_tokens": 17820918.0, "sample_num_tokens": 8987.5, "step": 4066, "total_num_tokens": 17856868.0, "z_loss": 0.0005965229938738048 }, { "copy_logits_max": -6.516576290130615, "copy_logits_min": -750000000.0, "copy_num_tokens": 473.5, "epoch": 0.8306356905795251, "gen_logits_max": 4.585844993591309, "gen_logits_mean": -16.156280517578125, "gen_logits_min": -27.560081481933594, "gen_logits_std": 2.7753870487213135, "gen_loss": 0.27991142868995667, "grad_norm": 0.4146623376960588, "learning_rate": 2.579242105263158e-05, "loss": 0.2927, "mean_copy_accuracy": 0.9944683313369751, "mean_gen_accuracy": 0.8713746964931488, "mean_token_accuracy": 0.9026453793048859, "num_tokens": 18095504.0, "sample_num_tokens": 8748.0, "step": 4067, "total_num_tokens": 18130496.0, "z_loss": 0.0005707830423489213 }, { "copy_logits_max": -6.5416131019592285, "copy_logits_min": -750000064.0, "copy_num_tokens": 514.6875, "epoch": 0.830839928516722, "gen_logits_max": 4.250051021575928, "gen_logits_mean": -16.220129013061523, "gen_logits_min": -27.88287353515625, "gen_logits_std": 2.8351821899414062, "gen_loss": 0.320083886384964, "grad_norm": 0.3907780769663081, "learning_rate": 2.5791157894736844e-05, "loss": 0.3103, "mean_copy_accuracy": 0.9952645301818848, "mean_gen_accuracy": 0.8613697588443756, "mean_token_accuracy": 0.8966624885797501, "num_tokens": 18379408.0, "sample_num_tokens": 8972.5, "step": 4068, "total_num_tokens": 18415298.0, "z_loss": 0.0006356599042192101 }, { "copy_logits_max": -4.733334541320801, "copy_logits_min": -625000064.0, "copy_num_tokens": 611.3125, "epoch": 0.8310441664539188, "gen_logits_max": 4.88348388671875, "gen_logits_mean": -15.092865943908691, "gen_logits_min": -27.225475311279297, "gen_logits_std": 2.8445043563842773, "gen_loss": 0.3335645794868469, "grad_norm": 0.3760412757560617, "learning_rate": 2.5789894736842105e-05, "loss": 0.3168, "mean_copy_accuracy": 0.9949060678482056, "mean_gen_accuracy": 0.8645581901073456, "mean_token_accuracy": 0.8949727267026901, "num_tokens": 18677302.0, "sample_num_tokens": 9620.5, "step": 4069, "total_num_tokens": 18715784.0, "z_loss": 0.0007390195387415588 }, { "copy_logits_max": -6.901995658874512, "copy_logits_min": -750000000.0, "copy_num_tokens": 375.9375, "epoch": 0.8312484043911157, "gen_logits_max": 6.473659515380859, "gen_logits_mean": -14.259578704833984, "gen_logits_min": -25.751968383789062, "gen_logits_std": 2.7400078773498535, "gen_loss": 0.30761075019836426, "grad_norm": 0.39841397281332613, "learning_rate": 2.578863157894737e-05, "loss": 0.311, "mean_copy_accuracy": 0.9954090565443039, "mean_gen_accuracy": 0.8665026128292084, "mean_token_accuracy": 0.896694004535675, "num_tokens": 18938489.0, "sample_num_tokens": 8125.75, "step": 4070, "total_num_tokens": 18970992.0, "z_loss": 0.0006858768174424767 }, { "copy_logits_max": -6.674063682556152, "copy_logits_min": -750000000.0, "copy_num_tokens": 443.25, "epoch": 0.8314526423283125, "gen_logits_max": 5.470636367797852, "gen_logits_mean": -13.407419204711914, "gen_logits_min": -24.787879943847656, "gen_logits_std": 2.7636752128601074, "gen_loss": 0.3234966993331909, "grad_norm": 0.413051113033524, "learning_rate": 2.578736842105263e-05, "loss": 0.3069, "mean_copy_accuracy": 0.9953630864620209, "mean_gen_accuracy": 0.8679953366518021, "mean_token_accuracy": 0.8990748226642609, "num_tokens": 19214108.0, "sample_num_tokens": 8185.5, "step": 4071, "total_num_tokens": 19246850.0, "z_loss": 0.0006456429837271571 }, { "copy_logits_max": -7.185365200042725, "copy_logits_min": -750000000.0, "copy_num_tokens": 453.8125, "epoch": 0.8316568802655093, "gen_logits_max": 4.844750881195068, "gen_logits_mean": -14.776152610778809, "gen_logits_min": -26.30747413635254, "gen_logits_std": 2.7703709602355957, "gen_loss": 0.2972361743450165, "grad_norm": 0.3902780106595853, "learning_rate": 2.5786105263157894e-05, "loss": 0.3098, "mean_copy_accuracy": 0.9952058494091034, "mean_gen_accuracy": 0.8650956600904465, "mean_token_accuracy": 0.8986109793186188, "num_tokens": 19489008.0, "sample_num_tokens": 8804.0, "step": 4072, "total_num_tokens": 19524224.0, "z_loss": 0.0005659430753439665 }, { "copy_logits_max": -5.8321332931518555, "copy_logits_min": -750000000.0, "copy_num_tokens": 350.875, "epoch": 0.8318611182027061, "gen_logits_max": 6.106679439544678, "gen_logits_mean": -13.776422500610352, "gen_logits_min": -25.27248764038086, "gen_logits_std": 2.7751221656799316, "gen_loss": 0.347505122423172, "grad_norm": 0.40662092923524307, "learning_rate": 2.578484210526316e-05, "loss": 0.3091, "mean_copy_accuracy": 0.9948637783527374, "mean_gen_accuracy": 0.8632455766201019, "mean_token_accuracy": 0.8988100290298462, "num_tokens": 19774970.0, "sample_num_tokens": 7793.0, "step": 4073, "total_num_tokens": 19806142.0, "z_loss": 0.0006989246467128396 }, { "copy_logits_max": -6.646352291107178, "copy_logits_min": -750000000.0, "copy_num_tokens": 466.8125, "epoch": 0.832065356139903, "gen_logits_max": 5.157622337341309, "gen_logits_mean": -13.67525863647461, "gen_logits_min": -25.277462005615234, "gen_logits_std": 2.7825961112976074, "gen_loss": 0.30745992064476013, "grad_norm": 0.39350421106671574, "learning_rate": 2.5783578947368423e-05, "loss": 0.2985, "mean_copy_accuracy": 0.9939956516027451, "mean_gen_accuracy": 0.8705544322729111, "mean_token_accuracy": 0.8995191156864166, "num_tokens": 20045687.0, "sample_num_tokens": 8482.75, "step": 4074, "total_num_tokens": 20079618.0, "z_loss": 0.000615114055108279 }, { "copy_logits_max": -5.399114608764648, "copy_logits_min": -750000000.0, "copy_num_tokens": 481.3125, "epoch": 0.8322695940770998, "gen_logits_max": 5.328151226043701, "gen_logits_mean": -13.587258338928223, "gen_logits_min": -25.431568145751953, "gen_logits_std": 2.755249261856079, "gen_loss": 0.3330742120742798, "grad_norm": 0.45624745703837233, "learning_rate": 2.5782315789473687e-05, "loss": 0.3481, "mean_copy_accuracy": 0.9959293454885483, "mean_gen_accuracy": 0.8532534688711166, "mean_token_accuracy": 0.8841997981071472, "num_tokens": 20311250.0, "sample_num_tokens": 7922.5, "step": 4075, "total_num_tokens": 20342940.0, "z_loss": 0.0006402496946975589 }, { "copy_logits_max": -5.958573818206787, "copy_logits_min": -750000000.0, "copy_num_tokens": 383.375, "epoch": 0.8324738320142967, "gen_logits_max": 4.777410507202148, "gen_logits_mean": -14.743463516235352, "gen_logits_min": -26.451004028320312, "gen_logits_std": 2.7822747230529785, "gen_loss": 0.3300042152404785, "grad_norm": 0.40672058232104835, "learning_rate": 2.5781052631578948e-05, "loss": 0.3216, "mean_copy_accuracy": 0.9954551309347153, "mean_gen_accuracy": 0.861587181687355, "mean_token_accuracy": 0.8907694667577744, "num_tokens": 20575609.0, "sample_num_tokens": 7449.25, "step": 4076, "total_num_tokens": 20605406.0, "z_loss": 0.0006865221075713634 }, { "copy_logits_max": -5.004483222961426, "copy_logits_min": -750000000.0, "copy_num_tokens": 672.5, "epoch": 0.8326780699514935, "gen_logits_max": 4.432606220245361, "gen_logits_mean": -15.08328628540039, "gen_logits_min": -26.73438262939453, "gen_logits_std": 2.7898573875427246, "gen_loss": 0.2626829743385315, "grad_norm": 0.37480268200345596, "learning_rate": 2.5779789473684212e-05, "loss": 0.2939, "mean_copy_accuracy": 0.995827853679657, "mean_gen_accuracy": 0.8653511554002762, "mean_token_accuracy": 0.9025245904922485, "num_tokens": 20843353.0, "sample_num_tokens": 9754.25, "step": 4077, "total_num_tokens": 20882370.0, "z_loss": 0.0006018716958351433 }, { "copy_logits_max": -4.883293151855469, "copy_logits_min": -750000000.0, "copy_num_tokens": 389.1875, "epoch": 0.8328823078886903, "gen_logits_max": 5.08161735534668, "gen_logits_mean": -14.012125015258789, "gen_logits_min": -26.432790756225586, "gen_logits_std": 2.810736894607544, "gen_loss": 0.34172993898391724, "grad_norm": 0.3856003598087107, "learning_rate": 2.5778526315789473e-05, "loss": 0.2955, "mean_copy_accuracy": 0.9960341900587082, "mean_gen_accuracy": 0.869130328297615, "mean_token_accuracy": 0.9022878110408783, "num_tokens": 21139067.0, "sample_num_tokens": 7492.25, "step": 4078, "total_num_tokens": 21169036.0, "z_loss": 0.0007414341089315712 }, { "copy_logits_max": -6.111490249633789, "copy_logits_min": -750000000.0, "copy_num_tokens": 373.125, "epoch": 0.8330865458258871, "gen_logits_max": 5.0674614906311035, "gen_logits_mean": -15.577037811279297, "gen_logits_min": -27.42559242248535, "gen_logits_std": 2.788515567779541, "gen_loss": 0.35929805040359497, "grad_norm": 0.40657084126210125, "learning_rate": 2.5777263157894738e-05, "loss": 0.326, "mean_copy_accuracy": 0.9954183548688889, "mean_gen_accuracy": 0.8608765751123428, "mean_token_accuracy": 0.890898808836937, "num_tokens": 21399190.0, "sample_num_tokens": 8218.0, "step": 4079, "total_num_tokens": 21432062.0, "z_loss": 0.0007001639460213482 }, { "copy_logits_max": -3.1463639736175537, "copy_logits_min": -687500032.0, "copy_num_tokens": 598.1875, "epoch": 0.833290783763084, "gen_logits_max": 3.5396342277526855, "gen_logits_mean": -15.285916328430176, "gen_logits_min": -27.32464599609375, "gen_logits_std": 2.8185625076293945, "gen_loss": 0.2959418296813965, "grad_norm": 0.42595707666890636, "learning_rate": 2.5776e-05, "loss": 0.3076, "mean_copy_accuracy": 0.9959504753351212, "mean_gen_accuracy": 0.8648914992809296, "mean_token_accuracy": 0.9000949412584305, "num_tokens": 21679477.0, "sample_num_tokens": 8432.75, "step": 4080, "total_num_tokens": 21713208.0, "z_loss": 0.0006810048362240195 }, { "copy_logits_max": -4.899800777435303, "copy_logits_min": -750000000.0, "copy_num_tokens": 705.75, "epoch": 0.8334950217002808, "gen_logits_max": 4.242259502410889, "gen_logits_mean": -14.837908744812012, "gen_logits_min": -26.5584716796875, "gen_logits_std": 2.7705655097961426, "gen_loss": 0.2543947100639343, "grad_norm": 0.3934024333423715, "learning_rate": 2.5774736842105263e-05, "loss": 0.2877, "mean_copy_accuracy": 0.9961516410112381, "mean_gen_accuracy": 0.8686519861221313, "mean_token_accuracy": 0.9035236686468124, "num_tokens": 21950761.0, "sample_num_tokens": 10195.75, "step": 4081, "total_num_tokens": 21991544.0, "z_loss": 0.0006060382584109902 }, { "copy_logits_max": -5.819320201873779, "copy_logits_min": -750000000.0, "copy_num_tokens": 344.75, "epoch": 0.8336992596374777, "gen_logits_max": 5.34135627746582, "gen_logits_mean": -13.551528930664062, "gen_logits_min": -25.261457443237305, "gen_logits_std": 2.757535934448242, "gen_loss": 0.33938631415367126, "grad_norm": 0.5248798013782259, "learning_rate": 2.5773473684210527e-05, "loss": 0.3079, "mean_copy_accuracy": 0.9956631660461426, "mean_gen_accuracy": 0.8625745326280594, "mean_token_accuracy": 0.895841121673584, "num_tokens": 22249328.0, "sample_num_tokens": 7456.5, "step": 4082, "total_num_tokens": 22279154.0, "z_loss": 0.0006661989027634263 }, { "copy_logits_max": -6.423762798309326, "copy_logits_min": -750000000.0, "copy_num_tokens": 496.875, "epoch": 0.8339034975746745, "gen_logits_max": 3.9116320610046387, "gen_logits_mean": -15.969036102294922, "gen_logits_min": -27.767030715942383, "gen_logits_std": 2.8087875843048096, "gen_loss": 0.27824968099594116, "grad_norm": 0.3930683862118251, "learning_rate": 2.577221052631579e-05, "loss": 0.3082, "mean_copy_accuracy": 0.9960713386535645, "mean_gen_accuracy": 0.8624101877212524, "mean_token_accuracy": 0.8957479745149612, "num_tokens": 22508698.0, "sample_num_tokens": 8458.0, "step": 4083, "total_num_tokens": 22542530.0, "z_loss": 0.0005365880206227303 }, { "copy_logits_max": -7.4845685958862305, "copy_logits_min": -750000000.0, "copy_num_tokens": 321.375, "epoch": 0.8341077355118713, "gen_logits_max": 5.375120162963867, "gen_logits_mean": -13.915066719055176, "gen_logits_min": -25.704404830932617, "gen_logits_std": 2.7719316482543945, "gen_loss": 0.302359938621521, "grad_norm": 0.3934584240637448, "learning_rate": 2.5770947368421052e-05, "loss": 0.3045, "mean_copy_accuracy": 0.995822548866272, "mean_gen_accuracy": 0.8666650801897049, "mean_token_accuracy": 0.9020094126462936, "num_tokens": 22771999.0, "sample_num_tokens": 7672.75, "step": 4084, "total_num_tokens": 22802690.0, "z_loss": 0.0006123340572230518 }, { "copy_logits_max": -8.94887924194336, "copy_logits_min": -687500032.0, "copy_num_tokens": 517.5, "epoch": 0.8343119734490682, "gen_logits_max": 4.2205119132995605, "gen_logits_mean": -15.703231811523438, "gen_logits_min": -27.475168228149414, "gen_logits_std": 2.8066916465759277, "gen_loss": 0.27771076560020447, "grad_norm": 0.3766866614028172, "learning_rate": 2.5769684210526317e-05, "loss": 0.2781, "mean_copy_accuracy": 0.9959404617547989, "mean_gen_accuracy": 0.8753147572278976, "mean_token_accuracy": 0.9077802896499634, "num_tokens": 23072002.0, "sample_num_tokens": 9536.5, "step": 4085, "total_num_tokens": 23110148.0, "z_loss": 0.0005323275690898299 }, { "copy_logits_max": -6.359349250793457, "copy_logits_min": -750000000.0, "copy_num_tokens": 561.25, "epoch": 0.834516211386265, "gen_logits_max": 3.91123628616333, "gen_logits_mean": -15.318851470947266, "gen_logits_min": -26.99600601196289, "gen_logits_std": 2.808269739151001, "gen_loss": 0.26096534729003906, "grad_norm": 0.39924064797516806, "learning_rate": 2.576842105263158e-05, "loss": 0.3005, "mean_copy_accuracy": 0.9951367676258087, "mean_gen_accuracy": 0.8663156032562256, "mean_token_accuracy": 0.8989754915237427, "num_tokens": 23338470.0, "sample_num_tokens": 8546.0, "step": 4086, "total_num_tokens": 23372654.0, "z_loss": 0.0005587564082816243 }, { "copy_logits_max": -7.807868480682373, "copy_logits_min": -750000000.0, "copy_num_tokens": 399.4375, "epoch": 0.8347204493234618, "gen_logits_max": 4.546714782714844, "gen_logits_mean": -14.53907585144043, "gen_logits_min": -26.40867042541504, "gen_logits_std": 2.7984347343444824, "gen_loss": 0.2893393635749817, "grad_norm": 0.3967443052434485, "learning_rate": 2.5767157894736842e-05, "loss": 0.3001, "mean_copy_accuracy": 0.9958045929670334, "mean_gen_accuracy": 0.8666501045227051, "mean_token_accuracy": 0.8990149796009064, "num_tokens": 23608299.0, "sample_num_tokens": 7014.25, "step": 4087, "total_num_tokens": 23636356.0, "z_loss": 0.0005786798428744078 }, { "copy_logits_max": -8.962023735046387, "copy_logits_min": -625000064.0, "copy_num_tokens": 374.125, "epoch": 0.8349246872606587, "gen_logits_max": 5.5014448165893555, "gen_logits_mean": -14.077800750732422, "gen_logits_min": -26.059444427490234, "gen_logits_std": 2.7687559127807617, "gen_loss": 0.30693838000297546, "grad_norm": 0.4203320203222666, "learning_rate": 2.5765894736842106e-05, "loss": 0.3189, "mean_copy_accuracy": 0.994982898235321, "mean_gen_accuracy": 0.8675389140844345, "mean_token_accuracy": 0.8932007849216461, "num_tokens": 23854915.0, "sample_num_tokens": 7717.25, "step": 4088, "total_num_tokens": 23885784.0, "z_loss": 0.000568175339139998 }, { "copy_logits_max": -6.992725849151611, "copy_logits_min": -750000000.0, "copy_num_tokens": 429.4375, "epoch": 0.8351289251978555, "gen_logits_max": 3.907726764678955, "gen_logits_mean": -15.986729621887207, "gen_logits_min": -27.5228214263916, "gen_logits_std": 2.766911745071411, "gen_loss": 0.32004886865615845, "grad_norm": 0.37487673622742645, "learning_rate": 2.5764631578947367e-05, "loss": 0.3048, "mean_copy_accuracy": 0.9947165697813034, "mean_gen_accuracy": 0.8674430400133133, "mean_token_accuracy": 0.8984982222318649, "num_tokens": 24120010.0, "sample_num_tokens": 8378.0, "step": 4089, "total_num_tokens": 24153522.0, "z_loss": 0.0006050476804375648 }, { "copy_logits_max": -4.963393211364746, "copy_logits_min": -687500032.0, "copy_num_tokens": 427.375, "epoch": 0.8353331631350523, "gen_logits_max": 4.661060333251953, "gen_logits_mean": -14.198287963867188, "gen_logits_min": -25.82166862487793, "gen_logits_std": 2.76369571685791, "gen_loss": 0.28860628604888916, "grad_norm": 0.45579593621966535, "learning_rate": 2.5763368421052635e-05, "loss": 0.3065, "mean_copy_accuracy": 0.9950670152902603, "mean_gen_accuracy": 0.8667261153459549, "mean_token_accuracy": 0.8969800472259521, "num_tokens": 24386632.0, "sample_num_tokens": 7750.0, "step": 4090, "total_num_tokens": 24417632.0, "z_loss": 0.0005652376567013562 }, { "copy_logits_max": -9.413722038269043, "copy_logits_min": -750000064.0, "copy_num_tokens": 340.75, "epoch": 0.8355374010722492, "gen_logits_max": 5.761386871337891, "gen_logits_mean": -14.92868423461914, "gen_logits_min": -26.30602264404297, "gen_logits_std": 2.7499947547912598, "gen_loss": 0.29832935333251953, "grad_norm": 0.4746796457394165, "learning_rate": 2.5762105263157896e-05, "loss": 0.3095, "mean_copy_accuracy": 0.9952593892812729, "mean_gen_accuracy": 0.8700605630874634, "mean_token_accuracy": 0.896298736333847, "num_tokens": 24637233.0, "sample_num_tokens": 8480.75, "step": 4091, "total_num_tokens": 24671156.0, "z_loss": 0.0005840889643877745 }, { "copy_logits_max": -7.437196731567383, "copy_logits_min": -750000000.0, "copy_num_tokens": 339.125, "epoch": 0.835741639009446, "gen_logits_max": 4.611639022827148, "gen_logits_mean": -14.975399017333984, "gen_logits_min": -26.73542022705078, "gen_logits_std": 2.7826437950134277, "gen_loss": 0.29585111141204834, "grad_norm": 0.3882939848770283, "learning_rate": 2.576084210526316e-05, "loss": 0.2861, "mean_copy_accuracy": 0.9945449978113174, "mean_gen_accuracy": 0.875939130783081, "mean_token_accuracy": 0.903398796916008, "num_tokens": 24896252.0, "sample_num_tokens": 7272.5, "step": 4092, "total_num_tokens": 24925342.0, "z_loss": 0.0006219244096428156 }, { "copy_logits_max": -7.267642498016357, "copy_logits_min": -750000000.0, "copy_num_tokens": 738.9375, "epoch": 0.8359458769466428, "gen_logits_max": 4.295444011688232, "gen_logits_mean": -14.06275463104248, "gen_logits_min": -26.451406478881836, "gen_logits_std": 2.8168435096740723, "gen_loss": 0.27165961265563965, "grad_norm": 0.3916996824145201, "learning_rate": 2.575957894736842e-05, "loss": 0.2962, "mean_copy_accuracy": 0.9941522777080536, "mean_gen_accuracy": 0.8704923093318939, "mean_token_accuracy": 0.9016474038362503, "num_tokens": 25158562.0, "sample_num_tokens": 10324.5, "step": 4093, "total_num_tokens": 25199860.0, "z_loss": 0.0005724088987335563 }, { "copy_logits_max": -7.704504489898682, "copy_logits_min": -750000000.0, "copy_num_tokens": 395.0, "epoch": 0.8361501148838397, "gen_logits_max": 5.794787406921387, "gen_logits_mean": -14.11727523803711, "gen_logits_min": -25.587833404541016, "gen_logits_std": 2.74861216545105, "gen_loss": 0.3127761483192444, "grad_norm": 0.560206046010669, "learning_rate": 2.5758315789473685e-05, "loss": 0.2947, "mean_copy_accuracy": 0.9960027635097504, "mean_gen_accuracy": 0.8699688166379929, "mean_token_accuracy": 0.9014100432395935, "num_tokens": 25416634.0, "sample_num_tokens": 8206.5, "step": 4094, "total_num_tokens": 25449460.0, "z_loss": 0.0006812131032347679 }, { "copy_logits_max": -6.0691728591918945, "copy_logits_min": -750000000.0, "copy_num_tokens": 317.875, "epoch": 0.8363543528210365, "gen_logits_max": 4.738800048828125, "gen_logits_mean": -15.272053718566895, "gen_logits_min": -27.0181827545166, "gen_logits_std": 2.7642970085144043, "gen_loss": 0.3334735929965973, "grad_norm": 0.4445519348701012, "learning_rate": 2.5757052631578946e-05, "loss": 0.3078, "mean_copy_accuracy": 0.9939574748277664, "mean_gen_accuracy": 0.8658999353647232, "mean_token_accuracy": 0.8972917795181274, "num_tokens": 25679084.0, "sample_num_tokens": 6740.0, "step": 4095, "total_num_tokens": 25706044.0, "z_loss": 0.0006320343818515539 }, { "copy_logits_max": -7.477301597595215, "copy_logits_min": -750000000.0, "copy_num_tokens": 553.875, "epoch": 0.8365585907582334, "gen_logits_max": 5.357157230377197, "gen_logits_mean": -14.581107139587402, "gen_logits_min": -26.107149124145508, "gen_logits_std": 2.7660255432128906, "gen_loss": 0.3064834475517273, "grad_norm": 0.392785133668622, "learning_rate": 2.575578947368421e-05, "loss": 0.3166, "mean_copy_accuracy": 0.9953896254301071, "mean_gen_accuracy": 0.8662685006856918, "mean_token_accuracy": 0.8954468816518784, "num_tokens": 25938967.0, "sample_num_tokens": 9699.75, "step": 4096, "total_num_tokens": 25977766.0, "z_loss": 0.0005795547040179372 }, { "copy_logits_max": -6.556138038635254, "copy_logits_min": -750000000.0, "copy_num_tokens": 495.75, "epoch": 0.8367628286954302, "gen_logits_max": 4.000907897949219, "gen_logits_mean": -15.069061279296875, "gen_logits_min": -26.84701156616211, "gen_logits_std": 2.795336961746216, "gen_loss": 0.3161412477493286, "grad_norm": 0.4409412551061415, "learning_rate": 2.575452631578947e-05, "loss": 0.3145, "mean_copy_accuracy": 0.9931048005819321, "mean_gen_accuracy": 0.8645453900098801, "mean_token_accuracy": 0.894893079996109, "num_tokens": 26198635.0, "sample_num_tokens": 8405.25, "step": 4097, "total_num_tokens": 26232256.0, "z_loss": 0.0005695960717275739 }, { "copy_logits_max": -5.441990375518799, "copy_logits_min": -625000064.0, "copy_num_tokens": 631.8125, "epoch": 0.836967066632627, "gen_logits_max": 3.754391670227051, "gen_logits_mean": -15.161935806274414, "gen_logits_min": -26.776264190673828, "gen_logits_std": 2.769383430480957, "gen_loss": 0.3182832598686218, "grad_norm": 0.407297261233481, "learning_rate": 2.575326315789474e-05, "loss": 0.313, "mean_copy_accuracy": 0.9959090203046799, "mean_gen_accuracy": 0.8670927435159683, "mean_token_accuracy": 0.8957649767398834, "num_tokens": 26447180.0, "sample_num_tokens": 10139.5, "step": 4098, "total_num_tokens": 26487738.0, "z_loss": 0.0006697421777062118 }, { "copy_logits_max": -5.449253082275391, "copy_logits_min": -750000000.0, "copy_num_tokens": 298.0625, "epoch": 0.8371713045698238, "gen_logits_max": 4.340685844421387, "gen_logits_mean": -15.944941520690918, "gen_logits_min": -27.548267364501953, "gen_logits_std": 2.795167922973633, "gen_loss": 0.36003172397613525, "grad_norm": 0.43360200306422075, "learning_rate": 2.5752000000000003e-05, "loss": 0.3124, "mean_copy_accuracy": 0.9957727342844009, "mean_gen_accuracy": 0.8621906340122223, "mean_token_accuracy": 0.897669792175293, "num_tokens": 26722165.0, "sample_num_tokens": 6803.75, "step": 4099, "total_num_tokens": 26749380.0, "z_loss": 0.000685444800183177 }, { "copy_logits_max": -5.524386405944824, "copy_logits_min": -750000000.0, "copy_num_tokens": 447.75, "epoch": 0.8373755425070207, "gen_logits_max": 4.794632434844971, "gen_logits_mean": -14.151857376098633, "gen_logits_min": -26.83470916748047, "gen_logits_std": 2.7774851322174072, "gen_loss": 0.3180055618286133, "grad_norm": 0.401732677242338, "learning_rate": 2.5750736842105264e-05, "loss": 0.3353, "mean_copy_accuracy": 0.9955763518810272, "mean_gen_accuracy": 0.8584299236536026, "mean_token_accuracy": 0.8892493844032288, "num_tokens": 26983925.0, "sample_num_tokens": 8423.25, "step": 4100, "total_num_tokens": 27017618.0, "z_loss": 0.0006024684407748282 }, { "copy_logits_max": -5.405805587768555, "copy_logits_min": -750000000.0, "copy_num_tokens": 494.6875, "epoch": 0.8375797804442175, "gen_logits_max": 3.834636688232422, "gen_logits_mean": -15.206823348999023, "gen_logits_min": -26.986270904541016, "gen_logits_std": 2.7979209423065186, "gen_loss": 0.31768807768821716, "grad_norm": 0.45099947251691574, "learning_rate": 2.574947368421053e-05, "loss": 0.318, "mean_copy_accuracy": 0.9950079917907715, "mean_gen_accuracy": 0.856402263045311, "mean_token_accuracy": 0.8958685845136642, "num_tokens": 27254385.0, "sample_num_tokens": 8807.25, "step": 4101, "total_num_tokens": 27289614.0, "z_loss": 0.000613540003541857 }, { "copy_logits_max": -4.337687969207764, "copy_logits_min": -750000000.0, "copy_num_tokens": 555.25, "epoch": 0.8377840183814144, "gen_logits_max": 4.337882995605469, "gen_logits_mean": -15.243810653686523, "gen_logits_min": -27.33542251586914, "gen_logits_std": 2.7999916076660156, "gen_loss": 0.3198130130767822, "grad_norm": 0.45140179704593036, "learning_rate": 2.574821052631579e-05, "loss": 0.319, "mean_copy_accuracy": 0.9947402477264404, "mean_gen_accuracy": 0.8586605340242386, "mean_token_accuracy": 0.8939918428659439, "num_tokens": 27529217.0, "sample_num_tokens": 9238.25, "step": 4102, "total_num_tokens": 27566170.0, "z_loss": 0.0006445899489335716 }, { "copy_logits_max": -2.9667797088623047, "copy_logits_min": -750000000.0, "copy_num_tokens": 413.0, "epoch": 0.8379882563186112, "gen_logits_max": 4.840333938598633, "gen_logits_mean": -14.277992248535156, "gen_logits_min": -26.280818939208984, "gen_logits_std": 2.7597782611846924, "gen_loss": 0.2980875074863434, "grad_norm": 0.4102427999970942, "learning_rate": 2.5746947368421054e-05, "loss": 0.3033, "mean_copy_accuracy": 0.9946914464235306, "mean_gen_accuracy": 0.8722975105047226, "mean_token_accuracy": 0.9001195132732391, "num_tokens": 27815343.0, "sample_num_tokens": 7663.25, "step": 4103, "total_num_tokens": 27845996.0, "z_loss": 0.0006966508226469159 }, { "copy_logits_max": -5.014248371124268, "copy_logits_min": -750000064.0, "copy_num_tokens": 286.25, "epoch": 0.838192494255808, "gen_logits_max": 5.0394110679626465, "gen_logits_mean": -13.949040412902832, "gen_logits_min": -25.337604522705078, "gen_logits_std": 2.7108514308929443, "gen_loss": 0.33802706003189087, "grad_norm": 0.42967909189426085, "learning_rate": 2.5745684210526315e-05, "loss": 0.332, "mean_copy_accuracy": 0.9929088205099106, "mean_gen_accuracy": 0.8603779673576355, "mean_token_accuracy": 0.8881950974464417, "num_tokens": 28074728.0, "sample_num_tokens": 6951.5, "step": 4104, "total_num_tokens": 28102534.0, "z_loss": 0.0006684048566967249 }, { "copy_logits_max": -6.737704277038574, "copy_logits_min": -750000000.0, "copy_num_tokens": 225.5, "epoch": 0.8383967321930048, "gen_logits_max": 5.7917280197143555, "gen_logits_mean": -14.361959457397461, "gen_logits_min": -25.60424041748047, "gen_logits_std": 2.694150924682617, "gen_loss": 0.28947702050209045, "grad_norm": 0.3885062127057926, "learning_rate": 2.574442105263158e-05, "loss": 0.3095, "mean_copy_accuracy": 0.9948441535234451, "mean_gen_accuracy": 0.8694356083869934, "mean_token_accuracy": 0.8981800377368927, "num_tokens": 28340753.0, "sample_num_tokens": 6488.25, "step": 4105, "total_num_tokens": 28366706.0, "z_loss": 0.0005810903385281563 }, { "copy_logits_max": -7.078888893127441, "copy_logits_min": -750000000.0, "copy_num_tokens": 348.4375, "epoch": 0.8386009701302017, "gen_logits_max": 4.7167253494262695, "gen_logits_mean": -15.628091812133789, "gen_logits_min": -26.777997970581055, "gen_logits_std": 2.6813693046569824, "gen_loss": 0.3404962420463562, "grad_norm": 0.4439149739723983, "learning_rate": 2.574315789473684e-05, "loss": 0.3158, "mean_copy_accuracy": 0.9937665164470673, "mean_gen_accuracy": 0.8681227713823318, "mean_token_accuracy": 0.8959702402353287, "num_tokens": 28567257.0, "sample_num_tokens": 7655.25, "step": 4106, "total_num_tokens": 28597878.0, "z_loss": 0.000705107522662729 }, { "copy_logits_max": -4.41245174407959, "copy_logits_min": -750000000.0, "copy_num_tokens": 503.9375, "epoch": 0.8388052080673986, "gen_logits_max": 4.709683895111084, "gen_logits_mean": -14.198269844055176, "gen_logits_min": -25.594623565673828, "gen_logits_std": 2.7172207832336426, "gen_loss": 0.31383728981018066, "grad_norm": 0.3783810427453129, "learning_rate": 2.5741894736842108e-05, "loss": 0.307, "mean_copy_accuracy": 0.9941771030426025, "mean_gen_accuracy": 0.8663661181926727, "mean_token_accuracy": 0.8956183642148972, "num_tokens": 28834261.0, "sample_num_tokens": 8078.75, "step": 4107, "total_num_tokens": 28866576.0, "z_loss": 0.0006685618427582085 }, { "copy_logits_max": -6.061387062072754, "copy_logits_min": -687500032.0, "copy_num_tokens": 422.375, "epoch": 0.8390094460045954, "gen_logits_max": 5.152779579162598, "gen_logits_mean": -13.774576187133789, "gen_logits_min": -25.281810760498047, "gen_logits_std": 2.7029178142547607, "gen_loss": 0.35064810514450073, "grad_norm": 0.37846336843750067, "learning_rate": 2.574063157894737e-05, "loss": 0.3156, "mean_copy_accuracy": 0.9943094104528427, "mean_gen_accuracy": 0.8647670149803162, "mean_token_accuracy": 0.8947819620370865, "num_tokens": 29109024.0, "sample_num_tokens": 8259.5, "step": 4108, "total_num_tokens": 29142062.0, "z_loss": 0.0007728753262199461 }, { "copy_logits_max": -6.4478864669799805, "copy_logits_min": -750000000.0, "copy_num_tokens": 326.6875, "epoch": 0.8392136839417922, "gen_logits_max": 6.325619697570801, "gen_logits_mean": -14.131872177124023, "gen_logits_min": -25.593385696411133, "gen_logits_std": 2.745955228805542, "gen_loss": 0.3268465995788574, "grad_norm": 0.42705877780190277, "learning_rate": 2.5739368421052633e-05, "loss": 0.315, "mean_copy_accuracy": 0.9934836179018021, "mean_gen_accuracy": 0.8665882349014282, "mean_token_accuracy": 0.8962213695049286, "num_tokens": 29378223.0, "sample_num_tokens": 7477.25, "step": 4109, "total_num_tokens": 29408132.0, "z_loss": 0.0006959144957363605 }, { "copy_logits_max": -6.251936435699463, "copy_logits_min": -750000000.0, "copy_num_tokens": 305.5625, "epoch": 0.839417921878989, "gen_logits_max": 5.410151481628418, "gen_logits_mean": -14.501397132873535, "gen_logits_min": -25.533348083496094, "gen_logits_std": 2.714815616607666, "gen_loss": 0.3490464687347412, "grad_norm": 0.398212723377496, "learning_rate": 2.5738105263157894e-05, "loss": 0.3051, "mean_copy_accuracy": 0.9959367215633392, "mean_gen_accuracy": 0.8668407797813416, "mean_token_accuracy": 0.8980846852064133, "num_tokens": 29646987.0, "sample_num_tokens": 7144.75, "step": 4110, "total_num_tokens": 29675566.0, "z_loss": 0.0007561126258224249 }, { "copy_logits_max": -6.08980131149292, "copy_logits_min": -750000000.0, "copy_num_tokens": 279.8125, "epoch": 0.8396221598161858, "gen_logits_max": 5.447054386138916, "gen_logits_mean": -14.626482009887695, "gen_logits_min": -25.479291915893555, "gen_logits_std": 2.6830501556396484, "gen_loss": 0.3723793029785156, "grad_norm": 0.3828203394300611, "learning_rate": 2.5736842105263158e-05, "loss": 0.3212, "mean_copy_accuracy": 0.9963951408863068, "mean_gen_accuracy": 0.8605419993400574, "mean_token_accuracy": 0.8943590968847275, "num_tokens": 29929929.0, "sample_num_tokens": 7767.75, "step": 4111, "total_num_tokens": 29961000.0, "z_loss": 0.0007519861683249474 }, { "copy_logits_max": -6.856215476989746, "copy_logits_min": -750000000.0, "copy_num_tokens": 468.125, "epoch": 0.8398263977533826, "gen_logits_max": 4.9921417236328125, "gen_logits_mean": -14.178979873657227, "gen_logits_min": -25.317424774169922, "gen_logits_std": 2.7196197509765625, "gen_loss": 0.3236445188522339, "grad_norm": 0.4154649334106002, "learning_rate": 2.5735578947368423e-05, "loss": 0.2973, "mean_copy_accuracy": 0.9939195066690445, "mean_gen_accuracy": 0.8701288849115372, "mean_token_accuracy": 0.9016852378845215, "num_tokens": 30230263.0, "sample_num_tokens": 8480.75, "step": 4112, "total_num_tokens": 30264186.0, "z_loss": 0.0006098649464547634 }, { "copy_logits_max": -8.001896858215332, "copy_logits_min": -750000000.0, "copy_num_tokens": 477.75, "epoch": 0.8400306356905796, "gen_logits_max": 5.918031692504883, "gen_logits_mean": -12.769649505615234, "gen_logits_min": -24.57056999206543, "gen_logits_std": 2.7354495525360107, "gen_loss": 0.29563194513320923, "grad_norm": 0.39613890414617486, "learning_rate": 2.5734315789473684e-05, "loss": 0.3065, "mean_copy_accuracy": 0.9936874359846115, "mean_gen_accuracy": 0.8675063401460648, "mean_token_accuracy": 0.8972917646169662, "num_tokens": 30515115.0, "sample_num_tokens": 9894.75, "step": 4113, "total_num_tokens": 30554694.0, "z_loss": 0.0005906032747589052 }, { "copy_logits_max": -6.2219133377075195, "copy_logits_min": -750000000.0, "copy_num_tokens": 509.5625, "epoch": 0.8402348736277764, "gen_logits_max": 4.286576271057129, "gen_logits_mean": -14.911776542663574, "gen_logits_min": -26.645862579345703, "gen_logits_std": 2.7773447036743164, "gen_loss": 0.2663741409778595, "grad_norm": 0.39391142012478225, "learning_rate": 2.5733052631578948e-05, "loss": 0.311, "mean_copy_accuracy": 0.9957717508077621, "mean_gen_accuracy": 0.8655694723129272, "mean_token_accuracy": 0.89499631524086, "num_tokens": 30795545.0, "sample_num_tokens": 8910.75, "step": 4114, "total_num_tokens": 30831188.0, "z_loss": 0.000559195876121521 }, { "copy_logits_max": -6.097386360168457, "copy_logits_min": -750000000.0, "copy_num_tokens": 354.0, "epoch": 0.8404391115649732, "gen_logits_max": 5.637876510620117, "gen_logits_mean": -13.267630577087402, "gen_logits_min": -24.29718780517578, "gen_logits_std": 2.661548137664795, "gen_loss": 0.3292565941810608, "grad_norm": 0.38541121724997573, "learning_rate": 2.5731789473684212e-05, "loss": 0.2936, "mean_copy_accuracy": 0.9960640221834183, "mean_gen_accuracy": 0.8711920231580734, "mean_token_accuracy": 0.904658854007721, "num_tokens": 31085055.0, "sample_num_tokens": 7739.25, "step": 4115, "total_num_tokens": 31116012.0, "z_loss": 0.0005842847749590874 }, { "copy_logits_max": -6.840821266174316, "copy_logits_min": -750000000.0, "copy_num_tokens": 367.75, "epoch": 0.84064334950217, "gen_logits_max": 4.581242561340332, "gen_logits_mean": -15.821249008178711, "gen_logits_min": -27.079818725585938, "gen_logits_std": 2.751359701156616, "gen_loss": 0.3451039791107178, "grad_norm": 0.3979833747013915, "learning_rate": 2.5730526315789476e-05, "loss": 0.3315, "mean_copy_accuracy": 0.9956896603107452, "mean_gen_accuracy": 0.8597692996263504, "mean_token_accuracy": 0.8900435119867325, "num_tokens": 31341258.0, "sample_num_tokens": 8276.0, "step": 4116, "total_num_tokens": 31374362.0, "z_loss": 0.00068213592749089 }, { "copy_logits_max": -5.7291693687438965, "copy_logits_min": -750000000.0, "copy_num_tokens": 473.0625, "epoch": 0.8408475874393668, "gen_logits_max": 4.065213680267334, "gen_logits_mean": -15.255228042602539, "gen_logits_min": -26.502717971801758, "gen_logits_std": 2.727041006088257, "gen_loss": 0.3366440534591675, "grad_norm": 0.37349350516423335, "learning_rate": 2.5729263157894737e-05, "loss": 0.3071, "mean_copy_accuracy": 0.9952453970909119, "mean_gen_accuracy": 0.8656226396560669, "mean_token_accuracy": 0.8999939113855362, "num_tokens": 31623003.0, "sample_num_tokens": 7763.75, "step": 4117, "total_num_tokens": 31654058.0, "z_loss": 0.0006048502982594073 }, { "copy_logits_max": -4.914416313171387, "copy_logits_min": -750000000.0, "copy_num_tokens": 530.5625, "epoch": 0.8410518253765636, "gen_logits_max": 5.666320323944092, "gen_logits_mean": -12.790163040161133, "gen_logits_min": -24.705284118652344, "gen_logits_std": 2.7598392963409424, "gen_loss": 0.3577985167503357, "grad_norm": 0.4165197736430965, "learning_rate": 2.5728e-05, "loss": 0.3245, "mean_copy_accuracy": 0.9943396896123886, "mean_gen_accuracy": 0.8618778884410858, "mean_token_accuracy": 0.8925866782665253, "num_tokens": 31882173.0, "sample_num_tokens": 10430.75, "step": 4118, "total_num_tokens": 31923896.0, "z_loss": 0.0007261153077706695 }, { "copy_logits_max": -6.700666427612305, "copy_logits_min": -750000000.0, "copy_num_tokens": 344.75, "epoch": 0.8412560633137606, "gen_logits_max": 6.217551231384277, "gen_logits_mean": -13.234230041503906, "gen_logits_min": -25.49057388305664, "gen_logits_std": 2.7917914390563965, "gen_loss": 0.3005641996860504, "grad_norm": 0.3867779772348626, "learning_rate": 2.5726736842105263e-05, "loss": 0.3014, "mean_copy_accuracy": 0.995476707816124, "mean_gen_accuracy": 0.8747107237577438, "mean_token_accuracy": 0.9003323465585709, "num_tokens": 32138218.0, "sample_num_tokens": 6614.5, "step": 4119, "total_num_tokens": 32164676.0, "z_loss": 0.0006276504136621952 }, { "copy_logits_max": -5.411727428436279, "copy_logits_min": -750000064.0, "copy_num_tokens": 393.5625, "epoch": 0.8414603012509574, "gen_logits_max": 4.865238189697266, "gen_logits_mean": -14.885751724243164, "gen_logits_min": -26.43671417236328, "gen_logits_std": 2.7498934268951416, "gen_loss": 0.3454171419143677, "grad_norm": 0.39627679872971733, "learning_rate": 2.5725473684210527e-05, "loss": 0.3216, "mean_copy_accuracy": 0.9952438324689865, "mean_gen_accuracy": 0.8666441291570663, "mean_token_accuracy": 0.894081175327301, "num_tokens": 32385510.0, "sample_num_tokens": 8007.5, "step": 4120, "total_num_tokens": 32417540.0, "z_loss": 0.0006678919889964163 }, { "copy_logits_max": -5.129533767700195, "copy_logits_min": -687500032.0, "copy_num_tokens": 392.875, "epoch": 0.8416645391881542, "gen_logits_max": 5.931112766265869, "gen_logits_mean": -13.405506134033203, "gen_logits_min": -25.109222412109375, "gen_logits_std": 2.7678310871124268, "gen_loss": 0.3472767174243927, "grad_norm": 0.413637843065731, "learning_rate": 2.5724210526315788e-05, "loss": 0.3159, "mean_copy_accuracy": 0.9951686263084412, "mean_gen_accuracy": 0.8589964061975479, "mean_token_accuracy": 0.8952227532863617, "num_tokens": 32653678.0, "sample_num_tokens": 7756.5, "step": 4121, "total_num_tokens": 32684704.0, "z_loss": 0.0006715698400512338 }, { "copy_logits_max": -1.5456748008728027, "copy_logits_min": -687500032.0, "copy_num_tokens": 703.5625, "epoch": 0.841868777125351, "gen_logits_max": 3.1835405826568604, "gen_logits_mean": -15.357281684875488, "gen_logits_min": -27.02105712890625, "gen_logits_std": 2.774782180786133, "gen_loss": 0.2476440668106079, "grad_norm": 0.3787700203735449, "learning_rate": 2.5722947368421052e-05, "loss": 0.3098, "mean_copy_accuracy": 0.996136024594307, "mean_gen_accuracy": 0.8594957143068314, "mean_token_accuracy": 0.8967933356761932, "num_tokens": 32917862.0, "sample_num_tokens": 9199.5, "step": 4122, "total_num_tokens": 32954660.0, "z_loss": 0.0006285838317126036 }, { "copy_logits_max": -3.443272590637207, "copy_logits_min": -625000064.0, "copy_num_tokens": 498.0625, "epoch": 0.8420730150625478, "gen_logits_max": 5.377443313598633, "gen_logits_mean": -13.397329330444336, "gen_logits_min": -25.434932708740234, "gen_logits_std": 2.722806453704834, "gen_loss": 0.28214454650878906, "grad_norm": 0.38634337729091683, "learning_rate": 2.5721684210526316e-05, "loss": 0.2867, "mean_copy_accuracy": 0.9961783438920975, "mean_gen_accuracy": 0.8680587857961655, "mean_token_accuracy": 0.9035209864377975, "num_tokens": 33200697.0, "sample_num_tokens": 8823.75, "step": 4123, "total_num_tokens": 33235992.0, "z_loss": 0.0006422463338822126 }, { "copy_logits_max": -3.8341970443725586, "copy_logits_min": -750000000.0, "copy_num_tokens": 457.625, "epoch": 0.8422772529997447, "gen_logits_max": 5.388997554779053, "gen_logits_mean": -12.909146308898926, "gen_logits_min": -24.1623592376709, "gen_logits_std": 2.6805083751678467, "gen_loss": 0.3098023533821106, "grad_norm": 0.4210079443034371, "learning_rate": 2.572042105263158e-05, "loss": 0.3112, "mean_copy_accuracy": 0.9950888305902481, "mean_gen_accuracy": 0.8651695251464844, "mean_token_accuracy": 0.8970575630664825, "num_tokens": 33467952.0, "sample_num_tokens": 7912.5, "step": 4124, "total_num_tokens": 33499602.0, "z_loss": 0.000763939693570137 }, { "copy_logits_max": -7.903873920440674, "copy_logits_min": -750000064.0, "copy_num_tokens": 274.875, "epoch": 0.8424814909369416, "gen_logits_max": 6.073901176452637, "gen_logits_mean": -14.352746963500977, "gen_logits_min": -25.931577682495117, "gen_logits_std": 2.7647433280944824, "gen_loss": 0.3110784590244293, "grad_norm": 0.3782515524020298, "learning_rate": 2.5719157894736842e-05, "loss": 0.2876, "mean_copy_accuracy": 0.9945941716432571, "mean_gen_accuracy": 0.8780550807714462, "mean_token_accuracy": 0.9041921198368073, "num_tokens": 33748951.0, "sample_num_tokens": 7641.25, "step": 4125, "total_num_tokens": 33779516.0, "z_loss": 0.0006459594005718827 }, { "copy_logits_max": -2.8959450721740723, "copy_logits_min": -750000000.0, "copy_num_tokens": 509.5625, "epoch": 0.8426857288741384, "gen_logits_max": 5.1958208084106445, "gen_logits_mean": -13.63405990600586, "gen_logits_min": -25.469202041625977, "gen_logits_std": 2.7874374389648438, "gen_loss": 0.2903085947036743, "grad_norm": 0.392625602874994, "learning_rate": 2.5717894736842106e-05, "loss": 0.3067, "mean_copy_accuracy": 0.9944958388805389, "mean_gen_accuracy": 0.8650594800710678, "mean_token_accuracy": 0.8963903337717056, "num_tokens": 34027331.0, "sample_num_tokens": 8661.25, "step": 4126, "total_num_tokens": 34061976.0, "z_loss": 0.0006636205944232643 }, { "copy_logits_max": -3.411684989929199, "copy_logits_min": -687500032.0, "copy_num_tokens": 550.5, "epoch": 0.8428899668113352, "gen_logits_max": 4.934460639953613, "gen_logits_mean": -14.237252235412598, "gen_logits_min": -26.125213623046875, "gen_logits_std": 2.797969102859497, "gen_loss": 0.3058246374130249, "grad_norm": 0.44776880584832174, "learning_rate": 2.571663157894737e-05, "loss": 0.3025, "mean_copy_accuracy": 0.9962199926376343, "mean_gen_accuracy": 0.8600657731294632, "mean_token_accuracy": 0.8983431309461594, "num_tokens": 34288339.0, "sample_num_tokens": 8927.25, "step": 4127, "total_num_tokens": 34324048.0, "z_loss": 0.0006888886564411223 }, { "copy_logits_max": -4.5095109939575195, "copy_logits_min": -750000000.0, "copy_num_tokens": 563.375, "epoch": 0.843094204748532, "gen_logits_max": 4.400157928466797, "gen_logits_mean": -15.044918060302734, "gen_logits_min": -27.010581970214844, "gen_logits_std": 2.7751035690307617, "gen_loss": 0.28205662965774536, "grad_norm": 0.3950841006104225, "learning_rate": 2.571536842105263e-05, "loss": 0.3044, "mean_copy_accuracy": 0.9960709661245346, "mean_gen_accuracy": 0.864142045378685, "mean_token_accuracy": 0.8970830589532852, "num_tokens": 34593953.0, "sample_num_tokens": 9186.25, "step": 4128, "total_num_tokens": 34630698.0, "z_loss": 0.0006018917192704976 }, { "copy_logits_max": -4.075996398925781, "copy_logits_min": -750000000.0, "copy_num_tokens": 411.1875, "epoch": 0.8432984426857288, "gen_logits_max": 5.444840908050537, "gen_logits_mean": -14.42497444152832, "gen_logits_min": -25.643829345703125, "gen_logits_std": 2.7120604515075684, "gen_loss": 0.3192688822746277, "grad_norm": 0.4503786259347571, "learning_rate": 2.5714105263157896e-05, "loss": 0.3314, "mean_copy_accuracy": 0.992809846997261, "mean_gen_accuracy": 0.8621933311223984, "mean_token_accuracy": 0.8903688937425613, "num_tokens": 34856132.0, "sample_num_tokens": 8722.0, "step": 4129, "total_num_tokens": 34891020.0, "z_loss": 0.0006736229406669736 }, { "copy_logits_max": -3.856739044189453, "copy_logits_min": -750000000.0, "copy_num_tokens": 287.6875, "epoch": 0.8435026806229257, "gen_logits_max": 5.470083713531494, "gen_logits_mean": -14.114703178405762, "gen_logits_min": -25.462051391601562, "gen_logits_std": 2.7134439945220947, "gen_loss": 0.3320823311805725, "grad_norm": 0.4495378325981112, "learning_rate": 2.5712842105263156e-05, "loss": 0.3389, "mean_copy_accuracy": 0.9949430525302887, "mean_gen_accuracy": 0.8571827709674835, "mean_token_accuracy": 0.8893323093652725, "num_tokens": 35124667.0, "sample_num_tokens": 6834.25, "step": 4130, "total_num_tokens": 35152004.0, "z_loss": 0.0006653806776739657 }, { "copy_logits_max": -5.869357109069824, "copy_logits_min": -750000000.0, "copy_num_tokens": 247.5625, "epoch": 0.8437069185601226, "gen_logits_max": 4.706732273101807, "gen_logits_mean": -15.878540992736816, "gen_logits_min": -27.305355072021484, "gen_logits_std": 2.7636423110961914, "gen_loss": 0.31230825185775757, "grad_norm": 0.4088544229719956, "learning_rate": 2.5711578947368424e-05, "loss": 0.3067, "mean_copy_accuracy": 0.9958451241254807, "mean_gen_accuracy": 0.8665475100278854, "mean_token_accuracy": 0.8987013548612595, "num_tokens": 35393279.0, "sample_num_tokens": 7149.25, "step": 4131, "total_num_tokens": 35421876.0, "z_loss": 0.0005969314370304346 }, { "copy_logits_max": -4.518425941467285, "copy_logits_min": -750000000.0, "copy_num_tokens": 339.1875, "epoch": 0.8439111564973194, "gen_logits_max": 5.309966087341309, "gen_logits_mean": -14.061060905456543, "gen_logits_min": -25.696393966674805, "gen_logits_std": 2.7844998836517334, "gen_loss": 0.35878250002861023, "grad_norm": 0.41594251941809274, "learning_rate": 2.5710315789473685e-05, "loss": 0.3268, "mean_copy_accuracy": 0.9947910755872726, "mean_gen_accuracy": 0.8640412986278534, "mean_token_accuracy": 0.8910604119300842, "num_tokens": 35648869.0, "sample_num_tokens": 7410.75, "step": 4132, "total_num_tokens": 35678512.0, "z_loss": 0.0007157115614973009 }, { "copy_logits_max": -2.5354151725769043, "copy_logits_min": -750000000.0, "copy_num_tokens": 633.375, "epoch": 0.8441153944345162, "gen_logits_max": 3.951925039291382, "gen_logits_mean": -14.65484619140625, "gen_logits_min": -26.57568359375, "gen_logits_std": 2.82053804397583, "gen_loss": 0.2551423907279968, "grad_norm": 0.4133632322584803, "learning_rate": 2.570905263157895e-05, "loss": 0.298, "mean_copy_accuracy": 0.9937023520469666, "mean_gen_accuracy": 0.8686378300189972, "mean_token_accuracy": 0.9005754888057709, "num_tokens": 35915270.0, "sample_num_tokens": 9269.0, "step": 4133, "total_num_tokens": 35952346.0, "z_loss": 0.0005497246747836471 }, { "copy_logits_max": 1.2131545543670654, "copy_logits_min": -750000064.0, "copy_num_tokens": 558.875, "epoch": 0.844319632371713, "gen_logits_max": 4.0526838302612305, "gen_logits_mean": -14.55013370513916, "gen_logits_min": -26.962779998779297, "gen_logits_std": 2.7734012603759766, "gen_loss": 0.2855515778064728, "grad_norm": 0.42021746952868394, "learning_rate": 2.570778947368421e-05, "loss": 0.2978, "mean_copy_accuracy": 0.9949259757995605, "mean_gen_accuracy": 0.8631703704595566, "mean_token_accuracy": 0.9022724032402039, "num_tokens": 36196282.0, "sample_num_tokens": 8566.0, "step": 4134, "total_num_tokens": 36230546.0, "z_loss": 0.0007029077969491482 }, { "copy_logits_max": -1.4858267307281494, "copy_logits_min": -750000000.0, "copy_num_tokens": 465.875, "epoch": 0.8445238703089099, "gen_logits_max": 5.186455726623535, "gen_logits_mean": -13.141300201416016, "gen_logits_min": -25.07498550415039, "gen_logits_std": 2.736450672149658, "gen_loss": 0.3234761655330658, "grad_norm": 0.4306487088141676, "learning_rate": 2.5706526315789475e-05, "loss": 0.3159, "mean_copy_accuracy": 0.9951757490634918, "mean_gen_accuracy": 0.8661602586507797, "mean_token_accuracy": 0.8951159715652466, "num_tokens": 36445063.0, "sample_num_tokens": 8645.75, "step": 4135, "total_num_tokens": 36479646.0, "z_loss": 0.0007161711109802127 }, { "copy_logits_max": 2.2970924377441406, "copy_logits_min": -750000000.0, "copy_num_tokens": 667.4375, "epoch": 0.8447281082461067, "gen_logits_max": 3.988920211791992, "gen_logits_mean": -13.894554138183594, "gen_logits_min": -25.83138084411621, "gen_logits_std": 2.744213581085205, "gen_loss": 0.31112760305404663, "grad_norm": 0.450933105174337, "learning_rate": 2.5705263157894736e-05, "loss": 0.3318, "mean_copy_accuracy": 0.9942218065261841, "mean_gen_accuracy": 0.8541489392518997, "mean_token_accuracy": 0.8913398534059525, "num_tokens": 36714851.0, "sample_num_tokens": 9060.75, "step": 4136, "total_num_tokens": 36751094.0, "z_loss": 0.0007996297208592296 }, { "copy_logits_max": -3.8553833961486816, "copy_logits_min": -750000000.0, "copy_num_tokens": 370.5, "epoch": 0.8449323461833036, "gen_logits_max": 4.112771987915039, "gen_logits_mean": -16.257217407226562, "gen_logits_min": -27.23749542236328, "gen_logits_std": 2.7329399585723877, "gen_loss": 0.3088640570640564, "grad_norm": 0.43707277813357964, "learning_rate": 2.5704e-05, "loss": 0.3185, "mean_copy_accuracy": 0.9941716641187668, "mean_gen_accuracy": 0.8654399812221527, "mean_token_accuracy": 0.8944554626941681, "num_tokens": 36966341.0, "sample_num_tokens": 8840.75, "step": 4137, "total_num_tokens": 37001704.0, "z_loss": 0.0005854809423908591 }, { "copy_logits_max": -3.570720911026001, "copy_logits_min": -750000000.0, "copy_num_tokens": 395.5, "epoch": 0.8451365841205004, "gen_logits_max": 4.56169319152832, "gen_logits_mean": -14.795297622680664, "gen_logits_min": -25.91666603088379, "gen_logits_std": 2.6944706439971924, "gen_loss": 0.322868287563324, "grad_norm": 0.4282752793167888, "learning_rate": 2.570273684210526e-05, "loss": 0.3004, "mean_copy_accuracy": 0.9952250570058823, "mean_gen_accuracy": 0.867519274353981, "mean_token_accuracy": 0.8990519493818283, "num_tokens": 37244956.0, "sample_num_tokens": 8421.0, "step": 4138, "total_num_tokens": 37278640.0, "z_loss": 0.0006514341803267598 }, { "copy_logits_max": -4.076216220855713, "copy_logits_min": -750000000.0, "copy_num_tokens": 345.4375, "epoch": 0.8453408220576972, "gen_logits_max": 4.5865607261657715, "gen_logits_mean": -15.89031982421875, "gen_logits_min": -27.098289489746094, "gen_logits_std": 2.7520318031311035, "gen_loss": 0.32199424505233765, "grad_norm": 0.4068228662402914, "learning_rate": 2.570147368421053e-05, "loss": 0.3136, "mean_copy_accuracy": 0.9935007840394974, "mean_gen_accuracy": 0.8706892430782318, "mean_token_accuracy": 0.897512897849083, "num_tokens": 37517181.0, "sample_num_tokens": 7937.25, "step": 4139, "total_num_tokens": 37548930.0, "z_loss": 0.0006403615698218346 }, { "copy_logits_max": -1.306505560874939, "copy_logits_min": -750000000.0, "copy_num_tokens": 376.75, "epoch": 0.845545059994894, "gen_logits_max": 5.4067702293396, "gen_logits_mean": -12.989109992980957, "gen_logits_min": -24.526273727416992, "gen_logits_std": 2.6618781089782715, "gen_loss": 0.29555439949035645, "grad_norm": 0.4193410070130582, "learning_rate": 2.5700210526315793e-05, "loss": 0.3127, "mean_copy_accuracy": 0.9954477250576019, "mean_gen_accuracy": 0.8612340986728668, "mean_token_accuracy": 0.8939076215028763, "num_tokens": 37775966.0, "sample_num_tokens": 7311.0, "step": 4140, "total_num_tokens": 37805210.0, "z_loss": 0.0005929064354859293 }, { "copy_logits_max": -2.016000986099243, "copy_logits_min": -750000000.0, "copy_num_tokens": 735.0625, "epoch": 0.8457492979320909, "gen_logits_max": 5.474562644958496, "gen_logits_mean": -12.512908935546875, "gen_logits_min": -24.461896896362305, "gen_logits_std": 2.7555036544799805, "gen_loss": 0.250177264213562, "grad_norm": 0.393563756606061, "learning_rate": 2.5698947368421054e-05, "loss": 0.2732, "mean_copy_accuracy": 0.9961588978767395, "mean_gen_accuracy": 0.8676862418651581, "mean_token_accuracy": 0.9088920801877975, "num_tokens": 38083061.0, "sample_num_tokens": 9846.25, "step": 4141, "total_num_tokens": 38122446.0, "z_loss": 0.0005586352199316025 }, { "copy_logits_max": -4.507203578948975, "copy_logits_min": -625000064.0, "copy_num_tokens": 466.6875, "epoch": 0.8459535358692877, "gen_logits_max": 3.8658316135406494, "gen_logits_mean": -16.200042724609375, "gen_logits_min": -27.738632202148438, "gen_logits_std": 2.769380569458008, "gen_loss": 0.31260162591934204, "grad_norm": 0.4378646645957734, "learning_rate": 2.5697684210526318e-05, "loss": 0.3075, "mean_copy_accuracy": 0.993201270699501, "mean_gen_accuracy": 0.8683996647596359, "mean_token_accuracy": 0.8966133147478104, "num_tokens": 38358495.0, "sample_num_tokens": 7956.75, "step": 4142, "total_num_tokens": 38390322.0, "z_loss": 0.0006459635915234685 }, { "copy_logits_max": -3.8923420906066895, "copy_logits_min": -750000000.0, "copy_num_tokens": 613.8125, "epoch": 0.8461577738064846, "gen_logits_max": 4.500153064727783, "gen_logits_mean": -14.14084243774414, "gen_logits_min": -25.723987579345703, "gen_logits_std": 2.714296817779541, "gen_loss": 0.2840843200683594, "grad_norm": 0.39892517170268704, "learning_rate": 2.569642105263158e-05, "loss": 0.2947, "mean_copy_accuracy": 0.9956028014421463, "mean_gen_accuracy": 0.8668574839830399, "mean_token_accuracy": 0.900405079126358, "num_tokens": 38623690.0, "sample_num_tokens": 9800.5, "step": 4143, "total_num_tokens": 38662892.0, "z_loss": 0.0006503101321868598 }, { "copy_logits_max": -4.240481853485107, "copy_logits_min": -750000000.0, "copy_num_tokens": 392.125, "epoch": 0.8463620117436814, "gen_logits_max": 6.1536712646484375, "gen_logits_mean": -12.76984691619873, "gen_logits_min": -24.399219512939453, "gen_logits_std": 2.707447052001953, "gen_loss": 0.312702476978302, "grad_norm": 0.44222399493188996, "learning_rate": 2.5695157894736843e-05, "loss": 0.318, "mean_copy_accuracy": 0.995145320892334, "mean_gen_accuracy": 0.8655484616756439, "mean_token_accuracy": 0.893784373998642, "num_tokens": 38878332.0, "sample_num_tokens": 7952.0, "step": 4144, "total_num_tokens": 38910140.0, "z_loss": 0.0006298056687228382 }, { "copy_logits_max": -4.93382453918457, "copy_logits_min": -687500032.0, "copy_num_tokens": 296.0, "epoch": 0.8465662496808782, "gen_logits_max": 5.25163459777832, "gen_logits_mean": -13.341421127319336, "gen_logits_min": -24.174278259277344, "gen_logits_std": 2.649989128112793, "gen_loss": 0.2962024509906769, "grad_norm": 0.4274257537927308, "learning_rate": 2.5693894736842104e-05, "loss": 0.307, "mean_copy_accuracy": 0.993840292096138, "mean_gen_accuracy": 0.8702038526535034, "mean_token_accuracy": 0.8976435661315918, "num_tokens": 39142110.0, "sample_num_tokens": 6940.5, "step": 4145, "total_num_tokens": 39169872.0, "z_loss": 0.0005785248940810561 }, { "copy_logits_max": -2.034759521484375, "copy_logits_min": -687500032.0, "copy_num_tokens": 376.5, "epoch": 0.846770487618075, "gen_logits_max": 4.7587409019470215, "gen_logits_mean": -14.689716339111328, "gen_logits_min": -26.295597076416016, "gen_logits_std": 2.7461230754852295, "gen_loss": 0.36559462547302246, "grad_norm": 0.4155166911269846, "learning_rate": 2.569263157894737e-05, "loss": 0.3182, "mean_copy_accuracy": 0.9948283433914185, "mean_gen_accuracy": 0.8630449026823044, "mean_token_accuracy": 0.8970263600349426, "num_tokens": 39414968.0, "sample_num_tokens": 7456.0, "step": 4146, "total_num_tokens": 39444792.0, "z_loss": 0.0007109347498044372 }, { "copy_logits_max": -4.212652206420898, "copy_logits_min": -750000000.0, "copy_num_tokens": 323.1875, "epoch": 0.8469747255552719, "gen_logits_max": 5.29545783996582, "gen_logits_mean": -14.546314239501953, "gen_logits_min": -26.024057388305664, "gen_logits_std": 2.7137913703918457, "gen_loss": 0.3370351195335388, "grad_norm": 0.4404519158622616, "learning_rate": 2.5691368421052633e-05, "loss": 0.3397, "mean_copy_accuracy": 0.9928701221942902, "mean_gen_accuracy": 0.8586201965808868, "mean_token_accuracy": 0.8879711180925369, "num_tokens": 39668294.0, "sample_num_tokens": 7795.5, "step": 4147, "total_num_tokens": 39699476.0, "z_loss": 0.0007116005290299654 }, { "copy_logits_max": -5.2923102378845215, "copy_logits_min": -750000128.0, "copy_num_tokens": 312.4375, "epoch": 0.8471789634924687, "gen_logits_max": 5.174544334411621, "gen_logits_mean": -14.937507629394531, "gen_logits_min": -25.763477325439453, "gen_logits_std": 2.6705870628356934, "gen_loss": 0.35722532868385315, "grad_norm": 0.39180918270529047, "learning_rate": 2.5690105263157897e-05, "loss": 0.3276, "mean_copy_accuracy": 0.995111882686615, "mean_gen_accuracy": 0.8629613667726517, "mean_token_accuracy": 0.892033651471138, "num_tokens": 39931321.0, "sample_num_tokens": 7817.75, "step": 4148, "total_num_tokens": 39962592.0, "z_loss": 0.0007223073625937104 }, { "copy_logits_max": -4.269376754760742, "copy_logits_min": -750000000.0, "copy_num_tokens": 344.375, "epoch": 0.8473832014296656, "gen_logits_max": 4.898008346557617, "gen_logits_mean": -15.712747573852539, "gen_logits_min": -26.94900131225586, "gen_logits_std": 2.7532660961151123, "gen_loss": 0.33961647748947144, "grad_norm": 0.4113813547265526, "learning_rate": 2.5688842105263158e-05, "loss": 0.3235, "mean_copy_accuracy": 0.9948218464851379, "mean_gen_accuracy": 0.8641743659973145, "mean_token_accuracy": 0.8930111974477768, "num_tokens": 40193924.0, "sample_num_tokens": 7956.5, "step": 4149, "total_num_tokens": 40225750.0, "z_loss": 0.0007491641445085406 }, { "copy_logits_max": -4.1523661613464355, "copy_logits_min": -750000000.0, "copy_num_tokens": 403.0, "epoch": 0.8475874393668624, "gen_logits_max": 5.534618377685547, "gen_logits_mean": -14.09560489654541, "gen_logits_min": -25.42540740966797, "gen_logits_std": 2.753115177154541, "gen_loss": 0.31684762239456177, "grad_norm": 0.42913965740061094, "learning_rate": 2.5687578947368422e-05, "loss": 0.3115, "mean_copy_accuracy": 0.9930729866027832, "mean_gen_accuracy": 0.8648837357759476, "mean_token_accuracy": 0.8968715667724609, "num_tokens": 40444175.0, "sample_num_tokens": 8229.75, "step": 4150, "total_num_tokens": 40477094.0, "z_loss": 0.0006437689298763871 }, { "copy_logits_max": -5.58683443069458, "copy_logits_min": -750000000.0, "copy_num_tokens": 302.6875, "epoch": 0.8477916773040592, "gen_logits_max": 6.36944580078125, "gen_logits_mean": -14.237312316894531, "gen_logits_min": -25.562498092651367, "gen_logits_std": 2.736478805541992, "gen_loss": 0.2883257269859314, "grad_norm": 0.40099217716058416, "learning_rate": 2.5686315789473683e-05, "loss": 0.299, "mean_copy_accuracy": 0.9936240017414093, "mean_gen_accuracy": 0.8732022941112518, "mean_token_accuracy": 0.9016727805137634, "num_tokens": 40699877.0, "sample_num_tokens": 7459.75, "step": 4151, "total_num_tokens": 40729716.0, "z_loss": 0.0006087750662118196 }, { "copy_logits_max": -5.871224880218506, "copy_logits_min": -750000000.0, "copy_num_tokens": 445.25, "epoch": 0.8479959152412561, "gen_logits_max": 4.2247772216796875, "gen_logits_mean": -15.890691757202148, "gen_logits_min": -27.3831787109375, "gen_logits_std": 2.8076834678649902, "gen_loss": 0.27323782444000244, "grad_norm": 0.39636394520044205, "learning_rate": 2.5685052631578948e-05, "loss": 0.3066, "mean_copy_accuracy": 0.9948340356349945, "mean_gen_accuracy": 0.8688160181045532, "mean_token_accuracy": 0.8974929004907608, "num_tokens": 40959058.0, "sample_num_tokens": 9333.0, "step": 4152, "total_num_tokens": 40996390.0, "z_loss": 0.0005963494768366218 }, { "copy_logits_max": -5.959579944610596, "copy_logits_min": -750000000.0, "copy_num_tokens": 409.6875, "epoch": 0.8482001531784529, "gen_logits_max": 5.245610237121582, "gen_logits_mean": -14.494319915771484, "gen_logits_min": -26.32286262512207, "gen_logits_std": 2.7988038063049316, "gen_loss": 0.3029549717903137, "grad_norm": 0.3798011539646595, "learning_rate": 2.5683789473684212e-05, "loss": 0.3124, "mean_copy_accuracy": 0.9944620430469513, "mean_gen_accuracy": 0.8645294457674026, "mean_token_accuracy": 0.8982936590909958, "num_tokens": 41231501.0, "sample_num_tokens": 8192.25, "step": 4153, "total_num_tokens": 41264270.0, "z_loss": 0.0006525702774524689 }, { "copy_logits_max": -5.131258964538574, "copy_logits_min": -750000000.0, "copy_num_tokens": 585.25, "epoch": 0.8484043911156497, "gen_logits_max": 4.034728050231934, "gen_logits_mean": -15.191518783569336, "gen_logits_min": -26.952499389648438, "gen_logits_std": 2.818998336791992, "gen_loss": 0.276608943939209, "grad_norm": 0.36430542709843294, "learning_rate": 2.5682526315789473e-05, "loss": 0.2894, "mean_copy_accuracy": 0.9964570552110672, "mean_gen_accuracy": 0.8670430481433868, "mean_token_accuracy": 0.9027918875217438, "num_tokens": 41512676.0, "sample_num_tokens": 9592.5, "step": 4154, "total_num_tokens": 41551046.0, "z_loss": 0.0006187280523590744 }, { "copy_logits_max": -3.0694336891174316, "copy_logits_min": -750000000.0, "copy_num_tokens": 511.0625, "epoch": 0.8486086290528466, "gen_logits_max": 4.393409729003906, "gen_logits_mean": -14.704665184020996, "gen_logits_min": -26.62778663635254, "gen_logits_std": 2.8177363872528076, "gen_loss": 0.31181058287620544, "grad_norm": 0.42152110763536077, "learning_rate": 2.5681263157894737e-05, "loss": 0.3194, "mean_copy_accuracy": 0.9949173927307129, "mean_gen_accuracy": 0.8564661741256714, "mean_token_accuracy": 0.8931482434272766, "num_tokens": 41794362.0, "sample_num_tokens": 8412.5, "step": 4155, "total_num_tokens": 41828012.0, "z_loss": 0.0006407109904102981 }, { "copy_logits_max": -3.2449562549591064, "copy_logits_min": -687500032.0, "copy_num_tokens": 419.125, "epoch": 0.8488128669900434, "gen_logits_max": 5.842591762542725, "gen_logits_mean": -13.586978912353516, "gen_logits_min": -25.276268005371094, "gen_logits_std": 2.7380475997924805, "gen_loss": 0.34714245796203613, "grad_norm": 0.5037213524236034, "learning_rate": 2.568e-05, "loss": 0.3092, "mean_copy_accuracy": 0.99440598487854, "mean_gen_accuracy": 0.8675021827220917, "mean_token_accuracy": 0.8969095945358276, "num_tokens": 42057521.0, "sample_num_tokens": 9120.75, "step": 4156, "total_num_tokens": 42094004.0, "z_loss": 0.0006801512790843844 }, { "copy_logits_max": -6.204301357269287, "copy_logits_min": -750000000.0, "copy_num_tokens": 408.375, "epoch": 0.8490171049272403, "gen_logits_max": 5.44786262512207, "gen_logits_mean": -14.988001823425293, "gen_logits_min": -27.210187911987305, "gen_logits_std": 2.8298325538635254, "gen_loss": 0.2957227826118469, "grad_norm": 0.3827679243690535, "learning_rate": 2.5678736842105266e-05, "loss": 0.2962, "mean_copy_accuracy": 0.9958106279373169, "mean_gen_accuracy": 0.8714500516653061, "mean_token_accuracy": 0.9015790075063705, "num_tokens": 42329289.0, "sample_num_tokens": 8381.25, "step": 4157, "total_num_tokens": 42362814.0, "z_loss": 0.0005761650972999632 }, { "copy_logits_max": -4.12421178817749, "copy_logits_min": -750000000.0, "copy_num_tokens": 446.625, "epoch": 0.8492213428644371, "gen_logits_max": 6.656193733215332, "gen_logits_mean": -14.337100982666016, "gen_logits_min": -26.484386444091797, "gen_logits_std": 2.803502082824707, "gen_loss": 0.3159828186035156, "grad_norm": 0.39287234146475913, "learning_rate": 2.5677473684210527e-05, "loss": 0.315, "mean_copy_accuracy": 0.9958244413137436, "mean_gen_accuracy": 0.8618720769882202, "mean_token_accuracy": 0.8967865407466888, "num_tokens": 42618835.0, "sample_num_tokens": 7810.75, "step": 4158, "total_num_tokens": 42650078.0, "z_loss": 0.0007120365626178682 }, { "copy_logits_max": -2.194216251373291, "copy_logits_min": -750000000.0, "copy_num_tokens": 702.375, "epoch": 0.8494255808016339, "gen_logits_max": 6.255265235900879, "gen_logits_mean": -13.429014205932617, "gen_logits_min": -25.852649688720703, "gen_logits_std": 2.819815158843994, "gen_loss": 0.277283251285553, "grad_norm": 0.4022105640878616, "learning_rate": 2.567621052631579e-05, "loss": 0.2964, "mean_copy_accuracy": 0.9951534867286682, "mean_gen_accuracy": 0.8711655139923096, "mean_token_accuracy": 0.9024372696876526, "num_tokens": 42891771.0, "sample_num_tokens": 9915.25, "step": 4159, "total_num_tokens": 42931432.0, "z_loss": 0.0006890658987686038 }, { "copy_logits_max": -1.697590947151184, "copy_logits_min": -687500032.0, "copy_num_tokens": 645.5625, "epoch": 0.8496298187388307, "gen_logits_max": 4.066815376281738, "gen_logits_mean": -15.416455268859863, "gen_logits_min": -27.411205291748047, "gen_logits_std": 2.8189804553985596, "gen_loss": 0.28741002082824707, "grad_norm": 0.4133810038474701, "learning_rate": 2.5674947368421052e-05, "loss": 0.2826, "mean_copy_accuracy": 0.9953743666410446, "mean_gen_accuracy": 0.8726349323987961, "mean_token_accuracy": 0.9077965766191483, "num_tokens": 43163007.0, "sample_num_tokens": 9397.25, "step": 4160, "total_num_tokens": 43200596.0, "z_loss": 0.0006734051858074963 }, { "copy_logits_max": -4.673860549926758, "copy_logits_min": -750000000.0, "copy_num_tokens": 319.25, "epoch": 0.8498340566760276, "gen_logits_max": 5.746791839599609, "gen_logits_mean": -14.352563858032227, "gen_logits_min": -26.354312896728516, "gen_logits_std": 2.828235149383545, "gen_loss": 0.3356841504573822, "grad_norm": 0.43199639776686594, "learning_rate": 2.5673684210526316e-05, "loss": 0.3383, "mean_copy_accuracy": 0.9933971166610718, "mean_gen_accuracy": 0.8601944297552109, "mean_token_accuracy": 0.8897293955087662, "num_tokens": 43431057.0, "sample_num_tokens": 6993.75, "step": 4161, "total_num_tokens": 43459032.0, "z_loss": 0.0007646074518561363 }, { "copy_logits_max": -3.7841272354125977, "copy_logits_min": -750000000.0, "copy_num_tokens": 695.1875, "epoch": 0.8500382946132244, "gen_logits_max": 4.298247814178467, "gen_logits_mean": -15.19464111328125, "gen_logits_min": -27.141422271728516, "gen_logits_std": 2.7980096340179443, "gen_loss": 0.2842142581939697, "grad_norm": 0.3828936866332311, "learning_rate": 2.5672421052631577e-05, "loss": 0.2923, "mean_copy_accuracy": 0.9951337575912476, "mean_gen_accuracy": 0.8743937611579895, "mean_token_accuracy": 0.9052005708217621, "num_tokens": 43686664.0, "sample_num_tokens": 10154.5, "step": 4162, "total_num_tokens": 43727282.0, "z_loss": 0.0006548819947056472 }, { "copy_logits_max": -3.859661340713501, "copy_logits_min": -750000000.0, "copy_num_tokens": 315.75, "epoch": 0.8502425325504213, "gen_logits_max": 4.509788990020752, "gen_logits_mean": -15.82567310333252, "gen_logits_min": -27.397850036621094, "gen_logits_std": 2.7736754417419434, "gen_loss": 0.30959445238113403, "grad_norm": 0.38414992173953333, "learning_rate": 2.567115789473684e-05, "loss": 0.3022, "mean_copy_accuracy": 0.9948592633008957, "mean_gen_accuracy": 0.8717426061630249, "mean_token_accuracy": 0.8997538387775421, "num_tokens": 43975808.0, "sample_num_tokens": 7487.5, "step": 4163, "total_num_tokens": 44005758.0, "z_loss": 0.0006398976547643542 }, { "copy_logits_max": -3.0799944400787354, "copy_logits_min": -750000000.0, "copy_num_tokens": 428.8125, "epoch": 0.8504467704876181, "gen_logits_max": 5.031702518463135, "gen_logits_mean": -14.779647827148438, "gen_logits_min": -26.98735809326172, "gen_logits_std": 2.82550311088562, "gen_loss": 0.2991878390312195, "grad_norm": 0.3989780435322234, "learning_rate": 2.5669894736842106e-05, "loss": 0.2981, "mean_copy_accuracy": 0.9953934401273727, "mean_gen_accuracy": 0.8686607480049133, "mean_token_accuracy": 0.9011552631855011, "num_tokens": 44254311.0, "sample_num_tokens": 8396.75, "step": 4164, "total_num_tokens": 44287898.0, "z_loss": 0.0006994319846853614 }, { "copy_logits_max": -3.8125011920928955, "copy_logits_min": -750000000.0, "copy_num_tokens": 375.75, "epoch": 0.8506510084248149, "gen_logits_max": 4.99555778503418, "gen_logits_mean": -14.444047927856445, "gen_logits_min": -26.628889083862305, "gen_logits_std": 2.8343658447265625, "gen_loss": 0.274397075176239, "grad_norm": 0.3895324473840545, "learning_rate": 2.566863157894737e-05, "loss": 0.2857, "mean_copy_accuracy": 0.9955416172742844, "mean_gen_accuracy": 0.8743386268615723, "mean_token_accuracy": 0.9041547179222107, "num_tokens": 44528358.0, "sample_num_tokens": 7559.0, "step": 4165, "total_num_tokens": 44558594.0, "z_loss": 0.0005663700867444277 }, { "copy_logits_max": -2.8559508323669434, "copy_logits_min": -750000000.0, "copy_num_tokens": 579.375, "epoch": 0.8508552463620117, "gen_logits_max": 5.558794975280762, "gen_logits_mean": -13.79063606262207, "gen_logits_min": -25.751880645751953, "gen_logits_std": 2.7935261726379395, "gen_loss": 0.30048370361328125, "grad_norm": 0.40923316849623087, "learning_rate": 2.5667368421052634e-05, "loss": 0.3125, "mean_copy_accuracy": 0.9962979555130005, "mean_gen_accuracy": 0.8614873737096786, "mean_token_accuracy": 0.8958015590906143, "num_tokens": 44792921.0, "sample_num_tokens": 9453.25, "step": 4166, "total_num_tokens": 44830734.0, "z_loss": 0.00060187594499439 }, { "copy_logits_max": -4.459003448486328, "copy_logits_min": -750000000.0, "copy_num_tokens": 425.0, "epoch": 0.8510594842992085, "gen_logits_max": 4.238564968109131, "gen_logits_mean": -15.175471305847168, "gen_logits_min": -26.981287002563477, "gen_logits_std": 2.803927421569824, "gen_loss": 0.32417941093444824, "grad_norm": 0.37094710289638455, "learning_rate": 2.5666105263157895e-05, "loss": 0.306, "mean_copy_accuracy": 0.9956775605678558, "mean_gen_accuracy": 0.8678391128778458, "mean_token_accuracy": 0.8998866528272629, "num_tokens": 45089470.0, "sample_num_tokens": 8233.5, "step": 4167, "total_num_tokens": 45122404.0, "z_loss": 0.0006345752626657486 }, { "copy_logits_max": -1.5249600410461426, "copy_logits_min": -750000000.0, "copy_num_tokens": 448.75, "epoch": 0.8512637222364055, "gen_logits_max": 5.210391044616699, "gen_logits_mean": -13.5001802444458, "gen_logits_min": -25.78069305419922, "gen_logits_std": 2.793987989425659, "gen_loss": 0.31904441118240356, "grad_norm": 0.40504402417739044, "learning_rate": 2.566484210526316e-05, "loss": 0.3009, "mean_copy_accuracy": 0.9959830492734909, "mean_gen_accuracy": 0.8626230806112289, "mean_token_accuracy": 0.8989766091108322, "num_tokens": 45344767.0, "sample_num_tokens": 8165.25, "step": 4168, "total_num_tokens": 45377428.0, "z_loss": 0.0006824190495535731 }, { "copy_logits_max": -4.477962970733643, "copy_logits_min": -750000000.0, "copy_num_tokens": 318.875, "epoch": 0.8514679601736023, "gen_logits_max": 5.128982067108154, "gen_logits_mean": -15.419078826904297, "gen_logits_min": -26.679855346679688, "gen_logits_std": 2.7779321670532227, "gen_loss": 0.35715898871421814, "grad_norm": 0.4140051192019404, "learning_rate": 2.566357894736842e-05, "loss": 0.3176, "mean_copy_accuracy": 0.9955995231866837, "mean_gen_accuracy": 0.8634001612663269, "mean_token_accuracy": 0.8933486491441727, "num_tokens": 45610571.0, "sample_num_tokens": 7577.75, "step": 4169, "total_num_tokens": 45640882.0, "z_loss": 0.0007058007176965475 }, { "copy_logits_max": -2.6566436290740967, "copy_logits_min": -750000000.0, "copy_num_tokens": 405.25, "epoch": 0.8516721981107991, "gen_logits_max": 4.886848449707031, "gen_logits_mean": -14.591304779052734, "gen_logits_min": -26.473529815673828, "gen_logits_std": 2.806147813796997, "gen_loss": 0.3427918255329132, "grad_norm": 0.391299896261685, "learning_rate": 2.5662315789473685e-05, "loss": 0.3179, "mean_copy_accuracy": 0.9959679394960403, "mean_gen_accuracy": 0.8677951544523239, "mean_token_accuracy": 0.8946844637393951, "num_tokens": 45860194.0, "sample_num_tokens": 8296.5, "step": 4170, "total_num_tokens": 45893380.0, "z_loss": 0.0007061810465529561 }, { "copy_logits_max": -3.173198699951172, "copy_logits_min": -750000000.0, "copy_num_tokens": 582.0625, "epoch": 0.8518764360479959, "gen_logits_max": 4.51140022277832, "gen_logits_mean": -14.234491348266602, "gen_logits_min": -26.59273338317871, "gen_logits_std": 2.8237645626068115, "gen_loss": 0.2809486985206604, "grad_norm": 0.3849378200052109, "learning_rate": 2.5661052631578946e-05, "loss": 0.302, "mean_copy_accuracy": 0.995718389749527, "mean_gen_accuracy": 0.8665478080511093, "mean_token_accuracy": 0.8987135291099548, "num_tokens": 46133477.0, "sample_num_tokens": 8272.25, "step": 4171, "total_num_tokens": 46166566.0, "z_loss": 0.0005979068810120225 }, { "copy_logits_max": -3.794041633605957, "copy_logits_min": -750000064.0, "copy_num_tokens": 334.3125, "epoch": 0.8520806739851927, "gen_logits_max": 5.215171813964844, "gen_logits_mean": -14.422211647033691, "gen_logits_min": -25.90264129638672, "gen_logits_std": 2.764260768890381, "gen_loss": 0.30240511894226074, "grad_norm": 0.4181476111516524, "learning_rate": 2.5659789473684214e-05, "loss": 0.3008, "mean_copy_accuracy": 0.9943917542695999, "mean_gen_accuracy": 0.8728936463594437, "mean_token_accuracy": 0.900021493434906, "num_tokens": 46405574.0, "sample_num_tokens": 7405.5, "step": 4172, "total_num_tokens": 46435196.0, "z_loss": 0.0006430186331272125 }, { "copy_logits_max": -4.163693428039551, "copy_logits_min": -750000000.0, "copy_num_tokens": 375.625, "epoch": 0.8522849119223895, "gen_logits_max": 4.227259635925293, "gen_logits_mean": -16.14595603942871, "gen_logits_min": -28.02005386352539, "gen_logits_std": 2.8069028854370117, "gen_loss": 0.3020807206630707, "grad_norm": 0.3966283522662926, "learning_rate": 2.5658526315789474e-05, "loss": 0.2935, "mean_copy_accuracy": 0.9944593757390976, "mean_gen_accuracy": 0.8701756149530411, "mean_token_accuracy": 0.900770977139473, "num_tokens": 46663237.0, "sample_num_tokens": 7524.25, "step": 4173, "total_num_tokens": 46693334.0, "z_loss": 0.0006224808166734874 }, { "copy_logits_max": -3.7523958683013916, "copy_logits_min": -750000000.0, "copy_num_tokens": 634.6875, "epoch": 0.8524891498595865, "gen_logits_max": 5.2915472984313965, "gen_logits_mean": -12.929878234863281, "gen_logits_min": -25.66445541381836, "gen_logits_std": 2.83469820022583, "gen_loss": 0.34378668665885925, "grad_norm": 0.3809860956533828, "learning_rate": 2.565726315789474e-05, "loss": 0.2969, "mean_copy_accuracy": 0.9971218556165695, "mean_gen_accuracy": 0.8636989146471024, "mean_token_accuracy": 0.9013580083847046, "num_tokens": 46941145.0, "sample_num_tokens": 9497.25, "step": 4174, "total_num_tokens": 46979134.0, "z_loss": 0.000818040978629142 }, { "copy_logits_max": -2.9349265098571777, "copy_logits_min": -750000000.0, "copy_num_tokens": 423.125, "epoch": 0.8526933877967833, "gen_logits_max": 4.284914016723633, "gen_logits_mean": -15.556350708007812, "gen_logits_min": -28.46961212158203, "gen_logits_std": 2.8592309951782227, "gen_loss": 0.3316447138786316, "grad_norm": 0.39094516622263453, "learning_rate": 2.5656e-05, "loss": 0.3005, "mean_copy_accuracy": 0.9953110069036484, "mean_gen_accuracy": 0.8619948476552963, "mean_token_accuracy": 0.9000186175107956, "num_tokens": 47212926.0, "sample_num_tokens": 7178.0, "step": 4175, "total_num_tokens": 47241638.0, "z_loss": 0.0007097629131749272 }, { "copy_logits_max": -6.167540073394775, "copy_logits_min": -750000000.0, "copy_num_tokens": 544.0625, "epoch": 0.8528976257339801, "gen_logits_max": 4.537285327911377, "gen_logits_mean": -15.37685775756836, "gen_logits_min": -28.273723602294922, "gen_logits_std": 2.8656296730041504, "gen_loss": 0.2972303032875061, "grad_norm": 0.37282137598852544, "learning_rate": 2.5654736842105264e-05, "loss": 0.3044, "mean_copy_accuracy": 0.9954382032155991, "mean_gen_accuracy": 0.8693925738334656, "mean_token_accuracy": 0.8982734233140945, "num_tokens": 47485398.0, "sample_num_tokens": 9506.5, "step": 4176, "total_num_tokens": 47523424.0, "z_loss": 0.0006271653692238033 }, { "copy_logits_max": -7.603265762329102, "copy_logits_min": -687500032.0, "copy_num_tokens": 584.0625, "epoch": 0.8531018636711769, "gen_logits_max": 5.147726535797119, "gen_logits_mean": -14.444950103759766, "gen_logits_min": -26.697101593017578, "gen_logits_std": 2.867039680480957, "gen_loss": 0.2625887393951416, "grad_norm": 0.41373860208657315, "learning_rate": 2.5653473684210525e-05, "loss": 0.2962, "mean_copy_accuracy": 0.994624063372612, "mean_gen_accuracy": 0.8726359009742737, "mean_token_accuracy": 0.9012726098299026, "num_tokens": 47748094.0, "sample_num_tokens": 9465.5, "step": 4177, "total_num_tokens": 47785956.0, "z_loss": 0.0005492179188877344 }, { "copy_logits_max": -6.685432434082031, "copy_logits_min": -750000000.0, "copy_num_tokens": 269.9375, "epoch": 0.8533061016083737, "gen_logits_max": 6.475345611572266, "gen_logits_mean": -13.413305282592773, "gen_logits_min": -25.3939208984375, "gen_logits_std": 2.8067331314086914, "gen_loss": 0.3167318105697632, "grad_norm": 0.4525263247933069, "learning_rate": 2.565221052631579e-05, "loss": 0.3339, "mean_copy_accuracy": 0.9935709983110428, "mean_gen_accuracy": 0.8627063184976578, "mean_token_accuracy": 0.8897691369056702, "num_tokens": 47988584.0, "sample_num_tokens": 6680.5, "step": 4178, "total_num_tokens": 48015306.0, "z_loss": 0.0006592944846488535 }, { "copy_logits_max": -6.042329788208008, "copy_logits_min": -750000000.0, "copy_num_tokens": 832.8125, "epoch": 0.8535103395455705, "gen_logits_max": 4.194622993469238, "gen_logits_mean": -14.723909378051758, "gen_logits_min": -26.991769790649414, "gen_logits_std": 2.8790531158447266, "gen_loss": 0.2541348934173584, "grad_norm": 0.42066975945649043, "learning_rate": 2.5650947368421054e-05, "loss": 0.321, "mean_copy_accuracy": 0.9947947859764099, "mean_gen_accuracy": 0.864788144826889, "mean_token_accuracy": 0.8940874934196472, "num_tokens": 48238840.0, "sample_num_tokens": 10496.0, "step": 4179, "total_num_tokens": 48280824.0, "z_loss": 0.0005229001399129629 }, { "copy_logits_max": -5.484431266784668, "copy_logits_min": -750000000.0, "copy_num_tokens": 526.375, "epoch": 0.8537145774827675, "gen_logits_max": 4.418239593505859, "gen_logits_mean": -15.417448043823242, "gen_logits_min": -27.597171783447266, "gen_logits_std": 2.8399438858032227, "gen_loss": 0.2878187894821167, "grad_norm": 0.42470884615769305, "learning_rate": 2.5649684210526318e-05, "loss": 0.3044, "mean_copy_accuracy": 0.9945948272943497, "mean_gen_accuracy": 0.8615866601467133, "mean_token_accuracy": 0.8974725604057312, "num_tokens": 48508971.0, "sample_num_tokens": 8823.25, "step": 4180, "total_num_tokens": 48544264.0, "z_loss": 0.0006258300272747874 }, { "copy_logits_max": -8.2284574508667, "copy_logits_min": -687500032.0, "copy_num_tokens": 491.25, "epoch": 0.8539188154199643, "gen_logits_max": 5.322484016418457, "gen_logits_mean": -14.423355102539062, "gen_logits_min": -26.386215209960938, "gen_logits_std": 2.8305740356445312, "gen_loss": 0.30665844678878784, "grad_norm": 0.39903743766087313, "learning_rate": 2.5648421052631582e-05, "loss": 0.3031, "mean_copy_accuracy": 0.9942739754915237, "mean_gen_accuracy": 0.8723350167274475, "mean_token_accuracy": 0.8994449526071548, "num_tokens": 48764177.0, "sample_num_tokens": 9112.75, "step": 4181, "total_num_tokens": 48800628.0, "z_loss": 0.0006673307507298887 }, { "copy_logits_max": -7.054672718048096, "copy_logits_min": -750000064.0, "copy_num_tokens": 456.25, "epoch": 0.8541230533571611, "gen_logits_max": 6.012142181396484, "gen_logits_mean": -13.57784652709961, "gen_logits_min": -26.11127281188965, "gen_logits_std": 2.8427584171295166, "gen_loss": 0.29503583908081055, "grad_norm": 0.4030397305409169, "learning_rate": 2.5647157894736843e-05, "loss": 0.2945, "mean_copy_accuracy": 0.9951232969760895, "mean_gen_accuracy": 0.8681234270334244, "mean_token_accuracy": 0.9015970677137375, "num_tokens": 49026119.0, "sample_num_tokens": 7619.75, "step": 4182, "total_num_tokens": 49056598.0, "z_loss": 0.0006545870564877987 }, { "copy_logits_max": -4.857233047485352, "copy_logits_min": -750000000.0, "copy_num_tokens": 808.5625, "epoch": 0.8543272912943579, "gen_logits_max": 4.7321295738220215, "gen_logits_mean": -14.507583618164062, "gen_logits_min": -27.2034969329834, "gen_logits_std": 2.8765478134155273, "gen_loss": 0.28135859966278076, "grad_norm": 0.3898254941374105, "learning_rate": 2.5645894736842107e-05, "loss": 0.2981, "mean_copy_accuracy": 0.9958641082048416, "mean_gen_accuracy": 0.8661443144083023, "mean_token_accuracy": 0.9033093750476837, "num_tokens": 49308798.0, "sample_num_tokens": 10042.0, "step": 4183, "total_num_tokens": 49348966.0, "z_loss": 0.0005914060166105628 }, { "copy_logits_max": -7.424205780029297, "copy_logits_min": -750000000.0, "copy_num_tokens": 480.3125, "epoch": 0.8545315292315547, "gen_logits_max": 7.314997673034668, "gen_logits_mean": -12.324873924255371, "gen_logits_min": -24.51262855529785, "gen_logits_std": 2.792268753051758, "gen_loss": 0.24911858141422272, "grad_norm": 0.4280120042321584, "learning_rate": 2.5644631578947368e-05, "loss": 0.3146, "mean_copy_accuracy": 0.993378221988678, "mean_gen_accuracy": 0.862775444984436, "mean_token_accuracy": 0.8951352089643478, "num_tokens": 49602488.0, "sample_num_tokens": 8726.0, "step": 4184, "total_num_tokens": 49637392.0, "z_loss": 0.0005479631945490837 }, { "copy_logits_max": -5.96296501159668, "copy_logits_min": -687500032.0, "copy_num_tokens": 550.0625, "epoch": 0.8547357671687515, "gen_logits_max": 4.279231071472168, "gen_logits_mean": -15.902223587036133, "gen_logits_min": -27.470529556274414, "gen_logits_std": 2.789780855178833, "gen_loss": 0.286281943321228, "grad_norm": 0.37497527170981326, "learning_rate": 2.5643368421052633e-05, "loss": 0.314, "mean_copy_accuracy": 0.9962820112705231, "mean_gen_accuracy": 0.8584450036287308, "mean_token_accuracy": 0.8954306393861771, "num_tokens": 49883644.0, "sample_num_tokens": 8741.5, "step": 4185, "total_num_tokens": 49918610.0, "z_loss": 0.0006251487648114562 }, { "copy_logits_max": -6.297921180725098, "copy_logits_min": -750000000.0, "copy_num_tokens": 459.0625, "epoch": 0.8549400051059485, "gen_logits_max": 4.200831413269043, "gen_logits_mean": -16.01012420654297, "gen_logits_min": -27.844881057739258, "gen_logits_std": 2.7967758178710938, "gen_loss": 0.2950349748134613, "grad_norm": 0.38248363550899495, "learning_rate": 2.5642105263157894e-05, "loss": 0.3192, "mean_copy_accuracy": 0.9960256963968277, "mean_gen_accuracy": 0.8624843209981918, "mean_token_accuracy": 0.8952082246541977, "num_tokens": 50160221.0, "sample_num_tokens": 8310.75, "step": 4186, "total_num_tokens": 50193464.0, "z_loss": 0.0006005497998557985 }, { "copy_logits_max": -6.389963150024414, "copy_logits_min": -750000000.0, "copy_num_tokens": 507.1875, "epoch": 0.8551442430431453, "gen_logits_max": 5.299689292907715, "gen_logits_mean": -14.394713401794434, "gen_logits_min": -26.100387573242188, "gen_logits_std": 2.822132110595703, "gen_loss": 0.3145960569381714, "grad_norm": 0.37916138642250014, "learning_rate": 2.5640842105263158e-05, "loss": 0.3142, "mean_copy_accuracy": 0.9939993470907211, "mean_gen_accuracy": 0.8674644529819489, "mean_token_accuracy": 0.8940570503473282, "num_tokens": 50427286.0, "sample_num_tokens": 9190.0, "step": 4187, "total_num_tokens": 50464046.0, "z_loss": 0.0006511349347420037 }, { "copy_logits_max": -7.899146556854248, "copy_logits_min": -750000000.0, "copy_num_tokens": 308.3125, "epoch": 0.8553484809803421, "gen_logits_max": 5.8810224533081055, "gen_logits_mean": -14.171253204345703, "gen_logits_min": -25.416784286499023, "gen_logits_std": 2.7863497734069824, "gen_loss": 0.3059171438217163, "grad_norm": 0.40459893200985153, "learning_rate": 2.5639578947368422e-05, "loss": 0.3289, "mean_copy_accuracy": 0.994912326335907, "mean_gen_accuracy": 0.8626656830310822, "mean_token_accuracy": 0.8899009227752686, "num_tokens": 50674711.0, "sample_num_tokens": 8003.75, "step": 4188, "total_num_tokens": 50706726.0, "z_loss": 0.0005893689813092351 }, { "copy_logits_max": -3.18145751953125, "copy_logits_min": -687500032.0, "copy_num_tokens": 723.5625, "epoch": 0.8555527189175389, "gen_logits_max": 3.8056583404541016, "gen_logits_mean": -16.454280853271484, "gen_logits_min": -28.591188430786133, "gen_logits_std": 2.8652830123901367, "gen_loss": 0.2954752445220947, "grad_norm": 0.4075109036208623, "learning_rate": 2.5638315789473686e-05, "loss": 0.2984, "mean_copy_accuracy": 0.9962891489267349, "mean_gen_accuracy": 0.8669113218784332, "mean_token_accuracy": 0.9008269608020782, "num_tokens": 50934433.0, "sample_num_tokens": 9782.75, "step": 4189, "total_num_tokens": 50973564.0, "z_loss": 0.0006388171459548175 }, { "copy_logits_max": -6.422256946563721, "copy_logits_min": -750000000.0, "copy_num_tokens": 556.4375, "epoch": 0.8557569568547357, "gen_logits_max": 5.067523002624512, "gen_logits_mean": -14.746952056884766, "gen_logits_min": -27.03874969482422, "gen_logits_std": 2.867435932159424, "gen_loss": 0.2687215507030487, "grad_norm": 0.37318247043970215, "learning_rate": 2.5637052631578947e-05, "loss": 0.2692, "mean_copy_accuracy": 0.996028408408165, "mean_gen_accuracy": 0.877803772687912, "mean_token_accuracy": 0.9094168990850449, "num_tokens": 51223320.0, "sample_num_tokens": 9155.5, "step": 4190, "total_num_tokens": 51259942.0, "z_loss": 0.0005651534302160144 }, { "copy_logits_max": -6.4160685539245605, "copy_logits_min": -750000000.0, "copy_num_tokens": 565.75, "epoch": 0.8559611947919326, "gen_logits_max": 5.792148590087891, "gen_logits_mean": -13.649246215820312, "gen_logits_min": -25.358642578125, "gen_logits_std": 2.84277081489563, "gen_loss": 0.3343754708766937, "grad_norm": 0.4003655069353451, "learning_rate": 2.5635789473684212e-05, "loss": 0.3279, "mean_copy_accuracy": 0.9949255585670471, "mean_gen_accuracy": 0.8618784844875336, "mean_token_accuracy": 0.8932349979877472, "num_tokens": 51491306.0, "sample_num_tokens": 9697.5, "step": 4191, "total_num_tokens": 51530096.0, "z_loss": 0.000672111171297729 }, { "copy_logits_max": -5.603057384490967, "copy_logits_min": -750000000.0, "copy_num_tokens": 222.5625, "epoch": 0.8561654327291295, "gen_logits_max": 5.889903545379639, "gen_logits_mean": -14.06799030303955, "gen_logits_min": -25.901500701904297, "gen_logits_std": 2.8279342651367188, "gen_loss": 0.29672378301620483, "grad_norm": 0.40645012180830836, "learning_rate": 2.5634526315789473e-05, "loss": 0.3331, "mean_copy_accuracy": 0.9944330602884293, "mean_gen_accuracy": 0.8657293319702148, "mean_token_accuracy": 0.8880808353424072, "num_tokens": 51757319.0, "sample_num_tokens": 7118.25, "step": 4192, "total_num_tokens": 51785792.0, "z_loss": 0.0005598660791292787 }, { "copy_logits_max": -4.5894951820373535, "copy_logits_min": -750000000.0, "copy_num_tokens": 411.3125, "epoch": 0.8563696706663263, "gen_logits_max": 4.617021560668945, "gen_logits_mean": -15.308704376220703, "gen_logits_min": -26.981887817382812, "gen_logits_std": 2.8062305450439453, "gen_loss": 0.30891579389572144, "grad_norm": 0.4000552385976364, "learning_rate": 2.5633263157894737e-05, "loss": 0.316, "mean_copy_accuracy": 0.9947451949119568, "mean_gen_accuracy": 0.8645722270011902, "mean_token_accuracy": 0.8952598571777344, "num_tokens": 52028633.0, "sample_num_tokens": 8468.75, "step": 4193, "total_num_tokens": 52062508.0, "z_loss": 0.0006077298312447965 }, { "copy_logits_max": -5.6613335609436035, "copy_logits_min": -750000000.0, "copy_num_tokens": 379.375, "epoch": 0.8565739086035231, "gen_logits_max": 5.478811264038086, "gen_logits_mean": -13.902640342712402, "gen_logits_min": -26.054794311523438, "gen_logits_std": 2.8549416065216064, "gen_loss": 0.31912514567375183, "grad_norm": 0.3800272283776695, "learning_rate": 2.5632e-05, "loss": 0.3133, "mean_copy_accuracy": 0.9934167414903641, "mean_gen_accuracy": 0.8737158477306366, "mean_token_accuracy": 0.8954846858978271, "num_tokens": 52282704.0, "sample_num_tokens": 7927.5, "step": 4194, "total_num_tokens": 52314414.0, "z_loss": 0.0005550750647671521 }, { "copy_logits_max": -5.251995086669922, "copy_logits_min": -750000000.0, "copy_num_tokens": 333.1875, "epoch": 0.8567781465407199, "gen_logits_max": 6.577853679656982, "gen_logits_mean": -12.93029499053955, "gen_logits_min": -24.29416275024414, "gen_logits_std": 2.786832094192505, "gen_loss": 0.335066556930542, "grad_norm": 0.4259914003759757, "learning_rate": 2.5630736842105262e-05, "loss": 0.2937, "mean_copy_accuracy": 0.9962937086820602, "mean_gen_accuracy": 0.8672108799219131, "mean_token_accuracy": 0.9042095243930817, "num_tokens": 52561024.0, "sample_num_tokens": 8825.0, "step": 4195, "total_num_tokens": 52596324.0, "z_loss": 0.0005849473527632654 }, { "copy_logits_max": -3.083634376525879, "copy_logits_min": -750000000.0, "copy_num_tokens": 459.0625, "epoch": 0.8569823844779167, "gen_logits_max": 5.053707122802734, "gen_logits_mean": -14.461041450500488, "gen_logits_min": -26.742332458496094, "gen_logits_std": 2.8797967433929443, "gen_loss": 0.30958545207977295, "grad_norm": 0.651090148797471, "learning_rate": 2.5629473684210526e-05, "loss": 0.3139, "mean_copy_accuracy": 0.9967915862798691, "mean_gen_accuracy": 0.8626997321844101, "mean_token_accuracy": 0.8971170037984848, "num_tokens": 52835866.0, "sample_num_tokens": 8455.5, "step": 4196, "total_num_tokens": 52869688.0, "z_loss": 0.0006600353517569602 }, { "copy_logits_max": -4.7453460693359375, "copy_logits_min": -750000000.0, "copy_num_tokens": 276.125, "epoch": 0.8571866224151136, "gen_logits_max": 6.137246608734131, "gen_logits_mean": -15.46186637878418, "gen_logits_min": -26.737401962280273, "gen_logits_std": 2.831162452697754, "gen_loss": 0.3314177691936493, "grad_norm": 0.37983434449046055, "learning_rate": 2.562821052631579e-05, "loss": 0.308, "mean_copy_accuracy": 0.9954645037651062, "mean_gen_accuracy": 0.8705069720745087, "mean_token_accuracy": 0.8973764330148697, "num_tokens": 53105297.0, "sample_num_tokens": 7805.25, "step": 4197, "total_num_tokens": 53136518.0, "z_loss": 0.0006429270142689347 }, { "copy_logits_max": -0.5737329721450806, "copy_logits_min": -750000000.0, "copy_num_tokens": 568.9375, "epoch": 0.8573908603523105, "gen_logits_max": 5.1265387535095215, "gen_logits_mean": -13.881244659423828, "gen_logits_min": -25.960521697998047, "gen_logits_std": 2.852078437805176, "gen_loss": 0.30744045972824097, "grad_norm": 0.4287758959143473, "learning_rate": 2.5626947368421055e-05, "loss": 0.3218, "mean_copy_accuracy": 0.9953549355268478, "mean_gen_accuracy": 0.8594277650117874, "mean_token_accuracy": 0.8949426710605621, "num_tokens": 53378645.0, "sample_num_tokens": 8656.75, "step": 4198, "total_num_tokens": 53413272.0, "z_loss": 0.0007499516941606998 }, { "copy_logits_max": -2.3507890701293945, "copy_logits_min": -750000064.0, "copy_num_tokens": 358.125, "epoch": 0.8575950982895073, "gen_logits_max": 6.117931365966797, "gen_logits_mean": -13.493728637695312, "gen_logits_min": -25.19379425048828, "gen_logits_std": 2.804931640625, "gen_loss": 0.31046974658966064, "grad_norm": 0.4135747753935675, "learning_rate": 2.5625684210526316e-05, "loss": 0.2985, "mean_copy_accuracy": 0.9942371696233749, "mean_gen_accuracy": 0.8702796548604965, "mean_token_accuracy": 0.9004745334386826, "num_tokens": 53641666.0, "sample_num_tokens": 7694.0, "step": 4199, "total_num_tokens": 53672442.0, "z_loss": 0.0007219776161946356 }, { "copy_logits_max": -1.6537690162658691, "copy_logits_min": -750000000.0, "copy_num_tokens": 557.5, "epoch": 0.8577993362267041, "gen_logits_max": 5.970496654510498, "gen_logits_mean": -12.615571975708008, "gen_logits_min": -24.861202239990234, "gen_logits_std": 2.819276809692383, "gen_loss": 0.2867163121700287, "grad_norm": 0.41504782306875415, "learning_rate": 2.562442105263158e-05, "loss": 0.3007, "mean_copy_accuracy": 0.9955058693885803, "mean_gen_accuracy": 0.8683784455060959, "mean_token_accuracy": 0.9005292356014252, "num_tokens": 53906416.0, "sample_num_tokens": 8667.5, "step": 4200, "total_num_tokens": 53941086.0, "z_loss": 0.0006594958249479532 }, { "copy_logits_max": -3.4115500450134277, "copy_logits_min": -750000128.0, "copy_num_tokens": 543.9375, "epoch": 0.8580035741639009, "gen_logits_max": 4.591391563415527, "gen_logits_mean": -14.452425003051758, "gen_logits_min": -26.96197509765625, "gen_logits_std": 2.8091089725494385, "gen_loss": 0.2767742872238159, "grad_norm": 0.393016836513993, "learning_rate": 2.562315789473684e-05, "loss": 0.307, "mean_copy_accuracy": 0.9951691925525665, "mean_gen_accuracy": 0.8647980839014053, "mean_token_accuracy": 0.8993536829948425, "num_tokens": 54195924.0, "sample_num_tokens": 8883.5, "step": 4201, "total_num_tokens": 54231458.0, "z_loss": 0.0006053575198166072 }, { "copy_logits_max": -2.332858085632324, "copy_logits_min": -750000000.0, "copy_num_tokens": 446.875, "epoch": 0.8582078121010978, "gen_logits_max": 6.133648872375488, "gen_logits_mean": -13.764854431152344, "gen_logits_min": -25.820392608642578, "gen_logits_std": 2.8245255947113037, "gen_loss": 0.3058244585990906, "grad_norm": 0.3964046946603822, "learning_rate": 2.5621894736842106e-05, "loss": 0.3091, "mean_copy_accuracy": 0.995657742023468, "mean_gen_accuracy": 0.8638010323047638, "mean_token_accuracy": 0.9003987014293671, "num_tokens": 54475559.0, "sample_num_tokens": 8341.25, "step": 4202, "total_num_tokens": 54508924.0, "z_loss": 0.0006684705149382353 }, { "copy_logits_max": -5.526039123535156, "copy_logits_min": -750000000.0, "copy_num_tokens": 547.6875, "epoch": 0.8584120500382946, "gen_logits_max": 5.62062931060791, "gen_logits_mean": -14.490320205688477, "gen_logits_min": -26.51634979248047, "gen_logits_std": 2.8334317207336426, "gen_loss": 0.2870549261569977, "grad_norm": 0.39372363693562457, "learning_rate": 2.5620631578947366e-05, "loss": 0.2927, "mean_copy_accuracy": 0.9952987432479858, "mean_gen_accuracy": 0.8702643811702728, "mean_token_accuracy": 0.9025118499994278, "num_tokens": 54758294.0, "sample_num_tokens": 9618.0, "step": 4203, "total_num_tokens": 54796766.0, "z_loss": 0.0006523338379338384 }, { "copy_logits_max": -5.606818675994873, "copy_logits_min": -750000000.0, "copy_num_tokens": 475.125, "epoch": 0.8586162879754915, "gen_logits_max": 5.895695209503174, "gen_logits_mean": -14.071477890014648, "gen_logits_min": -26.03438377380371, "gen_logits_std": 2.828035354614258, "gen_loss": 0.2943135201931, "grad_norm": 0.37362523208929754, "learning_rate": 2.561936842105263e-05, "loss": 0.2963, "mean_copy_accuracy": 0.9961423426866531, "mean_gen_accuracy": 0.8642618358135223, "mean_token_accuracy": 0.901097297668457, "num_tokens": 55043500.0, "sample_num_tokens": 8174.5, "step": 4204, "total_num_tokens": 55076198.0, "z_loss": 0.0006302370456978679 }, { "copy_logits_max": -4.839352607727051, "copy_logits_min": -750000000.0, "copy_num_tokens": 460.25, "epoch": 0.8588205259126883, "gen_logits_max": 5.3921918869018555, "gen_logits_mean": -14.788718223571777, "gen_logits_min": -26.8326416015625, "gen_logits_std": 2.825299024581909, "gen_loss": 0.3346588909626007, "grad_norm": 0.4026988141784087, "learning_rate": 2.5618105263157895e-05, "loss": 0.3009, "mean_copy_accuracy": 0.9947277009487152, "mean_gen_accuracy": 0.8634652346372604, "mean_token_accuracy": 0.8982063978910446, "num_tokens": 55300557.0, "sample_num_tokens": 8005.25, "step": 4205, "total_num_tokens": 55332578.0, "z_loss": 0.0007188315503299236 }, { "copy_logits_max": -3.55318546295166, "copy_logits_min": -687500032.0, "copy_num_tokens": 479.6875, "epoch": 0.8590247638498851, "gen_logits_max": 5.219108581542969, "gen_logits_mean": -13.890743255615234, "gen_logits_min": -26.067983627319336, "gen_logits_std": 2.848928451538086, "gen_loss": 0.27101194858551025, "grad_norm": 0.40343682451782575, "learning_rate": 2.561684210526316e-05, "loss": 0.2873, "mean_copy_accuracy": 0.9941941648721695, "mean_gen_accuracy": 0.8792823106050491, "mean_token_accuracy": 0.9034298062324524, "num_tokens": 55577230.0, "sample_num_tokens": 8872.5, "step": 4206, "total_num_tokens": 55612720.0, "z_loss": 0.0005715532461181283 }, { "copy_logits_max": -4.63729190826416, "copy_logits_min": -750000000.0, "copy_num_tokens": 519.375, "epoch": 0.859229001787082, "gen_logits_max": 4.7892961502075195, "gen_logits_mean": -14.900923728942871, "gen_logits_min": -26.883785247802734, "gen_logits_std": 2.82930850982666, "gen_loss": 0.28438514471054077, "grad_norm": 0.4282255519283037, "learning_rate": 2.5615578947368424e-05, "loss": 0.2944, "mean_copy_accuracy": 0.9942808002233505, "mean_gen_accuracy": 0.8705086708068848, "mean_token_accuracy": 0.9004151672124863, "num_tokens": 55846582.0, "sample_num_tokens": 8725.0, "step": 4207, "total_num_tokens": 55881482.0, "z_loss": 0.0006079147569835186 }, { "copy_logits_max": -4.153413772583008, "copy_logits_min": -687500032.0, "copy_num_tokens": 421.4375, "epoch": 0.8594332397242788, "gen_logits_max": 4.8253021240234375, "gen_logits_mean": -16.001375198364258, "gen_logits_min": -27.885583877563477, "gen_logits_std": 2.812692642211914, "gen_loss": 0.3105330467224121, "grad_norm": 0.4688892644335186, "learning_rate": 2.5614315789473685e-05, "loss": 0.3417, "mean_copy_accuracy": 0.995772510766983, "mean_gen_accuracy": 0.8515813648700714, "mean_token_accuracy": 0.8883892744779587, "num_tokens": 56124605.0, "sample_num_tokens": 8211.75, "step": 4208, "total_num_tokens": 56157452.0, "z_loss": 0.0007039642659947276 }, { "copy_logits_max": -2.3270015716552734, "copy_logits_min": -750000000.0, "copy_num_tokens": 362.4375, "epoch": 0.8596374776614756, "gen_logits_max": 6.08420467376709, "gen_logits_mean": -14.25955581665039, "gen_logits_min": -26.24544906616211, "gen_logits_std": 2.8196825981140137, "gen_loss": 0.32856136560440063, "grad_norm": 0.43677663765213903, "learning_rate": 2.561305263157895e-05, "loss": 0.3053, "mean_copy_accuracy": 0.9955075681209564, "mean_gen_accuracy": 0.864050880074501, "mean_token_accuracy": 0.8992619961500168, "num_tokens": 56404489.0, "sample_num_tokens": 7575.25, "step": 4209, "total_num_tokens": 56434790.0, "z_loss": 0.000755566987209022 }, { "copy_logits_max": -3.402754306793213, "copy_logits_min": -750000000.0, "copy_num_tokens": 423.3125, "epoch": 0.8598417155986725, "gen_logits_max": 4.262632846832275, "gen_logits_mean": -15.365163803100586, "gen_logits_min": -27.258174896240234, "gen_logits_std": 2.836977005004883, "gen_loss": 0.30871057510375977, "grad_norm": 0.4526748172541692, "learning_rate": 2.561178947368421e-05, "loss": 0.3235, "mean_copy_accuracy": 0.995101273059845, "mean_gen_accuracy": 0.8589594662189484, "mean_token_accuracy": 0.8909210711717606, "num_tokens": 56657086.0, "sample_num_tokens": 8278.5, "step": 4210, "total_num_tokens": 56690200.0, "z_loss": 0.0006405056337825954 }, { "copy_logits_max": -3.416774034500122, "copy_logits_min": -687500032.0, "copy_num_tokens": 469.3125, "epoch": 0.8600459535358693, "gen_logits_max": 4.731387138366699, "gen_logits_mean": -14.986462593078613, "gen_logits_min": -26.990047454833984, "gen_logits_std": 2.8371143341064453, "gen_loss": 0.31093114614486694, "grad_norm": 0.4602301577063968, "learning_rate": 2.5610526315789474e-05, "loss": 0.3123, "mean_copy_accuracy": 0.9941764324903488, "mean_gen_accuracy": 0.8651903420686722, "mean_token_accuracy": 0.8962727189064026, "num_tokens": 56916263.0, "sample_num_tokens": 8332.25, "step": 4211, "total_num_tokens": 56949592.0, "z_loss": 0.0006289392476901412 }, { "copy_logits_max": -2.7694907188415527, "copy_logits_min": -750000000.0, "copy_num_tokens": 430.375, "epoch": 0.8602501914730661, "gen_logits_max": 4.669836521148682, "gen_logits_mean": -15.203107833862305, "gen_logits_min": -26.839183807373047, "gen_logits_std": 2.8065929412841797, "gen_loss": 0.3521663546562195, "grad_norm": 0.4399669570380001, "learning_rate": 2.5609263157894735e-05, "loss": 0.3324, "mean_copy_accuracy": 0.9942445009946823, "mean_gen_accuracy": 0.8573333472013474, "mean_token_accuracy": 0.8910389840602875, "num_tokens": 57186701.0, "sample_num_tokens": 8413.25, "step": 4212, "total_num_tokens": 57220354.0, "z_loss": 0.0006764784920960665 }, { "copy_logits_max": -0.48800837993621826, "copy_logits_min": -750000000.0, "copy_num_tokens": 632.0, "epoch": 0.860454429410263, "gen_logits_max": 4.975824356079102, "gen_logits_mean": -13.864875793457031, "gen_logits_min": -26.03842544555664, "gen_logits_std": 2.838878631591797, "gen_loss": 0.3192712664604187, "grad_norm": 0.4231317425282934, "learning_rate": 2.5608000000000003e-05, "loss": 0.3072, "mean_copy_accuracy": 0.9953083693981171, "mean_gen_accuracy": 0.86774642765522, "mean_token_accuracy": 0.8994921743869781, "num_tokens": 57470994.0, "sample_num_tokens": 10015.0, "step": 4213, "total_num_tokens": 57511054.0, "z_loss": 0.0006625335663557053 }, { "copy_logits_max": -2.845926284790039, "copy_logits_min": -750000000.0, "copy_num_tokens": 621.3125, "epoch": 0.8606586673474598, "gen_logits_max": 3.445607900619507, "gen_logits_mean": -16.44335174560547, "gen_logits_min": -27.97954559326172, "gen_logits_std": 2.8358006477355957, "gen_loss": 0.2922918200492859, "grad_norm": 0.4242415452765049, "learning_rate": 2.5606736842105264e-05, "loss": 0.2913, "mean_copy_accuracy": 0.9960555285215378, "mean_gen_accuracy": 0.8738422989845276, "mean_token_accuracy": 0.9038112908601761, "num_tokens": 57754413.0, "sample_num_tokens": 10136.75, "step": 4214, "total_num_tokens": 57794960.0, "z_loss": 0.0005661917966790497 }, { "copy_logits_max": -2.585400104522705, "copy_logits_min": -750000000.0, "copy_num_tokens": 411.5, "epoch": 0.8608629052846566, "gen_logits_max": 5.101503372192383, "gen_logits_mean": -14.602251052856445, "gen_logits_min": -26.70785140991211, "gen_logits_std": 2.8292198181152344, "gen_loss": 0.30480819940567017, "grad_norm": 0.4148275932145084, "learning_rate": 2.5605473684210528e-05, "loss": 0.3202, "mean_copy_accuracy": 0.9944596737623215, "mean_gen_accuracy": 0.8611575812101364, "mean_token_accuracy": 0.8921483755111694, "num_tokens": 58019483.0, "sample_num_tokens": 8585.25, "step": 4215, "total_num_tokens": 58053824.0, "z_loss": 0.0005918731912970543 }, { "copy_logits_max": -4.496154308319092, "copy_logits_min": -750000000.0, "copy_num_tokens": 585.9375, "epoch": 0.8610671432218534, "gen_logits_max": 4.983706474304199, "gen_logits_mean": -14.028728485107422, "gen_logits_min": -25.81934356689453, "gen_logits_std": 2.8425803184509277, "gen_loss": 0.2626185715198517, "grad_norm": 0.468335056005214, "learning_rate": 2.560421052631579e-05, "loss": 0.3159, "mean_copy_accuracy": 0.9948610961437225, "mean_gen_accuracy": 0.8622984290122986, "mean_token_accuracy": 0.8971998691558838, "num_tokens": 58293084.0, "sample_num_tokens": 9272.5, "step": 4216, "total_num_tokens": 58330174.0, "z_loss": 0.0005081656854599714 }, { "copy_logits_max": -1.1822731494903564, "copy_logits_min": -687500032.0, "copy_num_tokens": 452.6875, "epoch": 0.8612713811590503, "gen_logits_max": 4.558856010437012, "gen_logits_mean": -14.332691192626953, "gen_logits_min": -26.44904136657715, "gen_logits_std": 2.863110065460205, "gen_loss": 0.3207348585128784, "grad_norm": 0.41144954594290184, "learning_rate": 2.5602947368421053e-05, "loss": 0.3318, "mean_copy_accuracy": 0.9955796152353287, "mean_gen_accuracy": 0.8568753004074097, "mean_token_accuracy": 0.8899521827697754, "num_tokens": 58560884.0, "sample_num_tokens": 8544.5, "step": 4217, "total_num_tokens": 58595062.0, "z_loss": 0.0006203673547133803 }, { "copy_logits_max": -3.434598922729492, "copy_logits_min": -687500032.0, "copy_num_tokens": 440.8125, "epoch": 0.8614756190962471, "gen_logits_max": 4.024767875671387, "gen_logits_mean": -15.742409706115723, "gen_logits_min": -27.294681549072266, "gen_logits_std": 2.8320858478546143, "gen_loss": 0.2993945777416229, "grad_norm": 0.40669815749638144, "learning_rate": 2.5601684210526314e-05, "loss": 0.3003, "mean_copy_accuracy": 0.9963490217924118, "mean_gen_accuracy": 0.8672426640987396, "mean_token_accuracy": 0.9013701528310776, "num_tokens": 58839913.0, "sample_num_tokens": 8313.75, "step": 4218, "total_num_tokens": 58873168.0, "z_loss": 0.0005903173005208373 }, { "copy_logits_max": -1.6187323331832886, "copy_logits_min": -750000064.0, "copy_num_tokens": 389.9375, "epoch": 0.861679857033444, "gen_logits_max": 4.743868827819824, "gen_logits_mean": -15.017956733703613, "gen_logits_min": -27.453784942626953, "gen_logits_std": 2.8346309661865234, "gen_loss": 0.3084661662578583, "grad_norm": 0.43890182900140434, "learning_rate": 2.560042105263158e-05, "loss": 0.3218, "mean_copy_accuracy": 0.9958230555057526, "mean_gen_accuracy": 0.8610920310020447, "mean_token_accuracy": 0.8960841000080109, "num_tokens": 59114358.0, "sample_num_tokens": 7097.5, "step": 4219, "total_num_tokens": 59142748.0, "z_loss": 0.000643631094135344 }, { "copy_logits_max": 1.2625423669815063, "copy_logits_min": -562500096.0, "copy_num_tokens": 644.4375, "epoch": 0.8618840949706408, "gen_logits_max": 4.811830997467041, "gen_logits_mean": -13.328287124633789, "gen_logits_min": -25.597572326660156, "gen_logits_std": 2.850869655609131, "gen_loss": 0.288563996553421, "grad_norm": 0.3628114252619765, "learning_rate": 2.5599157894736843e-05, "loss": 0.2805, "mean_copy_accuracy": 0.9961805641651154, "mean_gen_accuracy": 0.8741739988327026, "mean_token_accuracy": 0.9075736403465271, "num_tokens": 59402759.0, "sample_num_tokens": 9854.25, "step": 4220, "total_num_tokens": 59442176.0, "z_loss": 0.0006581267807632685 }, { "copy_logits_max": -1.5904220342636108, "copy_logits_min": -687500032.0, "copy_num_tokens": 493.875, "epoch": 0.8620883329078376, "gen_logits_max": 4.286675453186035, "gen_logits_mean": -14.864238739013672, "gen_logits_min": -27.29310417175293, "gen_logits_std": 2.8401031494140625, "gen_loss": 0.30930060148239136, "grad_norm": 0.424519098456408, "learning_rate": 2.5597894736842107e-05, "loss": 0.3001, "mean_copy_accuracy": 0.9949305206537247, "mean_gen_accuracy": 0.8620207160711288, "mean_token_accuracy": 0.8996906131505966, "num_tokens": 59676881.0, "sample_num_tokens": 7858.75, "step": 4221, "total_num_tokens": 59708316.0, "z_loss": 0.0006644302047789097 }, { "copy_logits_max": -4.799681663513184, "copy_logits_min": -750000000.0, "copy_num_tokens": 422.9375, "epoch": 0.8622925708450344, "gen_logits_max": 5.090685844421387, "gen_logits_mean": -15.100083351135254, "gen_logits_min": -26.863189697265625, "gen_logits_std": 2.8268368244171143, "gen_loss": 0.3182617127895355, "grad_norm": 0.40410917706984745, "learning_rate": 2.559663157894737e-05, "loss": 0.3174, "mean_copy_accuracy": 0.9952253997325897, "mean_gen_accuracy": 0.8638771921396255, "mean_token_accuracy": 0.8965442180633545, "num_tokens": 59961702.0, "sample_num_tokens": 8361.0, "step": 4222, "total_num_tokens": 59995146.0, "z_loss": 0.0006557961460202932 }, { "copy_logits_max": -0.320797324180603, "copy_logits_min": -750000000.0, "copy_num_tokens": 483.625, "epoch": 0.8624968087822313, "gen_logits_max": 4.7097859382629395, "gen_logits_mean": -15.29574203491211, "gen_logits_min": -27.469112396240234, "gen_logits_std": 2.837160110473633, "gen_loss": 0.31897181272506714, "grad_norm": 0.4664552093129146, "learning_rate": 2.5595368421052632e-05, "loss": 0.3149, "mean_copy_accuracy": 0.9942785948514938, "mean_gen_accuracy": 0.8627931028604507, "mean_token_accuracy": 0.8956375420093536, "num_tokens": 60214582.0, "sample_num_tokens": 8434.0, "step": 4223, "total_num_tokens": 60248318.0, "z_loss": 0.0006731204339303076 }, { "copy_logits_max": -2.5621860027313232, "copy_logits_min": -750000000.0, "copy_num_tokens": 368.75, "epoch": 0.8627010467194282, "gen_logits_max": 5.621547698974609, "gen_logits_mean": -14.236807823181152, "gen_logits_min": -26.722171783447266, "gen_logits_std": 2.857968807220459, "gen_loss": 0.29782235622406006, "grad_norm": 0.38842053090193557, "learning_rate": 2.5594105263157897e-05, "loss": 0.2875, "mean_copy_accuracy": 0.9946061819791794, "mean_gen_accuracy": 0.8743426948785782, "mean_token_accuracy": 0.9019962400197983, "num_tokens": 60474738.0, "sample_num_tokens": 7175.5, "step": 4224, "total_num_tokens": 60503440.0, "z_loss": 0.0005950183258391917 }, { "copy_logits_max": -3.182910919189453, "copy_logits_min": -750000000.0, "copy_num_tokens": 529.625, "epoch": 0.862905284656625, "gen_logits_max": 4.190699100494385, "gen_logits_mean": -15.489740371704102, "gen_logits_min": -27.624277114868164, "gen_logits_std": 2.833859920501709, "gen_loss": 0.3168429732322693, "grad_norm": 0.5729518061904078, "learning_rate": 2.5592842105263158e-05, "loss": 0.3315, "mean_copy_accuracy": 0.9945733100175858, "mean_gen_accuracy": 0.8564905077219009, "mean_token_accuracy": 0.89015993475914, "num_tokens": 60750176.0, "sample_num_tokens": 8638.5, "step": 4225, "total_num_tokens": 60784730.0, "z_loss": 0.0006617432809434831 }, { "copy_logits_max": -2.9873275756835938, "copy_logits_min": -750000000.0, "copy_num_tokens": 438.9375, "epoch": 0.8631095225938218, "gen_logits_max": 5.128022193908691, "gen_logits_mean": -13.964232444763184, "gen_logits_min": -26.266569137573242, "gen_logits_std": 2.8094935417175293, "gen_loss": 0.2917354106903076, "grad_norm": 0.43221802699219086, "learning_rate": 2.5591578947368422e-05, "loss": 0.297, "mean_copy_accuracy": 0.996050164103508, "mean_gen_accuracy": 0.8668418228626251, "mean_token_accuracy": 0.899285614490509, "num_tokens": 61018067.0, "sample_num_tokens": 8123.25, "step": 4226, "total_num_tokens": 61050560.0, "z_loss": 0.0006163232610560954 }, { "copy_logits_max": -2.394643783569336, "copy_logits_min": -687500032.0, "copy_num_tokens": 516.625, "epoch": 0.8633137605310186, "gen_logits_max": 5.320566177368164, "gen_logits_mean": -13.704164505004883, "gen_logits_min": -26.349313735961914, "gen_logits_std": 2.8036723136901855, "gen_loss": 0.338483989238739, "grad_norm": 0.4174125604481665, "learning_rate": 2.5590315789473683e-05, "loss": 0.3016, "mean_copy_accuracy": 0.99641153216362, "mean_gen_accuracy": 0.8661307394504547, "mean_token_accuracy": 0.9006179869174957, "num_tokens": 61280899.0, "sample_num_tokens": 8593.75, "step": 4227, "total_num_tokens": 61315274.0, "z_loss": 0.0008521189447492361 }, { "copy_logits_max": -4.575351238250732, "copy_logits_min": -750000000.0, "copy_num_tokens": 341.1875, "epoch": 0.8635179984682154, "gen_logits_max": 4.415503978729248, "gen_logits_mean": -15.719392776489258, "gen_logits_min": -27.714649200439453, "gen_logits_std": 2.817192554473877, "gen_loss": 0.31304001808166504, "grad_norm": 0.46204464864180445, "learning_rate": 2.5589052631578947e-05, "loss": 0.2983, "mean_copy_accuracy": 0.9950049072504044, "mean_gen_accuracy": 0.8685382008552551, "mean_token_accuracy": 0.9015205949544907, "num_tokens": 61555862.0, "sample_num_tokens": 7407.5, "step": 4228, "total_num_tokens": 61585492.0, "z_loss": 0.0006306882714852691 }, { "copy_logits_max": -2.2164087295532227, "copy_logits_min": -687500032.0, "copy_num_tokens": 304.875, "epoch": 0.8637222364054123, "gen_logits_max": 6.043499946594238, "gen_logits_mean": -13.145437240600586, "gen_logits_min": -25.77039909362793, "gen_logits_std": 2.760817527770996, "gen_loss": 0.33379849791526794, "grad_norm": 0.41187989689694926, "learning_rate": 2.558778947368421e-05, "loss": 0.3326, "mean_copy_accuracy": 0.9941159337759018, "mean_gen_accuracy": 0.8603430092334747, "mean_token_accuracy": 0.8903776109218597, "num_tokens": 61810172.0, "sample_num_tokens": 6433.0, "step": 4229, "total_num_tokens": 61835904.0, "z_loss": 0.0008246225770562887 }, { "copy_logits_max": 0.1407243013381958, "copy_logits_min": -750000000.0, "copy_num_tokens": 491.0, "epoch": 0.8639264743426092, "gen_logits_max": 4.247321128845215, "gen_logits_mean": -15.05870246887207, "gen_logits_min": -27.153541564941406, "gen_logits_std": 2.837174892425537, "gen_loss": 0.33171528577804565, "grad_norm": 0.5661395950745383, "learning_rate": 2.5586526315789476e-05, "loss": 0.303, "mean_copy_accuracy": 0.9953136444091797, "mean_gen_accuracy": 0.8671054095029831, "mean_token_accuracy": 0.8987181037664413, "num_tokens": 62082438.0, "sample_num_tokens": 7857.5, "step": 4230, "total_num_tokens": 62113868.0, "z_loss": 0.0009083333425223827 }, { "copy_logits_max": -0.3248894512653351, "copy_logits_min": -687500032.0, "copy_num_tokens": 433.1875, "epoch": 0.864130712279806, "gen_logits_max": 4.5956315994262695, "gen_logits_mean": -15.472221374511719, "gen_logits_min": -27.17702865600586, "gen_logits_std": 2.8183326721191406, "gen_loss": 0.2879430949687958, "grad_norm": 0.3785014300090555, "learning_rate": 2.5585263157894737e-05, "loss": 0.2973, "mean_copy_accuracy": 0.9949306100606918, "mean_gen_accuracy": 0.871195062994957, "mean_token_accuracy": 0.9027924239635468, "num_tokens": 62362570.0, "sample_num_tokens": 8087.0, "step": 4231, "total_num_tokens": 62394918.0, "z_loss": 0.0008421640377491713 }, { "copy_logits_max": -0.3265053927898407, "copy_logits_min": -625000064.0, "copy_num_tokens": 518.25, "epoch": 0.8643349502170028, "gen_logits_max": 4.702664375305176, "gen_logits_mean": -14.073318481445312, "gen_logits_min": -26.473920822143555, "gen_logits_std": 2.8311290740966797, "gen_loss": 0.2908359169960022, "grad_norm": 0.39517381067405505, "learning_rate": 2.5584e-05, "loss": 0.2943, "mean_copy_accuracy": 0.996680736541748, "mean_gen_accuracy": 0.8649361431598663, "mean_token_accuracy": 0.9022188037633896, "num_tokens": 62645796.0, "sample_num_tokens": 8426.0, "step": 4232, "total_num_tokens": 62679500.0, "z_loss": 0.0009541952167637646 }, { "copy_logits_max": -1.7135562896728516, "copy_logits_min": -750000000.0, "copy_num_tokens": 391.5, "epoch": 0.8645391881541996, "gen_logits_max": 4.730853080749512, "gen_logits_mean": -14.819292068481445, "gen_logits_min": -26.857200622558594, "gen_logits_std": 2.818495035171509, "gen_loss": 0.28857165575027466, "grad_norm": 0.39845319365507714, "learning_rate": 2.5582736842105265e-05, "loss": 0.3036, "mean_copy_accuracy": 0.994651734828949, "mean_gen_accuracy": 0.871051236987114, "mean_token_accuracy": 0.8994637131690979, "num_tokens": 62896378.0, "sample_num_tokens": 8072.0, "step": 4233, "total_num_tokens": 62928666.0, "z_loss": 0.0007156261708587408 }, { "copy_logits_max": 0.47838544845581055, "copy_logits_min": -750000000.0, "copy_num_tokens": 657.875, "epoch": 0.8647434260913964, "gen_logits_max": 3.3312573432922363, "gen_logits_mean": -15.298772811889648, "gen_logits_min": -27.747879028320312, "gen_logits_std": 2.852069854736328, "gen_loss": 0.25944435596466064, "grad_norm": 0.5113662862137558, "learning_rate": 2.5581473684210526e-05, "loss": 0.302, "mean_copy_accuracy": 0.9943203330039978, "mean_gen_accuracy": 0.8663453906774521, "mean_token_accuracy": 0.8997243046760559, "num_tokens": 63167458.0, "sample_num_tokens": 9442.5, "step": 4234, "total_num_tokens": 63205228.0, "z_loss": 0.0006897816201671958 }, { "copy_logits_max": -4.015292644500732, "copy_logits_min": -750000000.0, "copy_num_tokens": 379.125, "epoch": 0.8649476640285934, "gen_logits_max": 5.374277114868164, "gen_logits_mean": -13.007400512695312, "gen_logits_min": -24.985261917114258, "gen_logits_std": 2.7652645111083984, "gen_loss": 0.3030504584312439, "grad_norm": 0.43420242532757986, "learning_rate": 2.558021052631579e-05, "loss": 0.2995, "mean_copy_accuracy": 0.9944563955068588, "mean_gen_accuracy": 0.8703432679176331, "mean_token_accuracy": 0.9010226875543594, "num_tokens": 63458431.0, "sample_num_tokens": 8305.75, "step": 4235, "total_num_tokens": 63491654.0, "z_loss": 0.0006485239719040692 }, { "copy_logits_max": -1.9371843338012695, "copy_logits_min": -750000000.0, "copy_num_tokens": 620.1875, "epoch": 0.8651519019657902, "gen_logits_max": 3.560772180557251, "gen_logits_mean": -14.89829158782959, "gen_logits_min": -26.984813690185547, "gen_logits_std": 2.8255951404571533, "gen_loss": 0.3032717704772949, "grad_norm": 0.4228691206913318, "learning_rate": 2.557894736842105e-05, "loss": 0.3148, "mean_copy_accuracy": 0.9954822361469269, "mean_gen_accuracy": 0.8623845130205154, "mean_token_accuracy": 0.897834300994873, "num_tokens": 63732692.0, "sample_num_tokens": 9054.0, "step": 4236, "total_num_tokens": 63768908.0, "z_loss": 0.0006793525535613298 }, { "copy_logits_max": -3.554675340652466, "copy_logits_min": -750000064.0, "copy_num_tokens": 440.0, "epoch": 0.865356139902987, "gen_logits_max": 4.542511463165283, "gen_logits_mean": -14.176464080810547, "gen_logits_min": -26.937671661376953, "gen_logits_std": 2.8590707778930664, "gen_loss": 0.3058643937110901, "grad_norm": 0.41542066149678175, "learning_rate": 2.557768421052632e-05, "loss": 0.3237, "mean_copy_accuracy": 0.9949967414140701, "mean_gen_accuracy": 0.8618125468492508, "mean_token_accuracy": 0.8937796950340271, "num_tokens": 64008521.0, "sample_num_tokens": 7679.75, "step": 4237, "total_num_tokens": 64039240.0, "z_loss": 0.0007277765544131398 }, { "copy_logits_max": -4.424324989318848, "copy_logits_min": -750000000.0, "copy_num_tokens": 436.5, "epoch": 0.8655603778401838, "gen_logits_max": 4.198132514953613, "gen_logits_mean": -14.469069480895996, "gen_logits_min": -27.1878662109375, "gen_logits_std": 2.880936622619629, "gen_loss": 0.26166611909866333, "grad_norm": 0.38093349735239984, "learning_rate": 2.557642105263158e-05, "loss": 0.2937, "mean_copy_accuracy": 0.9962840527296066, "mean_gen_accuracy": 0.8669420629739761, "mean_token_accuracy": 0.9018715918064117, "num_tokens": 64295157.0, "sample_num_tokens": 7715.75, "step": 4238, "total_num_tokens": 64326020.0, "z_loss": 0.0005666202632710338 }, { "copy_logits_max": -6.835798263549805, "copy_logits_min": -687500032.0, "copy_num_tokens": 471.5625, "epoch": 0.8657646157773806, "gen_logits_max": 4.478464126586914, "gen_logits_mean": -15.102714538574219, "gen_logits_min": -26.86309814453125, "gen_logits_std": 2.82179594039917, "gen_loss": 0.3059779405593872, "grad_norm": 0.42433815053118057, "learning_rate": 2.5575157894736844e-05, "loss": 0.2933, "mean_copy_accuracy": 0.9936452955007553, "mean_gen_accuracy": 0.872249498963356, "mean_token_accuracy": 0.9015688300132751, "num_tokens": 64561210.0, "sample_num_tokens": 8254.5, "step": 4239, "total_num_tokens": 64594228.0, "z_loss": 0.0005850490415468812 }, { "copy_logits_max": -2.8057303428649902, "copy_logits_min": -750000000.0, "copy_num_tokens": 493.25, "epoch": 0.8659688537145774, "gen_logits_max": 5.077394008636475, "gen_logits_mean": -13.734113693237305, "gen_logits_min": -26.04058074951172, "gen_logits_std": 2.843513011932373, "gen_loss": 0.3565465807914734, "grad_norm": 0.41362038399997675, "learning_rate": 2.5573894736842105e-05, "loss": 0.3193, "mean_copy_accuracy": 0.9947263300418854, "mean_gen_accuracy": 0.8577665537595749, "mean_token_accuracy": 0.8916924446821213, "num_tokens": 64837145.0, "sample_num_tokens": 8615.25, "step": 4240, "total_num_tokens": 64871606.0, "z_loss": 0.000709861982613802 }, { "copy_logits_max": -5.517562389373779, "copy_logits_min": -750000000.0, "copy_num_tokens": 548.25, "epoch": 0.8661730916517744, "gen_logits_max": 3.7652504444122314, "gen_logits_mean": -15.099712371826172, "gen_logits_min": -27.793909072875977, "gen_logits_std": 2.864863872528076, "gen_loss": 0.2838340997695923, "grad_norm": 0.38969276727860397, "learning_rate": 2.557263157894737e-05, "loss": 0.2889, "mean_copy_accuracy": 0.9948477298021317, "mean_gen_accuracy": 0.8734432905912399, "mean_token_accuracy": 0.9030872732400894, "num_tokens": 65095884.0, "sample_num_tokens": 8235.0, "step": 4241, "total_num_tokens": 65128824.0, "z_loss": 0.0006119117024354637 }, { "copy_logits_max": -4.25464391708374, "copy_logits_min": -750000000.0, "copy_num_tokens": 585.5625, "epoch": 0.8663773295889712, "gen_logits_max": 4.233222961425781, "gen_logits_mean": -14.65188217163086, "gen_logits_min": -27.102685928344727, "gen_logits_std": 2.8701889514923096, "gen_loss": 0.27075284719467163, "grad_norm": 0.38984699002338125, "learning_rate": 2.557136842105263e-05, "loss": 0.2981, "mean_copy_accuracy": 0.9950270503759384, "mean_gen_accuracy": 0.867654025554657, "mean_token_accuracy": 0.9018906652927399, "num_tokens": 65382889.0, "sample_num_tokens": 8725.75, "step": 4242, "total_num_tokens": 65417792.0, "z_loss": 0.000668608583509922 }, { "copy_logits_max": -2.2097432613372803, "copy_logits_min": -750000000.0, "copy_num_tokens": 553.75, "epoch": 0.866581567526168, "gen_logits_max": 5.345682144165039, "gen_logits_mean": -13.709953308105469, "gen_logits_min": -25.57990264892578, "gen_logits_std": 2.8332903385162354, "gen_loss": 0.3141093850135803, "grad_norm": 0.40028038396185395, "learning_rate": 2.5570105263157895e-05, "loss": 0.3156, "mean_copy_accuracy": 0.995007112622261, "mean_gen_accuracy": 0.8598729968070984, "mean_token_accuracy": 0.8960232436656952, "num_tokens": 65651708.0, "sample_num_tokens": 8713.5, "step": 4243, "total_num_tokens": 65686562.0, "z_loss": 0.0007800161256454885 }, { "copy_logits_max": -4.536598205566406, "copy_logits_min": -750000000.0, "copy_num_tokens": 434.8125, "epoch": 0.8667858054633648, "gen_logits_max": 5.031454563140869, "gen_logits_mean": -15.041528701782227, "gen_logits_min": -26.835418701171875, "gen_logits_std": 2.8391075134277344, "gen_loss": 0.28682345151901245, "grad_norm": 0.39817253971508887, "learning_rate": 2.5568842105263156e-05, "loss": 0.2844, "mean_copy_accuracy": 0.9942950755357742, "mean_gen_accuracy": 0.8703942745923996, "mean_token_accuracy": 0.905293881893158, "num_tokens": 65915762.0, "sample_num_tokens": 8133.5, "step": 4244, "total_num_tokens": 65948296.0, "z_loss": 0.0006245103431865573 }, { "copy_logits_max": -3.0426995754241943, "copy_logits_min": -750000000.0, "copy_num_tokens": 481.6875, "epoch": 0.8669900434005616, "gen_logits_max": 3.9519004821777344, "gen_logits_mean": -15.549860954284668, "gen_logits_min": -27.565757751464844, "gen_logits_std": 2.9054501056671143, "gen_loss": 0.24861901998519897, "grad_norm": 0.40941808976984384, "learning_rate": 2.5567578947368424e-05, "loss": 0.2839, "mean_copy_accuracy": 0.9949043840169907, "mean_gen_accuracy": 0.8735053986310959, "mean_token_accuracy": 0.9048685878515244, "num_tokens": 66185442.0, "sample_num_tokens": 8199.0, "step": 4245, "total_num_tokens": 66218238.0, "z_loss": 0.0005395868793129921 }, { "copy_logits_max": -1.6544969081878662, "copy_logits_min": -750000064.0, "copy_num_tokens": 380.25, "epoch": 0.8671942813377584, "gen_logits_max": 6.662683486938477, "gen_logits_mean": -13.488229751586914, "gen_logits_min": -25.700284957885742, "gen_logits_std": 2.8686888217926025, "gen_loss": 0.3377642035484314, "grad_norm": 0.42397458461749965, "learning_rate": 2.5566315789473684e-05, "loss": 0.3208, "mean_copy_accuracy": 0.9947458952665329, "mean_gen_accuracy": 0.858521893620491, "mean_token_accuracy": 0.8930905610322952, "num_tokens": 66436475.0, "sample_num_tokens": 6761.75, "step": 4246, "total_num_tokens": 66463522.0, "z_loss": 0.0007036416209302843 }, { "copy_logits_max": -4.74373722076416, "copy_logits_min": -750000000.0, "copy_num_tokens": 331.5, "epoch": 0.8673985192749554, "gen_logits_max": 6.285998344421387, "gen_logits_mean": -15.159713745117188, "gen_logits_min": -27.424331665039062, "gen_logits_std": 2.81815767288208, "gen_loss": 0.3641907274723053, "grad_norm": 0.39662321618639845, "learning_rate": 2.556505263157895e-05, "loss": 0.3067, "mean_copy_accuracy": 0.9945908933877945, "mean_gen_accuracy": 0.8663046956062317, "mean_token_accuracy": 0.8979502618312836, "num_tokens": 66706894.0, "sample_num_tokens": 8033.5, "step": 4247, "total_num_tokens": 66739028.0, "z_loss": 0.0007554692565463483 }, { "copy_logits_max": -3.2225189208984375, "copy_logits_min": -687500032.0, "copy_num_tokens": 586.75, "epoch": 0.8676027572121522, "gen_logits_max": 5.298071384429932, "gen_logits_mean": -14.495922088623047, "gen_logits_min": -27.111804962158203, "gen_logits_std": 2.9110565185546875, "gen_loss": 0.27577221393585205, "grad_norm": 0.3930586537816613, "learning_rate": 2.5563789473684213e-05, "loss": 0.2906, "mean_copy_accuracy": 0.9950025975704193, "mean_gen_accuracy": 0.8744056224822998, "mean_token_accuracy": 0.9026409089565277, "num_tokens": 66941923.0, "sample_num_tokens": 8580.75, "step": 4248, "total_num_tokens": 66976246.0, "z_loss": 0.0006111292750574648 }, { "copy_logits_max": -1.9001827239990234, "copy_logits_min": -687500032.0, "copy_num_tokens": 664.3125, "epoch": 0.867806995149349, "gen_logits_max": 4.022171497344971, "gen_logits_mean": -15.153726577758789, "gen_logits_min": -27.70186996459961, "gen_logits_std": 2.9117157459259033, "gen_loss": 0.25257524847984314, "grad_norm": 0.42134635899043243, "learning_rate": 2.5562526315789474e-05, "loss": 0.3107, "mean_copy_accuracy": 0.9944713115692139, "mean_gen_accuracy": 0.8624932169914246, "mean_token_accuracy": 0.8976270854473114, "num_tokens": 67208157.0, "sample_num_tokens": 9138.75, "step": 4249, "total_num_tokens": 67244712.0, "z_loss": 0.0005584637983702123 }, { "copy_logits_max": -3.8257999420166016, "copy_logits_min": -750000064.0, "copy_num_tokens": 294.8125, "epoch": 0.8680112330865458, "gen_logits_max": 5.272335052490234, "gen_logits_mean": -15.417118072509766, "gen_logits_min": -27.06011962890625, "gen_logits_std": 2.8339738845825195, "gen_loss": 0.32734090089797974, "grad_norm": 0.37176403469418856, "learning_rate": 2.5561263157894738e-05, "loss": 0.2975, "mean_copy_accuracy": 0.9952121078968048, "mean_gen_accuracy": 0.867523193359375, "mean_token_accuracy": 0.9002572596073151, "num_tokens": 67485167.0, "sample_num_tokens": 7266.75, "step": 4250, "total_num_tokens": 67514234.0, "z_loss": 0.0006450712098740041 }, { "copy_logits_max": -0.7743728756904602, "copy_logits_min": -687500032.0, "copy_num_tokens": 881.25, "epoch": 0.8682154710237426, "gen_logits_max": 5.246755599975586, "gen_logits_mean": -13.970499038696289, "gen_logits_min": -26.631683349609375, "gen_logits_std": 2.916184902191162, "gen_loss": 0.2725217640399933, "grad_norm": 0.39248131735832675, "learning_rate": 2.556e-05, "loss": 0.2923, "mean_copy_accuracy": 0.9964737147092819, "mean_gen_accuracy": 0.8669076859951019, "mean_token_accuracy": 0.9036246985197067, "num_tokens": 67759402.0, "sample_num_tokens": 10782.5, "step": 4251, "total_num_tokens": 67802532.0, "z_loss": 0.0006833677762188017 }, { "copy_logits_max": -0.8028988838195801, "copy_logits_min": -750000000.0, "copy_num_tokens": 518.875, "epoch": 0.8684197089609395, "gen_logits_max": 4.400760650634766, "gen_logits_mean": -14.991806030273438, "gen_logits_min": -27.34709930419922, "gen_logits_std": 2.8929638862609863, "gen_loss": 0.2713853120803833, "grad_norm": 0.383546735945967, "learning_rate": 2.5558736842105264e-05, "loss": 0.3056, "mean_copy_accuracy": 0.995298758149147, "mean_gen_accuracy": 0.8649319112300873, "mean_token_accuracy": 0.8986070901155472, "num_tokens": 68048829.0, "sample_num_tokens": 8156.25, "step": 4252, "total_num_tokens": 68081454.0, "z_loss": 0.0007007803069427609 }, { "copy_logits_max": -3.9805362224578857, "copy_logits_min": -750000000.0, "copy_num_tokens": 433.125, "epoch": 0.8686239468981364, "gen_logits_max": 5.109748840332031, "gen_logits_mean": -14.868621826171875, "gen_logits_min": -26.58464813232422, "gen_logits_std": 2.8788042068481445, "gen_loss": 0.29125452041625977, "grad_norm": 0.40371964480919237, "learning_rate": 2.5557473684210524e-05, "loss": 0.3114, "mean_copy_accuracy": 0.9961380511522293, "mean_gen_accuracy": 0.8631273359060287, "mean_token_accuracy": 0.8961775153875351, "num_tokens": 68313254.0, "sample_num_tokens": 7737.0, "step": 4253, "total_num_tokens": 68344202.0, "z_loss": 0.0006636456237174571 }, { "copy_logits_max": -3.0374014377593994, "copy_logits_min": -687500032.0, "copy_num_tokens": 492.8125, "epoch": 0.8688281848353332, "gen_logits_max": 4.336978435516357, "gen_logits_mean": -15.82506275177002, "gen_logits_min": -27.610830307006836, "gen_logits_std": 2.838521957397461, "gen_loss": 0.3195202648639679, "grad_norm": 0.4055184171032619, "learning_rate": 2.5556210526315792e-05, "loss": 0.3233, "mean_copy_accuracy": 0.9952331930398941, "mean_gen_accuracy": 0.8587579429149628, "mean_token_accuracy": 0.8925407081842422, "num_tokens": 68581059.0, "sample_num_tokens": 8275.75, "step": 4254, "total_num_tokens": 68614162.0, "z_loss": 0.0006473950925283134 }, { "copy_logits_max": -3.0401175022125244, "copy_logits_min": -687500032.0, "copy_num_tokens": 465.0, "epoch": 0.86903242277253, "gen_logits_max": 6.02033805847168, "gen_logits_mean": -12.951897621154785, "gen_logits_min": -25.129405975341797, "gen_logits_std": 2.899714946746826, "gen_loss": 0.3026887774467468, "grad_norm": 0.3833577438627259, "learning_rate": 2.5554947368421053e-05, "loss": 0.2774, "mean_copy_accuracy": 0.9952974766492844, "mean_gen_accuracy": 0.8774882256984711, "mean_token_accuracy": 0.9077527523040771, "num_tokens": 68845031.0, "sample_num_tokens": 7963.25, "step": 4255, "total_num_tokens": 68876884.0, "z_loss": 0.0006551743717864156 }, { "copy_logits_max": -2.9251365661621094, "copy_logits_min": -750000000.0, "copy_num_tokens": 506.75, "epoch": 0.8692366607097268, "gen_logits_max": 3.829247236251831, "gen_logits_mean": -15.88267707824707, "gen_logits_min": -27.98195457458496, "gen_logits_std": 2.8651418685913086, "gen_loss": 0.3121298551559448, "grad_norm": 0.37652864573705797, "learning_rate": 2.5553684210526317e-05, "loss": 0.3083, "mean_copy_accuracy": 0.9949917495250702, "mean_gen_accuracy": 0.8643710911273956, "mean_token_accuracy": 0.897504135966301, "num_tokens": 69123969.0, "sample_num_tokens": 8635.25, "step": 4256, "total_num_tokens": 69158510.0, "z_loss": 0.0006289087468758225 }, { "copy_logits_max": -1.9638394117355347, "copy_logits_min": -750000000.0, "copy_num_tokens": 507.625, "epoch": 0.8694408986469236, "gen_logits_max": 5.240758895874023, "gen_logits_mean": -14.1907958984375, "gen_logits_min": -26.310138702392578, "gen_logits_std": 2.8777689933776855, "gen_loss": 0.32073479890823364, "grad_norm": 0.41848182701596337, "learning_rate": 2.555242105263158e-05, "loss": 0.3128, "mean_copy_accuracy": 0.9955014735460281, "mean_gen_accuracy": 0.8657317608594894, "mean_token_accuracy": 0.8992997854948044, "num_tokens": 69413567.0, "sample_num_tokens": 8194.75, "step": 4257, "total_num_tokens": 69446346.0, "z_loss": 0.0006777287926524878 }, { "copy_logits_max": -3.278024673461914, "copy_logits_min": -750000000.0, "copy_num_tokens": 629.8125, "epoch": 0.8696451365841205, "gen_logits_max": 3.784006118774414, "gen_logits_mean": -15.790617942810059, "gen_logits_min": -27.91928482055664, "gen_logits_std": 2.867260456085205, "gen_loss": 0.3432680368423462, "grad_norm": 0.39994077895467356, "learning_rate": 2.5551157894736843e-05, "loss": 0.3084, "mean_copy_accuracy": 0.9968904703855515, "mean_gen_accuracy": 0.8655025810003281, "mean_token_accuracy": 0.9008072316646576, "num_tokens": 69708176.0, "sample_num_tokens": 9722.0, "step": 4258, "total_num_tokens": 69747064.0, "z_loss": 0.0005772372824139893 }, { "copy_logits_max": -1.6043081283569336, "copy_logits_min": -750000000.0, "copy_num_tokens": 573.5625, "epoch": 0.8698493745213174, "gen_logits_max": 4.250311851501465, "gen_logits_mean": -13.856134414672852, "gen_logits_min": -25.769620895385742, "gen_logits_std": 2.870227098464966, "gen_loss": 0.32419851422309875, "grad_norm": 0.5727574390767249, "learning_rate": 2.5549894736842104e-05, "loss": 0.3171, "mean_copy_accuracy": 0.9938709735870361, "mean_gen_accuracy": 0.8611784726381302, "mean_token_accuracy": 0.8952018767595291, "num_tokens": 69962408.0, "sample_num_tokens": 8645.0, "step": 4259, "total_num_tokens": 69996988.0, "z_loss": 0.0005773601587861776 }, { "copy_logits_max": -2.5930309295654297, "copy_logits_min": -687500032.0, "copy_num_tokens": 408.4375, "epoch": 0.8700536124585142, "gen_logits_max": 4.957684516906738, "gen_logits_mean": -14.699651718139648, "gen_logits_min": -26.66237449645996, "gen_logits_std": 2.834987163543701, "gen_loss": 0.3404436707496643, "grad_norm": 0.4397555697802971, "learning_rate": 2.5548631578947368e-05, "loss": 0.3217, "mean_copy_accuracy": 0.9955903440713882, "mean_gen_accuracy": 0.8620728999376297, "mean_token_accuracy": 0.8923467099666595, "num_tokens": 70216052.0, "sample_num_tokens": 7937.5, "step": 4260, "total_num_tokens": 70247802.0, "z_loss": 0.0008062297129072249 }, { "copy_logits_max": 2.2835285663604736, "copy_logits_min": -687500032.0, "copy_num_tokens": 646.875, "epoch": 0.870257850395711, "gen_logits_max": 3.916067600250244, "gen_logits_mean": -14.85522174835205, "gen_logits_min": -26.87425994873047, "gen_logits_std": 2.871283531188965, "gen_loss": 0.32023507356643677, "grad_norm": 0.4392734028881421, "learning_rate": 2.5547368421052632e-05, "loss": 0.3186, "mean_copy_accuracy": 0.9957034885883331, "mean_gen_accuracy": 0.8597886264324188, "mean_token_accuracy": 0.8959339410066605, "num_tokens": 70510393.0, "sample_num_tokens": 8967.75, "step": 4261, "total_num_tokens": 70546264.0, "z_loss": 0.0011380075011402369 }, { "copy_logits_max": 0.019352376461029053, "copy_logits_min": -687500032.0, "copy_num_tokens": 541.8125, "epoch": 0.8704620883329078, "gen_logits_max": 5.013941764831543, "gen_logits_mean": -13.478019714355469, "gen_logits_min": -25.666454315185547, "gen_logits_std": 2.8723416328430176, "gen_loss": 0.2936297655105591, "grad_norm": 0.4252738326375723, "learning_rate": 2.5546105263157896e-05, "loss": 0.3277, "mean_copy_accuracy": 0.9947157502174377, "mean_gen_accuracy": 0.8584533929824829, "mean_token_accuracy": 0.8915348649024963, "num_tokens": 70769516.0, "sample_num_tokens": 8682.0, "step": 4262, "total_num_tokens": 70804244.0, "z_loss": 0.0010822622571140528 }, { "copy_logits_max": -4.3659772872924805, "copy_logits_min": -687500032.0, "copy_num_tokens": 599.875, "epoch": 0.8706663262701047, "gen_logits_max": 4.245207786560059, "gen_logits_mean": -15.702866554260254, "gen_logits_min": -27.56165313720703, "gen_logits_std": 2.815821647644043, "gen_loss": 0.27828526496887207, "grad_norm": 0.48405700505929994, "learning_rate": 2.554484210526316e-05, "loss": 0.2712, "mean_copy_accuracy": 0.9929549247026443, "mean_gen_accuracy": 0.8797696828842163, "mean_token_accuracy": 0.911530002951622, "num_tokens": 71048560.0, "sample_num_tokens": 9448.0, "step": 4263, "total_num_tokens": 71086352.0, "z_loss": 0.0009031769004650414 }, { "copy_logits_max": -1.7051966190338135, "copy_logits_min": -750000000.0, "copy_num_tokens": 423.6875, "epoch": 0.8708705642073015, "gen_logits_max": 4.384102821350098, "gen_logits_mean": -15.028729438781738, "gen_logits_min": -26.95197296142578, "gen_logits_std": 2.854306697845459, "gen_loss": 0.30924755334854126, "grad_norm": 0.39148186889155334, "learning_rate": 2.5543578947368422e-05, "loss": 0.2968, "mean_copy_accuracy": 0.9951681345701218, "mean_gen_accuracy": 0.8683914989233017, "mean_token_accuracy": 0.9029012024402618, "num_tokens": 71338236.0, "sample_num_tokens": 7603.5, "step": 4264, "total_num_tokens": 71368650.0, "z_loss": 0.0008540985290892422 }, { "copy_logits_max": -2.042494773864746, "copy_logits_min": -750000000.0, "copy_num_tokens": 427.8125, "epoch": 0.8710748021444984, "gen_logits_max": 4.62811279296875, "gen_logits_mean": -15.682032585144043, "gen_logits_min": -27.91132354736328, "gen_logits_std": 2.8534271717071533, "gen_loss": 0.29674938321113586, "grad_norm": 0.4066435164978175, "learning_rate": 2.5542315789473686e-05, "loss": 0.3152, "mean_copy_accuracy": 0.9952527284622192, "mean_gen_accuracy": 0.8681035488843918, "mean_token_accuracy": 0.8962274342775345, "num_tokens": 71619936.0, "sample_num_tokens": 8820.0, "step": 4265, "total_num_tokens": 71655216.0, "z_loss": 0.0007358167204074562 }, { "copy_logits_max": -3.7856698036193848, "copy_logits_min": -750000064.0, "copy_num_tokens": 471.75, "epoch": 0.8712790400816952, "gen_logits_max": 5.229465007781982, "gen_logits_mean": -14.607112884521484, "gen_logits_min": -26.492700576782227, "gen_logits_std": 2.814582586288452, "gen_loss": 0.28718462586402893, "grad_norm": 0.43251259925817215, "learning_rate": 2.5541052631578947e-05, "loss": 0.3146, "mean_copy_accuracy": 0.996233269572258, "mean_gen_accuracy": 0.8605342507362366, "mean_token_accuracy": 0.8954657763242722, "num_tokens": 71901565.0, "sample_num_tokens": 8668.75, "step": 4266, "total_num_tokens": 71936240.0, "z_loss": 0.0006635803729295731 }, { "copy_logits_max": -2.936616897583008, "copy_logits_min": -750000064.0, "copy_num_tokens": 397.5, "epoch": 0.871483278018892, "gen_logits_max": 4.593961715698242, "gen_logits_mean": -14.722185134887695, "gen_logits_min": -26.235637664794922, "gen_logits_std": 2.830583095550537, "gen_loss": 0.2861511707305908, "grad_norm": 0.3847531811381865, "learning_rate": 2.553978947368421e-05, "loss": 0.2874, "mean_copy_accuracy": 0.9941474050283432, "mean_gen_accuracy": 0.8780397921800613, "mean_token_accuracy": 0.9041756391525269, "num_tokens": 72173684.0, "sample_num_tokens": 7975.0, "step": 4267, "total_num_tokens": 72205584.0, "z_loss": 0.0005588503554463387 }, { "copy_logits_max": -4.3329057693481445, "copy_logits_min": -750000000.0, "copy_num_tokens": 669.75, "epoch": 0.8716875159560888, "gen_logits_max": 3.734419345855713, "gen_logits_mean": -14.461921691894531, "gen_logits_min": -26.51959991455078, "gen_logits_std": 2.8638696670532227, "gen_loss": 0.26619061827659607, "grad_norm": 0.4478763647594685, "learning_rate": 2.5538526315789472e-05, "loss": 0.298, "mean_copy_accuracy": 0.9950228631496429, "mean_gen_accuracy": 0.8660799711942673, "mean_token_accuracy": 0.9059538841247559, "num_tokens": 72463805.0, "sample_num_tokens": 8889.75, "step": 4268, "total_num_tokens": 72499364.0, "z_loss": 0.00054254534188658 }, { "copy_logits_max": -2.3972055912017822, "copy_logits_min": -750000000.0, "copy_num_tokens": 431.875, "epoch": 0.8718917538932857, "gen_logits_max": 4.494253158569336, "gen_logits_mean": -14.500692367553711, "gen_logits_min": -26.347089767456055, "gen_logits_std": 2.8419647216796875, "gen_loss": 0.3154945373535156, "grad_norm": 0.39169114471972216, "learning_rate": 2.5537263157894736e-05, "loss": 0.2902, "mean_copy_accuracy": 0.9947959929704666, "mean_gen_accuracy": 0.875178873538971, "mean_token_accuracy": 0.903934121131897, "num_tokens": 72725118.0, "sample_num_tokens": 7565.0, "step": 4269, "total_num_tokens": 72755378.0, "z_loss": 0.0006630137213505805 }, { "copy_logits_max": -1.4496538639068604, "copy_logits_min": -750000000.0, "copy_num_tokens": 613.875, "epoch": 0.8720959918304825, "gen_logits_max": 4.937122344970703, "gen_logits_mean": -14.247025489807129, "gen_logits_min": -26.302001953125, "gen_logits_std": 2.8802924156188965, "gen_loss": 0.28306806087493896, "grad_norm": 0.4195937742441199, "learning_rate": 2.5536e-05, "loss": 0.3087, "mean_copy_accuracy": 0.9956833869218826, "mean_gen_accuracy": 0.8625018298625946, "mean_token_accuracy": 0.8973364979028702, "num_tokens": 72987889.0, "sample_num_tokens": 9623.75, "step": 4270, "total_num_tokens": 73026384.0, "z_loss": 0.0006718498189002275 }, { "copy_logits_max": -1.9830079078674316, "copy_logits_min": -750000000.0, "copy_num_tokens": 305.6875, "epoch": 0.8723002297676793, "gen_logits_max": 5.686676979064941, "gen_logits_mean": -13.789947509765625, "gen_logits_min": -25.569110870361328, "gen_logits_std": 2.8319621086120605, "gen_loss": 0.3506985008716583, "grad_norm": 0.4114549153901398, "learning_rate": 2.5534736842105265e-05, "loss": 0.3088, "mean_copy_accuracy": 0.9940081685781479, "mean_gen_accuracy": 0.8704333603382111, "mean_token_accuracy": 0.8969894200563431, "num_tokens": 73259246.0, "sample_num_tokens": 7298.5, "step": 4271, "total_num_tokens": 73288440.0, "z_loss": 0.0007387925870716572 }, { "copy_logits_max": -4.401716232299805, "copy_logits_min": -750000000.0, "copy_num_tokens": 435.5625, "epoch": 0.8725044677048762, "gen_logits_max": 4.436250686645508, "gen_logits_mean": -15.557655334472656, "gen_logits_min": -26.991106033325195, "gen_logits_std": 2.8367605209350586, "gen_loss": 0.31816643476486206, "grad_norm": 0.4260053743837564, "learning_rate": 2.5533473684210526e-05, "loss": 0.3069, "mean_copy_accuracy": 0.9942701309919357, "mean_gen_accuracy": 0.8662025183439255, "mean_token_accuracy": 0.8957248479127884, "num_tokens": 73535789.0, "sample_num_tokens": 8595.75, "step": 4272, "total_num_tokens": 73570172.0, "z_loss": 0.0006664575776085258 }, { "copy_logits_max": -1.4888145923614502, "copy_logits_min": -625000064.0, "copy_num_tokens": 602.6875, "epoch": 0.872708705642073, "gen_logits_max": 4.498601913452148, "gen_logits_mean": -14.6799898147583, "gen_logits_min": -26.73328399658203, "gen_logits_std": 2.8954460620880127, "gen_loss": 0.27290797233581543, "grad_norm": 0.4180816673333229, "learning_rate": 2.553221052631579e-05, "loss": 0.3022, "mean_copy_accuracy": 0.9950575977563858, "mean_gen_accuracy": 0.8682712614536285, "mean_token_accuracy": 0.9003933668136597, "num_tokens": 73795776.0, "sample_num_tokens": 8842.0, "step": 4273, "total_num_tokens": 73831144.0, "z_loss": 0.0005837921053171158 }, { "copy_logits_max": -3.0404727458953857, "copy_logits_min": -750000000.0, "copy_num_tokens": 525.25, "epoch": 0.8729129435792699, "gen_logits_max": 4.663071632385254, "gen_logits_mean": -15.751382827758789, "gen_logits_min": -27.805883407592773, "gen_logits_std": 2.8535492420196533, "gen_loss": 0.2908504009246826, "grad_norm": 0.3670653832084845, "learning_rate": 2.5530947368421055e-05, "loss": 0.3087, "mean_copy_accuracy": 0.9958397895097733, "mean_gen_accuracy": 0.8632468283176422, "mean_token_accuracy": 0.8930182754993439, "num_tokens": 74077416.0, "sample_num_tokens": 8903.5, "step": 4274, "total_num_tokens": 74113030.0, "z_loss": 0.0006825965829193592 }, { "copy_logits_max": -1.4165599346160889, "copy_logits_min": -687500032.0, "copy_num_tokens": 367.9375, "epoch": 0.8731171815164667, "gen_logits_max": 5.989694595336914, "gen_logits_mean": -13.259939193725586, "gen_logits_min": -25.17945098876953, "gen_logits_std": 2.8304548263549805, "gen_loss": 0.30025264620780945, "grad_norm": 0.44263621000311865, "learning_rate": 2.5529684210526316e-05, "loss": 0.3129, "mean_copy_accuracy": 0.9960417151451111, "mean_gen_accuracy": 0.865423247218132, "mean_token_accuracy": 0.8955211788415909, "num_tokens": 74356712.0, "sample_num_tokens": 8228.0, "step": 4275, "total_num_tokens": 74389624.0, "z_loss": 0.0006951391696929932 }, { "copy_logits_max": -2.214902639389038, "copy_logits_min": -687500032.0, "copy_num_tokens": 425.0625, "epoch": 0.8733214194536635, "gen_logits_max": 5.1682329177856445, "gen_logits_mean": -13.853126525878906, "gen_logits_min": -25.812992095947266, "gen_logits_std": 2.8598480224609375, "gen_loss": 0.28117501735687256, "grad_norm": 0.4108108924472788, "learning_rate": 2.552842105263158e-05, "loss": 0.3131, "mean_copy_accuracy": 0.9956171363592148, "mean_gen_accuracy": 0.8587357103824615, "mean_token_accuracy": 0.894531637430191, "num_tokens": 74620690.0, "sample_num_tokens": 7759.5, "step": 4276, "total_num_tokens": 74651728.0, "z_loss": 0.0006319028907455504 }, { "copy_logits_max": 0.10877633094787598, "copy_logits_min": -687500032.0, "copy_num_tokens": 537.875, "epoch": 0.8735256573908603, "gen_logits_max": 4.6467719078063965, "gen_logits_mean": -14.34821891784668, "gen_logits_min": -26.565689086914062, "gen_logits_std": 2.8576717376708984, "gen_loss": 0.29456061124801636, "grad_norm": 0.409597454800513, "learning_rate": 2.552715789473684e-05, "loss": 0.2798, "mean_copy_accuracy": 0.9944793432950974, "mean_gen_accuracy": 0.8762810081243515, "mean_token_accuracy": 0.9079498052597046, "num_tokens": 74871716.0, "sample_num_tokens": 7835.0, "step": 4277, "total_num_tokens": 74903056.0, "z_loss": 0.0006851401994936168 }, { "copy_logits_max": 0.30301740765571594, "copy_logits_min": -687500032.0, "copy_num_tokens": 332.5625, "epoch": 0.8737298953280572, "gen_logits_max": 6.1083221435546875, "gen_logits_mean": -13.60930061340332, "gen_logits_min": -25.701492309570312, "gen_logits_std": 2.843975305557251, "gen_loss": 0.32638031244277954, "grad_norm": 0.42266214473616265, "learning_rate": 2.552589473684211e-05, "loss": 0.3207, "mean_copy_accuracy": 0.992477297782898, "mean_gen_accuracy": 0.8667532205581665, "mean_token_accuracy": 0.8940056711435318, "num_tokens": 75135001.0, "sample_num_tokens": 7752.75, "step": 4278, "total_num_tokens": 75166012.0, "z_loss": 0.0006265277042984962 }, { "copy_logits_max": 1.0521492958068848, "copy_logits_min": -750000000.0, "copy_num_tokens": 748.375, "epoch": 0.873934133265254, "gen_logits_max": 4.989760398864746, "gen_logits_mean": -13.86192798614502, "gen_logits_min": -26.817073822021484, "gen_logits_std": 2.865640640258789, "gen_loss": 0.29137754440307617, "grad_norm": 0.386265103886347, "learning_rate": 2.552463157894737e-05, "loss": 0.3151, "mean_copy_accuracy": 0.9961012452840805, "mean_gen_accuracy": 0.8625137060880661, "mean_token_accuracy": 0.8961816281080246, "num_tokens": 75404141.0, "sample_num_tokens": 10047.75, "step": 4279, "total_num_tokens": 75444332.0, "z_loss": 0.0007224393193610013 }, { "copy_logits_max": 0.5748104453086853, "copy_logits_min": -750000000.0, "copy_num_tokens": 421.9375, "epoch": 0.8741383712024509, "gen_logits_max": 5.064436435699463, "gen_logits_mean": -14.327841758728027, "gen_logits_min": -26.412742614746094, "gen_logits_std": 2.8616929054260254, "gen_loss": 0.3175138533115387, "grad_norm": 0.42485621286454667, "learning_rate": 2.5523368421052634e-05, "loss": 0.3309, "mean_copy_accuracy": 0.9950005561113358, "mean_gen_accuracy": 0.8582648634910583, "mean_token_accuracy": 0.8909687250852585, "num_tokens": 75678582.0, "sample_num_tokens": 7978.5, "step": 4280, "total_num_tokens": 75710496.0, "z_loss": 0.0007751364028081298 }, { "copy_logits_max": -1.1881362199783325, "copy_logits_min": -750000064.0, "copy_num_tokens": 563.4375, "epoch": 0.8743426091396477, "gen_logits_max": 4.5987019538879395, "gen_logits_mean": -14.190165519714355, "gen_logits_min": -26.531505584716797, "gen_logits_std": 2.8313703536987305, "gen_loss": 0.27123725414276123, "grad_norm": 0.3708438048776261, "learning_rate": 2.5522105263157895e-05, "loss": 0.3012, "mean_copy_accuracy": 0.9951656758785248, "mean_gen_accuracy": 0.8663365244865417, "mean_token_accuracy": 0.8991549462080002, "num_tokens": 75967908.0, "sample_num_tokens": 9055.0, "step": 4281, "total_num_tokens": 76004128.0, "z_loss": 0.0006770931067876518 }, { "copy_logits_max": -5.510397911071777, "copy_logits_min": -750000000.0, "copy_num_tokens": 408.375, "epoch": 0.8745468470768445, "gen_logits_max": 4.702206134796143, "gen_logits_mean": -15.149284362792969, "gen_logits_min": -26.913753509521484, "gen_logits_std": 2.8472957611083984, "gen_loss": 0.3042992651462555, "grad_norm": 0.4501373203102115, "learning_rate": 2.552084210526316e-05, "loss": 0.3186, "mean_copy_accuracy": 0.9948690086603165, "mean_gen_accuracy": 0.8601644337177277, "mean_token_accuracy": 0.8940368145704269, "num_tokens": 76234472.0, "sample_num_tokens": 8329.5, "step": 4282, "total_num_tokens": 76267790.0, "z_loss": 0.0006366877350956202 }, { "copy_logits_max": -2.627275228500366, "copy_logits_min": -750000000.0, "copy_num_tokens": 714.6875, "epoch": 0.8747510850140413, "gen_logits_max": 4.094651699066162, "gen_logits_mean": -15.374954223632812, "gen_logits_min": -27.41136360168457, "gen_logits_std": 2.878896713256836, "gen_loss": 0.27255749702453613, "grad_norm": 0.40735644153517636, "learning_rate": 2.551957894736842e-05, "loss": 0.2946, "mean_copy_accuracy": 0.9952804148197174, "mean_gen_accuracy": 0.8716709166765213, "mean_token_accuracy": 0.9032887518405914, "num_tokens": 76505220.0, "sample_num_tokens": 10486.5, "step": 4283, "total_num_tokens": 76547166.0, "z_loss": 0.000630886061117053 }, { "copy_logits_max": -4.169794082641602, "copy_logits_min": -750000000.0, "copy_num_tokens": 456.3125, "epoch": 0.8749553229512382, "gen_logits_max": 3.784270763397217, "gen_logits_mean": -15.557684898376465, "gen_logits_min": -27.84765625, "gen_logits_std": 2.859999895095825, "gen_loss": 0.30914008617401123, "grad_norm": 0.3952706533703964, "learning_rate": 2.5518315789473684e-05, "loss": 0.3199, "mean_copy_accuracy": 0.9956713765859604, "mean_gen_accuracy": 0.8628715574741364, "mean_token_accuracy": 0.8921855539083481, "num_tokens": 76772830.0, "sample_num_tokens": 8313.5, "step": 4284, "total_num_tokens": 76806084.0, "z_loss": 0.0006154085276648402 }, { "copy_logits_max": -4.283328056335449, "copy_logits_min": -687500032.0, "copy_num_tokens": 353.3125, "epoch": 0.875159560888435, "gen_logits_max": 4.7382659912109375, "gen_logits_mean": -15.004350662231445, "gen_logits_min": -27.252349853515625, "gen_logits_std": 2.8539562225341797, "gen_loss": 0.3179665207862854, "grad_norm": 0.39691179234561835, "learning_rate": 2.5517052631578945e-05, "loss": 0.3036, "mean_copy_accuracy": 0.995776817202568, "mean_gen_accuracy": 0.8654259592294693, "mean_token_accuracy": 0.8986896425485611, "num_tokens": 77054232.0, "sample_num_tokens": 7475.5, "step": 4285, "total_num_tokens": 77084134.0, "z_loss": 0.0006262515089474618 }, { "copy_logits_max": -2.6563339233398438, "copy_logits_min": -687500032.0, "copy_num_tokens": 536.0, "epoch": 0.8753637988256319, "gen_logits_max": 3.63427734375, "gen_logits_mean": -15.81069564819336, "gen_logits_min": -27.614967346191406, "gen_logits_std": 2.855466604232788, "gen_loss": 0.28889402747154236, "grad_norm": 0.3710305496914764, "learning_rate": 2.5515789473684213e-05, "loss": 0.3044, "mean_copy_accuracy": 0.99534472823143, "mean_gen_accuracy": 0.8636448532342911, "mean_token_accuracy": 0.8985254317522049, "num_tokens": 77296576.0, "sample_num_tokens": 8315.5, "step": 4286, "total_num_tokens": 77329838.0, "z_loss": 0.0005822839448228478 }, { "copy_logits_max": -3.4121532440185547, "copy_logits_min": -750000000.0, "copy_num_tokens": 520.3125, "epoch": 0.8755680367628287, "gen_logits_max": 4.795764923095703, "gen_logits_mean": -14.91933536529541, "gen_logits_min": -27.10933494567871, "gen_logits_std": 2.845487117767334, "gen_loss": 0.28026461601257324, "grad_norm": 0.36621895071117605, "learning_rate": 2.5514526315789477e-05, "loss": 0.3032, "mean_copy_accuracy": 0.9954968839883804, "mean_gen_accuracy": 0.867257371544838, "mean_token_accuracy": 0.8994353711605072, "num_tokens": 77573868.0, "sample_num_tokens": 8654.5, "step": 4287, "total_num_tokens": 77608486.0, "z_loss": 0.000617670826613903 }, { "copy_logits_max": -4.481868267059326, "copy_logits_min": -750000000.0, "copy_num_tokens": 560.125, "epoch": 0.8757722747000255, "gen_logits_max": 4.551885604858398, "gen_logits_mean": -14.20229721069336, "gen_logits_min": -26.178844451904297, "gen_logits_std": 2.8215599060058594, "gen_loss": 0.2751171588897705, "grad_norm": 0.4042517175130989, "learning_rate": 2.5513263157894738e-05, "loss": 0.289, "mean_copy_accuracy": 0.995573878288269, "mean_gen_accuracy": 0.867972195148468, "mean_token_accuracy": 0.9036250412464142, "num_tokens": 77864679.0, "sample_num_tokens": 8271.25, "step": 4288, "total_num_tokens": 77897764.0, "z_loss": 0.0005883401026949286 }, { "copy_logits_max": -3.9432926177978516, "copy_logits_min": -750000000.0, "copy_num_tokens": 725.6875, "epoch": 0.8759765126372223, "gen_logits_max": 3.895122528076172, "gen_logits_mean": -15.093551635742188, "gen_logits_min": -27.4731388092041, "gen_logits_std": 2.9026336669921875, "gen_loss": 0.25106921792030334, "grad_norm": 0.4197059638835831, "learning_rate": 2.5512000000000002e-05, "loss": 0.3118, "mean_copy_accuracy": 0.9954165518283844, "mean_gen_accuracy": 0.8670570403337479, "mean_token_accuracy": 0.8974801450967789, "num_tokens": 78116595.0, "sample_num_tokens": 10800.25, "step": 4289, "total_num_tokens": 78159796.0, "z_loss": 0.0005096154636703432 }, { "copy_logits_max": -3.6089797019958496, "copy_logits_min": -750000000.0, "copy_num_tokens": 579.8125, "epoch": 0.8761807505744192, "gen_logits_max": 3.8258705139160156, "gen_logits_mean": -14.95445442199707, "gen_logits_min": -26.872095108032227, "gen_logits_std": 2.8503944873809814, "gen_loss": 0.31466206908226013, "grad_norm": 0.3978293467151055, "learning_rate": 2.5510736842105263e-05, "loss": 0.3162, "mean_copy_accuracy": 0.9948879480361938, "mean_gen_accuracy": 0.8620134145021439, "mean_token_accuracy": 0.8968369513750076, "num_tokens": 78409325.0, "sample_num_tokens": 9354.25, "step": 4290, "total_num_tokens": 78446742.0, "z_loss": 0.0006219755159690976 }, { "copy_logits_max": -4.124099254608154, "copy_logits_min": -750000000.0, "copy_num_tokens": 325.4375, "epoch": 0.8763849885116161, "gen_logits_max": 4.954775810241699, "gen_logits_mean": -15.180846214294434, "gen_logits_min": -27.20525360107422, "gen_logits_std": 2.827054023742676, "gen_loss": 0.3467271029949188, "grad_norm": 0.4139101091648364, "learning_rate": 2.5509473684210528e-05, "loss": 0.3288, "mean_copy_accuracy": 0.9944424480199814, "mean_gen_accuracy": 0.8633937835693359, "mean_token_accuracy": 0.8920432925224304, "num_tokens": 78683016.0, "sample_num_tokens": 7144.0, "step": 4291, "total_num_tokens": 78711592.0, "z_loss": 0.0006903181201778352 }, { "copy_logits_max": -5.674530029296875, "copy_logits_min": -750000000.0, "copy_num_tokens": 475.5625, "epoch": 0.8765892264488129, "gen_logits_max": 4.311970233917236, "gen_logits_mean": -15.610157012939453, "gen_logits_min": -27.114765167236328, "gen_logits_std": 2.815314769744873, "gen_loss": 0.3104999363422394, "grad_norm": 0.42219698846430515, "learning_rate": 2.550821052631579e-05, "loss": 0.3076, "mean_copy_accuracy": 0.9942229390144348, "mean_gen_accuracy": 0.868282824754715, "mean_token_accuracy": 0.8962560743093491, "num_tokens": 78926266.0, "sample_num_tokens": 8193.0, "step": 4292, "total_num_tokens": 78959038.0, "z_loss": 0.0006030682125128806 }, { "copy_logits_max": -5.719244003295898, "copy_logits_min": -750000064.0, "copy_num_tokens": 492.9375, "epoch": 0.8767934643860097, "gen_logits_max": 4.702627658843994, "gen_logits_mean": -14.402435302734375, "gen_logits_min": -26.147178649902344, "gen_logits_std": 2.8534884452819824, "gen_loss": 0.29550468921661377, "grad_norm": 0.4142107319364844, "learning_rate": 2.5506947368421053e-05, "loss": 0.2985, "mean_copy_accuracy": 0.9952224493026733, "mean_gen_accuracy": 0.8666133731603622, "mean_token_accuracy": 0.8990635126829147, "num_tokens": 79202257.0, "sample_num_tokens": 8155.75, "step": 4293, "total_num_tokens": 79234880.0, "z_loss": 0.0005599507130682468 }, { "copy_logits_max": -4.170197010040283, "copy_logits_min": -750000000.0, "copy_num_tokens": 483.875, "epoch": 0.8769977023232065, "gen_logits_max": 4.7135009765625, "gen_logits_mean": -13.588238716125488, "gen_logits_min": -26.18402099609375, "gen_logits_std": 2.872772455215454, "gen_loss": 0.28727957606315613, "grad_norm": 0.41323812707437596, "learning_rate": 2.5505684210526317e-05, "loss": 0.2904, "mean_copy_accuracy": 0.9956849366426468, "mean_gen_accuracy": 0.872763603925705, "mean_token_accuracy": 0.9039361476898193, "num_tokens": 79452905.0, "sample_num_tokens": 8566.25, "step": 4294, "total_num_tokens": 79487170.0, "z_loss": 0.00058782659471035 }, { "copy_logits_max": -2.926860809326172, "copy_logits_min": -750000000.0, "copy_num_tokens": 451.9375, "epoch": 0.8772019402604033, "gen_logits_max": 4.727534770965576, "gen_logits_mean": -13.197761535644531, "gen_logits_min": -25.185758590698242, "gen_logits_std": 2.8364243507385254, "gen_loss": 0.3031724691390991, "grad_norm": 0.41432972707344523, "learning_rate": 2.550442105263158e-05, "loss": 0.2975, "mean_copy_accuracy": 0.9957771301269531, "mean_gen_accuracy": 0.8690614551305771, "mean_token_accuracy": 0.9019380062818527, "num_tokens": 79721816.0, "sample_num_tokens": 7760.5, "step": 4295, "total_num_tokens": 79752858.0, "z_loss": 0.0006213996093720198 }, { "copy_logits_max": -4.830770492553711, "copy_logits_min": -750000000.0, "copy_num_tokens": 541.1875, "epoch": 0.8774061781976003, "gen_logits_max": 3.552929162979126, "gen_logits_mean": -16.01559066772461, "gen_logits_min": -28.004192352294922, "gen_logits_std": 2.853822946548462, "gen_loss": 0.2728899121284485, "grad_norm": 0.3971757775761653, "learning_rate": 2.5503157894736842e-05, "loss": 0.2924, "mean_copy_accuracy": 0.9951054453849792, "mean_gen_accuracy": 0.8682517856359482, "mean_token_accuracy": 0.9005234390497208, "num_tokens": 80001454.0, "sample_num_tokens": 8953.0, "step": 4296, "total_num_tokens": 80037266.0, "z_loss": 0.0005597923882305622 }, { "copy_logits_max": -0.6502537727355957, "copy_logits_min": -750000000.0, "copy_num_tokens": 589.25, "epoch": 0.8776104161347971, "gen_logits_max": 3.691478729248047, "gen_logits_mean": -15.164525985717773, "gen_logits_min": -27.51347541809082, "gen_logits_std": 2.885219097137451, "gen_loss": 0.2729978561401367, "grad_norm": 0.3952426203799632, "learning_rate": 2.5501894736842107e-05, "loss": 0.3221, "mean_copy_accuracy": 0.995366245508194, "mean_gen_accuracy": 0.8550252318382263, "mean_token_accuracy": 0.8919420540332794, "num_tokens": 80294122.0, "sample_num_tokens": 8999.5, "step": 4297, "total_num_tokens": 80330120.0, "z_loss": 0.0005986186442896724 }, { "copy_logits_max": -0.4245166778564453, "copy_logits_min": -750000000.0, "copy_num_tokens": 389.9375, "epoch": 0.8778146540719939, "gen_logits_max": 6.142571926116943, "gen_logits_mean": -12.947205543518066, "gen_logits_min": -24.97560691833496, "gen_logits_std": 2.8112683296203613, "gen_loss": 0.29071688652038574, "grad_norm": 0.4230001840386242, "learning_rate": 2.5500631578947368e-05, "loss": 0.3037, "mean_copy_accuracy": 0.9955637156963348, "mean_gen_accuracy": 0.8675220012664795, "mean_token_accuracy": 0.9000708758831024, "num_tokens": 80574202.0, "sample_num_tokens": 7666.5, "step": 4298, "total_num_tokens": 80604868.0, "z_loss": 0.0006595982122235 }, { "copy_logits_max": -1.4677106142044067, "copy_logits_min": -750000000.0, "copy_num_tokens": 437.625, "epoch": 0.8780188920091907, "gen_logits_max": 5.212247371673584, "gen_logits_mean": -14.450304985046387, "gen_logits_min": -26.432689666748047, "gen_logits_std": 2.8437509536743164, "gen_loss": 0.28762680292129517, "grad_norm": 0.3873686202679029, "learning_rate": 2.5499368421052632e-05, "loss": 0.2981, "mean_copy_accuracy": 0.9965257197618484, "mean_gen_accuracy": 0.8694862723350525, "mean_token_accuracy": 0.9009575992822647, "num_tokens": 80828160.0, "sample_num_tokens": 8414.0, "step": 4299, "total_num_tokens": 80861816.0, "z_loss": 0.0006793338689021766 }, { "copy_logits_max": -4.9159955978393555, "copy_logits_min": -750000000.0, "copy_num_tokens": 436.3125, "epoch": 0.8782231299463875, "gen_logits_max": 4.994966506958008, "gen_logits_mean": -14.180583953857422, "gen_logits_min": -26.253969192504883, "gen_logits_std": 2.8460874557495117, "gen_loss": 0.2815767526626587, "grad_norm": 0.35073491317419353, "learning_rate": 2.5498105263157896e-05, "loss": 0.2874, "mean_copy_accuracy": 0.995759442448616, "mean_gen_accuracy": 0.8752814829349518, "mean_token_accuracy": 0.9035608023405075, "num_tokens": 81101624.0, "sample_num_tokens": 8321.0, "step": 4300, "total_num_tokens": 81134908.0, "z_loss": 0.0006915213889442384 }, { "copy_logits_max": -5.018477439880371, "copy_logits_min": -750000000.0, "copy_num_tokens": 600.25, "epoch": 0.8784273678835843, "gen_logits_max": 4.381701469421387, "gen_logits_mean": -14.902628898620605, "gen_logits_min": -27.001190185546875, "gen_logits_std": 2.852539300918579, "gen_loss": 0.2480628490447998, "grad_norm": 0.4352683807022973, "learning_rate": 2.5496842105263157e-05, "loss": 0.2903, "mean_copy_accuracy": 0.994926244020462, "mean_gen_accuracy": 0.8741065859794617, "mean_token_accuracy": 0.9047780781984329, "num_tokens": 81390071.0, "sample_num_tokens": 10182.75, "step": 4301, "total_num_tokens": 81430802.0, "z_loss": 0.0006614961894229054 }, { "copy_logits_max": -0.886894702911377, "copy_logits_min": -750000064.0, "copy_num_tokens": 577.5625, "epoch": 0.8786316058207813, "gen_logits_max": 5.083949089050293, "gen_logits_mean": -13.954460144042969, "gen_logits_min": -26.21118927001953, "gen_logits_std": 2.838961124420166, "gen_loss": 0.2540745735168457, "grad_norm": 0.4119657050235981, "learning_rate": 2.549557894736842e-05, "loss": 0.2743, "mean_copy_accuracy": 0.9949630796909332, "mean_gen_accuracy": 0.8777046650648117, "mean_token_accuracy": 0.9088670164346695, "num_tokens": 81695961.0, "sample_num_tokens": 9705.75, "step": 4302, "total_num_tokens": 81734784.0, "z_loss": 0.0005925819277763367 }, { "copy_logits_max": -2.993565559387207, "copy_logits_min": -750000000.0, "copy_num_tokens": 627.125, "epoch": 0.8788358437579781, "gen_logits_max": 4.622389793395996, "gen_logits_mean": -13.617189407348633, "gen_logits_min": -25.657438278198242, "gen_logits_std": 2.7840278148651123, "gen_loss": 0.2700865864753723, "grad_norm": 0.4081148343979204, "learning_rate": 2.5494315789473686e-05, "loss": 0.3152, "mean_copy_accuracy": 0.9958246648311615, "mean_gen_accuracy": 0.8602053076028824, "mean_token_accuracy": 0.8952190428972244, "num_tokens": 81972236.0, "sample_num_tokens": 8422.0, "step": 4303, "total_num_tokens": 82005924.0, "z_loss": 0.000598891987465322 }, { "copy_logits_max": -1.5995795726776123, "copy_logits_min": -750000000.0, "copy_num_tokens": 493.4375, "epoch": 0.8790400816951749, "gen_logits_max": 6.011857986450195, "gen_logits_mean": -12.881108283996582, "gen_logits_min": -24.692466735839844, "gen_logits_std": 2.792929172515869, "gen_loss": 0.2718973755836487, "grad_norm": 0.38928041256892865, "learning_rate": 2.549305263157895e-05, "loss": 0.2883, "mean_copy_accuracy": 0.9954650551080704, "mean_gen_accuracy": 0.8717180043458939, "mean_token_accuracy": 0.9039360731840134, "num_tokens": 82244545.0, "sample_num_tokens": 8504.75, "step": 4304, "total_num_tokens": 82278564.0, "z_loss": 0.0005853003822267056 }, { "copy_logits_max": -2.4174861907958984, "copy_logits_min": -750000000.0, "copy_num_tokens": 436.4375, "epoch": 0.8792443196323717, "gen_logits_max": 5.1626176834106445, "gen_logits_mean": -14.796661376953125, "gen_logits_min": -26.04607391357422, "gen_logits_std": 2.7829039096832275, "gen_loss": 0.33531031012535095, "grad_norm": 0.4032279568417936, "learning_rate": 2.549178947368421e-05, "loss": 0.2978, "mean_copy_accuracy": 0.9947881102561951, "mean_gen_accuracy": 0.8712646514177322, "mean_token_accuracy": 0.8996567726135254, "num_tokens": 82518221.0, "sample_num_tokens": 9003.25, "step": 4305, "total_num_tokens": 82554234.0, "z_loss": 0.0006885076873004436 }, { "copy_logits_max": -5.357793807983398, "copy_logits_min": -750000000.0, "copy_num_tokens": 470.5625, "epoch": 0.8794485575695685, "gen_logits_max": 4.935091018676758, "gen_logits_mean": -14.222637176513672, "gen_logits_min": -25.585159301757812, "gen_logits_std": 2.7492659091949463, "gen_loss": 0.2831820845603943, "grad_norm": 0.379103961526688, "learning_rate": 2.5490526315789475e-05, "loss": 0.3007, "mean_copy_accuracy": 0.9955424666404724, "mean_gen_accuracy": 0.8675760179758072, "mean_token_accuracy": 0.9007745832204819, "num_tokens": 82807820.0, "sample_num_tokens": 8497.0, "step": 4306, "total_num_tokens": 82841808.0, "z_loss": 0.0006464675534516573 }, { "copy_logits_max": -3.76757550239563, "copy_logits_min": -750000000.0, "copy_num_tokens": 503.0, "epoch": 0.8796527955067653, "gen_logits_max": 4.50258731842041, "gen_logits_mean": -13.80481243133545, "gen_logits_min": -26.066896438598633, "gen_logits_std": 2.7995619773864746, "gen_loss": 0.29555732011795044, "grad_norm": 0.3979335249234773, "learning_rate": 2.5489263157894736e-05, "loss": 0.2899, "mean_copy_accuracy": 0.9950853884220123, "mean_gen_accuracy": 0.8707698732614517, "mean_token_accuracy": 0.9022985547780991, "num_tokens": 83076334.0, "sample_num_tokens": 8297.0, "step": 4307, "total_num_tokens": 83109522.0, "z_loss": 0.0005832197493873537 }, { "copy_logits_max": -3.650857448577881, "copy_logits_min": -687500096.0, "copy_num_tokens": 518.9375, "epoch": 0.8798570334439623, "gen_logits_max": 4.249142169952393, "gen_logits_mean": -14.15931224822998, "gen_logits_min": -25.74663734436035, "gen_logits_std": 2.7581188678741455, "gen_loss": 0.3373106122016907, "grad_norm": 0.37425249881850703, "learning_rate": 2.5488e-05, "loss": 0.304, "mean_copy_accuracy": 0.9946821331977844, "mean_gen_accuracy": 0.8654231131076813, "mean_token_accuracy": 0.8975687474012375, "num_tokens": 83348546.0, "sample_num_tokens": 8740.0, "step": 4308, "total_num_tokens": 83383506.0, "z_loss": 0.0006596508319489658 }, { "copy_logits_max": -4.483901023864746, "copy_logits_min": -750000000.0, "copy_num_tokens": 414.1875, "epoch": 0.8800612713811591, "gen_logits_max": 4.590947151184082, "gen_logits_mean": -15.069270133972168, "gen_logits_min": -26.39044761657715, "gen_logits_std": 2.7932276725769043, "gen_loss": 0.33343496918678284, "grad_norm": 0.44599546133161677, "learning_rate": 2.548673684210526e-05, "loss": 0.3039, "mean_copy_accuracy": 0.9960105270147324, "mean_gen_accuracy": 0.8697133958339691, "mean_token_accuracy": 0.8985895663499832, "num_tokens": 83636125.0, "sample_num_tokens": 8125.75, "step": 4309, "total_num_tokens": 83668628.0, "z_loss": 0.0006484955083578825 }, { "copy_logits_max": -4.331395149230957, "copy_logits_min": -687499968.0, "copy_num_tokens": 521.75, "epoch": 0.8802655093183559, "gen_logits_max": 5.1103515625, "gen_logits_mean": -14.575106620788574, "gen_logits_min": -26.140424728393555, "gen_logits_std": 2.7936975955963135, "gen_loss": 0.3017536401748657, "grad_norm": 0.38059828014362446, "learning_rate": 2.5485473684210526e-05, "loss": 0.2974, "mean_copy_accuracy": 0.9953401684761047, "mean_gen_accuracy": 0.8687243163585663, "mean_token_accuracy": 0.9008777439594269, "num_tokens": 83942732.0, "sample_num_tokens": 9627.0, "step": 4310, "total_num_tokens": 83981240.0, "z_loss": 0.0005653901025652885 }, { "copy_logits_max": -5.94769287109375, "copy_logits_min": -750000000.0, "copy_num_tokens": 453.75, "epoch": 0.8804697472555527, "gen_logits_max": 4.690218448638916, "gen_logits_mean": -14.123732566833496, "gen_logits_min": -25.74936294555664, "gen_logits_std": 2.799981117248535, "gen_loss": 0.2984837293624878, "grad_norm": 0.4584818690826077, "learning_rate": 2.548421052631579e-05, "loss": 0.3022, "mean_copy_accuracy": 0.9939840883016586, "mean_gen_accuracy": 0.8684180080890656, "mean_token_accuracy": 0.9006030261516571, "num_tokens": 84229371.0, "sample_num_tokens": 8230.75, "step": 4311, "total_num_tokens": 84262294.0, "z_loss": 0.0005938233807682991 }, { "copy_logits_max": -6.058976173400879, "copy_logits_min": -750000000.0, "copy_num_tokens": 436.1875, "epoch": 0.8806739851927495, "gen_logits_max": 5.441224575042725, "gen_logits_mean": -13.716211318969727, "gen_logits_min": -25.18214225769043, "gen_logits_std": 2.757157325744629, "gen_loss": 0.333811491727829, "grad_norm": 0.42163798501065824, "learning_rate": 2.5482947368421054e-05, "loss": 0.3288, "mean_copy_accuracy": 0.9939537197351456, "mean_gen_accuracy": 0.8630328476428986, "mean_token_accuracy": 0.8928681015968323, "num_tokens": 84490200.0, "sample_num_tokens": 8065.0, "step": 4312, "total_num_tokens": 84522460.0, "z_loss": 0.0006479002186097205 }, { "copy_logits_max": -3.924741506576538, "copy_logits_min": -750000000.0, "copy_num_tokens": 623.375, "epoch": 0.8808782231299463, "gen_logits_max": 4.0277557373046875, "gen_logits_mean": -15.047307968139648, "gen_logits_min": -26.081968307495117, "gen_logits_std": 2.7393012046813965, "gen_loss": 0.28279268741607666, "grad_norm": 0.3713113016499049, "learning_rate": 2.5481684210526315e-05, "loss": 0.3128, "mean_copy_accuracy": 0.9969066083431244, "mean_gen_accuracy": 0.8622930347919464, "mean_token_accuracy": 0.8951542377471924, "num_tokens": 84752150.0, "sample_num_tokens": 9814.0, "step": 4313, "total_num_tokens": 84791406.0, "z_loss": 0.0005802961532026529 }, { "copy_logits_max": -3.832547187805176, "copy_logits_min": -750000000.0, "copy_num_tokens": 484.375, "epoch": 0.8810824610671433, "gen_logits_max": 4.862208843231201, "gen_logits_mean": -13.952327728271484, "gen_logits_min": -25.961666107177734, "gen_logits_std": 2.8446340560913086, "gen_loss": 0.3106958568096161, "grad_norm": 0.3814801728692255, "learning_rate": 2.548042105263158e-05, "loss": 0.3021, "mean_copy_accuracy": 0.994472935795784, "mean_gen_accuracy": 0.868256151676178, "mean_token_accuracy": 0.8998151570558548, "num_tokens": 85037974.0, "sample_num_tokens": 8818.0, "step": 4314, "total_num_tokens": 85073246.0, "z_loss": 0.000744003162253648 }, { "copy_logits_max": -3.7113709449768066, "copy_logits_min": -750000000.0, "copy_num_tokens": 663.0625, "epoch": 0.8812866990043401, "gen_logits_max": 4.43812370300293, "gen_logits_mean": -13.678474426269531, "gen_logits_min": -25.331722259521484, "gen_logits_std": 2.770570755004883, "gen_loss": 0.26881909370422363, "grad_norm": 0.42847658159174024, "learning_rate": 2.5479157894736844e-05, "loss": 0.3032, "mean_copy_accuracy": 0.9940595328807831, "mean_gen_accuracy": 0.868764266371727, "mean_token_accuracy": 0.8999678194522858, "num_tokens": 85304827.0, "sample_num_tokens": 9466.25, "step": 4315, "total_num_tokens": 85342692.0, "z_loss": 0.0007405258947983384 }, { "copy_logits_max": -3.8726894855499268, "copy_logits_min": -750000000.0, "copy_num_tokens": 413.75, "epoch": 0.8814909369415369, "gen_logits_max": 4.585704326629639, "gen_logits_mean": -14.80989933013916, "gen_logits_min": -26.436607360839844, "gen_logits_std": 2.79776668548584, "gen_loss": 0.33165886998176575, "grad_norm": 0.3988503927845021, "learning_rate": 2.5477894736842105e-05, "loss": 0.3269, "mean_copy_accuracy": 0.9933556169271469, "mean_gen_accuracy": 0.861953392624855, "mean_token_accuracy": 0.8919177353382111, "num_tokens": 85571056.0, "sample_num_tokens": 7864.0, "step": 4316, "total_num_tokens": 85602512.0, "z_loss": 0.0007099464419297874 }, { "copy_logits_max": -2.909026861190796, "copy_logits_min": -687500032.0, "copy_num_tokens": 633.375, "epoch": 0.8816951748787337, "gen_logits_max": 3.9108028411865234, "gen_logits_mean": -14.219276428222656, "gen_logits_min": -26.06216049194336, "gen_logits_std": 2.793832540512085, "gen_loss": 0.3013902008533478, "grad_norm": 0.4140318236285923, "learning_rate": 2.547663157894737e-05, "loss": 0.3091, "mean_copy_accuracy": 0.9943237453699112, "mean_gen_accuracy": 0.8608594685792923, "mean_token_accuracy": 0.8959392160177231, "num_tokens": 85859445.0, "sample_num_tokens": 9090.25, "step": 4317, "total_num_tokens": 85895806.0, "z_loss": 0.0007040099008008838 }, { "copy_logits_max": -4.177321434020996, "copy_logits_min": -687500032.0, "copy_num_tokens": 374.625, "epoch": 0.8818994128159305, "gen_logits_max": 4.60334587097168, "gen_logits_mean": -15.773786544799805, "gen_logits_min": -27.22603416442871, "gen_logits_std": 2.795172691345215, "gen_loss": 0.3039545714855194, "grad_norm": 0.405374835913161, "learning_rate": 2.547536842105263e-05, "loss": 0.3213, "mean_copy_accuracy": 0.995176300406456, "mean_gen_accuracy": 0.8596131503582001, "mean_token_accuracy": 0.8938773572444916, "num_tokens": 86159553.0, "sample_num_tokens": 8038.75, "step": 4318, "total_num_tokens": 86191708.0, "z_loss": 0.000641626538708806 }, { "copy_logits_max": -2.571368932723999, "copy_logits_min": -687500032.0, "copy_num_tokens": 559.875, "epoch": 0.8821036507531274, "gen_logits_max": 3.7110722064971924, "gen_logits_mean": -15.419245719909668, "gen_logits_min": -27.383283615112305, "gen_logits_std": 2.82975435256958, "gen_loss": 0.29720035195350647, "grad_norm": 0.39494426738230193, "learning_rate": 2.5474105263157898e-05, "loss": 0.3074, "mean_copy_accuracy": 0.9954970479011536, "mean_gen_accuracy": 0.865394800901413, "mean_token_accuracy": 0.8966771960258484, "num_tokens": 86427526.0, "sample_num_tokens": 9033.0, "step": 4319, "total_num_tokens": 86463658.0, "z_loss": 0.0005846993299201131 }, { "copy_logits_max": -4.106369495391846, "copy_logits_min": -750000000.0, "copy_num_tokens": 264.875, "epoch": 0.8823078886903243, "gen_logits_max": 4.847328186035156, "gen_logits_mean": -15.86733627319336, "gen_logits_min": -27.447017669677734, "gen_logits_std": 2.8214964866638184, "gen_loss": 0.3403654098510742, "grad_norm": 0.42354643033137435, "learning_rate": 2.547284210526316e-05, "loss": 0.3372, "mean_copy_accuracy": 0.9950381219387054, "mean_gen_accuracy": 0.8562031686306, "mean_token_accuracy": 0.8880467861890793, "num_tokens": 86690761.0, "sample_num_tokens": 7043.25, "step": 4320, "total_num_tokens": 86718934.0, "z_loss": 0.0006057899445295334 }, { "copy_logits_max": -5.1052565574646, "copy_logits_min": -687500032.0, "copy_num_tokens": 519.3125, "epoch": 0.8825121266275211, "gen_logits_max": 4.605112075805664, "gen_logits_mean": -15.108809471130371, "gen_logits_min": -27.028621673583984, "gen_logits_std": 2.861273765563965, "gen_loss": 0.3510996997356415, "grad_norm": 0.48276867402697904, "learning_rate": 2.5471578947368423e-05, "loss": 0.3323, "mean_copy_accuracy": 0.9954138398170471, "mean_gen_accuracy": 0.86262047290802, "mean_token_accuracy": 0.8921753019094467, "num_tokens": 86953216.0, "sample_num_tokens": 9355.5, "step": 4321, "total_num_tokens": 86990638.0, "z_loss": 0.0006303419359028339 }, { "copy_logits_max": -3.200092315673828, "copy_logits_min": -750000000.0, "copy_num_tokens": 719.1875, "epoch": 0.8827163645647179, "gen_logits_max": 4.496214866638184, "gen_logits_mean": -13.29170036315918, "gen_logits_min": -25.09561538696289, "gen_logits_std": 2.8397364616394043, "gen_loss": 0.2551855444908142, "grad_norm": 0.3754372725181913, "learning_rate": 2.5470315789473684e-05, "loss": 0.2951, "mean_copy_accuracy": 0.9948954731225967, "mean_gen_accuracy": 0.8678439855575562, "mean_token_accuracy": 0.9020727276802063, "num_tokens": 87260421.0, "sample_num_tokens": 10247.75, "step": 4322, "total_num_tokens": 87301412.0, "z_loss": 0.0004856764280702919 }, { "copy_logits_max": -4.300162315368652, "copy_logits_min": -687500032.0, "copy_num_tokens": 722.5625, "epoch": 0.8829206025019147, "gen_logits_max": 3.614724636077881, "gen_logits_mean": -14.953107833862305, "gen_logits_min": -26.956281661987305, "gen_logits_std": 2.8403334617614746, "gen_loss": 0.2731967270374298, "grad_norm": 0.39174160017726384, "learning_rate": 2.546905263157895e-05, "loss": 0.303, "mean_copy_accuracy": 0.9957842826843262, "mean_gen_accuracy": 0.8666151016950607, "mean_token_accuracy": 0.9016015827655792, "num_tokens": 87544504.0, "sample_num_tokens": 9603.0, "step": 4323, "total_num_tokens": 87582916.0, "z_loss": 0.0005929957842454314 }, { "copy_logits_max": -4.479614734649658, "copy_logits_min": -750000000.0, "copy_num_tokens": 362.0, "epoch": 0.8831248404391115, "gen_logits_max": 4.065231800079346, "gen_logits_mean": -16.011505126953125, "gen_logits_min": -27.648019790649414, "gen_logits_std": 2.8225412368774414, "gen_loss": 0.3140765428543091, "grad_norm": 0.4032530721258966, "learning_rate": 2.546778947368421e-05, "loss": 0.2982, "mean_copy_accuracy": 0.9953083097934723, "mean_gen_accuracy": 0.8657627999782562, "mean_token_accuracy": 0.8999921977519989, "num_tokens": 87797247.0, "sample_num_tokens": 7278.25, "step": 4324, "total_num_tokens": 87826360.0, "z_loss": 0.0006193023291416466 }, { "copy_logits_max": -3.395843029022217, "copy_logits_min": -750000000.0, "copy_num_tokens": 626.5625, "epoch": 0.8833290783763084, "gen_logits_max": 4.013401508331299, "gen_logits_mean": -14.696380615234375, "gen_logits_min": -27.037818908691406, "gen_logits_std": 2.8685946464538574, "gen_loss": 0.2797123193740845, "grad_norm": 0.37217098201915827, "learning_rate": 2.5466526315789474e-05, "loss": 0.3069, "mean_copy_accuracy": 0.996620774269104, "mean_gen_accuracy": 0.8629100769758224, "mean_token_accuracy": 0.8979803323745728, "num_tokens": 88073563.0, "sample_num_tokens": 10105.25, "step": 4325, "total_num_tokens": 88113984.0, "z_loss": 0.0005746185779571533 }, { "copy_logits_max": -5.660227298736572, "copy_logits_min": -750000000.0, "copy_num_tokens": 578.875, "epoch": 0.8835333163135052, "gen_logits_max": 5.119903564453125, "gen_logits_mean": -13.208840370178223, "gen_logits_min": -24.49430274963379, "gen_logits_std": 2.7125465869903564, "gen_loss": 0.2994931638240814, "grad_norm": 0.369507862294305, "learning_rate": 2.5465263157894734e-05, "loss": 0.2962, "mean_copy_accuracy": 0.9951966404914856, "mean_gen_accuracy": 0.8646753430366516, "mean_token_accuracy": 0.901406928896904, "num_tokens": 88356805.0, "sample_num_tokens": 8879.25, "step": 4326, "total_num_tokens": 88392322.0, "z_loss": 0.0006023551104590297 }, { "copy_logits_max": -5.673229217529297, "copy_logits_min": -750000000.0, "copy_num_tokens": 443.8125, "epoch": 0.8837375542507021, "gen_logits_max": 5.003824234008789, "gen_logits_mean": -14.948617935180664, "gen_logits_min": -26.16769790649414, "gen_logits_std": 2.7728729248046875, "gen_loss": 0.29266357421875, "grad_norm": 0.4044905316596879, "learning_rate": 2.5464000000000002e-05, "loss": 0.3029, "mean_copy_accuracy": 0.9946692138910294, "mean_gen_accuracy": 0.8720723092556, "mean_token_accuracy": 0.9036537110805511, "num_tokens": 88630012.0, "sample_num_tokens": 8576.5, "step": 4327, "total_num_tokens": 88664318.0, "z_loss": 0.0005350555293262005 }, { "copy_logits_max": -7.061645030975342, "copy_logits_min": -750000000.0, "copy_num_tokens": 595.9375, "epoch": 0.8839417921878989, "gen_logits_max": 4.511740684509277, "gen_logits_mean": -15.014604568481445, "gen_logits_min": -26.74545669555664, "gen_logits_std": 2.779305934906006, "gen_loss": 0.27679872512817383, "grad_norm": 0.40612777822866697, "learning_rate": 2.5462736842105266e-05, "loss": 0.316, "mean_copy_accuracy": 0.9956726729869843, "mean_gen_accuracy": 0.8610350638628006, "mean_token_accuracy": 0.8938233852386475, "num_tokens": 88894316.0, "sample_num_tokens": 10629.0, "step": 4328, "total_num_tokens": 88936832.0, "z_loss": 0.0006018370622768998 }, { "copy_logits_max": -5.007115840911865, "copy_logits_min": -750000000.0, "copy_num_tokens": 596.6875, "epoch": 0.8841460301250957, "gen_logits_max": 4.467228889465332, "gen_logits_mean": -13.687444686889648, "gen_logits_min": -26.21346664428711, "gen_logits_std": 2.815614700317383, "gen_loss": 0.3087614178657532, "grad_norm": 0.4148015117887421, "learning_rate": 2.5461473684210527e-05, "loss": 0.299, "mean_copy_accuracy": 0.9959443360567093, "mean_gen_accuracy": 0.8689483255147934, "mean_token_accuracy": 0.9020081907510757, "num_tokens": 89169080.0, "sample_num_tokens": 8931.5, "step": 4329, "total_num_tokens": 89204806.0, "z_loss": 0.0006472049281001091 }, { "copy_logits_max": -7.859278202056885, "copy_logits_min": -750000000.0, "copy_num_tokens": 481.6875, "epoch": 0.8843502680622926, "gen_logits_max": 4.526816368103027, "gen_logits_mean": -14.632866859436035, "gen_logits_min": -26.595191955566406, "gen_logits_std": 2.8419244289398193, "gen_loss": 0.2984299063682556, "grad_norm": 0.37654481650324106, "learning_rate": 2.5460210526315792e-05, "loss": 0.3192, "mean_copy_accuracy": 0.9947958290576935, "mean_gen_accuracy": 0.8641345202922821, "mean_token_accuracy": 0.8929077386856079, "num_tokens": 89441829.0, "sample_num_tokens": 8590.25, "step": 4330, "total_num_tokens": 89476190.0, "z_loss": 0.0005971207283437252 }, { "copy_logits_max": -6.119592666625977, "copy_logits_min": -687500032.0, "copy_num_tokens": 459.6875, "epoch": 0.8845545059994894, "gen_logits_max": 3.8386383056640625, "gen_logits_mean": -15.45562744140625, "gen_logits_min": -27.20468521118164, "gen_logits_std": 2.849423885345459, "gen_loss": 0.28436148166656494, "grad_norm": 0.3959955096273523, "learning_rate": 2.5458947368421053e-05, "loss": 0.3193, "mean_copy_accuracy": 0.995425820350647, "mean_gen_accuracy": 0.8596090376377106, "mean_token_accuracy": 0.8928510397672653, "num_tokens": 89703402.0, "sample_num_tokens": 8193.5, "step": 4331, "total_num_tokens": 89736176.0, "z_loss": 0.0005386651027947664 }, { "copy_logits_max": -5.5592145919799805, "copy_logits_min": -625000064.0, "copy_num_tokens": 537.75, "epoch": 0.8847587439366862, "gen_logits_max": 3.6280903816223145, "gen_logits_mean": -15.783226013183594, "gen_logits_min": -27.683908462524414, "gen_logits_std": 2.8232741355895996, "gen_loss": 0.3240320086479187, "grad_norm": 0.38937002204051424, "learning_rate": 2.5457684210526317e-05, "loss": 0.3102, "mean_copy_accuracy": 0.9958573132753372, "mean_gen_accuracy": 0.8618487864732742, "mean_token_accuracy": 0.8964760899543762, "num_tokens": 89973645.0, "sample_num_tokens": 8853.75, "step": 4332, "total_num_tokens": 90009060.0, "z_loss": 0.0006207178230397403 }, { "copy_logits_max": -6.955251693725586, "copy_logits_min": -750000000.0, "copy_num_tokens": 383.3125, "epoch": 0.8849629818738831, "gen_logits_max": 5.692896842956543, "gen_logits_mean": -13.920339584350586, "gen_logits_min": -25.9400634765625, "gen_logits_std": 2.749446153640747, "gen_loss": 0.3610568940639496, "grad_norm": 0.4262318164527612, "learning_rate": 2.5456421052631578e-05, "loss": 0.329, "mean_copy_accuracy": 0.9954208731651306, "mean_gen_accuracy": 0.8622128069400787, "mean_token_accuracy": 0.8906795382499695, "num_tokens": 90235473.0, "sample_num_tokens": 7963.75, "step": 4333, "total_num_tokens": 90267328.0, "z_loss": 0.0007109116995707154 }, { "copy_logits_max": -5.880141258239746, "copy_logits_min": -750000000.0, "copy_num_tokens": 727.4375, "epoch": 0.8851672198110799, "gen_logits_max": 5.2472333908081055, "gen_logits_mean": -13.092041015625, "gen_logits_min": -24.860668182373047, "gen_logits_std": 2.791256904602051, "gen_loss": 0.3125685453414917, "grad_norm": 0.40364995036204593, "learning_rate": 2.5455157894736842e-05, "loss": 0.3128, "mean_copy_accuracy": 0.9953036457300186, "mean_gen_accuracy": 0.8634855896234512, "mean_token_accuracy": 0.898265540599823, "num_tokens": 90497926.0, "sample_num_tokens": 10342.5, "step": 4334, "total_num_tokens": 90539296.0, "z_loss": 0.0006807834142819047 }, { "copy_logits_max": -5.1407880783081055, "copy_logits_min": -750000000.0, "copy_num_tokens": 602.25, "epoch": 0.8853714577482767, "gen_logits_max": 4.712259769439697, "gen_logits_mean": -13.84396743774414, "gen_logits_min": -26.147253036499023, "gen_logits_std": 2.8084912300109863, "gen_loss": 0.32424527406692505, "grad_norm": 0.39760077422418877, "learning_rate": 2.5453894736842107e-05, "loss": 0.3057, "mean_copy_accuracy": 0.9959553182125092, "mean_gen_accuracy": 0.8627143800258636, "mean_token_accuracy": 0.8979999423027039, "num_tokens": 90778252.0, "sample_num_tokens": 8196.0, "step": 4335, "total_num_tokens": 90811036.0, "z_loss": 0.0006856335094198585 }, { "copy_logits_max": -5.333825588226318, "copy_logits_min": -687500032.0, "copy_num_tokens": 449.125, "epoch": 0.8855756956854736, "gen_logits_max": 4.330029487609863, "gen_logits_mean": -15.88316535949707, "gen_logits_min": -27.42361831665039, "gen_logits_std": 2.8178396224975586, "gen_loss": 0.30537399649620056, "grad_norm": 0.4323847058772437, "learning_rate": 2.545263157894737e-05, "loss": 0.3125, "mean_copy_accuracy": 0.9946268498897552, "mean_gen_accuracy": 0.8649148792028427, "mean_token_accuracy": 0.8942287415266037, "num_tokens": 91032601.0, "sample_num_tokens": 8747.75, "step": 4336, "total_num_tokens": 91067592.0, "z_loss": 0.0006968920351937413 }, { "copy_logits_max": -4.271357536315918, "copy_logits_min": -750000000.0, "copy_num_tokens": 601.0, "epoch": 0.8857799336226704, "gen_logits_max": 4.600130081176758, "gen_logits_mean": -14.125407218933105, "gen_logits_min": -26.366043090820312, "gen_logits_std": 2.83390474319458, "gen_loss": 0.2927534580230713, "grad_norm": 0.3833982995152041, "learning_rate": 2.5451368421052632e-05, "loss": 0.2954, "mean_copy_accuracy": 0.9966252893209457, "mean_gen_accuracy": 0.8672615438699722, "mean_token_accuracy": 0.9017622023820877, "num_tokens": 91308862.0, "sample_num_tokens": 8439.0, "step": 4337, "total_num_tokens": 91342618.0, "z_loss": 0.0006715147173963487 }, { "copy_logits_max": -7.448231220245361, "copy_logits_min": -750000000.0, "copy_num_tokens": 372.6875, "epoch": 0.8859841715598672, "gen_logits_max": 4.694976806640625, "gen_logits_mean": -16.23716163635254, "gen_logits_min": -27.545259475708008, "gen_logits_std": 2.788968563079834, "gen_loss": 0.31706124544143677, "grad_norm": 0.38483336069175483, "learning_rate": 2.5450105263157896e-05, "loss": 0.3141, "mean_copy_accuracy": 0.9960525184869766, "mean_gen_accuracy": 0.8625351637601852, "mean_token_accuracy": 0.8953412771224976, "num_tokens": 91567014.0, "sample_num_tokens": 7601.0, "step": 4338, "total_num_tokens": 91597418.0, "z_loss": 0.0006354835350066423 }, { "copy_logits_max": -8.275333404541016, "copy_logits_min": -750000000.0, "copy_num_tokens": 471.8125, "epoch": 0.8861884094970641, "gen_logits_max": 5.331084251403809, "gen_logits_mean": -14.071979522705078, "gen_logits_min": -25.730953216552734, "gen_logits_std": 2.793268918991089, "gen_loss": 0.2910008430480957, "grad_norm": 0.43481433180570256, "learning_rate": 2.5448842105263157e-05, "loss": 0.3123, "mean_copy_accuracy": 0.9943174123764038, "mean_gen_accuracy": 0.8665427267551422, "mean_token_accuracy": 0.8969547748565674, "num_tokens": 91832125.0, "sample_num_tokens": 8830.25, "step": 4339, "total_num_tokens": 91867446.0, "z_loss": 0.0006036031991243362 }, { "copy_logits_max": -6.698888778686523, "copy_logits_min": -750000000.0, "copy_num_tokens": 600.4375, "epoch": 0.8863926474342609, "gen_logits_max": 5.468669891357422, "gen_logits_mean": -13.654643058776855, "gen_logits_min": -25.477495193481445, "gen_logits_std": 2.8084957599639893, "gen_loss": 0.2926880419254303, "grad_norm": 0.36867237161632166, "learning_rate": 2.544757894736842e-05, "loss": 0.273, "mean_copy_accuracy": 0.9951418191194534, "mean_gen_accuracy": 0.8749045878648758, "mean_token_accuracy": 0.9086811989545822, "num_tokens": 92125230.0, "sample_num_tokens": 8849.0, "step": 4340, "total_num_tokens": 92160626.0, "z_loss": 0.0006357698002830148 }, { "copy_logits_max": -6.650976181030273, "copy_logits_min": -687500032.0, "copy_num_tokens": 384.1875, "epoch": 0.8865968853714578, "gen_logits_max": 5.808271408081055, "gen_logits_mean": -13.35822582244873, "gen_logits_min": -25.28876304626465, "gen_logits_std": 2.8338780403137207, "gen_loss": 0.30748048424720764, "grad_norm": 0.366022352429342, "learning_rate": 2.5446315789473686e-05, "loss": 0.2981, "mean_copy_accuracy": 0.996043249964714, "mean_gen_accuracy": 0.8682546019554138, "mean_token_accuracy": 0.9000295549631119, "num_tokens": 92397953.0, "sample_num_tokens": 7152.25, "step": 4341, "total_num_tokens": 92426562.0, "z_loss": 0.0007205643923953176 }, { "copy_logits_max": -5.163078308105469, "copy_logits_min": -687500032.0, "copy_num_tokens": 534.1875, "epoch": 0.8868011233086546, "gen_logits_max": 4.396685600280762, "gen_logits_mean": -15.376705169677734, "gen_logits_min": -27.314308166503906, "gen_logits_std": 2.8315470218658447, "gen_loss": 0.29520973563194275, "grad_norm": 0.37357404029805447, "learning_rate": 2.5445052631578947e-05, "loss": 0.2721, "mean_copy_accuracy": 0.9973894655704498, "mean_gen_accuracy": 0.8734844923019409, "mean_token_accuracy": 0.9095034748315811, "num_tokens": 92688767.0, "sample_num_tokens": 9135.75, "step": 4342, "total_num_tokens": 92725310.0, "z_loss": 0.0006246243137866259 }, { "copy_logits_max": -7.085435390472412, "copy_logits_min": -750000000.0, "copy_num_tokens": 395.5625, "epoch": 0.8870053612458514, "gen_logits_max": 6.285591125488281, "gen_logits_mean": -13.890334129333496, "gen_logits_min": -25.586050033569336, "gen_logits_std": 2.829681873321533, "gen_loss": 0.28285276889801025, "grad_norm": 0.4043379608474565, "learning_rate": 2.544378947368421e-05, "loss": 0.29, "mean_copy_accuracy": 0.9960286021232605, "mean_gen_accuracy": 0.8725768774747849, "mean_token_accuracy": 0.9043162912130356, "num_tokens": 92968146.0, "sample_num_tokens": 8151.5, "step": 4343, "total_num_tokens": 93000752.0, "z_loss": 0.0006205491372384131 }, { "copy_logits_max": -7.697244644165039, "copy_logits_min": -750000000.0, "copy_num_tokens": 433.75, "epoch": 0.8872095991830482, "gen_logits_max": 5.814091205596924, "gen_logits_mean": -14.353484153747559, "gen_logits_min": -26.24359130859375, "gen_logits_std": 2.8291540145874023, "gen_loss": 0.2876249849796295, "grad_norm": 0.35305476388644313, "learning_rate": 2.5442526315789475e-05, "loss": 0.2863, "mean_copy_accuracy": 0.9965856075286865, "mean_gen_accuracy": 0.8764588981866837, "mean_token_accuracy": 0.9061961621046066, "num_tokens": 93255010.0, "sample_num_tokens": 8485.0, "step": 4344, "total_num_tokens": 93288950.0, "z_loss": 0.0005589717766270041 }, { "copy_logits_max": -7.338667869567871, "copy_logits_min": -750000000.0, "copy_num_tokens": 291.125, "epoch": 0.8874138371202451, "gen_logits_max": 6.042829990386963, "gen_logits_mean": -13.7210693359375, "gen_logits_min": -25.535442352294922, "gen_logits_std": 2.8479435443878174, "gen_loss": 0.29634231328964233, "grad_norm": 0.43067929616451167, "learning_rate": 2.544126315789474e-05, "loss": 0.3042, "mean_copy_accuracy": 0.9955427348613739, "mean_gen_accuracy": 0.8724726140499115, "mean_token_accuracy": 0.8999464362859726, "num_tokens": 93502581.0, "sample_num_tokens": 6994.75, "step": 4345, "total_num_tokens": 93530560.0, "z_loss": 0.0005670847604051232 }, { "copy_logits_max": -6.887331962585449, "copy_logits_min": -750000000.0, "copy_num_tokens": 383.875, "epoch": 0.887618075057442, "gen_logits_max": 5.06829833984375, "gen_logits_mean": -15.539785385131836, "gen_logits_min": -27.283748626708984, "gen_logits_std": 2.837644100189209, "gen_loss": 0.32938647270202637, "grad_norm": 0.39298972944595373, "learning_rate": 2.544e-05, "loss": 0.3121, "mean_copy_accuracy": 0.9959097802639008, "mean_gen_accuracy": 0.8636096268892288, "mean_token_accuracy": 0.8944827914237976, "num_tokens": 93760162.0, "sample_num_tokens": 7533.5, "step": 4346, "total_num_tokens": 93790296.0, "z_loss": 0.0005991741782054305 }, { "copy_logits_max": -6.139350891113281, "copy_logits_min": -750000064.0, "copy_num_tokens": 650.5625, "epoch": 0.8878223129946388, "gen_logits_max": 4.685441970825195, "gen_logits_mean": -14.444452285766602, "gen_logits_min": -26.7034969329834, "gen_logits_std": 2.8619868755340576, "gen_loss": 0.2597082853317261, "grad_norm": 0.4072129918352271, "learning_rate": 2.5438736842105265e-05, "loss": 0.297, "mean_copy_accuracy": 0.9960533976554871, "mean_gen_accuracy": 0.8681013733148575, "mean_token_accuracy": 0.9020961374044418, "num_tokens": 94042557.0, "sample_num_tokens": 9751.75, "step": 4347, "total_num_tokens": 94081564.0, "z_loss": 0.000593540258705616 }, { "copy_logits_max": -8.480289459228516, "copy_logits_min": -750000000.0, "copy_num_tokens": 348.875, "epoch": 0.8880265509318356, "gen_logits_max": 6.906414031982422, "gen_logits_mean": -12.777719497680664, "gen_logits_min": -24.247379302978516, "gen_logits_std": 2.7065796852111816, "gen_loss": 0.31383687257766724, "grad_norm": 0.3771763249020419, "learning_rate": 2.5437473684210526e-05, "loss": 0.3091, "mean_copy_accuracy": 0.9960525333881378, "mean_gen_accuracy": 0.867949590086937, "mean_token_accuracy": 0.8982950001955032, "num_tokens": 94326130.0, "sample_num_tokens": 7681.5, "step": 4348, "total_num_tokens": 94356856.0, "z_loss": 0.0006012177909724414 }, { "copy_logits_max": -5.659117221832275, "copy_logits_min": -750000000.0, "copy_num_tokens": 592.75, "epoch": 0.8882307888690324, "gen_logits_max": 5.3302001953125, "gen_logits_mean": -14.060612678527832, "gen_logits_min": -25.63883399963379, "gen_logits_std": 2.7876932621002197, "gen_loss": 0.33314090967178345, "grad_norm": 0.4247722936915192, "learning_rate": 2.543621052631579e-05, "loss": 0.3251, "mean_copy_accuracy": 0.9949020743370056, "mean_gen_accuracy": 0.8588393181562424, "mean_token_accuracy": 0.8924577087163925, "num_tokens": 94593575.0, "sample_num_tokens": 10276.25, "step": 4349, "total_num_tokens": 94634680.0, "z_loss": 0.0007196404039859772 }, { "copy_logits_max": -6.45622444152832, "copy_logits_min": -750000000.0, "copy_num_tokens": 555.125, "epoch": 0.8884350268062292, "gen_logits_max": 4.542616844177246, "gen_logits_mean": -13.983780860900879, "gen_logits_min": -26.468395233154297, "gen_logits_std": 2.81880259513855, "gen_loss": 0.30138787627220154, "grad_norm": 0.3670742586122651, "learning_rate": 2.543494736842105e-05, "loss": 0.3039, "mean_copy_accuracy": 0.995645210146904, "mean_gen_accuracy": 0.8653455972671509, "mean_token_accuracy": 0.899592712521553, "num_tokens": 94880716.0, "sample_num_tokens": 9114.5, "step": 4350, "total_num_tokens": 94917174.0, "z_loss": 0.0006120482576079667 }, { "copy_logits_max": -5.851779460906982, "copy_logits_min": -750000000.0, "copy_num_tokens": 391.0625, "epoch": 0.8886392647434261, "gen_logits_max": 5.984930038452148, "gen_logits_mean": -13.18069076538086, "gen_logits_min": -25.110260009765625, "gen_logits_std": 2.763461112976074, "gen_loss": 0.34071454405784607, "grad_norm": 0.4528807570924529, "learning_rate": 2.5433684210526315e-05, "loss": 0.3218, "mean_copy_accuracy": 0.9945648312568665, "mean_gen_accuracy": 0.8649250417947769, "mean_token_accuracy": 0.8957617431879044, "num_tokens": 95160027.0, "sample_num_tokens": 8172.25, "step": 4351, "total_num_tokens": 95192716.0, "z_loss": 0.0006874398677609861 }, { "copy_logits_max": -3.7252745628356934, "copy_logits_min": -687500032.0, "copy_num_tokens": 496.5625, "epoch": 0.888843502680623, "gen_logits_max": 4.643506050109863, "gen_logits_mean": -14.59664535522461, "gen_logits_min": -26.218482971191406, "gen_logits_std": 2.7927114963531494, "gen_loss": 0.3572281301021576, "grad_norm": 0.39335738124496283, "learning_rate": 2.543242105263158e-05, "loss": 0.3053, "mean_copy_accuracy": 0.9953006953001022, "mean_gen_accuracy": 0.8692068010568619, "mean_token_accuracy": 0.8996264040470123, "num_tokens": 95407489.0, "sample_num_tokens": 8428.75, "step": 4352, "total_num_tokens": 95441204.0, "z_loss": 0.0007107190322130919 }, { "copy_logits_max": -8.487665176391602, "copy_logits_min": -750000064.0, "copy_num_tokens": 187.375, "epoch": 0.8890477406178198, "gen_logits_max": 5.190083026885986, "gen_logits_mean": -16.82284164428711, "gen_logits_min": -27.76260757446289, "gen_logits_std": 2.792356491088867, "gen_loss": 0.31471070647239685, "grad_norm": 0.46376675469174683, "learning_rate": 2.5431157894736844e-05, "loss": 0.3191, "mean_copy_accuracy": 0.9952310472726822, "mean_gen_accuracy": 0.8617524057626724, "mean_token_accuracy": 0.8921632617712021, "num_tokens": 95675827.0, "sample_num_tokens": 6958.25, "step": 4353, "total_num_tokens": 95703660.0, "z_loss": 0.000602906453423202 }, { "copy_logits_max": -4.880284309387207, "copy_logits_min": -750000000.0, "copy_num_tokens": 404.375, "epoch": 0.8892519785550166, "gen_logits_max": 5.445374965667725, "gen_logits_mean": -13.745050430297852, "gen_logits_min": -25.533082962036133, "gen_logits_std": 2.8101022243499756, "gen_loss": 0.32320985198020935, "grad_norm": 0.5275980090087328, "learning_rate": 2.5429894736842108e-05, "loss": 0.2967, "mean_copy_accuracy": 0.9958688765764236, "mean_gen_accuracy": 0.869790330529213, "mean_token_accuracy": 0.9005127996206284, "num_tokens": 95944777.0, "sample_num_tokens": 8230.75, "step": 4354, "total_num_tokens": 95977700.0, "z_loss": 0.0006189968553371727 }, { "copy_logits_max": -5.500455856323242, "copy_logits_min": -750000000.0, "copy_num_tokens": 351.8125, "epoch": 0.8894562164922134, "gen_logits_max": 6.213430404663086, "gen_logits_mean": -13.506890296936035, "gen_logits_min": -25.196006774902344, "gen_logits_std": 2.803293466567993, "gen_loss": 0.3333342373371124, "grad_norm": 0.4257470393319146, "learning_rate": 2.542863157894737e-05, "loss": 0.305, "mean_copy_accuracy": 0.9953638315200806, "mean_gen_accuracy": 0.8619909882545471, "mean_token_accuracy": 0.8964485973119736, "num_tokens": 96230156.0, "sample_num_tokens": 7989.0, "step": 4355, "total_num_tokens": 96262112.0, "z_loss": 0.0006564410869032145 }, { "copy_logits_max": -6.540835857391357, "copy_logits_min": -750000064.0, "copy_num_tokens": 387.75, "epoch": 0.8896604544294102, "gen_logits_max": 5.599976539611816, "gen_logits_mean": -14.052764892578125, "gen_logits_min": -25.80419158935547, "gen_logits_std": 2.8121445178985596, "gen_loss": 0.3078122138977051, "grad_norm": 0.48411058424239056, "learning_rate": 2.5427368421052633e-05, "loss": 0.3156, "mean_copy_accuracy": 0.9945279806852341, "mean_gen_accuracy": 0.8604287654161453, "mean_token_accuracy": 0.8950157314538956, "num_tokens": 96483079.0, "sample_num_tokens": 7579.25, "step": 4356, "total_num_tokens": 96513396.0, "z_loss": 0.0005649524973705411 }, { "copy_logits_max": -5.891620635986328, "copy_logits_min": -687500032.0, "copy_num_tokens": 563.6875, "epoch": 0.8898646923666071, "gen_logits_max": 5.06685733795166, "gen_logits_mean": -14.825054168701172, "gen_logits_min": -26.461681365966797, "gen_logits_std": 2.832449436187744, "gen_loss": 0.2955535054206848, "grad_norm": 0.3994262584126787, "learning_rate": 2.5426105263157894e-05, "loss": 0.298, "mean_copy_accuracy": 0.9964629262685776, "mean_gen_accuracy": 0.8690748512744904, "mean_token_accuracy": 0.9032687693834305, "num_tokens": 96735845.0, "sample_num_tokens": 7982.75, "step": 4357, "total_num_tokens": 96767776.0, "z_loss": 0.0005434144986793399 }, { "copy_logits_max": -4.213076591491699, "copy_logits_min": -750000000.0, "copy_num_tokens": 497.6875, "epoch": 0.890068930303804, "gen_logits_max": 5.7195844650268555, "gen_logits_mean": -13.989425659179688, "gen_logits_min": -25.641185760498047, "gen_logits_std": 2.83376145362854, "gen_loss": 0.29551225900650024, "grad_norm": 0.43004131833700093, "learning_rate": 2.542484210526316e-05, "loss": 0.3169, "mean_copy_accuracy": 0.9955116808414459, "mean_gen_accuracy": 0.8626814782619476, "mean_token_accuracy": 0.8950805068016052, "num_tokens": 97014258.0, "sample_num_tokens": 9237.5, "step": 4358, "total_num_tokens": 97051208.0, "z_loss": 0.0006182822398841381 }, { "copy_logits_max": -2.3154313564300537, "copy_logits_min": -687500032.0, "copy_num_tokens": 546.8125, "epoch": 0.8902731682410008, "gen_logits_max": 4.789732933044434, "gen_logits_mean": -14.665949821472168, "gen_logits_min": -26.887235641479492, "gen_logits_std": 2.8659768104553223, "gen_loss": 0.33392614126205444, "grad_norm": 0.40394349205872077, "learning_rate": 2.542357894736842e-05, "loss": 0.3272, "mean_copy_accuracy": 0.995478168129921, "mean_gen_accuracy": 0.858644887804985, "mean_token_accuracy": 0.891932412981987, "num_tokens": 97294392.0, "sample_num_tokens": 8949.5, "step": 4359, "total_num_tokens": 97330190.0, "z_loss": 0.0007580713136121631 }, { "copy_logits_max": -3.1855034828186035, "copy_logits_min": -750000000.0, "copy_num_tokens": 609.875, "epoch": 0.8904774061781976, "gen_logits_max": 4.369861602783203, "gen_logits_mean": -15.363556861877441, "gen_logits_min": -27.29245376586914, "gen_logits_std": 2.853286027908325, "gen_loss": 0.2684447169303894, "grad_norm": 0.3744546717863176, "learning_rate": 2.5422315789473687e-05, "loss": 0.2778, "mean_copy_accuracy": 0.9953492134809494, "mean_gen_accuracy": 0.8765827268362045, "mean_token_accuracy": 0.9084581136703491, "num_tokens": 97589801.0, "sample_num_tokens": 9676.75, "step": 4360, "total_num_tokens": 97628508.0, "z_loss": 0.0005712711717933416 }, { "copy_logits_max": -4.702472686767578, "copy_logits_min": -687500032.0, "copy_num_tokens": 431.75, "epoch": 0.8906816441153944, "gen_logits_max": 4.4795050621032715, "gen_logits_mean": -16.339181900024414, "gen_logits_min": -28.062171936035156, "gen_logits_std": 2.8505563735961914, "gen_loss": 0.3341045379638672, "grad_norm": 0.38088786558706933, "learning_rate": 2.5421052631578948e-05, "loss": 0.3057, "mean_copy_accuracy": 0.9962017685174942, "mean_gen_accuracy": 0.8669421523809433, "mean_token_accuracy": 0.8980110734701157, "num_tokens": 97876355.0, "sample_num_tokens": 8265.25, "step": 4361, "total_num_tokens": 97909416.0, "z_loss": 0.0006433121743611991 }, { "copy_logits_max": -4.447390079498291, "copy_logits_min": -750000000.0, "copy_num_tokens": 451.375, "epoch": 0.8908858820525912, "gen_logits_max": 5.692902565002441, "gen_logits_mean": -14.579999923706055, "gen_logits_min": -26.2495059967041, "gen_logits_std": 2.8434982299804688, "gen_loss": 0.29634225368499756, "grad_norm": 0.4164477470711592, "learning_rate": 2.5419789473684212e-05, "loss": 0.301, "mean_copy_accuracy": 0.9960751533508301, "mean_gen_accuracy": 0.8687066584825516, "mean_token_accuracy": 0.8987606018781662, "num_tokens": 98137124.0, "sample_num_tokens": 8343.5, "step": 4362, "total_num_tokens": 98170498.0, "z_loss": 0.0005557579570449889 }, { "copy_logits_max": -3.9083340167999268, "copy_logits_min": -750000000.0, "copy_num_tokens": 467.375, "epoch": 0.8910901199897882, "gen_logits_max": 5.157283306121826, "gen_logits_mean": -14.667110443115234, "gen_logits_min": -26.820390701293945, "gen_logits_std": 2.9006831645965576, "gen_loss": 0.2658662796020508, "grad_norm": 0.4075365005822027, "learning_rate": 2.5418526315789473e-05, "loss": 0.3055, "mean_copy_accuracy": 0.9957037270069122, "mean_gen_accuracy": 0.8677384257316589, "mean_token_accuracy": 0.8991184085607529, "num_tokens": 98407158.0, "sample_num_tokens": 8383.0, "step": 4363, "total_num_tokens": 98440690.0, "z_loss": 0.0005246040527708828 }, { "copy_logits_max": -4.003194808959961, "copy_logits_min": -625000064.0, "copy_num_tokens": 687.0, "epoch": 0.891294357926985, "gen_logits_max": 5.443731307983398, "gen_logits_mean": -13.41480827331543, "gen_logits_min": -25.60022735595703, "gen_logits_std": 2.8579635620117188, "gen_loss": 0.26731717586517334, "grad_norm": 0.3847795206276504, "learning_rate": 2.5417263157894738e-05, "loss": 0.3015, "mean_copy_accuracy": 0.9958323091268539, "mean_gen_accuracy": 0.8686854243278503, "mean_token_accuracy": 0.8980650454759598, "num_tokens": 98668703.0, "sample_num_tokens": 10341.25, "step": 4364, "total_num_tokens": 98710068.0, "z_loss": 0.0004908178234472871 }, { "copy_logits_max": -2.9640250205993652, "copy_logits_min": -750000000.0, "copy_num_tokens": 485.625, "epoch": 0.8914985958641818, "gen_logits_max": 5.790614604949951, "gen_logits_mean": -13.948088645935059, "gen_logits_min": -26.112613677978516, "gen_logits_std": 2.8713159561157227, "gen_loss": 0.3128790259361267, "grad_norm": 0.4493670265262502, "learning_rate": 2.5416e-05, "loss": 0.3355, "mean_copy_accuracy": 0.9942053109407425, "mean_gen_accuracy": 0.85688616335392, "mean_token_accuracy": 0.8868140429258347, "num_tokens": 98918312.0, "sample_num_tokens": 8050.0, "step": 4365, "total_num_tokens": 98950512.0, "z_loss": 0.0005813892930746078 }, { "copy_logits_max": -4.3001604080200195, "copy_logits_min": -750000000.0, "copy_num_tokens": 643.3125, "epoch": 0.8917028338013786, "gen_logits_max": 3.783590793609619, "gen_logits_mean": -16.155797958374023, "gen_logits_min": -28.041156768798828, "gen_logits_std": 2.8642003536224365, "gen_loss": 0.2805446982383728, "grad_norm": 0.4396635992771208, "learning_rate": 2.5414736842105263e-05, "loss": 0.2944, "mean_copy_accuracy": 0.9967089891433716, "mean_gen_accuracy": 0.8657871931791306, "mean_token_accuracy": 0.900106355547905, "num_tokens": 99178447.0, "sample_num_tokens": 9067.25, "step": 4366, "total_num_tokens": 99214716.0, "z_loss": 0.0005423722905106843 }, { "copy_logits_max": -1.9292412996292114, "copy_logits_min": -750000000.0, "copy_num_tokens": 462.9375, "epoch": 0.8919070717385754, "gen_logits_max": 5.514442443847656, "gen_logits_mean": -14.135747909545898, "gen_logits_min": -26.381118774414062, "gen_logits_std": 2.8754525184631348, "gen_loss": 0.33160996437072754, "grad_norm": 0.4604679005468134, "learning_rate": 2.5413473684210524e-05, "loss": 0.3076, "mean_copy_accuracy": 0.9960514903068542, "mean_gen_accuracy": 0.867683932185173, "mean_token_accuracy": 0.899310752749443, "num_tokens": 99451918.0, "sample_num_tokens": 8504.5, "step": 4367, "total_num_tokens": 99485936.0, "z_loss": 0.0006455894908867776 }, { "copy_logits_max": -4.637218475341797, "copy_logits_min": -750000064.0, "copy_num_tokens": 502.375, "epoch": 0.8921113096757722, "gen_logits_max": 5.101470947265625, "gen_logits_mean": -14.990036964416504, "gen_logits_min": -27.08645248413086, "gen_logits_std": 2.86893892288208, "gen_loss": 0.28377532958984375, "grad_norm": 0.40336178902268927, "learning_rate": 2.541221052631579e-05, "loss": 0.2938, "mean_copy_accuracy": 0.9940209984779358, "mean_gen_accuracy": 0.8687648922204971, "mean_token_accuracy": 0.9007170349359512, "num_tokens": 99736195.0, "sample_num_tokens": 8467.25, "step": 4368, "total_num_tokens": 99770064.0, "z_loss": 0.000582049717195332 }, { "copy_logits_max": -4.498219966888428, "copy_logits_min": -750000000.0, "copy_num_tokens": 561.0625, "epoch": 0.8923155476129692, "gen_logits_max": 6.150364398956299, "gen_logits_mean": -13.956966400146484, "gen_logits_min": -26.322669982910156, "gen_logits_std": 2.8802554607391357, "gen_loss": 0.29294732213020325, "grad_norm": 0.5421643156457013, "learning_rate": 2.5410947368421056e-05, "loss": 0.2857, "mean_copy_accuracy": 0.9949922412633896, "mean_gen_accuracy": 0.8706052005290985, "mean_token_accuracy": 0.9060569405555725, "num_tokens": 100011280.0, "sample_num_tokens": 8494.5, "step": 4369, "total_num_tokens": 100045258.0, "z_loss": 0.0005468277959153056 }, { "copy_logits_max": -2.6584510803222656, "copy_logits_min": -750000000.0, "copy_num_tokens": 384.625, "epoch": 0.892519785550166, "gen_logits_max": 6.621150970458984, "gen_logits_mean": -13.312567710876465, "gen_logits_min": -25.152225494384766, "gen_logits_std": 2.808884620666504, "gen_loss": 0.3086615204811096, "grad_norm": 0.38732155807489344, "learning_rate": 2.5409684210526317e-05, "loss": 0.3042, "mean_copy_accuracy": 0.9945938587188721, "mean_gen_accuracy": 0.867774173617363, "mean_token_accuracy": 0.8991113752126694, "num_tokens": 100285841.0, "sample_num_tokens": 9051.25, "step": 4370, "total_num_tokens": 100322046.0, "z_loss": 0.0006395822856575251 }, { "copy_logits_max": -1.2758210897445679, "copy_logits_min": -750000000.0, "copy_num_tokens": 445.875, "epoch": 0.8927240234873628, "gen_logits_max": 5.319087028503418, "gen_logits_mean": -14.32519817352295, "gen_logits_min": -26.32610321044922, "gen_logits_std": 2.849543571472168, "gen_loss": 0.3265116214752197, "grad_norm": 0.45344606653107306, "learning_rate": 2.540842105263158e-05, "loss": 0.3442, "mean_copy_accuracy": 0.9952572584152222, "mean_gen_accuracy": 0.8572524785995483, "mean_token_accuracy": 0.8868928700685501, "num_tokens": 100554176.0, "sample_num_tokens": 7742.5, "step": 4371, "total_num_tokens": 100585146.0, "z_loss": 0.000836022081784904 }, { "copy_logits_max": -2.3453099727630615, "copy_logits_min": -750000000.0, "copy_num_tokens": 399.375, "epoch": 0.8929282614245596, "gen_logits_max": 5.926950454711914, "gen_logits_mean": -14.191091537475586, "gen_logits_min": -25.9101619720459, "gen_logits_std": 2.839540481567383, "gen_loss": 0.32364603877067566, "grad_norm": 0.40658386286738374, "learning_rate": 2.5407157894736842e-05, "loss": 0.3181, "mean_copy_accuracy": 0.9944083094596863, "mean_gen_accuracy": 0.8646661043167114, "mean_token_accuracy": 0.894297182559967, "num_tokens": 100807868.0, "sample_num_tokens": 7758.0, "step": 4372, "total_num_tokens": 100838900.0, "z_loss": 0.0008076293743215501 }, { "copy_logits_max": 0.82027268409729, "copy_logits_min": -750000000.0, "copy_num_tokens": 447.125, "epoch": 0.8931324993617564, "gen_logits_max": 6.2496795654296875, "gen_logits_mean": -12.999164581298828, "gen_logits_min": -25.079133987426758, "gen_logits_std": 2.852445602416992, "gen_loss": 0.3036706745624542, "grad_norm": 0.40928437014196606, "learning_rate": 2.5405894736842106e-05, "loss": 0.3163, "mean_copy_accuracy": 0.9943237900733948, "mean_gen_accuracy": 0.8658562451601028, "mean_token_accuracy": 0.8960092067718506, "num_tokens": 101067228.0, "sample_num_tokens": 8065.0, "step": 4373, "total_num_tokens": 101099488.0, "z_loss": 0.0007433975115418434 }, { "copy_logits_max": -2.0519819259643555, "copy_logits_min": -750000000.0, "copy_num_tokens": 318.1875, "epoch": 0.8933367372989532, "gen_logits_max": 6.119511604309082, "gen_logits_mean": -13.79140853881836, "gen_logits_min": -25.327913284301758, "gen_logits_std": 2.7881274223327637, "gen_loss": 0.3261176347732544, "grad_norm": 0.41649450791538184, "learning_rate": 2.5404631578947367e-05, "loss": 0.3175, "mean_copy_accuracy": 0.9952452033758163, "mean_gen_accuracy": 0.8576959073543549, "mean_token_accuracy": 0.8963587284088135, "num_tokens": 101339643.0, "sample_num_tokens": 6975.75, "step": 4374, "total_num_tokens": 101367546.0, "z_loss": 0.0007488739211112261 }, { "copy_logits_max": -2.4341564178466797, "copy_logits_min": -750000000.0, "copy_num_tokens": 354.0625, "epoch": 0.8935409752361502, "gen_logits_max": 5.394284725189209, "gen_logits_mean": -14.619342803955078, "gen_logits_min": -26.52840232849121, "gen_logits_std": 2.8433139324188232, "gen_loss": 0.3117601275444031, "grad_norm": 0.42707534648525325, "learning_rate": 2.540336842105263e-05, "loss": 0.3265, "mean_copy_accuracy": 0.9935805797576904, "mean_gen_accuracy": 0.8620039522647858, "mean_token_accuracy": 0.8931973725557327, "num_tokens": 101598370.0, "sample_num_tokens": 7804.5, "step": 4375, "total_num_tokens": 101629588.0, "z_loss": 0.0007764175534248352 }, { "copy_logits_max": 0.6899384260177612, "copy_logits_min": -750000000.0, "copy_num_tokens": 603.0625, "epoch": 0.893745213173347, "gen_logits_max": 4.137745380401611, "gen_logits_mean": -15.336745262145996, "gen_logits_min": -27.77603530883789, "gen_logits_std": 2.873806953430176, "gen_loss": 0.29206568002700806, "grad_norm": 0.441607077286373, "learning_rate": 2.5402105263157896e-05, "loss": 0.2949, "mean_copy_accuracy": 0.9954169392585754, "mean_gen_accuracy": 0.8686164319515228, "mean_token_accuracy": 0.9013798981904984, "num_tokens": 101866539.0, "sample_num_tokens": 9130.75, "step": 4376, "total_num_tokens": 101903062.0, "z_loss": 0.0009191028657369316 }, { "copy_logits_max": 0.9482859373092651, "copy_logits_min": -750000000.0, "copy_num_tokens": 739.625, "epoch": 0.8939494511105438, "gen_logits_max": 4.131124496459961, "gen_logits_mean": -14.57783031463623, "gen_logits_min": -26.83338165283203, "gen_logits_std": 2.881732225418091, "gen_loss": 0.2906230390071869, "grad_norm": 0.38803735882724194, "learning_rate": 2.540084210526316e-05, "loss": 0.308, "mean_copy_accuracy": 0.995671957731247, "mean_gen_accuracy": 0.8621596992015839, "mean_token_accuracy": 0.8977151811122894, "num_tokens": 102155596.0, "sample_num_tokens": 10334.0, "step": 4377, "total_num_tokens": 102196932.0, "z_loss": 0.0008305815863423049 }, { "copy_logits_max": -0.31981831789016724, "copy_logits_min": -750000000.0, "copy_num_tokens": 492.75, "epoch": 0.8941536890477406, "gen_logits_max": 5.254060745239258, "gen_logits_mean": -14.280073165893555, "gen_logits_min": -26.284820556640625, "gen_logits_std": 2.849609851837158, "gen_loss": 0.29233235120773315, "grad_norm": 0.3888219961518932, "learning_rate": 2.539957894736842e-05, "loss": 0.3141, "mean_copy_accuracy": 0.9947683215141296, "mean_gen_accuracy": 0.8695098459720612, "mean_token_accuracy": 0.8979062885046005, "num_tokens": 102433039.0, "sample_num_tokens": 9827.75, "step": 4378, "total_num_tokens": 102472350.0, "z_loss": 0.000713283137883991 }, { "copy_logits_max": -1.8044278621673584, "copy_logits_min": -687500032.0, "copy_num_tokens": 365.375, "epoch": 0.8943579269849374, "gen_logits_max": 5.150501251220703, "gen_logits_mean": -14.561546325683594, "gen_logits_min": -26.743717193603516, "gen_logits_std": 2.8472204208374023, "gen_loss": 0.3058544397354126, "grad_norm": 0.5099068313628339, "learning_rate": 2.5398315789473685e-05, "loss": 0.3225, "mean_copy_accuracy": 0.9919068515300751, "mean_gen_accuracy": 0.8613962978124619, "mean_token_accuracy": 0.8919487744569778, "num_tokens": 102693889.0, "sample_num_tokens": 7488.25, "step": 4379, "total_num_tokens": 102723842.0, "z_loss": 0.0007081283256411552 }, { "copy_logits_max": -1.185076117515564, "copy_logits_min": -750000000.0, "copy_num_tokens": 526.875, "epoch": 0.8945621649221343, "gen_logits_max": 5.224366188049316, "gen_logits_mean": -13.785772323608398, "gen_logits_min": -26.18661117553711, "gen_logits_std": 2.890374183654785, "gen_loss": 0.24918216466903687, "grad_norm": 0.41947359382835553, "learning_rate": 2.5397052631578946e-05, "loss": 0.3093, "mean_copy_accuracy": 0.9959762096405029, "mean_gen_accuracy": 0.8597962260246277, "mean_token_accuracy": 0.8972246497869492, "num_tokens": 102948777.0, "sample_num_tokens": 7884.25, "step": 4380, "total_num_tokens": 102980314.0, "z_loss": 0.0005450897151604295 }, { "copy_logits_max": -0.9083348512649536, "copy_logits_min": -750000000.0, "copy_num_tokens": 515.8125, "epoch": 0.8947664028593311, "gen_logits_max": 5.245839595794678, "gen_logits_mean": -14.3841552734375, "gen_logits_min": -26.401344299316406, "gen_logits_std": 2.8653600215911865, "gen_loss": 0.3376618027687073, "grad_norm": 0.46218524467498856, "learning_rate": 2.539578947368421e-05, "loss": 0.3035, "mean_copy_accuracy": 0.9956620335578918, "mean_gen_accuracy": 0.8641571402549744, "mean_token_accuracy": 0.9002247899770737, "num_tokens": 103221159.0, "sample_num_tokens": 9015.75, "step": 4381, "total_num_tokens": 103257222.0, "z_loss": 0.0007373315747827291 }, { "copy_logits_max": -1.0068285465240479, "copy_logits_min": -750000000.0, "copy_num_tokens": 489.9375, "epoch": 0.894970640796528, "gen_logits_max": 5.576447486877441, "gen_logits_mean": -14.046449661254883, "gen_logits_min": -26.331218719482422, "gen_logits_std": 2.8758435249328613, "gen_loss": 0.2807874083518982, "grad_norm": 0.45647469718709, "learning_rate": 2.5394526315789475e-05, "loss": 0.2964, "mean_copy_accuracy": 0.9948734790086746, "mean_gen_accuracy": 0.8658476024866104, "mean_token_accuracy": 0.9016298204660416, "num_tokens": 103494941.0, "sample_num_tokens": 7686.25, "step": 4382, "total_num_tokens": 103525686.0, "z_loss": 0.0005750677082687616 }, { "copy_logits_max": -2.724384307861328, "copy_logits_min": -750000000.0, "copy_num_tokens": 533.625, "epoch": 0.8951748787337248, "gen_logits_max": 4.059194564819336, "gen_logits_mean": -15.609524726867676, "gen_logits_min": -28.115997314453125, "gen_logits_std": 2.8824527263641357, "gen_loss": 0.2767419219017029, "grad_norm": 0.4298246426308585, "learning_rate": 2.5393263157894736e-05, "loss": 0.2976, "mean_copy_accuracy": 0.9938025325536728, "mean_gen_accuracy": 0.869241327047348, "mean_token_accuracy": 0.8987059891223907, "num_tokens": 103749647.0, "sample_num_tokens": 8371.75, "step": 4383, "total_num_tokens": 103783134.0, "z_loss": 0.000526332063600421 }, { "copy_logits_max": -3.106966495513916, "copy_logits_min": -750000000.0, "copy_num_tokens": 489.25, "epoch": 0.8953791166709216, "gen_logits_max": 5.92510986328125, "gen_logits_mean": -13.563507080078125, "gen_logits_min": -25.82378387451172, "gen_logits_std": 2.871852159500122, "gen_loss": 0.26576006412506104, "grad_norm": 0.4604115435890352, "learning_rate": 2.5392000000000004e-05, "loss": 0.3072, "mean_copy_accuracy": 0.9940488189458847, "mean_gen_accuracy": 0.8633426278829575, "mean_token_accuracy": 0.8973992764949799, "num_tokens": 104029292.0, "sample_num_tokens": 8211.0, "step": 4384, "total_num_tokens": 104062136.0, "z_loss": 0.0005149880307726562 }, { "copy_logits_max": -2.659372568130493, "copy_logits_min": -750000000.0, "copy_num_tokens": 429.625, "epoch": 0.8955833546081184, "gen_logits_max": 4.818027496337891, "gen_logits_mean": -15.304170608520508, "gen_logits_min": -27.552013397216797, "gen_logits_std": 2.8454227447509766, "gen_loss": 0.35444945096969604, "grad_norm": 0.4431744540006109, "learning_rate": 2.5390736842105264e-05, "loss": 0.3293, "mean_copy_accuracy": 0.9955627173185349, "mean_gen_accuracy": 0.8615415394306183, "mean_token_accuracy": 0.893102377653122, "num_tokens": 104282370.0, "sample_num_tokens": 8512.0, "step": 4385, "total_num_tokens": 104316418.0, "z_loss": 0.000652558053843677 }, { "copy_logits_max": 0.07451958954334259, "copy_logits_min": -750000000.0, "copy_num_tokens": 415.8125, "epoch": 0.8957875925453153, "gen_logits_max": 4.529277801513672, "gen_logits_mean": -15.300689697265625, "gen_logits_min": -27.092151641845703, "gen_logits_std": 2.841449737548828, "gen_loss": 0.34535515308380127, "grad_norm": 0.5012906748562915, "learning_rate": 2.538947368421053e-05, "loss": 0.3207, "mean_copy_accuracy": 0.9937385767698288, "mean_gen_accuracy": 0.8621797263622284, "mean_token_accuracy": 0.8910926878452301, "num_tokens": 104530502.0, "sample_num_tokens": 7781.0, "step": 4386, "total_num_tokens": 104561626.0, "z_loss": 0.0007246839813888073 }, { "copy_logits_max": -1.234043836593628, "copy_logits_min": -750000000.0, "copy_num_tokens": 373.5625, "epoch": 0.8959918304825121, "gen_logits_max": 5.193478584289551, "gen_logits_mean": -14.83912181854248, "gen_logits_min": -26.95404052734375, "gen_logits_std": 2.8536481857299805, "gen_loss": 0.33289381861686707, "grad_norm": 0.36958090790545783, "learning_rate": 2.538821052631579e-05, "loss": 0.2936, "mean_copy_accuracy": 0.9943440854549408, "mean_gen_accuracy": 0.871973305940628, "mean_token_accuracy": 0.9022278934717178, "num_tokens": 104802920.0, "sample_num_tokens": 8578.0, "step": 4387, "total_num_tokens": 104837232.0, "z_loss": 0.0005783963715657592 }, { "copy_logits_max": -3.0351815223693848, "copy_logits_min": -750000064.0, "copy_num_tokens": 363.0, "epoch": 0.896196068419709, "gen_logits_max": 4.88311767578125, "gen_logits_mean": -15.287652969360352, "gen_logits_min": -27.180095672607422, "gen_logits_std": 2.8708062171936035, "gen_loss": 0.3367016911506653, "grad_norm": 0.4809147532005662, "learning_rate": 2.5386947368421054e-05, "loss": 0.3109, "mean_copy_accuracy": 0.9930996000766754, "mean_gen_accuracy": 0.864383190870285, "mean_token_accuracy": 0.8951051980257034, "num_tokens": 105092709.0, "sample_num_tokens": 8090.75, "step": 4388, "total_num_tokens": 105125072.0, "z_loss": 0.00059903459623456 }, { "copy_logits_max": -0.9798151850700378, "copy_logits_min": -750000000.0, "copy_num_tokens": 514.1875, "epoch": 0.8964003063569058, "gen_logits_max": 5.8343963623046875, "gen_logits_mean": -13.186993598937988, "gen_logits_min": -25.741273880004883, "gen_logits_std": 2.826026201248169, "gen_loss": 0.31977909803390503, "grad_norm": 0.3889770989496383, "learning_rate": 2.5385684210526315e-05, "loss": 0.2967, "mean_copy_accuracy": 0.9957377314567566, "mean_gen_accuracy": 0.8709746301174164, "mean_token_accuracy": 0.9026021510362625, "num_tokens": 105382806.0, "sample_num_tokens": 9563.5, "step": 4389, "total_num_tokens": 105421060.0, "z_loss": 0.0006197431357577443 }, { "copy_logits_max": -0.9596882462501526, "copy_logits_min": -750000000.0, "copy_num_tokens": 405.5, "epoch": 0.8966045442941026, "gen_logits_max": 5.273665904998779, "gen_logits_mean": -14.350992202758789, "gen_logits_min": -26.133209228515625, "gen_logits_std": 2.831681251525879, "gen_loss": 0.34084218740463257, "grad_norm": 0.4405455612370794, "learning_rate": 2.538442105263158e-05, "loss": 0.3137, "mean_copy_accuracy": 0.9959562420845032, "mean_gen_accuracy": 0.8685767501592636, "mean_token_accuracy": 0.8966020196676254, "num_tokens": 105631033.0, "sample_num_tokens": 7991.75, "step": 4390, "total_num_tokens": 105663000.0, "z_loss": 0.0006866217590868473 }, { "copy_logits_max": 0.5880005955696106, "copy_logits_min": -750000000.0, "copy_num_tokens": 593.625, "epoch": 0.8968087822312995, "gen_logits_max": 4.063276290893555, "gen_logits_mean": -14.970379829406738, "gen_logits_min": -27.113765716552734, "gen_logits_std": 2.872519016265869, "gen_loss": 0.2777978479862213, "grad_norm": 0.45438192301192554, "learning_rate": 2.538315789473684e-05, "loss": 0.2738, "mean_copy_accuracy": 0.9955090433359146, "mean_gen_accuracy": 0.869858980178833, "mean_token_accuracy": 0.9085454344749451, "num_tokens": 105912133.0, "sample_num_tokens": 8745.25, "step": 4391, "total_num_tokens": 105947114.0, "z_loss": 0.0006338543607853353 }, { "copy_logits_max": -2.8580868244171143, "copy_logits_min": -750000000.0, "copy_num_tokens": 412.25, "epoch": 0.8970130201684963, "gen_logits_max": 5.606456756591797, "gen_logits_mean": -13.523153305053711, "gen_logits_min": -26.161754608154297, "gen_logits_std": 2.8633604049682617, "gen_loss": 0.3129124045372009, "grad_norm": 0.3813991621780672, "learning_rate": 2.5381894736842104e-05, "loss": 0.3267, "mean_copy_accuracy": 0.9947516471147537, "mean_gen_accuracy": 0.8574856370687485, "mean_token_accuracy": 0.8923586905002594, "num_tokens": 106185888.0, "sample_num_tokens": 7727.0, "step": 4392, "total_num_tokens": 106216796.0, "z_loss": 0.0006855828687548637 }, { "copy_logits_max": -3.8928122520446777, "copy_logits_min": -750000000.0, "copy_num_tokens": 378.1875, "epoch": 0.8972172581056931, "gen_logits_max": 5.7041168212890625, "gen_logits_mean": -12.709901809692383, "gen_logits_min": -24.123565673828125, "gen_logits_std": 2.7550642490386963, "gen_loss": 0.30802008509635925, "grad_norm": 0.5089175959047594, "learning_rate": 2.538063157894737e-05, "loss": 0.2889, "mean_copy_accuracy": 0.9924716651439667, "mean_gen_accuracy": 0.8729424327611923, "mean_token_accuracy": 0.9035926312208176, "num_tokens": 106451516.0, "sample_num_tokens": 8941.0, "step": 4393, "total_num_tokens": 106487280.0, "z_loss": 0.000584967783652246 }, { "copy_logits_max": -3.185844898223877, "copy_logits_min": -625000064.0, "copy_num_tokens": 749.6875, "epoch": 0.89742149604289, "gen_logits_max": 4.052114963531494, "gen_logits_mean": -15.750223159790039, "gen_logits_min": -27.721080780029297, "gen_logits_std": 2.8587214946746826, "gen_loss": 0.2800945043563843, "grad_norm": 0.38816504262932555, "learning_rate": 2.5379368421052633e-05, "loss": 0.2948, "mean_copy_accuracy": 0.9947998076677322, "mean_gen_accuracy": 0.8670245707035065, "mean_token_accuracy": 0.9020061939954758, "num_tokens": 106728320.0, "sample_num_tokens": 10657.5, "step": 4394, "total_num_tokens": 106770950.0, "z_loss": 0.0005953429499641061 }, { "copy_logits_max": -3.0736687183380127, "copy_logits_min": -687500032.0, "copy_num_tokens": 441.6875, "epoch": 0.8976257339800868, "gen_logits_max": 4.332446098327637, "gen_logits_mean": -15.474386215209961, "gen_logits_min": -27.114368438720703, "gen_logits_std": 2.844475269317627, "gen_loss": 0.30503618717193604, "grad_norm": 0.47833464700669714, "learning_rate": 2.5378105263157897e-05, "loss": 0.3285, "mean_copy_accuracy": 0.9948101490736008, "mean_gen_accuracy": 0.8624279052019119, "mean_token_accuracy": 0.8905865252017975, "num_tokens": 106995145.0, "sample_num_tokens": 7814.25, "step": 4395, "total_num_tokens": 107026402.0, "z_loss": 0.0006566765950992703 }, { "copy_logits_max": -0.5699722766876221, "copy_logits_min": -750000000.0, "copy_num_tokens": 656.375, "epoch": 0.8978299719172836, "gen_logits_max": 5.278110504150391, "gen_logits_mean": -12.609903335571289, "gen_logits_min": -24.65598487854004, "gen_logits_std": 2.841944456100464, "gen_loss": 0.2646946907043457, "grad_norm": 0.4420326829135414, "learning_rate": 2.537684210526316e-05, "loss": 0.2896, "mean_copy_accuracy": 0.9952544271945953, "mean_gen_accuracy": 0.8742037713527679, "mean_token_accuracy": 0.903159573674202, "num_tokens": 107260452.0, "sample_num_tokens": 9887.5, "step": 4396, "total_num_tokens": 107300002.0, "z_loss": 0.0006461490411311388 }, { "copy_logits_max": -0.6901175379753113, "copy_logits_min": -687500032.0, "copy_num_tokens": 400.0, "epoch": 0.8980342098544805, "gen_logits_max": 6.515555381774902, "gen_logits_mean": -12.657001495361328, "gen_logits_min": -24.572813034057617, "gen_logits_std": 2.817995071411133, "gen_loss": 0.31603795289993286, "grad_norm": 0.42074912279142535, "learning_rate": 2.5375578947368423e-05, "loss": 0.3144, "mean_copy_accuracy": 0.9953087717294693, "mean_gen_accuracy": 0.8651684522628784, "mean_token_accuracy": 0.8928633779287338, "num_tokens": 107540117.0, "sample_num_tokens": 7820.25, "step": 4397, "total_num_tokens": 107571398.0, "z_loss": 0.0006222032243385911 }, { "copy_logits_max": 0.6554654836654663, "copy_logits_min": -750000128.0, "copy_num_tokens": 408.0625, "epoch": 0.8982384477916773, "gen_logits_max": 6.565612316131592, "gen_logits_mean": -12.60936164855957, "gen_logits_min": -24.77076530456543, "gen_logits_std": 2.8443825244903564, "gen_loss": 0.3335070013999939, "grad_norm": 0.4393407123356428, "learning_rate": 2.5374315789473684e-05, "loss": 0.322, "mean_copy_accuracy": 0.9935212135314941, "mean_gen_accuracy": 0.8625165969133377, "mean_token_accuracy": 0.8940230309963226, "num_tokens": 107792895.0, "sample_num_tokens": 7888.25, "step": 4398, "total_num_tokens": 107824448.0, "z_loss": 0.0007151247700676322 }, { "copy_logits_max": 1.127759575843811, "copy_logits_min": -687500032.0, "copy_num_tokens": 694.3125, "epoch": 0.8984426857288741, "gen_logits_max": 4.614513397216797, "gen_logits_mean": -13.486927032470703, "gen_logits_min": -26.201528549194336, "gen_logits_std": 2.8717398643493652, "gen_loss": 0.23960280418395996, "grad_norm": 0.4369555169843101, "learning_rate": 2.5373052631578948e-05, "loss": 0.2846, "mean_copy_accuracy": 0.9949556887149811, "mean_gen_accuracy": 0.8711528480052948, "mean_token_accuracy": 0.9051576405763626, "num_tokens": 108054749.0, "sample_num_tokens": 9159.75, "step": 4399, "total_num_tokens": 108091388.0, "z_loss": 0.0007019881159067154 }, { "copy_logits_max": 1.1360546350479126, "copy_logits_min": -687500032.0, "copy_num_tokens": 425.5, "epoch": 0.898646923666071, "gen_logits_max": 5.136290550231934, "gen_logits_mean": -14.441996574401855, "gen_logits_min": -26.65694808959961, "gen_logits_std": 2.8566644191741943, "gen_loss": 0.3580188751220703, "grad_norm": 0.4316465011126699, "learning_rate": 2.537178947368421e-05, "loss": 0.3217, "mean_copy_accuracy": 0.9958314746618271, "mean_gen_accuracy": 0.8592799454927444, "mean_token_accuracy": 0.896009236574173, "num_tokens": 108340288.0, "sample_num_tokens": 7712.5, "step": 4400, "total_num_tokens": 108371138.0, "z_loss": 0.0009356423979625106 }, { "copy_logits_max": 0.6459918022155762, "copy_logits_min": -500000064.0, "copy_num_tokens": 426.125, "epoch": 0.8988511616032678, "gen_logits_max": 4.076258659362793, "gen_logits_mean": -16.335222244262695, "gen_logits_min": -28.36480712890625, "gen_logits_std": 2.876508951187134, "gen_loss": 0.3064647912979126, "grad_norm": 0.4306136488123091, "learning_rate": 2.5370526315789477e-05, "loss": 0.3183, "mean_copy_accuracy": 0.9963349103927612, "mean_gen_accuracy": 0.8613062649965286, "mean_token_accuracy": 0.8930805027484894, "num_tokens": 108589183.0, "sample_num_tokens": 7592.25, "step": 4401, "total_num_tokens": 108619552.0, "z_loss": 0.0009192551369778812 }, { "copy_logits_max": -1.0647693872451782, "copy_logits_min": -625000064.0, "copy_num_tokens": 656.4375, "epoch": 0.8990553995404647, "gen_logits_max": 4.518500328063965, "gen_logits_mean": -14.797661781311035, "gen_logits_min": -27.040260314941406, "gen_logits_std": 2.8494744300842285, "gen_loss": 0.2895006537437439, "grad_norm": 0.38804569722258964, "learning_rate": 2.5369263157894737e-05, "loss": 0.2913, "mean_copy_accuracy": 0.9968458712100983, "mean_gen_accuracy": 0.8674853295087814, "mean_token_accuracy": 0.9036500304937363, "num_tokens": 108883583.0, "sample_num_tokens": 10763.25, "step": 4402, "total_num_tokens": 108926636.0, "z_loss": 0.0007991429883986712 }, { "copy_logits_max": -0.8385534286499023, "copy_logits_min": -687500032.0, "copy_num_tokens": 401.0, "epoch": 0.8992596374776615, "gen_logits_max": 5.809316158294678, "gen_logits_mean": -13.317146301269531, "gen_logits_min": -25.106489181518555, "gen_logits_std": 2.8289098739624023, "gen_loss": 0.33914801478385925, "grad_norm": 0.4131923822443037, "learning_rate": 2.5368000000000002e-05, "loss": 0.3339, "mean_copy_accuracy": 0.9945598542690277, "mean_gen_accuracy": 0.8586277365684509, "mean_token_accuracy": 0.88882976770401, "num_tokens": 109165794.0, "sample_num_tokens": 8461.5, "step": 4403, "total_num_tokens": 109199640.0, "z_loss": 0.0007078043418005109 }, { "copy_logits_max": -1.489945650100708, "copy_logits_min": -750000000.0, "copy_num_tokens": 581.5625, "epoch": 0.8994638754148583, "gen_logits_max": 4.346030235290527, "gen_logits_mean": -14.408041000366211, "gen_logits_min": -26.381528854370117, "gen_logits_std": 2.8355391025543213, "gen_loss": 0.30862265825271606, "grad_norm": 0.43110939801074, "learning_rate": 2.5366736842105263e-05, "loss": 0.3282, "mean_copy_accuracy": 0.9948996156454086, "mean_gen_accuracy": 0.8535759896039963, "mean_token_accuracy": 0.8924195021390915, "num_tokens": 109439638.0, "sample_num_tokens": 8883.5, "step": 4404, "total_num_tokens": 109475172.0, "z_loss": 0.0006206307443790138 }, { "copy_logits_max": -1.5974617004394531, "copy_logits_min": -750000000.0, "copy_num_tokens": 433.6875, "epoch": 0.8996681133520551, "gen_logits_max": 6.019345283508301, "gen_logits_mean": -13.040240287780762, "gen_logits_min": -24.94425392150879, "gen_logits_std": 2.8389620780944824, "gen_loss": 0.292735755443573, "grad_norm": 0.5519719662300924, "learning_rate": 2.5365473684210527e-05, "loss": 0.3338, "mean_copy_accuracy": 0.9964960217475891, "mean_gen_accuracy": 0.8610985726118088, "mean_token_accuracy": 0.8905394375324249, "num_tokens": 109691876.0, "sample_num_tokens": 8820.0, "step": 4405, "total_num_tokens": 109727156.0, "z_loss": 0.0005742970388382673 }, { "copy_logits_max": -0.28651899099349976, "copy_logits_min": -750000000.0, "copy_num_tokens": 611.625, "epoch": 0.899872351289252, "gen_logits_max": 5.300394058227539, "gen_logits_mean": -13.552156448364258, "gen_logits_min": -25.612857818603516, "gen_logits_std": 2.8659231662750244, "gen_loss": 0.29978930950164795, "grad_norm": 0.42611083575101727, "learning_rate": 2.5364210526315788e-05, "loss": 0.3059, "mean_copy_accuracy": 0.9960120171308517, "mean_gen_accuracy": 0.8661153167486191, "mean_token_accuracy": 0.9000641852617264, "num_tokens": 109959625.0, "sample_num_tokens": 10014.25, "step": 4406, "total_num_tokens": 109999682.0, "z_loss": 0.0006418541306629777 }, { "copy_logits_max": -1.9363096952438354, "copy_logits_min": -750000000.0, "copy_num_tokens": 298.375, "epoch": 0.9000765892264488, "gen_logits_max": 5.595630645751953, "gen_logits_mean": -14.923446655273438, "gen_logits_min": -26.20199966430664, "gen_logits_std": 2.8303632736206055, "gen_loss": 0.34281376004219055, "grad_norm": 0.41502066691231393, "learning_rate": 2.5362947368421052e-05, "loss": 0.3072, "mean_copy_accuracy": 0.9934809356927872, "mean_gen_accuracy": 0.8714911639690399, "mean_token_accuracy": 0.8980790227651596, "num_tokens": 110215771.0, "sample_num_tokens": 7859.25, "step": 4407, "total_num_tokens": 110247208.0, "z_loss": 0.0006607023533433676 }, { "copy_logits_max": -2.887803554534912, "copy_logits_min": -750000000.0, "copy_num_tokens": 440.3125, "epoch": 0.9002808271636457, "gen_logits_max": 4.587426662445068, "gen_logits_mean": -14.638570785522461, "gen_logits_min": -26.396995544433594, "gen_logits_std": 2.835507869720459, "gen_loss": 0.3206290006637573, "grad_norm": 0.6120874022044258, "learning_rate": 2.5361684210526317e-05, "loss": 0.3242, "mean_copy_accuracy": 0.993447557091713, "mean_gen_accuracy": 0.860610768198967, "mean_token_accuracy": 0.8921273499727249, "num_tokens": 110493392.0, "sample_num_tokens": 7840.0, "step": 4408, "total_num_tokens": 110524752.0, "z_loss": 0.0006026974879205227 }, { "copy_logits_max": -1.7340281009674072, "copy_logits_min": -750000000.0, "copy_num_tokens": 439.1875, "epoch": 0.9004850651008425, "gen_logits_max": 4.418455600738525, "gen_logits_mean": -15.758211135864258, "gen_logits_min": -27.440425872802734, "gen_logits_std": 2.8659262657165527, "gen_loss": 0.2836175858974457, "grad_norm": 0.3888565794184294, "learning_rate": 2.536042105263158e-05, "loss": 0.3021, "mean_copy_accuracy": 0.9952235221862793, "mean_gen_accuracy": 0.865316316485405, "mean_token_accuracy": 0.9005445688962936, "num_tokens": 110773792.0, "sample_num_tokens": 8425.0, "step": 4409, "total_num_tokens": 110807492.0, "z_loss": 0.0006437906413339078 }, { "copy_logits_max": -2.4734272956848145, "copy_logits_min": -750000000.0, "copy_num_tokens": 277.3125, "epoch": 0.9006893030380393, "gen_logits_max": 5.285840034484863, "gen_logits_mean": -15.585508346557617, "gen_logits_min": -26.779064178466797, "gen_logits_std": 2.814274787902832, "gen_loss": 0.337130606174469, "grad_norm": 0.43208010922220064, "learning_rate": 2.5359157894736845e-05, "loss": 0.309, "mean_copy_accuracy": 0.9956167489290237, "mean_gen_accuracy": 0.8683571517467499, "mean_token_accuracy": 0.8956107795238495, "num_tokens": 111031015.0, "sample_num_tokens": 6726.25, "step": 4410, "total_num_tokens": 111057920.0, "z_loss": 0.0008157002739608288 }, { "copy_logits_max": 2.3936104774475098, "copy_logits_min": -687500032.0, "copy_num_tokens": 360.6875, "epoch": 0.9008935409752361, "gen_logits_max": 5.0042924880981445, "gen_logits_mean": -14.863378524780273, "gen_logits_min": -26.792335510253906, "gen_logits_std": 2.846653938293457, "gen_loss": 0.29271817207336426, "grad_norm": 0.46472365099996144, "learning_rate": 2.5357894736842106e-05, "loss": 0.283, "mean_copy_accuracy": 0.9948844313621521, "mean_gen_accuracy": 0.8751060515642166, "mean_token_accuracy": 0.9056339114904404, "num_tokens": 111283822.0, "sample_num_tokens": 7705.5, "step": 4411, "total_num_tokens": 111314644.0, "z_loss": 0.0011392750311642885 }, { "copy_logits_max": -0.5231795310974121, "copy_logits_min": -750000000.0, "copy_num_tokens": 493.625, "epoch": 0.901097778912433, "gen_logits_max": 5.014041900634766, "gen_logits_mean": -14.760637283325195, "gen_logits_min": -26.01956558227539, "gen_logits_std": 2.817727565765381, "gen_loss": 0.33201050758361816, "grad_norm": 0.40388670494984835, "learning_rate": 2.535663157894737e-05, "loss": 0.3072, "mean_copy_accuracy": 0.9953277856111526, "mean_gen_accuracy": 0.8674246519804001, "mean_token_accuracy": 0.8990227431058884, "num_tokens": 111545536.0, "sample_num_tokens": 9533.0, "step": 4412, "total_num_tokens": 111583668.0, "z_loss": 0.0010980353690683842 }, { "copy_logits_max": 1.2939320802688599, "copy_logits_min": -750000000.0, "copy_num_tokens": 623.9375, "epoch": 0.9013020168496299, "gen_logits_max": 4.546077728271484, "gen_logits_mean": -14.22636890411377, "gen_logits_min": -26.39447784423828, "gen_logits_std": 2.854996681213379, "gen_loss": 0.28879302740097046, "grad_norm": 0.3592848072954595, "learning_rate": 2.535536842105263e-05, "loss": 0.2851, "mean_copy_accuracy": 0.9963343292474747, "mean_gen_accuracy": 0.8749378770589828, "mean_token_accuracy": 0.9064951986074448, "num_tokens": 111833113.0, "sample_num_tokens": 8975.75, "step": 4413, "total_num_tokens": 111869016.0, "z_loss": 0.0011158757843077183 }, { "copy_logits_max": 0.9334723949432373, "copy_logits_min": -750000128.0, "copy_num_tokens": 481.5, "epoch": 0.9015062547868267, "gen_logits_max": 4.852880954742432, "gen_logits_mean": -13.837068557739258, "gen_logits_min": -26.188690185546875, "gen_logits_std": 2.8819327354431152, "gen_loss": 0.2610326111316681, "grad_norm": 0.37882016389261586, "learning_rate": 2.5354105263157896e-05, "loss": 0.2883, "mean_copy_accuracy": 0.9949984848499298, "mean_gen_accuracy": 0.8716561198234558, "mean_token_accuracy": 0.9030772298574448, "num_tokens": 112104077.0, "sample_num_tokens": 7894.25, "step": 4414, "total_num_tokens": 112135654.0, "z_loss": 0.0007794694975018501 }, { "copy_logits_max": -1.393282413482666, "copy_logits_min": -687500032.0, "copy_num_tokens": 440.625, "epoch": 0.9017104927240235, "gen_logits_max": 4.358755588531494, "gen_logits_mean": -15.253945350646973, "gen_logits_min": -27.079477310180664, "gen_logits_std": 2.8326759338378906, "gen_loss": 0.2808160185813904, "grad_norm": 0.44595266274449785, "learning_rate": 2.5352842105263157e-05, "loss": 0.2765, "mean_copy_accuracy": 0.994574710726738, "mean_gen_accuracy": 0.8770939111709595, "mean_token_accuracy": 0.9082353264093399, "num_tokens": 112382485.0, "sample_num_tokens": 8145.25, "step": 4415, "total_num_tokens": 112415066.0, "z_loss": 0.0006599226035177708 }, { "copy_logits_max": -1.7688002586364746, "copy_logits_min": -687500032.0, "copy_num_tokens": 428.125, "epoch": 0.9019147306612203, "gen_logits_max": 4.316174030303955, "gen_logits_mean": -15.942086219787598, "gen_logits_min": -27.839418411254883, "gen_logits_std": 2.8392691612243652, "gen_loss": 0.3124629855155945, "grad_norm": 0.3919120673720481, "learning_rate": 2.535157894736842e-05, "loss": 0.2988, "mean_copy_accuracy": 0.995895579457283, "mean_gen_accuracy": 0.8724792450666428, "mean_token_accuracy": 0.9015199393033981, "num_tokens": 112644874.0, "sample_num_tokens": 8817.0, "step": 4416, "total_num_tokens": 112680142.0, "z_loss": 0.0006705236155539751 }, { "copy_logits_max": -3.8583984375, "copy_logits_min": -750000000.0, "copy_num_tokens": 326.4375, "epoch": 0.9021189685984171, "gen_logits_max": 4.77589225769043, "gen_logits_mean": -14.931196212768555, "gen_logits_min": -26.629568099975586, "gen_logits_std": 2.8175344467163086, "gen_loss": 0.30207186937332153, "grad_norm": 0.40437015754847244, "learning_rate": 2.5350315789473685e-05, "loss": 0.2988, "mean_copy_accuracy": 0.9952346086502075, "mean_gen_accuracy": 0.8686194717884064, "mean_token_accuracy": 0.9007698744535446, "num_tokens": 112939990.0, "sample_num_tokens": 7269.0, "step": 4417, "total_num_tokens": 112969066.0, "z_loss": 0.0006236507324501872 }, { "copy_logits_max": -2.2409372329711914, "copy_logits_min": -750000000.0, "copy_num_tokens": 408.75, "epoch": 0.902323206535614, "gen_logits_max": 5.18026065826416, "gen_logits_mean": -13.591756820678711, "gen_logits_min": -25.59838104248047, "gen_logits_std": 2.8576059341430664, "gen_loss": 0.3049584925174713, "grad_norm": 0.3827359675928403, "learning_rate": 2.534905263157895e-05, "loss": 0.3005, "mean_copy_accuracy": 0.9968500286340714, "mean_gen_accuracy": 0.8678493350744247, "mean_token_accuracy": 0.8991921544075012, "num_tokens": 113207020.0, "sample_num_tokens": 8138.0, "step": 4418, "total_num_tokens": 113239572.0, "z_loss": 0.000596996396780014 }, { "copy_logits_max": -1.0091335773468018, "copy_logits_min": -687500032.0, "copy_num_tokens": 473.3125, "epoch": 0.9025274444728109, "gen_logits_max": 4.306599140167236, "gen_logits_mean": -15.330739974975586, "gen_logits_min": -27.06878662109375, "gen_logits_std": 2.8710360527038574, "gen_loss": 0.30279478430747986, "grad_norm": 0.4001921465898785, "learning_rate": 2.534778947368421e-05, "loss": 0.3364, "mean_copy_accuracy": 0.9942387044429779, "mean_gen_accuracy": 0.8592113554477692, "mean_token_accuracy": 0.8879104405641556, "num_tokens": 113471241.0, "sample_num_tokens": 8969.25, "step": 4419, "total_num_tokens": 113507118.0, "z_loss": 0.0006914014229550958 }, { "copy_logits_max": -0.8596031665802002, "copy_logits_min": -750000000.0, "copy_num_tokens": 412.625, "epoch": 0.9027316824100077, "gen_logits_max": 5.297788619995117, "gen_logits_mean": -13.957500457763672, "gen_logits_min": -25.87639808654785, "gen_logits_std": 2.830167293548584, "gen_loss": 0.3119243383407593, "grad_norm": 0.41474995036561807, "learning_rate": 2.5346526315789475e-05, "loss": 0.3197, "mean_copy_accuracy": 0.9955593347549438, "mean_gen_accuracy": 0.8637741953134537, "mean_token_accuracy": 0.8932333886623383, "num_tokens": 113743181.0, "sample_num_tokens": 7612.75, "step": 4420, "total_num_tokens": 113773632.0, "z_loss": 0.000743247102946043 }, { "copy_logits_max": -3.5455007553100586, "copy_logits_min": -687500032.0, "copy_num_tokens": 392.625, "epoch": 0.9029359203472045, "gen_logits_max": 5.2050981521606445, "gen_logits_mean": -15.019247055053711, "gen_logits_min": -26.559480667114258, "gen_logits_std": 2.811168670654297, "gen_loss": 0.3513278663158417, "grad_norm": 0.46910512667129606, "learning_rate": 2.534526315789474e-05, "loss": 0.3211, "mean_copy_accuracy": 0.9922382831573486, "mean_gen_accuracy": 0.8645142465829849, "mean_token_accuracy": 0.8918906897306442, "num_tokens": 113985466.0, "sample_num_tokens": 8705.5, "step": 4421, "total_num_tokens": 114020288.0, "z_loss": 0.0007146022981032729 }, { "copy_logits_max": 0.44084301590919495, "copy_logits_min": -750000000.0, "copy_num_tokens": 438.375, "epoch": 0.9031401582844013, "gen_logits_max": 4.419320106506348, "gen_logits_mean": -14.92190933227539, "gen_logits_min": -27.188560485839844, "gen_logits_std": 2.8763394355773926, "gen_loss": 0.27194544672966003, "grad_norm": 0.4032532494463937, "learning_rate": 2.5344e-05, "loss": 0.308, "mean_copy_accuracy": 0.9945745915174484, "mean_gen_accuracy": 0.8643609881401062, "mean_token_accuracy": 0.8942393958568573, "num_tokens": 114242842.0, "sample_num_tokens": 7764.0, "step": 4422, "total_num_tokens": 114273898.0, "z_loss": 0.0006714060436934233 }, { "copy_logits_max": -4.505347728729248, "copy_logits_min": -750000000.0, "copy_num_tokens": 328.5, "epoch": 0.9033443962215981, "gen_logits_max": 4.659904479980469, "gen_logits_mean": -15.935654640197754, "gen_logits_min": -27.24354362487793, "gen_logits_std": 2.7572555541992188, "gen_loss": 0.3154391050338745, "grad_norm": 0.43845282539613845, "learning_rate": 2.5342736842105264e-05, "loss": 0.3064, "mean_copy_accuracy": 0.9954289346933365, "mean_gen_accuracy": 0.8719458281993866, "mean_token_accuracy": 0.8979990482330322, "num_tokens": 114503542.0, "sample_num_tokens": 8085.5, "step": 4423, "total_num_tokens": 114535884.0, "z_loss": 0.0007427571690641344 }, { "copy_logits_max": -3.165048599243164, "copy_logits_min": -750000000.0, "copy_num_tokens": 499.375, "epoch": 0.903548634158795, "gen_logits_max": 5.764771938323975, "gen_logits_mean": -13.384060859680176, "gen_logits_min": -25.29312515258789, "gen_logits_std": 2.8208415508270264, "gen_loss": 0.2805377244949341, "grad_norm": 0.37606761752660123, "learning_rate": 2.5341473684210525e-05, "loss": 0.2992, "mean_copy_accuracy": 0.9955443888902664, "mean_gen_accuracy": 0.8671789914369583, "mean_token_accuracy": 0.9014254361391068, "num_tokens": 114784624.0, "sample_num_tokens": 8977.5, "step": 4424, "total_num_tokens": 114820534.0, "z_loss": 0.0007026178063824773 }, { "copy_logits_max": -2.72172212600708, "copy_logits_min": -750000000.0, "copy_num_tokens": 505.6875, "epoch": 0.9037528720959919, "gen_logits_max": 4.625278472900391, "gen_logits_mean": -14.831089973449707, "gen_logits_min": -26.86028289794922, "gen_logits_std": 2.8153772354125977, "gen_loss": 0.2802512049674988, "grad_norm": 0.4220925346784988, "learning_rate": 2.5340210526315793e-05, "loss": 0.2787, "mean_copy_accuracy": 0.995423823595047, "mean_gen_accuracy": 0.8755607157945633, "mean_token_accuracy": 0.906414657831192, "num_tokens": 115071015.0, "sample_num_tokens": 8749.75, "step": 4425, "total_num_tokens": 115106014.0, "z_loss": 0.000683037331327796 }, { "copy_logits_max": -0.2832612991333008, "copy_logits_min": -750000000.0, "copy_num_tokens": 474.9375, "epoch": 0.9039571100331887, "gen_logits_max": 4.467480659484863, "gen_logits_mean": -14.776468276977539, "gen_logits_min": -26.584583282470703, "gen_logits_std": 2.799468994140625, "gen_loss": 0.3166603446006775, "grad_norm": 0.4442167251446438, "learning_rate": 2.5338947368421054e-05, "loss": 0.3086, "mean_copy_accuracy": 0.9951752126216888, "mean_gen_accuracy": 0.8608190566301346, "mean_token_accuracy": 0.8978369981050491, "num_tokens": 115336559.0, "sample_num_tokens": 8181.25, "step": 4426, "total_num_tokens": 115369284.0, "z_loss": 0.0006855280953459442 }, { "copy_logits_max": -2.097377300262451, "copy_logits_min": -750000000.0, "copy_num_tokens": 282.25, "epoch": 0.9041613479703855, "gen_logits_max": 6.1489105224609375, "gen_logits_mean": -12.6669340133667, "gen_logits_min": -24.236530303955078, "gen_logits_std": 2.712700128555298, "gen_loss": 0.3916940689086914, "grad_norm": 0.4538985809425317, "learning_rate": 2.5337684210526318e-05, "loss": 0.3207, "mean_copy_accuracy": 0.9929279088973999, "mean_gen_accuracy": 0.864404559135437, "mean_token_accuracy": 0.8919178694486618, "num_tokens": 115582567.0, "sample_num_tokens": 7146.75, "step": 4427, "total_num_tokens": 115611154.0, "z_loss": 0.0007674562511965632 }, { "copy_logits_max": -4.878758430480957, "copy_logits_min": -750000000.0, "copy_num_tokens": 483.25, "epoch": 0.9043655859075823, "gen_logits_max": 3.957807779312134, "gen_logits_mean": -14.832265853881836, "gen_logits_min": -26.605690002441406, "gen_logits_std": 2.8152174949645996, "gen_loss": 0.2775876820087433, "grad_norm": 0.41727427824625524, "learning_rate": 2.533642105263158e-05, "loss": 0.2817, "mean_copy_accuracy": 0.9949311316013336, "mean_gen_accuracy": 0.8773612231016159, "mean_token_accuracy": 0.9059794694185257, "num_tokens": 115856149.0, "sample_num_tokens": 7785.75, "step": 4428, "total_num_tokens": 115887292.0, "z_loss": 0.0005909019964747131 }, { "copy_logits_max": -4.28359317779541, "copy_logits_min": -687500032.0, "copy_num_tokens": 566.375, "epoch": 0.9045698238447791, "gen_logits_max": 4.373089790344238, "gen_logits_mean": -14.27406120300293, "gen_logits_min": -25.927993774414062, "gen_logits_std": 2.73714280128479, "gen_loss": 0.2694247364997864, "grad_norm": 0.3973654916100988, "learning_rate": 2.5335157894736843e-05, "loss": 0.29, "mean_copy_accuracy": 0.9933787137269974, "mean_gen_accuracy": 0.8749414086341858, "mean_token_accuracy": 0.9037361294031143, "num_tokens": 116124353.0, "sample_num_tokens": 9315.75, "step": 4429, "total_num_tokens": 116161616.0, "z_loss": 0.0006338784005492926 }, { "copy_logits_max": -3.7519631385803223, "copy_logits_min": -562500032.0, "copy_num_tokens": 447.0, "epoch": 0.904774061781976, "gen_logits_max": 4.845072269439697, "gen_logits_mean": -13.96865463256836, "gen_logits_min": -25.26353645324707, "gen_logits_std": 2.684654951095581, "gen_loss": 0.31039348244667053, "grad_norm": 0.4268669817399599, "learning_rate": 2.5333894736842104e-05, "loss": 0.3119, "mean_copy_accuracy": 0.992548480629921, "mean_gen_accuracy": 0.868821382522583, "mean_token_accuracy": 0.8965606838464737, "num_tokens": 116387153.0, "sample_num_tokens": 8802.25, "step": 4430, "total_num_tokens": 116422362.0, "z_loss": 0.0006548091769218445 }, { "copy_logits_max": -5.0440287590026855, "copy_logits_min": -750000000.0, "copy_num_tokens": 331.8125, "epoch": 0.9049782997191729, "gen_logits_max": 5.324538230895996, "gen_logits_mean": -14.099472045898438, "gen_logits_min": -25.481761932373047, "gen_logits_std": 2.7261409759521484, "gen_loss": 0.3602301776409149, "grad_norm": 0.4065576953122041, "learning_rate": 2.533263157894737e-05, "loss": 0.3257, "mean_copy_accuracy": 0.9932131767272949, "mean_gen_accuracy": 0.8622331917285919, "mean_token_accuracy": 0.8930387645959854, "num_tokens": 116660289.0, "sample_num_tokens": 8045.25, "step": 4431, "total_num_tokens": 116692470.0, "z_loss": 0.0007301404257304966 }, { "copy_logits_max": -3.1028220653533936, "copy_logits_min": -750000000.0, "copy_num_tokens": 711.875, "epoch": 0.9051825376563697, "gen_logits_max": 5.074010848999023, "gen_logits_mean": -13.65837287902832, "gen_logits_min": -25.86663246154785, "gen_logits_std": 2.8107357025146484, "gen_loss": 0.2763884663581848, "grad_norm": 0.3743922508964838, "learning_rate": 2.533136842105263e-05, "loss": 0.2951, "mean_copy_accuracy": 0.9948498904705048, "mean_gen_accuracy": 0.8729081749916077, "mean_token_accuracy": 0.9036300033330917, "num_tokens": 116947562.0, "sample_num_tokens": 9969.5, "step": 4432, "total_num_tokens": 116987440.0, "z_loss": 0.0007286594482138753 }, { "copy_logits_max": -1.2670097351074219, "copy_logits_min": -750000064.0, "copy_num_tokens": 694.1875, "epoch": 0.9053867755935665, "gen_logits_max": 4.275104522705078, "gen_logits_mean": -14.384169578552246, "gen_logits_min": -26.21607208251953, "gen_logits_std": 2.832416296005249, "gen_loss": 0.29935723543167114, "grad_norm": 0.3641237276876882, "learning_rate": 2.5330105263157897e-05, "loss": 0.3014, "mean_copy_accuracy": 0.9950767159461975, "mean_gen_accuracy": 0.8590990751981735, "mean_token_accuracy": 0.8989588618278503, "num_tokens": 117254186.0, "sample_num_tokens": 10776.5, "step": 4433, "total_num_tokens": 117297292.0, "z_loss": 0.0007724800962023437 }, { "copy_logits_max": -3.9947338104248047, "copy_logits_min": -750000000.0, "copy_num_tokens": 400.0625, "epoch": 0.9055910135307633, "gen_logits_max": 4.715087890625, "gen_logits_mean": -15.017311096191406, "gen_logits_min": -26.89337921142578, "gen_logits_std": 2.8336586952209473, "gen_loss": 0.25406432151794434, "grad_norm": 0.4005580459626321, "learning_rate": 2.5328842105263158e-05, "loss": 0.2888, "mean_copy_accuracy": 0.9938827902078629, "mean_gen_accuracy": 0.8750024139881134, "mean_token_accuracy": 0.9020165205001831, "num_tokens": 117522452.0, "sample_num_tokens": 7460.0, "step": 4434, "total_num_tokens": 117552292.0, "z_loss": 0.0006438397103920579 }, { "copy_logits_max": -2.9185142517089844, "copy_logits_min": -687500032.0, "copy_num_tokens": 519.5, "epoch": 0.9057952514679601, "gen_logits_max": 4.2156982421875, "gen_logits_mean": -15.681259155273438, "gen_logits_min": -27.916296005249023, "gen_logits_std": 2.856416702270508, "gen_loss": 0.3036261796951294, "grad_norm": 0.4203453797265047, "learning_rate": 2.5327578947368422e-05, "loss": 0.3241, "mean_copy_accuracy": 0.9949685335159302, "mean_gen_accuracy": 0.8636902570724487, "mean_token_accuracy": 0.8922470808029175, "num_tokens": 117766575.0, "sample_num_tokens": 9138.25, "step": 4435, "total_num_tokens": 117803128.0, "z_loss": 0.0007802761974744499 }, { "copy_logits_max": -2.6152894496917725, "copy_logits_min": -687500032.0, "copy_num_tokens": 737.875, "epoch": 0.905999489405157, "gen_logits_max": 4.5958404541015625, "gen_logits_mean": -13.892879486083984, "gen_logits_min": -26.068084716796875, "gen_logits_std": 2.854717969894409, "gen_loss": 0.30162668228149414, "grad_norm": 0.3691631868135869, "learning_rate": 2.5326315789473687e-05, "loss": 0.2986, "mean_copy_accuracy": 0.99504254758358, "mean_gen_accuracy": 0.8680361956357956, "mean_token_accuracy": 0.9018319398164749, "num_tokens": 118063408.0, "sample_num_tokens": 10833.0, "step": 4436, "total_num_tokens": 118106740.0, "z_loss": 0.0007122191018424928 }, { "copy_logits_max": -3.8769922256469727, "copy_logits_min": -750000000.0, "copy_num_tokens": 617.8125, "epoch": 0.9062037273423539, "gen_logits_max": 4.0251383781433105, "gen_logits_mean": -15.681467056274414, "gen_logits_min": -27.60524559020996, "gen_logits_std": 2.8630599975585938, "gen_loss": 0.29192185401916504, "grad_norm": 0.40319728836342633, "learning_rate": 2.5325052631578948e-05, "loss": 0.2988, "mean_copy_accuracy": 0.9949881285429001, "mean_gen_accuracy": 0.8657284677028656, "mean_token_accuracy": 0.8997901678085327, "num_tokens": 118343992.0, "sample_num_tokens": 8890.0, "step": 4437, "total_num_tokens": 118379552.0, "z_loss": 0.0005913989152759314 }, { "copy_logits_max": -3.3042478561401367, "copy_logits_min": -750000000.0, "copy_num_tokens": 498.1875, "epoch": 0.9064079652795507, "gen_logits_max": 3.8478689193725586, "gen_logits_mean": -15.813566207885742, "gen_logits_min": -27.663864135742188, "gen_logits_std": 2.866842269897461, "gen_loss": 0.30665719509124756, "grad_norm": 0.45239462007744696, "learning_rate": 2.5323789473684212e-05, "loss": 0.3278, "mean_copy_accuracy": 0.9943860173225403, "mean_gen_accuracy": 0.8575360924005508, "mean_token_accuracy": 0.8923400342464447, "num_tokens": 118608893.0, "sample_num_tokens": 8186.75, "step": 4438, "total_num_tokens": 118641640.0, "z_loss": 0.0005882643163204193 }, { "copy_logits_max": -4.836061477661133, "copy_logits_min": -750000000.0, "copy_num_tokens": 591.75, "epoch": 0.9066122032167475, "gen_logits_max": 4.282684326171875, "gen_logits_mean": -15.413740158081055, "gen_logits_min": -27.449140548706055, "gen_logits_std": 2.8520514965057373, "gen_loss": 0.2488732933998108, "grad_norm": 0.37645581920304394, "learning_rate": 2.5322526315789473e-05, "loss": 0.3129, "mean_copy_accuracy": 0.9958657622337341, "mean_gen_accuracy": 0.8611982017755508, "mean_token_accuracy": 0.8963734805583954, "num_tokens": 118892279.0, "sample_num_tokens": 10271.75, "step": 4439, "total_num_tokens": 118933366.0, "z_loss": 0.0005683471681550145 }, { "copy_logits_max": -3.4284210205078125, "copy_logits_min": -750000000.0, "copy_num_tokens": 317.0, "epoch": 0.9068164411539443, "gen_logits_max": 6.080887794494629, "gen_logits_mean": -13.82317066192627, "gen_logits_min": -25.703617095947266, "gen_logits_std": 2.8471879959106445, "gen_loss": 0.3236541152000427, "grad_norm": 0.40069782979575674, "learning_rate": 2.5321263157894737e-05, "loss": 0.321, "mean_copy_accuracy": 0.9947305172681808, "mean_gen_accuracy": 0.8635968416929245, "mean_token_accuracy": 0.8937573432922363, "num_tokens": 119162575.0, "sample_num_tokens": 7578.25, "step": 4440, "total_num_tokens": 119192888.0, "z_loss": 0.0006248267600312829 }, { "copy_logits_max": -4.796019554138184, "copy_logits_min": -750000000.0, "copy_num_tokens": 298.25, "epoch": 0.9070206790911411, "gen_logits_max": 5.8588056564331055, "gen_logits_mean": -14.264547348022461, "gen_logits_min": -25.772680282592773, "gen_logits_std": 2.8264260292053223, "gen_loss": 0.3039114773273468, "grad_norm": 0.3768728297235007, "learning_rate": 2.5319999999999998e-05, "loss": 0.2983, "mean_copy_accuracy": 0.9954839199781418, "mean_gen_accuracy": 0.8683409094810486, "mean_token_accuracy": 0.8998796343803406, "num_tokens": 119429625.0, "sample_num_tokens": 7722.25, "step": 4441, "total_num_tokens": 119460514.0, "z_loss": 0.0006004982860758901 }, { "copy_logits_max": -3.489631175994873, "copy_logits_min": -750000000.0, "copy_num_tokens": 515.125, "epoch": 0.907224917028338, "gen_logits_max": 4.893558502197266, "gen_logits_mean": -14.086329460144043, "gen_logits_min": -25.98055076599121, "gen_logits_std": 2.8473238945007324, "gen_loss": 0.30474281311035156, "grad_norm": 0.39958017328060447, "learning_rate": 2.5318736842105266e-05, "loss": 0.3113, "mean_copy_accuracy": 0.9943291991949081, "mean_gen_accuracy": 0.8641723841428757, "mean_token_accuracy": 0.8975319564342499, "num_tokens": 119710092.0, "sample_num_tokens": 8775.0, "step": 4442, "total_num_tokens": 119745192.0, "z_loss": 0.0006117974407970905 }, { "copy_logits_max": -3.745725154876709, "copy_logits_min": -750000000.0, "copy_num_tokens": 458.0625, "epoch": 0.9074291549655349, "gen_logits_max": 5.30202579498291, "gen_logits_mean": -14.059606552124023, "gen_logits_min": -26.547412872314453, "gen_logits_std": 2.8692634105682373, "gen_loss": 0.3145313262939453, "grad_norm": 0.39007438408744993, "learning_rate": 2.5317473684210527e-05, "loss": 0.3195, "mean_copy_accuracy": 0.994415670633316, "mean_gen_accuracy": 0.8623168170452118, "mean_token_accuracy": 0.8928459733724594, "num_tokens": 119979746.0, "sample_num_tokens": 8993.5, "step": 4443, "total_num_tokens": 120015720.0, "z_loss": 0.0006386418826878071 }, { "copy_logits_max": -4.022397041320801, "copy_logits_min": -750000000.0, "copy_num_tokens": 380.0625, "epoch": 0.9076333929027317, "gen_logits_max": 5.007014751434326, "gen_logits_mean": -14.796262741088867, "gen_logits_min": -26.966455459594727, "gen_logits_std": 2.863117218017578, "gen_loss": 0.322189062833786, "grad_norm": 0.41180114650310545, "learning_rate": 2.531621052631579e-05, "loss": 0.3071, "mean_copy_accuracy": 0.995441660284996, "mean_gen_accuracy": 0.8663589507341385, "mean_token_accuracy": 0.8969499617815018, "num_tokens": 120256639.0, "sample_num_tokens": 7965.75, "step": 4444, "total_num_tokens": 120288502.0, "z_loss": 0.0005980750429444015 }, { "copy_logits_max": -5.705524921417236, "copy_logits_min": -750000000.0, "copy_num_tokens": 272.0625, "epoch": 0.9078376308399285, "gen_logits_max": 5.6913065910339355, "gen_logits_mean": -15.250325202941895, "gen_logits_min": -27.111513137817383, "gen_logits_std": 2.839596748352051, "gen_loss": 0.37954527139663696, "grad_norm": 0.4001398765770573, "learning_rate": 2.5314947368421052e-05, "loss": 0.3264, "mean_copy_accuracy": 0.9949544817209244, "mean_gen_accuracy": 0.8599793612957001, "mean_token_accuracy": 0.8897691667079926, "num_tokens": 120541011.0, "sample_num_tokens": 7507.25, "step": 4445, "total_num_tokens": 120571040.0, "z_loss": 0.0006970706745050848 }, { "copy_logits_max": -4.757052421569824, "copy_logits_min": -687500032.0, "copy_num_tokens": 565.625, "epoch": 0.9080418687771253, "gen_logits_max": 5.58793306350708, "gen_logits_mean": -13.31729793548584, "gen_logits_min": -25.25438690185547, "gen_logits_std": 2.865354537963867, "gen_loss": 0.2563299536705017, "grad_norm": 0.38346656296599585, "learning_rate": 2.5313684210526316e-05, "loss": 0.2903, "mean_copy_accuracy": 0.9950847625732422, "mean_gen_accuracy": 0.8722306340932846, "mean_token_accuracy": 0.9025143980979919, "num_tokens": 120815964.0, "sample_num_tokens": 9740.0, "step": 4446, "total_num_tokens": 120854924.0, "z_loss": 0.0005190197261981666 }, { "copy_logits_max": -2.4877285957336426, "copy_logits_min": -750000000.0, "copy_num_tokens": 551.5625, "epoch": 0.9082461067143222, "gen_logits_max": 4.039312362670898, "gen_logits_mean": -15.259308815002441, "gen_logits_min": -27.397624969482422, "gen_logits_std": 2.8646395206451416, "gen_loss": 0.2901522219181061, "grad_norm": 0.3705247271730236, "learning_rate": 2.5312421052631577e-05, "loss": 0.3021, "mean_copy_accuracy": 0.9954091459512711, "mean_gen_accuracy": 0.8675309866666794, "mean_token_accuracy": 0.9000188112258911, "num_tokens": 121094311.0, "sample_num_tokens": 8677.75, "step": 4447, "total_num_tokens": 121129022.0, "z_loss": 0.000565032591111958 }, { "copy_logits_max": -2.8740367889404297, "copy_logits_min": -750000000.0, "copy_num_tokens": 431.875, "epoch": 0.908450344651519, "gen_logits_max": 5.772492408752441, "gen_logits_mean": -14.082828521728516, "gen_logits_min": -26.411914825439453, "gen_logits_std": 2.829249858856201, "gen_loss": 0.30637744069099426, "grad_norm": 0.3863419112909358, "learning_rate": 2.531115789473684e-05, "loss": 0.3215, "mean_copy_accuracy": 0.9953608065843582, "mean_gen_accuracy": 0.8605101704597473, "mean_token_accuracy": 0.8932656943798065, "num_tokens": 121371284.0, "sample_num_tokens": 9049.5, "step": 4448, "total_num_tokens": 121407482.0, "z_loss": 0.000537377200089395 }, { "copy_logits_max": -2.76796817779541, "copy_logits_min": -750000000.0, "copy_num_tokens": 554.6875, "epoch": 0.9086545825887159, "gen_logits_max": 4.643998622894287, "gen_logits_mean": -14.850168228149414, "gen_logits_min": -27.04784393310547, "gen_logits_std": 2.873159646987915, "gen_loss": 0.3055749535560608, "grad_norm": 0.3634244219573694, "learning_rate": 2.5309894736842106e-05, "loss": 0.3105, "mean_copy_accuracy": 0.9962118119001389, "mean_gen_accuracy": 0.8656542897224426, "mean_token_accuracy": 0.8983135670423508, "num_tokens": 121668402.0, "sample_num_tokens": 8814.5, "step": 4449, "total_num_tokens": 121703660.0, "z_loss": 0.0006765304133296013 }, { "copy_logits_max": -3.865866184234619, "copy_logits_min": -687500032.0, "copy_num_tokens": 464.0, "epoch": 0.9088588205259127, "gen_logits_max": 4.624074935913086, "gen_logits_mean": -14.914189338684082, "gen_logits_min": -26.869321823120117, "gen_logits_std": 2.8385586738586426, "gen_loss": 0.28931403160095215, "grad_norm": 0.39382429944816044, "learning_rate": 2.530863157894737e-05, "loss": 0.3048, "mean_copy_accuracy": 0.9947983473539352, "mean_gen_accuracy": 0.8676305413246155, "mean_token_accuracy": 0.8972502052783966, "num_tokens": 121931441.0, "sample_num_tokens": 8957.25, "step": 4450, "total_num_tokens": 121967270.0, "z_loss": 0.0006087538786232471 }, { "copy_logits_max": -1.242837905883789, "copy_logits_min": -687500032.0, "copy_num_tokens": 470.0625, "epoch": 0.9090630584631095, "gen_logits_max": 4.635552406311035, "gen_logits_mean": -13.725407600402832, "gen_logits_min": -26.110713958740234, "gen_logits_std": 2.828369617462158, "gen_loss": 0.3338613212108612, "grad_norm": 0.40287248402612025, "learning_rate": 2.5307368421052634e-05, "loss": 0.3075, "mean_copy_accuracy": 0.9953677654266357, "mean_gen_accuracy": 0.8641056716442108, "mean_token_accuracy": 0.896159827709198, "num_tokens": 122189907.0, "sample_num_tokens": 7928.25, "step": 4451, "total_num_tokens": 122221620.0, "z_loss": 0.0007782734464854002 }, { "copy_logits_max": -4.13608455657959, "copy_logits_min": -687500032.0, "copy_num_tokens": 608.25, "epoch": 0.9092672964003063, "gen_logits_max": 5.7368574142456055, "gen_logits_mean": -13.549272537231445, "gen_logits_min": -25.367738723754883, "gen_logits_std": 2.8311944007873535, "gen_loss": 0.3088742792606354, "grad_norm": 0.35613368553674296, "learning_rate": 2.5306105263157895e-05, "loss": 0.3205, "mean_copy_accuracy": 0.9950930774211884, "mean_gen_accuracy": 0.8647504448890686, "mean_token_accuracy": 0.8934351652860641, "num_tokens": 122468603.0, "sample_num_tokens": 10166.75, "step": 4452, "total_num_tokens": 122509270.0, "z_loss": 0.0006702785613015294 }, { "copy_logits_max": -3.2272591590881348, "copy_logits_min": -750000000.0, "copy_num_tokens": 505.875, "epoch": 0.9094715343375032, "gen_logits_max": 6.554806232452393, "gen_logits_mean": -12.191885948181152, "gen_logits_min": -24.117671966552734, "gen_logits_std": 2.7853593826293945, "gen_loss": 0.3326214551925659, "grad_norm": 0.40949572630344444, "learning_rate": 2.530484210526316e-05, "loss": 0.3159, "mean_copy_accuracy": 0.9950273483991623, "mean_gen_accuracy": 0.8607177436351776, "mean_token_accuracy": 0.8946605622768402, "num_tokens": 122745074.0, "sample_num_tokens": 8774.0, "step": 4453, "total_num_tokens": 122780170.0, "z_loss": 0.0007013590075075626 }, { "copy_logits_max": -5.2634406089782715, "copy_logits_min": -750000000.0, "copy_num_tokens": 420.0625, "epoch": 0.9096757722747, "gen_logits_max": 4.838979721069336, "gen_logits_mean": -15.5164794921875, "gen_logits_min": -26.81163787841797, "gen_logits_std": 2.821157932281494, "gen_loss": 0.3009510934352875, "grad_norm": 0.38259665645902907, "learning_rate": 2.530357894736842e-05, "loss": 0.2868, "mean_copy_accuracy": 0.995407447218895, "mean_gen_accuracy": 0.8719797432422638, "mean_token_accuracy": 0.9024865180253983, "num_tokens": 123014200.0, "sample_num_tokens": 8273.5, "step": 4454, "total_num_tokens": 123047294.0, "z_loss": 0.0005988434422761202 }, { "copy_logits_max": -4.9355363845825195, "copy_logits_min": -750000064.0, "copy_num_tokens": 418.4375, "epoch": 0.9098800102118969, "gen_logits_max": 5.049654483795166, "gen_logits_mean": -14.158729553222656, "gen_logits_min": -25.795265197753906, "gen_logits_std": 2.8032541275024414, "gen_loss": 0.32526057958602905, "grad_norm": 0.38154252529597993, "learning_rate": 2.5302315789473685e-05, "loss": 0.3016, "mean_copy_accuracy": 0.9960676580667496, "mean_gen_accuracy": 0.8693451136350632, "mean_token_accuracy": 0.9009983092546463, "num_tokens": 123277038.0, "sample_num_tokens": 8239.0, "step": 4455, "total_num_tokens": 123309994.0, "z_loss": 0.0006344677531160414 }, { "copy_logits_max": -3.732496500015259, "copy_logits_min": -750000000.0, "copy_num_tokens": 808.125, "epoch": 0.9100842481490937, "gen_logits_max": 3.9087424278259277, "gen_logits_mean": -13.16600227355957, "gen_logits_min": -24.91973114013672, "gen_logits_std": 2.813572645187378, "gen_loss": 0.218997523188591, "grad_norm": 0.33711756481323885, "learning_rate": 2.5301052631578946e-05, "loss": 0.2727, "mean_copy_accuracy": 0.9962276518344879, "mean_gen_accuracy": 0.8762779831886292, "mean_token_accuracy": 0.907546654343605, "num_tokens": 123566282.0, "sample_num_tokens": 9593.5, "step": 4456, "total_num_tokens": 123604656.0, "z_loss": 0.00046625512186437845 }, { "copy_logits_max": -5.891000747680664, "copy_logits_min": -750000000.0, "copy_num_tokens": 341.375, "epoch": 0.9102884860862905, "gen_logits_max": 5.192354202270508, "gen_logits_mean": -15.042763710021973, "gen_logits_min": -26.398603439331055, "gen_logits_std": 2.801043748855591, "gen_loss": 0.2896798253059387, "grad_norm": 0.3567551271787936, "learning_rate": 2.529978947368421e-05, "loss": 0.3052, "mean_copy_accuracy": 0.9949025958776474, "mean_gen_accuracy": 0.8730424791574478, "mean_token_accuracy": 0.9001717567443848, "num_tokens": 123856692.0, "sample_num_tokens": 8633.0, "step": 4457, "total_num_tokens": 123891224.0, "z_loss": 0.0005726157687604427 }, { "copy_logits_max": -2.8985373973846436, "copy_logits_min": -750000000.0, "copy_num_tokens": 382.625, "epoch": 0.9104927240234874, "gen_logits_max": 5.269733428955078, "gen_logits_mean": -14.837751388549805, "gen_logits_min": -26.177074432373047, "gen_logits_std": 2.7888004779815674, "gen_loss": 0.3349316120147705, "grad_norm": 0.39924761046246127, "learning_rate": 2.5298526315789474e-05, "loss": 0.3314, "mean_copy_accuracy": 0.9956699460744858, "mean_gen_accuracy": 0.8644976913928986, "mean_token_accuracy": 0.8897193521261215, "num_tokens": 124099586.0, "sample_num_tokens": 8442.0, "step": 4458, "total_num_tokens": 124133354.0, "z_loss": 0.0006675944314338267 }, { "copy_logits_max": -1.6986172199249268, "copy_logits_min": -750000000.0, "copy_num_tokens": 530.875, "epoch": 0.9106969619606842, "gen_logits_max": 5.056342601776123, "gen_logits_mean": -13.1937255859375, "gen_logits_min": -24.87908363342285, "gen_logits_std": 2.804572105407715, "gen_loss": 0.27157822251319885, "grad_norm": 0.3669270305241706, "learning_rate": 2.529726315789474e-05, "loss": 0.3012, "mean_copy_accuracy": 0.9958666563034058, "mean_gen_accuracy": 0.8693298697471619, "mean_token_accuracy": 0.8980174660682678, "num_tokens": 124359318.0, "sample_num_tokens": 8630.5, "step": 4459, "total_num_tokens": 124393840.0, "z_loss": 0.000578019767999649 }, { "copy_logits_max": -1.9213649034500122, "copy_logits_min": -687500032.0, "copy_num_tokens": 573.25, "epoch": 0.910901199897881, "gen_logits_max": 5.387730121612549, "gen_logits_mean": -13.4515380859375, "gen_logits_min": -25.24826431274414, "gen_logits_std": 2.788741111755371, "gen_loss": 0.31784379482269287, "grad_norm": 0.3861893002026473, "learning_rate": 2.5296e-05, "loss": 0.3128, "mean_copy_accuracy": 0.9957257509231567, "mean_gen_accuracy": 0.8631013333797455, "mean_token_accuracy": 0.8949303925037384, "num_tokens": 124612290.0, "sample_num_tokens": 9635.5, "step": 4460, "total_num_tokens": 124650832.0, "z_loss": 0.0006410295027308166 }, { "copy_logits_max": -4.5210161209106445, "copy_logits_min": -750000000.0, "copy_num_tokens": 384.625, "epoch": 0.9111054378350779, "gen_logits_max": 5.298101425170898, "gen_logits_mean": -14.279356002807617, "gen_logits_min": -26.25337791442871, "gen_logits_std": 2.816110134124756, "gen_loss": 0.2812137007713318, "grad_norm": 0.36322174387582173, "learning_rate": 2.5294736842105264e-05, "loss": 0.3063, "mean_copy_accuracy": 0.995439738035202, "mean_gen_accuracy": 0.8657924085855484, "mean_token_accuracy": 0.8992322087287903, "num_tokens": 124909931.0, "sample_num_tokens": 8488.75, "step": 4461, "total_num_tokens": 124943886.0, "z_loss": 0.0005379565991461277 }, { "copy_logits_max": -1.73692786693573, "copy_logits_min": -750000000.0, "copy_num_tokens": 467.0, "epoch": 0.9113096757722747, "gen_logits_max": 4.983216762542725, "gen_logits_mean": -13.386516571044922, "gen_logits_min": -24.93482208251953, "gen_logits_std": 2.749523162841797, "gen_loss": 0.3130947947502136, "grad_norm": 0.3749663818930869, "learning_rate": 2.529347368421053e-05, "loss": 0.2936, "mean_copy_accuracy": 0.9963717460632324, "mean_gen_accuracy": 0.8621632307767868, "mean_token_accuracy": 0.9029106497764587, "num_tokens": 125192118.0, "sample_num_tokens": 8540.5, "step": 4462, "total_num_tokens": 125226280.0, "z_loss": 0.0006144462386146188 }, { "copy_logits_max": -2.777670383453369, "copy_logits_min": -687500032.0, "copy_num_tokens": 541.8125, "epoch": 0.9115139137094715, "gen_logits_max": 4.132543563842773, "gen_logits_mean": -14.88422966003418, "gen_logits_min": -27.216285705566406, "gen_logits_std": 2.8273770809173584, "gen_loss": 0.2996150851249695, "grad_norm": 0.40002802341930216, "learning_rate": 2.529221052631579e-05, "loss": 0.3137, "mean_copy_accuracy": 0.9951879233121872, "mean_gen_accuracy": 0.8665373176336288, "mean_token_accuracy": 0.8967745304107666, "num_tokens": 125447990.0, "sample_num_tokens": 8592.0, "step": 4463, "total_num_tokens": 125482358.0, "z_loss": 0.000571644282899797 }, { "copy_logits_max": -2.227280855178833, "copy_logits_min": -750000000.0, "copy_num_tokens": 510.5, "epoch": 0.9117181516466684, "gen_logits_max": 4.346536636352539, "gen_logits_mean": -14.286552429199219, "gen_logits_min": -25.755435943603516, "gen_logits_std": 2.7890686988830566, "gen_loss": 0.3088732361793518, "grad_norm": 0.3702494326614734, "learning_rate": 2.5290947368421054e-05, "loss": 0.3052, "mean_copy_accuracy": 0.9958535134792328, "mean_gen_accuracy": 0.8661084771156311, "mean_token_accuracy": 0.8975480049848557, "num_tokens": 125708393.0, "sample_num_tokens": 9009.75, "step": 4464, "total_num_tokens": 125744432.0, "z_loss": 0.0005361852236092091 }, { "copy_logits_max": -2.891265392303467, "copy_logits_min": -750000000.0, "copy_num_tokens": 363.3125, "epoch": 0.9119223895838652, "gen_logits_max": 4.623842239379883, "gen_logits_mean": -14.891237258911133, "gen_logits_min": -27.047332763671875, "gen_logits_std": 2.8398256301879883, "gen_loss": 0.3121876120567322, "grad_norm": 0.3932096481775099, "learning_rate": 2.5289684210526314e-05, "loss": 0.3065, "mean_copy_accuracy": 0.9958145469427109, "mean_gen_accuracy": 0.8669203072786331, "mean_token_accuracy": 0.8980259448289871, "num_tokens": 125987624.0, "sample_num_tokens": 7309.5, "step": 4465, "total_num_tokens": 126016862.0, "z_loss": 0.0005925564328208566 }, { "copy_logits_max": -1.7385756969451904, "copy_logits_min": -750000000.0, "copy_num_tokens": 526.3125, "epoch": 0.912126627521062, "gen_logits_max": 5.2454352378845215, "gen_logits_mean": -13.846303939819336, "gen_logits_min": -26.54401397705078, "gen_logits_std": 2.8054633140563965, "gen_loss": 0.28224653005599976, "grad_norm": 0.4214570166520802, "learning_rate": 2.5288421052631582e-05, "loss": 0.3015, "mean_copy_accuracy": 0.9951479732990265, "mean_gen_accuracy": 0.8701268434524536, "mean_token_accuracy": 0.8990006297826767, "num_tokens": 126233402.0, "sample_num_tokens": 9241.0, "step": 4466, "total_num_tokens": 126270366.0, "z_loss": 0.000545489601790905 }, { "copy_logits_max": -5.0652756690979, "copy_logits_min": -687500032.0, "copy_num_tokens": 320.0625, "epoch": 0.9123308654582589, "gen_logits_max": 5.25709867477417, "gen_logits_mean": -14.562657356262207, "gen_logits_min": -26.064329147338867, "gen_logits_std": 2.7728090286254883, "gen_loss": 0.3557090759277344, "grad_norm": 0.3682298031201161, "learning_rate": 2.5287157894736843e-05, "loss": 0.2968, "mean_copy_accuracy": 0.9958646148443222, "mean_gen_accuracy": 0.869578018784523, "mean_token_accuracy": 0.9022179841995239, "num_tokens": 126525575.0, "sample_num_tokens": 7837.75, "step": 4467, "total_num_tokens": 126556926.0, "z_loss": 0.0006020126165822148 }, { "copy_logits_max": -2.126239776611328, "copy_logits_min": -687500032.0, "copy_num_tokens": 536.9375, "epoch": 0.9125351033954557, "gen_logits_max": 4.843039512634277, "gen_logits_mean": -14.34371566772461, "gen_logits_min": -25.984779357910156, "gen_logits_std": 2.831153392791748, "gen_loss": 0.34957414865493774, "grad_norm": 0.3962624674783485, "learning_rate": 2.5285894736842107e-05, "loss": 0.3012, "mean_copy_accuracy": 0.9937542229890823, "mean_gen_accuracy": 0.8675085604190826, "mean_token_accuracy": 0.8978139460086823, "num_tokens": 126803876.0, "sample_num_tokens": 9231.0, "step": 4468, "total_num_tokens": 126840800.0, "z_loss": 0.0006801409181207418 }, { "copy_logits_max": -3.909646987915039, "copy_logits_min": -750000000.0, "copy_num_tokens": 431.6875, "epoch": 0.9127393413326526, "gen_logits_max": 4.930824279785156, "gen_logits_mean": -15.121566772460938, "gen_logits_min": -26.717342376708984, "gen_logits_std": 2.827751636505127, "gen_loss": 0.31298118829727173, "grad_norm": 0.37853479748778035, "learning_rate": 2.528463157894737e-05, "loss": 0.3267, "mean_copy_accuracy": 0.996345266699791, "mean_gen_accuracy": 0.8619842529296875, "mean_token_accuracy": 0.8924630135297775, "num_tokens": 127074599.0, "sample_num_tokens": 9285.75, "step": 4469, "total_num_tokens": 127111742.0, "z_loss": 0.000611937721259892 }, { "copy_logits_max": -4.014162063598633, "copy_logits_min": -687500032.0, "copy_num_tokens": 424.3125, "epoch": 0.9129435792698494, "gen_logits_max": 4.309179306030273, "gen_logits_mean": -15.546138763427734, "gen_logits_min": -27.229602813720703, "gen_logits_std": 2.8607940673828125, "gen_loss": 0.27379530668258667, "grad_norm": 0.4729376928739333, "learning_rate": 2.5283368421052633e-05, "loss": 0.2914, "mean_copy_accuracy": 0.9955754578113556, "mean_gen_accuracy": 0.8716379702091217, "mean_token_accuracy": 0.9014341980218887, "num_tokens": 127347780.0, "sample_num_tokens": 7952.0, "step": 4470, "total_num_tokens": 127379588.0, "z_loss": 0.0006196500035002828 }, { "copy_logits_max": -1.5219991207122803, "copy_logits_min": -687500032.0, "copy_num_tokens": 528.0, "epoch": 0.9131478172070462, "gen_logits_max": 4.532922267913818, "gen_logits_mean": -15.185731887817383, "gen_logits_min": -27.142908096313477, "gen_logits_std": 2.867469310760498, "gen_loss": 0.3185421824455261, "grad_norm": 0.3830274192519597, "learning_rate": 2.5282105263157894e-05, "loss": 0.2949, "mean_copy_accuracy": 0.9967318028211594, "mean_gen_accuracy": 0.8603181391954422, "mean_token_accuracy": 0.9023981988430023, "num_tokens": 127637403.0, "sample_num_tokens": 8281.75, "step": 4471, "total_num_tokens": 127670530.0, "z_loss": 0.0006953349220566452 }, { "copy_logits_max": -2.60266375541687, "copy_logits_min": -750000000.0, "copy_num_tokens": 448.875, "epoch": 0.913352055144243, "gen_logits_max": 4.509513854980469, "gen_logits_mean": -14.761075019836426, "gen_logits_min": -26.65115737915039, "gen_logits_std": 2.8176519870758057, "gen_loss": 0.2890411913394928, "grad_norm": 0.3955701586139857, "learning_rate": 2.5280842105263158e-05, "loss": 0.2921, "mean_copy_accuracy": 0.994989201426506, "mean_gen_accuracy": 0.8717553615570068, "mean_token_accuracy": 0.9026801437139511, "num_tokens": 127895595.0, "sample_num_tokens": 8347.75, "step": 4472, "total_num_tokens": 127928986.0, "z_loss": 0.0006021201843395829 }, { "copy_logits_max": -5.499551296234131, "copy_logits_min": -750000000.0, "copy_num_tokens": 586.6875, "epoch": 0.9135562930814399, "gen_logits_max": 4.331555366516113, "gen_logits_mean": -15.018583297729492, "gen_logits_min": -26.683034896850586, "gen_logits_std": 2.824453830718994, "gen_loss": 0.3108194172382355, "grad_norm": 0.41438347008489296, "learning_rate": 2.527957894736842e-05, "loss": 0.3026, "mean_copy_accuracy": 0.9947194308042526, "mean_gen_accuracy": 0.8654645085334778, "mean_token_accuracy": 0.8987214118242264, "num_tokens": 128176115.0, "sample_num_tokens": 9322.75, "step": 4473, "total_num_tokens": 128213406.0, "z_loss": 0.0006072515388950706 }, { "copy_logits_max": -4.037683963775635, "copy_logits_min": -750000000.0, "copy_num_tokens": 460.1875, "epoch": 0.9137605310186367, "gen_logits_max": 4.440550327301025, "gen_logits_mean": -14.661238670349121, "gen_logits_min": -26.19330596923828, "gen_logits_std": 2.8110666275024414, "gen_loss": 0.313309907913208, "grad_norm": 0.4054229389505607, "learning_rate": 2.5278315789473687e-05, "loss": 0.295, "mean_copy_accuracy": 0.995609849691391, "mean_gen_accuracy": 0.8664165735244751, "mean_token_accuracy": 0.9013002216815948, "num_tokens": 128467663.0, "sample_num_tokens": 8146.75, "step": 4474, "total_num_tokens": 128500250.0, "z_loss": 0.0006417928962036967 }, { "copy_logits_max": -2.846407413482666, "copy_logits_min": -687500096.0, "copy_num_tokens": 380.3125, "epoch": 0.9139647689558336, "gen_logits_max": 5.329807281494141, "gen_logits_mean": -14.75566577911377, "gen_logits_min": -26.614261627197266, "gen_logits_std": 2.8256425857543945, "gen_loss": 0.33525824546813965, "grad_norm": 0.37843410978181774, "learning_rate": 2.527705263157895e-05, "loss": 0.3179, "mean_copy_accuracy": 0.9967076629400253, "mean_gen_accuracy": 0.8648853898048401, "mean_token_accuracy": 0.8972231447696686, "num_tokens": 128772737.0, "sample_num_tokens": 7453.75, "step": 4475, "total_num_tokens": 128802552.0, "z_loss": 0.0006621829816140234 }, { "copy_logits_max": -3.5899996757507324, "copy_logits_min": -750000128.0, "copy_num_tokens": 485.625, "epoch": 0.9141690068930304, "gen_logits_max": 3.8095834255218506, "gen_logits_mean": -15.815176010131836, "gen_logits_min": -27.82529640197754, "gen_logits_std": 2.8599846363067627, "gen_loss": 0.27701008319854736, "grad_norm": 0.38293878477807214, "learning_rate": 2.5275789473684212e-05, "loss": 0.2827, "mean_copy_accuracy": 0.9969350844621658, "mean_gen_accuracy": 0.8721872270107269, "mean_token_accuracy": 0.907408595085144, "num_tokens": 129051458.0, "sample_num_tokens": 7929.0, "step": 4476, "total_num_tokens": 129083174.0, "z_loss": 0.0005433066980913281 }, { "copy_logits_max": -4.621549129486084, "copy_logits_min": -750000000.0, "copy_num_tokens": 418.1875, "epoch": 0.9143732448302272, "gen_logits_max": 5.802934646606445, "gen_logits_mean": -12.114801406860352, "gen_logits_min": -24.092693328857422, "gen_logits_std": 2.8308639526367188, "gen_loss": 0.3111960291862488, "grad_norm": 0.3886273351554728, "learning_rate": 2.5274526315789476e-05, "loss": 0.3083, "mean_copy_accuracy": 0.9957270473241806, "mean_gen_accuracy": 0.8693028390407562, "mean_token_accuracy": 0.8980046063661575, "num_tokens": 129329407.0, "sample_num_tokens": 8159.25, "step": 4477, "total_num_tokens": 129362044.0, "z_loss": 0.0005889484891667962 }, { "copy_logits_max": -4.605936050415039, "copy_logits_min": -750000000.0, "copy_num_tokens": 452.375, "epoch": 0.914577482767424, "gen_logits_max": 4.153802871704102, "gen_logits_mean": -15.09501838684082, "gen_logits_min": -27.05210304260254, "gen_logits_std": 2.8236913681030273, "gen_loss": 0.325980007648468, "grad_norm": 0.4762389583542729, "learning_rate": 2.5273263157894737e-05, "loss": 0.3049, "mean_copy_accuracy": 0.9951538294553757, "mean_gen_accuracy": 0.8628426641225815, "mean_token_accuracy": 0.899937629699707, "num_tokens": 129606179.0, "sample_num_tokens": 8066.25, "step": 4478, "total_num_tokens": 129638444.0, "z_loss": 0.0006273940089158714 }, { "copy_logits_max": -2.328692674636841, "copy_logits_min": -750000000.0, "copy_num_tokens": 373.0, "epoch": 0.9147817207046209, "gen_logits_max": 6.195350646972656, "gen_logits_mean": -12.81981086730957, "gen_logits_min": -24.522232055664062, "gen_logits_std": 2.8168632984161377, "gen_loss": 0.2904788553714752, "grad_norm": 0.3957899662743913, "learning_rate": 2.5272e-05, "loss": 0.3175, "mean_copy_accuracy": 0.9956643432378769, "mean_gen_accuracy": 0.8619141280651093, "mean_token_accuracy": 0.8948595821857452, "num_tokens": 129885451.0, "sample_num_tokens": 8667.75, "step": 4479, "total_num_tokens": 129920122.0, "z_loss": 0.0006234547472558916 }, { "copy_logits_max": -2.1485400199890137, "copy_logits_min": -750000000.0, "copy_num_tokens": 487.25, "epoch": 0.9149859586418178, "gen_logits_max": 4.307558536529541, "gen_logits_mean": -15.23169231414795, "gen_logits_min": -27.212141036987305, "gen_logits_std": 2.8309097290039062, "gen_loss": 0.29745256900787354, "grad_norm": 0.4276343833123587, "learning_rate": 2.5270736842105262e-05, "loss": 0.3023, "mean_copy_accuracy": 0.9959230422973633, "mean_gen_accuracy": 0.8685201406478882, "mean_token_accuracy": 0.8994831740856171, "num_tokens": 130152520.0, "sample_num_tokens": 8447.5, "step": 4480, "total_num_tokens": 130186310.0, "z_loss": 0.0007851726841181517 }, { "copy_logits_max": -3.7055740356445312, "copy_logits_min": -750000000.0, "copy_num_tokens": 457.8125, "epoch": 0.9151901965790146, "gen_logits_max": 3.8030710220336914, "gen_logits_mean": -15.929080963134766, "gen_logits_min": -27.699140548706055, "gen_logits_std": 2.8373541831970215, "gen_loss": 0.28031766414642334, "grad_norm": 0.35925412628105047, "learning_rate": 2.5269473684210527e-05, "loss": 0.2897, "mean_copy_accuracy": 0.99668750166893, "mean_gen_accuracy": 0.8703446984291077, "mean_token_accuracy": 0.9024884700775146, "num_tokens": 130425911.0, "sample_num_tokens": 8454.75, "step": 4481, "total_num_tokens": 130459730.0, "z_loss": 0.0006831556675024331 }, { "copy_logits_max": -3.661663293838501, "copy_logits_min": -750000000.0, "copy_num_tokens": 533.9375, "epoch": 0.9153944345162114, "gen_logits_max": 4.199298858642578, "gen_logits_mean": -15.452759742736816, "gen_logits_min": -27.037900924682617, "gen_logits_std": 2.8166513442993164, "gen_loss": 0.2547041177749634, "grad_norm": 0.3903079998345857, "learning_rate": 2.526821052631579e-05, "loss": 0.2954, "mean_copy_accuracy": 0.9945388436317444, "mean_gen_accuracy": 0.8735856413841248, "mean_token_accuracy": 0.9033655971288681, "num_tokens": 130708745.0, "sample_num_tokens": 8327.75, "step": 4482, "total_num_tokens": 130742056.0, "z_loss": 0.0007103158859536052 }, { "copy_logits_max": -3.7850141525268555, "copy_logits_min": -750000064.0, "copy_num_tokens": 632.3125, "epoch": 0.9155986724534082, "gen_logits_max": 3.5278449058532715, "gen_logits_mean": -16.9601993560791, "gen_logits_min": -28.482810974121094, "gen_logits_std": 2.868469715118408, "gen_loss": 0.313626229763031, "grad_norm": 0.3932256107091207, "learning_rate": 2.5266947368421055e-05, "loss": 0.3001, "mean_copy_accuracy": 0.995690181851387, "mean_gen_accuracy": 0.8714398741722107, "mean_token_accuracy": 0.9012233763933182, "num_tokens": 130994806.0, "sample_num_tokens": 10146.0, "step": 4483, "total_num_tokens": 131035390.0, "z_loss": 0.0006833573570474982 }, { "copy_logits_max": -5.468379020690918, "copy_logits_min": -750000000.0, "copy_num_tokens": 270.0625, "epoch": 0.915802910390605, "gen_logits_max": 5.380362510681152, "gen_logits_mean": -14.834728240966797, "gen_logits_min": -26.248661041259766, "gen_logits_std": 2.825636386871338, "gen_loss": 0.28095322847366333, "grad_norm": 0.4019563141717585, "learning_rate": 2.5265684210526316e-05, "loss": 0.3135, "mean_copy_accuracy": 0.9950264245271683, "mean_gen_accuracy": 0.8674268871545792, "mean_token_accuracy": 0.8968845307826996, "num_tokens": 131271928.0, "sample_num_tokens": 6775.0, "step": 4484, "total_num_tokens": 131299028.0, "z_loss": 0.0005790995201095939 }, { "copy_logits_max": -4.30789041519165, "copy_logits_min": -750000000.0, "copy_num_tokens": 412.0, "epoch": 0.9160071483278018, "gen_logits_max": 5.2804856300354, "gen_logits_mean": -15.37556266784668, "gen_logits_min": -26.524093627929688, "gen_logits_std": 2.8389852046966553, "gen_loss": 0.32358384132385254, "grad_norm": 0.40935988404324075, "learning_rate": 2.526442105263158e-05, "loss": 0.3221, "mean_copy_accuracy": 0.9954899698495865, "mean_gen_accuracy": 0.8629116415977478, "mean_token_accuracy": 0.894109457731247, "num_tokens": 131544244.0, "sample_num_tokens": 9286.5, "step": 4485, "total_num_tokens": 131581390.0, "z_loss": 0.0006334183271974325 }, { "copy_logits_max": -4.825528144836426, "copy_logits_min": -750000000.0, "copy_num_tokens": 474.0, "epoch": 0.9162113862649988, "gen_logits_max": 4.307665824890137, "gen_logits_mean": -14.719595909118652, "gen_logits_min": -26.46849822998047, "gen_logits_std": 2.8452303409576416, "gen_loss": 0.2638121247291565, "grad_norm": 0.36887151122303025, "learning_rate": 2.526315789473684e-05, "loss": 0.2938, "mean_copy_accuracy": 0.9951227009296417, "mean_gen_accuracy": 0.8716583251953125, "mean_token_accuracy": 0.9035550355911255, "num_tokens": 131819515.0, "sample_num_tokens": 8564.25, "step": 4486, "total_num_tokens": 131853772.0, "z_loss": 0.0005245180800557137 }, { "copy_logits_max": -4.454463481903076, "copy_logits_min": -687500032.0, "copy_num_tokens": 532.125, "epoch": 0.9164156242021956, "gen_logits_max": 4.5195441246032715, "gen_logits_mean": -14.657137870788574, "gen_logits_min": -26.392452239990234, "gen_logits_std": 2.810349702835083, "gen_loss": 0.2959146797657013, "grad_norm": 0.3889069040210131, "learning_rate": 2.5261894736842106e-05, "loss": 0.3113, "mean_copy_accuracy": 0.9948640018701553, "mean_gen_accuracy": 0.8655398786067963, "mean_token_accuracy": 0.8951630145311356, "num_tokens": 132073828.0, "sample_num_tokens": 8759.5, "step": 4487, "total_num_tokens": 132108866.0, "z_loss": 0.0005723880603909492 }, { "copy_logits_max": -5.295719146728516, "copy_logits_min": -750000000.0, "copy_num_tokens": 390.75, "epoch": 0.9166198621393924, "gen_logits_max": 5.409980773925781, "gen_logits_mean": -13.878705024719238, "gen_logits_min": -25.6436710357666, "gen_logits_std": 2.82827091217041, "gen_loss": 0.3532339334487915, "grad_norm": 0.3876117778004864, "learning_rate": 2.5260631578947367e-05, "loss": 0.3117, "mean_copy_accuracy": 0.9959055334329605, "mean_gen_accuracy": 0.8621229529380798, "mean_token_accuracy": 0.8959843516349792, "num_tokens": 132363113.0, "sample_num_tokens": 8569.25, "step": 4488, "total_num_tokens": 132397390.0, "z_loss": 0.0006510917446576059 }, { "copy_logits_max": -7.196728706359863, "copy_logits_min": -750000000.0, "copy_num_tokens": 495.5, "epoch": 0.9168241000765892, "gen_logits_max": 4.418956279754639, "gen_logits_mean": -15.02866268157959, "gen_logits_min": -26.54541778564453, "gen_logits_std": 2.804948091506958, "gen_loss": 0.3053489923477173, "grad_norm": 0.4256908235921801, "learning_rate": 2.525936842105263e-05, "loss": 0.3244, "mean_copy_accuracy": 0.9954694360494614, "mean_gen_accuracy": 0.8606604039669037, "mean_token_accuracy": 0.889209657907486, "num_tokens": 132620818.0, "sample_num_tokens": 9241.5, "step": 4489, "total_num_tokens": 132657784.0, "z_loss": 0.000576523132622242 }, { "copy_logits_max": -4.551671028137207, "copy_logits_min": -750000064.0, "copy_num_tokens": 411.0625, "epoch": 0.917028338013786, "gen_logits_max": 5.90787410736084, "gen_logits_mean": -13.13988208770752, "gen_logits_min": -25.44777488708496, "gen_logits_std": 2.8384997844696045, "gen_loss": 0.3020434081554413, "grad_norm": 0.38522687991908827, "learning_rate": 2.5258105263157895e-05, "loss": 0.3368, "mean_copy_accuracy": 0.9944263696670532, "mean_gen_accuracy": 0.8562650978565216, "mean_token_accuracy": 0.8878289759159088, "num_tokens": 132899141.0, "sample_num_tokens": 8173.25, "step": 4490, "total_num_tokens": 132931834.0, "z_loss": 0.0006146945524960756 }, { "copy_logits_max": -4.943910598754883, "copy_logits_min": -750000000.0, "copy_num_tokens": 301.75, "epoch": 0.9172325759509828, "gen_logits_max": 4.746106147766113, "gen_logits_mean": -15.158419609069824, "gen_logits_min": -26.86168098449707, "gen_logits_std": 2.858938455581665, "gen_loss": 0.3003929853439331, "grad_norm": 0.42701678186979264, "learning_rate": 2.525684210526316e-05, "loss": 0.288, "mean_copy_accuracy": 0.9954525232315063, "mean_gen_accuracy": 0.8718432784080505, "mean_token_accuracy": 0.9028838127851486, "num_tokens": 133170937.0, "sample_num_tokens": 6538.25, "step": 4491, "total_num_tokens": 133197090.0, "z_loss": 0.0005659431335516274 }, { "copy_logits_max": -3.5969536304473877, "copy_logits_min": -687500032.0, "copy_num_tokens": 479.5625, "epoch": 0.9174368138881798, "gen_logits_max": 4.676634311676025, "gen_logits_mean": -14.487005233764648, "gen_logits_min": -26.10110092163086, "gen_logits_std": 2.8403377532958984, "gen_loss": 0.28253328800201416, "grad_norm": 0.4199432152396347, "learning_rate": 2.5255578947368424e-05, "loss": 0.3152, "mean_copy_accuracy": 0.9952262789011002, "mean_gen_accuracy": 0.8629585802555084, "mean_token_accuracy": 0.8955184072256088, "num_tokens": 133427052.0, "sample_num_tokens": 7994.5, "step": 4492, "total_num_tokens": 133459030.0, "z_loss": 0.0006018680287525058 }, { "copy_logits_max": -4.017821311950684, "copy_logits_min": -750000000.0, "copy_num_tokens": 470.1875, "epoch": 0.9176410518253766, "gen_logits_max": 4.763267517089844, "gen_logits_mean": -14.940690040588379, "gen_logits_min": -27.298229217529297, "gen_logits_std": 2.8738536834716797, "gen_loss": 0.2639318108558655, "grad_norm": 0.45091937927647485, "learning_rate": 2.5254315789473685e-05, "loss": 0.3019, "mean_copy_accuracy": 0.993930920958519, "mean_gen_accuracy": 0.8702357858419418, "mean_token_accuracy": 0.9001345336437225, "num_tokens": 133689523.0, "sample_num_tokens": 8958.75, "step": 4493, "total_num_tokens": 133725358.0, "z_loss": 0.000546478433534503 }, { "copy_logits_max": -5.83367919921875, "copy_logits_min": -750000000.0, "copy_num_tokens": 329.5, "epoch": 0.9178452897625734, "gen_logits_max": 5.751703262329102, "gen_logits_mean": -14.44955825805664, "gen_logits_min": -25.822898864746094, "gen_logits_std": 2.8235864639282227, "gen_loss": 0.36654937267303467, "grad_norm": 0.393129224439877, "learning_rate": 2.525305263157895e-05, "loss": 0.3318, "mean_copy_accuracy": 0.9968738108873367, "mean_gen_accuracy": 0.8576139360666275, "mean_token_accuracy": 0.8905455470085144, "num_tokens": 133971387.0, "sample_num_tokens": 7739.75, "step": 4494, "total_num_tokens": 134002346.0, "z_loss": 0.0006915719714015722 }, { "copy_logits_max": -3.0091540813446045, "copy_logits_min": -750000000.0, "copy_num_tokens": 450.3125, "epoch": 0.9180495276997702, "gen_logits_max": 4.911700248718262, "gen_logits_mean": -14.454008102416992, "gen_logits_min": -26.10306167602539, "gen_logits_std": 2.8370659351348877, "gen_loss": 0.309792160987854, "grad_norm": 0.44104336026955515, "learning_rate": 2.525178947368421e-05, "loss": 0.2839, "mean_copy_accuracy": 0.9952680170536041, "mean_gen_accuracy": 0.8714071959257126, "mean_token_accuracy": 0.9030119478702545, "num_tokens": 134229175.0, "sample_num_tokens": 7547.25, "step": 4495, "total_num_tokens": 134259364.0, "z_loss": 0.0006176346796564758 }, { "copy_logits_max": -1.5453171730041504, "copy_logits_min": -750000000.0, "copy_num_tokens": 504.0625, "epoch": 0.918253765636967, "gen_logits_max": 4.83999490737915, "gen_logits_mean": -14.743515014648438, "gen_logits_min": -26.551250457763672, "gen_logits_std": 2.8536555767059326, "gen_loss": 0.29523298144340515, "grad_norm": 0.39998525210704167, "learning_rate": 2.5250526315789474e-05, "loss": 0.3018, "mean_copy_accuracy": 0.9962552785873413, "mean_gen_accuracy": 0.8665775954723358, "mean_token_accuracy": 0.8999188542366028, "num_tokens": 134484066.0, "sample_num_tokens": 8772.5, "step": 4496, "total_num_tokens": 134519156.0, "z_loss": 0.0006387649336829782 }, { "copy_logits_max": -0.6719802021980286, "copy_logits_min": -750000000.0, "copy_num_tokens": 381.0, "epoch": 0.9184580035741639, "gen_logits_max": 6.3101887702941895, "gen_logits_mean": -13.217107772827148, "gen_logits_min": -24.89907455444336, "gen_logits_std": 2.8257181644439697, "gen_loss": 0.3549085259437561, "grad_norm": 0.37962341924784115, "learning_rate": 2.5249263157894735e-05, "loss": 0.3058, "mean_copy_accuracy": 0.9959961473941803, "mean_gen_accuracy": 0.8616283088922501, "mean_token_accuracy": 0.8965732753276825, "num_tokens": 134771809.0, "sample_num_tokens": 8545.25, "step": 4497, "total_num_tokens": 134805990.0, "z_loss": 0.0007743545575067401 }, { "copy_logits_max": -4.589419364929199, "copy_logits_min": -750000000.0, "copy_num_tokens": 362.3125, "epoch": 0.9186622415113608, "gen_logits_max": 4.621725082397461, "gen_logits_mean": -15.769671440124512, "gen_logits_min": -27.20627784729004, "gen_logits_std": 2.821773052215576, "gen_loss": 0.32511839270591736, "grad_norm": 0.7123038009564818, "learning_rate": 2.5248e-05, "loss": 0.2977, "mean_copy_accuracy": 0.9940605461597443, "mean_gen_accuracy": 0.8712923228740692, "mean_token_accuracy": 0.9009789079427719, "num_tokens": 135034163.0, "sample_num_tokens": 7720.25, "step": 4498, "total_num_tokens": 135065044.0, "z_loss": 0.0006611648132093251 }, { "copy_logits_max": -3.656181812286377, "copy_logits_min": -750000064.0, "copy_num_tokens": 550.25, "epoch": 0.9188664794485576, "gen_logits_max": 4.733414173126221, "gen_logits_mean": -14.405738830566406, "gen_logits_min": -26.491025924682617, "gen_logits_std": 2.879641056060791, "gen_loss": 0.2850497364997864, "grad_norm": 0.3942686590334887, "learning_rate": 2.5246736842105264e-05, "loss": 0.3178, "mean_copy_accuracy": 0.9952386617660522, "mean_gen_accuracy": 0.8623060137033463, "mean_token_accuracy": 0.8934758603572845, "num_tokens": 135295683.0, "sample_num_tokens": 7873.75, "step": 4499, "total_num_tokens": 135327178.0, "z_loss": 0.0006351840565912426 }, { "epoch": 0.9190707173857544, "grad_norm": 0.3596254280624981, "learning_rate": 2.5245473684210528e-05, "loss": 0.28, "step": 4500 }, { "epoch": 0.9190707173857544, "eval_copy_logits_max": -7.015992641448975, "eval_copy_logits_min": -78.43595123291016, "eval_gen_logits_max": 4.046878337860107, "eval_gen_logits_mean": -19.27268409729004, "eval_gen_logits_min": -30.041702270507812, "eval_gen_logits_std": 2.822441339492798, "eval_gen_loss": 0.33437812328338623, "eval_loss": 0.32087966799736023, "eval_mean_copy_accuracy": 0.9894402921199799, "eval_mean_gen_accuracy": 0.874811977148056, "eval_mean_token_accuracy": 0.8895826637744904, "eval_num_tokens": 135614586.0, "eval_runtime": 0.7952, "eval_samples_per_second": 10.061, "eval_steps_per_second": 2.515, "eval_total_num_tokens": 135614586.0, "eval_z_loss": 0.000655469368211925, "step": 4500 }, { "copy_logits_max": -5.927072525024414, "copy_logits_min": -750000000.0, "copy_num_tokens": 341.9375, "epoch": 0.9192749553229512, "gen_logits_max": 5.451438903808594, "gen_logits_mean": -14.833008766174316, "gen_logits_min": -26.630966186523438, "gen_logits_std": 2.8215761184692383, "gen_loss": 0.30169379711151123, "grad_norm": 0.46699944444814306, "learning_rate": 2.524421052631579e-05, "loss": 0.3181, "mean_copy_accuracy": 0.9949117824435234, "mean_gen_accuracy": 0.8697308376431465, "mean_token_accuracy": 0.9024884924292564, "num_tokens": 135845569.0, "sample_num_tokens": 8034.75, "step": 4501, "total_num_tokens": 135877708.0, "z_loss": 0.0006338362582027912 }, { "copy_logits_max": -3.801380157470703, "copy_logits_min": -750000000.0, "copy_num_tokens": 642.5, "epoch": 0.919479193260148, "gen_logits_max": 5.246857166290283, "gen_logits_mean": -13.181587219238281, "gen_logits_min": -25.4929141998291, "gen_logits_std": 2.8606157302856445, "gen_loss": 0.27395346760749817, "grad_norm": 0.3887909011404541, "learning_rate": 2.5242947368421053e-05, "loss": 0.3038, "mean_copy_accuracy": 0.9955107122659683, "mean_gen_accuracy": 0.8645848482847214, "mean_token_accuracy": 0.8995370715856552, "num_tokens": 136127697.0, "sample_num_tokens": 9568.75, "step": 4502, "total_num_tokens": 136165972.0, "z_loss": 0.0005571596557274461 }, { "copy_logits_max": -1.6372361183166504, "copy_logits_min": -750000064.0, "copy_num_tokens": 456.5, "epoch": 0.9196834311973449, "gen_logits_max": 4.929730415344238, "gen_logits_mean": -13.73548698425293, "gen_logits_min": -26.043468475341797, "gen_logits_std": 2.8719229698181152, "gen_loss": 0.29183274507522583, "grad_norm": 0.3686682914654496, "learning_rate": 2.5241684210526318e-05, "loss": 0.3085, "mean_copy_accuracy": 0.9964326620101929, "mean_gen_accuracy": 0.8635261803865433, "mean_token_accuracy": 0.8973385244607925, "num_tokens": 136401494.0, "sample_num_tokens": 7083.5, "step": 4503, "total_num_tokens": 136429828.0, "z_loss": 0.0005869250744581223 }, { "copy_logits_max": -0.8401088118553162, "copy_logits_min": -750000000.0, "copy_num_tokens": 542.4375, "epoch": 0.9198876691345418, "gen_logits_max": 6.025119781494141, "gen_logits_mean": -11.460494995117188, "gen_logits_min": -23.736066818237305, "gen_logits_std": 2.842536449432373, "gen_loss": 0.29837289452552795, "grad_norm": 0.38743262152312435, "learning_rate": 2.524042105263158e-05, "loss": 0.3181, "mean_copy_accuracy": 0.9959289878606796, "mean_gen_accuracy": 0.8612529933452606, "mean_token_accuracy": 0.8938775807619095, "num_tokens": 136667443.0, "sample_num_tokens": 8798.25, "step": 4504, "total_num_tokens": 136702636.0, "z_loss": 0.0006539878668263555 }, { "copy_logits_max": -3.55322265625, "copy_logits_min": -750000000.0, "copy_num_tokens": 383.875, "epoch": 0.9200919070717386, "gen_logits_max": 5.1959686279296875, "gen_logits_mean": -13.992856979370117, "gen_logits_min": -26.060897827148438, "gen_logits_std": 2.8397293090820312, "gen_loss": 0.31809353828430176, "grad_norm": 0.3844106779218706, "learning_rate": 2.5239157894736843e-05, "loss": 0.306, "mean_copy_accuracy": 0.9946451038122177, "mean_gen_accuracy": 0.8711647391319275, "mean_token_accuracy": 0.8971574902534485, "num_tokens": 136932208.0, "sample_num_tokens": 8122.0, "step": 4505, "total_num_tokens": 136964696.0, "z_loss": 0.000668682623654604 }, { "copy_logits_max": -4.113696098327637, "copy_logits_min": -687500032.0, "copy_num_tokens": 396.75, "epoch": 0.9202961450089354, "gen_logits_max": 4.84830379486084, "gen_logits_mean": -14.61815357208252, "gen_logits_min": -26.671384811401367, "gen_logits_std": 2.8671510219573975, "gen_loss": 0.26850801706314087, "grad_norm": 0.39553047907874694, "learning_rate": 2.5237894736842104e-05, "loss": 0.2898, "mean_copy_accuracy": 0.9949522167444229, "mean_gen_accuracy": 0.8744017630815506, "mean_token_accuracy": 0.9031171351671219, "num_tokens": 137193058.0, "sample_num_tokens": 7741.5, "step": 4506, "total_num_tokens": 137224024.0, "z_loss": 0.0006204117671586573 }, { "copy_logits_max": -1.496551752090454, "copy_logits_min": -750000000.0, "copy_num_tokens": 665.0, "epoch": 0.9205003829461322, "gen_logits_max": 3.221079111099243, "gen_logits_mean": -16.00436019897461, "gen_logits_min": -28.162677764892578, "gen_logits_std": 2.909562587738037, "gen_loss": 0.27584969997406006, "grad_norm": 0.43768919578585364, "learning_rate": 2.523663157894737e-05, "loss": 0.3098, "mean_copy_accuracy": 0.9948913156986237, "mean_gen_accuracy": 0.8600258678197861, "mean_token_accuracy": 0.8953760713338852, "num_tokens": 137450118.0, "sample_num_tokens": 9317.5, "step": 4507, "total_num_tokens": 137487388.0, "z_loss": 0.0006039204890839756 }, { "copy_logits_max": -3.9908957481384277, "copy_logits_min": -687500032.0, "copy_num_tokens": 480.5625, "epoch": 0.920704620883329, "gen_logits_max": 5.20863676071167, "gen_logits_mean": -13.524433135986328, "gen_logits_min": -25.896892547607422, "gen_logits_std": 2.79986834526062, "gen_loss": 0.3154100775718689, "grad_norm": 0.38740075645523736, "learning_rate": 2.5235368421052632e-05, "loss": 0.2915, "mean_copy_accuracy": 0.9956246763467789, "mean_gen_accuracy": 0.8705793768167496, "mean_token_accuracy": 0.9020483046770096, "num_tokens": 137738768.0, "sample_num_tokens": 8661.0, "step": 4508, "total_num_tokens": 137773412.0, "z_loss": 0.0006436888943426311 }, { "copy_logits_max": -2.0760715007781982, "copy_logits_min": -687500032.0, "copy_num_tokens": 617.375, "epoch": 0.9209088588205259, "gen_logits_max": 4.273597717285156, "gen_logits_mean": -15.454448699951172, "gen_logits_min": -27.880023956298828, "gen_logits_std": 2.871830940246582, "gen_loss": 0.26310068368911743, "grad_norm": 1.6688674812646052, "learning_rate": 2.5234105263157897e-05, "loss": 0.3124, "mean_copy_accuracy": 0.9944973737001419, "mean_gen_accuracy": 0.8600860983133316, "mean_token_accuracy": 0.8973973095417023, "num_tokens": 138020094.0, "sample_num_tokens": 9705.5, "step": 4509, "total_num_tokens": 138058916.0, "z_loss": 0.0006253707106225193 }, { "copy_logits_max": -5.388906478881836, "copy_logits_min": -750000000.0, "copy_num_tokens": 469.75, "epoch": 0.9211130967577228, "gen_logits_max": 4.49261474609375, "gen_logits_mean": -14.557769775390625, "gen_logits_min": -26.849939346313477, "gen_logits_std": 2.847090244293213, "gen_loss": 0.2972050905227661, "grad_norm": 0.3809037041302, "learning_rate": 2.5232842105263158e-05, "loss": 0.3146, "mean_copy_accuracy": 0.9944670051336288, "mean_gen_accuracy": 0.8661581873893738, "mean_token_accuracy": 0.8950825035572052, "num_tokens": 138279285.0, "sample_num_tokens": 8333.75, "step": 4510, "total_num_tokens": 138312620.0, "z_loss": 0.0006355961668305099 }, { "copy_logits_max": -2.624199867248535, "copy_logits_min": -750000000.0, "copy_num_tokens": 386.625, "epoch": 0.9213173346949196, "gen_logits_max": 5.766770362854004, "gen_logits_mean": -12.402568817138672, "gen_logits_min": -24.640962600708008, "gen_logits_std": 2.698484420776367, "gen_loss": 0.30772650241851807, "grad_norm": 0.3731612293519985, "learning_rate": 2.5231578947368422e-05, "loss": 0.292, "mean_copy_accuracy": 0.9952168464660645, "mean_gen_accuracy": 0.8705294728279114, "mean_token_accuracy": 0.9033151119947433, "num_tokens": 138562297.0, "sample_num_tokens": 7415.25, "step": 4511, "total_num_tokens": 138591958.0, "z_loss": 0.0007482180371880531 }, { "copy_logits_max": -4.130955219268799, "copy_logits_min": -750000000.0, "copy_num_tokens": 530.4375, "epoch": 0.9215215726321164, "gen_logits_max": 4.000328063964844, "gen_logits_mean": -15.284011840820312, "gen_logits_min": -27.559003829956055, "gen_logits_std": 2.825815200805664, "gen_loss": 0.2696988582611084, "grad_norm": 0.37669309771725806, "learning_rate": 2.5230315789473683e-05, "loss": 0.2907, "mean_copy_accuracy": 0.9956885129213333, "mean_gen_accuracy": 0.8721951842308044, "mean_token_accuracy": 0.9035703539848328, "num_tokens": 138833408.0, "sample_num_tokens": 8800.0, "step": 4512, "total_num_tokens": 138868608.0, "z_loss": 0.0005655136774294078 }, { "copy_logits_max": -2.5620508193969727, "copy_logits_min": -687500032.0, "copy_num_tokens": 632.25, "epoch": 0.9217258105693132, "gen_logits_max": 4.868975639343262, "gen_logits_mean": -14.840678215026855, "gen_logits_min": -26.974044799804688, "gen_logits_std": 2.868544578552246, "gen_loss": 0.31234613060951233, "grad_norm": 0.42279267372090573, "learning_rate": 2.5229052631578947e-05, "loss": 0.3024, "mean_copy_accuracy": 0.9951653331518173, "mean_gen_accuracy": 0.8686645179986954, "mean_token_accuracy": 0.9005877673625946, "num_tokens": 139112928.0, "sample_num_tokens": 9709.5, "step": 4513, "total_num_tokens": 139151766.0, "z_loss": 0.0007030009292066097 }, { "copy_logits_max": -3.3998448848724365, "copy_logits_min": -750000000.0, "copy_num_tokens": 564.8125, "epoch": 0.9219300485065101, "gen_logits_max": 4.631839275360107, "gen_logits_mean": -13.818891525268555, "gen_logits_min": -25.46902084350586, "gen_logits_std": 2.8303024768829346, "gen_loss": 0.2862011790275574, "grad_norm": 0.383063682971861, "learning_rate": 2.5227789473684208e-05, "loss": 0.3006, "mean_copy_accuracy": 0.9945070892572403, "mean_gen_accuracy": 0.8689574599266052, "mean_token_accuracy": 0.8972470909357071, "num_tokens": 139371687.0, "sample_num_tokens": 8374.75, "step": 4514, "total_num_tokens": 139405186.0, "z_loss": 0.0006401303689926863 }, { "copy_logits_max": -6.42671012878418, "copy_logits_min": -750000000.0, "copy_num_tokens": 285.6875, "epoch": 0.9221342864437069, "gen_logits_max": 6.104698181152344, "gen_logits_mean": -13.733736991882324, "gen_logits_min": -25.085025787353516, "gen_logits_std": 2.7779040336608887, "gen_loss": 0.3155559301376343, "grad_norm": 0.3826745744612648, "learning_rate": 2.5226526315789476e-05, "loss": 0.3082, "mean_copy_accuracy": 0.9952435195446014, "mean_gen_accuracy": 0.8654782176017761, "mean_token_accuracy": 0.8978607058525085, "num_tokens": 139642207.0, "sample_num_tokens": 7956.75, "step": 4515, "total_num_tokens": 139674034.0, "z_loss": 0.0006673770258203149 }, { "copy_logits_max": -6.703824996948242, "copy_logits_min": -687500032.0, "copy_num_tokens": 410.875, "epoch": 0.9223385243809038, "gen_logits_max": 4.084866523742676, "gen_logits_mean": -17.091548919677734, "gen_logits_min": -28.685195922851562, "gen_logits_std": 2.8453593254089355, "gen_loss": 0.3480260372161865, "grad_norm": 0.3980542760527968, "learning_rate": 2.522526315789474e-05, "loss": 0.3167, "mean_copy_accuracy": 0.9966269284486771, "mean_gen_accuracy": 0.8634002655744553, "mean_token_accuracy": 0.8962033987045288, "num_tokens": 139913300.0, "sample_num_tokens": 9217.0, "step": 4516, "total_num_tokens": 139950168.0, "z_loss": 0.0006763403653167188 }, { "copy_logits_max": -4.247931480407715, "copy_logits_min": -750000128.0, "copy_num_tokens": 514.375, "epoch": 0.9225427623181006, "gen_logits_max": 4.479070663452148, "gen_logits_mean": -13.725321769714355, "gen_logits_min": -25.538013458251953, "gen_logits_std": 2.790414810180664, "gen_loss": 0.3193041682243347, "grad_norm": 0.3935209861306604, "learning_rate": 2.5224e-05, "loss": 0.2923, "mean_copy_accuracy": 0.9952685534954071, "mean_gen_accuracy": 0.8729526400566101, "mean_token_accuracy": 0.9007578492164612, "num_tokens": 140162913.0, "sample_num_tokens": 8588.75, "step": 4517, "total_num_tokens": 140197268.0, "z_loss": 0.0006444542086683214 }, { "copy_logits_max": -5.220425605773926, "copy_logits_min": -687500032.0, "copy_num_tokens": 486.1875, "epoch": 0.9227470002552974, "gen_logits_max": 3.952791213989258, "gen_logits_mean": -15.575658798217773, "gen_logits_min": -27.382362365722656, "gen_logits_std": 2.874567985534668, "gen_loss": 0.2903004586696625, "grad_norm": 0.3994929578692869, "learning_rate": 2.5222736842105265e-05, "loss": 0.292, "mean_copy_accuracy": 0.99550761282444, "mean_gen_accuracy": 0.8710076659917831, "mean_token_accuracy": 0.9018552452325821, "num_tokens": 140430118.0, "sample_num_tokens": 9488.5, "step": 4518, "total_num_tokens": 140468072.0, "z_loss": 0.0006053433753550053 }, { "copy_logits_max": -6.669845104217529, "copy_logits_min": -687500032.0, "copy_num_tokens": 340.3125, "epoch": 0.9229512381924942, "gen_logits_max": 5.465056419372559, "gen_logits_mean": -14.404936790466309, "gen_logits_min": -26.21347427368164, "gen_logits_std": 2.853503704071045, "gen_loss": 0.31188639998435974, "grad_norm": 0.364785043251543, "learning_rate": 2.5221473684210526e-05, "loss": 0.2915, "mean_copy_accuracy": 0.9957881718873978, "mean_gen_accuracy": 0.8716465979814529, "mean_token_accuracy": 0.9031843394041061, "num_tokens": 140715364.0, "sample_num_tokens": 7555.0, "step": 4519, "total_num_tokens": 140745584.0, "z_loss": 0.0006080951425246894 }, { "copy_logits_max": -4.385710716247559, "copy_logits_min": -750000000.0, "copy_num_tokens": 629.6875, "epoch": 0.9231554761296911, "gen_logits_max": 4.2462615966796875, "gen_logits_mean": -14.400388717651367, "gen_logits_min": -26.245956420898438, "gen_logits_std": 2.8331027030944824, "gen_loss": 0.2673357129096985, "grad_norm": 0.3992979254528002, "learning_rate": 2.522021052631579e-05, "loss": 0.2711, "mean_copy_accuracy": 0.996001586318016, "mean_gen_accuracy": 0.8780519962310791, "mean_token_accuracy": 0.9088086038827896, "num_tokens": 140982537.0, "sample_num_tokens": 9420.25, "step": 4520, "total_num_tokens": 141020218.0, "z_loss": 0.000556999584659934 }, { "copy_logits_max": -5.85450553894043, "copy_logits_min": -625000064.0, "copy_num_tokens": 295.9375, "epoch": 0.9233597140668879, "gen_logits_max": 4.500334739685059, "gen_logits_mean": -16.633316040039062, "gen_logits_min": -27.987464904785156, "gen_logits_std": 2.859466791152954, "gen_loss": 0.30329370498657227, "grad_norm": 0.3868698520596571, "learning_rate": 2.521894736842105e-05, "loss": 0.29, "mean_copy_accuracy": 0.9969495087862015, "mean_gen_accuracy": 0.8741720765829086, "mean_token_accuracy": 0.904514953494072, "num_tokens": 141270491.0, "sample_num_tokens": 6718.25, "step": 4521, "total_num_tokens": 141297364.0, "z_loss": 0.0005978366825729609 }, { "copy_logits_max": -4.511395454406738, "copy_logits_min": -750000000.0, "copy_num_tokens": 371.75, "epoch": 0.9235639520040848, "gen_logits_max": 4.763206958770752, "gen_logits_mean": -15.317778587341309, "gen_logits_min": -27.007022857666016, "gen_logits_std": 2.8623435497283936, "gen_loss": 0.2976982891559601, "grad_norm": 0.38043593774070444, "learning_rate": 2.5217684210526316e-05, "loss": 0.3162, "mean_copy_accuracy": 0.9959472417831421, "mean_gen_accuracy": 0.8674535155296326, "mean_token_accuracy": 0.8949812799692154, "num_tokens": 141532951.0, "sample_num_tokens": 8057.25, "step": 4522, "total_num_tokens": 141565180.0, "z_loss": 0.0005729699041694403 }, { "copy_logits_max": -5.532844066619873, "copy_logits_min": -750000000.0, "copy_num_tokens": 321.5625, "epoch": 0.9237681899412816, "gen_logits_max": 4.851903438568115, "gen_logits_mean": -14.542190551757812, "gen_logits_min": -26.16607093811035, "gen_logits_std": 2.827728271484375, "gen_loss": 0.32397276163101196, "grad_norm": 0.4051560219143972, "learning_rate": 2.521642105263158e-05, "loss": 0.3052, "mean_copy_accuracy": 0.9949204325675964, "mean_gen_accuracy": 0.8685411214828491, "mean_token_accuracy": 0.8977943509817123, "num_tokens": 141803767.0, "sample_num_tokens": 7154.25, "step": 4523, "total_num_tokens": 141832384.0, "z_loss": 0.0006426138570532203 }, { "copy_logits_max": -4.896514892578125, "copy_logits_min": -750000000.0, "copy_num_tokens": 476.125, "epoch": 0.9239724278784784, "gen_logits_max": 5.124339580535889, "gen_logits_mean": -13.87667465209961, "gen_logits_min": -25.531993865966797, "gen_logits_std": 2.847196340560913, "gen_loss": 0.3227449953556061, "grad_norm": 0.3682549309567659, "learning_rate": 2.5215157894736844e-05, "loss": 0.3158, "mean_copy_accuracy": 0.9952938109636307, "mean_gen_accuracy": 0.8678648918867111, "mean_token_accuracy": 0.8944610804319382, "num_tokens": 142058794.0, "sample_num_tokens": 9228.0, "step": 4524, "total_num_tokens": 142095706.0, "z_loss": 0.0005894620553590357 }, { "copy_logits_max": -2.514284133911133, "copy_logits_min": -625000064.0, "copy_num_tokens": 713.9375, "epoch": 0.9241766658156753, "gen_logits_max": 3.511622190475464, "gen_logits_mean": -14.498706817626953, "gen_logits_min": -26.76296615600586, "gen_logits_std": 2.8823351860046387, "gen_loss": 0.2839035093784332, "grad_norm": 0.3789949900766517, "learning_rate": 2.5213894736842105e-05, "loss": 0.314, "mean_copy_accuracy": 0.9958731979131699, "mean_gen_accuracy": 0.8599094301462173, "mean_token_accuracy": 0.8954692929983139, "num_tokens": 142333620.0, "sample_num_tokens": 9229.0, "step": 4525, "total_num_tokens": 142370536.0, "z_loss": 0.0005647196667268872 }, { "copy_logits_max": -4.956282615661621, "copy_logits_min": -750000000.0, "copy_num_tokens": 409.875, "epoch": 0.9243809037528721, "gen_logits_max": 5.254518508911133, "gen_logits_mean": -13.42003059387207, "gen_logits_min": -24.940143585205078, "gen_logits_std": 2.8057193756103516, "gen_loss": 0.3563527464866638, "grad_norm": 0.41674585233255435, "learning_rate": 2.521263157894737e-05, "loss": 0.3234, "mean_copy_accuracy": 0.9950670748949051, "mean_gen_accuracy": 0.8608715236186981, "mean_token_accuracy": 0.8939239233732224, "num_tokens": 142599912.0, "sample_num_tokens": 8031.0, "step": 4526, "total_num_tokens": 142632036.0, "z_loss": 0.0006257279310375452 }, { "copy_logits_max": -1.525225043296814, "copy_logits_min": -750000000.0, "copy_num_tokens": 550.375, "epoch": 0.9245851416900689, "gen_logits_max": 4.583653450012207, "gen_logits_mean": -14.31128215789795, "gen_logits_min": -26.148462295532227, "gen_logits_std": 2.871910810470581, "gen_loss": 0.2699410915374756, "grad_norm": 0.4166119244838674, "learning_rate": 2.521136842105263e-05, "loss": 0.3049, "mean_copy_accuracy": 0.99497951567173, "mean_gen_accuracy": 0.8670181781053543, "mean_token_accuracy": 0.8977475017309189, "num_tokens": 142859611.0, "sample_num_tokens": 8604.75, "step": 4527, "total_num_tokens": 142894030.0, "z_loss": 0.0005829628789797425 }, { "copy_logits_max": -4.385446548461914, "copy_logits_min": -750000000.0, "copy_num_tokens": 390.6875, "epoch": 0.9247893796272658, "gen_logits_max": 4.875083923339844, "gen_logits_mean": -14.43770980834961, "gen_logits_min": -25.913665771484375, "gen_logits_std": 2.81490421295166, "gen_loss": 0.34758076071739197, "grad_norm": 0.43660644252540975, "learning_rate": 2.5210105263157895e-05, "loss": 0.323, "mean_copy_accuracy": 0.994075357913971, "mean_gen_accuracy": 0.8615936189889908, "mean_token_accuracy": 0.8913758993148804, "num_tokens": 143115688.0, "sample_num_tokens": 8091.5, "step": 4528, "total_num_tokens": 143148054.0, "z_loss": 0.0006639798521064222 }, { "copy_logits_max": -2.5084025859832764, "copy_logits_min": -625000064.0, "copy_num_tokens": 705.5625, "epoch": 0.9249936175644626, "gen_logits_max": 3.2518224716186523, "gen_logits_mean": -16.491409301757812, "gen_logits_min": -28.549100875854492, "gen_logits_std": 2.8972878456115723, "gen_loss": 0.28795188665390015, "grad_norm": 0.3994879661053768, "learning_rate": 2.520884210526316e-05, "loss": 0.2965, "mean_copy_accuracy": 0.9962001740932465, "mean_gen_accuracy": 0.8655813485383987, "mean_token_accuracy": 0.9015982002019882, "num_tokens": 143384178.0, "sample_num_tokens": 9965.0, "step": 4529, "total_num_tokens": 143424038.0, "z_loss": 0.0006188293918967247 }, { "copy_logits_max": -2.2660722732543945, "copy_logits_min": -750000000.0, "copy_num_tokens": 470.0625, "epoch": 0.9251978555016594, "gen_logits_max": 5.2207489013671875, "gen_logits_mean": -14.2731351852417, "gen_logits_min": -26.285709381103516, "gen_logits_std": 2.8243982791900635, "gen_loss": 0.3262040615081787, "grad_norm": 0.3864874072152374, "learning_rate": 2.520757894736842e-05, "loss": 0.3029, "mean_copy_accuracy": 0.9956213980913162, "mean_gen_accuracy": 0.8678691387176514, "mean_token_accuracy": 0.8992574661970139, "num_tokens": 143654952.0, "sample_num_tokens": 9034.5, "step": 4530, "total_num_tokens": 143691090.0, "z_loss": 0.0007220290135592222 }, { "copy_logits_max": -5.095846176147461, "copy_logits_min": -750000000.0, "copy_num_tokens": 374.75, "epoch": 0.9254020934388563, "gen_logits_max": 4.384868621826172, "gen_logits_mean": -15.369382858276367, "gen_logits_min": -27.03167724609375, "gen_logits_std": 2.8334622383117676, "gen_loss": 0.32543283700942993, "grad_norm": 0.46649540270394635, "learning_rate": 2.5206315789473688e-05, "loss": 0.3145, "mean_copy_accuracy": 0.995350643992424, "mean_gen_accuracy": 0.8579302579164505, "mean_token_accuracy": 0.8964058458805084, "num_tokens": 143923630.0, "sample_num_tokens": 7455.0, "step": 4531, "total_num_tokens": 143953450.0, "z_loss": 0.0007003428181633353 }, { "copy_logits_max": -5.3240532875061035, "copy_logits_min": -687500032.0, "copy_num_tokens": 453.4375, "epoch": 0.9256063313760531, "gen_logits_max": 3.2596795558929443, "gen_logits_mean": -16.227720260620117, "gen_logits_min": -27.928621292114258, "gen_logits_std": 2.852823257446289, "gen_loss": 0.28985804319381714, "grad_norm": 0.3991025142565279, "learning_rate": 2.520505263157895e-05, "loss": 0.3096, "mean_copy_accuracy": 0.9957507103681564, "mean_gen_accuracy": 0.8601216226816177, "mean_token_accuracy": 0.8969978094100952, "num_tokens": 144211026.0, "sample_num_tokens": 7892.5, "step": 4532, "total_num_tokens": 144242596.0, "z_loss": 0.0006396081298589706 }, { "copy_logits_max": -6.834707736968994, "copy_logits_min": -750000000.0, "copy_num_tokens": 395.8125, "epoch": 0.9258105693132499, "gen_logits_max": 5.196837902069092, "gen_logits_mean": -14.419294357299805, "gen_logits_min": -26.272611618041992, "gen_logits_std": 2.847018241882324, "gen_loss": 0.32955068349838257, "grad_norm": 0.3938499919100433, "learning_rate": 2.5203789473684213e-05, "loss": 0.2957, "mean_copy_accuracy": 0.9962011575698853, "mean_gen_accuracy": 0.8693142384290695, "mean_token_accuracy": 0.9011758863925934, "num_tokens": 144493770.0, "sample_num_tokens": 8398.5, "step": 4533, "total_num_tokens": 144527364.0, "z_loss": 0.000624783628154546 }, { "copy_logits_max": -6.5422258377075195, "copy_logits_min": -750000000.0, "copy_num_tokens": 508.5625, "epoch": 0.9260148072504468, "gen_logits_max": 4.367293834686279, "gen_logits_mean": -14.3712158203125, "gen_logits_min": -26.402467727661133, "gen_logits_std": 2.8516690731048584, "gen_loss": 0.2945721447467804, "grad_norm": 0.41504935976786334, "learning_rate": 2.5202526315789474e-05, "loss": 0.2981, "mean_copy_accuracy": 0.9960190802812576, "mean_gen_accuracy": 0.8661714345216751, "mean_token_accuracy": 0.901808574795723, "num_tokens": 144767293.0, "sample_num_tokens": 8716.25, "step": 4534, "total_num_tokens": 144802158.0, "z_loss": 0.0005540488054975867 }, { "copy_logits_max": -5.68527889251709, "copy_logits_min": -750000000.0, "copy_num_tokens": 572.0, "epoch": 0.9262190451876436, "gen_logits_max": 4.032451629638672, "gen_logits_mean": -14.555386543273926, "gen_logits_min": -26.789581298828125, "gen_logits_std": 2.8219141960144043, "gen_loss": 0.26736530661582947, "grad_norm": 0.3869689535433848, "learning_rate": 2.520126315789474e-05, "loss": 0.3043, "mean_copy_accuracy": 0.9956468641757965, "mean_gen_accuracy": 0.8608521521091461, "mean_token_accuracy": 0.8970480710268021, "num_tokens": 145031101.0, "sample_num_tokens": 8463.25, "step": 4535, "total_num_tokens": 145064954.0, "z_loss": 0.0005527840694412589 }, { "copy_logits_max": -6.7696213722229, "copy_logits_min": -750000000.0, "copy_num_tokens": 519.125, "epoch": 0.9264232831248405, "gen_logits_max": 4.105663299560547, "gen_logits_mean": -15.257716178894043, "gen_logits_min": -27.175405502319336, "gen_logits_std": 2.849144458770752, "gen_loss": 0.25882500410079956, "grad_norm": 0.41004664154373444, "learning_rate": 2.52e-05, "loss": 0.31, "mean_copy_accuracy": 0.995470717549324, "mean_gen_accuracy": 0.8668274283409119, "mean_token_accuracy": 0.896263837814331, "num_tokens": 145314518.0, "sample_num_tokens": 8541.5, "step": 4536, "total_num_tokens": 145348684.0, "z_loss": 0.000516242696903646 }, { "copy_logits_max": -7.544914245605469, "copy_logits_min": -750000000.0, "copy_num_tokens": 567.1875, "epoch": 0.9266275210620373, "gen_logits_max": 4.361908435821533, "gen_logits_mean": -14.49515151977539, "gen_logits_min": -26.613727569580078, "gen_logits_std": 2.8196640014648438, "gen_loss": 0.28072673082351685, "grad_norm": 0.38433042872090073, "learning_rate": 2.5198736842105264e-05, "loss": 0.3115, "mean_copy_accuracy": 0.9947434216737747, "mean_gen_accuracy": 0.8650078177452087, "mean_token_accuracy": 0.8970695436000824, "num_tokens": 145606857.0, "sample_num_tokens": 9139.25, "step": 4537, "total_num_tokens": 145643414.0, "z_loss": 0.0005572203081101179 }, { "copy_logits_max": -6.311031818389893, "copy_logits_min": -750000000.0, "copy_num_tokens": 657.625, "epoch": 0.9268317589992341, "gen_logits_max": 3.812626361846924, "gen_logits_mean": -14.92446517944336, "gen_logits_min": -26.944244384765625, "gen_logits_std": 2.8764078617095947, "gen_loss": 0.27660036087036133, "grad_norm": 0.36625952277295803, "learning_rate": 2.5197473684210525e-05, "loss": 0.2813, "mean_copy_accuracy": 0.9956101179122925, "mean_gen_accuracy": 0.8728239834308624, "mean_token_accuracy": 0.9062204658985138, "num_tokens": 145902021.0, "sample_num_tokens": 9497.25, "step": 4538, "total_num_tokens": 145940010.0, "z_loss": 0.0005424785194918513 }, { "copy_logits_max": -5.183163642883301, "copy_logits_min": -750000000.0, "copy_num_tokens": 504.375, "epoch": 0.9270359969364309, "gen_logits_max": 4.451249122619629, "gen_logits_mean": -14.904626846313477, "gen_logits_min": -26.608253479003906, "gen_logits_std": 2.7923834323883057, "gen_loss": 0.33909130096435547, "grad_norm": 0.39849962606758565, "learning_rate": 2.519621052631579e-05, "loss": 0.3285, "mean_copy_accuracy": 0.9945425987243652, "mean_gen_accuracy": 0.8581344187259674, "mean_token_accuracy": 0.8883857429027557, "num_tokens": 146164754.0, "sample_num_tokens": 8437.5, "step": 4539, "total_num_tokens": 146198504.0, "z_loss": 0.0006552028353326023 }, { "copy_logits_max": -9.494041442871094, "copy_logits_min": -750000000.0, "copy_num_tokens": 390.0625, "epoch": 0.9272402348736277, "gen_logits_max": 5.281852722167969, "gen_logits_mean": -16.116180419921875, "gen_logits_min": -27.725778579711914, "gen_logits_std": 2.827281951904297, "gen_loss": 0.3340635597705841, "grad_norm": 0.3733797592636809, "learning_rate": 2.5194947368421053e-05, "loss": 0.3111, "mean_copy_accuracy": 0.9954051226377487, "mean_gen_accuracy": 0.8687390983104706, "mean_token_accuracy": 0.8963807225227356, "num_tokens": 146430854.0, "sample_num_tokens": 8373.0, "step": 4540, "total_num_tokens": 146464346.0, "z_loss": 0.0006181685603223741 }, { "copy_logits_max": -5.041177749633789, "copy_logits_min": -750000000.0, "copy_num_tokens": 646.0, "epoch": 0.9274444728108246, "gen_logits_max": 3.274780035018921, "gen_logits_mean": -15.900601387023926, "gen_logits_min": -27.827377319335938, "gen_logits_std": 2.852494239807129, "gen_loss": 0.31017178297042847, "grad_norm": 0.39937726307749327, "learning_rate": 2.5193684210526317e-05, "loss": 0.323, "mean_copy_accuracy": 0.9938531965017319, "mean_gen_accuracy": 0.8609890788793564, "mean_token_accuracy": 0.8929761052131653, "num_tokens": 146691575.0, "sample_num_tokens": 9103.75, "step": 4541, "total_num_tokens": 146727990.0, "z_loss": 0.0006683142273686826 }, { "copy_logits_max": -8.930537223815918, "copy_logits_min": -750000000.0, "copy_num_tokens": 298.875, "epoch": 0.9276487107480215, "gen_logits_max": 5.414507865905762, "gen_logits_mean": -15.517729759216309, "gen_logits_min": -26.882858276367188, "gen_logits_std": 2.7867703437805176, "gen_loss": 0.32976701855659485, "grad_norm": 0.3750416996757747, "learning_rate": 2.5192421052631582e-05, "loss": 0.2897, "mean_copy_accuracy": 0.994618147611618, "mean_gen_accuracy": 0.8720280230045319, "mean_token_accuracy": 0.9018545895814896, "num_tokens": 146978761.0, "sample_num_tokens": 7695.75, "step": 4542, "total_num_tokens": 147009544.0, "z_loss": 0.0006552382837980986 }, { "copy_logits_max": -7.142998218536377, "copy_logits_min": -750000000.0, "copy_num_tokens": 639.75, "epoch": 0.9278529486852183, "gen_logits_max": 6.099270820617676, "gen_logits_mean": -12.899757385253906, "gen_logits_min": -25.355384826660156, "gen_logits_std": 2.868533134460449, "gen_loss": 0.2730972468852997, "grad_norm": 0.42720567480395416, "learning_rate": 2.5191157894736843e-05, "loss": 0.2951, "mean_copy_accuracy": 0.9947447925806046, "mean_gen_accuracy": 0.869685560464859, "mean_token_accuracy": 0.9016321897506714, "num_tokens": 147244004.0, "sample_num_tokens": 8591.5, "step": 4543, "total_num_tokens": 147278370.0, "z_loss": 0.0005840113153681159 }, { "copy_logits_max": -8.031562805175781, "copy_logits_min": -750000000.0, "copy_num_tokens": 442.5625, "epoch": 0.9280571866224151, "gen_logits_max": 5.104518890380859, "gen_logits_mean": -14.624732971191406, "gen_logits_min": -26.296443939208984, "gen_logits_std": 2.811475992202759, "gen_loss": 0.29083138704299927, "grad_norm": 0.5041902939872621, "learning_rate": 2.5189894736842107e-05, "loss": 0.2869, "mean_copy_accuracy": 0.9962774217128754, "mean_gen_accuracy": 0.8766658455133438, "mean_token_accuracy": 0.9025937765836716, "num_tokens": 147525532.0, "sample_num_tokens": 9581.5, "step": 4544, "total_num_tokens": 147563858.0, "z_loss": 0.0005826209089718759 }, { "copy_logits_max": -4.25218391418457, "copy_logits_min": -750000064.0, "copy_num_tokens": 498.875, "epoch": 0.9282614245596119, "gen_logits_max": 5.386837959289551, "gen_logits_mean": -14.438776016235352, "gen_logits_min": -26.768781661987305, "gen_logits_std": 2.8817811012268066, "gen_loss": 0.2690659165382385, "grad_norm": 0.3813395569324185, "learning_rate": 2.5188631578947368e-05, "loss": 0.3012, "mean_copy_accuracy": 0.9948474764823914, "mean_gen_accuracy": 0.8649644553661346, "mean_token_accuracy": 0.8970582634210587, "num_tokens": 147792298.0, "sample_num_tokens": 8130.0, "step": 4545, "total_num_tokens": 147824818.0, "z_loss": 0.0005682947812601924 }, { "copy_logits_max": -6.294219017028809, "copy_logits_min": -687500032.0, "copy_num_tokens": 350.875, "epoch": 0.9284656624968087, "gen_logits_max": 5.250322341918945, "gen_logits_mean": -14.99461555480957, "gen_logits_min": -26.92698860168457, "gen_logits_std": 2.8701798915863037, "gen_loss": 0.2916591167449951, "grad_norm": 0.38910254624805785, "learning_rate": 2.5187368421052632e-05, "loss": 0.3054, "mean_copy_accuracy": 0.9945400804281235, "mean_gen_accuracy": 0.8692003935575485, "mean_token_accuracy": 0.8990896344184875, "num_tokens": 148054196.0, "sample_num_tokens": 6732.5, "step": 4546, "total_num_tokens": 148081126.0, "z_loss": 0.0005675175925716758 }, { "copy_logits_max": -4.907973289489746, "copy_logits_min": -750000000.0, "copy_num_tokens": 425.9375, "epoch": 0.9286699004340057, "gen_logits_max": 5.162777900695801, "gen_logits_mean": -15.245070457458496, "gen_logits_min": -27.70802879333496, "gen_logits_std": 2.8585312366485596, "gen_loss": 0.3414066433906555, "grad_norm": 0.37824244405153085, "learning_rate": 2.5186105263157893e-05, "loss": 0.3182, "mean_copy_accuracy": 0.9959306567907333, "mean_gen_accuracy": 0.8550174981355667, "mean_token_accuracy": 0.8970178067684174, "num_tokens": 148335999.0, "sample_num_tokens": 8137.75, "step": 4547, "total_num_tokens": 148368550.0, "z_loss": 0.0006684407126158476 }, { "copy_logits_max": -4.184340476989746, "copy_logits_min": -687500032.0, "copy_num_tokens": 478.1875, "epoch": 0.9288741383712025, "gen_logits_max": 6.012914657592773, "gen_logits_mean": -12.965336799621582, "gen_logits_min": -24.89737319946289, "gen_logits_std": 2.805130958557129, "gen_loss": 0.321358859539032, "grad_norm": 0.3696415699704359, "learning_rate": 2.518484210526316e-05, "loss": 0.2968, "mean_copy_accuracy": 0.9962641298770905, "mean_gen_accuracy": 0.8695339262485504, "mean_token_accuracy": 0.9013348817825317, "num_tokens": 148620917.0, "sample_num_tokens": 8580.75, "step": 4548, "total_num_tokens": 148655240.0, "z_loss": 0.0006662634550593793 }, { "copy_logits_max": -6.385216236114502, "copy_logits_min": -687500032.0, "copy_num_tokens": 359.6875, "epoch": 0.9290783763083993, "gen_logits_max": 5.335642337799072, "gen_logits_mean": -14.300007820129395, "gen_logits_min": -25.690479278564453, "gen_logits_std": 2.8319358825683594, "gen_loss": 0.30835825204849243, "grad_norm": 0.4006270919845676, "learning_rate": 2.5183578947368422e-05, "loss": 0.3112, "mean_copy_accuracy": 0.9956208318471909, "mean_gen_accuracy": 0.8651983141899109, "mean_token_accuracy": 0.8959522545337677, "num_tokens": 148892348.0, "sample_num_tokens": 7870.0, "step": 4549, "total_num_tokens": 148923828.0, "z_loss": 0.0006186027312651277 }, { "copy_logits_max": -4.362160682678223, "copy_logits_min": -750000000.0, "copy_num_tokens": 383.6875, "epoch": 0.9292826142455961, "gen_logits_max": 6.5892534255981445, "gen_logits_mean": -12.836557388305664, "gen_logits_min": -24.88644027709961, "gen_logits_std": 2.8127875328063965, "gen_loss": 0.359942764043808, "grad_norm": 0.4189727019094306, "learning_rate": 2.5182315789473686e-05, "loss": 0.3438, "mean_copy_accuracy": 0.9945408552885056, "mean_gen_accuracy": 0.8551964461803436, "mean_token_accuracy": 0.8859617710113525, "num_tokens": 149145517.0, "sample_num_tokens": 7532.75, "step": 4550, "total_num_tokens": 149175648.0, "z_loss": 0.0006574611179530621 }, { "copy_logits_max": -4.53964900970459, "copy_logits_min": -750000000.0, "copy_num_tokens": 495.9375, "epoch": 0.9294868521827929, "gen_logits_max": 4.407752513885498, "gen_logits_mean": -14.73620319366455, "gen_logits_min": -26.672849655151367, "gen_logits_std": 2.879786968231201, "gen_loss": 0.3106032609939575, "grad_norm": 0.3899038766286113, "learning_rate": 2.5181052631578947e-05, "loss": 0.3054, "mean_copy_accuracy": 0.9949269145727158, "mean_gen_accuracy": 0.8664283603429794, "mean_token_accuracy": 0.900049701333046, "num_tokens": 149425091.0, "sample_num_tokens": 7615.25, "step": 4551, "total_num_tokens": 149455552.0, "z_loss": 0.0005757857579737902 }, { "copy_logits_max": -5.07242488861084, "copy_logits_min": -750000000.0, "copy_num_tokens": 376.625, "epoch": 0.9296910901199897, "gen_logits_max": 4.8044352531433105, "gen_logits_mean": -15.214462280273438, "gen_logits_min": -27.2010440826416, "gen_logits_std": 2.8582968711853027, "gen_loss": 0.3288559019565582, "grad_norm": 0.39993851211761927, "learning_rate": 2.517978947368421e-05, "loss": 0.3141, "mean_copy_accuracy": 0.9953814744949341, "mean_gen_accuracy": 0.8631925135850906, "mean_token_accuracy": 0.8946572095155716, "num_tokens": 149674564.0, "sample_num_tokens": 7960.5, "step": 4552, "total_num_tokens": 149706406.0, "z_loss": 0.0006427859771065414 }, { "copy_logits_max": -4.650276184082031, "copy_logits_min": -687500032.0, "copy_num_tokens": 337.5625, "epoch": 0.9298953280571867, "gen_logits_max": 5.7656450271606445, "gen_logits_mean": -15.317767143249512, "gen_logits_min": -26.826618194580078, "gen_logits_std": 2.857891082763672, "gen_loss": 0.3237019181251526, "grad_norm": 0.3874058862413928, "learning_rate": 2.5178526315789472e-05, "loss": 0.3224, "mean_copy_accuracy": 0.9956262707710266, "mean_gen_accuracy": 0.8613768815994263, "mean_token_accuracy": 0.8934501707553864, "num_tokens": 149960724.0, "sample_num_tokens": 8235.0, "step": 4553, "total_num_tokens": 149993664.0, "z_loss": 0.0005897234077565372 }, { "copy_logits_max": -3.9674124717712402, "copy_logits_min": -750000000.0, "copy_num_tokens": 469.5, "epoch": 0.9300995659943835, "gen_logits_max": 5.814105987548828, "gen_logits_mean": -13.25808048248291, "gen_logits_min": -25.219600677490234, "gen_logits_std": 2.862990379333496, "gen_loss": 0.28912484645843506, "grad_norm": 0.38838104164943466, "learning_rate": 2.5177263157894737e-05, "loss": 0.3005, "mean_copy_accuracy": 0.9956316947937012, "mean_gen_accuracy": 0.8670965731143951, "mean_token_accuracy": 0.9019404053688049, "num_tokens": 150237616.0, "sample_num_tokens": 8921.0, "step": 4554, "total_num_tokens": 150273300.0, "z_loss": 0.0005397238419391215 }, { "copy_logits_max": -5.011351585388184, "copy_logits_min": -750000064.0, "copy_num_tokens": 648.0, "epoch": 0.9303038039315803, "gen_logits_max": 4.9686665534973145, "gen_logits_mean": -13.121583938598633, "gen_logits_min": -25.224384307861328, "gen_logits_std": 2.8487095832824707, "gen_loss": 0.3136558532714844, "grad_norm": 0.43334405607888127, "learning_rate": 2.5175999999999997e-05, "loss": 0.3084, "mean_copy_accuracy": 0.9944563806056976, "mean_gen_accuracy": 0.8643518835306168, "mean_token_accuracy": 0.8982180655002594, "num_tokens": 150513458.0, "sample_num_tokens": 10339.0, "step": 4555, "total_num_tokens": 150554814.0, "z_loss": 0.0006179301999509335 }, { "copy_logits_max": -6.2054972648620605, "copy_logits_min": -687500032.0, "copy_num_tokens": 437.625, "epoch": 0.9305080418687771, "gen_logits_max": 5.170696258544922, "gen_logits_mean": -14.525491714477539, "gen_logits_min": -26.40858268737793, "gen_logits_std": 2.8868980407714844, "gen_loss": 0.24901776015758514, "grad_norm": 0.35841841367069066, "learning_rate": 2.5174736842105265e-05, "loss": 0.3019, "mean_copy_accuracy": 0.9958343952894211, "mean_gen_accuracy": 0.8688611388206482, "mean_token_accuracy": 0.8992835879325867, "num_tokens": 150803614.0, "sample_num_tokens": 8683.5, "step": 4556, "total_num_tokens": 150838348.0, "z_loss": 0.0004477347247302532 }, { "copy_logits_max": -1.9796631336212158, "copy_logits_min": -687500032.0, "copy_num_tokens": 296.125, "epoch": 0.9307122798059739, "gen_logits_max": 6.799234390258789, "gen_logits_mean": -12.128297805786133, "gen_logits_min": -23.9124813079834, "gen_logits_std": 2.8438713550567627, "gen_loss": 0.38507744669914246, "grad_norm": 0.46624285579783675, "learning_rate": 2.517347368421053e-05, "loss": 0.3332, "mean_copy_accuracy": 0.9953636527061462, "mean_gen_accuracy": 0.8581317216157913, "mean_token_accuracy": 0.8875095397233963, "num_tokens": 151053851.0, "sample_num_tokens": 7123.75, "step": 4557, "total_num_tokens": 151082346.0, "z_loss": 0.0006856080144643784 }, { "copy_logits_max": -3.3727221488952637, "copy_logits_min": -687500032.0, "copy_num_tokens": 480.6875, "epoch": 0.9309165177431707, "gen_logits_max": 4.74073600769043, "gen_logits_mean": -15.365599632263184, "gen_logits_min": -27.20083999633789, "gen_logits_std": 2.885772466659546, "gen_loss": 0.30110231041908264, "grad_norm": 0.3874212689822267, "learning_rate": 2.517221052631579e-05, "loss": 0.2986, "mean_copy_accuracy": 0.9956873953342438, "mean_gen_accuracy": 0.866202175617218, "mean_token_accuracy": 0.9027179032564163, "num_tokens": 151339133.0, "sample_num_tokens": 8267.25, "step": 4558, "total_num_tokens": 151372202.0, "z_loss": 0.000582031556405127 }, { "copy_logits_max": -5.153842926025391, "copy_logits_min": -750000000.0, "copy_num_tokens": 353.6875, "epoch": 0.9311207556803677, "gen_logits_max": 6.0531415939331055, "gen_logits_mean": -13.876995086669922, "gen_logits_min": -25.753055572509766, "gen_logits_std": 2.8701698780059814, "gen_loss": 0.2952219843864441, "grad_norm": 0.42596545688957443, "learning_rate": 2.5170947368421055e-05, "loss": 0.2876, "mean_copy_accuracy": 0.995433509349823, "mean_gen_accuracy": 0.8758123368024826, "mean_token_accuracy": 0.9048935025930405, "num_tokens": 151611485.0, "sample_num_tokens": 7916.75, "step": 4559, "total_num_tokens": 151643152.0, "z_loss": 0.0005517835961654782 }, { "copy_logits_max": -3.7101473808288574, "copy_logits_min": -687500032.0, "copy_num_tokens": 522.25, "epoch": 0.9313249936175645, "gen_logits_max": 4.840283393859863, "gen_logits_mean": -15.282241821289062, "gen_logits_min": -27.14548110961914, "gen_logits_std": 2.8955016136169434, "gen_loss": 0.2938384711742401, "grad_norm": 0.4612824501794313, "learning_rate": 2.5169684210526316e-05, "loss": 0.3171, "mean_copy_accuracy": 0.9936261922121048, "mean_gen_accuracy": 0.8666545897722244, "mean_token_accuracy": 0.8940963447093964, "num_tokens": 151879147.0, "sample_num_tokens": 8993.75, "step": 4560, "total_num_tokens": 151915122.0, "z_loss": 0.0005496322410181165 }, { "copy_logits_max": -3.7647573947906494, "copy_logits_min": -750000000.0, "copy_num_tokens": 453.5625, "epoch": 0.9315292315547613, "gen_logits_max": 4.299739837646484, "gen_logits_mean": -15.792160034179688, "gen_logits_min": -27.480321884155273, "gen_logits_std": 2.8916282653808594, "gen_loss": 0.2980819642543793, "grad_norm": 0.4022593702981962, "learning_rate": 2.516842105263158e-05, "loss": 0.3089, "mean_copy_accuracy": 0.9959638118743896, "mean_gen_accuracy": 0.8662939816713333, "mean_token_accuracy": 0.89629065990448, "num_tokens": 152155912.0, "sample_num_tokens": 8816.5, "step": 4561, "total_num_tokens": 152191178.0, "z_loss": 0.0005415462073870003 }, { "copy_logits_max": -0.7763679027557373, "copy_logits_min": -750000000.0, "copy_num_tokens": 585.1875, "epoch": 0.9317334694919581, "gen_logits_max": 4.56345796585083, "gen_logits_mean": -14.153549194335938, "gen_logits_min": -26.38619613647461, "gen_logits_std": 2.8783493041992188, "gen_loss": 0.27052414417266846, "grad_norm": 0.453915632789075, "learning_rate": 2.516715789473684e-05, "loss": 0.2999, "mean_copy_accuracy": 0.9954430162906647, "mean_gen_accuracy": 0.8684957474470139, "mean_token_accuracy": 0.90067058801651, "num_tokens": 152433570.0, "sample_num_tokens": 9119.0, "step": 4562, "total_num_tokens": 152470046.0, "z_loss": 0.00051604158943519 }, { "copy_logits_max": -2.7821743488311768, "copy_logits_min": -750000000.0, "copy_num_tokens": 361.4375, "epoch": 0.9319377074291549, "gen_logits_max": 4.508581161499023, "gen_logits_mean": -15.953617095947266, "gen_logits_min": -27.828590393066406, "gen_logits_std": 2.8981549739837646, "gen_loss": 0.30462172627449036, "grad_norm": 0.3983919239473557, "learning_rate": 2.5165894736842105e-05, "loss": 0.3161, "mean_copy_accuracy": 0.9954367280006409, "mean_gen_accuracy": 0.8595371544361115, "mean_token_accuracy": 0.8942609280347824, "num_tokens": 152714610.0, "sample_num_tokens": 7150.5, "step": 4563, "total_num_tokens": 152743212.0, "z_loss": 0.0005478892708197236 }, { "copy_logits_max": -1.794115424156189, "copy_logits_min": -750000000.0, "copy_num_tokens": 513.75, "epoch": 0.9321419453663518, "gen_logits_max": 5.017754554748535, "gen_logits_mean": -14.605988502502441, "gen_logits_min": -26.54928970336914, "gen_logits_std": 2.8869917392730713, "gen_loss": 0.28849518299102783, "grad_norm": 0.4265593184389429, "learning_rate": 2.516463157894737e-05, "loss": 0.3017, "mean_copy_accuracy": 0.9935281276702881, "mean_gen_accuracy": 0.8738005608320236, "mean_token_accuracy": 0.8995259702205658, "num_tokens": 152983165.0, "sample_num_tokens": 9667.25, "step": 4564, "total_num_tokens": 153021834.0, "z_loss": 0.0005358922644518316 }, { "copy_logits_max": -0.9276010990142822, "copy_logits_min": -750000000.0, "copy_num_tokens": 473.375, "epoch": 0.9323461833035487, "gen_logits_max": 5.039888381958008, "gen_logits_mean": -14.714150428771973, "gen_logits_min": -26.25146484375, "gen_logits_std": 2.874284267425537, "gen_loss": 0.3081430196762085, "grad_norm": 0.4351576197256311, "learning_rate": 2.5163368421052634e-05, "loss": 0.3144, "mean_copy_accuracy": 0.9948202073574066, "mean_gen_accuracy": 0.8663361668586731, "mean_token_accuracy": 0.89553502202034, "num_tokens": 153251166.0, "sample_num_tokens": 8642.0, "step": 4565, "total_num_tokens": 153285734.0, "z_loss": 0.0005726972012780607 }, { "copy_logits_max": -3.7761781215667725, "copy_logits_min": -750000000.0, "copy_num_tokens": 391.375, "epoch": 0.9325504212407455, "gen_logits_max": 5.095053195953369, "gen_logits_mean": -15.692242622375488, "gen_logits_min": -27.477436065673828, "gen_logits_std": 2.889193296432495, "gen_loss": 0.286094069480896, "grad_norm": 0.4452147913578655, "learning_rate": 2.5162105263157895e-05, "loss": 0.3063, "mean_copy_accuracy": 0.9950213730335236, "mean_gen_accuracy": 0.8724336922168732, "mean_token_accuracy": 0.8991711288690567, "num_tokens": 153513052.0, "sample_num_tokens": 8452.5, "step": 4566, "total_num_tokens": 153546862.0, "z_loss": 0.0006147020030766726 }, { "copy_logits_max": 0.3817669749259949, "copy_logits_min": -750000000.0, "copy_num_tokens": 646.1875, "epoch": 0.9327546591779423, "gen_logits_max": 5.619690895080566, "gen_logits_mean": -13.759414672851562, "gen_logits_min": -25.731380462646484, "gen_logits_std": 2.878452777862549, "gen_loss": 0.30595189332962036, "grad_norm": 0.4317489420987751, "learning_rate": 2.516084210526316e-05, "loss": 0.3058, "mean_copy_accuracy": 0.9939674586057663, "mean_gen_accuracy": 0.8718755543231964, "mean_token_accuracy": 0.9001862555742264, "num_tokens": 153768249.0, "sample_num_tokens": 9129.25, "step": 4567, "total_num_tokens": 153804766.0, "z_loss": 0.0006895349361002445 }, { "copy_logits_max": -2.355320692062378, "copy_logits_min": -750000000.0, "copy_num_tokens": 468.375, "epoch": 0.9329588971151391, "gen_logits_max": 4.556946754455566, "gen_logits_mean": -15.934072494506836, "gen_logits_min": -27.721179962158203, "gen_logits_std": 2.89180850982666, "gen_loss": 0.3003915250301361, "grad_norm": 0.40859031043922805, "learning_rate": 2.515957894736842e-05, "loss": 0.315, "mean_copy_accuracy": 0.9942079037427902, "mean_gen_accuracy": 0.862520232796669, "mean_token_accuracy": 0.8953751921653748, "num_tokens": 154042371.0, "sample_num_tokens": 8294.25, "step": 4568, "total_num_tokens": 154075548.0, "z_loss": 0.0006454947870224714 }, { "copy_logits_max": -3.7905359268188477, "copy_logits_min": -750000000.0, "copy_num_tokens": 373.5625, "epoch": 0.933163135052336, "gen_logits_max": 6.028414249420166, "gen_logits_mean": -12.93183708190918, "gen_logits_min": -25.262287139892578, "gen_logits_std": 2.8432297706604004, "gen_loss": 0.3030747175216675, "grad_norm": 0.4731877072401478, "learning_rate": 2.5158315789473684e-05, "loss": 0.306, "mean_copy_accuracy": 0.9947804808616638, "mean_gen_accuracy": 0.8660658299922943, "mean_token_accuracy": 0.8969671279191971, "num_tokens": 154309554.0, "sample_num_tokens": 7374.5, "step": 4569, "total_num_tokens": 154339052.0, "z_loss": 0.0006617489270865917 }, { "copy_logits_max": -1.5736373662948608, "copy_logits_min": -750000000.0, "copy_num_tokens": 685.625, "epoch": 0.9333673729895328, "gen_logits_max": 4.679966449737549, "gen_logits_mean": -14.442634582519531, "gen_logits_min": -26.70370101928711, "gen_logits_std": 2.877333641052246, "gen_loss": 0.27705472707748413, "grad_norm": 0.5874821462405909, "learning_rate": 2.515705263157895e-05, "loss": 0.3145, "mean_copy_accuracy": 0.9931293427944183, "mean_gen_accuracy": 0.8660739362239838, "mean_token_accuracy": 0.8958591371774673, "num_tokens": 154572519.0, "sample_num_tokens": 10262.25, "step": 4570, "total_num_tokens": 154613568.0, "z_loss": 0.0005903408164158463 }, { "copy_logits_max": -3.057833194732666, "copy_logits_min": -750000000.0, "copy_num_tokens": 384.4375, "epoch": 0.9335716109267297, "gen_logits_max": 5.725536346435547, "gen_logits_mean": -14.303182601928711, "gen_logits_min": -25.87823486328125, "gen_logits_std": 2.8321282863616943, "gen_loss": 0.3187655210494995, "grad_norm": 0.43162944208848536, "learning_rate": 2.515578947368421e-05, "loss": 0.3242, "mean_copy_accuracy": 0.9950213432312012, "mean_gen_accuracy": 0.8559778481721878, "mean_token_accuracy": 0.8903863579034805, "num_tokens": 154851586.0, "sample_num_tokens": 7898.0, "step": 4571, "total_num_tokens": 154883178.0, "z_loss": 0.0006540430476889014 }, { "copy_logits_max": 1.4246934652328491, "copy_logits_min": -750000000.0, "copy_num_tokens": 456.25, "epoch": 0.9337758488639265, "gen_logits_max": 6.721446990966797, "gen_logits_mean": -12.445377349853516, "gen_logits_min": -24.534170150756836, "gen_logits_std": 2.8995628356933594, "gen_loss": 0.26583436131477356, "grad_norm": 0.3868901615222291, "learning_rate": 2.5154526315789477e-05, "loss": 0.2836, "mean_copy_accuracy": 0.9945985972881317, "mean_gen_accuracy": 0.8773838430643082, "mean_token_accuracy": 0.904850497841835, "num_tokens": 155108274.0, "sample_num_tokens": 7260.0, "step": 4572, "total_num_tokens": 155137314.0, "z_loss": 0.0005534493830054998 }, { "copy_logits_max": -0.9527894258499146, "copy_logits_min": -750000000.0, "copy_num_tokens": 351.375, "epoch": 0.9339800868011233, "gen_logits_max": 5.7488017082214355, "gen_logits_mean": -14.56478500366211, "gen_logits_min": -26.26462745666504, "gen_logits_std": 2.853677272796631, "gen_loss": 0.35171180963516235, "grad_norm": 0.37797281747504563, "learning_rate": 2.5153263157894738e-05, "loss": 0.2978, "mean_copy_accuracy": 0.9950970262289047, "mean_gen_accuracy": 0.8696435391902924, "mean_token_accuracy": 0.9006261378526688, "num_tokens": 155387637.0, "sample_num_tokens": 7898.25, "step": 4573, "total_num_tokens": 155419230.0, "z_loss": 0.0007077407208271325 }, { "copy_logits_max": -4.536270618438721, "copy_logits_min": -750000000.0, "copy_num_tokens": 398.75, "epoch": 0.9341843247383201, "gen_logits_max": 6.26365852355957, "gen_logits_mean": -14.416158676147461, "gen_logits_min": -25.944427490234375, "gen_logits_std": 2.8479256629943848, "gen_loss": 0.3374999165534973, "grad_norm": 0.38933213720720816, "learning_rate": 2.5152000000000002e-05, "loss": 0.3056, "mean_copy_accuracy": 0.994805321097374, "mean_gen_accuracy": 0.8642282634973526, "mean_token_accuracy": 0.8980234861373901, "num_tokens": 155655271.0, "sample_num_tokens": 8771.75, "step": 4574, "total_num_tokens": 155690358.0, "z_loss": 0.0006572100683115423 }, { "copy_logits_max": -3.8790407180786133, "copy_logits_min": -750000000.0, "copy_num_tokens": 361.6875, "epoch": 0.934388562675517, "gen_logits_max": 5.885889053344727, "gen_logits_mean": -13.440271377563477, "gen_logits_min": -24.919612884521484, "gen_logits_std": 2.7962822914123535, "gen_loss": 0.3304843604564667, "grad_norm": 0.3589555101613923, "learning_rate": 2.5150736842105263e-05, "loss": 0.2955, "mean_copy_accuracy": 0.9959555417299271, "mean_gen_accuracy": 0.8734194189310074, "mean_token_accuracy": 0.8990872800350189, "num_tokens": 155934207.0, "sample_num_tokens": 7689.25, "step": 4575, "total_num_tokens": 155964964.0, "z_loss": 0.0006904109031893313 }, { "copy_logits_max": -1.4343243837356567, "copy_logits_min": -687500032.0, "copy_num_tokens": 542.0, "epoch": 0.9345928006127138, "gen_logits_max": 6.1108317375183105, "gen_logits_mean": -12.963979721069336, "gen_logits_min": -25.17523765563965, "gen_logits_std": 2.8468432426452637, "gen_loss": 0.3189518451690674, "grad_norm": 0.4341679164484706, "learning_rate": 2.5149473684210528e-05, "loss": 0.3149, "mean_copy_accuracy": 0.9951926022768021, "mean_gen_accuracy": 0.8644621223211288, "mean_token_accuracy": 0.897798627614975, "num_tokens": 156213116.0, "sample_num_tokens": 8691.0, "step": 4576, "total_num_tokens": 156247880.0, "z_loss": 0.0006754627684131265 }, { "copy_logits_max": -2.06854248046875, "copy_logits_min": -687500032.0, "copy_num_tokens": 720.875, "epoch": 0.9347970385499107, "gen_logits_max": 4.854283809661865, "gen_logits_mean": -13.642611503601074, "gen_logits_min": -25.490314483642578, "gen_logits_std": 2.863591194152832, "gen_loss": 0.27941712737083435, "grad_norm": 0.38437464356421364, "learning_rate": 2.514821052631579e-05, "loss": 0.2987, "mean_copy_accuracy": 0.9960928559303284, "mean_gen_accuracy": 0.8635658174753189, "mean_token_accuracy": 0.9016465246677399, "num_tokens": 156520005.0, "sample_num_tokens": 9924.25, "step": 4577, "total_num_tokens": 156559702.0, "z_loss": 0.0005949632031843066 }, { "copy_logits_max": -4.646237373352051, "copy_logits_min": -750000000.0, "copy_num_tokens": 457.1875, "epoch": 0.9350012764871075, "gen_logits_max": 4.973996162414551, "gen_logits_mean": -14.790772438049316, "gen_logits_min": -26.74860382080078, "gen_logits_std": 2.8598828315734863, "gen_loss": 0.32902294397354126, "grad_norm": 0.40394217153856515, "learning_rate": 2.5146947368421053e-05, "loss": 0.3072, "mean_copy_accuracy": 0.99648617208004, "mean_gen_accuracy": 0.8613665401935577, "mean_token_accuracy": 0.8968120366334915, "num_tokens": 156804507.0, "sample_num_tokens": 8819.25, "step": 4578, "total_num_tokens": 156839784.0, "z_loss": 0.0006956116994842887 }, { "copy_logits_max": -5.055052757263184, "copy_logits_min": -750000000.0, "copy_num_tokens": 310.125, "epoch": 0.9352055144243043, "gen_logits_max": 6.069984436035156, "gen_logits_mean": -13.918427467346191, "gen_logits_min": -25.322189331054688, "gen_logits_std": 2.7994039058685303, "gen_loss": 0.3178553283214569, "grad_norm": 0.3924265659509097, "learning_rate": 2.5145684210526314e-05, "loss": 0.3123, "mean_copy_accuracy": 0.9950372874736786, "mean_gen_accuracy": 0.8685292899608612, "mean_token_accuracy": 0.8975616842508316, "num_tokens": 157068059.0, "sample_num_tokens": 8132.25, "step": 4579, "total_num_tokens": 157100588.0, "z_loss": 0.0006313914200291038 }, { "copy_logits_max": -4.778790473937988, "copy_logits_min": -687500032.0, "copy_num_tokens": 488.5625, "epoch": 0.9354097523615011, "gen_logits_max": 5.804332256317139, "gen_logits_mean": -12.2280855178833, "gen_logits_min": -24.720436096191406, "gen_logits_std": 2.799131393432617, "gen_loss": 0.2927062511444092, "grad_norm": 0.3886390010073425, "learning_rate": 2.514442105263158e-05, "loss": 0.2993, "mean_copy_accuracy": 0.9956265240907669, "mean_gen_accuracy": 0.8664787709712982, "mean_token_accuracy": 0.9012471735477448, "num_tokens": 157344991.0, "sample_num_tokens": 8530.75, "step": 4580, "total_num_tokens": 157379114.0, "z_loss": 0.00057644909247756 }, { "copy_logits_max": -3.9285926818847656, "copy_logits_min": -750000000.0, "copy_num_tokens": 653.875, "epoch": 0.935613990298698, "gen_logits_max": 5.085816383361816, "gen_logits_mean": -12.807005882263184, "gen_logits_min": -24.714954376220703, "gen_logits_std": 2.8599154949188232, "gen_loss": 0.28268465399742126, "grad_norm": 0.36143082941019206, "learning_rate": 2.5143157894736842e-05, "loss": 0.2815, "mean_copy_accuracy": 0.9966476857662201, "mean_gen_accuracy": 0.8703242540359497, "mean_token_accuracy": 0.9088198095560074, "num_tokens": 157666776.0, "sample_num_tokens": 9350.0, "step": 4581, "total_num_tokens": 157704176.0, "z_loss": 0.0005382776726037264 }, { "copy_logits_max": -2.976052761077881, "copy_logits_min": -750000000.0, "copy_num_tokens": 538.375, "epoch": 0.9358182282358948, "gen_logits_max": 6.158114910125732, "gen_logits_mean": -12.310396194458008, "gen_logits_min": -24.19854736328125, "gen_logits_std": 2.830634593963623, "gen_loss": 0.2918280363082886, "grad_norm": 0.38413869408566853, "learning_rate": 2.5141894736842107e-05, "loss": 0.3041, "mean_copy_accuracy": 0.9966632425785065, "mean_gen_accuracy": 0.8660187423229218, "mean_token_accuracy": 0.9000159353017807, "num_tokens": 157942832.0, "sample_num_tokens": 8972.0, "step": 4582, "total_num_tokens": 157978720.0, "z_loss": 0.0005760947242379189 }, { "copy_logits_max": -4.98084020614624, "copy_logits_min": -750000000.0, "copy_num_tokens": 513.5, "epoch": 0.9360224661730917, "gen_logits_max": 5.135917663574219, "gen_logits_mean": -14.205821990966797, "gen_logits_min": -26.069808959960938, "gen_logits_std": 2.8429808616638184, "gen_loss": 0.3167446255683899, "grad_norm": 0.38094303251660055, "learning_rate": 2.514063157894737e-05, "loss": 0.3023, "mean_copy_accuracy": 0.9957713484764099, "mean_gen_accuracy": 0.867792397737503, "mean_token_accuracy": 0.8999390602111816, "num_tokens": 158211441.0, "sample_num_tokens": 8576.25, "step": 4583, "total_num_tokens": 158245746.0, "z_loss": 0.0005999502027407289 }, { "copy_logits_max": -5.9037556648254395, "copy_logits_min": -750000000.0, "copy_num_tokens": 305.25, "epoch": 0.9362267041102885, "gen_logits_max": 6.092785835266113, "gen_logits_mean": -13.782938957214355, "gen_logits_min": -24.965898513793945, "gen_logits_std": 2.787336826324463, "gen_loss": 0.33745062351226807, "grad_norm": 0.3782665724153667, "learning_rate": 2.5139368421052632e-05, "loss": 0.3024, "mean_copy_accuracy": 0.9947943985462189, "mean_gen_accuracy": 0.8717150837182999, "mean_token_accuracy": 0.8992740511894226, "num_tokens": 158465411.0, "sample_num_tokens": 8727.25, "step": 4584, "total_num_tokens": 158500320.0, "z_loss": 0.0006279480876401067 }, { "copy_logits_max": -3.8198020458221436, "copy_logits_min": -687500032.0, "copy_num_tokens": 447.625, "epoch": 0.9364309420474853, "gen_logits_max": 5.05181884765625, "gen_logits_mean": -14.422532081604004, "gen_logits_min": -25.894351959228516, "gen_logits_std": 2.8479666709899902, "gen_loss": 0.3352563679218292, "grad_norm": 0.4079429355687136, "learning_rate": 2.5138105263157896e-05, "loss": 0.3257, "mean_copy_accuracy": 0.9951322078704834, "mean_gen_accuracy": 0.8586911559104919, "mean_token_accuracy": 0.891729861497879, "num_tokens": 158720802.0, "sample_num_tokens": 8190.5, "step": 4585, "total_num_tokens": 158753564.0, "z_loss": 0.000606968707870692 }, { "copy_logits_max": -4.16917610168457, "copy_logits_min": -750000000.0, "copy_num_tokens": 263.125, "epoch": 0.9366351799846822, "gen_logits_max": 6.213285446166992, "gen_logits_mean": -14.379438400268555, "gen_logits_min": -25.82215690612793, "gen_logits_std": 2.81830096244812, "gen_loss": 0.32460126280784607, "grad_norm": 0.4190357798196047, "learning_rate": 2.5136842105263157e-05, "loss": 0.3166, "mean_copy_accuracy": 0.9954470247030258, "mean_gen_accuracy": 0.8648092746734619, "mean_token_accuracy": 0.8933838158845901, "num_tokens": 158979648.0, "sample_num_tokens": 6877.5, "step": 4586, "total_num_tokens": 159007158.0, "z_loss": 0.0006066809874027967 }, { "copy_logits_max": -2.167969226837158, "copy_logits_min": -687500032.0, "copy_num_tokens": 593.3125, "epoch": 0.936839417921879, "gen_logits_max": 5.459742069244385, "gen_logits_mean": -12.943475723266602, "gen_logits_min": -24.969696044921875, "gen_logits_std": 2.8210387229919434, "gen_loss": 0.3008422553539276, "grad_norm": 0.419309542035604, "learning_rate": 2.513557894736842e-05, "loss": 0.318, "mean_copy_accuracy": 0.994674876332283, "mean_gen_accuracy": 0.8623258173465729, "mean_token_accuracy": 0.8951105326414108, "num_tokens": 159248337.0, "sample_num_tokens": 9688.25, "step": 4587, "total_num_tokens": 159287090.0, "z_loss": 0.000601704407017678 }, { "copy_logits_max": -4.4191389083862305, "copy_logits_min": -750000000.0, "copy_num_tokens": 436.5625, "epoch": 0.9370436558590758, "gen_logits_max": 4.049044609069824, "gen_logits_mean": -15.993115425109863, "gen_logits_min": -27.68355941772461, "gen_logits_std": 2.8514938354492188, "gen_loss": 0.31465262174606323, "grad_norm": 0.36885167572514493, "learning_rate": 2.5134315789473682e-05, "loss": 0.3082, "mean_copy_accuracy": 0.9949617832899094, "mean_gen_accuracy": 0.8666125237941742, "mean_token_accuracy": 0.8957242369651794, "num_tokens": 159529932.0, "sample_num_tokens": 7306.0, "step": 4588, "total_num_tokens": 159559156.0, "z_loss": 0.0006217950722202659 }, { "copy_logits_max": -2.3034329414367676, "copy_logits_min": -750000000.0, "copy_num_tokens": 675.0, "epoch": 0.9372478937962727, "gen_logits_max": 4.903115272521973, "gen_logits_mean": -13.726649284362793, "gen_logits_min": -25.508682250976562, "gen_logits_std": 2.8511297702789307, "gen_loss": 0.2821957468986511, "grad_norm": 0.4131257543529161, "learning_rate": 2.513305263157895e-05, "loss": 0.3266, "mean_copy_accuracy": 0.9948803633451462, "mean_gen_accuracy": 0.8632068485021591, "mean_token_accuracy": 0.8960824310779572, "num_tokens": 159797746.0, "sample_num_tokens": 9532.0, "step": 4589, "total_num_tokens": 159835874.0, "z_loss": 0.0005219028098508716 }, { "copy_logits_max": -4.468062877655029, "copy_logits_min": -750000000.0, "copy_num_tokens": 442.8125, "epoch": 0.9374521317334695, "gen_logits_max": 6.114876747131348, "gen_logits_mean": -13.318653106689453, "gen_logits_min": -25.1021671295166, "gen_logits_std": 2.812255859375, "gen_loss": 0.3298606276512146, "grad_norm": 0.4248991453797704, "learning_rate": 2.513178947368421e-05, "loss": 0.33, "mean_copy_accuracy": 0.9962742775678635, "mean_gen_accuracy": 0.8600779324769974, "mean_token_accuracy": 0.8906703740358353, "num_tokens": 160085688.0, "sample_num_tokens": 9032.0, "step": 4590, "total_num_tokens": 160121816.0, "z_loss": 0.000588085560593754 }, { "copy_logits_max": -3.622838020324707, "copy_logits_min": -750000000.0, "copy_num_tokens": 297.3125, "epoch": 0.9376563696706663, "gen_logits_max": 6.052724838256836, "gen_logits_mean": -14.875399589538574, "gen_logits_min": -26.65502166748047, "gen_logits_std": 2.853515863418579, "gen_loss": 0.29690179228782654, "grad_norm": 0.3717994155007233, "learning_rate": 2.5130526315789475e-05, "loss": 0.2863, "mean_copy_accuracy": 0.9947137534618378, "mean_gen_accuracy": 0.8751981854438782, "mean_token_accuracy": 0.9021583646535873, "num_tokens": 160365047.0, "sample_num_tokens": 7916.25, "step": 4591, "total_num_tokens": 160396712.0, "z_loss": 0.0005679787136614323 }, { "copy_logits_max": -2.6314868927001953, "copy_logits_min": -687500032.0, "copy_num_tokens": 394.1875, "epoch": 0.9378606076078632, "gen_logits_max": 5.068910598754883, "gen_logits_mean": -14.655179023742676, "gen_logits_min": -26.766429901123047, "gen_logits_std": 2.8610782623291016, "gen_loss": 0.3227851390838623, "grad_norm": 0.39557060264322386, "learning_rate": 2.5129263157894736e-05, "loss": 0.3153, "mean_copy_accuracy": 0.9950299710035324, "mean_gen_accuracy": 0.8614951521158218, "mean_token_accuracy": 0.8940257430076599, "num_tokens": 160624926.0, "sample_num_tokens": 7615.0, "step": 4592, "total_num_tokens": 160655386.0, "z_loss": 0.0006506014615297318 }, { "copy_logits_max": 0.4848136305809021, "copy_logits_min": -687500032.0, "copy_num_tokens": 394.25, "epoch": 0.93806484554506, "gen_logits_max": 5.9255266189575195, "gen_logits_mean": -13.787530899047852, "gen_logits_min": -25.819442749023438, "gen_logits_std": 2.871429681777954, "gen_loss": 0.311443954706192, "grad_norm": 0.3819169417367997, "learning_rate": 2.5128e-05, "loss": 0.3078, "mean_copy_accuracy": 0.9949561506509781, "mean_gen_accuracy": 0.8627921789884567, "mean_token_accuracy": 0.8975711911916733, "num_tokens": 160900255.0, "sample_num_tokens": 7624.25, "step": 4593, "total_num_tokens": 160930752.0, "z_loss": 0.0006524547934532166 }, { "copy_logits_max": 0.6596148014068604, "copy_logits_min": -750000000.0, "copy_num_tokens": 613.1875, "epoch": 0.9382690834822568, "gen_logits_max": 4.311229228973389, "gen_logits_mean": -15.276366233825684, "gen_logits_min": -27.42384910583496, "gen_logits_std": 2.8900535106658936, "gen_loss": 0.27343568205833435, "grad_norm": 0.3689483579247084, "learning_rate": 2.512673684210526e-05, "loss": 0.2811, "mean_copy_accuracy": 0.9973601251840591, "mean_gen_accuracy": 0.8683527261018753, "mean_token_accuracy": 0.9083141535520554, "num_tokens": 161181129.0, "sample_num_tokens": 9001.75, "step": 4594, "total_num_tokens": 161217136.0, "z_loss": 0.0005863923579454422 }, { "copy_logits_max": -0.3324625492095947, "copy_logits_min": -750000000.0, "copy_num_tokens": 490.375, "epoch": 0.9384733214194536, "gen_logits_max": 6.39024543762207, "gen_logits_mean": -12.218058586120605, "gen_logits_min": -24.281442642211914, "gen_logits_std": 2.8068017959594727, "gen_loss": 0.29294973611831665, "grad_norm": 0.38044752853198993, "learning_rate": 2.5125473684210526e-05, "loss": 0.2918, "mean_copy_accuracy": 0.9963958710432053, "mean_gen_accuracy": 0.8699382841587067, "mean_token_accuracy": 0.9025052040815353, "num_tokens": 161448217.0, "sample_num_tokens": 9113.25, "step": 4595, "total_num_tokens": 161484670.0, "z_loss": 0.000650750589556992 }, { "copy_logits_max": -2.568789482116699, "copy_logits_min": -750000000.0, "copy_num_tokens": 524.0, "epoch": 0.9386775593566505, "gen_logits_max": 5.130090713500977, "gen_logits_mean": -13.849971771240234, "gen_logits_min": -26.044883728027344, "gen_logits_std": 2.869198799133301, "gen_loss": 0.3065410852432251, "grad_norm": 0.3797520852303514, "learning_rate": 2.512421052631579e-05, "loss": 0.3243, "mean_copy_accuracy": 0.996257945895195, "mean_gen_accuracy": 0.8571528941392899, "mean_token_accuracy": 0.8919770866632462, "num_tokens": 161748604.0, "sample_num_tokens": 8173.0, "step": 4596, "total_num_tokens": 161781296.0, "z_loss": 0.0006216830224730074 }, { "copy_logits_max": -2.836331367492676, "copy_logits_min": -687500032.0, "copy_num_tokens": 277.75, "epoch": 0.9388817972938474, "gen_logits_max": 5.441890716552734, "gen_logits_mean": -14.853683471679688, "gen_logits_min": -26.534347534179688, "gen_logits_std": 2.8667049407958984, "gen_loss": 0.3407827913761139, "grad_norm": 0.4239220532067896, "learning_rate": 2.5122947368421055e-05, "loss": 0.3087, "mean_copy_accuracy": 0.9941590577363968, "mean_gen_accuracy": 0.8685303926467896, "mean_token_accuracy": 0.8969570994377136, "num_tokens": 162011443.0, "sample_num_tokens": 6944.75, "step": 4597, "total_num_tokens": 162039222.0, "z_loss": 0.0006109520327299833 }, { "copy_logits_max": -2.2094435691833496, "copy_logits_min": -750000000.0, "copy_num_tokens": 610.375, "epoch": 0.9390860352310442, "gen_logits_max": 4.917898654937744, "gen_logits_mean": -14.389559745788574, "gen_logits_min": -26.817901611328125, "gen_logits_std": 2.8563592433929443, "gen_loss": 0.2596637010574341, "grad_norm": 0.3434292178594903, "learning_rate": 2.512168421052632e-05, "loss": 0.2688, "mean_copy_accuracy": 0.9969761818647385, "mean_gen_accuracy": 0.8765023797750473, "mean_token_accuracy": 0.9107283353805542, "num_tokens": 162316100.0, "sample_num_tokens": 10496.5, "step": 4598, "total_num_tokens": 162358086.0, "z_loss": 0.00047195563092827797 }, { "copy_logits_max": -0.1127430647611618, "copy_logits_min": -750000000.0, "copy_num_tokens": 568.4375, "epoch": 0.939290273168241, "gen_logits_max": 3.824707508087158, "gen_logits_mean": -15.347786903381348, "gen_logits_min": -27.424095153808594, "gen_logits_std": 2.8779568672180176, "gen_loss": 0.2503073215484619, "grad_norm": 0.36068820486667047, "learning_rate": 2.512042105263158e-05, "loss": 0.2865, "mean_copy_accuracy": 0.9960782676935196, "mean_gen_accuracy": 0.8710232526063919, "mean_token_accuracy": 0.9031769931316376, "num_tokens": 162601030.0, "sample_num_tokens": 9354.5, "step": 4599, "total_num_tokens": 162638448.0, "z_loss": 0.0005002575926482677 }, { "copy_logits_max": -0.3006748557090759, "copy_logits_min": -750000128.0, "copy_num_tokens": 542.0625, "epoch": 0.9394945111054378, "gen_logits_max": 4.536306858062744, "gen_logits_mean": -14.285172462463379, "gen_logits_min": -26.708553314208984, "gen_logits_std": 2.912557601928711, "gen_loss": 0.25391197204589844, "grad_norm": 0.3849089701044311, "learning_rate": 2.5119157894736844e-05, "loss": 0.2826, "mean_copy_accuracy": 0.9957238435745239, "mean_gen_accuracy": 0.8748359829187393, "mean_token_accuracy": 0.9068281054496765, "num_tokens": 162885824.0, "sample_num_tokens": 9099.0, "step": 4600, "total_num_tokens": 162922220.0, "z_loss": 0.0005199452862143517 }, { "copy_logits_max": 1.8064087629318237, "copy_logits_min": -687500032.0, "copy_num_tokens": 830.875, "epoch": 0.9396987490426346, "gen_logits_max": 4.529182434082031, "gen_logits_mean": -13.883682250976562, "gen_logits_min": -26.017179489135742, "gen_logits_std": 2.866241455078125, "gen_loss": 0.29774412512779236, "grad_norm": 0.37719211996686114, "learning_rate": 2.5117894736842105e-05, "loss": 0.2983, "mean_copy_accuracy": 0.995638445019722, "mean_gen_accuracy": 0.8682329207658768, "mean_token_accuracy": 0.8998436629772186, "num_tokens": 163162655.0, "sample_num_tokens": 11405.25, "step": 4601, "total_num_tokens": 163208276.0, "z_loss": 0.0005938108661212027 }, { "copy_logits_max": 0.15933005511760712, "copy_logits_min": -750000000.0, "copy_num_tokens": 535.25, "epoch": 0.9399029869798315, "gen_logits_max": 4.430456161499023, "gen_logits_mean": -14.712326049804688, "gen_logits_min": -26.898414611816406, "gen_logits_std": 2.8438076972961426, "gen_loss": 0.26503223180770874, "grad_norm": 0.39090872796479453, "learning_rate": 2.511663157894737e-05, "loss": 0.3021, "mean_copy_accuracy": 0.9962527304887772, "mean_gen_accuracy": 0.8652531504631042, "mean_token_accuracy": 0.8997192978858948, "num_tokens": 163468403.0, "sample_num_tokens": 9096.75, "step": 4602, "total_num_tokens": 163504790.0, "z_loss": 0.0005528968176804483 }, { "copy_logits_max": 1.6586525440216064, "copy_logits_min": -750000000.0, "copy_num_tokens": 453.5, "epoch": 0.9401072249170284, "gen_logits_max": 5.994903564453125, "gen_logits_mean": -13.273365020751953, "gen_logits_min": -25.00667381286621, "gen_logits_std": 2.8647847175598145, "gen_loss": 0.3101169466972351, "grad_norm": 0.37646143433922363, "learning_rate": 2.511536842105263e-05, "loss": 0.2857, "mean_copy_accuracy": 0.9975527971982956, "mean_gen_accuracy": 0.870092898607254, "mean_token_accuracy": 0.9056936055421829, "num_tokens": 163760448.0, "sample_num_tokens": 8122.0, "step": 4603, "total_num_tokens": 163792936.0, "z_loss": 0.0005792375886812806 }, { "copy_logits_max": -0.3187914490699768, "copy_logits_min": -750000000.0, "copy_num_tokens": 297.875, "epoch": 0.9403114628542252, "gen_logits_max": 5.335387229919434, "gen_logits_mean": -14.002022743225098, "gen_logits_min": -26.12930679321289, "gen_logits_std": 2.796457290649414, "gen_loss": 0.3055623471736908, "grad_norm": 0.3790813858493301, "learning_rate": 2.5114105263157895e-05, "loss": 0.2877, "mean_copy_accuracy": 0.9951421916484833, "mean_gen_accuracy": 0.8713771402835846, "mean_token_accuracy": 0.9028719067573547, "num_tokens": 164044300.0, "sample_num_tokens": 6714.5, "step": 4604, "total_num_tokens": 164071158.0, "z_loss": 0.0005771028809249401 }, { "copy_logits_max": -1.7787928581237793, "copy_logits_min": -750000000.0, "copy_num_tokens": 516.6875, "epoch": 0.940515700791422, "gen_logits_max": 4.4785308837890625, "gen_logits_mean": -15.534061431884766, "gen_logits_min": -27.387142181396484, "gen_logits_std": 2.831676483154297, "gen_loss": 0.3312082290649414, "grad_norm": 0.3851792763398019, "learning_rate": 2.511284210526316e-05, "loss": 0.308, "mean_copy_accuracy": 0.9960688054561615, "mean_gen_accuracy": 0.8654643446207047, "mean_token_accuracy": 0.897991731762886, "num_tokens": 164320429.0, "sample_num_tokens": 9465.25, "step": 4605, "total_num_tokens": 164358290.0, "z_loss": 0.0006473437533713877 }, { "copy_logits_max": 3.22066593170166, "copy_logits_min": -750000000.0, "copy_num_tokens": 428.375, "epoch": 0.9407199387286188, "gen_logits_max": 5.608049392700195, "gen_logits_mean": -12.465545654296875, "gen_logits_min": -24.505748748779297, "gen_logits_std": 2.8240599632263184, "gen_loss": 0.330554723739624, "grad_norm": 0.3827801496349502, "learning_rate": 2.5111578947368423e-05, "loss": 0.3062, "mean_copy_accuracy": 0.995830625295639, "mean_gen_accuracy": 0.8665965646505356, "mean_token_accuracy": 0.8982043266296387, "num_tokens": 164608383.0, "sample_num_tokens": 7982.25, "step": 4606, "total_num_tokens": 164640312.0, "z_loss": 0.0006476336857303977 }, { "copy_logits_max": -1.4107801914215088, "copy_logits_min": -625000064.0, "copy_num_tokens": 577.0625, "epoch": 0.9409241766658156, "gen_logits_max": 4.385744094848633, "gen_logits_mean": -14.644932746887207, "gen_logits_min": -26.823108673095703, "gen_logits_std": 2.8362083435058594, "gen_loss": 0.24265442788600922, "grad_norm": 0.39581872224383985, "learning_rate": 2.5110315789473684e-05, "loss": 0.3184, "mean_copy_accuracy": 0.9960346668958664, "mean_gen_accuracy": 0.8593978434801102, "mean_token_accuracy": 0.8926938027143478, "num_tokens": 164872103.0, "sample_num_tokens": 8517.75, "step": 4607, "total_num_tokens": 164906174.0, "z_loss": 0.0005175912519916892 }, { "copy_logits_max": -0.1819063127040863, "copy_logits_min": -687500032.0, "copy_num_tokens": 378.0625, "epoch": 0.9411284146030126, "gen_logits_max": 5.321658134460449, "gen_logits_mean": -13.641153335571289, "gen_logits_min": -25.81151008605957, "gen_logits_std": 2.8014447689056396, "gen_loss": 0.3389168977737427, "grad_norm": 0.354372669181811, "learning_rate": 2.510905263157895e-05, "loss": 0.2926, "mean_copy_accuracy": 0.9966099560260773, "mean_gen_accuracy": 0.8745176643133163, "mean_token_accuracy": 0.9021451771259308, "num_tokens": 165145238.0, "sample_num_tokens": 8253.5, "step": 4608, "total_num_tokens": 165178252.0, "z_loss": 0.0006325009744614363 }, { "copy_logits_max": -2.573399305343628, "copy_logits_min": -750000064.0, "copy_num_tokens": 333.375, "epoch": 0.9413326525402094, "gen_logits_max": 5.62981653213501, "gen_logits_mean": -14.014571189880371, "gen_logits_min": -25.952133178710938, "gen_logits_std": 2.8130111694335938, "gen_loss": 0.35669177770614624, "grad_norm": 0.38566657005938054, "learning_rate": 2.510778947368421e-05, "loss": 0.3204, "mean_copy_accuracy": 0.9946680217981339, "mean_gen_accuracy": 0.8614581823348999, "mean_token_accuracy": 0.892653226852417, "num_tokens": 165419822.0, "sample_num_tokens": 7590.0, "step": 4609, "total_num_tokens": 165450182.0, "z_loss": 0.0006605752860195935 }, { "copy_logits_max": 0.028774619102478027, "copy_logits_min": -687500032.0, "copy_num_tokens": 467.5, "epoch": 0.9415368904774062, "gen_logits_max": 4.434332847595215, "gen_logits_mean": -14.961475372314453, "gen_logits_min": -26.838342666625977, "gen_logits_std": 2.849529266357422, "gen_loss": 0.34310781955718994, "grad_norm": 0.40204646276590394, "learning_rate": 2.5106526315789474e-05, "loss": 0.3243, "mean_copy_accuracy": 0.9950589239597321, "mean_gen_accuracy": 0.8601190894842148, "mean_token_accuracy": 0.8910552263259888, "num_tokens": 165697834.0, "sample_num_tokens": 8607.5, "step": 4610, "total_num_tokens": 165732264.0, "z_loss": 0.0006723455153405666 }, { "copy_logits_max": -2.558640480041504, "copy_logits_min": -750000000.0, "copy_num_tokens": 437.3125, "epoch": 0.941741128414603, "gen_logits_max": 5.087551593780518, "gen_logits_mean": -15.155508041381836, "gen_logits_min": -26.81167221069336, "gen_logits_std": 2.8521790504455566, "gen_loss": 0.31400346755981445, "grad_norm": 0.3858126271452945, "learning_rate": 2.5105263157894738e-05, "loss": 0.3204, "mean_copy_accuracy": 0.995927706360817, "mean_gen_accuracy": 0.8606602102518082, "mean_token_accuracy": 0.8926047384738922, "num_tokens": 165960426.0, "sample_num_tokens": 8967.5, "step": 4611, "total_num_tokens": 165996296.0, "z_loss": 0.0006701051024720073 }, { "copy_logits_max": 3.734447479248047, "copy_logits_min": -750000000.0, "copy_num_tokens": 377.75, "epoch": 0.9419453663517998, "gen_logits_max": 6.968783378601074, "gen_logits_mean": -12.245508193969727, "gen_logits_min": -24.069599151611328, "gen_logits_std": 2.8294546604156494, "gen_loss": 0.33655720949172974, "grad_norm": 0.41418764214255965, "learning_rate": 2.5104e-05, "loss": 0.3175, "mean_copy_accuracy": 0.9947202950716019, "mean_gen_accuracy": 0.8665625005960464, "mean_token_accuracy": 0.8947138339281082, "num_tokens": 166234259.0, "sample_num_tokens": 8673.75, "step": 4612, "total_num_tokens": 166268954.0, "z_loss": 0.0006850335048511624 }, { "copy_logits_max": -2.1788289546966553, "copy_logits_min": -687500032.0, "copy_num_tokens": 332.875, "epoch": 0.9421496042889966, "gen_logits_max": 5.426692962646484, "gen_logits_mean": -14.379206657409668, "gen_logits_min": -26.044719696044922, "gen_logits_std": 2.7923669815063477, "gen_loss": 0.31697484850883484, "grad_norm": 0.41813817057623015, "learning_rate": 2.5102736842105267e-05, "loss": 0.317, "mean_copy_accuracy": 0.9952205270528793, "mean_gen_accuracy": 0.8636088222265244, "mean_token_accuracy": 0.892561674118042, "num_tokens": 166496501.0, "sample_num_tokens": 7423.75, "step": 4613, "total_num_tokens": 166526196.0, "z_loss": 0.0006792813073843718 }, { "copy_logits_max": -0.28466129302978516, "copy_logits_min": -750000000.0, "copy_num_tokens": 644.5625, "epoch": 0.9423538422261936, "gen_logits_max": 5.53714656829834, "gen_logits_mean": -13.294351577758789, "gen_logits_min": -25.15172004699707, "gen_logits_std": 2.8298633098602295, "gen_loss": 0.27117201685905457, "grad_norm": 0.39568853545184757, "learning_rate": 2.5101473684210527e-05, "loss": 0.3322, "mean_copy_accuracy": 0.9958852678537369, "mean_gen_accuracy": 0.8625205159187317, "mean_token_accuracy": 0.8948435932397842, "num_tokens": 166764161.0, "sample_num_tokens": 10025.25, "step": 4614, "total_num_tokens": 166804262.0, "z_loss": 0.000562670174986124 }, { "copy_logits_max": -4.689762115478516, "copy_logits_min": -750000000.0, "copy_num_tokens": 282.875, "epoch": 0.9425580801633904, "gen_logits_max": 5.560614585876465, "gen_logits_mean": -13.12380599975586, "gen_logits_min": -24.41214370727539, "gen_logits_std": 2.7484021186828613, "gen_loss": 0.3377532362937927, "grad_norm": 0.3835472292426646, "learning_rate": 2.5100210526315792e-05, "loss": 0.2961, "mean_copy_accuracy": 0.9957950860261917, "mean_gen_accuracy": 0.8705692738294601, "mean_token_accuracy": 0.9006022363901138, "num_tokens": 167028285.0, "sample_num_tokens": 6610.75, "step": 4615, "total_num_tokens": 167054728.0, "z_loss": 0.0006133767310529947 }, { "copy_logits_max": -4.1280293464660645, "copy_logits_min": -750000000.0, "copy_num_tokens": 528.0, "epoch": 0.9427623181005872, "gen_logits_max": 4.817105770111084, "gen_logits_mean": -13.895671844482422, "gen_logits_min": -25.432449340820312, "gen_logits_std": 2.7778995037078857, "gen_loss": 0.3102257549762726, "grad_norm": 0.406408014694806, "learning_rate": 2.5098947368421053e-05, "loss": 0.3174, "mean_copy_accuracy": 0.9958154559135437, "mean_gen_accuracy": 0.8636851757764816, "mean_token_accuracy": 0.893942579627037, "num_tokens": 167292319.0, "sample_num_tokens": 9189.75, "step": 4616, "total_num_tokens": 167329078.0, "z_loss": 0.0005762012442573905 }, { "copy_logits_max": -1.7798951864242554, "copy_logits_min": -750000000.0, "copy_num_tokens": 507.0625, "epoch": 0.942966556037784, "gen_logits_max": 3.484701633453369, "gen_logits_mean": -15.529645919799805, "gen_logits_min": -27.26650047302246, "gen_logits_std": 2.8495192527770996, "gen_loss": 0.2833991050720215, "grad_norm": 0.4352217449868064, "learning_rate": 2.5097684210526317e-05, "loss": 0.3077, "mean_copy_accuracy": 0.9963577091693878, "mean_gen_accuracy": 0.8620842695236206, "mean_token_accuracy": 0.8983576595783234, "num_tokens": 167591788.0, "sample_num_tokens": 7575.5, "step": 4617, "total_num_tokens": 167622090.0, "z_loss": 0.0005258836317807436 }, { "copy_logits_max": -2.240741729736328, "copy_logits_min": -750000000.0, "copy_num_tokens": 312.5625, "epoch": 0.9431707939749808, "gen_logits_max": 4.960527420043945, "gen_logits_mean": -14.835294723510742, "gen_logits_min": -26.16928482055664, "gen_logits_std": 2.8197576999664307, "gen_loss": 0.36340370774269104, "grad_norm": 0.40781146827404874, "learning_rate": 2.5096421052631578e-05, "loss": 0.3148, "mean_copy_accuracy": 0.9948104918003082, "mean_gen_accuracy": 0.8615633845329285, "mean_token_accuracy": 0.8957835733890533, "num_tokens": 167859820.0, "sample_num_tokens": 7438.0, "step": 4618, "total_num_tokens": 167889572.0, "z_loss": 0.0006560152978636324 }, { "copy_logits_max": -3.3750901222229004, "copy_logits_min": -750000000.0, "copy_num_tokens": 236.75, "epoch": 0.9433750319121776, "gen_logits_max": 5.694421291351318, "gen_logits_mean": -14.134912490844727, "gen_logits_min": -25.41991424560547, "gen_logits_std": 2.8327698707580566, "gen_loss": 0.3246748149394989, "grad_norm": 0.3832205110657461, "learning_rate": 2.5095157894736842e-05, "loss": 0.311, "mean_copy_accuracy": 0.9950737804174423, "mean_gen_accuracy": 0.866699829697609, "mean_token_accuracy": 0.8954396396875381, "num_tokens": 168123856.0, "sample_num_tokens": 7004.0, "step": 4619, "total_num_tokens": 168151872.0, "z_loss": 0.0005588927306234837 }, { "copy_logits_max": -2.970829725265503, "copy_logits_min": -750000000.0, "copy_num_tokens": 557.125, "epoch": 0.9435792698493746, "gen_logits_max": 5.3938188552856445, "gen_logits_mean": -13.521820068359375, "gen_logits_min": -25.13284683227539, "gen_logits_std": 2.844709873199463, "gen_loss": 0.29072314500808716, "grad_norm": 0.3885057679582521, "learning_rate": 2.5093894736842103e-05, "loss": 0.3205, "mean_copy_accuracy": 0.9965887516736984, "mean_gen_accuracy": 0.8641767054796219, "mean_token_accuracy": 0.8960560411214828, "num_tokens": 168393379.0, "sample_num_tokens": 9403.25, "step": 4620, "total_num_tokens": 168430992.0, "z_loss": 0.000498587847687304 }, { "copy_logits_max": -3.121366500854492, "copy_logits_min": -750000000.0, "copy_num_tokens": 350.375, "epoch": 0.9437835077865714, "gen_logits_max": 4.690855026245117, "gen_logits_mean": -14.6146879196167, "gen_logits_min": -25.74259376525879, "gen_logits_std": 2.8059539794921875, "gen_loss": 0.31717008352279663, "grad_norm": 0.46971117527150563, "learning_rate": 2.509263157894737e-05, "loss": 0.3152, "mean_copy_accuracy": 0.994817465543747, "mean_gen_accuracy": 0.8619229644536972, "mean_token_accuracy": 0.8969776481389999, "num_tokens": 168655269.0, "sample_num_tokens": 6944.25, "step": 4621, "total_num_tokens": 168683046.0, "z_loss": 0.0005477872909978032 }, { "copy_logits_max": -4.156181335449219, "copy_logits_min": -687500032.0, "copy_num_tokens": 400.75, "epoch": 0.9439877457237682, "gen_logits_max": 4.244793891906738, "gen_logits_mean": -15.483186721801758, "gen_logits_min": -27.10641098022461, "gen_logits_std": 2.8695833683013916, "gen_loss": 0.3112296462059021, "grad_norm": 0.35648498953422003, "learning_rate": 2.5091368421052632e-05, "loss": 0.2929, "mean_copy_accuracy": 0.9966843128204346, "mean_gen_accuracy": 0.8729163259267807, "mean_token_accuracy": 0.9008872807025909, "num_tokens": 168925709.0, "sample_num_tokens": 7485.25, "step": 4622, "total_num_tokens": 168955650.0, "z_loss": 0.0005875692586414516 }, { "copy_logits_max": -2.3338706493377686, "copy_logits_min": -750000000.0, "copy_num_tokens": 449.375, "epoch": 0.944191983660965, "gen_logits_max": 4.376799583435059, "gen_logits_mean": -14.911848068237305, "gen_logits_min": -26.76264762878418, "gen_logits_std": 2.8545713424682617, "gen_loss": 0.29837241768836975, "grad_norm": 0.4328211272219281, "learning_rate": 2.5090105263157896e-05, "loss": 0.3071, "mean_copy_accuracy": 0.9962126463651657, "mean_gen_accuracy": 0.8611403852701187, "mean_token_accuracy": 0.8986199498176575, "num_tokens": 169178339.0, "sample_num_tokens": 8181.25, "step": 4623, "total_num_tokens": 169211064.0, "z_loss": 0.0006041190936230123 }, { "copy_logits_max": -0.7057893872261047, "copy_logits_min": -750000000.0, "copy_num_tokens": 409.375, "epoch": 0.9443962215981618, "gen_logits_max": 4.433279037475586, "gen_logits_mean": -15.121294021606445, "gen_logits_min": -26.83364486694336, "gen_logits_std": 2.857396125793457, "gen_loss": 0.30446648597717285, "grad_norm": 0.415176589262165, "learning_rate": 2.508884210526316e-05, "loss": 0.3101, "mean_copy_accuracy": 0.9951023906469345, "mean_gen_accuracy": 0.8614678978919983, "mean_token_accuracy": 0.8970865905284882, "num_tokens": 169433831.0, "sample_num_tokens": 7236.25, "step": 4624, "total_num_tokens": 169462776.0, "z_loss": 0.0005758075858466327 }, { "copy_logits_max": -0.4584061801433563, "copy_logits_min": -750000000.0, "copy_num_tokens": 580.8125, "epoch": 0.9446004595353586, "gen_logits_max": 4.475213050842285, "gen_logits_mean": -14.068580627441406, "gen_logits_min": -25.743240356445312, "gen_logits_std": 2.813795566558838, "gen_loss": 0.2620220482349396, "grad_norm": 0.4221272651610682, "learning_rate": 2.508757894736842e-05, "loss": 0.3109, "mean_copy_accuracy": 0.9942051023244858, "mean_gen_accuracy": 0.8650938719511032, "mean_token_accuracy": 0.8950304985046387, "num_tokens": 169692576.0, "sample_num_tokens": 8986.0, "step": 4625, "total_num_tokens": 169728520.0, "z_loss": 0.0004676161624956876 }, { "copy_logits_max": -2.3990976810455322, "copy_logits_min": -750000000.0, "copy_num_tokens": 497.6875, "epoch": 0.9448046974725556, "gen_logits_max": 5.085546493530273, "gen_logits_mean": -13.929258346557617, "gen_logits_min": -25.45110321044922, "gen_logits_std": 2.8340466022491455, "gen_loss": 0.3053191900253296, "grad_norm": 0.3993448237058619, "learning_rate": 2.5086315789473686e-05, "loss": 0.3003, "mean_copy_accuracy": 0.9948409050703049, "mean_gen_accuracy": 0.8668759167194366, "mean_token_accuracy": 0.8991059958934784, "num_tokens": 169964422.0, "sample_num_tokens": 9323.0, "step": 4626, "total_num_tokens": 170001714.0, "z_loss": 0.0005892497720196843 }, { "copy_logits_max": -1.9722726345062256, "copy_logits_min": -750000000.0, "copy_num_tokens": 281.0625, "epoch": 0.9450089354097524, "gen_logits_max": 5.850712299346924, "gen_logits_mean": -13.600412368774414, "gen_logits_min": -25.22372055053711, "gen_logits_std": 2.8697919845581055, "gen_loss": 0.32556718587875366, "grad_norm": 0.4529580493037452, "learning_rate": 2.5085052631578947e-05, "loss": 0.3143, "mean_copy_accuracy": 0.994576945900917, "mean_gen_accuracy": 0.8672597259283066, "mean_token_accuracy": 0.8953362554311752, "num_tokens": 170217649.0, "sample_num_tokens": 6688.75, "step": 4627, "total_num_tokens": 170244404.0, "z_loss": 0.0006251013837754726 }, { "copy_logits_max": -4.374683380126953, "copy_logits_min": -750000000.0, "copy_num_tokens": 517.5, "epoch": 0.9452131733469492, "gen_logits_max": 2.95457124710083, "gen_logits_mean": -16.896217346191406, "gen_logits_min": -28.892684936523438, "gen_logits_std": 2.918214797973633, "gen_loss": 0.2546903192996979, "grad_norm": 0.35213354848874806, "learning_rate": 2.508378947368421e-05, "loss": 0.2911, "mean_copy_accuracy": 0.9954881221055984, "mean_gen_accuracy": 0.8704062551259995, "mean_token_accuracy": 0.9022780805826187, "num_tokens": 170482847.0, "sample_num_tokens": 8308.25, "step": 4628, "total_num_tokens": 170516080.0, "z_loss": 0.0005219090380705893 }, { "copy_logits_max": -4.488614082336426, "copy_logits_min": -750000000.0, "copy_num_tokens": 457.9375, "epoch": 0.945417411284146, "gen_logits_max": 4.5998735427856445, "gen_logits_mean": -15.799614906311035, "gen_logits_min": -27.662761688232422, "gen_logits_std": 2.890645980834961, "gen_loss": 0.28974902629852295, "grad_norm": 0.3743840170518196, "learning_rate": 2.5082526315789475e-05, "loss": 0.2865, "mean_copy_accuracy": 0.9954342842102051, "mean_gen_accuracy": 0.8733270764350891, "mean_token_accuracy": 0.9036800265312195, "num_tokens": 170764365.0, "sample_num_tokens": 8419.25, "step": 4629, "total_num_tokens": 170798042.0, "z_loss": 0.0005782660446129739 }, { "copy_logits_max": -2.24291729927063, "copy_logits_min": -625000064.0, "copy_num_tokens": 691.5625, "epoch": 0.9456216492213428, "gen_logits_max": 4.731440544128418, "gen_logits_mean": -13.85351848602295, "gen_logits_min": -26.099361419677734, "gen_logits_std": 2.8938589096069336, "gen_loss": 0.28898173570632935, "grad_norm": 0.3966980041005789, "learning_rate": 2.508126315789474e-05, "loss": 0.2933, "mean_copy_accuracy": 0.9951976388692856, "mean_gen_accuracy": 0.8653773069381714, "mean_token_accuracy": 0.9026061296463013, "num_tokens": 171054621.0, "sample_num_tokens": 9048.25, "step": 4630, "total_num_tokens": 171090814.0, "z_loss": 0.000607885594945401 }, { "copy_logits_max": -1.6253249645233154, "copy_logits_min": -687500032.0, "copy_num_tokens": 678.8125, "epoch": 0.9458258871585397, "gen_logits_max": 5.222893714904785, "gen_logits_mean": -13.742738723754883, "gen_logits_min": -25.721744537353516, "gen_logits_std": 2.8822550773620605, "gen_loss": 0.2872544825077057, "grad_norm": 0.35401332196229834, "learning_rate": 2.508e-05, "loss": 0.2786, "mean_copy_accuracy": 0.9955686330795288, "mean_gen_accuracy": 0.873151883482933, "mean_token_accuracy": 0.9054149240255356, "num_tokens": 171345089.0, "sample_num_tokens": 10559.25, "step": 4631, "total_num_tokens": 171387326.0, "z_loss": 0.0005943140131421387 }, { "copy_logits_max": -2.01265549659729, "copy_logits_min": -687500032.0, "copy_num_tokens": 465.5625, "epoch": 0.9460301250957366, "gen_logits_max": 5.36085844039917, "gen_logits_mean": -14.290983200073242, "gen_logits_min": -26.44719886779785, "gen_logits_std": 2.8710405826568604, "gen_loss": 0.3229014277458191, "grad_norm": 0.41219781877372935, "learning_rate": 2.5078736842105265e-05, "loss": 0.3049, "mean_copy_accuracy": 0.994217038154602, "mean_gen_accuracy": 0.86997489631176, "mean_token_accuracy": 0.8990190327167511, "num_tokens": 171615934.0, "sample_num_tokens": 8773.0, "step": 4632, "total_num_tokens": 171651026.0, "z_loss": 0.0007169035961851478 }, { "copy_logits_max": -0.7949502468109131, "copy_logits_min": -750000000.0, "copy_num_tokens": 517.3125, "epoch": 0.9462343630329334, "gen_logits_max": 4.72788667678833, "gen_logits_mean": -14.220791816711426, "gen_logits_min": -26.048946380615234, "gen_logits_std": 2.842611074447632, "gen_loss": 0.2716881334781647, "grad_norm": 0.3999012007570688, "learning_rate": 2.5077473684210526e-05, "loss": 0.3092, "mean_copy_accuracy": 0.9947453737258911, "mean_gen_accuracy": 0.869753435254097, "mean_token_accuracy": 0.8968741446733475, "num_tokens": 171870901.0, "sample_num_tokens": 9253.75, "step": 4633, "total_num_tokens": 171907916.0, "z_loss": 0.0005824335385113955 }, { "copy_logits_max": -3.504983425140381, "copy_logits_min": -750000000.0, "copy_num_tokens": 371.75, "epoch": 0.9464386009701302, "gen_logits_max": 5.399136543273926, "gen_logits_mean": -15.017221450805664, "gen_logits_min": -27.072813034057617, "gen_logits_std": 2.857412099838257, "gen_loss": 0.30393439531326294, "grad_norm": 0.41337323558927774, "learning_rate": 2.507621052631579e-05, "loss": 0.31, "mean_copy_accuracy": 0.995192676782608, "mean_gen_accuracy": 0.8612810373306274, "mean_token_accuracy": 0.8975035548210144, "num_tokens": 172139646.0, "sample_num_tokens": 7736.0, "step": 4634, "total_num_tokens": 172170590.0, "z_loss": 0.0006662119412794709 }, { "copy_logits_max": -5.760004997253418, "copy_logits_min": -687500032.0, "copy_num_tokens": 187.125, "epoch": 0.946642838907327, "gen_logits_max": 5.5127668380737305, "gen_logits_mean": -15.270659446716309, "gen_logits_min": -26.586153030395508, "gen_logits_std": 2.814072608947754, "gen_loss": 0.3421359956264496, "grad_norm": 0.39787142872673564, "learning_rate": 2.507494736842105e-05, "loss": 0.3175, "mean_copy_accuracy": 0.9943967908620834, "mean_gen_accuracy": 0.8671849817037582, "mean_token_accuracy": 0.8948951214551926, "num_tokens": 172392035.0, "sample_num_tokens": 6189.75, "step": 4635, "total_num_tokens": 172416794.0, "z_loss": 0.0006742857513017952 }, { "copy_logits_max": -0.7366053462028503, "copy_logits_min": -625000064.0, "copy_num_tokens": 599.125, "epoch": 0.9468470768445238, "gen_logits_max": 5.429277420043945, "gen_logits_mean": -13.036311149597168, "gen_logits_min": -24.98731803894043, "gen_logits_std": 2.8365511894226074, "gen_loss": 0.3246071934700012, "grad_norm": 0.36952505692307447, "learning_rate": 2.5073684210526315e-05, "loss": 0.2888, "mean_copy_accuracy": 0.9948495328426361, "mean_gen_accuracy": 0.8720078617334366, "mean_token_accuracy": 0.9045035541057587, "num_tokens": 172651620.0, "sample_num_tokens": 8791.0, "step": 4636, "total_num_tokens": 172686784.0, "z_loss": 0.0007166160503402352 }, { "copy_logits_max": -3.393073797225952, "copy_logits_min": -750000000.0, "copy_num_tokens": 503.4375, "epoch": 0.9470513147817207, "gen_logits_max": 4.055999755859375, "gen_logits_mean": -16.260623931884766, "gen_logits_min": -28.145200729370117, "gen_logits_std": 2.905625820159912, "gen_loss": 0.2795359194278717, "grad_norm": 0.35119203793500436, "learning_rate": 2.507242105263158e-05, "loss": 0.2738, "mean_copy_accuracy": 0.9962462186813354, "mean_gen_accuracy": 0.8774109333753586, "mean_token_accuracy": 0.9091317355632782, "num_tokens": 172928316.0, "sample_num_tokens": 9078.0, "step": 4637, "total_num_tokens": 172964628.0, "z_loss": 0.0005663746269419789 }, { "copy_logits_max": -2.4079394340515137, "copy_logits_min": -750000000.0, "copy_num_tokens": 401.1875, "epoch": 0.9472555527189176, "gen_logits_max": 5.141711235046387, "gen_logits_mean": -14.229547500610352, "gen_logits_min": -26.000598907470703, "gen_logits_std": 2.8338115215301514, "gen_loss": 0.32386165857315063, "grad_norm": 0.3916387881150346, "learning_rate": 2.5071157894736844e-05, "loss": 0.2988, "mean_copy_accuracy": 0.9960889220237732, "mean_gen_accuracy": 0.8717695325613022, "mean_token_accuracy": 0.9009836763143539, "num_tokens": 173207495.0, "sample_num_tokens": 8095.25, "step": 4638, "total_num_tokens": 173239876.0, "z_loss": 0.0006245817057788372 }, { "copy_logits_max": -3.073026657104492, "copy_logits_min": -687500032.0, "copy_num_tokens": 338.125, "epoch": 0.9474597906561144, "gen_logits_max": 6.013237476348877, "gen_logits_mean": -14.093247413635254, "gen_logits_min": -25.68745231628418, "gen_logits_std": 2.8311753273010254, "gen_loss": 0.34486937522888184, "grad_norm": 0.3900917463712889, "learning_rate": 2.5069894736842108e-05, "loss": 0.3392, "mean_copy_accuracy": 0.995375782251358, "mean_gen_accuracy": 0.8611356317996979, "mean_token_accuracy": 0.8874451071023941, "num_tokens": 173465107.0, "sample_num_tokens": 8149.75, "step": 4639, "total_num_tokens": 173497706.0, "z_loss": 0.0006434716051444411 }, { "copy_logits_max": -1.1940827369689941, "copy_logits_min": -750000000.0, "copy_num_tokens": 359.75, "epoch": 0.9476640285933112, "gen_logits_max": 5.565850257873535, "gen_logits_mean": -13.82646656036377, "gen_logits_min": -25.779481887817383, "gen_logits_std": 2.8447182178497314, "gen_loss": 0.2918921709060669, "grad_norm": 0.4327209271399697, "learning_rate": 2.506863157894737e-05, "loss": 0.3352, "mean_copy_accuracy": 0.9945491999387741, "mean_gen_accuracy": 0.8582299798727036, "mean_token_accuracy": 0.886315330862999, "num_tokens": 173702547.0, "sample_num_tokens": 8137.25, "step": 4640, "total_num_tokens": 173735096.0, "z_loss": 0.0006255528423935175 }, { "copy_logits_max": -2.411353826522827, "copy_logits_min": -687500032.0, "copy_num_tokens": 402.625, "epoch": 0.947868266530508, "gen_logits_max": 5.617356300354004, "gen_logits_mean": -14.424833297729492, "gen_logits_min": -25.91398811340332, "gen_logits_std": 2.8251941204071045, "gen_loss": 0.31372371315956116, "grad_norm": 0.3953448488469012, "learning_rate": 2.5067368421052633e-05, "loss": 0.2984, "mean_copy_accuracy": 0.9954947829246521, "mean_gen_accuracy": 0.8728417009115219, "mean_token_accuracy": 0.9013935923576355, "num_tokens": 173972080.0, "sample_num_tokens": 9155.0, "step": 4641, "total_num_tokens": 174008700.0, "z_loss": 0.0006311364704743028 }, { "copy_logits_max": -2.3603904247283936, "copy_logits_min": -687500032.0, "copy_num_tokens": 324.125, "epoch": 0.9480725044677049, "gen_logits_max": 5.704127788543701, "gen_logits_mean": -14.276472091674805, "gen_logits_min": -26.186540603637695, "gen_logits_std": 2.852175235748291, "gen_loss": 0.3311160206794739, "grad_norm": 0.40983280711498987, "learning_rate": 2.5066105263157894e-05, "loss": 0.3131, "mean_copy_accuracy": 0.9935370236635208, "mean_gen_accuracy": 0.8736950904130936, "mean_token_accuracy": 0.8975007832050323, "num_tokens": 174218079.0, "sample_num_tokens": 7830.75, "step": 4642, "total_num_tokens": 174249402.0, "z_loss": 0.0006647096015512943 }, { "copy_logits_max": -1.046777367591858, "copy_logits_min": -750000000.0, "copy_num_tokens": 321.3125, "epoch": 0.9482767424049017, "gen_logits_max": 6.166934490203857, "gen_logits_mean": -12.716779708862305, "gen_logits_min": -24.339683532714844, "gen_logits_std": 2.830399751663208, "gen_loss": 0.35553133487701416, "grad_norm": 0.4195940167845275, "learning_rate": 2.506484210526316e-05, "loss": 0.3323, "mean_copy_accuracy": 0.9940008074045181, "mean_gen_accuracy": 0.861062690615654, "mean_token_accuracy": 0.8900022208690643, "num_tokens": 174500120.0, "sample_num_tokens": 7132.0, "step": 4643, "total_num_tokens": 174528648.0, "z_loss": 0.0006730963941663504 }, { "copy_logits_max": -2.4722535610198975, "copy_logits_min": -750000000.0, "copy_num_tokens": 377.5, "epoch": 0.9484809803420985, "gen_logits_max": 6.2324724197387695, "gen_logits_mean": -12.838667869567871, "gen_logits_min": -24.491703033447266, "gen_logits_std": 2.8869071006774902, "gen_loss": 0.2955860495567322, "grad_norm": 0.3574519311196454, "learning_rate": 2.506357894736842e-05, "loss": 0.2889, "mean_copy_accuracy": 0.9965413361787796, "mean_gen_accuracy": 0.8729158639907837, "mean_token_accuracy": 0.906666949391365, "num_tokens": 174783123.0, "sample_num_tokens": 8050.25, "step": 4644, "total_num_tokens": 174815324.0, "z_loss": 0.0005337439943104982 }, { "copy_logits_max": -3.6254982948303223, "copy_logits_min": -687500032.0, "copy_num_tokens": 726.5625, "epoch": 0.9486852182792954, "gen_logits_max": 3.571366310119629, "gen_logits_mean": -15.480158805847168, "gen_logits_min": -27.16732406616211, "gen_logits_std": 2.8584346771240234, "gen_loss": 0.27997922897338867, "grad_norm": 0.39226288223904654, "learning_rate": 2.5062315789473684e-05, "loss": 0.2814, "mean_copy_accuracy": 0.9959373325109482, "mean_gen_accuracy": 0.8711338937282562, "mean_token_accuracy": 0.9076181352138519, "num_tokens": 175068845.0, "sample_num_tokens": 9427.25, "step": 4645, "total_num_tokens": 175106554.0, "z_loss": 0.0005499987164512277 }, { "copy_logits_max": -3.8406150341033936, "copy_logits_min": -750000000.0, "copy_num_tokens": 357.3125, "epoch": 0.9488894562164922, "gen_logits_max": 5.273423194885254, "gen_logits_mean": -13.97597885131836, "gen_logits_min": -24.988935470581055, "gen_logits_std": 2.7730329036712646, "gen_loss": 0.3559253215789795, "grad_norm": 0.4107745124622291, "learning_rate": 2.5061052631578948e-05, "loss": 0.3269, "mean_copy_accuracy": 0.9961278438568115, "mean_gen_accuracy": 0.8592305779457092, "mean_token_accuracy": 0.8900883197784424, "num_tokens": 175324829.0, "sample_num_tokens": 7852.25, "step": 4646, "total_num_tokens": 175356238.0, "z_loss": 0.000625533633865416 }, { "copy_logits_max": -0.43878650665283203, "copy_logits_min": -750000000.0, "copy_num_tokens": 596.5, "epoch": 0.949093694153689, "gen_logits_max": 4.977396011352539, "gen_logits_mean": -13.289981842041016, "gen_logits_min": -25.625926971435547, "gen_logits_std": 2.8420324325561523, "gen_loss": 0.286639928817749, "grad_norm": 0.4124295810468036, "learning_rate": 2.5059789473684212e-05, "loss": 0.311, "mean_copy_accuracy": 0.9958503246307373, "mean_gen_accuracy": 0.8612070977687836, "mean_token_accuracy": 0.8959370404481888, "num_tokens": 175567755.0, "sample_num_tokens": 9446.25, "step": 4647, "total_num_tokens": 175605540.0, "z_loss": 0.000523414695635438 }, { "copy_logits_max": -0.08654609322547913, "copy_logits_min": -687500032.0, "copy_num_tokens": 564.6875, "epoch": 0.9492979320908859, "gen_logits_max": 5.344115257263184, "gen_logits_mean": -12.651724815368652, "gen_logits_min": -24.411392211914062, "gen_logits_std": 2.833859920501709, "gen_loss": 0.2728419005870819, "grad_norm": 0.38551078249951204, "learning_rate": 2.5058526315789473e-05, "loss": 0.2897, "mean_copy_accuracy": 0.9959379583597183, "mean_gen_accuracy": 0.8706590384244919, "mean_token_accuracy": 0.9031905382871628, "num_tokens": 175848923.0, "sample_num_tokens": 9020.25, "step": 4648, "total_num_tokens": 175885004.0, "z_loss": 0.00046060694148764014 }, { "copy_logits_max": -1.1755059957504272, "copy_logits_min": -750000000.0, "copy_num_tokens": 465.125, "epoch": 0.9495021700280827, "gen_logits_max": 4.639558792114258, "gen_logits_mean": -14.239311218261719, "gen_logits_min": -26.48175621032715, "gen_logits_std": 2.8426129817962646, "gen_loss": 0.2848397493362427, "grad_norm": 0.3814395022575889, "learning_rate": 2.5057263157894738e-05, "loss": 0.2867, "mean_copy_accuracy": 0.9949052929878235, "mean_gen_accuracy": 0.875485360622406, "mean_token_accuracy": 0.9043117612600327, "num_tokens": 176127020.0, "sample_num_tokens": 8219.0, "step": 4649, "total_num_tokens": 176159896.0, "z_loss": 0.00050621188711375 }, { "copy_logits_max": -2.2200887203216553, "copy_logits_min": -750000000.0, "copy_num_tokens": 429.0, "epoch": 0.9497064079652795, "gen_logits_max": 4.144126892089844, "gen_logits_mean": -15.709883689880371, "gen_logits_min": -27.331310272216797, "gen_logits_std": 2.8612217903137207, "gen_loss": 0.3280833959579468, "grad_norm": 0.3693122822114604, "learning_rate": 2.5056000000000002e-05, "loss": 0.2945, "mean_copy_accuracy": 0.9970110654830933, "mean_gen_accuracy": 0.8697943538427353, "mean_token_accuracy": 0.9009475857019424, "num_tokens": 176402071.0, "sample_num_tokens": 8389.75, "step": 4650, "total_num_tokens": 176435630.0, "z_loss": 0.0005678466404788196 }, { "copy_logits_max": -1.2853641510009766, "copy_logits_min": -750000000.0, "copy_num_tokens": 378.5625, "epoch": 0.9499106459024764, "gen_logits_max": 4.953850746154785, "gen_logits_mean": -14.589019775390625, "gen_logits_min": -26.17667007446289, "gen_logits_std": 2.8235483169555664, "gen_loss": 0.27935469150543213, "grad_norm": 0.424052416154867, "learning_rate": 2.5054736842105263e-05, "loss": 0.2773, "mean_copy_accuracy": 0.9965045601129532, "mean_gen_accuracy": 0.8789805322885513, "mean_token_accuracy": 0.9078156352043152, "num_tokens": 176677345.0, "sample_num_tokens": 7926.75, "step": 4651, "total_num_tokens": 176709052.0, "z_loss": 0.00049755530199036 }, { "copy_logits_max": 2.4239563941955566, "copy_logits_min": -750000000.0, "copy_num_tokens": 760.0, "epoch": 0.9501148838396732, "gen_logits_max": 4.742738723754883, "gen_logits_mean": -13.54699993133545, "gen_logits_min": -26.196252822875977, "gen_logits_std": 2.882359504699707, "gen_loss": 0.274207204580307, "grad_norm": 0.369671642636471, "learning_rate": 2.5053473684210527e-05, "loss": 0.2978, "mean_copy_accuracy": 0.9963468909263611, "mean_gen_accuracy": 0.8658241778612137, "mean_token_accuracy": 0.9008518159389496, "num_tokens": 176958012.0, "sample_num_tokens": 10287.0, "step": 4652, "total_num_tokens": 176999160.0, "z_loss": 0.0005165271577425301 }, { "copy_logits_max": -0.4701230823993683, "copy_logits_min": -750000000.0, "copy_num_tokens": 406.9375, "epoch": 0.9503191217768701, "gen_logits_max": 4.405195236206055, "gen_logits_mean": -15.29361629486084, "gen_logits_min": -27.647014617919922, "gen_logits_std": 2.8775599002838135, "gen_loss": 0.321410596370697, "grad_norm": 0.4165197133419478, "learning_rate": 2.5052210526315788e-05, "loss": 0.3227, "mean_copy_accuracy": 0.9956502467393875, "mean_gen_accuracy": 0.8594001531600952, "mean_token_accuracy": 0.8917046189308167, "num_tokens": 177228925.0, "sample_num_tokens": 7900.25, "step": 4653, "total_num_tokens": 177260526.0, "z_loss": 0.0006448164349421859 }, { "copy_logits_max": -0.759647011756897, "copy_logits_min": -687500032.0, "copy_num_tokens": 534.375, "epoch": 0.9505233597140669, "gen_logits_max": 5.218064308166504, "gen_logits_mean": -14.052783012390137, "gen_logits_min": -26.081947326660156, "gen_logits_std": 2.7934303283691406, "gen_loss": 0.30487608909606934, "grad_norm": 0.39545273827398714, "learning_rate": 2.5050947368421056e-05, "loss": 0.2891, "mean_copy_accuracy": 0.9960839599370956, "mean_gen_accuracy": 0.8700601607561111, "mean_token_accuracy": 0.9051203578710556, "num_tokens": 177503881.0, "sample_num_tokens": 9261.25, "step": 4654, "total_num_tokens": 177540926.0, "z_loss": 0.0006205156678333879 }, { "copy_logits_max": -0.6059632301330566, "copy_logits_min": -750000000.0, "copy_num_tokens": 403.5625, "epoch": 0.9507275976512637, "gen_logits_max": 4.6368184089660645, "gen_logits_mean": -15.127029418945312, "gen_logits_min": -26.750452041625977, "gen_logits_std": 2.808722972869873, "gen_loss": 0.3288773000240326, "grad_norm": 0.37991919302758903, "learning_rate": 2.5049684210526317e-05, "loss": 0.2938, "mean_copy_accuracy": 0.9954010397195816, "mean_gen_accuracy": 0.8730164915323257, "mean_token_accuracy": 0.8994072526693344, "num_tokens": 177761295.0, "sample_num_tokens": 8411.25, "step": 4655, "total_num_tokens": 177794940.0, "z_loss": 0.000670921232085675 }, { "copy_logits_max": 0.9016084671020508, "copy_logits_min": -750000064.0, "copy_num_tokens": 410.75, "epoch": 0.9509318355884605, "gen_logits_max": 5.279063701629639, "gen_logits_mean": -13.439461708068848, "gen_logits_min": -25.776363372802734, "gen_logits_std": 2.814889907836914, "gen_loss": 0.2501600384712219, "grad_norm": 0.40758959366815284, "learning_rate": 2.504842105263158e-05, "loss": 0.2826, "mean_copy_accuracy": 0.995262861251831, "mean_gen_accuracy": 0.8786442130804062, "mean_token_accuracy": 0.9068422317504883, "num_tokens": 178018158.0, "sample_num_tokens": 7673.0, "step": 4656, "total_num_tokens": 178048850.0, "z_loss": 0.0005698903696611524 }, { "copy_logits_max": 0.07964158058166504, "copy_logits_min": -750000000.0, "copy_num_tokens": 426.4375, "epoch": 0.9511360735256574, "gen_logits_max": 5.975612640380859, "gen_logits_mean": -13.800107955932617, "gen_logits_min": -25.626388549804688, "gen_logits_std": 2.8217530250549316, "gen_loss": 0.32326170802116394, "grad_norm": 0.375400920638909, "learning_rate": 2.5047157894736842e-05, "loss": 0.3004, "mean_copy_accuracy": 0.9963034689426422, "mean_gen_accuracy": 0.8709381520748138, "mean_token_accuracy": 0.9005321115255356, "num_tokens": 178302903.0, "sample_num_tokens": 8660.75, "step": 4657, "total_num_tokens": 178337546.0, "z_loss": 0.0006625625537708402 }, { "copy_logits_max": -0.4361220598220825, "copy_logits_min": -750000064.0, "copy_num_tokens": 540.125, "epoch": 0.9513403114628542, "gen_logits_max": 5.864489555358887, "gen_logits_mean": -13.276727676391602, "gen_logits_min": -24.845495223999023, "gen_logits_std": 2.788252353668213, "gen_loss": 0.3124167025089264, "grad_norm": 0.41364905776211053, "learning_rate": 2.5045894736842106e-05, "loss": 0.3059, "mean_copy_accuracy": 0.9951854944229126, "mean_gen_accuracy": 0.8648021668195724, "mean_token_accuracy": 0.8955443054437637, "num_tokens": 178571799.0, "sample_num_tokens": 9383.75, "step": 4658, "total_num_tokens": 178609334.0, "z_loss": 0.000667208805680275 }, { "copy_logits_max": 0.08704040199518204, "copy_logits_min": -750000000.0, "copy_num_tokens": 356.125, "epoch": 0.9515445494000511, "gen_logits_max": 4.987534523010254, "gen_logits_mean": -14.342228889465332, "gen_logits_min": -25.992504119873047, "gen_logits_std": 2.812191963195801, "gen_loss": 0.3209865391254425, "grad_norm": 0.4347018665456361, "learning_rate": 2.5044631578947367e-05, "loss": 0.3153, "mean_copy_accuracy": 0.9955315589904785, "mean_gen_accuracy": 0.8680783063173294, "mean_token_accuracy": 0.8955103605985641, "num_tokens": 178825767.0, "sample_num_tokens": 7487.25, "step": 4659, "total_num_tokens": 178855716.0, "z_loss": 0.0006583426729775965 }, { "copy_logits_max": -2.0488195419311523, "copy_logits_min": -750000128.0, "copy_num_tokens": 410.625, "epoch": 0.9517487873372479, "gen_logits_max": 5.634120464324951, "gen_logits_mean": -13.751150131225586, "gen_logits_min": -25.42208480834961, "gen_logits_std": 2.834336757659912, "gen_loss": 0.3099345564842224, "grad_norm": 0.4327629220805829, "learning_rate": 2.504336842105263e-05, "loss": 0.2812, "mean_copy_accuracy": 0.9947746247053146, "mean_gen_accuracy": 0.8735338598489761, "mean_token_accuracy": 0.9069496542215347, "num_tokens": 179100884.0, "sample_num_tokens": 7881.0, "step": 4660, "total_num_tokens": 179132408.0, "z_loss": 0.0006605290691368282 }, { "copy_logits_max": 0.5845835208892822, "copy_logits_min": -750000000.0, "copy_num_tokens": 388.5625, "epoch": 0.9519530252744447, "gen_logits_max": 5.390941143035889, "gen_logits_mean": -13.301445960998535, "gen_logits_min": -24.689149856567383, "gen_logits_std": 2.754521369934082, "gen_loss": 0.3178289532661438, "grad_norm": 0.38053127210980486, "learning_rate": 2.5042105263157893e-05, "loss": 0.3049, "mean_copy_accuracy": 0.9955979883670807, "mean_gen_accuracy": 0.8710785806179047, "mean_token_accuracy": 0.8987690657377243, "num_tokens": 179351690.0, "sample_num_tokens": 8565.5, "step": 4661, "total_num_tokens": 179385952.0, "z_loss": 0.0006374499062076211 }, { "copy_logits_max": -2.4722964763641357, "copy_logits_min": -750000000.0, "copy_num_tokens": 349.75, "epoch": 0.9521572632116415, "gen_logits_max": 5.5639967918396, "gen_logits_mean": -13.669462203979492, "gen_logits_min": -24.92644691467285, "gen_logits_std": 2.6751160621643066, "gen_loss": 0.3041851222515106, "grad_norm": 0.44391994485845354, "learning_rate": 2.504084210526316e-05, "loss": 0.3231, "mean_copy_accuracy": 0.9952898472547531, "mean_gen_accuracy": 0.8650183826684952, "mean_token_accuracy": 0.8950608670711517, "num_tokens": 179636615.0, "sample_num_tokens": 8129.75, "step": 4662, "total_num_tokens": 179669134.0, "z_loss": 0.0005992279620841146 }, { "copy_logits_max": -2.967601776123047, "copy_logits_min": -687500032.0, "copy_num_tokens": 303.5625, "epoch": 0.9523615011488384, "gen_logits_max": 5.493488311767578, "gen_logits_mean": -15.45355224609375, "gen_logits_min": -26.899974822998047, "gen_logits_std": 2.7626404762268066, "gen_loss": 0.3013484477996826, "grad_norm": 0.39722281002452114, "learning_rate": 2.5039578947368425e-05, "loss": 0.3197, "mean_copy_accuracy": 0.9946925342082977, "mean_gen_accuracy": 0.8653252571821213, "mean_token_accuracy": 0.8930160403251648, "num_tokens": 179882227.0, "sample_num_tokens": 7178.25, "step": 4663, "total_num_tokens": 179910940.0, "z_loss": 0.0006586892995983362 }, { "copy_logits_max": -1.8293976783752441, "copy_logits_min": -750000064.0, "copy_num_tokens": 421.9375, "epoch": 0.9525657390860353, "gen_logits_max": 5.613065719604492, "gen_logits_mean": -14.384817123413086, "gen_logits_min": -25.68120574951172, "gen_logits_std": 2.7591333389282227, "gen_loss": 0.36524754762649536, "grad_norm": 0.43547392167603144, "learning_rate": 2.5038315789473685e-05, "loss": 0.3037, "mean_copy_accuracy": 0.994488924741745, "mean_gen_accuracy": 0.8682321012020111, "mean_token_accuracy": 0.8987766206264496, "num_tokens": 180156769.0, "sample_num_tokens": 9057.75, "step": 4664, "total_num_tokens": 180193000.0, "z_loss": 0.0007814749842509627 }, { "copy_logits_max": -2.154128313064575, "copy_logits_min": -687500032.0, "copy_num_tokens": 506.1875, "epoch": 0.9527699770232321, "gen_logits_max": 4.811030387878418, "gen_logits_mean": -12.898021697998047, "gen_logits_min": -24.143413543701172, "gen_logits_std": 2.690335273742676, "gen_loss": 0.27376145124435425, "grad_norm": 0.3685065361497621, "learning_rate": 2.503705263157895e-05, "loss": 0.2581, "mean_copy_accuracy": 0.9956609308719635, "mean_gen_accuracy": 0.8825706988573074, "mean_token_accuracy": 0.9134562760591507, "num_tokens": 180436606.0, "sample_num_tokens": 8410.5, "step": 4665, "total_num_tokens": 180470248.0, "z_loss": 0.0006073185941204429 }, { "copy_logits_max": -3.0868146419525146, "copy_logits_min": -687500032.0, "copy_num_tokens": 359.8125, "epoch": 0.9529742149604289, "gen_logits_max": 4.090686798095703, "gen_logits_mean": -15.710034370422363, "gen_logits_min": -26.844608306884766, "gen_logits_std": 2.76509952545166, "gen_loss": 0.3056309223175049, "grad_norm": 0.4272930961483232, "learning_rate": 2.503578947368421e-05, "loss": 0.3125, "mean_copy_accuracy": 0.9960378408432007, "mean_gen_accuracy": 0.8658810555934906, "mean_token_accuracy": 0.8975623995065689, "num_tokens": 180681991.0, "sample_num_tokens": 7055.25, "step": 4666, "total_num_tokens": 180710212.0, "z_loss": 0.0006614433950744569 }, { "copy_logits_max": -1.9851683378219604, "copy_logits_min": -687500032.0, "copy_num_tokens": 649.5, "epoch": 0.9531784528976257, "gen_logits_max": 3.7540411949157715, "gen_logits_mean": -15.743121147155762, "gen_logits_min": -27.4118595123291, "gen_logits_std": 2.774441957473755, "gen_loss": 0.23952311277389526, "grad_norm": 0.4045323134466426, "learning_rate": 2.5034526315789475e-05, "loss": 0.2734, "mean_copy_accuracy": 0.996669352054596, "mean_gen_accuracy": 0.8759221732616425, "mean_token_accuracy": 0.9090785980224609, "num_tokens": 180945472.0, "sample_num_tokens": 9559.0, "step": 4667, "total_num_tokens": 180983708.0, "z_loss": 0.000544710666872561 }, { "copy_logits_max": -2.414912462234497, "copy_logits_min": -750000000.0, "copy_num_tokens": 426.9375, "epoch": 0.9533826908348225, "gen_logits_max": 4.688270568847656, "gen_logits_mean": -15.731050491333008, "gen_logits_min": -27.24553871154785, "gen_logits_std": 2.80250883102417, "gen_loss": 0.28778862953186035, "grad_norm": 0.48358593867456895, "learning_rate": 2.5033263157894736e-05, "loss": 0.3099, "mean_copy_accuracy": 0.9934923201799393, "mean_gen_accuracy": 0.8725784122943878, "mean_token_accuracy": 0.8966901004314423, "num_tokens": 181204618.0, "sample_num_tokens": 8271.0, "step": 4668, "total_num_tokens": 181237702.0, "z_loss": 0.0005477347876876593 }, { "copy_logits_max": 1.2401044368743896, "copy_logits_min": -750000128.0, "copy_num_tokens": 716.375, "epoch": 0.9535869287720194, "gen_logits_max": 4.091641426086426, "gen_logits_mean": -14.271108627319336, "gen_logits_min": -26.511402130126953, "gen_logits_std": 2.862920045852661, "gen_loss": 0.30980589985847473, "grad_norm": 0.4056453111760708, "learning_rate": 2.5032e-05, "loss": 0.317, "mean_copy_accuracy": 0.9951147139072418, "mean_gen_accuracy": 0.8589763939380646, "mean_token_accuracy": 0.8955823332071304, "num_tokens": 181482465.0, "sample_num_tokens": 10165.75, "step": 4669, "total_num_tokens": 181523128.0, "z_loss": 0.0005496591329574585 }, { "copy_logits_max": -2.912548303604126, "copy_logits_min": -750000000.0, "copy_num_tokens": 439.875, "epoch": 0.9537911667092163, "gen_logits_max": 6.192840099334717, "gen_logits_mean": -14.039114952087402, "gen_logits_min": -25.866268157958984, "gen_logits_std": 2.7830028533935547, "gen_loss": 0.299280047416687, "grad_norm": 0.40348945680023396, "learning_rate": 2.5030736842105265e-05, "loss": 0.2991, "mean_copy_accuracy": 0.9960847496986389, "mean_gen_accuracy": 0.8726540952920914, "mean_token_accuracy": 0.9023803919553757, "num_tokens": 181749761.0, "sample_num_tokens": 8569.25, "step": 4670, "total_num_tokens": 181784038.0, "z_loss": 0.0005119270645081997 }, { "copy_logits_max": -3.6619811058044434, "copy_logits_min": -625000064.0, "copy_num_tokens": 362.4375, "epoch": 0.9539954046464131, "gen_logits_max": 4.6674113273620605, "gen_logits_mean": -15.879447937011719, "gen_logits_min": -27.3485050201416, "gen_logits_std": 2.848446846008301, "gen_loss": 0.31689518690109253, "grad_norm": 0.40033046909136427, "learning_rate": 2.502947368421053e-05, "loss": 0.3029, "mean_copy_accuracy": 0.9957103133201599, "mean_gen_accuracy": 0.8692436963319778, "mean_token_accuracy": 0.8982859551906586, "num_tokens": 182000665.0, "sample_num_tokens": 8517.25, "step": 4671, "total_num_tokens": 182034734.0, "z_loss": 0.0005798715865239501 }, { "copy_logits_max": -0.773585319519043, "copy_logits_min": -750000000.0, "copy_num_tokens": 271.5625, "epoch": 0.9541996425836099, "gen_logits_max": 6.465417861938477, "gen_logits_mean": -12.016969680786133, "gen_logits_min": -23.837465286254883, "gen_logits_std": 2.734778881072998, "gen_loss": 0.33647629618644714, "grad_norm": 0.41823541451531876, "learning_rate": 2.502821052631579e-05, "loss": 0.3313, "mean_copy_accuracy": 0.9952827841043472, "mean_gen_accuracy": 0.8577706813812256, "mean_token_accuracy": 0.890487790107727, "num_tokens": 182270744.0, "sample_num_tokens": 6366.5, "step": 4672, "total_num_tokens": 182296210.0, "z_loss": 0.0006064252811484039 }, { "copy_logits_max": -0.4286450445652008, "copy_logits_min": -687500032.0, "copy_num_tokens": 525.625, "epoch": 0.9544038805208067, "gen_logits_max": 4.173907279968262, "gen_logits_mean": -14.692699432373047, "gen_logits_min": -26.71099090576172, "gen_logits_std": 2.8505852222442627, "gen_loss": 0.3565317392349243, "grad_norm": 0.41876650941909976, "learning_rate": 2.5026947368421054e-05, "loss": 0.3242, "mean_copy_accuracy": 0.9956795275211334, "mean_gen_accuracy": 0.8651021420955658, "mean_token_accuracy": 0.8967314958572388, "num_tokens": 182529446.0, "sample_num_tokens": 8461.0, "step": 4673, "total_num_tokens": 182563290.0, "z_loss": 0.0006536372238770127 }, { "copy_logits_max": -0.45005494356155396, "copy_logits_min": -750000000.0, "copy_num_tokens": 412.375, "epoch": 0.9546081184580035, "gen_logits_max": 6.298957824707031, "gen_logits_mean": -13.577775001525879, "gen_logits_min": -25.600149154663086, "gen_logits_std": 2.8591229915618896, "gen_loss": 0.2843976318836212, "grad_norm": 0.3834674231599924, "learning_rate": 2.5025684210526315e-05, "loss": 0.2944, "mean_copy_accuracy": 0.9965221285820007, "mean_gen_accuracy": 0.8741553723812103, "mean_token_accuracy": 0.9040001183748245, "num_tokens": 182827469.0, "sample_num_tokens": 9802.25, "step": 4674, "total_num_tokens": 182866678.0, "z_loss": 0.0005951644852757454 }, { "copy_logits_max": -1.9570155143737793, "copy_logits_min": -750000000.0, "copy_num_tokens": 458.875, "epoch": 0.9548123563952005, "gen_logits_max": 4.572310447692871, "gen_logits_mean": -15.250412940979004, "gen_logits_min": -27.180028915405273, "gen_logits_std": 2.8786354064941406, "gen_loss": 0.28630441427230835, "grad_norm": 0.3902232922925029, "learning_rate": 2.502442105263158e-05, "loss": 0.3057, "mean_copy_accuracy": 0.9955040961503983, "mean_gen_accuracy": 0.8663551658391953, "mean_token_accuracy": 0.8991393893957138, "num_tokens": 183099639.0, "sample_num_tokens": 8369.25, "step": 4675, "total_num_tokens": 183133116.0, "z_loss": 0.0005855396739207208 }, { "copy_logits_max": -3.0702242851257324, "copy_logits_min": -750000000.0, "copy_num_tokens": 551.125, "epoch": 0.9550165943323973, "gen_logits_max": 4.883983612060547, "gen_logits_mean": -14.872735977172852, "gen_logits_min": -27.174808502197266, "gen_logits_std": 2.8687028884887695, "gen_loss": 0.3133367896080017, "grad_norm": 0.35987146657795527, "learning_rate": 2.502315789473684e-05, "loss": 0.27, "mean_copy_accuracy": 0.9961592555046082, "mean_gen_accuracy": 0.8780318647623062, "mean_token_accuracy": 0.9102166146039963, "num_tokens": 183397589.0, "sample_num_tokens": 9663.25, "step": 4676, "total_num_tokens": 183436242.0, "z_loss": 0.0006270181620493531 }, { "copy_logits_max": -0.6132091283798218, "copy_logits_min": -750000064.0, "copy_num_tokens": 622.0625, "epoch": 0.9552208322695941, "gen_logits_max": 4.251889228820801, "gen_logits_mean": -14.961060523986816, "gen_logits_min": -26.801279067993164, "gen_logits_std": 2.878098487854004, "gen_loss": 0.2591821551322937, "grad_norm": 0.3692161159089525, "learning_rate": 2.5021894736842105e-05, "loss": 0.2892, "mean_copy_accuracy": 0.9959938824176788, "mean_gen_accuracy": 0.8684588223695755, "mean_token_accuracy": 0.9031390249729156, "num_tokens": 183680255.0, "sample_num_tokens": 8924.25, "step": 4677, "total_num_tokens": 183715952.0, "z_loss": 0.000515329244080931 }, { "copy_logits_max": -2.073695659637451, "copy_logits_min": -687500032.0, "copy_num_tokens": 614.375, "epoch": 0.9554250702067909, "gen_logits_max": 3.8253607749938965, "gen_logits_mean": -14.826425552368164, "gen_logits_min": -27.244054794311523, "gen_logits_std": 2.892429828643799, "gen_loss": 0.26849430799484253, "grad_norm": 0.4255497450114115, "learning_rate": 2.502063157894737e-05, "loss": 0.3034, "mean_copy_accuracy": 0.9962931126356125, "mean_gen_accuracy": 0.865108460187912, "mean_token_accuracy": 0.8998118340969086, "num_tokens": 183949399.0, "sample_num_tokens": 9131.75, "step": 4678, "total_num_tokens": 183985926.0, "z_loss": 0.0005137635162100196 }, { "copy_logits_max": 1.3930513858795166, "copy_logits_min": -687500032.0, "copy_num_tokens": 627.25, "epoch": 0.9556293081439877, "gen_logits_max": 4.262519359588623, "gen_logits_mean": -14.936782836914062, "gen_logits_min": -27.01970863342285, "gen_logits_std": 2.876142978668213, "gen_loss": 0.304573118686676, "grad_norm": 0.3905246053647231, "learning_rate": 2.5019368421052633e-05, "loss": 0.2952, "mean_copy_accuracy": 0.9961010366678238, "mean_gen_accuracy": 0.8604005128145218, "mean_token_accuracy": 0.90109783411026, "num_tokens": 184229920.0, "sample_num_tokens": 9437.5, "step": 4679, "total_num_tokens": 184267670.0, "z_loss": 0.0006010602810420096 }, { "copy_logits_max": 0.4610781967639923, "copy_logits_min": -750000000.0, "copy_num_tokens": 351.0625, "epoch": 0.9558335460811845, "gen_logits_max": 6.037226676940918, "gen_logits_mean": -13.455057144165039, "gen_logits_min": -25.40225601196289, "gen_logits_std": 2.8483123779296875, "gen_loss": 0.3312273621559143, "grad_norm": 0.3905457489087918, "learning_rate": 2.5018105263157897e-05, "loss": 0.3125, "mean_copy_accuracy": 0.9949857890605927, "mean_gen_accuracy": 0.8676765263080597, "mean_token_accuracy": 0.8963996917009354, "num_tokens": 184509575.0, "sample_num_tokens": 7782.75, "step": 4680, "total_num_tokens": 184540706.0, "z_loss": 0.0006368294125422835 }, { "copy_logits_max": -4.351329803466797, "copy_logits_min": -750000000.0, "copy_num_tokens": 304.3125, "epoch": 0.9560377840183815, "gen_logits_max": 5.2174811363220215, "gen_logits_mean": -15.209159851074219, "gen_logits_min": -26.563465118408203, "gen_logits_std": 2.842729330062866, "gen_loss": 0.30978500843048096, "grad_norm": 0.38416929082733137, "learning_rate": 2.501684210526316e-05, "loss": 0.3007, "mean_copy_accuracy": 0.9962349832057953, "mean_gen_accuracy": 0.8683218508958817, "mean_token_accuracy": 0.8984700739383698, "num_tokens": 184777247.0, "sample_num_tokens": 7099.25, "step": 4681, "total_num_tokens": 184805644.0, "z_loss": 0.0005934538785368204 }, { "copy_logits_max": -1.6560970544815063, "copy_logits_min": -750000000.0, "copy_num_tokens": 293.375, "epoch": 0.9562420219555783, "gen_logits_max": 6.433259010314941, "gen_logits_mean": -13.193085670471191, "gen_logits_min": -24.627246856689453, "gen_logits_std": 2.8438045978546143, "gen_loss": 0.30875176191329956, "grad_norm": 0.40012495897472555, "learning_rate": 2.5015578947368423e-05, "loss": 0.3163, "mean_copy_accuracy": 0.9939897209405899, "mean_gen_accuracy": 0.8652346432209015, "mean_token_accuracy": 0.894409254193306, "num_tokens": 185030885.0, "sample_num_tokens": 7530.25, "step": 4682, "total_num_tokens": 185061006.0, "z_loss": 0.0005988439079374075 }, { "copy_logits_max": -0.5515520572662354, "copy_logits_min": -750000000.0, "copy_num_tokens": 492.5, "epoch": 0.9564462598927751, "gen_logits_max": 5.037908554077148, "gen_logits_mean": -14.654802322387695, "gen_logits_min": -26.875953674316406, "gen_logits_std": 2.8729214668273926, "gen_loss": 0.26993483304977417, "grad_norm": 0.38478792783537724, "learning_rate": 2.5014315789473684e-05, "loss": 0.296, "mean_copy_accuracy": 0.9948943853378296, "mean_gen_accuracy": 0.8722406625747681, "mean_token_accuracy": 0.9001953601837158, "num_tokens": 185284962.0, "sample_num_tokens": 8088.5, "step": 4683, "total_num_tokens": 185317316.0, "z_loss": 0.0005362965166568756 }, { "copy_logits_max": 1.6473891735076904, "copy_logits_min": -750000000.0, "copy_num_tokens": 457.5625, "epoch": 0.9566504978299719, "gen_logits_max": 4.9034271240234375, "gen_logits_mean": -14.151731491088867, "gen_logits_min": -26.203615188598633, "gen_logits_std": 2.8983616828918457, "gen_loss": 0.26210659742355347, "grad_norm": 0.39306895539016445, "learning_rate": 2.5013052631578948e-05, "loss": 0.2933, "mean_copy_accuracy": 0.9965431690216064, "mean_gen_accuracy": 0.8688492476940155, "mean_token_accuracy": 0.9008964598178864, "num_tokens": 185563580.0, "sample_num_tokens": 7766.5, "step": 4684, "total_num_tokens": 185594646.0, "z_loss": 0.0005079591646790504 }, { "copy_logits_max": -2.0109994411468506, "copy_logits_min": -750000000.0, "copy_num_tokens": 237.0, "epoch": 0.9568547357671687, "gen_logits_max": 5.28318977355957, "gen_logits_mean": -15.10373306274414, "gen_logits_min": -26.523433685302734, "gen_logits_std": 2.839688777923584, "gen_loss": 0.30942630767822266, "grad_norm": 0.4096161685731262, "learning_rate": 2.501178947368421e-05, "loss": 0.3206, "mean_copy_accuracy": 0.9947681725025177, "mean_gen_accuracy": 0.8666832596063614, "mean_token_accuracy": 0.8929407596588135, "num_tokens": 185817593.0, "sample_num_tokens": 6760.25, "step": 4685, "total_num_tokens": 185844634.0, "z_loss": 0.0005723175127059221 }, { "copy_logits_max": -1.0984820127487183, "copy_logits_min": -687500032.0, "copy_num_tokens": 450.1875, "epoch": 0.9570589737043655, "gen_logits_max": 5.552463054656982, "gen_logits_mean": -13.874068260192871, "gen_logits_min": -25.679838180541992, "gen_logits_std": 2.8761165142059326, "gen_loss": 0.2979747951030731, "grad_norm": 0.35150621873308335, "learning_rate": 2.5010526315789473e-05, "loss": 0.3161, "mean_copy_accuracy": 0.9952916353940964, "mean_gen_accuracy": 0.8633005768060684, "mean_token_accuracy": 0.8944727331399918, "num_tokens": 186099614.0, "sample_num_tokens": 8696.5, "step": 4686, "total_num_tokens": 186134400.0, "z_loss": 0.0005546495667658746 }, { "copy_logits_max": -2.0637717247009277, "copy_logits_min": -750000000.0, "copy_num_tokens": 257.125, "epoch": 0.9572632116415625, "gen_logits_max": 6.084857940673828, "gen_logits_mean": -13.891630172729492, "gen_logits_min": -25.8720645904541, "gen_logits_std": 2.8619070053100586, "gen_loss": 0.36355459690093994, "grad_norm": 0.3757331605857804, "learning_rate": 2.5009263157894737e-05, "loss": 0.2945, "mean_copy_accuracy": 0.9949886053800583, "mean_gen_accuracy": 0.8755072355270386, "mean_token_accuracy": 0.9022154808044434, "num_tokens": 186364302.0, "sample_num_tokens": 7440.5, "step": 4687, "total_num_tokens": 186394064.0, "z_loss": 0.0006203419179655612 }, { "copy_logits_max": -0.8383666276931763, "copy_logits_min": -750000000.0, "copy_num_tokens": 305.25, "epoch": 0.9574674495787593, "gen_logits_max": 4.768686771392822, "gen_logits_mean": -15.827396392822266, "gen_logits_min": -27.2910099029541, "gen_logits_std": 2.875359535217285, "gen_loss": 0.31091636419296265, "grad_norm": 0.7917855879699677, "learning_rate": 2.5008000000000002e-05, "loss": 0.3124, "mean_copy_accuracy": 0.9949999004602432, "mean_gen_accuracy": 0.8648502379655838, "mean_token_accuracy": 0.8947400748729706, "num_tokens": 186621557.0, "sample_num_tokens": 7344.25, "step": 4688, "total_num_tokens": 186650934.0, "z_loss": 0.000582541455514729 }, { "copy_logits_max": 2.0373730659484863, "copy_logits_min": -625000064.0, "copy_num_tokens": 331.875, "epoch": 0.9576716875159561, "gen_logits_max": 5.617610454559326, "gen_logits_mean": -14.029354095458984, "gen_logits_min": -26.06390380859375, "gen_logits_std": 2.88346004486084, "gen_loss": 0.34106937050819397, "grad_norm": 0.41876436638913883, "learning_rate": 2.5006736842105263e-05, "loss": 0.3137, "mean_copy_accuracy": 0.9941495358943939, "mean_gen_accuracy": 0.8669008910655975, "mean_token_accuracy": 0.8967911601066589, "num_tokens": 186879039.0, "sample_num_tokens": 7063.25, "step": 4689, "total_num_tokens": 186907292.0, "z_loss": 0.0006530359387397766 }, { "copy_logits_max": -1.6323161125183105, "copy_logits_min": -750000064.0, "copy_num_tokens": 306.9375, "epoch": 0.9578759254531529, "gen_logits_max": 5.820743560791016, "gen_logits_mean": -14.979722023010254, "gen_logits_min": -26.646968841552734, "gen_logits_std": 2.8666648864746094, "gen_loss": 0.3327157497406006, "grad_norm": 0.3779355212314218, "learning_rate": 2.5005473684210527e-05, "loss": 0.3092, "mean_copy_accuracy": 0.9950181245803833, "mean_gen_accuracy": 0.8676291853189468, "mean_token_accuracy": 0.8991474360227585, "num_tokens": 187140958.0, "sample_num_tokens": 7231.0, "step": 4690, "total_num_tokens": 187169882.0, "z_loss": 0.0006007981719449162 }, { "copy_logits_max": 1.387326955795288, "copy_logits_min": -750000064.0, "copy_num_tokens": 607.9375, "epoch": 0.9580801633903497, "gen_logits_max": 5.984526634216309, "gen_logits_mean": -12.9260892868042, "gen_logits_min": -25.80437469482422, "gen_logits_std": 2.8960533142089844, "gen_loss": 0.28938576579093933, "grad_norm": 0.38113646902451165, "learning_rate": 2.500421052631579e-05, "loss": 0.3053, "mean_copy_accuracy": 0.996440514922142, "mean_gen_accuracy": 0.8620319962501526, "mean_token_accuracy": 0.8988390266895294, "num_tokens": 187424555.0, "sample_num_tokens": 9241.25, "step": 4691, "total_num_tokens": 187461520.0, "z_loss": 0.0005790300201624632 }, { "copy_logits_max": 2.1082539558410645, "copy_logits_min": -750000000.0, "copy_num_tokens": 499.0625, "epoch": 0.9582844013275466, "gen_logits_max": 5.958122253417969, "gen_logits_mean": -13.101444244384766, "gen_logits_min": -25.01165199279785, "gen_logits_std": 2.8859455585479736, "gen_loss": 0.3313864469528198, "grad_norm": 0.38634374864844434, "learning_rate": 2.5002947368421052e-05, "loss": 0.3071, "mean_copy_accuracy": 0.9958580732345581, "mean_gen_accuracy": 0.8670037686824799, "mean_token_accuracy": 0.8980227708816528, "num_tokens": 187694325.0, "sample_num_tokens": 9170.75, "step": 4692, "total_num_tokens": 187731008.0, "z_loss": 0.0006774269277229905 }, { "copy_logits_max": -1.0666007995605469, "copy_logits_min": -687500032.0, "copy_num_tokens": 555.3125, "epoch": 0.9584886392647435, "gen_logits_max": 5.9393463134765625, "gen_logits_mean": -13.133298873901367, "gen_logits_min": -24.941381454467773, "gen_logits_std": 2.8829355239868164, "gen_loss": 0.3287338316440582, "grad_norm": 0.4008565800455702, "learning_rate": 2.5001684210526317e-05, "loss": 0.3242, "mean_copy_accuracy": 0.995535671710968, "mean_gen_accuracy": 0.8624944090843201, "mean_token_accuracy": 0.8951883465051651, "num_tokens": 187958570.0, "sample_num_tokens": 9417.5, "step": 4693, "total_num_tokens": 187996240.0, "z_loss": 0.0006979882018640637 }, { "copy_logits_max": -3.24202299118042, "copy_logits_min": -750000000.0, "copy_num_tokens": 484.5, "epoch": 0.9586928772019403, "gen_logits_max": 6.51550817489624, "gen_logits_mean": -13.334554672241211, "gen_logits_min": -25.188976287841797, "gen_logits_std": 2.892671585083008, "gen_loss": 0.32665130496025085, "grad_norm": 0.40231042184327326, "learning_rate": 2.5000421052631578e-05, "loss": 0.3251, "mean_copy_accuracy": 0.9951312392950058, "mean_gen_accuracy": 0.8647114485502243, "mean_token_accuracy": 0.8939960449934006, "num_tokens": 188206308.0, "sample_num_tokens": 8465.5, "step": 4694, "total_num_tokens": 188240170.0, "z_loss": 0.0006306918803602457 }, { "copy_logits_max": -1.1925575733184814, "copy_logits_min": -750000000.0, "copy_num_tokens": 433.5, "epoch": 0.9588971151391371, "gen_logits_max": 5.134145736694336, "gen_logits_mean": -14.149286270141602, "gen_logits_min": -26.55807876586914, "gen_logits_std": 2.897806167602539, "gen_loss": 0.27159225940704346, "grad_norm": 0.38814880422251347, "learning_rate": 2.4999157894736845e-05, "loss": 0.2942, "mean_copy_accuracy": 0.9949570000171661, "mean_gen_accuracy": 0.8705054968595505, "mean_token_accuracy": 0.9011573642492294, "num_tokens": 188465158.0, "sample_num_tokens": 7941.0, "step": 4695, "total_num_tokens": 188496922.0, "z_loss": 0.0005199642619118094 }, { "copy_logits_max": -1.9671448469161987, "copy_logits_min": -750000000.0, "copy_num_tokens": 638.5, "epoch": 0.9591013530763339, "gen_logits_max": 6.008542060852051, "gen_logits_mean": -12.524713516235352, "gen_logits_min": -24.886056900024414, "gen_logits_std": 2.8982861042022705, "gen_loss": 0.3152962625026703, "grad_norm": 0.3821021259533448, "learning_rate": 2.4997894736842106e-05, "loss": 0.3099, "mean_copy_accuracy": 0.9963110536336899, "mean_gen_accuracy": 0.8628890216350555, "mean_token_accuracy": 0.8969735950231552, "num_tokens": 188733518.0, "sample_num_tokens": 9938.5, "step": 4696, "total_num_tokens": 188773272.0, "z_loss": 0.0006130652036517859 }, { "copy_logits_max": -6.922517776489258, "copy_logits_min": -750000000.0, "copy_num_tokens": 371.375, "epoch": 0.9593055910135307, "gen_logits_max": 6.094533443450928, "gen_logits_mean": -14.493402481079102, "gen_logits_min": -26.399818420410156, "gen_logits_std": 2.8926138877868652, "gen_loss": 0.282448410987854, "grad_norm": 0.3956529915955014, "learning_rate": 2.499663157894737e-05, "loss": 0.3117, "mean_copy_accuracy": 0.9957282990217209, "mean_gen_accuracy": 0.8638955354690552, "mean_token_accuracy": 0.8966081738471985, "num_tokens": 189005616.0, "sample_num_tokens": 7699.0, "step": 4697, "total_num_tokens": 189036412.0, "z_loss": 0.0005425645504146814 }, { "copy_logits_max": -3.223785638809204, "copy_logits_min": -750000128.0, "copy_num_tokens": 672.0625, "epoch": 0.9595098289507276, "gen_logits_max": 4.203218460083008, "gen_logits_mean": -13.645453453063965, "gen_logits_min": -25.566661834716797, "gen_logits_std": 2.8958864212036133, "gen_loss": 0.2579891085624695, "grad_norm": 0.3656869521113962, "learning_rate": 2.499536842105263e-05, "loss": 0.2938, "mean_copy_accuracy": 0.9970236569643021, "mean_gen_accuracy": 0.8650811910629272, "mean_token_accuracy": 0.9035720229148865, "num_tokens": 189294881.0, "sample_num_tokens": 9035.75, "step": 4698, "total_num_tokens": 189331024.0, "z_loss": 0.0005531255155801773 }, { "copy_logits_max": -5.153047561645508, "copy_logits_min": -687500032.0, "copy_num_tokens": 475.0625, "epoch": 0.9597140668879244, "gen_logits_max": 4.878047466278076, "gen_logits_mean": -14.921595573425293, "gen_logits_min": -27.29994773864746, "gen_logits_std": 2.919426679611206, "gen_loss": 0.307363361120224, "grad_norm": 0.37324544062660314, "learning_rate": 2.4994105263157896e-05, "loss": 0.322, "mean_copy_accuracy": 0.9956113249063492, "mean_gen_accuracy": 0.8613872528076172, "mean_token_accuracy": 0.8924385905265808, "num_tokens": 189568351.0, "sample_num_tokens": 8340.75, "step": 4699, "total_num_tokens": 189601714.0, "z_loss": 0.0005490758921951056 }, { "copy_logits_max": -5.2390594482421875, "copy_logits_min": -750000000.0, "copy_num_tokens": 542.8125, "epoch": 0.9599183048251213, "gen_logits_max": 6.012455940246582, "gen_logits_mean": -12.548653602600098, "gen_logits_min": -24.709444046020508, "gen_logits_std": 2.8742613792419434, "gen_loss": 0.3041464686393738, "grad_norm": 0.4226307845855479, "learning_rate": 2.4992842105263157e-05, "loss": 0.3003, "mean_copy_accuracy": 0.9953686445951462, "mean_gen_accuracy": 0.8677988946437836, "mean_token_accuracy": 0.89805106818676, "num_tokens": 189824121.0, "sample_num_tokens": 8952.75, "step": 4700, "total_num_tokens": 189859932.0, "z_loss": 0.0005793196614831686 }, { "copy_logits_max": -4.950613975524902, "copy_logits_min": -750000000.0, "copy_num_tokens": 596.375, "epoch": 0.9601225427623181, "gen_logits_max": 4.7600908279418945, "gen_logits_mean": -14.303522109985352, "gen_logits_min": -26.540157318115234, "gen_logits_std": 2.927091121673584, "gen_loss": 0.24577292799949646, "grad_norm": 0.5739810475282814, "learning_rate": 2.499157894736842e-05, "loss": 0.2653, "mean_copy_accuracy": 0.9961428046226501, "mean_gen_accuracy": 0.8792077898979187, "mean_token_accuracy": 0.9125809073448181, "num_tokens": 190102795.0, "sample_num_tokens": 8753.25, "step": 4701, "total_num_tokens": 190137808.0, "z_loss": 0.0004575166676659137 }, { "copy_logits_max": -4.015092849731445, "copy_logits_min": -750000000.0, "copy_num_tokens": 509.375, "epoch": 0.9603267806995149, "gen_logits_max": 5.579164028167725, "gen_logits_mean": -13.025408744812012, "gen_logits_min": -25.541248321533203, "gen_logits_std": 2.8698205947875977, "gen_loss": 0.30636823177337646, "grad_norm": 0.4012158775693926, "learning_rate": 2.4990315789473682e-05, "loss": 0.3147, "mean_copy_accuracy": 0.9952265322208405, "mean_gen_accuracy": 0.8642773777246475, "mean_token_accuracy": 0.8959096968173981, "num_tokens": 190374937.0, "sample_num_tokens": 9793.75, "step": 4702, "total_num_tokens": 190414112.0, "z_loss": 0.0006185135571286082 }, { "copy_logits_max": -5.931742191314697, "copy_logits_min": -750000128.0, "copy_num_tokens": 360.0, "epoch": 0.9605310186367118, "gen_logits_max": 5.542006015777588, "gen_logits_mean": -14.173995971679688, "gen_logits_min": -25.850093841552734, "gen_logits_std": 2.7906107902526855, "gen_loss": 0.34641122817993164, "grad_norm": 0.3911374757500561, "learning_rate": 2.498905263157895e-05, "loss": 0.3159, "mean_copy_accuracy": 0.9954322427511215, "mean_gen_accuracy": 0.8640368580818176, "mean_token_accuracy": 0.8941151648759842, "num_tokens": 190648229.0, "sample_num_tokens": 8369.75, "step": 4703, "total_num_tokens": 190681708.0, "z_loss": 0.0005791600560769439 }, { "copy_logits_max": -4.132954120635986, "copy_logits_min": -750000000.0, "copy_num_tokens": 519.5, "epoch": 0.9607352565739086, "gen_logits_max": 4.513846397399902, "gen_logits_mean": -13.976512908935547, "gen_logits_min": -26.31703758239746, "gen_logits_std": 2.907989501953125, "gen_loss": 0.29706674814224243, "grad_norm": 0.3825944683164926, "learning_rate": 2.4987789473684214e-05, "loss": 0.2977, "mean_copy_accuracy": 0.9956530034542084, "mean_gen_accuracy": 0.8645315915346146, "mean_token_accuracy": 0.9016334116458893, "num_tokens": 190924494.0, "sample_num_tokens": 8138.5, "step": 4704, "total_num_tokens": 190957048.0, "z_loss": 0.0005429306766018271 }, { "copy_logits_max": -3.201056718826294, "copy_logits_min": -687500032.0, "copy_num_tokens": 562.1875, "epoch": 0.9609394945111054, "gen_logits_max": 4.625143051147461, "gen_logits_mean": -14.450483322143555, "gen_logits_min": -26.237760543823242, "gen_logits_std": 2.884188652038574, "gen_loss": 0.30254095792770386, "grad_norm": 0.3613497171419137, "learning_rate": 2.4986526315789475e-05, "loss": 0.2975, "mean_copy_accuracy": 0.9951105862855911, "mean_gen_accuracy": 0.8704672008752823, "mean_token_accuracy": 0.9028629660606384, "num_tokens": 191199684.0, "sample_num_tokens": 8489.5, "step": 4705, "total_num_tokens": 191233642.0, "z_loss": 0.0005945002194494009 }, { "copy_logits_max": -4.9423675537109375, "copy_logits_min": -750000000.0, "copy_num_tokens": 426.4375, "epoch": 0.9611437324483023, "gen_logits_max": 5.363034725189209, "gen_logits_mean": -14.546135902404785, "gen_logits_min": -26.15193748474121, "gen_logits_std": 2.8778679370880127, "gen_loss": 0.3129368722438812, "grad_norm": 0.36819446879054424, "learning_rate": 2.498526315789474e-05, "loss": 0.297, "mean_copy_accuracy": 0.9952747970819473, "mean_gen_accuracy": 0.8687730431556702, "mean_token_accuracy": 0.9014738649129868, "num_tokens": 191484076.0, "sample_num_tokens": 8319.0, "step": 4706, "total_num_tokens": 191517352.0, "z_loss": 0.0005784818204119802 }, { "copy_logits_max": -6.881219863891602, "copy_logits_min": -750000000.0, "copy_num_tokens": 284.5625, "epoch": 0.9613479703854991, "gen_logits_max": 6.097803115844727, "gen_logits_mean": -13.554383277893066, "gen_logits_min": -25.208209991455078, "gen_logits_std": 2.825561046600342, "gen_loss": 0.31780222058296204, "grad_norm": 0.39904293463967316, "learning_rate": 2.4984e-05, "loss": 0.3078, "mean_copy_accuracy": 0.9950874149799347, "mean_gen_accuracy": 0.8653392493724823, "mean_token_accuracy": 0.8988249450922012, "num_tokens": 191749316.0, "sample_num_tokens": 7573.5, "step": 4707, "total_num_tokens": 191779610.0, "z_loss": 0.0006186994723975658 }, { "copy_logits_max": -3.76806640625, "copy_logits_min": -750000000.0, "copy_num_tokens": 436.9375, "epoch": 0.961552208322696, "gen_logits_max": 4.793203353881836, "gen_logits_mean": -14.306598663330078, "gen_logits_min": -26.246715545654297, "gen_logits_std": 2.8697805404663086, "gen_loss": 0.2541765570640564, "grad_norm": 0.3513393265120681, "learning_rate": 2.4982736842105264e-05, "loss": 0.2736, "mean_copy_accuracy": 0.9959670454263687, "mean_gen_accuracy": 0.8769159466028214, "mean_token_accuracy": 0.9065050184726715, "num_tokens": 192022785.0, "sample_num_tokens": 7282.25, "step": 4708, "total_num_tokens": 192051914.0, "z_loss": 0.0005170884542167187 }, { "copy_logits_max": -5.4253387451171875, "copy_logits_min": -750000000.0, "copy_num_tokens": 402.0625, "epoch": 0.9617564462598928, "gen_logits_max": 5.210543632507324, "gen_logits_mean": -14.76371955871582, "gen_logits_min": -26.536479949951172, "gen_logits_std": 2.8735997676849365, "gen_loss": 0.3541746735572815, "grad_norm": 0.40326691956114985, "learning_rate": 2.4981473684210525e-05, "loss": 0.3313, "mean_copy_accuracy": 0.9952874630689621, "mean_gen_accuracy": 0.8620509952306747, "mean_token_accuracy": 0.8906926959753036, "num_tokens": 192274608.0, "sample_num_tokens": 7729.5, "step": 4709, "total_num_tokens": 192305526.0, "z_loss": 0.000643388251774013 }, { "copy_logits_max": -5.83104133605957, "copy_logits_min": -687500032.0, "copy_num_tokens": 529.0, "epoch": 0.9619606841970896, "gen_logits_max": 5.855177879333496, "gen_logits_mean": -12.883360862731934, "gen_logits_min": -24.506057739257812, "gen_logits_std": 2.8325729370117188, "gen_loss": 0.30711129307746887, "grad_norm": 0.37375060384336645, "learning_rate": 2.498021052631579e-05, "loss": 0.3022, "mean_copy_accuracy": 0.9954959005117416, "mean_gen_accuracy": 0.8651927560567856, "mean_token_accuracy": 0.8989753276109695, "num_tokens": 192570784.0, "sample_num_tokens": 9749.0, "step": 4710, "total_num_tokens": 192609780.0, "z_loss": 0.000634526542853564 }, { "copy_logits_max": -2.948788642883301, "copy_logits_min": -562500032.0, "copy_num_tokens": 848.8125, "epoch": 0.9621649221342864, "gen_logits_max": 6.04541015625, "gen_logits_mean": -11.428352355957031, "gen_logits_min": -24.20343780517578, "gen_logits_std": 2.886831760406494, "gen_loss": 0.2355499565601349, "grad_norm": 0.35745933688938103, "learning_rate": 2.4978947368421054e-05, "loss": 0.2761, "mean_copy_accuracy": 0.9967766553163528, "mean_gen_accuracy": 0.871135875582695, "mean_token_accuracy": 0.9095117747783661, "num_tokens": 192862317.0, "sample_num_tokens": 10064.25, "step": 4711, "total_num_tokens": 192902574.0, "z_loss": 0.0006041980814188719 }, { "copy_logits_max": -4.091969013214111, "copy_logits_min": -750000000.0, "copy_num_tokens": 521.5, "epoch": 0.9623691600714833, "gen_logits_max": 5.814512252807617, "gen_logits_mean": -13.229615211486816, "gen_logits_min": -24.993980407714844, "gen_logits_std": 2.874197244644165, "gen_loss": 0.3335126042366028, "grad_norm": 0.4146415576660699, "learning_rate": 2.4977684210526318e-05, "loss": 0.3411, "mean_copy_accuracy": 0.9960187673568726, "mean_gen_accuracy": 0.8571313321590424, "mean_token_accuracy": 0.8890501856803894, "num_tokens": 193133233.0, "sample_num_tokens": 8474.25, "step": 4712, "total_num_tokens": 193167130.0, "z_loss": 0.0007229965412989259 }, { "copy_logits_max": -3.5638530254364014, "copy_logits_min": -750000064.0, "copy_num_tokens": 411.625, "epoch": 0.9625733980086801, "gen_logits_max": 6.1762566566467285, "gen_logits_mean": -12.783945083618164, "gen_logits_min": -24.508790969848633, "gen_logits_std": 2.8321585655212402, "gen_loss": 0.3740220069885254, "grad_norm": 0.41388920928565637, "learning_rate": 2.497642105263158e-05, "loss": 0.3478, "mean_copy_accuracy": 0.9941582530736923, "mean_gen_accuracy": 0.8528374880552292, "mean_token_accuracy": 0.8830752968788147, "num_tokens": 193400170.0, "sample_num_tokens": 7846.0, "step": 4713, "total_num_tokens": 193431554.0, "z_loss": 0.0007876484887674451 }, { "copy_logits_max": -2.1836507320404053, "copy_logits_min": -750000128.0, "copy_num_tokens": 521.875, "epoch": 0.962777635945877, "gen_logits_max": 5.186136245727539, "gen_logits_mean": -13.977193832397461, "gen_logits_min": -26.01498794555664, "gen_logits_std": 2.866366386413574, "gen_loss": 0.3093835115432739, "grad_norm": 0.425745540105658, "learning_rate": 2.4975157894736843e-05, "loss": 0.317, "mean_copy_accuracy": 0.9948844164609909, "mean_gen_accuracy": 0.8637480586767197, "mean_token_accuracy": 0.8959321528673172, "num_tokens": 193667697.0, "sample_num_tokens": 8574.75, "step": 4714, "total_num_tokens": 193701996.0, "z_loss": 0.0006004370516166091 }, { "copy_logits_max": -1.6098341941833496, "copy_logits_min": -750000000.0, "copy_num_tokens": 880.25, "epoch": 0.9629818738830738, "gen_logits_max": 5.750178337097168, "gen_logits_mean": -12.448318481445312, "gen_logits_min": -24.39212417602539, "gen_logits_std": 2.8421993255615234, "gen_loss": 0.3024882376194, "grad_norm": 0.34762901835619964, "learning_rate": 2.4973894736842104e-05, "loss": 0.2916, "mean_copy_accuracy": 0.9959475249052048, "mean_gen_accuracy": 0.8724613785743713, "mean_token_accuracy": 0.9043389111757278, "num_tokens": 193961515.0, "sample_num_tokens": 11660.25, "step": 4715, "total_num_tokens": 194008156.0, "z_loss": 0.0006157133029773831 }, { "copy_logits_max": -1.724388599395752, "copy_logits_min": -750000000.0, "copy_num_tokens": 585.375, "epoch": 0.9631861118202706, "gen_logits_max": 5.076329231262207, "gen_logits_mean": -13.14793872833252, "gen_logits_min": -25.76244354248047, "gen_logits_std": 2.874156951904297, "gen_loss": 0.28868356347084045, "grad_norm": 0.3832543722335298, "learning_rate": 2.497263157894737e-05, "loss": 0.3026, "mean_copy_accuracy": 0.9961447417736053, "mean_gen_accuracy": 0.8642038404941559, "mean_token_accuracy": 0.8988151252269745, "num_tokens": 194250084.0, "sample_num_tokens": 8045.5, "step": 4716, "total_num_tokens": 194282266.0, "z_loss": 0.0005936547531746328 }, { "copy_logits_max": -1.6934149265289307, "copy_logits_min": -750000000.0, "copy_num_tokens": 473.625, "epoch": 0.9633903497574674, "gen_logits_max": 4.9257707595825195, "gen_logits_mean": -14.775738716125488, "gen_logits_min": -26.62690544128418, "gen_logits_std": 2.8783953189849854, "gen_loss": 0.3121606111526489, "grad_norm": 0.38437722647621664, "learning_rate": 2.4971368421052633e-05, "loss": 0.2898, "mean_copy_accuracy": 0.9957125931978226, "mean_gen_accuracy": 0.8756690472364426, "mean_token_accuracy": 0.9030663669109344, "num_tokens": 194514545.0, "sample_num_tokens": 8388.75, "step": 4717, "total_num_tokens": 194548100.0, "z_loss": 0.000656510703265667 }, { "copy_logits_max": -0.3825986087322235, "copy_logits_min": -750000064.0, "copy_num_tokens": 546.125, "epoch": 0.9635945876946643, "gen_logits_max": 5.7116498947143555, "gen_logits_mean": -12.718107223510742, "gen_logits_min": -24.87216567993164, "gen_logits_std": 2.8625783920288086, "gen_loss": 0.2777237296104431, "grad_norm": 0.39398938425604696, "learning_rate": 2.4970105263157894e-05, "loss": 0.3189, "mean_copy_accuracy": 0.9942473620176315, "mean_gen_accuracy": 0.8632392436265945, "mean_token_accuracy": 0.8948162347078323, "num_tokens": 194774516.0, "sample_num_tokens": 8222.5, "step": 4718, "total_num_tokens": 194807406.0, "z_loss": 0.0005716843297705054 }, { "copy_logits_max": -4.332520484924316, "copy_logits_min": -750000000.0, "copy_num_tokens": 476.375, "epoch": 0.9637988256318611, "gen_logits_max": 5.422369003295898, "gen_logits_mean": -13.704614639282227, "gen_logits_min": -25.19904899597168, "gen_logits_std": 2.8637685775756836, "gen_loss": 0.2981182932853699, "grad_norm": 0.35076346250348833, "learning_rate": 2.496884210526316e-05, "loss": 0.3048, "mean_copy_accuracy": 0.9961441606283188, "mean_gen_accuracy": 0.8647630363702774, "mean_token_accuracy": 0.8993319422006607, "num_tokens": 195061275.0, "sample_num_tokens": 8536.75, "step": 4719, "total_num_tokens": 195095422.0, "z_loss": 0.0005681473412550986 }, { "copy_logits_max": -3.95790696144104, "copy_logits_min": -750000000.0, "copy_num_tokens": 370.0625, "epoch": 0.964003063569058, "gen_logits_max": 7.031020164489746, "gen_logits_mean": -12.736726760864258, "gen_logits_min": -24.334030151367188, "gen_logits_std": 2.843141555786133, "gen_loss": 0.3064119815826416, "grad_norm": 0.37680107190030443, "learning_rate": 2.4967578947368422e-05, "loss": 0.3066, "mean_copy_accuracy": 0.9943647384643555, "mean_gen_accuracy": 0.8691433668136597, "mean_token_accuracy": 0.8978268057107925, "num_tokens": 195336385.0, "sample_num_tokens": 8875.25, "step": 4720, "total_num_tokens": 195371886.0, "z_loss": 0.0006123486673459411 }, { "copy_logits_max": -2.949728488922119, "copy_logits_min": -750000000.0, "copy_num_tokens": 592.0, "epoch": 0.9642073015062548, "gen_logits_max": 6.508584976196289, "gen_logits_mean": -11.882379531860352, "gen_logits_min": -24.468168258666992, "gen_logits_std": 2.8986592292785645, "gen_loss": 0.23904290795326233, "grad_norm": 0.3917592617042775, "learning_rate": 2.4966315789473687e-05, "loss": 0.2763, "mean_copy_accuracy": 0.9954961687326431, "mean_gen_accuracy": 0.8794332295656204, "mean_token_accuracy": 0.9078971445560455, "num_tokens": 195594854.0, "sample_num_tokens": 8951.5, "step": 4721, "total_num_tokens": 195630660.0, "z_loss": 0.0005400293739512563 }, { "copy_logits_max": -2.7305073738098145, "copy_logits_min": -750000000.0, "copy_num_tokens": 439.375, "epoch": 0.9644115394434516, "gen_logits_max": 5.406763076782227, "gen_logits_mean": -14.283852577209473, "gen_logits_min": -26.145835876464844, "gen_logits_std": 2.8784914016723633, "gen_loss": 0.33450812101364136, "grad_norm": 0.4072281247699524, "learning_rate": 2.4965052631578948e-05, "loss": 0.3216, "mean_copy_accuracy": 0.9956767708063126, "mean_gen_accuracy": 0.8634208142757416, "mean_token_accuracy": 0.892093151807785, "num_tokens": 195847311.0, "sample_num_tokens": 8456.75, "step": 4722, "total_num_tokens": 195881138.0, "z_loss": 0.0006744408747181296 }, { "copy_logits_max": -5.097092628479004, "copy_logits_min": -750000064.0, "copy_num_tokens": 408.875, "epoch": 0.9646157773806484, "gen_logits_max": 5.336879730224609, "gen_logits_mean": -14.66187572479248, "gen_logits_min": -26.44579315185547, "gen_logits_std": 2.884733200073242, "gen_loss": 0.3023582696914673, "grad_norm": 0.39194861868504305, "learning_rate": 2.4963789473684212e-05, "loss": 0.3015, "mean_copy_accuracy": 0.995763972401619, "mean_gen_accuracy": 0.869954913854599, "mean_token_accuracy": 0.8983433693647385, "num_tokens": 196099365.0, "sample_num_tokens": 7478.75, "step": 4723, "total_num_tokens": 196129280.0, "z_loss": 0.0006052579265087843 }, { "copy_logits_max": -3.8680624961853027, "copy_logits_min": -687500032.0, "copy_num_tokens": 371.5625, "epoch": 0.9648200153178453, "gen_logits_max": 5.7093634605407715, "gen_logits_mean": -13.317398071289062, "gen_logits_min": -24.94862937927246, "gen_logits_std": 2.83615779876709, "gen_loss": 0.28727006912231445, "grad_norm": 0.388687739558939, "learning_rate": 2.4962526315789473e-05, "loss": 0.2881, "mean_copy_accuracy": 0.9955996721982956, "mean_gen_accuracy": 0.8745290338993073, "mean_token_accuracy": 0.9026189893484116, "num_tokens": 196367246.0, "sample_num_tokens": 8208.5, "step": 4724, "total_num_tokens": 196400080.0, "z_loss": 0.0006031148368492723 }, { "copy_logits_max": -4.009543418884277, "copy_logits_min": -750000000.0, "copy_num_tokens": 691.875, "epoch": 0.9650242532550422, "gen_logits_max": 3.783134937286377, "gen_logits_mean": -15.583640098571777, "gen_logits_min": -27.46373748779297, "gen_logits_std": 2.873650550842285, "gen_loss": 0.2786180377006531, "grad_norm": 0.4008687683738394, "learning_rate": 2.4961263157894737e-05, "loss": 0.2911, "mean_copy_accuracy": 0.9961947947740555, "mean_gen_accuracy": 0.8688481003046036, "mean_token_accuracy": 0.9039051681756973, "num_tokens": 196654030.0, "sample_num_tokens": 10376.0, "step": 4725, "total_num_tokens": 196695534.0, "z_loss": 0.0005617913557216525 }, { "copy_logits_max": -2.2622265815734863, "copy_logits_min": -750000000.0, "copy_num_tokens": 629.5, "epoch": 0.965228491192239, "gen_logits_max": 5.620329856872559, "gen_logits_mean": -13.15848159790039, "gen_logits_min": -25.787036895751953, "gen_logits_std": 2.8554506301879883, "gen_loss": 0.27467629313468933, "grad_norm": 0.3465367112423387, "learning_rate": 2.4959999999999998e-05, "loss": 0.2907, "mean_copy_accuracy": 0.9962616115808487, "mean_gen_accuracy": 0.8729363530874252, "mean_token_accuracy": 0.9040955305099487, "num_tokens": 196965283.0, "sample_num_tokens": 10304.25, "step": 4726, "total_num_tokens": 197006500.0, "z_loss": 0.0006012226222082973 }, { "copy_logits_max": -2.3052492141723633, "copy_logits_min": -750000000.0, "copy_num_tokens": 516.375, "epoch": 0.9654327291294358, "gen_logits_max": 5.799314498901367, "gen_logits_mean": -13.47824478149414, "gen_logits_min": -25.482128143310547, "gen_logits_std": 2.881348133087158, "gen_loss": 0.2689156234264374, "grad_norm": 0.3932399050357983, "learning_rate": 2.4958736842105263e-05, "loss": 0.3126, "mean_copy_accuracy": 0.9947643876075745, "mean_gen_accuracy": 0.865748718380928, "mean_token_accuracy": 0.8953985571861267, "num_tokens": 197236715.0, "sample_num_tokens": 9008.75, "step": 4727, "total_num_tokens": 197272750.0, "z_loss": 0.0005864576669409871 }, { "copy_logits_max": -2.8642210960388184, "copy_logits_min": -687500032.0, "copy_num_tokens": 438.3125, "epoch": 0.9656369670666326, "gen_logits_max": 5.194441795349121, "gen_logits_mean": -14.29808235168457, "gen_logits_min": -26.1275577545166, "gen_logits_std": 2.8853540420532227, "gen_loss": 0.3276481032371521, "grad_norm": 0.40283326194133956, "learning_rate": 2.4957473684210527e-05, "loss": 0.3171, "mean_copy_accuracy": 0.9957662522792816, "mean_gen_accuracy": 0.8632527738809586, "mean_token_accuracy": 0.8933936953544617, "num_tokens": 197493771.0, "sample_num_tokens": 8300.75, "step": 4728, "total_num_tokens": 197526974.0, "z_loss": 0.0006337208906188607 }, { "copy_logits_max": -3.432004928588867, "copy_logits_min": -750000000.0, "copy_num_tokens": 361.9375, "epoch": 0.9658412050038294, "gen_logits_max": 5.767130374908447, "gen_logits_mean": -13.558267593383789, "gen_logits_min": -24.952707290649414, "gen_logits_std": 2.8307058811187744, "gen_loss": 0.28128570318222046, "grad_norm": 0.3777515060811017, "learning_rate": 2.495621052631579e-05, "loss": 0.292, "mean_copy_accuracy": 0.9948607683181763, "mean_gen_accuracy": 0.870508998632431, "mean_token_accuracy": 0.9029674082994461, "num_tokens": 197759497.0, "sample_num_tokens": 9112.25, "step": 4729, "total_num_tokens": 197795946.0, "z_loss": 0.0005286649102345109 }, { "copy_logits_max": -3.3884220123291016, "copy_logits_min": -750000000.0, "copy_num_tokens": 424.0, "epoch": 0.9660454429410263, "gen_logits_max": 4.670409202575684, "gen_logits_mean": -14.900042533874512, "gen_logits_min": -26.594406127929688, "gen_logits_std": 2.87628436088562, "gen_loss": 0.2801583409309387, "grad_norm": 0.37107327862905565, "learning_rate": 2.4954947368421052e-05, "loss": 0.2803, "mean_copy_accuracy": 0.9955156743526459, "mean_gen_accuracy": 0.8756783753633499, "mean_token_accuracy": 0.9064074456691742, "num_tokens": 198038526.0, "sample_num_tokens": 7736.5, "step": 4730, "total_num_tokens": 198069472.0, "z_loss": 0.0005765848327428102 }, { "copy_logits_max": -2.457188129425049, "copy_logits_min": -687500032.0, "copy_num_tokens": 540.0625, "epoch": 0.9662496808782232, "gen_logits_max": 4.744019508361816, "gen_logits_mean": -14.406131744384766, "gen_logits_min": -26.446800231933594, "gen_logits_std": 2.8871076107025146, "gen_loss": 0.32571643590927124, "grad_norm": 0.41469025959819966, "learning_rate": 2.4953684210526316e-05, "loss": 0.3097, "mean_copy_accuracy": 0.9961954355239868, "mean_gen_accuracy": 0.8644727170467377, "mean_token_accuracy": 0.8993786573410034, "num_tokens": 198313931.0, "sample_num_tokens": 9428.25, "step": 4731, "total_num_tokens": 198351644.0, "z_loss": 0.0006501756142824888 }, { "copy_logits_max": -3.793548345565796, "copy_logits_min": -750000000.0, "copy_num_tokens": 549.875, "epoch": 0.96645391881542, "gen_logits_max": 4.207618713378906, "gen_logits_mean": -15.113821029663086, "gen_logits_min": -27.283660888671875, "gen_logits_std": 2.938084602355957, "gen_loss": 0.24910905957221985, "grad_norm": 0.400440336473343, "learning_rate": 2.495242105263158e-05, "loss": 0.3004, "mean_copy_accuracy": 0.9965916723012924, "mean_gen_accuracy": 0.8660836815834045, "mean_token_accuracy": 0.8994155079126358, "num_tokens": 198581053.0, "sample_num_tokens": 8871.25, "step": 4732, "total_num_tokens": 198616538.0, "z_loss": 0.0005008485168218613 }, { "copy_logits_max": -4.5715532302856445, "copy_logits_min": -750000000.0, "copy_num_tokens": 375.25, "epoch": 0.9666581567526168, "gen_logits_max": 4.867063045501709, "gen_logits_mean": -13.634974479675293, "gen_logits_min": -25.538597106933594, "gen_logits_std": 2.8605082035064697, "gen_loss": 0.34359532594680786, "grad_norm": 0.39342712690041637, "learning_rate": 2.495115789473684e-05, "loss": 0.3225, "mean_copy_accuracy": 0.9961493760347366, "mean_gen_accuracy": 0.8602652102708817, "mean_token_accuracy": 0.8928709775209427, "num_tokens": 198843140.0, "sample_num_tokens": 7084.5, "step": 4733, "total_num_tokens": 198871478.0, "z_loss": 0.0005886038998141885 }, { "copy_logits_max": -5.579100608825684, "copy_logits_min": -750000000.0, "copy_num_tokens": 502.0625, "epoch": 0.9668623946898136, "gen_logits_max": 4.98710823059082, "gen_logits_mean": -15.134528160095215, "gen_logits_min": -26.98798370361328, "gen_logits_std": 2.923508405685425, "gen_loss": 0.25273075699806213, "grad_norm": 0.48645158675426275, "learning_rate": 2.4949894736842106e-05, "loss": 0.294, "mean_copy_accuracy": 0.9932472407817841, "mean_gen_accuracy": 0.8696160912513733, "mean_token_accuracy": 0.9021938741207123, "num_tokens": 199108236.0, "sample_num_tokens": 8462.5, "step": 4734, "total_num_tokens": 199142086.0, "z_loss": 0.00044149791938252747 }, { "copy_logits_max": -4.139573097229004, "copy_logits_min": -687500032.0, "copy_num_tokens": 624.4375, "epoch": 0.9670666326270104, "gen_logits_max": 4.082679748535156, "gen_logits_mean": -15.805397033691406, "gen_logits_min": -27.641700744628906, "gen_logits_std": 2.9275946617126465, "gen_loss": 0.26132529973983765, "grad_norm": 0.402278400241359, "learning_rate": 2.4948631578947367e-05, "loss": 0.2834, "mean_copy_accuracy": 0.9962673485279083, "mean_gen_accuracy": 0.8680362552404404, "mean_token_accuracy": 0.9049761444330215, "num_tokens": 199376004.0, "sample_num_tokens": 9111.5, "step": 4735, "total_num_tokens": 199412450.0, "z_loss": 0.0005203793989494443 }, { "copy_logits_max": -3.976252794265747, "copy_logits_min": -750000000.0, "copy_num_tokens": 388.25, "epoch": 0.9672708705642074, "gen_logits_max": 5.414097785949707, "gen_logits_mean": -14.638772964477539, "gen_logits_min": -26.47995376586914, "gen_logits_std": 2.89607310295105, "gen_loss": 0.3236663341522217, "grad_norm": 0.41959055851717647, "learning_rate": 2.4947368421052635e-05, "loss": 0.3002, "mean_copy_accuracy": 0.9964578002691269, "mean_gen_accuracy": 0.8661917299032211, "mean_token_accuracy": 0.8990893065929413, "num_tokens": 199633098.0, "sample_num_tokens": 7607.0, "step": 4736, "total_num_tokens": 199663526.0, "z_loss": 0.0006573376595042646 }, { "copy_logits_max": -5.506704330444336, "copy_logits_min": -687500032.0, "copy_num_tokens": 316.8125, "epoch": 0.9674751085014042, "gen_logits_max": 5.453297138214111, "gen_logits_mean": -14.780172348022461, "gen_logits_min": -26.224031448364258, "gen_logits_std": 2.8477845191955566, "gen_loss": 0.34211286902427673, "grad_norm": 0.41080863038395454, "learning_rate": 2.4946105263157895e-05, "loss": 0.316, "mean_copy_accuracy": 0.9956478625535965, "mean_gen_accuracy": 0.8625200986862183, "mean_token_accuracy": 0.895322173833847, "num_tokens": 199921513.0, "sample_num_tokens": 8257.25, "step": 4737, "total_num_tokens": 199954542.0, "z_loss": 0.0006149436230771244 }, { "copy_logits_max": -5.0237932205200195, "copy_logits_min": -750000000.0, "copy_num_tokens": 407.3125, "epoch": 0.967679346438601, "gen_logits_max": 5.343728065490723, "gen_logits_mean": -14.158639907836914, "gen_logits_min": -25.870086669921875, "gen_logits_std": 2.900026559829712, "gen_loss": 0.2734454870223999, "grad_norm": 0.39711991390919865, "learning_rate": 2.494484210526316e-05, "loss": 0.3005, "mean_copy_accuracy": 0.9947264343500137, "mean_gen_accuracy": 0.8704724758863449, "mean_token_accuracy": 0.8981473296880722, "num_tokens": 200180863.0, "sample_num_tokens": 8265.75, "step": 4738, "total_num_tokens": 200213926.0, "z_loss": 0.000537589774467051 }, { "copy_logits_max": -3.803835868835449, "copy_logits_min": -750000000.0, "copy_num_tokens": 584.5, "epoch": 0.9678835843757978, "gen_logits_max": 5.109099388122559, "gen_logits_mean": -13.267690658569336, "gen_logits_min": -25.40606689453125, "gen_logits_std": 2.889312267303467, "gen_loss": 0.27152717113494873, "grad_norm": 0.37079997070550214, "learning_rate": 2.494357894736842e-05, "loss": 0.2951, "mean_copy_accuracy": 0.9954176247119904, "mean_gen_accuracy": 0.8719872236251831, "mean_token_accuracy": 0.9024854004383087, "num_tokens": 200459923.0, "sample_num_tokens": 9563.25, "step": 4739, "total_num_tokens": 200498176.0, "z_loss": 0.0005545829772017896 }, { "copy_logits_max": -2.8733301162719727, "copy_logits_min": -750000128.0, "copy_num_tokens": 623.75, "epoch": 0.9680878223129946, "gen_logits_max": 4.3957085609436035, "gen_logits_mean": -14.95469856262207, "gen_logits_min": -26.836078643798828, "gen_logits_std": 2.89323091506958, "gen_loss": 0.30470743775367737, "grad_norm": 0.38063733733903926, "learning_rate": 2.4942315789473685e-05, "loss": 0.3148, "mean_copy_accuracy": 0.9959207326173782, "mean_gen_accuracy": 0.859071895480156, "mean_token_accuracy": 0.8948574960231781, "num_tokens": 200733262.0, "sample_num_tokens": 10044.0, "step": 4740, "total_num_tokens": 200773438.0, "z_loss": 0.000674711016472429 }, { "copy_logits_max": -4.6536664962768555, "copy_logits_min": -750000000.0, "copy_num_tokens": 435.375, "epoch": 0.9682920602501914, "gen_logits_max": 6.241450309753418, "gen_logits_mean": -13.569575309753418, "gen_logits_min": -25.823646545410156, "gen_logits_std": 2.8826353549957275, "gen_loss": 0.3353249132633209, "grad_norm": 0.42871141524285133, "learning_rate": 2.4941052631578946e-05, "loss": 0.306, "mean_copy_accuracy": 0.9950002431869507, "mean_gen_accuracy": 0.8685777187347412, "mean_token_accuracy": 0.8989487439393997, "num_tokens": 201035329.0, "sample_num_tokens": 8438.75, "step": 4741, "total_num_tokens": 201069084.0, "z_loss": 0.0006426239851862192 }, { "copy_logits_max": -2.579629898071289, "copy_logits_min": -750000000.0, "copy_num_tokens": 585.1875, "epoch": 0.9684962981873884, "gen_logits_max": 5.144526481628418, "gen_logits_mean": -13.998193740844727, "gen_logits_min": -26.395401000976562, "gen_logits_std": 2.921861171722412, "gen_loss": 0.249965637922287, "grad_norm": 0.4250337436662142, "learning_rate": 2.493978947368421e-05, "loss": 0.3017, "mean_copy_accuracy": 0.9950333833694458, "mean_gen_accuracy": 0.863232895731926, "mean_token_accuracy": 0.9006993472576141, "num_tokens": 201294803.0, "sample_num_tokens": 8113.25, "step": 4742, "total_num_tokens": 201327256.0, "z_loss": 0.0005942803691141307 }, { "copy_logits_max": -1.676356315612793, "copy_logits_min": -750000064.0, "copy_num_tokens": 603.875, "epoch": 0.9687005361245852, "gen_logits_max": 6.088540077209473, "gen_logits_mean": -12.352374076843262, "gen_logits_min": -24.365150451660156, "gen_logits_std": 2.8736929893493652, "gen_loss": 0.2790036201477051, "grad_norm": 0.3940654582929992, "learning_rate": 2.493852631578947e-05, "loss": 0.3127, "mean_copy_accuracy": 0.9954167157411575, "mean_gen_accuracy": 0.8657157272100449, "mean_token_accuracy": 0.8969894051551819, "num_tokens": 201561756.0, "sample_num_tokens": 8669.0, "step": 4743, "total_num_tokens": 201596432.0, "z_loss": 0.0006052792305126786 }, { "copy_logits_max": -1.8126397132873535, "copy_logits_min": -750000000.0, "copy_num_tokens": 609.9375, "epoch": 0.968904774061782, "gen_logits_max": 4.871115207672119, "gen_logits_mean": -14.119802474975586, "gen_logits_min": -26.102380752563477, "gen_logits_std": 2.895921230316162, "gen_loss": 0.2985324263572693, "grad_norm": 0.3761520330308067, "learning_rate": 2.493726315789474e-05, "loss": 0.3194, "mean_copy_accuracy": 0.9945568889379501, "mean_gen_accuracy": 0.8644304722547531, "mean_token_accuracy": 0.8935293108224869, "num_tokens": 201822932.0, "sample_num_tokens": 9167.5, "step": 4744, "total_num_tokens": 201859602.0, "z_loss": 0.0006235460168682039 }, { "copy_logits_max": -3.283514976501465, "copy_logits_min": -750000000.0, "copy_num_tokens": 405.8125, "epoch": 0.9691090119989788, "gen_logits_max": 5.397904396057129, "gen_logits_mean": -14.896242141723633, "gen_logits_min": -26.774616241455078, "gen_logits_std": 2.913513660430908, "gen_loss": 0.24856114387512207, "grad_norm": 0.39874986958407094, "learning_rate": 2.4936000000000003e-05, "loss": 0.2909, "mean_copy_accuracy": 0.9930418133735657, "mean_gen_accuracy": 0.8765790611505508, "mean_token_accuracy": 0.9028601944446564, "num_tokens": 202096147.0, "sample_num_tokens": 8637.25, "step": 4745, "total_num_tokens": 202130696.0, "z_loss": 0.0005177991697564721 }, { "copy_logits_max": -2.7883214950561523, "copy_logits_min": -750000000.0, "copy_num_tokens": 377.25, "epoch": 0.9693132499361756, "gen_logits_max": 5.992514610290527, "gen_logits_mean": -13.186162948608398, "gen_logits_min": -26.014305114746094, "gen_logits_std": 2.8988614082336426, "gen_loss": 0.2983464002609253, "grad_norm": 0.3790094383803086, "learning_rate": 2.4934736842105264e-05, "loss": 0.2886, "mean_copy_accuracy": 0.9949297606945038, "mean_gen_accuracy": 0.8721706867218018, "mean_token_accuracy": 0.9024266600608826, "num_tokens": 202357899.0, "sample_num_tokens": 8442.75, "step": 4746, "total_num_tokens": 202391670.0, "z_loss": 0.0005831402377225459 }, { "copy_logits_max": -2.044956684112549, "copy_logits_min": -750000000.0, "copy_num_tokens": 471.8125, "epoch": 0.9695174878733724, "gen_logits_max": 5.177300453186035, "gen_logits_mean": -14.240631103515625, "gen_logits_min": -26.968158721923828, "gen_logits_std": 2.9028749465942383, "gen_loss": 0.29584139585494995, "grad_norm": 0.3488910518291748, "learning_rate": 2.493347368421053e-05, "loss": 0.2934, "mean_copy_accuracy": 0.9954957067966461, "mean_gen_accuracy": 0.8768844306468964, "mean_token_accuracy": 0.9034593552350998, "num_tokens": 202631896.0, "sample_num_tokens": 8155.5, "step": 4747, "total_num_tokens": 202664518.0, "z_loss": 0.0005951393395662308 }, { "copy_logits_max": -1.0981783866882324, "copy_logits_min": -750000000.0, "copy_num_tokens": 572.75, "epoch": 0.9697217258105694, "gen_logits_max": 4.8452839851379395, "gen_logits_mean": -13.468962669372559, "gen_logits_min": -25.769227981567383, "gen_logits_std": 2.923081159591675, "gen_loss": 0.2634890079498291, "grad_norm": 0.4492029694461162, "learning_rate": 2.493221052631579e-05, "loss": 0.2992, "mean_copy_accuracy": 0.993217721581459, "mean_gen_accuracy": 0.8668634593486786, "mean_token_accuracy": 0.9012844562530518, "num_tokens": 202905585.0, "sample_num_tokens": 8504.75, "step": 4748, "total_num_tokens": 202939604.0, "z_loss": 0.0006429217173717916 }, { "copy_logits_max": -2.752328395843506, "copy_logits_min": -750000064.0, "copy_num_tokens": 810.375, "epoch": 0.9699259637477662, "gen_logits_max": 4.481295585632324, "gen_logits_mean": -14.700268745422363, "gen_logits_min": -26.59713363647461, "gen_logits_std": 2.9040565490722656, "gen_loss": 0.27982670068740845, "grad_norm": 0.3930974075405362, "learning_rate": 2.4930947368421054e-05, "loss": 0.2993, "mean_copy_accuracy": 0.9959175139665604, "mean_gen_accuracy": 0.8662122786045074, "mean_token_accuracy": 0.9006776064634323, "num_tokens": 203189894.0, "sample_num_tokens": 10702.5, "step": 4749, "total_num_tokens": 203232704.0, "z_loss": 0.0006723447004333138 }, { "copy_logits_max": -3.5111145973205566, "copy_logits_min": -750000000.0, "copy_num_tokens": 636.625, "epoch": 0.970130201684963, "gen_logits_max": 3.6725587844848633, "gen_logits_mean": -16.924514770507812, "gen_logits_min": -28.52046775817871, "gen_logits_std": 2.9019646644592285, "gen_loss": 0.32542237639427185, "grad_norm": 0.420986987253209, "learning_rate": 2.4929684210526315e-05, "loss": 0.3007, "mean_copy_accuracy": 0.9952178448438644, "mean_gen_accuracy": 0.8681436777114868, "mean_token_accuracy": 0.9013696908950806, "num_tokens": 203474232.0, "sample_num_tokens": 10667.5, "step": 4750, "total_num_tokens": 203516902.0, "z_loss": 0.0006356019875966012 }, { "copy_logits_max": -2.5905048847198486, "copy_logits_min": -687500032.0, "copy_num_tokens": 547.25, "epoch": 0.9703344396221598, "gen_logits_max": 6.075333595275879, "gen_logits_mean": -12.825725555419922, "gen_logits_min": -26.666973114013672, "gen_logits_std": 2.8976681232452393, "gen_loss": 0.2807546854019165, "grad_norm": 0.41978376516621235, "learning_rate": 2.492842105263158e-05, "loss": 0.3095, "mean_copy_accuracy": 0.9936586171388626, "mean_gen_accuracy": 0.8689745664596558, "mean_token_accuracy": 0.8966036885976791, "num_tokens": 203758055.0, "sample_num_tokens": 8632.75, "step": 4751, "total_num_tokens": 203792586.0, "z_loss": 0.0005710238474421203 }, { "copy_logits_max": -3.3925108909606934, "copy_logits_min": -750000000.0, "copy_num_tokens": 372.625, "epoch": 0.9705386775593566, "gen_logits_max": 5.234106063842773, "gen_logits_mean": -14.555464744567871, "gen_logits_min": -26.210613250732422, "gen_logits_std": 2.8801870346069336, "gen_loss": 0.2760549485683441, "grad_norm": 0.4135059604940087, "learning_rate": 2.4927157894736843e-05, "loss": 0.2998, "mean_copy_accuracy": 0.9940007925033569, "mean_gen_accuracy": 0.8691001087427139, "mean_token_accuracy": 0.899836540222168, "num_tokens": 204014560.0, "sample_num_tokens": 7325.0, "step": 4752, "total_num_tokens": 204043860.0, "z_loss": 0.0005551446229219437 }, { "copy_logits_max": -2.271181583404541, "copy_logits_min": -687500032.0, "copy_num_tokens": 608.9375, "epoch": 0.9707429154965534, "gen_logits_max": 4.731093883514404, "gen_logits_mean": -13.955582618713379, "gen_logits_min": -26.366634368896484, "gen_logits_std": 2.90126895904541, "gen_loss": 0.3324706554412842, "grad_norm": 0.3806164737390729, "learning_rate": 2.4925894736842107e-05, "loss": 0.305, "mean_copy_accuracy": 0.9967063218355179, "mean_gen_accuracy": 0.8600923269987106, "mean_token_accuracy": 0.9005581736564636, "num_tokens": 204295205.0, "sample_num_tokens": 9356.75, "step": 4753, "total_num_tokens": 204332632.0, "z_loss": 0.0006447940832003951 }, { "copy_logits_max": -3.0338046550750732, "copy_logits_min": -750000000.0, "copy_num_tokens": 486.75, "epoch": 0.9709471534337503, "gen_logits_max": 4.062050819396973, "gen_logits_mean": -15.376812934875488, "gen_logits_min": -27.006454467773438, "gen_logits_std": 2.894040584564209, "gen_loss": 0.2979646921157837, "grad_norm": 0.6793379796331479, "learning_rate": 2.492463157894737e-05, "loss": 0.3199, "mean_copy_accuracy": 0.9950425624847412, "mean_gen_accuracy": 0.8654088526964188, "mean_token_accuracy": 0.8973245471715927, "num_tokens": 204569939.0, "sample_num_tokens": 8579.25, "step": 4754, "total_num_tokens": 204604256.0, "z_loss": 0.0005425303243100643 }, { "copy_logits_max": -3.982524871826172, "copy_logits_min": -750000000.0, "copy_num_tokens": 370.125, "epoch": 0.9711513913709472, "gen_logits_max": 4.749641418457031, "gen_logits_mean": -15.393900871276855, "gen_logits_min": -26.651954650878906, "gen_logits_std": 2.88265061378479, "gen_loss": 0.3244885802268982, "grad_norm": 0.36867679622254346, "learning_rate": 2.4923368421052633e-05, "loss": 0.2936, "mean_copy_accuracy": 0.9962291568517685, "mean_gen_accuracy": 0.868173137307167, "mean_token_accuracy": 0.9008395224809647, "num_tokens": 204850778.0, "sample_num_tokens": 8086.5, "step": 4755, "total_num_tokens": 204883124.0, "z_loss": 0.00057097093667835 }, { "copy_logits_max": -3.002169132232666, "copy_logits_min": -750000000.0, "copy_num_tokens": 589.5625, "epoch": 0.971355629308144, "gen_logits_max": 4.9004998207092285, "gen_logits_mean": -13.910924911499023, "gen_logits_min": -25.835859298706055, "gen_logits_std": 2.880026340484619, "gen_loss": 0.25055450201034546, "grad_norm": 0.36892651440719615, "learning_rate": 2.4922105263157894e-05, "loss": 0.2907, "mean_copy_accuracy": 0.9958511739969254, "mean_gen_accuracy": 0.8673532605171204, "mean_token_accuracy": 0.9007991850376129, "num_tokens": 205143177.0, "sample_num_tokens": 9735.25, "step": 4756, "total_num_tokens": 205182118.0, "z_loss": 0.00044342834735289216 }, { "copy_logits_max": -3.738621711730957, "copy_logits_min": -750000000.0, "copy_num_tokens": 527.4375, "epoch": 0.9715598672453408, "gen_logits_max": 3.140961170196533, "gen_logits_mean": -16.205163955688477, "gen_logits_min": -28.24997901916504, "gen_logits_std": 2.9153270721435547, "gen_loss": 0.28634464740753174, "grad_norm": 0.4031954235517701, "learning_rate": 2.4920842105263158e-05, "loss": 0.3071, "mean_copy_accuracy": 0.9954215288162231, "mean_gen_accuracy": 0.8617185205221176, "mean_token_accuracy": 0.8981139063835144, "num_tokens": 205420498.0, "sample_num_tokens": 8387.0, "step": 4757, "total_num_tokens": 205454046.0, "z_loss": 0.0004846561932936311 }, { "copy_logits_max": -4.208586692810059, "copy_logits_min": -750000000.0, "copy_num_tokens": 349.5, "epoch": 0.9717641051825376, "gen_logits_max": 4.623812198638916, "gen_logits_mean": -15.058393478393555, "gen_logits_min": -26.61404800415039, "gen_logits_std": 2.855581283569336, "gen_loss": 0.34522947669029236, "grad_norm": 0.40307258916226085, "learning_rate": 2.4919578947368422e-05, "loss": 0.3127, "mean_copy_accuracy": 0.9957311898469925, "mean_gen_accuracy": 0.8700524270534515, "mean_token_accuracy": 0.8972734063863754, "num_tokens": 205678098.0, "sample_num_tokens": 7626.0, "step": 4758, "total_num_tokens": 205708602.0, "z_loss": 0.0005679469904862344 }, { "copy_logits_max": -1.7344512939453125, "copy_logits_min": -750000000.0, "copy_num_tokens": 312.625, "epoch": 0.9719683431197345, "gen_logits_max": 4.696084976196289, "gen_logits_mean": -14.508922576904297, "gen_logits_min": -26.13859748840332, "gen_logits_std": 2.871096611022949, "gen_loss": 0.3377591669559479, "grad_norm": 0.4149482882464084, "learning_rate": 2.4918315789473683e-05, "loss": 0.3206, "mean_copy_accuracy": 0.9950970858335495, "mean_gen_accuracy": 0.8636763542890549, "mean_token_accuracy": 0.8939439505338669, "num_tokens": 205938430.0, "sample_num_tokens": 6455.5, "step": 4759, "total_num_tokens": 205964252.0, "z_loss": 0.0005661557079292834 }, { "copy_logits_max": -3.1162972450256348, "copy_logits_min": -750000000.0, "copy_num_tokens": 449.8125, "epoch": 0.9721725810569313, "gen_logits_max": 4.426496505737305, "gen_logits_mean": -14.462923049926758, "gen_logits_min": -26.368541717529297, "gen_logits_std": 2.861816167831421, "gen_loss": 0.3435770869255066, "grad_norm": 0.5774770739422544, "learning_rate": 2.491705263157895e-05, "loss": 0.3157, "mean_copy_accuracy": 0.9961964190006256, "mean_gen_accuracy": 0.8665323108434677, "mean_token_accuracy": 0.8956591784954071, "num_tokens": 206200147.0, "sample_num_tokens": 8086.25, "step": 4760, "total_num_tokens": 206232492.0, "z_loss": 0.0006051048403605819 }, { "copy_logits_max": -2.6408650875091553, "copy_logits_min": -750000064.0, "copy_num_tokens": 383.5, "epoch": 0.9723768189941282, "gen_logits_max": 5.506673812866211, "gen_logits_mean": -13.613937377929688, "gen_logits_min": -25.222911834716797, "gen_logits_std": 2.8059818744659424, "gen_loss": 0.3427184820175171, "grad_norm": 0.4015098811005294, "learning_rate": 2.4915789473684212e-05, "loss": 0.2945, "mean_copy_accuracy": 0.996136263012886, "mean_gen_accuracy": 0.8624871224164963, "mean_token_accuracy": 0.9033494740724564, "num_tokens": 206496575.0, "sample_num_tokens": 7865.25, "step": 4761, "total_num_tokens": 206528036.0, "z_loss": 0.0006724097765982151 }, { "copy_logits_max": -6.104423522949219, "copy_logits_min": -750000000.0, "copy_num_tokens": 285.0625, "epoch": 0.972581056931325, "gen_logits_max": 5.329281806945801, "gen_logits_mean": -15.009260177612305, "gen_logits_min": -26.482147216796875, "gen_logits_std": 2.821800947189331, "gen_loss": 0.3335813283920288, "grad_norm": 0.43862140105532693, "learning_rate": 2.4914526315789476e-05, "loss": 0.3289, "mean_copy_accuracy": 0.9946857988834381, "mean_gen_accuracy": 0.8604210466146469, "mean_token_accuracy": 0.8913749903440475, "num_tokens": 206756863.0, "sample_num_tokens": 6970.25, "step": 4762, "total_num_tokens": 206784744.0, "z_loss": 0.0006000786670483649 }, { "copy_logits_max": -5.309279918670654, "copy_logits_min": -750000000.0, "copy_num_tokens": 375.5, "epoch": 0.9727852948685218, "gen_logits_max": 5.943466663360596, "gen_logits_mean": -12.531198501586914, "gen_logits_min": -23.77920150756836, "gen_logits_std": 2.762280225753784, "gen_loss": 0.31315529346466064, "grad_norm": 0.3874832443236491, "learning_rate": 2.4913263157894737e-05, "loss": 0.3131, "mean_copy_accuracy": 0.9947762936353683, "mean_gen_accuracy": 0.8675539195537567, "mean_token_accuracy": 0.8954430520534515, "num_tokens": 207012021.0, "sample_num_tokens": 8045.25, "step": 4763, "total_num_tokens": 207044202.0, "z_loss": 0.0006036816630512476 }, { "copy_logits_max": -3.4187750816345215, "copy_logits_min": -750000000.0, "copy_num_tokens": 424.625, "epoch": 0.9729895328057186, "gen_logits_max": 5.440782070159912, "gen_logits_mean": -13.996821403503418, "gen_logits_min": -25.0550479888916, "gen_logits_std": 2.7617006301879883, "gen_loss": 0.3086954951286316, "grad_norm": 0.3770845243893872, "learning_rate": 2.4912e-05, "loss": 0.291, "mean_copy_accuracy": 0.9961960464715958, "mean_gen_accuracy": 0.8709908425807953, "mean_token_accuracy": 0.9050884693861008, "num_tokens": 207295384.0, "sample_num_tokens": 8213.5, "step": 4764, "total_num_tokens": 207328238.0, "z_loss": 0.0006205445388332009 }, { "copy_logits_max": -4.035585403442383, "copy_logits_min": -750000064.0, "copy_num_tokens": 383.5625, "epoch": 0.9731937707429155, "gen_logits_max": 4.780144691467285, "gen_logits_mean": -14.209651947021484, "gen_logits_min": -25.46923065185547, "gen_logits_std": 2.780592679977417, "gen_loss": 0.29700344800949097, "grad_norm": 0.44545227774325064, "learning_rate": 2.4910736842105262e-05, "loss": 0.3094, "mean_copy_accuracy": 0.9957553148269653, "mean_gen_accuracy": 0.8683987706899643, "mean_token_accuracy": 0.895796537399292, "num_tokens": 207544357.0, "sample_num_tokens": 7021.25, "step": 4765, "total_num_tokens": 207572442.0, "z_loss": 0.000612266652751714 }, { "copy_logits_max": -4.614262104034424, "copy_logits_min": -750000000.0, "copy_num_tokens": 255.5625, "epoch": 0.9733980086801123, "gen_logits_max": 4.791641712188721, "gen_logits_mean": -14.54969596862793, "gen_logits_min": -25.809574127197266, "gen_logits_std": 2.8149778842926025, "gen_loss": 0.33743762969970703, "grad_norm": 0.41640410299498387, "learning_rate": 2.4909473684210527e-05, "loss": 0.3041, "mean_copy_accuracy": 0.996446967124939, "mean_gen_accuracy": 0.869426816701889, "mean_token_accuracy": 0.8978349268436432, "num_tokens": 207784068.0, "sample_num_tokens": 6156.5, "step": 4766, "total_num_tokens": 207808694.0, "z_loss": 0.0006830656202509999 }, { "copy_logits_max": -4.917819976806641, "copy_logits_min": -750000000.0, "copy_num_tokens": 424.875, "epoch": 0.9736022466173092, "gen_logits_max": 4.081526756286621, "gen_logits_mean": -15.320053100585938, "gen_logits_min": -26.879669189453125, "gen_logits_std": 2.8438382148742676, "gen_loss": 0.3075755536556244, "grad_norm": 0.3635586463306238, "learning_rate": 2.4908210526315788e-05, "loss": 0.2767, "mean_copy_accuracy": 0.9952807873487473, "mean_gen_accuracy": 0.8738954812288284, "mean_token_accuracy": 0.9060226380825043, "num_tokens": 208040565.0, "sample_num_tokens": 8063.25, "step": 4767, "total_num_tokens": 208072818.0, "z_loss": 0.0005613028188236058 }, { "copy_logits_max": -2.772099018096924, "copy_logits_min": -750000000.0, "copy_num_tokens": 534.9375, "epoch": 0.973806484554506, "gen_logits_max": 3.9456584453582764, "gen_logits_mean": -15.008334159851074, "gen_logits_min": -26.957494735717773, "gen_logits_std": 2.8463687896728516, "gen_loss": 0.31323426961898804, "grad_norm": 0.3873895166378778, "learning_rate": 2.4906947368421055e-05, "loss": 0.3155, "mean_copy_accuracy": 0.9953885525465012, "mean_gen_accuracy": 0.862387016415596, "mean_token_accuracy": 0.8950807303190231, "num_tokens": 208331926.0, "sample_num_tokens": 9228.0, "step": 4768, "total_num_tokens": 208368838.0, "z_loss": 0.0006479557487182319 }, { "copy_logits_max": -4.963552474975586, "copy_logits_min": -687500032.0, "copy_num_tokens": 602.0625, "epoch": 0.9740107224917028, "gen_logits_max": 5.0423688888549805, "gen_logits_mean": -14.536918640136719, "gen_logits_min": -26.25874900817871, "gen_logits_std": 2.8380534648895264, "gen_loss": 0.33264651894569397, "grad_norm": 0.3978210416104316, "learning_rate": 2.4905684210526316e-05, "loss": 0.3022, "mean_copy_accuracy": 0.9955168962478638, "mean_gen_accuracy": 0.8655928075313568, "mean_token_accuracy": 0.9001763761043549, "num_tokens": 208612378.0, "sample_num_tokens": 9748.0, "step": 4769, "total_num_tokens": 208651370.0, "z_loss": 0.0007684940937906504 }, { "copy_logits_max": -1.1367944478988647, "copy_logits_min": -750000000.0, "copy_num_tokens": 470.6875, "epoch": 0.9742149604288997, "gen_logits_max": 5.43581485748291, "gen_logits_mean": -13.412580490112305, "gen_logits_min": -25.197303771972656, "gen_logits_std": 2.819225549697876, "gen_loss": 0.3366602659225464, "grad_norm": 0.39195371222263775, "learning_rate": 2.490442105263158e-05, "loss": 0.3028, "mean_copy_accuracy": 0.9960625916719437, "mean_gen_accuracy": 0.8637569844722748, "mean_token_accuracy": 0.898331493139267, "num_tokens": 208900673.0, "sample_num_tokens": 7892.25, "step": 4770, "total_num_tokens": 208932242.0, "z_loss": 0.0007008148822933435 }, { "copy_logits_max": -6.325478553771973, "copy_logits_min": -750000000.0, "copy_num_tokens": 709.5625, "epoch": 0.9744191983660965, "gen_logits_max": 4.97602653503418, "gen_logits_mean": -13.195601463317871, "gen_logits_min": -25.172916412353516, "gen_logits_std": 2.889922618865967, "gen_loss": 0.2563849985599518, "grad_norm": 0.4112786959865162, "learning_rate": 2.4903157894736845e-05, "loss": 0.3157, "mean_copy_accuracy": 0.9957216084003448, "mean_gen_accuracy": 0.8626156449317932, "mean_token_accuracy": 0.8951989859342575, "num_tokens": 209150481.0, "sample_num_tokens": 11087.25, "step": 4771, "total_num_tokens": 209194830.0, "z_loss": 0.0005613038083538413 }, { "copy_logits_max": -3.9076175689697266, "copy_logits_min": -625000064.0, "copy_num_tokens": 660.875, "epoch": 0.9746234363032933, "gen_logits_max": 3.573153257369995, "gen_logits_mean": -16.164234161376953, "gen_logits_min": -27.665634155273438, "gen_logits_std": 2.8737130165100098, "gen_loss": 0.3087688088417053, "grad_norm": 0.37631144140390627, "learning_rate": 2.4901894736842106e-05, "loss": 0.3025, "mean_copy_accuracy": 0.9953953176736832, "mean_gen_accuracy": 0.864421084523201, "mean_token_accuracy": 0.9001905769109726, "num_tokens": 209468753.0, "sample_num_tokens": 10440.75, "step": 4772, "total_num_tokens": 209510516.0, "z_loss": 0.000568134244531393 }, { "copy_logits_max": -2.495266914367676, "copy_logits_min": -687500032.0, "copy_num_tokens": 404.375, "epoch": 0.9748276742404902, "gen_logits_max": 5.450305938720703, "gen_logits_mean": -13.418966293334961, "gen_logits_min": -25.55916404724121, "gen_logits_std": 2.8789713382720947, "gen_loss": 0.3512420654296875, "grad_norm": 0.39959414394436055, "learning_rate": 2.490063157894737e-05, "loss": 0.2983, "mean_copy_accuracy": 0.9957003742456436, "mean_gen_accuracy": 0.8714179247617722, "mean_token_accuracy": 0.9035245776176453, "num_tokens": 209767876.0, "sample_num_tokens": 7619.5, "step": 4773, "total_num_tokens": 209798354.0, "z_loss": 0.0006656600162386894 }, { "copy_logits_max": -4.220546722412109, "copy_logits_min": -750000000.0, "copy_num_tokens": 400.8125, "epoch": 0.975031912177687, "gen_logits_max": 3.5860273838043213, "gen_logits_mean": -16.849349975585938, "gen_logits_min": -28.57059669494629, "gen_logits_std": 2.912519693374634, "gen_loss": 0.31843775510787964, "grad_norm": 0.4493244434441827, "learning_rate": 2.489936842105263e-05, "loss": 0.3045, "mean_copy_accuracy": 0.9956260472536087, "mean_gen_accuracy": 0.8688198924064636, "mean_token_accuracy": 0.8979033976793289, "num_tokens": 210041092.0, "sample_num_tokens": 8099.5, "step": 4774, "total_num_tokens": 210073490.0, "z_loss": 0.0006105900974944234 }, { "copy_logits_max": -1.8979008197784424, "copy_logits_min": -687500032.0, "copy_num_tokens": 814.625, "epoch": 0.9752361501148838, "gen_logits_max": 4.313488006591797, "gen_logits_mean": -14.215794563293457, "gen_logits_min": -27.298738479614258, "gen_logits_std": 2.913560390472412, "gen_loss": 0.3068329691886902, "grad_norm": 0.37903736107551594, "learning_rate": 2.4898105263157895e-05, "loss": 0.2983, "mean_copy_accuracy": 0.9962006658315659, "mean_gen_accuracy": 0.8669039309024811, "mean_token_accuracy": 0.9025028944015503, "num_tokens": 210328862.0, "sample_num_tokens": 10600.5, "step": 4775, "total_num_tokens": 210371264.0, "z_loss": 0.0006037837592884898 }, { "copy_logits_max": -5.710629463195801, "copy_logits_min": -750000000.0, "copy_num_tokens": 636.875, "epoch": 0.9754403880520807, "gen_logits_max": 4.589231491088867, "gen_logits_mean": -14.869319915771484, "gen_logits_min": -26.99354362487793, "gen_logits_std": 2.862027168273926, "gen_loss": 0.3463941216468811, "grad_norm": 0.3934178054766222, "learning_rate": 2.4896842105263156e-05, "loss": 0.3098, "mean_copy_accuracy": 0.9960980415344238, "mean_gen_accuracy": 0.8607782572507858, "mean_token_accuracy": 0.8974036127328873, "num_tokens": 210625178.0, "sample_num_tokens": 10373.0, "step": 4776, "total_num_tokens": 210666670.0, "z_loss": 0.0005747106624767184 }, { "copy_logits_max": -2.090000629425049, "copy_logits_min": -687500032.0, "copy_num_tokens": 630.625, "epoch": 0.9756446259892775, "gen_logits_max": 4.213894367218018, "gen_logits_mean": -14.225790023803711, "gen_logits_min": -26.52642059326172, "gen_logits_std": 2.9465134143829346, "gen_loss": 0.27275022864341736, "grad_norm": 0.38634106187391903, "learning_rate": 2.4895578947368424e-05, "loss": 0.2944, "mean_copy_accuracy": 0.9963797181844711, "mean_gen_accuracy": 0.8639008849859238, "mean_token_accuracy": 0.9021355658769608, "num_tokens": 210892897.0, "sample_num_tokens": 9301.75, "step": 4777, "total_num_tokens": 210930104.0, "z_loss": 0.0005056434310972691 }, { "copy_logits_max": -6.375951290130615, "copy_logits_min": -750000000.0, "copy_num_tokens": 379.625, "epoch": 0.9758488639264743, "gen_logits_max": 4.784502983093262, "gen_logits_mean": -15.48741340637207, "gen_logits_min": -27.53276252746582, "gen_logits_std": 2.8814549446105957, "gen_loss": 0.35266658663749695, "grad_norm": 0.39218499563002307, "learning_rate": 2.4894315789473685e-05, "loss": 0.3066, "mean_copy_accuracy": 0.995416522026062, "mean_gen_accuracy": 0.8677032738924026, "mean_token_accuracy": 0.8976842612028122, "num_tokens": 211160890.0, "sample_num_tokens": 7556.5, "step": 4778, "total_num_tokens": 211191116.0, "z_loss": 0.0006482035387307405 }, { "copy_logits_max": -5.184108734130859, "copy_logits_min": -750000000.0, "copy_num_tokens": 377.25, "epoch": 0.9760531018636712, "gen_logits_max": 6.336815357208252, "gen_logits_mean": -13.539058685302734, "gen_logits_min": -25.08814811706543, "gen_logits_std": 2.821042537689209, "gen_loss": 0.3713699281215668, "grad_norm": 0.4217206186054118, "learning_rate": 2.489305263157895e-05, "loss": 0.3122, "mean_copy_accuracy": 0.9956096261739731, "mean_gen_accuracy": 0.8701083809137344, "mean_token_accuracy": 0.896643877029419, "num_tokens": 211403358.0, "sample_num_tokens": 7311.0, "step": 4779, "total_num_tokens": 211432602.0, "z_loss": 0.0007079357746988535 }, { "copy_logits_max": -5.015050888061523, "copy_logits_min": -750000000.0, "copy_num_tokens": 522.75, "epoch": 0.976257339800868, "gen_logits_max": 5.155057430267334, "gen_logits_mean": -13.606597900390625, "gen_logits_min": -25.3515682220459, "gen_logits_std": 2.89841628074646, "gen_loss": 0.3005872964859009, "grad_norm": 0.3611343474811419, "learning_rate": 2.489178947368421e-05, "loss": 0.3121, "mean_copy_accuracy": 0.9956383109092712, "mean_gen_accuracy": 0.8619868457317352, "mean_token_accuracy": 0.8948856890201569, "num_tokens": 211664303.0, "sample_num_tokens": 8854.75, "step": 4780, "total_num_tokens": 211699722.0, "z_loss": 0.0006187575636431575 }, { "copy_logits_max": -6.562614917755127, "copy_logits_min": -750000000.0, "copy_num_tokens": 297.6875, "epoch": 0.9764615777380649, "gen_logits_max": 5.099632263183594, "gen_logits_mean": -15.402877807617188, "gen_logits_min": -27.117862701416016, "gen_logits_std": 2.8576865196228027, "gen_loss": 0.31256675720214844, "grad_norm": 0.4151370952045534, "learning_rate": 2.4890526315789474e-05, "loss": 0.3234, "mean_copy_accuracy": 0.9938950836658478, "mean_gen_accuracy": 0.8638617247343063, "mean_token_accuracy": 0.892199382185936, "num_tokens": 211934014.0, "sample_num_tokens": 7186.0, "step": 4781, "total_num_tokens": 211962758.0, "z_loss": 0.0006359748658724129 }, { "copy_logits_max": -5.776673316955566, "copy_logits_min": -687500032.0, "copy_num_tokens": 393.125, "epoch": 0.9766658156752617, "gen_logits_max": 6.252817153930664, "gen_logits_mean": -13.566038131713867, "gen_logits_min": -25.215343475341797, "gen_logits_std": 2.8453328609466553, "gen_loss": 0.3165503144264221, "grad_norm": 0.4112933301731803, "learning_rate": 2.4889263157894735e-05, "loss": 0.3, "mean_copy_accuracy": 0.9952547997236252, "mean_gen_accuracy": 0.8715739101171494, "mean_token_accuracy": 0.9010709226131439, "num_tokens": 212185227.0, "sample_num_tokens": 7685.25, "step": 4782, "total_num_tokens": 212215968.0, "z_loss": 0.0006218897178769112 }, { "copy_logits_max": -4.49361515045166, "copy_logits_min": -750000000.0, "copy_num_tokens": 425.125, "epoch": 0.9768700536124585, "gen_logits_max": 5.342164039611816, "gen_logits_mean": -13.760517120361328, "gen_logits_min": -25.16393280029297, "gen_logits_std": 2.8057985305786133, "gen_loss": 0.3292527198791504, "grad_norm": 0.48598444907389543, "learning_rate": 2.4888e-05, "loss": 0.326, "mean_copy_accuracy": 0.9954435676336288, "mean_gen_accuracy": 0.859626516699791, "mean_token_accuracy": 0.8924980908632278, "num_tokens": 212450520.0, "sample_num_tokens": 7720.0, "step": 4783, "total_num_tokens": 212481400.0, "z_loss": 0.0006680542137473822 }, { "copy_logits_max": -4.0138840675354, "copy_logits_min": -687500032.0, "copy_num_tokens": 518.8125, "epoch": 0.9770742915496553, "gen_logits_max": 5.200620651245117, "gen_logits_mean": -14.17507266998291, "gen_logits_min": -25.877883911132812, "gen_logits_std": 2.8511698246002197, "gen_loss": 0.3063129484653473, "grad_norm": 0.4457241789880741, "learning_rate": 2.4886736842105264e-05, "loss": 0.3238, "mean_copy_accuracy": 0.9960951954126358, "mean_gen_accuracy": 0.8584722727537155, "mean_token_accuracy": 0.8922302275896072, "num_tokens": 212696381.0, "sample_num_tokens": 8651.25, "step": 4784, "total_num_tokens": 212730986.0, "z_loss": 0.0006892128731124103 }, { "copy_logits_max": -3.3908727169036865, "copy_logits_min": -750000000.0, "copy_num_tokens": 507.125, "epoch": 0.9772785294868522, "gen_logits_max": 6.36579704284668, "gen_logits_mean": -12.562479019165039, "gen_logits_min": -24.49456787109375, "gen_logits_std": 2.8221538066864014, "gen_loss": 0.3260054588317871, "grad_norm": 0.8399725007429846, "learning_rate": 2.4885473684210528e-05, "loss": 0.296, "mean_copy_accuracy": 0.9966201782226562, "mean_gen_accuracy": 0.8629705905914307, "mean_token_accuracy": 0.9008050560951233, "num_tokens": 212984659.0, "sample_num_tokens": 8573.75, "step": 4785, "total_num_tokens": 213018954.0, "z_loss": 0.0006776095251552761 }, { "copy_logits_max": -3.4750494956970215, "copy_logits_min": -687500032.0, "copy_num_tokens": 484.8125, "epoch": 0.977482767424049, "gen_logits_max": 5.849246025085449, "gen_logits_mean": -12.945232391357422, "gen_logits_min": -24.554141998291016, "gen_logits_std": 2.8447959423065186, "gen_loss": 0.321637898683548, "grad_norm": 0.3925229310196057, "learning_rate": 2.4884210526315792e-05, "loss": 0.2934, "mean_copy_accuracy": 0.9955798983573914, "mean_gen_accuracy": 0.8679975122213364, "mean_token_accuracy": 0.9019724726676941, "num_tokens": 213276994.0, "sample_num_tokens": 8744.0, "step": 4786, "total_num_tokens": 213311970.0, "z_loss": 0.0006134708528406918 }, { "copy_logits_max": -3.2897632122039795, "copy_logits_min": -750000064.0, "copy_num_tokens": 815.125, "epoch": 0.9776870053612459, "gen_logits_max": 3.305868625640869, "gen_logits_mean": -15.508211135864258, "gen_logits_min": -27.397079467773438, "gen_logits_std": 2.9084064960479736, "gen_loss": 0.2668803632259369, "grad_norm": 0.3755429825497654, "learning_rate": 2.4882947368421053e-05, "loss": 0.3038, "mean_copy_accuracy": 0.995758131146431, "mean_gen_accuracy": 0.8609881699085236, "mean_token_accuracy": 0.9008449763059616, "num_tokens": 213564386.0, "sample_num_tokens": 10342.0, "step": 4787, "total_num_tokens": 213605754.0, "z_loss": 0.0005693132407031953 }, { "copy_logits_max": -3.3716797828674316, "copy_logits_min": -750000000.0, "copy_num_tokens": 347.3125, "epoch": 0.9778912432984427, "gen_logits_max": 5.130575180053711, "gen_logits_mean": -14.919675827026367, "gen_logits_min": -26.758243560791016, "gen_logits_std": 2.882873058319092, "gen_loss": 0.3057399392127991, "grad_norm": 0.4156895635673897, "learning_rate": 2.4881684210526318e-05, "loss": 0.308, "mean_copy_accuracy": 0.9964899718761444, "mean_gen_accuracy": 0.8658561110496521, "mean_token_accuracy": 0.8962152749300003, "num_tokens": 213832910.0, "sample_num_tokens": 6792.5, "step": 4788, "total_num_tokens": 213860080.0, "z_loss": 0.0006659561768174171 }, { "copy_logits_max": -4.2771525382995605, "copy_logits_min": -750000000.0, "copy_num_tokens": 462.0, "epoch": 0.9780954812356395, "gen_logits_max": 5.066458702087402, "gen_logits_mean": -13.364718437194824, "gen_logits_min": -24.81513786315918, "gen_logits_std": 2.7634706497192383, "gen_loss": 0.32064494490623474, "grad_norm": 0.36930840275180465, "learning_rate": 2.488042105263158e-05, "loss": 0.3103, "mean_copy_accuracy": 0.9959196597337723, "mean_gen_accuracy": 0.8688445836305618, "mean_token_accuracy": 0.8989369869232178, "num_tokens": 214116284.0, "sample_num_tokens": 9564.5, "step": 4789, "total_num_tokens": 214154542.0, "z_loss": 0.0007114883628673851 }, { "copy_logits_max": -4.769964218139648, "copy_logits_min": -750000000.0, "copy_num_tokens": 464.0625, "epoch": 0.9782997191728363, "gen_logits_max": 5.469901084899902, "gen_logits_mean": -14.120366096496582, "gen_logits_min": -25.76968002319336, "gen_logits_std": 2.8624439239501953, "gen_loss": 0.26662349700927734, "grad_norm": 0.3851470596248878, "learning_rate": 2.4879157894736843e-05, "loss": 0.2893, "mean_copy_accuracy": 0.9963359534740448, "mean_gen_accuracy": 0.8712878525257111, "mean_token_accuracy": 0.9028522968292236, "num_tokens": 214396079.0, "sample_num_tokens": 9642.25, "step": 4790, "total_num_tokens": 214434648.0, "z_loss": 0.0006014845566824079 }, { "copy_logits_max": -2.568483591079712, "copy_logits_min": -750000064.0, "copy_num_tokens": 448.125, "epoch": 0.9785039571100332, "gen_logits_max": 5.713650703430176, "gen_logits_mean": -13.402482986450195, "gen_logits_min": -25.792491912841797, "gen_logits_std": 2.885788679122925, "gen_loss": 0.2833014726638794, "grad_norm": 0.40580496982848213, "learning_rate": 2.4877894736842104e-05, "loss": 0.3044, "mean_copy_accuracy": 0.9952825456857681, "mean_gen_accuracy": 0.8679890483617783, "mean_token_accuracy": 0.8987548053264618, "num_tokens": 214655084.0, "sample_num_tokens": 7824.5, "step": 4791, "total_num_tokens": 214686382.0, "z_loss": 0.000596723286435008 }, { "copy_logits_max": -4.4629106521606445, "copy_logits_min": -750000064.0, "copy_num_tokens": 370.0, "epoch": 0.9787081950472301, "gen_logits_max": 5.702961444854736, "gen_logits_mean": -13.821298599243164, "gen_logits_min": -25.27678680419922, "gen_logits_std": 2.819817543029785, "gen_loss": 0.29168832302093506, "grad_norm": 0.40403776290873666, "learning_rate": 2.4876631578947368e-05, "loss": 0.3144, "mean_copy_accuracy": 0.9951255917549133, "mean_gen_accuracy": 0.8659666180610657, "mean_token_accuracy": 0.8958036154508591, "num_tokens": 214914083.0, "sample_num_tokens": 8662.25, "step": 4792, "total_num_tokens": 214948732.0, "z_loss": 0.0006142113707028329 }, { "copy_logits_max": -3.0829124450683594, "copy_logits_min": -750000000.0, "copy_num_tokens": 556.75, "epoch": 0.9789124329844269, "gen_logits_max": 4.085685729980469, "gen_logits_mean": -15.235527992248535, "gen_logits_min": -27.16004753112793, "gen_logits_std": 2.85606050491333, "gen_loss": 0.2900502383708954, "grad_norm": 0.3980836858926613, "learning_rate": 2.4875368421052633e-05, "loss": 0.3217, "mean_copy_accuracy": 0.9959635883569717, "mean_gen_accuracy": 0.8592585921287537, "mean_token_accuracy": 0.8922634273767471, "num_tokens": 215189381.0, "sample_num_tokens": 8750.75, "step": 4793, "total_num_tokens": 215224384.0, "z_loss": 0.0006330965552479029 }, { "copy_logits_max": -3.144562244415283, "copy_logits_min": -750000000.0, "copy_num_tokens": 392.125, "epoch": 0.9791166709216237, "gen_logits_max": 4.605005264282227, "gen_logits_mean": -14.70955753326416, "gen_logits_min": -26.73210906982422, "gen_logits_std": 2.897472381591797, "gen_loss": 0.28019750118255615, "grad_norm": 0.39829287658504386, "learning_rate": 2.4874105263157897e-05, "loss": 0.3091, "mean_copy_accuracy": 0.9960353076457977, "mean_gen_accuracy": 0.8656473159790039, "mean_token_accuracy": 0.8976192027330399, "num_tokens": 215461984.0, "sample_num_tokens": 7971.0, "step": 4794, "total_num_tokens": 215493868.0, "z_loss": 0.0005631748354062438 }, { "copy_logits_max": -3.904451370239258, "copy_logits_min": -625000064.0, "copy_num_tokens": 521.4375, "epoch": 0.9793209088588205, "gen_logits_max": 5.262899398803711, "gen_logits_mean": -13.051300048828125, "gen_logits_min": -24.782249450683594, "gen_logits_std": 2.775667667388916, "gen_loss": 0.30853521823883057, "grad_norm": 0.3939541778621239, "learning_rate": 2.4872842105263158e-05, "loss": 0.3044, "mean_copy_accuracy": 0.9952514320611954, "mean_gen_accuracy": 0.8690351694822311, "mean_token_accuracy": 0.900263711810112, "num_tokens": 215735619.0, "sample_num_tokens": 8668.75, "step": 4795, "total_num_tokens": 215770294.0, "z_loss": 0.0006470810621976852 }, { "copy_logits_max": -4.749181747436523, "copy_logits_min": -750000000.0, "copy_num_tokens": 381.375, "epoch": 0.9795251467960173, "gen_logits_max": 4.120232582092285, "gen_logits_mean": -15.918896675109863, "gen_logits_min": -27.50377655029297, "gen_logits_std": 2.838897466659546, "gen_loss": 0.3278713822364807, "grad_norm": 0.553697677813061, "learning_rate": 2.4871578947368422e-05, "loss": 0.3102, "mean_copy_accuracy": 0.9965937733650208, "mean_gen_accuracy": 0.8667791932821274, "mean_token_accuracy": 0.8995134085416794, "num_tokens": 216013163.0, "sample_num_tokens": 8120.25, "step": 4796, "total_num_tokens": 216045644.0, "z_loss": 0.0006073277909308672 }, { "copy_logits_max": -3.9990768432617188, "copy_logits_min": -750000064.0, "copy_num_tokens": 365.25, "epoch": 0.9797293847332142, "gen_logits_max": 4.081367492675781, "gen_logits_mean": -15.753969192504883, "gen_logits_min": -27.146469116210938, "gen_logits_std": 2.805879592895508, "gen_loss": 0.3123161196708679, "grad_norm": 0.4604093136377138, "learning_rate": 2.4870315789473683e-05, "loss": 0.3191, "mean_copy_accuracy": 0.9939948171377182, "mean_gen_accuracy": 0.8616963326931, "mean_token_accuracy": 0.8927333801984787, "num_tokens": 216258285.0, "sample_num_tokens": 7530.25, "step": 4797, "total_num_tokens": 216288406.0, "z_loss": 0.0006426413892768323 }, { "copy_logits_max": -3.698227643966675, "copy_logits_min": -625000064.0, "copy_num_tokens": 449.5, "epoch": 0.9799336226704111, "gen_logits_max": 5.128188610076904, "gen_logits_mean": -13.721513748168945, "gen_logits_min": -25.088279724121094, "gen_logits_std": 2.7628159523010254, "gen_loss": 0.3069455027580261, "grad_norm": 0.3936171579921106, "learning_rate": 2.4869052631578947e-05, "loss": 0.2938, "mean_copy_accuracy": 0.9947158396244049, "mean_gen_accuracy": 0.8722276240587234, "mean_token_accuracy": 0.9031027108430862, "num_tokens": 216544767.0, "sample_num_tokens": 8292.75, "step": 4798, "total_num_tokens": 216577938.0, "z_loss": 0.0005845747655257583 }, { "copy_logits_max": -4.548271656036377, "copy_logits_min": -750000000.0, "copy_num_tokens": 512.3125, "epoch": 0.9801378606076079, "gen_logits_max": 4.648085594177246, "gen_logits_mean": -14.004883766174316, "gen_logits_min": -25.153846740722656, "gen_logits_std": 2.7536659240722656, "gen_loss": 0.2802000343799591, "grad_norm": 0.3759935974721695, "learning_rate": 2.486778947368421e-05, "loss": 0.2915, "mean_copy_accuracy": 0.9959265142679214, "mean_gen_accuracy": 0.8704090267419815, "mean_token_accuracy": 0.9028503894805908, "num_tokens": 216823457.0, "sample_num_tokens": 10142.25, "step": 4799, "total_num_tokens": 216864026.0, "z_loss": 0.0005463500856421888 }, { "copy_logits_max": -5.3159894943237305, "copy_logits_min": -750000000.0, "copy_num_tokens": 477.5625, "epoch": 0.9803420985448047, "gen_logits_max": 4.69788932800293, "gen_logits_mean": -13.889749526977539, "gen_logits_min": -25.148866653442383, "gen_logits_std": 2.7425429821014404, "gen_loss": 0.22933319211006165, "grad_norm": 0.39329030103380774, "learning_rate": 2.4866526315789473e-05, "loss": 0.2808, "mean_copy_accuracy": 0.9945153146982193, "mean_gen_accuracy": 0.8798950463533401, "mean_token_accuracy": 0.9061731398105621, "num_tokens": 217095316.0, "sample_num_tokens": 8820.5, "step": 4800, "total_num_tokens": 217130598.0, "z_loss": 0.0004244356823619455 }, { "copy_logits_max": -4.681108474731445, "copy_logits_min": -750000000.0, "copy_num_tokens": 648.5, "epoch": 0.9805463364820015, "gen_logits_max": 4.075206756591797, "gen_logits_mean": -15.0625581741333, "gen_logits_min": -26.39458656311035, "gen_logits_std": 2.792681932449341, "gen_loss": 0.3285626769065857, "grad_norm": 0.3542338264078523, "learning_rate": 2.486526315789474e-05, "loss": 0.3218, "mean_copy_accuracy": 0.9960757046937943, "mean_gen_accuracy": 0.857761561870575, "mean_token_accuracy": 0.8927018195390701, "num_tokens": 217383728.0, "sample_num_tokens": 10799.5, "step": 4801, "total_num_tokens": 217426926.0, "z_loss": 0.0006224197568371892 }, { "copy_logits_max": -4.644477367401123, "copy_logits_min": -750000000.0, "copy_num_tokens": 431.6875, "epoch": 0.9807505744191983, "gen_logits_max": 4.371659278869629, "gen_logits_mean": -14.553311347961426, "gen_logits_min": -26.08708953857422, "gen_logits_std": 2.7858316898345947, "gen_loss": 0.2938993573188782, "grad_norm": 0.43696451987793733, "learning_rate": 2.4864e-05, "loss": 0.3079, "mean_copy_accuracy": 0.99375881254673, "mean_gen_accuracy": 0.8694770932197571, "mean_token_accuracy": 0.8960070610046387, "num_tokens": 217639453.0, "sample_num_tokens": 7810.25, "step": 4802, "total_num_tokens": 217670694.0, "z_loss": 0.0005117136752232909 }, { "copy_logits_max": -5.6755757331848145, "copy_logits_min": -750000064.0, "copy_num_tokens": 478.5, "epoch": 0.9809548123563951, "gen_logits_max": 5.298504829406738, "gen_logits_mean": -13.362634658813477, "gen_logits_min": -24.86044692993164, "gen_logits_std": 2.7583107948303223, "gen_loss": 0.3350626528263092, "grad_norm": 0.40891122238691036, "learning_rate": 2.4862736842105265e-05, "loss": 0.3032, "mean_copy_accuracy": 0.9938570111989975, "mean_gen_accuracy": 0.8664472848176956, "mean_token_accuracy": 0.8994071781635284, "num_tokens": 217905445.0, "sample_num_tokens": 8219.75, "step": 4803, "total_num_tokens": 217938324.0, "z_loss": 0.0006061358144506812 }, { "copy_logits_max": -4.342313766479492, "copy_logits_min": -687500032.0, "copy_num_tokens": 580.875, "epoch": 0.9811590502935921, "gen_logits_max": 4.254735946655273, "gen_logits_mean": -15.400897979736328, "gen_logits_min": -27.51475715637207, "gen_logits_std": 2.8639752864837646, "gen_loss": 0.2954601049423218, "grad_norm": 0.37682399940707384, "learning_rate": 2.4861473684210526e-05, "loss": 0.3036, "mean_copy_accuracy": 0.9957608431577682, "mean_gen_accuracy": 0.8646963834762573, "mean_token_accuracy": 0.8987370580434799, "num_tokens": 218199120.0, "sample_num_tokens": 9802.5, "step": 4804, "total_num_tokens": 218238330.0, "z_loss": 0.0006119225872680545 }, { "copy_logits_max": -5.083224296569824, "copy_logits_min": -750000064.0, "copy_num_tokens": 575.75, "epoch": 0.9813632882307889, "gen_logits_max": 3.814250946044922, "gen_logits_mean": -14.777246475219727, "gen_logits_min": -26.531408309936523, "gen_logits_std": 2.859579563140869, "gen_loss": 0.28549861907958984, "grad_norm": 0.37569679160316255, "learning_rate": 2.486021052631579e-05, "loss": 0.2855, "mean_copy_accuracy": 0.99647556245327, "mean_gen_accuracy": 0.8709861189126968, "mean_token_accuracy": 0.906404435634613, "num_tokens": 218472706.0, "sample_num_tokens": 8306.0, "step": 4805, "total_num_tokens": 218505930.0, "z_loss": 0.0005546775064431131 }, { "copy_logits_max": -6.606016159057617, "copy_logits_min": -750000000.0, "copy_num_tokens": 365.1875, "epoch": 0.9815675261679857, "gen_logits_max": 5.008894920349121, "gen_logits_mean": -14.800287246704102, "gen_logits_min": -26.593738555908203, "gen_logits_std": 2.842595100402832, "gen_loss": 0.3090716004371643, "grad_norm": 0.41895694557586804, "learning_rate": 2.485894736842105e-05, "loss": 0.3234, "mean_copy_accuracy": 0.995083674788475, "mean_gen_accuracy": 0.8587569445371628, "mean_token_accuracy": 0.8911316692829132, "num_tokens": 218741420.0, "sample_num_tokens": 7879.0, "step": 4806, "total_num_tokens": 218772936.0, "z_loss": 0.0006168173858895898 }, { "copy_logits_max": -3.360832452774048, "copy_logits_min": -750000000.0, "copy_num_tokens": 434.25, "epoch": 0.9817717641051825, "gen_logits_max": 4.779139518737793, "gen_logits_mean": -14.279853820800781, "gen_logits_min": -26.213459014892578, "gen_logits_std": 2.8789443969726562, "gen_loss": 0.30091631412506104, "grad_norm": 0.39672430748047083, "learning_rate": 2.4857684210526316e-05, "loss": 0.3042, "mean_copy_accuracy": 0.9960614442825317, "mean_gen_accuracy": 0.8637481927871704, "mean_token_accuracy": 0.898361548781395, "num_tokens": 219010106.0, "sample_num_tokens": 7155.0, "step": 4807, "total_num_tokens": 219038726.0, "z_loss": 0.0005908350576646626 }, { "copy_logits_max": -3.802415132522583, "copy_logits_min": -750000000.0, "copy_num_tokens": 851.75, "epoch": 0.9819760020423793, "gen_logits_max": 4.00961971282959, "gen_logits_mean": -13.812253952026367, "gen_logits_min": -25.85598373413086, "gen_logits_std": 2.860065221786499, "gen_loss": 0.3132172226905823, "grad_norm": 0.394096322632413, "learning_rate": 2.4856421052631577e-05, "loss": 0.2778, "mean_copy_accuracy": 0.9966065287590027, "mean_gen_accuracy": 0.8690709620714188, "mean_token_accuracy": 0.9067869037389755, "num_tokens": 219291234.0, "sample_num_tokens": 10471.5, "step": 4808, "total_num_tokens": 219333120.0, "z_loss": 0.0006199915078468621 }, { "copy_logits_max": -5.3757123947143555, "copy_logits_min": -750000000.0, "copy_num_tokens": 421.8125, "epoch": 0.9821802399795762, "gen_logits_max": 4.934731483459473, "gen_logits_mean": -14.85820484161377, "gen_logits_min": -26.678028106689453, "gen_logits_std": 2.822737693786621, "gen_loss": 0.31111109256744385, "grad_norm": 0.38366232729671645, "learning_rate": 2.4855157894736845e-05, "loss": 0.2928, "mean_copy_accuracy": 0.9952467978000641, "mean_gen_accuracy": 0.8698355406522751, "mean_token_accuracy": 0.901405930519104, "num_tokens": 219579289.0, "sample_num_tokens": 8585.75, "step": 4809, "total_num_tokens": 219613632.0, "z_loss": 0.0005736704915761948 }, { "copy_logits_max": -3.1265437602996826, "copy_logits_min": -750000000.0, "copy_num_tokens": 560.6875, "epoch": 0.9823844779167731, "gen_logits_max": 4.489060401916504, "gen_logits_mean": -14.395130157470703, "gen_logits_min": -26.053524017333984, "gen_logits_std": 2.8665382862091064, "gen_loss": 0.29657310247421265, "grad_norm": 0.38723895275340986, "learning_rate": 2.4853894736842105e-05, "loss": 0.3209, "mean_copy_accuracy": 0.9958304911851883, "mean_gen_accuracy": 0.8614068329334259, "mean_token_accuracy": 0.8931072056293488, "num_tokens": 219841017.0, "sample_num_tokens": 8986.75, "step": 4810, "total_num_tokens": 219876964.0, "z_loss": 0.0006129001267254353 }, { "copy_logits_max": -3.1123714447021484, "copy_logits_min": -750000000.0, "copy_num_tokens": 523.375, "epoch": 0.9825887158539699, "gen_logits_max": 4.9579877853393555, "gen_logits_mean": -15.651033401489258, "gen_logits_min": -27.28452491760254, "gen_logits_std": 2.87432599067688, "gen_loss": 0.3217533230781555, "grad_norm": 0.3715572308780323, "learning_rate": 2.485263157894737e-05, "loss": 0.3091, "mean_copy_accuracy": 0.9959730058908463, "mean_gen_accuracy": 0.8647858500480652, "mean_token_accuracy": 0.8967512995004654, "num_tokens": 220125970.0, "sample_num_tokens": 8822.0, "step": 4811, "total_num_tokens": 220161258.0, "z_loss": 0.0006527419900521636 }, { "copy_logits_max": -1.5481815338134766, "copy_logits_min": -750000000.0, "copy_num_tokens": 547.625, "epoch": 0.9827929537911667, "gen_logits_max": 4.915238380432129, "gen_logits_mean": -12.515628814697266, "gen_logits_min": -24.694204330444336, "gen_logits_std": 2.847651958465576, "gen_loss": 0.31064528226852417, "grad_norm": 0.4317289393347422, "learning_rate": 2.4851368421052634e-05, "loss": 0.2994, "mean_copy_accuracy": 0.9961425215005875, "mean_gen_accuracy": 0.8712587058544159, "mean_token_accuracy": 0.8996142446994781, "num_tokens": 220371528.0, "sample_num_tokens": 8236.0, "step": 4812, "total_num_tokens": 220404472.0, "z_loss": 0.000691670342348516 }, { "copy_logits_max": -4.94103479385376, "copy_logits_min": -750000000.0, "copy_num_tokens": 468.3125, "epoch": 0.9829971917283635, "gen_logits_max": 6.115342140197754, "gen_logits_mean": -12.296977996826172, "gen_logits_min": -23.721616744995117, "gen_logits_std": 2.763915538787842, "gen_loss": 0.32813167572021484, "grad_norm": 0.4106467649676005, "learning_rate": 2.4850105263157895e-05, "loss": 0.3121, "mean_copy_accuracy": 0.9941819906234741, "mean_gen_accuracy": 0.8660954087972641, "mean_token_accuracy": 0.894784152507782, "num_tokens": 220632150.0, "sample_num_tokens": 8704.0, "step": 4813, "total_num_tokens": 220666966.0, "z_loss": 0.0007065475219860673 }, { "copy_logits_max": -3.472383975982666, "copy_logits_min": -750000000.0, "copy_num_tokens": 414.0, "epoch": 0.9832014296655603, "gen_logits_max": 5.1649394035339355, "gen_logits_mean": -14.626374244689941, "gen_logits_min": -26.238868713378906, "gen_logits_std": 2.835737466812134, "gen_loss": 0.3367428183555603, "grad_norm": 0.48211175998070027, "learning_rate": 2.484884210526316e-05, "loss": 0.3042, "mean_copy_accuracy": 0.9939501583576202, "mean_gen_accuracy": 0.8603229373693466, "mean_token_accuracy": 0.8980620801448822, "num_tokens": 220903224.0, "sample_num_tokens": 7647.0, "step": 4814, "total_num_tokens": 220933812.0, "z_loss": 0.0006836996180936694 }, { "copy_logits_max": -5.948942184448242, "copy_logits_min": -750000000.0, "copy_num_tokens": 291.625, "epoch": 0.9834056676027572, "gen_logits_max": 5.2731614112854, "gen_logits_mean": -14.201742172241211, "gen_logits_min": -26.37987518310547, "gen_logits_std": 2.902549982070923, "gen_loss": 0.30250388383865356, "grad_norm": 0.4187905723210495, "learning_rate": 2.484757894736842e-05, "loss": 0.3172, "mean_copy_accuracy": 0.9954100996255875, "mean_gen_accuracy": 0.8642369359731674, "mean_token_accuracy": 0.8936277478933334, "num_tokens": 221161817.0, "sample_num_tokens": 6718.25, "step": 4815, "total_num_tokens": 221188690.0, "z_loss": 0.000585228786803782 }, { "copy_logits_max": -3.7396087646484375, "copy_logits_min": -750000000.0, "copy_num_tokens": 600.9375, "epoch": 0.9836099055399541, "gen_logits_max": 5.827903747558594, "gen_logits_mean": -13.605974197387695, "gen_logits_min": -24.88182830810547, "gen_logits_std": 2.8314948081970215, "gen_loss": 0.28736549615859985, "grad_norm": 0.383940397162559, "learning_rate": 2.4846315789473685e-05, "loss": 0.2974, "mean_copy_accuracy": 0.996776357293129, "mean_gen_accuracy": 0.8666900545358658, "mean_token_accuracy": 0.9017637968063354, "num_tokens": 221445753.0, "sample_num_tokens": 9710.75, "step": 4816, "total_num_tokens": 221484596.0, "z_loss": 0.0007082083029672503 }, { "copy_logits_max": -4.116518974304199, "copy_logits_min": -687500032.0, "copy_num_tokens": 478.625, "epoch": 0.9838141434771509, "gen_logits_max": 3.9504220485687256, "gen_logits_mean": -16.254566192626953, "gen_logits_min": -27.933216094970703, "gen_logits_std": 2.8949079513549805, "gen_loss": 0.2931530475616455, "grad_norm": 0.37026545623993296, "learning_rate": 2.484505263157895e-05, "loss": 0.2981, "mean_copy_accuracy": 0.9953776299953461, "mean_gen_accuracy": 0.8698426485061646, "mean_token_accuracy": 0.9018005579710007, "num_tokens": 221742441.0, "sample_num_tokens": 8217.25, "step": 4817, "total_num_tokens": 221775310.0, "z_loss": 0.0006298506050370634 }, { "copy_logits_max": -2.3628764152526855, "copy_logits_min": -687500032.0, "copy_num_tokens": 494.5625, "epoch": 0.9840183814143477, "gen_logits_max": 4.1447601318359375, "gen_logits_mean": -15.189667701721191, "gen_logits_min": -27.18868637084961, "gen_logits_std": 2.849752902984619, "gen_loss": 0.27833497524261475, "grad_norm": 0.4300926988300002, "learning_rate": 2.4843789473684213e-05, "loss": 0.2867, "mean_copy_accuracy": 0.9954071640968323, "mean_gen_accuracy": 0.869772657752037, "mean_token_accuracy": 0.9058545529842377, "num_tokens": 222035784.0, "sample_num_tokens": 7748.0, "step": 4818, "total_num_tokens": 222066776.0, "z_loss": 0.000561793043743819 }, { "copy_logits_max": -5.729217052459717, "copy_logits_min": -750000000.0, "copy_num_tokens": 302.6875, "epoch": 0.9842226193515445, "gen_logits_max": 4.0563764572143555, "gen_logits_mean": -17.00960922241211, "gen_logits_min": -27.881223678588867, "gen_logits_std": 2.8259267807006836, "gen_loss": 0.3349580764770508, "grad_norm": 0.4032091756951956, "learning_rate": 2.4842526315789474e-05, "loss": 0.3038, "mean_copy_accuracy": 0.994747057557106, "mean_gen_accuracy": 0.8716991245746613, "mean_token_accuracy": 0.8980710059404373, "num_tokens": 222300957.0, "sample_num_tokens": 7702.25, "step": 4819, "total_num_tokens": 222331766.0, "z_loss": 0.0006248371209949255 }, { "copy_logits_max": -2.560999631881714, "copy_logits_min": -687500032.0, "copy_num_tokens": 533.0625, "epoch": 0.9844268572887414, "gen_logits_max": 3.969346046447754, "gen_logits_mean": -14.490388870239258, "gen_logits_min": -26.061504364013672, "gen_logits_std": 2.843742847442627, "gen_loss": 0.3336809575557709, "grad_norm": 0.3821360338649551, "learning_rate": 2.484126315789474e-05, "loss": 0.3069, "mean_copy_accuracy": 0.9960986077785492, "mean_gen_accuracy": 0.8633492439985275, "mean_token_accuracy": 0.8961591273546219, "num_tokens": 222595751.0, "sample_num_tokens": 9134.25, "step": 4820, "total_num_tokens": 222632288.0, "z_loss": 0.0006394408992491663 }, { "copy_logits_max": -1.0561015605926514, "copy_logits_min": -750000000.0, "copy_num_tokens": 472.375, "epoch": 0.9846310952259382, "gen_logits_max": 4.07985782623291, "gen_logits_mean": -14.513447761535645, "gen_logits_min": -25.989299774169922, "gen_logits_std": 2.8320155143737793, "gen_loss": 0.29172462224960327, "grad_norm": 0.3770891622954699, "learning_rate": 2.484e-05, "loss": 0.3003, "mean_copy_accuracy": 0.9957810193300247, "mean_gen_accuracy": 0.865983858704567, "mean_token_accuracy": 0.9019284397363663, "num_tokens": 222876183.0, "sample_num_tokens": 8012.25, "step": 4821, "total_num_tokens": 222908232.0, "z_loss": 0.0006002797745168209 }, { "copy_logits_max": -5.0571417808532715, "copy_logits_min": -750000000.0, "copy_num_tokens": 294.6875, "epoch": 0.9848353331631351, "gen_logits_max": 4.891697883605957, "gen_logits_mean": -14.901208877563477, "gen_logits_min": -25.982831954956055, "gen_logits_std": 2.7871456146240234, "gen_loss": 0.32830411195755005, "grad_norm": 0.393377861756347, "learning_rate": 2.4838736842105264e-05, "loss": 0.3155, "mean_copy_accuracy": 0.9948980063199997, "mean_gen_accuracy": 0.8677153140306473, "mean_token_accuracy": 0.8935476243495941, "num_tokens": 223130518.0, "sample_num_tokens": 7576.0, "step": 4822, "total_num_tokens": 223160822.0, "z_loss": 0.0005861715762875974 }, { "copy_logits_max": -1.3673086166381836, "copy_logits_min": -750000000.0, "copy_num_tokens": 579.1875, "epoch": 0.9850395711003319, "gen_logits_max": 3.6005916595458984, "gen_logits_mean": -14.682628631591797, "gen_logits_min": -26.17144203186035, "gen_logits_std": 2.8055787086486816, "gen_loss": 0.28295812010765076, "grad_norm": 0.38553863785799947, "learning_rate": 2.4837473684210525e-05, "loss": 0.3349, "mean_copy_accuracy": 0.9963365942239761, "mean_gen_accuracy": 0.8555299639701843, "mean_token_accuracy": 0.8883463144302368, "num_tokens": 223409204.0, "sample_num_tokens": 8733.5, "step": 4823, "total_num_tokens": 223444138.0, "z_loss": 0.0006100272876210511 }, { "copy_logits_max": -2.762218952178955, "copy_logits_min": -750000000.0, "copy_num_tokens": 432.5625, "epoch": 0.9852438090375287, "gen_logits_max": 3.9263224601745605, "gen_logits_mean": -15.37691879272461, "gen_logits_min": -26.74271011352539, "gen_logits_std": 2.813237428665161, "gen_loss": 0.3075195848941803, "grad_norm": 0.3884137142817741, "learning_rate": 2.483621052631579e-05, "loss": 0.2844, "mean_copy_accuracy": 0.9964082092046738, "mean_gen_accuracy": 0.8709916323423386, "mean_token_accuracy": 0.9033653885126114, "num_tokens": 223676494.0, "sample_num_tokens": 8226.0, "step": 4824, "total_num_tokens": 223709398.0, "z_loss": 0.0006232695886865258 }, { "copy_logits_max": -3.26189923286438, "copy_logits_min": -687500032.0, "copy_num_tokens": 518.9375, "epoch": 0.9854480469747255, "gen_logits_max": 3.6942625045776367, "gen_logits_mean": -15.199447631835938, "gen_logits_min": -26.3736572265625, "gen_logits_std": 2.8067877292633057, "gen_loss": 0.25964540243148804, "grad_norm": 0.3661551887358585, "learning_rate": 2.4834947368421053e-05, "loss": 0.2908, "mean_copy_accuracy": 0.9952384978532791, "mean_gen_accuracy": 0.8709319680929184, "mean_token_accuracy": 0.903400719165802, "num_tokens": 223961399.0, "sample_num_tokens": 8417.75, "step": 4825, "total_num_tokens": 223995070.0, "z_loss": 0.0005131727084517479 }, { "copy_logits_max": -5.323017120361328, "copy_logits_min": -750000000.0, "copy_num_tokens": 354.8125, "epoch": 0.9856522849119224, "gen_logits_max": 5.135607719421387, "gen_logits_mean": -15.277530670166016, "gen_logits_min": -26.953432083129883, "gen_logits_std": 2.8205270767211914, "gen_loss": 0.34586578607559204, "grad_norm": 0.4635550621396352, "learning_rate": 2.4833684210526318e-05, "loss": 0.3249, "mean_copy_accuracy": 0.9946480393409729, "mean_gen_accuracy": 0.8643293976783752, "mean_token_accuracy": 0.8938162475824356, "num_tokens": 224217568.0, "sample_num_tokens": 7643.5, "step": 4826, "total_num_tokens": 224248142.0, "z_loss": 0.0006325667491182685 }, { "copy_logits_max": -4.25382661819458, "copy_logits_min": -750000000.0, "copy_num_tokens": 222.375, "epoch": 0.9858565228491192, "gen_logits_max": 5.206280708312988, "gen_logits_mean": -14.843548774719238, "gen_logits_min": -26.152477264404297, "gen_logits_std": 2.8183422088623047, "gen_loss": 0.3206828236579895, "grad_norm": 0.4039870954098154, "learning_rate": 2.4832421052631582e-05, "loss": 0.3026, "mean_copy_accuracy": 0.9951785355806351, "mean_gen_accuracy": 0.8686911016702652, "mean_token_accuracy": 0.8994200527667999, "num_tokens": 224483418.0, "sample_num_tokens": 6870.0, "step": 4827, "total_num_tokens": 224510898.0, "z_loss": 0.0005523543222807348 }, { "copy_logits_max": -3.4299635887145996, "copy_logits_min": -687499968.0, "copy_num_tokens": 754.5625, "epoch": 0.9860607607863161, "gen_logits_max": 4.049907684326172, "gen_logits_mean": -14.57503604888916, "gen_logits_min": -26.85301971435547, "gen_logits_std": 2.825725793838501, "gen_loss": 0.25624626874923706, "grad_norm": 0.3856043965851015, "learning_rate": 2.4831157894736843e-05, "loss": 0.2908, "mean_copy_accuracy": 0.9951250851154327, "mean_gen_accuracy": 0.8713373690843582, "mean_token_accuracy": 0.9031771868467331, "num_tokens": 224753533.0, "sample_num_tokens": 10334.25, "step": 4828, "total_num_tokens": 224794870.0, "z_loss": 0.0005384692922234535 }, { "copy_logits_max": -3.2163262367248535, "copy_logits_min": -750000000.0, "copy_num_tokens": 477.75, "epoch": 0.9862649987235129, "gen_logits_max": 4.1034369468688965, "gen_logits_mean": -14.875102996826172, "gen_logits_min": -25.999725341796875, "gen_logits_std": 2.807133197784424, "gen_loss": 0.3147358000278473, "grad_norm": 0.42148010024308186, "learning_rate": 2.4829894736842107e-05, "loss": 0.2979, "mean_copy_accuracy": 0.9951878935098648, "mean_gen_accuracy": 0.8725061863660812, "mean_token_accuracy": 0.9004396796226501, "num_tokens": 225030360.0, "sample_num_tokens": 8770.0, "step": 4829, "total_num_tokens": 225065440.0, "z_loss": 0.0005798704223707318 }, { "copy_logits_max": -3.6284213066101074, "copy_logits_min": -750000000.0, "copy_num_tokens": 359.5, "epoch": 0.9864692366607097, "gen_logits_max": 4.898100852966309, "gen_logits_mean": -15.300811767578125, "gen_logits_min": -26.359073638916016, "gen_logits_std": 2.8359270095825195, "gen_loss": 0.32503965497016907, "grad_norm": 0.3803634188374611, "learning_rate": 2.4828631578947368e-05, "loss": 0.302, "mean_copy_accuracy": 0.9948305785655975, "mean_gen_accuracy": 0.8733962625265121, "mean_token_accuracy": 0.898632675409317, "num_tokens": 225289682.0, "sample_num_tokens": 7630.5, "step": 4830, "total_num_tokens": 225320204.0, "z_loss": 0.000587904651183635 }, { "copy_logits_max": -3.9249649047851562, "copy_logits_min": -687500032.0, "copy_num_tokens": 630.4375, "epoch": 0.9866734745979066, "gen_logits_max": 2.891119956970215, "gen_logits_mean": -16.556713104248047, "gen_logits_min": -28.131580352783203, "gen_logits_std": 2.8916618824005127, "gen_loss": 0.300853967666626, "grad_norm": 0.4584729164272099, "learning_rate": 2.4827368421052632e-05, "loss": 0.3355, "mean_copy_accuracy": 0.9959415793418884, "mean_gen_accuracy": 0.8536184579133987, "mean_token_accuracy": 0.8876871466636658, "num_tokens": 225545405.0, "sample_num_tokens": 8784.25, "step": 4831, "total_num_tokens": 225580542.0, "z_loss": 0.000569574476685375 }, { "copy_logits_max": -3.547639846801758, "copy_logits_min": -687500032.0, "copy_num_tokens": 427.3125, "epoch": 0.9868777125351034, "gen_logits_max": 5.262403964996338, "gen_logits_mean": -14.246705055236816, "gen_logits_min": -26.168228149414062, "gen_logits_std": 2.8351988792419434, "gen_loss": 0.30962324142456055, "grad_norm": 0.4337715316531169, "learning_rate": 2.4826105263157893e-05, "loss": 0.2992, "mean_copy_accuracy": 0.9959972649812698, "mean_gen_accuracy": 0.864417165517807, "mean_token_accuracy": 0.9006366580724716, "num_tokens": 225828518.0, "sample_num_tokens": 8118.0, "step": 4832, "total_num_tokens": 225860990.0, "z_loss": 0.0006296289502643049 }, { "copy_logits_max": -3.5764615535736084, "copy_logits_min": -750000000.0, "copy_num_tokens": 388.5625, "epoch": 0.9870819504723002, "gen_logits_max": 3.755621910095215, "gen_logits_mean": -15.819180488586426, "gen_logits_min": -27.406478881835938, "gen_logits_std": 2.846458911895752, "gen_loss": 0.32629066705703735, "grad_norm": 0.3806829547796141, "learning_rate": 2.4824842105263158e-05, "loss": 0.312, "mean_copy_accuracy": 0.9960848838090897, "mean_gen_accuracy": 0.8641439527273178, "mean_token_accuracy": 0.8959247469902039, "num_tokens": 226094455.0, "sample_num_tokens": 7537.25, "step": 4833, "total_num_tokens": 226124604.0, "z_loss": 0.0006869049975648522 }, { "copy_logits_max": -0.662148118019104, "copy_logits_min": -687500032.0, "copy_num_tokens": 420.6875, "epoch": 0.9872861884094971, "gen_logits_max": 4.384614944458008, "gen_logits_mean": -15.200113296508789, "gen_logits_min": -26.489009857177734, "gen_logits_std": 2.8352019786834717, "gen_loss": 0.3311585783958435, "grad_norm": 0.44152380567048466, "learning_rate": 2.4823578947368422e-05, "loss": 0.3137, "mean_copy_accuracy": 0.99560546875, "mean_gen_accuracy": 0.861370176076889, "mean_token_accuracy": 0.8936779052019119, "num_tokens": 226368339.0, "sample_num_tokens": 8538.75, "step": 4834, "total_num_tokens": 226402494.0, "z_loss": 0.0007379351882264018 }, { "copy_logits_max": -2.8560290336608887, "copy_logits_min": -750000000.0, "copy_num_tokens": 398.3125, "epoch": 0.9874904263466939, "gen_logits_max": 4.113028526306152, "gen_logits_mean": -16.31515884399414, "gen_logits_min": -27.199003219604492, "gen_logits_std": 2.8358843326568604, "gen_loss": 0.28279632329940796, "grad_norm": 0.3935551760087125, "learning_rate": 2.4822315789473686e-05, "loss": 0.3086, "mean_copy_accuracy": 0.9933872073888779, "mean_gen_accuracy": 0.8708267062902451, "mean_token_accuracy": 0.8971831798553467, "num_tokens": 226622484.0, "sample_num_tokens": 8322.5, "step": 4835, "total_num_tokens": 226655774.0, "z_loss": 0.0006085833301767707 }, { "copy_logits_max": -3.9796042442321777, "copy_logits_min": -750000000.0, "copy_num_tokens": 465.75, "epoch": 0.9876946642838907, "gen_logits_max": 4.199333190917969, "gen_logits_mean": -16.031862258911133, "gen_logits_min": -27.377182006835938, "gen_logits_std": 2.8770270347595215, "gen_loss": 0.33242425322532654, "grad_norm": 0.4063025329738676, "learning_rate": 2.4821052631578947e-05, "loss": 0.3162, "mean_copy_accuracy": 0.9954878240823746, "mean_gen_accuracy": 0.864363968372345, "mean_token_accuracy": 0.8965882062911987, "num_tokens": 226905354.0, "sample_num_tokens": 8697.5, "step": 4836, "total_num_tokens": 226940144.0, "z_loss": 0.0006216818583197892 }, { "copy_logits_max": -4.325418472290039, "copy_logits_min": -750000000.0, "copy_num_tokens": 426.9375, "epoch": 0.9878989022210876, "gen_logits_max": 4.0287885665893555, "gen_logits_mean": -14.93478012084961, "gen_logits_min": -25.983240127563477, "gen_logits_std": 2.777834892272949, "gen_loss": 0.27223849296569824, "grad_norm": 0.39352603328802693, "learning_rate": 2.481978947368421e-05, "loss": 0.3106, "mean_copy_accuracy": 0.9942238628864288, "mean_gen_accuracy": 0.8681769222021103, "mean_token_accuracy": 0.895034596323967, "num_tokens": 227177884.0, "sample_num_tokens": 8774.5, "step": 4837, "total_num_tokens": 227212982.0, "z_loss": 0.0005318694747984409 }, { "copy_logits_max": 0.6961377859115601, "copy_logits_min": -750000000.0, "copy_num_tokens": 478.8125, "epoch": 0.9881031401582844, "gen_logits_max": 5.326784133911133, "gen_logits_mean": -13.015514373779297, "gen_logits_min": -24.317256927490234, "gen_logits_std": 2.7729029655456543, "gen_loss": 0.322415828704834, "grad_norm": 0.358996576138959, "learning_rate": 2.4818526315789476e-05, "loss": 0.2878, "mean_copy_accuracy": 0.9955483376979828, "mean_gen_accuracy": 0.8706963062286377, "mean_token_accuracy": 0.9044943749904633, "num_tokens": 227457107.0, "sample_num_tokens": 8594.75, "step": 4838, "total_num_tokens": 227491486.0, "z_loss": 0.0006718350341543555 }, { "copy_logits_max": -1.9990991353988647, "copy_logits_min": -687500096.0, "copy_num_tokens": 441.125, "epoch": 0.9883073780954812, "gen_logits_max": 5.052772045135498, "gen_logits_mean": -13.818746566772461, "gen_logits_min": -24.80925178527832, "gen_logits_std": 2.7801897525787354, "gen_loss": 0.326159805059433, "grad_norm": 0.41656451607091266, "learning_rate": 2.4817263157894737e-05, "loss": 0.3186, "mean_copy_accuracy": 0.9949211329221725, "mean_gen_accuracy": 0.8614051938056946, "mean_token_accuracy": 0.8926992118358612, "num_tokens": 227702176.0, "sample_num_tokens": 8692.5, "step": 4839, "total_num_tokens": 227736946.0, "z_loss": 0.0006531620747409761 }, { "copy_logits_max": -1.6748368740081787, "copy_logits_min": -750000000.0, "copy_num_tokens": 464.0625, "epoch": 0.9885116160326781, "gen_logits_max": 4.221164703369141, "gen_logits_mean": -14.949234962463379, "gen_logits_min": -26.719860076904297, "gen_logits_std": 2.8708271980285645, "gen_loss": 0.2817609906196594, "grad_norm": 0.3646960816545237, "learning_rate": 2.4816e-05, "loss": 0.2985, "mean_copy_accuracy": 0.9959037750959396, "mean_gen_accuracy": 0.8733019977807999, "mean_token_accuracy": 0.8998502343893051, "num_tokens": 227973979.0, "sample_num_tokens": 8387.75, "step": 4840, "total_num_tokens": 228007530.0, "z_loss": 0.0005370959406718612 }, { "copy_logits_max": -3.2905168533325195, "copy_logits_min": -687500032.0, "copy_num_tokens": 448.125, "epoch": 0.9887158539698749, "gen_logits_max": 3.9401443004608154, "gen_logits_mean": -15.592702865600586, "gen_logits_min": -27.175552368164062, "gen_logits_std": 2.9012136459350586, "gen_loss": 0.2947414517402649, "grad_norm": 0.3486242917622835, "learning_rate": 2.4814736842105262e-05, "loss": 0.2826, "mean_copy_accuracy": 0.9966346919536591, "mean_gen_accuracy": 0.8737709820270538, "mean_token_accuracy": 0.9056429415941238, "num_tokens": 228263508.0, "sample_num_tokens": 8204.5, "step": 4841, "total_num_tokens": 228296326.0, "z_loss": 0.0005700257606804371 }, { "copy_logits_max": -0.10450199246406555, "copy_logits_min": -750000000.0, "copy_num_tokens": 548.9375, "epoch": 0.9889200919070718, "gen_logits_max": 4.869546413421631, "gen_logits_mean": -13.044879913330078, "gen_logits_min": -24.289133071899414, "gen_logits_std": 2.798448085784912, "gen_loss": 0.3370959162712097, "grad_norm": 0.39585102642697956, "learning_rate": 2.481347368421053e-05, "loss": 0.3007, "mean_copy_accuracy": 0.9945289939641953, "mean_gen_accuracy": 0.8678068965673447, "mean_token_accuracy": 0.8991446942090988, "num_tokens": 228527463.0, "sample_num_tokens": 8958.25, "step": 4842, "total_num_tokens": 228563296.0, "z_loss": 0.000680526252835989 }, { "copy_logits_max": -2.3114395141601562, "copy_logits_min": -750000064.0, "copy_num_tokens": 532.5625, "epoch": 0.9891243298442686, "gen_logits_max": 4.259578704833984, "gen_logits_mean": -14.873291015625, "gen_logits_min": -26.59246063232422, "gen_logits_std": 2.8412837982177734, "gen_loss": 0.2897005081176758, "grad_norm": 0.3841389498794876, "learning_rate": 2.481221052631579e-05, "loss": 0.2979, "mean_copy_accuracy": 0.9953663796186447, "mean_gen_accuracy": 0.8673156052827835, "mean_token_accuracy": 0.9009235352277756, "num_tokens": 228800154.0, "sample_num_tokens": 8852.5, "step": 4843, "total_num_tokens": 228835564.0, "z_loss": 0.0005906104343011975 }, { "copy_logits_max": -4.0464887619018555, "copy_logits_min": -687500032.0, "copy_num_tokens": 331.625, "epoch": 0.9893285677814654, "gen_logits_max": 6.357946395874023, "gen_logits_mean": -14.262632369995117, "gen_logits_min": -26.003753662109375, "gen_logits_std": 2.8637008666992188, "gen_loss": 0.3634837865829468, "grad_norm": 0.39186420440210223, "learning_rate": 2.4810947368421055e-05, "loss": 0.3338, "mean_copy_accuracy": 0.9948846846818924, "mean_gen_accuracy": 0.8624728471040726, "mean_token_accuracy": 0.8920765072107315, "num_tokens": 229074210.0, "sample_num_tokens": 8601.0, "step": 4844, "total_num_tokens": 229108614.0, "z_loss": 0.0006888146162964404 }, { "copy_logits_max": -1.607059121131897, "copy_logits_min": -750000000.0, "copy_num_tokens": 421.375, "epoch": 0.9895328057186622, "gen_logits_max": 5.628045558929443, "gen_logits_mean": -14.175707817077637, "gen_logits_min": -26.31236457824707, "gen_logits_std": 2.8855104446411133, "gen_loss": 0.31852445006370544, "grad_norm": 0.35566294858258096, "learning_rate": 2.4809684210526316e-05, "loss": 0.2841, "mean_copy_accuracy": 0.9951947182416916, "mean_gen_accuracy": 0.874321386218071, "mean_token_accuracy": 0.9071284234523773, "num_tokens": 229388160.0, "sample_num_tokens": 8733.5, "step": 4845, "total_num_tokens": 229423094.0, "z_loss": 0.0006066887872293591 }, { "copy_logits_max": -4.648476600646973, "copy_logits_min": -750000000.0, "copy_num_tokens": 385.0, "epoch": 0.9897370436558591, "gen_logits_max": 4.2911810874938965, "gen_logits_mean": -16.12168312072754, "gen_logits_min": -28.024911880493164, "gen_logits_std": 2.902822732925415, "gen_loss": 0.31692948937416077, "grad_norm": 0.45622184293230067, "learning_rate": 2.480842105263158e-05, "loss": 0.3059, "mean_copy_accuracy": 0.9928844571113586, "mean_gen_accuracy": 0.8719806969165802, "mean_token_accuracy": 0.8967103660106659, "num_tokens": 229636703.0, "sample_num_tokens": 7801.25, "step": 4846, "total_num_tokens": 229667908.0, "z_loss": 0.0006446636980399489 }, { "copy_logits_max": -2.8866941928863525, "copy_logits_min": -750000000.0, "copy_num_tokens": 434.0, "epoch": 0.9899412815930559, "gen_logits_max": 6.4066643714904785, "gen_logits_mean": -12.574189186096191, "gen_logits_min": -24.347028732299805, "gen_logits_std": 2.857759475708008, "gen_loss": 0.29198163747787476, "grad_norm": 0.7801917158234092, "learning_rate": 2.480715789473684e-05, "loss": 0.3023, "mean_copy_accuracy": 0.9960448145866394, "mean_gen_accuracy": 0.8711435347795486, "mean_token_accuracy": 0.8987952321767807, "num_tokens": 229896330.0, "sample_num_tokens": 8257.5, "step": 4847, "total_num_tokens": 229929360.0, "z_loss": 0.000609966111369431 }, { "copy_logits_max": -5.579543113708496, "copy_logits_min": -750000064.0, "copy_num_tokens": 281.875, "epoch": 0.9901455195302528, "gen_logits_max": 5.457890510559082, "gen_logits_mean": -15.131061553955078, "gen_logits_min": -26.54345703125, "gen_logits_std": 2.865617036819458, "gen_loss": 0.3496641516685486, "grad_norm": 0.6848191054190498, "learning_rate": 2.4805894736842105e-05, "loss": 0.3416, "mean_copy_accuracy": 0.9940722584724426, "mean_gen_accuracy": 0.8566925376653671, "mean_token_accuracy": 0.8862815797328949, "num_tokens": 230167747.0, "sample_num_tokens": 7514.25, "step": 4848, "total_num_tokens": 230197804.0, "z_loss": 0.0006808533216826618 }, { "copy_logits_max": -4.0364227294921875, "copy_logits_min": -750000000.0, "copy_num_tokens": 334.375, "epoch": 0.9903497574674496, "gen_logits_max": 5.300326347351074, "gen_logits_mean": -14.792501449584961, "gen_logits_min": -26.015230178833008, "gen_logits_std": 2.840165615081787, "gen_loss": 0.35839563608169556, "grad_norm": 0.4177612239828842, "learning_rate": 2.4804631578947366e-05, "loss": 0.3077, "mean_copy_accuracy": 0.9951816648244858, "mean_gen_accuracy": 0.8705145567655563, "mean_token_accuracy": 0.897735059261322, "num_tokens": 230449543.0, "sample_num_tokens": 9051.25, "step": 4849, "total_num_tokens": 230485748.0, "z_loss": 0.000694984570145607 }, { "copy_logits_max": -5.657896041870117, "copy_logits_min": -750000000.0, "copy_num_tokens": 423.4375, "epoch": 0.9905539954046464, "gen_logits_max": 4.06674861907959, "gen_logits_mean": -15.426032066345215, "gen_logits_min": -26.876361846923828, "gen_logits_std": 2.8650565147399902, "gen_loss": 0.28998106718063354, "grad_norm": 0.4107045913359879, "learning_rate": 2.4803368421052634e-05, "loss": 0.3122, "mean_copy_accuracy": 0.995420515537262, "mean_gen_accuracy": 0.8652488887310028, "mean_token_accuracy": 0.8948086798191071, "num_tokens": 230717567.0, "sample_num_tokens": 9086.25, "step": 4850, "total_num_tokens": 230753912.0, "z_loss": 0.0005244605708867311 }, { "copy_logits_max": -4.2247514724731445, "copy_logits_min": -750000000.0, "copy_num_tokens": 542.75, "epoch": 0.9907582333418432, "gen_logits_max": 4.456265449523926, "gen_logits_mean": -14.934508323669434, "gen_logits_min": -26.582618713378906, "gen_logits_std": 2.886880874633789, "gen_loss": 0.3195478022098541, "grad_norm": 0.4527817077589513, "learning_rate": 2.4802105263157895e-05, "loss": 0.3172, "mean_copy_accuracy": 0.9950048923492432, "mean_gen_accuracy": 0.861919105052948, "mean_token_accuracy": 0.8958807289600372, "num_tokens": 230980343.0, "sample_num_tokens": 9279.25, "step": 4851, "total_num_tokens": 231017460.0, "z_loss": 0.0005488898605108261 }, { "copy_logits_max": -3.9425594806671143, "copy_logits_min": -750000000.0, "copy_num_tokens": 577.8125, "epoch": 0.9909624712790401, "gen_logits_max": 3.7367587089538574, "gen_logits_mean": -16.498613357543945, "gen_logits_min": -28.483627319335938, "gen_logits_std": 2.928813934326172, "gen_loss": 0.3033004105091095, "grad_norm": 0.39927731601313093, "learning_rate": 2.480084210526316e-05, "loss": 0.3156, "mean_copy_accuracy": 0.9953077733516693, "mean_gen_accuracy": 0.8622222691774368, "mean_token_accuracy": 0.8958376497030258, "num_tokens": 231246883.0, "sample_num_tokens": 9274.75, "step": 4852, "total_num_tokens": 231283982.0, "z_loss": 0.0005261586047708988 }, { "copy_logits_max": -2.600940227508545, "copy_logits_min": -750000000.0, "copy_num_tokens": 434.75, "epoch": 0.991166709216237, "gen_logits_max": 4.639348983764648, "gen_logits_mean": -14.321608543395996, "gen_logits_min": -26.298078536987305, "gen_logits_std": 2.907287359237671, "gen_loss": 0.3227604031562805, "grad_norm": 0.412458199239015, "learning_rate": 2.4799578947368423e-05, "loss": 0.3149, "mean_copy_accuracy": 0.996067002415657, "mean_gen_accuracy": 0.8593327850103378, "mean_token_accuracy": 0.8958943486213684, "num_tokens": 231551321.0, "sample_num_tokens": 8263.25, "step": 4853, "total_num_tokens": 231584374.0, "z_loss": 0.0006112823029980063 }, { "copy_logits_max": 2.83039927482605, "copy_logits_min": -750000000.0, "copy_num_tokens": 484.125, "epoch": 0.9913709471534338, "gen_logits_max": 6.2139482498168945, "gen_logits_mean": -12.519600868225098, "gen_logits_min": -24.38202667236328, "gen_logits_std": 2.805044412612915, "gen_loss": 0.30890828371047974, "grad_norm": 0.37491153509197844, "learning_rate": 2.4798315789473684e-05, "loss": 0.308, "mean_copy_accuracy": 0.9965699464082718, "mean_gen_accuracy": 0.8669903725385666, "mean_token_accuracy": 0.8985717296600342, "num_tokens": 231816908.0, "sample_num_tokens": 9184.0, "step": 4854, "total_num_tokens": 231853644.0, "z_loss": 0.0006049433723092079 }, { "copy_logits_max": -3.205019474029541, "copy_logits_min": -750000000.0, "copy_num_tokens": 504.1875, "epoch": 0.9915751850906306, "gen_logits_max": 3.5396523475646973, "gen_logits_mean": -16.959070205688477, "gen_logits_min": -28.74660873413086, "gen_logits_std": 2.916027784347534, "gen_loss": 0.295738160610199, "grad_norm": 0.38509910913835205, "learning_rate": 2.479705263157895e-05, "loss": 0.2861, "mean_copy_accuracy": 0.9958144128322601, "mean_gen_accuracy": 0.8682687431573868, "mean_token_accuracy": 0.9045078456401825, "num_tokens": 232113311.0, "sample_num_tokens": 8863.75, "step": 4855, "total_num_tokens": 232148766.0, "z_loss": 0.0005655143177136779 }, { "copy_logits_max": -2.9965977668762207, "copy_logits_min": -750000000.0, "copy_num_tokens": 467.9375, "epoch": 0.9917794230278274, "gen_logits_max": 4.358266830444336, "gen_logits_mean": -13.314173698425293, "gen_logits_min": -24.697307586669922, "gen_logits_std": 2.7561957836151123, "gen_loss": 0.32407352328300476, "grad_norm": 0.4270621817960651, "learning_rate": 2.479578947368421e-05, "loss": 0.3088, "mean_copy_accuracy": 0.9945957660675049, "mean_gen_accuracy": 0.8626142293214798, "mean_token_accuracy": 0.8963223099708557, "num_tokens": 232375936.0, "sample_num_tokens": 8083.0, "step": 4856, "total_num_tokens": 232408268.0, "z_loss": 0.0006247652927413583 }, { "copy_logits_max": -0.6111003756523132, "copy_logits_min": -750000000.0, "copy_num_tokens": 600.125, "epoch": 0.9919836609650242, "gen_logits_max": 3.9906842708587646, "gen_logits_mean": -14.018112182617188, "gen_logits_min": -25.176891326904297, "gen_logits_std": 2.7592992782592773, "gen_loss": 0.3064122200012207, "grad_norm": 0.40466403290987507, "learning_rate": 2.4794526315789474e-05, "loss": 0.2906, "mean_copy_accuracy": 0.9968786835670471, "mean_gen_accuracy": 0.8720277398824692, "mean_token_accuracy": 0.9036065489053726, "num_tokens": 232652255.0, "sample_num_tokens": 9776.25, "step": 4857, "total_num_tokens": 232691360.0, "z_loss": 0.0006234425818547606 }, { "copy_logits_max": -4.576570510864258, "copy_logits_min": -750000000.0, "copy_num_tokens": 418.6875, "epoch": 0.992187898902221, "gen_logits_max": 4.301937580108643, "gen_logits_mean": -14.324751853942871, "gen_logits_min": -25.285259246826172, "gen_logits_std": 2.7773358821868896, "gen_loss": 0.3059529662132263, "grad_norm": 0.4228499299592226, "learning_rate": 2.4793263157894738e-05, "loss": 0.3285, "mean_copy_accuracy": 0.9962800741195679, "mean_gen_accuracy": 0.8620983809232712, "mean_token_accuracy": 0.8932136297225952, "num_tokens": 232916171.0, "sample_num_tokens": 8030.75, "step": 4858, "total_num_tokens": 232948294.0, "z_loss": 0.0005472716293297708 }, { "copy_logits_max": -1.8805642127990723, "copy_logits_min": -750000000.0, "copy_num_tokens": 409.0, "epoch": 0.992392136839418, "gen_logits_max": 5.6357574462890625, "gen_logits_mean": -12.681455612182617, "gen_logits_min": -24.084413528442383, "gen_logits_std": 2.7643861770629883, "gen_loss": 0.3511781096458435, "grad_norm": 0.3927496077915361, "learning_rate": 2.4792000000000003e-05, "loss": 0.3068, "mean_copy_accuracy": 0.995596393942833, "mean_gen_accuracy": 0.8684967905282974, "mean_token_accuracy": 0.896924301981926, "num_tokens": 233179310.0, "sample_num_tokens": 8661.5, "step": 4859, "total_num_tokens": 233213956.0, "z_loss": 0.0006726110586896539 }, { "copy_logits_max": -3.0108094215393066, "copy_logits_min": -750000064.0, "copy_num_tokens": 644.0, "epoch": 0.9925963747766148, "gen_logits_max": 3.688011646270752, "gen_logits_mean": -15.712367057800293, "gen_logits_min": -27.735122680664062, "gen_logits_std": 2.8958399295806885, "gen_loss": 0.24373897910118103, "grad_norm": 0.41247459436158135, "learning_rate": 2.4790736842105263e-05, "loss": 0.3005, "mean_copy_accuracy": 0.9952356070280075, "mean_gen_accuracy": 0.86766017973423, "mean_token_accuracy": 0.8998752534389496, "num_tokens": 233454617.0, "sample_num_tokens": 9347.75, "step": 4860, "total_num_tokens": 233492008.0, "z_loss": 0.00044437876204028726 }, { "copy_logits_max": -5.428802967071533, "copy_logits_min": -750000000.0, "copy_num_tokens": 310.375, "epoch": 0.9928006127138116, "gen_logits_max": 5.015448093414307, "gen_logits_mean": -14.519428253173828, "gen_logits_min": -26.222164154052734, "gen_logits_std": 2.82558012008667, "gen_loss": 0.33169108629226685, "grad_norm": 0.3710220044804755, "learning_rate": 2.4789473684210528e-05, "loss": 0.3002, "mean_copy_accuracy": 0.9955544471740723, "mean_gen_accuracy": 0.8670942485332489, "mean_token_accuracy": 0.9005167633295059, "num_tokens": 233739964.0, "sample_num_tokens": 7205.0, "step": 4861, "total_num_tokens": 233768784.0, "z_loss": 0.0006146173691377044 }, { "copy_logits_max": -4.830870151519775, "copy_logits_min": -750000000.0, "copy_num_tokens": 465.0625, "epoch": 0.9930048506510084, "gen_logits_max": 3.927335500717163, "gen_logits_mean": -14.605371475219727, "gen_logits_min": -26.550674438476562, "gen_logits_std": 2.8506531715393066, "gen_loss": 0.27147120237350464, "grad_norm": 0.37968606885911504, "learning_rate": 2.478821052631579e-05, "loss": 0.2922, "mean_copy_accuracy": 0.9959336519241333, "mean_gen_accuracy": 0.8721529543399811, "mean_token_accuracy": 0.9015238881111145, "num_tokens": 233986499.0, "sample_num_tokens": 7930.25, "step": 4862, "total_num_tokens": 234018220.0, "z_loss": 0.0005487920134328306 }, { "copy_logits_max": -4.263360500335693, "copy_logits_min": -750000000.0, "copy_num_tokens": 515.4375, "epoch": 0.9932090885882052, "gen_logits_max": 3.584108829498291, "gen_logits_mean": -15.805703163146973, "gen_logits_min": -27.13996124267578, "gen_logits_std": 2.8464221954345703, "gen_loss": 0.27470627427101135, "grad_norm": 0.37774022237905547, "learning_rate": 2.4786947368421053e-05, "loss": 0.3012, "mean_copy_accuracy": 0.9965259581804276, "mean_gen_accuracy": 0.8684266358613968, "mean_token_accuracy": 0.9001912474632263, "num_tokens": 234265032.0, "sample_num_tokens": 8811.5, "step": 4863, "total_num_tokens": 234300278.0, "z_loss": 0.0005828350549563766 }, { "copy_logits_max": -6.409641742706299, "copy_logits_min": -750000000.0, "copy_num_tokens": 687.125, "epoch": 0.993413326525402, "gen_logits_max": 4.290688991546631, "gen_logits_mean": -13.872523307800293, "gen_logits_min": -25.31844139099121, "gen_logits_std": 2.799130916595459, "gen_loss": 0.26324933767318726, "grad_norm": 0.3833638373186835, "learning_rate": 2.4785684210526314e-05, "loss": 0.2795, "mean_copy_accuracy": 0.9942118674516678, "mean_gen_accuracy": 0.8769509196281433, "mean_token_accuracy": 0.9080891758203506, "num_tokens": 234527317.0, "sample_num_tokens": 9734.75, "step": 4864, "total_num_tokens": 234566256.0, "z_loss": 0.0005875731585547328 }, { "copy_logits_max": -6.851463317871094, "copy_logits_min": -750000000.0, "copy_num_tokens": 384.5625, "epoch": 0.993617564462599, "gen_logits_max": 4.623145580291748, "gen_logits_mean": -15.100881576538086, "gen_logits_min": -26.322452545166016, "gen_logits_std": 2.8257312774658203, "gen_loss": 0.3307155668735504, "grad_norm": 0.38385035288438707, "learning_rate": 2.4784421052631578e-05, "loss": 0.3086, "mean_copy_accuracy": 0.9968239963054657, "mean_gen_accuracy": 0.8670284301042557, "mean_token_accuracy": 0.8963735997676849, "num_tokens": 234790670.0, "sample_num_tokens": 7898.5, "step": 4865, "total_num_tokens": 234822264.0, "z_loss": 0.0006414244999177754 }, { "copy_logits_max": -4.647174835205078, "copy_logits_min": -687500032.0, "copy_num_tokens": 396.625, "epoch": 0.9938218023997958, "gen_logits_max": 5.375452041625977, "gen_logits_mean": -13.8525390625, "gen_logits_min": -25.85552215576172, "gen_logits_std": 2.8212203979492188, "gen_loss": 0.3421269357204437, "grad_norm": 0.39307276931706825, "learning_rate": 2.4783157894736846e-05, "loss": 0.3207, "mean_copy_accuracy": 0.9952537417411804, "mean_gen_accuracy": 0.8643915504217148, "mean_token_accuracy": 0.8917534500360489, "num_tokens": 235044812.0, "sample_num_tokens": 9040.0, "step": 4866, "total_num_tokens": 235080972.0, "z_loss": 0.0006706881104037166 }, { "copy_logits_max": -4.2795329093933105, "copy_logits_min": -750000000.0, "copy_num_tokens": 393.75, "epoch": 0.9940260403369926, "gen_logits_max": 4.767033576965332, "gen_logits_mean": -14.332659721374512, "gen_logits_min": -26.557052612304688, "gen_logits_std": 2.8582537174224854, "gen_loss": 0.2891290783882141, "grad_norm": 0.405903626333297, "learning_rate": 2.4781894736842107e-05, "loss": 0.2761, "mean_copy_accuracy": 0.9959438145160675, "mean_gen_accuracy": 0.8768545091152191, "mean_token_accuracy": 0.908506453037262, "num_tokens": 235319020.0, "sample_num_tokens": 7486.5, "step": 4867, "total_num_tokens": 235348966.0, "z_loss": 0.0005469710449688137 }, { "copy_logits_max": -6.874642848968506, "copy_logits_min": -750000000.0, "copy_num_tokens": 418.75, "epoch": 0.9942302782741894, "gen_logits_max": 4.292835235595703, "gen_logits_mean": -15.914039611816406, "gen_logits_min": -27.56745147705078, "gen_logits_std": 2.838324546813965, "gen_loss": 0.32705092430114746, "grad_norm": 0.37151285387304, "learning_rate": 2.478063157894737e-05, "loss": 0.3045, "mean_copy_accuracy": 0.9963660389184952, "mean_gen_accuracy": 0.8660129606723785, "mean_token_accuracy": 0.8978559821844101, "num_tokens": 235578249.0, "sample_num_tokens": 8583.25, "step": 4868, "total_num_tokens": 235612582.0, "z_loss": 0.0006053877295926213 }, { "copy_logits_max": -6.725122928619385, "copy_logits_min": -750000000.0, "copy_num_tokens": 432.75, "epoch": 0.9944345162113862, "gen_logits_max": 4.054636001586914, "gen_logits_mean": -16.488513946533203, "gen_logits_min": -28.64227867126465, "gen_logits_std": 2.9426207542419434, "gen_loss": 0.31924235820770264, "grad_norm": 0.35423474162452595, "learning_rate": 2.4779368421052632e-05, "loss": 0.2958, "mean_copy_accuracy": 0.9962636381387711, "mean_gen_accuracy": 0.8695906847715378, "mean_token_accuracy": 0.9017584323883057, "num_tokens": 235862553.0, "sample_num_tokens": 8599.75, "step": 4869, "total_num_tokens": 235896952.0, "z_loss": 0.0005569719360210001 }, { "copy_logits_max": -6.371167182922363, "copy_logits_min": -750000000.0, "copy_num_tokens": 440.0625, "epoch": 0.994638754148583, "gen_logits_max": 4.68300199508667, "gen_logits_mean": -14.005037307739258, "gen_logits_min": -25.938751220703125, "gen_logits_std": 2.860635757446289, "gen_loss": 0.3144330382347107, "grad_norm": 0.4363963709475226, "learning_rate": 2.4778105263157896e-05, "loss": 0.3258, "mean_copy_accuracy": 0.9961367845535278, "mean_gen_accuracy": 0.8633779585361481, "mean_token_accuracy": 0.892889678478241, "num_tokens": 236129411.0, "sample_num_tokens": 8612.25, "step": 4870, "total_num_tokens": 236163860.0, "z_loss": 0.0005504550645127892 }, { "copy_logits_max": -5.8890252113342285, "copy_logits_min": -750000000.0, "copy_num_tokens": 576.875, "epoch": 0.99484299208578, "gen_logits_max": 4.525913238525391, "gen_logits_mean": -13.555208206176758, "gen_logits_min": -24.81820297241211, "gen_logits_std": 2.7976293563842773, "gen_loss": 0.2923794090747833, "grad_norm": 0.35294513261615834, "learning_rate": 2.4776842105263157e-05, "loss": 0.2887, "mean_copy_accuracy": 0.9962252676486969, "mean_gen_accuracy": 0.8708497434854507, "mean_token_accuracy": 0.9039183706045151, "num_tokens": 236412322.0, "sample_num_tokens": 9129.0, "step": 4871, "total_num_tokens": 236448838.0, "z_loss": 0.000509455450810492 }, { "copy_logits_max": -5.510519981384277, "copy_logits_min": -750000000.0, "copy_num_tokens": 434.6875, "epoch": 0.9950472300229768, "gen_logits_max": 4.153231620788574, "gen_logits_mean": -15.098532676696777, "gen_logits_min": -26.74066734313965, "gen_logits_std": 2.8919663429260254, "gen_loss": 0.3211311101913452, "grad_norm": 0.37235970327316115, "learning_rate": 2.477557894736842e-05, "loss": 0.3195, "mean_copy_accuracy": 0.9966681599617004, "mean_gen_accuracy": 0.8590260297060013, "mean_token_accuracy": 0.8925559520721436, "num_tokens": 236695695.0, "sample_num_tokens": 8259.75, "step": 4872, "total_num_tokens": 236728734.0, "z_loss": 0.000537347630597651 }, { "copy_logits_max": -6.655021667480469, "copy_logits_min": -687500032.0, "copy_num_tokens": 426.0, "epoch": 0.9952514679601736, "gen_logits_max": 4.319299697875977, "gen_logits_mean": -14.28731918334961, "gen_logits_min": -25.697851181030273, "gen_logits_std": 2.7672109603881836, "gen_loss": 0.3018995523452759, "grad_norm": 0.42255286794914854, "learning_rate": 2.4774315789473683e-05, "loss": 0.3074, "mean_copy_accuracy": 0.9949182569980621, "mean_gen_accuracy": 0.8672958463430405, "mean_token_accuracy": 0.8989561349153519, "num_tokens": 236972084.0, "sample_num_tokens": 9494.5, "step": 4873, "total_num_tokens": 237010062.0, "z_loss": 0.0005231066606938839 }, { "copy_logits_max": -6.130967140197754, "copy_logits_min": -750000000.0, "copy_num_tokens": 434.125, "epoch": 0.9954557058973704, "gen_logits_max": 4.752433776855469, "gen_logits_mean": -15.156698226928711, "gen_logits_min": -26.796598434448242, "gen_logits_std": 2.8449809551239014, "gen_loss": 0.3337089717388153, "grad_norm": 0.37406990769548354, "learning_rate": 2.4773052631578947e-05, "loss": 0.307, "mean_copy_accuracy": 0.995762288570404, "mean_gen_accuracy": 0.870130866765976, "mean_token_accuracy": 0.8971852958202362, "num_tokens": 237244233.0, "sample_num_tokens": 9060.25, "step": 4874, "total_num_tokens": 237280474.0, "z_loss": 0.0005658823647536337 }, { "copy_logits_max": -5.733859062194824, "copy_logits_min": -750000000.0, "copy_num_tokens": 766.8125, "epoch": 0.9956599438345672, "gen_logits_max": 2.4894766807556152, "gen_logits_mean": -15.983916282653809, "gen_logits_min": -27.619325637817383, "gen_logits_std": 2.882150650024414, "gen_loss": 0.25365349650382996, "grad_norm": 0.38028926646028466, "learning_rate": 2.477178947368421e-05, "loss": 0.2745, "mean_copy_accuracy": 0.9968281388282776, "mean_gen_accuracy": 0.8751447349786758, "mean_token_accuracy": 0.9078204184770584, "num_tokens": 237511836.0, "sample_num_tokens": 10253.5, "step": 4875, "total_num_tokens": 237552850.0, "z_loss": 0.0005071406485512853 }, { "copy_logits_max": -5.94015645980835, "copy_logits_min": -750000000.0, "copy_num_tokens": 651.875, "epoch": 0.9958641817717641, "gen_logits_max": 3.665229320526123, "gen_logits_mean": -15.937878608703613, "gen_logits_min": -27.723526000976562, "gen_logits_std": 2.9028589725494385, "gen_loss": 0.2629134953022003, "grad_norm": 0.36266713434373016, "learning_rate": 2.4770526315789475e-05, "loss": 0.3008, "mean_copy_accuracy": 0.9964760988950729, "mean_gen_accuracy": 0.8673823326826096, "mean_token_accuracy": 0.8999241590499878, "num_tokens": 237807973.0, "sample_num_tokens": 9459.75, "step": 4876, "total_num_tokens": 237845812.0, "z_loss": 0.0005084450822323561 }, { "copy_logits_max": -7.305211067199707, "copy_logits_min": -750000000.0, "copy_num_tokens": 393.1875, "epoch": 0.996068419708961, "gen_logits_max": 4.741971492767334, "gen_logits_mean": -14.32843017578125, "gen_logits_min": -25.559329986572266, "gen_logits_std": 2.787991523742676, "gen_loss": 0.34142976999282837, "grad_norm": 0.34909490397550563, "learning_rate": 2.4769263157894736e-05, "loss": 0.2998, "mean_copy_accuracy": 0.9957147240638733, "mean_gen_accuracy": 0.8715419769287109, "mean_token_accuracy": 0.8991241008043289, "num_tokens": 238081475.0, "sample_num_tokens": 7979.75, "step": 4877, "total_num_tokens": 238113394.0, "z_loss": 0.0005442636320367455 }, { "copy_logits_max": -5.146702766418457, "copy_logits_min": -687500032.0, "copy_num_tokens": 403.4375, "epoch": 0.9962726576461578, "gen_logits_max": 4.3411102294921875, "gen_logits_mean": -14.48715591430664, "gen_logits_min": -25.432645797729492, "gen_logits_std": 2.755455493927002, "gen_loss": 0.29892224073410034, "grad_norm": 0.38565437881622155, "learning_rate": 2.4768e-05, "loss": 0.301, "mean_copy_accuracy": 0.9952793121337891, "mean_gen_accuracy": 0.872776210308075, "mean_token_accuracy": 0.9011954516172409, "num_tokens": 238339856.0, "sample_num_tokens": 8368.5, "step": 4878, "total_num_tokens": 238373330.0, "z_loss": 0.0005411250167526305 }, { "copy_logits_max": -5.145991325378418, "copy_logits_min": -750000064.0, "copy_num_tokens": 649.375, "epoch": 0.9964768955833546, "gen_logits_max": 2.882765293121338, "gen_logits_mean": -16.09601593017578, "gen_logits_min": -27.8946533203125, "gen_logits_std": 2.8737640380859375, "gen_loss": 0.26582956314086914, "grad_norm": 0.3690700514554155, "learning_rate": 2.4766736842105265e-05, "loss": 0.2718, "mean_copy_accuracy": 0.995360717177391, "mean_gen_accuracy": 0.8740000575780869, "mean_token_accuracy": 0.9086074084043503, "num_tokens": 238618013.0, "sample_num_tokens": 9342.25, "step": 4879, "total_num_tokens": 238655382.0, "z_loss": 0.0005000942037440836 }, { "copy_logits_max": -5.444913864135742, "copy_logits_min": -750000128.0, "copy_num_tokens": 587.1875, "epoch": 0.9966811335205514, "gen_logits_max": 4.234027862548828, "gen_logits_mean": -14.100208282470703, "gen_logits_min": -25.844688415527344, "gen_logits_std": 2.800485610961914, "gen_loss": 0.26030755043029785, "grad_norm": 0.3498201643211083, "learning_rate": 2.4765473684210526e-05, "loss": 0.2948, "mean_copy_accuracy": 0.9959492683410645, "mean_gen_accuracy": 0.8697560578584671, "mean_token_accuracy": 0.9013269394636154, "num_tokens": 238903417.0, "sample_num_tokens": 8825.25, "step": 4880, "total_num_tokens": 238938718.0, "z_loss": 0.0004938660422340035 }, { "copy_logits_max": -5.008137226104736, "copy_logits_min": -750000000.0, "copy_num_tokens": 572.6875, "epoch": 0.9968853714577482, "gen_logits_max": 3.3470396995544434, "gen_logits_mean": -15.96551513671875, "gen_logits_min": -27.4735107421875, "gen_logits_std": 2.874878168106079, "gen_loss": 0.25181499123573303, "grad_norm": 0.3572281019513409, "learning_rate": 2.476421052631579e-05, "loss": 0.2826, "mean_copy_accuracy": 0.9955524355173111, "mean_gen_accuracy": 0.874719887971878, "mean_token_accuracy": 0.9058806449174881, "num_tokens": 239186758.0, "sample_num_tokens": 9214.5, "step": 4881, "total_num_tokens": 239223616.0, "z_loss": 0.0005162168527022004 }, { "copy_logits_max": -5.6755475997924805, "copy_logits_min": -625000064.0, "copy_num_tokens": 481.3125, "epoch": 0.9970896093949451, "gen_logits_max": 3.540898084640503, "gen_logits_mean": -16.94304847717285, "gen_logits_min": -28.451488494873047, "gen_logits_std": 2.898068904876709, "gen_loss": 0.2835291028022766, "grad_norm": 0.3577863910390753, "learning_rate": 2.476294736842105e-05, "loss": 0.2725, "mean_copy_accuracy": 0.9963428974151611, "mean_gen_accuracy": 0.8724751472473145, "mean_token_accuracy": 0.9100478738546371, "num_tokens": 239485742.0, "sample_num_tokens": 9145.0, "step": 4882, "total_num_tokens": 239522322.0, "z_loss": 0.0005260681500658393 }, { "copy_logits_max": -6.731752872467041, "copy_logits_min": -750000000.0, "copy_num_tokens": 368.1875, "epoch": 0.997293847332142, "gen_logits_max": 4.824179649353027, "gen_logits_mean": -15.434316635131836, "gen_logits_min": -27.148029327392578, "gen_logits_std": 2.8474996089935303, "gen_loss": 0.2955087423324585, "grad_norm": 0.421983086307333, "learning_rate": 2.476168421052632e-05, "loss": 0.3181, "mean_copy_accuracy": 0.9952994585037231, "mean_gen_accuracy": 0.8653595298528671, "mean_token_accuracy": 0.8956504762172699, "num_tokens": 239744708.0, "sample_num_tokens": 7932.0, "step": 4883, "total_num_tokens": 239776436.0, "z_loss": 0.0005741844652220607 }, { "copy_logits_max": -3.3671751022338867, "copy_logits_min": -750000000.0, "copy_num_tokens": 445.0, "epoch": 0.9974980852693388, "gen_logits_max": 4.236581325531006, "gen_logits_mean": -15.205819129943848, "gen_logits_min": -26.69261932373047, "gen_logits_std": 2.8519318103790283, "gen_loss": 0.2800498604774475, "grad_norm": 0.38668618263834303, "learning_rate": 2.476042105263158e-05, "loss": 0.3032, "mean_copy_accuracy": 0.9947700053453445, "mean_gen_accuracy": 0.866650253534317, "mean_token_accuracy": 0.8973598927259445, "num_tokens": 240025292.0, "sample_num_tokens": 8143.0, "step": 4884, "total_num_tokens": 240057864.0, "z_loss": 0.0005631211679428816 }, { "copy_logits_max": -5.728213787078857, "copy_logits_min": -750000000.0, "copy_num_tokens": 526.5, "epoch": 0.9977023232065356, "gen_logits_max": 3.5230209827423096, "gen_logits_mean": -16.19247817993164, "gen_logits_min": -27.339033126831055, "gen_logits_std": 2.8219757080078125, "gen_loss": 0.29135602712631226, "grad_norm": 0.38188575700917843, "learning_rate": 2.4759157894736844e-05, "loss": 0.3029, "mean_copy_accuracy": 0.995677649974823, "mean_gen_accuracy": 0.8728067278862, "mean_token_accuracy": 0.9014775604009628, "num_tokens": 240286881.0, "sample_num_tokens": 9480.25, "step": 4885, "total_num_tokens": 240324802.0, "z_loss": 0.0005554055678658187 }, { "copy_logits_max": -3.3459434509277344, "copy_logits_min": -750000000.0, "copy_num_tokens": 690.875, "epoch": 0.9979065611437324, "gen_logits_max": 5.714958190917969, "gen_logits_mean": -12.997749328613281, "gen_logits_min": -25.152488708496094, "gen_logits_std": 2.9029979705810547, "gen_loss": 0.3148497939109802, "grad_norm": 0.37996429541127497, "learning_rate": 2.4757894736842105e-05, "loss": 0.3058, "mean_copy_accuracy": 0.9965844452381134, "mean_gen_accuracy": 0.8612067848443985, "mean_token_accuracy": 0.8981422036886215, "num_tokens": 240576443.0, "sample_num_tokens": 9913.75, "step": 4886, "total_num_tokens": 240616098.0, "z_loss": 0.000608294561970979 }, { "copy_logits_max": -5.461761474609375, "copy_logits_min": -750000000.0, "copy_num_tokens": 486.9375, "epoch": 0.9981107990809293, "gen_logits_max": 4.388645648956299, "gen_logits_mean": -15.38739013671875, "gen_logits_min": -27.235118865966797, "gen_logits_std": 2.8892931938171387, "gen_loss": 0.3239448070526123, "grad_norm": 0.4017457973028055, "learning_rate": 2.475663157894737e-05, "loss": 0.2997, "mean_copy_accuracy": 0.9960768073797226, "mean_gen_accuracy": 0.8684951514005661, "mean_token_accuracy": 0.9014296978712082, "num_tokens": 240852707.0, "sample_num_tokens": 8741.75, "step": 4887, "total_num_tokens": 240887674.0, "z_loss": 0.0006560626206919551 }, { "copy_logits_max": -6.074991226196289, "copy_logits_min": -687500032.0, "copy_num_tokens": 419.875, "epoch": 0.9983150370181261, "gen_logits_max": 3.8226301670074463, "gen_logits_mean": -16.554847717285156, "gen_logits_min": -28.52271270751953, "gen_logits_std": 2.9070262908935547, "gen_loss": 0.3322579860687256, "grad_norm": 0.38311104539164487, "learning_rate": 2.475536842105263e-05, "loss": 0.3136, "mean_copy_accuracy": 0.9955602586269379, "mean_gen_accuracy": 0.8631196469068527, "mean_token_accuracy": 0.8953786343336105, "num_tokens": 241121377.0, "sample_num_tokens": 8155.75, "step": 4888, "total_num_tokens": 241154000.0, "z_loss": 0.0006200082134455442 }, { "copy_logits_max": -4.0799126625061035, "copy_logits_min": -750000000.0, "copy_num_tokens": 601.6875, "epoch": 0.998519274955323, "gen_logits_max": 4.27544641494751, "gen_logits_mean": -14.144655227661133, "gen_logits_min": -26.12652587890625, "gen_logits_std": 2.859114646911621, "gen_loss": 0.26032912731170654, "grad_norm": 0.3452867570119729, "learning_rate": 2.4754105263157895e-05, "loss": 0.2855, "mean_copy_accuracy": 0.9965944290161133, "mean_gen_accuracy": 0.8696221262216568, "mean_token_accuracy": 0.9035124480724335, "num_tokens": 241401371.0, "sample_num_tokens": 9446.75, "step": 4889, "total_num_tokens": 241439158.0, "z_loss": 0.0004967080894857645 }, { "copy_logits_max": -6.242562294006348, "copy_logits_min": -750000000.0, "copy_num_tokens": 352.875, "epoch": 0.9987235128925198, "gen_logits_max": 5.203279495239258, "gen_logits_mean": -14.172747611999512, "gen_logits_min": -25.696430206298828, "gen_logits_std": 2.8276443481445312, "gen_loss": 0.32683807611465454, "grad_norm": 0.3937580958569444, "learning_rate": 2.4752842105263156e-05, "loss": 0.2992, "mean_copy_accuracy": 0.995687335729599, "mean_gen_accuracy": 0.8674831092357635, "mean_token_accuracy": 0.9010405540466309, "num_tokens": 241686338.0, "sample_num_tokens": 7276.5, "step": 4890, "total_num_tokens": 241715444.0, "z_loss": 0.0006201200885698199 }, { "copy_logits_max": -3.5554394721984863, "copy_logits_min": -750000000.0, "copy_num_tokens": 416.375, "epoch": 0.9989277508297166, "gen_logits_max": 3.72594952583313, "gen_logits_mean": -16.015043258666992, "gen_logits_min": -27.96591567993164, "gen_logits_std": 2.8828535079956055, "gen_loss": 0.32045048475265503, "grad_norm": 0.3919047795646528, "learning_rate": 2.4751578947368423e-05, "loss": 0.3052, "mean_copy_accuracy": 0.9962910860776901, "mean_gen_accuracy": 0.8630835860967636, "mean_token_accuracy": 0.8992674052715302, "num_tokens": 241959254.0, "sample_num_tokens": 7514.5, "step": 4891, "total_num_tokens": 241989312.0, "z_loss": 0.0006110032554715872 }, { "copy_logits_max": -5.020089149475098, "copy_logits_min": -750000000.0, "copy_num_tokens": 593.5, "epoch": 0.9991319887669134, "gen_logits_max": 5.6709818840026855, "gen_logits_mean": -12.92371940612793, "gen_logits_min": -25.4757080078125, "gen_logits_std": 2.8955554962158203, "gen_loss": 0.25932347774505615, "grad_norm": 0.3602835348829755, "learning_rate": 2.4750315789473688e-05, "loss": 0.2994, "mean_copy_accuracy": 0.9965085387229919, "mean_gen_accuracy": 0.8665505647659302, "mean_token_accuracy": 0.9003559648990631, "num_tokens": 242231357.0, "sample_num_tokens": 10651.75, "step": 4892, "total_num_tokens": 242273964.0, "z_loss": 0.0004787980578839779 }, { "copy_logits_max": -4.530109405517578, "copy_logits_min": -750000000.0, "copy_num_tokens": 635.75, "epoch": 0.9993362267041103, "gen_logits_max": 3.6187119483947754, "gen_logits_mean": -14.815569877624512, "gen_logits_min": -26.916885375976562, "gen_logits_std": 2.8970553874969482, "gen_loss": 0.2593575716018677, "grad_norm": 0.39285218336248307, "learning_rate": 2.474905263157895e-05, "loss": 0.2929, "mean_copy_accuracy": 0.9945700019598007, "mean_gen_accuracy": 0.8699501007795334, "mean_token_accuracy": 0.9012023955583572, "num_tokens": 242501469.0, "sample_num_tokens": 8864.75, "step": 4893, "total_num_tokens": 242536928.0, "z_loss": 0.0004713878151960671 }, { "copy_logits_max": -2.8976526260375977, "copy_logits_min": -750000000.0, "copy_num_tokens": 692.875, "epoch": 0.9995404646413071, "gen_logits_max": 4.447910308837891, "gen_logits_mean": -14.991487503051758, "gen_logits_min": -27.365524291992188, "gen_logits_std": 2.9473366737365723, "gen_loss": 0.2948116064071655, "grad_norm": 0.4115546465097221, "learning_rate": 2.4747789473684213e-05, "loss": 0.2895, "mean_copy_accuracy": 0.9958881586790085, "mean_gen_accuracy": 0.8730270117521286, "mean_token_accuracy": 0.9037316292524338, "num_tokens": 242768213.0, "sample_num_tokens": 10009.25, "step": 4894, "total_num_tokens": 242808250.0, "z_loss": 0.0005446815630421042 }, { "copy_logits_max": -2.410236120223999, "copy_logits_min": -750000000.0, "copy_num_tokens": 584.375, "epoch": 0.999744702578504, "gen_logits_max": 6.266624450683594, "gen_logits_mean": -12.56602668762207, "gen_logits_min": -24.914852142333984, "gen_logits_std": 2.9507973194122314, "gen_loss": 0.2853817343711853, "grad_norm": 0.371146857487824, "learning_rate": 2.4746526315789474e-05, "loss": 0.2829, "mean_copy_accuracy": 0.9947812855243683, "mean_gen_accuracy": 0.8775345981121063, "mean_token_accuracy": 0.9061933904886246, "num_tokens": 243036971.0, "sample_num_tokens": 8711.75, "step": 4895, "total_num_tokens": 243071818.0, "z_loss": 0.0005069314502179623 }, { "copy_logits_max": -2.2055134773254395, "copy_logits_min": -750000000.0, "copy_num_tokens": 619.0625, "epoch": 0.9999489405157008, "gen_logits_max": 4.5184125900268555, "gen_logits_mean": -13.928424835205078, "gen_logits_min": -26.25516700744629, "gen_logits_std": 2.885591983795166, "gen_loss": 0.3523862361907959, "grad_norm": 0.3844186541316015, "learning_rate": 2.4745263157894738e-05, "loss": 0.3021, "mean_copy_accuracy": 0.9960859566926956, "mean_gen_accuracy": 0.8612689077854156, "mean_token_accuracy": 0.8998235464096069, "num_tokens": 243313836.0, "sample_num_tokens": 9426.5, "step": 4896, "total_num_tokens": 243351542.0, "z_loss": 0.0006250213482417166 }, { "copy_logits_max": -5.646983623504639, "copy_logits_min": -687500032.0, "copy_num_tokens": 357.5, "epoch": 1.000204237937197, "gen_logits_max": 5.812657356262207, "gen_logits_mean": -14.739558219909668, "gen_logits_min": -26.600006103515625, "gen_logits_std": 2.9550869464874268, "gen_loss": 0.27735841274261475, "grad_norm": 0.4476357260560828, "learning_rate": 2.4744e-05, "loss": 0.3468, "mean_copy_accuracy": 0.9961544632911682, "mean_gen_accuracy": 0.878896963596344, "mean_token_accuracy": 0.9047980189323426, "num_tokens": 243626624.0, "sample_num_tokens": 8510.5, "step": 4897, "total_num_tokens": 243660666.0, "z_loss": 0.0005051075131632388 }, { "copy_logits_max": -5.5611066818237305, "copy_logits_min": -750000000.0, "copy_num_tokens": 552.0625, "epoch": 1.0004084758743936, "gen_logits_max": 7.029055118560791, "gen_logits_mean": -11.30341911315918, "gen_logits_min": -23.546205520629883, "gen_logits_std": 2.9167778491973877, "gen_loss": 0.2612609565258026, "grad_norm": 0.3826156247273826, "learning_rate": 2.4742736842105263e-05, "loss": 0.2574, "mean_copy_accuracy": 0.9961186200380325, "mean_gen_accuracy": 0.8857607692480087, "mean_token_accuracy": 0.9150961190462112, "num_tokens": 243900144.0, "sample_num_tokens": 8972.5, "step": 4898, "total_num_tokens": 243936034.0, "z_loss": 0.0005131110083311796 }, { "copy_logits_max": -4.12585973739624, "copy_logits_min": -750000000.0, "copy_num_tokens": 470.5625, "epoch": 1.0006127138115906, "gen_logits_max": 4.76339054107666, "gen_logits_mean": -15.298657417297363, "gen_logits_min": -27.635723114013672, "gen_logits_std": 2.984614849090576, "gen_loss": 0.2713131308555603, "grad_norm": 0.3820199553072199, "learning_rate": 2.4741473684210528e-05, "loss": 0.2774, "mean_copy_accuracy": 0.9963004738092422, "mean_gen_accuracy": 0.8726308941841125, "mean_token_accuracy": 0.9058620482683182, "num_tokens": 244186299.0, "sample_num_tokens": 7869.75, "step": 4899, "total_num_tokens": 244217778.0, "z_loss": 0.0005705953808501363 }, { "copy_logits_max": -3.340742349624634, "copy_logits_min": -750000064.0, "copy_num_tokens": 261.4375, "epoch": 1.0008169517487873, "gen_logits_max": 6.941215515136719, "gen_logits_mean": -12.934935569763184, "gen_logits_min": -24.94725799560547, "gen_logits_std": 2.8738231658935547, "gen_loss": 0.27771836519241333, "grad_norm": 0.3587534014692028, "learning_rate": 2.4740210526315792e-05, "loss": 0.2672, "mean_copy_accuracy": 0.996324747800827, "mean_gen_accuracy": 0.8825132846832275, "mean_token_accuracy": 0.9074631035327911, "num_tokens": 244458971.0, "sample_num_tokens": 7479.75, "step": 4900, "total_num_tokens": 244488890.0, "z_loss": 0.0005849405424669385 }, { "copy_logits_max": -2.3392486572265625, "copy_logits_min": -750000000.0, "copy_num_tokens": 498.25, "epoch": 1.0010211896859842, "gen_logits_max": 5.112851142883301, "gen_logits_mean": -14.963536262512207, "gen_logits_min": -26.972412109375, "gen_logits_std": 2.953183174133301, "gen_loss": 0.2867427468299866, "grad_norm": 0.4538990477053308, "learning_rate": 2.4738947368421053e-05, "loss": 0.2976, "mean_copy_accuracy": 0.9940191358327866, "mean_gen_accuracy": 0.8746156394481659, "mean_token_accuracy": 0.8990349322557449, "num_tokens": 244697149.0, "sample_num_tokens": 9024.75, "step": 4901, "total_num_tokens": 244733248.0, "z_loss": 0.0006358051905408502 }, { "copy_logits_max": -4.78588342666626, "copy_logits_min": -750000000.0, "copy_num_tokens": 300.875, "epoch": 1.001225427623181, "gen_logits_max": 7.214719772338867, "gen_logits_mean": -12.793177604675293, "gen_logits_min": -25.108861923217773, "gen_logits_std": 2.9500417709350586, "gen_loss": 0.3216869831085205, "grad_norm": 0.39244543242876534, "learning_rate": 2.4737684210526317e-05, "loss": 0.2817, "mean_copy_accuracy": 0.9962698966264725, "mean_gen_accuracy": 0.8803462982177734, "mean_token_accuracy": 0.9058920294046402, "num_tokens": 244960721.0, "sample_num_tokens": 7709.75, "step": 4902, "total_num_tokens": 244991560.0, "z_loss": 0.0006708499859087169 }, { "copy_logits_max": -4.475668907165527, "copy_logits_min": -750000000.0, "copy_num_tokens": 424.5, "epoch": 1.0014296655603778, "gen_logits_max": 5.854800224304199, "gen_logits_mean": -14.328662872314453, "gen_logits_min": -26.08538246154785, "gen_logits_std": 2.898894786834717, "gen_loss": 0.31555163860321045, "grad_norm": 0.4194795327440917, "learning_rate": 2.4736421052631578e-05, "loss": 0.2706, "mean_copy_accuracy": 0.9961961954832077, "mean_gen_accuracy": 0.8734706044197083, "mean_token_accuracy": 0.9089673459529877, "num_tokens": 245214660.0, "sample_num_tokens": 7856.0, "step": 4903, "total_num_tokens": 245246084.0, "z_loss": 0.0006870602956041694 }, { "copy_logits_max": -5.647637367248535, "copy_logits_min": -750000000.0, "copy_num_tokens": 355.75, "epoch": 1.0016339034975748, "gen_logits_max": 5.45107364654541, "gen_logits_mean": -15.090248107910156, "gen_logits_min": -26.50387191772461, "gen_logits_std": 2.9053378105163574, "gen_loss": 0.28159087896347046, "grad_norm": 0.404783048059362, "learning_rate": 2.4735157894736842e-05, "loss": 0.2729, "mean_copy_accuracy": 0.99598428606987, "mean_gen_accuracy": 0.8861063718795776, "mean_token_accuracy": 0.9098220318555832, "num_tokens": 245493552.0, "sample_num_tokens": 8479.0, "step": 4904, "total_num_tokens": 245527468.0, "z_loss": 0.0006328666349872947 }, { "copy_logits_max": -6.500260353088379, "copy_logits_min": -750000000.0, "copy_num_tokens": 258.25, "epoch": 1.0018381414347715, "gen_logits_max": 4.751996994018555, "gen_logits_mean": -15.725417137145996, "gen_logits_min": -27.281635284423828, "gen_logits_std": 2.8808236122131348, "gen_loss": 0.30041956901550293, "grad_norm": 0.4165562446052379, "learning_rate": 2.4733894736842103e-05, "loss": 0.283, "mean_copy_accuracy": 0.9960993528366089, "mean_gen_accuracy": 0.8771802186965942, "mean_token_accuracy": 0.9039002060890198, "num_tokens": 245730954.0, "sample_num_tokens": 6732.5, "step": 4905, "total_num_tokens": 245757884.0, "z_loss": 0.0006261107046157122 }, { "copy_logits_max": -3.4175727367401123, "copy_logits_min": -687500032.0, "copy_num_tokens": 434.5625, "epoch": 1.0020423793719684, "gen_logits_max": 4.956763744354248, "gen_logits_mean": -14.9276123046875, "gen_logits_min": -27.20359992980957, "gen_logits_std": 2.9380645751953125, "gen_loss": 0.26933783292770386, "grad_norm": 0.42079080454472384, "learning_rate": 2.4732631578947368e-05, "loss": 0.2862, "mean_copy_accuracy": 0.996431365609169, "mean_gen_accuracy": 0.8784991502761841, "mean_token_accuracy": 0.9022973924875259, "num_tokens": 245998576.0, "sample_num_tokens": 8888.0, "step": 4906, "total_num_tokens": 246034128.0, "z_loss": 0.0005831140442751348 }, { "copy_logits_max": -5.422675132751465, "copy_logits_min": -750000000.0, "copy_num_tokens": 448.4375, "epoch": 1.002246617309165, "gen_logits_max": 3.9012064933776855, "gen_logits_mean": -15.764453887939453, "gen_logits_min": -27.387805938720703, "gen_logits_std": 2.873609781265259, "gen_loss": 0.25274401903152466, "grad_norm": 0.43219554860632825, "learning_rate": 2.4731368421052635e-05, "loss": 0.283, "mean_copy_accuracy": 0.9949347674846649, "mean_gen_accuracy": 0.8810092806816101, "mean_token_accuracy": 0.9042054861783981, "num_tokens": 246252021.0, "sample_num_tokens": 8279.25, "step": 4907, "total_num_tokens": 246285138.0, "z_loss": 0.0005613009561784565 }, { "copy_logits_max": -4.504414081573486, "copy_logits_min": -750000000.0, "copy_num_tokens": 495.625, "epoch": 1.002450855246362, "gen_logits_max": 4.3822479248046875, "gen_logits_mean": -15.278940200805664, "gen_logits_min": -27.095842361450195, "gen_logits_std": 2.8944764137268066, "gen_loss": 0.28034108877182007, "grad_norm": 0.454938625355999, "learning_rate": 2.4730105263157896e-05, "loss": 0.2789, "mean_copy_accuracy": 0.9956557601690292, "mean_gen_accuracy": 0.8740592300891876, "mean_token_accuracy": 0.9046489745378494, "num_tokens": 246543032.0, "sample_num_tokens": 9073.5, "step": 4908, "total_num_tokens": 246579326.0, "z_loss": 0.000592028780374676 }, { "copy_logits_max": -0.7025117874145508, "copy_logits_min": -750000000.0, "copy_num_tokens": 673.5, "epoch": 1.002655093183559, "gen_logits_max": 4.869836807250977, "gen_logits_mean": -14.411025047302246, "gen_logits_min": -25.900381088256836, "gen_logits_std": 2.8514490127563477, "gen_loss": 0.2579733729362488, "grad_norm": 0.407767871100402, "learning_rate": 2.472884210526316e-05, "loss": 0.2564, "mean_copy_accuracy": 0.995002418756485, "mean_gen_accuracy": 0.8880001753568649, "mean_token_accuracy": 0.9149402678012848, "num_tokens": 246823192.0, "sample_num_tokens": 10100.0, "step": 4909, "total_num_tokens": 246863592.0, "z_loss": 0.0006083027692511678 }, { "copy_logits_max": -0.8965376615524292, "copy_logits_min": -750000000.0, "copy_num_tokens": 578.6875, "epoch": 1.0028593311207556, "gen_logits_max": 4.208120346069336, "gen_logits_mean": -15.175092697143555, "gen_logits_min": -27.365333557128906, "gen_logits_std": 2.947526454925537, "gen_loss": 0.262067973613739, "grad_norm": 0.38273671219598643, "learning_rate": 2.472757894736842e-05, "loss": 0.2682, "mean_copy_accuracy": 0.9968099743127823, "mean_gen_accuracy": 0.8796578347682953, "mean_token_accuracy": 0.908722311258316, "num_tokens": 247090848.0, "sample_num_tokens": 9131.5, "step": 4910, "total_num_tokens": 247127374.0, "z_loss": 0.0005803536623716354 }, { "copy_logits_max": -4.665666580200195, "copy_logits_min": -750000000.0, "copy_num_tokens": 335.0625, "epoch": 1.0030635690579526, "gen_logits_max": 4.914642810821533, "gen_logits_mean": -14.884004592895508, "gen_logits_min": -26.402849197387695, "gen_logits_std": 2.909472703933716, "gen_loss": 0.275199830532074, "grad_norm": 0.45202405349484004, "learning_rate": 2.4726315789473686e-05, "loss": 0.3039, "mean_copy_accuracy": 0.9948166012763977, "mean_gen_accuracy": 0.8710353076457977, "mean_token_accuracy": 0.8972557634115219, "num_tokens": 247349368.0, "sample_num_tokens": 7728.0, "step": 4911, "total_num_tokens": 247380280.0, "z_loss": 0.0005694646388292313 }, { "copy_logits_max": -0.5734021067619324, "copy_logits_min": -687500032.0, "copy_num_tokens": 610.6875, "epoch": 1.0032678069951493, "gen_logits_max": 3.577139139175415, "gen_logits_mean": -15.791400909423828, "gen_logits_min": -28.246261596679688, "gen_logits_std": 2.9940900802612305, "gen_loss": 0.26915502548217773, "grad_norm": 0.4161305131965401, "learning_rate": 2.4725052631578947e-05, "loss": 0.2737, "mean_copy_accuracy": 0.9953114241361618, "mean_gen_accuracy": 0.8737791627645493, "mean_token_accuracy": 0.9071841239929199, "num_tokens": 247626294.0, "sample_num_tokens": 8879.5, "step": 4912, "total_num_tokens": 247661812.0, "z_loss": 0.0005547729087993503 }, { "copy_logits_max": -1.9917938709259033, "copy_logits_min": -750000000.0, "copy_num_tokens": 585.5, "epoch": 1.0034720449323462, "gen_logits_max": 4.428885459899902, "gen_logits_mean": -15.765847206115723, "gen_logits_min": -27.579891204833984, "gen_logits_std": 2.946270704269409, "gen_loss": 0.3022218346595764, "grad_norm": 0.41146808577785715, "learning_rate": 2.472378947368421e-05, "loss": 0.268, "mean_copy_accuracy": 0.9963889122009277, "mean_gen_accuracy": 0.8762992024421692, "mean_token_accuracy": 0.9099974036216736, "num_tokens": 247915308.0, "sample_num_tokens": 9840.0, "step": 4913, "total_num_tokens": 247954668.0, "z_loss": 0.0006465258775278926 }, { "copy_logits_max": -2.0211150646209717, "copy_logits_min": -750000000.0, "copy_num_tokens": 462.625, "epoch": 1.003676282869543, "gen_logits_max": 4.55983304977417, "gen_logits_mean": -14.809728622436523, "gen_logits_min": -27.329082489013672, "gen_logits_std": 2.9292397499084473, "gen_loss": 0.26946091651916504, "grad_norm": 0.4112003605501246, "learning_rate": 2.4722526315789472e-05, "loss": 0.2942, "mean_copy_accuracy": 0.9964178502559662, "mean_gen_accuracy": 0.8661404550075531, "mean_token_accuracy": 0.9008269608020782, "num_tokens": 248201008.0, "sample_num_tokens": 8305.5, "step": 4914, "total_num_tokens": 248234230.0, "z_loss": 0.0007597531657665968 }, { "copy_logits_max": -2.312509775161743, "copy_logits_min": -750000000.0, "copy_num_tokens": 368.8125, "epoch": 1.0038805208067398, "gen_logits_max": 5.893640518188477, "gen_logits_mean": -14.206339836120605, "gen_logits_min": -25.812585830688477, "gen_logits_std": 2.915332078933716, "gen_loss": 0.2987689971923828, "grad_norm": 0.4558444898194919, "learning_rate": 2.472126315789474e-05, "loss": 0.2949, "mean_copy_accuracy": 0.9958845525979996, "mean_gen_accuracy": 0.8728018999099731, "mean_token_accuracy": 0.899992510676384, "num_tokens": 248450549.0, "sample_num_tokens": 8029.75, "step": 4915, "total_num_tokens": 248482668.0, "z_loss": 0.0006187393446452916 }, { "copy_logits_max": -3.871427536010742, "copy_logits_min": -750000000.0, "copy_num_tokens": 276.0, "epoch": 1.0040847587439368, "gen_logits_max": 5.110795021057129, "gen_logits_mean": -15.82657527923584, "gen_logits_min": -27.715885162353516, "gen_logits_std": 2.9267735481262207, "gen_loss": 0.276822954416275, "grad_norm": 0.4358802209458446, "learning_rate": 2.472e-05, "loss": 0.2905, "mean_copy_accuracy": 0.9937666803598404, "mean_gen_accuracy": 0.8789033591747284, "mean_token_accuracy": 0.9010316133499146, "num_tokens": 248706642.0, "sample_num_tokens": 7200.5, "step": 4916, "total_num_tokens": 248735444.0, "z_loss": 0.0006111027905717492 }, { "copy_logits_max": 2.4104056358337402, "copy_logits_min": -750000000.0, "copy_num_tokens": 446.5, "epoch": 1.0042889966811335, "gen_logits_max": 5.072023391723633, "gen_logits_mean": -13.968219757080078, "gen_logits_min": -26.03265953063965, "gen_logits_std": 2.9598658084869385, "gen_loss": 0.2500731945037842, "grad_norm": 0.4642568748722133, "learning_rate": 2.4718736842105265e-05, "loss": 0.2665, "mean_copy_accuracy": 0.9948822110891342, "mean_gen_accuracy": 0.8810959905385971, "mean_token_accuracy": 0.9089710712432861, "num_tokens": 248972642.0, "sample_num_tokens": 7681.0, "step": 4917, "total_num_tokens": 249003366.0, "z_loss": 0.0005624917102977633 }, { "copy_logits_max": -3.9207968711853027, "copy_logits_min": -750000000.0, "copy_num_tokens": 457.4375, "epoch": 1.0044932346183304, "gen_logits_max": 3.4531893730163574, "gen_logits_mean": -16.821016311645508, "gen_logits_min": -28.874765396118164, "gen_logits_std": 2.954836845397949, "gen_loss": 0.26549604535102844, "grad_norm": 0.5583832035829681, "learning_rate": 2.4717473684210526e-05, "loss": 0.2472, "mean_copy_accuracy": 0.9957744032144547, "mean_gen_accuracy": 0.8825474828481674, "mean_token_accuracy": 0.9145824909210205, "num_tokens": 249253202.0, "sample_num_tokens": 8932.5, "step": 4918, "total_num_tokens": 249288932.0, "z_loss": 0.0005101702408865094 }, { "copy_logits_max": -0.8653411865234375, "copy_logits_min": -687500032.0, "copy_num_tokens": 543.4375, "epoch": 1.004697472555527, "gen_logits_max": 4.70473051071167, "gen_logits_mean": -14.532896995544434, "gen_logits_min": -26.335731506347656, "gen_logits_std": 2.9090471267700195, "gen_loss": 0.25790125131607056, "grad_norm": 0.40931959958445635, "learning_rate": 2.471621052631579e-05, "loss": 0.2772, "mean_copy_accuracy": 0.9957602918148041, "mean_gen_accuracy": 0.8786812573671341, "mean_token_accuracy": 0.9049822986125946, "num_tokens": 249528671.0, "sample_num_tokens": 9387.75, "step": 4919, "total_num_tokens": 249566222.0, "z_loss": 0.0005733383586630225 }, { "copy_logits_max": -1.6745787858963013, "copy_logits_min": -750000000.0, "copy_num_tokens": 631.8125, "epoch": 1.004901710492724, "gen_logits_max": 4.152261734008789, "gen_logits_mean": -13.851232528686523, "gen_logits_min": -26.231863021850586, "gen_logits_std": 2.8657898902893066, "gen_loss": 0.243190735578537, "grad_norm": 0.44578334865253905, "learning_rate": 2.4714947368421054e-05, "loss": 0.2922, "mean_copy_accuracy": 0.9947434067726135, "mean_gen_accuracy": 0.8723036199808121, "mean_token_accuracy": 0.9011536687612534, "num_tokens": 249793855.0, "sample_num_tokens": 9345.75, "step": 4920, "total_num_tokens": 249831238.0, "z_loss": 0.0005350990686565638 }, { "copy_logits_max": -1.5232765674591064, "copy_logits_min": -750000000.0, "copy_num_tokens": 653.0625, "epoch": 1.005105948429921, "gen_logits_max": 4.391840934753418, "gen_logits_mean": -14.112539291381836, "gen_logits_min": -26.066299438476562, "gen_logits_std": 2.8517673015594482, "gen_loss": 0.23320861160755157, "grad_norm": 0.4303236392243069, "learning_rate": 2.4713684210526315e-05, "loss": 0.2728, "mean_copy_accuracy": 0.9961945861577988, "mean_gen_accuracy": 0.8724494129419327, "mean_token_accuracy": 0.90763920545578, "num_tokens": 250060309.0, "sample_num_tokens": 10358.25, "step": 4921, "total_num_tokens": 250101742.0, "z_loss": 0.00047787456423975527 }, { "copy_logits_max": -0.5173394680023193, "copy_logits_min": -750000000.0, "copy_num_tokens": 590.5625, "epoch": 1.0053101863671177, "gen_logits_max": 4.303032875061035, "gen_logits_mean": -14.187202453613281, "gen_logits_min": -26.37235450744629, "gen_logits_std": 2.841071844100952, "gen_loss": 0.3035212755203247, "grad_norm": 0.43650453013134083, "learning_rate": 2.471242105263158e-05, "loss": 0.2906, "mean_copy_accuracy": 0.9956337660551071, "mean_gen_accuracy": 0.8706539422273636, "mean_token_accuracy": 0.9022674858570099, "num_tokens": 250339601.0, "sample_num_tokens": 9112.75, "step": 4922, "total_num_tokens": 250376052.0, "z_loss": 0.0006552495760843158 }, { "copy_logits_max": -1.9096189737319946, "copy_logits_min": -750000000.0, "copy_num_tokens": 571.3125, "epoch": 1.0055144243043146, "gen_logits_max": 2.509334087371826, "gen_logits_mean": -16.729930877685547, "gen_logits_min": -28.771108627319336, "gen_logits_std": 2.934025526046753, "gen_loss": 0.2705623507499695, "grad_norm": 0.3583896667075756, "learning_rate": 2.471115789473684e-05, "loss": 0.2595, "mean_copy_accuracy": 0.9970217347145081, "mean_gen_accuracy": 0.8812430649995804, "mean_token_accuracy": 0.9118217080831528, "num_tokens": 250633093.0, "sample_num_tokens": 8471.25, "step": 4923, "total_num_tokens": 250666978.0, "z_loss": 0.0005924574215896428 }, { "copy_logits_max": -4.808108329772949, "copy_logits_min": -750000128.0, "copy_num_tokens": 544.6875, "epoch": 1.0057186622415113, "gen_logits_max": 3.0936527252197266, "gen_logits_mean": -17.046852111816406, "gen_logits_min": -28.780981063842773, "gen_logits_std": 2.9184727668762207, "gen_loss": 0.2659725546836853, "grad_norm": 0.4397345489905526, "learning_rate": 2.4709894736842108e-05, "loss": 0.2763, "mean_copy_accuracy": 0.9955082535743713, "mean_gen_accuracy": 0.8747989982366562, "mean_token_accuracy": 0.9068802744150162, "num_tokens": 250905136.0, "sample_num_tokens": 9444.0, "step": 4924, "total_num_tokens": 250942912.0, "z_loss": 0.0005740235792472959 }, { "copy_logits_max": -3.5108845233917236, "copy_logits_min": -687500032.0, "copy_num_tokens": 621.0625, "epoch": 1.0059229001787082, "gen_logits_max": 2.606508731842041, "gen_logits_mean": -17.10698890686035, "gen_logits_min": -28.912839889526367, "gen_logits_std": 2.898308515548706, "gen_loss": 0.25040364265441895, "grad_norm": 0.40390590277045446, "learning_rate": 2.470863157894737e-05, "loss": 0.2669, "mean_copy_accuracy": 0.9957911223173141, "mean_gen_accuracy": 0.8769672363996506, "mean_token_accuracy": 0.9095614850521088, "num_tokens": 251196498.0, "sample_num_tokens": 8689.0, "step": 4925, "total_num_tokens": 251231254.0, "z_loss": 0.0005505400476977229 }, { "copy_logits_max": -3.8558197021484375, "copy_logits_min": -750000000.0, "copy_num_tokens": 460.4375, "epoch": 1.006127138115905, "gen_logits_max": 4.336600303649902, "gen_logits_mean": -14.79751968383789, "gen_logits_min": -26.730209350585938, "gen_logits_std": 2.8126907348632812, "gen_loss": 0.2778160870075226, "grad_norm": 0.38187611858079623, "learning_rate": 2.4707368421052633e-05, "loss": 0.273, "mean_copy_accuracy": 0.9954269528388977, "mean_gen_accuracy": 0.8728713244199753, "mean_token_accuracy": 0.9046504348516464, "num_tokens": 251478068.0, "sample_num_tokens": 9110.5, "step": 4926, "total_num_tokens": 251514510.0, "z_loss": 0.0005738969193771482 }, { "copy_logits_max": -1.342120885848999, "copy_logits_min": -750000000.0, "copy_num_tokens": 629.875, "epoch": 1.0063313760531019, "gen_logits_max": 3.1303579807281494, "gen_logits_mean": -15.28615665435791, "gen_logits_min": -26.917219161987305, "gen_logits_std": 2.8749477863311768, "gen_loss": 0.24601465463638306, "grad_norm": 0.4044622134834, "learning_rate": 2.4706105263157894e-05, "loss": 0.2677, "mean_copy_accuracy": 0.9947731345891953, "mean_gen_accuracy": 0.8816040307283401, "mean_token_accuracy": 0.9097805023193359, "num_tokens": 251750968.0, "sample_num_tokens": 9721.0, "step": 4927, "total_num_tokens": 251789852.0, "z_loss": 0.0006019080756232142 }, { "copy_logits_max": -4.8542280197143555, "copy_logits_min": -750000000.0, "copy_num_tokens": 336.625, "epoch": 1.0065356139902988, "gen_logits_max": 4.320556640625, "gen_logits_mean": -16.595500946044922, "gen_logits_min": -28.434358596801758, "gen_logits_std": 2.918785333633423, "gen_loss": 0.2913866937160492, "grad_norm": 0.35978764359374354, "learning_rate": 2.470484210526316e-05, "loss": 0.2748, "mean_copy_accuracy": 0.9955578446388245, "mean_gen_accuracy": 0.8758516758680344, "mean_token_accuracy": 0.9078760296106339, "num_tokens": 252035291.0, "sample_num_tokens": 7649.75, "step": 4928, "total_num_tokens": 252065890.0, "z_loss": 0.0006060153245925903 }, { "copy_logits_max": -4.8116607666015625, "copy_logits_min": -562500032.0, "copy_num_tokens": 349.5, "epoch": 1.0067398519274955, "gen_logits_max": 4.3581109046936035, "gen_logits_mean": -15.497044563293457, "gen_logits_min": -26.94964599609375, "gen_logits_std": 2.8269236087799072, "gen_loss": 0.27239176630973816, "grad_norm": 0.4072234654585408, "learning_rate": 2.470357894736842e-05, "loss": 0.2789, "mean_copy_accuracy": 0.9963634759187698, "mean_gen_accuracy": 0.8783954679965973, "mean_token_accuracy": 0.9057275354862213, "num_tokens": 252298913.0, "sample_num_tokens": 8074.75, "step": 4929, "total_num_tokens": 252331212.0, "z_loss": 0.0005415928899310529 }, { "copy_logits_max": -4.347951412200928, "copy_logits_min": -750000000.0, "copy_num_tokens": 382.25, "epoch": 1.0069440898646924, "gen_logits_max": 3.8219034671783447, "gen_logits_mean": -15.304525375366211, "gen_logits_min": -27.058555603027344, "gen_logits_std": 2.8756723403930664, "gen_loss": 0.29687654972076416, "grad_norm": 0.41741962041496594, "learning_rate": 2.4702315789473684e-05, "loss": 0.2822, "mean_copy_accuracy": 0.9959336519241333, "mean_gen_accuracy": 0.8703407347202301, "mean_token_accuracy": 0.9031529873609543, "num_tokens": 252568769.0, "sample_num_tokens": 6796.25, "step": 4930, "total_num_tokens": 252595954.0, "z_loss": 0.0005673093255609274 }, { "copy_logits_max": -4.1119256019592285, "copy_logits_min": -750000000.0, "copy_num_tokens": 415.875, "epoch": 1.0071483278018891, "gen_logits_max": 4.812262535095215, "gen_logits_mean": -14.295565605163574, "gen_logits_min": -25.86810302734375, "gen_logits_std": 2.838263988494873, "gen_loss": 0.2780003249645233, "grad_norm": 0.39365999746589675, "learning_rate": 2.4701052631578945e-05, "loss": 0.2733, "mean_copy_accuracy": 0.9954996556043625, "mean_gen_accuracy": 0.8781766444444656, "mean_token_accuracy": 0.9071227461099625, "num_tokens": 252860602.0, "sample_num_tokens": 8490.5, "step": 4931, "total_num_tokens": 252894564.0, "z_loss": 0.0005389494472183287 }, { "copy_logits_max": -3.6267969608306885, "copy_logits_min": -750000000.0, "copy_num_tokens": 549.375, "epoch": 1.007352565739086, "gen_logits_max": 4.406040191650391, "gen_logits_mean": -14.901248931884766, "gen_logits_min": -26.32646942138672, "gen_logits_std": 2.828183889389038, "gen_loss": 0.3053876459598541, "grad_norm": 0.3935026529140177, "learning_rate": 2.4699789473684213e-05, "loss": 0.2665, "mean_copy_accuracy": 0.9956232160329819, "mean_gen_accuracy": 0.8808533102273941, "mean_token_accuracy": 0.9088913202285767, "num_tokens": 253146982.0, "sample_num_tokens": 10083.5, "step": 4932, "total_num_tokens": 253187316.0, "z_loss": 0.0006059967563487589 }, { "copy_logits_max": -0.21774768829345703, "copy_logits_min": -750000000.0, "copy_num_tokens": 545.1875, "epoch": 1.007556803676283, "gen_logits_max": 4.491095066070557, "gen_logits_mean": -13.690299034118652, "gen_logits_min": -25.382648468017578, "gen_logits_std": 2.8044965267181396, "gen_loss": 0.2971913516521454, "grad_norm": 0.42477951109093265, "learning_rate": 2.4698526315789477e-05, "loss": 0.2771, "mean_copy_accuracy": 0.9962476938962936, "mean_gen_accuracy": 0.8721985518932343, "mean_token_accuracy": 0.9049001038074493, "num_tokens": 253428154.0, "sample_num_tokens": 8781.0, "step": 4933, "total_num_tokens": 253463278.0, "z_loss": 0.0006570432451553643 }, { "copy_logits_max": -1.8887009620666504, "copy_logits_min": -750000000.0, "copy_num_tokens": 467.0625, "epoch": 1.0077610416134797, "gen_logits_max": 5.117991924285889, "gen_logits_mean": -13.751766204833984, "gen_logits_min": -25.742488861083984, "gen_logits_std": 2.8534598350524902, "gen_loss": 0.2525484561920166, "grad_norm": 0.41851734566632953, "learning_rate": 2.4697263157894738e-05, "loss": 0.2774, "mean_copy_accuracy": 0.9950733780860901, "mean_gen_accuracy": 0.8761827796697617, "mean_token_accuracy": 0.9061264395713806, "num_tokens": 253707419.0, "sample_num_tokens": 8152.75, "step": 4934, "total_num_tokens": 253740030.0, "z_loss": 0.0006175452726893127 }, { "copy_logits_max": -3.261727809906006, "copy_logits_min": -750000000.0, "copy_num_tokens": 544.5625, "epoch": 1.0079652795506766, "gen_logits_max": 3.9259519577026367, "gen_logits_mean": -15.688218116760254, "gen_logits_min": -26.930274963378906, "gen_logits_std": 2.8085227012634277, "gen_loss": 0.2513201832771301, "grad_norm": 0.3785783468268163, "learning_rate": 2.4696000000000002e-05, "loss": 0.2494, "mean_copy_accuracy": 0.9966963827610016, "mean_gen_accuracy": 0.8831753879785538, "mean_token_accuracy": 0.9152446389198303, "num_tokens": 253991984.0, "sample_num_tokens": 9252.5, "step": 4935, "total_num_tokens": 254028994.0, "z_loss": 0.0005142270820215344 }, { "copy_logits_max": -1.9784107208251953, "copy_logits_min": -750000000.0, "copy_num_tokens": 590.375, "epoch": 1.0081695174878733, "gen_logits_max": 6.161281108856201, "gen_logits_mean": -12.93906021118164, "gen_logits_min": -25.030391693115234, "gen_logits_std": 2.779357671737671, "gen_loss": 0.26660680770874023, "grad_norm": 0.369989804271126, "learning_rate": 2.4694736842105263e-05, "loss": 0.2675, "mean_copy_accuracy": 0.9948641508817673, "mean_gen_accuracy": 0.8782195001840591, "mean_token_accuracy": 0.9082660973072052, "num_tokens": 254281916.0, "sample_num_tokens": 9413.0, "step": 4936, "total_num_tokens": 254319568.0, "z_loss": 0.000583764398470521 }, { "copy_logits_max": -1.8038287162780762, "copy_logits_min": -750000000.0, "copy_num_tokens": 556.0, "epoch": 1.0083737554250702, "gen_logits_max": 4.591454982757568, "gen_logits_mean": -14.627448081970215, "gen_logits_min": -26.788042068481445, "gen_logits_std": 2.8722057342529297, "gen_loss": 0.27674704790115356, "grad_norm": 0.3790553375902504, "learning_rate": 2.4693473684210527e-05, "loss": 0.2675, "mean_copy_accuracy": 0.9957925230264664, "mean_gen_accuracy": 0.8814997673034668, "mean_token_accuracy": 0.909296452999115, "num_tokens": 254547119.0, "sample_num_tokens": 8723.25, "step": 4937, "total_num_tokens": 254582012.0, "z_loss": 0.0006061089225113392 }, { "copy_logits_max": -2.4742205142974854, "copy_logits_min": -625000064.0, "copy_num_tokens": 588.8125, "epoch": 1.008577993362267, "gen_logits_max": 4.443787574768066, "gen_logits_mean": -13.926234245300293, "gen_logits_min": -26.038307189941406, "gen_logits_std": 2.829925537109375, "gen_loss": 0.2316194325685501, "grad_norm": 0.4310071912163218, "learning_rate": 2.4692210526315788e-05, "loss": 0.2801, "mean_copy_accuracy": 0.9958004653453827, "mean_gen_accuracy": 0.8735597878694534, "mean_token_accuracy": 0.9058426171541214, "num_tokens": 254810708.0, "sample_num_tokens": 8767.5, "step": 4938, "total_num_tokens": 254845778.0, "z_loss": 0.0005191955715417862 }, { "copy_logits_max": -4.9587178230285645, "copy_logits_min": -687500032.0, "copy_num_tokens": 506.875, "epoch": 1.0087822312994639, "gen_logits_max": 4.44746208190918, "gen_logits_mean": -14.951991081237793, "gen_logits_min": -26.29073143005371, "gen_logits_std": 2.7815771102905273, "gen_loss": 0.2535301446914673, "grad_norm": 0.39978647597461675, "learning_rate": 2.4690947368421053e-05, "loss": 0.2566, "mean_copy_accuracy": 0.9950289279222488, "mean_gen_accuracy": 0.8857276290655136, "mean_token_accuracy": 0.9108322560787201, "num_tokens": 255085501.0, "sample_num_tokens": 9580.75, "step": 4939, "total_num_tokens": 255123824.0, "z_loss": 0.0005055641522631049 }, { "copy_logits_max": -2.721130847930908, "copy_logits_min": -750000000.0, "copy_num_tokens": 418.0625, "epoch": 1.0089864692366608, "gen_logits_max": 4.16986608505249, "gen_logits_mean": -15.640400886535645, "gen_logits_min": -27.280616760253906, "gen_logits_std": 2.829437732696533, "gen_loss": 0.31120243668556213, "grad_norm": 0.3610547765440869, "learning_rate": 2.4689684210526317e-05, "loss": 0.2908, "mean_copy_accuracy": 0.9957020729780197, "mean_gen_accuracy": 0.873032420873642, "mean_token_accuracy": 0.9012145400047302, "num_tokens": 255357852.0, "sample_num_tokens": 8459.5, "step": 4940, "total_num_tokens": 255391690.0, "z_loss": 0.0005919577670283616 }, { "copy_logits_max": -1.4390103816986084, "copy_logits_min": -687500032.0, "copy_num_tokens": 523.5625, "epoch": 1.0091907071738575, "gen_logits_max": 4.842607021331787, "gen_logits_mean": -14.607013702392578, "gen_logits_min": -25.799915313720703, "gen_logits_std": 2.771207809448242, "gen_loss": 0.28587812185287476, "grad_norm": 0.3990233196129415, "learning_rate": 2.468842105263158e-05, "loss": 0.2874, "mean_copy_accuracy": 0.995662733912468, "mean_gen_accuracy": 0.8719893991947174, "mean_token_accuracy": 0.9026855677366257, "num_tokens": 255618817.0, "sample_num_tokens": 8304.25, "step": 4941, "total_num_tokens": 255652034.0, "z_loss": 0.000635340518783778 }, { "copy_logits_max": -4.761855125427246, "copy_logits_min": -750000000.0, "copy_num_tokens": 334.375, "epoch": 1.0093949451110544, "gen_logits_max": 4.836047172546387, "gen_logits_mean": -15.183738708496094, "gen_logits_min": -26.260709762573242, "gen_logits_std": 2.8115248680114746, "gen_loss": 0.3166792392730713, "grad_norm": 0.4151862615895706, "learning_rate": 2.4687157894736842e-05, "loss": 0.297, "mean_copy_accuracy": 0.9967083930969238, "mean_gen_accuracy": 0.8662760257720947, "mean_token_accuracy": 0.8992969542741776, "num_tokens": 255889676.0, "sample_num_tokens": 7305.0, "step": 4942, "total_num_tokens": 255918896.0, "z_loss": 0.0006009607459418476 }, { "copy_logits_max": -3.088857650756836, "copy_logits_min": -750000000.0, "copy_num_tokens": 523.3125, "epoch": 1.0095991830482511, "gen_logits_max": 5.37217378616333, "gen_logits_mean": -11.61711311340332, "gen_logits_min": -22.903934478759766, "gen_logits_std": 2.643758773803711, "gen_loss": 0.2428218126296997, "grad_norm": 0.45742620498745024, "learning_rate": 2.4685894736842106e-05, "loss": 0.2792, "mean_copy_accuracy": 0.9965516030788422, "mean_gen_accuracy": 0.8762853145599365, "mean_token_accuracy": 0.9037216007709503, "num_tokens": 256116235.0, "sample_num_tokens": 7778.25, "step": 4943, "total_num_tokens": 256147348.0, "z_loss": 0.0004907532129436731 }, { "copy_logits_max": -2.8098597526550293, "copy_logits_min": -687500032.0, "copy_num_tokens": 546.1875, "epoch": 1.009803420985448, "gen_logits_max": 4.490231513977051, "gen_logits_mean": -15.313986778259277, "gen_logits_min": -26.755855560302734, "gen_logits_std": 2.8525147438049316, "gen_loss": 0.2408682405948639, "grad_norm": 0.3740983074433709, "learning_rate": 2.4684631578947367e-05, "loss": 0.252, "mean_copy_accuracy": 0.9964584708213806, "mean_gen_accuracy": 0.8773116022348404, "mean_token_accuracy": 0.9143896549940109, "num_tokens": 256417275.0, "sample_num_tokens": 9480.25, "step": 4944, "total_num_tokens": 256455196.0, "z_loss": 0.0005460649263113737 }, { "copy_logits_max": -5.775508880615234, "copy_logits_min": -750000000.0, "copy_num_tokens": 349.625, "epoch": 1.010007658922645, "gen_logits_max": 4.129749298095703, "gen_logits_mean": -16.39604949951172, "gen_logits_min": -28.259037017822266, "gen_logits_std": 2.879594326019287, "gen_loss": 0.268973708152771, "grad_norm": 0.3689454951661132, "learning_rate": 2.468336842105263e-05, "loss": 0.2637, "mean_copy_accuracy": 0.9953242838382721, "mean_gen_accuracy": 0.8830714821815491, "mean_token_accuracy": 0.9106044918298721, "num_tokens": 256694252.0, "sample_num_tokens": 7122.5, "step": 4945, "total_num_tokens": 256722742.0, "z_loss": 0.0005297241150401533 }, { "copy_logits_max": -3.6984214782714844, "copy_logits_min": -750000000.0, "copy_num_tokens": 472.1875, "epoch": 1.0102118968598417, "gen_logits_max": 3.7649178504943848, "gen_logits_mean": -16.888263702392578, "gen_logits_min": -28.368345260620117, "gen_logits_std": 2.9044618606567383, "gen_loss": 0.27700862288475037, "grad_norm": 0.4355589027125504, "learning_rate": 2.4682105263157896e-05, "loss": 0.2803, "mean_copy_accuracy": 0.9947098344564438, "mean_gen_accuracy": 0.8771770745515823, "mean_token_accuracy": 0.9052360653877258, "num_tokens": 256947543.0, "sample_num_tokens": 8038.75, "step": 4946, "total_num_tokens": 256979698.0, "z_loss": 0.0005693755811080337 }, { "copy_logits_max": -4.534472465515137, "copy_logits_min": -750000000.0, "copy_num_tokens": 428.0625, "epoch": 1.0104161347970386, "gen_logits_max": 4.916508197784424, "gen_logits_mean": -14.216344833374023, "gen_logits_min": -25.600902557373047, "gen_logits_std": 2.7723560333251953, "gen_loss": 0.25435173511505127, "grad_norm": 0.43434011380797816, "learning_rate": 2.4680842105263157e-05, "loss": 0.2771, "mean_copy_accuracy": 0.9955355823040009, "mean_gen_accuracy": 0.8790177553892136, "mean_token_accuracy": 0.9046870321035385, "num_tokens": 257195090.0, "sample_num_tokens": 8259.0, "step": 4947, "total_num_tokens": 257228126.0, "z_loss": 0.0005771309370175004 }, { "copy_logits_max": -4.138312339782715, "copy_logits_min": -750000000.0, "copy_num_tokens": 553.5625, "epoch": 1.0106203727342353, "gen_logits_max": 3.6955301761627197, "gen_logits_mean": -16.12246322631836, "gen_logits_min": -27.63591766357422, "gen_logits_std": 2.839268684387207, "gen_loss": 0.26591798663139343, "grad_norm": 0.3943816233140614, "learning_rate": 2.4679578947368425e-05, "loss": 0.2634, "mean_copy_accuracy": 0.9968968778848648, "mean_gen_accuracy": 0.8764073848724365, "mean_token_accuracy": 0.9113019704818726, "num_tokens": 257498068.0, "sample_num_tokens": 8311.0, "step": 4948, "total_num_tokens": 257531312.0, "z_loss": 0.0005168199422769248 }, { "copy_logits_max": -3.7458839416503906, "copy_logits_min": -750000000.0, "copy_num_tokens": 540.75, "epoch": 1.0108246106714323, "gen_logits_max": 6.517148971557617, "gen_logits_mean": -12.826967239379883, "gen_logits_min": -23.731124877929688, "gen_logits_std": 2.707636833190918, "gen_loss": 0.26688605546951294, "grad_norm": 0.429324505662487, "learning_rate": 2.4678315789473685e-05, "loss": 0.2899, "mean_copy_accuracy": 0.9954828470945358, "mean_gen_accuracy": 0.8730445504188538, "mean_token_accuracy": 0.9053586274385452, "num_tokens": 257790574.0, "sample_num_tokens": 9785.0, "step": 4949, "total_num_tokens": 257829714.0, "z_loss": 0.0005850851302966475 }, { "copy_logits_max": -4.0840606689453125, "copy_logits_min": -750000000.0, "copy_num_tokens": 351.125, "epoch": 1.011028848608629, "gen_logits_max": 4.977009296417236, "gen_logits_mean": -16.095958709716797, "gen_logits_min": -27.76404571533203, "gen_logits_std": 2.9006524085998535, "gen_loss": 0.3123108446598053, "grad_norm": 0.3832571931803759, "learning_rate": 2.467705263157895e-05, "loss": 0.2893, "mean_copy_accuracy": 0.9941190034151077, "mean_gen_accuracy": 0.8777911812067032, "mean_token_accuracy": 0.9012055993080139, "num_tokens": 258043088.0, "sample_num_tokens": 7669.0, "step": 4950, "total_num_tokens": 258073764.0, "z_loss": 0.0007298081764020026 }, { "copy_logits_max": 1.081520438194275, "copy_logits_min": -750000000.0, "copy_num_tokens": 795.1875, "epoch": 1.0112330865458259, "gen_logits_max": 5.249541282653809, "gen_logits_mean": -13.126842498779297, "gen_logits_min": -25.112850189208984, "gen_logits_std": 2.81192946434021, "gen_loss": 0.24974609911441803, "grad_norm": 0.6352638992813695, "learning_rate": 2.467578947368421e-05, "loss": 0.2773, "mean_copy_accuracy": 0.9960451722145081, "mean_gen_accuracy": 0.8746796548366547, "mean_token_accuracy": 0.9066771864891052, "num_tokens": 258312693.0, "sample_num_tokens": 9817.75, "step": 4951, "total_num_tokens": 258351964.0, "z_loss": 0.0007721188012510538 }, { "copy_logits_max": -3.5902419090270996, "copy_logits_min": -750000000.0, "copy_num_tokens": 343.6875, "epoch": 1.0114373244830228, "gen_logits_max": 6.01983642578125, "gen_logits_mean": -14.767965316772461, "gen_logits_min": -26.35642433166504, "gen_logits_std": 2.819681167602539, "gen_loss": 0.31281453371047974, "grad_norm": 0.38406888296019925, "learning_rate": 2.4674526315789475e-05, "loss": 0.2814, "mean_copy_accuracy": 0.9957475811243057, "mean_gen_accuracy": 0.8746514767408371, "mean_token_accuracy": 0.9034536629915237, "num_tokens": 258573041.0, "sample_num_tokens": 7831.75, "step": 4952, "total_num_tokens": 258604368.0, "z_loss": 0.0007699787383899093 }, { "copy_logits_max": -3.056748867034912, "copy_logits_min": -750000000.0, "copy_num_tokens": 300.1875, "epoch": 1.0116415624202195, "gen_logits_max": 5.687314510345459, "gen_logits_mean": -15.584413528442383, "gen_logits_min": -27.020973205566406, "gen_logits_std": 2.865448474884033, "gen_loss": 0.290280818939209, "grad_norm": 0.3841982133745508, "learning_rate": 2.4673263157894736e-05, "loss": 0.28, "mean_copy_accuracy": 0.9964632391929626, "mean_gen_accuracy": 0.8743127584457397, "mean_token_accuracy": 0.9038466811180115, "num_tokens": 258854808.0, "sample_num_tokens": 7064.5, "step": 4953, "total_num_tokens": 258883066.0, "z_loss": 0.0006663125241175294 }, { "copy_logits_max": -4.249034881591797, "copy_logits_min": -750000000.0, "copy_num_tokens": 483.5625, "epoch": 1.0118458003574164, "gen_logits_max": 4.225751876831055, "gen_logits_mean": -16.943836212158203, "gen_logits_min": -28.394620895385742, "gen_logits_std": 2.9160757064819336, "gen_loss": 0.2831054925918579, "grad_norm": 0.37886444587206053, "learning_rate": 2.4672e-05, "loss": 0.285, "mean_copy_accuracy": 0.9945550411939621, "mean_gen_accuracy": 0.8764082342386246, "mean_token_accuracy": 0.9041315913200378, "num_tokens": 259122525.0, "sample_num_tokens": 8840.25, "step": 4954, "total_num_tokens": 259157886.0, "z_loss": 0.0006612660363316536 }, { "copy_logits_max": -3.193510055541992, "copy_logits_min": -687500032.0, "copy_num_tokens": 423.4375, "epoch": 1.0120500382946132, "gen_logits_max": 5.687142372131348, "gen_logits_mean": -14.264739036560059, "gen_logits_min": -25.4891357421875, "gen_logits_std": 2.791416645050049, "gen_loss": 0.2994033694267273, "grad_norm": 0.38090271148321386, "learning_rate": 2.467073684210526e-05, "loss": 0.2958, "mean_copy_accuracy": 0.9955186396837234, "mean_gen_accuracy": 0.8707468509674072, "mean_token_accuracy": 0.8999348729848862, "num_tokens": 259401583.0, "sample_num_tokens": 8436.75, "step": 4955, "total_num_tokens": 259435330.0, "z_loss": 0.0006236289045773447 }, { "copy_logits_max": -4.700222015380859, "copy_logits_min": -687500032.0, "copy_num_tokens": 299.5, "epoch": 1.01225427623181, "gen_logits_max": 4.153702735900879, "gen_logits_mean": -16.730003356933594, "gen_logits_min": -28.016063690185547, "gen_logits_std": 2.8636202812194824, "gen_loss": 0.28826141357421875, "grad_norm": 0.3860386201604406, "learning_rate": 2.466947368421053e-05, "loss": 0.2834, "mean_copy_accuracy": 0.9964143335819244, "mean_gen_accuracy": 0.8739681094884872, "mean_token_accuracy": 0.9032766669988632, "num_tokens": 259662277.0, "sample_num_tokens": 7056.75, "step": 4956, "total_num_tokens": 259690504.0, "z_loss": 0.0005570576176978648 }, { "copy_logits_max": -4.721312999725342, "copy_logits_min": -750000000.0, "copy_num_tokens": 293.375, "epoch": 1.0124585141690068, "gen_logits_max": 5.591972351074219, "gen_logits_mean": -13.313282012939453, "gen_logits_min": -24.779905319213867, "gen_logits_std": 2.76273512840271, "gen_loss": 0.31705930829048157, "grad_norm": 0.42469533253982933, "learning_rate": 2.466821052631579e-05, "loss": 0.3032, "mean_copy_accuracy": 0.9942911118268967, "mean_gen_accuracy": 0.8715428411960602, "mean_token_accuracy": 0.897641584277153, "num_tokens": 259923215.0, "sample_num_tokens": 7721.25, "step": 4957, "total_num_tokens": 259954100.0, "z_loss": 0.0007045820239000022 }, { "copy_logits_max": -2.9740002155303955, "copy_logits_min": -750000000.0, "copy_num_tokens": 359.8125, "epoch": 1.0126627521062037, "gen_logits_max": 4.584488868713379, "gen_logits_mean": -15.774429321289062, "gen_logits_min": -27.41739845275879, "gen_logits_std": 2.879784107208252, "gen_loss": 0.29556143283843994, "grad_norm": 0.6747719756144344, "learning_rate": 2.4666947368421054e-05, "loss": 0.2915, "mean_copy_accuracy": 0.9953919649124146, "mean_gen_accuracy": 0.8686911463737488, "mean_token_accuracy": 0.9015520513057709, "num_tokens": 260185263.0, "sample_num_tokens": 7282.25, "step": 4958, "total_num_tokens": 260214392.0, "z_loss": 0.0006138796452432871 }, { "copy_logits_max": -2.837411880493164, "copy_logits_min": -687500032.0, "copy_num_tokens": 588.0625, "epoch": 1.0128669900434006, "gen_logits_max": 4.791179180145264, "gen_logits_mean": -14.17950439453125, "gen_logits_min": -25.583126068115234, "gen_logits_std": 2.8167500495910645, "gen_loss": 0.25184834003448486, "grad_norm": 0.3863327860541239, "learning_rate": 2.466568421052632e-05, "loss": 0.2469, "mean_copy_accuracy": 0.9963779151439667, "mean_gen_accuracy": 0.8853831738233566, "mean_token_accuracy": 0.9154781550168991, "num_tokens": 260465082.0, "sample_num_tokens": 9401.0, "step": 4959, "total_num_tokens": 260502686.0, "z_loss": 0.0005266328807920218 }, { "copy_logits_max": -2.28157114982605, "copy_logits_min": -750000000.0, "copy_num_tokens": 478.5625, "epoch": 1.0130712279805973, "gen_logits_max": 4.264972686767578, "gen_logits_mean": -15.165225982666016, "gen_logits_min": -26.865352630615234, "gen_logits_std": 2.8618669509887695, "gen_loss": 0.27409613132476807, "grad_norm": 0.38988547061331663, "learning_rate": 2.466442105263158e-05, "loss": 0.2749, "mean_copy_accuracy": 0.9950306117534637, "mean_gen_accuracy": 0.8787519335746765, "mean_token_accuracy": 0.9082858860492706, "num_tokens": 260745769.0, "sample_num_tokens": 8062.25, "step": 4960, "total_num_tokens": 260778018.0, "z_loss": 0.0005741191562265158 }, { "copy_logits_max": -1.1125390529632568, "copy_logits_min": -750000000.0, "copy_num_tokens": 402.6875, "epoch": 1.0132754659177943, "gen_logits_max": 4.72956657409668, "gen_logits_mean": -15.088016510009766, "gen_logits_min": -26.943851470947266, "gen_logits_std": 2.836534261703491, "gen_loss": 0.2903518080711365, "grad_norm": 0.41782570628877924, "learning_rate": 2.4663157894736844e-05, "loss": 0.2823, "mean_copy_accuracy": 0.9959693998098373, "mean_gen_accuracy": 0.8780448585748672, "mean_token_accuracy": 0.9060705453157425, "num_tokens": 260995460.0, "sample_num_tokens": 7206.0, "step": 4961, "total_num_tokens": 261024284.0, "z_loss": 0.0005841556703671813 }, { "copy_logits_max": -5.289104461669922, "copy_logits_min": -750000000.0, "copy_num_tokens": 339.3125, "epoch": 1.013479703854991, "gen_logits_max": 4.275269508361816, "gen_logits_mean": -15.731193542480469, "gen_logits_min": -27.51254653930664, "gen_logits_std": 2.927342176437378, "gen_loss": 0.24602098762989044, "grad_norm": 0.40735527108481706, "learning_rate": 2.4661894736842105e-05, "loss": 0.2738, "mean_copy_accuracy": 0.9957142919301987, "mean_gen_accuracy": 0.8807584345340729, "mean_token_accuracy": 0.9066748321056366, "num_tokens": 261258676.0, "sample_num_tokens": 6732.5, "step": 4962, "total_num_tokens": 261285606.0, "z_loss": 0.00047725450713187456 }, { "copy_logits_max": -4.31317663192749, "copy_logits_min": -687500032.0, "copy_num_tokens": 351.0, "epoch": 1.013683941792188, "gen_logits_max": 5.057919979095459, "gen_logits_mean": -14.896173477172852, "gen_logits_min": -26.627918243408203, "gen_logits_std": 2.852005958557129, "gen_loss": 0.2952813506126404, "grad_norm": 0.3770790282241105, "learning_rate": 2.466063157894737e-05, "loss": 0.2918, "mean_copy_accuracy": 0.9948843419551849, "mean_gen_accuracy": 0.874695897102356, "mean_token_accuracy": 0.9002427011728287, "num_tokens": 261540292.0, "sample_num_tokens": 7604.0, "step": 4963, "total_num_tokens": 261570708.0, "z_loss": 0.0006189327104948461 }, { "copy_logits_max": -2.5053439140319824, "copy_logits_min": -750000000.0, "copy_num_tokens": 567.5625, "epoch": 1.0138881797293848, "gen_logits_max": 3.994682788848877, "gen_logits_mean": -15.877846717834473, "gen_logits_min": -27.753236770629883, "gen_logits_std": 2.9072012901306152, "gen_loss": 0.2454654574394226, "grad_norm": 0.42409471691576933, "learning_rate": 2.4659368421052633e-05, "loss": 0.2598, "mean_copy_accuracy": 0.9962195605039597, "mean_gen_accuracy": 0.8796479254961014, "mean_token_accuracy": 0.9121549427509308, "num_tokens": 261800250.0, "sample_num_tokens": 9417.0, "step": 4964, "total_num_tokens": 261837918.0, "z_loss": 0.0005337608745321631 }, { "copy_logits_max": -5.918856620788574, "copy_logits_min": -750000000.0, "copy_num_tokens": 306.9375, "epoch": 1.0140924176665815, "gen_logits_max": 5.554413318634033, "gen_logits_mean": -15.131393432617188, "gen_logits_min": -26.643247604370117, "gen_logits_std": 2.848672866821289, "gen_loss": 0.30025291442871094, "grad_norm": 0.3619769168853139, "learning_rate": 2.4658105263157898e-05, "loss": 0.2812, "mean_copy_accuracy": 0.9961532354354858, "mean_gen_accuracy": 0.8802622556686401, "mean_token_accuracy": 0.9051590412855148, "num_tokens": 262066668.0, "sample_num_tokens": 6825.5, "step": 4965, "total_num_tokens": 262093970.0, "z_loss": 0.0006372212083078921 }, { "copy_logits_max": -1.601457953453064, "copy_logits_min": -750000000.0, "copy_num_tokens": 247.1875, "epoch": 1.0142966556037785, "gen_logits_max": 6.343391418457031, "gen_logits_mean": -13.81423568725586, "gen_logits_min": -24.963191986083984, "gen_logits_std": 2.7746992111206055, "gen_loss": 0.3541465699672699, "grad_norm": 0.39869620026911384, "learning_rate": 2.465684210526316e-05, "loss": 0.2805, "mean_copy_accuracy": 0.9950212240219116, "mean_gen_accuracy": 0.8770883679389954, "mean_token_accuracy": 0.9038417339324951, "num_tokens": 262336623.0, "sample_num_tokens": 7650.75, "step": 4966, "total_num_tokens": 262367226.0, "z_loss": 0.0007540907827205956 }, { "copy_logits_max": -1.8214690685272217, "copy_logits_min": -750000000.0, "copy_num_tokens": 421.0625, "epoch": 1.0145008935409752, "gen_logits_max": 5.521151542663574, "gen_logits_mean": -13.403524398803711, "gen_logits_min": -25.10635757446289, "gen_logits_std": 2.8265585899353027, "gen_loss": 0.25851255655288696, "grad_norm": 0.3786181615969986, "learning_rate": 2.4655578947368423e-05, "loss": 0.2755, "mean_copy_accuracy": 0.9960808753967285, "mean_gen_accuracy": 0.8756484538316727, "mean_token_accuracy": 0.9055869281291962, "num_tokens": 262618650.0, "sample_num_tokens": 7690.5, "step": 4967, "total_num_tokens": 262649412.0, "z_loss": 0.0006608638796024024 }, { "copy_logits_max": -3.813154697418213, "copy_logits_min": -750000000.0, "copy_num_tokens": 333.6875, "epoch": 1.014705131478172, "gen_logits_max": 5.224399566650391, "gen_logits_mean": -15.296770095825195, "gen_logits_min": -26.889812469482422, "gen_logits_std": 2.855407238006592, "gen_loss": 0.3080141246318817, "grad_norm": 0.385907359738134, "learning_rate": 2.4654315789473684e-05, "loss": 0.2825, "mean_copy_accuracy": 0.995939165353775, "mean_gen_accuracy": 0.8739590048789978, "mean_token_accuracy": 0.9028197675943375, "num_tokens": 262891600.0, "sample_num_tokens": 7608.0, "step": 4968, "total_num_tokens": 262922032.0, "z_loss": 0.0006850538775324821 }, { "copy_logits_max": -2.545975685119629, "copy_logits_min": -750000000.0, "copy_num_tokens": 344.8125, "epoch": 1.0149093694153688, "gen_logits_max": 6.095566272735596, "gen_logits_mean": -13.080820083618164, "gen_logits_min": -24.66986656188965, "gen_logits_std": 2.7179441452026367, "gen_loss": 0.31244611740112305, "grad_norm": 0.4033136520618487, "learning_rate": 2.4653052631578948e-05, "loss": 0.2719, "mean_copy_accuracy": 0.9957786500453949, "mean_gen_accuracy": 0.8822633624076843, "mean_token_accuracy": 0.9074181616306305, "num_tokens": 263154025.0, "sample_num_tokens": 7893.75, "step": 4969, "total_num_tokens": 263185600.0, "z_loss": 0.0006540543981827796 }, { "copy_logits_max": -1.5726650953292847, "copy_logits_min": -687500032.0, "copy_num_tokens": 523.0625, "epoch": 1.0151136073525657, "gen_logits_max": 4.127598762512207, "gen_logits_mean": -15.832454681396484, "gen_logits_min": -27.87448501586914, "gen_logits_std": 2.9027857780456543, "gen_loss": 0.24212156236171722, "grad_norm": 0.4134668687608929, "learning_rate": 2.465178947368421e-05, "loss": 0.2702, "mean_copy_accuracy": 0.9951251000165939, "mean_gen_accuracy": 0.879518061876297, "mean_token_accuracy": 0.9082056134939194, "num_tokens": 263422730.0, "sample_num_tokens": 8849.0, "step": 4970, "total_num_tokens": 263458126.0, "z_loss": 0.0005604978650808334 }, { "copy_logits_max": -2.0809690952301025, "copy_logits_min": -625000064.0, "copy_num_tokens": 464.75, "epoch": 1.0153178452897627, "gen_logits_max": 3.9780430793762207, "gen_logits_mean": -15.903882026672363, "gen_logits_min": -27.656810760498047, "gen_logits_std": 2.8942036628723145, "gen_loss": 0.2831178605556488, "grad_norm": 0.44079133089570316, "learning_rate": 2.4650526315789473e-05, "loss": 0.2914, "mean_copy_accuracy": 0.9939269125461578, "mean_gen_accuracy": 0.8734850585460663, "mean_token_accuracy": 0.9014732092618942, "num_tokens": 263664939.0, "sample_num_tokens": 7859.25, "step": 4971, "total_num_tokens": 263696376.0, "z_loss": 0.0006448982167057693 }, { "copy_logits_max": -4.400764465332031, "copy_logits_min": -750000000.0, "copy_num_tokens": 404.0625, "epoch": 1.0155220832269594, "gen_logits_max": 4.074653625488281, "gen_logits_mean": -16.523195266723633, "gen_logits_min": -27.702083587646484, "gen_logits_std": 2.843395233154297, "gen_loss": 0.2772112488746643, "grad_norm": 0.4092506860853742, "learning_rate": 2.4649263157894734e-05, "loss": 0.2985, "mean_copy_accuracy": 0.9940057992935181, "mean_gen_accuracy": 0.8738806843757629, "mean_token_accuracy": 0.8990454077720642, "num_tokens": 263926401.0, "sample_num_tokens": 7746.75, "step": 4972, "total_num_tokens": 263957388.0, "z_loss": 0.000569646421354264 }, { "copy_logits_max": -1.1904942989349365, "copy_logits_min": -750000000.0, "copy_num_tokens": 342.1875, "epoch": 1.0157263211641563, "gen_logits_max": 5.851436614990234, "gen_logits_mean": -13.2783842086792, "gen_logits_min": -25.22657585144043, "gen_logits_std": 2.8131401538848877, "gen_loss": 0.3009042739868164, "grad_norm": 0.37273147469063567, "learning_rate": 2.4648000000000002e-05, "loss": 0.2888, "mean_copy_accuracy": 0.9966668337583542, "mean_gen_accuracy": 0.871615007519722, "mean_token_accuracy": 0.9026133716106415, "num_tokens": 264210456.0, "sample_num_tokens": 7291.5, "step": 4973, "total_num_tokens": 264239622.0, "z_loss": 0.0006496558198705316 }, { "copy_logits_max": -4.227352619171143, "copy_logits_min": -687500032.0, "copy_num_tokens": 461.4375, "epoch": 1.015930559101353, "gen_logits_max": 4.471030235290527, "gen_logits_mean": -15.127458572387695, "gen_logits_min": -26.60654640197754, "gen_logits_std": 2.8308606147766113, "gen_loss": 0.2803269922733307, "grad_norm": 0.39647935722371525, "learning_rate": 2.4646736842105266e-05, "loss": 0.2726, "mean_copy_accuracy": 0.9966196566820145, "mean_gen_accuracy": 0.8795702755451202, "mean_token_accuracy": 0.9073097109794617, "num_tokens": 264480104.0, "sample_num_tokens": 8375.0, "step": 4974, "total_num_tokens": 264513604.0, "z_loss": 0.0006469989893957973 }, { "copy_logits_max": 1.7586290836334229, "copy_logits_min": -750000000.0, "copy_num_tokens": 647.1875, "epoch": 1.01613479703855, "gen_logits_max": 5.709743976593018, "gen_logits_mean": -13.619803428649902, "gen_logits_min": -25.83338165283203, "gen_logits_std": 2.8724727630615234, "gen_loss": 0.2868821620941162, "grad_norm": 0.40973120329695795, "learning_rate": 2.4645473684210527e-05, "loss": 0.2756, "mean_copy_accuracy": 0.9966384172439575, "mean_gen_accuracy": 0.8773770779371262, "mean_token_accuracy": 0.9086035490036011, "num_tokens": 264760445.0, "sample_num_tokens": 10136.25, "step": 4975, "total_num_tokens": 264800990.0, "z_loss": 0.000572820077650249 }, { "copy_logits_max": -1.9616835117340088, "copy_logits_min": -750000000.0, "copy_num_tokens": 470.3125, "epoch": 1.0163390349757468, "gen_logits_max": 5.419149875640869, "gen_logits_mean": -14.888486862182617, "gen_logits_min": -27.240814208984375, "gen_logits_std": 2.890486478805542, "gen_loss": 0.26451045274734497, "grad_norm": 1.0405071729173336, "learning_rate": 2.464421052631579e-05, "loss": 0.2771, "mean_copy_accuracy": 0.9964122474193573, "mean_gen_accuracy": 0.8770285546779633, "mean_token_accuracy": 0.9076480716466904, "num_tokens": 265057548.0, "sample_num_tokens": 8497.5, "step": 4976, "total_num_tokens": 265091538.0, "z_loss": 0.0004889227566309273 }, { "copy_logits_max": -1.8881947994232178, "copy_logits_min": -750000000.0, "copy_num_tokens": 633.0, "epoch": 1.0165432729129436, "gen_logits_max": 3.859632730484009, "gen_logits_mean": -16.13035011291504, "gen_logits_min": -28.071138381958008, "gen_logits_std": 2.9517621994018555, "gen_loss": 0.26793718338012695, "grad_norm": 0.5309527590355713, "learning_rate": 2.4642947368421052e-05, "loss": 0.2809, "mean_copy_accuracy": 0.995370626449585, "mean_gen_accuracy": 0.871602326631546, "mean_token_accuracy": 0.9036017656326294, "num_tokens": 265323831.0, "sample_num_tokens": 9624.75, "step": 4977, "total_num_tokens": 265362330.0, "z_loss": 0.0005120758432894945 }, { "copy_logits_max": -3.042060375213623, "copy_logits_min": -687500032.0, "copy_num_tokens": 661.3125, "epoch": 1.0167475108501405, "gen_logits_max": 4.6544084548950195, "gen_logits_mean": -13.88155746459961, "gen_logits_min": -26.006912231445312, "gen_logits_std": 2.840963840484619, "gen_loss": 0.23625944554805756, "grad_norm": 0.4438715450595862, "learning_rate": 2.4641684210526317e-05, "loss": 0.29, "mean_copy_accuracy": 0.9954402595758438, "mean_gen_accuracy": 0.8719187080860138, "mean_token_accuracy": 0.9021187573671341, "num_tokens": 265594084.0, "sample_num_tokens": 9856.0, "step": 4978, "total_num_tokens": 265633508.0, "z_loss": 0.000546327093616128 }, { "copy_logits_max": -2.792738199234009, "copy_logits_min": -687500032.0, "copy_num_tokens": 306.5625, "epoch": 1.0169517487873372, "gen_logits_max": 4.1657209396362305, "gen_logits_mean": -16.61764144897461, "gen_logits_min": -28.07522201538086, "gen_logits_std": 2.898406982421875, "gen_loss": 0.30514824390411377, "grad_norm": 0.41298318567230285, "learning_rate": 2.4640421052631578e-05, "loss": 0.2744, "mean_copy_accuracy": 0.995621994137764, "mean_gen_accuracy": 0.8756979405879974, "mean_token_accuracy": 0.9067862778902054, "num_tokens": 265870344.0, "sample_num_tokens": 6916.0, "step": 4979, "total_num_tokens": 265898008.0, "z_loss": 0.0005872117471881211 }, { "copy_logits_max": -4.169661521911621, "copy_logits_min": -750000000.0, "copy_num_tokens": 455.625, "epoch": 1.0171559867245341, "gen_logits_max": 3.9399845600128174, "gen_logits_mean": -16.62530517578125, "gen_logits_min": -28.202255249023438, "gen_logits_std": 2.9077892303466797, "gen_loss": 0.27491295337677, "grad_norm": 0.42896183590162473, "learning_rate": 2.4639157894736842e-05, "loss": 0.2785, "mean_copy_accuracy": 0.9949332773685455, "mean_gen_accuracy": 0.8809987157583237, "mean_token_accuracy": 0.9077395796775818, "num_tokens": 266151873.0, "sample_num_tokens": 9277.25, "step": 4980, "total_num_tokens": 266188982.0, "z_loss": 0.0005248074303381145 }, { "copy_logits_max": -5.022125244140625, "copy_logits_min": -750000000.0, "copy_num_tokens": 358.75, "epoch": 1.0173602246617308, "gen_logits_max": 4.300074100494385, "gen_logits_mean": -16.066452026367188, "gen_logits_min": -27.635589599609375, "gen_logits_std": 2.85911226272583, "gen_loss": 0.3167746365070343, "grad_norm": 0.387648633538254, "learning_rate": 2.4637894736842106e-05, "loss": 0.287, "mean_copy_accuracy": 0.9952774792909622, "mean_gen_accuracy": 0.8741372525691986, "mean_token_accuracy": 0.9023217111825943, "num_tokens": 266417269.0, "sample_num_tokens": 7506.75, "step": 4981, "total_num_tokens": 266447296.0, "z_loss": 0.0005589278880506754 }, { "copy_logits_max": -3.5601401329040527, "copy_logits_min": -687500032.0, "copy_num_tokens": 586.0, "epoch": 1.0175644625989277, "gen_logits_max": 4.267934799194336, "gen_logits_mean": -14.285426139831543, "gen_logits_min": -26.12335968017578, "gen_logits_std": 2.815196990966797, "gen_loss": 0.2833828926086426, "grad_norm": 0.39348063075951706, "learning_rate": 2.463663157894737e-05, "loss": 0.2569, "mean_copy_accuracy": 0.9960997104644775, "mean_gen_accuracy": 0.8817230761051178, "mean_token_accuracy": 0.9118687212467194, "num_tokens": 266683087.0, "sample_num_tokens": 9230.25, "step": 4982, "total_num_tokens": 266720008.0, "z_loss": 0.0006405859021469951 }, { "copy_logits_max": -4.195044040679932, "copy_logits_min": -750000000.0, "copy_num_tokens": 539.6875, "epoch": 1.0177687005361247, "gen_logits_max": 4.902403354644775, "gen_logits_mean": -14.755404472351074, "gen_logits_min": -26.701889038085938, "gen_logits_std": 2.8424856662750244, "gen_loss": 0.27498406171798706, "grad_norm": 0.39854232621529584, "learning_rate": 2.463536842105263e-05, "loss": 0.2734, "mean_copy_accuracy": 0.9951090812683105, "mean_gen_accuracy": 0.8748161792755127, "mean_token_accuracy": 0.9065359383821487, "num_tokens": 266969994.0, "sample_num_tokens": 9244.5, "step": 4983, "total_num_tokens": 267006972.0, "z_loss": 0.0005669556558132172 }, { "copy_logits_max": -5.650838851928711, "copy_logits_min": -750000000.0, "copy_num_tokens": 376.125, "epoch": 1.0179729384733214, "gen_logits_max": 4.706068992614746, "gen_logits_mean": -15.054288864135742, "gen_logits_min": -26.79994773864746, "gen_logits_std": 2.8215932846069336, "gen_loss": 0.2565630078315735, "grad_norm": 0.4069221671098297, "learning_rate": 2.4634105263157896e-05, "loss": 0.2831, "mean_copy_accuracy": 0.9961741268634796, "mean_gen_accuracy": 0.8795834928750992, "mean_token_accuracy": 0.9040528982877731, "num_tokens": 267229006.0, "sample_num_tokens": 7847.5, "step": 4984, "total_num_tokens": 267260396.0, "z_loss": 0.00054691091645509 }, { "copy_logits_max": -5.086757659912109, "copy_logits_min": -750000000.0, "copy_num_tokens": 524.0, "epoch": 1.0181771764105183, "gen_logits_max": 4.702801704406738, "gen_logits_mean": -15.22048282623291, "gen_logits_min": -27.203157424926758, "gen_logits_std": 2.869457244873047, "gen_loss": 0.25146859884262085, "grad_norm": 0.8162368719568692, "learning_rate": 2.4632842105263157e-05, "loss": 0.2611, "mean_copy_accuracy": 0.9964068531990051, "mean_gen_accuracy": 0.8804415613412857, "mean_token_accuracy": 0.9118907898664474, "num_tokens": 267498444.0, "sample_num_tokens": 8958.0, "step": 4985, "total_num_tokens": 267534276.0, "z_loss": 0.0005506405723281205 }, { "copy_logits_max": -4.872596263885498, "copy_logits_min": -750000000.0, "copy_num_tokens": 453.75, "epoch": 1.018381414347715, "gen_logits_max": 4.7665557861328125, "gen_logits_mean": -14.53221321105957, "gen_logits_min": -26.913318634033203, "gen_logits_std": 2.8611462116241455, "gen_loss": 0.2660030126571655, "grad_norm": 0.3862892339488651, "learning_rate": 2.463157894736842e-05, "loss": 0.2785, "mean_copy_accuracy": 0.995623305439949, "mean_gen_accuracy": 0.8747243136167526, "mean_token_accuracy": 0.9049683064222336, "num_tokens": 267769280.0, "sample_num_tokens": 7897.0, "step": 4986, "total_num_tokens": 267800868.0, "z_loss": 0.0005518911639228463 }, { "copy_logits_max": -7.056461334228516, "copy_logits_min": -750000000.0, "copy_num_tokens": 459.8125, "epoch": 1.018585652284912, "gen_logits_max": 4.595465183258057, "gen_logits_mean": -14.395007133483887, "gen_logits_min": -25.69879150390625, "gen_logits_std": 2.8124279975891113, "gen_loss": 0.24978797137737274, "grad_norm": 0.4079996424630058, "learning_rate": 2.4630315789473685e-05, "loss": 0.2705, "mean_copy_accuracy": 0.9955016374588013, "mean_gen_accuracy": 0.8792726248502731, "mean_token_accuracy": 0.909279927611351, "num_tokens": 268055674.0, "sample_num_tokens": 8146.0, "step": 4987, "total_num_tokens": 268088258.0, "z_loss": 0.0005102056893520057 }, { "copy_logits_max": -7.142280578613281, "copy_logits_min": -750000000.0, "copy_num_tokens": 355.8125, "epoch": 1.0187898902221089, "gen_logits_max": 5.204616069793701, "gen_logits_mean": -14.961231231689453, "gen_logits_min": -26.463558197021484, "gen_logits_std": 2.8391668796539307, "gen_loss": 0.35514533519744873, "grad_norm": 0.3933918866693142, "learning_rate": 2.4629052631578946e-05, "loss": 0.3034, "mean_copy_accuracy": 0.9962682723999023, "mean_gen_accuracy": 0.8670331239700317, "mean_token_accuracy": 0.8984850347042084, "num_tokens": 268340134.0, "sample_num_tokens": 8137.5, "step": 4988, "total_num_tokens": 268372684.0, "z_loss": 0.0007083818782120943 }, { "copy_logits_max": -7.1648640632629395, "copy_logits_min": -750000000.0, "copy_num_tokens": 565.75, "epoch": 1.0189941281593056, "gen_logits_max": 4.504892826080322, "gen_logits_mean": -14.696521759033203, "gen_logits_min": -26.20006561279297, "gen_logits_std": 2.8362069129943848, "gen_loss": 0.27059030532836914, "grad_norm": 0.40900251777740654, "learning_rate": 2.4627789473684214e-05, "loss": 0.2723, "mean_copy_accuracy": 0.9972752630710602, "mean_gen_accuracy": 0.8790233433246613, "mean_token_accuracy": 0.9089817106723785, "num_tokens": 268622658.0, "sample_num_tokens": 9165.0, "step": 4989, "total_num_tokens": 268659318.0, "z_loss": 0.0005361647345125675 }, { "copy_logits_max": -5.505785942077637, "copy_logits_min": -750000000.0, "copy_num_tokens": 469.125, "epoch": 1.0191983660965025, "gen_logits_max": 4.262315273284912, "gen_logits_mean": -14.486639022827148, "gen_logits_min": -26.308929443359375, "gen_logits_std": 2.870786190032959, "gen_loss": 0.26809197664260864, "grad_norm": 0.39847461377342136, "learning_rate": 2.4626526315789475e-05, "loss": 0.2748, "mean_copy_accuracy": 0.9949726015329361, "mean_gen_accuracy": 0.8799661993980408, "mean_token_accuracy": 0.9079967439174652, "num_tokens": 268896843.0, "sample_num_tokens": 7774.75, "step": 4990, "total_num_tokens": 268927942.0, "z_loss": 0.0005524156731553376 }, { "copy_logits_max": -6.59444522857666, "copy_logits_min": -750000064.0, "copy_num_tokens": 657.8125, "epoch": 1.0194026040336992, "gen_logits_max": 4.272098541259766, "gen_logits_mean": -15.67776107788086, "gen_logits_min": -27.654504776000977, "gen_logits_std": 2.9050393104553223, "gen_loss": 0.23584038019180298, "grad_norm": 0.5284428004873816, "learning_rate": 2.462526315789474e-05, "loss": 0.2807, "mean_copy_accuracy": 0.9938412308692932, "mean_gen_accuracy": 0.879604384303093, "mean_token_accuracy": 0.9065777063369751, "num_tokens": 269156994.0, "sample_num_tokens": 9086.5, "step": 4991, "total_num_tokens": 269193340.0, "z_loss": 0.0005453730700537562 }, { "copy_logits_max": -5.867384910583496, "copy_logits_min": -750000000.0, "copy_num_tokens": 452.5625, "epoch": 1.0196068419708961, "gen_logits_max": 4.860997676849365, "gen_logits_mean": -14.333066940307617, "gen_logits_min": -25.627872467041016, "gen_logits_std": 2.8286046981811523, "gen_loss": 0.3315466046333313, "grad_norm": 0.7533631329680743, "learning_rate": 2.4624e-05, "loss": 0.2935, "mean_copy_accuracy": 0.9959362298250198, "mean_gen_accuracy": 0.8672410398721695, "mean_token_accuracy": 0.8995508253574371, "num_tokens": 269432270.0, "sample_num_tokens": 8499.5, "step": 4992, "total_num_tokens": 269466268.0, "z_loss": 0.0007411888800561428 }, { "copy_logits_max": -3.9995880126953125, "copy_logits_min": -687500032.0, "copy_num_tokens": 392.375, "epoch": 1.0198110799080928, "gen_logits_max": 4.497509002685547, "gen_logits_mean": -15.274261474609375, "gen_logits_min": -26.567678451538086, "gen_logits_std": 2.82963490486145, "gen_loss": 0.28919556736946106, "grad_norm": 0.37659526674692256, "learning_rate": 2.4622736842105264e-05, "loss": 0.2536, "mean_copy_accuracy": 0.9968986511230469, "mean_gen_accuracy": 0.8817368745803833, "mean_token_accuracy": 0.9131265878677368, "num_tokens": 269727778.0, "sample_num_tokens": 7788.5, "step": 4993, "total_num_tokens": 269758932.0, "z_loss": 0.0007146216230466962 }, { "copy_logits_max": -5.941936492919922, "copy_logits_min": -750000000.0, "copy_num_tokens": 392.0, "epoch": 1.0200153178452898, "gen_logits_max": 5.168887138366699, "gen_logits_mean": -14.480953216552734, "gen_logits_min": -25.5665225982666, "gen_logits_std": 2.8146910667419434, "gen_loss": 0.28788524866104126, "grad_norm": 0.4122424274053911, "learning_rate": 2.4621473684210525e-05, "loss": 0.2824, "mean_copy_accuracy": 0.9957971274852753, "mean_gen_accuracy": 0.872060090303421, "mean_token_accuracy": 0.9054158478975296, "num_tokens": 269997606.0, "sample_num_tokens": 7273.0, "step": 4994, "total_num_tokens": 270026698.0, "z_loss": 0.0007008163956925273 }, { "copy_logits_max": -5.208930015563965, "copy_logits_min": -687500032.0, "copy_num_tokens": 678.1875, "epoch": 1.0202195557824867, "gen_logits_max": 4.676669120788574, "gen_logits_mean": -13.661941528320312, "gen_logits_min": -24.848827362060547, "gen_logits_std": 2.726628303527832, "gen_loss": 0.27335384488105774, "grad_norm": 0.4435412140761677, "learning_rate": 2.462021052631579e-05, "loss": 0.2792, "mean_copy_accuracy": 0.9967134147882462, "mean_gen_accuracy": 0.8733198195695877, "mean_token_accuracy": 0.9055413454771042, "num_tokens": 270266819.0, "sample_num_tokens": 10092.75, "step": 4995, "total_num_tokens": 270307190.0, "z_loss": 0.0007197819650173187 }, { "copy_logits_max": -5.7054572105407715, "copy_logits_min": -750000000.0, "copy_num_tokens": 396.25, "epoch": 1.0204237937196834, "gen_logits_max": 5.082125663757324, "gen_logits_mean": -13.88967227935791, "gen_logits_min": -24.955013275146484, "gen_logits_std": 2.7815542221069336, "gen_loss": 0.3026348352432251, "grad_norm": 0.3703244190833655, "learning_rate": 2.461894736842105e-05, "loss": 0.282, "mean_copy_accuracy": 0.9953241050243378, "mean_gen_accuracy": 0.875806525349617, "mean_token_accuracy": 0.9041286110877991, "num_tokens": 270535050.0, "sample_num_tokens": 7940.5, "step": 4996, "total_num_tokens": 270566812.0, "z_loss": 0.0006803121650591493 }, { "copy_logits_max": -3.0805959701538086, "copy_logits_min": -750000000.0, "copy_num_tokens": 517.8125, "epoch": 1.0206280316568803, "gen_logits_max": 4.657423496246338, "gen_logits_mean": -13.90780258178711, "gen_logits_min": -25.465198516845703, "gen_logits_std": 2.8236355781555176, "gen_loss": 0.32520073652267456, "grad_norm": 0.3824895184322279, "learning_rate": 2.4617684210526318e-05, "loss": 0.2983, "mean_copy_accuracy": 0.9962964504957199, "mean_gen_accuracy": 0.8740150779485703, "mean_token_accuracy": 0.9013053327798843, "num_tokens": 270789980.0, "sample_num_tokens": 8694.0, "step": 4997, "total_num_tokens": 270824756.0, "z_loss": 0.0007074870518408716 }, { "copy_logits_max": -6.54585075378418, "copy_logits_min": -750000000.0, "copy_num_tokens": 415.75, "epoch": 1.020832269594077, "gen_logits_max": 5.155655384063721, "gen_logits_mean": -13.739863395690918, "gen_logits_min": -24.971397399902344, "gen_logits_std": 2.7922468185424805, "gen_loss": 0.28977158665657043, "grad_norm": 0.3902112835537448, "learning_rate": 2.461642105263158e-05, "loss": 0.2818, "mean_copy_accuracy": 0.9960702210664749, "mean_gen_accuracy": 0.8723315596580505, "mean_token_accuracy": 0.9049345254898071, "num_tokens": 271075585.0, "sample_num_tokens": 8473.75, "step": 4998, "total_num_tokens": 271109480.0, "z_loss": 0.0006474857218563557 }, { "copy_logits_max": -6.42746114730835, "copy_logits_min": -687500032.0, "copy_num_tokens": 418.875, "epoch": 1.021036507531274, "gen_logits_max": 4.558202266693115, "gen_logits_mean": -14.919788360595703, "gen_logits_min": -26.1053466796875, "gen_logits_std": 2.827110767364502, "gen_loss": 0.2876740097999573, "grad_norm": 0.3978252283609279, "learning_rate": 2.4615157894736843e-05, "loss": 0.2759, "mean_copy_accuracy": 0.9953717589378357, "mean_gen_accuracy": 0.8803945481777191, "mean_token_accuracy": 0.9067545384168625, "num_tokens": 271357068.0, "sample_num_tokens": 8870.5, "step": 4999, "total_num_tokens": 271392550.0, "z_loss": 0.0005530874477699399 }, { "epoch": 1.0212407454684707, "grad_norm": 0.3876896608260599, "learning_rate": 2.4613894736842108e-05, "loss": 0.2983, "step": 5000 }, { "epoch": 1.0212407454684707, "eval_copy_logits_max": -7.177128314971924, "eval_copy_logits_min": -80.01925659179688, "eval_gen_logits_max": 3.5678622722625732, "eval_gen_logits_mean": -20.01795196533203, "eval_gen_logits_min": -31.043813705444336, "eval_gen_logits_std": 2.951282024383545, "eval_gen_loss": 0.3361864686012268, "eval_loss": 0.3168620765209198, "eval_mean_copy_accuracy": 0.9929614067077637, "eval_mean_gen_accuracy": 0.8746457695960999, "eval_mean_token_accuracy": 0.889828622341156, "eval_num_tokens": 271669096.0, "eval_runtime": 0.682, "eval_samples_per_second": 11.73, "eval_steps_per_second": 2.932, "eval_total_num_tokens": 271669096.0, "eval_z_loss": 0.0006203053053468466, "step": 5000 }, { "copy_logits_max": -6.510801792144775, "copy_logits_min": -750000064.0, "copy_num_tokens": 485.0625, "epoch": 1.0214449834056676, "gen_logits_max": 4.113002777099609, "gen_logits_mean": -16.818363189697266, "gen_logits_min": -28.40007781982422, "gen_logits_std": 2.90626859664917, "gen_loss": 0.2557424306869507, "grad_norm": 0.4197624700891361, "learning_rate": 2.461263157894737e-05, "loss": 0.2687, "mean_copy_accuracy": 0.9950580820441246, "mean_gen_accuracy": 0.8749018609523773, "mean_token_accuracy": 0.9039131179451942, "num_tokens": 271903830.0, "sample_num_tokens": 8738.5, "step": 5001, "total_num_tokens": 271938784.0, "z_loss": 0.0005580889992415905 }, { "copy_logits_max": -6.331523895263672, "copy_logits_min": -687500032.0, "copy_num_tokens": 561.625, "epoch": 1.0216492213428645, "gen_logits_max": 4.66064453125, "gen_logits_mean": -14.934980392456055, "gen_logits_min": -26.578506469726562, "gen_logits_std": 2.8260393142700195, "gen_loss": 0.23591050505638123, "grad_norm": 0.37707996674933025, "learning_rate": 2.4611368421052633e-05, "loss": 0.2764, "mean_copy_accuracy": 0.9948664754629135, "mean_gen_accuracy": 0.8783052712678909, "mean_token_accuracy": 0.9046909511089325, "num_tokens": 272181258.0, "sample_num_tokens": 9145.5, "step": 5002, "total_num_tokens": 272217840.0, "z_loss": 0.0005971912178210914 }, { "copy_logits_max": -4.666727066040039, "copy_logits_min": -750000000.0, "copy_num_tokens": 498.625, "epoch": 1.0218534592800612, "gen_logits_max": 4.746052265167236, "gen_logits_mean": -13.93578815460205, "gen_logits_min": -25.624094009399414, "gen_logits_std": 2.8126721382141113, "gen_loss": 0.28517261147499084, "grad_norm": 0.3696822400981487, "learning_rate": 2.4610105263157894e-05, "loss": 0.278, "mean_copy_accuracy": 0.9965347200632095, "mean_gen_accuracy": 0.8738219141960144, "mean_token_accuracy": 0.9058762788772583, "num_tokens": 272468148.0, "sample_num_tokens": 7965.0, "step": 5003, "total_num_tokens": 272500008.0, "z_loss": 0.0006292739417403936 }, { "copy_logits_max": -8.058012008666992, "copy_logits_min": -750000000.0, "copy_num_tokens": 495.1875, "epoch": 1.0220576972172581, "gen_logits_max": 5.373734951019287, "gen_logits_mean": -12.975724220275879, "gen_logits_min": -24.579853057861328, "gen_logits_std": 2.7159061431884766, "gen_loss": 0.2534099817276001, "grad_norm": 0.3898671376134474, "learning_rate": 2.4608842105263158e-05, "loss": 0.2691, "mean_copy_accuracy": 0.99505415558815, "mean_gen_accuracy": 0.8807952255010605, "mean_token_accuracy": 0.9079109877347946, "num_tokens": 272733631.0, "sample_num_tokens": 8452.75, "step": 5004, "total_num_tokens": 272767442.0, "z_loss": 0.0005397898494265974 }, { "copy_logits_max": -6.759568214416504, "copy_logits_min": -750000064.0, "copy_num_tokens": 450.0625, "epoch": 1.0222619351544548, "gen_logits_max": 4.467141151428223, "gen_logits_mean": -15.405118942260742, "gen_logits_min": -27.18990707397461, "gen_logits_std": 2.852869987487793, "gen_loss": 0.27372583746910095, "grad_norm": 0.4165436339179894, "learning_rate": 2.4607578947368423e-05, "loss": 0.303, "mean_copy_accuracy": 0.9949702024459839, "mean_gen_accuracy": 0.8698176443576813, "mean_token_accuracy": 0.8981924951076508, "num_tokens": 273000470.0, "sample_num_tokens": 8925.0, "step": 5005, "total_num_tokens": 273036170.0, "z_loss": 0.0005903298733755946 }, { "copy_logits_max": -6.2260565757751465, "copy_logits_min": -750000000.0, "copy_num_tokens": 514.875, "epoch": 1.0224661730916518, "gen_logits_max": 5.4342145919799805, "gen_logits_mean": -13.198781967163086, "gen_logits_min": -24.965457916259766, "gen_logits_std": 2.8175387382507324, "gen_loss": 0.2754935324192047, "grad_norm": 0.36450917234409874, "learning_rate": 2.4606315789473687e-05, "loss": 0.2664, "mean_copy_accuracy": 0.9961802363395691, "mean_gen_accuracy": 0.878911942243576, "mean_token_accuracy": 0.910786971449852, "num_tokens": 273277501.0, "sample_num_tokens": 8030.75, "step": 5006, "total_num_tokens": 273309624.0, "z_loss": 0.0006015907856635749 }, { "copy_logits_max": -6.790654182434082, "copy_logits_min": -750000000.0, "copy_num_tokens": 340.9375, "epoch": 1.0226704110288487, "gen_logits_max": 4.7696919441223145, "gen_logits_mean": -15.40781021118164, "gen_logits_min": -27.016496658325195, "gen_logits_std": 2.860619068145752, "gen_loss": 0.30936986207962036, "grad_norm": 0.37802051280059623, "learning_rate": 2.4605052631578948e-05, "loss": 0.2961, "mean_copy_accuracy": 0.9955718517303467, "mean_gen_accuracy": 0.8739819526672363, "mean_token_accuracy": 0.8992959856987, "num_tokens": 273539301.0, "sample_num_tokens": 7688.75, "step": 5007, "total_num_tokens": 273570056.0, "z_loss": 0.0005956476088613272 }, { "copy_logits_max": -5.884746551513672, "copy_logits_min": -687500032.0, "copy_num_tokens": 457.8125, "epoch": 1.0228746489660454, "gen_logits_max": 4.174535274505615, "gen_logits_mean": -16.212947845458984, "gen_logits_min": -27.447053909301758, "gen_logits_std": 2.8366174697875977, "gen_loss": 0.3230173587799072, "grad_norm": 0.36326973611011604, "learning_rate": 2.4603789473684212e-05, "loss": 0.2934, "mean_copy_accuracy": 0.9955889284610748, "mean_gen_accuracy": 0.8671797513961792, "mean_token_accuracy": 0.9012762159109116, "num_tokens": 273815109.0, "sample_num_tokens": 9179.25, "step": 5008, "total_num_tokens": 273851826.0, "z_loss": 0.0007035351009108126 }, { "copy_logits_max": -6.848129749298096, "copy_logits_min": -750000000.0, "copy_num_tokens": 450.4375, "epoch": 1.0230788869032423, "gen_logits_max": 4.420105457305908, "gen_logits_mean": -15.179616928100586, "gen_logits_min": -26.240272521972656, "gen_logits_std": 2.7903523445129395, "gen_loss": 0.2853509783744812, "grad_norm": 0.3906922470926741, "learning_rate": 2.4602526315789473e-05, "loss": 0.3037, "mean_copy_accuracy": 0.9956357777118683, "mean_gen_accuracy": 0.867554098367691, "mean_token_accuracy": 0.8976289927959442, "num_tokens": 274085523.0, "sample_num_tokens": 8823.25, "step": 5009, "total_num_tokens": 274120816.0, "z_loss": 0.000585596077144146 }, { "copy_logits_max": -7.018481731414795, "copy_logits_min": -750000000.0, "copy_num_tokens": 633.0, "epoch": 1.023283124840439, "gen_logits_max": 4.170204162597656, "gen_logits_mean": -14.844245910644531, "gen_logits_min": -26.536283493041992, "gen_logits_std": 2.802062511444092, "gen_loss": 0.25371819734573364, "grad_norm": 0.385664517442989, "learning_rate": 2.4601263157894737e-05, "loss": 0.2815, "mean_copy_accuracy": 0.9962302297353745, "mean_gen_accuracy": 0.8721852600574493, "mean_token_accuracy": 0.9042122513055801, "num_tokens": 274370587.0, "sample_num_tokens": 9804.75, "step": 5010, "total_num_tokens": 274409806.0, "z_loss": 0.0005385871045291424 }, { "copy_logits_max": -8.129878044128418, "copy_logits_min": -750000000.0, "copy_num_tokens": 503.875, "epoch": 1.023487362777636, "gen_logits_max": 4.292070388793945, "gen_logits_mean": -15.33521842956543, "gen_logits_min": -26.502487182617188, "gen_logits_std": 2.786818027496338, "gen_loss": 0.24985317885875702, "grad_norm": 0.4605479822227807, "learning_rate": 2.4599999999999998e-05, "loss": 0.2605, "mean_copy_accuracy": 0.9965757578611374, "mean_gen_accuracy": 0.8832125216722488, "mean_token_accuracy": 0.9115448445081711, "num_tokens": 274655337.0, "sample_num_tokens": 9375.25, "step": 5011, "total_num_tokens": 274692838.0, "z_loss": 0.00048180343583226204 }, { "copy_logits_max": -6.492671012878418, "copy_logits_min": -750000128.0, "copy_num_tokens": 563.3125, "epoch": 1.0236916007148327, "gen_logits_max": 3.8896994590759277, "gen_logits_mean": -15.50847339630127, "gen_logits_min": -27.599288940429688, "gen_logits_std": 2.874720335006714, "gen_loss": 0.28705495595932007, "grad_norm": 0.3712915565714047, "learning_rate": 2.4598736842105263e-05, "loss": 0.2648, "mean_copy_accuracy": 0.9967790991067886, "mean_gen_accuracy": 0.8762392550706863, "mean_token_accuracy": 0.9106222540140152, "num_tokens": 274942251.0, "sample_num_tokens": 8983.25, "step": 5012, "total_num_tokens": 274978184.0, "z_loss": 0.0005261123878881335 }, { "copy_logits_max": -5.492677688598633, "copy_logits_min": -625000064.0, "copy_num_tokens": 660.3125, "epoch": 1.0238958386520296, "gen_logits_max": 4.910604953765869, "gen_logits_mean": -12.9277982711792, "gen_logits_min": -25.334686279296875, "gen_logits_std": 2.789544105529785, "gen_loss": 0.2473295032978058, "grad_norm": 0.43837168733413673, "learning_rate": 2.4597473684210527e-05, "loss": 0.2662, "mean_copy_accuracy": 0.9955042153596878, "mean_gen_accuracy": 0.8778007924556732, "mean_token_accuracy": 0.910341203212738, "num_tokens": 275222469.0, "sample_num_tokens": 9167.75, "step": 5013, "total_num_tokens": 275259140.0, "z_loss": 0.0005703697679564357 }, { "copy_logits_max": -4.205293655395508, "copy_logits_min": -750000000.0, "copy_num_tokens": 548.8125, "epoch": 1.0241000765892265, "gen_logits_max": 3.9878411293029785, "gen_logits_mean": -15.50687313079834, "gen_logits_min": -27.38152503967285, "gen_logits_std": 2.8471059799194336, "gen_loss": 0.2514139413833618, "grad_norm": 0.3778344662751264, "learning_rate": 2.459621052631579e-05, "loss": 0.2769, "mean_copy_accuracy": 0.9949559569358826, "mean_gen_accuracy": 0.8750663846731186, "mean_token_accuracy": 0.905498206615448, "num_tokens": 275500667.0, "sample_num_tokens": 8427.25, "step": 5014, "total_num_tokens": 275534376.0, "z_loss": 0.0006589241675101221 }, { "copy_logits_max": -5.601794242858887, "copy_logits_min": -750000000.0, "copy_num_tokens": 405.625, "epoch": 1.0243043145264232, "gen_logits_max": 4.000378608703613, "gen_logits_mean": -15.634382247924805, "gen_logits_min": -27.07170295715332, "gen_logits_std": 2.7945871353149414, "gen_loss": 0.2758353054523468, "grad_norm": 0.3927955876063701, "learning_rate": 2.4594947368421055e-05, "loss": 0.2886, "mean_copy_accuracy": 0.9957681894302368, "mean_gen_accuracy": 0.874749019742012, "mean_token_accuracy": 0.9024048298597336, "num_tokens": 275761396.0, "sample_num_tokens": 7904.0, "step": 5015, "total_num_tokens": 275793012.0, "z_loss": 0.0006813708459958434 }, { "copy_logits_max": -5.571203708648682, "copy_logits_min": -687500032.0, "copy_num_tokens": 459.0625, "epoch": 1.0245085524636202, "gen_logits_max": 4.317575454711914, "gen_logits_mean": -14.696775436401367, "gen_logits_min": -26.40725326538086, "gen_logits_std": 2.778330087661743, "gen_loss": 0.2298097312450409, "grad_norm": 0.38145680025273593, "learning_rate": 2.4593684210526316e-05, "loss": 0.2819, "mean_copy_accuracy": 0.996101588010788, "mean_gen_accuracy": 0.8745695650577545, "mean_token_accuracy": 0.9042575359344482, "num_tokens": 276018500.0, "sample_num_tokens": 8001.5, "step": 5016, "total_num_tokens": 276050506.0, "z_loss": 0.0005974710220471025 }, { "copy_logits_max": -4.562416076660156, "copy_logits_min": -750000000.0, "copy_num_tokens": 532.5625, "epoch": 1.0247127904008169, "gen_logits_max": 4.146439552307129, "gen_logits_mean": -14.65458869934082, "gen_logits_min": -25.85440444946289, "gen_logits_std": 2.764039993286133, "gen_loss": 0.2522837519645691, "grad_norm": 0.3844316899303356, "learning_rate": 2.459242105263158e-05, "loss": 0.2619, "mean_copy_accuracy": 0.9962157607078552, "mean_gen_accuracy": 0.8809385597705841, "mean_token_accuracy": 0.91079381108284, "num_tokens": 276287712.0, "sample_num_tokens": 9145.5, "step": 5017, "total_num_tokens": 276324294.0, "z_loss": 0.0005958439433015883 }, { "copy_logits_max": -2.477931499481201, "copy_logits_min": -687500032.0, "copy_num_tokens": 728.5, "epoch": 1.0249170283380138, "gen_logits_max": 3.5095930099487305, "gen_logits_mean": -15.058486938476562, "gen_logits_min": -26.437599182128906, "gen_logits_std": 2.736348867416382, "gen_loss": 0.2686573565006256, "grad_norm": 0.41812436641997985, "learning_rate": 2.459115789473684e-05, "loss": 0.2841, "mean_copy_accuracy": 0.9955596029758453, "mean_gen_accuracy": 0.8704781383275986, "mean_token_accuracy": 0.9053214490413666, "num_tokens": 276554614.0, "sample_num_tokens": 10083.5, "step": 5018, "total_num_tokens": 276594948.0, "z_loss": 0.0006888255011290312 }, { "copy_logits_max": -6.178704738616943, "copy_logits_min": -750000000.0, "copy_num_tokens": 445.375, "epoch": 1.0251212662752107, "gen_logits_max": 4.154997825622559, "gen_logits_mean": -15.735725402832031, "gen_logits_min": -27.10692596435547, "gen_logits_std": 2.804426670074463, "gen_loss": 0.2657691240310669, "grad_norm": 0.38664472781538506, "learning_rate": 2.4589894736842106e-05, "loss": 0.2739, "mean_copy_accuracy": 0.9958252757787704, "mean_gen_accuracy": 0.8784540593624115, "mean_token_accuracy": 0.9071099907159805, "num_tokens": 276839365.0, "sample_num_tokens": 8684.75, "step": 5019, "total_num_tokens": 276874104.0, "z_loss": 0.0005848369910381734 }, { "copy_logits_max": -6.37693977355957, "copy_logits_min": -687500032.0, "copy_num_tokens": 596.625, "epoch": 1.0253255042124074, "gen_logits_max": 5.695181846618652, "gen_logits_mean": -11.654083251953125, "gen_logits_min": -23.768386840820312, "gen_logits_std": 2.6210739612579346, "gen_loss": 0.24321842193603516, "grad_norm": 0.3940442323722837, "learning_rate": 2.4588631578947367e-05, "loss": 0.2891, "mean_copy_accuracy": 0.9964360743761063, "mean_gen_accuracy": 0.8736999481916428, "mean_token_accuracy": 0.9015889018774033, "num_tokens": 277124370.0, "sample_num_tokens": 10033.0, "step": 5020, "total_num_tokens": 277164502.0, "z_loss": 0.0004763858742080629 }, { "copy_logits_max": -3.7018094062805176, "copy_logits_min": -750000000.0, "copy_num_tokens": 388.75, "epoch": 1.0255297421496044, "gen_logits_max": 4.242862224578857, "gen_logits_mean": -15.270646095275879, "gen_logits_min": -26.550777435302734, "gen_logits_std": 2.760639190673828, "gen_loss": 0.28652647137641907, "grad_norm": 0.3712976982452232, "learning_rate": 2.458736842105263e-05, "loss": 0.2815, "mean_copy_accuracy": 0.9950657933950424, "mean_gen_accuracy": 0.8741383105516434, "mean_token_accuracy": 0.9027145802974701, "num_tokens": 277383462.0, "sample_num_tokens": 7779.5, "step": 5021, "total_num_tokens": 277414580.0, "z_loss": 0.0006293824408203363 }, { "copy_logits_max": -5.1238203048706055, "copy_logits_min": -750000000.0, "copy_num_tokens": 406.25, "epoch": 1.025733980086801, "gen_logits_max": 5.462405204772949, "gen_logits_mean": -12.717002868652344, "gen_logits_min": -23.809947967529297, "gen_logits_std": 2.6149351596832275, "gen_loss": 0.2959476113319397, "grad_norm": 0.3489384242007155, "learning_rate": 2.4586105263157896e-05, "loss": 0.2546, "mean_copy_accuracy": 0.9959820955991745, "mean_gen_accuracy": 0.8844281584024429, "mean_token_accuracy": 0.9137985855340958, "num_tokens": 277674515.0, "sample_num_tokens": 7841.75, "step": 5022, "total_num_tokens": 277705882.0, "z_loss": 0.000642852159217 }, { "copy_logits_max": -5.604525089263916, "copy_logits_min": -750000000.0, "copy_num_tokens": 374.75, "epoch": 1.025938218023998, "gen_logits_max": 4.914126396179199, "gen_logits_mean": -15.26266098022461, "gen_logits_min": -26.329090118408203, "gen_logits_std": 2.763033390045166, "gen_loss": 0.2932198643684387, "grad_norm": 0.4045144744705841, "learning_rate": 2.458484210526316e-05, "loss": 0.2957, "mean_copy_accuracy": 0.9961509853601456, "mean_gen_accuracy": 0.8710203319787979, "mean_token_accuracy": 0.9004476070404053, "num_tokens": 277929712.0, "sample_num_tokens": 8014.5, "step": 5023, "total_num_tokens": 277961770.0, "z_loss": 0.0005918889073655009 }, { "copy_logits_max": -7.272974014282227, "copy_logits_min": -750000000.0, "copy_num_tokens": 596.6875, "epoch": 1.0261424559611947, "gen_logits_max": 3.8956122398376465, "gen_logits_mean": -15.38126277923584, "gen_logits_min": -26.143463134765625, "gen_logits_std": 2.7235212326049805, "gen_loss": 0.2770857810974121, "grad_norm": 0.39634630436977547, "learning_rate": 2.458357894736842e-05, "loss": 0.3024, "mean_copy_accuracy": 0.9969988763332367, "mean_gen_accuracy": 0.8709638118743896, "mean_token_accuracy": 0.8996506184339523, "num_tokens": 278182255.0, "sample_num_tokens": 10830.25, "step": 5024, "total_num_tokens": 278225576.0, "z_loss": 0.0005408894503489137 }, { "copy_logits_max": -2.7451930046081543, "copy_logits_min": -687500032.0, "copy_num_tokens": 404.3125, "epoch": 1.0263466938983916, "gen_logits_max": 5.135306358337402, "gen_logits_mean": -13.198633193969727, "gen_logits_min": -24.09001922607422, "gen_logits_std": 2.645326614379883, "gen_loss": 0.30184274911880493, "grad_norm": 0.39011102225503336, "learning_rate": 2.4582315789473685e-05, "loss": 0.2833, "mean_copy_accuracy": 0.9952104240655899, "mean_gen_accuracy": 0.8749504536390305, "mean_token_accuracy": 0.9046538323163986, "num_tokens": 278437334.0, "sample_num_tokens": 7427.0, "step": 5025, "total_num_tokens": 278467042.0, "z_loss": 0.0006561148911714554 }, { "copy_logits_max": -5.184500694274902, "copy_logits_min": -687500032.0, "copy_num_tokens": 451.3125, "epoch": 1.0265509318355885, "gen_logits_max": 3.8943562507629395, "gen_logits_mean": -15.97603988647461, "gen_logits_min": -27.00460433959961, "gen_logits_std": 2.7755074501037598, "gen_loss": 0.2591134309768677, "grad_norm": 0.41607358998540905, "learning_rate": 2.4581052631578946e-05, "loss": 0.2734, "mean_copy_accuracy": 0.9957414716482162, "mean_gen_accuracy": 0.8764853626489639, "mean_token_accuracy": 0.907370388507843, "num_tokens": 278707425.0, "sample_num_tokens": 7753.25, "step": 5026, "total_num_tokens": 278738438.0, "z_loss": 0.0005513541400432587 }, { "copy_logits_max": -7.171484470367432, "copy_logits_min": -750000000.0, "copy_num_tokens": 295.375, "epoch": 1.0267551697727852, "gen_logits_max": 4.979370594024658, "gen_logits_mean": -15.916259765625, "gen_logits_min": -26.805654525756836, "gen_logits_std": 2.786345958709717, "gen_loss": 0.29590779542922974, "grad_norm": 0.38573247798955695, "learning_rate": 2.457978947368421e-05, "loss": 0.2681, "mean_copy_accuracy": 0.994818240404129, "mean_gen_accuracy": 0.8857828676700592, "mean_token_accuracy": 0.9085762351751328, "num_tokens": 278963330.0, "sample_num_tokens": 7008.5, "step": 5027, "total_num_tokens": 278991364.0, "z_loss": 0.0006087085930630565 }, { "copy_logits_max": -4.031556129455566, "copy_logits_min": -750000000.0, "copy_num_tokens": 540.375, "epoch": 1.0269594077099822, "gen_logits_max": 3.433485984802246, "gen_logits_mean": -15.818422317504883, "gen_logits_min": -26.730918884277344, "gen_logits_std": 2.750682830810547, "gen_loss": 0.2842978835105896, "grad_norm": 0.4273588974579448, "learning_rate": 2.4578526315789475e-05, "loss": 0.2986, "mean_copy_accuracy": 0.9953806400299072, "mean_gen_accuracy": 0.8725376427173615, "mean_token_accuracy": 0.9004615247249603, "num_tokens": 279207303.0, "sample_num_tokens": 8611.75, "step": 5028, "total_num_tokens": 279241750.0, "z_loss": 0.0005816896446049213 }, { "copy_logits_max": -7.474861145019531, "copy_logits_min": -750000000.0, "copy_num_tokens": 358.625, "epoch": 1.0271636456471789, "gen_logits_max": 5.076208114624023, "gen_logits_mean": -15.197096824645996, "gen_logits_min": -26.09121322631836, "gen_logits_std": 2.7413249015808105, "gen_loss": 0.2650105357170105, "grad_norm": 0.43592882551617834, "learning_rate": 2.4577263157894736e-05, "loss": 0.2611, "mean_copy_accuracy": 0.9955080151557922, "mean_gen_accuracy": 0.8847320526838303, "mean_token_accuracy": 0.911616176366806, "num_tokens": 279481681.0, "sample_num_tokens": 8432.25, "step": 5029, "total_num_tokens": 279515410.0, "z_loss": 0.0005711316480301321 }, { "copy_logits_max": -4.052369117736816, "copy_logits_min": -750000064.0, "copy_num_tokens": 540.125, "epoch": 1.0273678835843758, "gen_logits_max": 4.817586898803711, "gen_logits_mean": -14.303890228271484, "gen_logits_min": -25.694313049316406, "gen_logits_std": 2.7544775009155273, "gen_loss": 0.24979273974895477, "grad_norm": 0.4032266149075124, "learning_rate": 2.4576000000000003e-05, "loss": 0.2704, "mean_copy_accuracy": 0.9935918599367142, "mean_gen_accuracy": 0.8841437548398972, "mean_token_accuracy": 0.906956672668457, "num_tokens": 279727589.0, "sample_num_tokens": 9139.75, "step": 5030, "total_num_tokens": 279764148.0, "z_loss": 0.0005821750382892787 }, { "copy_logits_max": -5.626419544219971, "copy_logits_min": -750000064.0, "copy_num_tokens": 509.1875, "epoch": 1.0275721215215727, "gen_logits_max": 4.7722039222717285, "gen_logits_mean": -13.392127990722656, "gen_logits_min": -23.90515899658203, "gen_logits_std": 2.6374247074127197, "gen_loss": 0.2641010284423828, "grad_norm": 0.4095152732160666, "learning_rate": 2.4574736842105264e-05, "loss": 0.2858, "mean_copy_accuracy": 0.995170995593071, "mean_gen_accuracy": 0.874326691031456, "mean_token_accuracy": 0.9018535912036896, "num_tokens": 279992633.0, "sample_num_tokens": 8707.75, "step": 5031, "total_num_tokens": 280027464.0, "z_loss": 0.0006124904612079263 }, { "copy_logits_max": -3.479680061340332, "copy_logits_min": -750000000.0, "copy_num_tokens": 482.625, "epoch": 1.0277763594587694, "gen_logits_max": 4.818626403808594, "gen_logits_mean": -14.446893692016602, "gen_logits_min": -25.240833282470703, "gen_logits_std": 2.701632499694824, "gen_loss": 0.2844085693359375, "grad_norm": 0.3891398137679573, "learning_rate": 2.457347368421053e-05, "loss": 0.2699, "mean_copy_accuracy": 0.9963858425617218, "mean_gen_accuracy": 0.8790594339370728, "mean_token_accuracy": 0.9083276242017746, "num_tokens": 280253664.0, "sample_num_tokens": 8252.5, "step": 5032, "total_num_tokens": 280286674.0, "z_loss": 0.0006955629214644432 }, { "copy_logits_max": -4.5698089599609375, "copy_logits_min": -750000000.0, "copy_num_tokens": 427.1875, "epoch": 1.0279805973959664, "gen_logits_max": 4.280786991119385, "gen_logits_mean": -15.110215187072754, "gen_logits_min": -25.970829010009766, "gen_logits_std": 2.7260026931762695, "gen_loss": 0.24359208345413208, "grad_norm": 0.4050165295136474, "learning_rate": 2.457221052631579e-05, "loss": 0.2745, "mean_copy_accuracy": 0.9957754760980606, "mean_gen_accuracy": 0.8804001957178116, "mean_token_accuracy": 0.9080038666725159, "num_tokens": 280524369.0, "sample_num_tokens": 7867.75, "step": 5033, "total_num_tokens": 280555840.0, "z_loss": 0.0006165947415865958 }, { "copy_logits_max": -3.184300422668457, "copy_logits_min": -750000000.0, "copy_num_tokens": 543.125, "epoch": 1.028184835333163, "gen_logits_max": 3.9799749851226807, "gen_logits_mean": -14.073183059692383, "gen_logits_min": -25.216386795043945, "gen_logits_std": 2.7126641273498535, "gen_loss": 0.2902370095252991, "grad_norm": 0.415329066597481, "learning_rate": 2.4570947368421054e-05, "loss": 0.2689, "mean_copy_accuracy": 0.9954413920640945, "mean_gen_accuracy": 0.8813555538654327, "mean_token_accuracy": 0.9076801538467407, "num_tokens": 280788096.0, "sample_num_tokens": 8806.5, "step": 5034, "total_num_tokens": 280823322.0, "z_loss": 0.000686979154124856 }, { "copy_logits_max": -6.046469688415527, "copy_logits_min": -687500032.0, "copy_num_tokens": 484.25, "epoch": 1.02838907327036, "gen_logits_max": 4.142278671264648, "gen_logits_mean": -13.815886497497559, "gen_logits_min": -24.684856414794922, "gen_logits_std": 2.648247718811035, "gen_loss": 0.2596225440502167, "grad_norm": 0.41772713490408847, "learning_rate": 2.4569684210526315e-05, "loss": 0.2811, "mean_copy_accuracy": 0.9958683997392654, "mean_gen_accuracy": 0.8741777539253235, "mean_token_accuracy": 0.9042665511369705, "num_tokens": 281046035.0, "sample_num_tokens": 8754.75, "step": 5035, "total_num_tokens": 281081054.0, "z_loss": 0.0005302302306517959 }, { "copy_logits_max": -5.9887285232543945, "copy_logits_min": -750000000.0, "copy_num_tokens": 466.125, "epoch": 1.0285933112075567, "gen_logits_max": 3.6873836517333984, "gen_logits_mean": -15.837034225463867, "gen_logits_min": -26.92165756225586, "gen_logits_std": 2.7915215492248535, "gen_loss": 0.2707463502883911, "grad_norm": 0.45719680764573345, "learning_rate": 2.456842105263158e-05, "loss": 0.2977, "mean_copy_accuracy": 0.994812622666359, "mean_gen_accuracy": 0.8689763695001602, "mean_token_accuracy": 0.900464728474617, "num_tokens": 281310275.0, "sample_num_tokens": 7931.25, "step": 5036, "total_num_tokens": 281342000.0, "z_loss": 0.0005819425568915904 }, { "copy_logits_max": -6.5488810539245605, "copy_logits_min": -750000000.0, "copy_num_tokens": 701.9375, "epoch": 1.0287975491447536, "gen_logits_max": 3.486476421356201, "gen_logits_mean": -14.79967975616455, "gen_logits_min": -25.717172622680664, "gen_logits_std": 2.726675033569336, "gen_loss": 0.2598280608654022, "grad_norm": 0.38248895479685135, "learning_rate": 2.456715789473684e-05, "loss": 0.2681, "mean_copy_accuracy": 0.9962031096220016, "mean_gen_accuracy": 0.8778730034828186, "mean_token_accuracy": 0.9096039980649948, "num_tokens": 281602590.0, "sample_num_tokens": 9793.0, "step": 5037, "total_num_tokens": 281641762.0, "z_loss": 0.0005653456901200116 }, { "copy_logits_max": -5.2031121253967285, "copy_logits_min": -750000000.0, "copy_num_tokens": 503.875, "epoch": 1.0290017870819506, "gen_logits_max": 3.7275195121765137, "gen_logits_mean": -14.51305103302002, "gen_logits_min": -25.572757720947266, "gen_logits_std": 2.705838680267334, "gen_loss": 0.2922757565975189, "grad_norm": 0.3962735920027915, "learning_rate": 2.4565894736842108e-05, "loss": 0.275, "mean_copy_accuracy": 0.9951398223638535, "mean_gen_accuracy": 0.8781405687332153, "mean_token_accuracy": 0.9053212553262711, "num_tokens": 281853618.0, "sample_num_tokens": 8489.0, "step": 5038, "total_num_tokens": 281887574.0, "z_loss": 0.0005774974124506116 }, { "copy_logits_max": -6.241415500640869, "copy_logits_min": -750000000.0, "copy_num_tokens": 725.9375, "epoch": 1.0292060250191473, "gen_logits_max": 2.8782546520233154, "gen_logits_mean": -15.559142112731934, "gen_logits_min": -26.736482620239258, "gen_logits_std": 2.782522201538086, "gen_loss": 0.2327985167503357, "grad_norm": 0.41853380799724266, "learning_rate": 2.456463157894737e-05, "loss": 0.2776, "mean_copy_accuracy": 0.9959084987640381, "mean_gen_accuracy": 0.874941498041153, "mean_token_accuracy": 0.9067503213882446, "num_tokens": 282126170.0, "sample_num_tokens": 10329.0, "step": 5039, "total_num_tokens": 282167486.0, "z_loss": 0.0004935772158205509 }, { "copy_logits_max": -6.126343727111816, "copy_logits_min": -750000064.0, "copy_num_tokens": 642.3125, "epoch": 1.0294102629563442, "gen_logits_max": 4.225702285766602, "gen_logits_mean": -13.791519165039062, "gen_logits_min": -25.051185607910156, "gen_logits_std": 2.633932590484619, "gen_loss": 0.26774853467941284, "grad_norm": 0.3559876753381173, "learning_rate": 2.4563368421052633e-05, "loss": 0.2616, "mean_copy_accuracy": 0.9960682541131973, "mean_gen_accuracy": 0.8833546936511993, "mean_token_accuracy": 0.9132284671068192, "num_tokens": 282416770.0, "sample_num_tokens": 9818.0, "step": 5040, "total_num_tokens": 282456042.0, "z_loss": 0.000616309349425137 }, { "copy_logits_max": -5.764829635620117, "copy_logits_min": -750000000.0, "copy_num_tokens": 313.375, "epoch": 1.029614500893541, "gen_logits_max": 4.389492511749268, "gen_logits_mean": -14.544355392456055, "gen_logits_min": -25.722858428955078, "gen_logits_std": 2.7340149879455566, "gen_loss": 0.34382501244544983, "grad_norm": 0.40449575548909233, "learning_rate": 2.4562105263157897e-05, "loss": 0.3059, "mean_copy_accuracy": 0.995092511177063, "mean_gen_accuracy": 0.8700366020202637, "mean_token_accuracy": 0.8952654302120209, "num_tokens": 282682623.0, "sample_num_tokens": 7062.75, "step": 5041, "total_num_tokens": 282710874.0, "z_loss": 0.0006982877384871244 }, { "copy_logits_max": -8.73725414276123, "copy_logits_min": -750000000.0, "copy_num_tokens": 358.3125, "epoch": 1.0298187388307378, "gen_logits_max": 3.2625231742858887, "gen_logits_mean": -17.315797805786133, "gen_logits_min": -28.349645614624023, "gen_logits_std": 2.839426040649414, "gen_loss": 0.26334866881370544, "grad_norm": 0.4216100774397023, "learning_rate": 2.4560842105263158e-05, "loss": 0.2713, "mean_copy_accuracy": 0.9956820607185364, "mean_gen_accuracy": 0.8811897039413452, "mean_token_accuracy": 0.9081854075193405, "num_tokens": 282945168.0, "sample_num_tokens": 7584.0, "step": 5042, "total_num_tokens": 282975504.0, "z_loss": 0.000494401203468442 }, { "copy_logits_max": -6.8899006843566895, "copy_logits_min": -750000000.0, "copy_num_tokens": 505.3125, "epoch": 1.0300229767679348, "gen_logits_max": 4.249794960021973, "gen_logits_mean": -14.19198989868164, "gen_logits_min": -25.435596466064453, "gen_logits_std": 2.7270092964172363, "gen_loss": 0.2671946585178375, "grad_norm": 0.41285904110777866, "learning_rate": 2.4559578947368422e-05, "loss": 0.2914, "mean_copy_accuracy": 0.9950875341892242, "mean_gen_accuracy": 0.872899129986763, "mean_token_accuracy": 0.9021845012903214, "num_tokens": 283201491.0, "sample_num_tokens": 8299.25, "step": 5043, "total_num_tokens": 283234688.0, "z_loss": 0.0005322502111084759 }, { "copy_logits_max": -7.047264099121094, "copy_logits_min": -687500032.0, "copy_num_tokens": 512.875, "epoch": 1.0302272147051315, "gen_logits_max": 3.756923198699951, "gen_logits_mean": -15.914899826049805, "gen_logits_min": -27.132831573486328, "gen_logits_std": 2.7946901321411133, "gen_loss": 0.24589306116104126, "grad_norm": 0.3588193486881215, "learning_rate": 2.4558315789473683e-05, "loss": 0.2564, "mean_copy_accuracy": 0.9963044375181198, "mean_gen_accuracy": 0.8831792622804642, "mean_token_accuracy": 0.9121691733598709, "num_tokens": 283472202.0, "sample_num_tokens": 8465.0, "step": 5044, "total_num_tokens": 283506062.0, "z_loss": 0.0005117782857269049 }, { "copy_logits_max": -8.945173263549805, "copy_logits_min": -750000000.0, "copy_num_tokens": 470.8125, "epoch": 1.0304314526423284, "gen_logits_max": 4.884258270263672, "gen_logits_mean": -13.881861686706543, "gen_logits_min": -24.4603328704834, "gen_logits_std": 2.644596576690674, "gen_loss": 0.32273024320602417, "grad_norm": 0.4209241957115228, "learning_rate": 2.4557052631578948e-05, "loss": 0.3134, "mean_copy_accuracy": 0.9958267360925674, "mean_gen_accuracy": 0.8614966422319412, "mean_token_accuracy": 0.8942521214485168, "num_tokens": 283743386.0, "sample_num_tokens": 10236.5, "step": 5045, "total_num_tokens": 283784332.0, "z_loss": 0.0006078286678530276 }, { "copy_logits_max": -4.902764797210693, "copy_logits_min": -750000000.0, "copy_num_tokens": 635.0, "epoch": 1.030635690579525, "gen_logits_max": 3.645254135131836, "gen_logits_mean": -14.414937019348145, "gen_logits_min": -25.530351638793945, "gen_logits_std": 2.737658739089966, "gen_loss": 0.25926053524017334, "grad_norm": 0.3826674255085646, "learning_rate": 2.4555789473684212e-05, "loss": 0.2918, "mean_copy_accuracy": 0.9960981458425522, "mean_gen_accuracy": 0.8702802211046219, "mean_token_accuracy": 0.9015525579452515, "num_tokens": 284026595.0, "sample_num_tokens": 9369.25, "step": 5046, "total_num_tokens": 284064072.0, "z_loss": 0.0005450246389955282 }, { "copy_logits_max": -6.7949538230896, "copy_logits_min": -750000000.0, "copy_num_tokens": 339.8125, "epoch": 1.030839928516722, "gen_logits_max": 4.214114665985107, "gen_logits_mean": -15.849050521850586, "gen_logits_min": -26.84271812438965, "gen_logits_std": 2.78926157951355, "gen_loss": 0.2769564390182495, "grad_norm": 0.3802062066734658, "learning_rate": 2.4554526315789476e-05, "loss": 0.2837, "mean_copy_accuracy": 0.9959670603275299, "mean_gen_accuracy": 0.8741207271814346, "mean_token_accuracy": 0.9031496495008469, "num_tokens": 284298695.0, "sample_num_tokens": 7713.75, "step": 5047, "total_num_tokens": 284329550.0, "z_loss": 0.0005915788351558149 }, { "copy_logits_max": -6.682029724121094, "copy_logits_min": -687500032.0, "copy_num_tokens": 302.125, "epoch": 1.0310441664539187, "gen_logits_max": 5.205192565917969, "gen_logits_mean": -14.42568588256836, "gen_logits_min": -25.42091178894043, "gen_logits_std": 2.7162868976593018, "gen_loss": 0.3182055950164795, "grad_norm": 0.4121258609653239, "learning_rate": 2.4553263157894737e-05, "loss": 0.2962, "mean_copy_accuracy": 0.9962402582168579, "mean_gen_accuracy": 0.8708741664886475, "mean_token_accuracy": 0.8996078222990036, "num_tokens": 284572991.0, "sample_num_tokens": 7463.25, "step": 5048, "total_num_tokens": 284602844.0, "z_loss": 0.0006319079548120499 }, { "copy_logits_max": -8.177193641662598, "copy_logits_min": -750000000.0, "copy_num_tokens": 453.8125, "epoch": 1.0312484043911156, "gen_logits_max": 4.434475421905518, "gen_logits_mean": -14.205048561096191, "gen_logits_min": -25.509429931640625, "gen_logits_std": 2.79396915435791, "gen_loss": 0.27924373745918274, "grad_norm": 0.40684245268009905, "learning_rate": 2.4552e-05, "loss": 0.2777, "mean_copy_accuracy": 0.9950393587350845, "mean_gen_accuracy": 0.8755856454372406, "mean_token_accuracy": 0.9054805487394333, "num_tokens": 284850949.0, "sample_num_tokens": 8777.75, "step": 5049, "total_num_tokens": 284886060.0, "z_loss": 0.0005149122443981469 }, { "copy_logits_max": -6.4147748947143555, "copy_logits_min": -687500032.0, "copy_num_tokens": 447.6875, "epoch": 1.0314526423283126, "gen_logits_max": 3.2099924087524414, "gen_logits_mean": -16.990089416503906, "gen_logits_min": -28.472009658813477, "gen_logits_std": 2.9026973247528076, "gen_loss": 0.2381342053413391, "grad_norm": 0.3933315146977368, "learning_rate": 2.4550736842105262e-05, "loss": 0.2741, "mean_copy_accuracy": 0.9962601065635681, "mean_gen_accuracy": 0.8757808357477188, "mean_token_accuracy": 0.9059579074382782, "num_tokens": 285131270.0, "sample_num_tokens": 8037.0, "step": 5050, "total_num_tokens": 285163418.0, "z_loss": 0.00048504883307032287 }, { "copy_logits_max": -5.779204368591309, "copy_logits_min": -687500032.0, "copy_num_tokens": 705.75, "epoch": 1.0316568802655093, "gen_logits_max": 3.188105821609497, "gen_logits_mean": -15.781702041625977, "gen_logits_min": -27.785247802734375, "gen_logits_std": 2.8866586685180664, "gen_loss": 0.2329440712928772, "grad_norm": 0.38298720650241624, "learning_rate": 2.4549473684210527e-05, "loss": 0.2482, "mean_copy_accuracy": 0.9960924834012985, "mean_gen_accuracy": 0.8864518851041794, "mean_token_accuracy": 0.9149858057498932, "num_tokens": 285398886.0, "sample_num_tokens": 10370.5, "step": 5051, "total_num_tokens": 285440368.0, "z_loss": 0.000520972243975848 }, { "copy_logits_max": -6.275723934173584, "copy_logits_min": -687500032.0, "copy_num_tokens": 442.875, "epoch": 1.0318611182027062, "gen_logits_max": 4.256430149078369, "gen_logits_mean": -14.493526458740234, "gen_logits_min": -25.870819091796875, "gen_logits_std": 2.830970525741577, "gen_loss": 0.3011304438114166, "grad_norm": 0.3965450496951069, "learning_rate": 2.4548210526315788e-05, "loss": 0.2705, "mean_copy_accuracy": 0.9950816184282303, "mean_gen_accuracy": 0.878161683678627, "mean_token_accuracy": 0.9078847318887711, "num_tokens": 285677083.0, "sample_num_tokens": 7943.75, "step": 5052, "total_num_tokens": 285708858.0, "z_loss": 0.000606766261626035 }, { "copy_logits_max": -6.987390041351318, "copy_logits_min": -687500096.0, "copy_num_tokens": 364.625, "epoch": 1.032065356139903, "gen_logits_max": 3.9469757080078125, "gen_logits_mean": -15.644357681274414, "gen_logits_min": -27.54291534423828, "gen_logits_std": 2.8614981174468994, "gen_loss": 0.2740950584411621, "grad_norm": 0.4179210764245915, "learning_rate": 2.4546947368421052e-05, "loss": 0.2721, "mean_copy_accuracy": 0.9957616031169891, "mean_gen_accuracy": 0.8789930790662766, "mean_token_accuracy": 0.9064915478229523, "num_tokens": 285934728.0, "sample_num_tokens": 7197.5, "step": 5053, "total_num_tokens": 285963518.0, "z_loss": 0.0005503739230334759 }, { "copy_logits_max": -6.87051248550415, "copy_logits_min": -750000000.0, "copy_num_tokens": 353.125, "epoch": 1.0322695940770998, "gen_logits_max": 3.854224681854248, "gen_logits_mean": -17.025997161865234, "gen_logits_min": -28.75407600402832, "gen_logits_std": 2.923187732696533, "gen_loss": 0.29676514863967896, "grad_norm": 0.41472601820801064, "learning_rate": 2.454568421052632e-05, "loss": 0.2881, "mean_copy_accuracy": 0.9942902773618698, "mean_gen_accuracy": 0.8760839104652405, "mean_token_accuracy": 0.9013058990240097, "num_tokens": 286196957.0, "sample_num_tokens": 7616.25, "step": 5054, "total_num_tokens": 286227422.0, "z_loss": 0.0005565998726524413 }, { "copy_logits_max": -7.778778553009033, "copy_logits_min": -750000000.0, "copy_num_tokens": 300.375, "epoch": 1.0324738320142965, "gen_logits_max": 5.283987045288086, "gen_logits_mean": -14.678651809692383, "gen_logits_min": -25.93694305419922, "gen_logits_std": 2.752441883087158, "gen_loss": 0.288142591714859, "grad_norm": 0.4338552454960191, "learning_rate": 2.454442105263158e-05, "loss": 0.2995, "mean_copy_accuracy": 0.9959357529878616, "mean_gen_accuracy": 0.8719406127929688, "mean_token_accuracy": 0.9014019072055817, "num_tokens": 286481370.0, "sample_num_tokens": 7855.5, "step": 5055, "total_num_tokens": 286512792.0, "z_loss": 0.000557553255930543 }, { "copy_logits_max": -4.959662914276123, "copy_logits_min": -625000064.0, "copy_num_tokens": 616.5625, "epoch": 1.0326780699514935, "gen_logits_max": 4.34334659576416, "gen_logits_mean": -14.250188827514648, "gen_logits_min": -26.201128005981445, "gen_logits_std": 2.832871198654175, "gen_loss": 0.2772911489009857, "grad_norm": 0.41879523839880706, "learning_rate": 2.4543157894736845e-05, "loss": 0.2937, "mean_copy_accuracy": 0.9947991222143173, "mean_gen_accuracy": 0.8718840181827545, "mean_token_accuracy": 0.900825023651123, "num_tokens": 286738221.0, "sample_num_tokens": 8497.75, "step": 5056, "total_num_tokens": 286772212.0, "z_loss": 0.000615612487308681 }, { "copy_logits_max": -6.139400482177734, "copy_logits_min": -750000000.0, "copy_num_tokens": 627.125, "epoch": 1.0328823078886904, "gen_logits_max": 5.999723434448242, "gen_logits_mean": -13.121097564697266, "gen_logits_min": -25.2467041015625, "gen_logits_std": 2.829519271850586, "gen_loss": 0.2608349323272705, "grad_norm": 0.4856517899532954, "learning_rate": 2.4541894736842106e-05, "loss": 0.2606, "mean_copy_accuracy": 0.9966634660959244, "mean_gen_accuracy": 0.8818160146474838, "mean_token_accuracy": 0.9117761552333832, "num_tokens": 287004762.0, "sample_num_tokens": 9797.0, "step": 5057, "total_num_tokens": 287043950.0, "z_loss": 0.0005483850836753845 }, { "copy_logits_max": -7.907281875610352, "copy_logits_min": -750000000.0, "copy_num_tokens": 619.125, "epoch": 1.033086545825887, "gen_logits_max": 4.363603591918945, "gen_logits_mean": -14.411800384521484, "gen_logits_min": -25.99413299560547, "gen_logits_std": 2.8415093421936035, "gen_loss": 0.2834736108779907, "grad_norm": 0.40693946658032715, "learning_rate": 2.454063157894737e-05, "loss": 0.2625, "mean_copy_accuracy": 0.9962566792964935, "mean_gen_accuracy": 0.8798306435346603, "mean_token_accuracy": 0.9097971767187119, "num_tokens": 287271948.0, "sample_num_tokens": 9772.5, "step": 5058, "total_num_tokens": 287311038.0, "z_loss": 0.0005459100939333439 }, { "copy_logits_max": -7.224702835083008, "copy_logits_min": -750000000.0, "copy_num_tokens": 306.25, "epoch": 1.033290783763084, "gen_logits_max": 5.212224960327148, "gen_logits_mean": -14.207365989685059, "gen_logits_min": -26.079360961914062, "gen_logits_std": 2.8362984657287598, "gen_loss": 0.2789255976676941, "grad_norm": 0.405411619439749, "learning_rate": 2.453936842105263e-05, "loss": 0.2891, "mean_copy_accuracy": 0.9943878650665283, "mean_gen_accuracy": 0.8771711736917496, "mean_token_accuracy": 0.9004999250173569, "num_tokens": 287529347.0, "sample_num_tokens": 7167.25, "step": 5059, "total_num_tokens": 287558016.0, "z_loss": 0.0005169901414774358 }, { "copy_logits_max": -4.90748405456543, "copy_logits_min": -750000000.0, "copy_num_tokens": 541.5625, "epoch": 1.0334950217002807, "gen_logits_max": 4.0252909660339355, "gen_logits_mean": -14.416065216064453, "gen_logits_min": -26.334402084350586, "gen_logits_std": 2.840791940689087, "gen_loss": 0.27012044191360474, "grad_norm": 0.43804381171813667, "learning_rate": 2.4538105263157895e-05, "loss": 0.2732, "mean_copy_accuracy": 0.9944325089454651, "mean_gen_accuracy": 0.8797473907470703, "mean_token_accuracy": 0.9095123112201691, "num_tokens": 287789774.0, "sample_num_tokens": 8322.0, "step": 5060, "total_num_tokens": 287823062.0, "z_loss": 0.0005644811317324638 }, { "copy_logits_max": -5.105855941772461, "copy_logits_min": -750000064.0, "copy_num_tokens": 598.125, "epoch": 1.0336992596374777, "gen_logits_max": 4.157602787017822, "gen_logits_mean": -14.787751197814941, "gen_logits_min": -26.832292556762695, "gen_logits_std": 2.9028987884521484, "gen_loss": 0.2729986310005188, "grad_norm": 0.3816567823489523, "learning_rate": 2.4536842105263156e-05, "loss": 0.278, "mean_copy_accuracy": 0.9957340061664581, "mean_gen_accuracy": 0.8751284629106522, "mean_token_accuracy": 0.9057176113128662, "num_tokens": 288055910.0, "sample_num_tokens": 8439.0, "step": 5061, "total_num_tokens": 288089666.0, "z_loss": 0.0006302464753389359 }, { "copy_logits_max": -5.054512977600098, "copy_logits_min": -750000000.0, "copy_num_tokens": 458.375, "epoch": 1.0339034975746746, "gen_logits_max": 4.343131065368652, "gen_logits_mean": -14.930998802185059, "gen_logits_min": -27.04743194580078, "gen_logits_std": 2.93284273147583, "gen_loss": 0.28855419158935547, "grad_norm": 0.38150229303921906, "learning_rate": 2.453557894736842e-05, "loss": 0.2744, "mean_copy_accuracy": 0.9960636645555496, "mean_gen_accuracy": 0.8763256072998047, "mean_token_accuracy": 0.9070975482463837, "num_tokens": 288314642.0, "sample_num_tokens": 8176.0, "step": 5062, "total_num_tokens": 288347346.0, "z_loss": 0.0006575825973413885 }, { "copy_logits_max": -4.820074081420898, "copy_logits_min": -687499968.0, "copy_num_tokens": 546.125, "epoch": 1.0341077355118713, "gen_logits_max": 4.27040433883667, "gen_logits_mean": -15.442695617675781, "gen_logits_min": -27.463045120239258, "gen_logits_std": 2.923203945159912, "gen_loss": 0.25194111466407776, "grad_norm": 0.4050251437459555, "learning_rate": 2.4534315789473685e-05, "loss": 0.2716, "mean_copy_accuracy": 0.9955703020095825, "mean_gen_accuracy": 0.876267209649086, "mean_token_accuracy": 0.9069005101919174, "num_tokens": 288578666.0, "sample_num_tokens": 9465.0, "step": 5063, "total_num_tokens": 288616526.0, "z_loss": 0.0004968889988958836 }, { "copy_logits_max": -7.180140495300293, "copy_logits_min": -750000000.0, "copy_num_tokens": 460.5625, "epoch": 1.0343119734490682, "gen_logits_max": 4.7350664138793945, "gen_logits_mean": -15.477526664733887, "gen_logits_min": -27.55759048461914, "gen_logits_std": 2.8998029232025146, "gen_loss": 0.254830002784729, "grad_norm": 0.4212010027714787, "learning_rate": 2.453305263157895e-05, "loss": 0.2846, "mean_copy_accuracy": 0.9940877109766006, "mean_gen_accuracy": 0.876678854227066, "mean_token_accuracy": 0.9027842432260513, "num_tokens": 288851509.0, "sample_num_tokens": 8395.75, "step": 5064, "total_num_tokens": 288885092.0, "z_loss": 0.0005106830503791571 }, { "copy_logits_max": -5.941390037536621, "copy_logits_min": -750000000.0, "copy_num_tokens": 421.0625, "epoch": 1.034516211386265, "gen_logits_max": 5.273708343505859, "gen_logits_mean": -14.801595687866211, "gen_logits_min": -26.481212615966797, "gen_logits_std": 2.917382001876831, "gen_loss": 0.28327786922454834, "grad_norm": 0.40449081194199804, "learning_rate": 2.453178947368421e-05, "loss": 0.2831, "mean_copy_accuracy": 0.9946137368679047, "mean_gen_accuracy": 0.8784107714891434, "mean_token_accuracy": 0.9048849791288376, "num_tokens": 289128035.0, "sample_num_tokens": 8952.25, "step": 5065, "total_num_tokens": 289163844.0, "z_loss": 0.0005461107357405126 }, { "copy_logits_max": -6.311614990234375, "copy_logits_min": -750000000.0, "copy_num_tokens": 475.25, "epoch": 1.0347204493234619, "gen_logits_max": 4.961672782897949, "gen_logits_mean": -15.112979888916016, "gen_logits_min": -27.129243850708008, "gen_logits_std": 2.937016487121582, "gen_loss": 0.29349231719970703, "grad_norm": 0.37592516208352256, "learning_rate": 2.4530526315789474e-05, "loss": 0.2903, "mean_copy_accuracy": 0.9975940883159637, "mean_gen_accuracy": 0.8705944567918777, "mean_token_accuracy": 0.9023420214653015, "num_tokens": 289400760.0, "sample_num_tokens": 8155.0, "step": 5066, "total_num_tokens": 289433380.0, "z_loss": 0.0006507826037704945 }, { "copy_logits_max": -6.4080939292907715, "copy_logits_min": -750000000.0, "copy_num_tokens": 461.5625, "epoch": 1.0349246872606586, "gen_logits_max": 4.468270301818848, "gen_logits_mean": -14.87681770324707, "gen_logits_min": -26.798080444335938, "gen_logits_std": 2.9079818725585938, "gen_loss": 0.28984758257865906, "grad_norm": 0.3906183533125046, "learning_rate": 2.452926315789474e-05, "loss": 0.2745, "mean_copy_accuracy": 0.9968539178371429, "mean_gen_accuracy": 0.876715213060379, "mean_token_accuracy": 0.9063343852758408, "num_tokens": 289681743.0, "sample_num_tokens": 8428.75, "step": 5067, "total_num_tokens": 289715458.0, "z_loss": 0.0006193745066411793 }, { "copy_logits_max": -7.189919948577881, "copy_logits_min": -750000064.0, "copy_num_tokens": 619.0625, "epoch": 1.0351289251978555, "gen_logits_max": 4.61769437789917, "gen_logits_mean": -14.454078674316406, "gen_logits_min": -26.674060821533203, "gen_logits_std": 2.9621329307556152, "gen_loss": 0.26307904720306396, "grad_norm": 0.3887074499413078, "learning_rate": 2.4528e-05, "loss": 0.2787, "mean_copy_accuracy": 0.9958399087190628, "mean_gen_accuracy": 0.8739742189645767, "mean_token_accuracy": 0.9053011536598206, "num_tokens": 289970590.0, "sample_num_tokens": 9687.5, "step": 5068, "total_num_tokens": 290009340.0, "z_loss": 0.0005219012382440269 }, { "copy_logits_max": -6.531876087188721, "copy_logits_min": -750000064.0, "copy_num_tokens": 427.5625, "epoch": 1.0353331631350524, "gen_logits_max": 5.848913192749023, "gen_logits_mean": -13.044595718383789, "gen_logits_min": -25.32514762878418, "gen_logits_std": 2.897062301635742, "gen_loss": 0.29824963212013245, "grad_norm": 0.38802955728399136, "learning_rate": 2.4526736842105264e-05, "loss": 0.2962, "mean_copy_accuracy": 0.996048629283905, "mean_gen_accuracy": 0.8701604604721069, "mean_token_accuracy": 0.8989980071783066, "num_tokens": 290244711.0, "sample_num_tokens": 8558.25, "step": 5069, "total_num_tokens": 290278944.0, "z_loss": 0.000533487182110548 }, { "copy_logits_max": -7.309467315673828, "copy_logits_min": -750000000.0, "copy_num_tokens": 579.9375, "epoch": 1.0355374010722491, "gen_logits_max": 4.898630619049072, "gen_logits_mean": -14.481550216674805, "gen_logits_min": -26.86882209777832, "gen_logits_std": 2.9384405612945557, "gen_loss": 0.2556591033935547, "grad_norm": 2.9846028962612423, "learning_rate": 2.4525473684210525e-05, "loss": 0.2538, "mean_copy_accuracy": 0.9966946989297867, "mean_gen_accuracy": 0.8825558125972748, "mean_token_accuracy": 0.9155565202236176, "num_tokens": 290533465.0, "sample_num_tokens": 8468.25, "step": 5070, "total_num_tokens": 290567338.0, "z_loss": 0.0005266115185804665 }, { "copy_logits_max": -8.222350120544434, "copy_logits_min": -750000000.0, "copy_num_tokens": 432.4375, "epoch": 1.035741639009446, "gen_logits_max": 4.439519882202148, "gen_logits_mean": -15.178224563598633, "gen_logits_min": -26.980985641479492, "gen_logits_std": 2.902803659439087, "gen_loss": 0.2520197033882141, "grad_norm": 0.3991511698842507, "learning_rate": 2.4524210526315793e-05, "loss": 0.2651, "mean_copy_accuracy": 0.9963403642177582, "mean_gen_accuracy": 0.8822214901447296, "mean_token_accuracy": 0.9106016308069229, "num_tokens": 290800427.0, "sample_num_tokens": 7924.75, "step": 5071, "total_num_tokens": 290832126.0, "z_loss": 0.0005050653708167374 }, { "copy_logits_max": -9.381169319152832, "copy_logits_min": -750000000.0, "copy_num_tokens": 357.375, "epoch": 1.0359458769466428, "gen_logits_max": 4.90858268737793, "gen_logits_mean": -14.929300308227539, "gen_logits_min": -26.672239303588867, "gen_logits_std": 2.8613030910491943, "gen_loss": 0.29878848791122437, "grad_norm": 0.37913759271332054, "learning_rate": 2.4522947368421053e-05, "loss": 0.2834, "mean_copy_accuracy": 0.9951481074094772, "mean_gen_accuracy": 0.8767051100730896, "mean_token_accuracy": 0.9029947221279144, "num_tokens": 291070118.0, "sample_num_tokens": 8382.0, "step": 5072, "total_num_tokens": 291103646.0, "z_loss": 0.0005675056600011885 }, { "copy_logits_max": -8.21339225769043, "copy_logits_min": -750000000.0, "copy_num_tokens": 290.375, "epoch": 1.0361501148838397, "gen_logits_max": 5.15714693069458, "gen_logits_mean": -14.6920804977417, "gen_logits_min": -26.321590423583984, "gen_logits_std": 2.8975257873535156, "gen_loss": 0.2880997657775879, "grad_norm": 0.39538974302539426, "learning_rate": 2.4521684210526318e-05, "loss": 0.266, "mean_copy_accuracy": 0.9940015822649002, "mean_gen_accuracy": 0.890429675579071, "mean_token_accuracy": 0.9081399887800217, "num_tokens": 291321004.0, "sample_num_tokens": 7071.5, "step": 5073, "total_num_tokens": 291349290.0, "z_loss": 0.0005931458435952663 }, { "copy_logits_max": -7.572539329528809, "copy_logits_min": -750000000.0, "copy_num_tokens": 438.4375, "epoch": 1.0363543528210366, "gen_logits_max": 4.205881118774414, "gen_logits_mean": -16.138172149658203, "gen_logits_min": -27.647796630859375, "gen_logits_std": 2.9193596839904785, "gen_loss": 0.27675771713256836, "grad_norm": 0.4144021438298247, "learning_rate": 2.452042105263158e-05, "loss": 0.2719, "mean_copy_accuracy": 0.995874285697937, "mean_gen_accuracy": 0.8780465424060822, "mean_token_accuracy": 0.9070820063352585, "num_tokens": 291573324.0, "sample_num_tokens": 8633.5, "step": 5074, "total_num_tokens": 291607858.0, "z_loss": 0.0006123829516582191 }, { "copy_logits_max": -6.041395664215088, "copy_logits_min": -750000000.0, "copy_num_tokens": 376.6875, "epoch": 1.0365585907582333, "gen_logits_max": 4.895251750946045, "gen_logits_mean": -14.738041877746582, "gen_logits_min": -26.734649658203125, "gen_logits_std": 2.895247459411621, "gen_loss": 0.2811058759689331, "grad_norm": 0.3676093892096638, "learning_rate": 2.4519157894736843e-05, "loss": 0.2547, "mean_copy_accuracy": 0.9956431239843369, "mean_gen_accuracy": 0.8885139524936676, "mean_token_accuracy": 0.9130486100912094, "num_tokens": 291854447.0, "sample_num_tokens": 7350.75, "step": 5075, "total_num_tokens": 291883850.0, "z_loss": 0.0006746692233718932 }, { "copy_logits_max": -4.314522743225098, "copy_logits_min": -750000000.0, "copy_num_tokens": 635.25, "epoch": 1.0367628286954302, "gen_logits_max": 4.866179466247559, "gen_logits_mean": -14.470391273498535, "gen_logits_min": -26.44794273376465, "gen_logits_std": 2.8942480087280273, "gen_loss": 0.2539598345756531, "grad_norm": 0.40998407675282683, "learning_rate": 2.4517894736842104e-05, "loss": 0.2804, "mean_copy_accuracy": 0.9955752640962601, "mean_gen_accuracy": 0.8789070248603821, "mean_token_accuracy": 0.9060746729373932, "num_tokens": 292110933.0, "sample_num_tokens": 10765.75, "step": 5076, "total_num_tokens": 292153996.0, "z_loss": 0.0006553877610713243 }, { "copy_logits_max": -4.905263900756836, "copy_logits_min": -687500032.0, "copy_num_tokens": 520.25, "epoch": 1.036967066632627, "gen_logits_max": 4.3529887199401855, "gen_logits_mean": -15.560677528381348, "gen_logits_min": -27.51378631591797, "gen_logits_std": 2.916152000427246, "gen_loss": 0.26848533749580383, "grad_norm": 0.3616557841141509, "learning_rate": 2.4516631578947368e-05, "loss": 0.2804, "mean_copy_accuracy": 0.9954543262720108, "mean_gen_accuracy": 0.8796134740114212, "mean_token_accuracy": 0.9050865918397903, "num_tokens": 292394759.0, "sample_num_tokens": 9075.75, "step": 5077, "total_num_tokens": 292431062.0, "z_loss": 0.0006042362656444311 }, { "copy_logits_max": -6.1892499923706055, "copy_logits_min": -750000000.0, "copy_num_tokens": 542.6875, "epoch": 1.0371713045698239, "gen_logits_max": 3.7495527267456055, "gen_logits_mean": -15.835505485534668, "gen_logits_min": -27.977638244628906, "gen_logits_std": 2.904820203781128, "gen_loss": 0.2274007350206375, "grad_norm": 0.4031987245542803, "learning_rate": 2.451536842105263e-05, "loss": 0.2735, "mean_copy_accuracy": 0.9961922317743301, "mean_gen_accuracy": 0.8796066492795944, "mean_token_accuracy": 0.9068789631128311, "num_tokens": 292676848.0, "sample_num_tokens": 8861.5, "step": 5078, "total_num_tokens": 292712294.0, "z_loss": 0.0005508034373633564 }, { "copy_logits_max": -5.417972087860107, "copy_logits_min": -750000000.0, "copy_num_tokens": 620.625, "epoch": 1.0373755425070206, "gen_logits_max": 4.102396011352539, "gen_logits_mean": -14.29587173461914, "gen_logits_min": -26.495811462402344, "gen_logits_std": 2.8666913509368896, "gen_loss": 0.2588401734828949, "grad_norm": 0.3649281039716428, "learning_rate": 2.4514105263157897e-05, "loss": 0.267, "mean_copy_accuracy": 0.996650755405426, "mean_gen_accuracy": 0.8772847652435303, "mean_token_accuracy": 0.9109031856060028, "num_tokens": 292967545.0, "sample_num_tokens": 9573.75, "step": 5079, "total_num_tokens": 293005840.0, "z_loss": 0.000589585630223155 }, { "copy_logits_max": -5.355695724487305, "copy_logits_min": -750000064.0, "copy_num_tokens": 565.75, "epoch": 1.0375797804442175, "gen_logits_max": 3.9504122734069824, "gen_logits_mean": -14.708670616149902, "gen_logits_min": -26.60698699951172, "gen_logits_std": 2.813570737838745, "gen_loss": 0.25680431723594666, "grad_norm": 0.38439202275730916, "learning_rate": 2.451284210526316e-05, "loss": 0.2764, "mean_copy_accuracy": 0.9961064159870148, "mean_gen_accuracy": 0.8758295923471451, "mean_token_accuracy": 0.906436875462532, "num_tokens": 293244028.0, "sample_num_tokens": 9189.0, "step": 5080, "total_num_tokens": 293280784.0, "z_loss": 0.0005595178226940334 }, { "copy_logits_max": -5.351554870605469, "copy_logits_min": -750000000.0, "copy_num_tokens": 333.625, "epoch": 1.0377840183814144, "gen_logits_max": 3.9025325775146484, "gen_logits_mean": -15.760764122009277, "gen_logits_min": -27.200637817382812, "gen_logits_std": 2.8418703079223633, "gen_loss": 0.2752950191497803, "grad_norm": 0.3741448130946619, "learning_rate": 2.4511578947368422e-05, "loss": 0.2742, "mean_copy_accuracy": 0.996622622013092, "mean_gen_accuracy": 0.8819343149662018, "mean_token_accuracy": 0.9087011069059372, "num_tokens": 293519158.0, "sample_num_tokens": 7402.0, "step": 5081, "total_num_tokens": 293548766.0, "z_loss": 0.0005780222127214074 }, { "copy_logits_max": -3.8878445625305176, "copy_logits_min": -750000000.0, "copy_num_tokens": 496.75, "epoch": 1.0379882563186111, "gen_logits_max": 3.861722469329834, "gen_logits_mean": -15.89897346496582, "gen_logits_min": -28.053268432617188, "gen_logits_std": 2.939274787902832, "gen_loss": 0.2574976682662964, "grad_norm": 0.3960225559654314, "learning_rate": 2.4510315789473686e-05, "loss": 0.2859, "mean_copy_accuracy": 0.9952130764722824, "mean_gen_accuracy": 0.8713498711585999, "mean_token_accuracy": 0.9023626744747162, "num_tokens": 293783475.0, "sample_num_tokens": 7800.75, "step": 5082, "total_num_tokens": 293814678.0, "z_loss": 0.0005698442691937089 }, { "copy_logits_max": -5.797228813171387, "copy_logits_min": -687500032.0, "copy_num_tokens": 695.5, "epoch": 1.038192494255808, "gen_logits_max": 3.766627550125122, "gen_logits_mean": -14.277032852172852, "gen_logits_min": -25.443077087402344, "gen_logits_std": 2.7629752159118652, "gen_loss": 0.2465352565050125, "grad_norm": 0.3723519189058896, "learning_rate": 2.4509052631578947e-05, "loss": 0.2651, "mean_copy_accuracy": 0.9968047738075256, "mean_gen_accuracy": 0.8786432892084122, "mean_token_accuracy": 0.9105085581541061, "num_tokens": 294079546.0, "sample_num_tokens": 11341.0, "step": 5083, "total_num_tokens": 294124910.0, "z_loss": 0.0005052760243415833 }, { "copy_logits_max": -5.338083267211914, "copy_logits_min": -687500032.0, "copy_num_tokens": 685.9375, "epoch": 1.0383967321930048, "gen_logits_max": 3.9532620906829834, "gen_logits_mean": -14.481420516967773, "gen_logits_min": -26.17562484741211, "gen_logits_std": 2.887169361114502, "gen_loss": 0.2701743543148041, "grad_norm": 0.3625862672521819, "learning_rate": 2.450778947368421e-05, "loss": 0.2708, "mean_copy_accuracy": 0.9963131695985794, "mean_gen_accuracy": 0.8797707706689835, "mean_token_accuracy": 0.9086550772190094, "num_tokens": 294364256.0, "sample_num_tokens": 10429.0, "step": 5084, "total_num_tokens": 294405972.0, "z_loss": 0.0005683288909494877 }, { "copy_logits_max": -4.938760757446289, "copy_logits_min": -750000000.0, "copy_num_tokens": 464.875, "epoch": 1.0386009701302017, "gen_logits_max": 4.087157249450684, "gen_logits_mean": -14.794300079345703, "gen_logits_min": -26.440040588378906, "gen_logits_std": 2.8360023498535156, "gen_loss": 0.2839471995830536, "grad_norm": 0.37755701869600283, "learning_rate": 2.4506526315789473e-05, "loss": 0.281, "mean_copy_accuracy": 0.9964907616376877, "mean_gen_accuracy": 0.8712075799703598, "mean_token_accuracy": 0.904167115688324, "num_tokens": 294653578.0, "sample_num_tokens": 8094.5, "step": 5085, "total_num_tokens": 294685956.0, "z_loss": 0.0005928803002461791 }, { "copy_logits_max": -5.266836166381836, "copy_logits_min": -750000000.0, "copy_num_tokens": 608.9375, "epoch": 1.0388052080673986, "gen_logits_max": 3.936570882797241, "gen_logits_mean": -14.412420272827148, "gen_logits_min": -26.22368621826172, "gen_logits_std": 2.908437728881836, "gen_loss": 0.25925493240356445, "grad_norm": 0.376158660592845, "learning_rate": 2.4505263157894737e-05, "loss": 0.2724, "mean_copy_accuracy": 0.9961313903331757, "mean_gen_accuracy": 0.8780229538679123, "mean_token_accuracy": 0.9077974557876587, "num_tokens": 294927740.0, "sample_num_tokens": 9332.5, "step": 5086, "total_num_tokens": 294965070.0, "z_loss": 0.000489933299832046 }, { "copy_logits_max": -6.23672342300415, "copy_logits_min": -687500032.0, "copy_num_tokens": 559.125, "epoch": 1.0390094460045953, "gen_logits_max": 3.7130494117736816, "gen_logits_mean": -14.329916000366211, "gen_logits_min": -25.90350341796875, "gen_logits_std": 2.7963132858276367, "gen_loss": 0.2958325743675232, "grad_norm": 0.3969027937500024, "learning_rate": 2.4504e-05, "loss": 0.2847, "mean_copy_accuracy": 0.9954325556755066, "mean_gen_accuracy": 0.8714553713798523, "mean_token_accuracy": 0.901703491806984, "num_tokens": 295185440.0, "sample_num_tokens": 8927.0, "step": 5087, "total_num_tokens": 295221148.0, "z_loss": 0.0005403338582254946 }, { "copy_logits_max": -5.862151622772217, "copy_logits_min": -687500032.0, "copy_num_tokens": 434.75, "epoch": 1.0392136839417923, "gen_logits_max": 4.2981276512146, "gen_logits_mean": -14.438138961791992, "gen_logits_min": -26.203779220581055, "gen_logits_std": 2.903782606124878, "gen_loss": 0.27373242378234863, "grad_norm": 0.4108311259711424, "learning_rate": 2.4502736842105266e-05, "loss": 0.2855, "mean_copy_accuracy": 0.9951473772525787, "mean_gen_accuracy": 0.8750400245189667, "mean_token_accuracy": 0.9034470766782761, "num_tokens": 295473328.0, "sample_num_tokens": 7558.0, "step": 5088, "total_num_tokens": 295503560.0, "z_loss": 0.0005393065512180328 }, { "copy_logits_max": -6.123793601989746, "copy_logits_min": -750000000.0, "copy_num_tokens": 565.1875, "epoch": 1.039417921878989, "gen_logits_max": 4.259611129760742, "gen_logits_mean": -15.273307800292969, "gen_logits_min": -27.001209259033203, "gen_logits_std": 2.9456584453582764, "gen_loss": 0.23999357223510742, "grad_norm": 0.37883715919121436, "learning_rate": 2.4501473684210526e-05, "loss": 0.2551, "mean_copy_accuracy": 0.9973513036966324, "mean_gen_accuracy": 0.8832881301641464, "mean_token_accuracy": 0.9147548973560333, "num_tokens": 295738984.0, "sample_num_tokens": 8794.5, "step": 5089, "total_num_tokens": 295774162.0, "z_loss": 0.0005039672832936049 }, { "copy_logits_max": -3.209444046020508, "copy_logits_min": -750000000.0, "copy_num_tokens": 431.1875, "epoch": 1.0396221598161859, "gen_logits_max": 4.7841572761535645, "gen_logits_mean": -14.590288162231445, "gen_logits_min": -26.532625198364258, "gen_logits_std": 2.949948787689209, "gen_loss": 0.30250853300094604, "grad_norm": 0.39354996238779005, "learning_rate": 2.450021052631579e-05, "loss": 0.2854, "mean_copy_accuracy": 0.9958425462245941, "mean_gen_accuracy": 0.8752859383821487, "mean_token_accuracy": 0.9030693769454956, "num_tokens": 296012906.0, "sample_num_tokens": 8621.0, "step": 5090, "total_num_tokens": 296047390.0, "z_loss": 0.0006317216320894659 }, { "copy_logits_max": -5.8611602783203125, "copy_logits_min": -625000064.0, "copy_num_tokens": 416.0, "epoch": 1.0398263977533826, "gen_logits_max": 3.227102279663086, "gen_logits_mean": -16.43256378173828, "gen_logits_min": -27.96969985961914, "gen_logits_std": 2.924253225326538, "gen_loss": 0.284409761428833, "grad_norm": 0.38504265099760676, "learning_rate": 2.449894736842105e-05, "loss": 0.2839, "mean_copy_accuracy": 0.9967807531356812, "mean_gen_accuracy": 0.8764791786670685, "mean_token_accuracy": 0.9046667516231537, "num_tokens": 296289006.0, "sample_num_tokens": 7504.0, "step": 5091, "total_num_tokens": 296319022.0, "z_loss": 0.0005456049111671746 }, { "copy_logits_max": -5.062163352966309, "copy_logits_min": -750000000.0, "copy_num_tokens": 412.0, "epoch": 1.0400306356905795, "gen_logits_max": 4.898883819580078, "gen_logits_mean": -14.652626991271973, "gen_logits_min": -26.654491424560547, "gen_logits_std": 2.9357166290283203, "gen_loss": 0.2952401638031006, "grad_norm": 0.42648146130814574, "learning_rate": 2.4497684210526316e-05, "loss": 0.2861, "mean_copy_accuracy": 0.995454341173172, "mean_gen_accuracy": 0.870411828160286, "mean_token_accuracy": 0.9025060087442398, "num_tokens": 296555141.0, "sample_num_tokens": 7962.25, "step": 5092, "total_num_tokens": 296586990.0, "z_loss": 0.0005568356136791408 }, { "copy_logits_max": -3.1575629711151123, "copy_logits_min": -750000064.0, "copy_num_tokens": 578.0, "epoch": 1.0402348736277764, "gen_logits_max": 4.4247236251831055, "gen_logits_mean": -13.983085632324219, "gen_logits_min": -26.413860321044922, "gen_logits_std": 2.9414167404174805, "gen_loss": 0.29054713249206543, "grad_norm": 0.3915034563276372, "learning_rate": 2.4496421052631577e-05, "loss": 0.2744, "mean_copy_accuracy": 0.995821088552475, "mean_gen_accuracy": 0.872699961066246, "mean_token_accuracy": 0.9070675820112228, "num_tokens": 296834194.0, "sample_num_tokens": 9171.5, "step": 5093, "total_num_tokens": 296870880.0, "z_loss": 0.0006236331537365913 }, { "copy_logits_max": -4.415550231933594, "copy_logits_min": -750000064.0, "copy_num_tokens": 422.375, "epoch": 1.0404391115649732, "gen_logits_max": 4.725953578948975, "gen_logits_mean": -14.06938362121582, "gen_logits_min": -25.931779861450195, "gen_logits_std": 2.8766207695007324, "gen_loss": 0.3021913468837738, "grad_norm": 0.42801443558368024, "learning_rate": 2.449515789473684e-05, "loss": 0.2878, "mean_copy_accuracy": 0.9948435723781586, "mean_gen_accuracy": 0.8731847554445267, "mean_token_accuracy": 0.9037655740976334, "num_tokens": 297104621.0, "sample_num_tokens": 7813.75, "step": 5094, "total_num_tokens": 297135876.0, "z_loss": 0.0007022914942353964 }, { "copy_logits_max": -5.890833854675293, "copy_logits_min": -687500032.0, "copy_num_tokens": 437.4375, "epoch": 1.04064334950217, "gen_logits_max": 6.0684895515441895, "gen_logits_mean": -13.75778579711914, "gen_logits_min": -25.298587799072266, "gen_logits_std": 2.9263298511505127, "gen_loss": 0.272305428981781, "grad_norm": 0.4375250461078435, "learning_rate": 2.449389473684211e-05, "loss": 0.2841, "mean_copy_accuracy": 0.9957260489463806, "mean_gen_accuracy": 0.8760769069194794, "mean_token_accuracy": 0.9039495885372162, "num_tokens": 297345146.0, "sample_num_tokens": 7435.0, "step": 5095, "total_num_tokens": 297374886.0, "z_loss": 0.0005570200155489147 }, { "copy_logits_max": -5.302760124206543, "copy_logits_min": -750000000.0, "copy_num_tokens": 622.8125, "epoch": 1.0408475874393668, "gen_logits_max": 4.973639965057373, "gen_logits_mean": -12.768327713012695, "gen_logits_min": -24.950397491455078, "gen_logits_std": 2.868605613708496, "gen_loss": 0.2363295704126358, "grad_norm": 0.3890279408234846, "learning_rate": 2.449263157894737e-05, "loss": 0.2467, "mean_copy_accuracy": 0.9963343888521194, "mean_gen_accuracy": 0.882327064871788, "mean_token_accuracy": 0.9160274118185043, "num_tokens": 297632216.0, "sample_num_tokens": 8652.5, "step": 5096, "total_num_tokens": 297666826.0, "z_loss": 0.0005484191933646798 }, { "copy_logits_max": -3.5759737491607666, "copy_logits_min": -750000000.0, "copy_num_tokens": 422.9375, "epoch": 1.0410518253765637, "gen_logits_max": 5.477535247802734, "gen_logits_mean": -14.59609603881836, "gen_logits_min": -26.712413787841797, "gen_logits_std": 2.9355173110961914, "gen_loss": 0.3065609633922577, "grad_norm": 0.38179965954360945, "learning_rate": 2.4491368421052634e-05, "loss": 0.295, "mean_copy_accuracy": 0.9949488788843155, "mean_gen_accuracy": 0.873568519949913, "mean_token_accuracy": 0.9003806561231613, "num_tokens": 297905505.0, "sample_num_tokens": 8644.25, "step": 5097, "total_num_tokens": 297940082.0, "z_loss": 0.0006347134476527572 }, { "copy_logits_max": -4.017983436584473, "copy_logits_min": -750000000.0, "copy_num_tokens": 500.5, "epoch": 1.0412560633137606, "gen_logits_max": 3.703087091445923, "gen_logits_mean": -16.14227294921875, "gen_logits_min": -27.93415069580078, "gen_logits_std": 2.957120656967163, "gen_loss": 0.26073962450027466, "grad_norm": 0.3976349625838369, "learning_rate": 2.4490105263157895e-05, "loss": 0.269, "mean_copy_accuracy": 0.9951041340827942, "mean_gen_accuracy": 0.879566103219986, "mean_token_accuracy": 0.9081501364707947, "num_tokens": 298170923.0, "sample_num_tokens": 8918.75, "step": 5098, "total_num_tokens": 298206598.0, "z_loss": 0.0005780712235718966 }, { "copy_logits_max": -6.403936386108398, "copy_logits_min": -750000000.0, "copy_num_tokens": 364.0625, "epoch": 1.0414603012509573, "gen_logits_max": 3.871978282928467, "gen_logits_mean": -16.654829025268555, "gen_logits_min": -28.848323822021484, "gen_logits_std": 2.9794974327087402, "gen_loss": 0.2765446901321411, "grad_norm": 0.37491286572402804, "learning_rate": 2.448884210526316e-05, "loss": 0.2811, "mean_copy_accuracy": 0.9948842078447342, "mean_gen_accuracy": 0.8784276843070984, "mean_token_accuracy": 0.904866099357605, "num_tokens": 298447330.0, "sample_num_tokens": 7599.5, "step": 5099, "total_num_tokens": 298477728.0, "z_loss": 0.0005215781857259572 }, { "copy_logits_max": -4.277756690979004, "copy_logits_min": -625000000.0, "copy_num_tokens": 873.25, "epoch": 1.0416645391881543, "gen_logits_max": 4.2486419677734375, "gen_logits_mean": -13.597634315490723, "gen_logits_min": -26.02583885192871, "gen_logits_std": 2.976768970489502, "gen_loss": 0.23242245614528656, "grad_norm": 0.387685295592305, "learning_rate": 2.448757894736842e-05, "loss": 0.2762, "mean_copy_accuracy": 0.9961918592453003, "mean_gen_accuracy": 0.8713487088680267, "mean_token_accuracy": 0.907062754034996, "num_tokens": 298733313.0, "sample_num_tokens": 11281.75, "step": 5100, "total_num_tokens": 298778440.0, "z_loss": 0.000474313972517848 }, { "copy_logits_max": -4.395961761474609, "copy_logits_min": -750000064.0, "copy_num_tokens": 486.5625, "epoch": 1.041868777125351, "gen_logits_max": 4.8179497718811035, "gen_logits_mean": -14.204886436462402, "gen_logits_min": -26.249134063720703, "gen_logits_std": 2.9408743381500244, "gen_loss": 0.2897875905036926, "grad_norm": 0.3471859234868128, "learning_rate": 2.4486315789473685e-05, "loss": 0.2749, "mean_copy_accuracy": 0.9971369206905365, "mean_gen_accuracy": 0.874168261885643, "mean_token_accuracy": 0.9060602337121964, "num_tokens": 299019761.0, "sample_num_tokens": 8486.25, "step": 5101, "total_num_tokens": 299053706.0, "z_loss": 0.0005142090376466513 }, { "copy_logits_max": -4.060250759124756, "copy_logits_min": -750000064.0, "copy_num_tokens": 463.6875, "epoch": 1.042073015062548, "gen_logits_max": 3.916386365890503, "gen_logits_mean": -16.060319900512695, "gen_logits_min": -28.315345764160156, "gen_logits_std": 2.9964656829833984, "gen_loss": 0.2704389989376068, "grad_norm": 0.38234510397481436, "learning_rate": 2.4485052631578946e-05, "loss": 0.2763, "mean_copy_accuracy": 0.9958108812570572, "mean_gen_accuracy": 0.8742105215787888, "mean_token_accuracy": 0.9056956768035889, "num_tokens": 299289072.0, "sample_num_tokens": 7963.5, "step": 5102, "total_num_tokens": 299320926.0, "z_loss": 0.0005597951239906251 }, { "copy_logits_max": -6.349648475646973, "copy_logits_min": -750000000.0, "copy_num_tokens": 392.25, "epoch": 1.0422772529997446, "gen_logits_max": 5.07269287109375, "gen_logits_mean": -14.925801277160645, "gen_logits_min": -26.341577529907227, "gen_logits_std": 2.9142065048217773, "gen_loss": 0.29351603984832764, "grad_norm": 0.3598809175885728, "learning_rate": 2.4483789473684213e-05, "loss": 0.2777, "mean_copy_accuracy": 0.9964596480131149, "mean_gen_accuracy": 0.8765016198158264, "mean_token_accuracy": 0.9070861339569092, "num_tokens": 299568045.0, "sample_num_tokens": 8060.25, "step": 5103, "total_num_tokens": 299600286.0, "z_loss": 0.0005925118457525969 }, { "copy_logits_max": -5.973520278930664, "copy_logits_min": -750000000.0, "copy_num_tokens": 526.8125, "epoch": 1.0424814909369415, "gen_logits_max": 3.860912799835205, "gen_logits_mean": -16.32577133178711, "gen_logits_min": -27.904632568359375, "gen_logits_std": 2.9355921745300293, "gen_loss": 0.2705261707305908, "grad_norm": 0.3546124521981169, "learning_rate": 2.4482526315789474e-05, "loss": 0.2588, "mean_copy_accuracy": 0.9965973049402237, "mean_gen_accuracy": 0.8787278681993484, "mean_token_accuracy": 0.9121439903974533, "num_tokens": 299849006.0, "sample_num_tokens": 8860.5, "step": 5104, "total_num_tokens": 299884448.0, "z_loss": 0.0005968728801235557 }, { "copy_logits_max": -6.379658222198486, "copy_logits_min": -750000000.0, "copy_num_tokens": 420.25, "epoch": 1.0426857288741385, "gen_logits_max": 5.04208517074585, "gen_logits_mean": -14.813807487487793, "gen_logits_min": -26.997066497802734, "gen_logits_std": 2.958596706390381, "gen_loss": 0.3024284243583679, "grad_norm": 0.4014431641051519, "learning_rate": 2.448126315789474e-05, "loss": 0.2757, "mean_copy_accuracy": 0.9952951520681381, "mean_gen_accuracy": 0.8769180327653885, "mean_token_accuracy": 0.9053252041339874, "num_tokens": 300118348.0, "sample_num_tokens": 8588.0, "step": 5105, "total_num_tokens": 300152700.0, "z_loss": 0.0006209207931533456 }, { "copy_logits_max": -3.961768388748169, "copy_logits_min": -687500032.0, "copy_num_tokens": 281.25, "epoch": 1.0428899668113352, "gen_logits_max": 5.941640377044678, "gen_logits_mean": -13.488067626953125, "gen_logits_min": -25.391326904296875, "gen_logits_std": 2.9081335067749023, "gen_loss": 0.2866891026496887, "grad_norm": 0.3839864737411122, "learning_rate": 2.448e-05, "loss": 0.2666, "mean_copy_accuracy": 0.9956180602312088, "mean_gen_accuracy": 0.881111204624176, "mean_token_accuracy": 0.9081342369318008, "num_tokens": 300387443.0, "sample_num_tokens": 7490.75, "step": 5106, "total_num_tokens": 300417406.0, "z_loss": 0.0006361827254295349 }, { "copy_logits_max": -4.767829895019531, "copy_logits_min": -750000000.0, "copy_num_tokens": 473.0, "epoch": 1.043094204748532, "gen_logits_max": 5.761953830718994, "gen_logits_mean": -13.128837585449219, "gen_logits_min": -25.22769546508789, "gen_logits_std": 2.942885398864746, "gen_loss": 0.2601579427719116, "grad_norm": 0.4238940643880173, "learning_rate": 2.4478736842105264e-05, "loss": 0.2898, "mean_copy_accuracy": 0.9957854747772217, "mean_gen_accuracy": 0.8713188171386719, "mean_token_accuracy": 0.901122972369194, "num_tokens": 300648877.0, "sample_num_tokens": 8362.75, "step": 5107, "total_num_tokens": 300682328.0, "z_loss": 0.0006580172339454293 }, { "copy_logits_max": -6.348423004150391, "copy_logits_min": -687500032.0, "copy_num_tokens": 322.4375, "epoch": 1.0432984426857288, "gen_logits_max": 4.177035808563232, "gen_logits_mean": -16.701807022094727, "gen_logits_min": -27.928977966308594, "gen_logits_std": 2.874783992767334, "gen_loss": 0.25341808795928955, "grad_norm": 0.3807713941355715, "learning_rate": 2.4477473684210528e-05, "loss": 0.2573, "mean_copy_accuracy": 0.9970459938049316, "mean_gen_accuracy": 0.8850599378347397, "mean_token_accuracy": 0.912012130022049, "num_tokens": 300922430.0, "sample_num_tokens": 7706.5, "step": 5108, "total_num_tokens": 300953256.0, "z_loss": 0.0005816511111333966 }, { "copy_logits_max": -5.431694030761719, "copy_logits_min": -750000064.0, "copy_num_tokens": 529.5625, "epoch": 1.0435026806229257, "gen_logits_max": 3.95967698097229, "gen_logits_mean": -15.32826042175293, "gen_logits_min": -26.7381534576416, "gen_logits_std": 2.8265647888183594, "gen_loss": 0.2614343762397766, "grad_norm": 0.4165516523946449, "learning_rate": 2.447621052631579e-05, "loss": 0.2859, "mean_copy_accuracy": 0.9947154968976974, "mean_gen_accuracy": 0.8746249824762344, "mean_token_accuracy": 0.9019300639629364, "num_tokens": 301172177.0, "sample_num_tokens": 9496.25, "step": 5109, "total_num_tokens": 301210162.0, "z_loss": 0.0005787777481600642 }, { "copy_logits_max": -6.153713226318359, "copy_logits_min": -750000000.0, "copy_num_tokens": 372.8125, "epoch": 1.0437069185601224, "gen_logits_max": 4.19141960144043, "gen_logits_mean": -15.532571792602539, "gen_logits_min": -26.766284942626953, "gen_logits_std": 2.8370890617370605, "gen_loss": 0.28819042444229126, "grad_norm": 0.3999739363861504, "learning_rate": 2.4474947368421053e-05, "loss": 0.2841, "mean_copy_accuracy": 0.9949052333831787, "mean_gen_accuracy": 0.8723798394203186, "mean_token_accuracy": 0.9031340926885605, "num_tokens": 301438076.0, "sample_num_tokens": 8333.5, "step": 5110, "total_num_tokens": 301471410.0, "z_loss": 0.000548622221685946 }, { "copy_logits_max": -4.313976287841797, "copy_logits_min": -687500032.0, "copy_num_tokens": 520.4375, "epoch": 1.0439111564973194, "gen_logits_max": 4.911036491394043, "gen_logits_mean": -13.905352592468262, "gen_logits_min": -25.482242584228516, "gen_logits_std": 2.8568313121795654, "gen_loss": 0.28394901752471924, "grad_norm": 0.38596377359729017, "learning_rate": 2.4473684210526318e-05, "loss": 0.2804, "mean_copy_accuracy": 0.9960350096225739, "mean_gen_accuracy": 0.8736394941806793, "mean_token_accuracy": 0.9032200425863266, "num_tokens": 301719133.0, "sample_num_tokens": 8168.25, "step": 5111, "total_num_tokens": 301751806.0, "z_loss": 0.0006008620839565992 }, { "copy_logits_max": -2.901390314102173, "copy_logits_min": -750000000.0, "copy_num_tokens": 597.375, "epoch": 1.0441153944345163, "gen_logits_max": 3.9183738231658936, "gen_logits_mean": -14.807258605957031, "gen_logits_min": -27.04951286315918, "gen_logits_std": 2.8734853267669678, "gen_loss": 0.2738857567310333, "grad_norm": 0.38085538935000635, "learning_rate": 2.4472421052631582e-05, "loss": 0.264, "mean_copy_accuracy": 0.9959575980901718, "mean_gen_accuracy": 0.8774979710578918, "mean_token_accuracy": 0.9103801399469376, "num_tokens": 301982879.0, "sample_num_tokens": 9684.25, "step": 5112, "total_num_tokens": 302021616.0, "z_loss": 0.0005737782921642065 }, { "copy_logits_max": -5.39723539352417, "copy_logits_min": -750000000.0, "copy_num_tokens": 316.1875, "epoch": 1.044319632371713, "gen_logits_max": 5.021118640899658, "gen_logits_mean": -15.139979362487793, "gen_logits_min": -26.830034255981445, "gen_logits_std": 2.921649932861328, "gen_loss": 0.3154374659061432, "grad_norm": 0.39260292486122367, "learning_rate": 2.4471157894736843e-05, "loss": 0.2941, "mean_copy_accuracy": 0.9945416301488876, "mean_gen_accuracy": 0.8772597908973694, "mean_token_accuracy": 0.9016805738210678, "num_tokens": 302247566.0, "sample_num_tokens": 7334.0, "step": 5113, "total_num_tokens": 302276902.0, "z_loss": 0.0006513121188618243 }, { "copy_logits_max": -5.565805912017822, "copy_logits_min": -750000000.0, "copy_num_tokens": 437.875, "epoch": 1.04452387030891, "gen_logits_max": 3.4466705322265625, "gen_logits_mean": -16.546916961669922, "gen_logits_min": -27.795024871826172, "gen_logits_std": 2.872044801712036, "gen_loss": 0.28231489658355713, "grad_norm": 0.3896093886548219, "learning_rate": 2.4469894736842107e-05, "loss": 0.2813, "mean_copy_accuracy": 0.9962607473134995, "mean_gen_accuracy": 0.8743881732225418, "mean_token_accuracy": 0.9025623053312302, "num_tokens": 302516449.0, "sample_num_tokens": 8180.75, "step": 5114, "total_num_tokens": 302549172.0, "z_loss": 0.0005150607321411371 }, { "copy_logits_max": -4.621695518493652, "copy_logits_min": -750000000.0, "copy_num_tokens": 444.75, "epoch": 1.0447281082461066, "gen_logits_max": 4.598598480224609, "gen_logits_mean": -13.848800659179688, "gen_logits_min": -25.19586944580078, "gen_logits_std": 2.802527904510498, "gen_loss": 0.25880974531173706, "grad_norm": 0.41929326285203, "learning_rate": 2.4468631578947368e-05, "loss": 0.3023, "mean_copy_accuracy": 0.9950165301561356, "mean_gen_accuracy": 0.8689025044441223, "mean_token_accuracy": 0.8984542787075043, "num_tokens": 302763504.0, "sample_num_tokens": 7831.5, "step": 5115, "total_num_tokens": 302794830.0, "z_loss": 0.000516245374456048 }, { "copy_logits_max": -3.277561664581299, "copy_logits_min": -750000000.0, "copy_num_tokens": 614.1875, "epoch": 1.0449323461833036, "gen_logits_max": 4.139527320861816, "gen_logits_mean": -15.658445358276367, "gen_logits_min": -28.076190948486328, "gen_logits_std": 3.001549005508423, "gen_loss": 0.25206881761550903, "grad_norm": 0.386658431408015, "learning_rate": 2.4467368421052632e-05, "loss": 0.2804, "mean_copy_accuracy": 0.99702188372612, "mean_gen_accuracy": 0.8720286041498184, "mean_token_accuracy": 0.9044306874275208, "num_tokens": 303034339.0, "sample_num_tokens": 9014.25, "step": 5116, "total_num_tokens": 303070396.0, "z_loss": 0.0004739676078315824 }, { "copy_logits_max": -3.6132774353027344, "copy_logits_min": -687500032.0, "copy_num_tokens": 672.375, "epoch": 1.0451365841205005, "gen_logits_max": 3.953641891479492, "gen_logits_mean": -14.646968841552734, "gen_logits_min": -26.075176239013672, "gen_logits_std": 2.885793685913086, "gen_loss": 0.262484073638916, "grad_norm": 0.38718576097731583, "learning_rate": 2.4466105263157893e-05, "loss": 0.2795, "mean_copy_accuracy": 0.9959961920976639, "mean_gen_accuracy": 0.8765256702899933, "mean_token_accuracy": 0.9050951451063156, "num_tokens": 303299819.0, "sample_num_tokens": 10601.25, "step": 5117, "total_num_tokens": 303342224.0, "z_loss": 0.0005598412244580686 }, { "copy_logits_max": -4.292802810668945, "copy_logits_min": -750000000.0, "copy_num_tokens": 455.0, "epoch": 1.0453408220576972, "gen_logits_max": 4.025206565856934, "gen_logits_mean": -15.393061637878418, "gen_logits_min": -26.781034469604492, "gen_logits_std": 2.8685898780822754, "gen_loss": 0.27042561769485474, "grad_norm": 0.36912244629957897, "learning_rate": 2.4464842105263158e-05, "loss": 0.2674, "mean_copy_accuracy": 0.9969018846750259, "mean_gen_accuracy": 0.8823688626289368, "mean_token_accuracy": 0.9090759605169296, "num_tokens": 303566580.0, "sample_num_tokens": 8592.5, "step": 5118, "total_num_tokens": 303600950.0, "z_loss": 0.0005013458430767059 }, { "copy_logits_max": -4.494258880615234, "copy_logits_min": -687500032.0, "copy_num_tokens": 460.5625, "epoch": 1.045545059994894, "gen_logits_max": 4.237824440002441, "gen_logits_mean": -14.733125686645508, "gen_logits_min": -26.597698211669922, "gen_logits_std": 2.84810209274292, "gen_loss": 0.30388736724853516, "grad_norm": 0.4010156279491353, "learning_rate": 2.446357894736842e-05, "loss": 0.3004, "mean_copy_accuracy": 0.9966420233249664, "mean_gen_accuracy": 0.8646406680345535, "mean_token_accuracy": 0.8986412882804871, "num_tokens": 303856517.0, "sample_num_tokens": 8066.75, "step": 5119, "total_num_tokens": 303888784.0, "z_loss": 0.0006601748755201697 }, { "copy_logits_max": -6.952876567840576, "copy_logits_min": -750000000.0, "copy_num_tokens": 463.1875, "epoch": 1.0457492979320908, "gen_logits_max": 4.331888675689697, "gen_logits_mean": -15.961133003234863, "gen_logits_min": -27.614078521728516, "gen_logits_std": 2.924060821533203, "gen_loss": 0.26217272877693176, "grad_norm": 0.3934280680098602, "learning_rate": 2.4462315789473686e-05, "loss": 0.2804, "mean_copy_accuracy": 0.9958014339208603, "mean_gen_accuracy": 0.8705441504716873, "mean_token_accuracy": 0.9036533832550049, "num_tokens": 304141144.0, "sample_num_tokens": 8515.5, "step": 5120, "total_num_tokens": 304175206.0, "z_loss": 0.0005545708118006587 }, { "copy_logits_max": -6.313070297241211, "copy_logits_min": -750000000.0, "copy_num_tokens": 550.25, "epoch": 1.0459535358692877, "gen_logits_max": 4.886431694030762, "gen_logits_mean": -14.646750450134277, "gen_logits_min": -26.67562484741211, "gen_logits_std": 2.9110894203186035, "gen_loss": 0.252218097448349, "grad_norm": 0.38439024575525993, "learning_rate": 2.446105263157895e-05, "loss": 0.2686, "mean_copy_accuracy": 0.9963533133268356, "mean_gen_accuracy": 0.882278636097908, "mean_token_accuracy": 0.9098026603460312, "num_tokens": 304408382.0, "sample_num_tokens": 8745.5, "step": 5121, "total_num_tokens": 304443364.0, "z_loss": 0.0004769217921420932 }, { "copy_logits_max": -5.527300834655762, "copy_logits_min": -750000064.0, "copy_num_tokens": 398.5, "epoch": 1.0461577738064844, "gen_logits_max": 4.547778129577637, "gen_logits_mean": -15.141425132751465, "gen_logits_min": -27.205223083496094, "gen_logits_std": 2.9917054176330566, "gen_loss": 0.26354366540908813, "grad_norm": 0.41142093342530023, "learning_rate": 2.445978947368421e-05, "loss": 0.2697, "mean_copy_accuracy": 0.9951510578393936, "mean_gen_accuracy": 0.8801787346601486, "mean_token_accuracy": 0.9082715809345245, "num_tokens": 304675372.0, "sample_num_tokens": 7713.5, "step": 5122, "total_num_tokens": 304706226.0, "z_loss": 0.0005889878375455737 }, { "copy_logits_max": -4.1316304206848145, "copy_logits_min": -750000000.0, "copy_num_tokens": 582.6875, "epoch": 1.0463620117436814, "gen_logits_max": 3.79793119430542, "gen_logits_mean": -16.116708755493164, "gen_logits_min": -28.256851196289062, "gen_logits_std": 2.9388365745544434, "gen_loss": 0.30007559061050415, "grad_norm": 0.4366345869270143, "learning_rate": 2.4458526315789476e-05, "loss": 0.2972, "mean_copy_accuracy": 0.9967494010925293, "mean_gen_accuracy": 0.8624465018510818, "mean_token_accuracy": 0.8988785743713379, "num_tokens": 304948294.0, "sample_num_tokens": 9449.0, "step": 5123, "total_num_tokens": 304986090.0, "z_loss": 0.0006358522805385292 }, { "copy_logits_max": -5.66185998916626, "copy_logits_min": -750000000.0, "copy_num_tokens": 420.6875, "epoch": 1.0465662496808783, "gen_logits_max": 4.844458103179932, "gen_logits_mean": -15.29510498046875, "gen_logits_min": -26.970354080200195, "gen_logits_std": 2.954596996307373, "gen_loss": 0.2807086408138275, "grad_norm": 0.3662266869659661, "learning_rate": 2.4457263157894737e-05, "loss": 0.2766, "mean_copy_accuracy": 0.9959750026464462, "mean_gen_accuracy": 0.8733197599649429, "mean_token_accuracy": 0.9070886075496674, "num_tokens": 305233439.0, "sample_num_tokens": 8500.25, "step": 5124, "total_num_tokens": 305267440.0, "z_loss": 0.000669147411826998 }, { "copy_logits_max": -1.7122257947921753, "copy_logits_min": -687500032.0, "copy_num_tokens": 575.6875, "epoch": 1.046770487618075, "gen_logits_max": 4.520931243896484, "gen_logits_mean": -14.374496459960938, "gen_logits_min": -26.41661262512207, "gen_logits_std": 2.9302425384521484, "gen_loss": 0.27816712856292725, "grad_norm": 0.4207168220114791, "learning_rate": 2.4456e-05, "loss": 0.2857, "mean_copy_accuracy": 0.9963578134775162, "mean_gen_accuracy": 0.8700910955667496, "mean_token_accuracy": 0.9042845964431763, "num_tokens": 305494672.0, "sample_num_tokens": 8659.0, "step": 5125, "total_num_tokens": 305529308.0, "z_loss": 0.000608601316343993 }, { "copy_logits_max": -4.374207496643066, "copy_logits_min": -750000064.0, "copy_num_tokens": 521.0, "epoch": 1.046974725555272, "gen_logits_max": 4.782127857208252, "gen_logits_mean": -15.319979667663574, "gen_logits_min": -27.2397403717041, "gen_logits_std": 2.9779343605041504, "gen_loss": 0.2708386778831482, "grad_norm": 0.46653176515525785, "learning_rate": 2.4454736842105262e-05, "loss": 0.2632, "mean_copy_accuracy": 0.9961859881877899, "mean_gen_accuracy": 0.8843212276697159, "mean_token_accuracy": 0.9121483415365219, "num_tokens": 305792252.0, "sample_num_tokens": 8457.5, "step": 5126, "total_num_tokens": 305826082.0, "z_loss": 0.0005740425549447536 }, { "copy_logits_max": -3.935164451599121, "copy_logits_min": -750000000.0, "copy_num_tokens": 395.75, "epoch": 1.0471789634924686, "gen_logits_max": 4.523148536682129, "gen_logits_mean": -14.782560348510742, "gen_logits_min": -26.24558448791504, "gen_logits_std": 2.892864227294922, "gen_loss": 0.30599838495254517, "grad_norm": 0.4098705571356772, "learning_rate": 2.4453473684210526e-05, "loss": 0.2983, "mean_copy_accuracy": 0.9947626292705536, "mean_gen_accuracy": 0.871450737118721, "mean_token_accuracy": 0.8978119790554047, "num_tokens": 306036108.0, "sample_num_tokens": 7564.5, "step": 5127, "total_num_tokens": 306066366.0, "z_loss": 0.0006297478103078902 }, { "copy_logits_max": -3.336482524871826, "copy_logits_min": -750000000.0, "copy_num_tokens": 385.0, "epoch": 1.0473832014296656, "gen_logits_max": 4.954030513763428, "gen_logits_mean": -15.066978454589844, "gen_logits_min": -27.12602996826172, "gen_logits_std": 2.9552221298217773, "gen_loss": 0.27325335144996643, "grad_norm": 0.39795107798085344, "learning_rate": 2.445221052631579e-05, "loss": 0.2921, "mean_copy_accuracy": 0.9968926906585693, "mean_gen_accuracy": 0.8728050142526627, "mean_token_accuracy": 0.9009881466627121, "num_tokens": 306306497.0, "sample_num_tokens": 8268.75, "step": 5128, "total_num_tokens": 306339572.0, "z_loss": 0.0005844773259013891 }, { "copy_logits_max": -4.790619850158691, "copy_logits_min": -750000000.0, "copy_num_tokens": 606.625, "epoch": 1.0475874393668625, "gen_logits_max": 3.588264226913452, "gen_logits_mean": -15.538145065307617, "gen_logits_min": -27.234554290771484, "gen_logits_std": 2.961031436920166, "gen_loss": 0.2523106336593628, "grad_norm": 0.42089067326427976, "learning_rate": 2.4450947368421055e-05, "loss": 0.2834, "mean_copy_accuracy": 0.9962818175554276, "mean_gen_accuracy": 0.8723962008953094, "mean_token_accuracy": 0.9029497057199478, "num_tokens": 306582409.0, "sample_num_tokens": 9317.25, "step": 5129, "total_num_tokens": 306619678.0, "z_loss": 0.0004889196716248989 }, { "copy_logits_max": -6.748393535614014, "copy_logits_min": -750000000.0, "copy_num_tokens": 381.6875, "epoch": 1.0477916773040592, "gen_logits_max": 4.429328441619873, "gen_logits_mean": -15.421656608581543, "gen_logits_min": -26.85628890991211, "gen_logits_std": 2.9222733974456787, "gen_loss": 0.27673035860061646, "grad_norm": 0.3595539381083271, "learning_rate": 2.4449684210526316e-05, "loss": 0.2589, "mean_copy_accuracy": 0.9945424944162369, "mean_gen_accuracy": 0.8846171647310257, "mean_token_accuracy": 0.9106975346803665, "num_tokens": 306854662.0, "sample_num_tokens": 7782.0, "step": 5130, "total_num_tokens": 306885790.0, "z_loss": 0.0005667888326570392 }, { "copy_logits_max": -7.6283721923828125, "copy_logits_min": -750000000.0, "copy_num_tokens": 246.875, "epoch": 1.0479959152412561, "gen_logits_max": 5.5796027183532715, "gen_logits_mean": -15.503275871276855, "gen_logits_min": -26.982595443725586, "gen_logits_std": 2.92586612701416, "gen_loss": 0.32190340757369995, "grad_norm": 0.40674817115207107, "learning_rate": 2.444842105263158e-05, "loss": 0.2831, "mean_copy_accuracy": 0.9957835078239441, "mean_gen_accuracy": 0.8723990619182587, "mean_token_accuracy": 0.9033212512731552, "num_tokens": 307116455.0, "sample_num_tokens": 7181.75, "step": 5131, "total_num_tokens": 307145182.0, "z_loss": 0.0006641709478572011 }, { "copy_logits_max": -5.207890510559082, "copy_logits_min": -750000000.0, "copy_num_tokens": 314.4375, "epoch": 1.0482001531784528, "gen_logits_max": 5.4613752365112305, "gen_logits_mean": -14.511829376220703, "gen_logits_min": -26.599639892578125, "gen_logits_std": 2.9471545219421387, "gen_loss": 0.33292055130004883, "grad_norm": 0.41189288952914904, "learning_rate": 2.444715789473684e-05, "loss": 0.2994, "mean_copy_accuracy": 0.9948568195104599, "mean_gen_accuracy": 0.8760003447532654, "mean_token_accuracy": 0.8984676152467728, "num_tokens": 307360834.0, "sample_num_tokens": 8041.5, "step": 5132, "total_num_tokens": 307393000.0, "z_loss": 0.0007228432805277407 }, { "copy_logits_max": -4.576393127441406, "copy_logits_min": -750000000.0, "copy_num_tokens": 544.5625, "epoch": 1.0484043911156498, "gen_logits_max": 4.988116264343262, "gen_logits_mean": -14.650437355041504, "gen_logits_min": -27.012189865112305, "gen_logits_std": 2.968148708343506, "gen_loss": 0.2804471254348755, "grad_norm": 0.3791490876528799, "learning_rate": 2.4445894736842105e-05, "loss": 0.2806, "mean_copy_accuracy": 0.9961892366409302, "mean_gen_accuracy": 0.874233141541481, "mean_token_accuracy": 0.9047843217849731, "num_tokens": 307642451.0, "sample_num_tokens": 8622.25, "step": 5133, "total_num_tokens": 307676940.0, "z_loss": 0.0006410939968191087 }, { "copy_logits_max": -6.154462814331055, "copy_logits_min": -750000000.0, "copy_num_tokens": 590.3125, "epoch": 1.0486086290528465, "gen_logits_max": 4.029018878936768, "gen_logits_mean": -15.470924377441406, "gen_logits_min": -27.734840393066406, "gen_logits_std": 2.987096071243286, "gen_loss": 0.2568240165710449, "grad_norm": 0.37429029587047064, "learning_rate": 2.444463157894737e-05, "loss": 0.2727, "mean_copy_accuracy": 0.9967340677976608, "mean_gen_accuracy": 0.8787104785442352, "mean_token_accuracy": 0.9064933657646179, "num_tokens": 307906242.0, "sample_num_tokens": 9614.0, "step": 5134, "total_num_tokens": 307944698.0, "z_loss": 0.0005411526653915644 }, { "copy_logits_max": -4.96066951751709, "copy_logits_min": -750000000.0, "copy_num_tokens": 434.375, "epoch": 1.0488128669900434, "gen_logits_max": 4.397194862365723, "gen_logits_mean": -15.38742446899414, "gen_logits_min": -27.85480499267578, "gen_logits_std": 2.933140754699707, "gen_loss": 0.26797449588775635, "grad_norm": 0.38910690441367946, "learning_rate": 2.444336842105263e-05, "loss": 0.2656, "mean_copy_accuracy": 0.9956748187541962, "mean_gen_accuracy": 0.8799159675836563, "mean_token_accuracy": 0.9085726141929626, "num_tokens": 308190573.0, "sample_num_tokens": 8682.75, "step": 5135, "total_num_tokens": 308225304.0, "z_loss": 0.0005891577457077801 }, { "copy_logits_max": -4.64750862121582, "copy_logits_min": -750000000.0, "copy_num_tokens": 427.25, "epoch": 1.0490171049272403, "gen_logits_max": 4.710196495056152, "gen_logits_mean": -14.895994186401367, "gen_logits_min": -26.433544158935547, "gen_logits_std": 2.927882194519043, "gen_loss": 0.3063414692878723, "grad_norm": 0.380772535615262, "learning_rate": 2.4442105263157898e-05, "loss": 0.2875, "mean_copy_accuracy": 0.9945138096809387, "mean_gen_accuracy": 0.8749255388975143, "mean_token_accuracy": 0.9017543643712997, "num_tokens": 308457152.0, "sample_num_tokens": 8984.5, "step": 5136, "total_num_tokens": 308493090.0, "z_loss": 0.0006598664331249893 }, { "copy_logits_max": -5.7219133377075195, "copy_logits_min": -750000000.0, "copy_num_tokens": 422.1875, "epoch": 1.049221342864437, "gen_logits_max": 4.353615760803223, "gen_logits_mean": -15.342483520507812, "gen_logits_min": -27.149425506591797, "gen_logits_std": 2.932730197906494, "gen_loss": 0.2586357295513153, "grad_norm": 0.4361982124659742, "learning_rate": 2.444084210526316e-05, "loss": 0.2633, "mean_copy_accuracy": 0.997174933552742, "mean_gen_accuracy": 0.8766329884529114, "mean_token_accuracy": 0.9101132750511169, "num_tokens": 308739123.0, "sample_num_tokens": 9127.75, "step": 5137, "total_num_tokens": 308775634.0, "z_loss": 0.0005321870557963848 }, { "copy_logits_max": -4.079435348510742, "copy_logits_min": -687500096.0, "copy_num_tokens": 415.8125, "epoch": 1.049425580801634, "gen_logits_max": 5.468552589416504, "gen_logits_mean": -14.832372665405273, "gen_logits_min": -26.74406623840332, "gen_logits_std": 2.9657442569732666, "gen_loss": 0.28620612621307373, "grad_norm": 0.3813823093422002, "learning_rate": 2.4439578947368423e-05, "loss": 0.2715, "mean_copy_accuracy": 0.9965887665748596, "mean_gen_accuracy": 0.8802118301391602, "mean_token_accuracy": 0.9108375608921051, "num_tokens": 309034180.0, "sample_num_tokens": 7994.0, "step": 5138, "total_num_tokens": 309066156.0, "z_loss": 0.000653247581794858 }, { "copy_logits_max": -4.967071533203125, "copy_logits_min": -750000000.0, "copy_num_tokens": 530.75, "epoch": 1.0496298187388307, "gen_logits_max": 4.942164897918701, "gen_logits_mean": -13.336277961730957, "gen_logits_min": -25.295143127441406, "gen_logits_std": 2.953214168548584, "gen_loss": 0.28830432891845703, "grad_norm": 0.40107152749902797, "learning_rate": 2.4438315789473684e-05, "loss": 0.2805, "mean_copy_accuracy": 0.995945617556572, "mean_gen_accuracy": 0.8745278716087341, "mean_token_accuracy": 0.9041363596916199, "num_tokens": 309290361.0, "sample_num_tokens": 9418.75, "step": 5139, "total_num_tokens": 309328036.0, "z_loss": 0.0005379045614972711 }, { "copy_logits_max": -2.4370765686035156, "copy_logits_min": -750000000.0, "copy_num_tokens": 624.5625, "epoch": 1.0498340566760276, "gen_logits_max": 4.653234481811523, "gen_logits_mean": -14.554739952087402, "gen_logits_min": -26.639318466186523, "gen_logits_std": 2.92673659324646, "gen_loss": 0.2492493987083435, "grad_norm": 0.41784518833069073, "learning_rate": 2.443705263157895e-05, "loss": 0.2927, "mean_copy_accuracy": 0.9956563115119934, "mean_gen_accuracy": 0.871459499001503, "mean_token_accuracy": 0.9014760553836823, "num_tokens": 309561554.0, "sample_num_tokens": 8857.5, "step": 5140, "total_num_tokens": 309596984.0, "z_loss": 0.00048432074254378676 }, { "copy_logits_max": -7.779613494873047, "copy_logits_min": -750000000.0, "copy_num_tokens": 360.4375, "epoch": 1.0500382946132245, "gen_logits_max": 4.393312931060791, "gen_logits_mean": -15.57356071472168, "gen_logits_min": -26.992721557617188, "gen_logits_std": 2.885728597640991, "gen_loss": 0.2870953679084778, "grad_norm": 0.35609552891897744, "learning_rate": 2.443578947368421e-05, "loss": 0.257, "mean_copy_accuracy": 0.9968536496162415, "mean_gen_accuracy": 0.8789814561605453, "mean_token_accuracy": 0.9132923632860184, "num_tokens": 309849193.0, "sample_num_tokens": 7671.75, "step": 5141, "total_num_tokens": 309879880.0, "z_loss": 0.000533353304490447 }, { "copy_logits_max": -6.260419845581055, "copy_logits_min": -750000000.0, "copy_num_tokens": 348.25, "epoch": 1.0502425325504212, "gen_logits_max": 5.235091686248779, "gen_logits_mean": -14.582883834838867, "gen_logits_min": -26.134273529052734, "gen_logits_std": 2.9209742546081543, "gen_loss": 0.29992207884788513, "grad_norm": 0.3852900660828794, "learning_rate": 2.4434526315789474e-05, "loss": 0.2718, "mean_copy_accuracy": 0.9967558234930038, "mean_gen_accuracy": 0.8776400238275528, "mean_token_accuracy": 0.9080827981233597, "num_tokens": 310122690.0, "sample_num_tokens": 7577.0, "step": 5142, "total_num_tokens": 310152998.0, "z_loss": 0.0005630579544231296 }, { "copy_logits_max": -2.612661838531494, "copy_logits_min": -750000000.0, "copy_num_tokens": 610.875, "epoch": 1.0504467704876181, "gen_logits_max": 4.308929443359375, "gen_logits_mean": -14.10043716430664, "gen_logits_min": -26.135469436645508, "gen_logits_std": 2.9160358905792236, "gen_loss": 0.26973745226860046, "grad_norm": 0.39949721147000167, "learning_rate": 2.4433263157894735e-05, "loss": 0.2768, "mean_copy_accuracy": 0.9957262873649597, "mean_gen_accuracy": 0.8770129531621933, "mean_token_accuracy": 0.9066102653741837, "num_tokens": 310401930.0, "sample_num_tokens": 9175.5, "step": 5143, "total_num_tokens": 310438632.0, "z_loss": 0.00046524975914508104 }, { "copy_logits_max": -3.112730026245117, "copy_logits_min": -750000000.0, "copy_num_tokens": 434.6875, "epoch": 1.0506510084248148, "gen_logits_max": 3.7704825401306152, "gen_logits_mean": -16.22183609008789, "gen_logits_min": -28.4049072265625, "gen_logits_std": 2.9791502952575684, "gen_loss": 0.26178550720214844, "grad_norm": 0.456207953878524, "learning_rate": 2.4432000000000003e-05, "loss": 0.2805, "mean_copy_accuracy": 0.9940164983272552, "mean_gen_accuracy": 0.8807257860898972, "mean_token_accuracy": 0.9051127284765244, "num_tokens": 310661977.0, "sample_num_tokens": 7931.25, "step": 5144, "total_num_tokens": 310693702.0, "z_loss": 0.0005604028119705617 }, { "copy_logits_max": -4.225582122802734, "copy_logits_min": -687500032.0, "copy_num_tokens": 587.0, "epoch": 1.0508552463620118, "gen_logits_max": 4.071500778198242, "gen_logits_mean": -15.63540267944336, "gen_logits_min": -27.956825256347656, "gen_logits_std": 2.9804227352142334, "gen_loss": 0.2700052261352539, "grad_norm": 0.38501668484095614, "learning_rate": 2.4430736842105263e-05, "loss": 0.2781, "mean_copy_accuracy": 0.9944547265768051, "mean_gen_accuracy": 0.8810847252607346, "mean_token_accuracy": 0.9050427675247192, "num_tokens": 310916619.0, "sample_num_tokens": 9638.25, "step": 5145, "total_num_tokens": 310955172.0, "z_loss": 0.0005722619243897498 }, { "copy_logits_max": -4.924245357513428, "copy_logits_min": -687500032.0, "copy_num_tokens": 471.4375, "epoch": 1.0510594842992085, "gen_logits_max": 4.786427021026611, "gen_logits_mean": -14.351737976074219, "gen_logits_min": -26.052528381347656, "gen_logits_std": 2.8817262649536133, "gen_loss": 0.2722820043563843, "grad_norm": 0.35636007341732384, "learning_rate": 2.4429473684210528e-05, "loss": 0.27, "mean_copy_accuracy": 0.9962069094181061, "mean_gen_accuracy": 0.8823658376932144, "mean_token_accuracy": 0.9070509374141693, "num_tokens": 311177922.0, "sample_num_tokens": 8568.5, "step": 5146, "total_num_tokens": 311212196.0, "z_loss": 0.0006127191591076553 }, { "copy_logits_max": -5.373270034790039, "copy_logits_min": -625000064.0, "copy_num_tokens": 365.125, "epoch": 1.0512637222364054, "gen_logits_max": 5.569787502288818, "gen_logits_mean": -13.620552062988281, "gen_logits_min": -25.64607048034668, "gen_logits_std": 2.912932872772217, "gen_loss": 0.29118818044662476, "grad_norm": 0.4237763475703699, "learning_rate": 2.442821052631579e-05, "loss": 0.2983, "mean_copy_accuracy": 0.9948849529027939, "mean_gen_accuracy": 0.8722886890172958, "mean_token_accuracy": 0.901121199131012, "num_tokens": 311444379.0, "sample_num_tokens": 8016.25, "step": 5147, "total_num_tokens": 311476444.0, "z_loss": 0.0006061353487893939 }, { "copy_logits_max": -3.1447527408599854, "copy_logits_min": -750000000.0, "copy_num_tokens": 605.9375, "epoch": 1.0514679601736023, "gen_logits_max": 4.218325138092041, "gen_logits_mean": -14.7350435256958, "gen_logits_min": -26.796707153320312, "gen_logits_std": 2.9670357704162598, "gen_loss": 0.29025188088417053, "grad_norm": 0.383580232987945, "learning_rate": 2.4426947368421053e-05, "loss": 0.2843, "mean_copy_accuracy": 0.9968633949756622, "mean_gen_accuracy": 0.8707040399312973, "mean_token_accuracy": 0.9045374542474747, "num_tokens": 311709514.0, "sample_num_tokens": 8662.0, "step": 5148, "total_num_tokens": 311744162.0, "z_loss": 0.0006318285595625639 }, { "copy_logits_max": -6.390213966369629, "copy_logits_min": -750000064.0, "copy_num_tokens": 511.3125, "epoch": 1.051672198110799, "gen_logits_max": 4.114136695861816, "gen_logits_mean": -16.060958862304688, "gen_logits_min": -27.744739532470703, "gen_logits_std": 2.933342695236206, "gen_loss": 0.273564875125885, "grad_norm": 0.3708028335675249, "learning_rate": 2.4425684210526317e-05, "loss": 0.2807, "mean_copy_accuracy": 0.9953030049800873, "mean_gen_accuracy": 0.8812257796525955, "mean_token_accuracy": 0.9052116125822067, "num_tokens": 311990842.0, "sample_num_tokens": 9133.0, "step": 5149, "total_num_tokens": 312027374.0, "z_loss": 0.0005571846850216389 }, { "copy_logits_max": -5.595473766326904, "copy_logits_min": -750000000.0, "copy_num_tokens": 389.9375, "epoch": 1.051876436047996, "gen_logits_max": 4.573188304901123, "gen_logits_mean": -15.309192657470703, "gen_logits_min": -27.31424331665039, "gen_logits_std": 2.9665098190307617, "gen_loss": 0.2886883318424225, "grad_norm": 0.40368622316088315, "learning_rate": 2.4424421052631578e-05, "loss": 0.283, "mean_copy_accuracy": 0.9950480759143829, "mean_gen_accuracy": 0.8730349391698837, "mean_token_accuracy": 0.9027533680200577, "num_tokens": 312261233.0, "sample_num_tokens": 7498.75, "step": 5150, "total_num_tokens": 312291228.0, "z_loss": 0.0005895608337596059 }, { "copy_logits_max": -6.580375671386719, "copy_logits_min": -750000000.0, "copy_num_tokens": 387.1875, "epoch": 1.0520806739851927, "gen_logits_max": 5.227869987487793, "gen_logits_mean": -14.24765396118164, "gen_logits_min": -26.1453857421875, "gen_logits_std": 2.885178565979004, "gen_loss": 0.26930516958236694, "grad_norm": 0.42763413230607034, "learning_rate": 2.4423157894736843e-05, "loss": 0.2722, "mean_copy_accuracy": 0.9954230189323425, "mean_gen_accuracy": 0.8831272274255753, "mean_token_accuracy": 0.9076777249574661, "num_tokens": 312540880.0, "sample_num_tokens": 7934.0, "step": 5151, "total_num_tokens": 312572616.0, "z_loss": 0.0005427446449175477 }, { "copy_logits_max": -4.766568183898926, "copy_logits_min": -750000000.0, "copy_num_tokens": 605.0, "epoch": 1.0522849119223896, "gen_logits_max": 6.002275466918945, "gen_logits_mean": -12.509552001953125, "gen_logits_min": -24.42825698852539, "gen_logits_std": 2.91029691696167, "gen_loss": 0.3087505102157593, "grad_norm": 0.48629466170394625, "learning_rate": 2.4421894736842107e-05, "loss": 0.2838, "mean_copy_accuracy": 0.9965025782585144, "mean_gen_accuracy": 0.8720415681600571, "mean_token_accuracy": 0.9078290164470673, "num_tokens": 312821905.0, "sample_num_tokens": 9059.75, "step": 5152, "total_num_tokens": 312858144.0, "z_loss": 0.000593076809309423 }, { "copy_logits_max": -5.919126510620117, "copy_logits_min": -750000000.0, "copy_num_tokens": 408.0625, "epoch": 1.0524891498595865, "gen_logits_max": 5.653581142425537, "gen_logits_mean": -14.447771072387695, "gen_logits_min": -26.212263107299805, "gen_logits_std": 2.8900704383850098, "gen_loss": 0.3128330707550049, "grad_norm": 0.4292389625828592, "learning_rate": 2.442063157894737e-05, "loss": 0.2882, "mean_copy_accuracy": 0.9958192706108093, "mean_gen_accuracy": 0.872326210141182, "mean_token_accuracy": 0.9037792086601257, "num_tokens": 313092136.0, "sample_num_tokens": 8416.5, "step": 5153, "total_num_tokens": 313125802.0, "z_loss": 0.000647154520265758 }, { "copy_logits_max": -8.647881507873535, "copy_logits_min": -687500032.0, "copy_num_tokens": 179.125, "epoch": 1.0526933877967832, "gen_logits_max": 6.359164714813232, "gen_logits_mean": -15.409046173095703, "gen_logits_min": -26.694175720214844, "gen_logits_std": 2.906050205230713, "gen_loss": 0.28496262431144714, "grad_norm": 0.437375109709643, "learning_rate": 2.4419368421052632e-05, "loss": 0.2854, "mean_copy_accuracy": 0.9937741905450821, "mean_gen_accuracy": 0.8794702142477036, "mean_token_accuracy": 0.9030930995941162, "num_tokens": 313356917.0, "sample_num_tokens": 6914.75, "step": 5154, "total_num_tokens": 313384576.0, "z_loss": 0.0006001442088745534 }, { "copy_logits_max": -6.049924850463867, "copy_logits_min": -750000000.0, "copy_num_tokens": 501.0625, "epoch": 1.0528976257339802, "gen_logits_max": 5.051836967468262, "gen_logits_mean": -14.520954132080078, "gen_logits_min": -26.2216796875, "gen_logits_std": 2.918384075164795, "gen_loss": 0.3044523000717163, "grad_norm": 0.41399935054765313, "learning_rate": 2.4418105263157896e-05, "loss": 0.2902, "mean_copy_accuracy": 0.9949219077825546, "mean_gen_accuracy": 0.8738346695899963, "mean_token_accuracy": 0.901859924197197, "num_tokens": 313609767.0, "sample_num_tokens": 8318.75, "step": 5155, "total_num_tokens": 313643042.0, "z_loss": 0.0006580348126590252 }, { "copy_logits_max": -3.362278938293457, "copy_logits_min": -750000000.0, "copy_num_tokens": 425.125, "epoch": 1.0531018636711769, "gen_logits_max": 5.316508769989014, "gen_logits_mean": -13.915975570678711, "gen_logits_min": -25.638687133789062, "gen_logits_std": 2.895125389099121, "gen_loss": 0.3119460940361023, "grad_norm": 0.33073458144673923, "learning_rate": 2.4416842105263157e-05, "loss": 0.2526, "mean_copy_accuracy": 0.9965885281562805, "mean_gen_accuracy": 0.8809789270162582, "mean_token_accuracy": 0.9148602485656738, "num_tokens": 313908292.0, "sample_num_tokens": 8309.0, "step": 5156, "total_num_tokens": 313941528.0, "z_loss": 0.0006945976056158543 }, { "copy_logits_max": -3.8996589183807373, "copy_logits_min": -750000000.0, "copy_num_tokens": 379.875, "epoch": 1.0533061016083738, "gen_logits_max": 5.13070011138916, "gen_logits_mean": -14.572135925292969, "gen_logits_min": -26.764713287353516, "gen_logits_std": 2.946755886077881, "gen_loss": 0.3089737892150879, "grad_norm": 0.3710191536636839, "learning_rate": 2.441557894736842e-05, "loss": 0.2851, "mean_copy_accuracy": 0.9959924072027206, "mean_gen_accuracy": 0.875840812921524, "mean_token_accuracy": 0.9027990102767944, "num_tokens": 314178228.0, "sample_num_tokens": 7821.0, "step": 5157, "total_num_tokens": 314209512.0, "z_loss": 0.0006540589965879917 }, { "copy_logits_max": -4.913527965545654, "copy_logits_min": -750000000.0, "copy_num_tokens": 403.5, "epoch": 1.0535103395455705, "gen_logits_max": 4.757184028625488, "gen_logits_mean": -15.270840644836426, "gen_logits_min": -27.336414337158203, "gen_logits_std": 2.964949131011963, "gen_loss": 0.3048187494277954, "grad_norm": 0.4220635703818831, "learning_rate": 2.4414315789473683e-05, "loss": 0.2582, "mean_copy_accuracy": 0.9965018481016159, "mean_gen_accuracy": 0.8802276104688644, "mean_token_accuracy": 0.9140331298112869, "num_tokens": 314485977.0, "sample_num_tokens": 7897.75, "step": 5158, "total_num_tokens": 314517568.0, "z_loss": 0.0005954935331828892 }, { "copy_logits_max": -7.107889652252197, "copy_logits_min": -750000000.0, "copy_num_tokens": 414.3125, "epoch": 1.0537145774827674, "gen_logits_max": 5.381984233856201, "gen_logits_mean": -15.580856323242188, "gen_logits_min": -27.341827392578125, "gen_logits_std": 2.9791641235351562, "gen_loss": 0.24444550275802612, "grad_norm": 0.38997579005553984, "learning_rate": 2.4413052631578947e-05, "loss": 0.2679, "mean_copy_accuracy": 0.9965891987085342, "mean_gen_accuracy": 0.8748794496059418, "mean_token_accuracy": 0.9098363518714905, "num_tokens": 314773979.0, "sample_num_tokens": 8440.75, "step": 5159, "total_num_tokens": 314807742.0, "z_loss": 0.0005024287384003401 }, { "copy_logits_max": -6.646490097045898, "copy_logits_min": -750000000.0, "copy_num_tokens": 400.25, "epoch": 1.0539188154199643, "gen_logits_max": 5.203892707824707, "gen_logits_mean": -14.393094062805176, "gen_logits_min": -25.87800407409668, "gen_logits_std": 2.9246115684509277, "gen_loss": 0.31336671113967896, "grad_norm": 0.37570317661279273, "learning_rate": 2.441178947368421e-05, "loss": 0.2992, "mean_copy_accuracy": 0.9968662708997726, "mean_gen_accuracy": 0.8752555847167969, "mean_token_accuracy": 0.9001199752092361, "num_tokens": 315054507.0, "sample_num_tokens": 7998.75, "step": 5160, "total_num_tokens": 315086502.0, "z_loss": 0.0006194042507559061 }, { "copy_logits_max": -6.156925201416016, "copy_logits_min": -750000000.0, "copy_num_tokens": 406.5625, "epoch": 1.054123053357161, "gen_logits_max": 4.200809478759766, "gen_logits_mean": -16.126754760742188, "gen_logits_min": -27.988311767578125, "gen_logits_std": 2.966299057006836, "gen_loss": 0.26812630891799927, "grad_norm": 0.4124788357684902, "learning_rate": 2.4410526315789476e-05, "loss": 0.3067, "mean_copy_accuracy": 0.9950177818536758, "mean_gen_accuracy": 0.8689973205327988, "mean_token_accuracy": 0.8960185199975967, "num_tokens": 315287726.0, "sample_num_tokens": 7593.5, "step": 5161, "total_num_tokens": 315318100.0, "z_loss": 0.0005487184971570969 }, { "copy_logits_max": -6.5563812255859375, "copy_logits_min": -750000000.0, "copy_num_tokens": 324.125, "epoch": 1.054327291294358, "gen_logits_max": 4.570897102355957, "gen_logits_mean": -14.915794372558594, "gen_logits_min": -26.33969497680664, "gen_logits_std": 2.8930540084838867, "gen_loss": 0.2533777356147766, "grad_norm": 0.38407358850195034, "learning_rate": 2.440926315789474e-05, "loss": 0.2542, "mean_copy_accuracy": 0.9948256611824036, "mean_gen_accuracy": 0.8856339752674103, "mean_token_accuracy": 0.9125235825777054, "num_tokens": 315547265.0, "sample_num_tokens": 7682.75, "step": 5162, "total_num_tokens": 315577996.0, "z_loss": 0.0004348818911239505 }, { "copy_logits_max": -5.961151123046875, "copy_logits_min": -687500032.0, "copy_num_tokens": 514.6875, "epoch": 1.0545315292315547, "gen_logits_max": 3.927353620529175, "gen_logits_mean": -15.876978874206543, "gen_logits_min": -27.787338256835938, "gen_logits_std": 2.9870765209198, "gen_loss": 0.28421750664711, "grad_norm": 0.3444785741865402, "learning_rate": 2.4408e-05, "loss": 0.2618, "mean_copy_accuracy": 0.9963745325803757, "mean_gen_accuracy": 0.879796102643013, "mean_token_accuracy": 0.9111742079257965, "num_tokens": 315834893.0, "sample_num_tokens": 8846.25, "step": 5163, "total_num_tokens": 315870278.0, "z_loss": 0.0005390611477196217 }, { "copy_logits_max": -9.834281921386719, "copy_logits_min": -750000000.0, "copy_num_tokens": 371.75, "epoch": 1.0547357671687516, "gen_logits_max": 4.416961193084717, "gen_logits_mean": -16.285213470458984, "gen_logits_min": -27.66712188720703, "gen_logits_std": 2.9264144897460938, "gen_loss": 0.2602401375770569, "grad_norm": 0.38138874621312246, "learning_rate": 2.4406736842105265e-05, "loss": 0.2735, "mean_copy_accuracy": 0.9964934885501862, "mean_gen_accuracy": 0.8806821554899216, "mean_token_accuracy": 0.9074308127164841, "num_tokens": 316104545.0, "sample_num_tokens": 8768.75, "step": 5164, "total_num_tokens": 316139620.0, "z_loss": 0.0004797307192347944 }, { "copy_logits_max": -7.479488849639893, "copy_logits_min": -750000000.0, "copy_num_tokens": 326.4375, "epoch": 1.0549400051059483, "gen_logits_max": 4.906956672668457, "gen_logits_mean": -15.976222038269043, "gen_logits_min": -27.695068359375, "gen_logits_std": 2.958223342895508, "gen_loss": 0.29055649042129517, "grad_norm": 0.40465158350809527, "learning_rate": 2.4405473684210526e-05, "loss": 0.2781, "mean_copy_accuracy": 0.9959579259157181, "mean_gen_accuracy": 0.8782330751419067, "mean_token_accuracy": 0.9081425964832306, "num_tokens": 316387704.0, "sample_num_tokens": 8002.0, "step": 5165, "total_num_tokens": 316419712.0, "z_loss": 0.0005099255358800292 }, { "copy_logits_max": -6.295577049255371, "copy_logits_min": -687500032.0, "copy_num_tokens": 748.0625, "epoch": 1.0551442430431452, "gen_logits_max": 3.2156801223754883, "gen_logits_mean": -15.663362503051758, "gen_logits_min": -27.734275817871094, "gen_logits_std": 2.9578661918640137, "gen_loss": 0.2416483461856842, "grad_norm": 0.3723771591643547, "learning_rate": 2.440421052631579e-05, "loss": 0.267, "mean_copy_accuracy": 0.995522677898407, "mean_gen_accuracy": 0.8787826001644135, "mean_token_accuracy": 0.9107121825218201, "num_tokens": 316663764.0, "sample_num_tokens": 10113.5, "step": 5166, "total_num_tokens": 316704218.0, "z_loss": 0.0005369535647332668 }, { "copy_logits_max": -6.614463806152344, "copy_logits_min": -750000000.0, "copy_num_tokens": 460.0, "epoch": 1.0553484809803422, "gen_logits_max": 4.217583656311035, "gen_logits_mean": -14.708961486816406, "gen_logits_min": -26.47164535522461, "gen_logits_std": 2.8663980960845947, "gen_loss": 0.28134483098983765, "grad_norm": 0.40658089890889726, "learning_rate": 2.440294736842105e-05, "loss": 0.2787, "mean_copy_accuracy": 0.9969069957733154, "mean_gen_accuracy": 0.8732930123806, "mean_token_accuracy": 0.9062830209732056, "num_tokens": 316930289.0, "sample_num_tokens": 8285.75, "step": 5167, "total_num_tokens": 316963432.0, "z_loss": 0.00055097317090258 }, { "copy_logits_max": -4.735511779785156, "copy_logits_min": -750000000.0, "copy_num_tokens": 697.375, "epoch": 1.0555527189175389, "gen_logits_max": 3.486330032348633, "gen_logits_mean": -14.532089233398438, "gen_logits_min": -26.714149475097656, "gen_logits_std": 2.9138612747192383, "gen_loss": 0.26155510544776917, "grad_norm": 0.39822405088713997, "learning_rate": 2.4401684210526316e-05, "loss": 0.2599, "mean_copy_accuracy": 0.9972509741783142, "mean_gen_accuracy": 0.8800641447305679, "mean_token_accuracy": 0.9106031060218811, "num_tokens": 317200546.0, "sample_num_tokens": 9244.5, "step": 5168, "total_num_tokens": 317237524.0, "z_loss": 0.0005719388136640191 }, { "copy_logits_max": -6.614217281341553, "copy_logits_min": -750000000.0, "copy_num_tokens": 471.75, "epoch": 1.0557569568547358, "gen_logits_max": 4.0675435066223145, "gen_logits_mean": -15.547306060791016, "gen_logits_min": -27.229846954345703, "gen_logits_std": 2.913877010345459, "gen_loss": 0.29907649755477905, "grad_norm": 0.3731082772468799, "learning_rate": 2.440042105263158e-05, "loss": 0.2734, "mean_copy_accuracy": 0.995833158493042, "mean_gen_accuracy": 0.8747938871383667, "mean_token_accuracy": 0.9083354473114014, "num_tokens": 317488439.0, "sample_num_tokens": 8035.25, "step": 5169, "total_num_tokens": 317520580.0, "z_loss": 0.0005365915130823851 }, { "copy_logits_max": -7.892485618591309, "copy_logits_min": -750000000.0, "copy_num_tokens": 341.4375, "epoch": 1.0559611947919325, "gen_logits_max": 4.173854351043701, "gen_logits_mean": -15.62950611114502, "gen_logits_min": -27.635936737060547, "gen_logits_std": 2.9777629375457764, "gen_loss": 0.27961280941963196, "grad_norm": 0.3738173246857688, "learning_rate": 2.4399157894736844e-05, "loss": 0.2688, "mean_copy_accuracy": 0.9962585866451263, "mean_gen_accuracy": 0.8750843852758408, "mean_token_accuracy": 0.9073722958564758, "num_tokens": 317766839.0, "sample_num_tokens": 7424.75, "step": 5170, "total_num_tokens": 317796538.0, "z_loss": 0.0005246312357485294 }, { "copy_logits_max": -8.12575912475586, "copy_logits_min": -750000000.0, "copy_num_tokens": 385.8125, "epoch": 1.0561654327291294, "gen_logits_max": 4.446794033050537, "gen_logits_mean": -15.185891151428223, "gen_logits_min": -26.94795036315918, "gen_logits_std": 2.9183716773986816, "gen_loss": 0.29671692848205566, "grad_norm": 0.4777181729923731, "learning_rate": 2.4397894736842105e-05, "loss": 0.3116, "mean_copy_accuracy": 0.995262399315834, "mean_gen_accuracy": 0.8683591037988663, "mean_token_accuracy": 0.8939779251813889, "num_tokens": 317999725.0, "sample_num_tokens": 8221.75, "step": 5171, "total_num_tokens": 318032612.0, "z_loss": 0.0005685610231012106 }, { "copy_logits_max": -7.491711616516113, "copy_logits_min": -750000000.0, "copy_num_tokens": 440.5, "epoch": 1.0563696706663264, "gen_logits_max": 5.956407070159912, "gen_logits_mean": -13.26371955871582, "gen_logits_min": -24.994983673095703, "gen_logits_std": 2.8951077461242676, "gen_loss": 0.2649504542350769, "grad_norm": 0.3804796642094753, "learning_rate": 2.439663157894737e-05, "loss": 0.2731, "mean_copy_accuracy": 0.9962609708309174, "mean_gen_accuracy": 0.8789908438920975, "mean_token_accuracy": 0.9069938361644745, "num_tokens": 318285799.0, "sample_num_tokens": 8501.25, "step": 5172, "total_num_tokens": 318319804.0, "z_loss": 0.0004680345591623336 }, { "copy_logits_max": -7.455148220062256, "copy_logits_min": -750000000.0, "copy_num_tokens": 334.1875, "epoch": 1.056573908603523, "gen_logits_max": 4.7900309562683105, "gen_logits_mean": -15.830862045288086, "gen_logits_min": -27.505571365356445, "gen_logits_std": 2.9547462463378906, "gen_loss": 0.3109877109527588, "grad_norm": 0.38320407945285245, "learning_rate": 2.439536842105263e-05, "loss": 0.2786, "mean_copy_accuracy": 0.9957137107849121, "mean_gen_accuracy": 0.876070037484169, "mean_token_accuracy": 0.9064570814371109, "num_tokens": 318566754.0, "sample_num_tokens": 7133.5, "step": 5173, "total_num_tokens": 318595288.0, "z_loss": 0.0005510968621820211 }, { "copy_logits_max": -6.953568935394287, "copy_logits_min": -687500032.0, "copy_num_tokens": 359.875, "epoch": 1.05677814654072, "gen_logits_max": 5.2787981033325195, "gen_logits_mean": -15.168664932250977, "gen_logits_min": -26.934711456298828, "gen_logits_std": 2.9244003295898438, "gen_loss": 0.30893775820732117, "grad_norm": 0.4097825163290002, "learning_rate": 2.4394105263157895e-05, "loss": 0.2929, "mean_copy_accuracy": 0.9957732409238815, "mean_gen_accuracy": 0.8717080950737, "mean_token_accuracy": 0.9009335041046143, "num_tokens": 318839122.0, "sample_num_tokens": 7731.5, "step": 5174, "total_num_tokens": 318870048.0, "z_loss": 0.0005867157015018165 }, { "copy_logits_max": -6.980965614318848, "copy_logits_min": -687500032.0, "copy_num_tokens": 496.125, "epoch": 1.0569823844779167, "gen_logits_max": 4.298575401306152, "gen_logits_mean": -15.144372940063477, "gen_logits_min": -26.52069854736328, "gen_logits_std": 2.9054481983184814, "gen_loss": 0.2879024147987366, "grad_norm": 0.4260643472161257, "learning_rate": 2.439284210526316e-05, "loss": 0.2911, "mean_copy_accuracy": 0.9961536675691605, "mean_gen_accuracy": 0.870949000120163, "mean_token_accuracy": 0.9003555029630661, "num_tokens": 319105299.0, "sample_num_tokens": 8322.25, "step": 5175, "total_num_tokens": 319138588.0, "z_loss": 0.0006197119946591556 }, { "copy_logits_max": -6.089602470397949, "copy_logits_min": -750000000.0, "copy_num_tokens": 669.6875, "epoch": 1.0571866224151136, "gen_logits_max": 4.810517311096191, "gen_logits_mean": -14.208003997802734, "gen_logits_min": -26.58749008178711, "gen_logits_std": 2.9353346824645996, "gen_loss": 0.26154085993766785, "grad_norm": 0.37286573061319606, "learning_rate": 2.439157894736842e-05, "loss": 0.2637, "mean_copy_accuracy": 0.9962661564350128, "mean_gen_accuracy": 0.8777247369289398, "mean_token_accuracy": 0.9097221940755844, "num_tokens": 319395943.0, "sample_num_tokens": 10378.25, "step": 5176, "total_num_tokens": 319437456.0, "z_loss": 0.0006351646734401584 }, { "copy_logits_max": -5.476628303527832, "copy_logits_min": -750000000.0, "copy_num_tokens": 466.75, "epoch": 1.0573908603523103, "gen_logits_max": 5.345690727233887, "gen_logits_mean": -13.33552360534668, "gen_logits_min": -25.186065673828125, "gen_logits_std": 2.912740707397461, "gen_loss": 0.2803049385547638, "grad_norm": 0.35582176263123694, "learning_rate": 2.4390315789473688e-05, "loss": 0.2768, "mean_copy_accuracy": 0.9964240193367004, "mean_gen_accuracy": 0.8759626746177673, "mean_token_accuracy": 0.9077537506818771, "num_tokens": 319681770.0, "sample_num_tokens": 8132.0, "step": 5177, "total_num_tokens": 319714298.0, "z_loss": 0.0006907135248184204 }, { "copy_logits_max": -5.009256839752197, "copy_logits_min": -750000000.0, "copy_num_tokens": 411.0625, "epoch": 1.0575950982895073, "gen_logits_max": 5.125700950622559, "gen_logits_mean": -14.32775592803955, "gen_logits_min": -26.552196502685547, "gen_logits_std": 2.9390673637390137, "gen_loss": 0.30035316944122314, "grad_norm": 0.37875906101349177, "learning_rate": 2.438905263157895e-05, "loss": 0.2869, "mean_copy_accuracy": 0.995596244931221, "mean_gen_accuracy": 0.8738063424825668, "mean_token_accuracy": 0.9042672663927078, "num_tokens": 319957477.0, "sample_num_tokens": 7613.25, "step": 5178, "total_num_tokens": 319987930.0, "z_loss": 0.000672394409775734 }, { "copy_logits_max": -5.928620338439941, "copy_logits_min": -750000000.0, "copy_num_tokens": 645.8125, "epoch": 1.0577993362267042, "gen_logits_max": 4.37343168258667, "gen_logits_mean": -14.671487808227539, "gen_logits_min": -27.35580825805664, "gen_logits_std": 2.95634388923645, "gen_loss": 0.2807106375694275, "grad_norm": 0.39793437769412515, "learning_rate": 2.4387789473684213e-05, "loss": 0.2735, "mean_copy_accuracy": 0.9967927038669586, "mean_gen_accuracy": 0.8745201528072357, "mean_token_accuracy": 0.9076901227235794, "num_tokens": 320220503.0, "sample_num_tokens": 9332.25, "step": 5179, "total_num_tokens": 320257832.0, "z_loss": 0.0006268630968406796 }, { "copy_logits_max": -5.465932846069336, "copy_logits_min": -687500032.0, "copy_num_tokens": 713.125, "epoch": 1.058003574163901, "gen_logits_max": 4.974442481994629, "gen_logits_mean": -13.625114440917969, "gen_logits_min": -25.616777420043945, "gen_logits_std": 2.9416136741638184, "gen_loss": 0.23855358362197876, "grad_norm": 0.3893577192927719, "learning_rate": 2.4386526315789474e-05, "loss": 0.2875, "mean_copy_accuracy": 0.9958627074956894, "mean_gen_accuracy": 0.8711641132831573, "mean_token_accuracy": 0.9010378122329712, "num_tokens": 320484468.0, "sample_num_tokens": 9902.0, "step": 5180, "total_num_tokens": 320524076.0, "z_loss": 0.0005728599498979747 }, { "copy_logits_max": -5.81324577331543, "copy_logits_min": -750000000.0, "copy_num_tokens": 392.125, "epoch": 1.0582078121010978, "gen_logits_max": 5.330266952514648, "gen_logits_mean": -13.447443008422852, "gen_logits_min": -24.902265548706055, "gen_logits_std": 2.821536064147949, "gen_loss": 0.25672221183776855, "grad_norm": 0.35131749430209414, "learning_rate": 2.4385263157894738e-05, "loss": 0.2532, "mean_copy_accuracy": 0.9958446621894836, "mean_gen_accuracy": 0.8811061382293701, "mean_token_accuracy": 0.9114058613777161, "num_tokens": 320774201.0, "sample_num_tokens": 7869.25, "step": 5181, "total_num_tokens": 320805678.0, "z_loss": 0.0005524165462702513 }, { "copy_logits_max": -3.8100154399871826, "copy_logits_min": -625000064.0, "copy_num_tokens": 665.875, "epoch": 1.0584120500382945, "gen_logits_max": 4.852526664733887, "gen_logits_mean": -15.10165023803711, "gen_logits_min": -27.147218704223633, "gen_logits_std": 2.9578208923339844, "gen_loss": 0.28291258215904236, "grad_norm": 0.37140092800064606, "learning_rate": 2.4384e-05, "loss": 0.2781, "mean_copy_accuracy": 0.9957925975322723, "mean_gen_accuracy": 0.8737356960773468, "mean_token_accuracy": 0.9075046479701996, "num_tokens": 321044248.0, "sample_num_tokens": 9469.0, "step": 5182, "total_num_tokens": 321082124.0, "z_loss": 0.0006458557909354568 }, { "copy_logits_max": -5.6970906257629395, "copy_logits_min": -750000000.0, "copy_num_tokens": 612.0625, "epoch": 1.0586162879754915, "gen_logits_max": 4.197360038757324, "gen_logits_mean": -15.001078605651855, "gen_logits_min": -27.04999542236328, "gen_logits_std": 2.9455766677856445, "gen_loss": 0.2618904113769531, "grad_norm": 0.4375591990559045, "learning_rate": 2.4382736842105263e-05, "loss": 0.2897, "mean_copy_accuracy": 0.9943180829286575, "mean_gen_accuracy": 0.8799510449171066, "mean_token_accuracy": 0.9047478139400482, "num_tokens": 321314115.0, "sample_num_tokens": 9638.75, "step": 5183, "total_num_tokens": 321352670.0, "z_loss": 0.0005560662830248475 }, { "copy_logits_max": -6.016233444213867, "copy_logits_min": -750000000.0, "copy_num_tokens": 462.9375, "epoch": 1.0588205259126884, "gen_logits_max": 4.6243462562561035, "gen_logits_mean": -15.423370361328125, "gen_logits_min": -27.07895278930664, "gen_logits_std": 2.911334991455078, "gen_loss": 0.2650684714317322, "grad_norm": 0.3853741570470771, "learning_rate": 2.4381473684210524e-05, "loss": 0.2769, "mean_copy_accuracy": 0.9961618632078171, "mean_gen_accuracy": 0.8742765039205551, "mean_token_accuracy": 0.905340850353241, "num_tokens": 321574508.0, "sample_num_tokens": 8041.0, "step": 5184, "total_num_tokens": 321606672.0, "z_loss": 0.0005586088518612087 }, { "copy_logits_max": -5.146524429321289, "copy_logits_min": -625000000.0, "copy_num_tokens": 461.5625, "epoch": 1.059024763849885, "gen_logits_max": 5.474290370941162, "gen_logits_mean": -14.676328659057617, "gen_logits_min": -26.674985885620117, "gen_logits_std": 2.964743137359619, "gen_loss": 0.30127841234207153, "grad_norm": 0.345614588854221, "learning_rate": 2.4380210526315792e-05, "loss": 0.2778, "mean_copy_accuracy": 0.9967236518859863, "mean_gen_accuracy": 0.87237548828125, "mean_token_accuracy": 0.9074520915746689, "num_tokens": 321864051.0, "sample_num_tokens": 8159.25, "step": 5185, "total_num_tokens": 321896688.0, "z_loss": 0.0007028242689557374 }, { "copy_logits_max": -5.177188396453857, "copy_logits_min": -750000064.0, "copy_num_tokens": 539.0, "epoch": 1.059229001787082, "gen_logits_max": 3.7140071392059326, "gen_logits_mean": -15.691043853759766, "gen_logits_min": -27.51644515991211, "gen_logits_std": 2.9366908073425293, "gen_loss": 0.30378228425979614, "grad_norm": 0.386123285197874, "learning_rate": 2.4378947368421053e-05, "loss": 0.2705, "mean_copy_accuracy": 0.99461530148983, "mean_gen_accuracy": 0.8808072656393051, "mean_token_accuracy": 0.9078643769025803, "num_tokens": 322124248.0, "sample_num_tokens": 8744.0, "step": 5186, "total_num_tokens": 322159224.0, "z_loss": 0.0005852269241586328 }, { "copy_logits_max": -5.853992938995361, "copy_logits_min": -750000000.0, "copy_num_tokens": 515.8125, "epoch": 1.0594332397242787, "gen_logits_max": 5.438898086547852, "gen_logits_mean": -14.839738845825195, "gen_logits_min": -27.19497299194336, "gen_logits_std": 2.976233959197998, "gen_loss": 0.2684262692928314, "grad_norm": 0.39320956010878444, "learning_rate": 2.4377684210526317e-05, "loss": 0.2774, "mean_copy_accuracy": 0.9958216547966003, "mean_gen_accuracy": 0.8776223808526993, "mean_token_accuracy": 0.9069447070360184, "num_tokens": 322407641.0, "sample_num_tokens": 8944.25, "step": 5187, "total_num_tokens": 322443418.0, "z_loss": 0.0005684240022674203 }, { "copy_logits_max": -4.347182273864746, "copy_logits_min": -750000064.0, "copy_num_tokens": 665.5, "epoch": 1.0596374776614756, "gen_logits_max": 3.862002372741699, "gen_logits_mean": -14.553912162780762, "gen_logits_min": -26.907146453857422, "gen_logits_std": 2.946894884109497, "gen_loss": 0.2694382667541504, "grad_norm": 0.3850904190662925, "learning_rate": 2.437642105263158e-05, "loss": 0.2732, "mean_copy_accuracy": 0.9957081228494644, "mean_gen_accuracy": 0.876193031668663, "mean_token_accuracy": 0.9053389728069305, "num_tokens": 322684440.0, "sample_num_tokens": 9597.0, "step": 5188, "total_num_tokens": 322722828.0, "z_loss": 0.0006065457710064948 }, { "copy_logits_max": -5.7295403480529785, "copy_logits_min": -750000000.0, "copy_num_tokens": 393.4375, "epoch": 1.0598417155986724, "gen_logits_max": 5.120344161987305, "gen_logits_mean": -14.672250747680664, "gen_logits_min": -26.364654541015625, "gen_logits_std": 2.9202005863189697, "gen_loss": 0.2751578092575073, "grad_norm": 0.3924784210656093, "learning_rate": 2.4375157894736842e-05, "loss": 0.294, "mean_copy_accuracy": 0.9956203103065491, "mean_gen_accuracy": 0.8731619566679001, "mean_token_accuracy": 0.8992839753627777, "num_tokens": 322955583.0, "sample_num_tokens": 8954.25, "step": 5189, "total_num_tokens": 322991400.0, "z_loss": 0.000568521034438163 }, { "copy_logits_max": -5.881561279296875, "copy_logits_min": -750000000.0, "copy_num_tokens": 525.8125, "epoch": 1.0600459535358693, "gen_logits_max": 4.009488105773926, "gen_logits_mean": -16.112831115722656, "gen_logits_min": -27.95513153076172, "gen_logits_std": 2.968142032623291, "gen_loss": 0.26006579399108887, "grad_norm": 0.4862689761508228, "learning_rate": 2.4373894736842107e-05, "loss": 0.2618, "mean_copy_accuracy": 0.9966769367456436, "mean_gen_accuracy": 0.8856563717126846, "mean_token_accuracy": 0.9129663556814194, "num_tokens": 323222107.0, "sample_num_tokens": 9595.75, "step": 5190, "total_num_tokens": 323260490.0, "z_loss": 0.0005260067991912365 }, { "copy_logits_max": -3.339539051055908, "copy_logits_min": -750000000.0, "copy_num_tokens": 363.5625, "epoch": 1.0602501914730662, "gen_logits_max": 4.51141357421875, "gen_logits_mean": -15.018631935119629, "gen_logits_min": -26.52259063720703, "gen_logits_std": 2.948554039001465, "gen_loss": 0.2653196454048157, "grad_norm": 0.38719091747841694, "learning_rate": 2.4372631578947368e-05, "loss": 0.2682, "mean_copy_accuracy": 0.9960109889507294, "mean_gen_accuracy": 0.8834480196237564, "mean_token_accuracy": 0.9077074378728867, "num_tokens": 323477468.0, "sample_num_tokens": 7262.5, "step": 5191, "total_num_tokens": 323506518.0, "z_loss": 0.0004798301961272955 }, { "copy_logits_max": -5.754273891448975, "copy_logits_min": -750000000.0, "copy_num_tokens": 362.4375, "epoch": 1.060454429410263, "gen_logits_max": 5.50654935836792, "gen_logits_mean": -13.746626853942871, "gen_logits_min": -25.697093963623047, "gen_logits_std": 2.9172635078430176, "gen_loss": 0.2693988084793091, "grad_norm": 0.40832606948919714, "learning_rate": 2.4371368421052632e-05, "loss": 0.2861, "mean_copy_accuracy": 0.995528444647789, "mean_gen_accuracy": 0.8761864900588989, "mean_token_accuracy": 0.9028414934873581, "num_tokens": 323736471.0, "sample_num_tokens": 8376.25, "step": 5192, "total_num_tokens": 323769976.0, "z_loss": 0.0005522534484043717 }, { "copy_logits_max": -3.33143949508667, "copy_logits_min": -750000000.0, "copy_num_tokens": 457.0, "epoch": 1.0606586673474598, "gen_logits_max": 5.053298473358154, "gen_logits_mean": -14.112911224365234, "gen_logits_min": -26.197675704956055, "gen_logits_std": 2.9417078495025635, "gen_loss": 0.30611228942871094, "grad_norm": 0.411578101172766, "learning_rate": 2.4370105263157896e-05, "loss": 0.2973, "mean_copy_accuracy": 0.996359646320343, "mean_gen_accuracy": 0.868390217423439, "mean_token_accuracy": 0.8994099199771881, "num_tokens": 323995252.0, "sample_num_tokens": 8172.5, "step": 5193, "total_num_tokens": 324027942.0, "z_loss": 0.0005876127979718149 }, { "copy_logits_max": -7.012578010559082, "copy_logits_min": -750000000.0, "copy_num_tokens": 515.5, "epoch": 1.0608629052846565, "gen_logits_max": 5.03543758392334, "gen_logits_mean": -13.970086097717285, "gen_logits_min": -25.67078971862793, "gen_logits_std": 2.938791275024414, "gen_loss": 0.2810664176940918, "grad_norm": 0.43110988893892793, "learning_rate": 2.436884210526316e-05, "loss": 0.2899, "mean_copy_accuracy": 0.9935737252235413, "mean_gen_accuracy": 0.8777185380458832, "mean_token_accuracy": 0.9006527960300446, "num_tokens": 324251761.0, "sample_num_tokens": 9135.25, "step": 5194, "total_num_tokens": 324288302.0, "z_loss": 0.0005127495969645679 }, { "copy_logits_max": -5.415915489196777, "copy_logits_min": -750000000.0, "copy_num_tokens": 554.875, "epoch": 1.0610671432218535, "gen_logits_max": 4.826876163482666, "gen_logits_mean": -14.225629806518555, "gen_logits_min": -26.89208221435547, "gen_logits_std": 2.976564407348633, "gen_loss": 0.26569420099258423, "grad_norm": 0.41253232768151415, "learning_rate": 2.436757894736842e-05, "loss": 0.2769, "mean_copy_accuracy": 0.9956175088882446, "mean_gen_accuracy": 0.8746206760406494, "mean_token_accuracy": 0.906499370932579, "num_tokens": 324520347.0, "sample_num_tokens": 9332.25, "step": 5195, "total_num_tokens": 324557676.0, "z_loss": 0.0005224470514804125 }, { "copy_logits_max": -3.7211053371429443, "copy_logits_min": -750000000.0, "copy_num_tokens": 535.5625, "epoch": 1.0612713811590504, "gen_logits_max": 3.0313477516174316, "gen_logits_mean": -16.90668296813965, "gen_logits_min": -29.073284149169922, "gen_logits_std": 3.0161800384521484, "gen_loss": 0.29004859924316406, "grad_norm": 0.39583546257421504, "learning_rate": 2.4366315789473686e-05, "loss": 0.2945, "mean_copy_accuracy": 0.9965606927871704, "mean_gen_accuracy": 0.8664730936288834, "mean_token_accuracy": 0.9029863178730011, "num_tokens": 324807521.0, "sample_num_tokens": 8742.25, "step": 5196, "total_num_tokens": 324842490.0, "z_loss": 0.0005490161129273474 }, { "copy_logits_max": -5.464836120605469, "copy_logits_min": -750000000.0, "copy_num_tokens": 490.9375, "epoch": 1.061475619096247, "gen_logits_max": 3.905099630355835, "gen_logits_mean": -16.279428482055664, "gen_logits_min": -28.530153274536133, "gen_logits_std": 3.008967399597168, "gen_loss": 0.27343058586120605, "grad_norm": 0.37970702234623266, "learning_rate": 2.4365052631578947e-05, "loss": 0.2795, "mean_copy_accuracy": 0.9954850375652313, "mean_gen_accuracy": 0.8814763277769089, "mean_token_accuracy": 0.9057826101779938, "num_tokens": 325069974.0, "sample_num_tokens": 9321.5, "step": 5197, "total_num_tokens": 325107260.0, "z_loss": 0.0005002800608053803 }, { "copy_logits_max": -5.076630592346191, "copy_logits_min": -625000064.0, "copy_num_tokens": 193.9375, "epoch": 1.061679857033444, "gen_logits_max": 5.51255989074707, "gen_logits_mean": -15.05935287475586, "gen_logits_min": -26.89049530029297, "gen_logits_std": 2.9334161281585693, "gen_loss": 0.3246504068374634, "grad_norm": 0.40524121018397236, "learning_rate": 2.436378947368421e-05, "loss": 0.3032, "mean_copy_accuracy": 0.9950492680072784, "mean_gen_accuracy": 0.8729467391967773, "mean_token_accuracy": 0.8977375328540802, "num_tokens": 325329084.0, "sample_num_tokens": 6574.0, "step": 5198, "total_num_tokens": 325355380.0, "z_loss": 0.0006197112961672246 }, { "copy_logits_max": -3.2023515701293945, "copy_logits_min": -687500032.0, "copy_num_tokens": 570.1875, "epoch": 1.0618840949706407, "gen_logits_max": 4.151872634887695, "gen_logits_mean": -14.050603866577148, "gen_logits_min": -26.349964141845703, "gen_logits_std": 2.909480571746826, "gen_loss": 0.2599777579307556, "grad_norm": 0.40919197294081233, "learning_rate": 2.4362526315789472e-05, "loss": 0.268, "mean_copy_accuracy": 0.9964514374732971, "mean_gen_accuracy": 0.8736365139484406, "mean_token_accuracy": 0.9109249711036682, "num_tokens": 325594569.0, "sample_num_tokens": 8310.25, "step": 5199, "total_num_tokens": 325627810.0, "z_loss": 0.0004990388406440616 }, { "copy_logits_max": -3.8250973224639893, "copy_logits_min": -750000000.0, "copy_num_tokens": 599.75, "epoch": 1.0620883329078377, "gen_logits_max": 4.633375644683838, "gen_logits_mean": -14.856696128845215, "gen_logits_min": -26.833404541015625, "gen_logits_std": 2.9027421474456787, "gen_loss": 0.283058226108551, "grad_norm": 0.3805030281407265, "learning_rate": 2.4361263157894736e-05, "loss": 0.2902, "mean_copy_accuracy": 0.996263861656189, "mean_gen_accuracy": 0.8696338087320328, "mean_token_accuracy": 0.9009730964899063, "num_tokens": 325869897.0, "sample_num_tokens": 9709.75, "step": 5200, "total_num_tokens": 325908736.0, "z_loss": 0.0005745190428569913 }, { "copy_logits_max": -5.7573418617248535, "copy_logits_min": -750000000.0, "copy_num_tokens": 433.25, "epoch": 1.0622925708450344, "gen_logits_max": 4.361123085021973, "gen_logits_mean": -16.003252029418945, "gen_logits_min": -28.106050491333008, "gen_logits_std": 2.9637885093688965, "gen_loss": 0.2617538571357727, "grad_norm": 0.3856036961937725, "learning_rate": 2.4360000000000004e-05, "loss": 0.2542, "mean_copy_accuracy": 0.9969596713781357, "mean_gen_accuracy": 0.8831426799297333, "mean_token_accuracy": 0.9119476228952408, "num_tokens": 326151114.0, "sample_num_tokens": 8202.0, "step": 5201, "total_num_tokens": 326183922.0, "z_loss": 0.0005798619240522385 }, { "copy_logits_max": -5.801690101623535, "copy_logits_min": -687500032.0, "copy_num_tokens": 515.75, "epoch": 1.0624968087822313, "gen_logits_max": 5.164948463439941, "gen_logits_mean": -14.190523147583008, "gen_logits_min": -26.31121063232422, "gen_logits_std": 2.872821807861328, "gen_loss": 0.302793025970459, "grad_norm": 0.4369129260730268, "learning_rate": 2.4358736842105265e-05, "loss": 0.3061, "mean_copy_accuracy": 0.9964006245136261, "mean_gen_accuracy": 0.8639955967664719, "mean_token_accuracy": 0.8973197042942047, "num_tokens": 326413724.0, "sample_num_tokens": 8746.0, "step": 5202, "total_num_tokens": 326448708.0, "z_loss": 0.0006443171296268702 }, { "copy_logits_max": -5.848214149475098, "copy_logits_min": -750000000.0, "copy_num_tokens": 469.3125, "epoch": 1.0627010467194282, "gen_logits_max": 5.327213287353516, "gen_logits_mean": -13.860727310180664, "gen_logits_min": -25.299633026123047, "gen_logits_std": 2.8179609775543213, "gen_loss": 0.32371199131011963, "grad_norm": 0.38937613777476765, "learning_rate": 2.435747368421053e-05, "loss": 0.2893, "mean_copy_accuracy": 0.9966543316841125, "mean_gen_accuracy": 0.8711990863084793, "mean_token_accuracy": 0.9032440483570099, "num_tokens": 326698145.0, "sample_num_tokens": 8305.25, "step": 5203, "total_num_tokens": 326731366.0, "z_loss": 0.0006660100189037621 }, { "copy_logits_max": -5.919713497161865, "copy_logits_min": -750000000.0, "copy_num_tokens": 380.75, "epoch": 1.062905284656625, "gen_logits_max": 4.783770561218262, "gen_logits_mean": -15.678169250488281, "gen_logits_min": -27.732681274414062, "gen_logits_std": 2.958202600479126, "gen_loss": 0.29532188177108765, "grad_norm": 0.397428818879814, "learning_rate": 2.435621052631579e-05, "loss": 0.2804, "mean_copy_accuracy": 0.994151696562767, "mean_gen_accuracy": 0.882828950881958, "mean_token_accuracy": 0.905633807182312, "num_tokens": 326968189.0, "sample_num_tokens": 8523.25, "step": 5204, "total_num_tokens": 327002282.0, "z_loss": 0.0006283017573878169 }, { "copy_logits_max": -5.226272106170654, "copy_logits_min": -750000000.0, "copy_num_tokens": 475.5625, "epoch": 1.0631095225938219, "gen_logits_max": 3.7021682262420654, "gen_logits_mean": -16.082298278808594, "gen_logits_min": -28.311458587646484, "gen_logits_std": 2.961510419845581, "gen_loss": 0.27629536390304565, "grad_norm": 0.3880151632930044, "learning_rate": 2.4354947368421054e-05, "loss": 0.277, "mean_copy_accuracy": 0.9955317080020905, "mean_gen_accuracy": 0.8765500336885452, "mean_token_accuracy": 0.9065647423267365, "num_tokens": 327245154.0, "sample_num_tokens": 8495.0, "step": 5205, "total_num_tokens": 327279134.0, "z_loss": 0.000603213906288147 }, { "copy_logits_max": -6.708609580993652, "copy_logits_min": -750000000.0, "copy_num_tokens": 366.125, "epoch": 1.0633137605310186, "gen_logits_max": 4.466169357299805, "gen_logits_mean": -15.967167854309082, "gen_logits_min": -27.560455322265625, "gen_logits_std": 2.912755012512207, "gen_loss": 0.29770028591156006, "grad_norm": 0.38074501872153965, "learning_rate": 2.4353684210526315e-05, "loss": 0.286, "mean_copy_accuracy": 0.9947108030319214, "mean_gen_accuracy": 0.8784389495849609, "mean_token_accuracy": 0.902445837855339, "num_tokens": 327522005.0, "sample_num_tokens": 8014.25, "step": 5206, "total_num_tokens": 327554062.0, "z_loss": 0.0005965358577668667 }, { "copy_logits_max": -6.369589328765869, "copy_logits_min": -750000000.0, "copy_num_tokens": 432.5625, "epoch": 1.0635179984682155, "gen_logits_max": 4.319956302642822, "gen_logits_mean": -15.098652839660645, "gen_logits_min": -26.66350555419922, "gen_logits_std": 2.890629768371582, "gen_loss": 0.30256322026252747, "grad_norm": 0.3990333480175116, "learning_rate": 2.435242105263158e-05, "loss": 0.2803, "mean_copy_accuracy": 0.9967110902070999, "mean_gen_accuracy": 0.87144935131073, "mean_token_accuracy": 0.904611811041832, "num_tokens": 327799472.0, "sample_num_tokens": 8248.0, "step": 5207, "total_num_tokens": 327832464.0, "z_loss": 0.0005707764066755772 }, { "copy_logits_max": -6.047292709350586, "copy_logits_min": -750000128.0, "copy_num_tokens": 507.5625, "epoch": 1.0637222364054124, "gen_logits_max": 4.572035789489746, "gen_logits_mean": -14.361616134643555, "gen_logits_min": -26.726112365722656, "gen_logits_std": 2.887272834777832, "gen_loss": 0.2786408066749573, "grad_norm": 0.36799044938607633, "learning_rate": 2.435115789473684e-05, "loss": 0.2558, "mean_copy_accuracy": 0.9964892566204071, "mean_gen_accuracy": 0.8855333775281906, "mean_token_accuracy": 0.9151743948459625, "num_tokens": 328087391.0, "sample_num_tokens": 8496.75, "step": 5208, "total_num_tokens": 328121378.0, "z_loss": 0.000579682644456625 }, { "copy_logits_max": -6.982671737670898, "copy_logits_min": -750000000.0, "copy_num_tokens": 337.125, "epoch": 1.0639264743426091, "gen_logits_max": 4.323812007904053, "gen_logits_mean": -16.778465270996094, "gen_logits_min": -28.700214385986328, "gen_logits_std": 2.9798550605773926, "gen_loss": 0.2849782407283783, "grad_norm": 0.3965453687095857, "learning_rate": 2.4349894736842105e-05, "loss": 0.2818, "mean_copy_accuracy": 0.9947803318500519, "mean_gen_accuracy": 0.8775003552436829, "mean_token_accuracy": 0.9040454477071762, "num_tokens": 328354236.0, "sample_num_tokens": 8063.5, "step": 5209, "total_num_tokens": 328386490.0, "z_loss": 0.0005290750996209681 }, { "copy_logits_max": -6.338573455810547, "copy_logits_min": -750000000.0, "copy_num_tokens": 369.5, "epoch": 1.064130712279806, "gen_logits_max": 5.642637729644775, "gen_logits_mean": -13.569642066955566, "gen_logits_min": -26.03866958618164, "gen_logits_std": 2.9401025772094727, "gen_loss": 0.3507327437400818, "grad_norm": 0.3943776925975014, "learning_rate": 2.434863157894737e-05, "loss": 0.2957, "mean_copy_accuracy": 0.995878279209137, "mean_gen_accuracy": 0.8722730278968811, "mean_token_accuracy": 0.8994211405515671, "num_tokens": 328615445.0, "sample_num_tokens": 8462.25, "step": 5210, "total_num_tokens": 328649294.0, "z_loss": 0.000619921600446105 }, { "copy_logits_max": -5.253033638000488, "copy_logits_min": -750000000.0, "copy_num_tokens": 486.0625, "epoch": 1.0643349502170028, "gen_logits_max": 4.051865577697754, "gen_logits_mean": -16.029693603515625, "gen_logits_min": -27.91466522216797, "gen_logits_std": 2.9417552947998047, "gen_loss": 0.2998603284358978, "grad_norm": 0.47613017814275155, "learning_rate": 2.4347368421052633e-05, "loss": 0.2952, "mean_copy_accuracy": 0.9947739541530609, "mean_gen_accuracy": 0.8717174530029297, "mean_token_accuracy": 0.9007669389247894, "num_tokens": 328896267.0, "sample_num_tokens": 8872.25, "step": 5211, "total_num_tokens": 328931756.0, "z_loss": 0.0005809846334159374 }, { "copy_logits_max": -5.398066520690918, "copy_logits_min": -750000000.0, "copy_num_tokens": 432.4375, "epoch": 1.0645391881541997, "gen_logits_max": 4.788498878479004, "gen_logits_mean": -15.717981338500977, "gen_logits_min": -27.24315643310547, "gen_logits_std": 2.9227521419525146, "gen_loss": 0.29858553409576416, "grad_norm": 0.4161305964161627, "learning_rate": 2.4346105263157894e-05, "loss": 0.3011, "mean_copy_accuracy": 0.9961473345756531, "mean_gen_accuracy": 0.867171049118042, "mean_token_accuracy": 0.8980851620435715, "num_tokens": 329162542.0, "sample_num_tokens": 9089.0, "step": 5212, "total_num_tokens": 329198898.0, "z_loss": 0.0006529349484480917 }, { "copy_logits_max": -3.8418169021606445, "copy_logits_min": -750000000.0, "copy_num_tokens": 614.625, "epoch": 1.0647434260913964, "gen_logits_max": 4.086669445037842, "gen_logits_mean": -15.901187896728516, "gen_logits_min": -27.62939453125, "gen_logits_std": 2.9579195976257324, "gen_loss": 0.2447204887866974, "grad_norm": 0.4779196398162681, "learning_rate": 2.434484210526316e-05, "loss": 0.3016, "mean_copy_accuracy": 0.9929533153772354, "mean_gen_accuracy": 0.8753038644790649, "mean_token_accuracy": 0.8983767628669739, "num_tokens": 329404044.0, "sample_num_tokens": 9556.0, "step": 5213, "total_num_tokens": 329442268.0, "z_loss": 0.000611813273280859 }, { "copy_logits_max": -3.7330005168914795, "copy_logits_min": -750000000.0, "copy_num_tokens": 469.5625, "epoch": 1.0649476640285933, "gen_logits_max": 4.2652058601379395, "gen_logits_mean": -16.06005859375, "gen_logits_min": -28.00592041015625, "gen_logits_std": 2.955594301223755, "gen_loss": 0.29205620288848877, "grad_norm": 0.4194010423884676, "learning_rate": 2.434357894736842e-05, "loss": 0.2813, "mean_copy_accuracy": 0.9960302412509918, "mean_gen_accuracy": 0.8750783503055573, "mean_token_accuracy": 0.9047854989767075, "num_tokens": 329675497.0, "sample_num_tokens": 8312.25, "step": 5214, "total_num_tokens": 329708746.0, "z_loss": 0.0006552882841788232 }, { "copy_logits_max": -6.108001708984375, "copy_logits_min": -750000000.0, "copy_num_tokens": 608.0625, "epoch": 1.0651519019657902, "gen_logits_max": 5.141457557678223, "gen_logits_mean": -13.638104438781738, "gen_logits_min": -25.508773803710938, "gen_logits_std": 2.919173240661621, "gen_loss": 0.2592089772224426, "grad_norm": 0.40558440670701235, "learning_rate": 2.4342315789473684e-05, "loss": 0.2538, "mean_copy_accuracy": 0.9974746257066727, "mean_gen_accuracy": 0.885253444314003, "mean_token_accuracy": 0.9161941707134247, "num_tokens": 329978590.0, "sample_num_tokens": 9540.5, "step": 5215, "total_num_tokens": 330016752.0, "z_loss": 0.0005597652634605765 }, { "copy_logits_max": -2.8364508152008057, "copy_logits_min": -750000000.0, "copy_num_tokens": 400.3125, "epoch": 1.065356139902987, "gen_logits_max": 5.481440544128418, "gen_logits_mean": -14.03762435913086, "gen_logits_min": -26.093412399291992, "gen_logits_std": 2.931324005126953, "gen_loss": 0.3212718367576599, "grad_norm": 0.38939084523291156, "learning_rate": 2.4341052631578948e-05, "loss": 0.2873, "mean_copy_accuracy": 0.9960125684738159, "mean_gen_accuracy": 0.8713102787733078, "mean_token_accuracy": 0.902418464422226, "num_tokens": 330272619.0, "sample_num_tokens": 8265.25, "step": 5216, "total_num_tokens": 330305680.0, "z_loss": 0.0006547466618940234 }, { "copy_logits_max": -3.402528762817383, "copy_logits_min": -687500032.0, "copy_num_tokens": 720.3125, "epoch": 1.0655603778401839, "gen_logits_max": 4.342217445373535, "gen_logits_mean": -14.299446105957031, "gen_logits_min": -26.322193145751953, "gen_logits_std": 2.925103187561035, "gen_loss": 0.2697789669036865, "grad_norm": 0.3934660896581318, "learning_rate": 2.433978947368421e-05, "loss": 0.2738, "mean_copy_accuracy": 0.9962353110313416, "mean_gen_accuracy": 0.8779850602149963, "mean_token_accuracy": 0.9074187129735947, "num_tokens": 330535984.0, "sample_num_tokens": 10128.0, "step": 5217, "total_num_tokens": 330576496.0, "z_loss": 0.0005390244186855853 }, { "copy_logits_max": -5.591872215270996, "copy_logits_min": -625000064.0, "copy_num_tokens": 589.0, "epoch": 1.0657646157773806, "gen_logits_max": 2.344954252243042, "gen_logits_mean": -17.718955993652344, "gen_logits_min": -29.889774322509766, "gen_logits_std": 3.0296177864074707, "gen_loss": 0.25257009267807007, "grad_norm": 0.38153880105634636, "learning_rate": 2.4338526315789477e-05, "loss": 0.2757, "mean_copy_accuracy": 0.9948668330907822, "mean_gen_accuracy": 0.8786292225122452, "mean_token_accuracy": 0.9061408489942551, "num_tokens": 330822977.0, "sample_num_tokens": 9250.25, "step": 5218, "total_num_tokens": 330859978.0, "z_loss": 0.0004877443134319037 }, { "copy_logits_max": -5.222960472106934, "copy_logits_min": -750000000.0, "copy_num_tokens": 397.5, "epoch": 1.0659688537145775, "gen_logits_max": 4.322169780731201, "gen_logits_mean": -15.410566329956055, "gen_logits_min": -26.976093292236328, "gen_logits_std": 2.929218292236328, "gen_loss": 0.2720905840396881, "grad_norm": 0.4108533191197489, "learning_rate": 2.4337263157894738e-05, "loss": 0.2999, "mean_copy_accuracy": 0.9953239262104034, "mean_gen_accuracy": 0.8757162243127823, "mean_token_accuracy": 0.8994169682264328, "num_tokens": 331088244.0, "sample_num_tokens": 8466.0, "step": 5219, "total_num_tokens": 331122108.0, "z_loss": 0.00048701695050112903 }, { "copy_logits_max": -6.123688220977783, "copy_logits_min": -750000000.0, "copy_num_tokens": 532.8125, "epoch": 1.0661730916517742, "gen_logits_max": 4.53275203704834, "gen_logits_mean": -14.888483047485352, "gen_logits_min": -26.70799446105957, "gen_logits_std": 2.916475534439087, "gen_loss": 0.25332388281822205, "grad_norm": 0.41710625643200977, "learning_rate": 2.4336000000000002e-05, "loss": 0.2649, "mean_copy_accuracy": 0.9960093796253204, "mean_gen_accuracy": 0.8814158290624619, "mean_token_accuracy": 0.9104006737470627, "num_tokens": 331358586.0, "sample_num_tokens": 8667.5, "step": 5220, "total_num_tokens": 331393256.0, "z_loss": 0.0004599041712936014 }, { "copy_logits_max": -5.681717872619629, "copy_logits_min": -750000000.0, "copy_num_tokens": 375.125, "epoch": 1.0663773295889711, "gen_logits_max": 3.3739521503448486, "gen_logits_mean": -16.83271026611328, "gen_logits_min": -28.45014190673828, "gen_logits_std": 2.9415788650512695, "gen_loss": 0.2695949077606201, "grad_norm": 0.4042879855431184, "learning_rate": 2.4334736842105263e-05, "loss": 0.2741, "mean_copy_accuracy": 0.9952059388160706, "mean_gen_accuracy": 0.8793997168540955, "mean_token_accuracy": 0.9060769826173782, "num_tokens": 331603336.0, "sample_num_tokens": 7307.0, "step": 5221, "total_num_tokens": 331632564.0, "z_loss": 0.0004549315490294248 }, { "copy_logits_max": -5.856785774230957, "copy_logits_min": -750000000.0, "copy_num_tokens": 379.5625, "epoch": 1.066581567526168, "gen_logits_max": 4.575977802276611, "gen_logits_mean": -15.434687614440918, "gen_logits_min": -26.885746002197266, "gen_logits_std": 2.9477598667144775, "gen_loss": 0.2858726382255554, "grad_norm": 0.36307816601917914, "learning_rate": 2.4333473684210527e-05, "loss": 0.288, "mean_copy_accuracy": 0.9961225986480713, "mean_gen_accuracy": 0.871839314699173, "mean_token_accuracy": 0.9005853235721588, "num_tokens": 331869841.0, "sample_num_tokens": 7700.75, "step": 5222, "total_num_tokens": 331900644.0, "z_loss": 0.0004944056272506714 }, { "copy_logits_max": -6.670898914337158, "copy_logits_min": -750000000.0, "copy_num_tokens": 423.125, "epoch": 1.0667858054633648, "gen_logits_max": 4.42128849029541, "gen_logits_mean": -15.945483207702637, "gen_logits_min": -27.575069427490234, "gen_logits_std": 2.9443893432617188, "gen_loss": 0.2943286895751953, "grad_norm": 0.4133285173139635, "learning_rate": 2.4332210526315788e-05, "loss": 0.2912, "mean_copy_accuracy": 0.9961688071489334, "mean_gen_accuracy": 0.8736554980278015, "mean_token_accuracy": 0.899716317653656, "num_tokens": 332132841.0, "sample_num_tokens": 8689.25, "step": 5223, "total_num_tokens": 332167598.0, "z_loss": 0.0005552682559937239 }, { "copy_logits_max": -6.795625686645508, "copy_logits_min": -750000000.0, "copy_num_tokens": 328.375, "epoch": 1.0669900434005617, "gen_logits_max": 4.759762763977051, "gen_logits_mean": -15.769510269165039, "gen_logits_min": -27.27667999267578, "gen_logits_std": 2.951275587081909, "gen_loss": 0.24967078864574432, "grad_norm": 0.38732358413957546, "learning_rate": 2.4330947368421053e-05, "loss": 0.2756, "mean_copy_accuracy": 0.9958131462335587, "mean_gen_accuracy": 0.8834190666675568, "mean_token_accuracy": 0.9080304056406021, "num_tokens": 332400860.0, "sample_num_tokens": 7653.0, "step": 5224, "total_num_tokens": 332431472.0, "z_loss": 0.00046619880595244467 }, { "copy_logits_max": -4.5567708015441895, "copy_logits_min": -750000000.0, "copy_num_tokens": 489.4375, "epoch": 1.0671942813377584, "gen_logits_max": 3.2579078674316406, "gen_logits_mean": -16.270370483398438, "gen_logits_min": -28.506732940673828, "gen_logits_std": 2.953430652618408, "gen_loss": 0.25841936469078064, "grad_norm": 0.4236596606860518, "learning_rate": 2.4329684210526314e-05, "loss": 0.2962, "mean_copy_accuracy": 0.9956687837839127, "mean_gen_accuracy": 0.8676978796720505, "mean_token_accuracy": 0.8982948958873749, "num_tokens": 332656621.0, "sample_num_tokens": 8155.75, "step": 5225, "total_num_tokens": 332689244.0, "z_loss": 0.0005211549578234553 }, { "copy_logits_max": -6.1165242195129395, "copy_logits_min": -750000000.0, "copy_num_tokens": 454.3125, "epoch": 1.0673985192749553, "gen_logits_max": 2.9347269535064697, "gen_logits_mean": -18.331951141357422, "gen_logits_min": -30.074514389038086, "gen_logits_std": 3.00746488571167, "gen_loss": 0.2707759439945221, "grad_norm": 0.38130129350146064, "learning_rate": 2.432842105263158e-05, "loss": 0.2701, "mean_copy_accuracy": 0.9954104423522949, "mean_gen_accuracy": 0.8816466331481934, "mean_token_accuracy": 0.9079377204179764, "num_tokens": 332903929.0, "sample_num_tokens": 8623.25, "step": 5226, "total_num_tokens": 332938422.0, "z_loss": 0.000537401472683996 }, { "copy_logits_max": -5.365141868591309, "copy_logits_min": -750000064.0, "copy_num_tokens": 347.0625, "epoch": 1.0676027572121523, "gen_logits_max": 4.064319610595703, "gen_logits_mean": -15.759199142456055, "gen_logits_min": -27.445125579833984, "gen_logits_std": 2.922729969024658, "gen_loss": 0.26321423053741455, "grad_norm": 0.41980492904823175, "learning_rate": 2.4327157894736842e-05, "loss": 0.2955, "mean_copy_accuracy": 0.9958656430244446, "mean_gen_accuracy": 0.8711391538381577, "mean_token_accuracy": 0.8997467011213303, "num_tokens": 333167463.0, "sample_num_tokens": 7113.25, "step": 5227, "total_num_tokens": 333195916.0, "z_loss": 0.0005376398330554366 }, { "copy_logits_max": -4.60424280166626, "copy_logits_min": -750000000.0, "copy_num_tokens": 408.25, "epoch": 1.067806995149349, "gen_logits_max": 4.798976898193359, "gen_logits_mean": -13.688592910766602, "gen_logits_min": -24.86022186279297, "gen_logits_std": 2.7951061725616455, "gen_loss": 0.27665767073631287, "grad_norm": 0.39709505581934096, "learning_rate": 2.4325894736842106e-05, "loss": 0.2865, "mean_copy_accuracy": 0.9957110285758972, "mean_gen_accuracy": 0.8736797869205475, "mean_token_accuracy": 0.9031490683555603, "num_tokens": 333439206.0, "sample_num_tokens": 7855.0, "step": 5228, "total_num_tokens": 333470626.0, "z_loss": 0.0005739611224271357 }, { "copy_logits_max": -5.402073860168457, "copy_logits_min": -687500032.0, "copy_num_tokens": 420.25, "epoch": 1.0680112330865459, "gen_logits_max": 4.749842643737793, "gen_logits_mean": -15.18576431274414, "gen_logits_min": -26.957706451416016, "gen_logits_std": 2.9165170192718506, "gen_loss": 0.25682371854782104, "grad_norm": 0.3774215870055486, "learning_rate": 2.432463157894737e-05, "loss": 0.2618, "mean_copy_accuracy": 0.9964451640844345, "mean_gen_accuracy": 0.8792692422866821, "mean_token_accuracy": 0.9101110994815826, "num_tokens": 333711479.0, "sample_num_tokens": 8335.25, "step": 5229, "total_num_tokens": 333744820.0, "z_loss": 0.0004976377822458744 }, { "copy_logits_max": -4.090814590454102, "copy_logits_min": -750000000.0, "copy_num_tokens": 540.5, "epoch": 1.0682154710237426, "gen_logits_max": 3.416609048843384, "gen_logits_mean": -15.07858657836914, "gen_logits_min": -26.63849639892578, "gen_logits_std": 2.812748432159424, "gen_loss": 0.2777724862098694, "grad_norm": 0.42799465820172455, "learning_rate": 2.4323368421052632e-05, "loss": 0.2791, "mean_copy_accuracy": 0.9961613267660141, "mean_gen_accuracy": 0.8756733238697052, "mean_token_accuracy": 0.9061004966497421, "num_tokens": 333986845.0, "sample_num_tokens": 8342.25, "step": 5230, "total_num_tokens": 334020214.0, "z_loss": 0.0005267239175736904 }, { "copy_logits_max": -4.620936393737793, "copy_logits_min": -750000000.0, "copy_num_tokens": 375.75, "epoch": 1.0684197089609395, "gen_logits_max": 5.3650007247924805, "gen_logits_mean": -12.456865310668945, "gen_logits_min": -24.024818420410156, "gen_logits_std": 2.717867374420166, "gen_loss": 0.30688512325286865, "grad_norm": 0.4186567695280155, "learning_rate": 2.4322105263157896e-05, "loss": 0.2862, "mean_copy_accuracy": 0.9960808306932449, "mean_gen_accuracy": 0.8726156949996948, "mean_token_accuracy": 0.9041370898485184, "num_tokens": 334280477.0, "sample_num_tokens": 8764.25, "step": 5231, "total_num_tokens": 334315534.0, "z_loss": 0.0005751418066211045 }, { "copy_logits_max": -5.794005393981934, "copy_logits_min": -750000000.0, "copy_num_tokens": 248.5, "epoch": 1.0686239468981364, "gen_logits_max": 5.37310791015625, "gen_logits_mean": -15.151874542236328, "gen_logits_min": -26.86304473876953, "gen_logits_std": 2.858083724975586, "gen_loss": 0.2865758538246155, "grad_norm": 0.3896065882730803, "learning_rate": 2.4320842105263157e-05, "loss": 0.2727, "mean_copy_accuracy": 0.9961546957492828, "mean_gen_accuracy": 0.8812648355960846, "mean_token_accuracy": 0.9072205871343613, "num_tokens": 334534648.0, "sample_num_tokens": 6688.5, "step": 5232, "total_num_tokens": 334561402.0, "z_loss": 0.0005619862931780517 }, { "copy_logits_max": -5.222018241882324, "copy_logits_min": -687500032.0, "copy_num_tokens": 537.8125, "epoch": 1.0688281848353332, "gen_logits_max": 3.3908753395080566, "gen_logits_mean": -17.06385040283203, "gen_logits_min": -29.12411117553711, "gen_logits_std": 2.989708423614502, "gen_loss": 0.26322871446609497, "grad_norm": 0.39101411731009145, "learning_rate": 2.431957894736842e-05, "loss": 0.2798, "mean_copy_accuracy": 0.9967004507780075, "mean_gen_accuracy": 0.8724375665187836, "mean_token_accuracy": 0.9046339243650436, "num_tokens": 334803977.0, "sample_num_tokens": 8920.25, "step": 5233, "total_num_tokens": 334839658.0, "z_loss": 0.0005452765617519617 }, { "copy_logits_max": -6.446488380432129, "copy_logits_min": -750000000.0, "copy_num_tokens": 248.75, "epoch": 1.06903242277253, "gen_logits_max": 5.887528419494629, "gen_logits_mean": -15.28945255279541, "gen_logits_min": -26.491682052612305, "gen_logits_std": 2.87265682220459, "gen_loss": 0.3256852626800537, "grad_norm": 0.39008509343116515, "learning_rate": 2.4318315789473686e-05, "loss": 0.2824, "mean_copy_accuracy": 0.9953306019306183, "mean_gen_accuracy": 0.8753149658441544, "mean_token_accuracy": 0.90244260430336, "num_tokens": 335081000.0, "sample_num_tokens": 7632.0, "step": 5234, "total_num_tokens": 335111528.0, "z_loss": 0.000629864982329309 }, { "copy_logits_max": -4.890012741088867, "copy_logits_min": -687500032.0, "copy_num_tokens": 589.0, "epoch": 1.0692366607097268, "gen_logits_max": 3.961300849914551, "gen_logits_mean": -15.577945709228516, "gen_logits_min": -27.356632232666016, "gen_logits_std": 2.9380428791046143, "gen_loss": 0.24651485681533813, "grad_norm": 0.3926636317664703, "learning_rate": 2.431705263157895e-05, "loss": 0.2694, "mean_copy_accuracy": 0.997296079993248, "mean_gen_accuracy": 0.8809588104486465, "mean_token_accuracy": 0.910548210144043, "num_tokens": 335329871.0, "sample_num_tokens": 9522.75, "step": 5235, "total_num_tokens": 335367962.0, "z_loss": 0.0005866400897502899 }, { "copy_logits_max": -6.227899074554443, "copy_logits_min": -750000000.0, "copy_num_tokens": 329.25, "epoch": 1.0694408986469237, "gen_logits_max": 4.742865085601807, "gen_logits_mean": -15.91987419128418, "gen_logits_min": -27.124021530151367, "gen_logits_std": 2.870304584503174, "gen_loss": 0.3088425099849701, "grad_norm": 0.4007960722486441, "learning_rate": 2.431578947368421e-05, "loss": 0.2949, "mean_copy_accuracy": 0.9958389401435852, "mean_gen_accuracy": 0.8728714883327484, "mean_token_accuracy": 0.9019513428211212, "num_tokens": 335610466.0, "sample_num_tokens": 8481.5, "step": 5236, "total_num_tokens": 335644392.0, "z_loss": 0.0006189162959344685 }, { "copy_logits_max": -4.662876605987549, "copy_logits_min": -750000000.0, "copy_num_tokens": 402.875, "epoch": 1.0696451365841204, "gen_logits_max": 4.687995433807373, "gen_logits_mean": -14.616828918457031, "gen_logits_min": -26.09239959716797, "gen_logits_std": 2.8540539741516113, "gen_loss": 0.2586525082588196, "grad_norm": 0.3722574652441008, "learning_rate": 2.4314526315789475e-05, "loss": 0.2484, "mean_copy_accuracy": 0.9963255822658539, "mean_gen_accuracy": 0.8892345577478409, "mean_token_accuracy": 0.9149443954229355, "num_tokens": 335902518.0, "sample_num_tokens": 8712.0, "step": 5237, "total_num_tokens": 335937366.0, "z_loss": 0.0005435820203274488 }, { "copy_logits_max": -5.031093597412109, "copy_logits_min": -687500032.0, "copy_num_tokens": 483.6875, "epoch": 1.0698493745213173, "gen_logits_max": 5.117514133453369, "gen_logits_mean": -13.193756103515625, "gen_logits_min": -24.66160011291504, "gen_logits_std": 2.811309337615967, "gen_loss": 0.27283716201782227, "grad_norm": 0.36586524970741374, "learning_rate": 2.4313263157894736e-05, "loss": 0.2582, "mean_copy_accuracy": 0.9966047704219818, "mean_gen_accuracy": 0.8803782612085342, "mean_token_accuracy": 0.9141144156455994, "num_tokens": 336200654.0, "sample_num_tokens": 8055.0, "step": 5238, "total_num_tokens": 336232874.0, "z_loss": 0.0006031908560544252 }, { "copy_logits_max": -6.0453691482543945, "copy_logits_min": -750000000.0, "copy_num_tokens": 478.8125, "epoch": 1.0700536124585143, "gen_logits_max": 4.443725109100342, "gen_logits_mean": -14.991476058959961, "gen_logits_min": -26.234657287597656, "gen_logits_std": 2.8381505012512207, "gen_loss": 0.29693615436553955, "grad_norm": 0.40756897225081323, "learning_rate": 2.4312e-05, "loss": 0.2994, "mean_copy_accuracy": 0.995973989367485, "mean_gen_accuracy": 0.8695346862077713, "mean_token_accuracy": 0.8999833911657333, "num_tokens": 336459030.0, "sample_num_tokens": 9241.0, "step": 5239, "total_num_tokens": 336495994.0, "z_loss": 0.0006511822575703263 }, { "copy_logits_max": -5.128918647766113, "copy_logits_min": -750000000.0, "copy_num_tokens": 320.25, "epoch": 1.070257850395711, "gen_logits_max": 4.004942417144775, "gen_logits_mean": -15.855158805847168, "gen_logits_min": -27.762367248535156, "gen_logits_std": 2.901484489440918, "gen_loss": 0.2772386372089386, "grad_norm": 0.3740803512099634, "learning_rate": 2.431073684210526e-05, "loss": 0.2871, "mean_copy_accuracy": 0.9960089921951294, "mean_gen_accuracy": 0.8726442605257034, "mean_token_accuracy": 0.9023052752017975, "num_tokens": 336741647.0, "sample_num_tokens": 7057.25, "step": 5240, "total_num_tokens": 336769876.0, "z_loss": 0.0006410910282284021 }, { "copy_logits_max": -5.121222019195557, "copy_logits_min": -750000000.0, "copy_num_tokens": 586.25, "epoch": 1.070462088332908, "gen_logits_max": 3.9466359615325928, "gen_logits_mean": -14.937408447265625, "gen_logits_min": -26.684715270996094, "gen_logits_std": 2.890592098236084, "gen_loss": 0.28510916233062744, "grad_norm": 0.37937136982280467, "learning_rate": 2.4309473684210526e-05, "loss": 0.2725, "mean_copy_accuracy": 0.9964024424552917, "mean_gen_accuracy": 0.8764725029468536, "mean_token_accuracy": 0.9078533351421356, "num_tokens": 337001724.0, "sample_num_tokens": 9230.0, "step": 5241, "total_num_tokens": 337038644.0, "z_loss": 0.0006332604098133743 }, { "copy_logits_max": -5.490039348602295, "copy_logits_min": -750000000.0, "copy_num_tokens": 469.25, "epoch": 1.0706663262701046, "gen_logits_max": 3.655888319015503, "gen_logits_mean": -15.788143157958984, "gen_logits_min": -27.915790557861328, "gen_logits_std": 2.9347357749938965, "gen_loss": 0.25321489572525024, "grad_norm": 0.37621600084871626, "learning_rate": 2.4308210526315793e-05, "loss": 0.2612, "mean_copy_accuracy": 0.9968776702880859, "mean_gen_accuracy": 0.8816960453987122, "mean_token_accuracy": 0.9122519493103027, "num_tokens": 337279662.0, "sample_num_tokens": 7814.0, "step": 5242, "total_num_tokens": 337310918.0, "z_loss": 0.0005950417835265398 }, { "copy_logits_max": -5.118983268737793, "copy_logits_min": -750000000.0, "copy_num_tokens": 596.9375, "epoch": 1.0708705642073015, "gen_logits_max": 4.973883628845215, "gen_logits_mean": -13.401693344116211, "gen_logits_min": -25.097688674926758, "gen_logits_std": 2.827362537384033, "gen_loss": 0.2993818521499634, "grad_norm": 0.4154000595255241, "learning_rate": 2.4306947368421054e-05, "loss": 0.2893, "mean_copy_accuracy": 0.9958698153495789, "mean_gen_accuracy": 0.8711113333702087, "mean_token_accuracy": 0.9029008448123932, "num_tokens": 337536158.0, "sample_num_tokens": 8949.5, "step": 5243, "total_num_tokens": 337571956.0, "z_loss": 0.000597998034209013 }, { "copy_logits_max": -4.9790544509887695, "copy_logits_min": -750000000.0, "copy_num_tokens": 357.1875, "epoch": 1.0710748021444982, "gen_logits_max": 4.631744861602783, "gen_logits_mean": -14.876827239990234, "gen_logits_min": -27.045734405517578, "gen_logits_std": 2.9232747554779053, "gen_loss": 0.28670597076416016, "grad_norm": 0.3857025720043064, "learning_rate": 2.430568421052632e-05, "loss": 0.2954, "mean_copy_accuracy": 0.9967397749423981, "mean_gen_accuracy": 0.872541680932045, "mean_token_accuracy": 0.9007308781147003, "num_tokens": 337785961.0, "sample_num_tokens": 7627.75, "step": 5244, "total_num_tokens": 337816472.0, "z_loss": 0.0005813881289213896 }, { "copy_logits_max": -5.258135795593262, "copy_logits_min": -750000000.0, "copy_num_tokens": 641.3125, "epoch": 1.0712790400816952, "gen_logits_max": 3.614802837371826, "gen_logits_mean": -15.339519500732422, "gen_logits_min": -27.372081756591797, "gen_logits_std": 2.9321205615997314, "gen_loss": 0.2522900104522705, "grad_norm": 0.39881703895201637, "learning_rate": 2.430442105263158e-05, "loss": 0.2778, "mean_copy_accuracy": 0.9961778670549393, "mean_gen_accuracy": 0.8806086033582687, "mean_token_accuracy": 0.9068974256515503, "num_tokens": 338055283.0, "sample_num_tokens": 9256.25, "step": 5245, "total_num_tokens": 338092308.0, "z_loss": 0.000519190973136574 }, { "copy_logits_max": -6.921820640563965, "copy_logits_min": -750000000.0, "copy_num_tokens": 456.0, "epoch": 1.071483278018892, "gen_logits_max": 3.987980604171753, "gen_logits_mean": -15.552177429199219, "gen_logits_min": -27.387203216552734, "gen_logits_std": 2.946237087249756, "gen_loss": 0.287436842918396, "grad_norm": 0.39415577226307624, "learning_rate": 2.4303157894736844e-05, "loss": 0.314, "mean_copy_accuracy": 0.9963921904563904, "mean_gen_accuracy": 0.8665242791175842, "mean_token_accuracy": 0.8955126255750656, "num_tokens": 338316135.0, "sample_num_tokens": 8481.25, "step": 5246, "total_num_tokens": 338350060.0, "z_loss": 0.0005288219545036554 }, { "copy_logits_max": -6.2351837158203125, "copy_logits_min": -750000000.0, "copy_num_tokens": 449.5625, "epoch": 1.0716875159560888, "gen_logits_max": 3.822838068008423, "gen_logits_mean": -15.426689147949219, "gen_logits_min": -27.473007202148438, "gen_logits_std": 2.9301347732543945, "gen_loss": 0.25576770305633545, "grad_norm": 0.3895832192703322, "learning_rate": 2.4301894736842105e-05, "loss": 0.2727, "mean_copy_accuracy": 0.995842456817627, "mean_gen_accuracy": 0.8751182705163956, "mean_token_accuracy": 0.9070545583963394, "num_tokens": 338615034.0, "sample_num_tokens": 8705.0, "step": 5247, "total_num_tokens": 338649854.0, "z_loss": 0.0005512339412234724 }, { "copy_logits_max": -3.837859869003296, "copy_logits_min": -750000000.0, "copy_num_tokens": 403.3125, "epoch": 1.0718917538932857, "gen_logits_max": 3.6705169677734375, "gen_logits_mean": -16.06103515625, "gen_logits_min": -28.214515686035156, "gen_logits_std": 2.963304042816162, "gen_loss": 0.28338468074798584, "grad_norm": 0.41409696585463995, "learning_rate": 2.430063157894737e-05, "loss": 0.2907, "mean_copy_accuracy": 0.9964995384216309, "mean_gen_accuracy": 0.8661579638719559, "mean_token_accuracy": 0.8991221785545349, "num_tokens": 338866009.0, "sample_num_tokens": 7475.25, "step": 5248, "total_num_tokens": 338895910.0, "z_loss": 0.0005292583373375237 }, { "copy_logits_max": -6.509403228759766, "copy_logits_min": -687500032.0, "copy_num_tokens": 460.3125, "epoch": 1.0720959918304824, "gen_logits_max": 4.486305236816406, "gen_logits_mean": -15.016239166259766, "gen_logits_min": -27.387771606445312, "gen_logits_std": 2.946507453918457, "gen_loss": 0.27591925859451294, "grad_norm": 0.4076090817610853, "learning_rate": 2.429936842105263e-05, "loss": 0.275, "mean_copy_accuracy": 0.9960336238145828, "mean_gen_accuracy": 0.8772594779729843, "mean_token_accuracy": 0.9067879021167755, "num_tokens": 339141682.0, "sample_num_tokens": 9506.0, "step": 5249, "total_num_tokens": 339179706.0, "z_loss": 0.0005251268157735467 }, { "copy_logits_max": -8.918746948242188, "copy_logits_min": -750000128.0, "copy_num_tokens": 478.625, "epoch": 1.0723002297676794, "gen_logits_max": 3.9785995483398438, "gen_logits_mean": -16.37979507446289, "gen_logits_min": -28.278345108032227, "gen_logits_std": 2.995652675628662, "gen_loss": 0.23893772065639496, "grad_norm": 0.36847514134688336, "learning_rate": 2.4298105263157898e-05, "loss": 0.2593, "mean_copy_accuracy": 0.9960555285215378, "mean_gen_accuracy": 0.8862897008657455, "mean_token_accuracy": 0.9128865152597427, "num_tokens": 339412816.0, "sample_num_tokens": 8704.5, "step": 5250, "total_num_tokens": 339447634.0, "z_loss": 0.00042799540096893907 }, { "copy_logits_max": -5.2881760597229, "copy_logits_min": -750000000.0, "copy_num_tokens": 422.875, "epoch": 1.072504467704876, "gen_logits_max": 4.143683910369873, "gen_logits_mean": -14.391901016235352, "gen_logits_min": -26.537675857543945, "gen_logits_std": 2.945955276489258, "gen_loss": 0.29127824306488037, "grad_norm": 0.38105990092906616, "learning_rate": 2.429684210526316e-05, "loss": 0.2751, "mean_copy_accuracy": 0.9958530813455582, "mean_gen_accuracy": 0.8759952932596207, "mean_token_accuracy": 0.9064938426017761, "num_tokens": 339699824.0, "sample_num_tokens": 7519.5, "step": 5251, "total_num_tokens": 339729902.0, "z_loss": 0.0005019911332055926 }, { "copy_logits_max": -5.238376617431641, "copy_logits_min": -750000000.0, "copy_num_tokens": 427.8125, "epoch": 1.072708705642073, "gen_logits_max": 3.46108341217041, "gen_logits_mean": -16.570552825927734, "gen_logits_min": -28.57099151611328, "gen_logits_std": 2.9822349548339844, "gen_loss": 0.29173821210861206, "grad_norm": 0.38866777850809986, "learning_rate": 2.4295578947368423e-05, "loss": 0.2609, "mean_copy_accuracy": 0.9959388226270676, "mean_gen_accuracy": 0.8790661692619324, "mean_token_accuracy": 0.9111351072788239, "num_tokens": 339972896.0, "sample_num_tokens": 7917.5, "step": 5252, "total_num_tokens": 340004566.0, "z_loss": 0.0005872999900020659 }, { "copy_logits_max": -3.697732925415039, "copy_logits_min": -687500032.0, "copy_num_tokens": 547.9375, "epoch": 1.07291294357927, "gen_logits_max": 3.333373546600342, "gen_logits_mean": -16.380313873291016, "gen_logits_min": -28.846302032470703, "gen_logits_std": 2.987523078918457, "gen_loss": 0.284884512424469, "grad_norm": 0.4084580402413164, "learning_rate": 2.4294315789473684e-05, "loss": 0.3, "mean_copy_accuracy": 0.9967551380395889, "mean_gen_accuracy": 0.8675132095813751, "mean_token_accuracy": 0.8989124000072479, "num_tokens": 340233864.0, "sample_num_tokens": 9504.0, "step": 5253, "total_num_tokens": 340271880.0, "z_loss": 0.0005699835019186139 }, { "copy_logits_max": -6.0425920486450195, "copy_logits_min": -750000000.0, "copy_num_tokens": 440.375, "epoch": 1.0731171815164666, "gen_logits_max": 4.214895725250244, "gen_logits_mean": -15.407848358154297, "gen_logits_min": -27.372819900512695, "gen_logits_std": 2.94647479057312, "gen_loss": 0.26729458570480347, "grad_norm": 0.39095851688905287, "learning_rate": 2.4293052631578948e-05, "loss": 0.2847, "mean_copy_accuracy": 0.9952930063009262, "mean_gen_accuracy": 0.8793972730636597, "mean_token_accuracy": 0.906354084610939, "num_tokens": 340506127.0, "sample_num_tokens": 8323.75, "step": 5254, "total_num_tokens": 340539422.0, "z_loss": 0.0005686914664693177 }, { "copy_logits_max": -5.897200584411621, "copy_logits_min": -750000064.0, "copy_num_tokens": 407.0625, "epoch": 1.0733214194536635, "gen_logits_max": 3.347747325897217, "gen_logits_mean": -17.07884407043457, "gen_logits_min": -29.271522521972656, "gen_logits_std": 3.0052332878112793, "gen_loss": 0.2937648594379425, "grad_norm": 0.40150616870329797, "learning_rate": 2.4291789473684212e-05, "loss": 0.2849, "mean_copy_accuracy": 0.996257409453392, "mean_gen_accuracy": 0.8739767670631409, "mean_token_accuracy": 0.9038595706224442, "num_tokens": 340772796.0, "sample_num_tokens": 8767.5, "step": 5255, "total_num_tokens": 340807866.0, "z_loss": 0.0005317121976986527 }, { "copy_logits_max": -5.823118686676025, "copy_logits_min": -687500032.0, "copy_num_tokens": 377.6875, "epoch": 1.0735256573908603, "gen_logits_max": 4.608244895935059, "gen_logits_mean": -15.959793090820312, "gen_logits_min": -28.16156768798828, "gen_logits_std": 2.9693551063537598, "gen_loss": 0.29157721996307373, "grad_norm": 0.4200309047987694, "learning_rate": 2.4290526315789473e-05, "loss": 0.2881, "mean_copy_accuracy": 0.9960188865661621, "mean_gen_accuracy": 0.8750021308660507, "mean_token_accuracy": 0.9027880132198334, "num_tokens": 341041977.0, "sample_num_tokens": 9000.25, "step": 5256, "total_num_tokens": 341077978.0, "z_loss": 0.0005809883587062359 }, { "copy_logits_max": -6.100985527038574, "copy_logits_min": -750000000.0, "copy_num_tokens": 918.0625, "epoch": 1.0737298953280572, "gen_logits_max": 2.62455415725708, "gen_logits_mean": -15.645898818969727, "gen_logits_min": -27.818254470825195, "gen_logits_std": 2.984870433807373, "gen_loss": 0.21646833419799805, "grad_norm": 0.3653066013861179, "learning_rate": 2.4289263157894738e-05, "loss": 0.2657, "mean_copy_accuracy": 0.9966282844543457, "mean_gen_accuracy": 0.8766390830278397, "mean_token_accuracy": 0.909340113401413, "num_tokens": 341337704.0, "sample_num_tokens": 10746.0, "step": 5257, "total_num_tokens": 341380688.0, "z_loss": 0.00048146251356229186 }, { "copy_logits_max": -6.867636203765869, "copy_logits_min": -750000000.0, "copy_num_tokens": 357.125, "epoch": 1.073934133265254, "gen_logits_max": 4.105579376220703, "gen_logits_mean": -16.760360717773438, "gen_logits_min": -28.93975257873535, "gen_logits_std": 2.996399402618408, "gen_loss": 0.25199222564697266, "grad_norm": 0.3955835433957728, "learning_rate": 2.4288e-05, "loss": 0.2743, "mean_copy_accuracy": 0.9954904913902283, "mean_gen_accuracy": 0.8789899349212646, "mean_token_accuracy": 0.9090128690004349, "num_tokens": 341638441.0, "sample_num_tokens": 7932.25, "step": 5258, "total_num_tokens": 341670170.0, "z_loss": 0.0005343332886695862 }, { "copy_logits_max": -5.862798690795898, "copy_logits_min": -750000128.0, "copy_num_tokens": 363.75, "epoch": 1.0741383712024508, "gen_logits_max": 4.665032386779785, "gen_logits_mean": -15.394213676452637, "gen_logits_min": -27.184837341308594, "gen_logits_std": 2.935664415359497, "gen_loss": 0.2972201108932495, "grad_norm": 0.41933657752003795, "learning_rate": 2.4286736842105266e-05, "loss": 0.285, "mean_copy_accuracy": 0.995171457529068, "mean_gen_accuracy": 0.8769862949848175, "mean_token_accuracy": 0.9033806174993515, "num_tokens": 341900367.0, "sample_num_tokens": 7724.25, "step": 5259, "total_num_tokens": 341931264.0, "z_loss": 0.000690534885507077 }, { "copy_logits_max": -5.983870983123779, "copy_logits_min": -750000064.0, "copy_num_tokens": 345.9375, "epoch": 1.0743426091396477, "gen_logits_max": 5.22879695892334, "gen_logits_mean": -14.202138900756836, "gen_logits_min": -26.71151351928711, "gen_logits_std": 2.9695515632629395, "gen_loss": 0.25788718461990356, "grad_norm": 0.5377150148482647, "learning_rate": 2.4285473684210527e-05, "loss": 0.256, "mean_copy_accuracy": 0.9966646879911423, "mean_gen_accuracy": 0.8833676427602768, "mean_token_accuracy": 0.911865308880806, "num_tokens": 342158890.0, "sample_num_tokens": 7524.5, "step": 5260, "total_num_tokens": 342188988.0, "z_loss": 0.0005964951706118882 }, { "copy_logits_max": -5.43222713470459, "copy_logits_min": -750000064.0, "copy_num_tokens": 521.4375, "epoch": 1.0745468470768444, "gen_logits_max": 3.2848141193389893, "gen_logits_mean": -16.364543914794922, "gen_logits_min": -28.733966827392578, "gen_logits_std": 2.9980969429016113, "gen_loss": 0.2618379592895508, "grad_norm": 0.3915397820330913, "learning_rate": 2.428421052631579e-05, "loss": 0.2965, "mean_copy_accuracy": 0.9961396157741547, "mean_gen_accuracy": 0.8675616979598999, "mean_token_accuracy": 0.9029157310724258, "num_tokens": 342428096.0, "sample_num_tokens": 8413.0, "step": 5261, "total_num_tokens": 342461748.0, "z_loss": 0.000549590215086937 }, { "copy_logits_max": -6.940174102783203, "copy_logits_min": -687500096.0, "copy_num_tokens": 469.3125, "epoch": 1.0747510850140414, "gen_logits_max": 4.179047107696533, "gen_logits_mean": -16.052888870239258, "gen_logits_min": -28.57217025756836, "gen_logits_std": 3.0190062522888184, "gen_loss": 0.2421068251132965, "grad_norm": 0.7544637052232018, "learning_rate": 2.4282947368421052e-05, "loss": 0.2575, "mean_copy_accuracy": 0.9956648349761963, "mean_gen_accuracy": 0.8825090080499649, "mean_token_accuracy": 0.9108375012874603, "num_tokens": 342702258.0, "sample_num_tokens": 7862.0, "step": 5262, "total_num_tokens": 342733706.0, "z_loss": 0.000523116672411561 }, { "copy_logits_max": -4.615531921386719, "copy_logits_min": -750000000.0, "copy_num_tokens": 452.25, "epoch": 1.0749553229512383, "gen_logits_max": 3.786599636077881, "gen_logits_mean": -16.60971450805664, "gen_logits_min": -28.406726837158203, "gen_logits_std": 3.0032007694244385, "gen_loss": 0.2989978790283203, "grad_norm": 0.44914990944333016, "learning_rate": 2.4281684210526317e-05, "loss": 0.2835, "mean_copy_accuracy": 0.9956951290369034, "mean_gen_accuracy": 0.8758385926485062, "mean_token_accuracy": 0.9047861695289612, "num_tokens": 342968964.0, "sample_num_tokens": 8334.0, "step": 5263, "total_num_tokens": 343002300.0, "z_loss": 0.0006220927461981773 }, { "copy_logits_max": -3.0894575119018555, "copy_logits_min": -687500032.0, "copy_num_tokens": 711.9375, "epoch": 1.075159560888435, "gen_logits_max": 3.1397666931152344, "gen_logits_mean": -16.594566345214844, "gen_logits_min": -28.728065490722656, "gen_logits_std": 3.017113208770752, "gen_loss": 0.2647920846939087, "grad_norm": 0.40901407270078394, "learning_rate": 2.4280421052631578e-05, "loss": 0.293, "mean_copy_accuracy": 0.9966340512037277, "mean_gen_accuracy": 0.8692620396614075, "mean_token_accuracy": 0.9000399261713028, "num_tokens": 343228830.0, "sample_num_tokens": 10072.5, "step": 5264, "total_num_tokens": 343269120.0, "z_loss": 0.000530351884663105 }, { "copy_logits_max": -6.109139442443848, "copy_logits_min": -687500032.0, "copy_num_tokens": 345.4375, "epoch": 1.075363798825632, "gen_logits_max": 3.6841988563537598, "gen_logits_mean": -16.913433074951172, "gen_logits_min": -28.602184295654297, "gen_logits_std": 2.9587888717651367, "gen_loss": 0.251335084438324, "grad_norm": 0.3822449591782653, "learning_rate": 2.4279157894736842e-05, "loss": 0.2806, "mean_copy_accuracy": 0.994889959692955, "mean_gen_accuracy": 0.8755364716053009, "mean_token_accuracy": 0.9047958850860596, "num_tokens": 343491653.0, "sample_num_tokens": 7954.25, "step": 5265, "total_num_tokens": 343523470.0, "z_loss": 0.0004867220122832805 }, { "copy_logits_max": -5.980093479156494, "copy_logits_min": -750000000.0, "copy_num_tokens": 373.9375, "epoch": 1.0755680367628286, "gen_logits_max": 4.009209632873535, "gen_logits_mean": -16.579914093017578, "gen_logits_min": -28.374706268310547, "gen_logits_std": 2.9540982246398926, "gen_loss": 0.30784615874290466, "grad_norm": 0.38154078138666225, "learning_rate": 2.4277894736842103e-05, "loss": 0.2871, "mean_copy_accuracy": 0.9956689029932022, "mean_gen_accuracy": 0.8746854960918427, "mean_token_accuracy": 0.9033626914024353, "num_tokens": 343769968.0, "sample_num_tokens": 8301.5, "step": 5266, "total_num_tokens": 343803174.0, "z_loss": 0.000598591344896704 }, { "copy_logits_max": -2.5012941360473633, "copy_logits_min": -687500032.0, "copy_num_tokens": 810.5625, "epoch": 1.0757722747000256, "gen_logits_max": 3.239917516708374, "gen_logits_mean": -15.246654510498047, "gen_logits_min": -27.9349365234375, "gen_logits_std": 2.9897403717041016, "gen_loss": 0.25517743825912476, "grad_norm": 0.4138002163118705, "learning_rate": 2.427663157894737e-05, "loss": 0.2861, "mean_copy_accuracy": 0.9959163218736649, "mean_gen_accuracy": 0.8716814368963242, "mean_token_accuracy": 0.9027326852083206, "num_tokens": 344036804.0, "sample_num_tokens": 10093.0, "step": 5267, "total_num_tokens": 344077176.0, "z_loss": 0.000589987204875797 }, { "copy_logits_max": -3.661801338195801, "copy_logits_min": -625000064.0, "copy_num_tokens": 517.875, "epoch": 1.0759765126372223, "gen_logits_max": 3.459498882293701, "gen_logits_mean": -16.400623321533203, "gen_logits_min": -28.305355072021484, "gen_logits_std": 2.965205669403076, "gen_loss": 0.2664860785007477, "grad_norm": 0.38977925864567764, "learning_rate": 2.427536842105263e-05, "loss": 0.2703, "mean_copy_accuracy": 0.9949191212654114, "mean_gen_accuracy": 0.8845418095588684, "mean_token_accuracy": 0.9075356125831604, "num_tokens": 344296158.0, "sample_num_tokens": 9034.0, "step": 5268, "total_num_tokens": 344332294.0, "z_loss": 0.0005664730560965836 }, { "copy_logits_max": -3.448324203491211, "copy_logits_min": -750000000.0, "copy_num_tokens": 551.3125, "epoch": 1.0761807505744192, "gen_logits_max": 3.1569299697875977, "gen_logits_mean": -16.20693016052246, "gen_logits_min": -27.9714298248291, "gen_logits_std": 2.9614617824554443, "gen_loss": 0.2652066648006439, "grad_norm": 0.4122711139306491, "learning_rate": 2.4274105263157896e-05, "loss": 0.2902, "mean_copy_accuracy": 0.9944848716259003, "mean_gen_accuracy": 0.8724038153886795, "mean_token_accuracy": 0.9011787325143814, "num_tokens": 344551586.0, "sample_num_tokens": 9018.5, "step": 5269, "total_num_tokens": 344587660.0, "z_loss": 0.00054657191503793 }, { "copy_logits_max": -4.914768218994141, "copy_logits_min": -750000000.0, "copy_num_tokens": 354.125, "epoch": 1.0763849885116161, "gen_logits_max": 4.433872222900391, "gen_logits_mean": -16.049022674560547, "gen_logits_min": -27.894161224365234, "gen_logits_std": 2.945920944213867, "gen_loss": 0.310449481010437, "grad_norm": 0.37687882875718093, "learning_rate": 2.427284210526316e-05, "loss": 0.2911, "mean_copy_accuracy": 0.9951010793447495, "mean_gen_accuracy": 0.87336465716362, "mean_token_accuracy": 0.9023628979921341, "num_tokens": 344819563.0, "sample_num_tokens": 7787.25, "step": 5270, "total_num_tokens": 344850712.0, "z_loss": 0.0006102290935814381 }, { "copy_logits_max": -3.0700325965881348, "copy_logits_min": -687500032.0, "copy_num_tokens": 558.75, "epoch": 1.0765892264488128, "gen_logits_max": 4.132741928100586, "gen_logits_mean": -13.928983688354492, "gen_logits_min": -26.409000396728516, "gen_logits_std": 2.925912380218506, "gen_loss": 0.24828410148620605, "grad_norm": 0.4009488617176757, "learning_rate": 2.427157894736842e-05, "loss": 0.2739, "mean_copy_accuracy": 0.9951243996620178, "mean_gen_accuracy": 0.8798361420631409, "mean_token_accuracy": 0.9072173237800598, "num_tokens": 345082378.0, "sample_num_tokens": 9279.0, "step": 5271, "total_num_tokens": 345119494.0, "z_loss": 0.0005105943419039249 }, { "copy_logits_max": -4.724431991577148, "copy_logits_min": -687500032.0, "copy_num_tokens": 534.25, "epoch": 1.0767934643860098, "gen_logits_max": 3.094404458999634, "gen_logits_mean": -16.39427375793457, "gen_logits_min": -28.492176055908203, "gen_logits_std": 2.960714340209961, "gen_loss": 0.25946736335754395, "grad_norm": 0.35766412409773907, "learning_rate": 2.4270315789473685e-05, "loss": 0.2744, "mean_copy_accuracy": 0.9959112405776978, "mean_gen_accuracy": 0.8817813098430634, "mean_token_accuracy": 0.9072405695915222, "num_tokens": 345352911.0, "sample_num_tokens": 8764.25, "step": 5272, "total_num_tokens": 345387968.0, "z_loss": 0.0005779589409939945 }, { "copy_logits_max": -4.477468490600586, "copy_logits_min": -687500032.0, "copy_num_tokens": 462.9375, "epoch": 1.0769977023232065, "gen_logits_max": 2.9222679138183594, "gen_logits_mean": -17.280536651611328, "gen_logits_min": -29.27962303161621, "gen_logits_std": 3.006483793258667, "gen_loss": 0.27644920349121094, "grad_norm": 0.37343025847739497, "learning_rate": 2.4269052631578946e-05, "loss": 0.268, "mean_copy_accuracy": 0.9966908395290375, "mean_gen_accuracy": 0.876719281077385, "mean_token_accuracy": 0.9089185446500778, "num_tokens": 345644270.0, "sample_num_tokens": 8317.5, "step": 5273, "total_num_tokens": 345677540.0, "z_loss": 0.000535406288690865 }, { "copy_logits_max": -5.928736686706543, "copy_logits_min": -750000000.0, "copy_num_tokens": 539.9375, "epoch": 1.0772019402604034, "gen_logits_max": 4.506425380706787, "gen_logits_mean": -14.082071304321289, "gen_logits_min": -25.834388732910156, "gen_logits_std": 2.9154701232910156, "gen_loss": 0.2712317705154419, "grad_norm": 0.5264627595594852, "learning_rate": 2.426778947368421e-05, "loss": 0.2791, "mean_copy_accuracy": 0.9960846155881882, "mean_gen_accuracy": 0.8757618069648743, "mean_token_accuracy": 0.9079300314188004, "num_tokens": 345941765.0, "sample_num_tokens": 10585.25, "step": 5274, "total_num_tokens": 345984106.0, "z_loss": 0.0005121729918755591 }, { "copy_logits_max": -6.6933135986328125, "copy_logits_min": -750000000.0, "copy_num_tokens": 480.9375, "epoch": 1.0774061781976, "gen_logits_max": 3.105478286743164, "gen_logits_mean": -16.73687171936035, "gen_logits_min": -28.448959350585938, "gen_logits_std": 2.975407600402832, "gen_loss": 0.24451866745948792, "grad_norm": 0.3821005277055708, "learning_rate": 2.4266526315789475e-05, "loss": 0.2699, "mean_copy_accuracy": 0.9963313341140747, "mean_gen_accuracy": 0.8799502104520798, "mean_token_accuracy": 0.9089521020650864, "num_tokens": 346226990.0, "sample_num_tokens": 9034.0, "step": 5275, "total_num_tokens": 346263126.0, "z_loss": 0.0004122629470657557 }, { "copy_logits_max": -4.30627965927124, "copy_logits_min": -750000000.0, "copy_num_tokens": 510.875, "epoch": 1.077610416134797, "gen_logits_max": 4.475908279418945, "gen_logits_mean": -13.847968101501465, "gen_logits_min": -25.72256851196289, "gen_logits_std": 2.907402753829956, "gen_loss": 0.28857678174972534, "grad_norm": 0.40712632919132913, "learning_rate": 2.426526315789474e-05, "loss": 0.2947, "mean_copy_accuracy": 0.9962422400712967, "mean_gen_accuracy": 0.8692453801631927, "mean_token_accuracy": 0.9008621424436569, "num_tokens": 346480927.0, "sample_num_tokens": 8625.75, "step": 5276, "total_num_tokens": 346515430.0, "z_loss": 0.00048660291940905154 }, { "copy_logits_max": -6.656105041503906, "copy_logits_min": -750000000.0, "copy_num_tokens": 241.375, "epoch": 1.077814654071994, "gen_logits_max": 3.847813129425049, "gen_logits_mean": -16.19927978515625, "gen_logits_min": -27.483901977539062, "gen_logits_std": 2.9131951332092285, "gen_loss": 0.2845865488052368, "grad_norm": 0.38828736536594316, "learning_rate": 2.4264e-05, "loss": 0.2858, "mean_copy_accuracy": 0.9953386634588242, "mean_gen_accuracy": 0.8735774457454681, "mean_token_accuracy": 0.9027897268533707, "num_tokens": 346756830.0, "sample_num_tokens": 6371.5, "step": 5277, "total_num_tokens": 346782316.0, "z_loss": 0.000524888513609767 }, { "copy_logits_max": -5.3125691413879395, "copy_logits_min": -750000000.0, "copy_num_tokens": 401.5, "epoch": 1.0780188920091907, "gen_logits_max": 3.614743947982788, "gen_logits_mean": -15.425298690795898, "gen_logits_min": -27.19186019897461, "gen_logits_std": 2.9425101280212402, "gen_loss": 0.29968100786209106, "grad_norm": 0.42472934933210926, "learning_rate": 2.4262736842105264e-05, "loss": 0.2948, "mean_copy_accuracy": 0.9951676577329636, "mean_gen_accuracy": 0.87443707883358, "mean_token_accuracy": 0.8992489278316498, "num_tokens": 347001830.0, "sample_num_tokens": 7341.5, "step": 5278, "total_num_tokens": 347031196.0, "z_loss": 0.0005550045752897859 }, { "copy_logits_max": -6.15199089050293, "copy_logits_min": -750000064.0, "copy_num_tokens": 492.0, "epoch": 1.0782231299463876, "gen_logits_max": 3.337296485900879, "gen_logits_mean": -15.967979431152344, "gen_logits_min": -28.04906463623047, "gen_logits_std": 2.9354865550994873, "gen_loss": 0.2968637943267822, "grad_norm": 0.3962174308441519, "learning_rate": 2.4261473684210525e-05, "loss": 0.2853, "mean_copy_accuracy": 0.9963259547948837, "mean_gen_accuracy": 0.8714729249477386, "mean_token_accuracy": 0.9055372029542923, "num_tokens": 347284307.0, "sample_num_tokens": 8252.75, "step": 5279, "total_num_tokens": 347317318.0, "z_loss": 0.0005478953826241195 }, { "copy_logits_max": -6.403229236602783, "copy_logits_min": -750000000.0, "copy_num_tokens": 374.125, "epoch": 1.0784273678835843, "gen_logits_max": 3.7522740364074707, "gen_logits_mean": -15.516202926635742, "gen_logits_min": -27.413028717041016, "gen_logits_std": 2.9599404335021973, "gen_loss": 0.26434382796287537, "grad_norm": 0.40024457705714606, "learning_rate": 2.426021052631579e-05, "loss": 0.2822, "mean_copy_accuracy": 0.9941138029098511, "mean_gen_accuracy": 0.8754598647356033, "mean_token_accuracy": 0.9026171267032623, "num_tokens": 347522766.0, "sample_num_tokens": 7199.5, "step": 5280, "total_num_tokens": 347551564.0, "z_loss": 0.0005042307893745601 }, { "copy_logits_max": -7.285623550415039, "copy_logits_min": -750000000.0, "copy_num_tokens": 498.5, "epoch": 1.0786316058207812, "gen_logits_max": 3.1171984672546387, "gen_logits_mean": -16.897422790527344, "gen_logits_min": -28.8692626953125, "gen_logits_std": 2.9583756923675537, "gen_loss": 0.2769315242767334, "grad_norm": 0.39871072064687013, "learning_rate": 2.425894736842105e-05, "loss": 0.2866, "mean_copy_accuracy": 0.9954236596822739, "mean_gen_accuracy": 0.8722184002399445, "mean_token_accuracy": 0.902878537774086, "num_tokens": 347796181.0, "sample_num_tokens": 8432.75, "step": 5281, "total_num_tokens": 347829912.0, "z_loss": 0.0005154469981789589 }, { "copy_logits_max": -6.663782596588135, "copy_logits_min": -750000064.0, "copy_num_tokens": 235.875, "epoch": 1.0788358437579781, "gen_logits_max": 4.48846960067749, "gen_logits_mean": -16.58168601989746, "gen_logits_min": -28.26618194580078, "gen_logits_std": 2.954864501953125, "gen_loss": 0.308299720287323, "grad_norm": 0.4469233538156211, "learning_rate": 2.4257684210526315e-05, "loss": 0.2971, "mean_copy_accuracy": 0.9952343106269836, "mean_gen_accuracy": 0.8712629079818726, "mean_token_accuracy": 0.8990506529808044, "num_tokens": 348044791.0, "sample_num_tokens": 5782.75, "step": 5282, "total_num_tokens": 348067922.0, "z_loss": 0.0006115284049883485 }, { "copy_logits_max": -6.075748920440674, "copy_logits_min": -750000000.0, "copy_num_tokens": 289.75, "epoch": 1.0790400816951748, "gen_logits_max": 4.950425624847412, "gen_logits_mean": -14.918707847595215, "gen_logits_min": -26.85866928100586, "gen_logits_std": 2.9182190895080566, "gen_loss": 0.2810444235801697, "grad_norm": 0.3976694086184143, "learning_rate": 2.4256421052631583e-05, "loss": 0.2789, "mean_copy_accuracy": 0.9959402233362198, "mean_gen_accuracy": 0.8800580650568008, "mean_token_accuracy": 0.9041474312543869, "num_tokens": 348310991.0, "sample_num_tokens": 8320.25, "step": 5283, "total_num_tokens": 348344272.0, "z_loss": 0.0005936553934589028 }, { "copy_logits_max": -3.0005478858947754, "copy_logits_min": -750000000.0, "copy_num_tokens": 502.0, "epoch": 1.0792443196323718, "gen_logits_max": 4.803901672363281, "gen_logits_mean": -13.407735824584961, "gen_logits_min": -25.331703186035156, "gen_logits_std": 2.8971991539001465, "gen_loss": 0.26875728368759155, "grad_norm": 0.4007560082733553, "learning_rate": 2.4255157894736844e-05, "loss": 0.2987, "mean_copy_accuracy": 0.9955734759569168, "mean_gen_accuracy": 0.8677285313606262, "mean_token_accuracy": 0.8998730629682541, "num_tokens": 348576294.0, "sample_num_tokens": 8120.0, "step": 5284, "total_num_tokens": 348608774.0, "z_loss": 0.0005914565408602357 }, { "copy_logits_max": -6.785973072052002, "copy_logits_min": -750000000.0, "copy_num_tokens": 522.0, "epoch": 1.0794485575695685, "gen_logits_max": 3.2517991065979004, "gen_logits_mean": -17.527053833007812, "gen_logits_min": -28.808074951171875, "gen_logits_std": 2.979259729385376, "gen_loss": 0.26597094535827637, "grad_norm": 0.45161656168305664, "learning_rate": 2.4253894736842108e-05, "loss": 0.267, "mean_copy_accuracy": 0.994572639465332, "mean_gen_accuracy": 0.881232738494873, "mean_token_accuracy": 0.9083746820688248, "num_tokens": 348837301.0, "sample_num_tokens": 9487.25, "step": 5285, "total_num_tokens": 348875250.0, "z_loss": 0.0004945078981108963 }, { "copy_logits_max": -5.038721084594727, "copy_logits_min": -750000000.0, "copy_num_tokens": 381.125, "epoch": 1.0796527955067654, "gen_logits_max": 4.4008612632751465, "gen_logits_mean": -15.569241523742676, "gen_logits_min": -27.593000411987305, "gen_logits_std": 2.970245838165283, "gen_loss": 0.293191134929657, "grad_norm": 0.4287691400215239, "learning_rate": 2.425263157894737e-05, "loss": 0.2893, "mean_copy_accuracy": 0.995856374502182, "mean_gen_accuracy": 0.8747921288013458, "mean_token_accuracy": 0.9041862785816193, "num_tokens": 349118119.0, "sample_num_tokens": 8280.25, "step": 5286, "total_num_tokens": 349151240.0, "z_loss": 0.0005849433364346623 }, { "copy_logits_max": -5.9089860916137695, "copy_logits_min": -750000000.0, "copy_num_tokens": 332.5, "epoch": 1.0798570334439623, "gen_logits_max": 4.994774341583252, "gen_logits_mean": -13.76704216003418, "gen_logits_min": -25.428924560546875, "gen_logits_std": 2.857982873916626, "gen_loss": 0.26141923666000366, "grad_norm": 0.4351176283072257, "learning_rate": 2.4251368421052633e-05, "loss": 0.2909, "mean_copy_accuracy": 0.9937472194433212, "mean_gen_accuracy": 0.8754342496395111, "mean_token_accuracy": 0.9002903252840042, "num_tokens": 349382416.0, "sample_num_tokens": 7851.0, "step": 5287, "total_num_tokens": 349413820.0, "z_loss": 0.0005568215856328607 }, { "copy_logits_max": -3.4298605918884277, "copy_logits_min": -625000064.0, "copy_num_tokens": 603.25, "epoch": 1.080061271381159, "gen_logits_max": 4.083586692810059, "gen_logits_mean": -14.619181632995605, "gen_logits_min": -26.578548431396484, "gen_logits_std": 2.9272360801696777, "gen_loss": 0.261688232421875, "grad_norm": 0.3815598504892533, "learning_rate": 2.4250105263157894e-05, "loss": 0.257, "mean_copy_accuracy": 0.9956971108913422, "mean_gen_accuracy": 0.8803104013204575, "mean_token_accuracy": 0.913198858499527, "num_tokens": 349649475.0, "sample_num_tokens": 8464.25, "step": 5288, "total_num_tokens": 349683332.0, "z_loss": 0.0006628108094446361 }, { "copy_logits_max": -5.459561824798584, "copy_logits_min": -750000000.0, "copy_num_tokens": 415.0, "epoch": 1.080265509318356, "gen_logits_max": 3.2079477310180664, "gen_logits_mean": -17.295040130615234, "gen_logits_min": -28.55771255493164, "gen_logits_std": 2.954723834991455, "gen_loss": 0.2524556517601013, "grad_norm": 0.39395211208925746, "learning_rate": 2.4248842105263158e-05, "loss": 0.2726, "mean_copy_accuracy": 0.9957192838191986, "mean_gen_accuracy": 0.8774052858352661, "mean_token_accuracy": 0.906924232840538, "num_tokens": 349906180.0, "sample_num_tokens": 8170.5, "step": 5289, "total_num_tokens": 349938862.0, "z_loss": 0.0005008596344850957 }, { "copy_logits_max": -4.814190864562988, "copy_logits_min": -750000128.0, "copy_num_tokens": 499.0625, "epoch": 1.0804697472555527, "gen_logits_max": 4.144750595092773, "gen_logits_mean": -14.905057907104492, "gen_logits_min": -26.445716857910156, "gen_logits_std": 2.8972067832946777, "gen_loss": 0.30000412464141846, "grad_norm": 0.4172857244843837, "learning_rate": 2.424757894736842e-05, "loss": 0.2959, "mean_copy_accuracy": 0.994897797703743, "mean_gen_accuracy": 0.8742815256118774, "mean_token_accuracy": 0.901155099272728, "num_tokens": 350157471.0, "sample_num_tokens": 8463.25, "step": 5290, "total_num_tokens": 350191324.0, "z_loss": 0.0006381290149874985 }, { "copy_logits_max": -5.754620552062988, "copy_logits_min": -750000000.0, "copy_num_tokens": 459.125, "epoch": 1.0806739851927496, "gen_logits_max": 4.96320915222168, "gen_logits_mean": -13.916496276855469, "gen_logits_min": -25.444292068481445, "gen_logits_std": 2.897822856903076, "gen_loss": 0.2722758948802948, "grad_norm": 0.40587526401436697, "learning_rate": 2.4246315789473687e-05, "loss": 0.2814, "mean_copy_accuracy": 0.9959009736776352, "mean_gen_accuracy": 0.8729756027460098, "mean_token_accuracy": 0.9045535326004028, "num_tokens": 350440873.0, "sample_num_tokens": 8945.75, "step": 5291, "total_num_tokens": 350476656.0, "z_loss": 0.0005192076787352562 }, { "copy_logits_max": -7.004294395446777, "copy_logits_min": -687500032.0, "copy_num_tokens": 362.875, "epoch": 1.0808782231299463, "gen_logits_max": 4.5200958251953125, "gen_logits_mean": -14.913069725036621, "gen_logits_min": -26.56505012512207, "gen_logits_std": 2.8942203521728516, "gen_loss": 0.3087455928325653, "grad_norm": 0.40225797836714733, "learning_rate": 2.4245052631578948e-05, "loss": 0.2753, "mean_copy_accuracy": 0.9952520728111267, "mean_gen_accuracy": 0.8788741081953049, "mean_token_accuracy": 0.9068480730056763, "num_tokens": 350719752.0, "sample_num_tokens": 7938.5, "step": 5292, "total_num_tokens": 350751506.0, "z_loss": 0.0006469020736403763 }, { "copy_logits_max": -4.0830278396606445, "copy_logits_min": -750000064.0, "copy_num_tokens": 434.5625, "epoch": 1.0810824610671432, "gen_logits_max": 5.10893440246582, "gen_logits_mean": -13.943971633911133, "gen_logits_min": -26.51072883605957, "gen_logits_std": 2.895960807800293, "gen_loss": 0.2872063219547272, "grad_norm": 0.4188914392154072, "learning_rate": 2.4243789473684212e-05, "loss": 0.2881, "mean_copy_accuracy": 0.9961449652910233, "mean_gen_accuracy": 0.8727374821901321, "mean_token_accuracy": 0.9037711173295975, "num_tokens": 350984952.0, "sample_num_tokens": 8629.5, "step": 5293, "total_num_tokens": 351019470.0, "z_loss": 0.0005772579461336136 }, { "copy_logits_max": -4.614845275878906, "copy_logits_min": -750000000.0, "copy_num_tokens": 452.8125, "epoch": 1.0812866990043402, "gen_logits_max": 4.526406288146973, "gen_logits_mean": -14.482721328735352, "gen_logits_min": -26.628252029418945, "gen_logits_std": 2.927311897277832, "gen_loss": 0.2731627821922302, "grad_norm": 0.39009553818496423, "learning_rate": 2.4242526315789473e-05, "loss": 0.2921, "mean_copy_accuracy": 0.9951998740434647, "mean_gen_accuracy": 0.8727859556674957, "mean_token_accuracy": 0.9001723229885101, "num_tokens": 351246574.0, "sample_num_tokens": 7931.0, "step": 5294, "total_num_tokens": 351278298.0, "z_loss": 0.0005296302260830998 }, { "copy_logits_max": -5.503237724304199, "copy_logits_min": -750000000.0, "copy_num_tokens": 480.4375, "epoch": 1.0814909369415369, "gen_logits_max": 3.998377799987793, "gen_logits_mean": -15.818445205688477, "gen_logits_min": -27.844341278076172, "gen_logits_std": 2.960275411605835, "gen_loss": 0.29644960165023804, "grad_norm": 0.3810706003467512, "learning_rate": 2.4241263157894737e-05, "loss": 0.2754, "mean_copy_accuracy": 0.9957704246044159, "mean_gen_accuracy": 0.8715019077062607, "mean_token_accuracy": 0.9068000465631485, "num_tokens": 351523654.0, "sample_num_tokens": 8209.0, "step": 5295, "total_num_tokens": 351556490.0, "z_loss": 0.0005654726992361248 }, { "copy_logits_max": -6.755209445953369, "copy_logits_min": -687500032.0, "copy_num_tokens": 329.9375, "epoch": 1.0816951748787338, "gen_logits_max": 4.974913120269775, "gen_logits_mean": -14.933515548706055, "gen_logits_min": -26.744129180908203, "gen_logits_std": 2.924804210662842, "gen_loss": 0.2959570288658142, "grad_norm": 0.4237800079026619, "learning_rate": 2.4240000000000002e-05, "loss": 0.2893, "mean_copy_accuracy": 0.9953333884477615, "mean_gen_accuracy": 0.8728652745485306, "mean_token_accuracy": 0.9023830443620682, "num_tokens": 351799354.0, "sample_num_tokens": 7595.0, "step": 5296, "total_num_tokens": 351829734.0, "z_loss": 0.0005498966202139854 }, { "copy_logits_max": -5.656117916107178, "copy_logits_min": -687500032.0, "copy_num_tokens": 524.3125, "epoch": 1.0818994128159305, "gen_logits_max": 3.9032397270202637, "gen_logits_mean": -15.487712860107422, "gen_logits_min": -27.893550872802734, "gen_logits_std": 2.9686806201934814, "gen_loss": 0.2596682608127594, "grad_norm": 0.40018383065739194, "learning_rate": 2.4238736842105263e-05, "loss": 0.2759, "mean_copy_accuracy": 0.9964717328548431, "mean_gen_accuracy": 0.8714521825313568, "mean_token_accuracy": 0.9064513593912125, "num_tokens": 352086903.0, "sample_num_tokens": 8847.75, "step": 5297, "total_num_tokens": 352122294.0, "z_loss": 0.00048637413419783115 }, { "copy_logits_max": -6.216030597686768, "copy_logits_min": -687500032.0, "copy_num_tokens": 366.875, "epoch": 1.0821036507531274, "gen_logits_max": 4.26873254776001, "gen_logits_mean": -15.808258056640625, "gen_logits_min": -27.46924591064453, "gen_logits_std": 2.959977149963379, "gen_loss": 0.3091184198856354, "grad_norm": 0.41105909379252303, "learning_rate": 2.4237473684210527e-05, "loss": 0.2942, "mean_copy_accuracy": 0.9963224083185196, "mean_gen_accuracy": 0.872894823551178, "mean_token_accuracy": 0.9020563513040543, "num_tokens": 352363746.0, "sample_num_tokens": 7764.0, "step": 5298, "total_num_tokens": 352394802.0, "z_loss": 0.0005691936821676791 }, { "copy_logits_max": -4.873546600341797, "copy_logits_min": -750000000.0, "copy_num_tokens": 356.75, "epoch": 1.0823078886903241, "gen_logits_max": 5.053273677825928, "gen_logits_mean": -14.085107803344727, "gen_logits_min": -26.159961700439453, "gen_logits_std": 2.886340618133545, "gen_loss": 0.30337420105934143, "grad_norm": 0.3810371847644665, "learning_rate": 2.423621052631579e-05, "loss": 0.279, "mean_copy_accuracy": 0.9956967532634735, "mean_gen_accuracy": 0.8807975947856903, "mean_token_accuracy": 0.9086180776357651, "num_tokens": 352635660.0, "sample_num_tokens": 7405.0, "step": 5299, "total_num_tokens": 352665280.0, "z_loss": 0.000627052562776953 }, { "copy_logits_max": -5.507930755615234, "copy_logits_min": -750000000.0, "copy_num_tokens": 478.375, "epoch": 1.082512126627521, "gen_logits_max": 4.647870063781738, "gen_logits_mean": -14.028998374938965, "gen_logits_min": -26.11927032470703, "gen_logits_std": 2.9265987873077393, "gen_loss": 0.26144808530807495, "grad_norm": 0.3593485832608707, "learning_rate": 2.4234947368421056e-05, "loss": 0.2728, "mean_copy_accuracy": 0.996408611536026, "mean_gen_accuracy": 0.8758515566587448, "mean_token_accuracy": 0.9049344658851624, "num_tokens": 352925825.0, "sample_num_tokens": 8507.75, "step": 5300, "total_num_tokens": 352959856.0, "z_loss": 0.0005127517506480217 }, { "copy_logits_max": -4.177050590515137, "copy_logits_min": -750000000.0, "copy_num_tokens": 776.25, "epoch": 1.082716364564718, "gen_logits_max": 3.536675214767456, "gen_logits_mean": -15.430196762084961, "gen_logits_min": -27.696590423583984, "gen_logits_std": 2.9839415550231934, "gen_loss": 0.24474093317985535, "grad_norm": 0.38933081884426, "learning_rate": 2.4233684210526316e-05, "loss": 0.2798, "mean_copy_accuracy": 0.9963888227939606, "mean_gen_accuracy": 0.8758786916732788, "mean_token_accuracy": 0.9041013568639755, "num_tokens": 353187376.0, "sample_num_tokens": 11131.0, "step": 5301, "total_num_tokens": 353231900.0, "z_loss": 0.00045259363832883537 }, { "copy_logits_max": -5.542928695678711, "copy_logits_min": -750000000.0, "copy_num_tokens": 452.375, "epoch": 1.0829206025019147, "gen_logits_max": 4.78801155090332, "gen_logits_mean": -14.094539642333984, "gen_logits_min": -26.032901763916016, "gen_logits_std": 2.9373598098754883, "gen_loss": 0.25902003049850464, "grad_norm": 0.3746281982487516, "learning_rate": 2.423242105263158e-05, "loss": 0.2723, "mean_copy_accuracy": 0.9950360953807831, "mean_gen_accuracy": 0.8831832259893417, "mean_token_accuracy": 0.9083158373832703, "num_tokens": 353466528.0, "sample_num_tokens": 7664.0, "step": 5302, "total_num_tokens": 353497184.0, "z_loss": 0.0004886115784756839 }, { "copy_logits_max": -5.484068870544434, "copy_logits_min": -750000000.0, "copy_num_tokens": 451.875, "epoch": 1.0831248404391116, "gen_logits_max": 4.073739528656006, "gen_logits_mean": -16.26806640625, "gen_logits_min": -27.937030792236328, "gen_logits_std": 2.9740796089172363, "gen_loss": 0.29337188601493835, "grad_norm": 0.38408818489873053, "learning_rate": 2.4231157894736842e-05, "loss": 0.2808, "mean_copy_accuracy": 0.9963064044713974, "mean_gen_accuracy": 0.8708907514810562, "mean_token_accuracy": 0.9036029726266861, "num_tokens": 353749201.0, "sample_num_tokens": 8471.75, "step": 5303, "total_num_tokens": 353783088.0, "z_loss": 0.0005324656958691776 }, { "copy_logits_max": -4.8944196701049805, "copy_logits_min": -750000000.0, "copy_num_tokens": 391.3125, "epoch": 1.0833290783763083, "gen_logits_max": 4.985772132873535, "gen_logits_mean": -15.12808609008789, "gen_logits_min": -27.041818618774414, "gen_logits_std": 2.9940168857574463, "gen_loss": 0.2932269871234894, "grad_norm": 0.3917357525155199, "learning_rate": 2.4229894736842106e-05, "loss": 0.2942, "mean_copy_accuracy": 0.9959277957677841, "mean_gen_accuracy": 0.8715783953666687, "mean_token_accuracy": 0.8997319340705872, "num_tokens": 354008732.0, "sample_num_tokens": 8526.0, "step": 5304, "total_num_tokens": 354042836.0, "z_loss": 0.0006172737339511514 }, { "copy_logits_max": -5.918192386627197, "copy_logits_min": -750000000.0, "copy_num_tokens": 348.0, "epoch": 1.0835333163135052, "gen_logits_max": 5.611724853515625, "gen_logits_mean": -14.551146507263184, "gen_logits_min": -26.18004035949707, "gen_logits_std": 2.9208872318267822, "gen_loss": 0.24364149570465088, "grad_norm": 0.38240733198901405, "learning_rate": 2.4228631578947367e-05, "loss": 0.287, "mean_copy_accuracy": 0.9963921755552292, "mean_gen_accuracy": 0.8745249360799789, "mean_token_accuracy": 0.9019272327423096, "num_tokens": 354287784.0, "sample_num_tokens": 8717.5, "step": 5305, "total_num_tokens": 354322654.0, "z_loss": 0.000482970557641238 }, { "copy_logits_max": -3.0407705307006836, "copy_logits_min": -687500032.0, "copy_num_tokens": 715.1875, "epoch": 1.083737554250702, "gen_logits_max": 3.462559223175049, "gen_logits_mean": -14.912473678588867, "gen_logits_min": -27.112022399902344, "gen_logits_std": 3.01204776763916, "gen_loss": 0.2275613248348236, "grad_norm": 0.3865863897534902, "learning_rate": 2.422736842105263e-05, "loss": 0.2729, "mean_copy_accuracy": 0.9964137971401215, "mean_gen_accuracy": 0.8749908953905106, "mean_token_accuracy": 0.9075154066085815, "num_tokens": 354562221.0, "sample_num_tokens": 9534.75, "step": 5306, "total_num_tokens": 354600360.0, "z_loss": 0.00044487143168225884 }, { "copy_logits_max": -3.0223793983459473, "copy_logits_min": -750000000.0, "copy_num_tokens": 439.375, "epoch": 1.0839417921878989, "gen_logits_max": 3.855870246887207, "gen_logits_mean": -15.912775039672852, "gen_logits_min": -27.922367095947266, "gen_logits_std": 2.985614776611328, "gen_loss": 0.2747436761856079, "grad_norm": 0.3506091496244878, "learning_rate": 2.4226105263157892e-05, "loss": 0.266, "mean_copy_accuracy": 0.9969558268785477, "mean_gen_accuracy": 0.8772683292627335, "mean_token_accuracy": 0.908321276307106, "num_tokens": 354843959.0, "sample_num_tokens": 7597.75, "step": 5307, "total_num_tokens": 354874350.0, "z_loss": 0.00060558773111552 }, { "copy_logits_max": -3.2225327491760254, "copy_logits_min": -750000128.0, "copy_num_tokens": 549.3125, "epoch": 1.0841460301250958, "gen_logits_max": 4.249634742736816, "gen_logits_mean": -14.823720932006836, "gen_logits_min": -27.44974708557129, "gen_logits_std": 3.0005788803100586, "gen_loss": 0.28632304072380066, "grad_norm": 0.40446404200449465, "learning_rate": 2.422484210526316e-05, "loss": 0.2779, "mean_copy_accuracy": 0.9965521842241287, "mean_gen_accuracy": 0.877472847700119, "mean_token_accuracy": 0.9065091013908386, "num_tokens": 355107098.0, "sample_num_tokens": 8777.0, "step": 5308, "total_num_tokens": 355142206.0, "z_loss": 0.0005868431180715561 }, { "copy_logits_max": -1.7766129970550537, "copy_logits_min": -687500032.0, "copy_num_tokens": 527.5, "epoch": 1.0843502680622925, "gen_logits_max": 4.720950126647949, "gen_logits_mean": -13.789122581481934, "gen_logits_min": -26.271320343017578, "gen_logits_std": 2.9611198902130127, "gen_loss": 0.29768115282058716, "grad_norm": 0.43235945731344233, "learning_rate": 2.4223578947368424e-05, "loss": 0.3124, "mean_copy_accuracy": 0.9952571541070938, "mean_gen_accuracy": 0.8693170249462128, "mean_token_accuracy": 0.8971691727638245, "num_tokens": 355362443.0, "sample_num_tokens": 9024.25, "step": 5309, "total_num_tokens": 355398540.0, "z_loss": 0.000631809642072767 }, { "copy_logits_max": -4.141942501068115, "copy_logits_min": -687500032.0, "copy_num_tokens": 525.75, "epoch": 1.0845545059994894, "gen_logits_max": 3.843057632446289, "gen_logits_mean": -15.73845386505127, "gen_logits_min": -27.884868621826172, "gen_logits_std": 3.000795364379883, "gen_loss": 0.2470434159040451, "grad_norm": 0.39826911425541445, "learning_rate": 2.4222315789473685e-05, "loss": 0.2827, "mean_copy_accuracy": 0.9965078979730606, "mean_gen_accuracy": 0.8786623179912567, "mean_token_accuracy": 0.9046895056962967, "num_tokens": 355632888.0, "sample_num_tokens": 8747.0, "step": 5310, "total_num_tokens": 355667876.0, "z_loss": 0.0005299529293552041 }, { "copy_logits_max": -2.3925108909606934, "copy_logits_min": -750000000.0, "copy_num_tokens": 744.3125, "epoch": 1.0847587439366861, "gen_logits_max": 4.267860412597656, "gen_logits_mean": -14.470096588134766, "gen_logits_min": -27.833145141601562, "gen_logits_std": 3.00978422164917, "gen_loss": 0.2466236799955368, "grad_norm": 0.380097573866976, "learning_rate": 2.422105263157895e-05, "loss": 0.2709, "mean_copy_accuracy": 0.9958338737487793, "mean_gen_accuracy": 0.8723513334989548, "mean_token_accuracy": 0.9081760048866272, "num_tokens": 355916332.0, "sample_num_tokens": 10108.0, "step": 5311, "total_num_tokens": 355956764.0, "z_loss": 0.0005145570612512529 }, { "copy_logits_max": -4.43989372253418, "copy_logits_min": -750000000.0, "copy_num_tokens": 600.8125, "epoch": 1.084962981873883, "gen_logits_max": 3.1630072593688965, "gen_logits_mean": -16.870094299316406, "gen_logits_min": -29.40550994873047, "gen_logits_std": 3.0469322204589844, "gen_loss": 0.2292957603931427, "grad_norm": 0.3729835232793805, "learning_rate": 2.421978947368421e-05, "loss": 0.2645, "mean_copy_accuracy": 0.9971769452095032, "mean_gen_accuracy": 0.8804945945739746, "mean_token_accuracy": 0.9112924784421921, "num_tokens": 356213187.0, "sample_num_tokens": 9453.75, "step": 5312, "total_num_tokens": 356251002.0, "z_loss": 0.0004633645585272461 }, { "copy_logits_max": -2.9459619522094727, "copy_logits_min": -750000000.0, "copy_num_tokens": 482.8125, "epoch": 1.08516721981108, "gen_logits_max": 4.871508598327637, "gen_logits_mean": -15.63711929321289, "gen_logits_min": -27.97052764892578, "gen_logits_std": 2.983755588531494, "gen_loss": 0.32133224606513977, "grad_norm": 0.39449984550707184, "learning_rate": 2.4218526315789475e-05, "loss": 0.2982, "mean_copy_accuracy": 0.9969302862882614, "mean_gen_accuracy": 0.8675058335065842, "mean_token_accuracy": 0.8998861014842987, "num_tokens": 356490202.0, "sample_num_tokens": 8748.5, "step": 5313, "total_num_tokens": 356525196.0, "z_loss": 0.0005825480911880732 }, { "copy_logits_max": -5.056875228881836, "copy_logits_min": -750000000.0, "copy_num_tokens": 482.25, "epoch": 1.0853714577482767, "gen_logits_max": 4.455244064331055, "gen_logits_mean": -15.237628936767578, "gen_logits_min": -27.229206085205078, "gen_logits_std": 2.9689383506774902, "gen_loss": 0.2869374454021454, "grad_norm": 0.3742278794903416, "learning_rate": 2.4217263157894736e-05, "loss": 0.2594, "mean_copy_accuracy": 0.9964168816804886, "mean_gen_accuracy": 0.8827614933252335, "mean_token_accuracy": 0.9115208238363266, "num_tokens": 356775998.0, "sample_num_tokens": 8984.0, "step": 5314, "total_num_tokens": 356811934.0, "z_loss": 0.0005524504231289029 }, { "copy_logits_max": -3.4442920684814453, "copy_logits_min": -750000000.0, "copy_num_tokens": 417.3125, "epoch": 1.0855756956854736, "gen_logits_max": 4.824154853820801, "gen_logits_mean": -13.974117279052734, "gen_logits_min": -25.88798713684082, "gen_logits_std": 2.9495697021484375, "gen_loss": 0.2636309862136841, "grad_norm": 0.3751486253991219, "learning_rate": 2.4216e-05, "loss": 0.268, "mean_copy_accuracy": 0.9964365363121033, "mean_gen_accuracy": 0.88099205493927, "mean_token_accuracy": 0.9083849191665649, "num_tokens": 357042666.0, "sample_num_tokens": 7848.0, "step": 5315, "total_num_tokens": 357074058.0, "z_loss": 0.0004887906834483147 }, { "copy_logits_max": -3.9784631729125977, "copy_logits_min": -750000000.0, "copy_num_tokens": 557.0625, "epoch": 1.0857799336226703, "gen_logits_max": 4.2876787185668945, "gen_logits_mean": -15.692264556884766, "gen_logits_min": -27.888385772705078, "gen_logits_std": 3.0004959106445312, "gen_loss": 0.28988897800445557, "grad_norm": 0.4032302160359564, "learning_rate": 2.4214736842105264e-05, "loss": 0.3059, "mean_copy_accuracy": 0.9959303736686707, "mean_gen_accuracy": 0.8648578375577927, "mean_token_accuracy": 0.8955765068531036, "num_tokens": 357323886.0, "sample_num_tokens": 9281.5, "step": 5316, "total_num_tokens": 357361012.0, "z_loss": 0.0005981768481433392 }, { "copy_logits_max": -4.701547622680664, "copy_logits_min": -750000000.0, "copy_num_tokens": 394.25, "epoch": 1.0859841715598673, "gen_logits_max": 4.947939395904541, "gen_logits_mean": -15.044999122619629, "gen_logits_min": -26.982242584228516, "gen_logits_std": 2.9737637042999268, "gen_loss": 0.2606573700904846, "grad_norm": 0.40980161493862044, "learning_rate": 2.421347368421053e-05, "loss": 0.2811, "mean_copy_accuracy": 0.9964561462402344, "mean_gen_accuracy": 0.8783500641584396, "mean_token_accuracy": 0.9037530720233917, "num_tokens": 357588221.0, "sample_num_tokens": 8074.75, "step": 5317, "total_num_tokens": 357620520.0, "z_loss": 0.0005278434837237 }, { "copy_logits_max": -3.601921319961548, "copy_logits_min": -687500032.0, "copy_num_tokens": 658.6875, "epoch": 1.0861884094970642, "gen_logits_max": 3.697559356689453, "gen_logits_mean": -16.10993194580078, "gen_logits_min": -28.138011932373047, "gen_logits_std": 3.0063490867614746, "gen_loss": 0.29579758644104004, "grad_norm": 0.3792960225185316, "learning_rate": 2.421221052631579e-05, "loss": 0.2777, "mean_copy_accuracy": 0.9965220540761948, "mean_gen_accuracy": 0.8717009723186493, "mean_token_accuracy": 0.906461775302887, "num_tokens": 357856402.0, "sample_num_tokens": 9628.5, "step": 5318, "total_num_tokens": 357894916.0, "z_loss": 0.000641466467641294 }, { "copy_logits_max": -4.840175628662109, "copy_logits_min": -750000000.0, "copy_num_tokens": 199.1875, "epoch": 1.086392647434261, "gen_logits_max": 5.410891056060791, "gen_logits_mean": -15.42344856262207, "gen_logits_min": -27.194889068603516, "gen_logits_std": 2.9803640842437744, "gen_loss": 0.29877549409866333, "grad_norm": 0.4191061816546362, "learning_rate": 2.4210947368421054e-05, "loss": 0.3039, "mean_copy_accuracy": 0.9952195733785629, "mean_gen_accuracy": 0.8720749616622925, "mean_token_accuracy": 0.8985497057437897, "num_tokens": 358116039.0, "sample_num_tokens": 5672.75, "step": 5319, "total_num_tokens": 358138730.0, "z_loss": 0.0005496181547641754 }, { "copy_logits_max": -3.7876718044281006, "copy_logits_min": -750000064.0, "copy_num_tokens": 416.125, "epoch": 1.0865968853714578, "gen_logits_max": 4.4176740646362305, "gen_logits_mean": -15.366479873657227, "gen_logits_min": -27.44208335876465, "gen_logits_std": 2.9690194129943848, "gen_loss": 0.28992533683776855, "grad_norm": 0.4578399018613977, "learning_rate": 2.4209684210526315e-05, "loss": 0.2941, "mean_copy_accuracy": 0.9954775273799896, "mean_gen_accuracy": 0.8710200637578964, "mean_token_accuracy": 0.8999735414981842, "num_tokens": 358355531.0, "sample_num_tokens": 7737.75, "step": 5320, "total_num_tokens": 358386482.0, "z_loss": 0.0005505455774255097 }, { "copy_logits_max": -2.7802319526672363, "copy_logits_min": -750000000.0, "copy_num_tokens": 536.625, "epoch": 1.0868011233086545, "gen_logits_max": 5.255296230316162, "gen_logits_mean": -15.11740779876709, "gen_logits_min": -27.31472396850586, "gen_logits_std": 2.9797749519348145, "gen_loss": 0.28041940927505493, "grad_norm": 0.3951581188757922, "learning_rate": 2.420842105263158e-05, "loss": 0.2718, "mean_copy_accuracy": 0.9950794279575348, "mean_gen_accuracy": 0.8804595023393631, "mean_token_accuracy": 0.9096461534500122, "num_tokens": 358624027.0, "sample_num_tokens": 9157.25, "step": 5321, "total_num_tokens": 358660656.0, "z_loss": 0.0005781835643574595 }, { "copy_logits_max": -5.539520263671875, "copy_logits_min": -750000000.0, "copy_num_tokens": 401.6875, "epoch": 1.0870053612458515, "gen_logits_max": 4.228143692016602, "gen_logits_mean": -16.352415084838867, "gen_logits_min": -28.03058624267578, "gen_logits_std": 3.0053391456604004, "gen_loss": 0.2912063002586365, "grad_norm": 0.44690735968893547, "learning_rate": 2.4207157894736843e-05, "loss": 0.2856, "mean_copy_accuracy": 0.9941838830709457, "mean_gen_accuracy": 0.8759868890047073, "mean_token_accuracy": 0.9019362032413483, "num_tokens": 358907736.0, "sample_num_tokens": 8615.5, "step": 5322, "total_num_tokens": 358942198.0, "z_loss": 0.0006081019528210163 }, { "copy_logits_max": -5.068164348602295, "copy_logits_min": -750000064.0, "copy_num_tokens": 409.0625, "epoch": 1.0872095991830482, "gen_logits_max": 3.1236422061920166, "gen_logits_mean": -17.639278411865234, "gen_logits_min": -29.274394989013672, "gen_logits_std": 3.00886869430542, "gen_loss": 0.22693157196044922, "grad_norm": 0.37771497606500276, "learning_rate": 2.4205894736842104e-05, "loss": 0.2504, "mean_copy_accuracy": 0.9969248920679092, "mean_gen_accuracy": 0.8877336531877518, "mean_token_accuracy": 0.9156273752450943, "num_tokens": 359170679.0, "sample_num_tokens": 7812.25, "step": 5323, "total_num_tokens": 359201928.0, "z_loss": 0.0004707801854237914 }, { "copy_logits_max": -0.3532804548740387, "copy_logits_min": -750000000.0, "copy_num_tokens": 488.75, "epoch": 1.087413837120245, "gen_logits_max": 4.874079704284668, "gen_logits_mean": -13.823619842529297, "gen_logits_min": -25.891250610351562, "gen_logits_std": 2.9780068397521973, "gen_loss": 0.25607094168663025, "grad_norm": 0.4007163853471792, "learning_rate": 2.4204631578947372e-05, "loss": 0.2705, "mean_copy_accuracy": 0.9970944225788116, "mean_gen_accuracy": 0.8787945657968521, "mean_token_accuracy": 0.9090655297040939, "num_tokens": 359454434.0, "sample_num_tokens": 8581.0, "step": 5324, "total_num_tokens": 359488758.0, "z_loss": 0.00047754080151207745 }, { "copy_logits_max": -3.1339378356933594, "copy_logits_min": -750000000.0, "copy_num_tokens": 418.75, "epoch": 1.087618075057442, "gen_logits_max": 4.709386825561523, "gen_logits_mean": -14.813215255737305, "gen_logits_min": -27.00899887084961, "gen_logits_std": 2.9930055141448975, "gen_loss": 0.2937595844268799, "grad_norm": 0.3951166200699795, "learning_rate": 2.4203368421052633e-05, "loss": 0.2819, "mean_copy_accuracy": 0.9964386075735092, "mean_gen_accuracy": 0.8728399872779846, "mean_token_accuracy": 0.9071657657623291, "num_tokens": 359732697.0, "sample_num_tokens": 7211.75, "step": 5325, "total_num_tokens": 359761544.0, "z_loss": 0.000554540369194001 }, { "copy_logits_max": -3.719048500061035, "copy_logits_min": -750000000.0, "copy_num_tokens": 467.375, "epoch": 1.0878223129946387, "gen_logits_max": 3.8466789722442627, "gen_logits_mean": -15.704206466674805, "gen_logits_min": -27.64634132385254, "gen_logits_std": 2.9816527366638184, "gen_loss": 0.28809136152267456, "grad_norm": 0.4478523046370424, "learning_rate": 2.4202105263157897e-05, "loss": 0.2671, "mean_copy_accuracy": 0.9959481060504913, "mean_gen_accuracy": 0.878862202167511, "mean_token_accuracy": 0.9077956527471542, "num_tokens": 359991013.0, "sample_num_tokens": 8060.25, "step": 5326, "total_num_tokens": 360023254.0, "z_loss": 0.0005593341775238514 }, { "copy_logits_max": -2.195204496383667, "copy_logits_min": -750000000.0, "copy_num_tokens": 501.375, "epoch": 1.0880265509318356, "gen_logits_max": 3.015160322189331, "gen_logits_mean": -17.327791213989258, "gen_logits_min": -29.425559997558594, "gen_logits_std": 3.028958320617676, "gen_loss": 0.24173930287361145, "grad_norm": 0.3884318132154659, "learning_rate": 2.4200842105263158e-05, "loss": 0.274, "mean_copy_accuracy": 0.994293749332428, "mean_gen_accuracy": 0.8811464458703995, "mean_token_accuracy": 0.9068146795034409, "num_tokens": 360252954.0, "sample_num_tokens": 9127.0, "step": 5327, "total_num_tokens": 360289462.0, "z_loss": 0.0005709553952328861 }, { "copy_logits_max": -3.108985662460327, "copy_logits_min": -750000000.0, "copy_num_tokens": 516.5, "epoch": 1.0882307888690324, "gen_logits_max": 2.668623685836792, "gen_logits_mean": -17.98659896850586, "gen_logits_min": -30.026729583740234, "gen_logits_std": 3.0349342823028564, "gen_loss": 0.2597644627094269, "grad_norm": 0.39012124976368107, "learning_rate": 2.4199578947368422e-05, "loss": 0.2778, "mean_copy_accuracy": 0.99563068151474, "mean_gen_accuracy": 0.8747844845056534, "mean_token_accuracy": 0.9065330177545547, "num_tokens": 360518110.0, "sample_num_tokens": 8830.5, "step": 5328, "total_num_tokens": 360553432.0, "z_loss": 0.0006258632056415081 }, { "copy_logits_max": -4.314951419830322, "copy_logits_min": -750000000.0, "copy_num_tokens": 343.0625, "epoch": 1.0884350268062293, "gen_logits_max": 4.531170845031738, "gen_logits_mean": -15.781060218811035, "gen_logits_min": -27.796371459960938, "gen_logits_std": 2.983607769012451, "gen_loss": 0.2823193669319153, "grad_norm": 0.3926515276659597, "learning_rate": 2.4198315789473683e-05, "loss": 0.2894, "mean_copy_accuracy": 0.9951921552419662, "mean_gen_accuracy": 0.876159742474556, "mean_token_accuracy": 0.9033587127923965, "num_tokens": 360763758.0, "sample_num_tokens": 6747.5, "step": 5329, "total_num_tokens": 360790748.0, "z_loss": 0.000690881279297173 }, { "copy_logits_max": -5.47784423828125, "copy_logits_min": -750000000.0, "copy_num_tokens": 388.5625, "epoch": 1.088639264743426, "gen_logits_max": 3.9074795246124268, "gen_logits_mean": -15.780646324157715, "gen_logits_min": -27.644214630126953, "gen_logits_std": 2.957756519317627, "gen_loss": 0.3057214021682739, "grad_norm": 0.41264538827310937, "learning_rate": 2.4197052631578948e-05, "loss": 0.2898, "mean_copy_accuracy": 0.9960573464632034, "mean_gen_accuracy": 0.8707157075405121, "mean_token_accuracy": 0.9035647809505463, "num_tokens": 361041411.0, "sample_num_tokens": 7630.75, "step": 5330, "total_num_tokens": 361071934.0, "z_loss": 0.0006014234386384487 }, { "copy_logits_max": -4.497981071472168, "copy_logits_min": -750000000.0, "copy_num_tokens": 587.625, "epoch": 1.088843502680623, "gen_logits_max": 2.708007335662842, "gen_logits_mean": -17.162445068359375, "gen_logits_min": -29.252822875976562, "gen_logits_std": 3.000056266784668, "gen_loss": 0.2607337236404419, "grad_norm": 0.43634154637466, "learning_rate": 2.419578947368421e-05, "loss": 0.2679, "mean_copy_accuracy": 0.9961535334587097, "mean_gen_accuracy": 0.877777099609375, "mean_token_accuracy": 0.9073207378387451, "num_tokens": 361306071.0, "sample_num_tokens": 9127.75, "step": 5331, "total_num_tokens": 361342582.0, "z_loss": 0.000602876883931458 }, { "copy_logits_max": -6.996176719665527, "copy_logits_min": -750000000.0, "copy_num_tokens": 329.0625, "epoch": 1.0890477406178198, "gen_logits_max": 4.391464710235596, "gen_logits_mean": -16.789838790893555, "gen_logits_min": -28.419010162353516, "gen_logits_std": 2.986863136291504, "gen_loss": 0.29740285873413086, "grad_norm": 0.4123116232814332, "learning_rate": 2.4194526315789476e-05, "loss": 0.2902, "mean_copy_accuracy": 0.9963998049497604, "mean_gen_accuracy": 0.8705978244543076, "mean_token_accuracy": 0.9018810540437698, "num_tokens": 361577298.0, "sample_num_tokens": 7408.0, "step": 5332, "total_num_tokens": 361606930.0, "z_loss": 0.0005944079603068531 }, { "copy_logits_max": -5.639108657836914, "copy_logits_min": -687500032.0, "copy_num_tokens": 618.0625, "epoch": 1.0892519785550165, "gen_logits_max": 2.347205400466919, "gen_logits_mean": -17.925251007080078, "gen_logits_min": -29.866994857788086, "gen_logits_std": 3.0267276763916016, "gen_loss": 0.25947436690330505, "grad_norm": 0.39558884239705466, "learning_rate": 2.4193263157894737e-05, "loss": 0.2765, "mean_copy_accuracy": 0.9964883327484131, "mean_gen_accuracy": 0.8746021687984467, "mean_token_accuracy": 0.9066805690526962, "num_tokens": 361861726.0, "sample_num_tokens": 8793.0, "step": 5333, "total_num_tokens": 361896898.0, "z_loss": 0.0005288006504997611 }, { "copy_logits_max": -4.233373641967773, "copy_logits_min": -687500032.0, "copy_num_tokens": 565.75, "epoch": 1.0894562164922135, "gen_logits_max": 3.4107460975646973, "gen_logits_mean": -15.039992332458496, "gen_logits_min": -27.230051040649414, "gen_logits_std": 2.968308210372925, "gen_loss": 0.2566267251968384, "grad_norm": 0.3934524061659338, "learning_rate": 2.4192e-05, "loss": 0.2882, "mean_copy_accuracy": 0.9957027435302734, "mean_gen_accuracy": 0.8740426301956177, "mean_token_accuracy": 0.9028382450342178, "num_tokens": 362121771.0, "sample_num_tokens": 8224.25, "step": 5334, "total_num_tokens": 362154668.0, "z_loss": 0.0005980342393741012 }, { "copy_logits_max": -5.726772308349609, "copy_logits_min": -687500032.0, "copy_num_tokens": 467.5625, "epoch": 1.0896604544294102, "gen_logits_max": 4.8278656005859375, "gen_logits_mean": -14.787874221801758, "gen_logits_min": -27.124887466430664, "gen_logits_std": 2.9457883834838867, "gen_loss": 0.3106904923915863, "grad_norm": 0.42016946919851994, "learning_rate": 2.4190736842105262e-05, "loss": 0.3031, "mean_copy_accuracy": 0.9952725917100906, "mean_gen_accuracy": 0.8649091571569443, "mean_token_accuracy": 0.8987332433462143, "num_tokens": 362409915.0, "sample_num_tokens": 8589.25, "step": 5335, "total_num_tokens": 362444272.0, "z_loss": 0.0006670580478385091 }, { "copy_logits_max": -5.151106834411621, "copy_logits_min": -750000000.0, "copy_num_tokens": 563.125, "epoch": 1.089864692366607, "gen_logits_max": 3.7405846118927, "gen_logits_mean": -15.769533157348633, "gen_logits_min": -28.139480590820312, "gen_logits_std": 3.002933979034424, "gen_loss": 0.2913796603679657, "grad_norm": 0.3940936176465335, "learning_rate": 2.4189473684210527e-05, "loss": 0.2999, "mean_copy_accuracy": 0.9964612722396851, "mean_gen_accuracy": 0.8649911284446716, "mean_token_accuracy": 0.9002037942409515, "num_tokens": 362688159.0, "sample_num_tokens": 9089.25, "step": 5336, "total_num_tokens": 362724516.0, "z_loss": 0.0006191973807290196 }, { "copy_logits_max": -5.550040245056152, "copy_logits_min": -750000000.0, "copy_num_tokens": 569.875, "epoch": 1.090068930303804, "gen_logits_max": 3.68375563621521, "gen_logits_mean": -16.92659568786621, "gen_logits_min": -29.130313873291016, "gen_logits_std": 3.012052059173584, "gen_loss": 0.25638774037361145, "grad_norm": 0.37820259176042553, "learning_rate": 2.418821052631579e-05, "loss": 0.2748, "mean_copy_accuracy": 0.9960350543260574, "mean_gen_accuracy": 0.8714286834001541, "mean_token_accuracy": 0.9059035629034042, "num_tokens": 362956467.0, "sample_num_tokens": 9190.25, "step": 5337, "total_num_tokens": 362993228.0, "z_loss": 0.0006443981546908617 }, { "copy_logits_max": -3.9102988243103027, "copy_logits_min": -750000000.0, "copy_num_tokens": 570.3125, "epoch": 1.0902731682410007, "gen_logits_max": 3.9624929428100586, "gen_logits_mean": -15.069009780883789, "gen_logits_min": -27.25096893310547, "gen_logits_std": 2.9816410541534424, "gen_loss": 0.28096890449523926, "grad_norm": 0.38963896590101776, "learning_rate": 2.4186947368421052e-05, "loss": 0.2894, "mean_copy_accuracy": 0.995776042342186, "mean_gen_accuracy": 0.8699375092983246, "mean_token_accuracy": 0.9033697992563248, "num_tokens": 363254487.0, "sample_num_tokens": 9359.25, "step": 5338, "total_num_tokens": 363291924.0, "z_loss": 0.0006932604592293501 }, { "copy_logits_max": -4.9832305908203125, "copy_logits_min": -750000000.0, "copy_num_tokens": 361.125, "epoch": 1.0904774061781977, "gen_logits_max": 4.876969337463379, "gen_logits_mean": -15.718729019165039, "gen_logits_min": -27.77056884765625, "gen_logits_std": 2.9903857707977295, "gen_loss": 0.30726492404937744, "grad_norm": 0.449944996571846, "learning_rate": 2.4185684210526316e-05, "loss": 0.2864, "mean_copy_accuracy": 0.9954223334789276, "mean_gen_accuracy": 0.8775526434183121, "mean_token_accuracy": 0.9033659845590591, "num_tokens": 363519172.0, "sample_num_tokens": 7621.0, "step": 5339, "total_num_tokens": 363549656.0, "z_loss": 0.0006957664154469967 }, { "copy_logits_max": -4.575284004211426, "copy_logits_min": -750000000.0, "copy_num_tokens": 381.75, "epoch": 1.0906816441153944, "gen_logits_max": 5.976778507232666, "gen_logits_mean": -13.85661506652832, "gen_logits_min": -26.48136329650879, "gen_logits_std": 2.9855122566223145, "gen_loss": 0.3088158369064331, "grad_norm": 0.4317533930469984, "learning_rate": 2.418442105263158e-05, "loss": 0.279, "mean_copy_accuracy": 0.9952414333820343, "mean_gen_accuracy": 0.873897060751915, "mean_token_accuracy": 0.907004103064537, "num_tokens": 363797018.0, "sample_num_tokens": 8009.0, "step": 5340, "total_num_tokens": 363829054.0, "z_loss": 0.0006681721424683928 }, { "copy_logits_max": -7.190277099609375, "copy_logits_min": -750000000.0, "copy_num_tokens": 486.875, "epoch": 1.0908858820525913, "gen_logits_max": 4.131775856018066, "gen_logits_mean": -15.894588470458984, "gen_logits_min": -28.131793975830078, "gen_logits_std": 3.0245413780212402, "gen_loss": 0.2334507703781128, "grad_norm": 0.381562093760864, "learning_rate": 2.4183157894736845e-05, "loss": 0.2599, "mean_copy_accuracy": 0.9949282705783844, "mean_gen_accuracy": 0.8837149739265442, "mean_token_accuracy": 0.9103540778160095, "num_tokens": 364054124.0, "sample_num_tokens": 8578.0, "step": 5341, "total_num_tokens": 364088436.0, "z_loss": 0.0005116382380947471 }, { "copy_logits_max": -6.249718189239502, "copy_logits_min": -750000000.0, "copy_num_tokens": 480.375, "epoch": 1.0910901199897882, "gen_logits_max": 4.290591239929199, "gen_logits_mean": -16.208995819091797, "gen_logits_min": -28.09174919128418, "gen_logits_std": 2.9914188385009766, "gen_loss": 0.3305814862251282, "grad_norm": 0.35576481587921505, "learning_rate": 2.4181894736842106e-05, "loss": 0.2921, "mean_copy_accuracy": 0.9961136430501938, "mean_gen_accuracy": 0.871087372303009, "mean_token_accuracy": 0.8999281674623489, "num_tokens": 364336608.0, "sample_num_tokens": 9532.0, "step": 5342, "total_num_tokens": 364374736.0, "z_loss": 0.000613293843343854 }, { "copy_logits_max": -4.691194534301758, "copy_logits_min": -750000128.0, "copy_num_tokens": 676.0, "epoch": 1.091294357926985, "gen_logits_max": 4.043663024902344, "gen_logits_mean": -15.164644241333008, "gen_logits_min": -27.803918838500977, "gen_logits_std": 2.977987766265869, "gen_loss": 0.2689337730407715, "grad_norm": 0.3616430488429696, "learning_rate": 2.418063157894737e-05, "loss": 0.2652, "mean_copy_accuracy": 0.9970718622207642, "mean_gen_accuracy": 0.8756188601255417, "mean_token_accuracy": 0.9117521792650223, "num_tokens": 364616613.0, "sample_num_tokens": 9531.25, "step": 5343, "total_num_tokens": 364654738.0, "z_loss": 0.0006821099668741226 }, { "copy_logits_max": -5.162837982177734, "copy_logits_min": -687500032.0, "copy_num_tokens": 384.875, "epoch": 1.0914985958641819, "gen_logits_max": 4.916839599609375, "gen_logits_mean": -14.867802619934082, "gen_logits_min": -26.957216262817383, "gen_logits_std": 2.9564783573150635, "gen_loss": 0.3430538773536682, "grad_norm": 0.3850891953091762, "learning_rate": 2.417936842105263e-05, "loss": 0.3045, "mean_copy_accuracy": 0.9959065616130829, "mean_gen_accuracy": 0.8665305823087692, "mean_token_accuracy": 0.8961580544710159, "num_tokens": 364890001.0, "sample_num_tokens": 8173.25, "step": 5344, "total_num_tokens": 364922694.0, "z_loss": 0.0006901360466144979 }, { "copy_logits_max": -7.198813438415527, "copy_logits_min": -750000000.0, "copy_num_tokens": 357.3125, "epoch": 1.0917028338013786, "gen_logits_max": 4.2038702964782715, "gen_logits_mean": -16.14170265197754, "gen_logits_min": -28.25836753845215, "gen_logits_std": 3.0095129013061523, "gen_loss": 0.26498904824256897, "grad_norm": 0.4678202861301226, "learning_rate": 2.4178105263157895e-05, "loss": 0.283, "mean_copy_accuracy": 0.994735911488533, "mean_gen_accuracy": 0.8787694424390793, "mean_token_accuracy": 0.9050324410200119, "num_tokens": 365148439.0, "sample_num_tokens": 8501.25, "step": 5345, "total_num_tokens": 365182444.0, "z_loss": 0.0005008015432395041 }, { "copy_logits_max": -5.31386661529541, "copy_logits_min": -750000000.0, "copy_num_tokens": 507.375, "epoch": 1.0919070717385755, "gen_logits_max": 3.7101120948791504, "gen_logits_mean": -16.883708953857422, "gen_logits_min": -28.926380157470703, "gen_logits_std": 3.039370536804199, "gen_loss": 0.2801035940647125, "grad_norm": 0.41296854723870896, "learning_rate": 2.4176842105263156e-05, "loss": 0.287, "mean_copy_accuracy": 0.994917243719101, "mean_gen_accuracy": 0.8721075803041458, "mean_token_accuracy": 0.9026034325361252, "num_tokens": 365409818.0, "sample_num_tokens": 8651.5, "step": 5346, "total_num_tokens": 365444424.0, "z_loss": 0.0005976320244371891 }, { "copy_logits_max": -4.779416561126709, "copy_logits_min": -687500032.0, "copy_num_tokens": 661.5, "epoch": 1.0921113096757722, "gen_logits_max": 2.428964138031006, "gen_logits_mean": -17.380081176757812, "gen_logits_min": -29.40812110900879, "gen_logits_std": 3.040067672729492, "gen_loss": 0.2447388768196106, "grad_norm": 0.4330722245851284, "learning_rate": 2.417557894736842e-05, "loss": 0.2699, "mean_copy_accuracy": 0.9967118054628372, "mean_gen_accuracy": 0.8769846707582474, "mean_token_accuracy": 0.9081283807754517, "num_tokens": 365680146.0, "sample_num_tokens": 9908.5, "step": 5347, "total_num_tokens": 365719780.0, "z_loss": 0.0005334694869816303 }, { "copy_logits_max": -4.482180595397949, "copy_logits_min": -750000000.0, "copy_num_tokens": 644.8125, "epoch": 1.0923155476129691, "gen_logits_max": 3.5968284606933594, "gen_logits_mean": -16.44457244873047, "gen_logits_min": -28.80018424987793, "gen_logits_std": 3.0654571056365967, "gen_loss": 0.2562181353569031, "grad_norm": 0.362377811410382, "learning_rate": 2.4174315789473685e-05, "loss": 0.2649, "mean_copy_accuracy": 0.9966401159763336, "mean_gen_accuracy": 0.8797130584716797, "mean_token_accuracy": 0.9102464020252228, "num_tokens": 365968720.0, "sample_num_tokens": 9535.5, "step": 5348, "total_num_tokens": 366006862.0, "z_loss": 0.0005436408100649714 }, { "copy_logits_max": -5.634369373321533, "copy_logits_min": -750000000.0, "copy_num_tokens": 467.625, "epoch": 1.092519785550166, "gen_logits_max": 3.7847094535827637, "gen_logits_mean": -16.31029510498047, "gen_logits_min": -28.597885131835938, "gen_logits_std": 3.038776159286499, "gen_loss": 0.28515928983688354, "grad_norm": 0.355214509206069, "learning_rate": 2.417305263157895e-05, "loss": 0.2635, "mean_copy_accuracy": 0.995977133512497, "mean_gen_accuracy": 0.8772099316120148, "mean_token_accuracy": 0.9090511351823807, "num_tokens": 366251190.0, "sample_num_tokens": 8916.5, "step": 5349, "total_num_tokens": 366286856.0, "z_loss": 0.0005120586138218641 }, { "copy_logits_max": -5.718571662902832, "copy_logits_min": -750000000.0, "copy_num_tokens": 586.9375, "epoch": 1.0927240234873628, "gen_logits_max": 4.5877604484558105, "gen_logits_mean": -15.329924583435059, "gen_logits_min": -27.608352661132812, "gen_logits_std": 3.0152478218078613, "gen_loss": 0.3037291169166565, "grad_norm": 0.4695079048636407, "learning_rate": 2.4171789473684214e-05, "loss": 0.2817, "mean_copy_accuracy": 0.9938672184944153, "mean_gen_accuracy": 0.8764803260564804, "mean_token_accuracy": 0.905989021062851, "num_tokens": 366508115.0, "sample_num_tokens": 8890.25, "step": 5350, "total_num_tokens": 366543676.0, "z_loss": 0.0005755949532613158 }, { "copy_logits_max": -5.835452556610107, "copy_logits_min": -750000000.0, "copy_num_tokens": 396.1875, "epoch": 1.0929282614245597, "gen_logits_max": 4.1184186935424805, "gen_logits_mean": -15.422894477844238, "gen_logits_min": -27.213794708251953, "gen_logits_std": 2.9998180866241455, "gen_loss": 0.2546747922897339, "grad_norm": 0.41360714811026816, "learning_rate": 2.4170526315789474e-05, "loss": 0.2703, "mean_copy_accuracy": 0.9955562651157379, "mean_gen_accuracy": 0.881714329123497, "mean_token_accuracy": 0.908777043223381, "num_tokens": 366803578.0, "sample_num_tokens": 8799.5, "step": 5351, "total_num_tokens": 366838776.0, "z_loss": 0.00046528410166502 }, { "copy_logits_max": -6.237222671508789, "copy_logits_min": -750000000.0, "copy_num_tokens": 478.25, "epoch": 1.0931324993617564, "gen_logits_max": 4.103466033935547, "gen_logits_mean": -15.094205856323242, "gen_logits_min": -26.99277114868164, "gen_logits_std": 2.978579521179199, "gen_loss": 0.2867983877658844, "grad_norm": 0.41246490174576483, "learning_rate": 2.416926315789474e-05, "loss": 0.2986, "mean_copy_accuracy": 0.9955942183732986, "mean_gen_accuracy": 0.869892343878746, "mean_token_accuracy": 0.9011338353157043, "num_tokens": 367073055.0, "sample_num_tokens": 8802.25, "step": 5352, "total_num_tokens": 367108264.0, "z_loss": 0.0005320805357769132 }, { "copy_logits_max": -7.247822284698486, "copy_logits_min": -750000000.0, "copy_num_tokens": 372.1875, "epoch": 1.0933367372989533, "gen_logits_max": 4.227631568908691, "gen_logits_mean": -15.947566032409668, "gen_logits_min": -27.766313552856445, "gen_logits_std": 2.9743781089782715, "gen_loss": 0.25842732191085815, "grad_norm": 0.4122044903205512, "learning_rate": 2.4168e-05, "loss": 0.2788, "mean_copy_accuracy": 0.9951339066028595, "mean_gen_accuracy": 0.8835508674383163, "mean_token_accuracy": 0.9060881733894348, "num_tokens": 367334705.0, "sample_num_tokens": 8075.25, "step": 5353, "total_num_tokens": 367367006.0, "z_loss": 0.000500817724969238 }, { "copy_logits_max": -6.417174339294434, "copy_logits_min": -750000000.0, "copy_num_tokens": 445.9375, "epoch": 1.09354097523615, "gen_logits_max": 3.9738807678222656, "gen_logits_mean": -15.115684509277344, "gen_logits_min": -27.27606964111328, "gen_logits_std": 2.930271863937378, "gen_loss": 0.2728601098060608, "grad_norm": 0.4375451895725934, "learning_rate": 2.4166736842105264e-05, "loss": 0.2859, "mean_copy_accuracy": 0.995453804731369, "mean_gen_accuracy": 0.875194638967514, "mean_token_accuracy": 0.9034214168787003, "num_tokens": 367587578.0, "sample_num_tokens": 7994.0, "step": 5354, "total_num_tokens": 367619554.0, "z_loss": 0.0005645630299113691 }, { "copy_logits_max": -8.475793838500977, "copy_logits_min": -750000000.0, "copy_num_tokens": 273.9375, "epoch": 1.093745213173347, "gen_logits_max": 5.0709638595581055, "gen_logits_mean": -15.350859642028809, "gen_logits_min": -26.998332977294922, "gen_logits_std": 2.9374992847442627, "gen_loss": 0.3046038746833801, "grad_norm": 0.37024376826520133, "learning_rate": 2.4165473684210525e-05, "loss": 0.2742, "mean_copy_accuracy": 0.9961380660533905, "mean_gen_accuracy": 0.8748301416635513, "mean_token_accuracy": 0.9061772525310516, "num_tokens": 367859494.0, "sample_num_tokens": 7452.5, "step": 5355, "total_num_tokens": 367889304.0, "z_loss": 0.0005953783402219415 }, { "copy_logits_max": -7.475057601928711, "copy_logits_min": -687500032.0, "copy_num_tokens": 589.25, "epoch": 1.0939494511105439, "gen_logits_max": 2.913573980331421, "gen_logits_mean": -17.424880981445312, "gen_logits_min": -29.53976058959961, "gen_logits_std": 3.0083322525024414, "gen_loss": 0.252834677696228, "grad_norm": 0.43610822256070425, "learning_rate": 2.416421052631579e-05, "loss": 0.2609, "mean_copy_accuracy": 0.9948816895484924, "mean_gen_accuracy": 0.8845521062612534, "mean_token_accuracy": 0.909739151597023, "num_tokens": 368130686.0, "sample_num_tokens": 10119.0, "step": 5356, "total_num_tokens": 368171162.0, "z_loss": 0.0005063577555119991 }, { "copy_logits_max": -8.655991554260254, "copy_logits_min": -687500032.0, "copy_num_tokens": 690.5625, "epoch": 1.0941536890477406, "gen_logits_max": 3.713625907897949, "gen_logits_mean": -16.08528709411621, "gen_logits_min": -27.681442260742188, "gen_logits_std": 2.9494500160217285, "gen_loss": 0.24709637463092804, "grad_norm": 0.4021389884339499, "learning_rate": 2.4162947368421054e-05, "loss": 0.2813, "mean_copy_accuracy": 0.9967006891965866, "mean_gen_accuracy": 0.8749129623174667, "mean_token_accuracy": 0.9054045528173447, "num_tokens": 368423322.0, "sample_num_tokens": 9910.5, "step": 5357, "total_num_tokens": 368462964.0, "z_loss": 0.00048751733265817165 }, { "copy_logits_max": -6.147195816040039, "copy_logits_min": -750000000.0, "copy_num_tokens": 688.0625, "epoch": 1.0943579269849375, "gen_logits_max": 3.8126022815704346, "gen_logits_mean": -14.532506942749023, "gen_logits_min": -26.408544540405273, "gen_logits_std": 2.906764030456543, "gen_loss": 0.24716661870479584, "grad_norm": 0.37604719968012573, "learning_rate": 2.4161684210526318e-05, "loss": 0.2753, "mean_copy_accuracy": 0.9971864968538284, "mean_gen_accuracy": 0.8712016493082047, "mean_token_accuracy": 0.9079858064651489, "num_tokens": 368739861.0, "sample_num_tokens": 9810.25, "step": 5358, "total_num_tokens": 368779102.0, "z_loss": 0.0005471209296956658 }, { "copy_logits_max": -6.9820451736450195, "copy_logits_min": -750000000.0, "copy_num_tokens": 281.375, "epoch": 1.0945621649221342, "gen_logits_max": 5.052361488342285, "gen_logits_mean": -15.088373184204102, "gen_logits_min": -26.577150344848633, "gen_logits_std": 2.8907999992370605, "gen_loss": 0.3037377595901489, "grad_norm": 0.36759528926678514, "learning_rate": 2.416042105263158e-05, "loss": 0.2859, "mean_copy_accuracy": 0.9953557848930359, "mean_gen_accuracy": 0.8776559084653854, "mean_token_accuracy": 0.9017762988805771, "num_tokens": 369007702.0, "sample_num_tokens": 7013.0, "step": 5359, "total_num_tokens": 369035754.0, "z_loss": 0.0006146724335849285 }, { "copy_logits_max": -8.880531311035156, "copy_logits_min": -750000000.0, "copy_num_tokens": 283.125, "epoch": 1.0947664028593311, "gen_logits_max": 4.054238319396973, "gen_logits_mean": -16.902446746826172, "gen_logits_min": -28.465694427490234, "gen_logits_std": 2.956843376159668, "gen_loss": 0.250180721282959, "grad_norm": 0.3895897419248987, "learning_rate": 2.4159157894736843e-05, "loss": 0.2712, "mean_copy_accuracy": 0.9955065250396729, "mean_gen_accuracy": 0.8819316327571869, "mean_token_accuracy": 0.9065987765789032, "num_tokens": 369245332.0, "sample_num_tokens": 6713.0, "step": 5360, "total_num_tokens": 369272184.0, "z_loss": 0.0005222902982495725 }, { "copy_logits_max": -7.372314453125, "copy_logits_min": -750000064.0, "copy_num_tokens": 459.5625, "epoch": 1.0949706407965278, "gen_logits_max": 4.1335368156433105, "gen_logits_mean": -15.863859176635742, "gen_logits_min": -27.64150619506836, "gen_logits_std": 2.927485227584839, "gen_loss": 0.3015335202217102, "grad_norm": 0.4422183452025431, "learning_rate": 2.4157894736842104e-05, "loss": 0.2959, "mean_copy_accuracy": 0.9963473826646805, "mean_gen_accuracy": 0.8695962727069855, "mean_token_accuracy": 0.9004461020231247, "num_tokens": 369505683.0, "sample_num_tokens": 8487.75, "step": 5361, "total_num_tokens": 369539634.0, "z_loss": 0.0006016880506649613 }, { "copy_logits_max": -9.306114196777344, "copy_logits_min": -750000000.0, "copy_num_tokens": 297.9375, "epoch": 1.0951748787337248, "gen_logits_max": 4.6200666427612305, "gen_logits_mean": -15.484139442443848, "gen_logits_min": -26.626346588134766, "gen_logits_std": 2.869657039642334, "gen_loss": 0.27951937913894653, "grad_norm": 0.39756064213947895, "learning_rate": 2.415663157894737e-05, "loss": 0.2737, "mean_copy_accuracy": 0.9954654574394226, "mean_gen_accuracy": 0.8820609003305435, "mean_token_accuracy": 0.9047891199588776, "num_tokens": 369751450.0, "sample_num_tokens": 7493.0, "step": 5362, "total_num_tokens": 369781422.0, "z_loss": 0.0006068680086173117 }, { "copy_logits_max": -5.658594131469727, "copy_logits_min": -687500032.0, "copy_num_tokens": 468.0625, "epoch": 1.0953791166709217, "gen_logits_max": 3.888120651245117, "gen_logits_mean": -16.275388717651367, "gen_logits_min": -27.77212905883789, "gen_logits_std": 2.9208478927612305, "gen_loss": 0.25600144267082214, "grad_norm": 0.38289668709448293, "learning_rate": 2.4155368421052633e-05, "loss": 0.2949, "mean_copy_accuracy": 0.995903417468071, "mean_gen_accuracy": 0.8764137923717499, "mean_token_accuracy": 0.9031053781509399, "num_tokens": 370018850.0, "sample_num_tokens": 8264.5, "step": 5363, "total_num_tokens": 370051908.0, "z_loss": 0.0006667438428848982 }, { "copy_logits_max": -6.055006980895996, "copy_logits_min": -750000000.0, "copy_num_tokens": 510.875, "epoch": 1.0955833546081184, "gen_logits_max": 3.6794087886810303, "gen_logits_mean": -17.088510513305664, "gen_logits_min": -28.89517593383789, "gen_logits_std": 2.982652187347412, "gen_loss": 0.2733094096183777, "grad_norm": 0.4393872136595132, "learning_rate": 2.4154105263157894e-05, "loss": 0.2906, "mean_copy_accuracy": 0.9966824352741241, "mean_gen_accuracy": 0.8729207217693329, "mean_token_accuracy": 0.904596671462059, "num_tokens": 370294473.0, "sample_num_tokens": 8628.75, "step": 5364, "total_num_tokens": 370328988.0, "z_loss": 0.0006422800943255424 }, { "copy_logits_max": -8.458745956420898, "copy_logits_min": -750000000.0, "copy_num_tokens": 307.125, "epoch": 1.0957875925453153, "gen_logits_max": 5.216407775878906, "gen_logits_mean": -14.381009101867676, "gen_logits_min": -26.53769302368164, "gen_logits_std": 2.9029788970947266, "gen_loss": 0.3123578429222107, "grad_norm": 0.3914844757152304, "learning_rate": 2.415284210526316e-05, "loss": 0.2915, "mean_copy_accuracy": 0.9956334382295609, "mean_gen_accuracy": 0.8731761872768402, "mean_token_accuracy": 0.9020074605941772, "num_tokens": 370565116.0, "sample_num_tokens": 7110.0, "step": 5365, "total_num_tokens": 370593556.0, "z_loss": 0.000676266907248646 }, { "copy_logits_max": -8.970291137695312, "copy_logits_min": -750000000.0, "copy_num_tokens": 379.6875, "epoch": 1.095991830482512, "gen_logits_max": 4.997981071472168, "gen_logits_mean": -13.901351928710938, "gen_logits_min": -25.213245391845703, "gen_logits_std": 2.8423357009887695, "gen_loss": 0.2590099275112152, "grad_norm": 0.3883567503919804, "learning_rate": 2.4151578947368422e-05, "loss": 0.2726, "mean_copy_accuracy": 0.9947267323732376, "mean_gen_accuracy": 0.883753314614296, "mean_token_accuracy": 0.9059715867042542, "num_tokens": 370819713.0, "sample_num_tokens": 8175.75, "step": 5366, "total_num_tokens": 370852416.0, "z_loss": 0.0005575338145717978 }, { "copy_logits_max": -7.824493408203125, "copy_logits_min": -750000000.0, "copy_num_tokens": 372.25, "epoch": 1.096196068419709, "gen_logits_max": 5.020101547241211, "gen_logits_mean": -14.35289192199707, "gen_logits_min": -25.527416229248047, "gen_logits_std": 2.850719928741455, "gen_loss": 0.3314315676689148, "grad_norm": 0.34459494530869383, "learning_rate": 2.4150315789473686e-05, "loss": 0.3006, "mean_copy_accuracy": 0.9963274449110031, "mean_gen_accuracy": 0.8664007931947708, "mean_token_accuracy": 0.8992048799991608, "num_tokens": 371106065.0, "sample_num_tokens": 8285.75, "step": 5367, "total_num_tokens": 371139208.0, "z_loss": 0.0006408318877220154 }, { "copy_logits_max": -7.376196384429932, "copy_logits_min": -687500032.0, "copy_num_tokens": 690.0, "epoch": 1.0964003063569059, "gen_logits_max": 3.471320390701294, "gen_logits_mean": -15.250068664550781, "gen_logits_min": -27.155242919921875, "gen_logits_std": 2.9397993087768555, "gen_loss": 0.261130690574646, "grad_norm": 0.38428001876784557, "learning_rate": 2.4149052631578947e-05, "loss": 0.2636, "mean_copy_accuracy": 0.9964949637651443, "mean_gen_accuracy": 0.8803346306085587, "mean_token_accuracy": 0.9116523712873459, "num_tokens": 371382215.0, "sample_num_tokens": 9787.75, "step": 5368, "total_num_tokens": 371421366.0, "z_loss": 0.0005399159854277968 }, { "copy_logits_max": -8.306589126586914, "copy_logits_min": -750000000.0, "copy_num_tokens": 517.8125, "epoch": 1.0966045442941026, "gen_logits_max": 3.475165843963623, "gen_logits_mean": -16.362873077392578, "gen_logits_min": -27.92891502380371, "gen_logits_std": 2.949793815612793, "gen_loss": 0.2655755281448364, "grad_norm": 0.41649604377382443, "learning_rate": 2.4147789473684212e-05, "loss": 0.2773, "mean_copy_accuracy": 0.9962345510721207, "mean_gen_accuracy": 0.8763899058103561, "mean_token_accuracy": 0.9060395509004593, "num_tokens": 371669380.0, "sample_num_tokens": 9219.0, "step": 5369, "total_num_tokens": 371706256.0, "z_loss": 0.0005198415601626039 }, { "copy_logits_max": -7.536770343780518, "copy_logits_min": -750000000.0, "copy_num_tokens": 836.0, "epoch": 1.0968087822312995, "gen_logits_max": 3.403995990753174, "gen_logits_mean": -15.12336254119873, "gen_logits_min": -27.630599975585938, "gen_logits_std": 2.938999652862549, "gen_loss": 0.20022693276405334, "grad_norm": 0.37005441359920344, "learning_rate": 2.4146526315789473e-05, "loss": 0.2632, "mean_copy_accuracy": 0.996926486492157, "mean_gen_accuracy": 0.8784646838903427, "mean_token_accuracy": 0.910498782992363, "num_tokens": 371951401.0, "sample_num_tokens": 10724.25, "step": 5370, "total_num_tokens": 371994298.0, "z_loss": 0.00041901282384060323 }, { "copy_logits_max": -8.222373962402344, "copy_logits_min": -750000000.0, "copy_num_tokens": 329.8125, "epoch": 1.0970130201684962, "gen_logits_max": 4.546877861022949, "gen_logits_mean": -15.729181289672852, "gen_logits_min": -27.256946563720703, "gen_logits_std": 2.9146125316619873, "gen_loss": 0.2965713143348694, "grad_norm": 0.36773087690637535, "learning_rate": 2.4145263157894737e-05, "loss": 0.2732, "mean_copy_accuracy": 0.9962358176708221, "mean_gen_accuracy": 0.8820540904998779, "mean_token_accuracy": 0.9077546894550323, "num_tokens": 372231333.0, "sample_num_tokens": 7806.75, "step": 5371, "total_num_tokens": 372262560.0, "z_loss": 0.0006040059961378574 }, { "copy_logits_max": -7.13246488571167, "copy_logits_min": -750000000.0, "copy_num_tokens": 294.3125, "epoch": 1.0972172581056931, "gen_logits_max": 4.373376369476318, "gen_logits_mean": -17.13948631286621, "gen_logits_min": -28.520051956176758, "gen_logits_std": 2.9222381114959717, "gen_loss": 0.32402729988098145, "grad_norm": 0.3884184442628968, "learning_rate": 2.4143999999999998e-05, "loss": 0.2806, "mean_copy_accuracy": 0.9969374090433121, "mean_gen_accuracy": 0.8726934492588043, "mean_token_accuracy": 0.9058702439069748, "num_tokens": 372515976.0, "sample_num_tokens": 7115.5, "step": 5372, "total_num_tokens": 372544438.0, "z_loss": 0.0006911669042892754 }, { "copy_logits_max": -8.48674201965332, "copy_logits_min": -687500032.0, "copy_num_tokens": 271.8125, "epoch": 1.09742149604289, "gen_logits_max": 4.948639869689941, "gen_logits_mean": -15.673316955566406, "gen_logits_min": -27.05839729309082, "gen_logits_std": 2.8997678756713867, "gen_loss": 0.2766951322555542, "grad_norm": 0.4108602836253615, "learning_rate": 2.4142736842105266e-05, "loss": 0.2752, "mean_copy_accuracy": 0.9945192635059357, "mean_gen_accuracy": 0.8841557502746582, "mean_token_accuracy": 0.906064584851265, "num_tokens": 372769313.0, "sample_num_tokens": 6665.25, "step": 5373, "total_num_tokens": 372795974.0, "z_loss": 0.0006396555108949542 }, { "copy_logits_max": -6.699029922485352, "copy_logits_min": -687500032.0, "copy_num_tokens": 537.1875, "epoch": 1.0976257339800868, "gen_logits_max": 4.046206474304199, "gen_logits_mean": -14.513561248779297, "gen_logits_min": -26.698097229003906, "gen_logits_std": 2.8899192810058594, "gen_loss": 0.24128073453903198, "grad_norm": 0.4093796600730806, "learning_rate": 2.4141473684210526e-05, "loss": 0.2737, "mean_copy_accuracy": 0.9946276098489761, "mean_gen_accuracy": 0.8840424120426178, "mean_token_accuracy": 0.9070234298706055, "num_tokens": 373027498.0, "sample_num_tokens": 8626.5, "step": 5374, "total_num_tokens": 373062004.0, "z_loss": 0.0005790782161056995 }, { "copy_logits_max": -5.938930511474609, "copy_logits_min": -687500032.0, "copy_num_tokens": 608.4375, "epoch": 1.0978299719172837, "gen_logits_max": 3.924769878387451, "gen_logits_mean": -14.995481491088867, "gen_logits_min": -27.241954803466797, "gen_logits_std": 2.905748128890991, "gen_loss": 0.2749701738357544, "grad_norm": 0.3551018452438143, "learning_rate": 2.414021052631579e-05, "loss": 0.2868, "mean_copy_accuracy": 0.9964376240968704, "mean_gen_accuracy": 0.8686622530221939, "mean_token_accuracy": 0.904052346944809, "num_tokens": 373320040.0, "sample_num_tokens": 8734.0, "step": 5375, "total_num_tokens": 373354976.0, "z_loss": 0.0006356650264933705 }, { "copy_logits_max": -7.088958740234375, "copy_logits_min": -687500032.0, "copy_num_tokens": 550.4375, "epoch": 1.0980342098544804, "gen_logits_max": 3.942046642303467, "gen_logits_mean": -16.243976593017578, "gen_logits_min": -27.913944244384766, "gen_logits_std": 2.9216816425323486, "gen_loss": 0.2895456552505493, "grad_norm": 0.4194218586775535, "learning_rate": 2.4138947368421055e-05, "loss": 0.2991, "mean_copy_accuracy": 0.9954892545938492, "mean_gen_accuracy": 0.8681624680757523, "mean_token_accuracy": 0.8981353789567947, "num_tokens": 373587408.0, "sample_num_tokens": 9486.0, "step": 5376, "total_num_tokens": 373625352.0, "z_loss": 0.000614753516856581 }, { "copy_logits_max": -8.763233184814453, "copy_logits_min": -750000000.0, "copy_num_tokens": 607.125, "epoch": 1.0982384477916773, "gen_logits_max": 5.112130165100098, "gen_logits_mean": -15.129749298095703, "gen_logits_min": -27.057037353515625, "gen_logits_std": 2.9521098136901855, "gen_loss": 0.2401195466518402, "grad_norm": 0.406895233446963, "learning_rate": 2.4137684210526316e-05, "loss": 0.2798, "mean_copy_accuracy": 0.9957172274589539, "mean_gen_accuracy": 0.8747829794883728, "mean_token_accuracy": 0.905051976442337, "num_tokens": 373857647.0, "sample_num_tokens": 8536.25, "step": 5377, "total_num_tokens": 373891792.0, "z_loss": 0.0004906467511318624 }, { "copy_logits_max": -8.812443733215332, "copy_logits_min": -750000000.0, "copy_num_tokens": 533.6875, "epoch": 1.098442685728874, "gen_logits_max": 4.947447776794434, "gen_logits_mean": -13.498638153076172, "gen_logits_min": -25.450706481933594, "gen_logits_std": 2.9071109294891357, "gen_loss": 0.2478361427783966, "grad_norm": 0.39478616382659404, "learning_rate": 2.413642105263158e-05, "loss": 0.2915, "mean_copy_accuracy": 0.9955816268920898, "mean_gen_accuracy": 0.8696934431791306, "mean_token_accuracy": 0.9003562927246094, "num_tokens": 374119022.0, "sample_num_tokens": 8748.0, "step": 5378, "total_num_tokens": 374154014.0, "z_loss": 0.00044917414197698236 }, { "copy_logits_max": -7.998293876647949, "copy_logits_min": -750000000.0, "copy_num_tokens": 446.0625, "epoch": 1.098646923666071, "gen_logits_max": 4.392823696136475, "gen_logits_mean": -14.97848892211914, "gen_logits_min": -26.83395004272461, "gen_logits_std": 2.893639087677002, "gen_loss": 0.2990366518497467, "grad_norm": 0.3372108039016244, "learning_rate": 2.413515789473684e-05, "loss": 0.2696, "mean_copy_accuracy": 0.9964001774787903, "mean_gen_accuracy": 0.8806106001138687, "mean_token_accuracy": 0.9105305671691895, "num_tokens": 374413813.0, "sample_num_tokens": 8577.75, "step": 5379, "total_num_tokens": 374448124.0, "z_loss": 0.0005467536393553019 }, { "copy_logits_max": -5.653228282928467, "copy_logits_min": -687500032.0, "copy_num_tokens": 539.9375, "epoch": 1.098851161603268, "gen_logits_max": 3.8264060020446777, "gen_logits_mean": -15.133209228515625, "gen_logits_min": -26.967687606811523, "gen_logits_std": 2.9342429637908936, "gen_loss": 0.28469932079315186, "grad_norm": 0.3869482208884405, "learning_rate": 2.4133894736842106e-05, "loss": 0.2741, "mean_copy_accuracy": 0.9961602091789246, "mean_gen_accuracy": 0.8768520653247833, "mean_token_accuracy": 0.9069322943687439, "num_tokens": 374667325.0, "sample_num_tokens": 8903.25, "step": 5380, "total_num_tokens": 374702938.0, "z_loss": 0.0005895133363083005 }, { "copy_logits_max": -8.005979537963867, "copy_logits_min": -750000000.0, "copy_num_tokens": 317.6875, "epoch": 1.0990553995404646, "gen_logits_max": 5.2941508293151855, "gen_logits_mean": -15.289331436157227, "gen_logits_min": -27.05817413330078, "gen_logits_std": 2.9136526584625244, "gen_loss": 0.2780425548553467, "grad_norm": 0.3953144117888851, "learning_rate": 2.413263157894737e-05, "loss": 0.2943, "mean_copy_accuracy": 0.9951336234807968, "mean_gen_accuracy": 0.8704134225845337, "mean_token_accuracy": 0.8991432189941406, "num_tokens": 374943631.0, "sample_num_tokens": 7813.75, "step": 5381, "total_num_tokens": 374974886.0, "z_loss": 0.0005838765646331012 }, { "copy_logits_max": -7.334099292755127, "copy_logits_min": -687500032.0, "copy_num_tokens": 345.125, "epoch": 1.0992596374776615, "gen_logits_max": 5.030529022216797, "gen_logits_mean": -15.051198959350586, "gen_logits_min": -26.71247100830078, "gen_logits_std": 2.970592737197876, "gen_loss": 0.32338017225265503, "grad_norm": 0.36335252671858004, "learning_rate": 2.4131368421052634e-05, "loss": 0.2571, "mean_copy_accuracy": 0.996417373418808, "mean_gen_accuracy": 0.8867823332548141, "mean_token_accuracy": 0.9158349931240082, "num_tokens": 375226418.0, "sample_num_tokens": 7925.0, "step": 5382, "total_num_tokens": 375258118.0, "z_loss": 0.0005793036543764174 }, { "copy_logits_max": -6.9232177734375, "copy_logits_min": -750000000.0, "copy_num_tokens": 480.875, "epoch": 1.0994638754148582, "gen_logits_max": 4.541165828704834, "gen_logits_mean": -15.61543083190918, "gen_logits_min": -27.52090072631836, "gen_logits_std": 2.9751510620117188, "gen_loss": 0.29840683937072754, "grad_norm": 0.41272836155664266, "learning_rate": 2.4130105263157895e-05, "loss": 0.2882, "mean_copy_accuracy": 0.9955070167779922, "mean_gen_accuracy": 0.8752146363258362, "mean_token_accuracy": 0.9027790129184723, "num_tokens": 375488663.0, "sample_num_tokens": 9213.25, "step": 5383, "total_num_tokens": 375525516.0, "z_loss": 0.0005706457886844873 }, { "copy_logits_max": -7.277605056762695, "copy_logits_min": -750000000.0, "copy_num_tokens": 432.375, "epoch": 1.0996681133520552, "gen_logits_max": 4.729488372802734, "gen_logits_mean": -14.845914840698242, "gen_logits_min": -26.752023696899414, "gen_logits_std": 2.9414539337158203, "gen_loss": 0.2828798294067383, "grad_norm": 0.4201262597737564, "learning_rate": 2.412884210526316e-05, "loss": 0.3031, "mean_copy_accuracy": 0.9948611110448837, "mean_gen_accuracy": 0.8712044656276703, "mean_token_accuracy": 0.8972630202770233, "num_tokens": 375743380.0, "sample_num_tokens": 8568.0, "step": 5384, "total_num_tokens": 375777652.0, "z_loss": 0.0004999961238354445 }, { "copy_logits_max": -6.579753875732422, "copy_logits_min": -750000000.0, "copy_num_tokens": 656.25, "epoch": 1.0998723512892519, "gen_logits_max": 4.403009414672852, "gen_logits_mean": -15.310481071472168, "gen_logits_min": -26.683948516845703, "gen_logits_std": 2.902010202407837, "gen_loss": 0.29285454750061035, "grad_norm": 0.3761360171669434, "learning_rate": 2.412757894736842e-05, "loss": 0.2757, "mean_copy_accuracy": 0.9957356154918671, "mean_gen_accuracy": 0.8735253661870956, "mean_token_accuracy": 0.9078378677368164, "num_tokens": 376044850.0, "sample_num_tokens": 10647.5, "step": 5385, "total_num_tokens": 376087440.0, "z_loss": 0.0005214432021602988 }, { "copy_logits_max": -4.237709045410156, "copy_logits_min": -750000064.0, "copy_num_tokens": 749.375, "epoch": 1.1000765892264488, "gen_logits_max": 4.365505218505859, "gen_logits_mean": -14.149322509765625, "gen_logits_min": -26.36590576171875, "gen_logits_std": 2.9453327655792236, "gen_loss": 0.2510222792625427, "grad_norm": 0.37714510769696047, "learning_rate": 2.4126315789473685e-05, "loss": 0.2851, "mean_copy_accuracy": 0.9959448277950287, "mean_gen_accuracy": 0.8721372187137604, "mean_token_accuracy": 0.9046179056167603, "num_tokens": 376331633.0, "sample_num_tokens": 9798.75, "step": 5386, "total_num_tokens": 376370828.0, "z_loss": 0.0005681098555214703 }, { "copy_logits_max": -6.433525085449219, "copy_logits_min": -750000000.0, "copy_num_tokens": 540.5, "epoch": 1.1002808271636457, "gen_logits_max": 3.594791889190674, "gen_logits_mean": -16.551311492919922, "gen_logits_min": -28.262035369873047, "gen_logits_std": 2.947988986968994, "gen_loss": 0.2605046033859253, "grad_norm": 0.42116493433422186, "learning_rate": 2.4125052631578946e-05, "loss": 0.2708, "mean_copy_accuracy": 0.9974043816328049, "mean_gen_accuracy": 0.8768219500780106, "mean_token_accuracy": 0.9075079411268234, "num_tokens": 376605578.0, "sample_num_tokens": 9667.0, "step": 5387, "total_num_tokens": 376644246.0, "z_loss": 0.0004994487389922142 }, { "copy_logits_max": -7.659199237823486, "copy_logits_min": -750000000.0, "copy_num_tokens": 432.5625, "epoch": 1.1004850651008424, "gen_logits_max": 5.386899471282959, "gen_logits_mean": -14.322259902954102, "gen_logits_min": -25.76202392578125, "gen_logits_std": 2.887411594390869, "gen_loss": 0.2638193368911743, "grad_norm": 0.3738980015205827, "learning_rate": 2.412378947368421e-05, "loss": 0.2621, "mean_copy_accuracy": 0.9967959076166153, "mean_gen_accuracy": 0.8807244747877121, "mean_token_accuracy": 0.9104628711938858, "num_tokens": 376885670.0, "sample_num_tokens": 8021.5, "step": 5388, "total_num_tokens": 376917756.0, "z_loss": 0.0005098446854390204 }, { "copy_logits_max": -6.664769649505615, "copy_logits_min": -750000064.0, "copy_num_tokens": 417.5, "epoch": 1.1006893030380394, "gen_logits_max": 5.419026851654053, "gen_logits_mean": -13.873542785644531, "gen_logits_min": -25.71375846862793, "gen_logits_std": 2.875117540359497, "gen_loss": 0.2901912331581116, "grad_norm": 0.469684680062081, "learning_rate": 2.4122526315789474e-05, "loss": 0.2885, "mean_copy_accuracy": 0.9950534701347351, "mean_gen_accuracy": 0.8708653599023819, "mean_token_accuracy": 0.9016187489032745, "num_tokens": 377140563.0, "sample_num_tokens": 8003.75, "step": 5389, "total_num_tokens": 377172578.0, "z_loss": 0.0005831895978190005 }, { "copy_logits_max": -7.4510602951049805, "copy_logits_min": -750000000.0, "copy_num_tokens": 550.6875, "epoch": 1.100893540975236, "gen_logits_max": 4.702836990356445, "gen_logits_mean": -15.197654724121094, "gen_logits_min": -26.902915954589844, "gen_logits_std": 2.908859968185425, "gen_loss": 0.27409306168556213, "grad_norm": 0.4110493793069047, "learning_rate": 2.412126315789474e-05, "loss": 0.2986, "mean_copy_accuracy": 0.9958623349666595, "mean_gen_accuracy": 0.8714250475168228, "mean_token_accuracy": 0.8994067162275314, "num_tokens": 377399176.0, "sample_num_tokens": 9054.5, "step": 5390, "total_num_tokens": 377435394.0, "z_loss": 0.0005502753192558885 }, { "copy_logits_max": -6.919709205627441, "copy_logits_min": -750000000.0, "copy_num_tokens": 288.375, "epoch": 1.101097778912433, "gen_logits_max": 5.101451873779297, "gen_logits_mean": -15.200496673583984, "gen_logits_min": -26.510421752929688, "gen_logits_std": 2.917959213256836, "gen_loss": 0.3190852105617523, "grad_norm": 0.39683209081063636, "learning_rate": 2.4120000000000003e-05, "loss": 0.2776, "mean_copy_accuracy": 0.9957642555236816, "mean_gen_accuracy": 0.8806484937667847, "mean_token_accuracy": 0.9067550301551819, "num_tokens": 377666207.0, "sample_num_tokens": 7759.75, "step": 5391, "total_num_tokens": 377697246.0, "z_loss": 0.0005943225114606321 }, { "copy_logits_max": -6.103577613830566, "copy_logits_min": -750000000.0, "copy_num_tokens": 412.1875, "epoch": 1.10130201684963, "gen_logits_max": 4.277382850646973, "gen_logits_mean": -15.818135261535645, "gen_logits_min": -27.568946838378906, "gen_logits_std": 2.9228477478027344, "gen_loss": 0.3103046119213104, "grad_norm": 0.4633316884130052, "learning_rate": 2.4118736842105264e-05, "loss": 0.3149, "mean_copy_accuracy": 0.9943728148937225, "mean_gen_accuracy": 0.8681005388498306, "mean_token_accuracy": 0.8942814320325851, "num_tokens": 377913506.0, "sample_num_tokens": 7479.5, "step": 5392, "total_num_tokens": 377943424.0, "z_loss": 0.0006093829870223999 }, { "copy_logits_max": -8.295546531677246, "copy_logits_min": -750000000.0, "copy_num_tokens": 359.625, "epoch": 1.1015062547868266, "gen_logits_max": 3.825732469558716, "gen_logits_mean": -16.94451332092285, "gen_logits_min": -27.990440368652344, "gen_logits_std": 2.9167070388793945, "gen_loss": 0.24833393096923828, "grad_norm": 0.38963491483245083, "learning_rate": 2.4117473684210528e-05, "loss": 0.2616, "mean_copy_accuracy": 0.9968888163566589, "mean_gen_accuracy": 0.8799676299095154, "mean_token_accuracy": 0.9097873568534851, "num_tokens": 378185376.0, "sample_num_tokens": 7978.5, "step": 5393, "total_num_tokens": 378217290.0, "z_loss": 0.00047203354188241065 }, { "copy_logits_max": -5.382668495178223, "copy_logits_min": -625000064.0, "copy_num_tokens": 677.0625, "epoch": 1.1017104927240235, "gen_logits_max": 3.4120473861694336, "gen_logits_mean": -16.05990982055664, "gen_logits_min": -27.932781219482422, "gen_logits_std": 2.9569411277770996, "gen_loss": 0.25235041975975037, "grad_norm": 0.41128661148880286, "learning_rate": 2.411621052631579e-05, "loss": 0.2801, "mean_copy_accuracy": 0.9954522103071213, "mean_gen_accuracy": 0.8730003982782364, "mean_token_accuracy": 0.9045466631650925, "num_tokens": 378443803.0, "sample_num_tokens": 9876.25, "step": 5394, "total_num_tokens": 378483308.0, "z_loss": 0.0005266726948320866 }, { "copy_logits_max": -6.070775032043457, "copy_logits_min": -750000000.0, "copy_num_tokens": 521.4375, "epoch": 1.1019147306612203, "gen_logits_max": 3.381057024002075, "gen_logits_mean": -16.572851181030273, "gen_logits_min": -28.195520401000977, "gen_logits_std": 2.9539084434509277, "gen_loss": 0.2792973816394806, "grad_norm": 0.3589125212373744, "learning_rate": 2.4114947368421053e-05, "loss": 0.2827, "mean_copy_accuracy": 0.9964134842157364, "mean_gen_accuracy": 0.8711753785610199, "mean_token_accuracy": 0.9038909822702408, "num_tokens": 378723248.0, "sample_num_tokens": 8535.0, "step": 5395, "total_num_tokens": 378757388.0, "z_loss": 0.0004837091837543994 }, { "copy_logits_max": -9.330663681030273, "copy_logits_min": -687500032.0, "copy_num_tokens": 468.875, "epoch": 1.1021189685984172, "gen_logits_max": 4.512544631958008, "gen_logits_mean": -14.44375228881836, "gen_logits_min": -25.75395965576172, "gen_logits_std": 2.8376455307006836, "gen_loss": 0.2683384120464325, "grad_norm": 0.41889130238555977, "learning_rate": 2.4113684210526314e-05, "loss": 0.2947, "mean_copy_accuracy": 0.9960424900054932, "mean_gen_accuracy": 0.8725584596395493, "mean_token_accuracy": 0.9013992547988892, "num_tokens": 378989652.0, "sample_num_tokens": 8048.5, "step": 5396, "total_num_tokens": 379021846.0, "z_loss": 0.00043927639489993453 }, { "copy_logits_max": -7.946674823760986, "copy_logits_min": -625000064.0, "copy_num_tokens": 602.1875, "epoch": 1.1023232065356139, "gen_logits_max": 4.360957622528076, "gen_logits_mean": -14.80263900756836, "gen_logits_min": -26.229259490966797, "gen_logits_std": 2.871380090713501, "gen_loss": 0.2929045855998993, "grad_norm": 0.3893039567835573, "learning_rate": 2.4112421052631582e-05, "loss": 0.2984, "mean_copy_accuracy": 0.994553342461586, "mean_gen_accuracy": 0.8687284886837006, "mean_token_accuracy": 0.8996845781803131, "num_tokens": 379260335.0, "sample_num_tokens": 9222.25, "step": 5397, "total_num_tokens": 379297224.0, "z_loss": 0.0005112950457260013 }, { "copy_logits_max": -5.985957145690918, "copy_logits_min": -750000000.0, "copy_num_tokens": 556.0625, "epoch": 1.1025274444728108, "gen_logits_max": 3.8464200496673584, "gen_logits_mean": -15.473270416259766, "gen_logits_min": -27.406436920166016, "gen_logits_std": 2.91945743560791, "gen_loss": 0.28546142578125, "grad_norm": 0.4053449496560321, "learning_rate": 2.4111157894736843e-05, "loss": 0.2696, "mean_copy_accuracy": 0.9959808439016342, "mean_gen_accuracy": 0.8742291480302811, "mean_token_accuracy": 0.9077799171209335, "num_tokens": 379510840.0, "sample_num_tokens": 8315.0, "step": 5398, "total_num_tokens": 379544100.0, "z_loss": 0.0005279765464365482 }, { "copy_logits_max": -7.895381450653076, "copy_logits_min": -687500032.0, "copy_num_tokens": 643.3125, "epoch": 1.1027316824100077, "gen_logits_max": 4.0046844482421875, "gen_logits_mean": -16.002988815307617, "gen_logits_min": -28.089929580688477, "gen_logits_std": 3.0145480632781982, "gen_loss": 0.28721845149993896, "grad_norm": 0.41644652637386287, "learning_rate": 2.4109894736842107e-05, "loss": 0.2969, "mean_copy_accuracy": 0.9957002252340317, "mean_gen_accuracy": 0.8706828951835632, "mean_token_accuracy": 0.8998425751924515, "num_tokens": 379785277.0, "sample_num_tokens": 9601.25, "step": 5399, "total_num_tokens": 379823682.0, "z_loss": 0.0005060655530542135 }, { "copy_logits_max": -8.413002967834473, "copy_logits_min": -687500032.0, "copy_num_tokens": 748.5625, "epoch": 1.1029359203472044, "gen_logits_max": 3.739345073699951, "gen_logits_mean": -15.289787292480469, "gen_logits_min": -26.704437255859375, "gen_logits_std": 2.8937487602233887, "gen_loss": 0.24458849430084229, "grad_norm": 0.352556471341027, "learning_rate": 2.4108631578947368e-05, "loss": 0.2599, "mean_copy_accuracy": 0.9966395497322083, "mean_gen_accuracy": 0.8830868750810623, "mean_token_accuracy": 0.9117550104856491, "num_tokens": 380068442.0, "sample_num_tokens": 10680.0, "step": 5400, "total_num_tokens": 380111162.0, "z_loss": 0.0004395873984321952 }, { "copy_logits_max": -5.303681373596191, "copy_logits_min": -750000000.0, "copy_num_tokens": 594.75, "epoch": 1.1031401582844014, "gen_logits_max": 3.8825297355651855, "gen_logits_mean": -14.353904724121094, "gen_logits_min": -26.32679557800293, "gen_logits_std": 2.9030916690826416, "gen_loss": 0.2739748954772949, "grad_norm": 0.35638391582323625, "learning_rate": 2.4107368421052632e-05, "loss": 0.2767, "mean_copy_accuracy": 0.9968002140522003, "mean_gen_accuracy": 0.8714186400175095, "mean_token_accuracy": 0.9053565561771393, "num_tokens": 380370474.0, "sample_num_tokens": 9371.5, "step": 5401, "total_num_tokens": 380407960.0, "z_loss": 0.0005026299622841179 }, { "copy_logits_max": -7.253912925720215, "copy_logits_min": -687500032.0, "copy_num_tokens": 575.375, "epoch": 1.103344396221598, "gen_logits_max": 4.314070701599121, "gen_logits_mean": -14.752979278564453, "gen_logits_min": -26.503812789916992, "gen_logits_std": 2.9147140979766846, "gen_loss": 0.27173230051994324, "grad_norm": 0.3599743752804551, "learning_rate": 2.4106105263157893e-05, "loss": 0.2733, "mean_copy_accuracy": 0.9955178052186966, "mean_gen_accuracy": 0.8736512064933777, "mean_token_accuracy": 0.9072851091623306, "num_tokens": 380653912.0, "sample_num_tokens": 8982.0, "step": 5402, "total_num_tokens": 380689840.0, "z_loss": 0.0005289120017550886 }, { "copy_logits_max": -9.080976486206055, "copy_logits_min": -750000000.0, "copy_num_tokens": 395.125, "epoch": 1.103548634158795, "gen_logits_max": 4.320568084716797, "gen_logits_mean": -16.068431854248047, "gen_logits_min": -27.98606300354004, "gen_logits_std": 2.9484786987304688, "gen_loss": 0.2850315570831299, "grad_norm": 0.3974663000237402, "learning_rate": 2.4104842105263158e-05, "loss": 0.2759, "mean_copy_accuracy": 0.9957097470760345, "mean_gen_accuracy": 0.8790333122014999, "mean_token_accuracy": 0.907074823975563, "num_tokens": 380917560.0, "sample_num_tokens": 8230.5, "step": 5403, "total_num_tokens": 380950482.0, "z_loss": 0.0005565754254348576 }, { "copy_logits_max": -8.773967742919922, "copy_logits_min": -625000064.0, "copy_num_tokens": 264.125, "epoch": 1.103752872095992, "gen_logits_max": 5.047712802886963, "gen_logits_mean": -15.462448120117188, "gen_logits_min": -27.43804168701172, "gen_logits_std": 2.912621021270752, "gen_loss": 0.27010828256607056, "grad_norm": 0.3918513288855019, "learning_rate": 2.4103578947368422e-05, "loss": 0.2683, "mean_copy_accuracy": 0.9956487566232681, "mean_gen_accuracy": 0.8851583003997803, "mean_token_accuracy": 0.9083837866783142, "num_tokens": 381180105.0, "sample_num_tokens": 6919.25, "step": 5404, "total_num_tokens": 381207782.0, "z_loss": 0.0005235569551587105 }, { "copy_logits_max": -8.123960494995117, "copy_logits_min": -687500032.0, "copy_num_tokens": 417.8125, "epoch": 1.1039571100331886, "gen_logits_max": 4.335309982299805, "gen_logits_mean": -15.301971435546875, "gen_logits_min": -27.138872146606445, "gen_logits_std": 2.861578941345215, "gen_loss": 0.2728310823440552, "grad_norm": 0.4006247172156206, "learning_rate": 2.4102315789473683e-05, "loss": 0.3009, "mean_copy_accuracy": 0.9960402548313141, "mean_gen_accuracy": 0.8685902953147888, "mean_token_accuracy": 0.8981391489505768, "num_tokens": 381439403.0, "sample_num_tokens": 7684.75, "step": 5405, "total_num_tokens": 381470142.0, "z_loss": 0.0005221006576903164 }, { "copy_logits_max": -7.673043727874756, "copy_logits_min": -687500032.0, "copy_num_tokens": 464.875, "epoch": 1.1041613479703856, "gen_logits_max": 3.3789687156677246, "gen_logits_mean": -15.651700973510742, "gen_logits_min": -27.132755279541016, "gen_logits_std": 2.853065252304077, "gen_loss": 0.26032859086990356, "grad_norm": 0.41069125405883633, "learning_rate": 2.410105263157895e-05, "loss": 0.2882, "mean_copy_accuracy": 0.996391624212265, "mean_gen_accuracy": 0.8753941059112549, "mean_token_accuracy": 0.9025351405143738, "num_tokens": 381678008.0, "sample_num_tokens": 7718.0, "step": 5406, "total_num_tokens": 381708880.0, "z_loss": 0.0005105126765556633 }, { "copy_logits_max": -9.24156379699707, "copy_logits_min": -750000000.0, "copy_num_tokens": 469.0, "epoch": 1.1043655859075823, "gen_logits_max": 4.218578338623047, "gen_logits_mean": -15.7789945602417, "gen_logits_min": -27.652301788330078, "gen_logits_std": 2.896571636199951, "gen_loss": 0.2726060152053833, "grad_norm": 0.3976768014744557, "learning_rate": 2.409978947368421e-05, "loss": 0.2824, "mean_copy_accuracy": 0.9968531131744385, "mean_gen_accuracy": 0.8744662255048752, "mean_token_accuracy": 0.9021790474653244, "num_tokens": 381936675.0, "sample_num_tokens": 9612.75, "step": 5407, "total_num_tokens": 381975126.0, "z_loss": 0.0005843940889462829 }, { "copy_logits_max": -6.802520275115967, "copy_logits_min": -750000000.0, "copy_num_tokens": 502.0, "epoch": 1.1045698238447792, "gen_logits_max": 3.610435962677002, "gen_logits_mean": -16.62198257446289, "gen_logits_min": -29.06264877319336, "gen_logits_std": 2.969089984893799, "gen_loss": 0.24618984758853912, "grad_norm": 0.39699286223654484, "learning_rate": 2.4098526315789476e-05, "loss": 0.2737, "mean_copy_accuracy": 0.9959661811590195, "mean_gen_accuracy": 0.8814245313405991, "mean_token_accuracy": 0.9090321809053421, "num_tokens": 382214820.0, "sample_num_tokens": 8672.0, "step": 5408, "total_num_tokens": 382249508.0, "z_loss": 0.0005638280999846756 }, { "copy_logits_max": -7.742129325866699, "copy_logits_min": -750000000.0, "copy_num_tokens": 308.125, "epoch": 1.104774061781976, "gen_logits_max": 4.613208770751953, "gen_logits_mean": -15.91241455078125, "gen_logits_min": -27.43730926513672, "gen_logits_std": 2.884711980819702, "gen_loss": 0.30946484208106995, "grad_norm": 0.3842459028368028, "learning_rate": 2.4097263157894737e-05, "loss": 0.2841, "mean_copy_accuracy": 0.9970160722732544, "mean_gen_accuracy": 0.8748155981302261, "mean_token_accuracy": 0.904453694820404, "num_tokens": 382488038.0, "sample_num_tokens": 7605.5, "step": 5409, "total_num_tokens": 382518460.0, "z_loss": 0.0006051139207556844 }, { "copy_logits_max": -7.807348251342773, "copy_logits_min": -750000000.0, "copy_num_tokens": 452.6875, "epoch": 1.1049782997191728, "gen_logits_max": 3.4320688247680664, "gen_logits_mean": -17.192386627197266, "gen_logits_min": -29.06417465209961, "gen_logits_std": 2.939173698425293, "gen_loss": 0.2607973515987396, "grad_norm": 0.39546831140391697, "learning_rate": 2.4096e-05, "loss": 0.2721, "mean_copy_accuracy": 0.9964345842599869, "mean_gen_accuracy": 0.8831136971712112, "mean_token_accuracy": 0.9082051366567612, "num_tokens": 382762888.0, "sample_num_tokens": 7861.5, "step": 5410, "total_num_tokens": 382794334.0, "z_loss": 0.0005416216445155442 }, { "copy_logits_max": -8.67138671875, "copy_logits_min": -750000064.0, "copy_num_tokens": 579.5, "epoch": 1.1051825376563698, "gen_logits_max": 4.6742095947265625, "gen_logits_mean": -14.904518127441406, "gen_logits_min": -26.90598487854004, "gen_logits_std": 2.898038864135742, "gen_loss": 0.24899382889270782, "grad_norm": 0.390318609962422, "learning_rate": 2.4094736842105262e-05, "loss": 0.2764, "mean_copy_accuracy": 0.9960346519947052, "mean_gen_accuracy": 0.8728462010622025, "mean_token_accuracy": 0.9050780385732651, "num_tokens": 383022501.0, "sample_num_tokens": 8546.75, "step": 5411, "total_num_tokens": 383056688.0, "z_loss": 0.0005078722024336457 }, { "copy_logits_max": -8.280952453613281, "copy_logits_min": -687500032.0, "copy_num_tokens": 515.5, "epoch": 1.1053867755935665, "gen_logits_max": 4.860537052154541, "gen_logits_mean": -13.813339233398438, "gen_logits_min": -26.006711959838867, "gen_logits_std": 2.832976818084717, "gen_loss": 0.27289557456970215, "grad_norm": 0.3974224135509943, "learning_rate": 2.4093473684210526e-05, "loss": 0.2808, "mean_copy_accuracy": 0.9973517805337906, "mean_gen_accuracy": 0.8682630658149719, "mean_token_accuracy": 0.9044679552316666, "num_tokens": 383300769.0, "sample_num_tokens": 8622.25, "step": 5412, "total_num_tokens": 383335258.0, "z_loss": 0.0005237857112661004 }, { "copy_logits_max": -7.660324573516846, "copy_logits_min": -750000064.0, "copy_num_tokens": 308.6875, "epoch": 1.1055910135307634, "gen_logits_max": 4.613270282745361, "gen_logits_mean": -16.023651123046875, "gen_logits_min": -28.047061920166016, "gen_logits_std": 2.9564638137817383, "gen_loss": 0.30796873569488525, "grad_norm": 0.38503420338079536, "learning_rate": 2.4092210526315787e-05, "loss": 0.272, "mean_copy_accuracy": 0.9962310642004013, "mean_gen_accuracy": 0.8759226351976395, "mean_token_accuracy": 0.9079644978046417, "num_tokens": 383580057.0, "sample_num_tokens": 7229.75, "step": 5413, "total_num_tokens": 383608976.0, "z_loss": 0.0005598465213552117 }, { "copy_logits_max": -5.113465309143066, "copy_logits_min": -750000064.0, "copy_num_tokens": 466.75, "epoch": 1.10579525146796, "gen_logits_max": 5.252628803253174, "gen_logits_mean": -14.249387741088867, "gen_logits_min": -25.899383544921875, "gen_logits_std": 2.8347179889678955, "gen_loss": 0.2984226942062378, "grad_norm": 0.38830516969981277, "learning_rate": 2.4090947368421055e-05, "loss": 0.279, "mean_copy_accuracy": 0.9973533898591995, "mean_gen_accuracy": 0.8740052133798599, "mean_token_accuracy": 0.9060280621051788, "num_tokens": 383864932.0, "sample_num_tokens": 8664.0, "step": 5414, "total_num_tokens": 383899588.0, "z_loss": 0.0005177254788577557 }, { "copy_logits_max": -7.188350677490234, "copy_logits_min": -750000000.0, "copy_num_tokens": 493.8125, "epoch": 1.105999489405157, "gen_logits_max": 4.08360481262207, "gen_logits_mean": -14.540225982666016, "gen_logits_min": -26.004236221313477, "gen_logits_std": 2.856423854827881, "gen_loss": 0.25486165285110474, "grad_norm": 0.39617035404541934, "learning_rate": 2.4089684210526316e-05, "loss": 0.2852, "mean_copy_accuracy": 0.9966680705547333, "mean_gen_accuracy": 0.8744907528162003, "mean_token_accuracy": 0.9022941738367081, "num_tokens": 384136789.0, "sample_num_tokens": 8323.25, "step": 5415, "total_num_tokens": 384170082.0, "z_loss": 0.0004423916107043624 }, { "copy_logits_max": -7.826775550842285, "copy_logits_min": -750000000.0, "copy_num_tokens": 558.8125, "epoch": 1.1062037273423537, "gen_logits_max": 4.296863555908203, "gen_logits_mean": -14.527366638183594, "gen_logits_min": -26.171768188476562, "gen_logits_std": 2.802499771118164, "gen_loss": 0.23970730602741241, "grad_norm": 0.3788396333586218, "learning_rate": 2.408842105263158e-05, "loss": 0.2664, "mean_copy_accuracy": 0.9964341819286346, "mean_gen_accuracy": 0.8831155896186829, "mean_token_accuracy": 0.9086245745420456, "num_tokens": 384408759.0, "sample_num_tokens": 8968.25, "step": 5416, "total_num_tokens": 384444632.0, "z_loss": 0.00044566133874468505 }, { "copy_logits_max": -7.479399681091309, "copy_logits_min": -750000000.0, "copy_num_tokens": 415.375, "epoch": 1.1064079652795507, "gen_logits_max": 4.90074348449707, "gen_logits_mean": -14.695747375488281, "gen_logits_min": -26.647361755371094, "gen_logits_std": 2.8660354614257812, "gen_loss": 0.29780760407447815, "grad_norm": 0.4408525104040859, "learning_rate": 2.4087157894736844e-05, "loss": 0.3153, "mean_copy_accuracy": 0.9946943372488022, "mean_gen_accuracy": 0.8667188882827759, "mean_token_accuracy": 0.8944158107042313, "num_tokens": 384669364.0, "sample_num_tokens": 8180.0, "step": 5417, "total_num_tokens": 384702084.0, "z_loss": 0.0005468814633786678 }, { "copy_logits_max": -6.6395673751831055, "copy_logits_min": -750000000.0, "copy_num_tokens": 450.0, "epoch": 1.1066122032167476, "gen_logits_max": 3.5484652519226074, "gen_logits_mean": -16.31870460510254, "gen_logits_min": -28.094131469726562, "gen_logits_std": 2.931250810623169, "gen_loss": 0.24813982844352722, "grad_norm": 0.37568591058055284, "learning_rate": 2.4085894736842105e-05, "loss": 0.2573, "mean_copy_accuracy": 0.9963832646608353, "mean_gen_accuracy": 0.8840612173080444, "mean_token_accuracy": 0.9117635786533356, "num_tokens": 384939063.0, "sample_num_tokens": 8669.75, "step": 5418, "total_num_tokens": 384973742.0, "z_loss": 0.0004623658605851233 }, { "copy_logits_max": -6.061339378356934, "copy_logits_min": -750000000.0, "copy_num_tokens": 365.8125, "epoch": 1.1068164411539443, "gen_logits_max": 4.8456292152404785, "gen_logits_mean": -14.7332763671875, "gen_logits_min": -25.932628631591797, "gen_logits_std": 2.8091750144958496, "gen_loss": 0.263647198677063, "grad_norm": 0.3496744992622414, "learning_rate": 2.408463157894737e-05, "loss": 0.2627, "mean_copy_accuracy": 0.9958437532186508, "mean_gen_accuracy": 0.883782148361206, "mean_token_accuracy": 0.9100208431482315, "num_tokens": 385216055.0, "sample_num_tokens": 8135.25, "step": 5419, "total_num_tokens": 385248596.0, "z_loss": 0.0005640742601826787 }, { "copy_logits_max": -3.4408316612243652, "copy_logits_min": -625000000.0, "copy_num_tokens": 864.5625, "epoch": 1.1070206790911412, "gen_logits_max": 3.596344470977783, "gen_logits_mean": -15.89954948425293, "gen_logits_min": -28.004188537597656, "gen_logits_std": 2.9691720008850098, "gen_loss": 0.2722238302230835, "grad_norm": 0.4043943806017326, "learning_rate": 2.408336842105263e-05, "loss": 0.2794, "mean_copy_accuracy": 0.996225118637085, "mean_gen_accuracy": 0.873055562376976, "mean_token_accuracy": 0.903860554099083, "num_tokens": 385488379.0, "sample_num_tokens": 11990.25, "step": 5420, "total_num_tokens": 385536340.0, "z_loss": 0.0006203685188665986 }, { "copy_logits_max": -5.847123622894287, "copy_logits_min": -750000000.0, "copy_num_tokens": 490.25, "epoch": 1.107224917028338, "gen_logits_max": 4.949560165405273, "gen_logits_mean": -14.264299392700195, "gen_logits_min": -26.168241500854492, "gen_logits_std": 2.8466734886169434, "gen_loss": 0.3012835383415222, "grad_norm": 0.40335034932938046, "learning_rate": 2.4082105263157895e-05, "loss": 0.3113, "mean_copy_accuracy": 0.9950112849473953, "mean_gen_accuracy": 0.8653849214315414, "mean_token_accuracy": 0.8952057808637619, "num_tokens": 385758327.0, "sample_num_tokens": 8131.75, "step": 5421, "total_num_tokens": 385790854.0, "z_loss": 0.0006711776950396597 }, { "copy_logits_max": -5.914934158325195, "copy_logits_min": -750000000.0, "copy_num_tokens": 602.1875, "epoch": 1.1074291549655348, "gen_logits_max": 3.767568588256836, "gen_logits_mean": -15.68393611907959, "gen_logits_min": -27.6160888671875, "gen_logits_std": 2.945425033569336, "gen_loss": 0.24673274159431458, "grad_norm": 0.3872561437592402, "learning_rate": 2.408084210526316e-05, "loss": 0.2705, "mean_copy_accuracy": 0.9960913956165314, "mean_gen_accuracy": 0.8791546672582626, "mean_token_accuracy": 0.9079108983278275, "num_tokens": 386007912.0, "sample_num_tokens": 9675.0, "step": 5422, "total_num_tokens": 386046612.0, "z_loss": 0.0005181802553124726 }, { "copy_logits_max": -6.180178642272949, "copy_logits_min": -750000064.0, "copy_num_tokens": 524.8125, "epoch": 1.1076333929027318, "gen_logits_max": 5.29265022277832, "gen_logits_mean": -13.969172477722168, "gen_logits_min": -25.9770565032959, "gen_logits_std": 2.8952622413635254, "gen_loss": 0.3056046962738037, "grad_norm": 0.3493669192655397, "learning_rate": 2.4079578947368424e-05, "loss": 0.2638, "mean_copy_accuracy": 0.9963118135929108, "mean_gen_accuracy": 0.87802754342556, "mean_token_accuracy": 0.9106560498476028, "num_tokens": 386307751.0, "sample_num_tokens": 8729.75, "step": 5423, "total_num_tokens": 386342670.0, "z_loss": 0.0006353367352858186 }, { "copy_logits_max": -5.378867149353027, "copy_logits_min": -750000064.0, "copy_num_tokens": 580.125, "epoch": 1.1078376308399285, "gen_logits_max": 4.253303527832031, "gen_logits_mean": -15.408651351928711, "gen_logits_min": -26.90103530883789, "gen_logits_std": 2.9263525009155273, "gen_loss": 0.31088167428970337, "grad_norm": 0.392604269227208, "learning_rate": 2.4078315789473684e-05, "loss": 0.2911, "mean_copy_accuracy": 0.9967577308416367, "mean_gen_accuracy": 0.8681159019470215, "mean_token_accuracy": 0.9025235921144485, "num_tokens": 386614537.0, "sample_num_tokens": 10623.25, "step": 5424, "total_num_tokens": 386657030.0, "z_loss": 0.0006408733315765858 }, { "copy_logits_max": -5.571628570556641, "copy_logits_min": -750000000.0, "copy_num_tokens": 476.6875, "epoch": 1.1080418687771254, "gen_logits_max": 3.0226073265075684, "gen_logits_mean": -17.667510986328125, "gen_logits_min": -29.610885620117188, "gen_logits_std": 3.0000720024108887, "gen_loss": 0.2838837504386902, "grad_norm": 0.375455664707069, "learning_rate": 2.407705263157895e-05, "loss": 0.2753, "mean_copy_accuracy": 0.9951673001050949, "mean_gen_accuracy": 0.8724749833345413, "mean_token_accuracy": 0.9050161242485046, "num_tokens": 386889495.0, "sample_num_tokens": 8098.75, "step": 5425, "total_num_tokens": 386921890.0, "z_loss": 0.0005503050051629543 }, { "copy_logits_max": -5.790043830871582, "copy_logits_min": -750000000.0, "copy_num_tokens": 593.0625, "epoch": 1.108246106714322, "gen_logits_max": 3.6044178009033203, "gen_logits_mean": -15.974649429321289, "gen_logits_min": -28.185348510742188, "gen_logits_std": 3.003544807434082, "gen_loss": 0.2679312527179718, "grad_norm": 0.38156325630764165, "learning_rate": 2.407578947368421e-05, "loss": 0.2804, "mean_copy_accuracy": 0.9969347566366196, "mean_gen_accuracy": 0.8696780949831009, "mean_token_accuracy": 0.9053052067756653, "num_tokens": 387164295.0, "sample_num_tokens": 8666.25, "step": 5426, "total_num_tokens": 387198960.0, "z_loss": 0.0005435844650492072 }, { "copy_logits_max": -3.4711735248565674, "copy_logits_min": -750000000.0, "copy_num_tokens": 400.625, "epoch": 1.108450344651519, "gen_logits_max": 4.201418876647949, "gen_logits_mean": -15.050274848937988, "gen_logits_min": -26.68518829345703, "gen_logits_std": 2.9184489250183105, "gen_loss": 0.2887803912162781, "grad_norm": 0.42118472596046264, "learning_rate": 2.4074526315789474e-05, "loss": 0.2963, "mean_copy_accuracy": 0.9952732622623444, "mean_gen_accuracy": 0.8741369247436523, "mean_token_accuracy": 0.900408998131752, "num_tokens": 387434103.0, "sample_num_tokens": 8221.75, "step": 5427, "total_num_tokens": 387466990.0, "z_loss": 0.0005824555992148817 }, { "copy_logits_max": -6.666284561157227, "copy_logits_min": -687500032.0, "copy_num_tokens": 340.0, "epoch": 1.108654582588716, "gen_logits_max": 5.141350746154785, "gen_logits_mean": -15.482637405395508, "gen_logits_min": -26.945737838745117, "gen_logits_std": 2.9028470516204834, "gen_loss": 0.28974205255508423, "grad_norm": 0.36778288207267074, "learning_rate": 2.4073263157894735e-05, "loss": 0.2685, "mean_copy_accuracy": 0.9974770843982697, "mean_gen_accuracy": 0.8744248747825623, "mean_token_accuracy": 0.9105697870254517, "num_tokens": 387704956.0, "sample_num_tokens": 8027.0, "step": 5428, "total_num_tokens": 387737064.0, "z_loss": 0.0005747461691498756 }, { "copy_logits_max": -4.634112358093262, "copy_logits_min": -750000000.0, "copy_num_tokens": 354.5, "epoch": 1.1088588205259127, "gen_logits_max": 3.734020709991455, "gen_logits_mean": -16.695106506347656, "gen_logits_min": -28.364669799804688, "gen_logits_std": 2.972597122192383, "gen_loss": 0.2758890986442566, "grad_norm": 0.39434468995938776, "learning_rate": 2.4072e-05, "loss": 0.2868, "mean_copy_accuracy": 0.996428444981575, "mean_gen_accuracy": 0.8727984130382538, "mean_token_accuracy": 0.9013912379741669, "num_tokens": 387964528.0, "sample_num_tokens": 7728.0, "step": 5429, "total_num_tokens": 387995440.0, "z_loss": 0.0005436903447844088 }, { "copy_logits_max": -6.166681289672852, "copy_logits_min": -750000000.0, "copy_num_tokens": 472.9375, "epoch": 1.1090630584631096, "gen_logits_max": 3.6756675243377686, "gen_logits_mean": -16.667583465576172, "gen_logits_min": -28.460132598876953, "gen_logits_std": 2.954202175140381, "gen_loss": 0.28263619542121887, "grad_norm": 0.42976186747500683, "learning_rate": 2.4070736842105267e-05, "loss": 0.2819, "mean_copy_accuracy": 0.9966220110654831, "mean_gen_accuracy": 0.8732426464557648, "mean_token_accuracy": 0.903152734041214, "num_tokens": 388222574.0, "sample_num_tokens": 8944.0, "step": 5430, "total_num_tokens": 388258350.0, "z_loss": 0.0005850862362422049 }, { "copy_logits_max": -7.8715667724609375, "copy_logits_min": -750000000.0, "copy_num_tokens": 384.0625, "epoch": 1.1092672964003063, "gen_logits_max": 4.85247278213501, "gen_logits_mean": -16.111478805541992, "gen_logits_min": -27.282642364501953, "gen_logits_std": 2.9343056678771973, "gen_loss": 0.2770485281944275, "grad_norm": 0.38117498826980956, "learning_rate": 2.4069473684210528e-05, "loss": 0.2904, "mean_copy_accuracy": 0.9962364137172699, "mean_gen_accuracy": 0.8772093206644058, "mean_token_accuracy": 0.902410477399826, "num_tokens": 388474846.0, "sample_num_tokens": 7665.5, "step": 5431, "total_num_tokens": 388505508.0, "z_loss": 0.0005596020491793752 }, { "copy_logits_max": -5.035156726837158, "copy_logits_min": -687500096.0, "copy_num_tokens": 565.75, "epoch": 1.1094715343375032, "gen_logits_max": 4.567747116088867, "gen_logits_mean": -15.24477767944336, "gen_logits_min": -27.0960693359375, "gen_logits_std": 2.961700439453125, "gen_loss": 0.27426284551620483, "grad_norm": 0.38573333051752673, "learning_rate": 2.4068210526315792e-05, "loss": 0.2742, "mean_copy_accuracy": 0.9970108866691589, "mean_gen_accuracy": 0.8738117665052414, "mean_token_accuracy": 0.9095135629177094, "num_tokens": 388769389.0, "sample_num_tokens": 9175.25, "step": 5432, "total_num_tokens": 388806090.0, "z_loss": 0.0005933876382187009 }, { "copy_logits_max": -2.8667702674865723, "copy_logits_min": -687500032.0, "copy_num_tokens": 487.125, "epoch": 1.1096757722747, "gen_logits_max": 3.6918601989746094, "gen_logits_mean": -15.686335563659668, "gen_logits_min": -27.59436798095703, "gen_logits_std": 2.966958999633789, "gen_loss": 0.22736087441444397, "grad_norm": 0.39281939735492666, "learning_rate": 2.4066947368421053e-05, "loss": 0.2765, "mean_copy_accuracy": 0.9973480254411697, "mean_gen_accuracy": 0.8713180124759674, "mean_token_accuracy": 0.9066279381513596, "num_tokens": 389045183.0, "sample_num_tokens": 7839.75, "step": 5433, "total_num_tokens": 389076542.0, "z_loss": 0.0005176053382456303 }, { "copy_logits_max": -5.105609893798828, "copy_logits_min": -750000000.0, "copy_num_tokens": 400.625, "epoch": 1.1098800102118969, "gen_logits_max": 4.553707122802734, "gen_logits_mean": -15.665963172912598, "gen_logits_min": -27.401424407958984, "gen_logits_std": 2.9425530433654785, "gen_loss": 0.2817731499671936, "grad_norm": 0.39794500377128567, "learning_rate": 2.4065684210526317e-05, "loss": 0.2859, "mean_copy_accuracy": 0.9959556013345718, "mean_gen_accuracy": 0.8767174780368805, "mean_token_accuracy": 0.9041861444711685, "num_tokens": 389312233.0, "sample_num_tokens": 8881.75, "step": 5434, "total_num_tokens": 389347760.0, "z_loss": 0.000520620436873287 }, { "copy_logits_max": -3.785429000854492, "copy_logits_min": -750000000.0, "copy_num_tokens": 474.3125, "epoch": 1.1100842481490938, "gen_logits_max": 4.640192031860352, "gen_logits_mean": -14.322734832763672, "gen_logits_min": -26.86345863342285, "gen_logits_std": 2.925414800643921, "gen_loss": 0.2765142619609833, "grad_norm": 0.3596241916268284, "learning_rate": 2.406442105263158e-05, "loss": 0.2726, "mean_copy_accuracy": 0.9968304485082626, "mean_gen_accuracy": 0.8755190670490265, "mean_token_accuracy": 0.9078113436698914, "num_tokens": 389581697.0, "sample_num_tokens": 7828.75, "step": 5435, "total_num_tokens": 389613012.0, "z_loss": 0.0006256639026105404 }, { "copy_logits_max": -3.813952684402466, "copy_logits_min": -750000064.0, "copy_num_tokens": 470.0625, "epoch": 1.1102884860862905, "gen_logits_max": 3.647650718688965, "gen_logits_mean": -15.980278015136719, "gen_logits_min": -27.720237731933594, "gen_logits_std": 2.958848476409912, "gen_loss": 0.2669132649898529, "grad_norm": 0.38581577778478454, "learning_rate": 2.4063157894736843e-05, "loss": 0.2867, "mean_copy_accuracy": 0.9956411421298981, "mean_gen_accuracy": 0.8718523532152176, "mean_token_accuracy": 0.9035423099994659, "num_tokens": 389860404.0, "sample_num_tokens": 8396.0, "step": 5436, "total_num_tokens": 389893988.0, "z_loss": 0.0005308474646881223 }, { "copy_logits_max": -4.585323333740234, "copy_logits_min": -750000000.0, "copy_num_tokens": 312.9375, "epoch": 1.1104927240234874, "gen_logits_max": 4.526651382446289, "gen_logits_mean": -15.578577041625977, "gen_logits_min": -27.3029842376709, "gen_logits_std": 2.944026470184326, "gen_loss": 0.25903433561325073, "grad_norm": 0.3691827079037276, "learning_rate": 2.4061894736842104e-05, "loss": 0.2735, "mean_copy_accuracy": 0.9957299381494522, "mean_gen_accuracy": 0.8782090246677399, "mean_token_accuracy": 0.9072471410036087, "num_tokens": 390128431.0, "sample_num_tokens": 6821.75, "step": 5437, "total_num_tokens": 390155718.0, "z_loss": 0.0005114835221320391 }, { "copy_logits_max": -4.68098258972168, "copy_logits_min": -750000128.0, "copy_num_tokens": 575.875, "epoch": 1.1106969619606841, "gen_logits_max": 3.596328020095825, "gen_logits_mean": -15.760537147521973, "gen_logits_min": -27.92745590209961, "gen_logits_std": 2.9771769046783447, "gen_loss": 0.25262075662612915, "grad_norm": 0.39432989491738335, "learning_rate": 2.406063157894737e-05, "loss": 0.263, "mean_copy_accuracy": 0.9967402219772339, "mean_gen_accuracy": 0.8763928562402725, "mean_token_accuracy": 0.9102029651403427, "num_tokens": 390419181.0, "sample_num_tokens": 9403.25, "step": 5438, "total_num_tokens": 390456794.0, "z_loss": 0.0005162269808351994 }, { "copy_logits_max": -4.978343963623047, "copy_logits_min": -750000000.0, "copy_num_tokens": 305.625, "epoch": 1.110901199897881, "gen_logits_max": 4.514428615570068, "gen_logits_mean": -15.290294647216797, "gen_logits_min": -27.02724838256836, "gen_logits_std": 2.9258668422698975, "gen_loss": 0.330783486366272, "grad_norm": 0.38499890704974665, "learning_rate": 2.4059368421052632e-05, "loss": 0.2932, "mean_copy_accuracy": 0.9957660287618637, "mean_gen_accuracy": 0.8737086802721024, "mean_token_accuracy": 0.9022262692451477, "num_tokens": 390676997.0, "sample_num_tokens": 7245.75, "step": 5439, "total_num_tokens": 390705980.0, "z_loss": 0.0006275579216890037 }, { "copy_logits_max": -7.54041862487793, "copy_logits_min": -750000000.0, "copy_num_tokens": 330.4375, "epoch": 1.1111054378350778, "gen_logits_max": 4.283717632293701, "gen_logits_mean": -16.652162551879883, "gen_logits_min": -28.378257751464844, "gen_logits_std": 2.9487733840942383, "gen_loss": 0.3183520436286926, "grad_norm": 0.4027685593181496, "learning_rate": 2.4058105263157897e-05, "loss": 0.2967, "mean_copy_accuracy": 0.9945390969514847, "mean_gen_accuracy": 0.8726658076047897, "mean_token_accuracy": 0.8980506360530853, "num_tokens": 390936587.0, "sample_num_tokens": 7812.25, "step": 5440, "total_num_tokens": 390967836.0, "z_loss": 0.0005476124351844192 }, { "copy_logits_max": -4.803739547729492, "copy_logits_min": -687500096.0, "copy_num_tokens": 427.0, "epoch": 1.1113096757722747, "gen_logits_max": 5.77297306060791, "gen_logits_mean": -12.843999862670898, "gen_logits_min": -25.109180450439453, "gen_logits_std": 2.8843350410461426, "gen_loss": 0.3048795461654663, "grad_norm": 0.44052339038666927, "learning_rate": 2.4056842105263157e-05, "loss": 0.2966, "mean_copy_accuracy": 0.995842695236206, "mean_gen_accuracy": 0.8659233003854752, "mean_token_accuracy": 0.8990335166454315, "num_tokens": 391239559.0, "sample_num_tokens": 8509.25, "step": 5441, "total_num_tokens": 391273596.0, "z_loss": 0.0005471161566674709 }, { "copy_logits_max": -5.43541145324707, "copy_logits_min": -750000000.0, "copy_num_tokens": 426.3125, "epoch": 1.1115139137094716, "gen_logits_max": 5.1439104080200195, "gen_logits_mean": -14.566245079040527, "gen_logits_min": -26.595115661621094, "gen_logits_std": 2.9093801975250244, "gen_loss": 0.337229460477829, "grad_norm": 0.40928883891599854, "learning_rate": 2.4055578947368422e-05, "loss": 0.2896, "mean_copy_accuracy": 0.9966142028570175, "mean_gen_accuracy": 0.8747555315494537, "mean_token_accuracy": 0.9018600136041641, "num_tokens": 391499868.0, "sample_num_tokens": 9445.0, "step": 5442, "total_num_tokens": 391537648.0, "z_loss": 0.0006520646857097745 }, { "copy_logits_max": -3.5474677085876465, "copy_logits_min": -750000000.0, "copy_num_tokens": 445.3125, "epoch": 1.1117181516466683, "gen_logits_max": 4.105998992919922, "gen_logits_mean": -15.368691444396973, "gen_logits_min": -27.188743591308594, "gen_logits_std": 2.9209775924682617, "gen_loss": 0.29495322704315186, "grad_norm": 0.38725673438123115, "learning_rate": 2.4054315789473686e-05, "loss": 0.2603, "mean_copy_accuracy": 0.9963543117046356, "mean_gen_accuracy": 0.8792617321014404, "mean_token_accuracy": 0.9116992205381393, "num_tokens": 391788599.0, "sample_num_tokens": 7682.25, "step": 5443, "total_num_tokens": 391819328.0, "z_loss": 0.0006479531875811517 }, { "copy_logits_max": -4.00303840637207, "copy_logits_min": -750000000.0, "copy_num_tokens": 371.4375, "epoch": 1.1119223895838652, "gen_logits_max": 4.85786247253418, "gen_logits_mean": -14.504789352416992, "gen_logits_min": -26.31468391418457, "gen_logits_std": 2.8808634281158447, "gen_loss": 0.2804614305496216, "grad_norm": 0.39239574982460723, "learning_rate": 2.4053052631578947e-05, "loss": 0.2665, "mean_copy_accuracy": 0.9968316555023193, "mean_gen_accuracy": 0.8791750818490982, "mean_token_accuracy": 0.9099760502576828, "num_tokens": 392055543.0, "sample_num_tokens": 7976.75, "step": 5444, "total_num_tokens": 392087450.0, "z_loss": 0.0006228877464309335 }, { "copy_logits_max": -1.280982255935669, "copy_logits_min": -750000000.0, "copy_num_tokens": 499.0625, "epoch": 1.112126627521062, "gen_logits_max": 3.563076972961426, "gen_logits_mean": -15.827722549438477, "gen_logits_min": -27.89345932006836, "gen_logits_std": 2.9634151458740234, "gen_loss": 0.2975095212459564, "grad_norm": 0.3848136041184337, "learning_rate": 2.405178947368421e-05, "loss": 0.308, "mean_copy_accuracy": 0.9953079372644424, "mean_gen_accuracy": 0.8670278936624527, "mean_token_accuracy": 0.8979136198759079, "num_tokens": 392326352.0, "sample_num_tokens": 8160.5, "step": 5445, "total_num_tokens": 392358994.0, "z_loss": 0.0006284489063546062 }, { "copy_logits_max": -1.276711106300354, "copy_logits_min": -750000000.0, "copy_num_tokens": 579.8125, "epoch": 1.1123308654582589, "gen_logits_max": 4.814607620239258, "gen_logits_mean": -14.344359397888184, "gen_logits_min": -26.332305908203125, "gen_logits_std": 2.918013095855713, "gen_loss": 0.2704482674598694, "grad_norm": 0.4090309274431442, "learning_rate": 2.4050526315789476e-05, "loss": 0.2796, "mean_copy_accuracy": 0.9965174496173859, "mean_gen_accuracy": 0.8728439509868622, "mean_token_accuracy": 0.9054461568593979, "num_tokens": 392591100.0, "sample_num_tokens": 9243.0, "step": 5446, "total_num_tokens": 392628072.0, "z_loss": 0.000674258335493505 }, { "copy_logits_max": -0.7336739301681519, "copy_logits_min": -750000000.0, "copy_num_tokens": 411.6875, "epoch": 1.1125351033954558, "gen_logits_max": 4.302826881408691, "gen_logits_mean": -15.382404327392578, "gen_logits_min": -27.745994567871094, "gen_logits_std": 2.970262050628662, "gen_loss": 0.2346133142709732, "grad_norm": 0.3524247203352083, "learning_rate": 2.404926315789474e-05, "loss": 0.2675, "mean_copy_accuracy": 0.9966343343257904, "mean_gen_accuracy": 0.8794685900211334, "mean_token_accuracy": 0.9090454280376434, "num_tokens": 392863931.0, "sample_num_tokens": 7764.25, "step": 5447, "total_num_tokens": 392894988.0, "z_loss": 0.0005924543947912753 }, { "copy_logits_max": 1.8770493268966675, "copy_logits_min": -750000000.0, "copy_num_tokens": 741.9375, "epoch": 1.1127393413326525, "gen_logits_max": 5.588955402374268, "gen_logits_mean": -12.686805725097656, "gen_logits_min": -25.543689727783203, "gen_logits_std": 2.9581334590911865, "gen_loss": 0.21416939795017242, "grad_norm": 0.38106853675287344, "learning_rate": 2.4048e-05, "loss": 0.2421, "mean_copy_accuracy": 0.9966024309396744, "mean_gen_accuracy": 0.8860716074705124, "mean_token_accuracy": 0.9171701222658157, "num_tokens": 393131321.0, "sample_num_tokens": 9146.25, "step": 5448, "total_num_tokens": 393167906.0, "z_loss": 0.0006167161627672613 }, { "copy_logits_max": -4.110499858856201, "copy_logits_min": -687500032.0, "copy_num_tokens": 420.5625, "epoch": 1.1129435792698494, "gen_logits_max": 4.811473846435547, "gen_logits_mean": -16.373146057128906, "gen_logits_min": -28.332660675048828, "gen_logits_std": 2.970386028289795, "gen_loss": 0.28106001019477844, "grad_norm": 0.3524817340139269, "learning_rate": 2.4046736842105265e-05, "loss": 0.2777, "mean_copy_accuracy": 0.9965835362672806, "mean_gen_accuracy": 0.8752597868442535, "mean_token_accuracy": 0.9040651917457581, "num_tokens": 393421130.0, "sample_num_tokens": 9322.5, "step": 5449, "total_num_tokens": 393458420.0, "z_loss": 0.0006208977429196239 }, { "copy_logits_max": 0.1927739381790161, "copy_logits_min": -625000064.0, "copy_num_tokens": 630.625, "epoch": 1.1131478172070461, "gen_logits_max": 4.395256042480469, "gen_logits_mean": -13.39944839477539, "gen_logits_min": -26.08770179748535, "gen_logits_std": 2.945021867752075, "gen_loss": 0.2580529451370239, "grad_norm": 0.3653166172675275, "learning_rate": 2.4045473684210526e-05, "loss": 0.278, "mean_copy_accuracy": 0.9962651282548904, "mean_gen_accuracy": 0.8738808333873749, "mean_token_accuracy": 0.9059087634086609, "num_tokens": 393682200.0, "sample_num_tokens": 8542.5, "step": 5450, "total_num_tokens": 393716370.0, "z_loss": 0.0005551769863814116 }, { "copy_logits_max": -1.8516558408737183, "copy_logits_min": -687500032.0, "copy_num_tokens": 369.0625, "epoch": 1.113352055144243, "gen_logits_max": 5.894191741943359, "gen_logits_mean": -13.301106452941895, "gen_logits_min": -25.757057189941406, "gen_logits_std": 2.941864013671875, "gen_loss": 0.29225441813468933, "grad_norm": 0.36209758969318084, "learning_rate": 2.404421052631579e-05, "loss": 0.273, "mean_copy_accuracy": 0.9960841983556747, "mean_gen_accuracy": 0.8792034834623337, "mean_token_accuracy": 0.9080260097980499, "num_tokens": 393962766.0, "sample_num_tokens": 8374.0, "step": 5451, "total_num_tokens": 393996262.0, "z_loss": 0.0005538712721318007 }, { "copy_logits_max": -4.15570068359375, "copy_logits_min": -750000000.0, "copy_num_tokens": 463.1875, "epoch": 1.1135562930814398, "gen_logits_max": 4.639880180358887, "gen_logits_mean": -15.049718856811523, "gen_logits_min": -26.99349594116211, "gen_logits_std": 2.9594385623931885, "gen_loss": 0.26523274183273315, "grad_norm": 0.35848442938595954, "learning_rate": 2.404294736842105e-05, "loss": 0.2603, "mean_copy_accuracy": 0.9964263290166855, "mean_gen_accuracy": 0.8827612847089767, "mean_token_accuracy": 0.9126513451337814, "num_tokens": 394256668.0, "sample_num_tokens": 8852.0, "step": 5452, "total_num_tokens": 394292076.0, "z_loss": 0.00048301537754014134 }, { "copy_logits_max": -2.552227258682251, "copy_logits_min": -750000000.0, "copy_num_tokens": 452.5, "epoch": 1.1137605310186367, "gen_logits_max": 4.278872013092041, "gen_logits_mean": -15.059864044189453, "gen_logits_min": -27.020572662353516, "gen_logits_std": 2.951362371444702, "gen_loss": 0.29344794154167175, "grad_norm": 0.3422036766309895, "learning_rate": 2.4041684210526316e-05, "loss": 0.273, "mean_copy_accuracy": 0.9968781024217606, "mean_gen_accuracy": 0.8741301000118256, "mean_token_accuracy": 0.9060287028551102, "num_tokens": 394540660.0, "sample_num_tokens": 8417.0, "step": 5453, "total_num_tokens": 394574328.0, "z_loss": 0.0004939372302033007 }, { "copy_logits_max": -2.496628522872925, "copy_logits_min": -750000000.0, "copy_num_tokens": 334.9375, "epoch": 1.1139647689558336, "gen_logits_max": 6.658647537231445, "gen_logits_mean": -13.324699401855469, "gen_logits_min": -25.20156478881836, "gen_logits_std": 2.932849407196045, "gen_loss": 0.26687055826187134, "grad_norm": 0.38838434890985263, "learning_rate": 2.4040421052631577e-05, "loss": 0.2808, "mean_copy_accuracy": 0.9958050400018692, "mean_gen_accuracy": 0.8783408403396606, "mean_token_accuracy": 0.9038251787424088, "num_tokens": 394804931.0, "sample_num_tokens": 8622.75, "step": 5454, "total_num_tokens": 394839422.0, "z_loss": 0.0004847896343562752 }, { "copy_logits_max": -4.499480724334717, "copy_logits_min": -750000000.0, "copy_num_tokens": 525.875, "epoch": 1.1141690068930303, "gen_logits_max": 2.9548659324645996, "gen_logits_mean": -17.71588706970215, "gen_logits_min": -29.464855194091797, "gen_logits_std": 3.0237996578216553, "gen_loss": 0.2659350037574768, "grad_norm": 0.4029782956931715, "learning_rate": 2.4039157894736844e-05, "loss": 0.2869, "mean_copy_accuracy": 0.9956045895814896, "mean_gen_accuracy": 0.8731498569250107, "mean_token_accuracy": 0.9028350114822388, "num_tokens": 395077644.0, "sample_num_tokens": 9144.5, "step": 5455, "total_num_tokens": 395114222.0, "z_loss": 0.0004344127082731575 }, { "copy_logits_max": -1.8500807285308838, "copy_logits_min": -750000000.0, "copy_num_tokens": 502.5625, "epoch": 1.1143732448302273, "gen_logits_max": 3.7612907886505127, "gen_logits_mean": -16.916269302368164, "gen_logits_min": -29.007291793823242, "gen_logits_std": 3.00710391998291, "gen_loss": 0.28232964873313904, "grad_norm": 0.3606740674621734, "learning_rate": 2.4037894736842105e-05, "loss": 0.2828, "mean_copy_accuracy": 0.996519610285759, "mean_gen_accuracy": 0.8702414482831955, "mean_token_accuracy": 0.9035301208496094, "num_tokens": 395365771.0, "sample_num_tokens": 8980.25, "step": 5456, "total_num_tokens": 395401692.0, "z_loss": 0.0005404305411502719 }, { "copy_logits_max": -0.3705872893333435, "copy_logits_min": -750000000.0, "copy_num_tokens": 488.8125, "epoch": 1.114577482767424, "gen_logits_max": 4.186149597167969, "gen_logits_mean": -15.525511741638184, "gen_logits_min": -27.620492935180664, "gen_logits_std": 2.9761080741882324, "gen_loss": 0.2520930767059326, "grad_norm": 0.3900692913009955, "learning_rate": 2.403663157894737e-05, "loss": 0.2642, "mean_copy_accuracy": 0.9958554804325104, "mean_gen_accuracy": 0.880158469080925, "mean_token_accuracy": 0.9107280671596527, "num_tokens": 395630256.0, "sample_num_tokens": 8843.0, "step": 5457, "total_num_tokens": 395665628.0, "z_loss": 0.0004584617563523352 }, { "copy_logits_max": -0.6147092580795288, "copy_logits_min": -750000000.0, "copy_num_tokens": 531.4375, "epoch": 1.114781720704621, "gen_logits_max": 4.073359489440918, "gen_logits_mean": -15.593711853027344, "gen_logits_min": -27.746746063232422, "gen_logits_std": 3.0088939666748047, "gen_loss": 0.2637059688568115, "grad_norm": 0.37107082693525045, "learning_rate": 2.4035368421052634e-05, "loss": 0.2776, "mean_copy_accuracy": 0.9966444820165634, "mean_gen_accuracy": 0.8746131062507629, "mean_token_accuracy": 0.9079750180244446, "num_tokens": 395914594.0, "sample_num_tokens": 8843.0, "step": 5458, "total_num_tokens": 395949966.0, "z_loss": 0.0004731951339635998 }, { "copy_logits_max": 0.06861698627471924, "copy_logits_min": -750000000.0, "copy_num_tokens": 463.1875, "epoch": 1.1149859586418178, "gen_logits_max": 5.180539131164551, "gen_logits_mean": -14.896430969238281, "gen_logits_min": -26.8946533203125, "gen_logits_std": 2.975668430328369, "gen_loss": 0.3160620331764221, "grad_norm": 0.38102888350073266, "learning_rate": 2.4034105263157895e-05, "loss": 0.2902, "mean_copy_accuracy": 0.9970869719982147, "mean_gen_accuracy": 0.8697897344827652, "mean_token_accuracy": 0.903946653008461, "num_tokens": 396190327.0, "sample_num_tokens": 8049.25, "step": 5459, "total_num_tokens": 396222524.0, "z_loss": 0.0005998661508783698 }, { "copy_logits_max": -2.564732313156128, "copy_logits_min": -687500032.0, "copy_num_tokens": 357.9375, "epoch": 1.1151901965790145, "gen_logits_max": 4.739027976989746, "gen_logits_mean": -15.745943069458008, "gen_logits_min": -27.69133186340332, "gen_logits_std": 2.9530951976776123, "gen_loss": 0.29309433698654175, "grad_norm": 0.3850357542219036, "learning_rate": 2.403284210526316e-05, "loss": 0.2713, "mean_copy_accuracy": 0.9960777312517166, "mean_gen_accuracy": 0.879578024148941, "mean_token_accuracy": 0.9076029509305954, "num_tokens": 396449835.0, "sample_num_tokens": 8374.25, "step": 5460, "total_num_tokens": 396483332.0, "z_loss": 0.0005136216641403735 }, { "copy_logits_max": -2.0219812393188477, "copy_logits_min": -750000000.0, "copy_num_tokens": 554.125, "epoch": 1.1153944345162115, "gen_logits_max": 4.050455093383789, "gen_logits_mean": -15.35037612915039, "gen_logits_min": -27.463172912597656, "gen_logits_std": 2.9762978553771973, "gen_loss": 0.26089802384376526, "grad_norm": 0.41697136298904214, "learning_rate": 2.403157894736842e-05, "loss": 0.2787, "mean_copy_accuracy": 0.9956212341785431, "mean_gen_accuracy": 0.8744344860315323, "mean_token_accuracy": 0.9053847044706345, "num_tokens": 396723464.0, "sample_num_tokens": 9597.0, "step": 5461, "total_num_tokens": 396761852.0, "z_loss": 0.0005096960230730474 }, { "copy_logits_max": -1.7489101886749268, "copy_logits_min": -687500032.0, "copy_num_tokens": 327.0, "epoch": 1.1155986724534082, "gen_logits_max": 5.279665946960449, "gen_logits_mean": -14.254354476928711, "gen_logits_min": -25.97209358215332, "gen_logits_std": 2.947803020477295, "gen_loss": 0.2923908829689026, "grad_norm": 0.37416962106357526, "learning_rate": 2.4030315789473684e-05, "loss": 0.2823, "mean_copy_accuracy": 0.996903195977211, "mean_gen_accuracy": 0.8760899156332016, "mean_token_accuracy": 0.9048774838447571, "num_tokens": 396993221.0, "sample_num_tokens": 7011.75, "step": 5462, "total_num_tokens": 397021268.0, "z_loss": 0.0005140214343555272 }, { "copy_logits_max": -0.18287813663482666, "copy_logits_min": -687500032.0, "copy_num_tokens": 713.5625, "epoch": 1.115802910390605, "gen_logits_max": 5.416153907775879, "gen_logits_mean": -12.467039108276367, "gen_logits_min": -24.082807540893555, "gen_logits_std": 2.8701720237731934, "gen_loss": 0.2665969729423523, "grad_norm": 0.40588167980626194, "learning_rate": 2.402905263157895e-05, "loss": 0.2837, "mean_copy_accuracy": 0.9963585138320923, "mean_gen_accuracy": 0.8715206533670425, "mean_token_accuracy": 0.9034810662269592, "num_tokens": 397268190.0, "sample_num_tokens": 11197.5, "step": 5463, "total_num_tokens": 397312980.0, "z_loss": 0.00047360869939439 }, { "copy_logits_max": -3.152108669281006, "copy_logits_min": -750000000.0, "copy_num_tokens": 213.5625, "epoch": 1.1160071483278018, "gen_logits_max": 5.359296798706055, "gen_logits_mean": -15.705097198486328, "gen_logits_min": -27.326881408691406, "gen_logits_std": 2.9755802154541016, "gen_loss": 0.26162996888160706, "grad_norm": 0.3748680700827961, "learning_rate": 2.4027789473684213e-05, "loss": 0.2879, "mean_copy_accuracy": 0.994798555970192, "mean_gen_accuracy": 0.8780588805675507, "mean_token_accuracy": 0.9010683298110962, "num_tokens": 397544325.0, "sample_num_tokens": 7271.25, "step": 5464, "total_num_tokens": 397573410.0, "z_loss": 0.00048751942813396454 }, { "copy_logits_max": -3.8915138244628906, "copy_logits_min": -750000000.0, "copy_num_tokens": 342.5625, "epoch": 1.1162113862649987, "gen_logits_max": 5.12326717376709, "gen_logits_mean": -15.041868209838867, "gen_logits_min": -26.52927017211914, "gen_logits_std": 2.9201462268829346, "gen_loss": 0.326002299785614, "grad_norm": 0.3612070425135366, "learning_rate": 2.4026526315789474e-05, "loss": 0.2864, "mean_copy_accuracy": 0.9965153634548187, "mean_gen_accuracy": 0.8698281943798065, "mean_token_accuracy": 0.9034193605184555, "num_tokens": 397826941.0, "sample_num_tokens": 7365.75, "step": 5465, "total_num_tokens": 397856404.0, "z_loss": 0.0005963305593468249 }, { "copy_logits_max": -4.098187446594238, "copy_logits_min": -750000000.0, "copy_num_tokens": 366.0, "epoch": 1.1164156242021956, "gen_logits_max": 5.24594783782959, "gen_logits_mean": -14.438555717468262, "gen_logits_min": -26.132482528686523, "gen_logits_std": 2.912287712097168, "gen_loss": 0.2681470513343811, "grad_norm": 0.3846131653805638, "learning_rate": 2.4025263157894738e-05, "loss": 0.2718, "mean_copy_accuracy": 0.9957783371210098, "mean_gen_accuracy": 0.8814789652824402, "mean_token_accuracy": 0.9077645540237427, "num_tokens": 398095067.0, "sample_num_tokens": 8471.25, "step": 5466, "total_num_tokens": 398128952.0, "z_loss": 0.0005652515101246536 }, { "copy_logits_max": -2.3334081172943115, "copy_logits_min": -750000000.0, "copy_num_tokens": 434.75, "epoch": 1.1166198621393923, "gen_logits_max": 4.028799057006836, "gen_logits_mean": -16.714839935302734, "gen_logits_min": -28.28690528869629, "gen_logits_std": 2.9923653602600098, "gen_loss": 0.2782450318336487, "grad_norm": 0.39752260173035403, "learning_rate": 2.4024e-05, "loss": 0.2663, "mean_copy_accuracy": 0.9971719533205032, "mean_gen_accuracy": 0.8768653571605682, "mean_token_accuracy": 0.9105502814054489, "num_tokens": 398394184.0, "sample_num_tokens": 7730.5, "step": 5467, "total_num_tokens": 398425106.0, "z_loss": 0.00048592063831165433 }, { "copy_logits_max": -2.0788636207580566, "copy_logits_min": -687500032.0, "copy_num_tokens": 451.375, "epoch": 1.1168241000765893, "gen_logits_max": 4.544761657714844, "gen_logits_mean": -14.73794937133789, "gen_logits_min": -26.439292907714844, "gen_logits_std": 2.914699077606201, "gen_loss": 0.26428401470184326, "grad_norm": 0.379762095727968, "learning_rate": 2.4022736842105263e-05, "loss": 0.2785, "mean_copy_accuracy": 0.9965957850217819, "mean_gen_accuracy": 0.8749937862157822, "mean_token_accuracy": 0.9049888700246811, "num_tokens": 398650354.0, "sample_num_tokens": 7342.5, "step": 5468, "total_num_tokens": 398679724.0, "z_loss": 0.0004932964802719653 }, { "copy_logits_max": -2.3751115798950195, "copy_logits_min": -750000000.0, "copy_num_tokens": 424.625, "epoch": 1.117028338013786, "gen_logits_max": 4.550514221191406, "gen_logits_mean": -14.786086082458496, "gen_logits_min": -26.437284469604492, "gen_logits_std": 2.9287424087524414, "gen_loss": 0.29684314131736755, "grad_norm": 0.3860719247780541, "learning_rate": 2.4021473684210524e-05, "loss": 0.2732, "mean_copy_accuracy": 0.9959840923547745, "mean_gen_accuracy": 0.8796497732400894, "mean_token_accuracy": 0.9081273078918457, "num_tokens": 398932482.0, "sample_num_tokens": 8419.0, "step": 5469, "total_num_tokens": 398966158.0, "z_loss": 0.0005328916013240814 }, { "copy_logits_max": -4.021327495574951, "copy_logits_min": -750000064.0, "copy_num_tokens": 311.4375, "epoch": 1.117232575950983, "gen_logits_max": 4.366802215576172, "gen_logits_mean": -16.82773780822754, "gen_logits_min": -28.333560943603516, "gen_logits_std": 2.9452877044677734, "gen_loss": 0.3148273229598999, "grad_norm": 0.38156294657588696, "learning_rate": 2.402021052631579e-05, "loss": 0.2787, "mean_copy_accuracy": 0.9965222924947739, "mean_gen_accuracy": 0.8770551979541779, "mean_token_accuracy": 0.9059509485960007, "num_tokens": 399210482.0, "sample_num_tokens": 7413.0, "step": 5470, "total_num_tokens": 399240134.0, "z_loss": 0.0006256329361349344 }, { "copy_logits_max": -2.1527929306030273, "copy_logits_min": -750000000.0, "copy_num_tokens": 512.5, "epoch": 1.1174368138881796, "gen_logits_max": 4.460391998291016, "gen_logits_mean": -14.56392765045166, "gen_logits_min": -26.276329040527344, "gen_logits_std": 2.871659278869629, "gen_loss": 0.2722649574279785, "grad_norm": 0.3548195522478213, "learning_rate": 2.4018947368421056e-05, "loss": 0.2574, "mean_copy_accuracy": 0.9973726719617844, "mean_gen_accuracy": 0.8827251493930817, "mean_token_accuracy": 0.9127224385738373, "num_tokens": 399482819.0, "sample_num_tokens": 8831.75, "step": 5471, "total_num_tokens": 399518146.0, "z_loss": 0.000530735298525542 }, { "copy_logits_max": -2.3257575035095215, "copy_logits_min": -750000000.0, "copy_num_tokens": 332.75, "epoch": 1.1176410518253765, "gen_logits_max": 4.68184232711792, "gen_logits_mean": -14.549140930175781, "gen_logits_min": -25.92546844482422, "gen_logits_std": 2.823347568511963, "gen_loss": 0.3220483362674713, "grad_norm": 0.5980566708564159, "learning_rate": 2.4017684210526317e-05, "loss": 0.2886, "mean_copy_accuracy": 0.9969503730535507, "mean_gen_accuracy": 0.8702532202005386, "mean_token_accuracy": 0.9021198749542236, "num_tokens": 399747015.0, "sample_num_tokens": 7159.75, "step": 5472, "total_num_tokens": 399775654.0, "z_loss": 0.0005854369956068695 }, { "copy_logits_max": 2.1680514812469482, "copy_logits_min": -562500096.0, "copy_num_tokens": 487.75, "epoch": 1.1178452897625735, "gen_logits_max": 4.742566108703613, "gen_logits_mean": -13.659063339233398, "gen_logits_min": -25.388427734375, "gen_logits_std": 2.8447346687316895, "gen_loss": 0.26669836044311523, "grad_norm": 0.3894262644549099, "learning_rate": 2.401642105263158e-05, "loss": 0.2836, "mean_copy_accuracy": 0.9962069392204285, "mean_gen_accuracy": 0.875492125749588, "mean_token_accuracy": 0.9028241336345673, "num_tokens": 399994829.0, "sample_num_tokens": 7795.25, "step": 5473, "total_num_tokens": 400026010.0, "z_loss": 0.0005795427714474499 }, { "copy_logits_max": -1.7631609439849854, "copy_logits_min": -750000128.0, "copy_num_tokens": 474.8125, "epoch": 1.1180495276997702, "gen_logits_max": 4.556804180145264, "gen_logits_mean": -15.694096565246582, "gen_logits_min": -27.469093322753906, "gen_logits_std": 2.884784698486328, "gen_loss": 0.2957024574279785, "grad_norm": 0.3749921336014952, "learning_rate": 2.4015157894736842e-05, "loss": 0.2807, "mean_copy_accuracy": 0.9964155703783035, "mean_gen_accuracy": 0.8730182647705078, "mean_token_accuracy": 0.9057688862085342, "num_tokens": 400266896.0, "sample_num_tokens": 8537.0, "step": 5474, "total_num_tokens": 400301044.0, "z_loss": 0.000610632065217942 }, { "copy_logits_max": -0.7907982468605042, "copy_logits_min": -625000064.0, "copy_num_tokens": 666.125, "epoch": 1.118253765636967, "gen_logits_max": 4.762497901916504, "gen_logits_mean": -13.523938179016113, "gen_logits_min": -25.587154388427734, "gen_logits_std": 2.911625623703003, "gen_loss": 0.2986406087875366, "grad_norm": 0.34569362273072085, "learning_rate": 2.4013894736842107e-05, "loss": 0.2573, "mean_copy_accuracy": 0.9979392737150192, "mean_gen_accuracy": 0.878512978553772, "mean_token_accuracy": 0.9143396019935608, "num_tokens": 400556539.0, "sample_num_tokens": 9641.25, "step": 5475, "total_num_tokens": 400595104.0, "z_loss": 0.0006408432964235544 }, { "copy_logits_max": -3.1714377403259277, "copy_logits_min": -687500032.0, "copy_num_tokens": 375.3125, "epoch": 1.1184580035741638, "gen_logits_max": 4.582437038421631, "gen_logits_mean": -14.535797119140625, "gen_logits_min": -25.80148696899414, "gen_logits_std": 2.8285348415374756, "gen_loss": 0.2891330122947693, "grad_norm": 0.3703018834365808, "learning_rate": 2.4012631578947368e-05, "loss": 0.2729, "mean_copy_accuracy": 0.996274471282959, "mean_gen_accuracy": 0.8757904171943665, "mean_token_accuracy": 0.906795933842659, "num_tokens": 400836200.0, "sample_num_tokens": 7570.5, "step": 5476, "total_num_tokens": 400866482.0, "z_loss": 0.0005913392524234951 }, { "copy_logits_max": -0.8587134480476379, "copy_logits_min": -750000000.0, "copy_num_tokens": 548.5, "epoch": 1.1186622415113607, "gen_logits_max": 4.4406352043151855, "gen_logits_mean": -13.894956588745117, "gen_logits_min": -26.31591033935547, "gen_logits_std": 2.8650965690612793, "gen_loss": 0.2774428129196167, "grad_norm": 0.3589933577351405, "learning_rate": 2.4011368421052632e-05, "loss": 0.2479, "mean_copy_accuracy": 0.9975292384624481, "mean_gen_accuracy": 0.8831276595592499, "mean_token_accuracy": 0.9154231697320938, "num_tokens": 401117088.0, "sample_num_tokens": 8304.0, "step": 5477, "total_num_tokens": 401150304.0, "z_loss": 0.0005899143870919943 }, { "copy_logits_max": -3.437741279602051, "copy_logits_min": -750000000.0, "copy_num_tokens": 746.6875, "epoch": 1.1188664794485577, "gen_logits_max": 4.855313301086426, "gen_logits_mean": -12.458642959594727, "gen_logits_min": -24.69344711303711, "gen_logits_std": 2.863316535949707, "gen_loss": 0.22795890271663666, "grad_norm": 0.40091441124705823, "learning_rate": 2.4010105263157893e-05, "loss": 0.2727, "mean_copy_accuracy": 0.997290849685669, "mean_gen_accuracy": 0.8744926750659943, "mean_token_accuracy": 0.9073891937732697, "num_tokens": 401390953.0, "sample_num_tokens": 9611.25, "step": 5478, "total_num_tokens": 401429398.0, "z_loss": 0.0004384040948934853 }, { "copy_logits_max": -4.599653244018555, "copy_logits_min": -750000064.0, "copy_num_tokens": 394.1875, "epoch": 1.1190707173857544, "gen_logits_max": 4.623006820678711, "gen_logits_mean": -15.867114067077637, "gen_logits_min": -27.368234634399414, "gen_logits_std": 2.901376485824585, "gen_loss": 0.28086215257644653, "grad_norm": 0.37859453633604273, "learning_rate": 2.400884210526316e-05, "loss": 0.2804, "mean_copy_accuracy": 0.9955353289842606, "mean_gen_accuracy": 0.8739748746156693, "mean_token_accuracy": 0.9033996313810349, "num_tokens": 401673640.0, "sample_num_tokens": 8161.0, "step": 5479, "total_num_tokens": 401706284.0, "z_loss": 0.0005612347740679979 }, { "copy_logits_max": -3.7210190296173096, "copy_logits_min": -625000064.0, "copy_num_tokens": 742.6875, "epoch": 1.1192749553229513, "gen_logits_max": 3.042853355407715, "gen_logits_mean": -16.337175369262695, "gen_logits_min": -28.427160263061523, "gen_logits_std": 2.9582462310791016, "gen_loss": 0.2605963945388794, "grad_norm": 0.3258434066639346, "learning_rate": 2.400757894736842e-05, "loss": 0.2457, "mean_copy_accuracy": 0.9980853796005249, "mean_gen_accuracy": 0.8842816948890686, "mean_token_accuracy": 0.9171776324510574, "num_tokens": 401972398.0, "sample_num_tokens": 10305.5, "step": 5480, "total_num_tokens": 402013620.0, "z_loss": 0.0005303806392475963 }, { "copy_logits_max": -1.8233256340026855, "copy_logits_min": -750000000.0, "copy_num_tokens": 514.4375, "epoch": 1.119479193260148, "gen_logits_max": 4.5508317947387695, "gen_logits_mean": -13.631208419799805, "gen_logits_min": -25.82672119140625, "gen_logits_std": 2.8660593032836914, "gen_loss": 0.25506722927093506, "grad_norm": 0.36313145545811704, "learning_rate": 2.4006315789473686e-05, "loss": 0.2687, "mean_copy_accuracy": 0.9970024228096008, "mean_gen_accuracy": 0.8806440830230713, "mean_token_accuracy": 0.910706028342247, "num_tokens": 402271420.0, "sample_num_tokens": 9773.0, "step": 5481, "total_num_tokens": 402310512.0, "z_loss": 0.0005606240592896938 }, { "copy_logits_max": -3.135195016860962, "copy_logits_min": -750000000.0, "copy_num_tokens": 513.5625, "epoch": 1.119683431197345, "gen_logits_max": 4.789393424987793, "gen_logits_mean": -13.321273803710938, "gen_logits_min": -24.928855895996094, "gen_logits_std": 2.8234879970550537, "gen_loss": 0.30672866106033325, "grad_norm": 0.3483147974745154, "learning_rate": 2.4005052631578947e-05, "loss": 0.2716, "mean_copy_accuracy": 0.997870996594429, "mean_gen_accuracy": 0.8752162158489227, "mean_token_accuracy": 0.9087505638599396, "num_tokens": 402591947.0, "sample_num_tokens": 9346.75, "step": 5482, "total_num_tokens": 402629334.0, "z_loss": 0.0005985030438750982 }, { "copy_logits_max": -5.692633628845215, "copy_logits_min": -750000000.0, "copy_num_tokens": 443.125, "epoch": 1.1198876691345419, "gen_logits_max": 3.726778507232666, "gen_logits_mean": -16.389907836914062, "gen_logits_min": -28.28323745727539, "gen_logits_std": 2.9253740310668945, "gen_loss": 0.2676950693130493, "grad_norm": 0.42229368863418987, "learning_rate": 2.400378947368421e-05, "loss": 0.2657, "mean_copy_accuracy": 0.9961740374565125, "mean_gen_accuracy": 0.8767019510269165, "mean_token_accuracy": 0.9095886647701263, "num_tokens": 402875381.0, "sample_num_tokens": 8354.75, "step": 5483, "total_num_tokens": 402908800.0, "z_loss": 0.000554647936951369 }, { "copy_logits_max": -5.542463302612305, "copy_logits_min": -750000064.0, "copy_num_tokens": 416.4375, "epoch": 1.1200919070717386, "gen_logits_max": 3.2822885513305664, "gen_logits_mean": -16.92043685913086, "gen_logits_min": -28.587757110595703, "gen_logits_std": 2.9363226890563965, "gen_loss": 0.2626522183418274, "grad_norm": 0.7429028606495929, "learning_rate": 2.4002526315789475e-05, "loss": 0.2828, "mean_copy_accuracy": 0.9949054270982742, "mean_gen_accuracy": 0.8762198686599731, "mean_token_accuracy": 0.9037979990243912, "num_tokens": 403138910.0, "sample_num_tokens": 8169.5, "step": 5484, "total_num_tokens": 403171588.0, "z_loss": 0.0005305999075062573 }, { "copy_logits_max": -4.862240791320801, "copy_logits_min": -750000064.0, "copy_num_tokens": 371.25, "epoch": 1.1202961450089355, "gen_logits_max": 3.8338093757629395, "gen_logits_mean": -17.134750366210938, "gen_logits_min": -28.974607467651367, "gen_logits_std": 2.993345260620117, "gen_loss": 0.25929710268974304, "grad_norm": 0.41842404974225866, "learning_rate": 2.4001263157894736e-05, "loss": 0.2805, "mean_copy_accuracy": 0.9960451871156693, "mean_gen_accuracy": 0.8790438026189804, "mean_token_accuracy": 0.9048106521368027, "num_tokens": 403392684.0, "sample_num_tokens": 7662.0, "step": 5485, "total_num_tokens": 403423332.0, "z_loss": 0.0004899194464087486 }, { "copy_logits_max": -4.807938575744629, "copy_logits_min": -687500032.0, "copy_num_tokens": 457.5625, "epoch": 1.1205003829461322, "gen_logits_max": 4.245444297790527, "gen_logits_mean": -14.85653305053711, "gen_logits_min": -26.783557891845703, "gen_logits_std": 2.904848098754883, "gen_loss": 0.24323485791683197, "grad_norm": 0.3828733183671438, "learning_rate": 2.4e-05, "loss": 0.2551, "mean_copy_accuracy": 0.9976125359535217, "mean_gen_accuracy": 0.8804714381694794, "mean_token_accuracy": 0.9131193608045578, "num_tokens": 403666588.0, "sample_num_tokens": 8248.0, "step": 5486, "total_num_tokens": 403699580.0, "z_loss": 0.00046527135418727994 }, { "copy_logits_max": -4.069830894470215, "copy_logits_min": -687500032.0, "copy_num_tokens": 448.4375, "epoch": 1.1207046208833291, "gen_logits_max": 4.694099426269531, "gen_logits_mean": -14.798513412475586, "gen_logits_min": -26.914356231689453, "gen_logits_std": 2.9293785095214844, "gen_loss": 0.3118065595626831, "grad_norm": 0.3939072453339398, "learning_rate": 2.3998736842105265e-05, "loss": 0.2911, "mean_copy_accuracy": 0.995568037033081, "mean_gen_accuracy": 0.8707231879234314, "mean_token_accuracy": 0.9010734856128693, "num_tokens": 403930049.0, "sample_num_tokens": 8123.75, "step": 5487, "total_num_tokens": 403962544.0, "z_loss": 0.0005875863134860992 }, { "copy_logits_max": -5.566891193389893, "copy_logits_min": -750000000.0, "copy_num_tokens": 432.125, "epoch": 1.1209088588205258, "gen_logits_max": 3.8412866592407227, "gen_logits_mean": -16.33657455444336, "gen_logits_min": -28.07616424560547, "gen_logits_std": 2.9288017749786377, "gen_loss": 0.3151390552520752, "grad_norm": 0.3915270870924117, "learning_rate": 2.399747368421053e-05, "loss": 0.2769, "mean_copy_accuracy": 0.9961491227149963, "mean_gen_accuracy": 0.8726682811975479, "mean_token_accuracy": 0.9046394377946854, "num_tokens": 404214312.0, "sample_num_tokens": 8413.0, "step": 5488, "total_num_tokens": 404247964.0, "z_loss": 0.0005824900581501424 }, { "copy_logits_max": -4.796079635620117, "copy_logits_min": -625000064.0, "copy_num_tokens": 364.6875, "epoch": 1.1211130967577227, "gen_logits_max": 4.974391937255859, "gen_logits_mean": -14.99007511138916, "gen_logits_min": -27.061193466186523, "gen_logits_std": 2.895115375518799, "gen_loss": 0.3448220193386078, "grad_norm": 0.43061795050083657, "learning_rate": 2.399621052631579e-05, "loss": 0.2998, "mean_copy_accuracy": 0.9966852217912674, "mean_gen_accuracy": 0.8668114989995956, "mean_token_accuracy": 0.8980216085910797, "num_tokens": 404461355.0, "sample_num_tokens": 7729.25, "step": 5489, "total_num_tokens": 404492272.0, "z_loss": 0.0006502014002762735 }, { "copy_logits_max": -2.404700994491577, "copy_logits_min": -687500032.0, "copy_num_tokens": 451.0, "epoch": 1.1213173346949197, "gen_logits_max": 5.388930797576904, "gen_logits_mean": -13.129670143127441, "gen_logits_min": -25.55367088317871, "gen_logits_std": 2.8987226486206055, "gen_loss": 0.26026690006256104, "grad_norm": 0.43743401164015266, "learning_rate": 2.3994947368421054e-05, "loss": 0.2962, "mean_copy_accuracy": 0.9953656643629074, "mean_gen_accuracy": 0.870125025510788, "mean_token_accuracy": 0.8997117877006531, "num_tokens": 404732876.0, "sample_num_tokens": 7936.5, "step": 5490, "total_num_tokens": 404764622.0, "z_loss": 0.0005205882480368018 }, { "copy_logits_max": -5.270432472229004, "copy_logits_min": -750000000.0, "copy_num_tokens": 359.3125, "epoch": 1.1215215726321164, "gen_logits_max": 4.3040266036987305, "gen_logits_mean": -15.606854438781738, "gen_logits_min": -27.58290672302246, "gen_logits_std": 2.9225518703460693, "gen_loss": 0.3073314130306244, "grad_norm": 0.40743597712639085, "learning_rate": 2.3993684210526315e-05, "loss": 0.2944, "mean_copy_accuracy": 0.9960377365350723, "mean_gen_accuracy": 0.8698646277189255, "mean_token_accuracy": 0.9001787453889847, "num_tokens": 405016010.0, "sample_num_tokens": 7394.5, "step": 5491, "total_num_tokens": 405045588.0, "z_loss": 0.0005480835679918528 }, { "copy_logits_max": -5.167113304138184, "copy_logits_min": -750000000.0, "copy_num_tokens": 525.125, "epoch": 1.1217258105693133, "gen_logits_max": 4.26713752746582, "gen_logits_mean": -15.020495414733887, "gen_logits_min": -27.016616821289062, "gen_logits_std": 2.9095373153686523, "gen_loss": 0.285693883895874, "grad_norm": 0.4323959386342529, "learning_rate": 2.399242105263158e-05, "loss": 0.306, "mean_copy_accuracy": 0.9947308301925659, "mean_gen_accuracy": 0.8692678809165955, "mean_token_accuracy": 0.8953878730535507, "num_tokens": 405257135.0, "sample_num_tokens": 9454.75, "step": 5492, "total_num_tokens": 405294954.0, "z_loss": 0.0005421644891612232 }, { "copy_logits_max": -5.495628356933594, "copy_logits_min": -750000064.0, "copy_num_tokens": 348.5625, "epoch": 1.12193004850651, "gen_logits_max": 4.5865092277526855, "gen_logits_mean": -15.46657943725586, "gen_logits_min": -27.00300407409668, "gen_logits_std": 2.91310977935791, "gen_loss": 0.2994931936264038, "grad_norm": 0.43183563261002633, "learning_rate": 2.399115789473684e-05, "loss": 0.2821, "mean_copy_accuracy": 0.9956515282392502, "mean_gen_accuracy": 0.8797358870506287, "mean_token_accuracy": 0.90494404733181, "num_tokens": 405518605.0, "sample_num_tokens": 6844.75, "step": 5493, "total_num_tokens": 405545984.0, "z_loss": 0.0005387873388826847 }, { "copy_logits_max": -3.8728554248809814, "copy_logits_min": -750000000.0, "copy_num_tokens": 433.9375, "epoch": 1.122134286443707, "gen_logits_max": 4.3810272216796875, "gen_logits_mean": -15.744514465332031, "gen_logits_min": -28.24759292602539, "gen_logits_std": 3.013920783996582, "gen_loss": 0.2770414352416992, "grad_norm": 0.8718540092731988, "learning_rate": 2.3989894736842105e-05, "loss": 0.2919, "mean_copy_accuracy": 0.9961254447698593, "mean_gen_accuracy": 0.8709065467119217, "mean_token_accuracy": 0.9003838896751404, "num_tokens": 405768963.0, "sample_num_tokens": 7495.25, "step": 5494, "total_num_tokens": 405798944.0, "z_loss": 0.0005224932101555169 }, { "copy_logits_max": -6.461733818054199, "copy_logits_min": -750000000.0, "copy_num_tokens": 384.9375, "epoch": 1.1223385243809036, "gen_logits_max": 4.518928050994873, "gen_logits_mean": -14.96090030670166, "gen_logits_min": -26.963153839111328, "gen_logits_std": 2.9454751014709473, "gen_loss": 0.29433390498161316, "grad_norm": 0.412625615621473, "learning_rate": 2.398863157894737e-05, "loss": 0.286, "mean_copy_accuracy": 0.9963575005531311, "mean_gen_accuracy": 0.8711576014757156, "mean_token_accuracy": 0.902631402015686, "num_tokens": 406051463.0, "sample_num_tokens": 7922.75, "step": 5495, "total_num_tokens": 406083154.0, "z_loss": 0.0005194173427298665 }, { "copy_logits_max": -6.550634384155273, "copy_logits_min": -750000000.0, "copy_num_tokens": 289.625, "epoch": 1.1225427623181006, "gen_logits_max": 5.079500198364258, "gen_logits_mean": -15.170177459716797, "gen_logits_min": -26.99612808227539, "gen_logits_std": 2.9239120483398438, "gen_loss": 0.2779148817062378, "grad_norm": 0.4072026355818648, "learning_rate": 2.3987368421052634e-05, "loss": 0.2883, "mean_copy_accuracy": 0.9947256445884705, "mean_gen_accuracy": 0.8759109973907471, "mean_token_accuracy": 0.9031932055950165, "num_tokens": 406304152.0, "sample_num_tokens": 8481.5, "step": 5496, "total_num_tokens": 406338078.0, "z_loss": 0.0004723082820419222 }, { "copy_logits_max": -6.366084098815918, "copy_logits_min": -687500032.0, "copy_num_tokens": 441.125, "epoch": 1.1227470002552975, "gen_logits_max": 4.904616355895996, "gen_logits_mean": -14.673253059387207, "gen_logits_min": -26.610733032226562, "gen_logits_std": 2.904090166091919, "gen_loss": 0.30764317512512207, "grad_norm": 0.43292526968804773, "learning_rate": 2.3986105263157898e-05, "loss": 0.3018, "mean_copy_accuracy": 0.9943791329860687, "mean_gen_accuracy": 0.8755825906991959, "mean_token_accuracy": 0.8980595022439957, "num_tokens": 406563245.0, "sample_num_tokens": 9529.25, "step": 5497, "total_num_tokens": 406601362.0, "z_loss": 0.0005705090006813407 }, { "copy_logits_max": -4.290772438049316, "copy_logits_min": -750000000.0, "copy_num_tokens": 520.25, "epoch": 1.1229512381924942, "gen_logits_max": 3.271419048309326, "gen_logits_mean": -16.27564239501953, "gen_logits_min": -28.627483367919922, "gen_logits_std": 2.985745906829834, "gen_loss": 0.28411510586738586, "grad_norm": 0.38405780803225725, "learning_rate": 2.398484210526316e-05, "loss": 0.2754, "mean_copy_accuracy": 0.9963884055614471, "mean_gen_accuracy": 0.8788346499204636, "mean_token_accuracy": 0.9062302112579346, "num_tokens": 406819350.0, "sample_num_tokens": 8804.0, "step": 5498, "total_num_tokens": 406854566.0, "z_loss": 0.0005641165771521628 }, { "copy_logits_max": -5.452792167663574, "copy_logits_min": -750000000.0, "copy_num_tokens": 589.25, "epoch": 1.1231554761296911, "gen_logits_max": 4.201106071472168, "gen_logits_mean": -15.33418083190918, "gen_logits_min": -26.953125, "gen_logits_std": 2.903899669647217, "gen_loss": 0.24337232112884521, "grad_norm": 0.40378936073605015, "learning_rate": 2.3983578947368423e-05, "loss": 0.2772, "mean_copy_accuracy": 0.9962020069360733, "mean_gen_accuracy": 0.8741304725408554, "mean_token_accuracy": 0.906122162938118, "num_tokens": 407101007.0, "sample_num_tokens": 10562.75, "step": 5499, "total_num_tokens": 407143258.0, "z_loss": 0.0005378468194976449 }, { "epoch": 1.1233597140668878, "grad_norm": 0.37397955613331146, "learning_rate": 2.3982315789473684e-05, "loss": 0.2579, "step": 5500 }, { "epoch": 1.1233597140668878, "eval_copy_logits_max": -7.14941930770874, "eval_copy_logits_min": -81.40160369873047, "eval_gen_logits_max": 3.5736775398254395, "eval_gen_logits_mean": -20.09735870361328, "eval_gen_logits_min": -31.123291015625, "eval_gen_logits_std": 2.9949660301208496, "eval_gen_loss": 0.3401893377304077, "eval_loss": 0.32020145654678345, "eval_mean_copy_accuracy": 0.9956976473331451, "eval_mean_gen_accuracy": 0.8755049109458923, "eval_mean_token_accuracy": 0.8908728957176208, "eval_num_tokens": 407408138.0, "eval_runtime": 0.672, "eval_samples_per_second": 11.905, "eval_steps_per_second": 2.976, "eval_total_num_tokens": 407408138.0, "eval_z_loss": 0.0006223173695616424, "step": 5500 }, { "copy_logits_max": -5.343413352966309, "copy_logits_min": -625000000.0, "copy_num_tokens": 369.125, "epoch": 1.1235639520040848, "gen_logits_max": 4.962574005126953, "gen_logits_mean": -14.966832160949707, "gen_logits_min": -26.866413116455078, "gen_logits_std": 2.9287984371185303, "gen_loss": 0.27837544679641724, "grad_norm": 0.4307412801844681, "learning_rate": 2.398105263157895e-05, "loss": 0.273, "mean_copy_accuracy": 0.9954477474093437, "mean_gen_accuracy": 0.8831089586019516, "mean_token_accuracy": 0.9094782620668411, "num_tokens": 407646301.0, "sample_num_tokens": 8712.25, "step": 5501, "total_num_tokens": 407681150.0, "z_loss": 0.0005561006837524474 }, { "copy_logits_max": -4.59721040725708, "copy_logits_min": -750000064.0, "copy_num_tokens": 391.5625, "epoch": 1.1237681899412817, "gen_logits_max": 4.577819347381592, "gen_logits_mean": -14.668950080871582, "gen_logits_min": -26.902950286865234, "gen_logits_std": 2.9013633728027344, "gen_loss": 0.2630791664123535, "grad_norm": 0.3995852701416064, "learning_rate": 2.397978947368421e-05, "loss": 0.2874, "mean_copy_accuracy": 0.9951611012220383, "mean_gen_accuracy": 0.8767737597227097, "mean_token_accuracy": 0.9028243571519852, "num_tokens": 407910298.0, "sample_num_tokens": 7977.5, "step": 5502, "total_num_tokens": 407942208.0, "z_loss": 0.0005131877842359245 }, { "copy_logits_max": -6.230751991271973, "copy_logits_min": -750000000.0, "copy_num_tokens": 338.5625, "epoch": 1.1239724278784784, "gen_logits_max": 4.789109706878662, "gen_logits_mean": -15.232641220092773, "gen_logits_min": -27.016559600830078, "gen_logits_std": 2.964998245239258, "gen_loss": 0.2867688238620758, "grad_norm": 0.40706657808266317, "learning_rate": 2.3978526315789474e-05, "loss": 0.2846, "mean_copy_accuracy": 0.9954740554094315, "mean_gen_accuracy": 0.8781452029943466, "mean_token_accuracy": 0.9056092947721481, "num_tokens": 408164952.0, "sample_num_tokens": 7639.0, "step": 5503, "total_num_tokens": 408195508.0, "z_loss": 0.0005334275774657726 }, { "copy_logits_max": -5.155599594116211, "copy_logits_min": -750000000.0, "copy_num_tokens": 464.6875, "epoch": 1.1241766658156753, "gen_logits_max": 4.436187744140625, "gen_logits_mean": -15.879826545715332, "gen_logits_min": -27.649547576904297, "gen_logits_std": 2.9693174362182617, "gen_loss": 0.2818993926048279, "grad_norm": 0.40258418379539096, "learning_rate": 2.3977263157894738e-05, "loss": 0.2794, "mean_copy_accuracy": 0.9962212890386581, "mean_gen_accuracy": 0.877248540520668, "mean_token_accuracy": 0.9056846648454666, "num_tokens": 408447782.0, "sample_num_tokens": 9164.5, "step": 5504, "total_num_tokens": 408484440.0, "z_loss": 0.0006030523800291121 }, { "copy_logits_max": -4.875871658325195, "copy_logits_min": -750000000.0, "copy_num_tokens": 475.8125, "epoch": 1.124380903752872, "gen_logits_max": 4.435686111450195, "gen_logits_mean": -14.79238510131836, "gen_logits_min": -26.67316246032715, "gen_logits_std": 2.919736623764038, "gen_loss": 0.27024945616722107, "grad_norm": 0.42972119140114656, "learning_rate": 2.3976000000000002e-05, "loss": 0.2925, "mean_copy_accuracy": 0.9955439120531082, "mean_gen_accuracy": 0.8686515390872955, "mean_token_accuracy": 0.9005621522665024, "num_tokens": 408699494.0, "sample_num_tokens": 8749.0, "step": 5505, "total_num_tokens": 408734490.0, "z_loss": 0.0005157195846550167 }, { "copy_logits_max": -4.472126007080078, "copy_logits_min": -687500032.0, "copy_num_tokens": 545.375, "epoch": 1.124585141690069, "gen_logits_max": 5.347497940063477, "gen_logits_mean": -13.29418659210205, "gen_logits_min": -25.959556579589844, "gen_logits_std": 2.891225814819336, "gen_loss": 0.2552635073661804, "grad_norm": 0.38396676646953826, "learning_rate": 2.3974736842105263e-05, "loss": 0.277, "mean_copy_accuracy": 0.9954645335674286, "mean_gen_accuracy": 0.8770711421966553, "mean_token_accuracy": 0.9046706706285477, "num_tokens": 408969773.0, "sample_num_tokens": 9365.25, "step": 5506, "total_num_tokens": 409007234.0, "z_loss": 0.0004735066322609782 }, { "copy_logits_max": -3.3392632007598877, "copy_logits_min": -750000000.0, "copy_num_tokens": 498.4375, "epoch": 1.1247893796272657, "gen_logits_max": 3.991427183151245, "gen_logits_mean": -15.767922401428223, "gen_logits_min": -28.332843780517578, "gen_logits_std": 3.0034470558166504, "gen_loss": 0.276826411485672, "grad_norm": 0.7773248658407136, "learning_rate": 2.3973473684210527e-05, "loss": 0.2875, "mean_copy_accuracy": 0.9960126429796219, "mean_gen_accuracy": 0.8715904802083969, "mean_token_accuracy": 0.9020701497793198, "num_tokens": 409225468.0, "sample_num_tokens": 8505.5, "step": 5507, "total_num_tokens": 409259490.0, "z_loss": 0.0005430489545688033 }, { "copy_logits_max": -3.1985530853271484, "copy_logits_min": -750000000.0, "copy_num_tokens": 384.6875, "epoch": 1.1249936175644626, "gen_logits_max": 4.776111602783203, "gen_logits_mean": -14.741706848144531, "gen_logits_min": -26.429140090942383, "gen_logits_std": 2.8950281143188477, "gen_loss": 0.2970257103443146, "grad_norm": 0.38095220416775455, "learning_rate": 2.397221052631579e-05, "loss": 0.2889, "mean_copy_accuracy": 0.9962752312421799, "mean_gen_accuracy": 0.8699756562709808, "mean_token_accuracy": 0.9022615253925323, "num_tokens": 409514514.0, "sample_num_tokens": 7942.5, "step": 5508, "total_num_tokens": 409546284.0, "z_loss": 0.0006037308485247195 }, { "copy_logits_max": -3.7184371948242188, "copy_logits_min": -687500032.0, "copy_num_tokens": 565.0625, "epoch": 1.1251978555016595, "gen_logits_max": 3.9268527030944824, "gen_logits_mean": -15.96914291381836, "gen_logits_min": -27.702089309692383, "gen_logits_std": 2.9546432495117188, "gen_loss": 0.2761952877044678, "grad_norm": 0.3923845367677959, "learning_rate": 2.3970947368421053e-05, "loss": 0.285, "mean_copy_accuracy": 0.9956672787666321, "mean_gen_accuracy": 0.8689641952514648, "mean_token_accuracy": 0.9036795347929001, "num_tokens": 409792919.0, "sample_num_tokens": 9031.75, "step": 5509, "total_num_tokens": 409829046.0, "z_loss": 0.0005212483229115605 }, { "copy_logits_max": -2.640981674194336, "copy_logits_min": -687500032.0, "copy_num_tokens": 495.0, "epoch": 1.1254020934388562, "gen_logits_max": 3.9099955558776855, "gen_logits_mean": -15.311141967773438, "gen_logits_min": -26.655105590820312, "gen_logits_std": 2.8808932304382324, "gen_loss": 0.3135455846786499, "grad_norm": 0.446522127832333, "learning_rate": 2.3969684210526314e-05, "loss": 0.2872, "mean_copy_accuracy": 0.9945297241210938, "mean_gen_accuracy": 0.8739737868309021, "mean_token_accuracy": 0.9032507538795471, "num_tokens": 410051421.0, "sample_num_tokens": 8980.75, "step": 5510, "total_num_tokens": 410087344.0, "z_loss": 0.0004939117934554815 }, { "copy_logits_max": -2.5069377422332764, "copy_logits_min": -687500032.0, "copy_num_tokens": 497.625, "epoch": 1.1256063313760531, "gen_logits_max": 5.088651180267334, "gen_logits_mean": -13.727633476257324, "gen_logits_min": -25.103086471557617, "gen_logits_std": 2.810235023498535, "gen_loss": 0.23712828755378723, "grad_norm": 0.42550264651912584, "learning_rate": 2.3968421052631578e-05, "loss": 0.2756, "mean_copy_accuracy": 0.9955860674381256, "mean_gen_accuracy": 0.8757770359516144, "mean_token_accuracy": 0.9044819623231888, "num_tokens": 410315361.0, "sample_num_tokens": 8126.25, "step": 5511, "total_num_tokens": 410347866.0, "z_loss": 0.0004234145162627101 }, { "copy_logits_max": -1.0371171236038208, "copy_logits_min": -750000000.0, "copy_num_tokens": 748.5625, "epoch": 1.1258105693132499, "gen_logits_max": 5.407981872558594, "gen_logits_mean": -12.838008880615234, "gen_logits_min": -24.55764389038086, "gen_logits_std": 2.815629005432129, "gen_loss": 0.24489204585552216, "grad_norm": 0.4319454383018859, "learning_rate": 2.3967157894736846e-05, "loss": 0.2688, "mean_copy_accuracy": 0.9941756874322891, "mean_gen_accuracy": 0.8806917667388916, "mean_token_accuracy": 0.9100415408611298, "num_tokens": 410587803.0, "sample_num_tokens": 9712.75, "step": 5512, "total_num_tokens": 410626654.0, "z_loss": 0.00047758762957528234 }, { "copy_logits_max": -1.370382308959961, "copy_logits_min": -750000064.0, "copy_num_tokens": 518.375, "epoch": 1.1260148072504468, "gen_logits_max": 5.489080429077148, "gen_logits_mean": -12.227778434753418, "gen_logits_min": -23.85930633544922, "gen_logits_std": 2.755185127258301, "gen_loss": 0.2456340789794922, "grad_norm": 0.41506336783158704, "learning_rate": 2.3965894736842107e-05, "loss": 0.2759, "mean_copy_accuracy": 0.9961466938257217, "mean_gen_accuracy": 0.8746931701898575, "mean_token_accuracy": 0.9056968539953232, "num_tokens": 410851833.0, "sample_num_tokens": 8788.25, "step": 5513, "total_num_tokens": 410886986.0, "z_loss": 0.0004774336121045053 }, { "copy_logits_max": -3.4574015140533447, "copy_logits_min": -750000000.0, "copy_num_tokens": 378.6875, "epoch": 1.1262190451876437, "gen_logits_max": 4.633937835693359, "gen_logits_mean": -15.032683372497559, "gen_logits_min": -26.32343864440918, "gen_logits_std": 2.8699886798858643, "gen_loss": 0.28563162684440613, "grad_norm": 0.4170971137877487, "learning_rate": 2.396463157894737e-05, "loss": 0.2992, "mean_copy_accuracy": 0.9946407079696655, "mean_gen_accuracy": 0.8703544586896896, "mean_token_accuracy": 0.89692023396492, "num_tokens": 411111453.0, "sample_num_tokens": 8332.25, "step": 5514, "total_num_tokens": 411144782.0, "z_loss": 0.0005555372918024659 }, { "copy_logits_max": -1.885350227355957, "copy_logits_min": -687500032.0, "copy_num_tokens": 568.875, "epoch": 1.1264232831248404, "gen_logits_max": 4.233388900756836, "gen_logits_mean": -15.235214233398438, "gen_logits_min": -27.238231658935547, "gen_logits_std": 2.906339168548584, "gen_loss": 0.25337889790534973, "grad_norm": 0.4164401743626589, "learning_rate": 2.3963368421052632e-05, "loss": 0.2634, "mean_copy_accuracy": 0.9955229014158249, "mean_gen_accuracy": 0.8742198348045349, "mean_token_accuracy": 0.9111012518405914, "num_tokens": 411405668.0, "sample_num_tokens": 8245.5, "step": 5515, "total_num_tokens": 411438650.0, "z_loss": 0.0005553931696340442 }, { "copy_logits_max": -3.0004706382751465, "copy_logits_min": -750000064.0, "copy_num_tokens": 489.3125, "epoch": 1.1266275210620373, "gen_logits_max": 4.816357135772705, "gen_logits_mean": -14.999366760253906, "gen_logits_min": -27.255992889404297, "gen_logits_std": 2.957146644592285, "gen_loss": 0.29032444953918457, "grad_norm": 0.3711445631823188, "learning_rate": 2.3962105263157896e-05, "loss": 0.2745, "mean_copy_accuracy": 0.9958327263593674, "mean_gen_accuracy": 0.8778116106987, "mean_token_accuracy": 0.9079098105430603, "num_tokens": 411682242.0, "sample_num_tokens": 8763.0, "step": 5516, "total_num_tokens": 411717294.0, "z_loss": 0.000626447843387723 }, { "copy_logits_max": -2.1339802742004395, "copy_logits_min": -750000000.0, "copy_num_tokens": 419.375, "epoch": 1.126831758999234, "gen_logits_max": 5.629153251647949, "gen_logits_mean": -13.88040542602539, "gen_logits_min": -25.47428321838379, "gen_logits_std": 2.8403124809265137, "gen_loss": 0.2726615071296692, "grad_norm": 0.38237718458388026, "learning_rate": 2.3960842105263157e-05, "loss": 0.2646, "mean_copy_accuracy": 0.9963714480400085, "mean_gen_accuracy": 0.8831682950258255, "mean_token_accuracy": 0.911096379160881, "num_tokens": 411945962.0, "sample_num_tokens": 8065.5, "step": 5517, "total_num_tokens": 411978224.0, "z_loss": 0.0006210595602169633 }, { "copy_logits_max": -2.186795711517334, "copy_logits_min": -750000000.0, "copy_num_tokens": 512.875, "epoch": 1.127035996936431, "gen_logits_max": 4.394069194793701, "gen_logits_mean": -14.900334358215332, "gen_logits_min": -26.9354190826416, "gen_logits_std": 2.9074606895446777, "gen_loss": 0.2837154269218445, "grad_norm": 0.41266990275538284, "learning_rate": 2.395957894736842e-05, "loss": 0.2727, "mean_copy_accuracy": 0.9954506754875183, "mean_gen_accuracy": 0.8768385350704193, "mean_token_accuracy": 0.9079314023256302, "num_tokens": 412217431.0, "sample_num_tokens": 8702.75, "step": 5518, "total_num_tokens": 412252242.0, "z_loss": 0.0006469901418313384 }, { "copy_logits_max": -3.355034828186035, "copy_logits_min": -750000000.0, "copy_num_tokens": 705.25, "epoch": 1.1272402348736277, "gen_logits_max": 3.9021430015563965, "gen_logits_mean": -15.056354522705078, "gen_logits_min": -26.8038330078125, "gen_logits_std": 2.8701894283294678, "gen_loss": 0.24238453805446625, "grad_norm": 0.40349308035586356, "learning_rate": 2.3958315789473682e-05, "loss": 0.2741, "mean_copy_accuracy": 0.994808241724968, "mean_gen_accuracy": 0.8765165954828262, "mean_token_accuracy": 0.9072475582361221, "num_tokens": 412489714.0, "sample_num_tokens": 9234.5, "step": 5519, "total_num_tokens": 412526652.0, "z_loss": 0.0005533293588086963 }, { "copy_logits_max": -5.068161487579346, "copy_logits_min": -750000000.0, "copy_num_tokens": 452.3125, "epoch": 1.1274444728108246, "gen_logits_max": 4.2322282791137695, "gen_logits_mean": -16.396305084228516, "gen_logits_min": -27.731765747070312, "gen_logits_std": 2.928689956665039, "gen_loss": 0.25502410531044006, "grad_norm": 0.3790189013492236, "learning_rate": 2.395705263157895e-05, "loss": 0.2712, "mean_copy_accuracy": 0.9964647740125656, "mean_gen_accuracy": 0.8781703859567642, "mean_token_accuracy": 0.9067211449146271, "num_tokens": 412768871.0, "sample_num_tokens": 8621.25, "step": 5520, "total_num_tokens": 412803356.0, "z_loss": 0.0005410996964201331 }, { "copy_logits_max": -3.7037618160247803, "copy_logits_min": -750000000.0, "copy_num_tokens": 516.5, "epoch": 1.1276487107480215, "gen_logits_max": 4.067187786102295, "gen_logits_mean": -15.680201530456543, "gen_logits_min": -27.36245346069336, "gen_logits_std": 2.9064486026763916, "gen_loss": 0.2484380602836609, "grad_norm": 0.40715917608475316, "learning_rate": 2.395578947368421e-05, "loss": 0.2558, "mean_copy_accuracy": 0.996345579624176, "mean_gen_accuracy": 0.878700852394104, "mean_token_accuracy": 0.9141189157962799, "num_tokens": 413043380.0, "sample_num_tokens": 8544.0, "step": 5521, "total_num_tokens": 413077556.0, "z_loss": 0.0005381579394452274 }, { "copy_logits_max": -4.471824645996094, "copy_logits_min": -750000000.0, "copy_num_tokens": 491.9375, "epoch": 1.1278529486852182, "gen_logits_max": 4.182697296142578, "gen_logits_mean": -15.522705078125, "gen_logits_min": -26.68006134033203, "gen_logits_std": 2.848787307739258, "gen_loss": 0.2841062545776367, "grad_norm": 0.433273917881675, "learning_rate": 2.3954526315789475e-05, "loss": 0.2917, "mean_copy_accuracy": 0.9939478784799576, "mean_gen_accuracy": 0.87267005443573, "mean_token_accuracy": 0.9000728279352188, "num_tokens": 413296598.0, "sample_num_tokens": 9252.5, "step": 5522, "total_num_tokens": 413333608.0, "z_loss": 0.0006385630695149302 }, { "copy_logits_max": -3.9090378284454346, "copy_logits_min": -750000000.0, "copy_num_tokens": 343.6875, "epoch": 1.1280571866224152, "gen_logits_max": 5.337975978851318, "gen_logits_mean": -14.230979919433594, "gen_logits_min": -25.37769317626953, "gen_logits_std": 2.812389850616455, "gen_loss": 0.28520673513412476, "grad_norm": 0.37340042859958367, "learning_rate": 2.3953263157894736e-05, "loss": 0.2736, "mean_copy_accuracy": 0.9963923245668411, "mean_gen_accuracy": 0.876127153635025, "mean_token_accuracy": 0.9086930155754089, "num_tokens": 413575207.0, "sample_num_tokens": 8064.75, "step": 5523, "total_num_tokens": 413607466.0, "z_loss": 0.0006460017757490277 }, { "copy_logits_max": -4.526998043060303, "copy_logits_min": -750000000.0, "copy_num_tokens": 368.0, "epoch": 1.1282614245596119, "gen_logits_max": 5.467118263244629, "gen_logits_mean": -14.245485305786133, "gen_logits_min": -25.544658660888672, "gen_logits_std": 2.8313615322113037, "gen_loss": 0.2912132143974304, "grad_norm": 0.41935451052692996, "learning_rate": 2.3952e-05, "loss": 0.2768, "mean_copy_accuracy": 0.9953100085258484, "mean_gen_accuracy": 0.8803815394639969, "mean_token_accuracy": 0.9056994020938873, "num_tokens": 413829622.0, "sample_num_tokens": 7190.0, "step": 5524, "total_num_tokens": 413858382.0, "z_loss": 0.0006060402374714613 }, { "copy_logits_max": -0.7254980206489563, "copy_logits_min": -750000000.0, "copy_num_tokens": 552.8125, "epoch": 1.1284656624968088, "gen_logits_max": 5.623712062835693, "gen_logits_mean": -13.12534236907959, "gen_logits_min": -24.73160171508789, "gen_logits_std": 2.8416521549224854, "gen_loss": 0.2973228096961975, "grad_norm": 0.41855087309373534, "learning_rate": 2.3950736842105265e-05, "loss": 0.28, "mean_copy_accuracy": 0.9959498196840286, "mean_gen_accuracy": 0.874147817492485, "mean_token_accuracy": 0.9070709645748138, "num_tokens": 414123230.0, "sample_num_tokens": 9326.0, "step": 5525, "total_num_tokens": 414160534.0, "z_loss": 0.0006389247137121856 }, { "copy_logits_max": -1.5243560075759888, "copy_logits_min": -750000000.0, "copy_num_tokens": 360.0625, "epoch": 1.1286699004340055, "gen_logits_max": 5.141977310180664, "gen_logits_mean": -14.44532585144043, "gen_logits_min": -26.30475616455078, "gen_logits_std": 2.8761515617370605, "gen_loss": 0.3582283556461334, "grad_norm": 0.3944702052584162, "learning_rate": 2.3949473684210526e-05, "loss": 0.3126, "mean_copy_accuracy": 0.9939644932746887, "mean_gen_accuracy": 0.8696571290493011, "mean_token_accuracy": 0.89329394698143, "num_tokens": 414395210.0, "sample_num_tokens": 7377.0, "step": 5526, "total_num_tokens": 414424718.0, "z_loss": 0.0006932247197255492 }, { "copy_logits_max": -1.4065428972244263, "copy_logits_min": -750000000.0, "copy_num_tokens": 438.5625, "epoch": 1.1288741383712024, "gen_logits_max": 4.761359214782715, "gen_logits_mean": -14.326639175415039, "gen_logits_min": -25.915958404541016, "gen_logits_std": 2.8459253311157227, "gen_loss": 0.2993772029876709, "grad_norm": 0.3684748025482603, "learning_rate": 2.394821052631579e-05, "loss": 0.2628, "mean_copy_accuracy": 0.9964954257011414, "mean_gen_accuracy": 0.8794869482517242, "mean_token_accuracy": 0.9091714024543762, "num_tokens": 414684907.0, "sample_num_tokens": 8587.75, "step": 5527, "total_num_tokens": 414719258.0, "z_loss": 0.0006068822112865746 }, { "copy_logits_max": -1.4929436445236206, "copy_logits_min": -625000000.0, "copy_num_tokens": 463.8125, "epoch": 1.1290783763083994, "gen_logits_max": 4.333098411560059, "gen_logits_mean": -14.237346649169922, "gen_logits_min": -26.575288772583008, "gen_logits_std": 2.9244112968444824, "gen_loss": 0.26524415612220764, "grad_norm": 0.4984561329391545, "learning_rate": 2.3946947368421054e-05, "loss": 0.2796, "mean_copy_accuracy": 0.9952674359083176, "mean_gen_accuracy": 0.8763895034790039, "mean_token_accuracy": 0.9054417312145233, "num_tokens": 414951485.0, "sample_num_tokens": 7943.75, "step": 5528, "total_num_tokens": 414983260.0, "z_loss": 0.0005762729560956359 }, { "copy_logits_max": -3.168874979019165, "copy_logits_min": -625000064.0, "copy_num_tokens": 383.75, "epoch": 1.129282614245596, "gen_logits_max": 4.627316951751709, "gen_logits_mean": -15.716730117797852, "gen_logits_min": -27.089540481567383, "gen_logits_std": 2.891864061355591, "gen_loss": 0.33616918325424194, "grad_norm": 0.38812795522643784, "learning_rate": 2.394568421052632e-05, "loss": 0.294, "mean_copy_accuracy": 0.9954857975244522, "mean_gen_accuracy": 0.8698574751615524, "mean_token_accuracy": 0.9011454433202744, "num_tokens": 415221448.0, "sample_num_tokens": 8046.5, "step": 5529, "total_num_tokens": 415253634.0, "z_loss": 0.0005826031556352973 }, { "copy_logits_max": -2.1761770248413086, "copy_logits_min": -687500032.0, "copy_num_tokens": 611.3125, "epoch": 1.129486852182793, "gen_logits_max": 3.8218212127685547, "gen_logits_mean": -14.893613815307617, "gen_logits_min": -26.6328067779541, "gen_logits_std": 2.8975582122802734, "gen_loss": 0.27977287769317627, "grad_norm": 0.45941722087658093, "learning_rate": 2.394442105263158e-05, "loss": 0.2819, "mean_copy_accuracy": 0.9953275620937347, "mean_gen_accuracy": 0.8727588057518005, "mean_token_accuracy": 0.9048901051282883, "num_tokens": 415498091.0, "sample_num_tokens": 8860.75, "step": 5530, "total_num_tokens": 415533534.0, "z_loss": 0.0005180626176297665 }, { "copy_logits_max": -2.1749448776245117, "copy_logits_min": -687500032.0, "copy_num_tokens": 389.625, "epoch": 1.1296910901199897, "gen_logits_max": 4.354040145874023, "gen_logits_mean": -15.364884376525879, "gen_logits_min": -27.26337242126465, "gen_logits_std": 2.95257568359375, "gen_loss": 0.29802340269088745, "grad_norm": 0.3911661642950709, "learning_rate": 2.3943157894736844e-05, "loss": 0.2881, "mean_copy_accuracy": 0.9951287657022476, "mean_gen_accuracy": 0.8745431751012802, "mean_token_accuracy": 0.901584655046463, "num_tokens": 415771258.0, "sample_num_tokens": 7717.0, "step": 5531, "total_num_tokens": 415802126.0, "z_loss": 0.000632861047051847 }, { "copy_logits_max": -1.9730820655822754, "copy_logits_min": -750000000.0, "copy_num_tokens": 449.5, "epoch": 1.1298953280571866, "gen_logits_max": 4.618710994720459, "gen_logits_mean": -14.416936874389648, "gen_logits_min": -26.34980010986328, "gen_logits_std": 2.8798704147338867, "gen_loss": 0.284137487411499, "grad_norm": 0.3803478480166147, "learning_rate": 2.3941894736842105e-05, "loss": 0.2624, "mean_copy_accuracy": 0.9955085813999176, "mean_gen_accuracy": 0.8808209896087646, "mean_token_accuracy": 0.9105506837368011, "num_tokens": 416040074.0, "sample_num_tokens": 8146.5, "step": 5532, "total_num_tokens": 416072660.0, "z_loss": 0.0006606338429264724 }, { "copy_logits_max": -2.6574368476867676, "copy_logits_min": -750000000.0, "copy_num_tokens": 489.8125, "epoch": 1.1300995659943835, "gen_logits_max": 4.014226913452148, "gen_logits_mean": -15.622737884521484, "gen_logits_min": -27.229684829711914, "gen_logits_std": 2.915400266647339, "gen_loss": 0.2543416917324066, "grad_norm": 0.40177974889005214, "learning_rate": 2.394063157894737e-05, "loss": 0.2926, "mean_copy_accuracy": 0.9956194311380386, "mean_gen_accuracy": 0.8732738196849823, "mean_token_accuracy": 0.9003803730010986, "num_tokens": 416322183.0, "sample_num_tokens": 8960.25, "step": 5533, "total_num_tokens": 416358024.0, "z_loss": 0.000635293370578438 }, { "copy_logits_max": -2.540471076965332, "copy_logits_min": -750000000.0, "copy_num_tokens": 572.1875, "epoch": 1.1303038039315803, "gen_logits_max": 4.457587718963623, "gen_logits_mean": -14.846769332885742, "gen_logits_min": -26.968854904174805, "gen_logits_std": 2.9077749252319336, "gen_loss": 0.27107664942741394, "grad_norm": 0.37960277656697283, "learning_rate": 2.393936842105263e-05, "loss": 0.2856, "mean_copy_accuracy": 0.9959911406040192, "mean_gen_accuracy": 0.8819965273141861, "mean_token_accuracy": 0.9055539071559906, "num_tokens": 416601599.0, "sample_num_tokens": 9237.75, "step": 5534, "total_num_tokens": 416638550.0, "z_loss": 0.0006247244891710579 }, { "copy_logits_max": -3.6373653411865234, "copy_logits_min": -750000000.0, "copy_num_tokens": 253.75, "epoch": 1.1305080418687772, "gen_logits_max": 5.8982014656066895, "gen_logits_mean": -13.511922836303711, "gen_logits_min": -25.228919982910156, "gen_logits_std": 2.8608438968658447, "gen_loss": 0.3097500801086426, "grad_norm": 0.41229124171224674, "learning_rate": 2.3938105263157894e-05, "loss": 0.295, "mean_copy_accuracy": 0.9940890669822693, "mean_gen_accuracy": 0.8762692809104919, "mean_token_accuracy": 0.8996749818325043, "num_tokens": 416863305.0, "sample_num_tokens": 7255.25, "step": 5535, "total_num_tokens": 416892326.0, "z_loss": 0.0005865833372808993 }, { "copy_logits_max": -4.013214111328125, "copy_logits_min": -750000000.0, "copy_num_tokens": 493.5625, "epoch": 1.1307122798059739, "gen_logits_max": 3.6688878536224365, "gen_logits_mean": -16.482675552368164, "gen_logits_min": -28.691545486450195, "gen_logits_std": 2.961496114730835, "gen_loss": 0.25711360573768616, "grad_norm": 0.38989536008860964, "learning_rate": 2.393684210526316e-05, "loss": 0.2809, "mean_copy_accuracy": 0.9946189671754837, "mean_gen_accuracy": 0.8780425637960434, "mean_token_accuracy": 0.9037787914276123, "num_tokens": 417122980.0, "sample_num_tokens": 8468.5, "step": 5536, "total_num_tokens": 417156854.0, "z_loss": 0.0005617449060082436 }, { "copy_logits_max": -4.610825538635254, "copy_logits_min": -750000000.0, "copy_num_tokens": 533.75, "epoch": 1.1309165177431708, "gen_logits_max": 4.37945032119751, "gen_logits_mean": -14.64511489868164, "gen_logits_min": -25.954849243164062, "gen_logits_std": 2.8311588764190674, "gen_loss": 0.25387975573539734, "grad_norm": 0.4118820941708873, "learning_rate": 2.3935578947368423e-05, "loss": 0.2648, "mean_copy_accuracy": 0.9949996769428253, "mean_gen_accuracy": 0.8801400065422058, "mean_token_accuracy": 0.9095977544784546, "num_tokens": 417388888.0, "sample_num_tokens": 9009.0, "step": 5537, "total_num_tokens": 417424924.0, "z_loss": 0.0005120729329064488 }, { "copy_logits_max": -4.904512405395508, "copy_logits_min": -750000000.0, "copy_num_tokens": 366.1875, "epoch": 1.1311207556803677, "gen_logits_max": 4.607447147369385, "gen_logits_mean": -14.466761589050293, "gen_logits_min": -26.045822143554688, "gen_logits_std": 2.8166842460632324, "gen_loss": 0.2867332398891449, "grad_norm": 0.36865639105689685, "learning_rate": 2.3934315789473687e-05, "loss": 0.2561, "mean_copy_accuracy": 0.9962151646614075, "mean_gen_accuracy": 0.883011057972908, "mean_token_accuracy": 0.9128983467817307, "num_tokens": 417668877.0, "sample_num_tokens": 7514.25, "step": 5538, "total_num_tokens": 417698934.0, "z_loss": 0.0005752221331931651 }, { "copy_logits_max": -3.817023277282715, "copy_logits_min": -687500032.0, "copy_num_tokens": 457.875, "epoch": 1.1313249936175644, "gen_logits_max": 3.9076735973358154, "gen_logits_mean": -15.164783477783203, "gen_logits_min": -26.37151336669922, "gen_logits_std": 2.875429153442383, "gen_loss": 0.22947707772254944, "grad_norm": 0.3757859368711817, "learning_rate": 2.3933052631578948e-05, "loss": 0.2625, "mean_copy_accuracy": 0.9965543448925018, "mean_gen_accuracy": 0.880255788564682, "mean_token_accuracy": 0.9108108133077621, "num_tokens": 417950533.0, "sample_num_tokens": 8633.75, "step": 5539, "total_num_tokens": 417985068.0, "z_loss": 0.0005089333862997591 }, { "copy_logits_max": -3.1809048652648926, "copy_logits_min": -687500032.0, "copy_num_tokens": 527.625, "epoch": 1.1315292315547614, "gen_logits_max": 4.0385918617248535, "gen_logits_mean": -15.238855361938477, "gen_logits_min": -26.98497772216797, "gen_logits_std": 2.875858783721924, "gen_loss": 0.2487206608057022, "grad_norm": 0.43436490952750767, "learning_rate": 2.3931789473684212e-05, "loss": 0.2961, "mean_copy_accuracy": 0.9969170391559601, "mean_gen_accuracy": 0.8650905638933182, "mean_token_accuracy": 0.8989544361829758, "num_tokens": 418218716.0, "sample_num_tokens": 8899.5, "step": 5540, "total_num_tokens": 418254314.0, "z_loss": 0.0005368174752220511 }, { "copy_logits_max": -2.925877571105957, "copy_logits_min": -750000000.0, "copy_num_tokens": 705.875, "epoch": 1.131733469491958, "gen_logits_max": 3.169137954711914, "gen_logits_mean": -16.042404174804688, "gen_logits_min": -27.838485717773438, "gen_logits_std": 2.9142661094665527, "gen_loss": 0.23664426803588867, "grad_norm": 0.3509640673273212, "learning_rate": 2.3930526315789473e-05, "loss": 0.2677, "mean_copy_accuracy": 0.9967803210020065, "mean_gen_accuracy": 0.8749982267618179, "mean_token_accuracy": 0.9077933579683304, "num_tokens": 418522538.0, "sample_num_tokens": 10029.5, "step": 5541, "total_num_tokens": 418562656.0, "z_loss": 0.0005197043064981699 }, { "copy_logits_max": -3.7329318523406982, "copy_logits_min": -750000000.0, "copy_num_tokens": 584.6875, "epoch": 1.131937707429155, "gen_logits_max": 3.677495002746582, "gen_logits_mean": -15.272205352783203, "gen_logits_min": -26.868412017822266, "gen_logits_std": 2.8790361881256104, "gen_loss": 0.24940572679042816, "grad_norm": 0.37390409057332785, "learning_rate": 2.3929263157894738e-05, "loss": 0.2822, "mean_copy_accuracy": 0.995565339922905, "mean_gen_accuracy": 0.8779554516077042, "mean_token_accuracy": 0.9030014425516129, "num_tokens": 418795937.0, "sample_num_tokens": 8977.75, "step": 5542, "total_num_tokens": 418831848.0, "z_loss": 0.0005334018496796489 }, { "copy_logits_max": -5.177554607391357, "copy_logits_min": -750000000.0, "copy_num_tokens": 545.6875, "epoch": 1.1321419453663517, "gen_logits_max": 4.945720672607422, "gen_logits_mean": -13.10947036743164, "gen_logits_min": -24.565649032592773, "gen_logits_std": 2.7515792846679688, "gen_loss": 0.24429640173912048, "grad_norm": 0.41016018726949877, "learning_rate": 2.3928e-05, "loss": 0.2675, "mean_copy_accuracy": 0.9962678849697113, "mean_gen_accuracy": 0.880292072892189, "mean_token_accuracy": 0.909578412771225, "num_tokens": 419080510.0, "sample_num_tokens": 9225.5, "step": 5543, "total_num_tokens": 419117412.0, "z_loss": 0.000509368721395731 }, { "copy_logits_max": -5.551541328430176, "copy_logits_min": -750000000.0, "copy_num_tokens": 431.3125, "epoch": 1.1323461833035486, "gen_logits_max": 4.251412868499756, "gen_logits_mean": -15.465364456176758, "gen_logits_min": -26.8973445892334, "gen_logits_std": 2.858220338821411, "gen_loss": 0.2814221978187561, "grad_norm": 0.34863758556339136, "learning_rate": 2.3926736842105263e-05, "loss": 0.2517, "mean_copy_accuracy": 0.9967915415763855, "mean_gen_accuracy": 0.8778244256973267, "mean_token_accuracy": 0.9151255041360855, "num_tokens": 419386781.0, "sample_num_tokens": 7858.75, "step": 5544, "total_num_tokens": 419418216.0, "z_loss": 0.0005862382822670043 }, { "copy_logits_max": -5.8960371017456055, "copy_logits_min": -687500032.0, "copy_num_tokens": 266.625, "epoch": 1.1325504212407456, "gen_logits_max": 4.2837443351745605, "gen_logits_mean": -16.208873748779297, "gen_logits_min": -27.25452423095703, "gen_logits_std": 2.8618063926696777, "gen_loss": 0.28978201746940613, "grad_norm": 0.38952785686210445, "learning_rate": 2.3925473684210527e-05, "loss": 0.2775, "mean_copy_accuracy": 0.9951607882976532, "mean_gen_accuracy": 0.8780132830142975, "mean_token_accuracy": 0.906459629535675, "num_tokens": 419651825.0, "sample_num_tokens": 6980.75, "step": 5545, "total_num_tokens": 419679748.0, "z_loss": 0.0005553754745051265 }, { "copy_logits_max": -3.485379934310913, "copy_logits_min": -750000128.0, "copy_num_tokens": 570.9375, "epoch": 1.1327546591779423, "gen_logits_max": 3.971350908279419, "gen_logits_mean": -14.802735328674316, "gen_logits_min": -26.799999237060547, "gen_logits_std": 2.909271717071533, "gen_loss": 0.3128410577774048, "grad_norm": 0.4136449153064311, "learning_rate": 2.392421052631579e-05, "loss": 0.2939, "mean_copy_accuracy": 0.9963637292385101, "mean_gen_accuracy": 0.8692469000816345, "mean_token_accuracy": 0.8990510255098343, "num_tokens": 419895995.0, "sample_num_tokens": 8695.25, "step": 5546, "total_num_tokens": 419930776.0, "z_loss": 0.000656402378808707 }, { "copy_logits_max": -4.819637298583984, "copy_logits_min": -750000000.0, "copy_num_tokens": 347.125, "epoch": 1.1329588971151392, "gen_logits_max": 4.4402265548706055, "gen_logits_mean": -15.201330184936523, "gen_logits_min": -27.035120010375977, "gen_logits_std": 2.8910272121429443, "gen_loss": 0.30625879764556885, "grad_norm": 0.37879597973121837, "learning_rate": 2.3922947368421052e-05, "loss": 0.2657, "mean_copy_accuracy": 0.995876133441925, "mean_gen_accuracy": 0.8808437883853912, "mean_token_accuracy": 0.9101533740758896, "num_tokens": 420180212.0, "sample_num_tokens": 7444.0, "step": 5547, "total_num_tokens": 420209988.0, "z_loss": 0.0006132909329608083 }, { "copy_logits_max": -5.5315704345703125, "copy_logits_min": -750000000.0, "copy_num_tokens": 472.0625, "epoch": 1.133163135052336, "gen_logits_max": 4.367138862609863, "gen_logits_mean": -15.701858520507812, "gen_logits_min": -26.897979736328125, "gen_logits_std": 2.852566957473755, "gen_loss": 0.3181799054145813, "grad_norm": 0.40467342362477043, "learning_rate": 2.3921684210526317e-05, "loss": 0.2866, "mean_copy_accuracy": 0.9959602206945419, "mean_gen_accuracy": 0.8750217854976654, "mean_token_accuracy": 0.903012216091156, "num_tokens": 420447360.0, "sample_num_tokens": 9658.5, "step": 5548, "total_num_tokens": 420485994.0, "z_loss": 0.0006825049058534205 }, { "copy_logits_max": -3.3827731609344482, "copy_logits_min": -687500032.0, "copy_num_tokens": 608.1875, "epoch": 1.1333673729895328, "gen_logits_max": 4.909516334533691, "gen_logits_mean": -12.536985397338867, "gen_logits_min": -23.797496795654297, "gen_logits_std": 2.7465991973876953, "gen_loss": 0.2894566059112549, "grad_norm": 0.37425090934764854, "learning_rate": 2.3920421052631578e-05, "loss": 0.2849, "mean_copy_accuracy": 0.9960187673568726, "mean_gen_accuracy": 0.8732125908136368, "mean_token_accuracy": 0.9022406786680222, "num_tokens": 420722909.0, "sample_num_tokens": 9428.75, "step": 5549, "total_num_tokens": 420760624.0, "z_loss": 0.000668685301207006 }, { "copy_logits_max": -4.371461868286133, "copy_logits_min": -750000000.0, "copy_num_tokens": 495.6875, "epoch": 1.1335716109267295, "gen_logits_max": 3.3591532707214355, "gen_logits_mean": -16.082927703857422, "gen_logits_min": -27.666690826416016, "gen_logits_std": 2.881463050842285, "gen_loss": 0.2711886763572693, "grad_norm": 0.36467071836574405, "learning_rate": 2.3919157894736842e-05, "loss": 0.2786, "mean_copy_accuracy": 0.9962864071130753, "mean_gen_accuracy": 0.874660462141037, "mean_token_accuracy": 0.9067055433988571, "num_tokens": 420996116.0, "sample_num_tokens": 8049.0, "step": 5550, "total_num_tokens": 421028312.0, "z_loss": 0.0005834919866174459 }, { "copy_logits_max": -5.988302707672119, "copy_logits_min": -687500032.0, "copy_num_tokens": 452.125, "epoch": 1.1337758488639265, "gen_logits_max": 4.407871246337891, "gen_logits_mean": -14.382230758666992, "gen_logits_min": -25.641176223754883, "gen_logits_std": 2.779819965362549, "gen_loss": 0.26593974232673645, "grad_norm": 0.4260151378131564, "learning_rate": 2.3917894736842106e-05, "loss": 0.2835, "mean_copy_accuracy": 0.9957834631204605, "mean_gen_accuracy": 0.8752337098121643, "mean_token_accuracy": 0.9057912081480026, "num_tokens": 421264974.0, "sample_num_tokens": 8461.0, "step": 5551, "total_num_tokens": 421298818.0, "z_loss": 0.0005122835282236338 }, { "copy_logits_max": -4.545872211456299, "copy_logits_min": -687500032.0, "copy_num_tokens": 615.3125, "epoch": 1.1339800868011234, "gen_logits_max": 3.6195244789123535, "gen_logits_mean": -14.002584457397461, "gen_logits_min": -24.66893196105957, "gen_logits_std": 2.709567070007324, "gen_loss": 0.25523608922958374, "grad_norm": 0.3924508357169347, "learning_rate": 2.3916631578947367e-05, "loss": 0.2788, "mean_copy_accuracy": 0.9959569573402405, "mean_gen_accuracy": 0.8710415959358215, "mean_token_accuracy": 0.9062744826078415, "num_tokens": 421543178.0, "sample_num_tokens": 9749.5, "step": 5552, "total_num_tokens": 421582176.0, "z_loss": 0.00043473680852912366 }, { "copy_logits_max": -5.316121578216553, "copy_logits_min": -750000000.0, "copy_num_tokens": 360.1875, "epoch": 1.13418432473832, "gen_logits_max": 4.963473796844482, "gen_logits_mean": -13.755614280700684, "gen_logits_min": -24.773540496826172, "gen_logits_std": 2.7854835987091064, "gen_loss": 0.268594890832901, "grad_norm": 0.34282017544873356, "learning_rate": 2.3915368421052635e-05, "loss": 0.2727, "mean_copy_accuracy": 0.9954385459423065, "mean_gen_accuracy": 0.8769705891609192, "mean_token_accuracy": 0.9077214449644089, "num_tokens": 421832450.0, "sample_num_tokens": 7639.0, "step": 5553, "total_num_tokens": 421863006.0, "z_loss": 0.0005401397356763482 }, { "copy_logits_max": -4.426311492919922, "copy_logits_min": -687500032.0, "copy_num_tokens": 515.25, "epoch": 1.134388562675517, "gen_logits_max": 3.9546709060668945, "gen_logits_mean": -14.775640487670898, "gen_logits_min": -25.653291702270508, "gen_logits_std": 2.749105215072632, "gen_loss": 0.2886202335357666, "grad_norm": 0.3794004119017576, "learning_rate": 2.3914105263157896e-05, "loss": 0.2864, "mean_copy_accuracy": 0.9963019043207169, "mean_gen_accuracy": 0.8732339888811111, "mean_token_accuracy": 0.9031806737184525, "num_tokens": 422105873.0, "sample_num_tokens": 9170.25, "step": 5554, "total_num_tokens": 422142554.0, "z_loss": 0.0005277104792185128 }, { "copy_logits_max": -3.08445405960083, "copy_logits_min": -750000000.0, "copy_num_tokens": 399.4375, "epoch": 1.1345928006127137, "gen_logits_max": 3.641780376434326, "gen_logits_mean": -15.261341094970703, "gen_logits_min": -26.232439041137695, "gen_logits_std": 2.7770943641662598, "gen_loss": 0.304914653301239, "grad_norm": 0.3897256735346261, "learning_rate": 2.391284210526316e-05, "loss": 0.2848, "mean_copy_accuracy": 0.9970552623271942, "mean_gen_accuracy": 0.8718696385622025, "mean_token_accuracy": 0.9013606160879135, "num_tokens": 422369582.0, "sample_num_tokens": 7205.0, "step": 5555, "total_num_tokens": 422398402.0, "z_loss": 0.0005454670172184706 }, { "copy_logits_max": -2.7809362411499023, "copy_logits_min": -750000000.0, "copy_num_tokens": 662.75, "epoch": 1.1347970385499107, "gen_logits_max": 3.0305016040802, "gen_logits_mean": -14.705455780029297, "gen_logits_min": -25.5535888671875, "gen_logits_std": 2.7534584999084473, "gen_loss": 0.21525517106056213, "grad_norm": 0.4057477862644235, "learning_rate": 2.391157894736842e-05, "loss": 0.2765, "mean_copy_accuracy": 0.9968899339437485, "mean_gen_accuracy": 0.8728252947330475, "mean_token_accuracy": 0.9063472300767899, "num_tokens": 422654126.0, "sample_num_tokens": 9184.5, "step": 5556, "total_num_tokens": 422690864.0, "z_loss": 0.00037499945028685033 }, { "copy_logits_max": -5.094859600067139, "copy_logits_min": -750000000.0, "copy_num_tokens": 503.5, "epoch": 1.1350012764871074, "gen_logits_max": 2.4450697898864746, "gen_logits_mean": -17.645118713378906, "gen_logits_min": -28.91788101196289, "gen_logits_std": 2.91268253326416, "gen_loss": 0.282008558511734, "grad_norm": 0.3350526370612076, "learning_rate": 2.3910315789473685e-05, "loss": 0.2817, "mean_copy_accuracy": 0.9962736517190933, "mean_gen_accuracy": 0.8731286972761154, "mean_token_accuracy": 0.9050575196743011, "num_tokens": 422943307.0, "sample_num_tokens": 8670.75, "step": 5557, "total_num_tokens": 422977990.0, "z_loss": 0.0004702191217802465 }, { "copy_logits_max": -4.7669758796691895, "copy_logits_min": -750000000.0, "copy_num_tokens": 502.75, "epoch": 1.1352055144243043, "gen_logits_max": 4.3634843826293945, "gen_logits_mean": -14.29194450378418, "gen_logits_min": -25.091833114624023, "gen_logits_std": 2.7291433811187744, "gen_loss": 0.2566484808921814, "grad_norm": 0.417453031261614, "learning_rate": 2.3909052631578946e-05, "loss": 0.2608, "mean_copy_accuracy": 0.9962026625871658, "mean_gen_accuracy": 0.879893109202385, "mean_token_accuracy": 0.9128208607435226, "num_tokens": 423215644.0, "sample_num_tokens": 8685.5, "step": 5558, "total_num_tokens": 423250386.0, "z_loss": 0.0004253237275406718 }, { "copy_logits_max": -5.434142589569092, "copy_logits_min": -750000000.0, "copy_num_tokens": 504.5625, "epoch": 1.1354097523615012, "gen_logits_max": 2.8909754753112793, "gen_logits_mean": -16.815942764282227, "gen_logits_min": -28.09426498413086, "gen_logits_std": 2.8869571685791016, "gen_loss": 0.24586895108222961, "grad_norm": 0.3915942370328071, "learning_rate": 2.390778947368421e-05, "loss": 0.261, "mean_copy_accuracy": 0.9967057555913925, "mean_gen_accuracy": 0.8835137039422989, "mean_token_accuracy": 0.9108688831329346, "num_tokens": 423480670.0, "sample_num_tokens": 8349.5, "step": 5559, "total_num_tokens": 423514068.0, "z_loss": 0.0004258360422682017 }, { "copy_logits_max": -5.232870578765869, "copy_logits_min": -750000000.0, "copy_num_tokens": 507.25, "epoch": 1.135613990298698, "gen_logits_max": 3.7233645915985107, "gen_logits_mean": -14.68712043762207, "gen_logits_min": -25.448097229003906, "gen_logits_std": 2.7735958099365234, "gen_loss": 0.26509565114974976, "grad_norm": 0.386605327272506, "learning_rate": 2.390652631578947e-05, "loss": 0.2611, "mean_copy_accuracy": 0.9967725872993469, "mean_gen_accuracy": 0.8846317827701569, "mean_token_accuracy": 0.9096737653017044, "num_tokens": 423750379.0, "sample_num_tokens": 9009.75, "step": 5560, "total_num_tokens": 423786418.0, "z_loss": 0.00047776722931303084 }, { "copy_logits_max": -6.150875568389893, "copy_logits_min": -750000000.0, "copy_num_tokens": 446.125, "epoch": 1.1358182282358948, "gen_logits_max": 3.5517446994781494, "gen_logits_mean": -16.637935638427734, "gen_logits_min": -27.68912124633789, "gen_logits_std": 2.8549394607543945, "gen_loss": 0.296744167804718, "grad_norm": 0.3742902491972613, "learning_rate": 2.390526315789474e-05, "loss": 0.293, "mean_copy_accuracy": 0.996063306927681, "mean_gen_accuracy": 0.8741486519575119, "mean_token_accuracy": 0.9019406139850616, "num_tokens": 424041854.0, "sample_num_tokens": 9622.5, "step": 5561, "total_num_tokens": 424080344.0, "z_loss": 0.00054847018327564 }, { "copy_logits_max": -3.5598697662353516, "copy_logits_min": -750000000.0, "copy_num_tokens": 496.625, "epoch": 1.1360224661730918, "gen_logits_max": 3.5985352993011475, "gen_logits_mean": -15.024160385131836, "gen_logits_min": -25.7152042388916, "gen_logits_std": 2.8034565448760986, "gen_loss": 0.2582939863204956, "grad_norm": 0.39582375509583645, "learning_rate": 2.3904e-05, "loss": 0.2767, "mean_copy_accuracy": 0.9963445216417313, "mean_gen_accuracy": 0.8781729340553284, "mean_token_accuracy": 0.9069804847240448, "num_tokens": 424324933.0, "sample_num_tokens": 9209.25, "step": 5562, "total_num_tokens": 424361770.0, "z_loss": 0.0004556483472697437 }, { "copy_logits_max": -5.135609149932861, "copy_logits_min": -750000000.0, "copy_num_tokens": 409.5, "epoch": 1.1362267041102885, "gen_logits_max": 4.014243125915527, "gen_logits_mean": -15.035202980041504, "gen_logits_min": -26.390214920043945, "gen_logits_std": 2.856628894805908, "gen_loss": 0.26054733991622925, "grad_norm": 0.38904499101367856, "learning_rate": 2.3902736842105264e-05, "loss": 0.2724, "mean_copy_accuracy": 0.9960718005895615, "mean_gen_accuracy": 0.8786035925149918, "mean_token_accuracy": 0.9073466062545776, "num_tokens": 424576428.0, "sample_num_tokens": 7605.5, "step": 5563, "total_num_tokens": 424606850.0, "z_loss": 0.00058228854322806 }, { "copy_logits_max": 0.1300162374973297, "copy_logits_min": -687500032.0, "copy_num_tokens": 464.9375, "epoch": 1.1364309420474854, "gen_logits_max": 3.9909262657165527, "gen_logits_mean": -14.024229049682617, "gen_logits_min": -25.37844467163086, "gen_logits_std": 2.8225953578948975, "gen_loss": 0.26045775413513184, "grad_norm": 0.4008048258979439, "learning_rate": 2.3901473684210525e-05, "loss": 0.2832, "mean_copy_accuracy": 0.995929405093193, "mean_gen_accuracy": 0.8747693449258804, "mean_token_accuracy": 0.9045207351446152, "num_tokens": 424830444.0, "sample_num_tokens": 7419.0, "step": 5564, "total_num_tokens": 424860120.0, "z_loss": 0.0005435208440758288 }, { "copy_logits_max": -1.7168737649917603, "copy_logits_min": -750000128.0, "copy_num_tokens": 536.0625, "epoch": 1.136635179984682, "gen_logits_max": 3.4650704860687256, "gen_logits_mean": -15.417073249816895, "gen_logits_min": -27.518356323242188, "gen_logits_std": 2.9257287979125977, "gen_loss": 0.2731022834777832, "grad_norm": 0.368691541634719, "learning_rate": 2.390021052631579e-05, "loss": 0.2689, "mean_copy_accuracy": 0.9969934970140457, "mean_gen_accuracy": 0.8761675506830215, "mean_token_accuracy": 0.9097846150398254, "num_tokens": 425112621.0, "sample_num_tokens": 8749.25, "step": 5565, "total_num_tokens": 425147618.0, "z_loss": 0.0005239835008978844 }, { "copy_logits_max": -2.4341955184936523, "copy_logits_min": -750000000.0, "copy_num_tokens": 498.4375, "epoch": 1.136839417921879, "gen_logits_max": 4.571323394775391, "gen_logits_mean": -14.769736289978027, "gen_logits_min": -26.589111328125, "gen_logits_std": 2.955582857131958, "gen_loss": 0.2554088234901428, "grad_norm": 0.36626241018524813, "learning_rate": 2.3898947368421054e-05, "loss": 0.261, "mean_copy_accuracy": 0.996755912899971, "mean_gen_accuracy": 0.8814618438482285, "mean_token_accuracy": 0.9117449671030045, "num_tokens": 425388143.0, "sample_num_tokens": 8606.75, "step": 5566, "total_num_tokens": 425422570.0, "z_loss": 0.0004951114533469081 }, { "copy_logits_max": -4.350772857666016, "copy_logits_min": -750000000.0, "copy_num_tokens": 368.1875, "epoch": 1.1370436558590757, "gen_logits_max": 3.9250776767730713, "gen_logits_mean": -16.365009307861328, "gen_logits_min": -28.144580841064453, "gen_logits_std": 2.975339889526367, "gen_loss": 0.28848493099212646, "grad_norm": 0.3677727199008568, "learning_rate": 2.3897684210526315e-05, "loss": 0.2726, "mean_copy_accuracy": 0.995201900601387, "mean_gen_accuracy": 0.8759967535734177, "mean_token_accuracy": 0.9079515635967255, "num_tokens": 425685288.0, "sample_num_tokens": 7657.0, "step": 5567, "total_num_tokens": 425715916.0, "z_loss": 0.0005323410150595009 }, { "copy_logits_max": -3.647136926651001, "copy_logits_min": -750000000.0, "copy_num_tokens": 663.125, "epoch": 1.1372478937962727, "gen_logits_max": 4.464642524719238, "gen_logits_mean": -14.366003036499023, "gen_logits_min": -26.53732681274414, "gen_logits_std": 2.9384765625, "gen_loss": 0.2968931794166565, "grad_norm": 0.4240006446863757, "learning_rate": 2.389642105263158e-05, "loss": 0.3085, "mean_copy_accuracy": 0.9954187124967575, "mean_gen_accuracy": 0.867760956287384, "mean_token_accuracy": 0.8987551033496857, "num_tokens": 425962127.0, "sample_num_tokens": 9824.75, "step": 5568, "total_num_tokens": 426001426.0, "z_loss": 0.0005829380243085325 }, { "copy_logits_max": -2.0040266513824463, "copy_logits_min": -687500032.0, "copy_num_tokens": 486.9375, "epoch": 1.1374521317334696, "gen_logits_max": 4.236108779907227, "gen_logits_mean": -14.774169921875, "gen_logits_min": -26.707414627075195, "gen_logits_std": 2.929152488708496, "gen_loss": 0.28957152366638184, "grad_norm": 0.40069530967648015, "learning_rate": 2.3895157894736844e-05, "loss": 0.3059, "mean_copy_accuracy": 0.994434729218483, "mean_gen_accuracy": 0.8683590590953827, "mean_token_accuracy": 0.8962926268577576, "num_tokens": 426225848.0, "sample_num_tokens": 8596.0, "step": 5569, "total_num_tokens": 426260232.0, "z_loss": 0.0005722015048377216 }, { "copy_logits_max": -3.5612404346466064, "copy_logits_min": -750000000.0, "copy_num_tokens": 329.8125, "epoch": 1.1376563696706663, "gen_logits_max": 5.279234409332275, "gen_logits_mean": -13.596514701843262, "gen_logits_min": -25.159648895263672, "gen_logits_std": 2.8624584674835205, "gen_loss": 0.27486610412597656, "grad_norm": 0.40473078865389345, "learning_rate": 2.3893894736842108e-05, "loss": 0.284, "mean_copy_accuracy": 0.996019184589386, "mean_gen_accuracy": 0.8724746704101562, "mean_token_accuracy": 0.9033172130584717, "num_tokens": 426485834.0, "sample_num_tokens": 6938.5, "step": 5570, "total_num_tokens": 426513588.0, "z_loss": 0.000549274031072855 }, { "copy_logits_max": -1.313670039176941, "copy_logits_min": -687500032.0, "copy_num_tokens": 639.9375, "epoch": 1.1378606076078632, "gen_logits_max": 4.18569278717041, "gen_logits_mean": -14.531280517578125, "gen_logits_min": -27.057435989379883, "gen_logits_std": 2.938298225402832, "gen_loss": 0.2637275457382202, "grad_norm": 0.40100803085443915, "learning_rate": 2.389263157894737e-05, "loss": 0.2637, "mean_copy_accuracy": 0.9970001429319382, "mean_gen_accuracy": 0.8778017610311508, "mean_token_accuracy": 0.9113422632217407, "num_tokens": 426754371.0, "sample_num_tokens": 8987.75, "step": 5571, "total_num_tokens": 426790322.0, "z_loss": 0.0005157201085239649 }, { "copy_logits_max": -2.9520936012268066, "copy_logits_min": -687500032.0, "copy_num_tokens": 518.125, "epoch": 1.13806484554506, "gen_logits_max": 3.5181751251220703, "gen_logits_mean": -16.204402923583984, "gen_logits_min": -28.623016357421875, "gen_logits_std": 3.0003957748413086, "gen_loss": 0.2750556468963623, "grad_norm": 0.4010955906118479, "learning_rate": 2.3891368421052633e-05, "loss": 0.2903, "mean_copy_accuracy": 0.9961305111646652, "mean_gen_accuracy": 0.8714366555213928, "mean_token_accuracy": 0.9006496071815491, "num_tokens": 427023277.0, "sample_num_tokens": 8582.75, "step": 5572, "total_num_tokens": 427057608.0, "z_loss": 0.0005251654074527323 }, { "copy_logits_max": -3.5673584938049316, "copy_logits_min": -750000000.0, "copy_num_tokens": 566.125, "epoch": 1.1382690834822569, "gen_logits_max": 3.5880534648895264, "gen_logits_mean": -16.15053939819336, "gen_logits_min": -28.206327438354492, "gen_logits_std": 3.011835813522339, "gen_loss": 0.24894331395626068, "grad_norm": 0.40375317490589224, "learning_rate": 2.3890105263157894e-05, "loss": 0.2625, "mean_copy_accuracy": 0.9955523908138275, "mean_gen_accuracy": 0.8763379752635956, "mean_token_accuracy": 0.9103465229272842, "num_tokens": 427291638.0, "sample_num_tokens": 9030.5, "step": 5573, "total_num_tokens": 427327760.0, "z_loss": 0.0004812374827452004 }, { "copy_logits_max": -2.2769532203674316, "copy_logits_min": -750000000.0, "copy_num_tokens": 470.75, "epoch": 1.1384733214194536, "gen_logits_max": 3.5508463382720947, "gen_logits_mean": -15.431449890136719, "gen_logits_min": -27.248836517333984, "gen_logits_std": 2.960153102874756, "gen_loss": 0.25964123010635376, "grad_norm": 0.36803557488234495, "learning_rate": 2.388884210526316e-05, "loss": 0.2639, "mean_copy_accuracy": 0.9949965327978134, "mean_gen_accuracy": 0.8803318440914154, "mean_token_accuracy": 0.9103342443704605, "num_tokens": 427561319.0, "sample_num_tokens": 7359.25, "step": 5574, "total_num_tokens": 427590756.0, "z_loss": 0.0005442939000204206 }, { "copy_logits_max": -2.1729681491851807, "copy_logits_min": -687500032.0, "copy_num_tokens": 406.3125, "epoch": 1.1386775593566505, "gen_logits_max": 4.535190582275391, "gen_logits_mean": -15.000823974609375, "gen_logits_min": -27.002708435058594, "gen_logits_std": 2.9330339431762695, "gen_loss": 0.3066336512565613, "grad_norm": 0.3898755260849942, "learning_rate": 2.388757894736842e-05, "loss": 0.2886, "mean_copy_accuracy": 0.9962680637836456, "mean_gen_accuracy": 0.8693745881319046, "mean_token_accuracy": 0.9032358080148697, "num_tokens": 427832421.0, "sample_num_tokens": 8110.25, "step": 5575, "total_num_tokens": 427864862.0, "z_loss": 0.0005963140865787864 }, { "copy_logits_max": -1.0748341083526611, "copy_logits_min": -750000000.0, "copy_num_tokens": 550.875, "epoch": 1.1388817972938474, "gen_logits_max": 4.438749313354492, "gen_logits_mean": -14.326958656311035, "gen_logits_min": -26.472164154052734, "gen_logits_std": 2.9327754974365234, "gen_loss": 0.28556329011917114, "grad_norm": 0.41333738537664505, "learning_rate": 2.3886315789473684e-05, "loss": 0.2733, "mean_copy_accuracy": 0.9971231669187546, "mean_gen_accuracy": 0.8766660839319229, "mean_token_accuracy": 0.9064957797527313, "num_tokens": 428106742.0, "sample_num_tokens": 8753.5, "step": 5576, "total_num_tokens": 428141756.0, "z_loss": 0.0005520760314539075 }, { "copy_logits_max": -3.733901023864746, "copy_logits_min": -562500032.0, "copy_num_tokens": 673.5625, "epoch": 1.1390860352310441, "gen_logits_max": 4.415882587432861, "gen_logits_mean": -13.324210166931152, "gen_logits_min": -25.64929962158203, "gen_logits_std": 2.877446413040161, "gen_loss": 0.27496322989463806, "grad_norm": 0.41612027052170986, "learning_rate": 2.3885052631578948e-05, "loss": 0.2699, "mean_copy_accuracy": 0.9959108829498291, "mean_gen_accuracy": 0.8787797689437866, "mean_token_accuracy": 0.9081276655197144, "num_tokens": 428357989.0, "sample_num_tokens": 10080.25, "step": 5577, "total_num_tokens": 428398310.0, "z_loss": 0.0006047995411790907 }, { "copy_logits_max": -3.9333364963531494, "copy_logits_min": -750000000.0, "copy_num_tokens": 478.25, "epoch": 1.139290273168241, "gen_logits_max": 5.155252456665039, "gen_logits_mean": -13.773712158203125, "gen_logits_min": -25.405738830566406, "gen_logits_std": 2.847024917602539, "gen_loss": 0.29854536056518555, "grad_norm": 0.36652175439927565, "learning_rate": 2.3883789473684212e-05, "loss": 0.2775, "mean_copy_accuracy": 0.9958472698926926, "mean_gen_accuracy": 0.8771462887525558, "mean_token_accuracy": 0.9071818441152573, "num_tokens": 428657474.0, "sample_num_tokens": 9498.0, "step": 5578, "total_num_tokens": 428695466.0, "z_loss": 0.0005972989602014422 }, { "copy_logits_max": -7.263068675994873, "copy_logits_min": -750000000.0, "copy_num_tokens": 346.0625, "epoch": 1.1394945111054378, "gen_logits_max": 5.047996997833252, "gen_logits_mean": -13.896146774291992, "gen_logits_min": -25.652042388916016, "gen_logits_std": 2.8684983253479004, "gen_loss": 0.2519912123680115, "grad_norm": 0.40112763863491097, "learning_rate": 2.3882526315789477e-05, "loss": 0.2561, "mean_copy_accuracy": 0.9968221485614777, "mean_gen_accuracy": 0.8866908848285675, "mean_token_accuracy": 0.9131563007831573, "num_tokens": 428928238.0, "sample_num_tokens": 7478.5, "step": 5579, "total_num_tokens": 428958152.0, "z_loss": 0.000514848215971142 }, { "copy_logits_max": -5.727221488952637, "copy_logits_min": -750000000.0, "copy_num_tokens": 445.3125, "epoch": 1.1396987490426347, "gen_logits_max": 4.639763355255127, "gen_logits_mean": -14.42106819152832, "gen_logits_min": -25.97247886657715, "gen_logits_std": 2.9074268341064453, "gen_loss": 0.24710695445537567, "grad_norm": 0.3895689454869006, "learning_rate": 2.3881263157894737e-05, "loss": 0.2788, "mean_copy_accuracy": 0.9966219067573547, "mean_gen_accuracy": 0.8773259222507477, "mean_token_accuracy": 0.9044476449489594, "num_tokens": 429197774.0, "sample_num_tokens": 7432.0, "step": 5580, "total_num_tokens": 429227502.0, "z_loss": 0.0004679191915784031 }, { "copy_logits_max": -5.139167308807373, "copy_logits_min": -687500032.0, "copy_num_tokens": 372.8125, "epoch": 1.1399029869798314, "gen_logits_max": 4.9908037185668945, "gen_logits_mean": -14.80133056640625, "gen_logits_min": -26.91408920288086, "gen_logits_std": 2.903614044189453, "gen_loss": 0.3101302981376648, "grad_norm": 0.3788085066682634, "learning_rate": 2.3880000000000002e-05, "loss": 0.2823, "mean_copy_accuracy": 0.996227815747261, "mean_gen_accuracy": 0.8761871308088303, "mean_token_accuracy": 0.9031463712453842, "num_tokens": 429460732.0, "sample_num_tokens": 8538.0, "step": 5581, "total_num_tokens": 429494884.0, "z_loss": 0.0006081403698772192 }, { "copy_logits_max": -2.8717989921569824, "copy_logits_min": -750000000.0, "copy_num_tokens": 669.375, "epoch": 1.1401072249170283, "gen_logits_max": 4.108848571777344, "gen_logits_mean": -14.90706729888916, "gen_logits_min": -26.984432220458984, "gen_logits_std": 2.954993724822998, "gen_loss": 0.2572594881057739, "grad_norm": 0.37646250325265657, "learning_rate": 2.3878736842105263e-05, "loss": 0.2784, "mean_copy_accuracy": 0.9954644739627838, "mean_gen_accuracy": 0.8748302161693573, "mean_token_accuracy": 0.9053317606449127, "num_tokens": 429750268.0, "sample_num_tokens": 10272.5, "step": 5582, "total_num_tokens": 429791358.0, "z_loss": 0.0004994040937162936 }, { "copy_logits_max": -6.635986328125, "copy_logits_min": -750000000.0, "copy_num_tokens": 466.875, "epoch": 1.1403114628542252, "gen_logits_max": 4.150376796722412, "gen_logits_mean": -15.522445678710938, "gen_logits_min": -26.881237030029297, "gen_logits_std": 2.8559117317199707, "gen_loss": 0.30211713910102844, "grad_norm": 0.35220891598093473, "learning_rate": 2.3877473684210527e-05, "loss": 0.2779, "mean_copy_accuracy": 0.9954358786344528, "mean_gen_accuracy": 0.8783149719238281, "mean_token_accuracy": 0.9059916585683823, "num_tokens": 430031090.0, "sample_num_tokens": 9818.5, "step": 5583, "total_num_tokens": 430070364.0, "z_loss": 0.0005752833094447851 }, { "copy_logits_max": -2.5700411796569824, "copy_logits_min": -750000000.0, "copy_num_tokens": 474.25, "epoch": 1.140515700791422, "gen_logits_max": 4.667112827301025, "gen_logits_mean": -13.964889526367188, "gen_logits_min": -25.55436134338379, "gen_logits_std": 2.906209945678711, "gen_loss": 0.290738046169281, "grad_norm": 0.3848039741250783, "learning_rate": 2.3876210526315788e-05, "loss": 0.2766, "mean_copy_accuracy": 0.9958954602479935, "mean_gen_accuracy": 0.8754754662513733, "mean_token_accuracy": 0.9072587341070175, "num_tokens": 430305373.0, "sample_num_tokens": 8303.25, "step": 5584, "total_num_tokens": 430338586.0, "z_loss": 0.0005786096444353461 }, { "copy_logits_max": -3.8468713760375977, "copy_logits_min": -687500032.0, "copy_num_tokens": 610.0, "epoch": 1.1407199387286189, "gen_logits_max": 4.46949577331543, "gen_logits_mean": -13.198251724243164, "gen_logits_min": -24.682353973388672, "gen_logits_std": 2.8136415481567383, "gen_loss": 0.27243244647979736, "grad_norm": 0.40052346985100296, "learning_rate": 2.3874947368421056e-05, "loss": 0.2953, "mean_copy_accuracy": 0.9962902516126633, "mean_gen_accuracy": 0.8664172440767288, "mean_token_accuracy": 0.900234043598175, "num_tokens": 430576754.0, "sample_num_tokens": 9435.5, "step": 5585, "total_num_tokens": 430614496.0, "z_loss": 0.0006385314627550542 }, { "copy_logits_max": -3.6795456409454346, "copy_logits_min": -750000000.0, "copy_num_tokens": 399.0, "epoch": 1.1409241766658156, "gen_logits_max": 4.047097206115723, "gen_logits_mean": -16.089195251464844, "gen_logits_min": -27.673330307006836, "gen_logits_std": 2.979224681854248, "gen_loss": 0.2797168493270874, "grad_norm": 0.3986667944166595, "learning_rate": 2.3873684210526317e-05, "loss": 0.2994, "mean_copy_accuracy": 0.9953903704881668, "mean_gen_accuracy": 0.8741028159856796, "mean_token_accuracy": 0.9003956317901611, "num_tokens": 430840537.0, "sample_num_tokens": 8019.75, "step": 5586, "total_num_tokens": 430872616.0, "z_loss": 0.0005470102187246084 }, { "copy_logits_max": -2.7723770141601562, "copy_logits_min": -687500032.0, "copy_num_tokens": 757.25, "epoch": 1.1411284146030125, "gen_logits_max": 3.9571924209594727, "gen_logits_mean": -14.414600372314453, "gen_logits_min": -26.451374053955078, "gen_logits_std": 2.941009283065796, "gen_loss": 0.24898505210876465, "grad_norm": 0.40981799853738643, "learning_rate": 2.387242105263158e-05, "loss": 0.289, "mean_copy_accuracy": 0.9969027191400528, "mean_gen_accuracy": 0.8715482503175735, "mean_token_accuracy": 0.9021328091621399, "num_tokens": 431112269.0, "sample_num_tokens": 10484.25, "step": 5587, "total_num_tokens": 431154206.0, "z_loss": 0.000540752662345767 }, { "copy_logits_max": -4.174459457397461, "copy_logits_min": -687500032.0, "copy_num_tokens": 364.75, "epoch": 1.1413326525402094, "gen_logits_max": 4.438365459442139, "gen_logits_mean": -15.423648834228516, "gen_logits_min": -27.172149658203125, "gen_logits_std": 2.94115948677063, "gen_loss": 0.26806217432022095, "grad_norm": 0.37819412590827034, "learning_rate": 2.3871157894736842e-05, "loss": 0.2938, "mean_copy_accuracy": 0.9948548525571823, "mean_gen_accuracy": 0.8764840215444565, "mean_token_accuracy": 0.9013352543115616, "num_tokens": 431376443.0, "sample_num_tokens": 7346.75, "step": 5588, "total_num_tokens": 431405830.0, "z_loss": 0.0005032034823670983 }, { "copy_logits_max": -5.984588146209717, "copy_logits_min": -687500032.0, "copy_num_tokens": 424.25, "epoch": 1.1415368904774061, "gen_logits_max": 3.9979286193847656, "gen_logits_mean": -16.253625869750977, "gen_logits_min": -27.571672439575195, "gen_logits_std": 2.9287567138671875, "gen_loss": 0.2862856388092041, "grad_norm": 0.4086379837883803, "learning_rate": 2.3869894736842106e-05, "loss": 0.2851, "mean_copy_accuracy": 0.996276468038559, "mean_gen_accuracy": 0.8754323869943619, "mean_token_accuracy": 0.902527779340744, "num_tokens": 431639013.0, "sample_num_tokens": 8044.75, "step": 5589, "total_num_tokens": 431671192.0, "z_loss": 0.0004801478935405612 }, { "copy_logits_max": -5.787014961242676, "copy_logits_min": -750000000.0, "copy_num_tokens": 404.5, "epoch": 1.141741128414603, "gen_logits_max": 4.3625640869140625, "gen_logits_mean": -15.40121841430664, "gen_logits_min": -27.07575225830078, "gen_logits_std": 2.9494404792785645, "gen_loss": 0.27025461196899414, "grad_norm": 0.3988019403840983, "learning_rate": 2.3868631578947367e-05, "loss": 0.2797, "mean_copy_accuracy": 0.9952231645584106, "mean_gen_accuracy": 0.8775406777858734, "mean_token_accuracy": 0.9051103442907333, "num_tokens": 431887157.0, "sample_num_tokens": 8049.25, "step": 5590, "total_num_tokens": 431919354.0, "z_loss": 0.0005042980774305761 }, { "copy_logits_max": -5.589992523193359, "copy_logits_min": -750000000.0, "copy_num_tokens": 420.8125, "epoch": 1.1419453663517998, "gen_logits_max": 4.263158798217773, "gen_logits_mean": -14.311300277709961, "gen_logits_min": -25.893550872802734, "gen_logits_std": 2.8164963722229004, "gen_loss": 0.2544400095939636, "grad_norm": 0.366377457298187, "learning_rate": 2.386736842105263e-05, "loss": 0.2666, "mean_copy_accuracy": 0.9963107854127884, "mean_gen_accuracy": 0.8834778368473053, "mean_token_accuracy": 0.9104428142309189, "num_tokens": 432140060.0, "sample_num_tokens": 7387.0, "step": 5591, "total_num_tokens": 432169608.0, "z_loss": 0.0005266335792839527 }, { "copy_logits_max": -5.693448066711426, "copy_logits_min": -687500032.0, "copy_num_tokens": 431.8125, "epoch": 1.1421496042889967, "gen_logits_max": 4.732664108276367, "gen_logits_mean": -14.57421588897705, "gen_logits_min": -26.031906127929688, "gen_logits_std": 2.873599052429199, "gen_loss": 0.2944629192352295, "grad_norm": 0.37822849263688474, "learning_rate": 2.3866105263157896e-05, "loss": 0.2799, "mean_copy_accuracy": 0.9957656860351562, "mean_gen_accuracy": 0.8757658749818802, "mean_token_accuracy": 0.9051344692707062, "num_tokens": 432406207.0, "sample_num_tokens": 8701.25, "step": 5592, "total_num_tokens": 432441012.0, "z_loss": 0.0005655569257214665 }, { "copy_logits_max": -4.435312747955322, "copy_logits_min": -750000000.0, "copy_num_tokens": 423.4375, "epoch": 1.1423538422261936, "gen_logits_max": 4.524343490600586, "gen_logits_mean": -14.638673782348633, "gen_logits_min": -26.36865997314453, "gen_logits_std": 2.909839153289795, "gen_loss": 0.27062004804611206, "grad_norm": 0.3915427612993499, "learning_rate": 2.3864842105263157e-05, "loss": 0.2786, "mean_copy_accuracy": 0.9963885992765427, "mean_gen_accuracy": 0.8749574273824692, "mean_token_accuracy": 0.90513114631176, "num_tokens": 432669350.0, "sample_num_tokens": 8556.5, "step": 5593, "total_num_tokens": 432703576.0, "z_loss": 0.0004976177588105202 }, { "copy_logits_max": -6.593204498291016, "copy_logits_min": -687500032.0, "copy_num_tokens": 475.125, "epoch": 1.1425580801633903, "gen_logits_max": 4.850017070770264, "gen_logits_mean": -13.679943084716797, "gen_logits_min": -25.030925750732422, "gen_logits_std": 2.8457655906677246, "gen_loss": 0.27775928378105164, "grad_norm": 0.39475652294801433, "learning_rate": 2.3863578947368424e-05, "loss": 0.2804, "mean_copy_accuracy": 0.9953910559415817, "mean_gen_accuracy": 0.8769989758729935, "mean_token_accuracy": 0.9046828746795654, "num_tokens": 432944698.0, "sample_num_tokens": 8290.5, "step": 5594, "total_num_tokens": 432977860.0, "z_loss": 0.0005304964142851532 }, { "copy_logits_max": -3.677861213684082, "copy_logits_min": -750000000.0, "copy_num_tokens": 451.375, "epoch": 1.1427623181005873, "gen_logits_max": 4.490983963012695, "gen_logits_mean": -14.410579681396484, "gen_logits_min": -25.928255081176758, "gen_logits_std": 2.890803575515747, "gen_loss": 0.2571812868118286, "grad_norm": 0.3744089110043351, "learning_rate": 2.3862315789473685e-05, "loss": 0.2564, "mean_copy_accuracy": 0.9959248453378677, "mean_gen_accuracy": 0.8845164179801941, "mean_token_accuracy": 0.9127017706632614, "num_tokens": 433226691.0, "sample_num_tokens": 8106.25, "step": 5595, "total_num_tokens": 433259116.0, "z_loss": 0.0005033880588598549 }, { "copy_logits_max": -7.591792106628418, "copy_logits_min": -750000000.0, "copy_num_tokens": 323.5625, "epoch": 1.142966556037784, "gen_logits_max": 5.214351654052734, "gen_logits_mean": -13.720970153808594, "gen_logits_min": -25.211563110351562, "gen_logits_std": 2.8376142978668213, "gen_loss": 0.2888546884059906, "grad_norm": 0.40067363001224954, "learning_rate": 2.386105263157895e-05, "loss": 0.2526, "mean_copy_accuracy": 0.9947754740715027, "mean_gen_accuracy": 0.8864259719848633, "mean_token_accuracy": 0.9144367277622223, "num_tokens": 433503221.0, "sample_num_tokens": 7502.25, "step": 5596, "total_num_tokens": 433533230.0, "z_loss": 0.0005723382346332073 }, { "copy_logits_max": -3.44950008392334, "copy_logits_min": -750000000.0, "copy_num_tokens": 977.8125, "epoch": 1.143170793974981, "gen_logits_max": 2.7884984016418457, "gen_logits_mean": -15.37131118774414, "gen_logits_min": -27.69200897216797, "gen_logits_std": 2.9404969215393066, "gen_loss": 0.2567601799964905, "grad_norm": 0.40451827232093296, "learning_rate": 2.385978947368421e-05, "loss": 0.2823, "mean_copy_accuracy": 0.9965735375881195, "mean_gen_accuracy": 0.8693152368068695, "mean_token_accuracy": 0.9044385999441147, "num_tokens": 433784597.0, "sample_num_tokens": 11095.25, "step": 5597, "total_num_tokens": 433828978.0, "z_loss": 0.0005280994228087366 }, { "copy_logits_max": -3.8685500621795654, "copy_logits_min": -750000000.0, "copy_num_tokens": 784.0, "epoch": 1.1433750319121776, "gen_logits_max": 4.329891681671143, "gen_logits_mean": -13.73918342590332, "gen_logits_min": -25.413009643554688, "gen_logits_std": 2.8710386753082275, "gen_loss": 0.2447066307067871, "grad_norm": 0.41909860863609844, "learning_rate": 2.3858526315789475e-05, "loss": 0.2691, "mean_copy_accuracy": 0.995246410369873, "mean_gen_accuracy": 0.8770255297422409, "mean_token_accuracy": 0.9080595821142197, "num_tokens": 434069112.0, "sample_num_tokens": 10107.5, "step": 5598, "total_num_tokens": 434109542.0, "z_loss": 0.0005602778401225805 }, { "copy_logits_max": -4.488642692565918, "copy_logits_min": -750000000.0, "copy_num_tokens": 377.8125, "epoch": 1.1435792698493745, "gen_logits_max": 4.741756916046143, "gen_logits_mean": -14.289813995361328, "gen_logits_min": -25.34002113342285, "gen_logits_std": 2.7984800338745117, "gen_loss": 0.2869111895561218, "grad_norm": 0.3983496957849272, "learning_rate": 2.3857263157894736e-05, "loss": 0.2912, "mean_copy_accuracy": 0.9959845542907715, "mean_gen_accuracy": 0.8739587366580963, "mean_token_accuracy": 0.9004258513450623, "num_tokens": 434342872.0, "sample_num_tokens": 8074.0, "step": 5599, "total_num_tokens": 434375168.0, "z_loss": 0.000632833456620574 }, { "copy_logits_max": -2.8018128871917725, "copy_logits_min": -750000000.0, "copy_num_tokens": 659.0, "epoch": 1.1437835077865715, "gen_logits_max": 3.7215487957000732, "gen_logits_mean": -15.225473403930664, "gen_logits_min": -26.849803924560547, "gen_logits_std": 2.918541193008423, "gen_loss": 0.25339531898498535, "grad_norm": 0.40266365541018456, "learning_rate": 2.3856e-05, "loss": 0.2559, "mean_copy_accuracy": 0.9952598363161087, "mean_gen_accuracy": 0.8830906897783279, "mean_token_accuracy": 0.9120759516954422, "num_tokens": 434620123.0, "sample_num_tokens": 9675.75, "step": 5600, "total_num_tokens": 434658826.0, "z_loss": 0.0006029425421729684 }, { "copy_logits_max": -6.152015686035156, "copy_logits_min": -750000000.0, "copy_num_tokens": 563.0625, "epoch": 1.1439877457237682, "gen_logits_max": 4.503148078918457, "gen_logits_mean": -14.497278213500977, "gen_logits_min": -25.839012145996094, "gen_logits_std": 2.8373570442199707, "gen_loss": 0.3201339542865753, "grad_norm": 0.38353456823090765, "learning_rate": 2.385473684210526e-05, "loss": 0.3045, "mean_copy_accuracy": 0.9965328425168991, "mean_gen_accuracy": 0.8645614385604858, "mean_token_accuracy": 0.8968654274940491, "num_tokens": 434909037.0, "sample_num_tokens": 9168.75, "step": 5601, "total_num_tokens": 434945712.0, "z_loss": 0.0005936469533480704 }, { "copy_logits_max": -5.07692813873291, "copy_logits_min": -687500032.0, "copy_num_tokens": 426.375, "epoch": 1.144191983660965, "gen_logits_max": 3.994344472885132, "gen_logits_mean": -15.639595985412598, "gen_logits_min": -26.96814727783203, "gen_logits_std": 2.9080379009246826, "gen_loss": 0.2670031487941742, "grad_norm": 0.4041960241438806, "learning_rate": 2.385347368421053e-05, "loss": 0.2852, "mean_copy_accuracy": 0.9952978044748306, "mean_gen_accuracy": 0.8780754506587982, "mean_token_accuracy": 0.9045949280261993, "num_tokens": 435170112.0, "sample_num_tokens": 8502.5, "step": 5602, "total_num_tokens": 435204122.0, "z_loss": 0.00047103798715397716 }, { "copy_logits_max": -5.581949234008789, "copy_logits_min": -750000064.0, "copy_num_tokens": 516.0625, "epoch": 1.1443962215981618, "gen_logits_max": 3.241753339767456, "gen_logits_mean": -16.898178100585938, "gen_logits_min": -28.431699752807617, "gen_logits_std": 2.9490346908569336, "gen_loss": 0.2532805800437927, "grad_norm": 0.4106221373675474, "learning_rate": 2.385221052631579e-05, "loss": 0.284, "mean_copy_accuracy": 0.9953272640705109, "mean_gen_accuracy": 0.8766265958547592, "mean_token_accuracy": 0.9032120704650879, "num_tokens": 435438599.0, "sample_num_tokens": 8614.75, "step": 5603, "total_num_tokens": 435473058.0, "z_loss": 0.00047356908908113837 }, { "copy_logits_max": -3.040327787399292, "copy_logits_min": -750000000.0, "copy_num_tokens": 831.375, "epoch": 1.1446004595353587, "gen_logits_max": 3.860466957092285, "gen_logits_mean": -14.144136428833008, "gen_logits_min": -26.294662475585938, "gen_logits_std": 2.9074859619140625, "gen_loss": 0.24026347696781158, "grad_norm": 0.3951857416411961, "learning_rate": 2.3850947368421054e-05, "loss": 0.2769, "mean_copy_accuracy": 0.9966817498207092, "mean_gen_accuracy": 0.8760466575622559, "mean_token_accuracy": 0.9067714363336563, "num_tokens": 435719654.0, "sample_num_tokens": 11560.5, "step": 5604, "total_num_tokens": 435765896.0, "z_loss": 0.0005433335318230093 }, { "copy_logits_max": -4.864356517791748, "copy_logits_min": -750000000.0, "copy_num_tokens": 531.9375, "epoch": 1.1448046974725554, "gen_logits_max": 2.9794626235961914, "gen_logits_mean": -16.680091857910156, "gen_logits_min": -28.39982032775879, "gen_logits_std": 2.951061248779297, "gen_loss": 0.26851677894592285, "grad_norm": 0.3813172479307663, "learning_rate": 2.3849684210526318e-05, "loss": 0.2822, "mean_copy_accuracy": 0.9965239614248276, "mean_gen_accuracy": 0.8723004162311554, "mean_token_accuracy": 0.9055373519659042, "num_tokens": 435998571.0, "sample_num_tokens": 8578.25, "step": 5605, "total_num_tokens": 436032884.0, "z_loss": 0.0005996980471536517 }, { "copy_logits_max": -2.1928186416625977, "copy_logits_min": -750000064.0, "copy_num_tokens": 723.375, "epoch": 1.1450089354097523, "gen_logits_max": 4.155742168426514, "gen_logits_mean": -15.042211532592773, "gen_logits_min": -26.78130340576172, "gen_logits_std": 2.9228978157043457, "gen_loss": 0.2550837993621826, "grad_norm": 0.3958940294743242, "learning_rate": 2.384842105263158e-05, "loss": 0.275, "mean_copy_accuracy": 0.9971282035112381, "mean_gen_accuracy": 0.8748446404933929, "mean_token_accuracy": 0.9057664275169373, "num_tokens": 436290014.0, "sample_num_tokens": 11501.5, "step": 5606, "total_num_tokens": 436336020.0, "z_loss": 0.0005668774829246104 }, { "copy_logits_max": -3.830023765563965, "copy_logits_min": -750000000.0, "copy_num_tokens": 477.0625, "epoch": 1.1452131733469493, "gen_logits_max": 3.5856118202209473, "gen_logits_mean": -14.531362533569336, "gen_logits_min": -25.993755340576172, "gen_logits_std": 2.8233642578125, "gen_loss": 0.2772248089313507, "grad_norm": 0.3879977796730344, "learning_rate": 2.3847157894736843e-05, "loss": 0.2729, "mean_copy_accuracy": 0.9959626197814941, "mean_gen_accuracy": 0.8782941401004791, "mean_token_accuracy": 0.9114514142274857, "num_tokens": 436597114.0, "sample_num_tokens": 7509.5, "step": 5607, "total_num_tokens": 436627152.0, "z_loss": 0.0004986244020983577 }, { "copy_logits_max": -4.720318794250488, "copy_logits_min": -687500032.0, "copy_num_tokens": 399.125, "epoch": 1.145417411284146, "gen_logits_max": 3.983865737915039, "gen_logits_mean": -16.046218872070312, "gen_logits_min": -28.226856231689453, "gen_logits_std": 2.9707255363464355, "gen_loss": 0.26059114933013916, "grad_norm": 0.4137384088188145, "learning_rate": 2.3845894736842104e-05, "loss": 0.278, "mean_copy_accuracy": 0.9956795573234558, "mean_gen_accuracy": 0.8729723989963531, "mean_token_accuracy": 0.9043635129928589, "num_tokens": 436848785.0, "sample_num_tokens": 7066.25, "step": 5608, "total_num_tokens": 436877050.0, "z_loss": 0.0005188976647332311 }, { "copy_logits_max": -6.974956512451172, "copy_logits_min": -750000000.0, "copy_num_tokens": 348.25, "epoch": 1.145621649221343, "gen_logits_max": 3.619492530822754, "gen_logits_mean": -16.289783477783203, "gen_logits_min": -27.886690139770508, "gen_logits_std": 2.9135279655456543, "gen_loss": 0.2843250036239624, "grad_norm": 0.3870131248399737, "learning_rate": 2.384463157894737e-05, "loss": 0.2784, "mean_copy_accuracy": 0.9959702044725418, "mean_gen_accuracy": 0.8786926567554474, "mean_token_accuracy": 0.9050336480140686, "num_tokens": 437117670.0, "sample_num_tokens": 8203.0, "step": 5609, "total_num_tokens": 437150482.0, "z_loss": 0.0004463453951757401 }, { "copy_logits_max": -2.9520771503448486, "copy_logits_min": -687500032.0, "copy_num_tokens": 463.25, "epoch": 1.1458258871585396, "gen_logits_max": 3.5980329513549805, "gen_logits_mean": -15.793146133422852, "gen_logits_min": -27.597536087036133, "gen_logits_std": 2.9710657596588135, "gen_loss": 0.2671586275100708, "grad_norm": 0.3774590262188973, "learning_rate": 2.3843368421052633e-05, "loss": 0.2714, "mean_copy_accuracy": 0.996765598654747, "mean_gen_accuracy": 0.8765214681625366, "mean_token_accuracy": 0.9085861593484879, "num_tokens": 437396130.0, "sample_num_tokens": 8007.5, "step": 5610, "total_num_tokens": 437428160.0, "z_loss": 0.0004976183408871293 }, { "copy_logits_max": -2.594971179962158, "copy_logits_min": -750000000.0, "copy_num_tokens": 362.8125, "epoch": 1.1460301250957365, "gen_logits_max": 5.062164306640625, "gen_logits_mean": -14.565217971801758, "gen_logits_min": -26.58873176574707, "gen_logits_std": 2.921438694000244, "gen_loss": 0.31415802240371704, "grad_norm": 0.3998006970972706, "learning_rate": 2.3842105263157897e-05, "loss": 0.282, "mean_copy_accuracy": 0.9956461191177368, "mean_gen_accuracy": 0.8714755922555923, "mean_token_accuracy": 0.9038009643554688, "num_tokens": 437674264.0, "sample_num_tokens": 7191.0, "step": 5611, "total_num_tokens": 437703028.0, "z_loss": 0.0005833597970195115 }, { "copy_logits_max": -4.829516410827637, "copy_logits_min": -750000000.0, "copy_num_tokens": 364.375, "epoch": 1.1462343630329332, "gen_logits_max": 5.125025272369385, "gen_logits_mean": -13.492767333984375, "gen_logits_min": -24.885971069335938, "gen_logits_std": 2.84147572517395, "gen_loss": 0.25261160731315613, "grad_norm": 0.3970895173382987, "learning_rate": 2.3840842105263158e-05, "loss": 0.2909, "mean_copy_accuracy": 0.9959146082401276, "mean_gen_accuracy": 0.8738002181053162, "mean_token_accuracy": 0.9015090763568878, "num_tokens": 437936033.0, "sample_num_tokens": 7969.25, "step": 5612, "total_num_tokens": 437967910.0, "z_loss": 0.0005025953287258744 }, { "copy_logits_max": -4.846315383911133, "copy_logits_min": -750000000.0, "copy_num_tokens": 462.875, "epoch": 1.1464386009701302, "gen_logits_max": 4.233017921447754, "gen_logits_mean": -15.464408874511719, "gen_logits_min": -27.645000457763672, "gen_logits_std": 2.8872194290161133, "gen_loss": 0.3152358829975128, "grad_norm": 0.39836308454942504, "learning_rate": 2.3839578947368422e-05, "loss": 0.2843, "mean_copy_accuracy": 0.9951424300670624, "mean_gen_accuracy": 0.8749756813049316, "mean_token_accuracy": 0.9027606546878815, "num_tokens": 438194768.0, "sample_num_tokens": 8366.5, "step": 5613, "total_num_tokens": 438228234.0, "z_loss": 0.0006215518806129694 }, { "copy_logits_max": -1.434969186782837, "copy_logits_min": -750000000.0, "copy_num_tokens": 829.125, "epoch": 1.146642838907327, "gen_logits_max": 3.4002652168273926, "gen_logits_mean": -15.712503433227539, "gen_logits_min": -27.961132049560547, "gen_logits_std": 2.9904561042785645, "gen_loss": 0.2531220614910126, "grad_norm": 0.39624197245811293, "learning_rate": 2.3838315789473683e-05, "loss": 0.2745, "mean_copy_accuracy": 0.9956522434949875, "mean_gen_accuracy": 0.8774363845586777, "mean_token_accuracy": 0.9106786102056503, "num_tokens": 438474552.0, "sample_num_tokens": 9445.5, "step": 5614, "total_num_tokens": 438512334.0, "z_loss": 0.0005480764666572213 }, { "copy_logits_max": -1.6573593616485596, "copy_logits_min": -687500032.0, "copy_num_tokens": 559.625, "epoch": 1.1468470768445238, "gen_logits_max": 3.550165891647339, "gen_logits_mean": -16.026968002319336, "gen_logits_min": -28.11150360107422, "gen_logits_std": 3.0277230739593506, "gen_loss": 0.2437673807144165, "grad_norm": 0.39741672957089724, "learning_rate": 2.3837052631578948e-05, "loss": 0.2833, "mean_copy_accuracy": 0.9978858381509781, "mean_gen_accuracy": 0.8732227385044098, "mean_token_accuracy": 0.9037464112043381, "num_tokens": 438737856.0, "sample_num_tokens": 8235.5, "step": 5615, "total_num_tokens": 438770798.0, "z_loss": 0.0005277638556435704 }, { "copy_logits_max": -1.7763928174972534, "copy_logits_min": -750000000.0, "copy_num_tokens": 569.3125, "epoch": 1.1470513147817207, "gen_logits_max": 4.40885066986084, "gen_logits_mean": -14.45984935760498, "gen_logits_min": -26.444114685058594, "gen_logits_std": 2.9462223052978516, "gen_loss": 0.23377124965190887, "grad_norm": 0.3928784688647629, "learning_rate": 2.383578947368421e-05, "loss": 0.2689, "mean_copy_accuracy": 0.9956467151641846, "mean_gen_accuracy": 0.8780318051576614, "mean_token_accuracy": 0.9091209024190903, "num_tokens": 438998187.0, "sample_num_tokens": 8740.25, "step": 5616, "total_num_tokens": 439033148.0, "z_loss": 0.0005065868026576936 }, { "copy_logits_max": -0.8312084674835205, "copy_logits_min": -750000000.0, "copy_num_tokens": 587.0625, "epoch": 1.1472555527189177, "gen_logits_max": 5.018430709838867, "gen_logits_mean": -12.702299118041992, "gen_logits_min": -24.58888053894043, "gen_logits_std": 2.828148365020752, "gen_loss": 0.23484210669994354, "grad_norm": 0.37380836357783653, "learning_rate": 2.3834526315789473e-05, "loss": 0.2697, "mean_copy_accuracy": 0.9963746964931488, "mean_gen_accuracy": 0.8803912550210953, "mean_token_accuracy": 0.909879058599472, "num_tokens": 439275286.0, "sample_num_tokens": 9402.0, "step": 5617, "total_num_tokens": 439312894.0, "z_loss": 0.00047164643183350563 }, { "copy_logits_max": -3.4516687393188477, "copy_logits_min": -687500032.0, "copy_num_tokens": 667.125, "epoch": 1.1474597906561144, "gen_logits_max": 3.908529758453369, "gen_logits_mean": -14.958256721496582, "gen_logits_min": -27.043724060058594, "gen_logits_std": 2.9333279132843018, "gen_loss": 0.2607261836528778, "grad_norm": 0.37314111703799163, "learning_rate": 2.383326315789474e-05, "loss": 0.2748, "mean_copy_accuracy": 0.9958936870098114, "mean_gen_accuracy": 0.8783949315547943, "mean_token_accuracy": 0.9069986641407013, "num_tokens": 439544923.0, "sample_num_tokens": 9463.25, "step": 5618, "total_num_tokens": 439582776.0, "z_loss": 0.0005476123769767582 }, { "copy_logits_max": -4.2315497398376465, "copy_logits_min": -687500032.0, "copy_num_tokens": 400.0, "epoch": 1.1476640285933113, "gen_logits_max": 4.333934783935547, "gen_logits_mean": -16.33405303955078, "gen_logits_min": -28.075260162353516, "gen_logits_std": 2.9506568908691406, "gen_loss": 0.3027307391166687, "grad_norm": 0.3735938854056048, "learning_rate": 2.3832e-05, "loss": 0.2713, "mean_copy_accuracy": 0.9955998957157135, "mean_gen_accuracy": 0.8799102157354355, "mean_token_accuracy": 0.908101499080658, "num_tokens": 439821805.0, "sample_num_tokens": 8490.25, "step": 5619, "total_num_tokens": 439855766.0, "z_loss": 0.0006059739971533418 }, { "copy_logits_max": -5.574778079986572, "copy_logits_min": -750000000.0, "copy_num_tokens": 333.375, "epoch": 1.147868266530508, "gen_logits_max": 5.2865495681762695, "gen_logits_mean": -15.065003395080566, "gen_logits_min": -26.23682403564453, "gen_logits_std": 2.904025077819824, "gen_loss": 0.2700788378715515, "grad_norm": 0.3953630663578528, "learning_rate": 2.3830736842105266e-05, "loss": 0.2648, "mean_copy_accuracy": 0.9963408261537552, "mean_gen_accuracy": 0.8825212270021439, "mean_token_accuracy": 0.9102249890565872, "num_tokens": 440097414.0, "sample_num_tokens": 8511.0, "step": 5620, "total_num_tokens": 440131458.0, "z_loss": 0.0005392180173657835 }, { "copy_logits_max": -4.606114387512207, "copy_logits_min": -750000064.0, "copy_num_tokens": 319.875, "epoch": 1.148072504467705, "gen_logits_max": 4.535248279571533, "gen_logits_mean": -15.290277481079102, "gen_logits_min": -26.57720375061035, "gen_logits_std": 2.869418144226074, "gen_loss": 0.28967195749282837, "grad_norm": 0.3615905733007085, "learning_rate": 2.3829473684210527e-05, "loss": 0.2838, "mean_copy_accuracy": 0.9963429421186447, "mean_gen_accuracy": 0.8762778490781784, "mean_token_accuracy": 0.9037463068962097, "num_tokens": 440369234.0, "sample_num_tokens": 7900.5, "step": 5621, "total_num_tokens": 440400836.0, "z_loss": 0.0005072536296211183 }, { "copy_logits_max": -2.5336055755615234, "copy_logits_min": -750000064.0, "copy_num_tokens": 479.6875, "epoch": 1.1482767424049016, "gen_logits_max": 4.651810646057129, "gen_logits_mean": -13.708017349243164, "gen_logits_min": -26.034648895263672, "gen_logits_std": 2.9425477981567383, "gen_loss": 0.29381856322288513, "grad_norm": 0.39807170682608595, "learning_rate": 2.382821052631579e-05, "loss": 0.3064, "mean_copy_accuracy": 0.9952724128961563, "mean_gen_accuracy": 0.8685120046138763, "mean_token_accuracy": 0.8970939069986343, "num_tokens": 440609974.0, "sample_num_tokens": 8478.0, "step": 5622, "total_num_tokens": 440643886.0, "z_loss": 0.0005408031865954399 }, { "copy_logits_max": -3.411458969116211, "copy_logits_min": -625000064.0, "copy_num_tokens": 481.25, "epoch": 1.1484809803420986, "gen_logits_max": 3.662522792816162, "gen_logits_mean": -16.223651885986328, "gen_logits_min": -27.83228874206543, "gen_logits_std": 2.967437744140625, "gen_loss": 0.2746378481388092, "grad_norm": 0.36695246043006036, "learning_rate": 2.3826947368421052e-05, "loss": 0.2684, "mean_copy_accuracy": 0.9961006194353104, "mean_gen_accuracy": 0.8795467019081116, "mean_token_accuracy": 0.9082064479589462, "num_tokens": 440883033.0, "sample_num_tokens": 7952.75, "step": 5623, "total_num_tokens": 440914844.0, "z_loss": 0.0004725996986962855 }, { "copy_logits_max": -4.408058166503906, "copy_logits_min": -625000064.0, "copy_num_tokens": 399.1875, "epoch": 1.1486852182792955, "gen_logits_max": 3.431178569793701, "gen_logits_mean": -16.331157684326172, "gen_logits_min": -27.86988639831543, "gen_logits_std": 2.9071784019470215, "gen_loss": 0.2750566005706787, "grad_norm": 0.44476969450638804, "learning_rate": 2.3825684210526316e-05, "loss": 0.2724, "mean_copy_accuracy": 0.9960587024688721, "mean_gen_accuracy": 0.8759607523679733, "mean_token_accuracy": 0.9073717594146729, "num_tokens": 441153228.0, "sample_num_tokens": 7874.5, "step": 5624, "total_num_tokens": 441184726.0, "z_loss": 0.0005934765795245767 }, { "copy_logits_max": -3.3538589477539062, "copy_logits_min": -750000000.0, "copy_num_tokens": 553.125, "epoch": 1.1488894562164922, "gen_logits_max": 2.876725673675537, "gen_logits_mean": -16.486703872680664, "gen_logits_min": -28.012298583984375, "gen_logits_std": 2.9380645751953125, "gen_loss": 0.26222121715545654, "grad_norm": 0.37529589833598725, "learning_rate": 2.3824421052631577e-05, "loss": 0.2815, "mean_copy_accuracy": 0.9963197112083435, "mean_gen_accuracy": 0.8751653283834457, "mean_token_accuracy": 0.9043088108301163, "num_tokens": 441434068.0, "sample_num_tokens": 8506.0, "step": 5625, "total_num_tokens": 441468092.0, "z_loss": 0.0005552585935220122 }, { "copy_logits_max": -2.402674674987793, "copy_logits_min": -687500032.0, "copy_num_tokens": 667.1875, "epoch": 1.1490936941536891, "gen_logits_max": 3.6058149337768555, "gen_logits_mean": -14.647467613220215, "gen_logits_min": -26.000823974609375, "gen_logits_std": 2.841549873352051, "gen_loss": 0.25454574823379517, "grad_norm": 0.3855794210148249, "learning_rate": 2.3823157894736845e-05, "loss": 0.2768, "mean_copy_accuracy": 0.996045857667923, "mean_gen_accuracy": 0.8737206310033798, "mean_token_accuracy": 0.9054331183433533, "num_tokens": 441696921.0, "sample_num_tokens": 9583.75, "step": 5626, "total_num_tokens": 441735256.0, "z_loss": 0.0005811880109831691 }, { "copy_logits_max": -5.626578330993652, "copy_logits_min": -750000000.0, "copy_num_tokens": 550.125, "epoch": 1.1492979320908858, "gen_logits_max": 4.021656036376953, "gen_logits_mean": -15.779407501220703, "gen_logits_min": -26.803691864013672, "gen_logits_std": 2.891711473464966, "gen_loss": 0.2972446084022522, "grad_norm": 0.3925978580170326, "learning_rate": 2.3821894736842106e-05, "loss": 0.2854, "mean_copy_accuracy": 0.9950947016477585, "mean_gen_accuracy": 0.8765103369951248, "mean_token_accuracy": 0.90423783659935, "num_tokens": 441967105.0, "sample_num_tokens": 9992.25, "step": 5627, "total_num_tokens": 442007074.0, "z_loss": 0.0005830804584547877 }, { "copy_logits_max": -5.207446575164795, "copy_logits_min": -687500032.0, "copy_num_tokens": 411.8125, "epoch": 1.1495021700280827, "gen_logits_max": 4.652036190032959, "gen_logits_mean": -14.200672149658203, "gen_logits_min": -24.872644424438477, "gen_logits_std": 2.753903865814209, "gen_loss": 0.2873472571372986, "grad_norm": 0.363961474392223, "learning_rate": 2.382063157894737e-05, "loss": 0.2747, "mean_copy_accuracy": 0.9971952736377716, "mean_gen_accuracy": 0.8821663111448288, "mean_token_accuracy": 0.9068615883588791, "num_tokens": 442249647.0, "sample_num_tokens": 8234.75, "step": 5628, "total_num_tokens": 442282586.0, "z_loss": 0.0005337579059414566 }, { "copy_logits_max": -2.8278794288635254, "copy_logits_min": -750000000.0, "copy_num_tokens": 563.5, "epoch": 1.1497064079652795, "gen_logits_max": 3.6090750694274902, "gen_logits_mean": -13.940773963928223, "gen_logits_min": -25.41397476196289, "gen_logits_std": 2.7596116065979004, "gen_loss": 0.25625017285346985, "grad_norm": 0.3727957324102277, "learning_rate": 2.381936842105263e-05, "loss": 0.2629, "mean_copy_accuracy": 0.9960599541664124, "mean_gen_accuracy": 0.8818991184234619, "mean_token_accuracy": 0.912622481584549, "num_tokens": 442506305.0, "sample_num_tokens": 8187.25, "step": 5629, "total_num_tokens": 442539054.0, "z_loss": 0.0004934880998916924 }, { "copy_logits_max": -3.8797731399536133, "copy_logits_min": -750000000.0, "copy_num_tokens": 567.75, "epoch": 1.1499106459024764, "gen_logits_max": 3.601489305496216, "gen_logits_mean": -13.761157989501953, "gen_logits_min": -24.65056610107422, "gen_logits_std": 2.7473909854888916, "gen_loss": 0.28142881393432617, "grad_norm": 0.4028352888248706, "learning_rate": 2.3818105263157895e-05, "loss": 0.2945, "mean_copy_accuracy": 0.9965870678424835, "mean_gen_accuracy": 0.8676293194293976, "mean_token_accuracy": 0.9015165716409683, "num_tokens": 442768848.0, "sample_num_tokens": 9213.0, "step": 5630, "total_num_tokens": 442805700.0, "z_loss": 0.0005008295993320644 }, { "copy_logits_max": -5.473355293273926, "copy_logits_min": -750000000.0, "copy_num_tokens": 349.75, "epoch": 1.1501148838396733, "gen_logits_max": 3.8984062671661377, "gen_logits_mean": -15.152332305908203, "gen_logits_min": -25.898143768310547, "gen_logits_std": 2.7711021900177, "gen_loss": 0.3254178464412689, "grad_norm": 0.39391356634285374, "learning_rate": 2.3816842105263156e-05, "loss": 0.2973, "mean_copy_accuracy": 0.9963621348142624, "mean_gen_accuracy": 0.8660005629062653, "mean_token_accuracy": 0.8989290446043015, "num_tokens": 443046067.0, "sample_num_tokens": 7374.25, "step": 5631, "total_num_tokens": 443075564.0, "z_loss": 0.0005775538738816977 }, { "copy_logits_max": -3.181779384613037, "copy_logits_min": -750000000.0, "copy_num_tokens": 658.125, "epoch": 1.15031912177687, "gen_logits_max": 3.118978500366211, "gen_logits_mean": -14.043634414672852, "gen_logits_min": -25.5008544921875, "gen_logits_std": 2.812408924102783, "gen_loss": 0.2412201315164566, "grad_norm": 0.39580304644236275, "learning_rate": 2.381557894736842e-05, "loss": 0.278, "mean_copy_accuracy": 0.9972117990255356, "mean_gen_accuracy": 0.8684164881706238, "mean_token_accuracy": 0.9076653718948364, "num_tokens": 443306618.0, "sample_num_tokens": 8644.0, "step": 5632, "total_num_tokens": 443341194.0, "z_loss": 0.00047163289855234325 }, { "copy_logits_max": -4.661556243896484, "copy_logits_min": -750000000.0, "copy_num_tokens": 452.4375, "epoch": 1.150523359714067, "gen_logits_max": 4.445278167724609, "gen_logits_mean": -13.167335510253906, "gen_logits_min": -23.87478256225586, "gen_logits_std": 2.6860294342041016, "gen_loss": 0.30401158332824707, "grad_norm": 0.3570716087437721, "learning_rate": 2.3814315789473685e-05, "loss": 0.278, "mean_copy_accuracy": 0.9973170459270477, "mean_gen_accuracy": 0.8677665740251541, "mean_token_accuracy": 0.9049591422080994, "num_tokens": 443604449.0, "sample_num_tokens": 8338.25, "step": 5633, "total_num_tokens": 443637802.0, "z_loss": 0.0005746177630499005 }, { "copy_logits_max": -5.879378318786621, "copy_logits_min": -750000000.0, "copy_num_tokens": 348.375, "epoch": 1.1507275976512636, "gen_logits_max": 4.457492828369141, "gen_logits_mean": -15.771663665771484, "gen_logits_min": -27.188739776611328, "gen_logits_std": 2.895610809326172, "gen_loss": 0.3117898106575012, "grad_norm": 0.38241767634473106, "learning_rate": 2.381305263157895e-05, "loss": 0.3039, "mean_copy_accuracy": 0.9960974901914597, "mean_gen_accuracy": 0.8669538795948029, "mean_token_accuracy": 0.8979397863149643, "num_tokens": 443874152.0, "sample_num_tokens": 8058.0, "step": 5634, "total_num_tokens": 443906384.0, "z_loss": 0.0005770187126472592 }, { "copy_logits_max": -4.059165000915527, "copy_logits_min": -750000128.0, "copy_num_tokens": 500.25, "epoch": 1.1509318355884606, "gen_logits_max": 3.6974759101867676, "gen_logits_mean": -14.305831909179688, "gen_logits_min": -25.5533447265625, "gen_logits_std": 2.7985825538635254, "gen_loss": 0.27634018659591675, "grad_norm": 0.34387018306647604, "learning_rate": 2.3811789473684214e-05, "loss": 0.2601, "mean_copy_accuracy": 0.9967090338468552, "mean_gen_accuracy": 0.8809382617473602, "mean_token_accuracy": 0.9111195057630539, "num_tokens": 444139930.0, "sample_num_tokens": 8372.0, "step": 5635, "total_num_tokens": 444173418.0, "z_loss": 0.0005326693644747138 }, { "copy_logits_max": -5.179955005645752, "copy_logits_min": -687500096.0, "copy_num_tokens": 601.625, "epoch": 1.1511360735256573, "gen_logits_max": 2.0087475776672363, "gen_logits_mean": -17.68837547302246, "gen_logits_min": -29.174856185913086, "gen_logits_std": 2.9467787742614746, "gen_loss": 0.25667673349380493, "grad_norm": 0.3820724556730385, "learning_rate": 2.3810526315789475e-05, "loss": 0.2707, "mean_copy_accuracy": 0.9951394349336624, "mean_gen_accuracy": 0.8819028437137604, "mean_token_accuracy": 0.9080481380224228, "num_tokens": 444393567.0, "sample_num_tokens": 9130.25, "step": 5636, "total_num_tokens": 444430088.0, "z_loss": 0.0004968479042872787 }, { "copy_logits_max": -3.2789602279663086, "copy_logits_min": -750000000.0, "copy_num_tokens": 408.5625, "epoch": 1.1513403114628542, "gen_logits_max": 5.205777168273926, "gen_logits_mean": -13.536687850952148, "gen_logits_min": -24.684707641601562, "gen_logits_std": 2.7486252784729004, "gen_loss": 0.3426661789417267, "grad_norm": 0.38919602305583395, "learning_rate": 2.380926315789474e-05, "loss": 0.3016, "mean_copy_accuracy": 0.9960996061563492, "mean_gen_accuracy": 0.8703869134187698, "mean_token_accuracy": 0.8986521810293198, "num_tokens": 444666644.0, "sample_num_tokens": 8085.0, "step": 5637, "total_num_tokens": 444698984.0, "z_loss": 0.0006255535408854485 }, { "copy_logits_max": -3.5013866424560547, "copy_logits_min": -750000000.0, "copy_num_tokens": 814.875, "epoch": 1.1515445494000511, "gen_logits_max": 4.00164794921875, "gen_logits_mean": -13.639244079589844, "gen_logits_min": -25.130342483520508, "gen_logits_std": 2.775650978088379, "gen_loss": 0.2452385574579239, "grad_norm": 0.38568553630156394, "learning_rate": 2.3808e-05, "loss": 0.2696, "mean_copy_accuracy": 0.9966968446969986, "mean_gen_accuracy": 0.8780100047588348, "mean_token_accuracy": 0.9092201292514801, "num_tokens": 444953322.0, "sample_num_tokens": 11490.0, "step": 5638, "total_num_tokens": 444999282.0, "z_loss": 0.0004990407032892108 }, { "copy_logits_max": -3.9678220748901367, "copy_logits_min": -750000000.0, "copy_num_tokens": 505.875, "epoch": 1.1517487873372478, "gen_logits_max": 4.504737377166748, "gen_logits_mean": -15.380019187927246, "gen_logits_min": -26.421138763427734, "gen_logits_std": 2.882993698120117, "gen_loss": 0.30437326431274414, "grad_norm": 0.34305814148397223, "learning_rate": 2.3806736842105264e-05, "loss": 0.2785, "mean_copy_accuracy": 0.9956331849098206, "mean_gen_accuracy": 0.8735121339559555, "mean_token_accuracy": 0.9039478451013565, "num_tokens": 445222124.0, "sample_num_tokens": 8741.0, "step": 5639, "total_num_tokens": 445257088.0, "z_loss": 0.0005805869586765766 }, { "copy_logits_max": -3.1506423950195312, "copy_logits_min": -625000000.0, "copy_num_tokens": 393.5, "epoch": 1.1519530252744448, "gen_logits_max": 4.176932334899902, "gen_logits_mean": -15.209014892578125, "gen_logits_min": -26.75946807861328, "gen_logits_std": 2.892007827758789, "gen_loss": 0.28147822618484497, "grad_norm": 0.3617379092433472, "learning_rate": 2.3805473684210525e-05, "loss": 0.2632, "mean_copy_accuracy": 0.9964679777622223, "mean_gen_accuracy": 0.8822861313819885, "mean_token_accuracy": 0.9102402329444885, "num_tokens": 445483081.0, "sample_num_tokens": 7944.75, "step": 5640, "total_num_tokens": 445514860.0, "z_loss": 0.0005372465820983052 }, { "copy_logits_max": -3.3004589080810547, "copy_logits_min": -750000000.0, "copy_num_tokens": 412.375, "epoch": 1.1521572632116415, "gen_logits_max": 4.302274703979492, "gen_logits_mean": -15.140459060668945, "gen_logits_min": -27.26471710205078, "gen_logits_std": 2.9546821117401123, "gen_loss": 0.27714425325393677, "grad_norm": 0.3566075859798071, "learning_rate": 2.380421052631579e-05, "loss": 0.2682, "mean_copy_accuracy": 0.9957752674818039, "mean_gen_accuracy": 0.8800681531429291, "mean_token_accuracy": 0.9094459116458893, "num_tokens": 445748384.0, "sample_num_tokens": 8387.5, "step": 5641, "total_num_tokens": 445781934.0, "z_loss": 0.0005651265382766724 }, { "copy_logits_max": -3.855813980102539, "copy_logits_min": -750000000.0, "copy_num_tokens": 392.3125, "epoch": 1.1523615011488384, "gen_logits_max": 4.19317102432251, "gen_logits_mean": -14.867002487182617, "gen_logits_min": -26.71728515625, "gen_logits_std": 2.887401580810547, "gen_loss": 0.2904527187347412, "grad_norm": 0.3743145748488114, "learning_rate": 2.3802947368421054e-05, "loss": 0.2729, "mean_copy_accuracy": 0.996357724070549, "mean_gen_accuracy": 0.8753599673509598, "mean_token_accuracy": 0.9064103811979294, "num_tokens": 446018084.0, "sample_num_tokens": 7128.0, "step": 5642, "total_num_tokens": 446046596.0, "z_loss": 0.0005245661595836282 }, { "copy_logits_max": -5.655203819274902, "copy_logits_min": -687500032.0, "copy_num_tokens": 379.5, "epoch": 1.1525657390860353, "gen_logits_max": 4.04133415222168, "gen_logits_mean": -16.575122833251953, "gen_logits_min": -28.29883575439453, "gen_logits_std": 2.986034870147705, "gen_loss": 0.2821859121322632, "grad_norm": 0.3907672784434566, "learning_rate": 2.3801684210526318e-05, "loss": 0.2734, "mean_copy_accuracy": 0.9966519474983215, "mean_gen_accuracy": 0.8764065057039261, "mean_token_accuracy": 0.9086290150880814, "num_tokens": 446282685.0, "sample_num_tokens": 8579.25, "step": 5643, "total_num_tokens": 446317002.0, "z_loss": 0.00047847634414210916 }, { "copy_logits_max": -2.965425491333008, "copy_logits_min": -750000000.0, "copy_num_tokens": 449.5, "epoch": 1.152769977023232, "gen_logits_max": 3.184206008911133, "gen_logits_mean": -16.375415802001953, "gen_logits_min": -28.214065551757812, "gen_logits_std": 2.9507551193237305, "gen_loss": 0.25710582733154297, "grad_norm": 0.350691417280042, "learning_rate": 2.380042105263158e-05, "loss": 0.2575, "mean_copy_accuracy": 0.9962129890918732, "mean_gen_accuracy": 0.8822177350521088, "mean_token_accuracy": 0.9105101525783539, "num_tokens": 446537279.0, "sample_num_tokens": 7887.75, "step": 5644, "total_num_tokens": 446568830.0, "z_loss": 0.0004083235398866236 }, { "copy_logits_max": -4.825046539306641, "copy_logits_min": -750000000.0, "copy_num_tokens": 366.4375, "epoch": 1.152974214960429, "gen_logits_max": 4.856231212615967, "gen_logits_mean": -14.448739051818848, "gen_logits_min": -26.15545654296875, "gen_logits_std": 2.912055015563965, "gen_loss": 0.29676657915115356, "grad_norm": 0.41885478369729173, "learning_rate": 2.3799157894736843e-05, "loss": 0.3017, "mean_copy_accuracy": 0.9952340126037598, "mean_gen_accuracy": 0.8732744604349136, "mean_token_accuracy": 0.8992898315191269, "num_tokens": 446787122.0, "sample_num_tokens": 7857.5, "step": 5645, "total_num_tokens": 446818552.0, "z_loss": 0.0004939877544529736 }, { "copy_logits_max": -3.6821787357330322, "copy_logits_min": -750000000.0, "copy_num_tokens": 396.625, "epoch": 1.1531784528976257, "gen_logits_max": 3.442908525466919, "gen_logits_mean": -17.08192253112793, "gen_logits_min": -28.65416717529297, "gen_logits_std": 3.016490936279297, "gen_loss": 0.2632565498352051, "grad_norm": 0.46301835760379195, "learning_rate": 2.3797894736842107e-05, "loss": 0.2633, "mean_copy_accuracy": 0.9953609853982925, "mean_gen_accuracy": 0.880160853266716, "mean_token_accuracy": 0.9109628647565842, "num_tokens": 447065621.0, "sample_num_tokens": 8402.75, "step": 5646, "total_num_tokens": 447099232.0, "z_loss": 0.0004572831094264984 }, { "copy_logits_max": -1.6330393552780151, "copy_logits_min": -750000000.0, "copy_num_tokens": 548.3125, "epoch": 1.1533826908348226, "gen_logits_max": 3.386260986328125, "gen_logits_mean": -16.197879791259766, "gen_logits_min": -28.197425842285156, "gen_logits_std": 2.9730544090270996, "gen_loss": 0.2694329619407654, "grad_norm": 0.38173136235391775, "learning_rate": 2.379663157894737e-05, "loss": 0.2768, "mean_copy_accuracy": 0.9955168813467026, "mean_gen_accuracy": 0.8794395178556442, "mean_token_accuracy": 0.9081423729658127, "num_tokens": 447344710.0, "sample_num_tokens": 9057.5, "step": 5647, "total_num_tokens": 447380940.0, "z_loss": 0.00045269643305800855 }, { "copy_logits_max": -3.1305601596832275, "copy_logits_min": -750000000.0, "copy_num_tokens": 512.1875, "epoch": 1.1535869287720195, "gen_logits_max": 3.9272994995117188, "gen_logits_mean": -15.260807991027832, "gen_logits_min": -26.755704879760742, "gen_logits_std": 2.9720253944396973, "gen_loss": 0.26719656586647034, "grad_norm": 0.3695871174644442, "learning_rate": 2.3795368421052633e-05, "loss": 0.2821, "mean_copy_accuracy": 0.9972203075885773, "mean_gen_accuracy": 0.874496266245842, "mean_token_accuracy": 0.9051133394241333, "num_tokens": 447618614.0, "sample_num_tokens": 8822.0, "step": 5648, "total_num_tokens": 447653902.0, "z_loss": 0.0004402699996717274 }, { "copy_logits_max": -0.7156049013137817, "copy_logits_min": -625000064.0, "copy_num_tokens": 489.4375, "epoch": 1.1537911667092162, "gen_logits_max": 4.561561584472656, "gen_logits_mean": -14.297917366027832, "gen_logits_min": -26.285003662109375, "gen_logits_std": 2.9720702171325684, "gen_loss": 0.27977609634399414, "grad_norm": 0.36094666989474783, "learning_rate": 2.3794105263157894e-05, "loss": 0.2805, "mean_copy_accuracy": 0.9960723221302032, "mean_gen_accuracy": 0.876008078455925, "mean_token_accuracy": 0.9060791879892349, "num_tokens": 447891171.0, "sample_num_tokens": 9104.75, "step": 5649, "total_num_tokens": 447927590.0, "z_loss": 0.0005108523182570934 }, { "copy_logits_max": -3.059147596359253, "copy_logits_min": -750000000.0, "copy_num_tokens": 407.5625, "epoch": 1.1539954046464131, "gen_logits_max": 4.579824447631836, "gen_logits_mean": -15.872186660766602, "gen_logits_min": -27.81147003173828, "gen_logits_std": 3.0260486602783203, "gen_loss": 0.2791016101837158, "grad_norm": 0.4369012429522348, "learning_rate": 2.3792842105263158e-05, "loss": 0.2804, "mean_copy_accuracy": 0.9943490922451019, "mean_gen_accuracy": 0.878434494137764, "mean_token_accuracy": 0.904885321855545, "num_tokens": 448159827.0, "sample_num_tokens": 7578.25, "step": 5650, "total_num_tokens": 448190140.0, "z_loss": 0.000478932837722823 }, { "copy_logits_max": -1.6256613731384277, "copy_logits_min": -687500032.0, "copy_num_tokens": 697.375, "epoch": 1.1541996425836099, "gen_logits_max": 4.648132801055908, "gen_logits_mean": -13.895832061767578, "gen_logits_min": -26.215106964111328, "gen_logits_std": 2.9998817443847656, "gen_loss": 0.2272701859474182, "grad_norm": 0.3594582731663951, "learning_rate": 2.3791578947368422e-05, "loss": 0.249, "mean_copy_accuracy": 0.9968944936990738, "mean_gen_accuracy": 0.8833881914615631, "mean_token_accuracy": 0.9147848337888718, "num_tokens": 448449511.0, "sample_num_tokens": 10492.25, "step": 5651, "total_num_tokens": 448491480.0, "z_loss": 0.00045652780681848526 }, { "copy_logits_max": -1.977779746055603, "copy_logits_min": -687500032.0, "copy_num_tokens": 503.0, "epoch": 1.1544038805208068, "gen_logits_max": 3.9099810123443604, "gen_logits_mean": -16.28668975830078, "gen_logits_min": -28.056968688964844, "gen_logits_std": 2.995359420776367, "gen_loss": 0.25599291920661926, "grad_norm": 0.37120947649313185, "learning_rate": 2.3790315789473687e-05, "loss": 0.2628, "mean_copy_accuracy": 0.996247261762619, "mean_gen_accuracy": 0.8810802847146988, "mean_token_accuracy": 0.9098068475723267, "num_tokens": 448754382.0, "sample_num_tokens": 8436.0, "step": 5652, "total_num_tokens": 448788126.0, "z_loss": 0.0005646766512654722 }, { "copy_logits_max": -1.8703696727752686, "copy_logits_min": -750000000.0, "copy_num_tokens": 288.375, "epoch": 1.1546081184580035, "gen_logits_max": 5.4218645095825195, "gen_logits_mean": -14.496513366699219, "gen_logits_min": -25.933547973632812, "gen_logits_std": 2.9311838150024414, "gen_loss": 0.3490378260612488, "grad_norm": 0.3780078330196406, "learning_rate": 2.3789052631578947e-05, "loss": 0.3086, "mean_copy_accuracy": 0.9961555451154709, "mean_gen_accuracy": 0.8681935369968414, "mean_token_accuracy": 0.8956447243690491, "num_tokens": 449041259.0, "sample_num_tokens": 7763.25, "step": 5653, "total_num_tokens": 449072312.0, "z_loss": 0.0007060111965984106 }, { "copy_logits_max": -1.3365789651870728, "copy_logits_min": -750000000.0, "copy_num_tokens": 321.8125, "epoch": 1.1548123563952004, "gen_logits_max": 4.144079208374023, "gen_logits_mean": -16.21613311767578, "gen_logits_min": -27.833709716796875, "gen_logits_std": 2.9491591453552246, "gen_loss": 0.24506652355194092, "grad_norm": 0.40590260154548546, "learning_rate": 2.3787789473684212e-05, "loss": 0.28, "mean_copy_accuracy": 0.996726006269455, "mean_gen_accuracy": 0.8763728439807892, "mean_token_accuracy": 0.9044625759124756, "num_tokens": 449313483.0, "sample_num_tokens": 7839.75, "step": 5654, "total_num_tokens": 449344842.0, "z_loss": 0.0004941921215504408 }, { "copy_logits_max": -2.096099376678467, "copy_logits_min": -687500032.0, "copy_num_tokens": 471.125, "epoch": 1.1550165943323973, "gen_logits_max": 3.854736328125, "gen_logits_mean": -16.164396286010742, "gen_logits_min": -28.17670249938965, "gen_logits_std": 3.0240416526794434, "gen_loss": 0.3193926513195038, "grad_norm": 0.3618343532659884, "learning_rate": 2.3786526315789473e-05, "loss": 0.29, "mean_copy_accuracy": 0.9950279295444489, "mean_gen_accuracy": 0.8718512803316116, "mean_token_accuracy": 0.9006059914827347, "num_tokens": 449579216.0, "sample_num_tokens": 9060.0, "step": 5655, "total_num_tokens": 449615456.0, "z_loss": 0.0006226848345249891 }, { "copy_logits_max": 1.6662356853485107, "copy_logits_min": -750000000.0, "copy_num_tokens": 639.1875, "epoch": 1.155220832269594, "gen_logits_max": 4.025629997253418, "gen_logits_mean": -14.527202606201172, "gen_logits_min": -26.162181854248047, "gen_logits_std": 2.95529842376709, "gen_loss": 0.2896762490272522, "grad_norm": 0.5000545600728721, "learning_rate": 2.3785263157894737e-05, "loss": 0.2811, "mean_copy_accuracy": 0.9961928874254227, "mean_gen_accuracy": 0.8710990995168686, "mean_token_accuracy": 0.9056142270565033, "num_tokens": 449874024.0, "sample_num_tokens": 9505.0, "step": 5656, "total_num_tokens": 449912044.0, "z_loss": 0.0006484062760137022 }, { "copy_logits_max": 1.1836689710617065, "copy_logits_min": -750000000.0, "copy_num_tokens": 481.9375, "epoch": 1.155425070206791, "gen_logits_max": 4.120388031005859, "gen_logits_mean": -15.769645690917969, "gen_logits_min": -27.45722198486328, "gen_logits_std": 2.9810686111450195, "gen_loss": 0.298846960067749, "grad_norm": 0.39231711167847294, "learning_rate": 2.3783999999999998e-05, "loss": 0.2714, "mean_copy_accuracy": 0.9960090667009354, "mean_gen_accuracy": 0.8783438205718994, "mean_token_accuracy": 0.907106950879097, "num_tokens": 450137848.0, "sample_num_tokens": 8458.0, "step": 5657, "total_num_tokens": 450171680.0, "z_loss": 0.0007690474158152938 }, { "copy_logits_max": 4.3183393478393555, "copy_logits_min": -750000000.0, "copy_num_tokens": 451.0, "epoch": 1.1556293081439877, "gen_logits_max": 3.8840115070343018, "gen_logits_mean": -15.612689018249512, "gen_logits_min": -27.67742919921875, "gen_logits_std": 3.0100817680358887, "gen_loss": 0.2752043604850769, "grad_norm": 0.3870807450012398, "learning_rate": 2.3782736842105262e-05, "loss": 0.2862, "mean_copy_accuracy": 0.9960870444774628, "mean_gen_accuracy": 0.873326301574707, "mean_token_accuracy": 0.9032733738422394, "num_tokens": 450421669.0, "sample_num_tokens": 8454.25, "step": 5658, "total_num_tokens": 450455486.0, "z_loss": 0.0007593822665512562 }, { "copy_logits_max": 1.0280358791351318, "copy_logits_min": -562500032.0, "copy_num_tokens": 537.875, "epoch": 1.1558335460811846, "gen_logits_max": 3.3326382637023926, "gen_logits_mean": -16.868484497070312, "gen_logits_min": -28.607006072998047, "gen_logits_std": 2.9968984127044678, "gen_loss": 0.2900485098361969, "grad_norm": 0.4070300118884139, "learning_rate": 2.378147368421053e-05, "loss": 0.2855, "mean_copy_accuracy": 0.995764821767807, "mean_gen_accuracy": 0.8742179721593857, "mean_token_accuracy": 0.9050818532705307, "num_tokens": 450703107.0, "sample_num_tokens": 9217.25, "step": 5659, "total_num_tokens": 450739976.0, "z_loss": 0.0006956813158467412 }, { "copy_logits_max": 2.3572096824645996, "copy_logits_min": -750000000.0, "copy_num_tokens": 455.0, "epoch": 1.1560377840183813, "gen_logits_max": 4.609246730804443, "gen_logits_mean": -14.677000045776367, "gen_logits_min": -26.666358947753906, "gen_logits_std": 3.0053651332855225, "gen_loss": 0.28187447786331177, "grad_norm": 0.4106842148185239, "learning_rate": 2.378021052631579e-05, "loss": 0.2908, "mean_copy_accuracy": 0.9950011670589447, "mean_gen_accuracy": 0.8774122595787048, "mean_token_accuracy": 0.9012658298015594, "num_tokens": 450968868.0, "sample_num_tokens": 9010.0, "step": 5660, "total_num_tokens": 451004908.0, "z_loss": 0.0005459762760438025 }, { "copy_logits_max": 3.195127010345459, "copy_logits_min": -687500032.0, "copy_num_tokens": 579.1875, "epoch": 1.1562420219555782, "gen_logits_max": 3.989187717437744, "gen_logits_mean": -14.944496154785156, "gen_logits_min": -27.607919692993164, "gen_logits_std": 3.04813814163208, "gen_loss": 0.28711846470832825, "grad_norm": 0.3971452705491844, "learning_rate": 2.3778947368421055e-05, "loss": 0.2749, "mean_copy_accuracy": 0.9948735237121582, "mean_gen_accuracy": 0.8749050796031952, "mean_token_accuracy": 0.9082263112068176, "num_tokens": 451243008.0, "sample_num_tokens": 8211.0, "step": 5661, "total_num_tokens": 451275852.0, "z_loss": 0.0006824625888839364 }, { "copy_logits_max": -0.7031347751617432, "copy_logits_min": -687500032.0, "copy_num_tokens": 617.5625, "epoch": 1.1564462598927752, "gen_logits_max": 3.4839062690734863, "gen_logits_mean": -16.829519271850586, "gen_logits_min": -29.014501571655273, "gen_logits_std": 3.065770387649536, "gen_loss": 0.24178414046764374, "grad_norm": 0.37527582815375593, "learning_rate": 2.3777684210526316e-05, "loss": 0.2882, "mean_copy_accuracy": 0.9963977634906769, "mean_gen_accuracy": 0.8697827607393265, "mean_token_accuracy": 0.9015950113534927, "num_tokens": 451514595.0, "sample_num_tokens": 10252.25, "step": 5662, "total_num_tokens": 451555604.0, "z_loss": 0.0005076271481812 }, { "copy_logits_max": 0.9509152173995972, "copy_logits_min": -750000000.0, "copy_num_tokens": 565.6875, "epoch": 1.1566504978299719, "gen_logits_max": 3.5563385486602783, "gen_logits_mean": -15.703399658203125, "gen_logits_min": -27.77019691467285, "gen_logits_std": 2.98761248588562, "gen_loss": 0.2109808325767517, "grad_norm": 0.3820756192175476, "learning_rate": 2.377642105263158e-05, "loss": 0.2531, "mean_copy_accuracy": 0.9954230338335037, "mean_gen_accuracy": 0.8863639682531357, "mean_token_accuracy": 0.9129184782505035, "num_tokens": 451787816.0, "sample_num_tokens": 9300.5, "step": 5663, "total_num_tokens": 451825018.0, "z_loss": 0.00045578647404909134 }, { "copy_logits_max": 2.6602301597595215, "copy_logits_min": -750000000.0, "copy_num_tokens": 648.6875, "epoch": 1.1568547357671688, "gen_logits_max": 4.06256628036499, "gen_logits_mean": -14.757796287536621, "gen_logits_min": -27.271900177001953, "gen_logits_std": 3.040149211883545, "gen_loss": 0.24809682369232178, "grad_norm": 0.37369196333459703, "learning_rate": 2.377515789473684e-05, "loss": 0.2652, "mean_copy_accuracy": 0.9966944754123688, "mean_gen_accuracy": 0.8782791793346405, "mean_token_accuracy": 0.9110682904720306, "num_tokens": 452067495.0, "sample_num_tokens": 9871.25, "step": 5664, "total_num_tokens": 452106980.0, "z_loss": 0.0004592029727064073 }, { "copy_logits_max": 0.43102264404296875, "copy_logits_min": -750000000.0, "copy_num_tokens": 430.625, "epoch": 1.1570589737043655, "gen_logits_max": 4.899609088897705, "gen_logits_mean": -15.004276275634766, "gen_logits_min": -27.066030502319336, "gen_logits_std": 3.015015125274658, "gen_loss": 0.3291930556297302, "grad_norm": 0.38860070596741614, "learning_rate": 2.3773894736842106e-05, "loss": 0.3054, "mean_copy_accuracy": 0.9959072172641754, "mean_gen_accuracy": 0.867495596408844, "mean_token_accuracy": 0.8974633067846298, "num_tokens": 452333072.0, "sample_num_tokens": 8166.0, "step": 5665, "total_num_tokens": 452365736.0, "z_loss": 0.0006116330623626709 }, { "copy_logits_max": 0.6919618844985962, "copy_logits_min": -687500032.0, "copy_num_tokens": 419.375, "epoch": 1.1572632116415624, "gen_logits_max": 4.857790470123291, "gen_logits_mean": -14.795646667480469, "gen_logits_min": -26.647171020507812, "gen_logits_std": 2.967581272125244, "gen_loss": 0.2840353846549988, "grad_norm": 0.42664533997923915, "learning_rate": 2.3772631578947367e-05, "loss": 0.3057, "mean_copy_accuracy": 0.9955549240112305, "mean_gen_accuracy": 0.8687081784009933, "mean_token_accuracy": 0.8971446603536606, "num_tokens": 452576949.0, "sample_num_tokens": 7979.25, "step": 5666, "total_num_tokens": 452608866.0, "z_loss": 0.0005103910225443542 }, { "copy_logits_max": -1.9022526741027832, "copy_logits_min": -750000000.0, "copy_num_tokens": 332.1875, "epoch": 1.1574674495787591, "gen_logits_max": 4.213910102844238, "gen_logits_mean": -17.049663543701172, "gen_logits_min": -29.145265579223633, "gen_logits_std": 3.0460164546966553, "gen_loss": 0.27861523628234863, "grad_norm": 0.4109563686152607, "learning_rate": 2.3771368421052634e-05, "loss": 0.2753, "mean_copy_accuracy": 0.9956038892269135, "mean_gen_accuracy": 0.8795201480388641, "mean_token_accuracy": 0.9078982025384903, "num_tokens": 452835945.0, "sample_num_tokens": 7091.25, "step": 5667, "total_num_tokens": 452864310.0, "z_loss": 0.0005252542323432863 }, { "copy_logits_max": 0.5235254764556885, "copy_logits_min": -687500032.0, "copy_num_tokens": 564.5625, "epoch": 1.157671687515956, "gen_logits_max": 3.483726739883423, "gen_logits_mean": -16.616321563720703, "gen_logits_min": -28.896194458007812, "gen_logits_std": 3.0593645572662354, "gen_loss": 0.26896995306015015, "grad_norm": 0.3789703306840861, "learning_rate": 2.3770105263157895e-05, "loss": 0.2655, "mean_copy_accuracy": 0.9962357729673386, "mean_gen_accuracy": 0.8769027441740036, "mean_token_accuracy": 0.9094601720571518, "num_tokens": 453107323.0, "sample_num_tokens": 8757.25, "step": 5668, "total_num_tokens": 453142352.0, "z_loss": 0.000628716079518199 }, { "copy_logits_max": -0.06630408763885498, "copy_logits_min": -687500032.0, "copy_num_tokens": 493.625, "epoch": 1.157875925453153, "gen_logits_max": 4.378530979156494, "gen_logits_mean": -15.498956680297852, "gen_logits_min": -27.550331115722656, "gen_logits_std": 3.0088651180267334, "gen_loss": 0.2606024146080017, "grad_norm": 0.3896559268540254, "learning_rate": 2.376884210526316e-05, "loss": 0.2789, "mean_copy_accuracy": 0.9958842545747757, "mean_gen_accuracy": 0.8762031197547913, "mean_token_accuracy": 0.9047496914863586, "num_tokens": 453365930.0, "sample_num_tokens": 8238.0, "step": 5669, "total_num_tokens": 453398882.0, "z_loss": 0.0006144000217318535 }, { "copy_logits_max": -0.24807748198509216, "copy_logits_min": -687500032.0, "copy_num_tokens": 434.5625, "epoch": 1.1580801633903497, "gen_logits_max": 4.232890605926514, "gen_logits_mean": -15.934876441955566, "gen_logits_min": -27.866975784301758, "gen_logits_std": 3.036684274673462, "gen_loss": 0.26155707240104675, "grad_norm": 0.3811868932018015, "learning_rate": 2.376757894736842e-05, "loss": 0.2695, "mean_copy_accuracy": 0.9958283007144928, "mean_gen_accuracy": 0.8813377618789673, "mean_token_accuracy": 0.9066715240478516, "num_tokens": 453606961.0, "sample_num_tokens": 7622.75, "step": 5670, "total_num_tokens": 453637452.0, "z_loss": 0.0005652756080962718 }, { "copy_logits_max": -2.152550458908081, "copy_logits_min": -625000064.0, "copy_num_tokens": 664.9375, "epoch": 1.1582844013275466, "gen_logits_max": 5.24278450012207, "gen_logits_mean": -14.022167205810547, "gen_logits_min": -26.559310913085938, "gen_logits_std": 3.004694938659668, "gen_loss": 0.25251680612564087, "grad_norm": 0.4090724096653017, "learning_rate": 2.3766315789473685e-05, "loss": 0.2846, "mean_copy_accuracy": 0.9941723048686981, "mean_gen_accuracy": 0.874292328953743, "mean_token_accuracy": 0.9028337746858597, "num_tokens": 453869755.0, "sample_num_tokens": 10207.25, "step": 5671, "total_num_tokens": 453910584.0, "z_loss": 0.0005916203954257071 }, { "copy_logits_max": -2.692349433898926, "copy_logits_min": -750000000.0, "copy_num_tokens": 475.625, "epoch": 1.1584886392647435, "gen_logits_max": 5.877510070800781, "gen_logits_mean": -13.885892868041992, "gen_logits_min": -25.687877655029297, "gen_logits_std": 2.980156898498535, "gen_loss": 0.30148980021476746, "grad_norm": 0.3931203051449917, "learning_rate": 2.376505263157895e-05, "loss": 0.2883, "mean_copy_accuracy": 0.9961030781269073, "mean_gen_accuracy": 0.8751060962677002, "mean_token_accuracy": 0.9035232961177826, "num_tokens": 454148914.0, "sample_num_tokens": 7898.0, "step": 5672, "total_num_tokens": 454180506.0, "z_loss": 0.0005738973850384355 }, { "copy_logits_max": -0.1001124382019043, "copy_logits_min": -750000000.0, "copy_num_tokens": 376.1875, "epoch": 1.1586928772019403, "gen_logits_max": 5.088942527770996, "gen_logits_mean": -15.448052406311035, "gen_logits_min": -27.89727210998535, "gen_logits_std": 3.0338943004608154, "gen_loss": 0.3029134273529053, "grad_norm": 0.44147536339736526, "learning_rate": 2.376378947368421e-05, "loss": 0.2917, "mean_copy_accuracy": 0.9949764013290405, "mean_gen_accuracy": 0.8718931972980499, "mean_token_accuracy": 0.9007825702428818, "num_tokens": 454411793.0, "sample_num_tokens": 7205.75, "step": 5673, "total_num_tokens": 454440616.0, "z_loss": 0.0005558078992180526 }, { "copy_logits_max": -1.509000301361084, "copy_logits_min": -750000000.0, "copy_num_tokens": 461.5, "epoch": 1.1588971151391372, "gen_logits_max": 5.602761745452881, "gen_logits_mean": -14.361772537231445, "gen_logits_min": -26.986129760742188, "gen_logits_std": 3.0086753368377686, "gen_loss": 0.2856130599975586, "grad_norm": 0.4153299533826559, "learning_rate": 2.3762526315789474e-05, "loss": 0.2886, "mean_copy_accuracy": 0.9966202229261398, "mean_gen_accuracy": 0.8667263686656952, "mean_token_accuracy": 0.9025495052337646, "num_tokens": 454687200.0, "sample_num_tokens": 8783.0, "step": 5674, "total_num_tokens": 454722332.0, "z_loss": 0.0005246296059340239 }, { "copy_logits_max": -4.0993123054504395, "copy_logits_min": -750000000.0, "copy_num_tokens": 529.0, "epoch": 1.1591013530763339, "gen_logits_max": 3.2083749771118164, "gen_logits_mean": -17.827749252319336, "gen_logits_min": -29.439430236816406, "gen_logits_std": 3.0518884658813477, "gen_loss": 0.2982428967952728, "grad_norm": 0.38020985429257387, "learning_rate": 2.376126315789474e-05, "loss": 0.2662, "mean_copy_accuracy": 0.996289536356926, "mean_gen_accuracy": 0.8774526566267014, "mean_token_accuracy": 0.910015806555748, "num_tokens": 454972327.0, "sample_num_tokens": 9482.75, "step": 5675, "total_num_tokens": 455010258.0, "z_loss": 0.00048271677223965526 }, { "copy_logits_max": -3.8784103393554688, "copy_logits_min": -750000000.0, "copy_num_tokens": 310.3125, "epoch": 1.1593055910135308, "gen_logits_max": 6.032589435577393, "gen_logits_mean": -13.41152286529541, "gen_logits_min": -25.533987045288086, "gen_logits_std": 2.9723870754241943, "gen_loss": 0.278955340385437, "grad_norm": 0.4906850698709525, "learning_rate": 2.3760000000000003e-05, "loss": 0.2901, "mean_copy_accuracy": 0.9955123215913773, "mean_gen_accuracy": 0.871645525097847, "mean_token_accuracy": 0.9017345607280731, "num_tokens": 455231898.0, "sample_num_tokens": 7034.0, "step": 5676, "total_num_tokens": 455260034.0, "z_loss": 0.00047765832277946174 }, { "copy_logits_max": -3.1165521144866943, "copy_logits_min": -687500032.0, "copy_num_tokens": 627.1875, "epoch": 1.1595098289507275, "gen_logits_max": 4.067799091339111, "gen_logits_mean": -15.371143341064453, "gen_logits_min": -27.613296508789062, "gen_logits_std": 3.0323073863983154, "gen_loss": 0.2724907398223877, "grad_norm": 0.4339532791279722, "learning_rate": 2.3758736842105264e-05, "loss": 0.2825, "mean_copy_accuracy": 0.9967433661222458, "mean_gen_accuracy": 0.8739765286445618, "mean_token_accuracy": 0.9050047844648361, "num_tokens": 455490899.0, "sample_num_tokens": 9226.75, "step": 5677, "total_num_tokens": 455527806.0, "z_loss": 0.0005080344853922725 }, { "copy_logits_max": -6.90048885345459, "copy_logits_min": -750000000.0, "copy_num_tokens": 405.4375, "epoch": 1.1597140668879244, "gen_logits_max": 5.066751480102539, "gen_logits_mean": -15.8029146194458, "gen_logits_min": -27.60162353515625, "gen_logits_std": 3.04103946685791, "gen_loss": 0.2532994747161865, "grad_norm": 0.3848734797796851, "learning_rate": 2.3757473684210528e-05, "loss": 0.2568, "mean_copy_accuracy": 0.9955200999975204, "mean_gen_accuracy": 0.8852950185537338, "mean_token_accuracy": 0.9124200791120529, "num_tokens": 455764951.0, "sample_num_tokens": 8876.25, "step": 5678, "total_num_tokens": 455800456.0, "z_loss": 0.0005150212673470378 }, { "copy_logits_max": -3.9848885536193848, "copy_logits_min": -750000000.0, "copy_num_tokens": 402.0, "epoch": 1.1599183048251214, "gen_logits_max": 4.905284404754639, "gen_logits_mean": -15.061660766601562, "gen_logits_min": -27.086177825927734, "gen_logits_std": 3.0246992111206055, "gen_loss": 0.28166288137435913, "grad_norm": 0.42207137830841895, "learning_rate": 2.375621052631579e-05, "loss": 0.2698, "mean_copy_accuracy": 0.9957420229911804, "mean_gen_accuracy": 0.8772266656160355, "mean_token_accuracy": 0.9058035612106323, "num_tokens": 456020996.0, "sample_num_tokens": 7287.5, "step": 5679, "total_num_tokens": 456050146.0, "z_loss": 0.0005533265648409724 }, { "copy_logits_max": -2.2018251419067383, "copy_logits_min": -750000064.0, "copy_num_tokens": 652.4375, "epoch": 1.160122542762318, "gen_logits_max": 4.100306510925293, "gen_logits_mean": -14.593480110168457, "gen_logits_min": -27.026615142822266, "gen_logits_std": 3.0325565338134766, "gen_loss": 0.23058144748210907, "grad_norm": 0.40893811650603196, "learning_rate": 2.3754947368421053e-05, "loss": 0.2594, "mean_copy_accuracy": 0.996251255273819, "mean_gen_accuracy": 0.8819056004285812, "mean_token_accuracy": 0.9120143204927444, "num_tokens": 456278322.0, "sample_num_tokens": 9794.0, "step": 5680, "total_num_tokens": 456317498.0, "z_loss": 0.00043912557885050774 }, { "copy_logits_max": -3.0427231788635254, "copy_logits_min": -687500032.0, "copy_num_tokens": 498.25, "epoch": 1.160326780699515, "gen_logits_max": 5.037811279296875, "gen_logits_mean": -14.648694038391113, "gen_logits_min": -26.44341468811035, "gen_logits_std": 3.0138378143310547, "gen_loss": 0.27567946910858154, "grad_norm": 0.4388212847715299, "learning_rate": 2.3753684210526314e-05, "loss": 0.2685, "mean_copy_accuracy": 0.9943662732839584, "mean_gen_accuracy": 0.8844853490591049, "mean_token_accuracy": 0.9095043987035751, "num_tokens": 456542915.0, "sample_num_tokens": 8668.25, "step": 5681, "total_num_tokens": 456577588.0, "z_loss": 0.0005700932233594358 }, { "copy_logits_max": -2.557539224624634, "copy_logits_min": -687500032.0, "copy_num_tokens": 223.1875, "epoch": 1.1605310186367117, "gen_logits_max": 5.353499889373779, "gen_logits_mean": -15.626221656799316, "gen_logits_min": -27.130596160888672, "gen_logits_std": 2.996657133102417, "gen_loss": 0.3087957501411438, "grad_norm": 0.4179123451680717, "learning_rate": 2.375242105263158e-05, "loss": 0.2991, "mean_copy_accuracy": 0.9930693358182907, "mean_gen_accuracy": 0.8750410377979279, "mean_token_accuracy": 0.8966524004936218, "num_tokens": 456784061.0, "sample_num_tokens": 6509.75, "step": 5682, "total_num_tokens": 456810100.0, "z_loss": 0.0006244873511604965 }, { "copy_logits_max": -4.257657051086426, "copy_logits_min": -750000000.0, "copy_num_tokens": 414.3125, "epoch": 1.1607352565739086, "gen_logits_max": 4.974641799926758, "gen_logits_mean": -15.814327239990234, "gen_logits_min": -27.38031578063965, "gen_logits_std": 3.005643844604492, "gen_loss": 0.2886814475059509, "grad_norm": 0.49751571299762404, "learning_rate": 2.3751157894736843e-05, "loss": 0.2703, "mean_copy_accuracy": 0.9964728057384491, "mean_gen_accuracy": 0.879403367638588, "mean_token_accuracy": 0.9084453284740448, "num_tokens": 457048686.0, "sample_num_tokens": 7989.5, "step": 5683, "total_num_tokens": 457080644.0, "z_loss": 0.0006410968489944935 }, { "copy_logits_max": -4.330155849456787, "copy_logits_min": -750000064.0, "copy_num_tokens": 387.875, "epoch": 1.1609394945111053, "gen_logits_max": 3.845560073852539, "gen_logits_mean": -16.42554473876953, "gen_logits_min": -28.283763885498047, "gen_logits_std": 3.049347162246704, "gen_loss": 0.2870992422103882, "grad_norm": 0.3884504725677426, "learning_rate": 2.3749894736842107e-05, "loss": 0.2785, "mean_copy_accuracy": 0.9963709712028503, "mean_gen_accuracy": 0.876177117228508, "mean_token_accuracy": 0.9069768339395523, "num_tokens": 457316570.0, "sample_num_tokens": 7392.0, "step": 5684, "total_num_tokens": 457346138.0, "z_loss": 0.0005919547402299941 }, { "copy_logits_max": -1.0266014337539673, "copy_logits_min": -750000000.0, "copy_num_tokens": 423.6875, "epoch": 1.1611437324483023, "gen_logits_max": 6.254453659057617, "gen_logits_mean": -13.86046028137207, "gen_logits_min": -25.794189453125, "gen_logits_std": 2.9799723625183105, "gen_loss": 0.30828601121902466, "grad_norm": 0.3918210140103438, "learning_rate": 2.3748631578947368e-05, "loss": 0.2815, "mean_copy_accuracy": 0.995280921459198, "mean_gen_accuracy": 0.8760875463485718, "mean_token_accuracy": 0.9040293991565704, "num_tokens": 457594817.0, "sample_num_tokens": 9334.75, "step": 5685, "total_num_tokens": 457632156.0, "z_loss": 0.000558067811653018 }, { "copy_logits_max": -4.41029691696167, "copy_logits_min": -750000000.0, "copy_num_tokens": 540.8125, "epoch": 1.1613479703854992, "gen_logits_max": 4.999514579772949, "gen_logits_mean": -13.427366256713867, "gen_logits_min": -24.90509033203125, "gen_logits_std": 2.922208070755005, "gen_loss": 0.27794933319091797, "grad_norm": 0.3917534249834579, "learning_rate": 2.3747368421052632e-05, "loss": 0.2712, "mean_copy_accuracy": 0.9963099509477615, "mean_gen_accuracy": 0.8786432445049286, "mean_token_accuracy": 0.9085393995046616, "num_tokens": 457863378.0, "sample_num_tokens": 8830.0, "step": 5686, "total_num_tokens": 457898698.0, "z_loss": 0.0005602349992841482 }, { "copy_logits_max": -5.185332298278809, "copy_logits_min": -750000000.0, "copy_num_tokens": 470.5, "epoch": 1.161552208322696, "gen_logits_max": 4.761688709259033, "gen_logits_mean": -14.060279846191406, "gen_logits_min": -25.486083984375, "gen_logits_std": 2.9469990730285645, "gen_loss": 0.30274277925491333, "grad_norm": 0.3830699607449461, "learning_rate": 2.3746105263157897e-05, "loss": 0.3029, "mean_copy_accuracy": 0.9951144307851791, "mean_gen_accuracy": 0.8670828491449356, "mean_token_accuracy": 0.8974952399730682, "num_tokens": 458130625.0, "sample_num_tokens": 8530.25, "step": 5687, "total_num_tokens": 458164746.0, "z_loss": 0.0005908723105676472 }, { "copy_logits_max": -4.762042999267578, "copy_logits_min": -750000000.0, "copy_num_tokens": 727.75, "epoch": 1.1617564462598928, "gen_logits_max": 3.0512661933898926, "gen_logits_mean": -15.860074996948242, "gen_logits_min": -27.748291015625, "gen_logits_std": 3.0066518783569336, "gen_loss": 0.25386953353881836, "grad_norm": 0.3808068604337373, "learning_rate": 2.3744842105263158e-05, "loss": 0.2667, "mean_copy_accuracy": 0.9968487173318863, "mean_gen_accuracy": 0.8727199733257294, "mean_token_accuracy": 0.9089741259813309, "num_tokens": 458415201.0, "sample_num_tokens": 9960.75, "step": 5688, "total_num_tokens": 458455044.0, "z_loss": 0.0005354346940293908 }, { "copy_logits_max": -1.0489213466644287, "copy_logits_min": -750000000.0, "copy_num_tokens": 667.5625, "epoch": 1.1619606841970895, "gen_logits_max": 4.348395347595215, "gen_logits_mean": -14.615909576416016, "gen_logits_min": -26.946531295776367, "gen_logits_std": 3.0467920303344727, "gen_loss": 0.25336113572120667, "grad_norm": 0.4881568438610198, "learning_rate": 2.3743578947368422e-05, "loss": 0.2783, "mean_copy_accuracy": 0.9965458959341049, "mean_gen_accuracy": 0.8726063370704651, "mean_token_accuracy": 0.9045667797327042, "num_tokens": 458681898.0, "sample_num_tokens": 9589.0, "step": 5689, "total_num_tokens": 458720254.0, "z_loss": 0.0005053105996921659 }, { "copy_logits_max": -3.4169039726257324, "copy_logits_min": -687500032.0, "copy_num_tokens": 537.375, "epoch": 1.1621649221342865, "gen_logits_max": 4.323805332183838, "gen_logits_mean": -14.473099708557129, "gen_logits_min": -26.51441192626953, "gen_logits_std": 2.9868602752685547, "gen_loss": 0.23484861850738525, "grad_norm": 0.3769262944198479, "learning_rate": 2.3742315789473683e-05, "loss": 0.2604, "mean_copy_accuracy": 0.9959007054567337, "mean_gen_accuracy": 0.885367676615715, "mean_token_accuracy": 0.9112354069948196, "num_tokens": 458946599.0, "sample_num_tokens": 9300.25, "step": 5690, "total_num_tokens": 458983800.0, "z_loss": 0.0004944525426253676 }, { "copy_logits_max": -5.3970112800598145, "copy_logits_min": -687500032.0, "copy_num_tokens": 801.875, "epoch": 1.1623691600714832, "gen_logits_max": 3.293726682662964, "gen_logits_mean": -15.766400337219238, "gen_logits_min": -28.294044494628906, "gen_logits_std": 3.0210413932800293, "gen_loss": 0.2631978690624237, "grad_norm": 0.36874875158660586, "learning_rate": 2.3741052631578947e-05, "loss": 0.2733, "mean_copy_accuracy": 0.996128648519516, "mean_gen_accuracy": 0.8762807250022888, "mean_token_accuracy": 0.9077019840478897, "num_tokens": 459233580.0, "sample_num_tokens": 11213.5, "step": 5691, "total_num_tokens": 459278434.0, "z_loss": 0.0004808054945897311 }, { "copy_logits_max": -4.739041805267334, "copy_logits_min": -687500032.0, "copy_num_tokens": 714.9375, "epoch": 1.16257339800868, "gen_logits_max": 3.1456689834594727, "gen_logits_mean": -15.792999267578125, "gen_logits_min": -28.37203598022461, "gen_logits_std": 3.0512197017669678, "gen_loss": 0.23648308217525482, "grad_norm": 0.37093700566994575, "learning_rate": 2.373978947368421e-05, "loss": 0.2642, "mean_copy_accuracy": 0.9967192262411118, "mean_gen_accuracy": 0.8802385628223419, "mean_token_accuracy": 0.9112277179956436, "num_tokens": 459506681.0, "sample_num_tokens": 9208.75, "step": 5692, "total_num_tokens": 459543516.0, "z_loss": 0.0005424314877018332 }, { "copy_logits_max": -7.386440277099609, "copy_logits_min": -687500032.0, "copy_num_tokens": 514.8125, "epoch": 1.162777635945877, "gen_logits_max": 3.625938892364502, "gen_logits_mean": -16.146484375, "gen_logits_min": -27.803876876831055, "gen_logits_std": 3.0364761352539062, "gen_loss": 0.254838764667511, "grad_norm": 0.44747585719037264, "learning_rate": 2.3738526315789476e-05, "loss": 0.2862, "mean_copy_accuracy": 0.9956225007772446, "mean_gen_accuracy": 0.8747097253799438, "mean_token_accuracy": 0.9043381661176682, "num_tokens": 459787747.0, "sample_num_tokens": 9405.75, "step": 5693, "total_num_tokens": 459825370.0, "z_loss": 0.0004710056527983397 }, { "copy_logits_max": -7.095948219299316, "copy_logits_min": -687500032.0, "copy_num_tokens": 463.625, "epoch": 1.1629818738830737, "gen_logits_max": 4.031092643737793, "gen_logits_mean": -15.0901460647583, "gen_logits_min": -26.518157958984375, "gen_logits_std": 2.9576945304870605, "gen_loss": 0.2424837052822113, "grad_norm": 0.3762339261236482, "learning_rate": 2.3737263157894737e-05, "loss": 0.2749, "mean_copy_accuracy": 0.9961103945970535, "mean_gen_accuracy": 0.873234286904335, "mean_token_accuracy": 0.9058680534362793, "num_tokens": 460078527.0, "sample_num_tokens": 8717.75, "step": 5694, "total_num_tokens": 460113398.0, "z_loss": 0.00046078269951976836 }, { "copy_logits_max": -6.0674147605896, "copy_logits_min": -750000000.0, "copy_num_tokens": 469.125, "epoch": 1.1631861118202707, "gen_logits_max": 4.714344501495361, "gen_logits_mean": -14.500199317932129, "gen_logits_min": -26.399089813232422, "gen_logits_std": 2.9485573768615723, "gen_loss": 0.267078697681427, "grad_norm": 0.40375584496921213, "learning_rate": 2.3736e-05, "loss": 0.2739, "mean_copy_accuracy": 0.9954098910093307, "mean_gen_accuracy": 0.8763302564620972, "mean_token_accuracy": 0.9067762643098831, "num_tokens": 460345079.0, "sample_num_tokens": 7950.75, "step": 5695, "total_num_tokens": 460376882.0, "z_loss": 0.0005376975750550628 }, { "copy_logits_max": -7.014379501342773, "copy_logits_min": -750000000.0, "copy_num_tokens": 429.1875, "epoch": 1.1633903497574674, "gen_logits_max": 4.505459785461426, "gen_logits_mean": -15.536791801452637, "gen_logits_min": -26.989017486572266, "gen_logits_std": 2.9400012493133545, "gen_loss": 0.2809528112411499, "grad_norm": 0.3727402984912972, "learning_rate": 2.3734736842105262e-05, "loss": 0.2689, "mean_copy_accuracy": 0.9974591284990311, "mean_gen_accuracy": 0.879213735461235, "mean_token_accuracy": 0.9093064516782761, "num_tokens": 460616591.0, "sample_num_tokens": 8299.25, "step": 5696, "total_num_tokens": 460649788.0, "z_loss": 0.0004967915010638535 }, { "copy_logits_max": -5.742304801940918, "copy_logits_min": -750000000.0, "copy_num_tokens": 433.5, "epoch": 1.1635945876946643, "gen_logits_max": 4.135315418243408, "gen_logits_mean": -15.599983215332031, "gen_logits_min": -27.416641235351562, "gen_logits_std": 3.0025644302368164, "gen_loss": 0.28839296102523804, "grad_norm": 0.357472484665022, "learning_rate": 2.3733473684210526e-05, "loss": 0.2788, "mean_copy_accuracy": 0.9968873411417007, "mean_gen_accuracy": 0.8722941726446152, "mean_token_accuracy": 0.903283417224884, "num_tokens": 460880513.0, "sample_num_tokens": 8117.25, "step": 5697, "total_num_tokens": 460912982.0, "z_loss": 0.0005121113499626517 }, { "copy_logits_max": -6.647517204284668, "copy_logits_min": -750000000.0, "copy_num_tokens": 519.6875, "epoch": 1.1637988256318612, "gen_logits_max": 4.3541107177734375, "gen_logits_mean": -14.412242889404297, "gen_logits_min": -25.836618423461914, "gen_logits_std": 2.9019274711608887, "gen_loss": 0.27588951587677, "grad_norm": 0.38040899622747193, "learning_rate": 2.3732210526315787e-05, "loss": 0.2901, "mean_copy_accuracy": 0.9957481175661087, "mean_gen_accuracy": 0.870890274643898, "mean_token_accuracy": 0.899789959192276, "num_tokens": 461148351.0, "sample_num_tokens": 9113.25, "step": 5698, "total_num_tokens": 461184804.0, "z_loss": 0.00047103484394028783 }, { "copy_logits_max": -4.347629070281982, "copy_logits_min": -750000064.0, "copy_num_tokens": 446.8125, "epoch": 1.164003063569058, "gen_logits_max": 4.455420970916748, "gen_logits_mean": -15.82429313659668, "gen_logits_min": -27.687196731567383, "gen_logits_std": 3.0048413276672363, "gen_loss": 0.2783668637275696, "grad_norm": 0.3889153543863272, "learning_rate": 2.373094736842105e-05, "loss": 0.2754, "mean_copy_accuracy": 0.9946019500494003, "mean_gen_accuracy": 0.877325177192688, "mean_token_accuracy": 0.9075125902891159, "num_tokens": 461440177.0, "sample_num_tokens": 8645.75, "step": 5699, "total_num_tokens": 461474760.0, "z_loss": 0.0005932471249252558 }, { "copy_logits_max": -5.64120626449585, "copy_logits_min": -750000000.0, "copy_num_tokens": 328.8125, "epoch": 1.1642073015062548, "gen_logits_max": 4.921354293823242, "gen_logits_mean": -16.105518341064453, "gen_logits_min": -27.42203140258789, "gen_logits_std": 2.977238655090332, "gen_loss": 0.3129546344280243, "grad_norm": 0.5016080629169722, "learning_rate": 2.372968421052632e-05, "loss": 0.2976, "mean_copy_accuracy": 0.9955797493457794, "mean_gen_accuracy": 0.8708764165639877, "mean_token_accuracy": 0.9001832157373428, "num_tokens": 461710194.0, "sample_num_tokens": 7760.5, "step": 5700, "total_num_tokens": 461741236.0, "z_loss": 0.0006635406753048301 }, { "copy_logits_max": -4.185027122497559, "copy_logits_min": -750000000.0, "copy_num_tokens": 502.0625, "epoch": 1.1644115394434515, "gen_logits_max": 4.037613868713379, "gen_logits_mean": -15.641923904418945, "gen_logits_min": -27.14270782470703, "gen_logits_std": 2.966961622238159, "gen_loss": 0.27341407537460327, "grad_norm": 0.400326134561529, "learning_rate": 2.372842105263158e-05, "loss": 0.279, "mean_copy_accuracy": 0.996577724814415, "mean_gen_accuracy": 0.8715869039297104, "mean_token_accuracy": 0.9054011404514313, "num_tokens": 461982375.0, "sample_num_tokens": 8474.25, "step": 5701, "total_num_tokens": 462016272.0, "z_loss": 0.0006182607030496001 }, { "copy_logits_max": -2.971494674682617, "copy_logits_min": -687500032.0, "copy_num_tokens": 444.5, "epoch": 1.1646157773806485, "gen_logits_max": 4.9548444747924805, "gen_logits_mean": -15.079324722290039, "gen_logits_min": -26.649641036987305, "gen_logits_std": 2.964742660522461, "gen_loss": 0.3143494129180908, "grad_norm": 0.3782023270901051, "learning_rate": 2.3727157894736845e-05, "loss": 0.2919, "mean_copy_accuracy": 0.9956127107143402, "mean_gen_accuracy": 0.8732335120439529, "mean_token_accuracy": 0.9025154858827591, "num_tokens": 462259267.0, "sample_num_tokens": 8713.75, "step": 5702, "total_num_tokens": 462294122.0, "z_loss": 0.0006656002951785922 }, { "copy_logits_max": -2.466472625732422, "copy_logits_min": -750000000.0, "copy_num_tokens": 444.0, "epoch": 1.1648200153178454, "gen_logits_max": 3.9824042320251465, "gen_logits_mean": -15.43160629272461, "gen_logits_min": -27.566932678222656, "gen_logits_std": 2.9777941703796387, "gen_loss": 0.3157649338245392, "grad_norm": 0.3527781821562587, "learning_rate": 2.3725894736842105e-05, "loss": 0.2935, "mean_copy_accuracy": 0.995879128575325, "mean_gen_accuracy": 0.8659291118383408, "mean_token_accuracy": 0.9004186987876892, "num_tokens": 462524773.0, "sample_num_tokens": 7309.75, "step": 5703, "total_num_tokens": 462554012.0, "z_loss": 0.0006588082760572433 }, { "copy_logits_max": -1.6820716857910156, "copy_logits_min": -687499968.0, "copy_num_tokens": 369.5, "epoch": 1.165024253255042, "gen_logits_max": 5.364931106567383, "gen_logits_mean": -13.813542366027832, "gen_logits_min": -25.707611083984375, "gen_logits_std": 2.9576525688171387, "gen_loss": 0.29596978425979614, "grad_norm": 0.382733701634588, "learning_rate": 2.372463157894737e-05, "loss": 0.2866, "mean_copy_accuracy": 0.9945076555013657, "mean_gen_accuracy": 0.8745550960302353, "mean_token_accuracy": 0.9010277539491653, "num_tokens": 462793419.0, "sample_num_tokens": 7740.25, "step": 5704, "total_num_tokens": 462824380.0, "z_loss": 0.0006274536717683077 }, { "copy_logits_max": -0.7132703065872192, "copy_logits_min": -687500032.0, "copy_num_tokens": 505.75, "epoch": 1.165228491192239, "gen_logits_max": 4.366069316864014, "gen_logits_mean": -15.61064338684082, "gen_logits_min": -27.63943862915039, "gen_logits_std": 3.0284430980682373, "gen_loss": 0.3066217005252838, "grad_norm": 0.38510457476378457, "learning_rate": 2.372336842105263e-05, "loss": 0.2793, "mean_copy_accuracy": 0.9956695884466171, "mean_gen_accuracy": 0.8726333677768707, "mean_token_accuracy": 0.9036417007446289, "num_tokens": 463065059.0, "sample_num_tokens": 8801.75, "step": 5705, "total_num_tokens": 463100266.0, "z_loss": 0.0006702913669869304 }, { "copy_logits_max": -3.112816333770752, "copy_logits_min": -750000000.0, "copy_num_tokens": 323.0, "epoch": 1.1654327291294357, "gen_logits_max": 4.578127861022949, "gen_logits_mean": -16.134675979614258, "gen_logits_min": -27.930484771728516, "gen_logits_std": 3.004517078399658, "gen_loss": 0.29140013456344604, "grad_norm": 0.4228123502810314, "learning_rate": 2.3722105263157895e-05, "loss": 0.2823, "mean_copy_accuracy": 0.9963275641202927, "mean_gen_accuracy": 0.874464213848114, "mean_token_accuracy": 0.903628796339035, "num_tokens": 463324550.0, "sample_num_tokens": 7625.5, "step": 5706, "total_num_tokens": 463355052.0, "z_loss": 0.000619543599896133 }, { "copy_logits_max": -1.6204148530960083, "copy_logits_min": -687500032.0, "copy_num_tokens": 477.125, "epoch": 1.1656369670666327, "gen_logits_max": 3.7542612552642822, "gen_logits_mean": -15.423236846923828, "gen_logits_min": -27.785884857177734, "gen_logits_std": 3.0009379386901855, "gen_loss": 0.2901111841201782, "grad_norm": 0.37701951059904687, "learning_rate": 2.3720842105263156e-05, "loss": 0.2799, "mean_copy_accuracy": 0.9958299696445465, "mean_gen_accuracy": 0.8736531734466553, "mean_token_accuracy": 0.9037803262472153, "num_tokens": 463576852.0, "sample_num_tokens": 7776.5, "step": 5707, "total_num_tokens": 463607958.0, "z_loss": 0.0005889511085115373 }, { "copy_logits_max": -3.226905345916748, "copy_logits_min": -687500032.0, "copy_num_tokens": 523.8125, "epoch": 1.1658412050038294, "gen_logits_max": 5.180692195892334, "gen_logits_mean": -12.929460525512695, "gen_logits_min": -24.788219451904297, "gen_logits_std": 2.9378769397735596, "gen_loss": 0.2627347707748413, "grad_norm": 0.38532174565945926, "learning_rate": 2.3719578947368424e-05, "loss": 0.261, "mean_copy_accuracy": 0.9960844814777374, "mean_gen_accuracy": 0.8811668753623962, "mean_token_accuracy": 0.912044957280159, "num_tokens": 463844760.0, "sample_num_tokens": 8376.0, "step": 5708, "total_num_tokens": 463878264.0, "z_loss": 0.0005517863901332021 }, { "copy_logits_max": -4.168511867523193, "copy_logits_min": -750000000.0, "copy_num_tokens": 647.3125, "epoch": 1.1660454429410263, "gen_logits_max": 3.4130773544311523, "gen_logits_mean": -15.124491691589355, "gen_logits_min": -27.326622009277344, "gen_logits_std": 2.9999887943267822, "gen_loss": 0.23303160071372986, "grad_norm": 0.42835544623093147, "learning_rate": 2.3718315789473685e-05, "loss": 0.2874, "mean_copy_accuracy": 0.9956187456846237, "mean_gen_accuracy": 0.8728231489658356, "mean_token_accuracy": 0.9019169956445694, "num_tokens": 464106598.0, "sample_num_tokens": 8837.0, "step": 5709, "total_num_tokens": 464141946.0, "z_loss": 0.0004801241448149085 }, { "copy_logits_max": -4.71681022644043, "copy_logits_min": -750000000.0, "copy_num_tokens": 395.625, "epoch": 1.1662496808782232, "gen_logits_max": 4.248395919799805, "gen_logits_mean": -16.382335662841797, "gen_logits_min": -28.375198364257812, "gen_logits_std": 3.0176727771759033, "gen_loss": 0.28820228576660156, "grad_norm": 0.38073168412416886, "learning_rate": 2.371705263157895e-05, "loss": 0.2781, "mean_copy_accuracy": 0.99636410176754, "mean_gen_accuracy": 0.8770123422145844, "mean_token_accuracy": 0.9059479981660843, "num_tokens": 464389081.0, "sample_num_tokens": 7662.25, "step": 5710, "total_num_tokens": 464419730.0, "z_loss": 0.0005308283725753427 }, { "copy_logits_max": -5.163375377655029, "copy_logits_min": -750000000.0, "copy_num_tokens": 515.8125, "epoch": 1.16645391881542, "gen_logits_max": 3.542952060699463, "gen_logits_mean": -16.073284149169922, "gen_logits_min": -27.929197311401367, "gen_logits_std": 2.9975461959838867, "gen_loss": 0.23349398374557495, "grad_norm": 0.3817698097978286, "learning_rate": 2.371578947368421e-05, "loss": 0.271, "mean_copy_accuracy": 0.9959428608417511, "mean_gen_accuracy": 0.8780162036418915, "mean_token_accuracy": 0.90725477039814, "num_tokens": 464656003.0, "sample_num_tokens": 8679.75, "step": 5711, "total_num_tokens": 464690722.0, "z_loss": 0.0004263615410309285 }, { "copy_logits_max": -3.117196559906006, "copy_logits_min": -750000000.0, "copy_num_tokens": 744.625, "epoch": 1.1666581567526169, "gen_logits_max": 3.0422780513763428, "gen_logits_mean": -16.002424240112305, "gen_logits_min": -27.864795684814453, "gen_logits_std": 3.0106091499328613, "gen_loss": 0.26758456230163574, "grad_norm": 0.38630880451924937, "learning_rate": 2.3714526315789474e-05, "loss": 0.291, "mean_copy_accuracy": 0.995882198214531, "mean_gen_accuracy": 0.870635062456131, "mean_token_accuracy": 0.90089151263237, "num_tokens": 464919558.0, "sample_num_tokens": 10036.5, "step": 5712, "total_num_tokens": 464959704.0, "z_loss": 0.0004947491106577218 }, { "copy_logits_max": -3.207468032836914, "copy_logits_min": -687500032.0, "copy_num_tokens": 859.375, "epoch": 1.1668623946898136, "gen_logits_max": 3.775690793991089, "gen_logits_mean": -14.499368667602539, "gen_logits_min": -26.486391067504883, "gen_logits_std": 2.9406113624572754, "gen_loss": 0.2585938572883606, "grad_norm": 0.3850258179867712, "learning_rate": 2.371326315789474e-05, "loss": 0.2869, "mean_copy_accuracy": 0.9972804933786392, "mean_gen_accuracy": 0.8684479743242264, "mean_token_accuracy": 0.9038301706314087, "num_tokens": 465207855.0, "sample_num_tokens": 11163.75, "step": 5713, "total_num_tokens": 465252510.0, "z_loss": 0.0004987608408555388 }, { "copy_logits_max": -4.764359474182129, "copy_logits_min": -750000000.0, "copy_num_tokens": 399.0, "epoch": 1.1670666326270105, "gen_logits_max": 4.8460259437561035, "gen_logits_mean": -15.309004783630371, "gen_logits_min": -27.296669006347656, "gen_logits_std": 3.0089638233184814, "gen_loss": 0.28840136528015137, "grad_norm": 0.36803831137700516, "learning_rate": 2.3712e-05, "loss": 0.2683, "mean_copy_accuracy": 0.996068611741066, "mean_gen_accuracy": 0.883479043841362, "mean_token_accuracy": 0.9080829173326492, "num_tokens": 465491655.0, "sample_num_tokens": 8619.25, "step": 5714, "total_num_tokens": 465526132.0, "z_loss": 0.0005424307310022414 }, { "copy_logits_max": -4.565535545349121, "copy_logits_min": -750000000.0, "copy_num_tokens": 556.5625, "epoch": 1.1672708705642072, "gen_logits_max": 2.8528976440429688, "gen_logits_mean": -16.65628433227539, "gen_logits_min": -28.816368103027344, "gen_logits_std": 3.0401642322540283, "gen_loss": 0.23678351938724518, "grad_norm": 0.3977918449779813, "learning_rate": 2.3710736842105264e-05, "loss": 0.2856, "mean_copy_accuracy": 0.9963370710611343, "mean_gen_accuracy": 0.8710528761148453, "mean_token_accuracy": 0.9015966355800629, "num_tokens": 465764523.0, "sample_num_tokens": 8477.75, "step": 5715, "total_num_tokens": 465798434.0, "z_loss": 0.00046882714377716184 }, { "copy_logits_max": -4.466663360595703, "copy_logits_min": -750000000.0, "copy_num_tokens": 331.875, "epoch": 1.1674751085014041, "gen_logits_max": 5.164693832397461, "gen_logits_mean": -15.04664421081543, "gen_logits_min": -26.750385284423828, "gen_logits_std": 2.9931485652923584, "gen_loss": 0.28934192657470703, "grad_norm": 0.39634822400706154, "learning_rate": 2.3709473684210528e-05, "loss": 0.2811, "mean_copy_accuracy": 0.9952827244997025, "mean_gen_accuracy": 0.8798703104257584, "mean_token_accuracy": 0.9045539349317551, "num_tokens": 466019042.0, "sample_num_tokens": 7535.5, "step": 5716, "total_num_tokens": 466049184.0, "z_loss": 0.000539610511623323 }, { "copy_logits_max": -1.3788721561431885, "copy_logits_min": -687500032.0, "copy_num_tokens": 701.0, "epoch": 1.167679346438601, "gen_logits_max": 4.568276405334473, "gen_logits_mean": -14.226336479187012, "gen_logits_min": -26.74140739440918, "gen_logits_std": 3.0203187465667725, "gen_loss": 0.22255389392375946, "grad_norm": 0.38786199523255727, "learning_rate": 2.3708210526315792e-05, "loss": 0.2855, "mean_copy_accuracy": 0.997018426656723, "mean_gen_accuracy": 0.8715183138847351, "mean_token_accuracy": 0.903266504406929, "num_tokens": 466281739.0, "sample_num_tokens": 9901.75, "step": 5717, "total_num_tokens": 466321346.0, "z_loss": 0.0004850971745327115 }, { "copy_logits_max": -2.871182441711426, "copy_logits_min": -687500032.0, "copy_num_tokens": 491.5625, "epoch": 1.1678835843757978, "gen_logits_max": 3.5182926654815674, "gen_logits_mean": -16.982942581176758, "gen_logits_min": -29.211069107055664, "gen_logits_std": 3.04622483253479, "gen_loss": 0.2764534056186676, "grad_norm": 0.37259452237279905, "learning_rate": 2.3706947368421053e-05, "loss": 0.2674, "mean_copy_accuracy": 0.9953240007162094, "mean_gen_accuracy": 0.8778630644083023, "mean_token_accuracy": 0.9092540144920349, "num_tokens": 466560482.0, "sample_num_tokens": 8221.5, "step": 5718, "total_num_tokens": 466593368.0, "z_loss": 0.0005810115253552794 }, { "copy_logits_max": -4.545666694641113, "copy_logits_min": -750000000.0, "copy_num_tokens": 291.75, "epoch": 1.1680878223129947, "gen_logits_max": 6.023271560668945, "gen_logits_mean": -13.605783462524414, "gen_logits_min": -25.118377685546875, "gen_logits_std": 2.9413199424743652, "gen_loss": 0.34488821029663086, "grad_norm": 0.3871802717531245, "learning_rate": 2.3705684210526317e-05, "loss": 0.2985, "mean_copy_accuracy": 0.9962528198957443, "mean_gen_accuracy": 0.8712155669927597, "mean_token_accuracy": 0.9002776592969894, "num_tokens": 466837027.0, "sample_num_tokens": 7399.75, "step": 5719, "total_num_tokens": 466866626.0, "z_loss": 0.0006533676059916615 }, { "copy_logits_max": -3.177159309387207, "copy_logits_min": -750000064.0, "copy_num_tokens": 479.25, "epoch": 1.1682920602501914, "gen_logits_max": 4.658706188201904, "gen_logits_mean": -14.79798698425293, "gen_logits_min": -26.60068130493164, "gen_logits_std": 2.9699978828430176, "gen_loss": 0.28628069162368774, "grad_norm": 0.4226143162214526, "learning_rate": 2.370442105263158e-05, "loss": 0.2755, "mean_copy_accuracy": 0.994110032916069, "mean_gen_accuracy": 0.8820084631443024, "mean_token_accuracy": 0.9046195894479752, "num_tokens": 467074653.0, "sample_num_tokens": 9434.25, "step": 5720, "total_num_tokens": 467112390.0, "z_loss": 0.0005693634739145637 }, { "copy_logits_max": -3.5266823768615723, "copy_logits_min": -750000000.0, "copy_num_tokens": 459.875, "epoch": 1.1684962981873883, "gen_logits_max": 4.701398849487305, "gen_logits_mean": -14.789037704467773, "gen_logits_min": -26.522357940673828, "gen_logits_std": 2.9733896255493164, "gen_loss": 0.2810884416103363, "grad_norm": 0.37496427742951, "learning_rate": 2.3703157894736843e-05, "loss": 0.2849, "mean_copy_accuracy": 0.9952411204576492, "mean_gen_accuracy": 0.8703819215297699, "mean_token_accuracy": 0.9017236828804016, "num_tokens": 467357325.0, "sample_num_tokens": 8559.25, "step": 5721, "total_num_tokens": 467391562.0, "z_loss": 0.0005567835760302842 }, { "copy_logits_max": -4.500486373901367, "copy_logits_min": -750000000.0, "copy_num_tokens": 375.6875, "epoch": 1.168700536124585, "gen_logits_max": 4.6930718421936035, "gen_logits_mean": -15.258464813232422, "gen_logits_min": -27.088287353515625, "gen_logits_std": 2.986335277557373, "gen_loss": 0.25361937284469604, "grad_norm": 0.3693486740804499, "learning_rate": 2.3701894736842104e-05, "loss": 0.2612, "mean_copy_accuracy": 0.9967986643314362, "mean_gen_accuracy": 0.8823856264352798, "mean_token_accuracy": 0.9111378043889999, "num_tokens": 467613974.0, "sample_num_tokens": 8244.0, "step": 5722, "total_num_tokens": 467646950.0, "z_loss": 0.000494245789013803 }, { "copy_logits_max": -1.627280354499817, "copy_logits_min": -625000064.0, "copy_num_tokens": 593.875, "epoch": 1.168904774061782, "gen_logits_max": 4.370241165161133, "gen_logits_mean": -13.85881233215332, "gen_logits_min": -26.079370498657227, "gen_logits_std": 2.9777002334594727, "gen_loss": 0.2577459216117859, "grad_norm": 0.3814482715764397, "learning_rate": 2.3700631578947368e-05, "loss": 0.2767, "mean_copy_accuracy": 0.9964010417461395, "mean_gen_accuracy": 0.8725463896989822, "mean_token_accuracy": 0.9072900712490082, "num_tokens": 467886020.0, "sample_num_tokens": 8619.0, "step": 5723, "total_num_tokens": 467920496.0, "z_loss": 0.0005924749420955777 }, { "copy_logits_max": -4.016007900238037, "copy_logits_min": -750000000.0, "copy_num_tokens": 502.4375, "epoch": 1.1691090119989789, "gen_logits_max": 4.882364749908447, "gen_logits_mean": -14.436647415161133, "gen_logits_min": -26.4292049407959, "gen_logits_std": 2.9368343353271484, "gen_loss": 0.27462130784988403, "grad_norm": 0.374947946489832, "learning_rate": 2.3699368421052632e-05, "loss": 0.3021, "mean_copy_accuracy": 0.9952428489923477, "mean_gen_accuracy": 0.870494931936264, "mean_token_accuracy": 0.8988705426454544, "num_tokens": 468158572.0, "sample_num_tokens": 8496.0, "step": 5724, "total_num_tokens": 468192556.0, "z_loss": 0.0006060936721041799 }, { "copy_logits_max": -4.498285293579102, "copy_logits_min": -625000064.0, "copy_num_tokens": 462.875, "epoch": 1.1693132499361756, "gen_logits_max": 4.015985488891602, "gen_logits_mean": -16.27667236328125, "gen_logits_min": -28.324748992919922, "gen_logits_std": 3.019305944442749, "gen_loss": 0.2587409019470215, "grad_norm": 0.3652156610327703, "learning_rate": 2.3698105263157897e-05, "loss": 0.2587, "mean_copy_accuracy": 0.9965409636497498, "mean_gen_accuracy": 0.8747607320547104, "mean_token_accuracy": 0.9117826670408249, "num_tokens": 468432895.0, "sample_num_tokens": 8576.75, "step": 5725, "total_num_tokens": 468467202.0, "z_loss": 0.000493230763822794 }, { "copy_logits_max": -2.5637059211730957, "copy_logits_min": -750000000.0, "copy_num_tokens": 588.25, "epoch": 1.1695174878733725, "gen_logits_max": 4.213942050933838, "gen_logits_mean": -15.056618690490723, "gen_logits_min": -27.67111587524414, "gen_logits_std": 2.997008800506592, "gen_loss": 0.2588961720466614, "grad_norm": 0.3778063700952385, "learning_rate": 2.369684210526316e-05, "loss": 0.2643, "mean_copy_accuracy": 0.9961216002702713, "mean_gen_accuracy": 0.8799083679914474, "mean_token_accuracy": 0.9108484983444214, "num_tokens": 468716393.0, "sample_num_tokens": 8469.25, "step": 5726, "total_num_tokens": 468750270.0, "z_loss": 0.0005204478511586785 }, { "copy_logits_max": -4.436885833740234, "copy_logits_min": -750000000.0, "copy_num_tokens": 300.4375, "epoch": 1.1697217258105694, "gen_logits_max": 4.902417182922363, "gen_logits_mean": -16.731693267822266, "gen_logits_min": -28.317398071289062, "gen_logits_std": 3.0220232009887695, "gen_loss": 0.3051072359085083, "grad_norm": 0.3504983105754945, "learning_rate": 2.3695578947368422e-05, "loss": 0.2843, "mean_copy_accuracy": 0.9966290444135666, "mean_gen_accuracy": 0.8756691366434097, "mean_token_accuracy": 0.9039091020822525, "num_tokens": 468997550.0, "sample_num_tokens": 8340.5, "step": 5727, "total_num_tokens": 469030912.0, "z_loss": 0.0006038179853931069 }, { "copy_logits_max": -2.6535372734069824, "copy_logits_min": -687500032.0, "copy_num_tokens": 646.9375, "epoch": 1.1699259637477661, "gen_logits_max": 4.246926307678223, "gen_logits_mean": -14.225041389465332, "gen_logits_min": -26.842432022094727, "gen_logits_std": 2.977461338043213, "gen_loss": 0.25599056482315063, "grad_norm": 0.41023279320000017, "learning_rate": 2.3694315789473686e-05, "loss": 0.2699, "mean_copy_accuracy": 0.9972921758890152, "mean_gen_accuracy": 0.8748951405286789, "mean_token_accuracy": 0.9075354486703873, "num_tokens": 469274588.0, "sample_num_tokens": 9537.0, "step": 5728, "total_num_tokens": 469312736.0, "z_loss": 0.0005342518561519682 }, { "copy_logits_max": -2.5162193775177, "copy_logits_min": -625000064.0, "copy_num_tokens": 648.6875, "epoch": 1.170130201684963, "gen_logits_max": 4.603384494781494, "gen_logits_mean": -14.930168151855469, "gen_logits_min": -27.292240142822266, "gen_logits_std": 2.9746837615966797, "gen_loss": 0.29870539903640747, "grad_norm": 0.445872064219728, "learning_rate": 2.3693052631578947e-05, "loss": 0.2771, "mean_copy_accuracy": 0.9972956776618958, "mean_gen_accuracy": 0.8753896206617355, "mean_token_accuracy": 0.9083514213562012, "num_tokens": 469550665.0, "sample_num_tokens": 9932.25, "step": 5729, "total_num_tokens": 469590394.0, "z_loss": 0.0005473652854561806 }, { "copy_logits_max": -2.3123738765716553, "copy_logits_min": -687500032.0, "copy_num_tokens": 396.6875, "epoch": 1.1703344396221598, "gen_logits_max": 5.951387405395508, "gen_logits_mean": -13.71318244934082, "gen_logits_min": -26.321857452392578, "gen_logits_std": 2.9704596996307373, "gen_loss": 0.2598133087158203, "grad_norm": 0.4015696168505689, "learning_rate": 2.369178947368421e-05, "loss": 0.2829, "mean_copy_accuracy": 0.9961984306573868, "mean_gen_accuracy": 0.8753970265388489, "mean_token_accuracy": 0.9040070027112961, "num_tokens": 469828320.0, "sample_num_tokens": 9342.0, "step": 5730, "total_num_tokens": 469865688.0, "z_loss": 0.0005383910611271858 }, { "copy_logits_max": -3.366323947906494, "copy_logits_min": -750000000.0, "copy_num_tokens": 478.875, "epoch": 1.1705386775593567, "gen_logits_max": 4.022927284240723, "gen_logits_mean": -15.311508178710938, "gen_logits_min": -27.073179244995117, "gen_logits_std": 2.9302151203155518, "gen_loss": 0.29535654187202454, "grad_norm": 0.3723025085139069, "learning_rate": 2.3690526315789472e-05, "loss": 0.2886, "mean_copy_accuracy": 0.9964043349027634, "mean_gen_accuracy": 0.8721687346696854, "mean_token_accuracy": 0.9041258245706558, "num_tokens": 470104102.0, "sample_num_tokens": 8963.0, "step": 5731, "total_num_tokens": 470139954.0, "z_loss": 0.0005181157030165195 }, { "copy_logits_max": -3.651017427444458, "copy_logits_min": -687500032.0, "copy_num_tokens": 531.9375, "epoch": 1.1707429154965534, "gen_logits_max": 4.610806941986084, "gen_logits_mean": -14.924882888793945, "gen_logits_min": -26.86054801940918, "gen_logits_std": 2.966731071472168, "gen_loss": 0.27296358346939087, "grad_norm": 0.4156876925105991, "learning_rate": 2.368926315789474e-05, "loss": 0.3002, "mean_copy_accuracy": 0.9962916076183319, "mean_gen_accuracy": 0.8697228282690048, "mean_token_accuracy": 0.89989173412323, "num_tokens": 470392847.0, "sample_num_tokens": 8444.75, "step": 5732, "total_num_tokens": 470426626.0, "z_loss": 0.0005112808430567384 }, { "copy_logits_max": -3.5434963703155518, "copy_logits_min": -687500032.0, "copy_num_tokens": 373.8125, "epoch": 1.1709471534337503, "gen_logits_max": 6.0999555587768555, "gen_logits_mean": -13.426658630371094, "gen_logits_min": -26.413360595703125, "gen_logits_std": 2.9128851890563965, "gen_loss": 0.30604374408721924, "grad_norm": 0.3896971107099915, "learning_rate": 2.3688e-05, "loss": 0.2869, "mean_copy_accuracy": 0.9964622110128403, "mean_gen_accuracy": 0.8713469356298447, "mean_token_accuracy": 0.9012329280376434, "num_tokens": 470651521.0, "sample_num_tokens": 7952.75, "step": 5733, "total_num_tokens": 470683332.0, "z_loss": 0.000591023126617074 }, { "copy_logits_max": -3.864278793334961, "copy_logits_min": -687500032.0, "copy_num_tokens": 582.75, "epoch": 1.1711513913709473, "gen_logits_max": 3.662440538406372, "gen_logits_mean": -15.751567840576172, "gen_logits_min": -27.656383514404297, "gen_logits_std": 3.010420799255371, "gen_loss": 0.22832165658473969, "grad_norm": 0.41538607048351, "learning_rate": 2.3686736842105265e-05, "loss": 0.2699, "mean_copy_accuracy": 0.9954909831285477, "mean_gen_accuracy": 0.8852168321609497, "mean_token_accuracy": 0.9078551381826401, "num_tokens": 470933453.0, "sample_num_tokens": 9142.75, "step": 5734, "total_num_tokens": 470970024.0, "z_loss": 0.0004659583210013807 }, { "copy_logits_max": -3.7528228759765625, "copy_logits_min": -750000000.0, "copy_num_tokens": 392.0625, "epoch": 1.171355629308144, "gen_logits_max": 4.881380081176758, "gen_logits_mean": -14.949881553649902, "gen_logits_min": -27.159542083740234, "gen_logits_std": 2.990708827972412, "gen_loss": 0.28784939646720886, "grad_norm": 0.41516971605281444, "learning_rate": 2.3685473684210526e-05, "loss": 0.2815, "mean_copy_accuracy": 0.9955192804336548, "mean_gen_accuracy": 0.8784195184707642, "mean_token_accuracy": 0.9056587368249893, "num_tokens": 471209573.0, "sample_num_tokens": 8342.75, "step": 5735, "total_num_tokens": 471242944.0, "z_loss": 0.0005425934214144945 }, { "copy_logits_max": -3.753336191177368, "copy_logits_min": -750000064.0, "copy_num_tokens": 422.8125, "epoch": 1.171559867245341, "gen_logits_max": 4.345928192138672, "gen_logits_mean": -15.77943229675293, "gen_logits_min": -27.928373336791992, "gen_logits_std": 2.9679207801818848, "gen_loss": 0.27911457419395447, "grad_norm": 0.42941043783475547, "learning_rate": 2.368421052631579e-05, "loss": 0.2679, "mean_copy_accuracy": 0.9967203885316849, "mean_gen_accuracy": 0.8816400319337845, "mean_token_accuracy": 0.9098171293735504, "num_tokens": 471492340.0, "sample_num_tokens": 7740.5, "step": 5736, "total_num_tokens": 471523302.0, "z_loss": 0.0005538337863981724 }, { "copy_logits_max": -1.4707057476043701, "copy_logits_min": -687500032.0, "copy_num_tokens": 560.25, "epoch": 1.1717641051825376, "gen_logits_max": 4.422980308532715, "gen_logits_mean": -14.939188957214355, "gen_logits_min": -27.869041442871094, "gen_logits_std": 3.012333869934082, "gen_loss": 0.29264408349990845, "grad_norm": 0.3902580656165048, "learning_rate": 2.368294736842105e-05, "loss": 0.2824, "mean_copy_accuracy": 0.9965171068906784, "mean_gen_accuracy": 0.8742891252040863, "mean_token_accuracy": 0.9049272239208221, "num_tokens": 471753431.0, "sample_num_tokens": 9130.25, "step": 5737, "total_num_tokens": 471789952.0, "z_loss": 0.0006160973571240902 }, { "copy_logits_max": -4.503520488739014, "copy_logits_min": -750000000.0, "copy_num_tokens": 420.6875, "epoch": 1.1719683431197345, "gen_logits_max": 3.967604875564575, "gen_logits_mean": -16.129226684570312, "gen_logits_min": -28.152225494384766, "gen_logits_std": 3.0057730674743652, "gen_loss": 0.2536163032054901, "grad_norm": 0.38203392046451007, "learning_rate": 2.3681684210526316e-05, "loss": 0.279, "mean_copy_accuracy": 0.9961275458335876, "mean_gen_accuracy": 0.8801216036081314, "mean_token_accuracy": 0.9053117036819458, "num_tokens": 472033436.0, "sample_num_tokens": 7936.5, "step": 5738, "total_num_tokens": 472065182.0, "z_loss": 0.0005150999641045928 }, { "copy_logits_max": -3.2650487422943115, "copy_logits_min": -750000000.0, "copy_num_tokens": 433.375, "epoch": 1.1721725810569312, "gen_logits_max": 4.191257953643799, "gen_logits_mean": -16.622827529907227, "gen_logits_min": -28.225704193115234, "gen_logits_std": 3.011192798614502, "gen_loss": 0.29281163215637207, "grad_norm": 0.38028308562582536, "learning_rate": 2.368042105263158e-05, "loss": 0.2866, "mean_copy_accuracy": 0.9960350692272186, "mean_gen_accuracy": 0.8721067607402802, "mean_token_accuracy": 0.9018367230892181, "num_tokens": 472291309.0, "sample_num_tokens": 8969.75, "step": 5739, "total_num_tokens": 472327188.0, "z_loss": 0.0005835179472342134 }, { "copy_logits_max": -1.3689295053482056, "copy_logits_min": -750000000.0, "copy_num_tokens": 817.4375, "epoch": 1.1723768189941282, "gen_logits_max": 3.470240592956543, "gen_logits_mean": -16.150188446044922, "gen_logits_min": -28.15789031982422, "gen_logits_std": 3.0194239616394043, "gen_loss": 0.24399012327194214, "grad_norm": 0.4049332479149543, "learning_rate": 2.367915789473684e-05, "loss": 0.2682, "mean_copy_accuracy": 0.9960251450538635, "mean_gen_accuracy": 0.876092791557312, "mean_token_accuracy": 0.909430667757988, "num_tokens": 472571798.0, "sample_num_tokens": 10615.5, "step": 5740, "total_num_tokens": 472614260.0, "z_loss": 0.0005495386430993676 }, { "copy_logits_max": -1.4904770851135254, "copy_logits_min": -750000000.0, "copy_num_tokens": 625.5, "epoch": 1.172581056931325, "gen_logits_max": 4.0272698402404785, "gen_logits_mean": -15.299440383911133, "gen_logits_min": -27.697078704833984, "gen_logits_std": 2.982591152191162, "gen_loss": 0.24967212975025177, "grad_norm": 0.36012107651511305, "learning_rate": 2.367789473684211e-05, "loss": 0.2645, "mean_copy_accuracy": 0.9967122972011566, "mean_gen_accuracy": 0.8766933232545853, "mean_token_accuracy": 0.9100382179021835, "num_tokens": 472871044.0, "sample_num_tokens": 9527.5, "step": 5741, "total_num_tokens": 472909154.0, "z_loss": 0.000551587319932878 }, { "copy_logits_max": -2.489901065826416, "copy_logits_min": -750000000.0, "copy_num_tokens": 459.1875, "epoch": 1.1727852948685218, "gen_logits_max": 3.8361520767211914, "gen_logits_mean": -16.210533142089844, "gen_logits_min": -28.035324096679688, "gen_logits_std": 2.9912352561950684, "gen_loss": 0.26549452543258667, "grad_norm": 0.3754181865971301, "learning_rate": 2.367663157894737e-05, "loss": 0.2823, "mean_copy_accuracy": 0.9960412830114365, "mean_gen_accuracy": 0.8711861819028854, "mean_token_accuracy": 0.9044306874275208, "num_tokens": 473152813.0, "sample_num_tokens": 7892.75, "step": 5742, "total_num_tokens": 473184384.0, "z_loss": 0.0005325399688445032 }, { "copy_logits_max": -2.5510573387145996, "copy_logits_min": -750000000.0, "copy_num_tokens": 543.625, "epoch": 1.1729895328057187, "gen_logits_max": 3.6523971557617188, "gen_logits_mean": -15.958446502685547, "gen_logits_min": -27.42656135559082, "gen_logits_std": 2.960756540298462, "gen_loss": 0.26107466220855713, "grad_norm": 0.3822532060100546, "learning_rate": 2.3675368421052634e-05, "loss": 0.2818, "mean_copy_accuracy": 0.9957273304462433, "mean_gen_accuracy": 0.8763241171836853, "mean_token_accuracy": 0.9036831259727478, "num_tokens": 473426846.0, "sample_num_tokens": 9459.5, "step": 5743, "total_num_tokens": 473464684.0, "z_loss": 0.0004946797853335738 }, { "copy_logits_max": -2.6424272060394287, "copy_logits_min": -750000000.0, "copy_num_tokens": 398.0625, "epoch": 1.1731937707429154, "gen_logits_max": 5.102835178375244, "gen_logits_mean": -14.883865356445312, "gen_logits_min": -26.6114501953125, "gen_logits_std": 2.955326557159424, "gen_loss": 0.3010480999946594, "grad_norm": 0.3572725887731089, "learning_rate": 2.3674105263157895e-05, "loss": 0.2626, "mean_copy_accuracy": 0.9954332560300827, "mean_gen_accuracy": 0.8846512585878372, "mean_token_accuracy": 0.9096473604440689, "num_tokens": 473707087.0, "sample_num_tokens": 8107.75, "step": 5744, "total_num_tokens": 473739518.0, "z_loss": 0.0005986859323456883 }, { "copy_logits_max": -1.9364681243896484, "copy_logits_min": -687500032.0, "copy_num_tokens": 589.75, "epoch": 1.1733980086801123, "gen_logits_max": 4.038866996765137, "gen_logits_mean": -15.453625679016113, "gen_logits_min": -27.44440269470215, "gen_logits_std": 2.9781415462493896, "gen_loss": 0.270878404378891, "grad_norm": 0.35961196085152564, "learning_rate": 2.367284210526316e-05, "loss": 0.2585, "mean_copy_accuracy": 0.9971911609172821, "mean_gen_accuracy": 0.8757269084453583, "mean_token_accuracy": 0.9121650010347366, "num_tokens": 473976980.0, "sample_num_tokens": 9208.5, "step": 5745, "total_num_tokens": 474013814.0, "z_loss": 0.0005528914043679833 }, { "copy_logits_max": -1.5570440292358398, "copy_logits_min": -750000000.0, "copy_num_tokens": 421.625, "epoch": 1.173602246617309, "gen_logits_max": 5.199742794036865, "gen_logits_mean": -14.405670166015625, "gen_logits_min": -26.470809936523438, "gen_logits_std": 2.9255545139312744, "gen_loss": 0.2910122275352478, "grad_norm": 0.3732879010535379, "learning_rate": 2.367157894736842e-05, "loss": 0.288, "mean_copy_accuracy": 0.9963091909885406, "mean_gen_accuracy": 0.8738467246294022, "mean_token_accuracy": 0.9036141782999039, "num_tokens": 474240214.0, "sample_num_tokens": 8197.5, "step": 5746, "total_num_tokens": 474273004.0, "z_loss": 0.0005851613241247833 }, { "copy_logits_max": -2.2400994300842285, "copy_logits_min": -750000000.0, "copy_num_tokens": 389.1875, "epoch": 1.173806484554506, "gen_logits_max": 4.257624626159668, "gen_logits_mean": -15.329606056213379, "gen_logits_min": -27.14739990234375, "gen_logits_std": 2.960623264312744, "gen_loss": 0.2787228226661682, "grad_norm": 0.37359959656473024, "learning_rate": 2.3670315789473684e-05, "loss": 0.2957, "mean_copy_accuracy": 0.9959895610809326, "mean_gen_accuracy": 0.8726593852043152, "mean_token_accuracy": 0.9015567451715469, "num_tokens": 474511607.0, "sample_num_tokens": 7443.25, "step": 5747, "total_num_tokens": 474541380.0, "z_loss": 0.0005238603334873915 }, { "copy_logits_max": -1.971589207649231, "copy_logits_min": -750000000.0, "copy_num_tokens": 484.75, "epoch": 1.174010722491703, "gen_logits_max": 5.363462448120117, "gen_logits_mean": -14.737499237060547, "gen_logits_min": -26.60064697265625, "gen_logits_std": 2.9507904052734375, "gen_loss": 0.2508653998374939, "grad_norm": 0.3896875896699817, "learning_rate": 2.3669052631578945e-05, "loss": 0.2823, "mean_copy_accuracy": 0.9970820397138596, "mean_gen_accuracy": 0.8749924600124359, "mean_token_accuracy": 0.903734490275383, "num_tokens": 474771556.0, "sample_num_tokens": 8860.5, "step": 5748, "total_num_tokens": 474806998.0, "z_loss": 0.0004967474960722029 }, { "copy_logits_max": -3.9463388919830322, "copy_logits_min": -750000064.0, "copy_num_tokens": 450.625, "epoch": 1.1742149604288996, "gen_logits_max": 4.0086445808410645, "gen_logits_mean": -15.897102355957031, "gen_logits_min": -28.219934463500977, "gen_logits_std": 3.0231518745422363, "gen_loss": 0.26511338353157043, "grad_norm": 0.3734807915647827, "learning_rate": 2.3667789473684213e-05, "loss": 0.2819, "mean_copy_accuracy": 0.9958320707082748, "mean_gen_accuracy": 0.8746104389429092, "mean_token_accuracy": 0.9038151800632477, "num_tokens": 475038855.0, "sample_num_tokens": 7982.25, "step": 5749, "total_num_tokens": 475070784.0, "z_loss": 0.0004633246280718595 }, { "copy_logits_max": -5.754734992980957, "copy_logits_min": -750000000.0, "copy_num_tokens": 484.5625, "epoch": 1.1744191983660965, "gen_logits_max": 4.876141548156738, "gen_logits_mean": -15.215936660766602, "gen_logits_min": -27.08112335205078, "gen_logits_std": 2.9537954330444336, "gen_loss": 0.30424803495407104, "grad_norm": 0.3532251846348554, "learning_rate": 2.3666526315789474e-05, "loss": 0.2748, "mean_copy_accuracy": 0.9965334683656693, "mean_gen_accuracy": 0.8739817291498184, "mean_token_accuracy": 0.9048526734113693, "num_tokens": 475317356.0, "sample_num_tokens": 9174.5, "step": 5750, "total_num_tokens": 475354054.0, "z_loss": 0.0005491678602993488 }, { "copy_logits_max": -6.333714008331299, "copy_logits_min": -750000000.0, "copy_num_tokens": 320.375, "epoch": 1.1746234363032932, "gen_logits_max": 5.230192184448242, "gen_logits_mean": -13.842391014099121, "gen_logits_min": -25.992511749267578, "gen_logits_std": 2.899690628051758, "gen_loss": 0.24986767768859863, "grad_norm": 0.35662052367261066, "learning_rate": 2.3665263157894738e-05, "loss": 0.2553, "mean_copy_accuracy": 0.9965478479862213, "mean_gen_accuracy": 0.883720800280571, "mean_token_accuracy": 0.912981241941452, "num_tokens": 475584420.0, "sample_num_tokens": 7087.5, "step": 5751, "total_num_tokens": 475612770.0, "z_loss": 0.0004827976517844945 }, { "copy_logits_max": -2.75925874710083, "copy_logits_min": -750000000.0, "copy_num_tokens": 408.875, "epoch": 1.1748276742404902, "gen_logits_max": 4.9334516525268555, "gen_logits_mean": -15.190267562866211, "gen_logits_min": -26.797740936279297, "gen_logits_std": 2.9466404914855957, "gen_loss": 0.3224528431892395, "grad_norm": 0.3880094019956389, "learning_rate": 2.3664e-05, "loss": 0.2927, "mean_copy_accuracy": 0.9959236830472946, "mean_gen_accuracy": 0.8763182759284973, "mean_token_accuracy": 0.9009383916854858, "num_tokens": 475839190.0, "sample_num_tokens": 7923.5, "step": 5752, "total_num_tokens": 475870884.0, "z_loss": 0.0005809340509586036 }, { "copy_logits_max": -4.331893444061279, "copy_logits_min": -625000064.0, "copy_num_tokens": 378.625, "epoch": 1.175031912177687, "gen_logits_max": 4.16131591796875, "gen_logits_mean": -15.737486839294434, "gen_logits_min": -27.717561721801758, "gen_logits_std": 2.9870142936706543, "gen_loss": 0.2759445309638977, "grad_norm": 0.40769862422858194, "learning_rate": 2.3662736842105263e-05, "loss": 0.262, "mean_copy_accuracy": 0.9950792044401169, "mean_gen_accuracy": 0.8776207268238068, "mean_token_accuracy": 0.9088533520698547, "num_tokens": 476118174.0, "sample_num_tokens": 7466.0, "step": 5753, "total_num_tokens": 476148038.0, "z_loss": 0.0005696348380297422 }, { "copy_logits_max": -5.698467254638672, "copy_logits_min": -750000000.0, "copy_num_tokens": 393.5, "epoch": 1.1752361501148838, "gen_logits_max": 4.390429496765137, "gen_logits_mean": -16.048484802246094, "gen_logits_min": -27.434083938598633, "gen_logits_std": 2.9507415294647217, "gen_loss": 0.29234927892684937, "grad_norm": 0.38822342557589457, "learning_rate": 2.3661473684210528e-05, "loss": 0.2802, "mean_copy_accuracy": 0.9952501952648163, "mean_gen_accuracy": 0.8804667145013809, "mean_token_accuracy": 0.9051600843667984, "num_tokens": 476380803.0, "sample_num_tokens": 8588.25, "step": 5754, "total_num_tokens": 476415156.0, "z_loss": 0.0005751776043325663 }, { "copy_logits_max": -4.123918533325195, "copy_logits_min": -750000000.0, "copy_num_tokens": 497.5, "epoch": 1.1754403880520807, "gen_logits_max": 3.864135265350342, "gen_logits_mean": -15.32828140258789, "gen_logits_min": -27.277320861816406, "gen_logits_std": 2.972368001937866, "gen_loss": 0.2793886065483093, "grad_norm": 0.37174612930042306, "learning_rate": 2.366021052631579e-05, "loss": 0.2831, "mean_copy_accuracy": 0.9967929422855377, "mean_gen_accuracy": 0.8718455284833908, "mean_token_accuracy": 0.9052560329437256, "num_tokens": 476662387.0, "sample_num_tokens": 8801.75, "step": 5755, "total_num_tokens": 476697594.0, "z_loss": 0.0005627584178000689 }, { "copy_logits_max": -4.401985168457031, "copy_logits_min": -750000000.0, "copy_num_tokens": 491.5625, "epoch": 1.1756446259892774, "gen_logits_max": 5.127285957336426, "gen_logits_mean": -13.654438018798828, "gen_logits_min": -25.10905647277832, "gen_logits_std": 2.82731294631958, "gen_loss": 0.28614309430122375, "grad_norm": 0.3777379289331923, "learning_rate": 2.3658947368421053e-05, "loss": 0.2722, "mean_copy_accuracy": 0.9959452450275421, "mean_gen_accuracy": 0.8792422711849213, "mean_token_accuracy": 0.9079063683748245, "num_tokens": 476936983.0, "sample_num_tokens": 8367.75, "step": 5756, "total_num_tokens": 476970454.0, "z_loss": 0.0005954239168204367 }, { "copy_logits_max": -4.700045108795166, "copy_logits_min": -687500032.0, "copy_num_tokens": 429.8125, "epoch": 1.1758488639264744, "gen_logits_max": 4.033303260803223, "gen_logits_mean": -14.884353637695312, "gen_logits_min": -26.295400619506836, "gen_logits_std": 2.874156951904297, "gen_loss": 0.2926870584487915, "grad_norm": 0.374612900058416, "learning_rate": 2.3657684210526317e-05, "loss": 0.2813, "mean_copy_accuracy": 0.9972722232341766, "mean_gen_accuracy": 0.8698901683092117, "mean_token_accuracy": 0.9053899496793747, "num_tokens": 477221504.0, "sample_num_tokens": 7822.5, "step": 5757, "total_num_tokens": 477252794.0, "z_loss": 0.0006215897738002241 }, { "copy_logits_max": -3.76607084274292, "copy_logits_min": -687500032.0, "copy_num_tokens": 555.375, "epoch": 1.1760531018636713, "gen_logits_max": 4.097539901733398, "gen_logits_mean": -14.400833129882812, "gen_logits_min": -25.91284942626953, "gen_logits_std": 2.881369113922119, "gen_loss": 0.2557802200317383, "grad_norm": 0.40145043175639883, "learning_rate": 2.365642105263158e-05, "loss": 0.2783, "mean_copy_accuracy": 0.9956510365009308, "mean_gen_accuracy": 0.8779002875089645, "mean_token_accuracy": 0.9065933376550674, "num_tokens": 477490457.0, "sample_num_tokens": 9341.75, "step": 5758, "total_num_tokens": 477527824.0, "z_loss": 0.0005054770736023784 }, { "copy_logits_max": -4.87009334564209, "copy_logits_min": -750000000.0, "copy_num_tokens": 550.0625, "epoch": 1.176257339800868, "gen_logits_max": 4.105837821960449, "gen_logits_mean": -16.16421127319336, "gen_logits_min": -27.670806884765625, "gen_logits_std": 2.939866065979004, "gen_loss": 0.26719844341278076, "grad_norm": 0.37803272731141324, "learning_rate": 2.3655157894736842e-05, "loss": 0.2783, "mean_copy_accuracy": 0.9956829249858856, "mean_gen_accuracy": 0.8735774457454681, "mean_token_accuracy": 0.9048769623041153, "num_tokens": 477768992.0, "sample_num_tokens": 9989.5, "step": 5759, "total_num_tokens": 477808950.0, "z_loss": 0.0005090429913252592 }, { "copy_logits_max": -6.268886566162109, "copy_logits_min": -750000000.0, "copy_num_tokens": 243.9375, "epoch": 1.176461577738065, "gen_logits_max": 5.727982521057129, "gen_logits_mean": -14.513344764709473, "gen_logits_min": -25.781444549560547, "gen_logits_std": 2.885517120361328, "gen_loss": 0.30633243918418884, "grad_norm": 0.37839508428285895, "learning_rate": 2.3653894736842107e-05, "loss": 0.2829, "mean_copy_accuracy": 0.995362713932991, "mean_gen_accuracy": 0.8747986555099487, "mean_token_accuracy": 0.9034627228975296, "num_tokens": 478044152.0, "sample_num_tokens": 7500.0, "step": 5760, "total_num_tokens": 478074152.0, "z_loss": 0.0005486682057380676 }, { "copy_logits_max": -4.011776447296143, "copy_logits_min": -750000000.0, "copy_num_tokens": 366.125, "epoch": 1.1766658156752616, "gen_logits_max": 4.117587089538574, "gen_logits_mean": -15.661050796508789, "gen_logits_min": -27.42340850830078, "gen_logits_std": 2.9228641986846924, "gen_loss": 0.32650959491729736, "grad_norm": 0.4155018795786162, "learning_rate": 2.3652631578947368e-05, "loss": 0.2825, "mean_copy_accuracy": 0.995647743344307, "mean_gen_accuracy": 0.87643001973629, "mean_token_accuracy": 0.9042833149433136, "num_tokens": 478280343.0, "sample_num_tokens": 7173.25, "step": 5761, "total_num_tokens": 478309036.0, "z_loss": 0.0006304820999503136 }, { "copy_logits_max": -4.226043701171875, "copy_logits_min": -687500032.0, "copy_num_tokens": 634.5, "epoch": 1.1768700536124586, "gen_logits_max": 4.270565509796143, "gen_logits_mean": -14.81042766571045, "gen_logits_min": -26.344079971313477, "gen_logits_std": 2.9111111164093018, "gen_loss": 0.30149394273757935, "grad_norm": 0.39092001292652934, "learning_rate": 2.3651368421052632e-05, "loss": 0.2774, "mean_copy_accuracy": 0.9964669197797775, "mean_gen_accuracy": 0.8774341493844986, "mean_token_accuracy": 0.9064399152994156, "num_tokens": 478546569.0, "sample_num_tokens": 9989.25, "step": 5762, "total_num_tokens": 478586526.0, "z_loss": 0.0005588515195995569 }, { "copy_logits_max": -4.604618549346924, "copy_logits_min": -750000000.0, "copy_num_tokens": 428.9375, "epoch": 1.1770742915496553, "gen_logits_max": 3.4810667037963867, "gen_logits_mean": -16.21558380126953, "gen_logits_min": -27.668460845947266, "gen_logits_std": 2.926091194152832, "gen_loss": 0.2801797389984131, "grad_norm": 0.4176561398395509, "learning_rate": 2.3650105263157893e-05, "loss": 0.2781, "mean_copy_accuracy": 0.9970458000898361, "mean_gen_accuracy": 0.8821941763162613, "mean_token_accuracy": 0.9064106941223145, "num_tokens": 478815942.0, "sample_num_tokens": 7940.5, "step": 5763, "total_num_tokens": 478847704.0, "z_loss": 0.0005180753651075065 }, { "copy_logits_max": -3.3179564476013184, "copy_logits_min": -750000000.0, "copy_num_tokens": 448.0, "epoch": 1.1772785294868522, "gen_logits_max": 5.064602375030518, "gen_logits_mean": -13.395198822021484, "gen_logits_min": -24.72835350036621, "gen_logits_std": 2.8557541370391846, "gen_loss": 0.28429001569747925, "grad_norm": 0.3678734931748143, "learning_rate": 2.3648842105263157e-05, "loss": 0.2749, "mean_copy_accuracy": 0.996993362903595, "mean_gen_accuracy": 0.8721832931041718, "mean_token_accuracy": 0.9061286002397537, "num_tokens": 479101771.0, "sample_num_tokens": 7734.25, "step": 5764, "total_num_tokens": 479132708.0, "z_loss": 0.0005326969549059868 }, { "copy_logits_max": -5.59673547744751, "copy_logits_min": -750000000.0, "copy_num_tokens": 326.6875, "epoch": 1.1774827674240491, "gen_logits_max": 4.558365821838379, "gen_logits_mean": -14.855257987976074, "gen_logits_min": -26.172956466674805, "gen_logits_std": 2.8646726608276367, "gen_loss": 0.3047987222671509, "grad_norm": 0.43351853045321703, "learning_rate": 2.364757894736842e-05, "loss": 0.2802, "mean_copy_accuracy": 0.9950185567140579, "mean_gen_accuracy": 0.8753781765699387, "mean_token_accuracy": 0.9055614173412323, "num_tokens": 479367052.0, "sample_num_tokens": 7046.0, "step": 5765, "total_num_tokens": 479395236.0, "z_loss": 0.0005027618608437479 }, { "copy_logits_max": -6.421442031860352, "copy_logits_min": -750000000.0, "copy_num_tokens": 344.125, "epoch": 1.1776870053612458, "gen_logits_max": 4.118049621582031, "gen_logits_mean": -17.0699462890625, "gen_logits_min": -28.598899841308594, "gen_logits_std": 2.967970609664917, "gen_loss": 0.3270416855812073, "grad_norm": 0.390823880126051, "learning_rate": 2.3646315789473686e-05, "loss": 0.293, "mean_copy_accuracy": 0.9957253187894821, "mean_gen_accuracy": 0.8746354579925537, "mean_token_accuracy": 0.9013805240392685, "num_tokens": 479627026.0, "sample_num_tokens": 7919.0, "step": 5766, "total_num_tokens": 479658702.0, "z_loss": 0.0006003800081089139 }, { "copy_logits_max": -6.098430156707764, "copy_logits_min": -750000000.0, "copy_num_tokens": 264.0, "epoch": 1.1778912432984427, "gen_logits_max": 4.907571315765381, "gen_logits_mean": -15.96146297454834, "gen_logits_min": -27.295551300048828, "gen_logits_std": 2.8934144973754883, "gen_loss": 0.2956327795982361, "grad_norm": 0.4150268065412334, "learning_rate": 2.364505263157895e-05, "loss": 0.2831, "mean_copy_accuracy": 0.9957452565431595, "mean_gen_accuracy": 0.877368837594986, "mean_token_accuracy": 0.9039740413427353, "num_tokens": 479885672.0, "sample_num_tokens": 7525.0, "step": 5767, "total_num_tokens": 479915772.0, "z_loss": 0.0005145344184711576 }, { "copy_logits_max": -5.1581525802612305, "copy_logits_min": -750000000.0, "copy_num_tokens": 294.0625, "epoch": 1.1780954812356395, "gen_logits_max": 4.428108215332031, "gen_logits_mean": -16.254802703857422, "gen_logits_min": -28.009841918945312, "gen_logits_std": 2.924894332885742, "gen_loss": 0.30027344822883606, "grad_norm": 0.3970217566229863, "learning_rate": 2.364378947368421e-05, "loss": 0.2903, "mean_copy_accuracy": 0.9965108782052994, "mean_gen_accuracy": 0.8727664649486542, "mean_token_accuracy": 0.8997247815132141, "num_tokens": 480136778.0, "sample_num_tokens": 7240.0, "step": 5768, "total_num_tokens": 480165738.0, "z_loss": 0.0005646118079312146 }, { "copy_logits_max": -2.736281394958496, "copy_logits_min": -750000000.0, "copy_num_tokens": 691.1875, "epoch": 1.1782997191728364, "gen_logits_max": 3.9250826835632324, "gen_logits_mean": -14.816854476928711, "gen_logits_min": -26.48478889465332, "gen_logits_std": 2.920233964920044, "gen_loss": 0.2639652490615845, "grad_norm": 0.3684680876830596, "learning_rate": 2.3642526315789475e-05, "loss": 0.2664, "mean_copy_accuracy": 0.9971062690019608, "mean_gen_accuracy": 0.878046065568924, "mean_token_accuracy": 0.910544753074646, "num_tokens": 480420479.0, "sample_num_tokens": 9747.25, "step": 5769, "total_num_tokens": 480459468.0, "z_loss": 0.00048499496188014746 }, { "copy_logits_max": -5.480440139770508, "copy_logits_min": -750000000.0, "copy_num_tokens": 478.0, "epoch": 1.178503957110033, "gen_logits_max": 3.856780529022217, "gen_logits_mean": -15.92093276977539, "gen_logits_min": -27.268417358398438, "gen_logits_std": 2.9149346351623535, "gen_loss": 0.2934395670890808, "grad_norm": 0.4094393411785772, "learning_rate": 2.3641263157894736e-05, "loss": 0.2799, "mean_copy_accuracy": 0.9951608330011368, "mean_gen_accuracy": 0.8783862143754959, "mean_token_accuracy": 0.9060362130403519, "num_tokens": 480700829.0, "sample_num_tokens": 9404.75, "step": 5770, "total_num_tokens": 480738448.0, "z_loss": 0.00047186139272525907 }, { "copy_logits_max": -2.6102330684661865, "copy_logits_min": -750000064.0, "copy_num_tokens": 472.4375, "epoch": 1.17870819504723, "gen_logits_max": 3.3943052291870117, "gen_logits_mean": -17.069124221801758, "gen_logits_min": -28.39651107788086, "gen_logits_std": 2.9678335189819336, "gen_loss": 0.2927706241607666, "grad_norm": 0.4113535089110958, "learning_rate": 2.364e-05, "loss": 0.3085, "mean_copy_accuracy": 0.9958728402853012, "mean_gen_accuracy": 0.8661022335290909, "mean_token_accuracy": 0.8964230418205261, "num_tokens": 480962801.0, "sample_num_tokens": 8420.75, "step": 5771, "total_num_tokens": 480996484.0, "z_loss": 0.0005822601960971951 }, { "copy_logits_max": -1.6748124361038208, "copy_logits_min": -687500032.0, "copy_num_tokens": 478.4375, "epoch": 1.178912432984427, "gen_logits_max": 3.65490460395813, "gen_logits_mean": -15.939159393310547, "gen_logits_min": -27.39788818359375, "gen_logits_std": 2.931974172592163, "gen_loss": 0.28000980615615845, "grad_norm": 0.35753931414661544, "learning_rate": 2.363873684210526e-05, "loss": 0.268, "mean_copy_accuracy": 0.9962819814682007, "mean_gen_accuracy": 0.878111332654953, "mean_token_accuracy": 0.9095685631036758, "num_tokens": 481244265.0, "sample_num_tokens": 8266.75, "step": 5772, "total_num_tokens": 481277332.0, "z_loss": 0.0005796593613922596 }, { "copy_logits_max": -1.7234187126159668, "copy_logits_min": -750000000.0, "copy_num_tokens": 617.5625, "epoch": 1.1791166709216236, "gen_logits_max": 4.302027702331543, "gen_logits_mean": -14.927925109863281, "gen_logits_min": -27.090972900390625, "gen_logits_std": 2.9527182579040527, "gen_loss": 0.2806589603424072, "grad_norm": 0.40412128906682493, "learning_rate": 2.363747368421053e-05, "loss": 0.2775, "mean_copy_accuracy": 0.9966260641813278, "mean_gen_accuracy": 0.8729941844940186, "mean_token_accuracy": 0.906286209821701, "num_tokens": 481523161.0, "sample_num_tokens": 9543.25, "step": 5773, "total_num_tokens": 481561334.0, "z_loss": 0.0005827982677146792 }, { "copy_logits_max": -2.2245781421661377, "copy_logits_min": -687500032.0, "copy_num_tokens": 364.9375, "epoch": 1.1793209088588206, "gen_logits_max": 5.262924671173096, "gen_logits_mean": -14.567827224731445, "gen_logits_min": -26.163692474365234, "gen_logits_std": 2.8947746753692627, "gen_loss": 0.284015417098999, "grad_norm": 0.3527211385800945, "learning_rate": 2.363621052631579e-05, "loss": 0.2534, "mean_copy_accuracy": 0.9963241070508957, "mean_gen_accuracy": 0.8853955566883087, "mean_token_accuracy": 0.9123937636613846, "num_tokens": 481805869.0, "sample_num_tokens": 7616.25, "step": 5774, "total_num_tokens": 481836334.0, "z_loss": 0.000568182033021003 }, { "copy_logits_max": -3.6180124282836914, "copy_logits_min": -750000064.0, "copy_num_tokens": 388.5, "epoch": 1.1795251467960173, "gen_logits_max": 4.509416580200195, "gen_logits_mean": -15.269383430480957, "gen_logits_min": -26.451799392700195, "gen_logits_std": 2.8950915336608887, "gen_loss": 0.29114824533462524, "grad_norm": 0.3720321177026109, "learning_rate": 2.3634947368421055e-05, "loss": 0.2741, "mean_copy_accuracy": 0.9957912415266037, "mean_gen_accuracy": 0.8781691491603851, "mean_token_accuracy": 0.9067907482385635, "num_tokens": 482088832.0, "sample_num_tokens": 7568.0, "step": 5775, "total_num_tokens": 482119104.0, "z_loss": 0.0005691916448995471 }, { "copy_logits_max": -0.823004961013794, "copy_logits_min": -750000000.0, "copy_num_tokens": 515.5, "epoch": 1.1797293847332142, "gen_logits_max": 5.064234733581543, "gen_logits_mean": -13.060601234436035, "gen_logits_min": -24.64959716796875, "gen_logits_std": 2.8408193588256836, "gen_loss": 0.26415759325027466, "grad_norm": 0.4021151533240661, "learning_rate": 2.3633684210526315e-05, "loss": 0.2889, "mean_copy_accuracy": 0.9960260391235352, "mean_gen_accuracy": 0.8741108626127243, "mean_token_accuracy": 0.904007613658905, "num_tokens": 482363702.0, "sample_num_tokens": 8445.5, "step": 5776, "total_num_tokens": 482397484.0, "z_loss": 0.0005267475498840213 }, { "copy_logits_max": 0.31023016571998596, "copy_logits_min": -750000000.0, "copy_num_tokens": 434.875, "epoch": 1.179933622670411, "gen_logits_max": 5.968064308166504, "gen_logits_mean": -13.178159713745117, "gen_logits_min": -24.563941955566406, "gen_logits_std": 2.877138614654541, "gen_loss": 0.269397497177124, "grad_norm": 0.3850475925868542, "learning_rate": 2.363242105263158e-05, "loss": 0.2627, "mean_copy_accuracy": 0.9970240741968155, "mean_gen_accuracy": 0.8820114880800247, "mean_token_accuracy": 0.9096136838197708, "num_tokens": 482631301.0, "sample_num_tokens": 8781.75, "step": 5777, "total_num_tokens": 482666428.0, "z_loss": 0.0005683295312337577 }, { "copy_logits_max": 0.694934606552124, "copy_logits_min": -750000000.0, "copy_num_tokens": 501.125, "epoch": 1.1801378606076078, "gen_logits_max": 5.045322418212891, "gen_logits_mean": -13.887284278869629, "gen_logits_min": -25.537704467773438, "gen_logits_std": 2.8717918395996094, "gen_loss": 0.28738635778427124, "grad_norm": 0.34552822899761626, "learning_rate": 2.363115789473684e-05, "loss": 0.2863, "mean_copy_accuracy": 0.9970249831676483, "mean_gen_accuracy": 0.8710148483514786, "mean_token_accuracy": 0.90366430580616, "num_tokens": 482918274.0, "sample_num_tokens": 8978.0, "step": 5778, "total_num_tokens": 482954186.0, "z_loss": 0.0006040963344275951 }, { "copy_logits_max": 0.6402261257171631, "copy_logits_min": -750000000.0, "copy_num_tokens": 441.8125, "epoch": 1.1803420985448048, "gen_logits_max": 5.469721794128418, "gen_logits_mean": -12.908943176269531, "gen_logits_min": -25.172409057617188, "gen_logits_std": 2.81596302986145, "gen_loss": 0.3087449073791504, "grad_norm": 0.3741627148266392, "learning_rate": 2.3629894736842105e-05, "loss": 0.2773, "mean_copy_accuracy": 0.9967767596244812, "mean_gen_accuracy": 0.8737123906612396, "mean_token_accuracy": 0.9045386016368866, "num_tokens": 483180443.0, "sample_num_tokens": 8433.25, "step": 5779, "total_num_tokens": 483214176.0, "z_loss": 0.0006228815182112157 }, { "copy_logits_max": -2.8587706089019775, "copy_logits_min": -687500032.0, "copy_num_tokens": 395.0, "epoch": 1.1805463364820015, "gen_logits_max": 4.441702842712402, "gen_logits_mean": -14.511979103088379, "gen_logits_min": -25.864591598510742, "gen_logits_std": 2.887540578842163, "gen_loss": 0.28698936104774475, "grad_norm": 0.40420609493375, "learning_rate": 2.362863157894737e-05, "loss": 0.2929, "mean_copy_accuracy": 0.9959741085767746, "mean_gen_accuracy": 0.8721596598625183, "mean_token_accuracy": 0.9000243246555328, "num_tokens": 483418297.0, "sample_num_tokens": 7222.25, "step": 5780, "total_num_tokens": 483447186.0, "z_loss": 0.0005525760352611542 }, { "copy_logits_max": -3.5304999351501465, "copy_logits_min": -750000000.0, "copy_num_tokens": 490.8125, "epoch": 1.1807505744191984, "gen_logits_max": 2.796919822692871, "gen_logits_mean": -17.364248275756836, "gen_logits_min": -29.175785064697266, "gen_logits_std": 3.009164810180664, "gen_loss": 0.24743424355983734, "grad_norm": 0.39854247963038447, "learning_rate": 2.3627368421052634e-05, "loss": 0.2945, "mean_copy_accuracy": 0.9960796386003494, "mean_gen_accuracy": 0.8724219650030136, "mean_token_accuracy": 0.8996918499469757, "num_tokens": 483666464.0, "sample_num_tokens": 8466.5, "step": 5781, "total_num_tokens": 483700330.0, "z_loss": 0.0004657318932004273 }, { "copy_logits_max": -0.3324790298938751, "copy_logits_min": -625000064.0, "copy_num_tokens": 525.625, "epoch": 1.1809548123563953, "gen_logits_max": 5.4465131759643555, "gen_logits_mean": -12.447904586791992, "gen_logits_min": -25.231826782226562, "gen_logits_std": 2.8634698390960693, "gen_loss": 0.2240217626094818, "grad_norm": 0.4235963725468417, "learning_rate": 2.3626105263157898e-05, "loss": 0.2842, "mean_copy_accuracy": 0.9948343336582184, "mean_gen_accuracy": 0.8758293092250824, "mean_token_accuracy": 0.9018208682537079, "num_tokens": 483918868.0, "sample_num_tokens": 8686.5, "step": 5782, "total_num_tokens": 483953614.0, "z_loss": 0.00042817796929739416 }, { "copy_logits_max": -3.6423885822296143, "copy_logits_min": -750000000.0, "copy_num_tokens": 392.4375, "epoch": 1.181159050293592, "gen_logits_max": 5.019433498382568, "gen_logits_mean": -14.259073257446289, "gen_logits_min": -25.861204147338867, "gen_logits_std": 2.9189162254333496, "gen_loss": 0.29096919298171997, "grad_norm": 0.3462392488510981, "learning_rate": 2.362484210526316e-05, "loss": 0.2821, "mean_copy_accuracy": 0.9963839650154114, "mean_gen_accuracy": 0.8702811598777771, "mean_token_accuracy": 0.9041750431060791, "num_tokens": 484199022.0, "sample_num_tokens": 8069.5, "step": 5783, "total_num_tokens": 484231300.0, "z_loss": 0.0005092429928481579 }, { "copy_logits_max": -3.7590129375457764, "copy_logits_min": -750000000.0, "copy_num_tokens": 561.0, "epoch": 1.181363288230789, "gen_logits_max": 3.6434547901153564, "gen_logits_mean": -15.99232006072998, "gen_logits_min": -27.76776123046875, "gen_logits_std": 2.9838075637817383, "gen_loss": 0.2495444118976593, "grad_norm": 0.4125859248478503, "learning_rate": 2.3623578947368423e-05, "loss": 0.2792, "mean_copy_accuracy": 0.9975935518741608, "mean_gen_accuracy": 0.8717479556798935, "mean_token_accuracy": 0.9053429812192917, "num_tokens": 484476114.0, "sample_num_tokens": 8979.0, "step": 5784, "total_num_tokens": 484512030.0, "z_loss": 0.00044952231110073626 }, { "copy_logits_max": -4.168485641479492, "copy_logits_min": -750000064.0, "copy_num_tokens": 732.5625, "epoch": 1.1815675261679857, "gen_logits_max": 3.2108230590820312, "gen_logits_mean": -14.953968048095703, "gen_logits_min": -26.49730682373047, "gen_logits_std": 2.892637014389038, "gen_loss": 0.26972323656082153, "grad_norm": 0.38935438238024705, "learning_rate": 2.3622315789473684e-05, "loss": 0.283, "mean_copy_accuracy": 0.9962505996227264, "mean_gen_accuracy": 0.8719278872013092, "mean_token_accuracy": 0.9043771773576736, "num_tokens": 484758270.0, "sample_num_tokens": 11116.0, "step": 5785, "total_num_tokens": 484802734.0, "z_loss": 0.00044726853957399726 }, { "copy_logits_max": -4.560104846954346, "copy_logits_min": -687500032.0, "copy_num_tokens": 499.0625, "epoch": 1.1817717641051826, "gen_logits_max": 3.125507116317749, "gen_logits_mean": -16.223464965820312, "gen_logits_min": -28.252582550048828, "gen_logits_std": 2.9776999950408936, "gen_loss": 0.2773972749710083, "grad_norm": 0.37074021258570394, "learning_rate": 2.362105263157895e-05, "loss": 0.2807, "mean_copy_accuracy": 0.9956089407205582, "mean_gen_accuracy": 0.8744560927152634, "mean_token_accuracy": 0.9046526253223419, "num_tokens": 485036835.0, "sample_num_tokens": 7864.25, "step": 5786, "total_num_tokens": 485068292.0, "z_loss": 0.0005467443843372166 }, { "copy_logits_max": -3.788682460784912, "copy_logits_min": -750000000.0, "copy_num_tokens": 543.0, "epoch": 1.1819760020423793, "gen_logits_max": 3.387516975402832, "gen_logits_mean": -15.824249267578125, "gen_logits_min": -27.618770599365234, "gen_logits_std": 2.9507999420166016, "gen_loss": 0.2926030158996582, "grad_norm": 0.3787084741649392, "learning_rate": 2.361978947368421e-05, "loss": 0.278, "mean_copy_accuracy": 0.9962538182735443, "mean_gen_accuracy": 0.8704277873039246, "mean_token_accuracy": 0.9043729454278946, "num_tokens": 485299889.0, "sample_num_tokens": 8613.25, "step": 5787, "total_num_tokens": 485334342.0, "z_loss": 0.000526655581779778 }, { "copy_logits_max": -3.8050944805145264, "copy_logits_min": -750000000.0, "copy_num_tokens": 424.8125, "epoch": 1.1821802399795762, "gen_logits_max": 5.726265907287598, "gen_logits_mean": -12.491630554199219, "gen_logits_min": -24.455039978027344, "gen_logits_std": 2.8648972511291504, "gen_loss": 0.3133547306060791, "grad_norm": 0.3851587353439355, "learning_rate": 2.3618526315789474e-05, "loss": 0.2841, "mean_copy_accuracy": 0.9958837181329727, "mean_gen_accuracy": 0.8736273050308228, "mean_token_accuracy": 0.9030030518770218, "num_tokens": 485564313.0, "sample_num_tokens": 8398.75, "step": 5788, "total_num_tokens": 485597908.0, "z_loss": 0.000553738500457257 }, { "copy_logits_max": -5.066255569458008, "copy_logits_min": -750000000.0, "copy_num_tokens": 559.375, "epoch": 1.1823844779167731, "gen_logits_max": 3.8835320472717285, "gen_logits_mean": -15.738359451293945, "gen_logits_min": -27.66773223876953, "gen_logits_std": 2.9578237533569336, "gen_loss": 0.27865222096443176, "grad_norm": 0.39402200618040095, "learning_rate": 2.3617263157894735e-05, "loss": 0.2733, "mean_copy_accuracy": 0.9967335164546967, "mean_gen_accuracy": 0.875784158706665, "mean_token_accuracy": 0.9054122865200043, "num_tokens": 485839281.0, "sample_num_tokens": 9391.75, "step": 5789, "total_num_tokens": 485876848.0, "z_loss": 0.0004794556589331478 }, { "copy_logits_max": -5.925017833709717, "copy_logits_min": -750000064.0, "copy_num_tokens": 513.1875, "epoch": 1.1825887158539699, "gen_logits_max": 3.542708396911621, "gen_logits_mean": -16.475116729736328, "gen_logits_min": -28.300920486450195, "gen_logits_std": 2.9903993606567383, "gen_loss": 0.2566777765750885, "grad_norm": 0.3873507782864091, "learning_rate": 2.3616000000000002e-05, "loss": 0.2768, "mean_copy_accuracy": 0.9963161200284958, "mean_gen_accuracy": 0.8763023614883423, "mean_token_accuracy": 0.9083136916160583, "num_tokens": 486115072.0, "sample_num_tokens": 9221.5, "step": 5790, "total_num_tokens": 486151958.0, "z_loss": 0.00046190066495910287 }, { "copy_logits_max": -3.1065545082092285, "copy_logits_min": -687500032.0, "copy_num_tokens": 542.25, "epoch": 1.1827929537911668, "gen_logits_max": 5.378204822540283, "gen_logits_mean": -13.22932243347168, "gen_logits_min": -25.744300842285156, "gen_logits_std": 2.8896260261535645, "gen_loss": 0.32945767045021057, "grad_norm": 0.39852143174216526, "learning_rate": 2.3614736842105263e-05, "loss": 0.3034, "mean_copy_accuracy": 0.996282160282135, "mean_gen_accuracy": 0.868373304605484, "mean_token_accuracy": 0.8995701223611832, "num_tokens": 486389375.0, "sample_num_tokens": 8875.25, "step": 5791, "total_num_tokens": 486424876.0, "z_loss": 0.0006548212841153145 }, { "copy_logits_max": -5.276183128356934, "copy_logits_min": -750000000.0, "copy_num_tokens": 239.4375, "epoch": 1.1829971917283635, "gen_logits_max": 5.693946838378906, "gen_logits_mean": -14.686756134033203, "gen_logits_min": -26.174400329589844, "gen_logits_std": 2.9187347888946533, "gen_loss": 0.31601718068122864, "grad_norm": 0.36282223417688453, "learning_rate": 2.3613473684210527e-05, "loss": 0.2741, "mean_copy_accuracy": 0.9963722229003906, "mean_gen_accuracy": 0.8773442953824997, "mean_token_accuracy": 0.9062977731227875, "num_tokens": 486673070.0, "sample_num_tokens": 7042.0, "step": 5792, "total_num_tokens": 486701238.0, "z_loss": 0.0005969686317257583 }, { "copy_logits_max": -5.840554237365723, "copy_logits_min": -687500032.0, "copy_num_tokens": 464.75, "epoch": 1.1832014296655604, "gen_logits_max": 3.470921516418457, "gen_logits_mean": -16.713727951049805, "gen_logits_min": -28.28592300415039, "gen_logits_std": 2.9719419479370117, "gen_loss": 0.2774404287338257, "grad_norm": 0.39039797718243174, "learning_rate": 2.3612210526315792e-05, "loss": 0.2865, "mean_copy_accuracy": 0.9959565550088882, "mean_gen_accuracy": 0.8742204010486603, "mean_token_accuracy": 0.9024634808301926, "num_tokens": 486940934.0, "sample_num_tokens": 8639.5, "step": 5793, "total_num_tokens": 486975492.0, "z_loss": 0.000531640718691051 }, { "copy_logits_max": -4.164374828338623, "copy_logits_min": -750000000.0, "copy_num_tokens": 585.0, "epoch": 1.1834056676027571, "gen_logits_max": 4.248935222625732, "gen_logits_mean": -15.423778533935547, "gen_logits_min": -27.591503143310547, "gen_logits_std": 2.9761836528778076, "gen_loss": 0.2647358775138855, "grad_norm": 0.41333367622384626, "learning_rate": 2.3610947368421053e-05, "loss": 0.2764, "mean_copy_accuracy": 0.9957672357559204, "mean_gen_accuracy": 0.8754427582025528, "mean_token_accuracy": 0.9068155586719513, "num_tokens": 487214165.0, "sample_num_tokens": 9487.75, "step": 5794, "total_num_tokens": 487252116.0, "z_loss": 0.0005190412048250437 }, { "copy_logits_max": -5.407632827758789, "copy_logits_min": -750000000.0, "copy_num_tokens": 314.875, "epoch": 1.183609905539954, "gen_logits_max": 4.812708377838135, "gen_logits_mean": -15.621845245361328, "gen_logits_min": -27.536787033081055, "gen_logits_std": 2.9393656253814697, "gen_loss": 0.3250240683555603, "grad_norm": 0.35822347601111343, "learning_rate": 2.3609684210526317e-05, "loss": 0.3077, "mean_copy_accuracy": 0.9966183453798294, "mean_gen_accuracy": 0.8707442134618759, "mean_token_accuracy": 0.896429568529129, "num_tokens": 487482237.0, "sample_num_tokens": 7632.25, "step": 5795, "total_num_tokens": 487512766.0, "z_loss": 0.0005918759270571172 }, { "copy_logits_max": -3.448876142501831, "copy_logits_min": -750000064.0, "copy_num_tokens": 446.5625, "epoch": 1.183814143477151, "gen_logits_max": 4.648569107055664, "gen_logits_mean": -14.13276481628418, "gen_logits_min": -26.32880210876465, "gen_logits_std": 2.90762996673584, "gen_loss": 0.251718133687973, "grad_norm": 0.4229060734852553, "learning_rate": 2.3608421052631578e-05, "loss": 0.2703, "mean_copy_accuracy": 0.9961443990468979, "mean_gen_accuracy": 0.882714718580246, "mean_token_accuracy": 0.9072341620922089, "num_tokens": 487736803.0, "sample_num_tokens": 8749.75, "step": 5796, "total_num_tokens": 487771802.0, "z_loss": 0.0005478954990394413 }, { "copy_logits_max": -2.4001963138580322, "copy_logits_min": -750000000.0, "copy_num_tokens": 773.875, "epoch": 1.1840183814143477, "gen_logits_max": 3.1821811199188232, "gen_logits_mean": -15.942743301391602, "gen_logits_min": -28.58047866821289, "gen_logits_std": 3.0303454399108887, "gen_loss": 0.2641940712928772, "grad_norm": 0.3821307234322218, "learning_rate": 2.3607157894736842e-05, "loss": 0.2788, "mean_copy_accuracy": 0.9965090751647949, "mean_gen_accuracy": 0.8734631985425949, "mean_token_accuracy": 0.9072365909814835, "num_tokens": 488022145.0, "sample_num_tokens": 10438.75, "step": 5797, "total_num_tokens": 488063900.0, "z_loss": 0.0006803626311011612 }, { "copy_logits_max": -4.798489570617676, "copy_logits_min": -687500032.0, "copy_num_tokens": 412.125, "epoch": 1.1842226193515446, "gen_logits_max": 4.899542808532715, "gen_logits_mean": -13.986505508422852, "gen_logits_min": -26.23271942138672, "gen_logits_std": 2.8996710777282715, "gen_loss": 0.3100976049900055, "grad_norm": 0.3647853897718789, "learning_rate": 2.3605894736842107e-05, "loss": 0.2857, "mean_copy_accuracy": 0.9972298443317413, "mean_gen_accuracy": 0.8722285628318787, "mean_token_accuracy": 0.9035586416721344, "num_tokens": 488306821.0, "sample_num_tokens": 7884.75, "step": 5798, "total_num_tokens": 488338360.0, "z_loss": 0.0006541463080793619 }, { "copy_logits_max": -4.875843048095703, "copy_logits_min": -687500032.0, "copy_num_tokens": 390.75, "epoch": 1.1844268572887413, "gen_logits_max": 4.201347351074219, "gen_logits_mean": -16.572551727294922, "gen_logits_min": -29.23847007751465, "gen_logits_std": 3.007115364074707, "gen_loss": 0.28562021255493164, "grad_norm": 0.37448826754380654, "learning_rate": 2.360463157894737e-05, "loss": 0.2894, "mean_copy_accuracy": 0.9970399141311646, "mean_gen_accuracy": 0.8735046684741974, "mean_token_accuracy": 0.9034439772367477, "num_tokens": 488576151.0, "sample_num_tokens": 7791.25, "step": 5799, "total_num_tokens": 488607316.0, "z_loss": 0.0005944902077317238 }, { "copy_logits_max": -1.5398776531219482, "copy_logits_min": -687500032.0, "copy_num_tokens": 769.5625, "epoch": 1.1846310952259382, "gen_logits_max": 3.4498159885406494, "gen_logits_mean": -15.58079719543457, "gen_logits_min": -28.505687713623047, "gen_logits_std": 3.036731719970703, "gen_loss": 0.26497572660446167, "grad_norm": 0.35903682065321396, "learning_rate": 2.3603368421052632e-05, "loss": 0.2687, "mean_copy_accuracy": 0.9961723536252975, "mean_gen_accuracy": 0.8764804154634476, "mean_token_accuracy": 0.9100361615419388, "num_tokens": 488858998.0, "sample_num_tokens": 10087.0, "step": 5800, "total_num_tokens": 488899346.0, "z_loss": 0.0005704649956896901 }, { "copy_logits_max": -4.872743606567383, "copy_logits_min": -750000000.0, "copy_num_tokens": 487.25, "epoch": 1.184835333163135, "gen_logits_max": 3.3416996002197266, "gen_logits_mean": -16.767501831054688, "gen_logits_min": -29.320749282836914, "gen_logits_std": 3.0393338203430176, "gen_loss": 0.2334948629140854, "grad_norm": 0.3575435141913296, "learning_rate": 2.3602105263157896e-05, "loss": 0.2631, "mean_copy_accuracy": 0.9962590485811234, "mean_gen_accuracy": 0.8854624629020691, "mean_token_accuracy": 0.9118987172842026, "num_tokens": 489145335.0, "sample_num_tokens": 9055.75, "step": 5801, "total_num_tokens": 489181558.0, "z_loss": 0.0004364543710835278 }, { "copy_logits_max": -5.258303642272949, "copy_logits_min": -750000000.0, "copy_num_tokens": 663.8125, "epoch": 1.1850395711003319, "gen_logits_max": 3.54427433013916, "gen_logits_mean": -15.953917503356934, "gen_logits_min": -29.101089477539062, "gen_logits_std": 3.030801773071289, "gen_loss": 0.24934138357639313, "grad_norm": 0.37380523265666354, "learning_rate": 2.3600842105263157e-05, "loss": 0.2664, "mean_copy_accuracy": 0.9969091862440109, "mean_gen_accuracy": 0.8755727559328079, "mean_token_accuracy": 0.9098426401615143, "num_tokens": 489446511.0, "sample_num_tokens": 10647.25, "step": 5802, "total_num_tokens": 489489100.0, "z_loss": 0.0004642088315449655 }, { "copy_logits_max": -3.190302610397339, "copy_logits_min": -687500032.0, "copy_num_tokens": 438.8125, "epoch": 1.1852438090375288, "gen_logits_max": 3.674879550933838, "gen_logits_mean": -16.360746383666992, "gen_logits_min": -28.85321044921875, "gen_logits_std": 3.003901243209839, "gen_loss": 0.34118393063545227, "grad_norm": 0.4591767819956501, "learning_rate": 2.359957894736842e-05, "loss": 0.3108, "mean_copy_accuracy": 0.996240884065628, "mean_gen_accuracy": 0.8637544363737106, "mean_token_accuracy": 0.8950227946043015, "num_tokens": 489727543.0, "sample_num_tokens": 8499.75, "step": 5803, "total_num_tokens": 489761542.0, "z_loss": 0.0006297392537817359 }, { "copy_logits_max": -2.4436686038970947, "copy_logits_min": -750000000.0, "copy_num_tokens": 593.8125, "epoch": 1.1854480469747255, "gen_logits_max": 3.7329368591308594, "gen_logits_mean": -15.81515884399414, "gen_logits_min": -28.168684005737305, "gen_logits_std": 3.0356504917144775, "gen_loss": 0.2681030035018921, "grad_norm": 0.38831772993314745, "learning_rate": 2.3598315789473682e-05, "loss": 0.3022, "mean_copy_accuracy": 0.996506080031395, "mean_gen_accuracy": 0.8685648143291473, "mean_token_accuracy": 0.8995444774627686, "num_tokens": 490005388.0, "sample_num_tokens": 9652.0, "step": 5804, "total_num_tokens": 490043996.0, "z_loss": 0.0004657766257878393 }, { "copy_logits_max": -3.758134603500366, "copy_logits_min": -750000000.0, "copy_num_tokens": 494.0, "epoch": 1.1856522849119224, "gen_logits_max": 4.53310489654541, "gen_logits_mean": -14.731969833374023, "gen_logits_min": -27.435916900634766, "gen_logits_std": 2.975029945373535, "gen_loss": 0.2652333378791809, "grad_norm": 0.34664535219252796, "learning_rate": 2.3597052631578947e-05, "loss": 0.2497, "mean_copy_accuracy": 0.9967566281557083, "mean_gen_accuracy": 0.8802521228790283, "mean_token_accuracy": 0.9153820723295212, "num_tokens": 490301596.0, "sample_num_tokens": 8464.5, "step": 5805, "total_num_tokens": 490335454.0, "z_loss": 0.0005076259840279818 }, { "copy_logits_max": -1.2952427864074707, "copy_logits_min": -687500096.0, "copy_num_tokens": 696.25, "epoch": 1.1858565228491191, "gen_logits_max": 3.2372875213623047, "gen_logits_mean": -15.01470947265625, "gen_logits_min": -28.003276824951172, "gen_logits_std": 2.9999606609344482, "gen_loss": 0.2650441527366638, "grad_norm": 0.4011000543541168, "learning_rate": 2.359578947368421e-05, "loss": 0.2765, "mean_copy_accuracy": 0.9955958873033524, "mean_gen_accuracy": 0.8732576221227646, "mean_token_accuracy": 0.905893474817276, "num_tokens": 490573252.0, "sample_num_tokens": 9603.0, "step": 5806, "total_num_tokens": 490611664.0, "z_loss": 0.0004872577846981585 }, { "copy_logits_max": -3.5263772010803223, "copy_logits_min": -750000000.0, "copy_num_tokens": 566.6875, "epoch": 1.186060760786316, "gen_logits_max": 3.8299756050109863, "gen_logits_mean": -15.763065338134766, "gen_logits_min": -27.946287155151367, "gen_logits_std": 2.996551275253296, "gen_loss": 0.3171994686126709, "grad_norm": 0.4188386462872386, "learning_rate": 2.3594526315789475e-05, "loss": 0.2867, "mean_copy_accuracy": 0.9948757588863373, "mean_gen_accuracy": 0.874329000711441, "mean_token_accuracy": 0.902119517326355, "num_tokens": 490837609.0, "sample_num_tokens": 9451.75, "step": 5807, "total_num_tokens": 490875416.0, "z_loss": 0.0006106947548687458 }, { "copy_logits_max": -6.389599323272705, "copy_logits_min": -750000000.0, "copy_num_tokens": 430.0625, "epoch": 1.186264998723513, "gen_logits_max": 4.638888835906982, "gen_logits_mean": -15.756925582885742, "gen_logits_min": -27.574565887451172, "gen_logits_std": 2.987259864807129, "gen_loss": 0.3041093349456787, "grad_norm": 0.40141418571924603, "learning_rate": 2.359326315789474e-05, "loss": 0.2773, "mean_copy_accuracy": 0.9966331273317337, "mean_gen_accuracy": 0.8782065957784653, "mean_token_accuracy": 0.9049785882234573, "num_tokens": 491104699.0, "sample_num_tokens": 8385.25, "step": 5808, "total_num_tokens": 491138240.0, "z_loss": 0.0005644776974804699 }, { "copy_logits_max": -4.269653797149658, "copy_logits_min": -687500032.0, "copy_num_tokens": 556.0, "epoch": 1.1864692366607097, "gen_logits_max": 4.154225826263428, "gen_logits_mean": -15.281335830688477, "gen_logits_min": -27.987098693847656, "gen_logits_std": 3.0145671367645264, "gen_loss": 0.2783403992652893, "grad_norm": 0.380139064537578, "learning_rate": 2.3592e-05, "loss": 0.277, "mean_copy_accuracy": 0.9967732578516006, "mean_gen_accuracy": 0.8744557946920395, "mean_token_accuracy": 0.9073390513658524, "num_tokens": 491382981.0, "sample_num_tokens": 8529.25, "step": 5809, "total_num_tokens": 491417098.0, "z_loss": 0.0005731737473979592 }, { "copy_logits_max": -4.861152648925781, "copy_logits_min": -625000064.0, "copy_num_tokens": 523.3125, "epoch": 1.1866734745979066, "gen_logits_max": 3.5179290771484375, "gen_logits_mean": -16.9846134185791, "gen_logits_min": -29.183246612548828, "gen_logits_std": 3.065544843673706, "gen_loss": 0.28869304060935974, "grad_norm": 0.40587228968554606, "learning_rate": 2.3590736842105265e-05, "loss": 0.2914, "mean_copy_accuracy": 0.9957799017429352, "mean_gen_accuracy": 0.8733624070882797, "mean_token_accuracy": 0.9001506716012955, "num_tokens": 491642995.0, "sample_num_tokens": 9252.25, "step": 5810, "total_num_tokens": 491680004.0, "z_loss": 0.0005488833412528038 }, { "copy_logits_max": -3.6931540966033936, "copy_logits_min": -750000000.0, "copy_num_tokens": 573.375, "epoch": 1.1868777125351033, "gen_logits_max": 3.635162830352783, "gen_logits_mean": -15.899835586547852, "gen_logits_min": -28.0706844329834, "gen_logits_std": 3.0315322875976562, "gen_loss": 0.28162699937820435, "grad_norm": 0.3880612134045641, "learning_rate": 2.3589473684210526e-05, "loss": 0.2934, "mean_copy_accuracy": 0.9964569061994553, "mean_gen_accuracy": 0.8702335953712463, "mean_token_accuracy": 0.9026335924863815, "num_tokens": 491928159.0, "sample_num_tokens": 9027.75, "step": 5811, "total_num_tokens": 491964270.0, "z_loss": 0.0005118082626722753 }, { "copy_logits_max": -3.790541172027588, "copy_logits_min": -750000000.0, "copy_num_tokens": 521.5625, "epoch": 1.1870819504723003, "gen_logits_max": 3.7517809867858887, "gen_logits_mean": -15.611151695251465, "gen_logits_min": -27.98138427734375, "gen_logits_std": 3.0563833713531494, "gen_loss": 0.2643722593784332, "grad_norm": 0.4038237338322315, "learning_rate": 2.358821052631579e-05, "loss": 0.2907, "mean_copy_accuracy": 0.9953913241624832, "mean_gen_accuracy": 0.8739043623209, "mean_token_accuracy": 0.9020049124956131, "num_tokens": 492185629.0, "sample_num_tokens": 8659.25, "step": 5812, "total_num_tokens": 492220266.0, "z_loss": 0.0004736953997053206 }, { "copy_logits_max": -4.315594673156738, "copy_logits_min": -687500032.0, "copy_num_tokens": 554.6875, "epoch": 1.1872861884094972, "gen_logits_max": 4.133070945739746, "gen_logits_mean": -15.186336517333984, "gen_logits_min": -27.378299713134766, "gen_logits_std": 3.0192036628723145, "gen_loss": 0.26903942227363586, "grad_norm": 0.3499703147569213, "learning_rate": 2.358694736842105e-05, "loss": 0.2628, "mean_copy_accuracy": 0.9966907501220703, "mean_gen_accuracy": 0.8825145959854126, "mean_token_accuracy": 0.9105685651302338, "num_tokens": 492464812.0, "sample_num_tokens": 10264.0, "step": 5813, "total_num_tokens": 492505868.0, "z_loss": 0.00047986229765228927 }, { "copy_logits_max": -5.103271007537842, "copy_logits_min": -750000000.0, "copy_num_tokens": 331.1875, "epoch": 1.1874904263466939, "gen_logits_max": 5.320810317993164, "gen_logits_mean": -14.670340538024902, "gen_logits_min": -26.732770919799805, "gen_logits_std": 3.0246329307556152, "gen_loss": 0.26166799664497375, "grad_norm": 0.3512560451797328, "learning_rate": 2.358568421052632e-05, "loss": 0.2721, "mean_copy_accuracy": 0.9973461031913757, "mean_gen_accuracy": 0.8762911558151245, "mean_token_accuracy": 0.9073539525270462, "num_tokens": 492764016.0, "sample_num_tokens": 7477.0, "step": 5814, "total_num_tokens": 492793924.0, "z_loss": 0.0005003582336939871 }, { "copy_logits_max": -3.8059589862823486, "copy_logits_min": -750000000.0, "copy_num_tokens": 612.1875, "epoch": 1.1876946642838908, "gen_logits_max": 4.520153522491455, "gen_logits_mean": -14.673315048217773, "gen_logits_min": -26.879661560058594, "gen_logits_std": 3.025099277496338, "gen_loss": 0.30300503969192505, "grad_norm": 0.419471303645128, "learning_rate": 2.358442105263158e-05, "loss": 0.2853, "mean_copy_accuracy": 0.9960352778434753, "mean_gen_accuracy": 0.8739047646522522, "mean_token_accuracy": 0.904399573802948, "num_tokens": 493028762.0, "sample_num_tokens": 10050.5, "step": 5815, "total_num_tokens": 493068964.0, "z_loss": 0.0005848645232617855 }, { "copy_logits_max": -1.7204185724258423, "copy_logits_min": -750000000.0, "copy_num_tokens": 520.6875, "epoch": 1.1878989022210875, "gen_logits_max": 6.369197368621826, "gen_logits_mean": -12.728026390075684, "gen_logits_min": -25.364717483520508, "gen_logits_std": 3.001284599304199, "gen_loss": 0.26124706864356995, "grad_norm": 0.4343726419808258, "learning_rate": 2.3583157894736844e-05, "loss": 0.279, "mean_copy_accuracy": 0.9964008182287216, "mean_gen_accuracy": 0.8753461539745331, "mean_token_accuracy": 0.9067089408636093, "num_tokens": 493305581.0, "sample_num_tokens": 8757.25, "step": 5816, "total_num_tokens": 493340610.0, "z_loss": 0.0004961096565239131 }, { "copy_logits_max": -3.0263679027557373, "copy_logits_min": -687500032.0, "copy_num_tokens": 613.125, "epoch": 1.1881031401582844, "gen_logits_max": 3.4576056003570557, "gen_logits_mean": -16.756351470947266, "gen_logits_min": -28.910133361816406, "gen_logits_std": 3.084397315979004, "gen_loss": 0.28066200017929077, "grad_norm": 0.4013570273662884, "learning_rate": 2.3581894736842105e-05, "loss": 0.2657, "mean_copy_accuracy": 0.9958354383707047, "mean_gen_accuracy": 0.8808012753725052, "mean_token_accuracy": 0.9091897755861282, "num_tokens": 493573442.0, "sample_num_tokens": 9452.0, "step": 5817, "total_num_tokens": 493611250.0, "z_loss": 0.0004997861688025296 }, { "copy_logits_max": -4.825876712799072, "copy_logits_min": -750000000.0, "copy_num_tokens": 404.0, "epoch": 1.1883073780954811, "gen_logits_max": 4.29960298538208, "gen_logits_mean": -16.456823348999023, "gen_logits_min": -28.124610900878906, "gen_logits_std": 3.0373942852020264, "gen_loss": 0.2620190978050232, "grad_norm": 0.3676753312268676, "learning_rate": 2.358063157894737e-05, "loss": 0.2728, "mean_copy_accuracy": 0.9956230074167252, "mean_gen_accuracy": 0.8805637508630753, "mean_token_accuracy": 0.9069993048906326, "num_tokens": 493845699.0, "sample_num_tokens": 8179.75, "step": 5818, "total_num_tokens": 493878418.0, "z_loss": 0.0005060469266027212 }, { "copy_logits_max": -2.8160030841827393, "copy_logits_min": -687500032.0, "copy_num_tokens": 703.0, "epoch": 1.188511616032678, "gen_logits_max": 4.381619453430176, "gen_logits_mean": -15.725069046020508, "gen_logits_min": -27.861722946166992, "gen_logits_std": 3.100315570831299, "gen_loss": 0.2279275357723236, "grad_norm": 0.37949017380399286, "learning_rate": 2.357936842105263e-05, "loss": 0.2742, "mean_copy_accuracy": 0.9963404834270477, "mean_gen_accuracy": 0.8769748210906982, "mean_token_accuracy": 0.9088316857814789, "num_tokens": 494127429.0, "sample_num_tokens": 10141.75, "step": 5819, "total_num_tokens": 494167996.0, "z_loss": 0.0004479695053305477 }, { "copy_logits_max": -3.900385856628418, "copy_logits_min": -750000000.0, "copy_num_tokens": 430.8125, "epoch": 1.188715853969875, "gen_logits_max": 3.775521993637085, "gen_logits_mean": -16.965744018554688, "gen_logits_min": -29.215890884399414, "gen_logits_std": 3.089806079864502, "gen_loss": 0.2652474641799927, "grad_norm": 0.366821217609692, "learning_rate": 2.3578105263157894e-05, "loss": 0.2901, "mean_copy_accuracy": 0.996348038315773, "mean_gen_accuracy": 0.8696694076061249, "mean_token_accuracy": 0.8994590193033218, "num_tokens": 494404635.0, "sample_num_tokens": 7306.75, "step": 5820, "total_num_tokens": 494433862.0, "z_loss": 0.0004963581450283527 }, { "copy_logits_max": -3.671637773513794, "copy_logits_min": -750000000.0, "copy_num_tokens": 457.25, "epoch": 1.1889200919070717, "gen_logits_max": 4.469821929931641, "gen_logits_mean": -16.23955535888672, "gen_logits_min": -28.309036254882812, "gen_logits_std": 3.0543136596679688, "gen_loss": 0.2762441039085388, "grad_norm": 0.37340056778785435, "learning_rate": 2.357684210526316e-05, "loss": 0.2755, "mean_copy_accuracy": 0.9963319599628448, "mean_gen_accuracy": 0.8735177367925644, "mean_token_accuracy": 0.906461089849472, "num_tokens": 494670960.0, "sample_num_tokens": 8971.5, "step": 5821, "total_num_tokens": 494706846.0, "z_loss": 0.0005235199932940304 }, { "copy_logits_max": -1.9293651580810547, "copy_logits_min": -687500032.0, "copy_num_tokens": 404.125, "epoch": 1.1891243298442686, "gen_logits_max": 6.230812072753906, "gen_logits_mean": -13.121233940124512, "gen_logits_min": -25.3531494140625, "gen_logits_std": 3.0063071250915527, "gen_loss": 0.31578341126441956, "grad_norm": 0.40479571419826627, "learning_rate": 2.3575578947368423e-05, "loss": 0.2907, "mean_copy_accuracy": 0.9960103929042816, "mean_gen_accuracy": 0.8720599412918091, "mean_token_accuracy": 0.9001532793045044, "num_tokens": 494923162.0, "sample_num_tokens": 8148.5, "step": 5822, "total_num_tokens": 494955756.0, "z_loss": 0.0006256754859350622 }, { "copy_logits_max": -3.153855085372925, "copy_logits_min": -687500032.0, "copy_num_tokens": 344.1875, "epoch": 1.1893285677814653, "gen_logits_max": 7.144998550415039, "gen_logits_mean": -12.393628120422363, "gen_logits_min": -24.892879486083984, "gen_logits_std": 2.9826226234436035, "gen_loss": 0.28419017791748047, "grad_norm": 0.3661674017023399, "learning_rate": 2.3574315789473687e-05, "loss": 0.2582, "mean_copy_accuracy": 0.9964169710874557, "mean_gen_accuracy": 0.8871357589960098, "mean_token_accuracy": 0.9116645008325577, "num_tokens": 495192678.0, "sample_num_tokens": 7365.0, "step": 5823, "total_num_tokens": 495222138.0, "z_loss": 0.0005794304888695478 }, { "copy_logits_max": -3.3495187759399414, "copy_logits_min": -750000000.0, "copy_num_tokens": 497.4375, "epoch": 1.1895328057186623, "gen_logits_max": 3.789283275604248, "gen_logits_mean": -17.08694076538086, "gen_logits_min": -29.473281860351562, "gen_logits_std": 3.067281723022461, "gen_loss": 0.27845585346221924, "grad_norm": 0.37801738208998753, "learning_rate": 2.3573052631578948e-05, "loss": 0.2864, "mean_copy_accuracy": 0.996340423822403, "mean_gen_accuracy": 0.8744805604219437, "mean_token_accuracy": 0.9029331803321838, "num_tokens": 495459075.0, "sample_num_tokens": 8114.25, "step": 5824, "total_num_tokens": 495491532.0, "z_loss": 0.0005743938381783664 }, { "copy_logits_max": -3.4101881980895996, "copy_logits_min": -750000000.0, "copy_num_tokens": 261.6875, "epoch": 1.189737043655859, "gen_logits_max": 6.332128524780273, "gen_logits_mean": -15.147500991821289, "gen_logits_min": -27.033740997314453, "gen_logits_std": 3.0074994564056396, "gen_loss": 0.3301337659358978, "grad_norm": 0.38605718170704345, "learning_rate": 2.3571789473684212e-05, "loss": 0.2844, "mean_copy_accuracy": 0.9957877099514008, "mean_gen_accuracy": 0.8740130811929703, "mean_token_accuracy": 0.901683434844017, "num_tokens": 495716826.0, "sample_num_tokens": 6526.0, "step": 5825, "total_num_tokens": 495742930.0, "z_loss": 0.0006590784760192037 }, { "copy_logits_max": -4.045111179351807, "copy_logits_min": -750000000.0, "copy_num_tokens": 428.875, "epoch": 1.189941281593056, "gen_logits_max": 4.275254249572754, "gen_logits_mean": -17.213878631591797, "gen_logits_min": -28.712617874145508, "gen_logits_std": 3.044175148010254, "gen_loss": 0.2833666205406189, "grad_norm": 0.4089257575488733, "learning_rate": 2.3570526315789473e-05, "loss": 0.2699, "mean_copy_accuracy": 0.996946170926094, "mean_gen_accuracy": 0.8805514723062515, "mean_token_accuracy": 0.9076274633407593, "num_tokens": 495977163.0, "sample_num_tokens": 8255.75, "step": 5826, "total_num_tokens": 496010186.0, "z_loss": 0.0005156663828529418 }, { "copy_logits_max": -1.0609972476959229, "copy_logits_min": -750000000.0, "copy_num_tokens": 415.625, "epoch": 1.1901455195302528, "gen_logits_max": 5.200882911682129, "gen_logits_mean": -15.597197532653809, "gen_logits_min": -27.694421768188477, "gen_logits_std": 3.0376861095428467, "gen_loss": 0.2793806195259094, "grad_norm": 0.3839144849430217, "learning_rate": 2.3569263157894738e-05, "loss": 0.2789, "mean_copy_accuracy": 0.9967647790908813, "mean_gen_accuracy": 0.8779453635215759, "mean_token_accuracy": 0.9047349095344543, "num_tokens": 496224957.0, "sample_num_tokens": 7797.25, "step": 5827, "total_num_tokens": 496256146.0, "z_loss": 0.000567562528885901 }, { "copy_logits_max": -1.6107761859893799, "copy_logits_min": -750000000.0, "copy_num_tokens": 429.4375, "epoch": 1.1903497574674495, "gen_logits_max": 5.062772750854492, "gen_logits_mean": -15.134610176086426, "gen_logits_min": -27.584665298461914, "gen_logits_std": 3.0278778076171875, "gen_loss": 0.3114003539085388, "grad_norm": 0.3585374693144082, "learning_rate": 2.3568e-05, "loss": 0.2732, "mean_copy_accuracy": 0.9961965382099152, "mean_gen_accuracy": 0.8787747323513031, "mean_token_accuracy": 0.907621368765831, "num_tokens": 496510671.0, "sample_num_tokens": 8564.75, "step": 5828, "total_num_tokens": 496544930.0, "z_loss": 0.0006072426913306117 }, { "copy_logits_max": -1.9866383075714111, "copy_logits_min": -750000064.0, "copy_num_tokens": 670.3125, "epoch": 1.1905539954046465, "gen_logits_max": 4.164024829864502, "gen_logits_mean": -15.14950180053711, "gen_logits_min": -27.772504806518555, "gen_logits_std": 3.058166980743408, "gen_loss": 0.2492586374282837, "grad_norm": 0.3412016771376017, "learning_rate": 2.3566736842105263e-05, "loss": 0.2671, "mean_copy_accuracy": 0.9966443926095963, "mean_gen_accuracy": 0.8805903196334839, "mean_token_accuracy": 0.9105810672044754, "num_tokens": 496791203.0, "sample_num_tokens": 10102.75, "step": 5829, "total_num_tokens": 496831614.0, "z_loss": 0.0004841752233915031 }, { "copy_logits_max": -4.267372131347656, "copy_logits_min": -750000000.0, "copy_num_tokens": 628.375, "epoch": 1.1907582333418432, "gen_logits_max": 4.624567985534668, "gen_logits_mean": -17.177650451660156, "gen_logits_min": -28.905742645263672, "gen_logits_std": 3.0443167686462402, "gen_loss": 0.251057893037796, "grad_norm": 0.38315593392687075, "learning_rate": 2.3565473684210527e-05, "loss": 0.2924, "mean_copy_accuracy": 0.9967609643936157, "mean_gen_accuracy": 0.8732707053422928, "mean_token_accuracy": 0.9023125767707825, "num_tokens": 497088532.0, "sample_num_tokens": 9983.0, "step": 5830, "total_num_tokens": 497128464.0, "z_loss": 0.0004790331586264074 }, { "copy_logits_max": -3.5364744663238525, "copy_logits_min": -687500032.0, "copy_num_tokens": 465.5, "epoch": 1.19096247127904, "gen_logits_max": 3.9795854091644287, "gen_logits_mean": -16.41272735595703, "gen_logits_min": -28.178417205810547, "gen_logits_std": 3.0390963554382324, "gen_loss": 0.2705238461494446, "grad_norm": 0.37985231986738416, "learning_rate": 2.356421052631579e-05, "loss": 0.2959, "mean_copy_accuracy": 0.9956020265817642, "mean_gen_accuracy": 0.8697398602962494, "mean_token_accuracy": 0.8997957855463028, "num_tokens": 497368643.0, "sample_num_tokens": 8180.25, "step": 5831, "total_num_tokens": 497401364.0, "z_loss": 0.0004934489843435585 }, { "copy_logits_max": -0.8853545188903809, "copy_logits_min": -687500032.0, "copy_num_tokens": 665.6875, "epoch": 1.1911667092162368, "gen_logits_max": 6.238886833190918, "gen_logits_mean": -13.899317741394043, "gen_logits_min": -26.21403694152832, "gen_logits_std": 2.9619321823120117, "gen_loss": 0.2358817458152771, "grad_norm": 0.3566988775768004, "learning_rate": 2.3562947368421053e-05, "loss": 0.2694, "mean_copy_accuracy": 0.9966748356819153, "mean_gen_accuracy": 0.8761326819658279, "mean_token_accuracy": 0.9074678122997284, "num_tokens": 497649208.0, "sample_num_tokens": 8988.0, "step": 5832, "total_num_tokens": 497685160.0, "z_loss": 0.000454734661616385 }, { "copy_logits_max": -3.8485007286071777, "copy_logits_min": -750000000.0, "copy_num_tokens": 520.3125, "epoch": 1.1913709471534337, "gen_logits_max": 3.8011157512664795, "gen_logits_mean": -16.732097625732422, "gen_logits_min": -28.40599822998047, "gen_logits_std": 3.0446910858154297, "gen_loss": 0.25140705704689026, "grad_norm": 0.35846619653415085, "learning_rate": 2.3561684210526317e-05, "loss": 0.2857, "mean_copy_accuracy": 0.9970667809247971, "mean_gen_accuracy": 0.8697824627161026, "mean_token_accuracy": 0.9019497483968735, "num_tokens": 497941510.0, "sample_num_tokens": 9277.5, "step": 5833, "total_num_tokens": 497978620.0, "z_loss": 0.0004480938659980893 }, { "copy_logits_max": -0.5956937074661255, "copy_logits_min": -687500032.0, "copy_num_tokens": 365.0625, "epoch": 1.1915751850906307, "gen_logits_max": 5.539191246032715, "gen_logits_mean": -14.807500839233398, "gen_logits_min": -26.45050811767578, "gen_logits_std": 2.9591548442840576, "gen_loss": 0.3144923150539398, "grad_norm": 0.3513272670664377, "learning_rate": 2.356042105263158e-05, "loss": 0.284, "mean_copy_accuracy": 0.9965137094259262, "mean_gen_accuracy": 0.8761676549911499, "mean_token_accuracy": 0.9049210250377655, "num_tokens": 498243359.0, "sample_num_tokens": 8880.75, "step": 5834, "total_num_tokens": 498278882.0, "z_loss": 0.000635335105471313 }, { "copy_logits_max": -6.225369930267334, "copy_logits_min": -687500032.0, "copy_num_tokens": 255.8125, "epoch": 1.1917794230278274, "gen_logits_max": 4.781777858734131, "gen_logits_mean": -16.554882049560547, "gen_logits_min": -28.186912536621094, "gen_logits_std": 2.991769790649414, "gen_loss": 0.29640278220176697, "grad_norm": 0.3824240225721629, "learning_rate": 2.3559157894736842e-05, "loss": 0.2763, "mean_copy_accuracy": 0.9956022500991821, "mean_gen_accuracy": 0.8803578317165375, "mean_token_accuracy": 0.906064435839653, "num_tokens": 498496605.0, "sample_num_tokens": 6887.25, "step": 5835, "total_num_tokens": 498524154.0, "z_loss": 0.0005928325699642301 }, { "copy_logits_max": -2.7118282318115234, "copy_logits_min": -687500032.0, "copy_num_tokens": 515.4375, "epoch": 1.1919836609650243, "gen_logits_max": 3.6298153400421143, "gen_logits_mean": -16.5345458984375, "gen_logits_min": -28.598054885864258, "gen_logits_std": 3.05743408203125, "gen_loss": 0.2707309424877167, "grad_norm": 0.37415349262881303, "learning_rate": 2.3557894736842106e-05, "loss": 0.2713, "mean_copy_accuracy": 0.9966624081134796, "mean_gen_accuracy": 0.8775436580181122, "mean_token_accuracy": 0.907244548201561, "num_tokens": 498742626.0, "sample_num_tokens": 8279.0, "step": 5836, "total_num_tokens": 498775742.0, "z_loss": 0.0005714399740099907 }, { "copy_logits_max": -4.222315788269043, "copy_logits_min": -750000128.0, "copy_num_tokens": 480.625, "epoch": 1.1921878989022212, "gen_logits_max": 5.0109100341796875, "gen_logits_mean": -15.336156845092773, "gen_logits_min": -27.58658790588379, "gen_logits_std": 3.0573744773864746, "gen_loss": 0.2594160735607147, "grad_norm": 0.35311345112460685, "learning_rate": 2.3556631578947367e-05, "loss": 0.2707, "mean_copy_accuracy": 0.9960356950759888, "mean_gen_accuracy": 0.8791613280773163, "mean_token_accuracy": 0.9091398119926453, "num_tokens": 499021447.0, "sample_num_tokens": 8781.75, "step": 5837, "total_num_tokens": 499056574.0, "z_loss": 0.0005027572042308748 }, { "copy_logits_max": -3.694209575653076, "copy_logits_min": -750000000.0, "copy_num_tokens": 331.875, "epoch": 1.192392136839418, "gen_logits_max": 5.339211463928223, "gen_logits_mean": -15.818045616149902, "gen_logits_min": -27.399940490722656, "gen_logits_std": 3.0141241550445557, "gen_loss": 0.26391464471817017, "grad_norm": 0.3762316720755455, "learning_rate": 2.355536842105263e-05, "loss": 0.2865, "mean_copy_accuracy": 0.9958677440881729, "mean_gen_accuracy": 0.8745713829994202, "mean_token_accuracy": 0.903690978884697, "num_tokens": 499288982.0, "sample_num_tokens": 7664.0, "step": 5838, "total_num_tokens": 499319638.0, "z_loss": 0.0005321651697158813 }, { "copy_logits_max": -2.571282148361206, "copy_logits_min": -750000000.0, "copy_num_tokens": 491.4375, "epoch": 1.1925963747766148, "gen_logits_max": 4.061434268951416, "gen_logits_mean": -15.46939468383789, "gen_logits_min": -27.83099365234375, "gen_logits_std": 3.0273218154907227, "gen_loss": 0.28454840183258057, "grad_norm": 0.3613579105936077, "learning_rate": 2.3554105263157896e-05, "loss": 0.2855, "mean_copy_accuracy": 0.9970034509897232, "mean_gen_accuracy": 0.8695765733718872, "mean_token_accuracy": 0.902330219745636, "num_tokens": 499586447.0, "sample_num_tokens": 8606.75, "step": 5839, "total_num_tokens": 499620874.0, "z_loss": 0.0006041008746251464 }, { "copy_logits_max": -3.586696147918701, "copy_logits_min": -750000064.0, "copy_num_tokens": 448.625, "epoch": 1.1928006127138115, "gen_logits_max": 3.3967125415802, "gen_logits_mean": -16.734268188476562, "gen_logits_min": -28.6397705078125, "gen_logits_std": 3.0509696006774902, "gen_loss": 0.2864728569984436, "grad_norm": 0.39943737915697536, "learning_rate": 2.355284210526316e-05, "loss": 0.2908, "mean_copy_accuracy": 0.9962852150201797, "mean_gen_accuracy": 0.8760293573141098, "mean_token_accuracy": 0.9014876931905746, "num_tokens": 499849912.0, "sample_num_tokens": 8193.0, "step": 5840, "total_num_tokens": 499882684.0, "z_loss": 0.0005224901833571494 }, { "copy_logits_max": -2.4181361198425293, "copy_logits_min": -750000000.0, "copy_num_tokens": 384.125, "epoch": 1.1930048506510085, "gen_logits_max": 5.67092752456665, "gen_logits_mean": -14.306547164916992, "gen_logits_min": -25.90087890625, "gen_logits_std": 2.977008819580078, "gen_loss": 0.28329724073410034, "grad_norm": 0.34779875292234236, "learning_rate": 2.355157894736842e-05, "loss": 0.2713, "mean_copy_accuracy": 0.9959900826215744, "mean_gen_accuracy": 0.8806256800889969, "mean_token_accuracy": 0.9084298461675644, "num_tokens": 500142128.0, "sample_num_tokens": 9257.0, "step": 5841, "total_num_tokens": 500179156.0, "z_loss": 0.000536037259735167 }, { "copy_logits_max": -3.839369773864746, "copy_logits_min": -750000000.0, "copy_num_tokens": 466.6875, "epoch": 1.1932090885882052, "gen_logits_max": 4.913745403289795, "gen_logits_mean": -14.457921028137207, "gen_logits_min": -25.826480865478516, "gen_logits_std": 2.943840742111206, "gen_loss": 0.29345959424972534, "grad_norm": 0.4253064149311936, "learning_rate": 2.3550315789473685e-05, "loss": 0.2756, "mean_copy_accuracy": 0.995202511548996, "mean_gen_accuracy": 0.8739991188049316, "mean_token_accuracy": 0.9056467562913895, "num_tokens": 500443209.0, "sample_num_tokens": 9464.25, "step": 5842, "total_num_tokens": 500481066.0, "z_loss": 0.0004969913279637694 }, { "copy_logits_max": -2.3409085273742676, "copy_logits_min": -687500032.0, "copy_num_tokens": 436.5625, "epoch": 1.193413326525402, "gen_logits_max": 5.5046186447143555, "gen_logits_mean": -14.420310974121094, "gen_logits_min": -26.286842346191406, "gen_logits_std": 2.972964286804199, "gen_loss": 0.2758767008781433, "grad_norm": 0.39169231930412096, "learning_rate": 2.3549052631578946e-05, "loss": 0.2749, "mean_copy_accuracy": 0.9953332096338272, "mean_gen_accuracy": 0.8805004507303238, "mean_token_accuracy": 0.9065579026937485, "num_tokens": 500703032.0, "sample_num_tokens": 8104.0, "step": 5843, "total_num_tokens": 500735448.0, "z_loss": 0.0005470330361276865 }, { "copy_logits_max": -0.5727599859237671, "copy_logits_min": -750000000.0, "copy_num_tokens": 436.375, "epoch": 1.193617564462599, "gen_logits_max": 5.0547990798950195, "gen_logits_mean": -14.111185073852539, "gen_logits_min": -26.541767120361328, "gen_logits_std": 2.9809303283691406, "gen_loss": 0.2934969663619995, "grad_norm": 0.4022892323169637, "learning_rate": 2.354778947368421e-05, "loss": 0.2987, "mean_copy_accuracy": 0.9956198930740356, "mean_gen_accuracy": 0.8644954562187195, "mean_token_accuracy": 0.8985545188188553, "num_tokens": 500952565.0, "sample_num_tokens": 7636.25, "step": 5844, "total_num_tokens": 500983110.0, "z_loss": 0.0005938654649071395 }, { "copy_logits_max": -1.6419146060943604, "copy_logits_min": -750000000.0, "copy_num_tokens": 263.25, "epoch": 1.1938218023997957, "gen_logits_max": 6.8991827964782715, "gen_logits_mean": -13.32951545715332, "gen_logits_min": -24.776348114013672, "gen_logits_std": 2.927234411239624, "gen_loss": 0.32598602771759033, "grad_norm": 0.38943027087385906, "learning_rate": 2.354652631578947e-05, "loss": 0.2922, "mean_copy_accuracy": 0.9948965311050415, "mean_gen_accuracy": 0.8800414502620697, "mean_token_accuracy": 0.8993677943944931, "num_tokens": 501200292.0, "sample_num_tokens": 7611.0, "step": 5845, "total_num_tokens": 501230736.0, "z_loss": 0.0006281224777922034 }, { "copy_logits_max": -3.7050909996032715, "copy_logits_min": -750000128.0, "copy_num_tokens": 467.0625, "epoch": 1.1940260403369927, "gen_logits_max": 4.159491062164307, "gen_logits_mean": -15.622111320495605, "gen_logits_min": -27.769760131835938, "gen_logits_std": 3.0103187561035156, "gen_loss": 0.2861822247505188, "grad_norm": 0.37176772928151036, "learning_rate": 2.3545263157894736e-05, "loss": 0.2747, "mean_copy_accuracy": 0.9966596513986588, "mean_gen_accuracy": 0.8721015006303787, "mean_token_accuracy": 0.9077426195144653, "num_tokens": 501482525.0, "sample_num_tokens": 8676.25, "step": 5846, "total_num_tokens": 501517230.0, "z_loss": 0.0005231445538811386 }, { "copy_logits_max": -1.3734729290008545, "copy_logits_min": -750000000.0, "copy_num_tokens": 395.5, "epoch": 1.1942302782741894, "gen_logits_max": 6.615626335144043, "gen_logits_mean": -13.153103828430176, "gen_logits_min": -25.269439697265625, "gen_logits_std": 2.957620143890381, "gen_loss": 0.29672566056251526, "grad_norm": 0.3703943895817011, "learning_rate": 2.3544000000000004e-05, "loss": 0.2866, "mean_copy_accuracy": 0.995046079158783, "mean_gen_accuracy": 0.8733823150396347, "mean_token_accuracy": 0.9030362665653229, "num_tokens": 501753360.0, "sample_num_tokens": 9607.0, "step": 5847, "total_num_tokens": 501791788.0, "z_loss": 0.0006217284826561809 }, { "copy_logits_max": -2.3174448013305664, "copy_logits_min": -687500032.0, "copy_num_tokens": 418.3125, "epoch": 1.1944345162113863, "gen_logits_max": 4.762079238891602, "gen_logits_mean": -15.969968795776367, "gen_logits_min": -28.66636848449707, "gen_logits_std": 2.9960036277770996, "gen_loss": 0.2586551308631897, "grad_norm": 0.4178958497817634, "learning_rate": 2.3542736842105265e-05, "loss": 0.291, "mean_copy_accuracy": 0.9942900240421295, "mean_gen_accuracy": 0.8736318945884705, "mean_token_accuracy": 0.9019099622964859, "num_tokens": 502001412.0, "sample_num_tokens": 8148.5, "step": 5848, "total_num_tokens": 502034006.0, "z_loss": 0.0005448508309200406 }, { "copy_logits_max": -1.676997423171997, "copy_logits_min": -750000000.0, "copy_num_tokens": 488.6875, "epoch": 1.194638754148583, "gen_logits_max": 6.162615776062012, "gen_logits_mean": -14.158987045288086, "gen_logits_min": -26.177576065063477, "gen_logits_std": 2.959296703338623, "gen_loss": 0.23693552613258362, "grad_norm": 0.3801321684742854, "learning_rate": 2.354147368421053e-05, "loss": 0.2688, "mean_copy_accuracy": 0.9962234348058701, "mean_gen_accuracy": 0.8785105645656586, "mean_token_accuracy": 0.9068128913640976, "num_tokens": 502247567.0, "sample_num_tokens": 8782.75, "step": 5849, "total_num_tokens": 502282698.0, "z_loss": 0.0004653455107472837 }, { "copy_logits_max": -3.198852062225342, "copy_logits_min": -750000000.0, "copy_num_tokens": 490.0625, "epoch": 1.19484299208578, "gen_logits_max": 3.9431886672973633, "gen_logits_mean": -17.038490295410156, "gen_logits_min": -28.87761116027832, "gen_logits_std": 3.020501136779785, "gen_loss": 0.26584872603416443, "grad_norm": 0.36633956214743896, "learning_rate": 2.354021052631579e-05, "loss": 0.2833, "mean_copy_accuracy": 0.996688961982727, "mean_gen_accuracy": 0.87174953520298, "mean_token_accuracy": 0.9051055014133453, "num_tokens": 502542049.0, "sample_num_tokens": 8373.75, "step": 5850, "total_num_tokens": 502575544.0, "z_loss": 0.0005471530603244901 }, { "copy_logits_max": -0.05449149012565613, "copy_logits_min": -750000000.0, "copy_num_tokens": 602.625, "epoch": 1.1950472300229769, "gen_logits_max": 5.136209487915039, "gen_logits_mean": -14.363308906555176, "gen_logits_min": -26.420684814453125, "gen_logits_std": 3.009582757949829, "gen_loss": 0.26212137937545776, "grad_norm": 0.3745898807143811, "learning_rate": 2.3538947368421054e-05, "loss": 0.2747, "mean_copy_accuracy": 0.9963026940822601, "mean_gen_accuracy": 0.8777059465646744, "mean_token_accuracy": 0.9071735292673111, "num_tokens": 502814594.0, "sample_num_tokens": 9809.0, "step": 5851, "total_num_tokens": 502853830.0, "z_loss": 0.0004901384236291051 }, { "copy_logits_max": -4.750769138336182, "copy_logits_min": -750000000.0, "copy_num_tokens": 452.0, "epoch": 1.1952514679601736, "gen_logits_max": 3.1000261306762695, "gen_logits_mean": -17.965213775634766, "gen_logits_min": -30.003074645996094, "gen_logits_std": 3.0869340896606445, "gen_loss": 0.23461076617240906, "grad_norm": 0.38362441198623254, "learning_rate": 2.3537684210526315e-05, "loss": 0.275, "mean_copy_accuracy": 0.9965247809886932, "mean_gen_accuracy": 0.8745003938674927, "mean_token_accuracy": 0.907259926199913, "num_tokens": 503089907.0, "sample_num_tokens": 7194.25, "step": 5852, "total_num_tokens": 503118684.0, "z_loss": 0.00047151779290288687 }, { "copy_logits_max": -4.451560974121094, "copy_logits_min": -750000000.0, "copy_num_tokens": 432.75, "epoch": 1.1954557058973705, "gen_logits_max": 3.651920795440674, "gen_logits_mean": -17.03199577331543, "gen_logits_min": -28.985694885253906, "gen_logits_std": 3.0557236671447754, "gen_loss": 0.3007246255874634, "grad_norm": 0.3655604007921416, "learning_rate": 2.353642105263158e-05, "loss": 0.2695, "mean_copy_accuracy": 0.9955232441425323, "mean_gen_accuracy": 0.8805363178253174, "mean_token_accuracy": 0.9084025323390961, "num_tokens": 503374487.0, "sample_num_tokens": 8180.25, "step": 5853, "total_num_tokens": 503407208.0, "z_loss": 0.0005164593458175659 }, { "copy_logits_max": -2.997380256652832, "copy_logits_min": -750000000.0, "copy_num_tokens": 373.0625, "epoch": 1.1956599438345672, "gen_logits_max": 4.1436448097229, "gen_logits_mean": -16.588220596313477, "gen_logits_min": -28.097858428955078, "gen_logits_std": 3.0225188732147217, "gen_loss": 0.26069366931915283, "grad_norm": 0.4678387869949783, "learning_rate": 2.353515789473684e-05, "loss": 0.2822, "mean_copy_accuracy": 0.9968084245920181, "mean_gen_accuracy": 0.8765982538461685, "mean_token_accuracy": 0.9050758332014084, "num_tokens": 503638239.0, "sample_num_tokens": 8213.75, "step": 5854, "total_num_tokens": 503671094.0, "z_loss": 0.0004280170542187989 }, { "copy_logits_max": -2.526181936264038, "copy_logits_min": -687500032.0, "copy_num_tokens": 563.4375, "epoch": 1.1958641817717641, "gen_logits_max": 2.7974185943603516, "gen_logits_mean": -17.694721221923828, "gen_logits_min": -29.776159286499023, "gen_logits_std": 3.096092462539673, "gen_loss": 0.2587844729423523, "grad_norm": 0.36951962775995634, "learning_rate": 2.3533894736842108e-05, "loss": 0.2672, "mean_copy_accuracy": 0.9970358610153198, "mean_gen_accuracy": 0.8765058815479279, "mean_token_accuracy": 0.9090034663677216, "num_tokens": 503912799.0, "sample_num_tokens": 8835.75, "step": 5855, "total_num_tokens": 503948142.0, "z_loss": 0.00043351814383640885 }, { "copy_logits_max": -6.284958362579346, "copy_logits_min": -750000000.0, "copy_num_tokens": 375.125, "epoch": 1.1960684197089608, "gen_logits_max": 4.565448760986328, "gen_logits_mean": -15.203665733337402, "gen_logits_min": -27.001041412353516, "gen_logits_std": 3.020327568054199, "gen_loss": 0.30981355905532837, "grad_norm": 0.3754590704696024, "learning_rate": 2.353263157894737e-05, "loss": 0.2809, "mean_copy_accuracy": 0.9957419633865356, "mean_gen_accuracy": 0.8776126950979233, "mean_token_accuracy": 0.9057137817144394, "num_tokens": 504165687.0, "sample_num_tokens": 7622.75, "step": 5856, "total_num_tokens": 504196178.0, "z_loss": 0.0004788066435139626 }, { "copy_logits_max": -0.8391291499137878, "copy_logits_min": -687500032.0, "copy_num_tokens": 497.3125, "epoch": 1.1962726576461578, "gen_logits_max": 5.1576361656188965, "gen_logits_mean": -13.648271560668945, "gen_logits_min": -25.547061920166016, "gen_logits_std": 2.9602394104003906, "gen_loss": 0.29495516419410706, "grad_norm": 0.4027062865067074, "learning_rate": 2.3531368421052633e-05, "loss": 0.2882, "mean_copy_accuracy": 0.9953270852565765, "mean_gen_accuracy": 0.8763580173254013, "mean_token_accuracy": 0.9028040170669556, "num_tokens": 504429513.0, "sample_num_tokens": 8852.25, "step": 5857, "total_num_tokens": 504464922.0, "z_loss": 0.00046839378774166107 }, { "copy_logits_max": -2.5270349979400635, "copy_logits_min": -750000000.0, "copy_num_tokens": 449.75, "epoch": 1.1964768955833547, "gen_logits_max": 4.108602523803711, "gen_logits_mean": -15.11323070526123, "gen_logits_min": -27.527305603027344, "gen_logits_std": 3.03251051902771, "gen_loss": 0.26769882440567017, "grad_norm": 0.409033077500414, "learning_rate": 2.3530105263157894e-05, "loss": 0.2779, "mean_copy_accuracy": 0.9953128695487976, "mean_gen_accuracy": 0.8760156035423279, "mean_token_accuracy": 0.9049912244081497, "num_tokens": 504705535.0, "sample_num_tokens": 8296.25, "step": 5858, "total_num_tokens": 504738720.0, "z_loss": 0.0004796888097189367 }, { "copy_logits_max": -4.1069183349609375, "copy_logits_min": -750000000.0, "copy_num_tokens": 423.125, "epoch": 1.1966811335205514, "gen_logits_max": 4.113098621368408, "gen_logits_mean": -15.600662231445312, "gen_logits_min": -27.395488739013672, "gen_logits_std": 3.012622356414795, "gen_loss": 0.2526246905326843, "grad_norm": 0.37197292043533914, "learning_rate": 2.352884210526316e-05, "loss": 0.2819, "mean_copy_accuracy": 0.9962518960237503, "mean_gen_accuracy": 0.8775083869695663, "mean_token_accuracy": 0.905056431889534, "num_tokens": 504984760.0, "sample_num_tokens": 8798.5, "step": 5859, "total_num_tokens": 505019954.0, "z_loss": 0.0004682590952143073 }, { "copy_logits_max": -3.7323946952819824, "copy_logits_min": -750000000.0, "copy_num_tokens": 361.0625, "epoch": 1.1968853714577483, "gen_logits_max": 3.707335948944092, "gen_logits_mean": -16.955358505249023, "gen_logits_min": -28.81041145324707, "gen_logits_std": 3.045140266418457, "gen_loss": 0.30397942662239075, "grad_norm": 0.3848787535756715, "learning_rate": 2.3527578947368423e-05, "loss": 0.2797, "mean_copy_accuracy": 0.9959463179111481, "mean_gen_accuracy": 0.8795374184846878, "mean_token_accuracy": 0.9056861251592636, "num_tokens": 505244330.0, "sample_num_tokens": 7366.0, "step": 5860, "total_num_tokens": 505273794.0, "z_loss": 0.0005854981718584895 }, { "copy_logits_max": -2.4559621810913086, "copy_logits_min": -750000000.0, "copy_num_tokens": 471.125, "epoch": 1.197089609394945, "gen_logits_max": 3.0926270484924316, "gen_logits_mean": -17.66185760498047, "gen_logits_min": -29.96160316467285, "gen_logits_std": 3.0953917503356934, "gen_loss": 0.27712368965148926, "grad_norm": 0.3781004878468475, "learning_rate": 2.3526315789473684e-05, "loss": 0.2749, "mean_copy_accuracy": 0.9958283305168152, "mean_gen_accuracy": 0.8736677467823029, "mean_token_accuracy": 0.9071661233901978, "num_tokens": 505499615.0, "sample_num_tokens": 8385.25, "step": 5861, "total_num_tokens": 505533156.0, "z_loss": 0.0005566169857047498 }, { "copy_logits_max": -0.72138911485672, "copy_logits_min": -750000000.0, "copy_num_tokens": 639.25, "epoch": 1.197293847332142, "gen_logits_max": 4.059191703796387, "gen_logits_mean": -15.069093704223633, "gen_logits_min": -27.55805778503418, "gen_logits_std": 3.0326590538024902, "gen_loss": 0.2625170350074768, "grad_norm": 0.4068873241785575, "learning_rate": 2.3525052631578948e-05, "loss": 0.2697, "mean_copy_accuracy": 0.9971020519733429, "mean_gen_accuracy": 0.8758654296398163, "mean_token_accuracy": 0.9073770642280579, "num_tokens": 505761718.0, "sample_num_tokens": 9170.0, "step": 5862, "total_num_tokens": 505798398.0, "z_loss": 0.0006072340765967965 }, { "copy_logits_max": -2.1715962886810303, "copy_logits_min": -625000064.0, "copy_num_tokens": 381.0625, "epoch": 1.1974980852693389, "gen_logits_max": 4.970968246459961, "gen_logits_mean": -13.522005081176758, "gen_logits_min": -25.7122859954834, "gen_logits_std": 2.988621473312378, "gen_loss": 0.2714027464389801, "grad_norm": 0.36995326804267487, "learning_rate": 2.3523789473684212e-05, "loss": 0.2819, "mean_copy_accuracy": 0.996362954378128, "mean_gen_accuracy": 0.8768500685691833, "mean_token_accuracy": 0.9034889340400696, "num_tokens": 506014977.0, "sample_num_tokens": 7078.75, "step": 5863, "total_num_tokens": 506043292.0, "z_loss": 0.0006462084129452705 }, { "copy_logits_max": -2.6862478256225586, "copy_logits_min": -750000000.0, "copy_num_tokens": 420.625, "epoch": 1.1977023232065356, "gen_logits_max": 4.160701751708984, "gen_logits_mean": -16.365081787109375, "gen_logits_min": -28.59368133544922, "gen_logits_std": 3.057818651199341, "gen_loss": 0.2931867837905884, "grad_norm": 0.38240758352219145, "learning_rate": 2.3522526315789477e-05, "loss": 0.2845, "mean_copy_accuracy": 0.9973142892122269, "mean_gen_accuracy": 0.8714037388563156, "mean_token_accuracy": 0.9024402499198914, "num_tokens": 506279983.0, "sample_num_tokens": 8044.25, "step": 5864, "total_num_tokens": 506312160.0, "z_loss": 0.0006411270587705076 }, { "copy_logits_max": -4.885443210601807, "copy_logits_min": -750000000.0, "copy_num_tokens": 365.75, "epoch": 1.1979065611437325, "gen_logits_max": 3.6982502937316895, "gen_logits_mean": -17.19989776611328, "gen_logits_min": -29.061250686645508, "gen_logits_std": 3.0707216262817383, "gen_loss": 0.2859124541282654, "grad_norm": 0.41386487623153384, "learning_rate": 2.3521263157894738e-05, "loss": 0.2889, "mean_copy_accuracy": 0.994875505566597, "mean_gen_accuracy": 0.8725282400846481, "mean_token_accuracy": 0.9010735005140305, "num_tokens": 506536255.0, "sample_num_tokens": 8075.75, "step": 5865, "total_num_tokens": 506568558.0, "z_loss": 0.00047815777361392975 }, { "copy_logits_max": -1.9753648042678833, "copy_logits_min": -562500096.0, "copy_num_tokens": 321.9375, "epoch": 1.1981107990809292, "gen_logits_max": 4.148818492889404, "gen_logits_mean": -16.241596221923828, "gen_logits_min": -28.553098678588867, "gen_logits_std": 3.053354263305664, "gen_loss": 0.30567532777786255, "grad_norm": 0.3821155327484872, "learning_rate": 2.3520000000000002e-05, "loss": 0.2635, "mean_copy_accuracy": 0.9961434006690979, "mean_gen_accuracy": 0.8850501328706741, "mean_token_accuracy": 0.911172941327095, "num_tokens": 506796530.0, "sample_num_tokens": 6963.0, "step": 5866, "total_num_tokens": 506824382.0, "z_loss": 0.000590357172768563 }, { "copy_logits_max": -1.6963483095169067, "copy_logits_min": -750000000.0, "copy_num_tokens": 512.75, "epoch": 1.1983150370181261, "gen_logits_max": 3.740769863128662, "gen_logits_mean": -15.97558879852295, "gen_logits_min": -28.393043518066406, "gen_logits_std": 3.074828624725342, "gen_loss": 0.2771914005279541, "grad_norm": 0.42091892058713926, "learning_rate": 2.3518736842105263e-05, "loss": 0.2908, "mean_copy_accuracy": 0.996074989438057, "mean_gen_accuracy": 0.8745221346616745, "mean_token_accuracy": 0.9030036479234695, "num_tokens": 507038306.0, "sample_num_tokens": 8481.5, "step": 5867, "total_num_tokens": 507072232.0, "z_loss": 0.0005936933448538184 }, { "copy_logits_max": -3.9711790084838867, "copy_logits_min": -750000000.0, "copy_num_tokens": 388.75, "epoch": 1.198519274955323, "gen_logits_max": 3.8754653930664062, "gen_logits_mean": -16.831266403198242, "gen_logits_min": -28.809553146362305, "gen_logits_std": 3.072309970855713, "gen_loss": 0.2401612401008606, "grad_norm": 0.4019437808742045, "learning_rate": 2.3517473684210527e-05, "loss": 0.271, "mean_copy_accuracy": 0.9956842511892319, "mean_gen_accuracy": 0.880939856171608, "mean_token_accuracy": 0.9069955199956894, "num_tokens": 507314714.0, "sample_num_tokens": 8170.5, "step": 5868, "total_num_tokens": 507347396.0, "z_loss": 0.00042618787847459316 }, { "copy_logits_max": -3.2285234928131104, "copy_logits_min": -750000000.0, "copy_num_tokens": 410.5, "epoch": 1.1987235128925198, "gen_logits_max": 4.35965633392334, "gen_logits_mean": -16.136199951171875, "gen_logits_min": -28.092636108398438, "gen_logits_std": 3.0491747856140137, "gen_loss": 0.26841461658477783, "grad_norm": 0.41837119202292067, "learning_rate": 2.3516210526315788e-05, "loss": 0.2947, "mean_copy_accuracy": 0.9954956769943237, "mean_gen_accuracy": 0.8731834143400192, "mean_token_accuracy": 0.9001720249652863, "num_tokens": 507572238.0, "sample_num_tokens": 8256.0, "step": 5869, "total_num_tokens": 507605262.0, "z_loss": 0.000516505679115653 }, { "copy_logits_max": -2.8922295570373535, "copy_logits_min": -687500032.0, "copy_num_tokens": 375.25, "epoch": 1.1989277508297167, "gen_logits_max": 3.741910934448242, "gen_logits_mean": -17.082483291625977, "gen_logits_min": -29.041400909423828, "gen_logits_std": 3.0730621814727783, "gen_loss": 0.2687886357307434, "grad_norm": 0.3922078321938798, "learning_rate": 2.3514947368421052e-05, "loss": 0.2898, "mean_copy_accuracy": 0.9947124719619751, "mean_gen_accuracy": 0.8753475844860077, "mean_token_accuracy": 0.9000149965286255, "num_tokens": 507808887.0, "sample_num_tokens": 7722.75, "step": 5870, "total_num_tokens": 507839778.0, "z_loss": 0.0005104351439513266 }, { "copy_logits_max": -2.7386159896850586, "copy_logits_min": -750000000.0, "copy_num_tokens": 346.375, "epoch": 1.1991319887669134, "gen_logits_max": 4.853846549987793, "gen_logits_mean": -14.603498458862305, "gen_logits_min": -27.01457405090332, "gen_logits_std": 3.02724027633667, "gen_loss": 0.28261640667915344, "grad_norm": 0.3677573491268867, "learning_rate": 2.3513684210526317e-05, "loss": 0.2698, "mean_copy_accuracy": 0.9954160898923874, "mean_gen_accuracy": 0.8807289004325867, "mean_token_accuracy": 0.9090457260608673, "num_tokens": 508092682.0, "sample_num_tokens": 7993.0, "step": 5871, "total_num_tokens": 508124654.0, "z_loss": 0.0005130878416821361 }, { "copy_logits_max": -0.6550686359405518, "copy_logits_min": -750000000.0, "copy_num_tokens": 589.125, "epoch": 1.1993362267041103, "gen_logits_max": 3.735644817352295, "gen_logits_mean": -15.081533432006836, "gen_logits_min": -27.697179794311523, "gen_logits_std": 3.0414745807647705, "gen_loss": 0.2675977647304535, "grad_norm": 0.3810943860700304, "learning_rate": 2.351242105263158e-05, "loss": 0.2752, "mean_copy_accuracy": 0.9957575500011444, "mean_gen_accuracy": 0.87871915102005, "mean_token_accuracy": 0.905621349811554, "num_tokens": 508350735.0, "sample_num_tokens": 9234.75, "step": 5872, "total_num_tokens": 508387674.0, "z_loss": 0.0004955814802087843 }, { "copy_logits_max": -2.2234315872192383, "copy_logits_min": -687500032.0, "copy_num_tokens": 374.0, "epoch": 1.199540464641307, "gen_logits_max": 4.033249378204346, "gen_logits_mean": -15.76755428314209, "gen_logits_min": -28.168041229248047, "gen_logits_std": 3.041555166244507, "gen_loss": 0.3216917812824249, "grad_norm": 0.4877318601886045, "learning_rate": 2.3511157894736842e-05, "loss": 0.2992, "mean_copy_accuracy": 0.9938575774431229, "mean_gen_accuracy": 0.8711233884096146, "mean_token_accuracy": 0.8982018828392029, "num_tokens": 508613032.0, "sample_num_tokens": 7610.0, "step": 5873, "total_num_tokens": 508643472.0, "z_loss": 0.0005933955544605851 }, { "copy_logits_max": -1.0158944129943848, "copy_logits_min": -750000000.0, "copy_num_tokens": 518.875, "epoch": 1.199744702578504, "gen_logits_max": 3.0953564643859863, "gen_logits_mean": -15.941125869750977, "gen_logits_min": -28.978382110595703, "gen_logits_std": 3.0957603454589844, "gen_loss": 0.2455708384513855, "grad_norm": 0.39243989272433727, "learning_rate": 2.3509894736842106e-05, "loss": 0.2856, "mean_copy_accuracy": 0.9959573894739151, "mean_gen_accuracy": 0.8743971139192581, "mean_token_accuracy": 0.9035457223653793, "num_tokens": 508867641.0, "sample_num_tokens": 7610.25, "step": 5874, "total_num_tokens": 508898082.0, "z_loss": 0.0004855714796576649 }, { "copy_logits_max": -1.8649977445602417, "copy_logits_min": -750000000.0, "copy_num_tokens": 407.625, "epoch": 1.199948940515701, "gen_logits_max": 4.764889717102051, "gen_logits_mean": -15.151970863342285, "gen_logits_min": -27.52001953125, "gen_logits_std": 3.0273566246032715, "gen_loss": 0.2737428843975067, "grad_norm": 0.37555734382848754, "learning_rate": 2.350863157894737e-05, "loss": 0.2925, "mean_copy_accuracy": 0.9968564957380295, "mean_gen_accuracy": 0.8718111664056778, "mean_token_accuracy": 0.8988038748502731, "num_tokens": 509104559.0, "sample_num_tokens": 8045.75, "step": 5875, "total_num_tokens": 509136742.0, "z_loss": 0.0004999157972633839 }, { "copy_logits_max": 0.5299894213676453, "copy_logits_min": -750000000.0, "copy_num_tokens": 543.5625, "epoch": 1.2001531784528976, "gen_logits_max": 5.109591960906982, "gen_logits_mean": -13.868653297424316, "gen_logits_min": -27.37656593322754, "gen_logits_std": 3.038191556930542, "gen_loss": 0.21616533398628235, "grad_norm": 0.3669136188161712, "learning_rate": 2.350736842105263e-05, "loss": 0.2627, "mean_copy_accuracy": 0.9965047687292099, "mean_gen_accuracy": 0.8806044459342957, "mean_token_accuracy": 0.9113890528678894, "num_tokens": 509385134.0, "sample_num_tokens": 8846.0, "step": 5876, "total_num_tokens": 509420518.0, "z_loss": 0.0004586591385304928 }, { "copy_logits_max": -1.686071753501892, "copy_logits_min": -750000000.0, "copy_num_tokens": 386.0625, "epoch": 1.2003574163900945, "gen_logits_max": 4.816882133483887, "gen_logits_mean": -15.342279434204102, "gen_logits_min": -27.675140380859375, "gen_logits_std": 3.034899950027466, "gen_loss": 0.26950281858444214, "grad_norm": 0.3674319593301512, "learning_rate": 2.3506105263157896e-05, "loss": 0.2664, "mean_copy_accuracy": 0.9957786798477173, "mean_gen_accuracy": 0.8796914666891098, "mean_token_accuracy": 0.9086419641971588, "num_tokens": 509661514.0, "sample_num_tokens": 7902.0, "step": 5877, "total_num_tokens": 509693122.0, "z_loss": 0.0005244515486992896 }, { "copy_logits_max": -3.617382049560547, "copy_logits_min": -687500032.0, "copy_num_tokens": 435.8125, "epoch": 1.2005616543272912, "gen_logits_max": 4.117074966430664, "gen_logits_mean": -16.960735321044922, "gen_logits_min": -29.55141830444336, "gen_logits_std": 3.074859380722046, "gen_loss": 0.30532142519950867, "grad_norm": 0.3417170493756838, "learning_rate": 2.3504842105263157e-05, "loss": 0.272, "mean_copy_accuracy": 0.9955722540616989, "mean_gen_accuracy": 0.8818287551403046, "mean_token_accuracy": 0.9079828262329102, "num_tokens": 509938392.0, "sample_num_tokens": 8795.5, "step": 5878, "total_num_tokens": 509973574.0, "z_loss": 0.0006188732804730535 }, { "copy_logits_max": -3.7233214378356934, "copy_logits_min": -625000000.0, "copy_num_tokens": 490.5625, "epoch": 1.2007658922644882, "gen_logits_max": 5.0963029861450195, "gen_logits_mean": -14.702899932861328, "gen_logits_min": -27.075937271118164, "gen_logits_std": 3.03371000289917, "gen_loss": 0.2655174136161804, "grad_norm": 0.40265345111795536, "learning_rate": 2.350357894736842e-05, "loss": 0.2784, "mean_copy_accuracy": 0.9953529536724091, "mean_gen_accuracy": 0.878346398472786, "mean_token_accuracy": 0.9064120799303055, "num_tokens": 510205478.0, "sample_num_tokens": 9244.5, "step": 5879, "total_num_tokens": 510242456.0, "z_loss": 0.0005267068045213819 }, { "copy_logits_max": -1.0204041004180908, "copy_logits_min": -687500032.0, "copy_num_tokens": 417.0625, "epoch": 1.2009701302016849, "gen_logits_max": 5.009025573730469, "gen_logits_mean": -14.735353469848633, "gen_logits_min": -27.31787872314453, "gen_logits_std": 2.9998860359191895, "gen_loss": 0.30270516872406006, "grad_norm": 0.35667434816545235, "learning_rate": 2.3502315789473685e-05, "loss": 0.2863, "mean_copy_accuracy": 0.9962114095687866, "mean_gen_accuracy": 0.8722266256809235, "mean_token_accuracy": 0.9040580093860626, "num_tokens": 510491306.0, "sample_num_tokens": 8337.0, "step": 5880, "total_num_tokens": 510524654.0, "z_loss": 0.0005920885596424341 }, { "copy_logits_max": -0.8415381908416748, "copy_logits_min": -687500032.0, "copy_num_tokens": 506.75, "epoch": 1.2011743681388818, "gen_logits_max": 3.9296491146087646, "gen_logits_mean": -15.994646072387695, "gen_logits_min": -28.049728393554688, "gen_logits_std": 3.041224956512451, "gen_loss": 0.2754974961280823, "grad_norm": 0.3913088909396954, "learning_rate": 2.350105263157895e-05, "loss": 0.2902, "mean_copy_accuracy": 0.996825635433197, "mean_gen_accuracy": 0.870607927441597, "mean_token_accuracy": 0.9049136787652969, "num_tokens": 510759502.0, "sample_num_tokens": 8143.5, "step": 5881, "total_num_tokens": 510792076.0, "z_loss": 0.0005173314712010324 }, { "copy_logits_max": -2.1488118171691895, "copy_logits_min": -750000000.0, "copy_num_tokens": 410.3125, "epoch": 1.2013786060760787, "gen_logits_max": 5.053921699523926, "gen_logits_mean": -15.22988510131836, "gen_logits_min": -27.597633361816406, "gen_logits_std": 3.0078344345092773, "gen_loss": 0.28893333673477173, "grad_norm": 0.3987877216077714, "learning_rate": 2.349978947368421e-05, "loss": 0.2982, "mean_copy_accuracy": 0.9954713433980942, "mean_gen_accuracy": 0.8689569234848022, "mean_token_accuracy": 0.8990472853183746, "num_tokens": 511033038.0, "sample_num_tokens": 8247.0, "step": 5882, "total_num_tokens": 511066026.0, "z_loss": 0.0005428474978543818 }, { "copy_logits_max": -1.504930019378662, "copy_logits_min": -750000000.0, "copy_num_tokens": 504.1875, "epoch": 1.2015828440132754, "gen_logits_max": 4.028683662414551, "gen_logits_mean": -16.275548934936523, "gen_logits_min": -28.596923828125, "gen_logits_std": 3.0411481857299805, "gen_loss": 0.2845688462257385, "grad_norm": 0.3512134909378932, "learning_rate": 2.3498526315789475e-05, "loss": 0.2652, "mean_copy_accuracy": 0.9961471110582352, "mean_gen_accuracy": 0.8775640428066254, "mean_token_accuracy": 0.9098171144723892, "num_tokens": 511318392.0, "sample_num_tokens": 8393.5, "step": 5883, "total_num_tokens": 511351966.0, "z_loss": 0.0005454320926219225 }, { "copy_logits_max": -2.477349042892456, "copy_logits_min": -750000000.0, "copy_num_tokens": 415.8125, "epoch": 1.2017870819504723, "gen_logits_max": 3.8067965507507324, "gen_logits_mean": -16.665721893310547, "gen_logits_min": -28.66029167175293, "gen_logits_std": 3.0362730026245117, "gen_loss": 0.3233814835548401, "grad_norm": 0.3782766129851802, "learning_rate": 2.3497263157894736e-05, "loss": 0.3044, "mean_copy_accuracy": 0.9957123398780823, "mean_gen_accuracy": 0.8656722158193588, "mean_token_accuracy": 0.8960466235876083, "num_tokens": 511585915.0, "sample_num_tokens": 8116.75, "step": 5884, "total_num_tokens": 511618382.0, "z_loss": 0.0006347774178721011 }, { "copy_logits_max": -2.619755744934082, "copy_logits_min": -687500032.0, "copy_num_tokens": 387.75, "epoch": 1.201991319887669, "gen_logits_max": 4.675027847290039, "gen_logits_mean": -15.231069564819336, "gen_logits_min": -27.371902465820312, "gen_logits_std": 3.0100672245025635, "gen_loss": 0.31992238759994507, "grad_norm": 0.36922032664843984, "learning_rate": 2.3496e-05, "loss": 0.2801, "mean_copy_accuracy": 0.9959985613822937, "mean_gen_accuracy": 0.8771445006132126, "mean_token_accuracy": 0.9053562879562378, "num_tokens": 511876321.0, "sample_num_tokens": 7714.25, "step": 5885, "total_num_tokens": 511907178.0, "z_loss": 0.0006193246226757765 }, { "copy_logits_max": -2.1599674224853516, "copy_logits_min": -750000000.0, "copy_num_tokens": 363.75, "epoch": 1.202195557824866, "gen_logits_max": 5.457729816436768, "gen_logits_mean": -15.134018898010254, "gen_logits_min": -27.129480361938477, "gen_logits_std": 2.9748098850250244, "gen_loss": 0.3008466958999634, "grad_norm": 0.411167842959301, "learning_rate": 2.349473684210526e-05, "loss": 0.2939, "mean_copy_accuracy": 0.9952032417058945, "mean_gen_accuracy": 0.8737606853246689, "mean_token_accuracy": 0.9002794623374939, "num_tokens": 512125733.0, "sample_num_tokens": 8404.75, "step": 5886, "total_num_tokens": 512159352.0, "z_loss": 0.0005762105574831367 }, { "copy_logits_max": -1.5099213123321533, "copy_logits_min": -750000000.0, "copy_num_tokens": 552.5625, "epoch": 1.2023997957620627, "gen_logits_max": 4.193849086761475, "gen_logits_mean": -15.863502502441406, "gen_logits_min": -27.97052001953125, "gen_logits_std": 3.060854911804199, "gen_loss": 0.23519781231880188, "grad_norm": 0.38052496761757515, "learning_rate": 2.3493473684210525e-05, "loss": 0.2583, "mean_copy_accuracy": 0.9955766797065735, "mean_gen_accuracy": 0.8853249102830887, "mean_token_accuracy": 0.9123043864965439, "num_tokens": 512396574.0, "sample_num_tokens": 9837.5, "step": 5887, "total_num_tokens": 512435924.0, "z_loss": 0.0005127025069668889 }, { "copy_logits_max": -1.3355046510696411, "copy_logits_min": -750000000.0, "copy_num_tokens": 486.1875, "epoch": 1.2026040336992596, "gen_logits_max": 3.6799845695495605, "gen_logits_mean": -16.984840393066406, "gen_logits_min": -28.980688095092773, "gen_logits_std": 3.0393338203430176, "gen_loss": 0.32145291566848755, "grad_norm": 0.3533700990301244, "learning_rate": 2.3492210526315793e-05, "loss": 0.2894, "mean_copy_accuracy": 0.9966594725847244, "mean_gen_accuracy": 0.8685425221920013, "mean_token_accuracy": 0.9038287848234177, "num_tokens": 512691108.0, "sample_num_tokens": 9275.0, "step": 5888, "total_num_tokens": 512728208.0, "z_loss": 0.00063158298144117 }, { "copy_logits_max": 0.565576434135437, "copy_logits_min": -750000000.0, "copy_num_tokens": 398.6875, "epoch": 1.2028082716364565, "gen_logits_max": 5.285930633544922, "gen_logits_mean": -14.103650093078613, "gen_logits_min": -26.80807113647461, "gen_logits_std": 3.024837017059326, "gen_loss": 0.26986098289489746, "grad_norm": 0.3792490880086034, "learning_rate": 2.3490947368421054e-05, "loss": 0.2947, "mean_copy_accuracy": 0.9962033480405807, "mean_gen_accuracy": 0.873033195734024, "mean_token_accuracy": 0.9011449217796326, "num_tokens": 512971239.0, "sample_num_tokens": 7653.75, "step": 5889, "total_num_tokens": 513001854.0, "z_loss": 0.0005861919489689171 }, { "copy_logits_max": 1.4767541885375977, "copy_logits_min": -750000000.0, "copy_num_tokens": 797.75, "epoch": 1.2030125095736532, "gen_logits_max": 4.012642860412598, "gen_logits_mean": -15.311284065246582, "gen_logits_min": -27.85680389404297, "gen_logits_std": 3.063642978668213, "gen_loss": 0.25938719511032104, "grad_norm": 0.4004993779915354, "learning_rate": 2.3489684210526318e-05, "loss": 0.2812, "mean_copy_accuracy": 0.9969509840011597, "mean_gen_accuracy": 0.8702051043510437, "mean_token_accuracy": 0.9049497544765472, "num_tokens": 513250882.0, "sample_num_tokens": 11241.0, "step": 5890, "total_num_tokens": 513295846.0, "z_loss": 0.0005833376199007034 }, { "copy_logits_max": -3.2600982189178467, "copy_logits_min": -750000000.0, "copy_num_tokens": 286.0, "epoch": 1.2032167475108502, "gen_logits_max": 4.719611644744873, "gen_logits_mean": -16.78837776184082, "gen_logits_min": -28.59055519104004, "gen_logits_std": 3.0329833030700684, "gen_loss": 0.29227423667907715, "grad_norm": 0.3808551524986616, "learning_rate": 2.348842105263158e-05, "loss": 0.2885, "mean_copy_accuracy": 0.9968589544296265, "mean_gen_accuracy": 0.8726016879081726, "mean_token_accuracy": 0.901546910405159, "num_tokens": 513516810.0, "sample_num_tokens": 7013.5, "step": 5891, "total_num_tokens": 513544864.0, "z_loss": 0.000557903025764972 }, { "copy_logits_max": -1.8902268409729004, "copy_logits_min": -750000000.0, "copy_num_tokens": 492.5, "epoch": 1.203420985448047, "gen_logits_max": 4.986174583435059, "gen_logits_mean": -15.489377975463867, "gen_logits_min": -27.376163482666016, "gen_logits_std": 3.0435450077056885, "gen_loss": 0.27409231662750244, "grad_norm": 0.39855280408034305, "learning_rate": 2.3487157894736843e-05, "loss": 0.2955, "mean_copy_accuracy": 0.996209979057312, "mean_gen_accuracy": 0.8683027774095535, "mean_token_accuracy": 0.9001981765031815, "num_tokens": 513771857.0, "sample_num_tokens": 9380.75, "step": 5892, "total_num_tokens": 513809380.0, "z_loss": 0.0004944645334035158 }, { "copy_logits_max": 0.6545524597167969, "copy_logits_min": -687500032.0, "copy_num_tokens": 435.0, "epoch": 1.2036252233852438, "gen_logits_max": 5.277283668518066, "gen_logits_mean": -15.584466934204102, "gen_logits_min": -27.977825164794922, "gen_logits_std": 3.0669960975646973, "gen_loss": 0.26642119884490967, "grad_norm": 0.3726435270319264, "learning_rate": 2.3485894736842104e-05, "loss": 0.2706, "mean_copy_accuracy": 0.9964272826910019, "mean_gen_accuracy": 0.8785650879144669, "mean_token_accuracy": 0.9071093797683716, "num_tokens": 514029304.0, "sample_num_tokens": 7780.0, "step": 5893, "total_num_tokens": 514060424.0, "z_loss": 0.0004675182281062007 }, { "copy_logits_max": 3.560009002685547, "copy_logits_min": -750000000.0, "copy_num_tokens": 725.9375, "epoch": 1.2038294613224407, "gen_logits_max": 5.519045829772949, "gen_logits_mean": -13.247304916381836, "gen_logits_min": -25.562271118164062, "gen_logits_std": 3.0027661323547363, "gen_loss": 0.25432926416397095, "grad_norm": 0.4120728812693984, "learning_rate": 2.348463157894737e-05, "loss": 0.2909, "mean_copy_accuracy": 0.9962543845176697, "mean_gen_accuracy": 0.8765302151441574, "mean_token_accuracy": 0.9017047435045242, "num_tokens": 514297202.0, "sample_num_tokens": 10706.5, "step": 5894, "total_num_tokens": 514340028.0, "z_loss": 0.0005068992613814771 }, { "copy_logits_max": -1.9810500144958496, "copy_logits_min": -750000000.0, "copy_num_tokens": 423.5, "epoch": 1.2040336992596374, "gen_logits_max": 3.951249837875366, "gen_logits_mean": -16.799755096435547, "gen_logits_min": -29.235557556152344, "gen_logits_std": 3.0858006477355957, "gen_loss": 0.2684129476547241, "grad_norm": 0.3635463753922223, "learning_rate": 2.348336842105263e-05, "loss": 0.2776, "mean_copy_accuracy": 0.9959607720375061, "mean_gen_accuracy": 0.8782535195350647, "mean_token_accuracy": 0.9045488089323044, "num_tokens": 514573042.0, "sample_num_tokens": 7950.0, "step": 5895, "total_num_tokens": 514604842.0, "z_loss": 0.0004588657757267356 }, { "copy_logits_max": -1.323089361190796, "copy_logits_min": -750000000.0, "copy_num_tokens": 462.4375, "epoch": 1.2042379371968344, "gen_logits_max": 4.756621837615967, "gen_logits_mean": -16.37193489074707, "gen_logits_min": -28.544506072998047, "gen_logits_std": 3.0701053142547607, "gen_loss": 0.26399746537208557, "grad_norm": 0.40212300897084124, "learning_rate": 2.3482105263157897e-05, "loss": 0.2666, "mean_copy_accuracy": 0.9963364452123642, "mean_gen_accuracy": 0.8779954463243484, "mean_token_accuracy": 0.9086444824934006, "num_tokens": 514873999.0, "sample_num_tokens": 8723.75, "step": 5896, "total_num_tokens": 514908894.0, "z_loss": 0.00047543461550958455 }, { "copy_logits_max": -1.240843653678894, "copy_logits_min": -750000000.0, "copy_num_tokens": 405.8125, "epoch": 1.204442175134031, "gen_logits_max": 6.231517314910889, "gen_logits_mean": -14.193646430969238, "gen_logits_min": -26.192031860351562, "gen_logits_std": 2.969841957092285, "gen_loss": 0.3020533621311188, "grad_norm": 0.3830329560972697, "learning_rate": 2.3480842105263158e-05, "loss": 0.2781, "mean_copy_accuracy": 0.9954207688570023, "mean_gen_accuracy": 0.8810674846172333, "mean_token_accuracy": 0.9055981934070587, "num_tokens": 515125505.0, "sample_num_tokens": 7759.25, "step": 5897, "total_num_tokens": 515156542.0, "z_loss": 0.0005359207279980183 }, { "copy_logits_max": -0.7270487546920776, "copy_logits_min": -750000000.0, "copy_num_tokens": 235.9375, "epoch": 1.204646413071228, "gen_logits_max": 6.860358238220215, "gen_logits_mean": -14.758549690246582, "gen_logits_min": -26.805980682373047, "gen_logits_std": 3.024614095687866, "gen_loss": 0.2591039836406708, "grad_norm": 0.376978990319773, "learning_rate": 2.3479578947368423e-05, "loss": 0.2751, "mean_copy_accuracy": 0.9950000792741776, "mean_gen_accuracy": 0.8788139373064041, "mean_token_accuracy": 0.9056533724069595, "num_tokens": 515377773.0, "sample_num_tokens": 6436.25, "step": 5898, "total_num_tokens": 515403518.0, "z_loss": 0.0004807639052160084 }, { "copy_logits_max": -0.3036307692527771, "copy_logits_min": -750000064.0, "copy_num_tokens": 511.25, "epoch": 1.204850651008425, "gen_logits_max": 5.190759658813477, "gen_logits_mean": -14.2037353515625, "gen_logits_min": -26.64717674255371, "gen_logits_std": 3.0646824836730957, "gen_loss": 0.2701219916343689, "grad_norm": 0.4240777091379086, "learning_rate": 2.3478315789473683e-05, "loss": 0.2713, "mean_copy_accuracy": 0.9966247081756592, "mean_gen_accuracy": 0.8749721199274063, "mean_token_accuracy": 0.9079526215791702, "num_tokens": 515652821.0, "sample_num_tokens": 8021.25, "step": 5899, "total_num_tokens": 515684906.0, "z_loss": 0.0004895677557215095 }, { "copy_logits_max": -3.2869486808776855, "copy_logits_min": -750000000.0, "copy_num_tokens": 524.625, "epoch": 1.2050548889456216, "gen_logits_max": 4.084024429321289, "gen_logits_mean": -16.077632904052734, "gen_logits_min": -28.335582733154297, "gen_logits_std": 3.0573348999023438, "gen_loss": 0.2857602834701538, "grad_norm": 0.36879653582980154, "learning_rate": 2.3477052631578948e-05, "loss": 0.2796, "mean_copy_accuracy": 0.9969868659973145, "mean_gen_accuracy": 0.8737421780824661, "mean_token_accuracy": 0.9047786146402359, "num_tokens": 515922976.0, "sample_num_tokens": 8604.5, "step": 5900, "total_num_tokens": 515957394.0, "z_loss": 0.0004645195440389216 }, { "copy_logits_max": 0.35266709327697754, "copy_logits_min": -750000000.0, "copy_num_tokens": 428.1875, "epoch": 1.2052591268828186, "gen_logits_max": 7.045876502990723, "gen_logits_mean": -12.85543441772461, "gen_logits_min": -25.8306941986084, "gen_logits_std": 3.036813497543335, "gen_loss": 0.25608110427856445, "grad_norm": 0.4113958966394307, "learning_rate": 2.3475789473684212e-05, "loss": 0.283, "mean_copy_accuracy": 0.9956018626689911, "mean_gen_accuracy": 0.8778491914272308, "mean_token_accuracy": 0.9042983055114746, "num_tokens": 516174880.0, "sample_num_tokens": 7398.5, "step": 5901, "total_num_tokens": 516204474.0, "z_loss": 0.00047073536552488804 }, { "copy_logits_max": -2.193399429321289, "copy_logits_min": -625000064.0, "copy_num_tokens": 493.8125, "epoch": 1.2054633648200153, "gen_logits_max": 4.875914573669434, "gen_logits_mean": -15.470121383666992, "gen_logits_min": -28.036170959472656, "gen_logits_std": 3.072922706604004, "gen_loss": 0.2555811405181885, "grad_norm": 0.362953738963177, "learning_rate": 2.3474526315789473e-05, "loss": 0.2521, "mean_copy_accuracy": 0.9957668632268906, "mean_gen_accuracy": 0.8843951374292374, "mean_token_accuracy": 0.9137633293867111, "num_tokens": 516450463.0, "sample_num_tokens": 7937.75, "step": 5902, "total_num_tokens": 516482214.0, "z_loss": 0.00048320702626369894 }, { "copy_logits_max": -2.5166568756103516, "copy_logits_min": -750000000.0, "copy_num_tokens": 573.75, "epoch": 1.2056676027572122, "gen_logits_max": 4.717296600341797, "gen_logits_mean": -15.437189102172852, "gen_logits_min": -28.36724090576172, "gen_logits_std": 3.0662801265716553, "gen_loss": 0.2669166922569275, "grad_norm": 0.3720458960683284, "learning_rate": 2.3473263157894737e-05, "loss": 0.2848, "mean_copy_accuracy": 0.9970085024833679, "mean_gen_accuracy": 0.866146132349968, "mean_token_accuracy": 0.9048256725072861, "num_tokens": 516738407.0, "sample_num_tokens": 8662.25, "step": 5903, "total_num_tokens": 516773056.0, "z_loss": 0.0005681111942976713 }, { "copy_logits_max": -1.057389497756958, "copy_logits_min": -687500032.0, "copy_num_tokens": 652.3125, "epoch": 1.205871840694409, "gen_logits_max": 5.058049201965332, "gen_logits_mean": -14.219217300415039, "gen_logits_min": -27.40479850769043, "gen_logits_std": 3.0387635231018066, "gen_loss": 0.24489983916282654, "grad_norm": 0.37837333154329705, "learning_rate": 2.3472e-05, "loss": 0.2684, "mean_copy_accuracy": 0.9972742199897766, "mean_gen_accuracy": 0.8706851750612259, "mean_token_accuracy": 0.9087205976247787, "num_tokens": 517019060.0, "sample_num_tokens": 9148.5, "step": 5904, "total_num_tokens": 517055654.0, "z_loss": 0.0005673514679074287 }, { "copy_logits_max": -1.3308217525482178, "copy_logits_min": -750000000.0, "copy_num_tokens": 549.0625, "epoch": 1.2060760786316058, "gen_logits_max": 4.338020324707031, "gen_logits_mean": -15.690521240234375, "gen_logits_min": -28.258358001708984, "gen_logits_std": 3.046964168548584, "gen_loss": 0.24812650680541992, "grad_norm": 0.3747515045519073, "learning_rate": 2.3470736842105266e-05, "loss": 0.2739, "mean_copy_accuracy": 0.9973770529031754, "mean_gen_accuracy": 0.8741616755723953, "mean_token_accuracy": 0.9079898595809937, "num_tokens": 517283812.0, "sample_num_tokens": 8639.5, "step": 5905, "total_num_tokens": 517318370.0, "z_loss": 0.0005447042640298605 }, { "copy_logits_max": -1.1796016693115234, "copy_logits_min": -687500032.0, "copy_num_tokens": 575.875, "epoch": 1.2062803165688027, "gen_logits_max": 5.193244934082031, "gen_logits_mean": -14.266427040100098, "gen_logits_min": -27.22723388671875, "gen_logits_std": 3.0275206565856934, "gen_loss": 0.26460468769073486, "grad_norm": 0.37338338426463835, "learning_rate": 2.3469473684210527e-05, "loss": 0.2701, "mean_copy_accuracy": 0.9963140338659286, "mean_gen_accuracy": 0.876706063747406, "mean_token_accuracy": 0.907551109790802, "num_tokens": 517539328.0, "sample_num_tokens": 8652.0, "step": 5906, "total_num_tokens": 517573936.0, "z_loss": 0.00047410259139724076 }, { "copy_logits_max": -2.6357593536376953, "copy_logits_min": -750000000.0, "copy_num_tokens": 440.9375, "epoch": 1.2064845545059995, "gen_logits_max": 4.1065287590026855, "gen_logits_mean": -16.61553192138672, "gen_logits_min": -28.909944534301758, "gen_logits_std": 3.052852153778076, "gen_loss": 0.3177216053009033, "grad_norm": 0.3493395370534973, "learning_rate": 2.346821052631579e-05, "loss": 0.2879, "mean_copy_accuracy": 0.9956071674823761, "mean_gen_accuracy": 0.8708002120256424, "mean_token_accuracy": 0.9007854163646698, "num_tokens": 517807417.0, "sample_num_tokens": 7942.25, "step": 5907, "total_num_tokens": 517839186.0, "z_loss": 0.0006241953233256936 }, { "copy_logits_max": -0.30575430393218994, "copy_logits_min": -750000064.0, "copy_num_tokens": 553.5, "epoch": 1.2066887924431964, "gen_logits_max": 5.612939357757568, "gen_logits_mean": -13.476314544677734, "gen_logits_min": -26.620954513549805, "gen_logits_std": 3.004924774169922, "gen_loss": 0.2758499085903168, "grad_norm": 0.41250894579280417, "learning_rate": 2.3466947368421052e-05, "loss": 0.2698, "mean_copy_accuracy": 0.9959801882505417, "mean_gen_accuracy": 0.8786101043224335, "mean_token_accuracy": 0.9072339832782745, "num_tokens": 518087212.0, "sample_num_tokens": 8637.5, "step": 5908, "total_num_tokens": 518121762.0, "z_loss": 0.0005215514684095979 }, { "copy_logits_max": 0.3543827533721924, "copy_logits_min": -750000000.0, "copy_num_tokens": 767.5, "epoch": 1.206893030380393, "gen_logits_max": 4.20516300201416, "gen_logits_mean": -14.878883361816406, "gen_logits_min": -27.27779769897461, "gen_logits_std": 3.0706419944763184, "gen_loss": 0.24279555678367615, "grad_norm": 0.35469605409922517, "learning_rate": 2.3465684210526316e-05, "loss": 0.268, "mean_copy_accuracy": 0.9964344948530197, "mean_gen_accuracy": 0.8771986216306686, "mean_token_accuracy": 0.9094570130109787, "num_tokens": 518370034.0, "sample_num_tokens": 10160.0, "step": 5909, "total_num_tokens": 518410674.0, "z_loss": 0.00045423544361256063 }, { "copy_logits_max": -3.307267189025879, "copy_logits_min": -750000000.0, "copy_num_tokens": 225.3125, "epoch": 1.20709726831759, "gen_logits_max": 6.069178581237793, "gen_logits_mean": -15.430803298950195, "gen_logits_min": -27.288192749023438, "gen_logits_std": 3.0130162239074707, "gen_loss": 0.31779277324676514, "grad_norm": 0.4254092210148425, "learning_rate": 2.3464421052631577e-05, "loss": 0.2923, "mean_copy_accuracy": 0.9951750636100769, "mean_gen_accuracy": 0.8763678669929504, "mean_token_accuracy": 0.900178462266922, "num_tokens": 518630020.0, "sample_num_tokens": 7365.5, "step": 5910, "total_num_tokens": 518659482.0, "z_loss": 0.0005849773297086358 }, { "copy_logits_max": 1.1565585136413574, "copy_logits_min": -750000000.0, "copy_num_tokens": 416.25, "epoch": 1.2073015062547867, "gen_logits_max": 4.752323150634766, "gen_logits_mean": -14.741835594177246, "gen_logits_min": -26.818010330200195, "gen_logits_std": 3.0581679344177246, "gen_loss": 0.2676721215248108, "grad_norm": 0.3712051661354722, "learning_rate": 2.346315789473684e-05, "loss": 0.2732, "mean_copy_accuracy": 0.9945406317710876, "mean_gen_accuracy": 0.879460796713829, "mean_token_accuracy": 0.9069160670042038, "num_tokens": 518882775.0, "sample_num_tokens": 7954.75, "step": 5911, "total_num_tokens": 518914594.0, "z_loss": 0.0005198096623644233 }, { "copy_logits_max": 2.15472674369812, "copy_logits_min": -750000064.0, "copy_num_tokens": 533.25, "epoch": 1.2075057441919836, "gen_logits_max": 4.625500679016113, "gen_logits_mean": -14.870849609375, "gen_logits_min": -27.106653213500977, "gen_logits_std": 3.0676448345184326, "gen_loss": 0.2510230541229248, "grad_norm": 0.4165303616875941, "learning_rate": 2.3461894736842106e-05, "loss": 0.2568, "mean_copy_accuracy": 0.9956318885087967, "mean_gen_accuracy": 0.8859031051397324, "mean_token_accuracy": 0.912227064371109, "num_tokens": 519154760.0, "sample_num_tokens": 9104.5, "step": 5912, "total_num_tokens": 519191178.0, "z_loss": 0.0005055562360212207 }, { "copy_logits_max": 0.19127333164215088, "copy_logits_min": -750000064.0, "copy_num_tokens": 352.625, "epoch": 1.2077099821291806, "gen_logits_max": 4.682687759399414, "gen_logits_mean": -15.799503326416016, "gen_logits_min": -27.954242706298828, "gen_logits_std": 3.05148983001709, "gen_loss": 0.3374388515949249, "grad_norm": 0.3570956070956017, "learning_rate": 2.346063157894737e-05, "loss": 0.2724, "mean_copy_accuracy": 0.9950689673423767, "mean_gen_accuracy": 0.8789350390434265, "mean_token_accuracy": 0.9073009788990021, "num_tokens": 519421336.0, "sample_num_tokens": 7045.5, "step": 5913, "total_num_tokens": 519449518.0, "z_loss": 0.0006546540535055101 }, { "copy_logits_max": -1.3313190937042236, "copy_logits_min": -750000000.0, "copy_num_tokens": 438.3125, "epoch": 1.2079142200663773, "gen_logits_max": 4.049047470092773, "gen_logits_mean": -16.614404678344727, "gen_logits_min": -28.623767852783203, "gen_logits_std": 3.088794708251953, "gen_loss": 0.24439164996147156, "grad_norm": 0.39105476865552974, "learning_rate": 2.3459368421052635e-05, "loss": 0.2617, "mean_copy_accuracy": 0.9971061795949936, "mean_gen_accuracy": 0.8779237866401672, "mean_token_accuracy": 0.9109455794095993, "num_tokens": 519709691.0, "sample_num_tokens": 8376.75, "step": 5914, "total_num_tokens": 519743198.0, "z_loss": 0.0005187917849980295 }, { "copy_logits_max": 4.007688522338867, "copy_logits_min": -750000000.0, "copy_num_tokens": 702.625, "epoch": 1.2081184580035742, "gen_logits_max": 5.514339447021484, "gen_logits_mean": -13.202083587646484, "gen_logits_min": -25.527759552001953, "gen_logits_std": 3.0376663208007812, "gen_loss": 0.271466463804245, "grad_norm": 0.3679017475318617, "learning_rate": 2.3458105263157895e-05, "loss": 0.2653, "mean_copy_accuracy": 0.9970104396343231, "mean_gen_accuracy": 0.8808996975421906, "mean_token_accuracy": 0.9115773439407349, "num_tokens": 519990186.0, "sample_num_tokens": 10060.0, "step": 5915, "total_num_tokens": 520030426.0, "z_loss": 0.0005066072917543352 }, { "copy_logits_max": 2.860957622528076, "copy_logits_min": -750000000.0, "copy_num_tokens": 599.1875, "epoch": 1.208322695940771, "gen_logits_max": 4.38427734375, "gen_logits_mean": -14.842677116394043, "gen_logits_min": -26.975309371948242, "gen_logits_std": 3.0734329223632812, "gen_loss": 0.26482075452804565, "grad_norm": 0.3631746000121617, "learning_rate": 2.345684210526316e-05, "loss": 0.2759, "mean_copy_accuracy": 0.9961154758930206, "mean_gen_accuracy": 0.8756573498249054, "mean_token_accuracy": 0.9065288454294205, "num_tokens": 520245407.0, "sample_num_tokens": 9010.75, "step": 5916, "total_num_tokens": 520281450.0, "z_loss": 0.0004998089279979467 }, { "copy_logits_max": 0.1948249489068985, "copy_logits_min": -750000000.0, "copy_num_tokens": 454.3125, "epoch": 1.2085269338779678, "gen_logits_max": 5.23660945892334, "gen_logits_mean": -14.032377243041992, "gen_logits_min": -26.700773239135742, "gen_logits_std": 3.0416347980499268, "gen_loss": 0.26720061898231506, "grad_norm": 0.39708547340717826, "learning_rate": 2.345557894736842e-05, "loss": 0.2779, "mean_copy_accuracy": 0.9961966723203659, "mean_gen_accuracy": 0.8746462315320969, "mean_token_accuracy": 0.9043641090393066, "num_tokens": 520520764.0, "sample_num_tokens": 8305.0, "step": 5917, "total_num_tokens": 520553984.0, "z_loss": 0.0005264633800834417 }, { "copy_logits_max": 0.022649645805358887, "copy_logits_min": -750000064.0, "copy_num_tokens": 387.8125, "epoch": 1.2087311718151648, "gen_logits_max": 5.361999988555908, "gen_logits_mean": -15.51010799407959, "gen_logits_min": -27.44843292236328, "gen_logits_std": 3.061230182647705, "gen_loss": 0.2786067724227905, "grad_norm": 0.39481299724884544, "learning_rate": 2.3454315789473685e-05, "loss": 0.3023, "mean_copy_accuracy": 0.9950798749923706, "mean_gen_accuracy": 0.8714409470558167, "mean_token_accuracy": 0.8978451192378998, "num_tokens": 520791072.0, "sample_num_tokens": 8913.0, "step": 5918, "total_num_tokens": 520826724.0, "z_loss": 0.0004833115090150386 }, { "copy_logits_max": 4.200950622558594, "copy_logits_min": -750000000.0, "copy_num_tokens": 766.4375, "epoch": 1.2089354097523615, "gen_logits_max": 4.875587463378906, "gen_logits_mean": -13.58370590209961, "gen_logits_min": -26.331668853759766, "gen_logits_std": 3.0747604370117188, "gen_loss": 0.24050454795360565, "grad_norm": 0.3377849714118729, "learning_rate": 2.3453052631578946e-05, "loss": 0.2492, "mean_copy_accuracy": 0.9972822070121765, "mean_gen_accuracy": 0.8803080916404724, "mean_token_accuracy": 0.9149175733327866, "num_tokens": 521079296.0, "sample_num_tokens": 10141.0, "step": 5919, "total_num_tokens": 521119860.0, "z_loss": 0.0004682737053371966 }, { "copy_logits_max": 2.251671075820923, "copy_logits_min": -750000000.0, "copy_num_tokens": 507.9375, "epoch": 1.2091396476895584, "gen_logits_max": 5.094879150390625, "gen_logits_mean": -14.697628021240234, "gen_logits_min": -27.47256088256836, "gen_logits_std": 3.0616002082824707, "gen_loss": 0.30149051547050476, "grad_norm": 0.36933277206120474, "learning_rate": 2.3451789473684214e-05, "loss": 0.2755, "mean_copy_accuracy": 0.996335431933403, "mean_gen_accuracy": 0.8755444586277008, "mean_token_accuracy": 0.906761959195137, "num_tokens": 521368339.0, "sample_num_tokens": 8687.75, "step": 5920, "total_num_tokens": 521403090.0, "z_loss": 0.0006190422573126853 }, { "copy_logits_max": 4.593203067779541, "copy_logits_min": -750000000.0, "copy_num_tokens": 562.3125, "epoch": 1.209343885626755, "gen_logits_max": 5.988069534301758, "gen_logits_mean": -12.734829902648926, "gen_logits_min": -24.955997467041016, "gen_logits_std": 3.0319042205810547, "gen_loss": 0.2539761960506439, "grad_norm": 0.39089621373065175, "learning_rate": 2.3450526315789475e-05, "loss": 0.2636, "mean_copy_accuracy": 0.99635249376297, "mean_gen_accuracy": 0.8772861063480377, "mean_token_accuracy": 0.9101068377494812, "num_tokens": 521653734.0, "sample_num_tokens": 9787.0, "step": 5921, "total_num_tokens": 521692882.0, "z_loss": 0.00043830706272274256 }, { "copy_logits_max": -0.3741191029548645, "copy_logits_min": -687500032.0, "copy_num_tokens": 407.5, "epoch": 1.209548123563952, "gen_logits_max": 5.060024261474609, "gen_logits_mean": -14.415441513061523, "gen_logits_min": -26.764217376708984, "gen_logits_std": 3.0289599895477295, "gen_loss": 0.33173513412475586, "grad_norm": 0.34392302912318684, "learning_rate": 2.344926315789474e-05, "loss": 0.2794, "mean_copy_accuracy": 0.996712937951088, "mean_gen_accuracy": 0.8728422820568085, "mean_token_accuracy": 0.9062458574771881, "num_tokens": 521954470.0, "sample_num_tokens": 7899.0, "step": 5922, "total_num_tokens": 521986066.0, "z_loss": 0.0006138360477052629 }, { "copy_logits_max": 3.8991570472717285, "copy_logits_min": -687500032.0, "copy_num_tokens": 593.4375, "epoch": 1.209752361501149, "gen_logits_max": 4.893006324768066, "gen_logits_mean": -14.820634841918945, "gen_logits_min": -27.156957626342773, "gen_logits_std": 3.0715720653533936, "gen_loss": 0.2556130886077881, "grad_norm": 0.40977034174615395, "learning_rate": 2.3448e-05, "loss": 0.2785, "mean_copy_accuracy": 0.9959815442562103, "mean_gen_accuracy": 0.871952623128891, "mean_token_accuracy": 0.9047942012548447, "num_tokens": 522228713.0, "sample_num_tokens": 9493.25, "step": 5923, "total_num_tokens": 522266686.0, "z_loss": 0.0005672331899404526 }, { "copy_logits_max": 4.106688022613525, "copy_logits_min": -750000064.0, "copy_num_tokens": 521.9375, "epoch": 1.2099565994383457, "gen_logits_max": 6.342307090759277, "gen_logits_mean": -12.507211685180664, "gen_logits_min": -24.818449020385742, "gen_logits_std": 3.0147266387939453, "gen_loss": 0.2959892153739929, "grad_norm": 0.40076331799703085, "learning_rate": 2.3446736842105264e-05, "loss": 0.293, "mean_copy_accuracy": 0.9966126978397369, "mean_gen_accuracy": 0.8713340908288956, "mean_token_accuracy": 0.8996234834194183, "num_tokens": 522487736.0, "sample_num_tokens": 9150.0, "step": 5924, "total_num_tokens": 522524336.0, "z_loss": 0.0005684271454811096 }, { "copy_logits_max": 0.9767284393310547, "copy_logits_min": -750000000.0, "copy_num_tokens": 588.125, "epoch": 1.2101608373755426, "gen_logits_max": 5.048718452453613, "gen_logits_mean": -13.477611541748047, "gen_logits_min": -26.022052764892578, "gen_logits_std": 3.044529914855957, "gen_loss": 0.2543056011199951, "grad_norm": 0.4059260574370508, "learning_rate": 2.3445473684210525e-05, "loss": 0.2843, "mean_copy_accuracy": 0.996933251619339, "mean_gen_accuracy": 0.8719963282346725, "mean_token_accuracy": 0.9047699272632599, "num_tokens": 522754626.0, "sample_num_tokens": 8596.5, "step": 5925, "total_num_tokens": 522789012.0, "z_loss": 0.0004649974871426821 }, { "copy_logits_max": -1.8167389631271362, "copy_logits_min": -750000000.0, "copy_num_tokens": 437.8125, "epoch": 1.2103650753127393, "gen_logits_max": 4.4186110496521, "gen_logits_mean": -15.580784797668457, "gen_logits_min": -27.72746467590332, "gen_logits_std": 3.0688669681549072, "gen_loss": 0.26408880949020386, "grad_norm": 0.36741072971277655, "learning_rate": 2.344421052631579e-05, "loss": 0.2753, "mean_copy_accuracy": 0.9950329661369324, "mean_gen_accuracy": 0.8816881030797958, "mean_token_accuracy": 0.9073410034179688, "num_tokens": 523020299.0, "sample_num_tokens": 8330.25, "step": 5926, "total_num_tokens": 523053620.0, "z_loss": 0.0004705114697571844 }, { "copy_logits_max": -1.4327104091644287, "copy_logits_min": -750000000.0, "copy_num_tokens": 477.875, "epoch": 1.2105693132499362, "gen_logits_max": 4.626657485961914, "gen_logits_mean": -15.448738098144531, "gen_logits_min": -27.94499969482422, "gen_logits_std": 3.0554006099700928, "gen_loss": 0.2953563928604126, "grad_norm": 0.4450221696749748, "learning_rate": 2.344294736842105e-05, "loss": 0.2995, "mean_copy_accuracy": 0.994206577539444, "mean_gen_accuracy": 0.8692377805709839, "mean_token_accuracy": 0.89838507771492, "num_tokens": 523277302.0, "sample_num_tokens": 8807.5, "step": 5927, "total_num_tokens": 523312532.0, "z_loss": 0.0005804019165225327 }, { "copy_logits_max": -0.6819347143173218, "copy_logits_min": -687500032.0, "copy_num_tokens": 485.875, "epoch": 1.210773551187133, "gen_logits_max": 5.074855327606201, "gen_logits_mean": -15.138017654418945, "gen_logits_min": -27.665386199951172, "gen_logits_std": 3.0616254806518555, "gen_loss": 0.2705720067024231, "grad_norm": 0.38480169811020637, "learning_rate": 2.3441684210526318e-05, "loss": 0.2841, "mean_copy_accuracy": 0.9971758425235748, "mean_gen_accuracy": 0.8726377487182617, "mean_token_accuracy": 0.9043870717287064, "num_tokens": 523544965.0, "sample_num_tokens": 8926.25, "step": 5928, "total_num_tokens": 523580670.0, "z_loss": 0.00048322879592888057 }, { "copy_logits_max": 2.7914390563964844, "copy_logits_min": -750000000.0, "copy_num_tokens": 385.9375, "epoch": 1.2109777891243299, "gen_logits_max": 5.803084373474121, "gen_logits_mean": -13.761241912841797, "gen_logits_min": -26.28256607055664, "gen_logits_std": 3.023435592651367, "gen_loss": 0.30468201637268066, "grad_norm": 0.3739714021638762, "learning_rate": 2.3440421052631582e-05, "loss": 0.2754, "mean_copy_accuracy": 0.9963039755821228, "mean_gen_accuracy": 0.8824156373739243, "mean_token_accuracy": 0.9074372053146362, "num_tokens": 523800205.0, "sample_num_tokens": 7448.75, "step": 5929, "total_num_tokens": 523830000.0, "z_loss": 0.0006126005901023746 }, { "copy_logits_max": 0.47964000701904297, "copy_logits_min": -750000000.0, "copy_num_tokens": 415.0, "epoch": 1.2111820270615268, "gen_logits_max": 5.002076148986816, "gen_logits_mean": -14.529487609863281, "gen_logits_min": -26.847871780395508, "gen_logits_std": 3.0289764404296875, "gen_loss": 0.2782503366470337, "grad_norm": 0.36091527610425456, "learning_rate": 2.3439157894736843e-05, "loss": 0.2808, "mean_copy_accuracy": 0.9966463148593903, "mean_gen_accuracy": 0.8722682893276215, "mean_token_accuracy": 0.9047814905643463, "num_tokens": 524065026.0, "sample_num_tokens": 7281.0, "step": 5930, "total_num_tokens": 524094150.0, "z_loss": 0.0005552544025704265 }, { "copy_logits_max": -0.15657639503479004, "copy_logits_min": -750000000.0, "copy_num_tokens": 563.375, "epoch": 1.2113862649987235, "gen_logits_max": 4.1155924797058105, "gen_logits_mean": -16.279296875, "gen_logits_min": -28.546958923339844, "gen_logits_std": 3.0688815116882324, "gen_loss": 0.2963261306285858, "grad_norm": 0.3974412892226565, "learning_rate": 2.3437894736842108e-05, "loss": 0.2798, "mean_copy_accuracy": 0.995215505361557, "mean_gen_accuracy": 0.8769758194684982, "mean_token_accuracy": 0.9064462929964066, "num_tokens": 524337659.0, "sample_num_tokens": 8851.25, "step": 5931, "total_num_tokens": 524373064.0, "z_loss": 0.0005868637817911804 }, { "copy_logits_max": -0.9328132271766663, "copy_logits_min": -687500032.0, "copy_num_tokens": 479.625, "epoch": 1.2115905029359204, "gen_logits_max": 4.232908248901367, "gen_logits_mean": -15.909280776977539, "gen_logits_min": -28.28692626953125, "gen_logits_std": 3.0649266242980957, "gen_loss": 0.2791208028793335, "grad_norm": 0.4112392405797536, "learning_rate": 2.343663157894737e-05, "loss": 0.3043, "mean_copy_accuracy": 0.9949415028095245, "mean_gen_accuracy": 0.8676742762327194, "mean_token_accuracy": 0.8967840522527695, "num_tokens": 524590865.0, "sample_num_tokens": 9068.25, "step": 5932, "total_num_tokens": 524627138.0, "z_loss": 0.0006613960722461343 }, { "copy_logits_max": 5.877172470092773, "copy_logits_min": -750000064.0, "copy_num_tokens": 649.6875, "epoch": 1.2117947408731171, "gen_logits_max": 5.954443454742432, "gen_logits_mean": -13.49644660949707, "gen_logits_min": -26.042076110839844, "gen_logits_std": 3.047513723373413, "gen_loss": 0.267927348613739, "grad_norm": 0.37154175755376545, "learning_rate": 2.3435368421052633e-05, "loss": 0.2937, "mean_copy_accuracy": 0.9964012950658798, "mean_gen_accuracy": 0.8711474984884262, "mean_token_accuracy": 0.9007823467254639, "num_tokens": 524867158.0, "sample_num_tokens": 9565.0, "step": 5933, "total_num_tokens": 524905418.0, "z_loss": 0.0006805878365412354 }, { "copy_logits_max": 1.7237956523895264, "copy_logits_min": -750000000.0, "copy_num_tokens": 570.125, "epoch": 1.211998978810314, "gen_logits_max": 5.0827789306640625, "gen_logits_mean": -15.131425857543945, "gen_logits_min": -27.571857452392578, "gen_logits_std": 3.08927583694458, "gen_loss": 0.2569132447242737, "grad_norm": 0.398422172617992, "learning_rate": 2.3434105263157894e-05, "loss": 0.2789, "mean_copy_accuracy": 0.9957422018051147, "mean_gen_accuracy": 0.8794472217559814, "mean_token_accuracy": 0.9047211110591888, "num_tokens": 525154349.0, "sample_num_tokens": 9351.25, "step": 5934, "total_num_tokens": 525191754.0, "z_loss": 0.0005770379211753607 }, { "copy_logits_max": -2.3237414360046387, "copy_logits_min": -750000000.0, "copy_num_tokens": 355.5625, "epoch": 1.2122032167475107, "gen_logits_max": 5.3834452629089355, "gen_logits_mean": -15.072568893432617, "gen_logits_min": -27.28965950012207, "gen_logits_std": 3.03885555267334, "gen_loss": 0.29202646017074585, "grad_norm": 0.35064907710807375, "learning_rate": 2.3432842105263158e-05, "loss": 0.2715, "mean_copy_accuracy": 0.996665820479393, "mean_gen_accuracy": 0.8748108595609665, "mean_token_accuracy": 0.9075762033462524, "num_tokens": 525460250.0, "sample_num_tokens": 7514.5, "step": 5935, "total_num_tokens": 525490308.0, "z_loss": 0.0005514439544640481 }, { "copy_logits_max": -3.1595659255981445, "copy_logits_min": -750000000.0, "copy_num_tokens": 589.6875, "epoch": 1.2124074546847077, "gen_logits_max": 4.001889228820801, "gen_logits_mean": -15.910234451293945, "gen_logits_min": -28.292966842651367, "gen_logits_std": 3.0789999961853027, "gen_loss": 0.2868192195892334, "grad_norm": 0.35971346348973654, "learning_rate": 2.343157894736842e-05, "loss": 0.2789, "mean_copy_accuracy": 0.9956150054931641, "mean_gen_accuracy": 0.879620835185051, "mean_token_accuracy": 0.9066813290119171, "num_tokens": 525713487.0, "sample_num_tokens": 10203.75, "step": 5936, "total_num_tokens": 525754302.0, "z_loss": 0.0005492848576977849 }, { "copy_logits_max": -3.1680634021759033, "copy_logits_min": -750000000.0, "copy_num_tokens": 412.875, "epoch": 1.2126116926219046, "gen_logits_max": 4.914220809936523, "gen_logits_mean": -15.872303009033203, "gen_logits_min": -28.172245025634766, "gen_logits_std": 3.0819129943847656, "gen_loss": 0.2871520519256592, "grad_norm": 0.4712415737310782, "learning_rate": 2.3430315789473687e-05, "loss": 0.2781, "mean_copy_accuracy": 0.9953766465187073, "mean_gen_accuracy": 0.8721815049648285, "mean_token_accuracy": 0.9040434807538986, "num_tokens": 525985326.0, "sample_num_tokens": 7853.5, "step": 5937, "total_num_tokens": 526016740.0, "z_loss": 0.0005676323780789971 }, { "copy_logits_max": -1.6851948499679565, "copy_logits_min": -750000064.0, "copy_num_tokens": 291.0, "epoch": 1.2128159305591013, "gen_logits_max": 5.225717544555664, "gen_logits_mean": -15.056032180786133, "gen_logits_min": -26.998783111572266, "gen_logits_std": 3.0273194313049316, "gen_loss": 0.2963505983352661, "grad_norm": 0.3681019941924054, "learning_rate": 2.3429052631578948e-05, "loss": 0.2841, "mean_copy_accuracy": 0.9951881468296051, "mean_gen_accuracy": 0.8786961883306503, "mean_token_accuracy": 0.9053702354431152, "num_tokens": 526232368.0, "sample_num_tokens": 7199.0, "step": 5938, "total_num_tokens": 526261164.0, "z_loss": 0.0005569289205595851 }, { "copy_logits_max": -3.5105326175689697, "copy_logits_min": -750000000.0, "copy_num_tokens": 423.0, "epoch": 1.2130201684962982, "gen_logits_max": 4.804257392883301, "gen_logits_mean": -15.243528366088867, "gen_logits_min": -27.50229263305664, "gen_logits_std": 3.063709020614624, "gen_loss": 0.28155630826950073, "grad_norm": 0.3892270398400046, "learning_rate": 2.3427789473684212e-05, "loss": 0.2715, "mean_copy_accuracy": 0.9959952980279922, "mean_gen_accuracy": 0.8762252479791641, "mean_token_accuracy": 0.906355232000351, "num_tokens": 526487312.0, "sample_num_tokens": 8213.5, "step": 5939, "total_num_tokens": 526520166.0, "z_loss": 0.0005585388862527907 }, { "copy_logits_max": -3.2747762203216553, "copy_logits_min": -750000000.0, "copy_num_tokens": 411.9375, "epoch": 1.213224406433495, "gen_logits_max": 4.296382427215576, "gen_logits_mean": -16.394189834594727, "gen_logits_min": -28.11724090576172, "gen_logits_std": 3.0828135013580322, "gen_loss": 0.2613570988178253, "grad_norm": 0.3350495729690528, "learning_rate": 2.3426526315789473e-05, "loss": 0.2639, "mean_copy_accuracy": 0.9970762431621552, "mean_gen_accuracy": 0.8817940503358841, "mean_token_accuracy": 0.9117723554372787, "num_tokens": 526784202.0, "sample_num_tokens": 8366.0, "step": 5940, "total_num_tokens": 526817666.0, "z_loss": 0.00048462380073033273 }, { "copy_logits_max": -4.246119022369385, "copy_logits_min": -750000000.0, "copy_num_tokens": 457.5, "epoch": 1.2134286443706919, "gen_logits_max": 4.269904136657715, "gen_logits_mean": -15.222155570983887, "gen_logits_min": -27.412639617919922, "gen_logits_std": 3.0631842613220215, "gen_loss": 0.2581101953983307, "grad_norm": 0.36778781500435026, "learning_rate": 2.3425263157894737e-05, "loss": 0.269, "mean_copy_accuracy": 0.9962262809276581, "mean_gen_accuracy": 0.878040999174118, "mean_token_accuracy": 0.9076305776834488, "num_tokens": 527048480.0, "sample_num_tokens": 8496.5, "step": 5941, "total_num_tokens": 527082466.0, "z_loss": 0.0004530260630417615 }, { "copy_logits_max": -3.4256556034088135, "copy_logits_min": -750000000.0, "copy_num_tokens": 383.3125, "epoch": 1.2136328823078886, "gen_logits_max": 4.554893970489502, "gen_logits_mean": -16.218334197998047, "gen_logits_min": -28.440067291259766, "gen_logits_std": 3.0495879650115967, "gen_loss": 0.31921517848968506, "grad_norm": 0.376527731957434, "learning_rate": 2.3424e-05, "loss": 0.2704, "mean_copy_accuracy": 0.9963618218898773, "mean_gen_accuracy": 0.878997802734375, "mean_token_accuracy": 0.9091355353593826, "num_tokens": 527320818.0, "sample_num_tokens": 7948.5, "step": 5942, "total_num_tokens": 527352612.0, "z_loss": 0.0005923899589106441 }, { "copy_logits_max": -1.3090720176696777, "copy_logits_min": -687500032.0, "copy_num_tokens": 486.5625, "epoch": 1.2138371202450855, "gen_logits_max": 4.283801078796387, "gen_logits_mean": -15.780564308166504, "gen_logits_min": -27.62986183166504, "gen_logits_std": 3.0633773803710938, "gen_loss": 0.297573983669281, "grad_norm": 0.3828097213477981, "learning_rate": 2.3422736842105262e-05, "loss": 0.2834, "mean_copy_accuracy": 0.9960860460996628, "mean_gen_accuracy": 0.8752300143241882, "mean_token_accuracy": 0.9025016278028488, "num_tokens": 527584751.0, "sample_num_tokens": 8232.75, "step": 5943, "total_num_tokens": 527617682.0, "z_loss": 0.0005189460935071111 }, { "copy_logits_max": -4.947566986083984, "copy_logits_min": -687500032.0, "copy_num_tokens": 369.375, "epoch": 1.2140413581822824, "gen_logits_max": 3.3322834968566895, "gen_logits_mean": -17.924293518066406, "gen_logits_min": -29.922208786010742, "gen_logits_std": 3.1055004596710205, "gen_loss": 0.27000299096107483, "grad_norm": 0.3854529692345211, "learning_rate": 2.3421473684210527e-05, "loss": 0.3013, "mean_copy_accuracy": 0.994953915476799, "mean_gen_accuracy": 0.8692402243614197, "mean_token_accuracy": 0.8959164768457413, "num_tokens": 527847386.0, "sample_num_tokens": 7805.0, "step": 5944, "total_num_tokens": 527878606.0, "z_loss": 0.0004779142909683287 }, { "copy_logits_max": -3.046529531478882, "copy_logits_min": -750000000.0, "copy_num_tokens": 486.1875, "epoch": 1.2142455961194791, "gen_logits_max": 4.433313369750977, "gen_logits_mean": -15.092509269714355, "gen_logits_min": -27.18043327331543, "gen_logits_std": 3.0583934783935547, "gen_loss": 0.28213077783584595, "grad_norm": 0.3755524886088128, "learning_rate": 2.342021052631579e-05, "loss": 0.2868, "mean_copy_accuracy": 0.9959417283535004, "mean_gen_accuracy": 0.8703403770923615, "mean_token_accuracy": 0.9023648053407669, "num_tokens": 528124792.0, "sample_num_tokens": 8121.0, "step": 5945, "total_num_tokens": 528157276.0, "z_loss": 0.0005677782464772463 }, { "copy_logits_max": -5.488397121429443, "copy_logits_min": -687500032.0, "copy_num_tokens": 335.625, "epoch": 1.214449834056676, "gen_logits_max": 5.0896430015563965, "gen_logits_mean": -15.373476028442383, "gen_logits_min": -27.216064453125, "gen_logits_std": 3.018228530883789, "gen_loss": 0.3328021168708801, "grad_norm": 0.38335219668241005, "learning_rate": 2.3418947368421055e-05, "loss": 0.3019, "mean_copy_accuracy": 0.9967876374721527, "mean_gen_accuracy": 0.8664578050374985, "mean_token_accuracy": 0.897320881485939, "num_tokens": 528391333.0, "sample_num_tokens": 7934.75, "step": 5946, "total_num_tokens": 528423072.0, "z_loss": 0.0005698032910004258 }, { "copy_logits_max": -1.6925380229949951, "copy_logits_min": -750000000.0, "copy_num_tokens": 356.0625, "epoch": 1.214654071993873, "gen_logits_max": 5.1123809814453125, "gen_logits_mean": -14.765960693359375, "gen_logits_min": -27.060699462890625, "gen_logits_std": 3.032763957977295, "gen_loss": 0.27866360545158386, "grad_norm": 0.3622354312342204, "learning_rate": 2.3417684210526316e-05, "loss": 0.2778, "mean_copy_accuracy": 0.997383177280426, "mean_gen_accuracy": 0.8758710473775864, "mean_token_accuracy": 0.9049130976200104, "num_tokens": 528663049.0, "sample_num_tokens": 8234.75, "step": 5947, "total_num_tokens": 528695988.0, "z_loss": 0.0005073497304692864 }, { "copy_logits_max": -3.24414324760437, "copy_logits_min": -750000000.0, "copy_num_tokens": 376.625, "epoch": 1.2148583099310697, "gen_logits_max": 4.583757400512695, "gen_logits_mean": -15.038629531860352, "gen_logits_min": -26.936302185058594, "gen_logits_std": 3.0525476932525635, "gen_loss": 0.2830248177051544, "grad_norm": 0.3692418122991822, "learning_rate": 2.341642105263158e-05, "loss": 0.288, "mean_copy_accuracy": 0.9952280819416046, "mean_gen_accuracy": 0.8731378316879272, "mean_token_accuracy": 0.9012385308742523, "num_tokens": 528911711.0, "sample_num_tokens": 7363.75, "step": 5948, "total_num_tokens": 528941166.0, "z_loss": 0.0005163815803825855 }, { "copy_logits_max": -3.374469518661499, "copy_logits_min": -750000000.0, "copy_num_tokens": 717.875, "epoch": 1.2150625478682666, "gen_logits_max": 2.112217426300049, "gen_logits_mean": -17.738311767578125, "gen_logits_min": -30.016014099121094, "gen_logits_std": 3.1294829845428467, "gen_loss": 0.22083373367786407, "grad_norm": 0.4225392095771988, "learning_rate": 2.341515789473684e-05, "loss": 0.2664, "mean_copy_accuracy": 0.9966380745172501, "mean_gen_accuracy": 0.8776343464851379, "mean_token_accuracy": 0.9101869761943817, "num_tokens": 529183093.0, "sample_num_tokens": 10285.25, "step": 5949, "total_num_tokens": 529224234.0, "z_loss": 0.00042546680197119713 }, { "copy_logits_max": -5.669583797454834, "copy_logits_min": -750000000.0, "copy_num_tokens": 378.875, "epoch": 1.2152667858054633, "gen_logits_max": 3.969447374343872, "gen_logits_mean": -16.93842315673828, "gen_logits_min": -28.86280632019043, "gen_logits_std": 3.0595555305480957, "gen_loss": 0.26215970516204834, "grad_norm": 0.3592111149612394, "learning_rate": 2.3413894736842106e-05, "loss": 0.2828, "mean_copy_accuracy": 0.9966579675674438, "mean_gen_accuracy": 0.8747091144323349, "mean_token_accuracy": 0.9025380313396454, "num_tokens": 529464659.0, "sample_num_tokens": 8932.25, "step": 5950, "total_num_tokens": 529500388.0, "z_loss": 0.0004455862217582762 }, { "copy_logits_max": -2.62074613571167, "copy_logits_min": -750000128.0, "copy_num_tokens": 474.5, "epoch": 1.2154710237426603, "gen_logits_max": 4.066442966461182, "gen_logits_mean": -15.791749954223633, "gen_logits_min": -27.79184913635254, "gen_logits_std": 3.0452795028686523, "gen_loss": 0.30166512727737427, "grad_norm": 0.3632872300581327, "learning_rate": 2.3412631578947367e-05, "loss": 0.2694, "mean_copy_accuracy": 0.9973573237657547, "mean_gen_accuracy": 0.873183086514473, "mean_token_accuracy": 0.9069925397634506, "num_tokens": 529740479.0, "sample_num_tokens": 8843.75, "step": 5951, "total_num_tokens": 529775854.0, "z_loss": 0.0005630465457215905 }, { "copy_logits_max": -2.1958179473876953, "copy_logits_min": -750000000.0, "copy_num_tokens": 601.4375, "epoch": 1.215675261679857, "gen_logits_max": 3.814579486846924, "gen_logits_mean": -15.923696517944336, "gen_logits_min": -28.162260055541992, "gen_logits_std": 3.0867984294891357, "gen_loss": 0.25090253353118896, "grad_norm": 0.38553451897179764, "learning_rate": 2.341136842105263e-05, "loss": 0.2809, "mean_copy_accuracy": 0.9968386441469193, "mean_gen_accuracy": 0.8737154603004456, "mean_token_accuracy": 0.9059071242809296, "num_tokens": 530023086.0, "sample_num_tokens": 9124.0, "step": 5952, "total_num_tokens": 530059582.0, "z_loss": 0.00046541955089196563 }, { "copy_logits_max": -0.33849281072616577, "copy_logits_min": -750000128.0, "copy_num_tokens": 577.0625, "epoch": 1.2158794996170539, "gen_logits_max": 3.9754300117492676, "gen_logits_mean": -14.787015914916992, "gen_logits_min": -27.209247589111328, "gen_logits_std": 3.0436840057373047, "gen_loss": 0.2483963966369629, "grad_norm": 0.38452500410648854, "learning_rate": 2.3410105263157895e-05, "loss": 0.279, "mean_copy_accuracy": 0.9967681914567947, "mean_gen_accuracy": 0.8760885149240494, "mean_token_accuracy": 0.9063909351825714, "num_tokens": 530310111.0, "sample_num_tokens": 8827.75, "step": 5953, "total_num_tokens": 530345422.0, "z_loss": 0.0004894410958513618 }, { "copy_logits_max": -2.7242679595947266, "copy_logits_min": -687500032.0, "copy_num_tokens": 450.4375, "epoch": 1.2160837375542508, "gen_logits_max": 4.232662200927734, "gen_logits_mean": -15.279077529907227, "gen_logits_min": -27.865461349487305, "gen_logits_std": 3.0192270278930664, "gen_loss": 0.2933334708213806, "grad_norm": 0.38186650782615544, "learning_rate": 2.340884210526316e-05, "loss": 0.2987, "mean_copy_accuracy": 0.9962281733751297, "mean_gen_accuracy": 0.8686817735433578, "mean_token_accuracy": 0.898446574807167, "num_tokens": 530593837.0, "sample_num_tokens": 7677.25, "step": 5954, "total_num_tokens": 530624546.0, "z_loss": 0.0005397844361141324 }, { "copy_logits_max": -3.8280632495880127, "copy_logits_min": -750000000.0, "copy_num_tokens": 380.5625, "epoch": 1.2162879754914475, "gen_logits_max": 4.059105396270752, "gen_logits_mean": -16.962331771850586, "gen_logits_min": -29.190946578979492, "gen_logits_std": 3.0738086700439453, "gen_loss": 0.29633814096450806, "grad_norm": 0.41208041586714933, "learning_rate": 2.3407578947368424e-05, "loss": 0.2851, "mean_copy_accuracy": 0.995983898639679, "mean_gen_accuracy": 0.8771275877952576, "mean_token_accuracy": 0.9051198810338974, "num_tokens": 530867969.0, "sample_num_tokens": 8433.25, "step": 5955, "total_num_tokens": 530901702.0, "z_loss": 0.0005942832212895155 }, { "copy_logits_max": -1.2746853828430176, "copy_logits_min": -750000000.0, "copy_num_tokens": 367.625, "epoch": 1.2164922134286444, "gen_logits_max": 3.9415812492370605, "gen_logits_mean": -17.001094818115234, "gen_logits_min": -28.88182830810547, "gen_logits_std": 3.067012071609497, "gen_loss": 0.2805001735687256, "grad_norm": 0.39669928559464745, "learning_rate": 2.3406315789473685e-05, "loss": 0.2783, "mean_copy_accuracy": 0.9964039176702499, "mean_gen_accuracy": 0.8769360780715942, "mean_token_accuracy": 0.9060434550046921, "num_tokens": 531140245.0, "sample_num_tokens": 8169.75, "step": 5956, "total_num_tokens": 531172924.0, "z_loss": 0.0005409686709754169 }, { "copy_logits_max": -4.520979404449463, "copy_logits_min": -750000000.0, "copy_num_tokens": 355.9375, "epoch": 1.2166964513658411, "gen_logits_max": 4.190582275390625, "gen_logits_mean": -16.507360458374023, "gen_logits_min": -28.673004150390625, "gen_logits_std": 3.0688469409942627, "gen_loss": 0.24521559476852417, "grad_norm": 0.3771032931720077, "learning_rate": 2.340505263157895e-05, "loss": 0.2593, "mean_copy_accuracy": 0.996644526720047, "mean_gen_accuracy": 0.8847790509462357, "mean_token_accuracy": 0.909437358379364, "num_tokens": 531410214.0, "sample_num_tokens": 7598.0, "step": 5957, "total_num_tokens": 531440606.0, "z_loss": 0.000497535103932023 }, { "copy_logits_max": 0.28715360164642334, "copy_logits_min": -750000000.0, "copy_num_tokens": 597.75, "epoch": 1.216900689303038, "gen_logits_max": 4.1288018226623535, "gen_logits_mean": -15.318743705749512, "gen_logits_min": -27.415245056152344, "gen_logits_std": 3.039714813232422, "gen_loss": 0.2667125463485718, "grad_norm": 0.3792392935564252, "learning_rate": 2.340378947368421e-05, "loss": 0.2831, "mean_copy_accuracy": 0.997227668762207, "mean_gen_accuracy": 0.8729140311479568, "mean_token_accuracy": 0.9040534049272537, "num_tokens": 531677391.0, "sample_num_tokens": 9554.25, "step": 5958, "total_num_tokens": 531715608.0, "z_loss": 0.0005464187124744058 }, { "copy_logits_max": -0.7667746543884277, "copy_logits_min": -750000000.0, "copy_num_tokens": 453.6875, "epoch": 1.2171049272402348, "gen_logits_max": 4.122910499572754, "gen_logits_mean": -16.963001251220703, "gen_logits_min": -29.424671173095703, "gen_logits_std": 3.0795557498931885, "gen_loss": 0.28494662046432495, "grad_norm": 0.34743494407203707, "learning_rate": 2.3402526315789474e-05, "loss": 0.279, "mean_copy_accuracy": 0.9958668649196625, "mean_gen_accuracy": 0.8795673996210098, "mean_token_accuracy": 0.9052329063415527, "num_tokens": 531956322.0, "sample_num_tokens": 8525.0, "step": 5959, "total_num_tokens": 531990422.0, "z_loss": 0.0005387876299209893 }, { "copy_logits_max": 3.0182366371154785, "copy_logits_min": -750000000.0, "copy_num_tokens": 388.125, "epoch": 1.2173091651774317, "gen_logits_max": 5.638696193695068, "gen_logits_mean": -13.92776107788086, "gen_logits_min": -27.02431297302246, "gen_logits_std": 3.034959554672241, "gen_loss": 0.31267470121383667, "grad_norm": 0.40947104685248337, "learning_rate": 2.3401263157894735e-05, "loss": 0.2925, "mean_copy_accuracy": 0.9972349256277084, "mean_gen_accuracy": 0.86629319190979, "mean_token_accuracy": 0.9019366800785065, "num_tokens": 532240871.0, "sample_num_tokens": 8113.75, "step": 5960, "total_num_tokens": 532273326.0, "z_loss": 0.0006001095753163099 }, { "copy_logits_max": -0.5538198947906494, "copy_logits_min": -750000000.0, "copy_num_tokens": 505.0, "epoch": 1.2175134031146286, "gen_logits_max": 3.687483787536621, "gen_logits_mean": -16.285348892211914, "gen_logits_min": -28.31482696533203, "gen_logits_std": 3.0593652725219727, "gen_loss": 0.2802829444408417, "grad_norm": 0.39373132772272734, "learning_rate": 2.3400000000000003e-05, "loss": 0.2754, "mean_copy_accuracy": 0.9964925050735474, "mean_gen_accuracy": 0.8794049471616745, "mean_token_accuracy": 0.9090377986431122, "num_tokens": 532539887.0, "sample_num_tokens": 9228.25, "step": 5961, "total_num_tokens": 532576800.0, "z_loss": 0.0004971413873136044 }, { "copy_logits_max": 5.666783332824707, "copy_logits_min": -687500032.0, "copy_num_tokens": 679.4375, "epoch": 1.2177176410518253, "gen_logits_max": 5.042046546936035, "gen_logits_mean": -13.699607849121094, "gen_logits_min": -26.23986053466797, "gen_logits_std": 3.04606556892395, "gen_loss": 0.2717534303665161, "grad_norm": 0.33471649444862, "learning_rate": 2.3398736842105264e-05, "loss": 0.2588, "mean_copy_accuracy": 0.9973690807819366, "mean_gen_accuracy": 0.877068817615509, "mean_token_accuracy": 0.9128726720809937, "num_tokens": 532837238.0, "sample_num_tokens": 9595.5, "step": 5962, "total_num_tokens": 532875620.0, "z_loss": 0.0004853594582527876 }, { "copy_logits_max": -2.1766886711120605, "copy_logits_min": -750000000.0, "copy_num_tokens": 634.75, "epoch": 1.2179218789890223, "gen_logits_max": 3.2708938121795654, "gen_logits_mean": -17.318674087524414, "gen_logits_min": -29.78326416015625, "gen_logits_std": 3.105299234390259, "gen_loss": 0.26042667031288147, "grad_norm": 0.3527042001548083, "learning_rate": 2.3397473684210528e-05, "loss": 0.2836, "mean_copy_accuracy": 0.9967999160289764, "mean_gen_accuracy": 0.8731125742197037, "mean_token_accuracy": 0.9051330983638763, "num_tokens": 533135052.0, "sample_num_tokens": 9831.0, "step": 5963, "total_num_tokens": 533174376.0, "z_loss": 0.0004408263193909079 }, { "copy_logits_max": -2.7091410160064697, "copy_logits_min": -687500032.0, "copy_num_tokens": 329.1875, "epoch": 1.218126116926219, "gen_logits_max": 5.13163948059082, "gen_logits_mean": -15.624940872192383, "gen_logits_min": -27.670198440551758, "gen_logits_std": 3.0418717861175537, "gen_loss": 0.2890288233757019, "grad_norm": 0.3679655786695002, "learning_rate": 2.339621052631579e-05, "loss": 0.2646, "mean_copy_accuracy": 0.9973151832818985, "mean_gen_accuracy": 0.8789459466934204, "mean_token_accuracy": 0.9103121012449265, "num_tokens": 533423770.0, "sample_num_tokens": 7823.5, "step": 5964, "total_num_tokens": 533455064.0, "z_loss": 0.0005203483160585165 }, { "copy_logits_max": 2.264484405517578, "copy_logits_min": -687500032.0, "copy_num_tokens": 616.75, "epoch": 1.218330354863416, "gen_logits_max": 4.862955570220947, "gen_logits_mean": -13.528809547424316, "gen_logits_min": -26.716938018798828, "gen_logits_std": 3.048473596572876, "gen_loss": 0.27226561307907104, "grad_norm": 0.35491029852519856, "learning_rate": 2.3394947368421053e-05, "loss": 0.262, "mean_copy_accuracy": 0.9970559179782867, "mean_gen_accuracy": 0.8793076872825623, "mean_token_accuracy": 0.9121941328048706, "num_tokens": 533712289.0, "sample_num_tokens": 8943.25, "step": 5965, "total_num_tokens": 533748062.0, "z_loss": 0.0005089543992653489 }, { "copy_logits_max": 0.8056365251541138, "copy_logits_min": -750000000.0, "copy_num_tokens": 374.0625, "epoch": 1.2185345928006126, "gen_logits_max": 4.899483680725098, "gen_logits_mean": -14.699967384338379, "gen_logits_min": -26.815265655517578, "gen_logits_std": 3.0419301986694336, "gen_loss": 0.302619069814682, "grad_norm": 0.36508355760483957, "learning_rate": 2.3393684210526314e-05, "loss": 0.2812, "mean_copy_accuracy": 0.9960390627384186, "mean_gen_accuracy": 0.877797394990921, "mean_token_accuracy": 0.9051266312599182, "num_tokens": 533978088.0, "sample_num_tokens": 7408.0, "step": 5966, "total_num_tokens": 534007720.0, "z_loss": 0.0005246161017566919 }, { "copy_logits_max": -3.63437557220459, "copy_logits_min": -750000000.0, "copy_num_tokens": 442.6875, "epoch": 1.2187388307378095, "gen_logits_max": 4.714320182800293, "gen_logits_mean": -15.975076675415039, "gen_logits_min": -28.506845474243164, "gen_logits_std": 3.0748348236083984, "gen_loss": 0.28270503878593445, "grad_norm": 0.6099576739234803, "learning_rate": 2.339242105263158e-05, "loss": 0.2737, "mean_copy_accuracy": 0.9967341423034668, "mean_gen_accuracy": 0.8766730427742004, "mean_token_accuracy": 0.9084389507770538, "num_tokens": 534278931.0, "sample_num_tokens": 8504.75, "step": 5967, "total_num_tokens": 534312950.0, "z_loss": 0.00046080260653980076 }, { "copy_logits_max": -3.0629513263702393, "copy_logits_min": -687500032.0, "copy_num_tokens": 407.75, "epoch": 1.2189430686750065, "gen_logits_max": 4.484958648681641, "gen_logits_mean": -15.964012145996094, "gen_logits_min": -28.10079574584961, "gen_logits_std": 3.0454154014587402, "gen_loss": 0.27149832248687744, "grad_norm": 0.35269072997418094, "learning_rate": 2.3391157894736843e-05, "loss": 0.273, "mean_copy_accuracy": 0.9969872981309891, "mean_gen_accuracy": 0.8759273290634155, "mean_token_accuracy": 0.9071071445941925, "num_tokens": 534577104.0, "sample_num_tokens": 8294.0, "step": 5968, "total_num_tokens": 534610280.0, "z_loss": 0.0004865983210038394 }, { "copy_logits_max": 0.14035093784332275, "copy_logits_min": -687500032.0, "copy_num_tokens": 278.875, "epoch": 1.2191473066122032, "gen_logits_max": 5.0626726150512695, "gen_logits_mean": -14.789992332458496, "gen_logits_min": -27.700626373291016, "gen_logits_std": 3.0511441230773926, "gen_loss": 0.2772289514541626, "grad_norm": 0.37036267507202747, "learning_rate": 2.3389894736842107e-05, "loss": 0.2681, "mean_copy_accuracy": 0.9958467185497284, "mean_gen_accuracy": 0.8836089372634888, "mean_token_accuracy": 0.9078581482172012, "num_tokens": 534821323.0, "sample_num_tokens": 7188.75, "step": 5969, "total_num_tokens": 534850078.0, "z_loss": 0.0004585105343721807 }, { "copy_logits_max": -4.595539093017578, "copy_logits_min": -750000000.0, "copy_num_tokens": 325.1875, "epoch": 1.2193515445494, "gen_logits_max": 4.474675178527832, "gen_logits_mean": -15.791328430175781, "gen_logits_min": -27.90194320678711, "gen_logits_std": 3.0657246112823486, "gen_loss": 0.2803046703338623, "grad_norm": 0.36812759703013287, "learning_rate": 2.338863157894737e-05, "loss": 0.2717, "mean_copy_accuracy": 0.9962578266859055, "mean_gen_accuracy": 0.8783504217863083, "mean_token_accuracy": 0.9067598730325699, "num_tokens": 535089928.0, "sample_num_tokens": 7382.5, "step": 5970, "total_num_tokens": 535119458.0, "z_loss": 0.00046088287490420043 }, { "copy_logits_max": -1.2191848754882812, "copy_logits_min": -750000000.0, "copy_num_tokens": 441.625, "epoch": 1.2195557824865968, "gen_logits_max": 4.038496971130371, "gen_logits_mean": -16.91162109375, "gen_logits_min": -29.097396850585938, "gen_logits_std": 3.079554796218872, "gen_loss": 0.3071456253528595, "grad_norm": 0.41112298261184577, "learning_rate": 2.3387368421052633e-05, "loss": 0.2754, "mean_copy_accuracy": 0.9971705079078674, "mean_gen_accuracy": 0.8788293302059174, "mean_token_accuracy": 0.9048654735088348, "num_tokens": 535339906.0, "sample_num_tokens": 8956.0, "step": 5971, "total_num_tokens": 535375730.0, "z_loss": 0.0005943287396803498 }, { "copy_logits_max": -4.101675510406494, "copy_logits_min": -750000000.0, "copy_num_tokens": 438.1875, "epoch": 1.2197600204237937, "gen_logits_max": 4.002228736877441, "gen_logits_mean": -16.696685791015625, "gen_logits_min": -28.505142211914062, "gen_logits_std": 3.0612425804138184, "gen_loss": 0.2804819941520691, "grad_norm": 0.3775405831252556, "learning_rate": 2.3386105263157897e-05, "loss": 0.2754, "mean_copy_accuracy": 0.9966096580028534, "mean_gen_accuracy": 0.8809877932071686, "mean_token_accuracy": 0.9086033999919891, "num_tokens": 535623826.0, "sample_num_tokens": 8385.5, "step": 5972, "total_num_tokens": 535657368.0, "z_loss": 0.0005050701438449323 }, { "copy_logits_max": 0.5297361612319946, "copy_logits_min": -687500032.0, "copy_num_tokens": 619.3125, "epoch": 1.2199642583609906, "gen_logits_max": 4.671759605407715, "gen_logits_mean": -13.042959213256836, "gen_logits_min": -26.007158279418945, "gen_logits_std": 3.029405355453491, "gen_loss": 0.2514208257198334, "grad_norm": 0.3781810884645313, "learning_rate": 2.3384842105263158e-05, "loss": 0.2935, "mean_copy_accuracy": 0.9963282346725464, "mean_gen_accuracy": 0.870834544301033, "mean_token_accuracy": 0.9020272791385651, "num_tokens": 535884210.0, "sample_num_tokens": 8362.5, "step": 5973, "total_num_tokens": 535917660.0, "z_loss": 0.00046952112461440265 }, { "copy_logits_max": -1.8980798721313477, "copy_logits_min": -750000000.0, "copy_num_tokens": 559.0, "epoch": 1.2201684962981874, "gen_logits_max": 3.3606150150299072, "gen_logits_mean": -16.283384323120117, "gen_logits_min": -28.565580368041992, "gen_logits_std": 3.0950522422790527, "gen_loss": 0.2809499204158783, "grad_norm": 0.36534435351485495, "learning_rate": 2.3383578947368422e-05, "loss": 0.2686, "mean_copy_accuracy": 0.9963078647851944, "mean_gen_accuracy": 0.8792869001626968, "mean_token_accuracy": 0.9093852490186691, "num_tokens": 536147374.0, "sample_num_tokens": 8335.5, "step": 5974, "total_num_tokens": 536180716.0, "z_loss": 0.000507194665260613 }, { "copy_logits_max": -1.144370198249817, "copy_logits_min": -750000000.0, "copy_num_tokens": 442.0, "epoch": 1.2203727342353843, "gen_logits_max": 4.252211570739746, "gen_logits_mean": -16.006946563720703, "gen_logits_min": -27.802640914916992, "gen_logits_std": 3.059648275375366, "gen_loss": 0.27551427483558655, "grad_norm": 0.40725005948981413, "learning_rate": 2.3382315789473683e-05, "loss": 0.2659, "mean_copy_accuracy": 0.9960954040288925, "mean_gen_accuracy": 0.8808838278055191, "mean_token_accuracy": 0.9085600525140762, "num_tokens": 536411094.0, "sample_num_tokens": 8577.5, "step": 5975, "total_num_tokens": 536445404.0, "z_loss": 0.0005090793129056692 }, { "copy_logits_max": -0.9565008878707886, "copy_logits_min": -625000064.0, "copy_num_tokens": 553.0625, "epoch": 1.220576972172581, "gen_logits_max": 4.9935407638549805, "gen_logits_mean": -14.051351547241211, "gen_logits_min": -26.37926483154297, "gen_logits_std": 3.0167908668518066, "gen_loss": 0.296825647354126, "grad_norm": 0.4085969150676841, "learning_rate": 2.3381052631578947e-05, "loss": 0.3089, "mean_copy_accuracy": 0.99643374979496, "mean_gen_accuracy": 0.8668210208415985, "mean_token_accuracy": 0.8961376994848251, "num_tokens": 536691709.0, "sample_num_tokens": 9228.25, "step": 5976, "total_num_tokens": 536728622.0, "z_loss": 0.0006039638537913561 }, { "copy_logits_max": -1.5239713191986084, "copy_logits_min": -750000000.0, "copy_num_tokens": 547.6875, "epoch": 1.220781210109778, "gen_logits_max": 4.279376983642578, "gen_logits_mean": -14.713970184326172, "gen_logits_min": -27.61670684814453, "gen_logits_std": 3.0591163635253906, "gen_loss": 0.2635686993598938, "grad_norm": 0.4231910488028734, "learning_rate": 2.337978947368421e-05, "loss": 0.2915, "mean_copy_accuracy": 0.9964973032474518, "mean_gen_accuracy": 0.8692331612110138, "mean_token_accuracy": 0.9009673148393631, "num_tokens": 536961662.0, "sample_num_tokens": 8304.5, "step": 5977, "total_num_tokens": 536994880.0, "z_loss": 0.0005592925008386374 }, { "copy_logits_max": -1.1488568782806396, "copy_logits_min": -750000064.0, "copy_num_tokens": 651.9375, "epoch": 1.2209854480469748, "gen_logits_max": 3.74360990524292, "gen_logits_mean": -14.860895156860352, "gen_logits_min": -27.597732543945312, "gen_logits_std": 3.0666422843933105, "gen_loss": 0.23196843266487122, "grad_norm": 0.3759618470705339, "learning_rate": 2.3378526315789476e-05, "loss": 0.2544, "mean_copy_accuracy": 0.9960904866456985, "mean_gen_accuracy": 0.8850547671318054, "mean_token_accuracy": 0.913161888718605, "num_tokens": 537230217.0, "sample_num_tokens": 8815.25, "step": 5978, "total_num_tokens": 537265478.0, "z_loss": 0.00045025680446997285 }, { "copy_logits_max": -2.35109281539917, "copy_logits_min": -687500032.0, "copy_num_tokens": 489.625, "epoch": 1.2211896859841715, "gen_logits_max": 4.018598556518555, "gen_logits_mean": -15.909194946289062, "gen_logits_min": -28.945415496826172, "gen_logits_std": 3.0827505588531494, "gen_loss": 0.23078399896621704, "grad_norm": 0.363090943906939, "learning_rate": 2.3377263157894737e-05, "loss": 0.2602, "mean_copy_accuracy": 0.9959744513034821, "mean_gen_accuracy": 0.8834962099790573, "mean_token_accuracy": 0.9114523828029633, "num_tokens": 537522423.0, "sample_num_tokens": 8910.75, "step": 5979, "total_num_tokens": 537558066.0, "z_loss": 0.00044992624316364527 }, { "copy_logits_max": -3.5411291122436523, "copy_logits_min": -687500032.0, "copy_num_tokens": 401.375, "epoch": 1.2213939239213685, "gen_logits_max": 4.5047807693481445, "gen_logits_mean": -15.07020378112793, "gen_logits_min": -28.092668533325195, "gen_logits_std": 3.0725040435791016, "gen_loss": 0.2713698744773865, "grad_norm": 0.36976176336282046, "learning_rate": 2.3376e-05, "loss": 0.2831, "mean_copy_accuracy": 0.9957629144191742, "mean_gen_accuracy": 0.8726742565631866, "mean_token_accuracy": 0.9036329835653305, "num_tokens": 537814551.0, "sample_num_tokens": 8138.75, "step": 5980, "total_num_tokens": 537847106.0, "z_loss": 0.0004990349407307804 }, { "copy_logits_max": -4.062045574188232, "copy_logits_min": -750000000.0, "copy_num_tokens": 519.125, "epoch": 1.2215981618585652, "gen_logits_max": 3.9935407638549805, "gen_logits_mean": -16.75648307800293, "gen_logits_min": -28.99777603149414, "gen_logits_std": 3.0739831924438477, "gen_loss": 0.2770649790763855, "grad_norm": 0.3978928169091932, "learning_rate": 2.3374736842105265e-05, "loss": 0.2872, "mean_copy_accuracy": 0.9968069642782211, "mean_gen_accuracy": 0.8750084638595581, "mean_token_accuracy": 0.9025006890296936, "num_tokens": 538069985.0, "sample_num_tokens": 9224.75, "step": 5981, "total_num_tokens": 538106884.0, "z_loss": 0.0005554794333875179 }, { "copy_logits_max": -2.2925102710723877, "copy_logits_min": -750000000.0, "copy_num_tokens": 541.0, "epoch": 1.221802399795762, "gen_logits_max": 4.548314094543457, "gen_logits_mean": -15.089681625366211, "gen_logits_min": -27.972057342529297, "gen_logits_std": 3.0812604427337646, "gen_loss": 0.23176272213459015, "grad_norm": 0.4349449961600069, "learning_rate": 2.3373473684210526e-05, "loss": 0.276, "mean_copy_accuracy": 0.9965183436870575, "mean_gen_accuracy": 0.8785342127084732, "mean_token_accuracy": 0.9057543277740479, "num_tokens": 538324882.0, "sample_num_tokens": 8790.0, "step": 5982, "total_num_tokens": 538360042.0, "z_loss": 0.00042355243931524456 }, { "copy_logits_max": -1.9981484413146973, "copy_logits_min": -750000000.0, "copy_num_tokens": 456.625, "epoch": 1.2220066377329588, "gen_logits_max": 6.097820281982422, "gen_logits_mean": -13.97946548461914, "gen_logits_min": -26.612382888793945, "gen_logits_std": 3.032616138458252, "gen_loss": 0.2933279871940613, "grad_norm": 0.41743474187540625, "learning_rate": 2.337221052631579e-05, "loss": 0.2759, "mean_copy_accuracy": 0.9966162592172623, "mean_gen_accuracy": 0.8699695318937302, "mean_token_accuracy": 0.9056238383054733, "num_tokens": 538603364.0, "sample_num_tokens": 9158.5, "step": 5983, "total_num_tokens": 538639998.0, "z_loss": 0.000517737353220582 }, { "copy_logits_max": -3.686619758605957, "copy_logits_min": -750000000.0, "copy_num_tokens": 391.1875, "epoch": 1.2222108756701557, "gen_logits_max": 5.442770957946777, "gen_logits_mean": -14.751019477844238, "gen_logits_min": -27.43683624267578, "gen_logits_std": 3.0375194549560547, "gen_loss": 0.29366111755371094, "grad_norm": 0.3626973178275059, "learning_rate": 2.337094736842105e-05, "loss": 0.2942, "mean_copy_accuracy": 0.9966252595186234, "mean_gen_accuracy": 0.8739284425973892, "mean_token_accuracy": 0.9016127735376358, "num_tokens": 538882782.0, "sample_num_tokens": 9510.0, "step": 5984, "total_num_tokens": 538920822.0, "z_loss": 0.0005006694700568914 }, { "copy_logits_max": -2.8699839115142822, "copy_logits_min": -750000000.0, "copy_num_tokens": 298.9375, "epoch": 1.2224151136073527, "gen_logits_max": 4.995627403259277, "gen_logits_mean": -15.512422561645508, "gen_logits_min": -27.66537857055664, "gen_logits_std": 3.042056083679199, "gen_loss": 0.3268136978149414, "grad_norm": 0.369531991126444, "learning_rate": 2.3369684210526316e-05, "loss": 0.2798, "mean_copy_accuracy": 0.9955022782087326, "mean_gen_accuracy": 0.8767955005168915, "mean_token_accuracy": 0.9058272838592529, "num_tokens": 539174373.0, "sample_num_tokens": 7146.75, "step": 5985, "total_num_tokens": 539202960.0, "z_loss": 0.0005809116410091519 }, { "copy_logits_max": -3.6303932666778564, "copy_logits_min": -687500032.0, "copy_num_tokens": 476.625, "epoch": 1.2226193515445494, "gen_logits_max": 5.758384704589844, "gen_logits_mean": -14.483642578125, "gen_logits_min": -27.472946166992188, "gen_logits_std": 3.053792953491211, "gen_loss": 0.3027004897594452, "grad_norm": 0.42398664927531216, "learning_rate": 2.336842105263158e-05, "loss": 0.2935, "mean_copy_accuracy": 0.9957370907068253, "mean_gen_accuracy": 0.872650533914566, "mean_token_accuracy": 0.901146188378334, "num_tokens": 539443089.0, "sample_num_tokens": 8810.25, "step": 5986, "total_num_tokens": 539478330.0, "z_loss": 0.0005892722401767969 }, { "copy_logits_max": -2.3271002769470215, "copy_logits_min": -750000000.0, "copy_num_tokens": 430.25, "epoch": 1.2228235894817463, "gen_logits_max": 4.805605888366699, "gen_logits_mean": -15.925954818725586, "gen_logits_min": -28.302425384521484, "gen_logits_std": 3.094068765640259, "gen_loss": 0.26800400018692017, "grad_norm": 0.4203814767241387, "learning_rate": 2.3367157894736845e-05, "loss": 0.2726, "mean_copy_accuracy": 0.9966957420110703, "mean_gen_accuracy": 0.8808156847953796, "mean_token_accuracy": 0.908204048871994, "num_tokens": 539707261.0, "sample_num_tokens": 9049.25, "step": 5987, "total_num_tokens": 539743458.0, "z_loss": 0.0005747542018070817 }, { "copy_logits_max": -1.8013789653778076, "copy_logits_min": -750000000.0, "copy_num_tokens": 573.8125, "epoch": 1.223027827418943, "gen_logits_max": 4.883896827697754, "gen_logits_mean": -14.715435981750488, "gen_logits_min": -28.069900512695312, "gen_logits_std": 3.0551092624664307, "gen_loss": 0.27446484565734863, "grad_norm": 0.3810507732860667, "learning_rate": 2.3365894736842105e-05, "loss": 0.2649, "mean_copy_accuracy": 0.9960073679685593, "mean_gen_accuracy": 0.8809810131788254, "mean_token_accuracy": 0.908133789896965, "num_tokens": 539979396.0, "sample_num_tokens": 9627.5, "step": 5988, "total_num_tokens": 540017906.0, "z_loss": 0.0004986034473404288 }, { "copy_logits_max": -5.272119522094727, "copy_logits_min": -750000064.0, "copy_num_tokens": 487.9375, "epoch": 1.22323206535614, "gen_logits_max": 3.468027114868164, "gen_logits_mean": -17.091150283813477, "gen_logits_min": -29.457412719726562, "gen_logits_std": 3.0954599380493164, "gen_loss": 0.3160553574562073, "grad_norm": 0.3449647825410466, "learning_rate": 2.336463157894737e-05, "loss": 0.2636, "mean_copy_accuracy": 0.9963486939668655, "mean_gen_accuracy": 0.87888965010643, "mean_token_accuracy": 0.9094414263963699, "num_tokens": 540273388.0, "sample_num_tokens": 9294.5, "step": 5989, "total_num_tokens": 540310566.0, "z_loss": 0.0005244479980319738 }, { "copy_logits_max": -1.3342171907424927, "copy_logits_min": -750000000.0, "copy_num_tokens": 410.0, "epoch": 1.2234363032933366, "gen_logits_max": 4.337306976318359, "gen_logits_mean": -15.960296630859375, "gen_logits_min": -28.346893310546875, "gen_logits_std": 3.0743319988250732, "gen_loss": 0.2828199863433838, "grad_norm": 0.41041664868529854, "learning_rate": 2.336336842105263e-05, "loss": 0.2771, "mean_copy_accuracy": 0.9956893026828766, "mean_gen_accuracy": 0.8785209953784943, "mean_token_accuracy": 0.9069791287183762, "num_tokens": 540562409.0, "sample_num_tokens": 8436.25, "step": 5990, "total_num_tokens": 540596154.0, "z_loss": 0.000505625328514725 }, { "copy_logits_max": -4.0135087966918945, "copy_logits_min": -750000000.0, "copy_num_tokens": 381.75, "epoch": 1.2236405412305336, "gen_logits_max": 3.390624523162842, "gen_logits_mean": -18.39472198486328, "gen_logits_min": -30.416933059692383, "gen_logits_std": 3.1296768188476562, "gen_loss": 0.2771364748477936, "grad_norm": 0.4462035776575685, "learning_rate": 2.3362105263157895e-05, "loss": 0.3129, "mean_copy_accuracy": 0.9961899071931839, "mean_gen_accuracy": 0.8665729910135269, "mean_token_accuracy": 0.8939114212989807, "num_tokens": 540827199.0, "sample_num_tokens": 8066.25, "step": 5991, "total_num_tokens": 540859464.0, "z_loss": 0.0004946955014020205 }, { "copy_logits_max": -1.4337588548660278, "copy_logits_min": -750000000.0, "copy_num_tokens": 376.0, "epoch": 1.2238447791677305, "gen_logits_max": 5.02020788192749, "gen_logits_mean": -15.121665000915527, "gen_logits_min": -28.457054138183594, "gen_logits_std": 3.0511856079101562, "gen_loss": 0.36981096863746643, "grad_norm": 0.3795609921696701, "learning_rate": 2.3360842105263156e-05, "loss": 0.2934, "mean_copy_accuracy": 0.9970766603946686, "mean_gen_accuracy": 0.8667851686477661, "mean_token_accuracy": 0.9013914912939072, "num_tokens": 541087188.0, "sample_num_tokens": 7207.5, "step": 5992, "total_num_tokens": 541116018.0, "z_loss": 0.0006752575282007456 }, { "copy_logits_max": -3.2525248527526855, "copy_logits_min": -687500032.0, "copy_num_tokens": 405.5625, "epoch": 1.2240490171049272, "gen_logits_max": 5.341306209564209, "gen_logits_mean": -15.41786003112793, "gen_logits_min": -28.68810272216797, "gen_logits_std": 3.081258773803711, "gen_loss": 0.24790194630622864, "grad_norm": 0.432011159066154, "learning_rate": 2.335957894736842e-05, "loss": 0.294, "mean_copy_accuracy": 0.9951635748147964, "mean_gen_accuracy": 0.868763267993927, "mean_token_accuracy": 0.9009864181280136, "num_tokens": 541348931.0, "sample_num_tokens": 8011.75, "step": 5993, "total_num_tokens": 541380978.0, "z_loss": 0.0005010142922401428 }, { "copy_logits_max": -2.5325655937194824, "copy_logits_min": -750000000.0, "copy_num_tokens": 330.0625, "epoch": 1.2242532550421241, "gen_logits_max": 5.503277778625488, "gen_logits_mean": -14.709172248840332, "gen_logits_min": -28.003755569458008, "gen_logits_std": 3.0360960960388184, "gen_loss": 0.28792959451675415, "grad_norm": 0.4215619974684884, "learning_rate": 2.3358315789473685e-05, "loss": 0.2782, "mean_copy_accuracy": 0.9958683550357819, "mean_gen_accuracy": 0.8754472881555557, "mean_token_accuracy": 0.9053025096654892, "num_tokens": 541603978.0, "sample_num_tokens": 7381.0, "step": 5994, "total_num_tokens": 541633502.0, "z_loss": 0.0005109731573611498 }, { "copy_logits_max": -3.4996156692504883, "copy_logits_min": -687500032.0, "copy_num_tokens": 364.9375, "epoch": 1.2244574929793208, "gen_logits_max": 5.358492851257324, "gen_logits_mean": -15.585235595703125, "gen_logits_min": -27.559154510498047, "gen_logits_std": 3.033348560333252, "gen_loss": 0.31362369656562805, "grad_norm": 0.37441342064673905, "learning_rate": 2.335705263157895e-05, "loss": 0.2884, "mean_copy_accuracy": 0.9969208240509033, "mean_gen_accuracy": 0.869470864534378, "mean_token_accuracy": 0.9023271054029465, "num_tokens": 541886956.0, "sample_num_tokens": 7527.5, "step": 5995, "total_num_tokens": 541917066.0, "z_loss": 0.0005654084379784763 }, { "copy_logits_max": -1.2993570566177368, "copy_logits_min": -750000000.0, "copy_num_tokens": 444.9375, "epoch": 1.2246617309165178, "gen_logits_max": 4.394115447998047, "gen_logits_mean": -15.512228012084961, "gen_logits_min": -27.784210205078125, "gen_logits_std": 3.0686094760894775, "gen_loss": 0.32287734746932983, "grad_norm": 0.3663577650605052, "learning_rate": 2.3355789473684213e-05, "loss": 0.2898, "mean_copy_accuracy": 0.9957337379455566, "mean_gen_accuracy": 0.8756955415010452, "mean_token_accuracy": 0.9029585868120193, "num_tokens": 542151296.0, "sample_num_tokens": 7885.5, "step": 5996, "total_num_tokens": 542182838.0, "z_loss": 0.0005549835041165352 }, { "copy_logits_max": -1.5125538110733032, "copy_logits_min": -750000000.0, "copy_num_tokens": 411.5, "epoch": 1.2248659688537145, "gen_logits_max": 5.250916481018066, "gen_logits_mean": -15.291881561279297, "gen_logits_min": -27.519859313964844, "gen_logits_std": 3.061042308807373, "gen_loss": 0.2677776515483856, "grad_norm": 0.384220717327685, "learning_rate": 2.3354526315789474e-05, "loss": 0.2834, "mean_copy_accuracy": 0.9964209645986557, "mean_gen_accuracy": 0.8729303777217865, "mean_token_accuracy": 0.9032662659883499, "num_tokens": 542415338.0, "sample_num_tokens": 8370.5, "step": 5997, "total_num_tokens": 542448820.0, "z_loss": 0.0004953294992446899 }, { "copy_logits_max": -1.4351601600646973, "copy_logits_min": -750000000.0, "copy_num_tokens": 883.1875, "epoch": 1.2250702067909114, "gen_logits_max": 4.609550952911377, "gen_logits_mean": -14.453962326049805, "gen_logits_min": -28.001876831054688, "gen_logits_std": 3.0805702209472656, "gen_loss": 0.2391470968723297, "grad_norm": 0.4615287239934967, "learning_rate": 2.335326315789474e-05, "loss": 0.2565, "mean_copy_accuracy": 0.9958671182394028, "mean_gen_accuracy": 0.8829302489757538, "mean_token_accuracy": 0.9130971282720566, "num_tokens": 542696483.0, "sample_num_tokens": 11673.25, "step": 5998, "total_num_tokens": 542743176.0, "z_loss": 0.00048317055916413665 }, { "copy_logits_max": -1.704542875289917, "copy_logits_min": -750000000.0, "copy_num_tokens": 409.0, "epoch": 1.2252744447281083, "gen_logits_max": 5.660741806030273, "gen_logits_mean": -15.458257675170898, "gen_logits_min": -27.816251754760742, "gen_logits_std": 3.033797025680542, "gen_loss": 0.3072138726711273, "grad_norm": 0.35111469620599567, "learning_rate": 2.3352e-05, "loss": 0.2846, "mean_copy_accuracy": 0.9970872849225998, "mean_gen_accuracy": 0.8776353150606155, "mean_token_accuracy": 0.9036479890346527, "num_tokens": 542972363.0, "sample_num_tokens": 7720.25, "step": 5999, "total_num_tokens": 543003244.0, "z_loss": 0.0005860908422619104 }, { "epoch": 1.225478682665305, "grad_norm": 0.36513723429706973, "learning_rate": 2.3350736842105264e-05, "loss": 0.2584, "step": 6000 }, { "epoch": 1.225478682665305, "eval_copy_logits_max": -7.696052551269531, "eval_copy_logits_min": -82.64293670654297, "eval_gen_logits_max": 3.2515339851379395, "eval_gen_logits_mean": -20.581865310668945, "eval_gen_logits_min": -31.59259033203125, "eval_gen_logits_std": 3.068466901779175, "eval_gen_loss": 0.32169878482818604, "eval_loss": 0.3072347640991211, "eval_mean_copy_accuracy": 0.9941314458847046, "eval_mean_gen_accuracy": 0.8799310326576233, "eval_mean_token_accuracy": 0.8946388363838196, "eval_num_tokens": 543282140.0, "eval_runtime": 0.6877, "eval_samples_per_second": 11.633, "eval_steps_per_second": 2.908, "eval_total_num_tokens": 543282140.0, "eval_z_loss": 0.0005538344266824424, "step": 6000 }, { "copy_logits_max": -4.8922295570373535, "copy_logits_min": -750000000.0, "copy_num_tokens": 424.1875, "epoch": 1.225682920602502, "gen_logits_max": 5.740653038024902, "gen_logits_mean": -15.595389366149902, "gen_logits_min": -28.472267150878906, "gen_logits_std": 3.056936264038086, "gen_loss": 0.28375673294067383, "grad_norm": 0.3938507667255133, "learning_rate": 2.3349473684210525e-05, "loss": 0.2718, "mean_copy_accuracy": 0.9962208345532417, "mean_gen_accuracy": 0.8792668506503105, "mean_token_accuracy": 0.9100908488035202, "num_tokens": 543528273.0, "sample_num_tokens": 8105.25, "step": 6001, "total_num_tokens": 543560694.0, "z_loss": 0.0005505927256308496 }, { "copy_logits_max": -4.6562042236328125, "copy_logits_min": -750000000.0, "copy_num_tokens": 305.8125, "epoch": 1.2258871585396989, "gen_logits_max": 5.0493011474609375, "gen_logits_mean": -16.99417495727539, "gen_logits_min": -29.38364028930664, "gen_logits_std": 3.064016580581665, "gen_loss": 0.2850254476070404, "grad_norm": 0.41742080810822746, "learning_rate": 2.3348210526315792e-05, "loss": 0.282, "mean_copy_accuracy": 0.9949400126934052, "mean_gen_accuracy": 0.8773791342973709, "mean_token_accuracy": 0.9045190513134003, "num_tokens": 543782346.0, "sample_num_tokens": 6765.5, "step": 6002, "total_num_tokens": 543809408.0, "z_loss": 0.0005660424940288067 }, { "copy_logits_max": -2.873664379119873, "copy_logits_min": -750000000.0, "copy_num_tokens": 463.0, "epoch": 1.2260913964768956, "gen_logits_max": 5.906895160675049, "gen_logits_mean": -14.784337997436523, "gen_logits_min": -27.622526168823242, "gen_logits_std": 3.0407328605651855, "gen_loss": 0.2795606851577759, "grad_norm": 0.3705408326490824, "learning_rate": 2.3346947368421053e-05, "loss": 0.2801, "mean_copy_accuracy": 0.9963204860687256, "mean_gen_accuracy": 0.879424974322319, "mean_token_accuracy": 0.9044936895370483, "num_tokens": 544050133.0, "sample_num_tokens": 9886.25, "step": 6003, "total_num_tokens": 544089678.0, "z_loss": 0.0005741557688452303 }, { "copy_logits_max": -0.8552037477493286, "copy_logits_min": -750000000.0, "copy_num_tokens": 341.625, "epoch": 1.2262956344140925, "gen_logits_max": 7.566359519958496, "gen_logits_mean": -13.234724998474121, "gen_logits_min": -26.45517349243164, "gen_logits_std": 3.0238890647888184, "gen_loss": 0.2930227518081665, "grad_norm": 0.37560467347970183, "learning_rate": 2.3345684210526318e-05, "loss": 0.2794, "mean_copy_accuracy": 0.9971076846122742, "mean_gen_accuracy": 0.8763962239027023, "mean_token_accuracy": 0.9044604450464249, "num_tokens": 544312753.0, "sample_num_tokens": 8091.75, "step": 6004, "total_num_tokens": 544345120.0, "z_loss": 0.0006416758988052607 }, { "copy_logits_max": -3.853213310241699, "copy_logits_min": -750000064.0, "copy_num_tokens": 327.4375, "epoch": 1.2264998723512892, "gen_logits_max": 4.555783271789551, "gen_logits_mean": -16.860633850097656, "gen_logits_min": -29.07140350341797, "gen_logits_std": 3.052015781402588, "gen_loss": 0.3101917803287506, "grad_norm": 0.3907385794729183, "learning_rate": 2.334442105263158e-05, "loss": 0.2857, "mean_copy_accuracy": 0.9962857514619827, "mean_gen_accuracy": 0.8682495653629303, "mean_token_accuracy": 0.9036110490560532, "num_tokens": 544575702.0, "sample_num_tokens": 7233.0, "step": 6005, "total_num_tokens": 544604634.0, "z_loss": 0.0006061462918296456 }, { "copy_logits_max": -0.9940756559371948, "copy_logits_min": -687500032.0, "copy_num_tokens": 794.75, "epoch": 1.2267041102884861, "gen_logits_max": 4.3687639236450195, "gen_logits_mean": -14.771819114685059, "gen_logits_min": -28.22526741027832, "gen_logits_std": 3.068039894104004, "gen_loss": 0.24096283316612244, "grad_norm": 0.3952352530173569, "learning_rate": 2.3343157894736843e-05, "loss": 0.2616, "mean_copy_accuracy": 0.996627077460289, "mean_gen_accuracy": 0.8751329779624939, "mean_token_accuracy": 0.9114865958690643, "num_tokens": 544862141.0, "sample_num_tokens": 10569.75, "step": 6006, "total_num_tokens": 544904420.0, "z_loss": 0.0004923942033201456 }, { "copy_logits_max": -1.3484230041503906, "copy_logits_min": -625000000.0, "copy_num_tokens": 778.125, "epoch": 1.2269083482256828, "gen_logits_max": 4.522794723510742, "gen_logits_mean": -15.791400909423828, "gen_logits_min": -29.02835464477539, "gen_logits_std": 3.0982561111450195, "gen_loss": 0.23209738731384277, "grad_norm": 0.36965681628513364, "learning_rate": 2.3341894736842104e-05, "loss": 0.2663, "mean_copy_accuracy": 0.9962844997644424, "mean_gen_accuracy": 0.8814473897218704, "mean_token_accuracy": 0.9090257734060287, "num_tokens": 545148058.0, "sample_num_tokens": 10652.5, "step": 6007, "total_num_tokens": 545190668.0, "z_loss": 0.00042239195317961276 }, { "copy_logits_max": -3.0048184394836426, "copy_logits_min": -750000000.0, "copy_num_tokens": 256.75, "epoch": 1.2271125861628798, "gen_logits_max": 5.766094207763672, "gen_logits_mean": -15.447282791137695, "gen_logits_min": -28.078157424926758, "gen_logits_std": 3.05350661277771, "gen_loss": 0.27909713983535767, "grad_norm": 0.3835395518074681, "learning_rate": 2.3340631578947368e-05, "loss": 0.2839, "mean_copy_accuracy": 0.9955486506223679, "mean_gen_accuracy": 0.8811819404363632, "mean_token_accuracy": 0.9039130955934525, "num_tokens": 545410510.0, "sample_num_tokens": 7121.5, "step": 6008, "total_num_tokens": 545438996.0, "z_loss": 0.00048284183139912784 }, { "copy_logits_max": -3.468250036239624, "copy_logits_min": -687500032.0, "copy_num_tokens": 381.5625, "epoch": 1.2273168241000767, "gen_logits_max": 4.770267009735107, "gen_logits_mean": -16.259052276611328, "gen_logits_min": -28.575031280517578, "gen_logits_std": 3.0920212268829346, "gen_loss": 0.2660563588142395, "grad_norm": 0.3867054505453717, "learning_rate": 2.3339368421052632e-05, "loss": 0.2736, "mean_copy_accuracy": 0.9966584295034409, "mean_gen_accuracy": 0.8754885494709015, "mean_token_accuracy": 0.9075109362602234, "num_tokens": 545681935.0, "sample_num_tokens": 7346.25, "step": 6009, "total_num_tokens": 545711320.0, "z_loss": 0.0004308882635086775 }, { "copy_logits_max": -3.954585552215576, "copy_logits_min": -625000064.0, "copy_num_tokens": 725.9375, "epoch": 1.2275210620372734, "gen_logits_max": 5.475754737854004, "gen_logits_mean": -14.84225845336914, "gen_logits_min": -26.866912841796875, "gen_logits_std": 3.0414364337921143, "gen_loss": 0.2632567286491394, "grad_norm": 0.42149414422076503, "learning_rate": 2.3338105263157897e-05, "loss": 0.2904, "mean_copy_accuracy": 0.9973619878292084, "mean_gen_accuracy": 0.8675229102373123, "mean_token_accuracy": 0.9039535224437714, "num_tokens": 545967182.0, "sample_num_tokens": 9610.5, "step": 6010, "total_num_tokens": 546005624.0, "z_loss": 0.0004872330173384398 }, { "copy_logits_max": -3.0436697006225586, "copy_logits_min": -750000000.0, "copy_num_tokens": 393.6875, "epoch": 1.2277252999744703, "gen_logits_max": 3.699798583984375, "gen_logits_mean": -16.849609375, "gen_logits_min": -29.171524047851562, "gen_logits_std": 3.081670045852661, "gen_loss": 0.31182152032852173, "grad_norm": 0.3565722791382748, "learning_rate": 2.333684210526316e-05, "loss": 0.2832, "mean_copy_accuracy": 0.9960315227508545, "mean_gen_accuracy": 0.8696626722812653, "mean_token_accuracy": 0.9029953628778458, "num_tokens": 546250465.0, "sample_num_tokens": 7342.75, "step": 6011, "total_num_tokens": 546279836.0, "z_loss": 0.0005179613945074379 }, { "copy_logits_max": 0.25308141112327576, "copy_logits_min": -750000000.0, "copy_num_tokens": 740.375, "epoch": 1.227929537911667, "gen_logits_max": 4.757465362548828, "gen_logits_mean": -15.305121421813965, "gen_logits_min": -27.686683654785156, "gen_logits_std": 3.080756664276123, "gen_loss": 0.22788304090499878, "grad_norm": 0.38044933826436567, "learning_rate": 2.3335578947368422e-05, "loss": 0.251, "mean_copy_accuracy": 0.9966101348400116, "mean_gen_accuracy": 0.8841720074415207, "mean_token_accuracy": 0.9144705086946487, "num_tokens": 546537584.0, "sample_num_tokens": 11077.5, "step": 6012, "total_num_tokens": 546581894.0, "z_loss": 0.0004095432232134044 }, { "copy_logits_max": -0.9150810837745667, "copy_logits_min": -750000000.0, "copy_num_tokens": 557.0, "epoch": 1.228133775848864, "gen_logits_max": 4.260216236114502, "gen_logits_mean": -15.198575973510742, "gen_logits_min": -27.590667724609375, "gen_logits_std": 3.059969902038574, "gen_loss": 0.27010825276374817, "grad_norm": 0.33870142767113837, "learning_rate": 2.3334315789473686e-05, "loss": 0.2695, "mean_copy_accuracy": 0.9966093152761459, "mean_gen_accuracy": 0.8753571808338165, "mean_token_accuracy": 0.9074266701936722, "num_tokens": 546849222.0, "sample_num_tokens": 8958.0, "step": 6013, "total_num_tokens": 546885054.0, "z_loss": 0.000493425817694515 }, { "copy_logits_max": -4.212098121643066, "copy_logits_min": -750000000.0, "copy_num_tokens": 268.0, "epoch": 1.2283380137860607, "gen_logits_max": 4.181548595428467, "gen_logits_mean": -17.379518508911133, "gen_logits_min": -29.244277954101562, "gen_logits_std": 3.0667712688446045, "gen_loss": 0.2706378102302551, "grad_norm": 0.3637179319112146, "learning_rate": 2.3333052631578947e-05, "loss": 0.2902, "mean_copy_accuracy": 0.9959552586078644, "mean_gen_accuracy": 0.8754065483808517, "mean_token_accuracy": 0.9009090960025787, "num_tokens": 547105990.0, "sample_num_tokens": 6747.0, "step": 6014, "total_num_tokens": 547132978.0, "z_loss": 0.0004817588487640023 }, { "copy_logits_max": -1.0276685953140259, "copy_logits_min": -750000000.0, "copy_num_tokens": 473.9375, "epoch": 1.2285422517232576, "gen_logits_max": 4.58194637298584, "gen_logits_mean": -15.118654251098633, "gen_logits_min": -27.473875045776367, "gen_logits_std": 3.040915012359619, "gen_loss": 0.30388858914375305, "grad_norm": 0.3765664921569115, "learning_rate": 2.333178947368421e-05, "loss": 0.2862, "mean_copy_accuracy": 0.9963682293891907, "mean_gen_accuracy": 0.8716873526573181, "mean_token_accuracy": 0.9014010429382324, "num_tokens": 547353638.0, "sample_num_tokens": 8061.0, "step": 6015, "total_num_tokens": 547385882.0, "z_loss": 0.0005301277851685882 }, { "copy_logits_max": -0.9093195199966431, "copy_logits_min": -750000000.0, "copy_num_tokens": 515.3125, "epoch": 1.2287464896604545, "gen_logits_max": 4.546041488647461, "gen_logits_mean": -15.586777687072754, "gen_logits_min": -27.93355369567871, "gen_logits_std": 3.0688962936401367, "gen_loss": 0.24471522867679596, "grad_norm": 0.39654560703954883, "learning_rate": 2.3330526315789472e-05, "loss": 0.2754, "mean_copy_accuracy": 0.9962384700775146, "mean_gen_accuracy": 0.8802183419466019, "mean_token_accuracy": 0.9064896106719971, "num_tokens": 547615962.0, "sample_num_tokens": 9208.5, "step": 6016, "total_num_tokens": 547652796.0, "z_loss": 0.0004450384003575891 }, { "copy_logits_max": -2.7670044898986816, "copy_logits_min": -750000000.0, "copy_num_tokens": 346.9375, "epoch": 1.2289507275976512, "gen_logits_max": 5.759860992431641, "gen_logits_mean": -14.939525604248047, "gen_logits_min": -26.851856231689453, "gen_logits_std": 3.0321292877197266, "gen_loss": 0.28222036361694336, "grad_norm": 0.36772800273262674, "learning_rate": 2.3329263157894737e-05, "loss": 0.2831, "mean_copy_accuracy": 0.9967112094163895, "mean_gen_accuracy": 0.8815790414810181, "mean_token_accuracy": 0.9072691947221756, "num_tokens": 547893971.0, "sample_num_tokens": 8380.75, "step": 6017, "total_num_tokens": 547927494.0, "z_loss": 0.0005273887072689831 }, { "copy_logits_max": -0.13581353425979614, "copy_logits_min": -750000000.0, "copy_num_tokens": 570.6875, "epoch": 1.2291549655348482, "gen_logits_max": 4.82774543762207, "gen_logits_mean": -14.41415023803711, "gen_logits_min": -26.769920349121094, "gen_logits_std": 3.0167531967163086, "gen_loss": 0.26961982250213623, "grad_norm": 0.37577671216988173, "learning_rate": 2.3328e-05, "loss": 0.2979, "mean_copy_accuracy": 0.996871754527092, "mean_gen_accuracy": 0.8745783269405365, "mean_token_accuracy": 0.9006572216749191, "num_tokens": 548147690.0, "sample_num_tokens": 8253.0, "step": 6018, "total_num_tokens": 548180702.0, "z_loss": 0.00048582430463284254 }, { "copy_logits_max": -0.22641456127166748, "copy_logits_min": -687500032.0, "copy_num_tokens": 371.625, "epoch": 1.2293592034720449, "gen_logits_max": 6.1873979568481445, "gen_logits_mean": -12.953653335571289, "gen_logits_min": -24.97369384765625, "gen_logits_std": 2.935847759246826, "gen_loss": 0.3110617995262146, "grad_norm": 0.41626768520135227, "learning_rate": 2.3326736842105265e-05, "loss": 0.3072, "mean_copy_accuracy": 0.9960993528366089, "mean_gen_accuracy": 0.8698357194662094, "mean_token_accuracy": 0.8955481201410294, "num_tokens": 548413067.0, "sample_num_tokens": 8191.25, "step": 6019, "total_num_tokens": 548445832.0, "z_loss": 0.000550043594557792 }, { "copy_logits_max": -2.709256649017334, "copy_logits_min": -750000000.0, "copy_num_tokens": 311.6875, "epoch": 1.2295634414092418, "gen_logits_max": 4.707307815551758, "gen_logits_mean": -15.578627586364746, "gen_logits_min": -27.486064910888672, "gen_logits_std": 3.041658878326416, "gen_loss": 0.2656640112400055, "grad_norm": 0.7479940017377313, "learning_rate": 2.3325473684210526e-05, "loss": 0.2778, "mean_copy_accuracy": 0.9961210340261459, "mean_gen_accuracy": 0.8781532496213913, "mean_token_accuracy": 0.9043526649475098, "num_tokens": 548675330.0, "sample_num_tokens": 7339.0, "step": 6020, "total_num_tokens": 548704686.0, "z_loss": 0.0004845529911108315 }, { "copy_logits_max": -1.474723219871521, "copy_logits_min": -687500032.0, "copy_num_tokens": 614.875, "epoch": 1.2297676793464385, "gen_logits_max": 3.1087543964385986, "gen_logits_mean": -17.00052833557129, "gen_logits_min": -28.983741760253906, "gen_logits_std": 3.0728049278259277, "gen_loss": 0.2654028534889221, "grad_norm": 0.35808996181704333, "learning_rate": 2.332421052631579e-05, "loss": 0.2693, "mean_copy_accuracy": 0.9962553381919861, "mean_gen_accuracy": 0.8764861077070236, "mean_token_accuracy": 0.9078536480665207, "num_tokens": 548952967.0, "sample_num_tokens": 9855.75, "step": 6021, "total_num_tokens": 548992390.0, "z_loss": 0.0005362966912798584 }, { "copy_logits_max": -0.7777395248413086, "copy_logits_min": -687500032.0, "copy_num_tokens": 524.5625, "epoch": 1.2299719172836354, "gen_logits_max": 4.749701499938965, "gen_logits_mean": -14.363922119140625, "gen_logits_min": -26.56291961669922, "gen_logits_std": 3.022706985473633, "gen_loss": 0.27835822105407715, "grad_norm": 0.4130252764837197, "learning_rate": 2.3322947368421055e-05, "loss": 0.2826, "mean_copy_accuracy": 0.9957587867975235, "mean_gen_accuracy": 0.8727993369102478, "mean_token_accuracy": 0.9041820615530014, "num_tokens": 549210566.0, "sample_num_tokens": 8249.5, "step": 6022, "total_num_tokens": 549243564.0, "z_loss": 0.0005403448594734073 }, { "copy_logits_max": 1.544944167137146, "copy_logits_min": -750000000.0, "copy_num_tokens": 671.75, "epoch": 1.2301761552208323, "gen_logits_max": 4.062956809997559, "gen_logits_mean": -14.488992691040039, "gen_logits_min": -26.929092407226562, "gen_logits_std": 3.062530517578125, "gen_loss": 0.22457700967788696, "grad_norm": 0.42540717210601464, "learning_rate": 2.3321684210526316e-05, "loss": 0.2939, "mean_copy_accuracy": 0.9953407943248749, "mean_gen_accuracy": 0.8693917393684387, "mean_token_accuracy": 0.8993054181337357, "num_tokens": 549470001.0, "sample_num_tokens": 9128.25, "step": 6023, "total_num_tokens": 549506514.0, "z_loss": 0.00048352929297834635 }, { "copy_logits_max": -1.638953447341919, "copy_logits_min": -750000000.0, "copy_num_tokens": 354.6875, "epoch": 1.230380393158029, "gen_logits_max": 5.89207649230957, "gen_logits_mean": -13.94454574584961, "gen_logits_min": -25.949607849121094, "gen_logits_std": 3.011432647705078, "gen_loss": 0.2521953880786896, "grad_norm": 0.3526814272335859, "learning_rate": 2.332042105263158e-05, "loss": 0.2606, "mean_copy_accuracy": 0.9969276338815689, "mean_gen_accuracy": 0.8845187127590179, "mean_token_accuracy": 0.9115399122238159, "num_tokens": 549746086.0, "sample_num_tokens": 8086.5, "step": 6024, "total_num_tokens": 549778432.0, "z_loss": 0.0004547579155769199 }, { "copy_logits_max": -0.7268871665000916, "copy_logits_min": -687500032.0, "copy_num_tokens": 321.5625, "epoch": 1.230584631095226, "gen_logits_max": 4.191323757171631, "gen_logits_mean": -15.902861595153809, "gen_logits_min": -27.94773292541504, "gen_logits_std": 3.035623550415039, "gen_loss": 0.2584759593009949, "grad_norm": 0.42071553121268473, "learning_rate": 2.331915789473684e-05, "loss": 0.2784, "mean_copy_accuracy": 0.9961603432893753, "mean_gen_accuracy": 0.8757656961679459, "mean_token_accuracy": 0.9057135581970215, "num_tokens": 550010303.0, "sample_num_tokens": 7456.25, "step": 6025, "total_num_tokens": 550040128.0, "z_loss": 0.0005042722914367914 }, { "copy_logits_max": -0.7788113951683044, "copy_logits_min": -750000000.0, "copy_num_tokens": 689.9375, "epoch": 1.2307888690324227, "gen_logits_max": 2.977688789367676, "gen_logits_mean": -16.859092712402344, "gen_logits_min": -29.096403121948242, "gen_logits_std": 3.085454225540161, "gen_loss": 0.2306489795446396, "grad_norm": 0.37920240617283774, "learning_rate": 2.3317894736842105e-05, "loss": 0.278, "mean_copy_accuracy": 0.9959384649991989, "mean_gen_accuracy": 0.8766115605831146, "mean_token_accuracy": 0.906289353966713, "num_tokens": 550282924.0, "sample_num_tokens": 9886.5, "step": 6026, "total_num_tokens": 550322470.0, "z_loss": 0.00045677670277655125 }, { "copy_logits_max": -1.3601555824279785, "copy_logits_min": -750000000.0, "copy_num_tokens": 505.25, "epoch": 1.2309931069696196, "gen_logits_max": 4.031545162200928, "gen_logits_mean": -14.992796897888184, "gen_logits_min": -26.733165740966797, "gen_logits_std": 2.9640069007873535, "gen_loss": 0.2848551571369171, "grad_norm": 0.3367950292572025, "learning_rate": 2.331663157894737e-05, "loss": 0.2764, "mean_copy_accuracy": 0.9966763406991959, "mean_gen_accuracy": 0.8751351237297058, "mean_token_accuracy": 0.9049209207296371, "num_tokens": 550565762.0, "sample_num_tokens": 8157.0, "step": 6027, "total_num_tokens": 550598390.0, "z_loss": 0.0004820466274395585 }, { "copy_logits_max": -1.8751885890960693, "copy_logits_min": -687500032.0, "copy_num_tokens": 388.5, "epoch": 1.2311973449068165, "gen_logits_max": 4.372377872467041, "gen_logits_mean": -15.883974075317383, "gen_logits_min": -28.148996353149414, "gen_logits_std": 3.043694496154785, "gen_loss": 0.2823793292045593, "grad_norm": 0.43073777973780325, "learning_rate": 2.3315368421052634e-05, "loss": 0.2727, "mean_copy_accuracy": 0.9945631474256516, "mean_gen_accuracy": 0.879486545920372, "mean_token_accuracy": 0.9071517139673233, "num_tokens": 550808034.0, "sample_num_tokens": 7734.0, "step": 6028, "total_num_tokens": 550838970.0, "z_loss": 0.0005377686466090381 }, { "copy_logits_max": -1.105679988861084, "copy_logits_min": -687500032.0, "copy_num_tokens": 680.4375, "epoch": 1.2314015828440132, "gen_logits_max": 4.0554890632629395, "gen_logits_mean": -15.636658668518066, "gen_logits_min": -27.423255920410156, "gen_logits_std": 3.028348684310913, "gen_loss": 0.23761412501335144, "grad_norm": 0.3958173501686608, "learning_rate": 2.3314105263157895e-05, "loss": 0.2775, "mean_copy_accuracy": 0.9958333075046539, "mean_gen_accuracy": 0.8741311132907867, "mean_token_accuracy": 0.9066044092178345, "num_tokens": 551093679.0, "sample_num_tokens": 9075.75, "step": 6029, "total_num_tokens": 551129982.0, "z_loss": 0.00043920561438426375 }, { "copy_logits_max": -2.9063644409179688, "copy_logits_min": -750000000.0, "copy_num_tokens": 443.4375, "epoch": 1.2316058207812102, "gen_logits_max": 3.4628305435180664, "gen_logits_mean": -18.03632354736328, "gen_logits_min": -29.868850708007812, "gen_logits_std": 3.095205068588257, "gen_loss": 0.2584681510925293, "grad_norm": 0.36524959060589296, "learning_rate": 2.331284210526316e-05, "loss": 0.2717, "mean_copy_accuracy": 0.9963656067848206, "mean_gen_accuracy": 0.8803714364767075, "mean_token_accuracy": 0.907919317483902, "num_tokens": 551366812.0, "sample_num_tokens": 8294.0, "step": 6030, "total_num_tokens": 551399988.0, "z_loss": 0.0004941246006637812 }, { "copy_logits_max": -2.614899158477783, "copy_logits_min": -750000000.0, "copy_num_tokens": 406.5625, "epoch": 1.2318100587184069, "gen_logits_max": 3.92263126373291, "gen_logits_mean": -16.791080474853516, "gen_logits_min": -28.74252700805664, "gen_logits_std": 3.0530786514282227, "gen_loss": 0.29928791522979736, "grad_norm": 0.37378354641157124, "learning_rate": 2.331157894736842e-05, "loss": 0.2804, "mean_copy_accuracy": 0.9958738684654236, "mean_gen_accuracy": 0.8762568533420563, "mean_token_accuracy": 0.905006930232048, "num_tokens": 551637276.0, "sample_num_tokens": 8451.0, "step": 6031, "total_num_tokens": 551671080.0, "z_loss": 0.0005574591923505068 }, { "copy_logits_max": -2.6347222328186035, "copy_logits_min": -687500032.0, "copy_num_tokens": 475.125, "epoch": 1.2320142966556038, "gen_logits_max": 4.585521697998047, "gen_logits_mean": -16.491241455078125, "gen_logits_min": -28.4017391204834, "gen_logits_std": 3.051013946533203, "gen_loss": 0.27209144830703735, "grad_norm": 0.37407563591041204, "learning_rate": 2.3310315789473684e-05, "loss": 0.2819, "mean_copy_accuracy": 0.996228888630867, "mean_gen_accuracy": 0.8781125098466873, "mean_token_accuracy": 0.9059119671583176, "num_tokens": 551909801.0, "sample_num_tokens": 8813.75, "step": 6032, "total_num_tokens": 551945056.0, "z_loss": 0.0005021400284022093 }, { "copy_logits_max": -1.5055439472198486, "copy_logits_min": -687500032.0, "copy_num_tokens": 444.4375, "epoch": 1.2322185345928007, "gen_logits_max": 5.830620288848877, "gen_logits_mean": -14.069273948669434, "gen_logits_min": -26.440542221069336, "gen_logits_std": 3.0387511253356934, "gen_loss": 0.25205153226852417, "grad_norm": 0.38365018737705464, "learning_rate": 2.3309052631578945e-05, "loss": 0.2696, "mean_copy_accuracy": 0.995952308177948, "mean_gen_accuracy": 0.8803755044937134, "mean_token_accuracy": 0.9081750065088272, "num_tokens": 552166174.0, "sample_num_tokens": 7785.5, "step": 6033, "total_num_tokens": 552197316.0, "z_loss": 0.0004137776850257069 }, { "copy_logits_max": -3.855288028717041, "copy_logits_min": -750000000.0, "copy_num_tokens": 337.4375, "epoch": 1.2324227725299974, "gen_logits_max": 5.170716762542725, "gen_logits_mean": -15.965709686279297, "gen_logits_min": -28.055675506591797, "gen_logits_std": 3.0406441688537598, "gen_loss": 0.30849188566207886, "grad_norm": 0.42815170197459795, "learning_rate": 2.330778947368421e-05, "loss": 0.2866, "mean_copy_accuracy": 0.9952649623155594, "mean_gen_accuracy": 0.8760210573673248, "mean_token_accuracy": 0.9031712263822556, "num_tokens": 552418011.0, "sample_num_tokens": 7694.75, "step": 6034, "total_num_tokens": 552448790.0, "z_loss": 0.0005907205049879849 }, { "copy_logits_max": -4.996922492980957, "copy_logits_min": -750000000.0, "copy_num_tokens": 299.125, "epoch": 1.2326270104671944, "gen_logits_max": 4.892911911010742, "gen_logits_mean": -16.190940856933594, "gen_logits_min": -27.795658111572266, "gen_logits_std": 3.0115129947662354, "gen_loss": 0.29902979731559753, "grad_norm": 0.37651270716849944, "learning_rate": 2.3306526315789477e-05, "loss": 0.2777, "mean_copy_accuracy": 0.9954947680234909, "mean_gen_accuracy": 0.8745011985301971, "mean_token_accuracy": 0.9050100445747375, "num_tokens": 552690357.0, "sample_num_tokens": 7141.25, "step": 6035, "total_num_tokens": 552718922.0, "z_loss": 0.0005137774278409779 }, { "copy_logits_max": -3.547173261642456, "copy_logits_min": -750000000.0, "copy_num_tokens": 448.75, "epoch": 1.232831248404391, "gen_logits_max": 4.141276836395264, "gen_logits_mean": -16.156267166137695, "gen_logits_min": -27.974750518798828, "gen_logits_std": 3.0114645957946777, "gen_loss": 0.29025110602378845, "grad_norm": 0.3982399300084979, "learning_rate": 2.3305263157894738e-05, "loss": 0.2916, "mean_copy_accuracy": 0.9963896572589874, "mean_gen_accuracy": 0.873339906334877, "mean_token_accuracy": 0.9023424237966537, "num_tokens": 552954781.0, "sample_num_tokens": 8588.25, "step": 6036, "total_num_tokens": 552989134.0, "z_loss": 0.0005423655966296792 }, { "copy_logits_max": 0.5401319265365601, "copy_logits_min": -750000000.0, "copy_num_tokens": 753.4375, "epoch": 1.233035486341588, "gen_logits_max": 3.669372081756592, "gen_logits_mean": -14.986328125, "gen_logits_min": -27.74163055419922, "gen_logits_std": 3.0533242225646973, "gen_loss": 0.24939611554145813, "grad_norm": 0.35148761099235787, "learning_rate": 2.3304000000000003e-05, "loss": 0.2533, "mean_copy_accuracy": 0.9971786886453629, "mean_gen_accuracy": 0.876385360956192, "mean_token_accuracy": 0.9140525460243225, "num_tokens": 553243613.0, "sample_num_tokens": 10006.25, "step": 6037, "total_num_tokens": 553283638.0, "z_loss": 0.00046722296974621713 }, { "copy_logits_max": -3.3034815788269043, "copy_logits_min": -750000000.0, "copy_num_tokens": 375.25, "epoch": 1.2332397242787847, "gen_logits_max": 5.033946990966797, "gen_logits_mean": -15.218066215515137, "gen_logits_min": -26.90084457397461, "gen_logits_std": 3.0016860961914062, "gen_loss": 0.2886239290237427, "grad_norm": 0.3393811256259939, "learning_rate": 2.3302736842105263e-05, "loss": 0.2797, "mean_copy_accuracy": 0.995935708284378, "mean_gen_accuracy": 0.8719385266304016, "mean_token_accuracy": 0.9034424722194672, "num_tokens": 553523501.0, "sample_num_tokens": 7559.75, "step": 6038, "total_num_tokens": 553553740.0, "z_loss": 0.0005295047303661704 }, { "copy_logits_max": -2.4862685203552246, "copy_logits_min": -750000000.0, "copy_num_tokens": 519.625, "epoch": 1.2334439622159816, "gen_logits_max": 5.378467559814453, "gen_logits_mean": -14.85543441772461, "gen_logits_min": -26.884984970092773, "gen_logits_std": 3.018622875213623, "gen_loss": 0.29831528663635254, "grad_norm": 0.3912551458037881, "learning_rate": 2.3301473684210528e-05, "loss": 0.286, "mean_copy_accuracy": 0.9963333308696747, "mean_gen_accuracy": 0.8716633319854736, "mean_token_accuracy": 0.9024574160575867, "num_tokens": 553792116.0, "sample_num_tokens": 8835.5, "step": 6039, "total_num_tokens": 553827458.0, "z_loss": 0.0005862437537871301 }, { "copy_logits_max": -3.6625967025756836, "copy_logits_min": -750000000.0, "copy_num_tokens": 465.125, "epoch": 1.2336482001531786, "gen_logits_max": 4.802979946136475, "gen_logits_mean": -15.343367576599121, "gen_logits_min": -27.664264678955078, "gen_logits_std": 3.0242042541503906, "gen_loss": 0.269700288772583, "grad_norm": 0.3561107794227838, "learning_rate": 2.330021052631579e-05, "loss": 0.2528, "mean_copy_accuracy": 0.9963645190000534, "mean_gen_accuracy": 0.8804207593202591, "mean_token_accuracy": 0.9137638062238693, "num_tokens": 554081821.0, "sample_num_tokens": 7928.75, "step": 6040, "total_num_tokens": 554113536.0, "z_loss": 0.0005594827234745026 }, { "copy_logits_max": -3.128469467163086, "copy_logits_min": -750000000.0, "copy_num_tokens": 542.5625, "epoch": 1.2338524380903753, "gen_logits_max": 3.7179174423217773, "gen_logits_mean": -16.97564697265625, "gen_logits_min": -29.428531646728516, "gen_logits_std": 3.091890811920166, "gen_loss": 0.2644796073436737, "grad_norm": 0.3851847423417539, "learning_rate": 2.3298947368421053e-05, "loss": 0.2732, "mean_copy_accuracy": 0.9970293492078781, "mean_gen_accuracy": 0.8755261600017548, "mean_token_accuracy": 0.9075059145689011, "num_tokens": 554334603.0, "sample_num_tokens": 8963.75, "step": 6041, "total_num_tokens": 554370458.0, "z_loss": 0.0005832000751979649 }, { "copy_logits_max": -1.897188663482666, "copy_logits_min": -750000000.0, "copy_num_tokens": 730.625, "epoch": 1.2340566760275722, "gen_logits_max": 3.7694778442382812, "gen_logits_mean": -15.680512428283691, "gen_logits_min": -27.92721176147461, "gen_logits_std": 3.0669684410095215, "gen_loss": 0.25494584441185, "grad_norm": 0.3911987846956618, "learning_rate": 2.3297684210526314e-05, "loss": 0.2702, "mean_copy_accuracy": 0.996053472161293, "mean_gen_accuracy": 0.8724953979253769, "mean_token_accuracy": 0.9078986793756485, "num_tokens": 554621351.0, "sample_num_tokens": 9707.75, "step": 6042, "total_num_tokens": 554660182.0, "z_loss": 0.0004943667445331812 }, { "copy_logits_max": -2.249605894088745, "copy_logits_min": -750000000.0, "copy_num_tokens": 507.875, "epoch": 1.234260913964769, "gen_logits_max": 4.794841766357422, "gen_logits_mean": -14.430338859558105, "gen_logits_min": -26.427438735961914, "gen_logits_std": 3.012207269668579, "gen_loss": 0.2828625738620758, "grad_norm": 0.37701865397912715, "learning_rate": 2.329642105263158e-05, "loss": 0.287, "mean_copy_accuracy": 0.9953453093767166, "mean_gen_accuracy": 0.8758098632097244, "mean_token_accuracy": 0.9033672660589218, "num_tokens": 554897265.0, "sample_num_tokens": 8531.75, "step": 6043, "total_num_tokens": 554931392.0, "z_loss": 0.0005669449456036091 }, { "copy_logits_max": -5.576342582702637, "copy_logits_min": -750000000.0, "copy_num_tokens": 314.1875, "epoch": 1.2344651519019658, "gen_logits_max": 4.633211135864258, "gen_logits_mean": -16.697635650634766, "gen_logits_min": -28.36212921142578, "gen_logits_std": 3.0602071285247803, "gen_loss": 0.266737699508667, "grad_norm": 0.36919947896844485, "learning_rate": 2.3295157894736843e-05, "loss": 0.2661, "mean_copy_accuracy": 0.9952343702316284, "mean_gen_accuracy": 0.8857624530792236, "mean_token_accuracy": 0.9106120616197586, "num_tokens": 555170617.0, "sample_num_tokens": 7962.25, "step": 6044, "total_num_tokens": 555202466.0, "z_loss": 0.0005435559432953596 }, { "copy_logits_max": -3.2561702728271484, "copy_logits_min": -750000000.0, "copy_num_tokens": 305.3125, "epoch": 1.2346693898391625, "gen_logits_max": 5.084149360656738, "gen_logits_mean": -15.174104690551758, "gen_logits_min": -26.93917465209961, "gen_logits_std": 3.005345106124878, "gen_loss": 0.28727176785469055, "grad_norm": 0.3700142106516348, "learning_rate": 2.3293894736842107e-05, "loss": 0.2754, "mean_copy_accuracy": 0.995099663734436, "mean_gen_accuracy": 0.8788249790668488, "mean_token_accuracy": 0.9057627469301224, "num_tokens": 555440291.0, "sample_num_tokens": 7214.75, "step": 6045, "total_num_tokens": 555469150.0, "z_loss": 0.000546370109077543 }, { "copy_logits_max": -3.2449138164520264, "copy_logits_min": -750000000.0, "copy_num_tokens": 415.5, "epoch": 1.2348736277763595, "gen_logits_max": 4.269136905670166, "gen_logits_mean": -15.738652229309082, "gen_logits_min": -27.313629150390625, "gen_logits_std": 3.026763439178467, "gen_loss": 0.28631791472435, "grad_norm": 0.42049803064009195, "learning_rate": 2.3292631578947368e-05, "loss": 0.2848, "mean_copy_accuracy": 0.9963332712650299, "mean_gen_accuracy": 0.8737311512231827, "mean_token_accuracy": 0.9032943397760391, "num_tokens": 555686865.0, "sample_num_tokens": 7445.75, "step": 6046, "total_num_tokens": 555716648.0, "z_loss": 0.0005861152894794941 }, { "copy_logits_max": -2.341545820236206, "copy_logits_min": -750000000.0, "copy_num_tokens": 402.4375, "epoch": 1.2350778657135564, "gen_logits_max": 4.51137113571167, "gen_logits_mean": -15.605091094970703, "gen_logits_min": -27.340106964111328, "gen_logits_std": 3.033660411834717, "gen_loss": 0.2766641080379486, "grad_norm": 0.37486341706816373, "learning_rate": 2.3291368421052632e-05, "loss": 0.2782, "mean_copy_accuracy": 0.9969314187765121, "mean_gen_accuracy": 0.879127562046051, "mean_token_accuracy": 0.9052638560533524, "num_tokens": 555953045.0, "sample_num_tokens": 7987.25, "step": 6047, "total_num_tokens": 555984994.0, "z_loss": 0.0005469312309287488 }, { "copy_logits_max": -3.398650884628296, "copy_logits_min": -750000000.0, "copy_num_tokens": 321.625, "epoch": 1.235282103650753, "gen_logits_max": 3.9396889209747314, "gen_logits_mean": -17.01085090637207, "gen_logits_min": -28.550678253173828, "gen_logits_std": 3.0370192527770996, "gen_loss": 0.2814640700817108, "grad_norm": 0.3990789864499103, "learning_rate": 2.3290105263157893e-05, "loss": 0.2873, "mean_copy_accuracy": 0.9950812757015228, "mean_gen_accuracy": 0.8781802505254745, "mean_token_accuracy": 0.9015648663043976, "num_tokens": 556192715.0, "sample_num_tokens": 7364.25, "step": 6048, "total_num_tokens": 556222172.0, "z_loss": 0.0005198084400035441 }, { "copy_logits_max": -1.7768964767456055, "copy_logits_min": -750000000.0, "copy_num_tokens": 522.6875, "epoch": 1.23548634158795, "gen_logits_max": 4.6597490310668945, "gen_logits_mean": -13.720296859741211, "gen_logits_min": -25.358057022094727, "gen_logits_std": 2.9009759426116943, "gen_loss": 0.31169456243515015, "grad_norm": 0.38311305073949137, "learning_rate": 2.3288842105263157e-05, "loss": 0.2857, "mean_copy_accuracy": 0.9951805770397186, "mean_gen_accuracy": 0.8721775710582733, "mean_token_accuracy": 0.9023447632789612, "num_tokens": 556453892.0, "sample_num_tokens": 9023.5, "step": 6049, "total_num_tokens": 556489986.0, "z_loss": 0.0005951966741122305 }, { "copy_logits_max": -0.6809424161911011, "copy_logits_min": -750000000.0, "copy_num_tokens": 454.625, "epoch": 1.2356905795251467, "gen_logits_max": 3.9965031147003174, "gen_logits_mean": -15.495841979980469, "gen_logits_min": -27.242616653442383, "gen_logits_std": 2.986172676086426, "gen_loss": 0.2618798017501831, "grad_norm": 0.4085542362222445, "learning_rate": 2.328757894736842e-05, "loss": 0.2704, "mean_copy_accuracy": 0.9950894713401794, "mean_gen_accuracy": 0.8816315531730652, "mean_token_accuracy": 0.9071182906627655, "num_tokens": 556711256.0, "sample_num_tokens": 7900.0, "step": 6050, "total_num_tokens": 556742856.0, "z_loss": 0.0005626970669254661 }, { "copy_logits_max": -6.247155666351318, "copy_logits_min": -750000000.0, "copy_num_tokens": 326.4375, "epoch": 1.2358948174623436, "gen_logits_max": 2.649143934249878, "gen_logits_mean": -19.22549819946289, "gen_logits_min": -30.871639251708984, "gen_logits_std": 3.1223766803741455, "gen_loss": 0.25283998250961304, "grad_norm": 0.43078214882681304, "learning_rate": 2.3286315789473686e-05, "loss": 0.2823, "mean_copy_accuracy": 0.9955342411994934, "mean_gen_accuracy": 0.8758931010961533, "mean_token_accuracy": 0.903002068400383, "num_tokens": 556967420.0, "sample_num_tokens": 6986.0, "step": 6051, "total_num_tokens": 556995364.0, "z_loss": 0.000430254585808143 }, { "copy_logits_max": -4.361106872558594, "copy_logits_min": -687500032.0, "copy_num_tokens": 600.25, "epoch": 1.2360990553995403, "gen_logits_max": 3.653651475906372, "gen_logits_mean": -15.743719100952148, "gen_logits_min": -27.62042999267578, "gen_logits_std": 3.0497593879699707, "gen_loss": 0.244209885597229, "grad_norm": 0.37760319306455126, "learning_rate": 2.328505263157895e-05, "loss": 0.2764, "mean_copy_accuracy": 0.9971576780080795, "mean_gen_accuracy": 0.8749575912952423, "mean_token_accuracy": 0.9053677618503571, "num_tokens": 557227505.0, "sample_num_tokens": 8655.75, "step": 6052, "total_num_tokens": 557262128.0, "z_loss": 0.0004571331955958158 }, { "copy_logits_max": -6.285122871398926, "copy_logits_min": -750000064.0, "copy_num_tokens": 370.25, "epoch": 1.2363032933367373, "gen_logits_max": 3.9447944164276123, "gen_logits_mean": -16.856924057006836, "gen_logits_min": -28.46680450439453, "gen_logits_std": 3.0545201301574707, "gen_loss": 0.27411675453186035, "grad_norm": 0.36411782424458544, "learning_rate": 2.328378947368421e-05, "loss": 0.2641, "mean_copy_accuracy": 0.9964001625776291, "mean_gen_accuracy": 0.8811097145080566, "mean_token_accuracy": 0.9096938371658325, "num_tokens": 557504008.0, "sample_num_tokens": 8528.0, "step": 6053, "total_num_tokens": 557538120.0, "z_loss": 0.00047460809582844377 }, { "copy_logits_max": -2.8827178478240967, "copy_logits_min": -750000064.0, "copy_num_tokens": 487.125, "epoch": 1.2365075312739342, "gen_logits_max": 4.191679000854492, "gen_logits_mean": -15.390035629272461, "gen_logits_min": -27.41977310180664, "gen_logits_std": 3.039280891418457, "gen_loss": 0.26748064160346985, "grad_norm": 0.43089846287673544, "learning_rate": 2.3282526315789475e-05, "loss": 0.2888, "mean_copy_accuracy": 0.9959330856800079, "mean_gen_accuracy": 0.8703575730323792, "mean_token_accuracy": 0.9003954976797104, "num_tokens": 557785820.0, "sample_num_tokens": 8468.5, "step": 6054, "total_num_tokens": 557819694.0, "z_loss": 0.0005075426306575537 }, { "copy_logits_max": -2.6360554695129395, "copy_logits_min": -750000000.0, "copy_num_tokens": 568.5, "epoch": 1.236711769211131, "gen_logits_max": 3.5884244441986084, "gen_logits_mean": -15.006218910217285, "gen_logits_min": -27.220874786376953, "gen_logits_std": 2.982539176940918, "gen_loss": 0.26065152883529663, "grad_norm": 0.4488525808617154, "learning_rate": 2.3281263157894736e-05, "loss": 0.2614, "mean_copy_accuracy": 0.9967684000730515, "mean_gen_accuracy": 0.8790227472782135, "mean_token_accuracy": 0.9121280014514923, "num_tokens": 558064633.0, "sample_num_tokens": 8726.25, "step": 6055, "total_num_tokens": 558099538.0, "z_loss": 0.0004913450102321804 }, { "copy_logits_max": -4.043402194976807, "copy_logits_min": -750000000.0, "copy_num_tokens": 433.9375, "epoch": 1.2369160071483278, "gen_logits_max": 4.082531452178955, "gen_logits_mean": -15.4662446975708, "gen_logits_min": -27.397310256958008, "gen_logits_std": 2.9746251106262207, "gen_loss": 0.30986741185188293, "grad_norm": 0.42811048824456005, "learning_rate": 2.328e-05, "loss": 0.3026, "mean_copy_accuracy": 0.9959395825862885, "mean_gen_accuracy": 0.8699349015951157, "mean_token_accuracy": 0.897222712635994, "num_tokens": 558318434.0, "sample_num_tokens": 8319.5, "step": 6056, "total_num_tokens": 558351712.0, "z_loss": 0.0005942300194874406 }, { "copy_logits_max": -4.199563026428223, "copy_logits_min": -750000000.0, "copy_num_tokens": 389.5625, "epoch": 1.2371202450855245, "gen_logits_max": 6.132693290710449, "gen_logits_mean": -12.776742935180664, "gen_logits_min": -25.27082633972168, "gen_logits_std": 2.9098024368286133, "gen_loss": 0.3211326599121094, "grad_norm": 0.39024768557940914, "learning_rate": 2.327873684210526e-05, "loss": 0.295, "mean_copy_accuracy": 0.9963351935148239, "mean_gen_accuracy": 0.8691359758377075, "mean_token_accuracy": 0.9007034450769424, "num_tokens": 558581279.0, "sample_num_tokens": 8668.75, "step": 6057, "total_num_tokens": 558615954.0, "z_loss": 0.0005506908055394888 }, { "copy_logits_max": -4.053474426269531, "copy_logits_min": -750000000.0, "copy_num_tokens": 494.1875, "epoch": 1.2373244830227215, "gen_logits_max": 3.683743953704834, "gen_logits_mean": -15.918784141540527, "gen_logits_min": -28.349897384643555, "gen_logits_std": 3.0243561267852783, "gen_loss": 0.27768975496292114, "grad_norm": 0.38843139392673204, "learning_rate": 2.3277473684210526e-05, "loss": 0.2929, "mean_copy_accuracy": 0.9960851073265076, "mean_gen_accuracy": 0.8702274709939957, "mean_token_accuracy": 0.9013248085975647, "num_tokens": 558847397.0, "sample_num_tokens": 7991.25, "step": 6058, "total_num_tokens": 558879362.0, "z_loss": 0.00048702373169362545 }, { "copy_logits_max": -3.5024261474609375, "copy_logits_min": -687500032.0, "copy_num_tokens": 491.0625, "epoch": 1.2375287209599184, "gen_logits_max": 3.0893502235412598, "gen_logits_mean": -16.3204345703125, "gen_logits_min": -28.653953552246094, "gen_logits_std": 3.0753865242004395, "gen_loss": 0.22575736045837402, "grad_norm": 0.40282370150789354, "learning_rate": 2.327621052631579e-05, "loss": 0.2594, "mean_copy_accuracy": 0.9948910474777222, "mean_gen_accuracy": 0.882186770439148, "mean_token_accuracy": 0.9112509787082672, "num_tokens": 559113168.0, "sample_num_tokens": 7751.5, "step": 6059, "total_num_tokens": 559144174.0, "z_loss": 0.0004491729778237641 }, { "copy_logits_max": -3.039315700531006, "copy_logits_min": -750000000.0, "copy_num_tokens": 373.5625, "epoch": 1.237732958897115, "gen_logits_max": 5.33792781829834, "gen_logits_mean": -14.002384185791016, "gen_logits_min": -26.0008544921875, "gen_logits_std": 3.003419876098633, "gen_loss": 0.32436439394950867, "grad_norm": 0.4168639642436691, "learning_rate": 2.3274947368421055e-05, "loss": 0.3065, "mean_copy_accuracy": 0.995868131518364, "mean_gen_accuracy": 0.86799556016922, "mean_token_accuracy": 0.8958686739206314, "num_tokens": 559372873.0, "sample_num_tokens": 7944.75, "step": 6060, "total_num_tokens": 559404652.0, "z_loss": 0.0006298718508332968 }, { "copy_logits_max": -0.9793180227279663, "copy_logits_min": -750000000.0, "copy_num_tokens": 488.9375, "epoch": 1.237937196834312, "gen_logits_max": 3.5274863243103027, "gen_logits_mean": -16.800743103027344, "gen_logits_min": -29.064517974853516, "gen_logits_std": 3.090981960296631, "gen_loss": 0.29579558968544006, "grad_norm": 0.37807128205690027, "learning_rate": 2.3273684210526316e-05, "loss": 0.2823, "mean_copy_accuracy": 0.9961758553981781, "mean_gen_accuracy": 0.8703705072402954, "mean_token_accuracy": 0.905259758234024, "num_tokens": 559668381.0, "sample_num_tokens": 8385.75, "step": 6061, "total_num_tokens": 559701924.0, "z_loss": 0.0006481509190052748 }, { "copy_logits_max": -1.697068691253662, "copy_logits_min": -750000000.0, "copy_num_tokens": 538.25, "epoch": 1.2381414347715087, "gen_logits_max": 3.787811756134033, "gen_logits_mean": -15.174999237060547, "gen_logits_min": -27.565536499023438, "gen_logits_std": 3.043407917022705, "gen_loss": 0.2743462026119232, "grad_norm": 0.4004927489225364, "learning_rate": 2.327242105263158e-05, "loss": 0.2729, "mean_copy_accuracy": 0.9965941309928894, "mean_gen_accuracy": 0.8767872303724289, "mean_token_accuracy": 0.9068106561899185, "num_tokens": 559951278.0, "sample_num_tokens": 8303.0, "step": 6062, "total_num_tokens": 559984490.0, "z_loss": 0.0005192068638280034 }, { "copy_logits_max": -3.4196059703826904, "copy_logits_min": -625000000.0, "copy_num_tokens": 448.8125, "epoch": 1.2383456727087057, "gen_logits_max": 3.926501512527466, "gen_logits_mean": -15.983345031738281, "gen_logits_min": -28.305728912353516, "gen_logits_std": 3.047776222229004, "gen_loss": 0.333026647567749, "grad_norm": 0.3603708426405039, "learning_rate": 2.3271157894736844e-05, "loss": 0.2894, "mean_copy_accuracy": 0.9966831505298615, "mean_gen_accuracy": 0.8684262633323669, "mean_token_accuracy": 0.9004011899232864, "num_tokens": 560227842.0, "sample_num_tokens": 9149.5, "step": 6063, "total_num_tokens": 560264440.0, "z_loss": 0.0005854426417499781 }, { "copy_logits_max": 3.709012508392334, "copy_logits_min": -750000000.0, "copy_num_tokens": 579.9375, "epoch": 1.2385499106459026, "gen_logits_max": 4.878239631652832, "gen_logits_mean": -12.871288299560547, "gen_logits_min": -25.651531219482422, "gen_logits_std": 2.9734530448913574, "gen_loss": 0.26213324069976807, "grad_norm": 0.4046365459285615, "learning_rate": 2.3269894736842105e-05, "loss": 0.3079, "mean_copy_accuracy": 0.9951466619968414, "mean_gen_accuracy": 0.8652344346046448, "mean_token_accuracy": 0.8961114883422852, "num_tokens": 560484332.0, "sample_num_tokens": 9234.5, "step": 6064, "total_num_tokens": 560521270.0, "z_loss": 0.0004452848224900663 }, { "copy_logits_max": -0.8843837976455688, "copy_logits_min": -625000000.0, "copy_num_tokens": 406.75, "epoch": 1.2387541485830993, "gen_logits_max": 3.5114552974700928, "gen_logits_mean": -16.831092834472656, "gen_logits_min": -29.221481323242188, "gen_logits_std": 3.094820022583008, "gen_loss": 0.28031808137893677, "grad_norm": 0.44624075116951173, "learning_rate": 2.326863157894737e-05, "loss": 0.2828, "mean_copy_accuracy": 0.9951201528310776, "mean_gen_accuracy": 0.8745661377906799, "mean_token_accuracy": 0.9061671942472458, "num_tokens": 560762663.0, "sample_num_tokens": 7972.75, "step": 6065, "total_num_tokens": 560794554.0, "z_loss": 0.0005541258724406362 }, { "copy_logits_max": -0.21390095353126526, "copy_logits_min": -687500032.0, "copy_num_tokens": 359.375, "epoch": 1.2389583865202962, "gen_logits_max": 3.9327619075775146, "gen_logits_mean": -15.91883659362793, "gen_logits_min": -28.585500717163086, "gen_logits_std": 3.0752763748168945, "gen_loss": 0.3126756548881531, "grad_norm": 0.3721225710493869, "learning_rate": 2.326736842105263e-05, "loss": 0.3089, "mean_copy_accuracy": 0.9958380162715912, "mean_gen_accuracy": 0.8653378486633301, "mean_token_accuracy": 0.89592245221138, "num_tokens": 561026345.0, "sample_num_tokens": 7011.25, "step": 6066, "total_num_tokens": 561054390.0, "z_loss": 0.0006625703535974026 }, { "copy_logits_max": 0.38886353373527527, "copy_logits_min": -750000000.0, "copy_num_tokens": 478.25, "epoch": 1.239162624457493, "gen_logits_max": 3.3035144805908203, "gen_logits_mean": -16.658775329589844, "gen_logits_min": -28.542970657348633, "gen_logits_std": 3.0691070556640625, "gen_loss": 0.2819596529006958, "grad_norm": 0.4228102410605227, "learning_rate": 2.3266105263157898e-05, "loss": 0.2712, "mean_copy_accuracy": 0.995281845331192, "mean_gen_accuracy": 0.8844880759716034, "mean_token_accuracy": 0.9079845994710922, "num_tokens": 561293908.0, "sample_num_tokens": 8074.5, "step": 6067, "total_num_tokens": 561326206.0, "z_loss": 0.0005833144532516599 }, { "copy_logits_max": -3.0120503902435303, "copy_logits_min": -687500032.0, "copy_num_tokens": 432.75, "epoch": 1.2393668623946898, "gen_logits_max": 3.855717658996582, "gen_logits_mean": -16.550045013427734, "gen_logits_min": -28.52348518371582, "gen_logits_std": 3.099252700805664, "gen_loss": 0.24337975680828094, "grad_norm": 0.3766034971083243, "learning_rate": 2.326484210526316e-05, "loss": 0.2732, "mean_copy_accuracy": 0.9962677359580994, "mean_gen_accuracy": 0.8822640031576157, "mean_token_accuracy": 0.9072750955820084, "num_tokens": 561556124.0, "sample_num_tokens": 7798.0, "step": 6068, "total_num_tokens": 561587316.0, "z_loss": 0.00048702972708269954 }, { "copy_logits_max": 0.890026330947876, "copy_logits_min": -750000000.0, "copy_num_tokens": 809.625, "epoch": 1.2395711003318866, "gen_logits_max": 4.357346057891846, "gen_logits_mean": -14.335210800170898, "gen_logits_min": -27.017906188964844, "gen_logits_std": 3.074274778366089, "gen_loss": 0.2315395623445511, "grad_norm": 0.399543559306836, "learning_rate": 2.3263578947368423e-05, "loss": 0.2843, "mean_copy_accuracy": 0.994699627161026, "mean_gen_accuracy": 0.8738089799880981, "mean_token_accuracy": 0.9023090302944183, "num_tokens": 561817468.0, "sample_num_tokens": 10549.5, "step": 6069, "total_num_tokens": 561859666.0, "z_loss": 0.00043118844041600823 }, { "copy_logits_max": 1.3076317310333252, "copy_logits_min": -687500032.0, "copy_num_tokens": 520.0625, "epoch": 1.2397753382690835, "gen_logits_max": 5.049355506896973, "gen_logits_mean": -14.943475723266602, "gen_logits_min": -26.929651260375977, "gen_logits_std": 3.0248353481292725, "gen_loss": 0.292206734418869, "grad_norm": 0.39307336775507185, "learning_rate": 2.3262315789473684e-05, "loss": 0.2856, "mean_copy_accuracy": 0.9945269674062729, "mean_gen_accuracy": 0.8740198612213135, "mean_token_accuracy": 0.9043920189142227, "num_tokens": 562095960.0, "sample_num_tokens": 8479.0, "step": 6070, "total_num_tokens": 562129876.0, "z_loss": 0.0005225329659879208 }, { "copy_logits_max": -0.2871865928173065, "copy_logits_min": -750000000.0, "copy_num_tokens": 629.3125, "epoch": 1.2399795762062804, "gen_logits_max": 3.3462791442871094, "gen_logits_mean": -16.192747116088867, "gen_logits_min": -28.081363677978516, "gen_logits_std": 3.069612503051758, "gen_loss": 0.2742782235145569, "grad_norm": 0.3553255557320012, "learning_rate": 2.326105263157895e-05, "loss": 0.2693, "mean_copy_accuracy": 0.9977877736091614, "mean_gen_accuracy": 0.8761268705129623, "mean_token_accuracy": 0.9106549024581909, "num_tokens": 562399425.0, "sample_num_tokens": 9840.75, "step": 6071, "total_num_tokens": 562438788.0, "z_loss": 0.00048278304166160524 }, { "copy_logits_max": -0.5683977603912354, "copy_logits_min": -750000064.0, "copy_num_tokens": 622.125, "epoch": 1.2401838141434771, "gen_logits_max": 4.081655979156494, "gen_logits_mean": -15.994804382324219, "gen_logits_min": -27.69266128540039, "gen_logits_std": 3.1046433448791504, "gen_loss": 0.2457149475812912, "grad_norm": 0.4091907559732124, "learning_rate": 2.325978947368421e-05, "loss": 0.2933, "mean_copy_accuracy": 0.9958392679691315, "mean_gen_accuracy": 0.8715554177761078, "mean_token_accuracy": 0.9009855389595032, "num_tokens": 562683202.0, "sample_num_tokens": 10946.0, "step": 6072, "total_num_tokens": 562726986.0, "z_loss": 0.00046103179920464754 }, { "copy_logits_max": -0.649741530418396, "copy_logits_min": -687500032.0, "copy_num_tokens": 709.1875, "epoch": 1.240388052080674, "gen_logits_max": 4.936551094055176, "gen_logits_mean": -14.540157318115234, "gen_logits_min": -26.571216583251953, "gen_logits_std": 3.01422381401062, "gen_loss": 0.2921670079231262, "grad_norm": 0.41314149195535493, "learning_rate": 2.3258526315789474e-05, "loss": 0.2947, "mean_copy_accuracy": 0.9960240125656128, "mean_gen_accuracy": 0.8718267381191254, "mean_token_accuracy": 0.9009163975715637, "num_tokens": 562944038.0, "sample_num_tokens": 11066.0, "step": 6073, "total_num_tokens": 562988302.0, "z_loss": 0.0004891208955086768 }, { "copy_logits_max": -3.1580376625061035, "copy_logits_min": -750000000.0, "copy_num_tokens": 321.4375, "epoch": 1.2405922900178707, "gen_logits_max": 3.983004570007324, "gen_logits_mean": -16.844932556152344, "gen_logits_min": -28.877853393554688, "gen_logits_std": 3.0958871841430664, "gen_loss": 0.330249160528183, "grad_norm": 0.36734872018362963, "learning_rate": 2.3257263157894735e-05, "loss": 0.2895, "mean_copy_accuracy": 0.9969126880168915, "mean_gen_accuracy": 0.8757143169641495, "mean_token_accuracy": 0.9017836898565292, "num_tokens": 563235123.0, "sample_num_tokens": 7435.25, "step": 6074, "total_num_tokens": 563264864.0, "z_loss": 0.0005645955679938197 }, { "copy_logits_max": -4.221133232116699, "copy_logits_min": -750000000.0, "copy_num_tokens": 387.6875, "epoch": 1.2407965279550677, "gen_logits_max": 2.867541551589966, "gen_logits_mean": -18.212602615356445, "gen_logits_min": -30.371990203857422, "gen_logits_std": 3.1293740272521973, "gen_loss": 0.2685965597629547, "grad_norm": 0.3853999711899004, "learning_rate": 2.3256e-05, "loss": 0.2724, "mean_copy_accuracy": 0.9970504492521286, "mean_gen_accuracy": 0.8755476325750351, "mean_token_accuracy": 0.9080604761838913, "num_tokens": 563509120.0, "sample_num_tokens": 7520.0, "step": 6075, "total_num_tokens": 563539200.0, "z_loss": 0.0004518427886068821 }, { "copy_logits_max": -2.5764389038085938, "copy_logits_min": -750000064.0, "copy_num_tokens": 469.8125, "epoch": 1.2410007658922644, "gen_logits_max": 4.793062210083008, "gen_logits_mean": -15.987841606140137, "gen_logits_min": -28.36935806274414, "gen_logits_std": 3.0738673210144043, "gen_loss": 0.2605871558189392, "grad_norm": 0.37903429354729223, "learning_rate": 2.3254736842105267e-05, "loss": 0.2865, "mean_copy_accuracy": 0.9962105602025986, "mean_gen_accuracy": 0.8745771199464798, "mean_token_accuracy": 0.9043395817279816, "num_tokens": 563754230.0, "sample_num_tokens": 7989.0, "step": 6076, "total_num_tokens": 563786186.0, "z_loss": 0.0004724980390165001 }, { "copy_logits_max": -3.4202938079833984, "copy_logits_min": -750000000.0, "copy_num_tokens": 480.375, "epoch": 1.2412050038294613, "gen_logits_max": 4.872489929199219, "gen_logits_mean": -16.766929626464844, "gen_logits_min": -28.6492919921875, "gen_logits_std": 3.080862045288086, "gen_loss": 0.27130627632141113, "grad_norm": 0.3657314509686361, "learning_rate": 2.3253473684210528e-05, "loss": 0.2945, "mean_copy_accuracy": 0.9950384348630905, "mean_gen_accuracy": 0.8748100697994232, "mean_token_accuracy": 0.9001383036375046, "num_tokens": 564025636.0, "sample_num_tokens": 8831.5, "step": 6077, "total_num_tokens": 564060962.0, "z_loss": 0.0005616728449240327 }, { "copy_logits_max": 0.49424490332603455, "copy_logits_min": -750000064.0, "copy_num_tokens": 467.5625, "epoch": 1.2414092417666582, "gen_logits_max": 5.155851364135742, "gen_logits_mean": -14.674826622009277, "gen_logits_min": -26.897602081298828, "gen_logits_std": 3.029550552368164, "gen_loss": 0.2851060628890991, "grad_norm": 0.3944699341591369, "learning_rate": 2.3252210526315792e-05, "loss": 0.2907, "mean_copy_accuracy": 0.9969644397497177, "mean_gen_accuracy": 0.8693777322769165, "mean_token_accuracy": 0.9016537964344025, "num_tokens": 564294863.0, "sample_num_tokens": 7815.75, "step": 6078, "total_num_tokens": 564326126.0, "z_loss": 0.0005097693065181375 }, { "copy_logits_max": -0.6225771307945251, "copy_logits_min": -687500032.0, "copy_num_tokens": 425.125, "epoch": 1.241613479703855, "gen_logits_max": 4.063966751098633, "gen_logits_mean": -16.440717697143555, "gen_logits_min": -28.67324447631836, "gen_logits_std": 3.0844335556030273, "gen_loss": 0.2894766628742218, "grad_norm": 0.4060354780801386, "learning_rate": 2.3250947368421053e-05, "loss": 0.2931, "mean_copy_accuracy": 0.9954796880483627, "mean_gen_accuracy": 0.8720715939998627, "mean_token_accuracy": 0.9003511816263199, "num_tokens": 564558746.0, "sample_num_tokens": 8194.5, "step": 6079, "total_num_tokens": 564591524.0, "z_loss": 0.0005539081757888198 }, { "copy_logits_max": -0.9291794300079346, "copy_logits_min": -687500032.0, "copy_num_tokens": 527.875, "epoch": 1.2418177176410519, "gen_logits_max": 3.6083595752716064, "gen_logits_mean": -16.54315185546875, "gen_logits_min": -28.538171768188477, "gen_logits_std": 3.0981605052948, "gen_loss": 0.25104081630706787, "grad_norm": 0.4156278224556402, "learning_rate": 2.3249684210526317e-05, "loss": 0.2685, "mean_copy_accuracy": 0.9958906918764114, "mean_gen_accuracy": 0.8850101977586746, "mean_token_accuracy": 0.9096305072307587, "num_tokens": 564797085.0, "sample_num_tokens": 8127.25, "step": 6080, "total_num_tokens": 564829594.0, "z_loss": 0.0005221226019784808 }, { "copy_logits_max": 1.1705528497695923, "copy_logits_min": -750000000.0, "copy_num_tokens": 257.0, "epoch": 1.2420219555782486, "gen_logits_max": 5.624270439147949, "gen_logits_mean": -13.95613956451416, "gen_logits_min": -25.952661514282227, "gen_logits_std": 3.0434677600860596, "gen_loss": 0.2965126037597656, "grad_norm": 0.3694903419457298, "learning_rate": 2.3248421052631578e-05, "loss": 0.2885, "mean_copy_accuracy": 0.9961543828248978, "mean_gen_accuracy": 0.8730453252792358, "mean_token_accuracy": 0.9001723229885101, "num_tokens": 565059001.0, "sample_num_tokens": 6833.25, "step": 6081, "total_num_tokens": 565086334.0, "z_loss": 0.000565725436899811 }, { "copy_logits_max": 0.44726163148880005, "copy_logits_min": -750000000.0, "copy_num_tokens": 461.375, "epoch": 1.2422261935154455, "gen_logits_max": 4.493177890777588, "gen_logits_mean": -14.933975219726562, "gen_logits_min": -26.67919158935547, "gen_logits_std": 3.045572280883789, "gen_loss": 0.34224045276641846, "grad_norm": 0.3901286798666595, "learning_rate": 2.3247157894736842e-05, "loss": 0.3024, "mean_copy_accuracy": 0.9969190508127213, "mean_gen_accuracy": 0.8658905774354935, "mean_token_accuracy": 0.8983692675828934, "num_tokens": 565320737.0, "sample_num_tokens": 8699.25, "step": 6082, "total_num_tokens": 565355534.0, "z_loss": 0.0006076520076021552 }, { "copy_logits_max": -2.960277557373047, "copy_logits_min": -750000000.0, "copy_num_tokens": 494.0625, "epoch": 1.2424304314526424, "gen_logits_max": 4.070135116577148, "gen_logits_mean": -16.176197052001953, "gen_logits_min": -28.021442413330078, "gen_logits_std": 3.075399160385132, "gen_loss": 0.2759057283401489, "grad_norm": 0.3926199946231945, "learning_rate": 2.3245894736842103e-05, "loss": 0.2835, "mean_copy_accuracy": 0.9965086728334427, "mean_gen_accuracy": 0.8741943836212158, "mean_token_accuracy": 0.9046498537063599, "num_tokens": 565605720.0, "sample_num_tokens": 9153.5, "step": 6083, "total_num_tokens": 565642334.0, "z_loss": 0.0005161286098882556 }, { "copy_logits_max": 3.706923484802246, "copy_logits_min": -687500032.0, "copy_num_tokens": 365.4375, "epoch": 1.2426346693898391, "gen_logits_max": 5.152359485626221, "gen_logits_mean": -14.507011413574219, "gen_logits_min": -27.278362274169922, "gen_logits_std": 3.077566385269165, "gen_loss": 0.2515454888343811, "grad_norm": 0.3908203856909015, "learning_rate": 2.324463157894737e-05, "loss": 0.2913, "mean_copy_accuracy": 0.9958236515522003, "mean_gen_accuracy": 0.8750571459531784, "mean_token_accuracy": 0.9002908021211624, "num_tokens": 565868828.0, "sample_num_tokens": 7475.5, "step": 6084, "total_num_tokens": 565898730.0, "z_loss": 0.0004469625127967447 }, { "copy_logits_max": -0.6554112434387207, "copy_logits_min": -687500032.0, "copy_num_tokens": 482.6875, "epoch": 1.242838907327036, "gen_logits_max": 3.7782511711120605, "gen_logits_mean": -15.994036674499512, "gen_logits_min": -28.141460418701172, "gen_logits_std": 3.088028907775879, "gen_loss": 0.29233676195144653, "grad_norm": 0.370453229808798, "learning_rate": 2.3243368421052632e-05, "loss": 0.2768, "mean_copy_accuracy": 0.9960760325193405, "mean_gen_accuracy": 0.8758990466594696, "mean_token_accuracy": 0.9058694541454315, "num_tokens": 566137075.0, "sample_num_tokens": 8714.75, "step": 6085, "total_num_tokens": 566171934.0, "z_loss": 0.0005581493023782969 }, { "copy_logits_max": -1.9307302236557007, "copy_logits_min": -750000000.0, "copy_num_tokens": 344.5625, "epoch": 1.2430431452642328, "gen_logits_max": 4.481484413146973, "gen_logits_mean": -15.516426086425781, "gen_logits_min": -27.624591827392578, "gen_logits_std": 3.0694689750671387, "gen_loss": 0.32289043068885803, "grad_norm": 0.40062690035648735, "learning_rate": 2.3242105263157896e-05, "loss": 0.3046, "mean_copy_accuracy": 0.9966566860675812, "mean_gen_accuracy": 0.871598094701767, "mean_token_accuracy": 0.8980899900197983, "num_tokens": 566410420.0, "sample_num_tokens": 7404.5, "step": 6086, "total_num_tokens": 566440038.0, "z_loss": 0.0006362220738083124 }, { "copy_logits_max": 0.6050245761871338, "copy_logits_min": -750000000.0, "copy_num_tokens": 392.9375, "epoch": 1.2432473832014297, "gen_logits_max": 5.101776123046875, "gen_logits_mean": -14.668787956237793, "gen_logits_min": -26.620784759521484, "gen_logits_std": 3.0709376335144043, "gen_loss": 0.274760901927948, "grad_norm": 0.42226084833535277, "learning_rate": 2.3240842105263157e-05, "loss": 0.3044, "mean_copy_accuracy": 0.9966062307357788, "mean_gen_accuracy": 0.8711559921503067, "mean_token_accuracy": 0.8991336673498154, "num_tokens": 566661915.0, "sample_num_tokens": 8288.75, "step": 6087, "total_num_tokens": 566695070.0, "z_loss": 0.0005773366428911686 }, { "copy_logits_max": 0.9023761749267578, "copy_logits_min": -750000000.0, "copy_num_tokens": 459.375, "epoch": 1.2434516211386266, "gen_logits_max": 3.7151622772216797, "gen_logits_mean": -16.6275634765625, "gen_logits_min": -29.147790908813477, "gen_logits_std": 3.0948071479797363, "gen_loss": 0.2975994348526001, "grad_norm": 0.39866117131066386, "learning_rate": 2.323957894736842e-05, "loss": 0.2716, "mean_copy_accuracy": 0.9965944588184357, "mean_gen_accuracy": 0.8782176971435547, "mean_token_accuracy": 0.9069216102361679, "num_tokens": 566931352.0, "sample_num_tokens": 7971.5, "step": 6088, "total_num_tokens": 566963238.0, "z_loss": 0.0006738209631294012 }, { "copy_logits_max": 0.03586382418870926, "copy_logits_min": -687500032.0, "copy_num_tokens": 453.1875, "epoch": 1.2436558590758233, "gen_logits_max": 4.077841758728027, "gen_logits_mean": -16.07124900817871, "gen_logits_min": -28.4965877532959, "gen_logits_std": 3.0881690979003906, "gen_loss": 0.2905403971672058, "grad_norm": 0.3908378454750267, "learning_rate": 2.3238315789473686e-05, "loss": 0.2803, "mean_copy_accuracy": 0.9972282499074936, "mean_gen_accuracy": 0.8772521466016769, "mean_token_accuracy": 0.9062813967466354, "num_tokens": 567220139.0, "sample_num_tokens": 8348.25, "step": 6089, "total_num_tokens": 567253532.0, "z_loss": 0.00055452820379287 }, { "copy_logits_max": -2.749354124069214, "copy_logits_min": -687500032.0, "copy_num_tokens": 300.9375, "epoch": 1.2438600970130202, "gen_logits_max": 3.8820533752441406, "gen_logits_mean": -17.071733474731445, "gen_logits_min": -29.3070068359375, "gen_logits_std": 3.1018424034118652, "gen_loss": 0.29447141289711, "grad_norm": 0.38681816736510116, "learning_rate": 2.3237052631578947e-05, "loss": 0.2879, "mean_copy_accuracy": 0.994557335972786, "mean_gen_accuracy": 0.8740902245044708, "mean_token_accuracy": 0.8995294272899628, "num_tokens": 567481473.0, "sample_num_tokens": 7635.75, "step": 6090, "total_num_tokens": 567512016.0, "z_loss": 0.0005573495291173458 }, { "copy_logits_max": -3.61692476272583, "copy_logits_min": -750000000.0, "copy_num_tokens": 291.1875, "epoch": 1.244064334950217, "gen_logits_max": 3.8856465816497803, "gen_logits_mean": -17.011131286621094, "gen_logits_min": -29.057662963867188, "gen_logits_std": 3.099569797515869, "gen_loss": 0.3054850995540619, "grad_norm": 0.42316618853180005, "learning_rate": 2.323578947368421e-05, "loss": 0.3002, "mean_copy_accuracy": 0.9952885061502457, "mean_gen_accuracy": 0.8719021081924438, "mean_token_accuracy": 0.8987168073654175, "num_tokens": 567735572.0, "sample_num_tokens": 7341.0, "step": 6091, "total_num_tokens": 567764936.0, "z_loss": 0.0005632168613374233 }, { "copy_logits_max": -0.7218275666236877, "copy_logits_min": -750000000.0, "copy_num_tokens": 456.5, "epoch": 1.2442685728874139, "gen_logits_max": 3.7720210552215576, "gen_logits_mean": -15.404350280761719, "gen_logits_min": -27.700180053710938, "gen_logits_std": 3.0911779403686523, "gen_loss": 0.2487468272447586, "grad_norm": 0.37676573722781315, "learning_rate": 2.3234526315789475e-05, "loss": 0.2627, "mean_copy_accuracy": 0.9967435002326965, "mean_gen_accuracy": 0.8803338259458542, "mean_token_accuracy": 0.9105178117752075, "num_tokens": 567999141.0, "sample_num_tokens": 7478.75, "step": 6092, "total_num_tokens": 568029056.0, "z_loss": 0.0004690903006121516 }, { "copy_logits_max": 0.3403458595275879, "copy_logits_min": -750000064.0, "copy_num_tokens": 526.625, "epoch": 1.2444728108246106, "gen_logits_max": 3.1884818077087402, "gen_logits_mean": -16.09776496887207, "gen_logits_min": -28.66417694091797, "gen_logits_std": 3.0889341831207275, "gen_loss": 0.27753299474716187, "grad_norm": 0.34089846025115456, "learning_rate": 2.323326315789474e-05, "loss": 0.2593, "mean_copy_accuracy": 0.9968782663345337, "mean_gen_accuracy": 0.879106268286705, "mean_token_accuracy": 0.9120180308818817, "num_tokens": 568285142.0, "sample_num_tokens": 8615.0, "step": 6093, "total_num_tokens": 568319602.0, "z_loss": 0.0005149008939042687 }, { "copy_logits_max": 2.3438589572906494, "copy_logits_min": -750000000.0, "copy_num_tokens": 751.75, "epoch": 1.2446770487618075, "gen_logits_max": 4.097866535186768, "gen_logits_mean": -14.468929290771484, "gen_logits_min": -26.815086364746094, "gen_logits_std": 3.090940475463867, "gen_loss": 0.21736308932304382, "grad_norm": 0.425672924691622, "learning_rate": 2.3232e-05, "loss": 0.2633, "mean_copy_accuracy": 0.9959272593259811, "mean_gen_accuracy": 0.8809837698936462, "mean_token_accuracy": 0.9111468940973282, "num_tokens": 568568245.0, "sample_num_tokens": 10853.25, "step": 6094, "total_num_tokens": 568611658.0, "z_loss": 0.0004070915747433901 }, { "copy_logits_max": 2.087445020675659, "copy_logits_min": -750000000.0, "copy_num_tokens": 532.8125, "epoch": 1.2448812866990044, "gen_logits_max": 4.741147994995117, "gen_logits_mean": -14.204301834106445, "gen_logits_min": -26.48078155517578, "gen_logits_std": 3.0551881790161133, "gen_loss": 0.2690601050853729, "grad_norm": 0.3642357360079047, "learning_rate": 2.3230736842105265e-05, "loss": 0.2883, "mean_copy_accuracy": 0.9966813027858734, "mean_gen_accuracy": 0.8710779547691345, "mean_token_accuracy": 0.9038762748241425, "num_tokens": 568854647.0, "sample_num_tokens": 9703.75, "step": 6095, "total_num_tokens": 568893462.0, "z_loss": 0.00045641622273251414 }, { "copy_logits_max": -0.4033792018890381, "copy_logits_min": -750000000.0, "copy_num_tokens": 566.4375, "epoch": 1.2450855246362011, "gen_logits_max": 4.850460052490234, "gen_logits_mean": -14.766027450561523, "gen_logits_min": -26.42557144165039, "gen_logits_std": 3.0458076000213623, "gen_loss": 0.2848251163959503, "grad_norm": 0.40225745907677524, "learning_rate": 2.3229473684210526e-05, "loss": 0.2848, "mean_copy_accuracy": 0.9951716214418411, "mean_gen_accuracy": 0.8763332813978195, "mean_token_accuracy": 0.9054099172353745, "num_tokens": 569144047.0, "sample_num_tokens": 9887.75, "step": 6096, "total_num_tokens": 569183598.0, "z_loss": 0.0005288817919790745 }, { "copy_logits_max": 2.1899805068969727, "copy_logits_min": -750000000.0, "copy_num_tokens": 564.125, "epoch": 1.245289762573398, "gen_logits_max": 3.8526194095611572, "gen_logits_mean": -15.455424308776855, "gen_logits_min": -28.064533233642578, "gen_logits_std": 3.0979037284851074, "gen_loss": 0.2748350501060486, "grad_norm": 0.37268518720835114, "learning_rate": 2.322821052631579e-05, "loss": 0.2944, "mean_copy_accuracy": 0.9954872280359268, "mean_gen_accuracy": 0.8716729134321213, "mean_token_accuracy": 0.9014343470335007, "num_tokens": 569415193.0, "sample_num_tokens": 8476.75, "step": 6097, "total_num_tokens": 569449100.0, "z_loss": 0.000596223515458405 }, { "copy_logits_max": -1.7276835441589355, "copy_logits_min": -625000064.0, "copy_num_tokens": 339.0, "epoch": 1.2454940005105948, "gen_logits_max": 4.5881829261779785, "gen_logits_mean": -15.967630386352539, "gen_logits_min": -27.581275939941406, "gen_logits_std": 3.0730578899383545, "gen_loss": 0.2667488753795624, "grad_norm": 0.35610781645425593, "learning_rate": 2.322694736842105e-05, "loss": 0.2716, "mean_copy_accuracy": 0.9970235377550125, "mean_gen_accuracy": 0.8735434114933014, "mean_token_accuracy": 0.9070748090744019, "num_tokens": 569691671.0, "sample_num_tokens": 8506.25, "step": 6098, "total_num_tokens": 569725696.0, "z_loss": 0.0005625829799100757 }, { "copy_logits_max": 1.5302138328552246, "copy_logits_min": -750000000.0, "copy_num_tokens": 319.8125, "epoch": 1.2456982384477917, "gen_logits_max": 6.025148868560791, "gen_logits_mean": -13.144998550415039, "gen_logits_min": -25.080575942993164, "gen_logits_std": 3.0040316581726074, "gen_loss": 0.33098626136779785, "grad_norm": 0.4353998539835849, "learning_rate": 2.3225684210526315e-05, "loss": 0.3144, "mean_copy_accuracy": 0.9955173283815384, "mean_gen_accuracy": 0.8652030527591705, "mean_token_accuracy": 0.8940174728631973, "num_tokens": 569942358.0, "sample_num_tokens": 7206.5, "step": 6099, "total_num_tokens": 569971184.0, "z_loss": 0.0006972883129492402 }, { "copy_logits_max": -1.9303255081176758, "copy_logits_min": -687500096.0, "copy_num_tokens": 649.25, "epoch": 1.2459024763849884, "gen_logits_max": 3.1255109310150146, "gen_logits_mean": -17.22130584716797, "gen_logits_min": -29.156539916992188, "gen_logits_std": 3.1112239360809326, "gen_loss": 0.23548434674739838, "grad_norm": 0.4012951357635505, "learning_rate": 2.322442105263158e-05, "loss": 0.2672, "mean_copy_accuracy": 0.9954162389039993, "mean_gen_accuracy": 0.8810827881097794, "mean_token_accuracy": 0.9096870422363281, "num_tokens": 570216404.0, "sample_num_tokens": 9822.0, "step": 6100, "total_num_tokens": 570255692.0, "z_loss": 0.0005284084472805262 }, { "copy_logits_max": -0.5610975027084351, "copy_logits_min": -750000000.0, "copy_num_tokens": 303.25, "epoch": 1.2461067143221853, "gen_logits_max": 5.457884788513184, "gen_logits_mean": -14.775035858154297, "gen_logits_min": -26.864439010620117, "gen_logits_std": 3.033967971801758, "gen_loss": 0.3205242156982422, "grad_norm": 0.34453801649880733, "learning_rate": 2.3223157894736844e-05, "loss": 0.2801, "mean_copy_accuracy": 0.9961246848106384, "mean_gen_accuracy": 0.8746381103992462, "mean_token_accuracy": 0.9054199755191803, "num_tokens": 570498023.0, "sample_num_tokens": 6854.25, "step": 6101, "total_num_tokens": 570525440.0, "z_loss": 0.000686726882122457 }, { "copy_logits_max": 1.5118683576583862, "copy_logits_min": -750000000.0, "copy_num_tokens": 694.75, "epoch": 1.2463109522593823, "gen_logits_max": 4.406961441040039, "gen_logits_mean": -14.275077819824219, "gen_logits_min": -26.532772064208984, "gen_logits_std": 3.0632176399230957, "gen_loss": 0.2810603976249695, "grad_norm": 0.41951436360383926, "learning_rate": 2.3221894736842108e-05, "loss": 0.2996, "mean_copy_accuracy": 0.9954514652490616, "mean_gen_accuracy": 0.8704937696456909, "mean_token_accuracy": 0.9001022279262543, "num_tokens": 570756920.0, "sample_num_tokens": 10022.0, "step": 6102, "total_num_tokens": 570797008.0, "z_loss": 0.0005874639609828591 }, { "copy_logits_max": -2.420659065246582, "copy_logits_min": -750000000.0, "copy_num_tokens": 580.125, "epoch": 1.246515190196579, "gen_logits_max": 4.575103759765625, "gen_logits_mean": -15.267341613769531, "gen_logits_min": -27.467323303222656, "gen_logits_std": 3.089747190475464, "gen_loss": 0.24714510142803192, "grad_norm": 0.3933941551097248, "learning_rate": 2.322063157894737e-05, "loss": 0.2784, "mean_copy_accuracy": 0.9966060370206833, "mean_gen_accuracy": 0.8770018666982651, "mean_token_accuracy": 0.9062233418226242, "num_tokens": 571034899.0, "sample_num_tokens": 9511.25, "step": 6103, "total_num_tokens": 571072944.0, "z_loss": 0.0005102198920212686 }, { "copy_logits_max": -4.404085636138916, "copy_logits_min": -750000000.0, "copy_num_tokens": 376.8125, "epoch": 1.246719428133776, "gen_logits_max": 4.786785125732422, "gen_logits_mean": -15.300640106201172, "gen_logits_min": -27.33375358581543, "gen_logits_std": 3.0419540405273438, "gen_loss": 0.28735384345054626, "grad_norm": 0.44695895158238075, "learning_rate": 2.3219368421052633e-05, "loss": 0.2698, "mean_copy_accuracy": 0.9955178797245026, "mean_gen_accuracy": 0.8765261620283127, "mean_token_accuracy": 0.9091673195362091, "num_tokens": 571309264.0, "sample_num_tokens": 7536.0, "step": 6104, "total_num_tokens": 571339408.0, "z_loss": 0.0005380455404520035 }, { "copy_logits_max": -2.5087413787841797, "copy_logits_min": -687500032.0, "copy_num_tokens": 693.5, "epoch": 1.2469236660709726, "gen_logits_max": 4.034053325653076, "gen_logits_mean": -14.833341598510742, "gen_logits_min": -27.249164581298828, "gen_logits_std": 3.07088041305542, "gen_loss": 0.262359619140625, "grad_norm": 0.4580393268891628, "learning_rate": 2.3218105263157894e-05, "loss": 0.2799, "mean_copy_accuracy": 0.9959087520837784, "mean_gen_accuracy": 0.8723747283220291, "mean_token_accuracy": 0.9048545658588409, "num_tokens": 571568789.0, "sample_num_tokens": 9700.75, "step": 6105, "total_num_tokens": 571607592.0, "z_loss": 0.00048577541019767523 }, { "copy_logits_max": -3.366590738296509, "copy_logits_min": -750000000.0, "copy_num_tokens": 464.6875, "epoch": 1.2471279040081695, "gen_logits_max": 3.1134777069091797, "gen_logits_mean": -16.419296264648438, "gen_logits_min": -27.8714656829834, "gen_logits_std": 3.044039487838745, "gen_loss": 0.2574787437915802, "grad_norm": 0.3860800034362473, "learning_rate": 2.321684210526316e-05, "loss": 0.2888, "mean_copy_accuracy": 0.9961835145950317, "mean_gen_accuracy": 0.8737462908029556, "mean_token_accuracy": 0.9032573848962784, "num_tokens": 571823031.0, "sample_num_tokens": 7752.75, "step": 6106, "total_num_tokens": 571854042.0, "z_loss": 0.0005110771162435412 }, { "copy_logits_max": -4.132355690002441, "copy_logits_min": -687500032.0, "copy_num_tokens": 655.0625, "epoch": 1.2473321419453662, "gen_logits_max": 3.826202392578125, "gen_logits_mean": -16.17753791809082, "gen_logits_min": -27.651775360107422, "gen_logits_std": 3.020516872406006, "gen_loss": 0.25909489393234253, "grad_norm": 0.36435993324966276, "learning_rate": 2.321557894736842e-05, "loss": 0.2569, "mean_copy_accuracy": 0.9970267564058304, "mean_gen_accuracy": 0.8850922733545303, "mean_token_accuracy": 0.914155051112175, "num_tokens": 572098333.0, "sample_num_tokens": 11201.25, "step": 6107, "total_num_tokens": 572143138.0, "z_loss": 0.0005165874608792365 }, { "copy_logits_max": -4.5927629470825195, "copy_logits_min": -750000000.0, "copy_num_tokens": 385.75, "epoch": 1.2475363798825632, "gen_logits_max": 3.599220037460327, "gen_logits_mean": -17.521554946899414, "gen_logits_min": -29.20826530456543, "gen_logits_std": 3.072784900665283, "gen_loss": 0.25571057200431824, "grad_norm": 0.3896955989801391, "learning_rate": 2.3214315789473687e-05, "loss": 0.2815, "mean_copy_accuracy": 0.9956341981887817, "mean_gen_accuracy": 0.8760920763015747, "mean_token_accuracy": 0.903546154499054, "num_tokens": 572359413.0, "sample_num_tokens": 8632.75, "step": 6108, "total_num_tokens": 572393944.0, "z_loss": 0.0004935848992317915 }, { "copy_logits_max": -4.208622932434082, "copy_logits_min": -687500032.0, "copy_num_tokens": 533.0, "epoch": 1.24774061781976, "gen_logits_max": 4.693884372711182, "gen_logits_mean": -15.720388412475586, "gen_logits_min": -27.17702865600586, "gen_logits_std": 2.9714488983154297, "gen_loss": 0.26986944675445557, "grad_norm": 0.38858099634519366, "learning_rate": 2.3213052631578948e-05, "loss": 0.2885, "mean_copy_accuracy": 0.996669352054596, "mean_gen_accuracy": 0.8691699206829071, "mean_token_accuracy": 0.9021712392568588, "num_tokens": 572632673.0, "sample_num_tokens": 8930.75, "step": 6109, "total_num_tokens": 572668396.0, "z_loss": 0.0005063667776994407 }, { "copy_logits_max": -2.134866714477539, "copy_logits_min": -687500032.0, "copy_num_tokens": 404.8125, "epoch": 1.2479448557569568, "gen_logits_max": 4.704257011413574, "gen_logits_mean": -15.299399375915527, "gen_logits_min": -26.93662452697754, "gen_logits_std": 3.0314300060272217, "gen_loss": 0.3032207190990448, "grad_norm": 0.3685236060588404, "learning_rate": 2.3211789473684213e-05, "loss": 0.273, "mean_copy_accuracy": 0.9957948178052902, "mean_gen_accuracy": 0.8753211498260498, "mean_token_accuracy": 0.9063567072153091, "num_tokens": 572924377.0, "sample_num_tokens": 7911.25, "step": 6110, "total_num_tokens": 572956022.0, "z_loss": 0.000527810538187623 }, { "copy_logits_max": -4.243497848510742, "copy_logits_min": -750000000.0, "copy_num_tokens": 437.875, "epoch": 1.2481490936941537, "gen_logits_max": 3.934133291244507, "gen_logits_mean": -16.48161506652832, "gen_logits_min": -28.237197875976562, "gen_logits_std": 3.0665841102600098, "gen_loss": 0.2776791453361511, "grad_norm": 0.3967513230631523, "learning_rate": 2.3210526315789473e-05, "loss": 0.3046, "mean_copy_accuracy": 0.9955088645219803, "mean_gen_accuracy": 0.8676120638847351, "mean_token_accuracy": 0.8968537598848343, "num_tokens": 573182611.0, "sample_num_tokens": 7632.75, "step": 6111, "total_num_tokens": 573213142.0, "z_loss": 0.0004900348139926791 }, { "copy_logits_max": -4.159884452819824, "copy_logits_min": -750000000.0, "copy_num_tokens": 413.9375, "epoch": 1.2483533316313504, "gen_logits_max": 4.740856647491455, "gen_logits_mean": -15.999823570251465, "gen_logits_min": -28.240537643432617, "gen_logits_std": 3.0540478229522705, "gen_loss": 0.295591801404953, "grad_norm": 0.3893008741870045, "learning_rate": 2.3209263157894738e-05, "loss": 0.2732, "mean_copy_accuracy": 0.9962269216775894, "mean_gen_accuracy": 0.8776694536209106, "mean_token_accuracy": 0.9067325741052628, "num_tokens": 573456875.0, "sample_num_tokens": 7709.25, "step": 6112, "total_num_tokens": 573487712.0, "z_loss": 0.0005747579270973802 }, { "copy_logits_max": -4.143402099609375, "copy_logits_min": -750000000.0, "copy_num_tokens": 482.5, "epoch": 1.2485575695685474, "gen_logits_max": 3.7307140827178955, "gen_logits_mean": -16.39346694946289, "gen_logits_min": -28.18344497680664, "gen_logits_std": 3.104886293411255, "gen_loss": 0.2527381181716919, "grad_norm": 0.4855718493346016, "learning_rate": 2.3208e-05, "loss": 0.2672, "mean_copy_accuracy": 0.9967695325613022, "mean_gen_accuracy": 0.8818417191505432, "mean_token_accuracy": 0.9076375216245651, "num_tokens": 573724483.0, "sample_num_tokens": 9140.75, "step": 6113, "total_num_tokens": 573761046.0, "z_loss": 0.00044673538650386035 }, { "copy_logits_max": -3.1393351554870605, "copy_logits_min": -750000000.0, "copy_num_tokens": 551.375, "epoch": 1.2487618075057443, "gen_logits_max": 3.5784499645233154, "gen_logits_mean": -17.018943786621094, "gen_logits_min": -28.939544677734375, "gen_logits_std": 3.106588363647461, "gen_loss": 0.30499324202537537, "grad_norm": 0.41040206146186464, "learning_rate": 2.3206736842105263e-05, "loss": 0.286, "mean_copy_accuracy": 0.9968142211437225, "mean_gen_accuracy": 0.870301753282547, "mean_token_accuracy": 0.9044817984104156, "num_tokens": 573998575.0, "sample_num_tokens": 9603.25, "step": 6114, "total_num_tokens": 574036988.0, "z_loss": 0.0005660982569679618 }, { "copy_logits_max": -5.779984474182129, "copy_logits_min": -750000000.0, "copy_num_tokens": 542.9375, "epoch": 1.248966045442941, "gen_logits_max": 2.9437172412872314, "gen_logits_mean": -17.705102920532227, "gen_logits_min": -29.627517700195312, "gen_logits_std": 3.1212620735168457, "gen_loss": 0.24102187156677246, "grad_norm": 0.3757668695279877, "learning_rate": 2.3205473684210524e-05, "loss": 0.2608, "mean_copy_accuracy": 0.9963963627815247, "mean_gen_accuracy": 0.8792992830276489, "mean_token_accuracy": 0.9099655449390411, "num_tokens": 574286167.0, "sample_num_tokens": 9191.25, "step": 6115, "total_num_tokens": 574322932.0, "z_loss": 0.000432511733379215 }, { "copy_logits_max": -3.9953179359436035, "copy_logits_min": -687500032.0, "copy_num_tokens": 760.875, "epoch": 1.249170283380138, "gen_logits_max": 3.5181736946105957, "gen_logits_mean": -15.23006534576416, "gen_logits_min": -27.0640926361084, "gen_logits_std": 3.0507636070251465, "gen_loss": 0.23212501406669617, "grad_norm": 0.392823577381009, "learning_rate": 2.320421052631579e-05, "loss": 0.263, "mean_copy_accuracy": 0.996313601732254, "mean_gen_accuracy": 0.8812005817890167, "mean_token_accuracy": 0.9114023298025131, "num_tokens": 574574899.0, "sample_num_tokens": 10175.25, "step": 6116, "total_num_tokens": 574615600.0, "z_loss": 0.0004029430856462568 }, { "copy_logits_max": -3.248661518096924, "copy_logits_min": -625000000.0, "copy_num_tokens": 584.625, "epoch": 1.2493745213173346, "gen_logits_max": 4.505051136016846, "gen_logits_mean": -14.486495971679688, "gen_logits_min": -26.600332260131836, "gen_logits_std": 3.0350961685180664, "gen_loss": 0.2605776786804199, "grad_norm": 0.39222879634362434, "learning_rate": 2.3202947368421056e-05, "loss": 0.2616, "mean_copy_accuracy": 0.9961139559745789, "mean_gen_accuracy": 0.8813980370759964, "mean_token_accuracy": 0.9116380363702774, "num_tokens": 574852575.0, "sample_num_tokens": 9079.25, "step": 6117, "total_num_tokens": 574888892.0, "z_loss": 0.0005010035238228738 }, { "copy_logits_max": -3.2275593280792236, "copy_logits_min": -750000000.0, "copy_num_tokens": 434.0, "epoch": 1.2495787592545315, "gen_logits_max": 3.5676677227020264, "gen_logits_mean": -16.53954315185547, "gen_logits_min": -28.423717498779297, "gen_logits_std": 3.082892894744873, "gen_loss": 0.30034470558166504, "grad_norm": 0.35147681398566766, "learning_rate": 2.3201684210526317e-05, "loss": 0.2729, "mean_copy_accuracy": 0.9962295740842819, "mean_gen_accuracy": 0.8769983053207397, "mean_token_accuracy": 0.9080936312675476, "num_tokens": 575149443.0, "sample_num_tokens": 8116.75, "step": 6118, "total_num_tokens": 575181910.0, "z_loss": 0.0006282669492065907 }, { "copy_logits_max": -3.101548194885254, "copy_logits_min": -750000000.0, "copy_num_tokens": 432.9375, "epoch": 1.2497829971917285, "gen_logits_max": 3.984421968460083, "gen_logits_mean": -16.09939193725586, "gen_logits_min": -27.752765655517578, "gen_logits_std": 3.0584652423858643, "gen_loss": 0.2703094780445099, "grad_norm": 0.40086969811513207, "learning_rate": 2.320042105263158e-05, "loss": 0.2647, "mean_copy_accuracy": 0.9966021031141281, "mean_gen_accuracy": 0.8828325569629669, "mean_token_accuracy": 0.9125641733407974, "num_tokens": 575455577.0, "sample_num_tokens": 9161.75, "step": 6119, "total_num_tokens": 575492224.0, "z_loss": 0.000490889884531498 }, { "copy_logits_max": -5.94661808013916, "copy_logits_min": -750000000.0, "copy_num_tokens": 271.4375, "epoch": 1.2499872351289252, "gen_logits_max": 5.997563362121582, "gen_logits_mean": -14.514424324035645, "gen_logits_min": -26.898286819458008, "gen_logits_std": 3.0259647369384766, "gen_loss": 0.3479648232460022, "grad_norm": 0.5658282022839203, "learning_rate": 2.3199157894736842e-05, "loss": 0.2955, "mean_copy_accuracy": 0.9964474588632584, "mean_gen_accuracy": 0.868061363697052, "mean_token_accuracy": 0.89930459856987, "num_tokens": 575719790.0, "sample_num_tokens": 6831.5, "step": 6120, "total_num_tokens": 575747116.0, "z_loss": 0.0005917287198826671 }, { "copy_logits_max": -2.19246768951416, "copy_logits_min": -750000000.0, "copy_num_tokens": 612.9375, "epoch": 1.250191473066122, "gen_logits_max": 3.925800323486328, "gen_logits_mean": -15.161602973937988, "gen_logits_min": -27.78019142150879, "gen_logits_std": 3.105307102203369, "gen_loss": 0.2553933262825012, "grad_norm": 0.39962101828972657, "learning_rate": 2.3197894736842106e-05, "loss": 0.2836, "mean_copy_accuracy": 0.9959394782781601, "mean_gen_accuracy": 0.8729030191898346, "mean_token_accuracy": 0.9054628908634186, "num_tokens": 575993498.0, "sample_num_tokens": 8440.0, "step": 6121, "total_num_tokens": 576027258.0, "z_loss": 0.00047229547635652125 }, { "copy_logits_max": -4.710555076599121, "copy_logits_min": -750000000.0, "copy_num_tokens": 451.125, "epoch": 1.2503957110033188, "gen_logits_max": 4.438922882080078, "gen_logits_mean": -16.68290138244629, "gen_logits_min": -28.955230712890625, "gen_logits_std": 3.113210439682007, "gen_loss": 0.25312840938568115, "grad_norm": 0.3938561004083089, "learning_rate": 2.3196631578947367e-05, "loss": 0.2745, "mean_copy_accuracy": 0.9952485114336014, "mean_gen_accuracy": 0.8789013773202896, "mean_token_accuracy": 0.9076796919107437, "num_tokens": 576271459.0, "sample_num_tokens": 8635.75, "step": 6122, "total_num_tokens": 576306002.0, "z_loss": 0.0005118410335853696 }, { "copy_logits_max": -5.26420259475708, "copy_logits_min": -750000064.0, "copy_num_tokens": 315.6875, "epoch": 1.2505999489405157, "gen_logits_max": 4.2680583000183105, "gen_logits_mean": -17.431621551513672, "gen_logits_min": -29.80831527709961, "gen_logits_std": 3.1127378940582275, "gen_loss": 0.29679393768310547, "grad_norm": 0.3986447244472812, "learning_rate": 2.319536842105263e-05, "loss": 0.2816, "mean_copy_accuracy": 0.9959132224321365, "mean_gen_accuracy": 0.8784656673669815, "mean_token_accuracy": 0.9051967263221741, "num_tokens": 576528615.0, "sample_num_tokens": 7381.75, "step": 6123, "total_num_tokens": 576558142.0, "z_loss": 0.0005586990737356246 }, { "copy_logits_max": -2.540815830230713, "copy_logits_min": -750000000.0, "copy_num_tokens": 579.1875, "epoch": 1.2508041868777124, "gen_logits_max": 4.757834434509277, "gen_logits_mean": -14.223855018615723, "gen_logits_min": -27.175716400146484, "gen_logits_std": 3.0830917358398438, "gen_loss": 0.2705796957015991, "grad_norm": 0.38157413550027225, "learning_rate": 2.3194105263157893e-05, "loss": 0.2703, "mean_copy_accuracy": 0.9974032193422318, "mean_gen_accuracy": 0.8738025575876236, "mean_token_accuracy": 0.9091883897781372, "num_tokens": 576824099.0, "sample_num_tokens": 8913.25, "step": 6124, "total_num_tokens": 576859752.0, "z_loss": 0.0004886173992417753 }, { "copy_logits_max": -3.7862842082977295, "copy_logits_min": -750000000.0, "copy_num_tokens": 494.5, "epoch": 1.2510084248149094, "gen_logits_max": 4.171343803405762, "gen_logits_mean": -16.72987937927246, "gen_logits_min": -28.725067138671875, "gen_logits_std": 3.101907253265381, "gen_loss": 0.2896406650543213, "grad_norm": 0.38073134169783557, "learning_rate": 2.319284210526316e-05, "loss": 0.2665, "mean_copy_accuracy": 0.9974128603935242, "mean_gen_accuracy": 0.8793092221021652, "mean_token_accuracy": 0.9076907634735107, "num_tokens": 577106891.0, "sample_num_tokens": 9475.75, "step": 6125, "total_num_tokens": 577144794.0, "z_loss": 0.000517416512593627 }, { "copy_logits_max": -3.6184022426605225, "copy_logits_min": -750000000.0, "copy_num_tokens": 376.5, "epoch": 1.2512126627521063, "gen_logits_max": 5.020956039428711, "gen_logits_mean": -15.259231567382812, "gen_logits_min": -27.85779571533203, "gen_logits_std": 3.0788497924804688, "gen_loss": 0.31540828943252563, "grad_norm": 0.3795923715398715, "learning_rate": 2.319157894736842e-05, "loss": 0.2853, "mean_copy_accuracy": 0.9960793405771255, "mean_gen_accuracy": 0.8725207895040512, "mean_token_accuracy": 0.9046174734830856, "num_tokens": 577383577.0, "sample_num_tokens": 6790.75, "step": 6126, "total_num_tokens": 577410740.0, "z_loss": 0.0005595592665486038 }, { "copy_logits_max": -5.104433059692383, "copy_logits_min": -687500032.0, "copy_num_tokens": 352.125, "epoch": 1.251416900689303, "gen_logits_max": 5.75778341293335, "gen_logits_mean": -14.36889934539795, "gen_logits_min": -27.18062973022461, "gen_logits_std": 3.054136276245117, "gen_loss": 0.32587534189224243, "grad_norm": 0.40646485751178724, "learning_rate": 2.3190315789473686e-05, "loss": 0.2906, "mean_copy_accuracy": 0.9957964718341827, "mean_gen_accuracy": 0.8766966462135315, "mean_token_accuracy": 0.9010209143161774, "num_tokens": 577628349.0, "sample_num_tokens": 7739.75, "step": 6127, "total_num_tokens": 577659308.0, "z_loss": 0.0006188797997310758 }, { "copy_logits_max": -3.534593105316162, "copy_logits_min": -750000000.0, "copy_num_tokens": 404.25, "epoch": 1.2516211386265, "gen_logits_max": 3.8198533058166504, "gen_logits_mean": -17.326026916503906, "gen_logits_min": -29.195323944091797, "gen_logits_std": 3.0745227336883545, "gen_loss": 0.3099830746650696, "grad_norm": 0.4356852735498088, "learning_rate": 2.3189052631578946e-05, "loss": 0.2997, "mean_copy_accuracy": 0.994263306260109, "mean_gen_accuracy": 0.8689055740833282, "mean_token_accuracy": 0.8984036296606064, "num_tokens": 577896616.0, "sample_num_tokens": 8401.0, "step": 6128, "total_num_tokens": 577930220.0, "z_loss": 0.0006452876841649413 }, { "copy_logits_max": -3.723456859588623, "copy_logits_min": -750000064.0, "copy_num_tokens": 534.25, "epoch": 1.2518253765636966, "gen_logits_max": 3.8210253715515137, "gen_logits_mean": -16.35116958618164, "gen_logits_min": -28.59705352783203, "gen_logits_std": 3.1077957153320312, "gen_loss": 0.26582035422325134, "grad_norm": 0.39065196523547, "learning_rate": 2.318778947368421e-05, "loss": 0.2932, "mean_copy_accuracy": 0.9962381273508072, "mean_gen_accuracy": 0.8696330934762955, "mean_token_accuracy": 0.9002145528793335, "num_tokens": 578161890.0, "sample_num_tokens": 9033.0, "step": 6129, "total_num_tokens": 578198022.0, "z_loss": 0.0005079910042695701 }, { "copy_logits_max": -0.6470716595649719, "copy_logits_min": -625000064.0, "copy_num_tokens": 396.5625, "epoch": 1.2520296145008936, "gen_logits_max": 4.417831897735596, "gen_logits_mean": -15.068914413452148, "gen_logits_min": -27.213834762573242, "gen_logits_std": 3.069739818572998, "gen_loss": 0.3047611713409424, "grad_norm": 0.454223270513102, "learning_rate": 2.3186526315789475e-05, "loss": 0.2963, "mean_copy_accuracy": 0.9963852167129517, "mean_gen_accuracy": 0.8718996196985245, "mean_token_accuracy": 0.9007344394922256, "num_tokens": 578430464.0, "sample_num_tokens": 8149.0, "step": 6130, "total_num_tokens": 578463060.0, "z_loss": 0.0005663208430632949 }, { "copy_logits_max": -5.840057849884033, "copy_logits_min": -687500032.0, "copy_num_tokens": 481.375, "epoch": 1.2522338524380903, "gen_logits_max": 3.429565906524658, "gen_logits_mean": -15.837498664855957, "gen_logits_min": -28.22688102722168, "gen_logits_std": 3.0776472091674805, "gen_loss": 0.2728269100189209, "grad_norm": 0.42064315674258823, "learning_rate": 2.3185263157894736e-05, "loss": 0.2684, "mean_copy_accuracy": 0.9961857795715332, "mean_gen_accuracy": 0.8795948773622513, "mean_token_accuracy": 0.9093445837497711, "num_tokens": 578705669.0, "sample_num_tokens": 8268.25, "step": 6131, "total_num_tokens": 578738742.0, "z_loss": 0.0005078842514194548 }, { "copy_logits_max": -5.703138828277588, "copy_logits_min": -687500032.0, "copy_num_tokens": 285.5625, "epoch": 1.2524380903752872, "gen_logits_max": 4.246598243713379, "gen_logits_mean": -16.076541900634766, "gen_logits_min": -28.393444061279297, "gen_logits_std": 3.0559661388397217, "gen_loss": 0.2953210175037384, "grad_norm": 0.40345759143784005, "learning_rate": 2.3184e-05, "loss": 0.2862, "mean_copy_accuracy": 0.9944463968276978, "mean_gen_accuracy": 0.8775080442428589, "mean_token_accuracy": 0.9024537801742554, "num_tokens": 578964526.0, "sample_num_tokens": 6864.0, "step": 6132, "total_num_tokens": 578991982.0, "z_loss": 0.0005216336576268077 }, { "copy_logits_max": -4.383880138397217, "copy_logits_min": -750000000.0, "copy_num_tokens": 485.5625, "epoch": 1.2526423283124841, "gen_logits_max": 3.4008588790893555, "gen_logits_mean": -17.05632781982422, "gen_logits_min": -28.968538284301758, "gen_logits_std": 3.085956335067749, "gen_loss": 0.28099507093429565, "grad_norm": 0.3816589638971535, "learning_rate": 2.3182736842105265e-05, "loss": 0.2805, "mean_copy_accuracy": 0.996256560087204, "mean_gen_accuracy": 0.8763289749622345, "mean_token_accuracy": 0.9053830951452255, "num_tokens": 579228659.0, "sample_num_tokens": 8675.75, "step": 6133, "total_num_tokens": 579263362.0, "z_loss": 0.0005221483297646046 }, { "copy_logits_max": -6.127987861633301, "copy_logits_min": -750000000.0, "copy_num_tokens": 684.5625, "epoch": 1.2528465662496808, "gen_logits_max": 4.854320526123047, "gen_logits_mean": -14.354610443115234, "gen_logits_min": -27.707345962524414, "gen_logits_std": 3.0603017807006836, "gen_loss": 0.2520054876804352, "grad_norm": 0.39929009398589355, "learning_rate": 2.318147368421053e-05, "loss": 0.2834, "mean_copy_accuracy": 0.9962111413478851, "mean_gen_accuracy": 0.8734917342662811, "mean_token_accuracy": 0.9035747200250626, "num_tokens": 579502467.0, "sample_num_tokens": 10284.25, "step": 6134, "total_num_tokens": 579543604.0, "z_loss": 0.00044291617814451456 }, { "copy_logits_max": -5.704833030700684, "copy_logits_min": -750000000.0, "copy_num_tokens": 521.5, "epoch": 1.2530508041868778, "gen_logits_max": 3.820746898651123, "gen_logits_mean": -15.534696578979492, "gen_logits_min": -27.39122772216797, "gen_logits_std": 3.0245208740234375, "gen_loss": 0.269813597202301, "grad_norm": 0.36543189213153887, "learning_rate": 2.318021052631579e-05, "loss": 0.2772, "mean_copy_accuracy": 0.9972579628229141, "mean_gen_accuracy": 0.8760888278484344, "mean_token_accuracy": 0.9084934890270233, "num_tokens": 579796294.0, "sample_num_tokens": 8498.5, "step": 6135, "total_num_tokens": 579830288.0, "z_loss": 0.000497771892696619 }, { "copy_logits_max": -3.827105760574341, "copy_logits_min": -687500032.0, "copy_num_tokens": 641.1875, "epoch": 1.2532550421240747, "gen_logits_max": 3.565864324569702, "gen_logits_mean": -15.50518798828125, "gen_logits_min": -28.313518524169922, "gen_logits_std": 3.074326515197754, "gen_loss": 0.25467175245285034, "grad_norm": 0.40362250460614746, "learning_rate": 2.3178947368421054e-05, "loss": 0.2964, "mean_copy_accuracy": 0.9961168766021729, "mean_gen_accuracy": 0.8706439733505249, "mean_token_accuracy": 0.8993178158998489, "num_tokens": 580045106.0, "sample_num_tokens": 9587.0, "step": 6136, "total_num_tokens": 580083454.0, "z_loss": 0.0004715960822068155 }, { "copy_logits_max": -5.169751167297363, "copy_logits_min": -750000000.0, "copy_num_tokens": 441.5625, "epoch": 1.2534592800612714, "gen_logits_max": 3.674225330352783, "gen_logits_mean": -15.840888977050781, "gen_logits_min": -28.072086334228516, "gen_logits_std": 3.02605938911438, "gen_loss": 0.2774279713630676, "grad_norm": 0.37469890881482376, "learning_rate": 2.3177684210526315e-05, "loss": 0.2939, "mean_copy_accuracy": 0.9965769201517105, "mean_gen_accuracy": 0.8724478185176849, "mean_token_accuracy": 0.9005583226680756, "num_tokens": 580302488.0, "sample_num_tokens": 7913.5, "step": 6137, "total_num_tokens": 580334142.0, "z_loss": 0.0005137039115652442 }, { "copy_logits_max": -4.323252201080322, "copy_logits_min": -750000064.0, "copy_num_tokens": 475.625, "epoch": 1.253663517998468, "gen_logits_max": 3.6309642791748047, "gen_logits_mean": -16.916797637939453, "gen_logits_min": -29.08629035949707, "gen_logits_std": 3.1107797622680664, "gen_loss": 0.29004520177841187, "grad_norm": 0.4050911915165158, "learning_rate": 2.317642105263158e-05, "loss": 0.295, "mean_copy_accuracy": 0.9961200654506683, "mean_gen_accuracy": 0.8670525997877121, "mean_token_accuracy": 0.900783896446228, "num_tokens": 580566967.0, "sample_num_tokens": 8140.25, "step": 6138, "total_num_tokens": 580599528.0, "z_loss": 0.000520323752425611 }, { "copy_logits_max": -4.758793830871582, "copy_logits_min": -750000000.0, "copy_num_tokens": 438.625, "epoch": 1.253867755935665, "gen_logits_max": 4.612082481384277, "gen_logits_mean": -16.398666381835938, "gen_logits_min": -28.94849967956543, "gen_logits_std": 3.0599703788757324, "gen_loss": 0.2934767007827759, "grad_norm": 0.41462604303483064, "learning_rate": 2.317515789473684e-05, "loss": 0.2896, "mean_copy_accuracy": 0.9959154725074768, "mean_gen_accuracy": 0.8712558448314667, "mean_token_accuracy": 0.9037672281265259, "num_tokens": 580822512.0, "sample_num_tokens": 9023.5, "step": 6139, "total_num_tokens": 580858606.0, "z_loss": 0.0005791588337160647 }, { "copy_logits_max": -2.0430850982666016, "copy_logits_min": -750000000.0, "copy_num_tokens": 449.9375, "epoch": 1.254071993872862, "gen_logits_max": 5.655306816101074, "gen_logits_mean": -13.821554183959961, "gen_logits_min": -26.731670379638672, "gen_logits_std": 3.041036605834961, "gen_loss": 0.2697869539260864, "grad_norm": 0.38746604341739116, "learning_rate": 2.3173894736842105e-05, "loss": 0.2814, "mean_copy_accuracy": 0.9967675358057022, "mean_gen_accuracy": 0.8755261152982712, "mean_token_accuracy": 0.9054321199655533, "num_tokens": 581089797.0, "sample_num_tokens": 7794.25, "step": 6140, "total_num_tokens": 581120974.0, "z_loss": 0.000551254372112453 }, { "copy_logits_max": -4.534599304199219, "copy_logits_min": -687500032.0, "copy_num_tokens": 552.0625, "epoch": 1.2542762318100587, "gen_logits_max": 2.6160902976989746, "gen_logits_mean": -17.52234649658203, "gen_logits_min": -29.425199508666992, "gen_logits_std": 3.11879825592041, "gen_loss": 0.25069642066955566, "grad_norm": 0.3674068667938634, "learning_rate": 2.317263157894737e-05, "loss": 0.2755, "mean_copy_accuracy": 0.9969617575407028, "mean_gen_accuracy": 0.8739786297082901, "mean_token_accuracy": 0.9060457199811935, "num_tokens": 581364132.0, "sample_num_tokens": 8272.0, "step": 6141, "total_num_tokens": 581397220.0, "z_loss": 0.0005015183123759925 }, { "copy_logits_max": -5.234587669372559, "copy_logits_min": -750000000.0, "copy_num_tokens": 654.9375, "epoch": 1.2544804697472556, "gen_logits_max": 2.766176223754883, "gen_logits_mean": -17.807552337646484, "gen_logits_min": -29.96501922607422, "gen_logits_std": 3.1064887046813965, "gen_loss": 0.2598695755004883, "grad_norm": 0.38027599984830385, "learning_rate": 2.3171368421052633e-05, "loss": 0.2669, "mean_copy_accuracy": 0.9969952702522278, "mean_gen_accuracy": 0.8800021260976791, "mean_token_accuracy": 0.9093623608350754, "num_tokens": 581631947.0, "sample_num_tokens": 10054.25, "step": 6142, "total_num_tokens": 581672164.0, "z_loss": 0.0005203994223847985 }, { "copy_logits_max": -6.518708229064941, "copy_logits_min": -750000000.0, "copy_num_tokens": 386.5, "epoch": 1.2546847076844525, "gen_logits_max": 5.11744499206543, "gen_logits_mean": -15.743766784667969, "gen_logits_min": -27.70953369140625, "gen_logits_std": 3.071429967880249, "gen_loss": 0.30137544870376587, "grad_norm": 0.3858961549265777, "learning_rate": 2.3170105263157898e-05, "loss": 0.2968, "mean_copy_accuracy": 0.9970587641000748, "mean_gen_accuracy": 0.8751524388790131, "mean_token_accuracy": 0.9009916633367538, "num_tokens": 581886019.0, "sample_num_tokens": 8377.75, "step": 6143, "total_num_tokens": 581919530.0, "z_loss": 0.0006054771365597844 }, { "copy_logits_max": -5.812507152557373, "copy_logits_min": -750000000.0, "copy_num_tokens": 515.3125, "epoch": 1.2548889456216492, "gen_logits_max": 4.411781311035156, "gen_logits_mean": -15.433610916137695, "gen_logits_min": -27.558359146118164, "gen_logits_std": 3.0721869468688965, "gen_loss": 0.28128886222839355, "grad_norm": 0.3977814470738423, "learning_rate": 2.316884210526316e-05, "loss": 0.291, "mean_copy_accuracy": 0.9967881292104721, "mean_gen_accuracy": 0.8707015365362167, "mean_token_accuracy": 0.901884600520134, "num_tokens": 582175434.0, "sample_num_tokens": 9134.5, "step": 6144, "total_num_tokens": 582211972.0, "z_loss": 0.0005112963845022023 }, { "copy_logits_max": -5.500204086303711, "copy_logits_min": -750000000.0, "copy_num_tokens": 472.8125, "epoch": 1.2550931835588461, "gen_logits_max": 4.5477614402771, "gen_logits_mean": -14.78232192993164, "gen_logits_min": -27.417570114135742, "gen_logits_std": 3.0741260051727295, "gen_loss": 0.3087431788444519, "grad_norm": 0.39872742460019356, "learning_rate": 2.3167578947368423e-05, "loss": 0.2863, "mean_copy_accuracy": 0.9954483062028885, "mean_gen_accuracy": 0.875917986035347, "mean_token_accuracy": 0.9052599966526031, "num_tokens": 582473974.0, "sample_num_tokens": 8326.0, "step": 6145, "total_num_tokens": 582507278.0, "z_loss": 0.0006159169133752584 }, { "copy_logits_max": -4.477337837219238, "copy_logits_min": -687500032.0, "copy_num_tokens": 719.75, "epoch": 1.2552974214960428, "gen_logits_max": 3.2605233192443848, "gen_logits_mean": -16.84573745727539, "gen_logits_min": -29.253219604492188, "gen_logits_std": 3.1287717819213867, "gen_loss": 0.2575434446334839, "grad_norm": 0.3889767470250874, "learning_rate": 2.3166315789473684e-05, "loss": 0.2734, "mean_copy_accuracy": 0.9966793358325958, "mean_gen_accuracy": 0.8731091618537903, "mean_token_accuracy": 0.9079251438379288, "num_tokens": 582764249.0, "sample_num_tokens": 10610.75, "step": 6146, "total_num_tokens": 582806692.0, "z_loss": 0.0004971427842974663 }, { "copy_logits_max": -6.567838191986084, "copy_logits_min": -750000000.0, "copy_num_tokens": 325.1875, "epoch": 1.2555016594332398, "gen_logits_max": 4.127372741699219, "gen_logits_mean": -16.787307739257812, "gen_logits_min": -29.0716552734375, "gen_logits_std": 3.093855857849121, "gen_loss": 0.3011901378631592, "grad_norm": 0.39381472206293794, "learning_rate": 2.3165052631578948e-05, "loss": 0.2707, "mean_copy_accuracy": 0.9965103417634964, "mean_gen_accuracy": 0.8785018920898438, "mean_token_accuracy": 0.9085859507322311, "num_tokens": 583045940.0, "sample_num_tokens": 7374.5, "step": 6147, "total_num_tokens": 583075438.0, "z_loss": 0.0005926095182076097 }, { "copy_logits_max": -5.041898727416992, "copy_logits_min": -687500032.0, "copy_num_tokens": 410.0625, "epoch": 1.2557058973704365, "gen_logits_max": 3.783085346221924, "gen_logits_mean": -16.072202682495117, "gen_logits_min": -28.284988403320312, "gen_logits_std": 3.059446334838867, "gen_loss": 0.33172088861465454, "grad_norm": 0.413204949292288, "learning_rate": 2.316378947368421e-05, "loss": 0.2884, "mean_copy_accuracy": 0.9966453164815903, "mean_gen_accuracy": 0.8718849420547485, "mean_token_accuracy": 0.9011227786540985, "num_tokens": 583310983.0, "sample_num_tokens": 8356.75, "step": 6148, "total_num_tokens": 583344410.0, "z_loss": 0.000592184835113585 }, { "copy_logits_max": -3.735995292663574, "copy_logits_min": -750000000.0, "copy_num_tokens": 628.75, "epoch": 1.2559101353076334, "gen_logits_max": 4.352020263671875, "gen_logits_mean": -14.997042655944824, "gen_logits_min": -27.303359985351562, "gen_logits_std": 3.0837509632110596, "gen_loss": 0.3213144540786743, "grad_norm": 0.39806720558306596, "learning_rate": 2.3162526315789477e-05, "loss": 0.2906, "mean_copy_accuracy": 0.9965524226427078, "mean_gen_accuracy": 0.8705465346574783, "mean_token_accuracy": 0.9006955772638321, "num_tokens": 583588894.0, "sample_num_tokens": 9766.5, "step": 6149, "total_num_tokens": 583627960.0, "z_loss": 0.0005678517045453191 }, { "copy_logits_max": -4.961268424987793, "copy_logits_min": -750000000.0, "copy_num_tokens": 502.8125, "epoch": 1.2561143732448303, "gen_logits_max": 4.489500045776367, "gen_logits_mean": -15.774452209472656, "gen_logits_min": -27.865324020385742, "gen_logits_std": 3.080455780029297, "gen_loss": 0.2894231677055359, "grad_norm": 0.3766383552990367, "learning_rate": 2.3161263157894738e-05, "loss": 0.2655, "mean_copy_accuracy": 0.9961297363042831, "mean_gen_accuracy": 0.8810044080018997, "mean_token_accuracy": 0.9086859077215195, "num_tokens": 583875728.0, "sample_num_tokens": 8527.0, "step": 6150, "total_num_tokens": 583909836.0, "z_loss": 0.0005313052097335458 }, { "copy_logits_max": -6.664155960083008, "copy_logits_min": -750000000.0, "copy_num_tokens": 557.4375, "epoch": 1.256318611182027, "gen_logits_max": 4.539299488067627, "gen_logits_mean": -14.8367919921875, "gen_logits_min": -27.026079177856445, "gen_logits_std": 3.069167375564575, "gen_loss": 0.2560088634490967, "grad_norm": 0.3443313054393155, "learning_rate": 2.3160000000000002e-05, "loss": 0.2598, "mean_copy_accuracy": 0.9967062175273895, "mean_gen_accuracy": 0.8809923678636551, "mean_token_accuracy": 0.9118723273277283, "num_tokens": 584166090.0, "sample_num_tokens": 8824.5, "step": 6151, "total_num_tokens": 584201388.0, "z_loss": 0.000445826182840392 }, { "copy_logits_max": -7.272220611572266, "copy_logits_min": -750000000.0, "copy_num_tokens": 437.8125, "epoch": 1.256522849119224, "gen_logits_max": 4.351337909698486, "gen_logits_mean": -15.822738647460938, "gen_logits_min": -27.763505935668945, "gen_logits_std": 3.0740764141082764, "gen_loss": 0.28655362129211426, "grad_norm": 0.4176818858503252, "learning_rate": 2.3158736842105263e-05, "loss": 0.2777, "mean_copy_accuracy": 0.9959040880203247, "mean_gen_accuracy": 0.8767624050378799, "mean_token_accuracy": 0.9046360999345779, "num_tokens": 584462926.0, "sample_num_tokens": 9191.0, "step": 6152, "total_num_tokens": 584499690.0, "z_loss": 0.0004994770861230791 }, { "copy_logits_max": -6.015781402587891, "copy_logits_min": -750000000.0, "copy_num_tokens": 526.3125, "epoch": 1.2567270870564207, "gen_logits_max": 3.014606237411499, "gen_logits_mean": -16.919626235961914, "gen_logits_min": -28.980289459228516, "gen_logits_std": 3.106438159942627, "gen_loss": 0.29378312826156616, "grad_norm": 0.37062405737801074, "learning_rate": 2.3157473684210527e-05, "loss": 0.2737, "mean_copy_accuracy": 0.9971591830253601, "mean_gen_accuracy": 0.8752710819244385, "mean_token_accuracy": 0.9059780687093735, "num_tokens": 584736219.0, "sample_num_tokens": 9024.25, "step": 6153, "total_num_tokens": 584772316.0, "z_loss": 0.0005146213807165623 }, { "copy_logits_max": -6.185407638549805, "copy_logits_min": -750000000.0, "copy_num_tokens": 333.5625, "epoch": 1.2569313249936176, "gen_logits_max": 5.322595596313477, "gen_logits_mean": -15.174938201904297, "gen_logits_min": -27.059593200683594, "gen_logits_std": 3.069420337677002, "gen_loss": 0.28615617752075195, "grad_norm": 0.3554018215839077, "learning_rate": 2.3156210526315788e-05, "loss": 0.2579, "mean_copy_accuracy": 0.9963844865560532, "mean_gen_accuracy": 0.8870032280683517, "mean_token_accuracy": 0.9108896851539612, "num_tokens": 585017743.0, "sample_num_tokens": 6952.25, "step": 6154, "total_num_tokens": 585045552.0, "z_loss": 0.0005156465340405703 }, { "copy_logits_max": -5.444100379943848, "copy_logits_min": -750000000.0, "copy_num_tokens": 442.8125, "epoch": 1.2571355629308143, "gen_logits_max": 3.3061413764953613, "gen_logits_mean": -17.02271270751953, "gen_logits_min": -29.245941162109375, "gen_logits_std": 3.1060900688171387, "gen_loss": 0.2826175391674042, "grad_norm": 0.3563863532857684, "learning_rate": 2.3154947368421052e-05, "loss": 0.283, "mean_copy_accuracy": 0.9965325742959976, "mean_gen_accuracy": 0.8739875257015228, "mean_token_accuracy": 0.9040154665708542, "num_tokens": 585282547.0, "sample_num_tokens": 7550.25, "step": 6155, "total_num_tokens": 585312748.0, "z_loss": 0.0005668206140398979 }, { "copy_logits_max": -7.059967994689941, "copy_logits_min": -750000000.0, "copy_num_tokens": 228.0, "epoch": 1.2573398008680112, "gen_logits_max": 4.894964694976807, "gen_logits_mean": -17.16199493408203, "gen_logits_min": -28.77138900756836, "gen_logits_std": 3.080606460571289, "gen_loss": 0.32358109951019287, "grad_norm": 0.4430672698596838, "learning_rate": 2.3153684210526317e-05, "loss": 0.2854, "mean_copy_accuracy": 0.9964320510625839, "mean_gen_accuracy": 0.8753086626529694, "mean_token_accuracy": 0.9039419293403625, "num_tokens": 585551851.0, "sample_num_tokens": 6989.25, "step": 6156, "total_num_tokens": 585579808.0, "z_loss": 0.0006096623837947845 }, { "copy_logits_max": -3.6791000366210938, "copy_logits_min": -750000000.0, "copy_num_tokens": 490.375, "epoch": 1.2575440388052082, "gen_logits_max": 3.7080163955688477, "gen_logits_mean": -16.121746063232422, "gen_logits_min": -28.739227294921875, "gen_logits_std": 3.093273162841797, "gen_loss": 0.26567918062210083, "grad_norm": 0.4135823315344684, "learning_rate": 2.315242105263158e-05, "loss": 0.2895, "mean_copy_accuracy": 0.9957298189401627, "mean_gen_accuracy": 0.8708186596632004, "mean_token_accuracy": 0.9011041224002838, "num_tokens": 585802615.0, "sample_num_tokens": 8237.75, "step": 6157, "total_num_tokens": 585835566.0, "z_loss": 0.0005426227580755949 }, { "copy_logits_max": -5.0904364585876465, "copy_logits_min": -625000000.0, "copy_num_tokens": 436.6875, "epoch": 1.2577482767424049, "gen_logits_max": 3.1438345909118652, "gen_logits_mean": -17.53342056274414, "gen_logits_min": -29.677143096923828, "gen_logits_std": 3.1202313899993896, "gen_loss": 0.26818451285362244, "grad_norm": 0.4014417123355899, "learning_rate": 2.3151157894736845e-05, "loss": 0.2812, "mean_copy_accuracy": 0.9959394037723541, "mean_gen_accuracy": 0.8784584850072861, "mean_token_accuracy": 0.9074269086122513, "num_tokens": 586083434.0, "sample_num_tokens": 8449.5, "step": 6158, "total_num_tokens": 586117232.0, "z_loss": 0.000505301053635776 }, { "copy_logits_max": -0.5194457769393921, "copy_logits_min": -750000000.0, "copy_num_tokens": 519.125, "epoch": 1.2579525146796018, "gen_logits_max": 5.587594985961914, "gen_logits_mean": -12.410842895507812, "gen_logits_min": -25.074420928955078, "gen_logits_std": 3.0379934310913086, "gen_loss": 0.2915783226490021, "grad_norm": 0.34402427225366067, "learning_rate": 2.3149894736842106e-05, "loss": 0.269, "mean_copy_accuracy": 0.9965327382087708, "mean_gen_accuracy": 0.8762949705123901, "mean_token_accuracy": 0.9087942242622375, "num_tokens": 586363766.0, "sample_num_tokens": 9112.0, "step": 6159, "total_num_tokens": 586400214.0, "z_loss": 0.0005141013534739614 }, { "copy_logits_max": -5.13279914855957, "copy_logits_min": -750000000.0, "copy_num_tokens": 597.0625, "epoch": 1.2581567526167985, "gen_logits_max": 3.778167724609375, "gen_logits_mean": -15.62668228149414, "gen_logits_min": -28.240488052368164, "gen_logits_std": 3.1133627891540527, "gen_loss": 0.2293596863746643, "grad_norm": 0.41141962487203854, "learning_rate": 2.314863157894737e-05, "loss": 0.274, "mean_copy_accuracy": 0.9970309734344482, "mean_gen_accuracy": 0.8718769699335098, "mean_token_accuracy": 0.9037551283836365, "num_tokens": 586635277.0, "sample_num_tokens": 9125.75, "step": 6160, "total_num_tokens": 586671780.0, "z_loss": 0.0004068126145284623 }, { "copy_logits_max": -6.596562385559082, "copy_logits_min": -687500032.0, "copy_num_tokens": 541.8125, "epoch": 1.2583609905539954, "gen_logits_max": 3.3449859619140625, "gen_logits_mean": -16.62135887145996, "gen_logits_min": -28.99067497253418, "gen_logits_std": 3.080002784729004, "gen_loss": 0.25501757860183716, "grad_norm": 0.35482820317079783, "learning_rate": 2.314736842105263e-05, "loss": 0.2665, "mean_copy_accuracy": 0.9959470331668854, "mean_gen_accuracy": 0.8821280896663666, "mean_token_accuracy": 0.9101417362689972, "num_tokens": 586910720.0, "sample_num_tokens": 9607.5, "step": 6161, "total_num_tokens": 586949150.0, "z_loss": 0.000447983096819371 }, { "copy_logits_max": -6.900272369384766, "copy_logits_min": -750000000.0, "copy_num_tokens": 531.8125, "epoch": 1.2585652284911921, "gen_logits_max": 3.23860764503479, "gen_logits_mean": -16.277538299560547, "gen_logits_min": -28.236854553222656, "gen_logits_std": 3.081937551498413, "gen_loss": 0.2637590169906616, "grad_norm": 0.3468562089133219, "learning_rate": 2.3146105263157896e-05, "loss": 0.2621, "mean_copy_accuracy": 0.9957805573940277, "mean_gen_accuracy": 0.8826543986797333, "mean_token_accuracy": 0.9106037616729736, "num_tokens": 587191659.0, "sample_num_tokens": 8741.75, "step": 6162, "total_num_tokens": 587226626.0, "z_loss": 0.00047036883188411593 }, { "copy_logits_max": -6.95463752746582, "copy_logits_min": -750000000.0, "copy_num_tokens": 316.25, "epoch": 1.258769466428389, "gen_logits_max": 4.4334893226623535, "gen_logits_mean": -16.84465789794922, "gen_logits_min": -28.380874633789062, "gen_logits_std": 3.0439538955688477, "gen_loss": 0.36721736192703247, "grad_norm": 0.3517142697542519, "learning_rate": 2.3144842105263157e-05, "loss": 0.2764, "mean_copy_accuracy": 0.9958736598491669, "mean_gen_accuracy": 0.875472217798233, "mean_token_accuracy": 0.904779851436615, "num_tokens": 587473982.0, "sample_num_tokens": 8117.0, "step": 6163, "total_num_tokens": 587506450.0, "z_loss": 0.0006935772835277021 }, { "copy_logits_max": -5.3447065353393555, "copy_logits_min": -750000064.0, "copy_num_tokens": 344.5, "epoch": 1.258973704365586, "gen_logits_max": 3.951594114303589, "gen_logits_mean": -15.573732376098633, "gen_logits_min": -27.635025024414062, "gen_logits_std": 3.0831234455108643, "gen_loss": 0.2926514148712158, "grad_norm": 0.3767884626378704, "learning_rate": 2.314357894736842e-05, "loss": 0.2618, "mean_copy_accuracy": 0.9956270605325699, "mean_gen_accuracy": 0.8817114531993866, "mean_token_accuracy": 0.9092180132865906, "num_tokens": 587724276.0, "sample_num_tokens": 6900.5, "step": 6164, "total_num_tokens": 587751878.0, "z_loss": 0.0005521284183487296 }, { "copy_logits_max": -5.239778518676758, "copy_logits_min": -750000000.0, "copy_num_tokens": 450.375, "epoch": 1.2591779423027827, "gen_logits_max": 4.602935791015625, "gen_logits_mean": -15.29667854309082, "gen_logits_min": -27.72356414794922, "gen_logits_std": 3.0713050365448, "gen_loss": 0.31043243408203125, "grad_norm": 0.40353750718223796, "learning_rate": 2.3142315789473685e-05, "loss": 0.291, "mean_copy_accuracy": 0.9961147308349609, "mean_gen_accuracy": 0.8718825727701187, "mean_token_accuracy": 0.9024020135402679, "num_tokens": 587980482.0, "sample_num_tokens": 8135.0, "step": 6165, "total_num_tokens": 588013022.0, "z_loss": 0.0006445904728025198 }, { "copy_logits_max": -6.229224681854248, "copy_logits_min": -750000000.0, "copy_num_tokens": 558.0625, "epoch": 1.2593821802399796, "gen_logits_max": 2.4103705883026123, "gen_logits_mean": -17.75155258178711, "gen_logits_min": -29.99695587158203, "gen_logits_std": 3.1378512382507324, "gen_loss": 0.2443317174911499, "grad_norm": 0.39113814302159244, "learning_rate": 2.314105263157895e-05, "loss": 0.2713, "mean_copy_accuracy": 0.9967624992132187, "mean_gen_accuracy": 0.8797815591096878, "mean_token_accuracy": 0.9077659845352173, "num_tokens": 588265123.0, "sample_num_tokens": 9253.25, "step": 6166, "total_num_tokens": 588302136.0, "z_loss": 0.00048285123193636537 }, { "copy_logits_max": -6.789213180541992, "copy_logits_min": -750000000.0, "copy_num_tokens": 448.375, "epoch": 1.2595864181771765, "gen_logits_max": 3.5941195487976074, "gen_logits_mean": -16.539138793945312, "gen_logits_min": -29.012378692626953, "gen_logits_std": 3.1093103885650635, "gen_loss": 0.292557954788208, "grad_norm": 0.3398033738745889, "learning_rate": 2.313978947368421e-05, "loss": 0.288, "mean_copy_accuracy": 0.9966747909784317, "mean_gen_accuracy": 0.8758108168840408, "mean_token_accuracy": 0.9033284336328506, "num_tokens": 588539693.0, "sample_num_tokens": 7771.25, "step": 6167, "total_num_tokens": 588570778.0, "z_loss": 0.0005704766372218728 }, { "copy_logits_max": -6.757600784301758, "copy_logits_min": -750000000.0, "copy_num_tokens": 483.375, "epoch": 1.2597906561143732, "gen_logits_max": 2.9916203022003174, "gen_logits_mean": -17.660619735717773, "gen_logits_min": -29.74976921081543, "gen_logits_std": 3.1180126667022705, "gen_loss": 0.31214970350265503, "grad_norm": 0.41001625101467937, "learning_rate": 2.3138526315789475e-05, "loss": 0.2641, "mean_copy_accuracy": 0.9972413182258606, "mean_gen_accuracy": 0.882540762424469, "mean_token_accuracy": 0.9110961854457855, "num_tokens": 588803924.0, "sample_num_tokens": 8181.0, "step": 6168, "total_num_tokens": 588836648.0, "z_loss": 0.0005397405475378036 }, { "copy_logits_max": -4.428299903869629, "copy_logits_min": -687500032.0, "copy_num_tokens": 498.25, "epoch": 1.25999489405157, "gen_logits_max": 3.429248332977295, "gen_logits_mean": -15.96688461303711, "gen_logits_min": -28.519678115844727, "gen_logits_std": 3.0876777172088623, "gen_loss": 0.26589542627334595, "grad_norm": 0.39755849941988597, "learning_rate": 2.3137263157894736e-05, "loss": 0.2743, "mean_copy_accuracy": 0.9963670372962952, "mean_gen_accuracy": 0.8759927153587341, "mean_token_accuracy": 0.9064965546131134, "num_tokens": 589052941.0, "sample_num_tokens": 7653.25, "step": 6169, "total_num_tokens": 589083554.0, "z_loss": 0.0005100355483591557 }, { "copy_logits_max": -4.532179355621338, "copy_logits_min": -687500032.0, "copy_num_tokens": 564.625, "epoch": 1.2601991319887669, "gen_logits_max": 4.391165733337402, "gen_logits_mean": -14.657451629638672, "gen_logits_min": -27.147380828857422, "gen_logits_std": 3.0643796920776367, "gen_loss": 0.28723129630088806, "grad_norm": 0.4002397220351473, "learning_rate": 2.3136e-05, "loss": 0.2847, "mean_copy_accuracy": 0.9966577291488647, "mean_gen_accuracy": 0.8711663633584976, "mean_token_accuracy": 0.9025948941707611, "num_tokens": 589318341.0, "sample_num_tokens": 8648.25, "step": 6170, "total_num_tokens": 589352934.0, "z_loss": 0.0005621715681627393 }, { "copy_logits_max": -6.447753429412842, "copy_logits_min": -750000000.0, "copy_num_tokens": 460.375, "epoch": 1.2604033699259638, "gen_logits_max": 3.89461088180542, "gen_logits_mean": -16.87163543701172, "gen_logits_min": -28.944564819335938, "gen_logits_std": 3.099297046661377, "gen_loss": 0.3093421161174774, "grad_norm": 0.34896134409847723, "learning_rate": 2.3134736842105264e-05, "loss": 0.2683, "mean_copy_accuracy": 0.9970288574695587, "mean_gen_accuracy": 0.8769620060920715, "mean_token_accuracy": 0.9079127162694931, "num_tokens": 589599411.0, "sample_num_tokens": 8403.25, "step": 6171, "total_num_tokens": 589633024.0, "z_loss": 0.0006209282437339425 }, { "copy_logits_max": -5.232870578765869, "copy_logits_min": -750000000.0, "copy_num_tokens": 374.375, "epoch": 1.2606076078631605, "gen_logits_max": 5.311741828918457, "gen_logits_mean": -15.248842239379883, "gen_logits_min": -27.357343673706055, "gen_logits_std": 3.0881874561309814, "gen_loss": 0.30003565549850464, "grad_norm": 0.43503330188567557, "learning_rate": 2.3133473684210525e-05, "loss": 0.29, "mean_copy_accuracy": 0.9953998029232025, "mean_gen_accuracy": 0.8756974339485168, "mean_token_accuracy": 0.9030579030513763, "num_tokens": 589861534.0, "sample_num_tokens": 7599.0, "step": 6172, "total_num_tokens": 589891930.0, "z_loss": 0.0005853454349562526 }, { "copy_logits_max": -4.6652421951293945, "copy_logits_min": -750000000.0, "copy_num_tokens": 591.1875, "epoch": 1.2608118458003574, "gen_logits_max": 4.519083499908447, "gen_logits_mean": -14.97690200805664, "gen_logits_min": -27.011112213134766, "gen_logits_std": 3.0779852867126465, "gen_loss": 0.28445965051651, "grad_norm": 0.41532896141832093, "learning_rate": 2.313221052631579e-05, "loss": 0.2894, "mean_copy_accuracy": 0.9956513792276382, "mean_gen_accuracy": 0.8726619929075241, "mean_token_accuracy": 0.9025439023971558, "num_tokens": 590124951.0, "sample_num_tokens": 9843.25, "step": 6173, "total_num_tokens": 590164324.0, "z_loss": 0.0005555481766350567 }, { "copy_logits_max": -3.01155161857605, "copy_logits_min": -687500032.0, "copy_num_tokens": 736.875, "epoch": 1.2610160837375544, "gen_logits_max": 3.9346389770507812, "gen_logits_mean": -16.184814453125, "gen_logits_min": -28.530702590942383, "gen_logits_std": 3.084740161895752, "gen_loss": 0.2746783494949341, "grad_norm": 0.578678876575047, "learning_rate": 2.3130947368421054e-05, "loss": 0.2811, "mean_copy_accuracy": 0.9964731484651566, "mean_gen_accuracy": 0.8698745965957642, "mean_token_accuracy": 0.9032756090164185, "num_tokens": 590408341.0, "sample_num_tokens": 10384.25, "step": 6174, "total_num_tokens": 590449878.0, "z_loss": 0.0005732172867283225 }, { "copy_logits_max": -6.019696235656738, "copy_logits_min": -750000000.0, "copy_num_tokens": 364.125, "epoch": 1.261220321674751, "gen_logits_max": 4.286289215087891, "gen_logits_mean": -17.13401222229004, "gen_logits_min": -29.61715316772461, "gen_logits_std": 3.087048053741455, "gen_loss": 0.2633536458015442, "grad_norm": 0.44152700109663856, "learning_rate": 2.3129684210526318e-05, "loss": 0.2936, "mean_copy_accuracy": 0.9952931702136993, "mean_gen_accuracy": 0.872099980711937, "mean_token_accuracy": 0.9002506583929062, "num_tokens": 590662782.0, "sample_num_tokens": 7623.5, "step": 6175, "total_num_tokens": 590693276.0, "z_loss": 0.0005180616863071918 }, { "copy_logits_max": -5.7412872314453125, "copy_logits_min": -750000000.0, "copy_num_tokens": 302.125, "epoch": 1.261424559611948, "gen_logits_max": 4.770297050476074, "gen_logits_mean": -15.697014808654785, "gen_logits_min": -28.01626968383789, "gen_logits_std": 3.0376052856445312, "gen_loss": 0.29112863540649414, "grad_norm": 0.3379855494759831, "learning_rate": 2.312842105263158e-05, "loss": 0.2817, "mean_copy_accuracy": 0.9968895614147186, "mean_gen_accuracy": 0.8770940899848938, "mean_token_accuracy": 0.9051344394683838, "num_tokens": 590927427.0, "sample_num_tokens": 7245.25, "step": 6176, "total_num_tokens": 590956408.0, "z_loss": 0.0006119189201854169 }, { "copy_logits_max": -5.193053722381592, "copy_logits_min": -562500096.0, "copy_num_tokens": 509.5, "epoch": 1.2616287975491447, "gen_logits_max": 4.6850738525390625, "gen_logits_mean": -15.763246536254883, "gen_logits_min": -27.966108322143555, "gen_logits_std": 3.082026243209839, "gen_loss": 0.2872994840145111, "grad_norm": 0.3788697646475101, "learning_rate": 2.3127157894736843e-05, "loss": 0.2739, "mean_copy_accuracy": 0.9958204925060272, "mean_gen_accuracy": 0.8774764537811279, "mean_token_accuracy": 0.9077946692705154, "num_tokens": 591203438.0, "sample_num_tokens": 8822.0, "step": 6177, "total_num_tokens": 591238726.0, "z_loss": 0.00062962481752038 }, { "copy_logits_max": -5.8405890464782715, "copy_logits_min": -750000000.0, "copy_num_tokens": 553.25, "epoch": 1.2618330354863416, "gen_logits_max": 4.1291913986206055, "gen_logits_mean": -15.582197189331055, "gen_logits_min": -27.970500946044922, "gen_logits_std": 3.087845802307129, "gen_loss": 0.2741324305534363, "grad_norm": 0.39849145832800126, "learning_rate": 2.3125894736842104e-05, "loss": 0.2735, "mean_copy_accuracy": 0.9948052912950516, "mean_gen_accuracy": 0.8787464499473572, "mean_token_accuracy": 0.9086595773696899, "num_tokens": 591469503.0, "sample_num_tokens": 9713.25, "step": 6178, "total_num_tokens": 591508356.0, "z_loss": 0.0005182047025300562 }, { "copy_logits_max": -4.480182647705078, "copy_logits_min": -750000000.0, "copy_num_tokens": 410.4375, "epoch": 1.2620372734235383, "gen_logits_max": 4.180078506469727, "gen_logits_mean": -15.96377944946289, "gen_logits_min": -27.968822479248047, "gen_logits_std": 2.9927940368652344, "gen_loss": 0.32841184735298157, "grad_norm": 0.3473816556175869, "learning_rate": 2.312463157894737e-05, "loss": 0.2823, "mean_copy_accuracy": 0.9969021379947662, "mean_gen_accuracy": 0.8742877393960953, "mean_token_accuracy": 0.9056505113840103, "num_tokens": 591748434.0, "sample_num_tokens": 8219.5, "step": 6179, "total_num_tokens": 591781312.0, "z_loss": 0.0006067259237170219 }, { "copy_logits_max": -6.269162178039551, "copy_logits_min": -750000000.0, "copy_num_tokens": 263.75, "epoch": 1.2622415113607353, "gen_logits_max": 5.108336925506592, "gen_logits_mean": -15.442495346069336, "gen_logits_min": -27.8282413482666, "gen_logits_std": 2.9896740913391113, "gen_loss": 0.3328206539154053, "grad_norm": 0.4123031340751323, "learning_rate": 2.312336842105263e-05, "loss": 0.2992, "mean_copy_accuracy": 0.9962113946676254, "mean_gen_accuracy": 0.8748624175786972, "mean_token_accuracy": 0.9001116156578064, "num_tokens": 591991232.0, "sample_num_tokens": 7087.0, "step": 6180, "total_num_tokens": 592019580.0, "z_loss": 0.0005982076399959624 }, { "copy_logits_max": -3.850668430328369, "copy_logits_min": -750000000.0, "copy_num_tokens": 445.5, "epoch": 1.2624457492979322, "gen_logits_max": 4.494490623474121, "gen_logits_mean": -15.470841407775879, "gen_logits_min": -27.772865295410156, "gen_logits_std": 2.9683837890625, "gen_loss": 0.2743197977542877, "grad_norm": 0.4174945748904808, "learning_rate": 2.3122105263157894e-05, "loss": 0.294, "mean_copy_accuracy": 0.9956512600183487, "mean_gen_accuracy": 0.8796710819005966, "mean_token_accuracy": 0.9010172039270401, "num_tokens": 592239769.0, "sample_num_tokens": 8879.25, "step": 6181, "total_num_tokens": 592275286.0, "z_loss": 0.0005083933938294649 }, { "copy_logits_max": -4.26201057434082, "copy_logits_min": -750000000.0, "copy_num_tokens": 595.0625, "epoch": 1.262649987235129, "gen_logits_max": 3.9247312545776367, "gen_logits_mean": -16.63311767578125, "gen_logits_min": -28.949750900268555, "gen_logits_std": 3.072650671005249, "gen_loss": 0.2720915973186493, "grad_norm": 0.3721655216084104, "learning_rate": 2.3120842105263158e-05, "loss": 0.2818, "mean_copy_accuracy": 0.9965820461511612, "mean_gen_accuracy": 0.8717878311872482, "mean_token_accuracy": 0.9050005972385406, "num_tokens": 592525776.0, "sample_num_tokens": 9360.0, "step": 6182, "total_num_tokens": 592563216.0, "z_loss": 0.0005167516646906734 }, { "copy_logits_max": -4.676684856414795, "copy_logits_min": -750000000.0, "copy_num_tokens": 436.8125, "epoch": 1.2628542251723258, "gen_logits_max": 3.8996856212615967, "gen_logits_mean": -16.24344253540039, "gen_logits_min": -28.00788116455078, "gen_logits_std": 2.984522819519043, "gen_loss": 0.319463849067688, "grad_norm": 0.38001504705628075, "learning_rate": 2.3119578947368423e-05, "loss": 0.2909, "mean_copy_accuracy": 0.9960404932498932, "mean_gen_accuracy": 0.872187927365303, "mean_token_accuracy": 0.9015622287988663, "num_tokens": 592784503.0, "sample_num_tokens": 7977.25, "step": 6183, "total_num_tokens": 592816412.0, "z_loss": 0.0005818188074044883 }, { "copy_logits_max": -5.041809558868408, "copy_logits_min": -750000000.0, "copy_num_tokens": 461.25, "epoch": 1.2630584631095225, "gen_logits_max": 4.959975242614746, "gen_logits_mean": -15.857519149780273, "gen_logits_min": -27.542558670043945, "gen_logits_std": 3.0310935974121094, "gen_loss": 0.2691026031970978, "grad_norm": 0.39668173445756977, "learning_rate": 2.3118315789473687e-05, "loss": 0.2624, "mean_copy_accuracy": 0.9966535717248917, "mean_gen_accuracy": 0.8844244033098221, "mean_token_accuracy": 0.9096416234970093, "num_tokens": 593041473.0, "sample_num_tokens": 10096.25, "step": 6184, "total_num_tokens": 593081858.0, "z_loss": 0.0004945409018546343 }, { "copy_logits_max": -5.949540615081787, "copy_logits_min": -687500096.0, "copy_num_tokens": 492.0, "epoch": 1.2632627010467194, "gen_logits_max": 5.136702537536621, "gen_logits_mean": -14.460090637207031, "gen_logits_min": -27.147674560546875, "gen_logits_std": 2.9775795936584473, "gen_loss": 0.33275577425956726, "grad_norm": 0.3751557073581768, "learning_rate": 2.3117052631578948e-05, "loss": 0.289, "mean_copy_accuracy": 0.9953498840332031, "mean_gen_accuracy": 0.8733018934726715, "mean_token_accuracy": 0.9015449583530426, "num_tokens": 593307472.0, "sample_num_tokens": 9374.0, "step": 6185, "total_num_tokens": 593344968.0, "z_loss": 0.000689366483129561 }, { "copy_logits_max": -4.224803447723389, "copy_logits_min": -687500032.0, "copy_num_tokens": 779.9375, "epoch": 1.2634669389839162, "gen_logits_max": 3.594341278076172, "gen_logits_mean": -15.52915096282959, "gen_logits_min": -28.811851501464844, "gen_logits_std": 3.084152936935425, "gen_loss": 0.23237952589988708, "grad_norm": 0.3855968866782483, "learning_rate": 2.3115789473684212e-05, "loss": 0.2764, "mean_copy_accuracy": 0.9964398592710495, "mean_gen_accuracy": 0.8717831820249557, "mean_token_accuracy": 0.9073124676942825, "num_tokens": 593570734.0, "sample_num_tokens": 9447.5, "step": 6186, "total_num_tokens": 593608524.0, "z_loss": 0.0004059626080561429 }, { "copy_logits_max": -6.465954303741455, "copy_logits_min": -750000000.0, "copy_num_tokens": 306.375, "epoch": 1.263671176921113, "gen_logits_max": 4.789422035217285, "gen_logits_mean": -17.097991943359375, "gen_logits_min": -28.791248321533203, "gen_logits_std": 3.02907657623291, "gen_loss": 0.30613020062446594, "grad_norm": 0.34403357762112047, "learning_rate": 2.3114526315789473e-05, "loss": 0.2958, "mean_copy_accuracy": 0.9965084493160248, "mean_gen_accuracy": 0.8761768788099289, "mean_token_accuracy": 0.9006150513887405, "num_tokens": 593850896.0, "sample_num_tokens": 7864.0, "step": 6187, "total_num_tokens": 593882352.0, "z_loss": 0.0005557286785915494 }, { "copy_logits_max": -5.366438865661621, "copy_logits_min": -750000000.0, "copy_num_tokens": 544.375, "epoch": 1.26387541485831, "gen_logits_max": 3.366298198699951, "gen_logits_mean": -16.595783233642578, "gen_logits_min": -29.008119583129883, "gen_logits_std": 3.0198545455932617, "gen_loss": 0.2837046980857849, "grad_norm": 0.37696999165824846, "learning_rate": 2.3113263157894737e-05, "loss": 0.2734, "mean_copy_accuracy": 0.9962617605924606, "mean_gen_accuracy": 0.8788706213235855, "mean_token_accuracy": 0.9082191586494446, "num_tokens": 594106551.0, "sample_num_tokens": 9038.25, "step": 6188, "total_num_tokens": 594142704.0, "z_loss": 0.0005151959485374391 }, { "copy_logits_max": -4.971901893615723, "copy_logits_min": -750000000.0, "copy_num_tokens": 320.875, "epoch": 1.2640796527955067, "gen_logits_max": 5.800015449523926, "gen_logits_mean": -14.24152946472168, "gen_logits_min": -27.005382537841797, "gen_logits_std": 2.9467263221740723, "gen_loss": 0.3281640410423279, "grad_norm": 0.3935945409665246, "learning_rate": 2.3111999999999998e-05, "loss": 0.2836, "mean_copy_accuracy": 0.9971374124288559, "mean_gen_accuracy": 0.8760030269622803, "mean_token_accuracy": 0.9045409858226776, "num_tokens": 594370754.0, "sample_num_tokens": 7281.0, "step": 6189, "total_num_tokens": 594399878.0, "z_loss": 0.0005781312938779593 }, { "copy_logits_max": -4.602034568786621, "copy_logits_min": -687500032.0, "copy_num_tokens": 586.8125, "epoch": 1.2642838907327036, "gen_logits_max": 4.150795936584473, "gen_logits_mean": -15.670076370239258, "gen_logits_min": -29.218185424804688, "gen_logits_std": 3.070934295654297, "gen_loss": 0.2554902136325836, "grad_norm": 0.38366382383280967, "learning_rate": 2.3110736842105266e-05, "loss": 0.2783, "mean_copy_accuracy": 0.9966583698987961, "mean_gen_accuracy": 0.8719758242368698, "mean_token_accuracy": 0.9061292260885239, "num_tokens": 594636263.0, "sample_num_tokens": 9068.25, "step": 6190, "total_num_tokens": 594672536.0, "z_loss": 0.0005585827166214585 }, { "copy_logits_max": -5.391643524169922, "copy_logits_min": -750000000.0, "copy_num_tokens": 393.125, "epoch": 1.2644881286699006, "gen_logits_max": 4.228245735168457, "gen_logits_mean": -16.540237426757812, "gen_logits_min": -28.851354598999023, "gen_logits_std": 3.02945876121521, "gen_loss": 0.28467100858688354, "grad_norm": 0.3649036191652743, "learning_rate": 2.3109473684210527e-05, "loss": 0.2971, "mean_copy_accuracy": 0.9960262030363083, "mean_gen_accuracy": 0.8698276579380035, "mean_token_accuracy": 0.8990372270345688, "num_tokens": 594904955.0, "sample_num_tokens": 8594.75, "step": 6191, "total_num_tokens": 594939334.0, "z_loss": 0.0005253534764051437 }, { "copy_logits_max": -6.450162887573242, "copy_logits_min": -750000000.0, "copy_num_tokens": 582.9375, "epoch": 1.2646923666070973, "gen_logits_max": 4.291835308074951, "gen_logits_mean": -15.952342987060547, "gen_logits_min": -29.374784469604492, "gen_logits_std": 3.053584337234497, "gen_loss": 0.24508774280548096, "grad_norm": 0.39142318534922005, "learning_rate": 2.310821052631579e-05, "loss": 0.2735, "mean_copy_accuracy": 0.9961951673030853, "mean_gen_accuracy": 0.8744449317455292, "mean_token_accuracy": 0.907869428396225, "num_tokens": 595176495.0, "sample_num_tokens": 8774.75, "step": 6192, "total_num_tokens": 595211594.0, "z_loss": 0.00046831194777041674 }, { "copy_logits_max": -4.325366973876953, "copy_logits_min": -750000000.0, "copy_num_tokens": 608.875, "epoch": 1.264896604544294, "gen_logits_max": 4.125080108642578, "gen_logits_mean": -15.252874374389648, "gen_logits_min": -28.32830047607422, "gen_logits_std": 3.052245616912842, "gen_loss": 0.26610225439071655, "grad_norm": 0.3718905157885517, "learning_rate": 2.3106947368421052e-05, "loss": 0.2875, "mean_copy_accuracy": 0.9949728101491928, "mean_gen_accuracy": 0.8722654432058334, "mean_token_accuracy": 0.9010459184646606, "num_tokens": 595446052.0, "sample_num_tokens": 9378.0, "step": 6193, "total_num_tokens": 595483564.0, "z_loss": 0.0004939649370498955 }, { "copy_logits_max": -4.718225955963135, "copy_logits_min": -750000064.0, "copy_num_tokens": 679.9375, "epoch": 1.265100842481491, "gen_logits_max": 3.4317402839660645, "gen_logits_mean": -16.7496280670166, "gen_logits_min": -29.584049224853516, "gen_logits_std": 3.087815761566162, "gen_loss": 0.31831827759742737, "grad_norm": 0.380113604377219, "learning_rate": 2.3105684210526316e-05, "loss": 0.3037, "mean_copy_accuracy": 0.9968107342720032, "mean_gen_accuracy": 0.8691680580377579, "mean_token_accuracy": 0.898491844534874, "num_tokens": 595732031.0, "sample_num_tokens": 10539.75, "step": 6194, "total_num_tokens": 595774190.0, "z_loss": 0.0005796565674245358 }, { "copy_logits_max": -5.471688270568848, "copy_logits_min": -750000064.0, "copy_num_tokens": 460.375, "epoch": 1.2653050804186878, "gen_logits_max": 4.113907814025879, "gen_logits_mean": -16.013931274414062, "gen_logits_min": -28.353641510009766, "gen_logits_std": 3.055002450942993, "gen_loss": 0.31001925468444824, "grad_norm": 0.3538590222132964, "learning_rate": 2.3104421052631577e-05, "loss": 0.2831, "mean_copy_accuracy": 0.9947110116481781, "mean_gen_accuracy": 0.8801192492246628, "mean_token_accuracy": 0.9035216867923737, "num_tokens": 596008026.0, "sample_num_tokens": 8382.5, "step": 6195, "total_num_tokens": 596041556.0, "z_loss": 0.0005748452967964113 }, { "copy_logits_max": -5.405755996704102, "copy_logits_min": -750000000.0, "copy_num_tokens": 454.75, "epoch": 1.2655093183558845, "gen_logits_max": 4.563790798187256, "gen_logits_mean": -15.433086395263672, "gen_logits_min": -27.752574920654297, "gen_logits_std": 3.021951675415039, "gen_loss": 0.2992563843727112, "grad_norm": 0.3687808130010306, "learning_rate": 2.310315789473684e-05, "loss": 0.2845, "mean_copy_accuracy": 0.9967507719993591, "mean_gen_accuracy": 0.8726979941129684, "mean_token_accuracy": 0.903832197189331, "num_tokens": 596289429.0, "sample_num_tokens": 8845.25, "step": 6196, "total_num_tokens": 596324810.0, "z_loss": 0.0005547610344365239 }, { "copy_logits_max": -5.752330780029297, "copy_logits_min": -750000000.0, "copy_num_tokens": 362.1875, "epoch": 1.2657135562930815, "gen_logits_max": 4.0664191246032715, "gen_logits_mean": -16.131450653076172, "gen_logits_min": -27.819746017456055, "gen_logits_std": 3.0631825923919678, "gen_loss": 0.3075076639652252, "grad_norm": 0.3855759930805347, "learning_rate": 2.3101894736842106e-05, "loss": 0.3165, "mean_copy_accuracy": 0.9955869168043137, "mean_gen_accuracy": 0.866474449634552, "mean_token_accuracy": 0.8938502222299576, "num_tokens": 596562588.0, "sample_num_tokens": 8331.5, "step": 6197, "total_num_tokens": 596595914.0, "z_loss": 0.0005374865140765905 }, { "copy_logits_max": -6.822589874267578, "copy_logits_min": -687500032.0, "copy_num_tokens": 419.5, "epoch": 1.2659177942302784, "gen_logits_max": 4.253534317016602, "gen_logits_mean": -16.096698760986328, "gen_logits_min": -28.21974754333496, "gen_logits_std": 3.087538242340088, "gen_loss": 0.27985307574272156, "grad_norm": 0.38680581436991096, "learning_rate": 2.310063157894737e-05, "loss": 0.2893, "mean_copy_accuracy": 0.9950534254312515, "mean_gen_accuracy": 0.8731842935085297, "mean_token_accuracy": 0.9005659818649292, "num_tokens": 596809852.0, "sample_num_tokens": 8141.0, "step": 6198, "total_num_tokens": 596842416.0, "z_loss": 0.0005296287126839161 }, { "copy_logits_max": -6.235972881317139, "copy_logits_min": -750000064.0, "copy_num_tokens": 309.4375, "epoch": 1.266122032167475, "gen_logits_max": 4.747882843017578, "gen_logits_mean": -15.381746292114258, "gen_logits_min": -27.39845085144043, "gen_logits_std": 3.047895908355713, "gen_loss": 0.32916873693466187, "grad_norm": 0.3734906565532466, "learning_rate": 2.3099368421052635e-05, "loss": 0.2884, "mean_copy_accuracy": 0.9954929500818253, "mean_gen_accuracy": 0.8734419792890549, "mean_token_accuracy": 0.9039752781391144, "num_tokens": 597071809.0, "sample_num_tokens": 7149.75, "step": 6199, "total_num_tokens": 597100408.0, "z_loss": 0.0005871171015314758 }, { "copy_logits_max": -5.662553787231445, "copy_logits_min": -750000000.0, "copy_num_tokens": 540.4375, "epoch": 1.266326270104672, "gen_logits_max": 3.4902448654174805, "gen_logits_mean": -17.369972229003906, "gen_logits_min": -29.464677810668945, "gen_logits_std": 3.1060214042663574, "gen_loss": 0.30258023738861084, "grad_norm": 0.3701099250935915, "learning_rate": 2.3098105263157896e-05, "loss": 0.2762, "mean_copy_accuracy": 0.996122196316719, "mean_gen_accuracy": 0.8790938854217529, "mean_token_accuracy": 0.9085091054439545, "num_tokens": 597334157.0, "sample_num_tokens": 8460.75, "step": 6200, "total_num_tokens": 597368000.0, "z_loss": 0.0005299961194396019 }, { "copy_logits_max": -6.481354236602783, "copy_logits_min": -750000000.0, "copy_num_tokens": 234.25, "epoch": 1.2665305080418687, "gen_logits_max": 4.907719612121582, "gen_logits_mean": -16.49658966064453, "gen_logits_min": -28.53034210205078, "gen_logits_std": 3.102541446685791, "gen_loss": 0.28674420714378357, "grad_norm": 0.36628296702464846, "learning_rate": 2.309684210526316e-05, "loss": 0.2786, "mean_copy_accuracy": 0.994633749127388, "mean_gen_accuracy": 0.8822142034769058, "mean_token_accuracy": 0.9070734083652496, "num_tokens": 597611880.0, "sample_num_tokens": 6818.5, "step": 6201, "total_num_tokens": 597639154.0, "z_loss": 0.0005081327399238944 }, { "copy_logits_max": -5.054377555847168, "copy_logits_min": -687500032.0, "copy_num_tokens": 450.8125, "epoch": 1.2667347459790657, "gen_logits_max": 5.123150825500488, "gen_logits_mean": -15.3363676071167, "gen_logits_min": -27.740028381347656, "gen_logits_std": 3.0947391986846924, "gen_loss": 0.35431164503097534, "grad_norm": 0.4015257849021991, "learning_rate": 2.309557894736842e-05, "loss": 0.3084, "mean_copy_accuracy": 0.9968997985124588, "mean_gen_accuracy": 0.8642250895500183, "mean_token_accuracy": 0.8994677662849426, "num_tokens": 597895963.0, "sample_num_tokens": 8865.25, "step": 6202, "total_num_tokens": 597931424.0, "z_loss": 0.0005446411669254303 }, { "copy_logits_max": -5.219826698303223, "copy_logits_min": -750000000.0, "copy_num_tokens": 387.4375, "epoch": 1.2669389839162624, "gen_logits_max": 3.6049747467041016, "gen_logits_mean": -17.737430572509766, "gen_logits_min": -29.605735778808594, "gen_logits_std": 3.133251428604126, "gen_loss": 0.2697395086288452, "grad_norm": 0.36707504104904976, "learning_rate": 2.3094315789473685e-05, "loss": 0.2828, "mean_copy_accuracy": 0.9956215918064117, "mean_gen_accuracy": 0.8777119219303131, "mean_token_accuracy": 0.9037415832281113, "num_tokens": 598178818.0, "sample_num_tokens": 8417.5, "step": 6203, "total_num_tokens": 598212488.0, "z_loss": 0.0004961080849170685 }, { "copy_logits_max": -5.219409465789795, "copy_logits_min": -687500032.0, "copy_num_tokens": 514.9375, "epoch": 1.2671432218534593, "gen_logits_max": 4.110665798187256, "gen_logits_mean": -15.672060012817383, "gen_logits_min": -28.1004581451416, "gen_logits_std": 3.1065220832824707, "gen_loss": 0.26158463954925537, "grad_norm": 0.6218781735829302, "learning_rate": 2.3093052631578946e-05, "loss": 0.2847, "mean_copy_accuracy": 0.995313286781311, "mean_gen_accuracy": 0.8744666427373886, "mean_token_accuracy": 0.9044131934642792, "num_tokens": 598443957.0, "sample_num_tokens": 8821.75, "step": 6204, "total_num_tokens": 598479244.0, "z_loss": 0.0004913895390927792 }, { "copy_logits_max": -5.275704383850098, "copy_logits_min": -687500032.0, "copy_num_tokens": 393.125, "epoch": 1.2673474597906562, "gen_logits_max": 2.7617275714874268, "gen_logits_mean": -18.548683166503906, "gen_logits_min": -30.669090270996094, "gen_logits_std": 3.1761419773101807, "gen_loss": 0.23865099251270294, "grad_norm": 0.34756453671062487, "learning_rate": 2.309178947368421e-05, "loss": 0.2602, "mean_copy_accuracy": 0.9963385462760925, "mean_gen_accuracy": 0.8781540244817734, "mean_token_accuracy": 0.9105590879917145, "num_tokens": 598728032.0, "sample_num_tokens": 7919.5, "step": 6205, "total_num_tokens": 598759710.0, "z_loss": 0.0004931394360028207 }, { "copy_logits_max": -5.523674011230469, "copy_logits_min": -750000000.0, "copy_num_tokens": 492.9375, "epoch": 1.267551697727853, "gen_logits_max": 3.639369010925293, "gen_logits_mean": -16.467723846435547, "gen_logits_min": -28.511028289794922, "gen_logits_std": 3.1093902587890625, "gen_loss": 0.2848730683326721, "grad_norm": 0.35134165335382955, "learning_rate": 2.3090526315789475e-05, "loss": 0.2836, "mean_copy_accuracy": 0.9967236965894699, "mean_gen_accuracy": 0.8739040940999985, "mean_token_accuracy": 0.9026983827352524, "num_tokens": 598988220.0, "sample_num_tokens": 8153.0, "step": 6206, "total_num_tokens": 599020832.0, "z_loss": 0.0005489004543051124 }, { "copy_logits_max": -3.527682065963745, "copy_logits_min": -687500032.0, "copy_num_tokens": 545.875, "epoch": 1.2677559356650498, "gen_logits_max": 2.779466152191162, "gen_logits_mean": -17.477535247802734, "gen_logits_min": -30.17938995361328, "gen_logits_std": 3.1445412635803223, "gen_loss": 0.2665620446205139, "grad_norm": 0.3875792398572224, "learning_rate": 2.308926315789474e-05, "loss": 0.3026, "mean_copy_accuracy": 0.9959527254104614, "mean_gen_accuracy": 0.8652544319629669, "mean_token_accuracy": 0.8976985812187195, "num_tokens": 599265907.0, "sample_num_tokens": 8725.25, "step": 6207, "total_num_tokens": 599300808.0, "z_loss": 0.0005522293504327536 }, { "copy_logits_max": -3.951082706451416, "copy_logits_min": -750000000.0, "copy_num_tokens": 814.0625, "epoch": 1.2679601736022466, "gen_logits_max": 3.923201560974121, "gen_logits_mean": -15.031332969665527, "gen_logits_min": -27.80513572692871, "gen_logits_std": 3.1320009231567383, "gen_loss": 0.2524726986885071, "grad_norm": 0.35580753624891365, "learning_rate": 2.3088e-05, "loss": 0.2464, "mean_copy_accuracy": 0.9964216947555542, "mean_gen_accuracy": 0.883599266409874, "mean_token_accuracy": 0.9161645174026489, "num_tokens": 599558440.0, "sample_num_tokens": 10552.0, "step": 6208, "total_num_tokens": 599600648.0, "z_loss": 0.0005195415578782558 }, { "copy_logits_max": -5.62950325012207, "copy_logits_min": -750000000.0, "copy_num_tokens": 355.25, "epoch": 1.2681644115394435, "gen_logits_max": 3.3332290649414062, "gen_logits_mean": -17.916162490844727, "gen_logits_min": -29.811939239501953, "gen_logits_std": 3.129589557647705, "gen_loss": 0.27354639768600464, "grad_norm": 0.3550875036181976, "learning_rate": 2.3086736842105264e-05, "loss": 0.2716, "mean_copy_accuracy": 0.9959857761859894, "mean_gen_accuracy": 0.8780122101306915, "mean_token_accuracy": 0.9074039161205292, "num_tokens": 599831251.0, "sample_num_tokens": 7201.25, "step": 6209, "total_num_tokens": 599860056.0, "z_loss": 0.0005638051079586148 }, { "copy_logits_max": -3.989776134490967, "copy_logits_min": -750000000.0, "copy_num_tokens": 617.125, "epoch": 1.2683686494766402, "gen_logits_max": 3.540304660797119, "gen_logits_mean": -16.363380432128906, "gen_logits_min": -28.634658813476562, "gen_logits_std": 3.106365203857422, "gen_loss": 0.2614544630050659, "grad_norm": 0.39590509952907055, "learning_rate": 2.308547368421053e-05, "loss": 0.2821, "mean_copy_accuracy": 0.9971085488796234, "mean_gen_accuracy": 0.8735825270414352, "mean_token_accuracy": 0.9040822833776474, "num_tokens": 600093593.0, "sample_num_tokens": 10148.75, "step": 6210, "total_num_tokens": 600134188.0, "z_loss": 0.0005111514474265277 }, { "copy_logits_max": -4.79651403427124, "copy_logits_min": -750000000.0, "copy_num_tokens": 321.5, "epoch": 1.2685728874138371, "gen_logits_max": 4.626718521118164, "gen_logits_mean": -15.58659553527832, "gen_logits_min": -27.576766967773438, "gen_logits_std": 3.074676036834717, "gen_loss": 0.2915419638156891, "grad_norm": 0.35776372048998845, "learning_rate": 2.308421052631579e-05, "loss": 0.298, "mean_copy_accuracy": 0.9969086349010468, "mean_gen_accuracy": 0.8679961413145065, "mean_token_accuracy": 0.8989726155996323, "num_tokens": 600355516.0, "sample_num_tokens": 6967.5, "step": 6211, "total_num_tokens": 600383386.0, "z_loss": 0.0005659377202391624 }, { "copy_logits_max": -3.1622915267944336, "copy_logits_min": -750000000.0, "copy_num_tokens": 399.125, "epoch": 1.268777125351034, "gen_logits_max": 4.556817054748535, "gen_logits_mean": -14.315217971801758, "gen_logits_min": -26.849058151245117, "gen_logits_std": 3.061112403869629, "gen_loss": 0.2795505225658417, "grad_norm": 0.3837443482955593, "learning_rate": 2.3082947368421054e-05, "loss": 0.2901, "mean_copy_accuracy": 0.9942969977855682, "mean_gen_accuracy": 0.8765642791986465, "mean_token_accuracy": 0.8998718708753586, "num_tokens": 600610557.0, "sample_num_tokens": 8181.75, "step": 6212, "total_num_tokens": 600643284.0, "z_loss": 0.0005304138176143169 }, { "copy_logits_max": -3.616542100906372, "copy_logits_min": -750000000.0, "copy_num_tokens": 483.375, "epoch": 1.2689813632882307, "gen_logits_max": 4.909578323364258, "gen_logits_mean": -14.979290008544922, "gen_logits_min": -27.442270278930664, "gen_logits_std": 3.0548362731933594, "gen_loss": 0.30141860246658325, "grad_norm": 0.35789467363189853, "learning_rate": 2.3081684210526315e-05, "loss": 0.2856, "mean_copy_accuracy": 0.9962984621524811, "mean_gen_accuracy": 0.8736328780651093, "mean_token_accuracy": 0.9032957851886749, "num_tokens": 600885819.0, "sample_num_tokens": 9651.25, "step": 6213, "total_num_tokens": 600924424.0, "z_loss": 0.0005322148790583014 }, { "copy_logits_max": -3.4422097206115723, "copy_logits_min": -687500032.0, "copy_num_tokens": 415.6875, "epoch": 1.2691856012254277, "gen_logits_max": 3.2572240829467773, "gen_logits_mean": -17.39144515991211, "gen_logits_min": -29.560047149658203, "gen_logits_std": 3.1283583641052246, "gen_loss": 0.2957783341407776, "grad_norm": 0.3621468472206401, "learning_rate": 2.3080421052631582e-05, "loss": 0.2734, "mean_copy_accuracy": 0.9966199845075607, "mean_gen_accuracy": 0.8806284666061401, "mean_token_accuracy": 0.9082957804203033, "num_tokens": 601161696.0, "sample_num_tokens": 7356.5, "step": 6214, "total_num_tokens": 601191122.0, "z_loss": 0.0004930646391585469 }, { "copy_logits_max": -4.4326276779174805, "copy_logits_min": -750000000.0, "copy_num_tokens": 643.5, "epoch": 1.2693898391626244, "gen_logits_max": 3.297180652618408, "gen_logits_mean": -16.390684127807617, "gen_logits_min": -28.65548324584961, "gen_logits_std": 3.122797966003418, "gen_loss": 0.24684587121009827, "grad_norm": 0.3947177721072735, "learning_rate": 2.3079157894736843e-05, "loss": 0.262, "mean_copy_accuracy": 0.9972720444202423, "mean_gen_accuracy": 0.8774468004703522, "mean_token_accuracy": 0.9098166525363922, "num_tokens": 601433883.0, "sample_num_tokens": 9696.25, "step": 6215, "total_num_tokens": 601472668.0, "z_loss": 0.00048140381113626063 }, { "copy_logits_max": -5.744885444641113, "copy_logits_min": -750000000.0, "copy_num_tokens": 463.5625, "epoch": 1.2695940770998213, "gen_logits_max": 3.114698886871338, "gen_logits_mean": -17.374088287353516, "gen_logits_min": -29.3403377532959, "gen_logits_std": 3.143608570098877, "gen_loss": 0.25134801864624023, "grad_norm": 0.3253242782807144, "learning_rate": 2.3077894736842108e-05, "loss": 0.2677, "mean_copy_accuracy": 0.9963479340076447, "mean_gen_accuracy": 0.880162388086319, "mean_token_accuracy": 0.9093370288610458, "num_tokens": 601722289.0, "sample_num_tokens": 9195.25, "step": 6216, "total_num_tokens": 601759070.0, "z_loss": 0.0004587032599374652 }, { "copy_logits_max": -3.397561550140381, "copy_logits_min": -750000000.0, "copy_num_tokens": 562.5, "epoch": 1.269798315037018, "gen_logits_max": 3.2270233631134033, "gen_logits_mean": -16.0928897857666, "gen_logits_min": -28.536785125732422, "gen_logits_std": 3.1085898876190186, "gen_loss": 0.23211631178855896, "grad_norm": 0.3398776209930286, "learning_rate": 2.307663157894737e-05, "loss": 0.2562, "mean_copy_accuracy": 0.9963225573301315, "mean_gen_accuracy": 0.8794544786214828, "mean_token_accuracy": 0.9115583151578903, "num_tokens": 602002145.0, "sample_num_tokens": 8683.25, "step": 6217, "total_num_tokens": 602036878.0, "z_loss": 0.00040307562449015677 }, { "copy_logits_max": -3.800705909729004, "copy_logits_min": -750000064.0, "copy_num_tokens": 577.125, "epoch": 1.270002552974215, "gen_logits_max": 4.080630779266357, "gen_logits_mean": -15.59093189239502, "gen_logits_min": -28.233436584472656, "gen_logits_std": 3.0786914825439453, "gen_loss": 0.28578388690948486, "grad_norm": 0.38172024180908726, "learning_rate": 2.3075368421052633e-05, "loss": 0.2674, "mean_copy_accuracy": 0.9962426573038101, "mean_gen_accuracy": 0.880159392952919, "mean_token_accuracy": 0.9101277887821198, "num_tokens": 602284218.0, "sample_num_tokens": 9294.0, "step": 6218, "total_num_tokens": 602321394.0, "z_loss": 0.0005788629641756415 }, { "copy_logits_max": -6.587677955627441, "copy_logits_min": -750000000.0, "copy_num_tokens": 322.8125, "epoch": 1.2702067909114119, "gen_logits_max": 3.2744345664978027, "gen_logits_mean": -18.226795196533203, "gen_logits_min": -30.293764114379883, "gen_logits_std": 3.107748508453369, "gen_loss": 0.27261170744895935, "grad_norm": 0.39090620446820107, "learning_rate": 2.3074105263157894e-05, "loss": 0.2843, "mean_copy_accuracy": 0.9961458742618561, "mean_gen_accuracy": 0.8744630664587021, "mean_token_accuracy": 0.9032200574874878, "num_tokens": 602546135.0, "sample_num_tokens": 7189.25, "step": 6219, "total_num_tokens": 602574892.0, "z_loss": 0.00045413931366056204 }, { "copy_logits_max": -5.363956451416016, "copy_logits_min": -750000000.0, "copy_num_tokens": 398.0, "epoch": 1.2704110288486086, "gen_logits_max": 3.4505372047424316, "gen_logits_mean": -17.119138717651367, "gen_logits_min": -29.190319061279297, "gen_logits_std": 3.1027567386627197, "gen_loss": 0.2674197256565094, "grad_norm": 0.38685969054344077, "learning_rate": 2.3072842105263158e-05, "loss": 0.2571, "mean_copy_accuracy": 0.9963147640228271, "mean_gen_accuracy": 0.8844293802976608, "mean_token_accuracy": 0.9111050814390182, "num_tokens": 602797037.0, "sample_num_tokens": 7862.25, "step": 6220, "total_num_tokens": 602828486.0, "z_loss": 0.000505784759297967 }, { "copy_logits_max": -5.896796226501465, "copy_logits_min": -687500032.0, "copy_num_tokens": 534.75, "epoch": 1.2706152667858055, "gen_logits_max": 3.5207138061523438, "gen_logits_mean": -16.081897735595703, "gen_logits_min": -28.791702270507812, "gen_logits_std": 3.069730758666992, "gen_loss": 0.28049612045288086, "grad_norm": 0.39512672332368065, "learning_rate": 2.307157894736842e-05, "loss": 0.2709, "mean_copy_accuracy": 0.9966655969619751, "mean_gen_accuracy": 0.8785867691040039, "mean_token_accuracy": 0.9103658646345139, "num_tokens": 603089502.0, "sample_num_tokens": 8792.0, "step": 6221, "total_num_tokens": 603124670.0, "z_loss": 0.0005000025266781449 }, { "copy_logits_max": -5.796878814697266, "copy_logits_min": -750000000.0, "copy_num_tokens": 423.875, "epoch": 1.2708195047230024, "gen_logits_max": 4.189441680908203, "gen_logits_mean": -15.724589347839355, "gen_logits_min": -28.038637161254883, "gen_logits_std": 3.0763585567474365, "gen_loss": 0.29043641686439514, "grad_norm": 0.3846626865594802, "learning_rate": 2.3070315789473683e-05, "loss": 0.2791, "mean_copy_accuracy": 0.9950799942016602, "mean_gen_accuracy": 0.877061516046524, "mean_token_accuracy": 0.9051089137792587, "num_tokens": 603353876.0, "sample_num_tokens": 8478.0, "step": 6222, "total_num_tokens": 603387788.0, "z_loss": 0.0004840384644921869 }, { "copy_logits_max": -7.514253616333008, "copy_logits_min": -750000000.0, "copy_num_tokens": 557.75, "epoch": 1.2710237426601991, "gen_logits_max": 4.400226593017578, "gen_logits_mean": -14.720069885253906, "gen_logits_min": -26.908618927001953, "gen_logits_std": 3.0204131603240967, "gen_loss": 0.27651530504226685, "grad_norm": 0.41857118824662626, "learning_rate": 2.306905263157895e-05, "loss": 0.2788, "mean_copy_accuracy": 0.9957560300827026, "mean_gen_accuracy": 0.8788959383964539, "mean_token_accuracy": 0.904951810836792, "num_tokens": 603615503.0, "sample_num_tokens": 9619.75, "step": 6223, "total_num_tokens": 603653982.0, "z_loss": 0.0005207168869674206 }, { "copy_logits_max": -5.176666259765625, "copy_logits_min": -687500032.0, "copy_num_tokens": 683.875, "epoch": 1.2712279805973958, "gen_logits_max": 3.645432949066162, "gen_logits_mean": -16.14710235595703, "gen_logits_min": -28.18313980102539, "gen_logits_std": 3.0957112312316895, "gen_loss": 0.26605138182640076, "grad_norm": 0.39813104773805574, "learning_rate": 2.3067789473684212e-05, "loss": 0.2887, "mean_copy_accuracy": 0.9959321022033691, "mean_gen_accuracy": 0.8673943877220154, "mean_token_accuracy": 0.9026141911745071, "num_tokens": 603883412.0, "sample_num_tokens": 9470.5, "step": 6224, "total_num_tokens": 603921294.0, "z_loss": 0.0005291068227961659 }, { "copy_logits_max": -5.6014275550842285, "copy_logits_min": -687500032.0, "copy_num_tokens": 588.0625, "epoch": 1.2714322185345928, "gen_logits_max": 3.392444133758545, "gen_logits_mean": -17.034038543701172, "gen_logits_min": -29.343048095703125, "gen_logits_std": 3.1062681674957275, "gen_loss": 0.28333529829978943, "grad_norm": 0.35724001076409045, "learning_rate": 2.3066526315789476e-05, "loss": 0.277, "mean_copy_accuracy": 0.9962728023529053, "mean_gen_accuracy": 0.8785250931978226, "mean_token_accuracy": 0.907073900103569, "num_tokens": 604174051.0, "sample_num_tokens": 10073.75, "step": 6225, "total_num_tokens": 604214346.0, "z_loss": 0.0006330255419015884 }, { "copy_logits_max": -5.536409378051758, "copy_logits_min": -687500032.0, "copy_num_tokens": 457.8125, "epoch": 1.2716364564717897, "gen_logits_max": 3.958986282348633, "gen_logits_mean": -15.51094913482666, "gen_logits_min": -27.79405975341797, "gen_logits_std": 3.091008186340332, "gen_loss": 0.24512124061584473, "grad_norm": 0.37986532594875305, "learning_rate": 2.3065263157894737e-05, "loss": 0.2727, "mean_copy_accuracy": 0.996984213590622, "mean_gen_accuracy": 0.8789512068033218, "mean_token_accuracy": 0.9067376405000687, "num_tokens": 604433830.0, "sample_num_tokens": 8039.5, "step": 6226, "total_num_tokens": 604465988.0, "z_loss": 0.0004964030813425779 }, { "copy_logits_max": -4.325815677642822, "copy_logits_min": -687500032.0, "copy_num_tokens": 808.1875, "epoch": 1.2718406944089864, "gen_logits_max": 2.432298421859741, "gen_logits_mean": -16.617145538330078, "gen_logits_min": -29.04718589782715, "gen_logits_std": 3.1177022457122803, "gen_loss": 0.2625383138656616, "grad_norm": 0.3981918038037573, "learning_rate": 2.3064e-05, "loss": 0.2769, "mean_copy_accuracy": 0.9965464472770691, "mean_gen_accuracy": 0.8710830360651016, "mean_token_accuracy": 0.9070810228586197, "num_tokens": 604713331.0, "sample_num_tokens": 10104.75, "step": 6227, "total_num_tokens": 604753750.0, "z_loss": 0.000533689686562866 }, { "copy_logits_max": -7.196044445037842, "copy_logits_min": -687500032.0, "copy_num_tokens": 467.375, "epoch": 1.2720449323461833, "gen_logits_max": 4.108574390411377, "gen_logits_mean": -16.258121490478516, "gen_logits_min": -28.536846160888672, "gen_logits_std": 3.094101905822754, "gen_loss": 0.26010817289352417, "grad_norm": 0.34440572339856873, "learning_rate": 2.3062736842105262e-05, "loss": 0.2614, "mean_copy_accuracy": 0.9963653534650803, "mean_gen_accuracy": 0.8859568536281586, "mean_token_accuracy": 0.911800816655159, "num_tokens": 605014302.0, "sample_num_tokens": 8494.0, "step": 6228, "total_num_tokens": 605048278.0, "z_loss": 0.0005217192228883505 }, { "copy_logits_max": -6.886828899383545, "copy_logits_min": -687500032.0, "copy_num_tokens": 443.1875, "epoch": 1.2722491702833802, "gen_logits_max": 4.0994486808776855, "gen_logits_mean": -15.859989166259766, "gen_logits_min": -28.190221786499023, "gen_logits_std": 3.060041666030884, "gen_loss": 0.3073708713054657, "grad_norm": 0.42929050653091483, "learning_rate": 2.3061473684210527e-05, "loss": 0.308, "mean_copy_accuracy": 0.9951381236314774, "mean_gen_accuracy": 0.8660966753959656, "mean_token_accuracy": 0.8967801332473755, "num_tokens": 605259255.0, "sample_num_tokens": 7632.75, "step": 6229, "total_num_tokens": 605289786.0, "z_loss": 0.0005772329168394208 }, { "copy_logits_max": -6.8465118408203125, "copy_logits_min": -687500032.0, "copy_num_tokens": 488.0, "epoch": 1.272453408220577, "gen_logits_max": 4.3921427726745605, "gen_logits_mean": -16.48104476928711, "gen_logits_min": -28.680952072143555, "gen_logits_std": 3.0928146839141846, "gen_loss": 0.2691802978515625, "grad_norm": 0.36849694287217444, "learning_rate": 2.3060210526315788e-05, "loss": 0.2638, "mean_copy_accuracy": 0.9966800510883331, "mean_gen_accuracy": 0.8803378939628601, "mean_token_accuracy": 0.9101801067590714, "num_tokens": 605521239.0, "sample_num_tokens": 8811.75, "step": 6230, "total_num_tokens": 605556486.0, "z_loss": 0.0005369047867134213 }, { "copy_logits_max": -7.018021106719971, "copy_logits_min": -750000000.0, "copy_num_tokens": 383.8125, "epoch": 1.2726576461577739, "gen_logits_max": 5.100225448608398, "gen_logits_mean": -15.661052703857422, "gen_logits_min": -27.76447296142578, "gen_logits_std": 3.0677437782287598, "gen_loss": 0.29615023732185364, "grad_norm": 0.3697316388498363, "learning_rate": 2.3058947368421055e-05, "loss": 0.2806, "mean_copy_accuracy": 0.9963863343000412, "mean_gen_accuracy": 0.8780664056539536, "mean_token_accuracy": 0.9059527963399887, "num_tokens": 605794949.0, "sample_num_tokens": 7530.75, "step": 6231, "total_num_tokens": 605825072.0, "z_loss": 0.0005522688152268529 }, { "copy_logits_max": -6.914295196533203, "copy_logits_min": -750000000.0, "copy_num_tokens": 450.6875, "epoch": 1.2728618840949706, "gen_logits_max": 3.4757251739501953, "gen_logits_mean": -16.647619247436523, "gen_logits_min": -28.923545837402344, "gen_logits_std": 3.0946359634399414, "gen_loss": 0.23394301533699036, "grad_norm": 0.4202957394964271, "learning_rate": 2.3057684210526316e-05, "loss": 0.2718, "mean_copy_accuracy": 0.9962990283966064, "mean_gen_accuracy": 0.8819601088762283, "mean_token_accuracy": 0.9076494723558426, "num_tokens": 606051431.0, "sample_num_tokens": 7918.25, "step": 6232, "total_num_tokens": 606083104.0, "z_loss": 0.00042891682824119925 }, { "copy_logits_max": -5.164377212524414, "copy_logits_min": -687500032.0, "copy_num_tokens": 593.5625, "epoch": 1.2730661220321675, "gen_logits_max": 4.666441917419434, "gen_logits_mean": -14.541960716247559, "gen_logits_min": -26.781543731689453, "gen_logits_std": 3.035191535949707, "gen_loss": 0.29047292470932007, "grad_norm": 0.46232551889468027, "learning_rate": 2.305642105263158e-05, "loss": 0.2887, "mean_copy_accuracy": 0.9963661879301071, "mean_gen_accuracy": 0.8695939630270004, "mean_token_accuracy": 0.9014599025249481, "num_tokens": 606324481.0, "sample_num_tokens": 9352.75, "step": 6233, "total_num_tokens": 606361892.0, "z_loss": 0.0005293336580507457 }, { "copy_logits_max": -5.528675079345703, "copy_logits_min": -750000000.0, "copy_num_tokens": 624.6875, "epoch": 1.2732703599693642, "gen_logits_max": 4.441618919372559, "gen_logits_mean": -14.73770523071289, "gen_logits_min": -27.050661087036133, "gen_logits_std": 3.033712387084961, "gen_loss": 0.2743605375289917, "grad_norm": 0.3657666757613112, "learning_rate": 2.305515789473684e-05, "loss": 0.2827, "mean_copy_accuracy": 0.9966602325439453, "mean_gen_accuracy": 0.8745696246623993, "mean_token_accuracy": 0.9038237780332565, "num_tokens": 606597613.0, "sample_num_tokens": 10518.25, "step": 6234, "total_num_tokens": 606639686.0, "z_loss": 0.0005007339059375226 }, { "copy_logits_max": -6.16165828704834, "copy_logits_min": -750000000.0, "copy_num_tokens": 373.125, "epoch": 1.2734745979065611, "gen_logits_max": 4.266700744628906, "gen_logits_mean": -16.441781997680664, "gen_logits_min": -28.459333419799805, "gen_logits_std": 3.095425844192505, "gen_loss": 0.2265528291463852, "grad_norm": 0.33438569233639465, "learning_rate": 2.3053894736842106e-05, "loss": 0.2666, "mean_copy_accuracy": 0.9968727678060532, "mean_gen_accuracy": 0.8791078627109528, "mean_token_accuracy": 0.9088215976953506, "num_tokens": 606886566.0, "sample_num_tokens": 7009.0, "step": 6235, "total_num_tokens": 606914602.0, "z_loss": 0.00043373569496907294 }, { "copy_logits_max": -3.141040563583374, "copy_logits_min": -750000000.0, "copy_num_tokens": 753.25, "epoch": 1.273678835843758, "gen_logits_max": 4.017948150634766, "gen_logits_mean": -15.968724250793457, "gen_logits_min": -28.833627700805664, "gen_logits_std": 3.135080575942993, "gen_loss": 0.2183813452720642, "grad_norm": 0.3757010922108557, "learning_rate": 2.3052631578947367e-05, "loss": 0.2755, "mean_copy_accuracy": 0.9966188371181488, "mean_gen_accuracy": 0.8730209320783615, "mean_token_accuracy": 0.908107340335846, "num_tokens": 607183072.0, "sample_num_tokens": 10030.5, "step": 6236, "total_num_tokens": 607223194.0, "z_loss": 0.00048685044748708606 }, { "copy_logits_max": -4.785179138183594, "copy_logits_min": -750000000.0, "copy_num_tokens": 434.75, "epoch": 1.2738830737809548, "gen_logits_max": 4.705444812774658, "gen_logits_mean": -15.50307846069336, "gen_logits_min": -27.462791442871094, "gen_logits_std": 3.0756568908691406, "gen_loss": 0.26568907499313354, "grad_norm": 0.3910543182308888, "learning_rate": 2.305136842105263e-05, "loss": 0.276, "mean_copy_accuracy": 0.9949545562267303, "mean_gen_accuracy": 0.8849720805883408, "mean_token_accuracy": 0.9069832414388657, "num_tokens": 607434138.0, "sample_num_tokens": 8182.5, "step": 6237, "total_num_tokens": 607466868.0, "z_loss": 0.0005706733209080994 }, { "copy_logits_max": -1.9326633214950562, "copy_logits_min": -750000000.0, "copy_num_tokens": 465.5625, "epoch": 1.2740873117181517, "gen_logits_max": 4.47138786315918, "gen_logits_mean": -15.325082778930664, "gen_logits_min": -27.374935150146484, "gen_logits_std": 3.0676846504211426, "gen_loss": 0.3233978748321533, "grad_norm": 0.3804706753084739, "learning_rate": 2.3050105263157895e-05, "loss": 0.3068, "mean_copy_accuracy": 0.9950735121965408, "mean_gen_accuracy": 0.8647709488868713, "mean_token_accuracy": 0.8954419195652008, "num_tokens": 607712403.0, "sample_num_tokens": 8144.25, "step": 6238, "total_num_tokens": 607744980.0, "z_loss": 0.0006313394987955689 }, { "copy_logits_max": -3.317708730697632, "copy_logits_min": -750000000.0, "copy_num_tokens": 532.1875, "epoch": 1.2742915496553484, "gen_logits_max": 3.9325666427612305, "gen_logits_mean": -14.800786972045898, "gen_logits_min": -26.972652435302734, "gen_logits_std": 3.052110195159912, "gen_loss": 0.24370144307613373, "grad_norm": 0.40708857619191685, "learning_rate": 2.304884210526316e-05, "loss": 0.2809, "mean_copy_accuracy": 0.9968220442533493, "mean_gen_accuracy": 0.8748605400323868, "mean_token_accuracy": 0.9041376560926437, "num_tokens": 607962597.0, "sample_num_tokens": 8109.75, "step": 6239, "total_num_tokens": 607995036.0, "z_loss": 0.0004556708154268563 }, { "copy_logits_max": -4.86740255355835, "copy_logits_min": -750000000.0, "copy_num_tokens": 466.4375, "epoch": 1.2744957875925453, "gen_logits_max": 3.3650076389312744, "gen_logits_mean": -16.64532470703125, "gen_logits_min": -28.71428680419922, "gen_logits_std": 3.104243278503418, "gen_loss": 0.2883749008178711, "grad_norm": 0.34495089876819734, "learning_rate": 2.3047578947368424e-05, "loss": 0.2718, "mean_copy_accuracy": 0.9958388805389404, "mean_gen_accuracy": 0.8805449306964874, "mean_token_accuracy": 0.9086426347494125, "num_tokens": 608248776.0, "sample_num_tokens": 7897.5, "step": 6240, "total_num_tokens": 608280366.0, "z_loss": 0.0005472637130878866 }, { "copy_logits_max": -3.5075464248657227, "copy_logits_min": -750000000.0, "copy_num_tokens": 463.1875, "epoch": 1.274700025529742, "gen_logits_max": 3.867232084274292, "gen_logits_mean": -15.465396881103516, "gen_logits_min": -28.132762908935547, "gen_logits_std": 3.09238338470459, "gen_loss": 0.24094948172569275, "grad_norm": 0.39177346171089655, "learning_rate": 2.3046315789473685e-05, "loss": 0.2625, "mean_copy_accuracy": 0.9963044226169586, "mean_gen_accuracy": 0.8782270848751068, "mean_token_accuracy": 0.9100334495306015, "num_tokens": 608522939.0, "sample_num_tokens": 7760.25, "step": 6241, "total_num_tokens": 608553980.0, "z_loss": 0.0004387497319839895 }, { "copy_logits_max": -4.748795032501221, "copy_logits_min": -750000000.0, "copy_num_tokens": 306.6875, "epoch": 1.274904263466939, "gen_logits_max": 4.190165042877197, "gen_logits_mean": -16.09176254272461, "gen_logits_min": -28.488845825195312, "gen_logits_std": 3.0658321380615234, "gen_loss": 0.2909879684448242, "grad_norm": 0.3838052134540305, "learning_rate": 2.304505263157895e-05, "loss": 0.2749, "mean_copy_accuracy": 0.9965088218450546, "mean_gen_accuracy": 0.8755525201559067, "mean_token_accuracy": 0.906066507101059, "num_tokens": 608793322.0, "sample_num_tokens": 6971.5, "step": 6242, "total_num_tokens": 608821208.0, "z_loss": 0.0004844490613322705 }, { "copy_logits_max": -6.420568943023682, "copy_logits_min": -750000000.0, "copy_num_tokens": 360.3125, "epoch": 1.275108501404136, "gen_logits_max": 3.932838201522827, "gen_logits_mean": -15.838526725769043, "gen_logits_min": -27.77214813232422, "gen_logits_std": 3.0775609016418457, "gen_loss": 0.28484460711479187, "grad_norm": 0.42749625112807843, "learning_rate": 2.304378947368421e-05, "loss": 0.3025, "mean_copy_accuracy": 0.9963403940200806, "mean_gen_accuracy": 0.8700473457574844, "mean_token_accuracy": 0.8961988240480423, "num_tokens": 609047698.0, "sample_num_tokens": 7947.0, "step": 6243, "total_num_tokens": 609079486.0, "z_loss": 0.00048582852468825877 }, { "copy_logits_max": -7.3167724609375, "copy_logits_min": -750000064.0, "copy_num_tokens": 318.375, "epoch": 1.2753127393413326, "gen_logits_max": 3.44828200340271, "gen_logits_mean": -17.645957946777344, "gen_logits_min": -29.186946868896484, "gen_logits_std": 3.1061947345733643, "gen_loss": 0.2805956304073334, "grad_norm": 0.34785835516016506, "learning_rate": 2.3042526315789474e-05, "loss": 0.2691, "mean_copy_accuracy": 0.9970064461231232, "mean_gen_accuracy": 0.8756027370691299, "mean_token_accuracy": 0.9067709147930145, "num_tokens": 609324143.0, "sample_num_tokens": 6935.75, "step": 6244, "total_num_tokens": 609351886.0, "z_loss": 0.0004579500528052449 }, { "copy_logits_max": -7.635179042816162, "copy_logits_min": -750000000.0, "copy_num_tokens": 227.0, "epoch": 1.2755169772785295, "gen_logits_max": 6.198745250701904, "gen_logits_mean": -15.32522201538086, "gen_logits_min": -27.381406784057617, "gen_logits_std": 3.0799453258514404, "gen_loss": 0.2593350410461426, "grad_norm": 0.37383061985905164, "learning_rate": 2.3041263157894735e-05, "loss": 0.2867, "mean_copy_accuracy": 0.996068924665451, "mean_gen_accuracy": 0.8803983330726624, "mean_token_accuracy": 0.9025406092405319, "num_tokens": 609591933.0, "sample_num_tokens": 6487.25, "step": 6245, "total_num_tokens": 609617882.0, "z_loss": 0.00047505402471870184 }, { "copy_logits_max": -4.925449371337891, "copy_logits_min": -750000000.0, "copy_num_tokens": 441.4375, "epoch": 1.2757212152157265, "gen_logits_max": 3.6681623458862305, "gen_logits_mean": -16.242958068847656, "gen_logits_min": -28.324003219604492, "gen_logits_std": 3.059962749481201, "gen_loss": 0.28341835737228394, "grad_norm": 0.34192029695677284, "learning_rate": 2.304e-05, "loss": 0.2565, "mean_copy_accuracy": 0.9970692545175552, "mean_gen_accuracy": 0.8823494166135788, "mean_token_accuracy": 0.9126093089580536, "num_tokens": 609873963.0, "sample_num_tokens": 8484.75, "step": 6246, "total_num_tokens": 609907902.0, "z_loss": 0.0005592094967141747 }, { "copy_logits_max": -5.822381019592285, "copy_logits_min": -687500032.0, "copy_num_tokens": 454.5, "epoch": 1.2759254531529232, "gen_logits_max": 3.529359817504883, "gen_logits_mean": -17.60162353515625, "gen_logits_min": -29.692081451416016, "gen_logits_std": 3.1469950675964355, "gen_loss": 0.26931774616241455, "grad_norm": 0.37905098576166063, "learning_rate": 2.3038736842105264e-05, "loss": 0.2833, "mean_copy_accuracy": 0.9962574690580368, "mean_gen_accuracy": 0.8723836988210678, "mean_token_accuracy": 0.9054690152406693, "num_tokens": 610145816.0, "sample_num_tokens": 8686.5, "step": 6247, "total_num_tokens": 610180562.0, "z_loss": 0.0006113287527114153 }, { "copy_logits_max": -2.3087120056152344, "copy_logits_min": -750000000.0, "copy_num_tokens": 424.5, "epoch": 1.2761296910901199, "gen_logits_max": 4.1823530197143555, "gen_logits_mean": -16.166383743286133, "gen_logits_min": -28.097957611083984, "gen_logits_std": 3.082008123397827, "gen_loss": 0.3198063373565674, "grad_norm": 0.3481047191859449, "learning_rate": 2.3037473684210528e-05, "loss": 0.2753, "mean_copy_accuracy": 0.9961087852716446, "mean_gen_accuracy": 0.8759984225034714, "mean_token_accuracy": 0.9066032469272614, "num_tokens": 610423021.0, "sample_num_tokens": 7663.75, "step": 6248, "total_num_tokens": 610453676.0, "z_loss": 0.0006393744261004031 }, { "copy_logits_max": -2.5780680179595947, "copy_logits_min": -750000000.0, "copy_num_tokens": 482.75, "epoch": 1.2763339290273168, "gen_logits_max": 4.4722490310668945, "gen_logits_mean": -15.49843692779541, "gen_logits_min": -27.69725227355957, "gen_logits_std": 3.0675997734069824, "gen_loss": 0.2679262161254883, "grad_norm": 0.40887447460041965, "learning_rate": 2.303621052631579e-05, "loss": 0.2932, "mean_copy_accuracy": 0.9967092871665955, "mean_gen_accuracy": 0.8750581294298172, "mean_token_accuracy": 0.9012622982263565, "num_tokens": 610677563.0, "sample_num_tokens": 8338.25, "step": 6249, "total_num_tokens": 610710916.0, "z_loss": 0.0005414404440671206 }, { "copy_logits_max": -4.662718772888184, "copy_logits_min": -750000000.0, "copy_num_tokens": 477.5625, "epoch": 1.2765381669645137, "gen_logits_max": 3.9335649013519287, "gen_logits_mean": -16.829830169677734, "gen_logits_min": -28.98162269592285, "gen_logits_std": 3.0843071937561035, "gen_loss": 0.2861916124820709, "grad_norm": 0.3694520005120793, "learning_rate": 2.3034947368421053e-05, "loss": 0.2858, "mean_copy_accuracy": 0.9967564344406128, "mean_gen_accuracy": 0.8732591867446899, "mean_token_accuracy": 0.9035551995038986, "num_tokens": 610950603.0, "sample_num_tokens": 8339.25, "step": 6250, "total_num_tokens": 610983960.0, "z_loss": 0.0005273744463920593 }, { "copy_logits_max": -7.533012390136719, "copy_logits_min": -750000000.0, "copy_num_tokens": 287.4375, "epoch": 1.2767424049017104, "gen_logits_max": 5.16055154800415, "gen_logits_mean": -16.57537841796875, "gen_logits_min": -28.459983825683594, "gen_logits_std": 3.078566074371338, "gen_loss": 0.285698264837265, "grad_norm": 0.38531954589870643, "learning_rate": 2.3033684210526318e-05, "loss": 0.2849, "mean_copy_accuracy": 0.9955673962831497, "mean_gen_accuracy": 0.877587765455246, "mean_token_accuracy": 0.9052073359489441, "num_tokens": 611234576.0, "sample_num_tokens": 7808.5, "step": 6251, "total_num_tokens": 611265810.0, "z_loss": 0.0004468101542443037 }, { "copy_logits_max": -5.940513610839844, "copy_logits_min": -750000064.0, "copy_num_tokens": 584.1875, "epoch": 1.2769466428389074, "gen_logits_max": 2.9312424659729004, "gen_logits_mean": -17.928428649902344, "gen_logits_min": -30.1766357421875, "gen_logits_std": 3.129706859588623, "gen_loss": 0.24529510736465454, "grad_norm": 0.4583751890568927, "learning_rate": 2.303242105263158e-05, "loss": 0.2469, "mean_copy_accuracy": 0.9969880282878876, "mean_gen_accuracy": 0.8885687440633774, "mean_token_accuracy": 0.9154553711414337, "num_tokens": 611497803.0, "sample_num_tokens": 9691.25, "step": 6252, "total_num_tokens": 611536568.0, "z_loss": 0.0004208640893921256 }, { "copy_logits_max": -2.905318260192871, "copy_logits_min": -750000064.0, "copy_num_tokens": 449.3125, "epoch": 1.2771508807761043, "gen_logits_max": 3.5010933876037598, "gen_logits_mean": -16.71115493774414, "gen_logits_min": -28.94056510925293, "gen_logits_std": 3.1054372787475586, "gen_loss": 0.3100596070289612, "grad_norm": 0.4010454613682843, "learning_rate": 2.3031157894736843e-05, "loss": 0.3043, "mean_copy_accuracy": 0.9953648597002029, "mean_gen_accuracy": 0.8690285831689835, "mean_token_accuracy": 0.8975147008895874, "num_tokens": 611749194.0, "sample_num_tokens": 7568.5, "step": 6253, "total_num_tokens": 611779468.0, "z_loss": 0.000528528296854347 }, { "copy_logits_max": -4.44008731842041, "copy_logits_min": -750000000.0, "copy_num_tokens": 400.375, "epoch": 1.277355118713301, "gen_logits_max": 3.5029938220977783, "gen_logits_mean": -17.283584594726562, "gen_logits_min": -29.50751304626465, "gen_logits_std": 3.1305060386657715, "gen_loss": 0.29359525442123413, "grad_norm": 0.3799583835145937, "learning_rate": 2.3029894736842104e-05, "loss": 0.289, "mean_copy_accuracy": 0.9955603331327438, "mean_gen_accuracy": 0.8747880458831787, "mean_token_accuracy": 0.9016250371932983, "num_tokens": 612008751.0, "sample_num_tokens": 7790.75, "step": 6254, "total_num_tokens": 612039914.0, "z_loss": 0.0005360044306144118 }, { "copy_logits_max": -5.140963077545166, "copy_logits_min": -750000064.0, "copy_num_tokens": 563.4375, "epoch": 1.277559356650498, "gen_logits_max": 3.9694161415100098, "gen_logits_mean": -16.544918060302734, "gen_logits_min": -29.05048370361328, "gen_logits_std": 3.116386890411377, "gen_loss": 0.2770587205886841, "grad_norm": 0.3618565103099737, "learning_rate": 2.302863157894737e-05, "loss": 0.267, "mean_copy_accuracy": 0.9968844056129456, "mean_gen_accuracy": 0.8810919970273972, "mean_token_accuracy": 0.9082980453968048, "num_tokens": 612289269.0, "sample_num_tokens": 10386.25, "step": 6255, "total_num_tokens": 612330814.0, "z_loss": 0.0005001031095162034 }, { "copy_logits_max": -3.6171813011169434, "copy_logits_min": -750000000.0, "copy_num_tokens": 441.6875, "epoch": 1.2777635945876946, "gen_logits_max": 4.351621627807617, "gen_logits_mean": -15.63820743560791, "gen_logits_min": -28.05792999267578, "gen_logits_std": 3.1155123710632324, "gen_loss": 0.2745549976825714, "grad_norm": 0.37108320367903364, "learning_rate": 2.3027368421052633e-05, "loss": 0.2846, "mean_copy_accuracy": 0.9970106333494186, "mean_gen_accuracy": 0.8739499151706696, "mean_token_accuracy": 0.9036286622285843, "num_tokens": 612569063.0, "sample_num_tokens": 8225.25, "step": 6256, "total_num_tokens": 612601964.0, "z_loss": 0.0005687373923137784 }, { "copy_logits_max": -5.573636531829834, "copy_logits_min": -750000000.0, "copy_num_tokens": 305.9375, "epoch": 1.2779678325248915, "gen_logits_max": 6.147666931152344, "gen_logits_mean": -14.110713958740234, "gen_logits_min": -26.03848648071289, "gen_logits_std": 3.0530905723571777, "gen_loss": 0.3024507164955139, "grad_norm": 0.3544353991009004, "learning_rate": 2.3026105263157897e-05, "loss": 0.2894, "mean_copy_accuracy": 0.996528759598732, "mean_gen_accuracy": 0.876460999250412, "mean_token_accuracy": 0.9016155302524567, "num_tokens": 612832131.0, "sample_num_tokens": 7092.75, "step": 6257, "total_num_tokens": 612860502.0, "z_loss": 0.0006429035565815866 }, { "copy_logits_max": -6.696051597595215, "copy_logits_min": -750000000.0, "copy_num_tokens": 419.875, "epoch": 1.2781720704620883, "gen_logits_max": 4.322446823120117, "gen_logits_mean": -16.802249908447266, "gen_logits_min": -28.506589889526367, "gen_logits_std": 3.096691131591797, "gen_loss": 0.3089791536331177, "grad_norm": 0.37831524085351226, "learning_rate": 2.3024842105263158e-05, "loss": 0.2993, "mean_copy_accuracy": 0.9965565353631973, "mean_gen_accuracy": 0.8674996048212051, "mean_token_accuracy": 0.8972359895706177, "num_tokens": 613097988.0, "sample_num_tokens": 8615.5, "step": 6258, "total_num_tokens": 613132450.0, "z_loss": 0.0005366526311263442 }, { "copy_logits_max": -6.803554058074951, "copy_logits_min": -750000000.0, "copy_num_tokens": 383.125, "epoch": 1.2783763083992852, "gen_logits_max": 5.122941017150879, "gen_logits_mean": -16.084144592285156, "gen_logits_min": -28.296630859375, "gen_logits_std": 3.1020472049713135, "gen_loss": 0.3080049753189087, "grad_norm": 0.37121601800116716, "learning_rate": 2.3023578947368422e-05, "loss": 0.2794, "mean_copy_accuracy": 0.9962101131677628, "mean_gen_accuracy": 0.8728267550468445, "mean_token_accuracy": 0.9046269208192825, "num_tokens": 613366035.0, "sample_num_tokens": 7960.25, "step": 6259, "total_num_tokens": 613397876.0, "z_loss": 0.000642231316305697 }, { "copy_logits_max": -5.527937889099121, "copy_logits_min": -750000000.0, "copy_num_tokens": 428.5625, "epoch": 1.278580546336482, "gen_logits_max": 3.7044837474823, "gen_logits_mean": -17.131505966186523, "gen_logits_min": -29.057451248168945, "gen_logits_std": 3.106109142303467, "gen_loss": 0.2881092429161072, "grad_norm": 0.387436229685494, "learning_rate": 2.3022315789473683e-05, "loss": 0.2834, "mean_copy_accuracy": 0.9950285404920578, "mean_gen_accuracy": 0.877162903547287, "mean_token_accuracy": 0.9048573821783066, "num_tokens": 613644260.0, "sample_num_tokens": 8196.5, "step": 6260, "total_num_tokens": 613677046.0, "z_loss": 0.0005550950299948454 }, { "copy_logits_max": -6.888518333435059, "copy_logits_min": -750000000.0, "copy_num_tokens": 314.25, "epoch": 1.2787847842736788, "gen_logits_max": 4.626734256744385, "gen_logits_mean": -15.576400756835938, "gen_logits_min": -27.686330795288086, "gen_logits_std": 3.0752015113830566, "gen_loss": 0.3323955833911896, "grad_norm": 0.3914724914522144, "learning_rate": 2.3021052631578947e-05, "loss": 0.2981, "mean_copy_accuracy": 0.9965357631444931, "mean_gen_accuracy": 0.8724091798067093, "mean_token_accuracy": 0.8980817198753357, "num_tokens": 613905213.0, "sample_num_tokens": 6758.75, "step": 6261, "total_num_tokens": 613932248.0, "z_loss": 0.0006175730959512293 }, { "copy_logits_max": -5.76558256149292, "copy_logits_min": -750000064.0, "copy_num_tokens": 267.125, "epoch": 1.2789890222108757, "gen_logits_max": 5.181825637817383, "gen_logits_mean": -15.539316177368164, "gen_logits_min": -27.58731460571289, "gen_logits_std": 3.0361328125, "gen_loss": 0.3430202007293701, "grad_norm": 0.4064864546722412, "learning_rate": 2.3019789473684208e-05, "loss": 0.2905, "mean_copy_accuracy": 0.9958063811063766, "mean_gen_accuracy": 0.8760956972837448, "mean_token_accuracy": 0.9010879546403885, "num_tokens": 614148962.0, "sample_num_tokens": 6999.0, "step": 6262, "total_num_tokens": 614176958.0, "z_loss": 0.000643073464743793 }, { "copy_logits_max": -6.639573574066162, "copy_logits_min": -750000000.0, "copy_num_tokens": 556.625, "epoch": 1.2791932601480724, "gen_logits_max": 4.382380962371826, "gen_logits_mean": -15.192225456237793, "gen_logits_min": -27.317676544189453, "gen_logits_std": 2.988049030303955, "gen_loss": 0.25152939558029175, "grad_norm": 0.472693825797102, "learning_rate": 2.3018526315789476e-05, "loss": 0.2664, "mean_copy_accuracy": 0.9963919520378113, "mean_gen_accuracy": 0.8782642036676407, "mean_token_accuracy": 0.909434974193573, "num_tokens": 614432821.0, "sample_num_tokens": 10007.25, "step": 6263, "total_num_tokens": 614472850.0, "z_loss": 0.0004525087424553931 }, { "copy_logits_max": -5.611623764038086, "copy_logits_min": -687500032.0, "copy_num_tokens": 581.6875, "epoch": 1.2793974980852694, "gen_logits_max": 3.5705320835113525, "gen_logits_mean": -16.851905822753906, "gen_logits_min": -29.33907699584961, "gen_logits_std": 3.0971391201019287, "gen_loss": 0.2657322287559509, "grad_norm": 0.39110106926248556, "learning_rate": 2.301726315789474e-05, "loss": 0.2755, "mean_copy_accuracy": 0.9961863905191422, "mean_gen_accuracy": 0.8797065764665604, "mean_token_accuracy": 0.9055353552103043, "num_tokens": 614712100.0, "sample_num_tokens": 9329.5, "step": 6264, "total_num_tokens": 614749418.0, "z_loss": 0.0005305691156536341 }, { "copy_logits_max": -4.905142307281494, "copy_logits_min": -750000000.0, "copy_num_tokens": 539.1875, "epoch": 1.279601736022466, "gen_logits_max": 3.3101272583007812, "gen_logits_mean": -16.898456573486328, "gen_logits_min": -28.748844146728516, "gen_logits_std": 3.0618629455566406, "gen_loss": 0.2890070676803589, "grad_norm": 0.40405422727979273, "learning_rate": 2.3016e-05, "loss": 0.2945, "mean_copy_accuracy": 0.9966579675674438, "mean_gen_accuracy": 0.8712680488824844, "mean_token_accuracy": 0.9013949185609818, "num_tokens": 614974488.0, "sample_num_tokens": 8738.5, "step": 6265, "total_num_tokens": 615009442.0, "z_loss": 0.0005332103464752436 }, { "copy_logits_max": -5.131765842437744, "copy_logits_min": -687500032.0, "copy_num_tokens": 369.875, "epoch": 1.279805973959663, "gen_logits_max": 4.869373321533203, "gen_logits_mean": -15.207836151123047, "gen_logits_min": -27.125516891479492, "gen_logits_std": 3.00797700881958, "gen_loss": 0.29123127460479736, "grad_norm": 0.3610551813941566, "learning_rate": 2.3014736842105266e-05, "loss": 0.2801, "mean_copy_accuracy": 0.9963840544223785, "mean_gen_accuracy": 0.8732459396123886, "mean_token_accuracy": 0.904133602976799, "num_tokens": 615250663.0, "sample_num_tokens": 8082.25, "step": 6266, "total_num_tokens": 615282992.0, "z_loss": 0.0005716418963856995 }, { "copy_logits_max": -6.624868869781494, "copy_logits_min": -750000000.0, "copy_num_tokens": 529.125, "epoch": 1.28001021189686, "gen_logits_max": 3.7842624187469482, "gen_logits_mean": -16.579450607299805, "gen_logits_min": -28.536136627197266, "gen_logits_std": 3.084721088409424, "gen_loss": 0.2761440873146057, "grad_norm": 0.3828030886037395, "learning_rate": 2.3013473684210526e-05, "loss": 0.2884, "mean_copy_accuracy": 0.9962607026100159, "mean_gen_accuracy": 0.8735407292842865, "mean_token_accuracy": 0.9021811932325363, "num_tokens": 615512280.0, "sample_num_tokens": 8991.5, "step": 6267, "total_num_tokens": 615548246.0, "z_loss": 0.0005156247643753886 }, { "copy_logits_max": -5.775361061096191, "copy_logits_min": -750000000.0, "copy_num_tokens": 288.0, "epoch": 1.2802144498340566, "gen_logits_max": 5.411093711853027, "gen_logits_mean": -14.830582618713379, "gen_logits_min": -27.13920021057129, "gen_logits_std": 3.0355541706085205, "gen_loss": 0.3083399534225464, "grad_norm": 0.36832384918997024, "learning_rate": 2.301221052631579e-05, "loss": 0.2896, "mean_copy_accuracy": 0.9962061792612076, "mean_gen_accuracy": 0.8704970479011536, "mean_token_accuracy": 0.9007778465747833, "num_tokens": 615811978.0, "sample_num_tokens": 8127.5, "step": 6268, "total_num_tokens": 615844488.0, "z_loss": 0.0004921000800095499 }, { "copy_logits_max": -6.733202934265137, "copy_logits_min": -687500032.0, "copy_num_tokens": 472.3125, "epoch": 1.2804186877712536, "gen_logits_max": 4.149435520172119, "gen_logits_mean": -16.290864944458008, "gen_logits_min": -28.510189056396484, "gen_logits_std": 3.114259719848633, "gen_loss": 0.2911764979362488, "grad_norm": 0.39223064148163894, "learning_rate": 2.301094736842105e-05, "loss": 0.2994, "mean_copy_accuracy": 0.9959698617458344, "mean_gen_accuracy": 0.870924785733223, "mean_token_accuracy": 0.9008840620517731, "num_tokens": 616071302.0, "sample_num_tokens": 8818.0, "step": 6269, "total_num_tokens": 616106574.0, "z_loss": 0.0005358766065910459 }, { "copy_logits_max": -3.869539737701416, "copy_logits_min": -750000128.0, "copy_num_tokens": 607.9375, "epoch": 1.2806229257084503, "gen_logits_max": 3.3278141021728516, "gen_logits_mean": -15.916640281677246, "gen_logits_min": -28.4364013671875, "gen_logits_std": 3.104785442352295, "gen_loss": 0.2304273396730423, "grad_norm": 0.3966776011601316, "learning_rate": 2.3009684210526316e-05, "loss": 0.2646, "mean_copy_accuracy": 0.9966942965984344, "mean_gen_accuracy": 0.8796731978654861, "mean_token_accuracy": 0.9084133207798004, "num_tokens": 616328908.0, "sample_num_tokens": 8694.5, "step": 6270, "total_num_tokens": 616363686.0, "z_loss": 0.00046287314034998417 }, { "copy_logits_max": -5.404136657714844, "copy_logits_min": -750000000.0, "copy_num_tokens": 502.3125, "epoch": 1.2808271636456472, "gen_logits_max": 4.159698486328125, "gen_logits_mean": -16.24364471435547, "gen_logits_min": -28.480436325073242, "gen_logits_std": 3.106569290161133, "gen_loss": 0.26042574644088745, "grad_norm": 0.3477133567498367, "learning_rate": 2.3008421052631577e-05, "loss": 0.2786, "mean_copy_accuracy": 0.9968554675579071, "mean_gen_accuracy": 0.8803726434707642, "mean_token_accuracy": 0.9071919769048691, "num_tokens": 616595548.0, "sample_num_tokens": 8340.5, "step": 6271, "total_num_tokens": 616628910.0, "z_loss": 0.0005161272711120546 }, { "copy_logits_max": -4.090299606323242, "copy_logits_min": -750000000.0, "copy_num_tokens": 493.5625, "epoch": 1.281031401582844, "gen_logits_max": 3.4360053539276123, "gen_logits_mean": -16.637866973876953, "gen_logits_min": -28.831924438476562, "gen_logits_std": 3.128756046295166, "gen_loss": 0.25721192359924316, "grad_norm": 0.396445429381801, "learning_rate": 2.3007157894736845e-05, "loss": 0.2833, "mean_copy_accuracy": 0.9969265013933182, "mean_gen_accuracy": 0.8713475465774536, "mean_token_accuracy": 0.9041656702756882, "num_tokens": 616865066.0, "sample_num_tokens": 8162.0, "step": 6272, "total_num_tokens": 616897714.0, "z_loss": 0.0004603127599693835 }, { "copy_logits_max": -7.327569007873535, "copy_logits_min": -750000000.0, "copy_num_tokens": 336.6875, "epoch": 1.2812356395200408, "gen_logits_max": 4.591832160949707, "gen_logits_mean": -16.158302307128906, "gen_logits_min": -28.533130645751953, "gen_logits_std": 3.0989198684692383, "gen_loss": 0.27837279438972473, "grad_norm": 0.3622414779631956, "learning_rate": 2.3005894736842106e-05, "loss": 0.2768, "mean_copy_accuracy": 0.996183380484581, "mean_gen_accuracy": 0.878599613904953, "mean_token_accuracy": 0.90669284760952, "num_tokens": 617131932.0, "sample_num_tokens": 6851.0, "step": 6273, "total_num_tokens": 617159336.0, "z_loss": 0.0005223978660069406 }, { "copy_logits_max": -1.5004808902740479, "copy_logits_min": -750000000.0, "copy_num_tokens": 583.75, "epoch": 1.2814398774572378, "gen_logits_max": 3.8420865535736084, "gen_logits_mean": -15.402878761291504, "gen_logits_min": -27.81315803527832, "gen_logits_std": 3.0885438919067383, "gen_loss": 0.2693478763103485, "grad_norm": 0.4626093961116295, "learning_rate": 2.300463157894737e-05, "loss": 0.2947, "mean_copy_accuracy": 0.9939181208610535, "mean_gen_accuracy": 0.8695363700389862, "mean_token_accuracy": 0.9000700414180756, "num_tokens": 617397502.0, "sample_num_tokens": 8995.5, "step": 6274, "total_num_tokens": 617433484.0, "z_loss": 0.0004595615610014647 }, { "copy_logits_max": -4.350430488586426, "copy_logits_min": -750000064.0, "copy_num_tokens": 524.875, "epoch": 1.2816441153944345, "gen_logits_max": 3.4460275173187256, "gen_logits_mean": -16.149707794189453, "gen_logits_min": -28.76982879638672, "gen_logits_std": 3.1050333976745605, "gen_loss": 0.2850196361541748, "grad_norm": 0.3758304811270749, "learning_rate": 2.300336842105263e-05, "loss": 0.2918, "mean_copy_accuracy": 0.9962225258350372, "mean_gen_accuracy": 0.8664807379245758, "mean_token_accuracy": 0.9004770219326019, "num_tokens": 617677202.0, "sample_num_tokens": 8596.5, "step": 6275, "total_num_tokens": 617711588.0, "z_loss": 0.0005015817005187273 }, { "copy_logits_max": -2.960686683654785, "copy_logits_min": -687500032.0, "copy_num_tokens": 445.75, "epoch": 1.2818483533316314, "gen_logits_max": 2.984294891357422, "gen_logits_mean": -17.636938095092773, "gen_logits_min": -29.865211486816406, "gen_logits_std": 3.158322334289551, "gen_loss": 0.29195213317871094, "grad_norm": 0.39425404009460263, "learning_rate": 2.3002105263157895e-05, "loss": 0.2923, "mean_copy_accuracy": 0.9971487522125244, "mean_gen_accuracy": 0.8693124800920486, "mean_token_accuracy": 0.902296170592308, "num_tokens": 617962265.0, "sample_num_tokens": 8446.75, "step": 6276, "total_num_tokens": 617996052.0, "z_loss": 0.00046127449604682624 }, { "copy_logits_max": -2.7835450172424316, "copy_logits_min": -750000000.0, "copy_num_tokens": 538.8125, "epoch": 1.2820525912688283, "gen_logits_max": 3.5598371028900146, "gen_logits_mean": -16.44353485107422, "gen_logits_min": -28.840015411376953, "gen_logits_std": 3.1490466594696045, "gen_loss": 0.2502340078353882, "grad_norm": 0.3887956620561108, "learning_rate": 2.300084210526316e-05, "loss": 0.2641, "mean_copy_accuracy": 0.9972385615110397, "mean_gen_accuracy": 0.8812234252691269, "mean_token_accuracy": 0.9119446277618408, "num_tokens": 618244001.0, "sample_num_tokens": 8511.25, "step": 6277, "total_num_tokens": 618278046.0, "z_loss": 0.0004601959662977606 }, { "copy_logits_max": -5.054720878601074, "copy_logits_min": -750000000.0, "copy_num_tokens": 397.75, "epoch": 1.282256829206025, "gen_logits_max": 3.1461257934570312, "gen_logits_mean": -18.03658676147461, "gen_logits_min": -30.186861038208008, "gen_logits_std": 3.1429412364959717, "gen_loss": 0.2841934263706207, "grad_norm": 0.37317357324566675, "learning_rate": 2.299957894736842e-05, "loss": 0.2706, "mean_copy_accuracy": 0.9964732825756073, "mean_gen_accuracy": 0.8776329159736633, "mean_token_accuracy": 0.9064958244562149, "num_tokens": 618519805.0, "sample_num_tokens": 8159.25, "step": 6278, "total_num_tokens": 618552442.0, "z_loss": 0.0004817901353817433 }, { "copy_logits_max": -4.803238868713379, "copy_logits_min": -750000064.0, "copy_num_tokens": 455.1875, "epoch": 1.2824610671432217, "gen_logits_max": 3.680795192718506, "gen_logits_mean": -16.714427947998047, "gen_logits_min": -29.41132354736328, "gen_logits_std": 3.1415085792541504, "gen_loss": 0.2380252480506897, "grad_norm": 0.3877299880448578, "learning_rate": 2.2998315789473685e-05, "loss": 0.2612, "mean_copy_accuracy": 0.9971576035022736, "mean_gen_accuracy": 0.878868043422699, "mean_token_accuracy": 0.911756306886673, "num_tokens": 618793393.0, "sample_num_tokens": 7857.75, "step": 6279, "total_num_tokens": 618824824.0, "z_loss": 0.000390248722396791 }, { "copy_logits_max": -6.522370338439941, "copy_logits_min": -750000000.0, "copy_num_tokens": 251.9375, "epoch": 1.2826653050804186, "gen_logits_max": 4.070901393890381, "gen_logits_mean": -16.98116683959961, "gen_logits_min": -29.024452209472656, "gen_logits_std": 3.117615222930908, "gen_loss": 0.2897997498512268, "grad_norm": 0.3904519359487309, "learning_rate": 2.299705263157895e-05, "loss": 0.2694, "mean_copy_accuracy": 0.9960435479879379, "mean_gen_accuracy": 0.8816128671169281, "mean_token_accuracy": 0.9086011499166489, "num_tokens": 619050204.0, "sample_num_tokens": 6559.0, "step": 6280, "total_num_tokens": 619076440.0, "z_loss": 0.0005075057270005345 }, { "copy_logits_max": -3.9638776779174805, "copy_logits_min": -687500032.0, "copy_num_tokens": 557.8125, "epoch": 1.2828695430176156, "gen_logits_max": 3.2938506603240967, "gen_logits_mean": -17.08121109008789, "gen_logits_min": -29.759479522705078, "gen_logits_std": 3.139303684234619, "gen_loss": 0.30067023634910583, "grad_norm": 0.3967186052709206, "learning_rate": 2.2995789473684213e-05, "loss": 0.3034, "mean_copy_accuracy": 0.9957806468009949, "mean_gen_accuracy": 0.864568829536438, "mean_token_accuracy": 0.8983200639486313, "num_tokens": 619304104.0, "sample_num_tokens": 9167.0, "step": 6281, "total_num_tokens": 619340772.0, "z_loss": 0.0005393215687945485 }, { "copy_logits_max": -5.549570083618164, "copy_logits_min": -750000128.0, "copy_num_tokens": 589.6875, "epoch": 1.2830737809548123, "gen_logits_max": 4.238630294799805, "gen_logits_mean": -15.801848411560059, "gen_logits_min": -28.285924911499023, "gen_logits_std": 3.120500087738037, "gen_loss": 0.2792397141456604, "grad_norm": 0.3909168200934939, "learning_rate": 2.2994526315789474e-05, "loss": 0.2851, "mean_copy_accuracy": 0.9962187111377716, "mean_gen_accuracy": 0.876327320933342, "mean_token_accuracy": 0.9052970707416534, "num_tokens": 619580975.0, "sample_num_tokens": 9592.75, "step": 6282, "total_num_tokens": 619619346.0, "z_loss": 0.0005219988524913788 }, { "copy_logits_max": -4.745830535888672, "copy_logits_min": -750000000.0, "copy_num_tokens": 422.625, "epoch": 1.2832780188920092, "gen_logits_max": 3.661013126373291, "gen_logits_mean": -17.157176971435547, "gen_logits_min": -29.14974594116211, "gen_logits_std": 3.1108710765838623, "gen_loss": 0.312364399433136, "grad_norm": 0.39337471543361335, "learning_rate": 2.299326315789474e-05, "loss": 0.2774, "mean_copy_accuracy": 0.996539443731308, "mean_gen_accuracy": 0.8743695914745331, "mean_token_accuracy": 0.9063709825277328, "num_tokens": 619867106.0, "sample_num_tokens": 8630.0, "step": 6283, "total_num_tokens": 619901626.0, "z_loss": 0.0006148719694465399 }, { "copy_logits_max": -4.742023468017578, "copy_logits_min": -750000000.0, "copy_num_tokens": 477.875, "epoch": 1.2834822568292061, "gen_logits_max": 3.645751476287842, "gen_logits_mean": -17.003910064697266, "gen_logits_min": -29.042451858520508, "gen_logits_std": 3.112332344055176, "gen_loss": 0.27074897289276123, "grad_norm": 0.37598649765238207, "learning_rate": 2.2992e-05, "loss": 0.2766, "mean_copy_accuracy": 0.9971125721931458, "mean_gen_accuracy": 0.8768827468156815, "mean_token_accuracy": 0.9073138236999512, "num_tokens": 620144386.0, "sample_num_tokens": 8418.5, "step": 6284, "total_num_tokens": 620178060.0, "z_loss": 0.00048344366950914264 }, { "copy_logits_max": -4.273883819580078, "copy_logits_min": -687500032.0, "copy_num_tokens": 500.5625, "epoch": 1.2836864947664028, "gen_logits_max": 3.3951854705810547, "gen_logits_mean": -16.882862091064453, "gen_logits_min": -29.43938446044922, "gen_logits_std": 3.1245555877685547, "gen_loss": 0.25835901498794556, "grad_norm": 0.33598395113866214, "learning_rate": 2.2990736842105264e-05, "loss": 0.2643, "mean_copy_accuracy": 0.995809480547905, "mean_gen_accuracy": 0.8832370638847351, "mean_token_accuracy": 0.9112928956747055, "num_tokens": 620434789.0, "sample_num_tokens": 8856.25, "step": 6285, "total_num_tokens": 620470214.0, "z_loss": 0.00046638568164780736 }, { "copy_logits_max": -4.762965202331543, "copy_logits_min": -750000000.0, "copy_num_tokens": 622.0, "epoch": 1.2838907327035998, "gen_logits_max": 3.7985353469848633, "gen_logits_mean": -15.473767280578613, "gen_logits_min": -28.412132263183594, "gen_logits_std": 3.1078896522521973, "gen_loss": 0.2769101858139038, "grad_norm": 0.4267125936923811, "learning_rate": 2.2989473684210525e-05, "loss": 0.3084, "mean_copy_accuracy": 0.9956074804067612, "mean_gen_accuracy": 0.865091398358345, "mean_token_accuracy": 0.8962405920028687, "num_tokens": 620713274.0, "sample_num_tokens": 9792.5, "step": 6286, "total_num_tokens": 620752444.0, "z_loss": 0.0004970190348103642 }, { "copy_logits_max": -4.518726825714111, "copy_logits_min": -687500032.0, "copy_num_tokens": 561.125, "epoch": 1.2840949706407965, "gen_logits_max": 3.494007110595703, "gen_logits_mean": -16.76943016052246, "gen_logits_min": -28.764652252197266, "gen_logits_std": 3.0914254188537598, "gen_loss": 0.303753525018692, "grad_norm": 0.37227752711877843, "learning_rate": 2.298821052631579e-05, "loss": 0.3027, "mean_copy_accuracy": 0.9962936192750931, "mean_gen_accuracy": 0.8666508346796036, "mean_token_accuracy": 0.8974372297525406, "num_tokens": 621008742.0, "sample_num_tokens": 9486.0, "step": 6287, "total_num_tokens": 621046686.0, "z_loss": 0.0005648346850648522 }, { "copy_logits_max": -7.130998611450195, "copy_logits_min": -687500032.0, "copy_num_tokens": 260.3125, "epoch": 1.2842992085779934, "gen_logits_max": 4.2094879150390625, "gen_logits_mean": -16.649866104125977, "gen_logits_min": -28.547517776489258, "gen_logits_std": 3.100004196166992, "gen_loss": 0.28598329424858093, "grad_norm": 0.3474641599896819, "learning_rate": 2.2986947368421053e-05, "loss": 0.254, "mean_copy_accuracy": 0.9964318126440048, "mean_gen_accuracy": 0.8837893605232239, "mean_token_accuracy": 0.9148798882961273, "num_tokens": 621311003.0, "sample_num_tokens": 7742.25, "step": 6288, "total_num_tokens": 621341972.0, "z_loss": 0.0005196089623495936 }, { "copy_logits_max": -7.78466796875, "copy_logits_min": -750000000.0, "copy_num_tokens": 436.25, "epoch": 1.28450344651519, "gen_logits_max": 4.480332851409912, "gen_logits_mean": -15.63039779663086, "gen_logits_min": -27.552616119384766, "gen_logits_std": 3.0557632446289062, "gen_loss": 0.30104464292526245, "grad_norm": 0.3612221599821513, "learning_rate": 2.2985684210526318e-05, "loss": 0.2904, "mean_copy_accuracy": 0.9964865893125534, "mean_gen_accuracy": 0.8693637400865555, "mean_token_accuracy": 0.9029349535703659, "num_tokens": 621602418.0, "sample_num_tokens": 8384.0, "step": 6289, "total_num_tokens": 621635954.0, "z_loss": 0.0005676108412444592 }, { "copy_logits_max": -4.57219123840332, "copy_logits_min": -750000064.0, "copy_num_tokens": 539.6875, "epoch": 1.284707684452387, "gen_logits_max": 3.444080114364624, "gen_logits_mean": -15.87220287322998, "gen_logits_min": -28.64604377746582, "gen_logits_std": 3.0975470542907715, "gen_loss": 0.29813480377197266, "grad_norm": 0.3643891526456334, "learning_rate": 2.298442105263158e-05, "loss": 0.2803, "mean_copy_accuracy": 0.9977600574493408, "mean_gen_accuracy": 0.8739590644836426, "mean_token_accuracy": 0.9069788306951523, "num_tokens": 621868896.0, "sample_num_tokens": 8297.5, "step": 6290, "total_num_tokens": 621902086.0, "z_loss": 0.0005719582550227642 }, { "copy_logits_max": -6.375297546386719, "copy_logits_min": -750000000.0, "copy_num_tokens": 549.375, "epoch": 1.284911922389584, "gen_logits_max": 3.315943479537964, "gen_logits_mean": -16.93458366394043, "gen_logits_min": -29.068674087524414, "gen_logits_std": 3.1155738830566406, "gen_loss": 0.26071980595588684, "grad_norm": 0.41879764944273257, "learning_rate": 2.2983157894736843e-05, "loss": 0.2945, "mean_copy_accuracy": 0.9957749545574188, "mean_gen_accuracy": 0.8735955953598022, "mean_token_accuracy": 0.9007980674505234, "num_tokens": 622135979.0, "sample_num_tokens": 8718.75, "step": 6291, "total_num_tokens": 622170854.0, "z_loss": 0.0004939592909067869 }, { "copy_logits_max": -6.2353315353393555, "copy_logits_min": -687500032.0, "copy_num_tokens": 316.8125, "epoch": 1.2851161603267807, "gen_logits_max": 4.096787452697754, "gen_logits_mean": -16.059715270996094, "gen_logits_min": -28.075397491455078, "gen_logits_std": 3.069669485092163, "gen_loss": 0.2694931626319885, "grad_norm": 0.3771123903692841, "learning_rate": 2.2981894736842107e-05, "loss": 0.2798, "mean_copy_accuracy": 0.9958492070436478, "mean_gen_accuracy": 0.8795870095491409, "mean_token_accuracy": 0.9039167016744614, "num_tokens": 622379297.0, "sample_num_tokens": 7486.25, "step": 6292, "total_num_tokens": 622409242.0, "z_loss": 0.0004756659909617156 }, { "copy_logits_max": -6.533402442932129, "copy_logits_min": -687500032.0, "copy_num_tokens": 584.3125, "epoch": 1.2853203982639776, "gen_logits_max": 3.5683083534240723, "gen_logits_mean": -16.43522071838379, "gen_logits_min": -28.41732406616211, "gen_logits_std": 3.0997378826141357, "gen_loss": 0.27031052112579346, "grad_norm": 0.37014157084704297, "learning_rate": 2.2980631578947368e-05, "loss": 0.264, "mean_copy_accuracy": 0.9960635006427765, "mean_gen_accuracy": 0.8836508095264435, "mean_token_accuracy": 0.9119488298892975, "num_tokens": 622655554.0, "sample_num_tokens": 8956.0, "step": 6293, "total_num_tokens": 622691378.0, "z_loss": 0.0005065284203737974 }, { "copy_logits_max": -6.373425483703613, "copy_logits_min": -750000000.0, "copy_num_tokens": 863.0625, "epoch": 1.2855246362011743, "gen_logits_max": 2.529651641845703, "gen_logits_mean": -16.46442985534668, "gen_logits_min": -29.268070220947266, "gen_logits_std": 3.133479356765747, "gen_loss": 0.21818454563617706, "grad_norm": 0.3561512662189123, "learning_rate": 2.2979368421052632e-05, "loss": 0.2623, "mean_copy_accuracy": 0.9967000037431717, "mean_gen_accuracy": 0.8809759467840195, "mean_token_accuracy": 0.9114408195018768, "num_tokens": 622939442.0, "sample_num_tokens": 10695.5, "step": 6294, "total_num_tokens": 622982224.0, "z_loss": 0.000425872509367764 }, { "copy_logits_max": -5.181546211242676, "copy_logits_min": -750000000.0, "copy_num_tokens": 429.875, "epoch": 1.2857288741383712, "gen_logits_max": 3.730278253555298, "gen_logits_mean": -16.026683807373047, "gen_logits_min": -28.340091705322266, "gen_logits_std": 3.0999035835266113, "gen_loss": 0.31537193059921265, "grad_norm": 0.3500914451467799, "learning_rate": 2.2978105263157893e-05, "loss": 0.278, "mean_copy_accuracy": 0.9967265129089355, "mean_gen_accuracy": 0.8722089231014252, "mean_token_accuracy": 0.9058108925819397, "num_tokens": 623224064.0, "sample_num_tokens": 7552.5, "step": 6295, "total_num_tokens": 623254274.0, "z_loss": 0.0005829357542097569 }, { "copy_logits_max": -6.846879482269287, "copy_logits_min": -750000000.0, "copy_num_tokens": 492.75, "epoch": 1.285933112075568, "gen_logits_max": 2.865297794342041, "gen_logits_mean": -17.747594833374023, "gen_logits_min": -29.89153289794922, "gen_logits_std": 3.1415233612060547, "gen_loss": 0.286277174949646, "grad_norm": 0.4047776464254386, "learning_rate": 2.297684210526316e-05, "loss": 0.2674, "mean_copy_accuracy": 0.9963629841804504, "mean_gen_accuracy": 0.8796870112419128, "mean_token_accuracy": 0.9098365008831024, "num_tokens": 623488668.0, "sample_num_tokens": 8646.0, "step": 6296, "total_num_tokens": 623523252.0, "z_loss": 0.00048713776050135493 }, { "copy_logits_max": -7.090641975402832, "copy_logits_min": -687500032.0, "copy_num_tokens": 611.1875, "epoch": 1.2861373500127649, "gen_logits_max": 3.1483376026153564, "gen_logits_mean": -16.243854522705078, "gen_logits_min": -28.793136596679688, "gen_logits_std": 3.1090526580810547, "gen_loss": 0.22925516963005066, "grad_norm": 0.3742258634139963, "learning_rate": 2.2975578947368422e-05, "loss": 0.266, "mean_copy_accuracy": 0.9964412748813629, "mean_gen_accuracy": 0.8875250667333603, "mean_token_accuracy": 0.9113015383481979, "num_tokens": 623760909.0, "sample_num_tokens": 9764.75, "step": 6297, "total_num_tokens": 623799968.0, "z_loss": 0.0004087358247488737 }, { "copy_logits_max": -5.607972145080566, "copy_logits_min": -625000064.0, "copy_num_tokens": 324.875, "epoch": 1.2863415879499618, "gen_logits_max": 4.093552589416504, "gen_logits_mean": -16.21192169189453, "gen_logits_min": -28.604076385498047, "gen_logits_std": 3.097477912902832, "gen_loss": 0.2900826334953308, "grad_norm": 0.39958646022541394, "learning_rate": 2.2974315789473686e-05, "loss": 0.2762, "mean_copy_accuracy": 0.9959550499916077, "mean_gen_accuracy": 0.8794300556182861, "mean_token_accuracy": 0.9064658135175705, "num_tokens": 624037669.0, "sample_num_tokens": 7816.75, "step": 6298, "total_num_tokens": 624068936.0, "z_loss": 0.0005017956718802452 }, { "copy_logits_max": -3.8041224479675293, "copy_logits_min": -750000064.0, "copy_num_tokens": 435.875, "epoch": 1.2865458258871585, "gen_logits_max": 3.115647315979004, "gen_logits_mean": -17.268409729003906, "gen_logits_min": -29.42899513244629, "gen_logits_std": 3.110926628112793, "gen_loss": 0.3090735673904419, "grad_norm": 0.3666903354415713, "learning_rate": 2.2973052631578947e-05, "loss": 0.2774, "mean_copy_accuracy": 0.997405469417572, "mean_gen_accuracy": 0.8743117302656174, "mean_token_accuracy": 0.9062848538160324, "num_tokens": 624301085.0, "sample_num_tokens": 7741.75, "step": 6299, "total_num_tokens": 624332052.0, "z_loss": 0.0006142485653981566 }, { "copy_logits_max": -4.007079601287842, "copy_logits_min": -750000064.0, "copy_num_tokens": 565.1875, "epoch": 1.2867500638243554, "gen_logits_max": 3.062192916870117, "gen_logits_mean": -17.18025016784668, "gen_logits_min": -29.538528442382812, "gen_logits_std": 3.142853021621704, "gen_loss": 0.262504905462265, "grad_norm": 0.39661637557081636, "learning_rate": 2.297178947368421e-05, "loss": 0.2853, "mean_copy_accuracy": 0.996117502450943, "mean_gen_accuracy": 0.8739066869020462, "mean_token_accuracy": 0.9033048450946808, "num_tokens": 624588056.0, "sample_num_tokens": 9058.5, "step": 6300, "total_num_tokens": 624624290.0, "z_loss": 0.0005400929367169738 }, { "copy_logits_max": -3.5696256160736084, "copy_logits_min": -750000000.0, "copy_num_tokens": 604.75, "epoch": 1.2869543017615523, "gen_logits_max": 3.9951868057250977, "gen_logits_mean": -14.464742660522461, "gen_logits_min": -26.804744720458984, "gen_logits_std": 3.085756540298462, "gen_loss": 0.23634353280067444, "grad_norm": 0.36074276582074, "learning_rate": 2.2970526315789472e-05, "loss": 0.2502, "mean_copy_accuracy": 0.9968391358852386, "mean_gen_accuracy": 0.8836649656295776, "mean_token_accuracy": 0.9150915741920471, "num_tokens": 624875363.0, "sample_num_tokens": 8470.25, "step": 6301, "total_num_tokens": 624909244.0, "z_loss": 0.0004924132372252643 }, { "copy_logits_max": -6.193984031677246, "copy_logits_min": -750000000.0, "copy_num_tokens": 225.375, "epoch": 1.287158539698749, "gen_logits_max": 4.59325647354126, "gen_logits_mean": -17.053604125976562, "gen_logits_min": -29.232070922851562, "gen_logits_std": 3.100346565246582, "gen_loss": 0.30003756284713745, "grad_norm": 0.3430042979701814, "learning_rate": 2.2969263157894737e-05, "loss": 0.269, "mean_copy_accuracy": 0.9966754615306854, "mean_gen_accuracy": 0.8795505166053772, "mean_token_accuracy": 0.9089347869157791, "num_tokens": 625146464.0, "sample_num_tokens": 6229.0, "step": 6302, "total_num_tokens": 625171380.0, "z_loss": 0.0005331399152055383 }, { "copy_logits_max": -6.123264312744141, "copy_logits_min": -750000000.0, "copy_num_tokens": 373.125, "epoch": 1.2873627776359458, "gen_logits_max": 4.8863525390625, "gen_logits_mean": -16.084083557128906, "gen_logits_min": -28.07129669189453, "gen_logits_std": 3.0891382694244385, "gen_loss": 0.27355825901031494, "grad_norm": 0.42148740316980865, "learning_rate": 2.2967999999999998e-05, "loss": 0.2804, "mean_copy_accuracy": 0.9954091012477875, "mean_gen_accuracy": 0.8754458725452423, "mean_token_accuracy": 0.9060861170291901, "num_tokens": 625402353.0, "sample_num_tokens": 8032.75, "step": 6303, "total_num_tokens": 625434484.0, "z_loss": 0.0005197622813284397 }, { "copy_logits_max": -5.855632781982422, "copy_logits_min": -750000000.0, "copy_num_tokens": 599.5625, "epoch": 1.2875670155731427, "gen_logits_max": 4.219049453735352, "gen_logits_mean": -15.552669525146484, "gen_logits_min": -27.880035400390625, "gen_logits_std": 3.0507750511169434, "gen_loss": 0.2704772353172302, "grad_norm": 0.3620511325499886, "learning_rate": 2.2966736842105265e-05, "loss": 0.259, "mean_copy_accuracy": 0.9969953745603561, "mean_gen_accuracy": 0.8853555619716644, "mean_token_accuracy": 0.9135311394929886, "num_tokens": 625665939.0, "sample_num_tokens": 8905.25, "step": 6304, "total_num_tokens": 625701560.0, "z_loss": 0.0005282631027512252 }, { "copy_logits_max": -6.33537483215332, "copy_logits_min": -750000000.0, "copy_num_tokens": 387.1875, "epoch": 1.2877712535103396, "gen_logits_max": 4.464301109313965, "gen_logits_mean": -15.33064079284668, "gen_logits_min": -27.494579315185547, "gen_logits_std": 3.067549705505371, "gen_loss": 0.25383827090263367, "grad_norm": 0.38294594669659976, "learning_rate": 2.296547368421053e-05, "loss": 0.295, "mean_copy_accuracy": 0.9969709366559982, "mean_gen_accuracy": 0.8729076534509659, "mean_token_accuracy": 0.9013479650020599, "num_tokens": 625936346.0, "sample_num_tokens": 7587.5, "step": 6305, "total_num_tokens": 625966696.0, "z_loss": 0.0004620960680767894 }, { "copy_logits_max": -6.920846939086914, "copy_logits_min": -750000000.0, "copy_num_tokens": 557.3125, "epoch": 1.2879754914475363, "gen_logits_max": 3.4225196838378906, "gen_logits_mean": -17.2506103515625, "gen_logits_min": -29.48731231689453, "gen_logits_std": 3.119387626647949, "gen_loss": 0.26005667448043823, "grad_norm": 0.37029597492323163, "learning_rate": 2.296421052631579e-05, "loss": 0.2861, "mean_copy_accuracy": 0.9954130947589874, "mean_gen_accuracy": 0.8732432276010513, "mean_token_accuracy": 0.9039375334978104, "num_tokens": 626225246.0, "sample_num_tokens": 9554.0, "step": 6306, "total_num_tokens": 626263462.0, "z_loss": 0.0005258647724986076 }, { "copy_logits_max": -6.580859661102295, "copy_logits_min": -750000000.0, "copy_num_tokens": 519.0, "epoch": 1.2881797293847332, "gen_logits_max": 4.273724555969238, "gen_logits_mean": -15.31248664855957, "gen_logits_min": -27.579021453857422, "gen_logits_std": 3.06805682182312, "gen_loss": 0.2827228307723999, "grad_norm": 0.41720855769112664, "learning_rate": 2.2962947368421055e-05, "loss": 0.3036, "mean_copy_accuracy": 0.9960732311010361, "mean_gen_accuracy": 0.8697646409273148, "mean_token_accuracy": 0.8968947678804398, "num_tokens": 626482516.0, "sample_num_tokens": 8960.0, "step": 6307, "total_num_tokens": 626518356.0, "z_loss": 0.000524193630553782 }, { "copy_logits_max": -6.891130447387695, "copy_logits_min": -750000000.0, "copy_num_tokens": 557.25, "epoch": 1.2883839673219302, "gen_logits_max": 4.120582580566406, "gen_logits_mean": -15.990495681762695, "gen_logits_min": -27.922880172729492, "gen_logits_std": 3.072155237197876, "gen_loss": 0.27820920944213867, "grad_norm": 0.39608060723667954, "learning_rate": 2.2961684210526316e-05, "loss": 0.2876, "mean_copy_accuracy": 0.9978944659233093, "mean_gen_accuracy": 0.869723841547966, "mean_token_accuracy": 0.9038026928901672, "num_tokens": 626748711.0, "sample_num_tokens": 8569.25, "step": 6308, "total_num_tokens": 626782988.0, "z_loss": 0.00048213748959824443 }, { "copy_logits_max": -7.364963531494141, "copy_logits_min": -750000000.0, "copy_num_tokens": 570.4375, "epoch": 1.2885882052591269, "gen_logits_max": 4.623425483703613, "gen_logits_mean": -14.382465362548828, "gen_logits_min": -27.6936092376709, "gen_logits_std": 3.0657596588134766, "gen_loss": 0.23630672693252563, "grad_norm": 0.40833280090197904, "learning_rate": 2.296042105263158e-05, "loss": 0.273, "mean_copy_accuracy": 0.9960871785879135, "mean_gen_accuracy": 0.8802794814109802, "mean_token_accuracy": 0.9087677448987961, "num_tokens": 627014548.0, "sample_num_tokens": 9824.5, "step": 6309, "total_num_tokens": 627053846.0, "z_loss": 0.0004413963761180639 }, { "copy_logits_max": -5.531317234039307, "copy_logits_min": -750000000.0, "copy_num_tokens": 617.25, "epoch": 1.2887924431963238, "gen_logits_max": 3.5604641437530518, "gen_logits_mean": -16.651325225830078, "gen_logits_min": -29.16839599609375, "gen_logits_std": 3.1267497539520264, "gen_loss": 0.26315149664878845, "grad_norm": 0.36472208071970985, "learning_rate": 2.295915789473684e-05, "loss": 0.2704, "mean_copy_accuracy": 0.9959989637136459, "mean_gen_accuracy": 0.8777696937322617, "mean_token_accuracy": 0.9096690863370895, "num_tokens": 627293439.0, "sample_num_tokens": 9096.25, "step": 6310, "total_num_tokens": 627329824.0, "z_loss": 0.00047781129251234233 }, { "copy_logits_max": -5.75063419342041, "copy_logits_min": -750000064.0, "copy_num_tokens": 479.375, "epoch": 1.2889966811335205, "gen_logits_max": 4.047275066375732, "gen_logits_mean": -17.59549903869629, "gen_logits_min": -29.70332145690918, "gen_logits_std": 3.131516456604004, "gen_loss": 0.300321102142334, "grad_norm": 0.35737037231339785, "learning_rate": 2.2957894736842105e-05, "loss": 0.2767, "mean_copy_accuracy": 0.9961948096752167, "mean_gen_accuracy": 0.8727187067270279, "mean_token_accuracy": 0.9073526561260223, "num_tokens": 627591110.0, "sample_num_tokens": 9077.0, "step": 6311, "total_num_tokens": 627627418.0, "z_loss": 0.0005619374569505453 }, { "copy_logits_max": -5.382096290588379, "copy_logits_min": -750000000.0, "copy_num_tokens": 314.5, "epoch": 1.2892009190707174, "gen_logits_max": 4.038199424743652, "gen_logits_mean": -16.864099502563477, "gen_logits_min": -29.107860565185547, "gen_logits_std": 3.118959903717041, "gen_loss": 0.2662174105644226, "grad_norm": 0.44117270296915584, "learning_rate": 2.295663157894737e-05, "loss": 0.302, "mean_copy_accuracy": 0.9961446076631546, "mean_gen_accuracy": 0.8720020353794098, "mean_token_accuracy": 0.8972876518964767, "num_tokens": 627841225.0, "sample_num_tokens": 6753.25, "step": 6312, "total_num_tokens": 627868238.0, "z_loss": 0.0005060543771833181 }, { "copy_logits_max": -5.2388434410095215, "copy_logits_min": -750000000.0, "copy_num_tokens": 374.6875, "epoch": 1.2894051570079141, "gen_logits_max": 4.602134704589844, "gen_logits_mean": -15.61361312866211, "gen_logits_min": -28.032974243164062, "gen_logits_std": 3.0788958072662354, "gen_loss": 0.33400779962539673, "grad_norm": 0.36398811000379244, "learning_rate": 2.2955368421052634e-05, "loss": 0.2919, "mean_copy_accuracy": 0.9971299022436142, "mean_gen_accuracy": 0.8695391267538071, "mean_token_accuracy": 0.9029363691806793, "num_tokens": 628125906.0, "sample_num_tokens": 7934.5, "step": 6313, "total_num_tokens": 628157644.0, "z_loss": 0.0005817639757879078 }, { "copy_logits_max": -7.087425708770752, "copy_logits_min": -687500032.0, "copy_num_tokens": 475.9375, "epoch": 1.289609394945111, "gen_logits_max": 2.601357936859131, "gen_logits_mean": -17.897228240966797, "gen_logits_min": -30.185874938964844, "gen_logits_std": 3.140353202819824, "gen_loss": 0.2556137442588806, "grad_norm": 0.37693331946160374, "learning_rate": 2.2954105263157895e-05, "loss": 0.2568, "mean_copy_accuracy": 0.9968633055686951, "mean_gen_accuracy": 0.883290633559227, "mean_token_accuracy": 0.9125234186649323, "num_tokens": 628420308.0, "sample_num_tokens": 8941.0, "step": 6314, "total_num_tokens": 628456072.0, "z_loss": 0.0004625776200555265 }, { "copy_logits_max": -5.259289741516113, "copy_logits_min": -750000000.0, "copy_num_tokens": 370.9375, "epoch": 1.289813632882308, "gen_logits_max": 3.853367567062378, "gen_logits_mean": -16.669769287109375, "gen_logits_min": -29.052818298339844, "gen_logits_std": 3.1306848526000977, "gen_loss": 0.2703333795070648, "grad_norm": 0.4388371245003792, "learning_rate": 2.295284210526316e-05, "loss": 0.2958, "mean_copy_accuracy": 0.9964478462934494, "mean_gen_accuracy": 0.8679723888635635, "mean_token_accuracy": 0.8994512856006622, "num_tokens": 628670102.0, "sample_num_tokens": 6873.5, "step": 6315, "total_num_tokens": 628697596.0, "z_loss": 0.0005483545828610659 }, { "copy_logits_max": -6.601645469665527, "copy_logits_min": -750000000.0, "copy_num_tokens": 391.375, "epoch": 1.2900178708195047, "gen_logits_max": 3.7440927028656006, "gen_logits_mean": -17.326759338378906, "gen_logits_min": -29.524919509887695, "gen_logits_std": 3.1185667514801025, "gen_loss": 0.2906346321105957, "grad_norm": 0.3803920748674032, "learning_rate": 2.295157894736842e-05, "loss": 0.2649, "mean_copy_accuracy": 0.9950840324163437, "mean_gen_accuracy": 0.8837522864341736, "mean_token_accuracy": 0.9101158082485199, "num_tokens": 628920572.0, "sample_num_tokens": 7411.5, "step": 6316, "total_num_tokens": 628950218.0, "z_loss": 0.000512616359628737 }, { "copy_logits_max": -5.43431282043457, "copy_logits_min": -750000000.0, "copy_num_tokens": 350.75, "epoch": 1.2902221087567016, "gen_logits_max": 4.636500358581543, "gen_logits_mean": -15.565592765808105, "gen_logits_min": -27.540512084960938, "gen_logits_std": 3.0826895236968994, "gen_loss": 0.2802843749523163, "grad_norm": 0.4562920807594839, "learning_rate": 2.2950315789473684e-05, "loss": 0.2914, "mean_copy_accuracy": 0.9950232803821564, "mean_gen_accuracy": 0.8731478750705719, "mean_token_accuracy": 0.900819256901741, "num_tokens": 629173638.0, "sample_num_tokens": 7951.0, "step": 6317, "total_num_tokens": 629205442.0, "z_loss": 0.0005116451065987349 }, { "copy_logits_max": -6.239436626434326, "copy_logits_min": -750000000.0, "copy_num_tokens": 548.5625, "epoch": 1.2904263466938983, "gen_logits_max": 3.733567237854004, "gen_logits_mean": -16.169010162353516, "gen_logits_min": -28.76798439025879, "gen_logits_std": 3.109193801879883, "gen_loss": 0.2741413712501526, "grad_norm": 0.4395027452341433, "learning_rate": 2.294905263157895e-05, "loss": 0.2748, "mean_copy_accuracy": 0.996945708990097, "mean_gen_accuracy": 0.8771469593048096, "mean_token_accuracy": 0.9059110432863235, "num_tokens": 629436862.0, "sample_num_tokens": 9071.0, "step": 6318, "total_num_tokens": 629473146.0, "z_loss": 0.0005245206994004548 }, { "copy_logits_max": -5.7246503829956055, "copy_logits_min": -750000000.0, "copy_num_tokens": 521.5625, "epoch": 1.2906305846310953, "gen_logits_max": 4.123134136199951, "gen_logits_mean": -14.964445114135742, "gen_logits_min": -27.149250030517578, "gen_logits_std": 3.0870399475097656, "gen_loss": 0.2409515380859375, "grad_norm": 0.41075513017757515, "learning_rate": 2.294778947368421e-05, "loss": 0.2846, "mean_copy_accuracy": 0.9956603050231934, "mean_gen_accuracy": 0.8778534233570099, "mean_token_accuracy": 0.903121292591095, "num_tokens": 629695420.0, "sample_num_tokens": 8380.0, "step": 6319, "total_num_tokens": 629728940.0, "z_loss": 0.0005169567884877324 }, { "copy_logits_max": -5.804742813110352, "copy_logits_min": -687500032.0, "copy_num_tokens": 435.25, "epoch": 1.290834822568292, "gen_logits_max": 3.9866626262664795, "gen_logits_mean": -16.43073272705078, "gen_logits_min": -28.69251251220703, "gen_logits_std": 3.11686372756958, "gen_loss": 0.2655832767486572, "grad_norm": 0.3607218603973543, "learning_rate": 2.2946526315789474e-05, "loss": 0.2638, "mean_copy_accuracy": 0.9962476938962936, "mean_gen_accuracy": 0.8786383271217346, "mean_token_accuracy": 0.9090922921895981, "num_tokens": 629957054.0, "sample_num_tokens": 7997.5, "step": 6320, "total_num_tokens": 629989044.0, "z_loss": 0.0005242618499323726 }, { "copy_logits_max": -5.026151657104492, "copy_logits_min": -750000064.0, "copy_num_tokens": 410.875, "epoch": 1.291039060505489, "gen_logits_max": 4.2946624755859375, "gen_logits_mean": -15.833026885986328, "gen_logits_min": -27.901248931884766, "gen_logits_std": 3.1052680015563965, "gen_loss": 0.2617671489715576, "grad_norm": 0.38306437859574194, "learning_rate": 2.2945263157894738e-05, "loss": 0.2895, "mean_copy_accuracy": 0.9963075071573257, "mean_gen_accuracy": 0.8686594516038895, "mean_token_accuracy": 0.9015359878540039, "num_tokens": 630230959.0, "sample_num_tokens": 7705.25, "step": 6321, "total_num_tokens": 630261780.0, "z_loss": 0.0004970314912497997 }, { "copy_logits_max": -5.75967264175415, "copy_logits_min": -750000000.0, "copy_num_tokens": 412.25, "epoch": 1.2912432984426858, "gen_logits_max": 3.960542678833008, "gen_logits_mean": -16.362651824951172, "gen_logits_min": -28.652652740478516, "gen_logits_std": 3.1037282943725586, "gen_loss": 0.2884069085121155, "grad_norm": 0.39739793262757145, "learning_rate": 2.2944000000000003e-05, "loss": 0.2881, "mean_copy_accuracy": 0.9948505312204361, "mean_gen_accuracy": 0.873664066195488, "mean_token_accuracy": 0.9015550464391708, "num_tokens": 630494687.0, "sample_num_tokens": 8083.25, "step": 6322, "total_num_tokens": 630527020.0, "z_loss": 0.0005564967868849635 }, { "copy_logits_max": -6.080810070037842, "copy_logits_min": -750000000.0, "copy_num_tokens": 380.4375, "epoch": 1.2914475363798825, "gen_logits_max": 4.61102294921875, "gen_logits_mean": -16.320669174194336, "gen_logits_min": -28.537384033203125, "gen_logits_std": 3.0858094692230225, "gen_loss": 0.31908220052719116, "grad_norm": 0.41555662904196383, "learning_rate": 2.2942736842105264e-05, "loss": 0.3045, "mean_copy_accuracy": 0.9956490844488144, "mean_gen_accuracy": 0.8668714314699173, "mean_token_accuracy": 0.8988700062036514, "num_tokens": 630733327.0, "sample_num_tokens": 7646.25, "step": 6323, "total_num_tokens": 630763912.0, "z_loss": 0.0005885834107175469 }, { "copy_logits_max": -7.528194427490234, "copy_logits_min": -750000000.0, "copy_num_tokens": 541.0625, "epoch": 1.2916517743170794, "gen_logits_max": 3.416478395462036, "gen_logits_mean": -17.05622100830078, "gen_logits_min": -29.06374740600586, "gen_logits_std": 3.1166367530822754, "gen_loss": 0.27083754539489746, "grad_norm": 0.41204442320342727, "learning_rate": 2.2941473684210528e-05, "loss": 0.2738, "mean_copy_accuracy": 0.9968773126602173, "mean_gen_accuracy": 0.8763132095336914, "mean_token_accuracy": 0.9085155874490738, "num_tokens": 631000868.0, "sample_num_tokens": 8586.5, "step": 6324, "total_num_tokens": 631035214.0, "z_loss": 0.0004848870448768139 }, { "copy_logits_max": -6.609121799468994, "copy_logits_min": -750000000.0, "copy_num_tokens": 538.75, "epoch": 1.2918560122542762, "gen_logits_max": 5.477710723876953, "gen_logits_mean": -13.520923614501953, "gen_logits_min": -25.945762634277344, "gen_logits_std": 3.0599303245544434, "gen_loss": 0.28075262904167175, "grad_norm": 0.4102031717213565, "learning_rate": 2.294021052631579e-05, "loss": 0.2711, "mean_copy_accuracy": 0.9956127256155014, "mean_gen_accuracy": 0.8770160377025604, "mean_token_accuracy": 0.9082922339439392, "num_tokens": 631269879.0, "sample_num_tokens": 9537.75, "step": 6325, "total_num_tokens": 631308030.0, "z_loss": 0.0005124505842104554 }, { "copy_logits_max": -6.172704696655273, "copy_logits_min": -750000000.0, "copy_num_tokens": 626.4375, "epoch": 1.292060250191473, "gen_logits_max": 4.192643165588379, "gen_logits_mean": -15.921924591064453, "gen_logits_min": -27.797908782958984, "gen_logits_std": 3.101353883743286, "gen_loss": 0.27863726019859314, "grad_norm": 0.36059697167514343, "learning_rate": 2.2938947368421053e-05, "loss": 0.2811, "mean_copy_accuracy": 0.9970535784959793, "mean_gen_accuracy": 0.8712106943130493, "mean_token_accuracy": 0.9070835113525391, "num_tokens": 631559641.0, "sample_num_tokens": 9591.25, "step": 6326, "total_num_tokens": 631598006.0, "z_loss": 0.00046822233707644045 }, { "copy_logits_max": -6.550551891326904, "copy_logits_min": -750000000.0, "copy_num_tokens": 572.375, "epoch": 1.2922644881286698, "gen_logits_max": 3.8545033931732178, "gen_logits_mean": -16.5460205078125, "gen_logits_min": -28.942724227905273, "gen_logits_std": 3.1293134689331055, "gen_loss": 0.2822798490524292, "grad_norm": 0.3808988119351702, "learning_rate": 2.2937684210526314e-05, "loss": 0.292, "mean_copy_accuracy": 0.9958460628986359, "mean_gen_accuracy": 0.8734534084796906, "mean_token_accuracy": 0.9018635749816895, "num_tokens": 631841303.0, "sample_num_tokens": 9479.25, "step": 6327, "total_num_tokens": 631879220.0, "z_loss": 0.0005348888225853443 }, { "copy_logits_max": -7.046078205108643, "copy_logits_min": -750000000.0, "copy_num_tokens": 378.625, "epoch": 1.2924687260658667, "gen_logits_max": 4.780309677124023, "gen_logits_mean": -15.684630393981934, "gen_logits_min": -27.553848266601562, "gen_logits_std": 3.0936849117279053, "gen_loss": 0.291410356760025, "grad_norm": 0.38697361808639796, "learning_rate": 2.2936421052631578e-05, "loss": 0.281, "mean_copy_accuracy": 0.996651828289032, "mean_gen_accuracy": 0.8749237954616547, "mean_token_accuracy": 0.9054602384567261, "num_tokens": 632119519.0, "sample_num_tokens": 8033.25, "step": 6328, "total_num_tokens": 632151652.0, "z_loss": 0.0005152398953214288 }, { "copy_logits_max": -5.902552604675293, "copy_logits_min": -750000000.0, "copy_num_tokens": 369.375, "epoch": 1.2926729640030636, "gen_logits_max": 5.479448318481445, "gen_logits_mean": -14.620414733886719, "gen_logits_min": -26.76397705078125, "gen_logits_std": 3.0773587226867676, "gen_loss": 0.284415602684021, "grad_norm": 0.35480227158472505, "learning_rate": 2.2935157894736843e-05, "loss": 0.2854, "mean_copy_accuracy": 0.9962586611509323, "mean_gen_accuracy": 0.8751490563154221, "mean_token_accuracy": 0.903067409992218, "num_tokens": 632381566.0, "sample_num_tokens": 7327.0, "step": 6329, "total_num_tokens": 632410874.0, "z_loss": 0.0005454538622871041 }, { "copy_logits_max": -3.7162418365478516, "copy_logits_min": -750000000.0, "copy_num_tokens": 778.1875, "epoch": 1.2928772019402603, "gen_logits_max": 3.456566333770752, "gen_logits_mean": -15.438173294067383, "gen_logits_min": -27.981124877929688, "gen_logits_std": 3.1094372272491455, "gen_loss": 0.2622019350528717, "grad_norm": 0.43846309892555374, "learning_rate": 2.2933894736842107e-05, "loss": 0.2671, "mean_copy_accuracy": 0.9975838661193848, "mean_gen_accuracy": 0.8709389120340347, "mean_token_accuracy": 0.9097846895456314, "num_tokens": 632671768.0, "sample_num_tokens": 10071.5, "step": 6330, "total_num_tokens": 632712054.0, "z_loss": 0.000533462327439338 }, { "copy_logits_max": -3.082047939300537, "copy_logits_min": -750000000.0, "copy_num_tokens": 424.0, "epoch": 1.2930814398774573, "gen_logits_max": 4.298637866973877, "gen_logits_mean": -15.823406219482422, "gen_logits_min": -28.00422477722168, "gen_logits_std": 3.090325355529785, "gen_loss": 0.31472158432006836, "grad_norm": 0.35883506424279715, "learning_rate": 2.293263157894737e-05, "loss": 0.3174, "mean_copy_accuracy": 0.9963039457798004, "mean_gen_accuracy": 0.8613482564687729, "mean_token_accuracy": 0.8913405239582062, "num_tokens": 632946544.0, "sample_num_tokens": 8269.0, "step": 6331, "total_num_tokens": 632979620.0, "z_loss": 0.0006144173676148057 }, { "copy_logits_max": -4.541650772094727, "copy_logits_min": -750000064.0, "copy_num_tokens": 447.125, "epoch": 1.2932856778146542, "gen_logits_max": 3.5455899238586426, "gen_logits_mean": -16.080322265625, "gen_logits_min": -28.25177764892578, "gen_logits_std": 3.1091842651367188, "gen_loss": 0.2837573289871216, "grad_norm": 0.3675254235381496, "learning_rate": 2.2931368421052632e-05, "loss": 0.2816, "mean_copy_accuracy": 0.9965691268444061, "mean_gen_accuracy": 0.8734202831983566, "mean_token_accuracy": 0.9023084342479706, "num_tokens": 633211099.0, "sample_num_tokens": 8050.75, "step": 6332, "total_num_tokens": 633243302.0, "z_loss": 0.00046675271005369723 }, { "copy_logits_max": -6.1703290939331055, "copy_logits_min": -750000000.0, "copy_num_tokens": 301.75, "epoch": 1.293489915751851, "gen_logits_max": 4.9518632888793945, "gen_logits_mean": -15.788923263549805, "gen_logits_min": -27.766841888427734, "gen_logits_std": 3.09805965423584, "gen_loss": 0.2659400403499603, "grad_norm": 0.4090652753303035, "learning_rate": 2.2930105263157896e-05, "loss": 0.288, "mean_copy_accuracy": 0.9954351186752319, "mean_gen_accuracy": 0.8761603683233261, "mean_token_accuracy": 0.9038664400577545, "num_tokens": 633467460.0, "sample_num_tokens": 7918.0, "step": 6333, "total_num_tokens": 633499132.0, "z_loss": 0.0004487474507186562 }, { "copy_logits_max": -5.4369893074035645, "copy_logits_min": -750000000.0, "copy_num_tokens": 311.625, "epoch": 1.2936941536890476, "gen_logits_max": 3.8715970516204834, "gen_logits_mean": -16.94841766357422, "gen_logits_min": -28.764583587646484, "gen_logits_std": 3.11664080619812, "gen_loss": 0.2991413176059723, "grad_norm": 0.4290522916908032, "learning_rate": 2.2928842105263157e-05, "loss": 0.275, "mean_copy_accuracy": 0.994353637099266, "mean_gen_accuracy": 0.8820118010044098, "mean_token_accuracy": 0.9056848585605621, "num_tokens": 633720764.0, "sample_num_tokens": 6912.5, "step": 6334, "total_num_tokens": 633748414.0, "z_loss": 0.0005233902484178543 }, { "copy_logits_max": -3.4329566955566406, "copy_logits_min": -750000000.0, "copy_num_tokens": 412.625, "epoch": 1.2938983916262445, "gen_logits_max": 4.195626258850098, "gen_logits_mean": -15.660123825073242, "gen_logits_min": -27.570581436157227, "gen_logits_std": 3.1056957244873047, "gen_loss": 0.28845107555389404, "grad_norm": 0.3851869008610926, "learning_rate": 2.292757894736842e-05, "loss": 0.2681, "mean_copy_accuracy": 0.9959535747766495, "mean_gen_accuracy": 0.878464013338089, "mean_token_accuracy": 0.9082736968994141, "num_tokens": 633994467.0, "sample_num_tokens": 8361.75, "step": 6335, "total_num_tokens": 634027914.0, "z_loss": 0.0004980975645594299 }, { "copy_logits_max": -4.984535217285156, "copy_logits_min": -750000000.0, "copy_num_tokens": 337.0625, "epoch": 1.2941026295634415, "gen_logits_max": 3.9491398334503174, "gen_logits_mean": -17.415800094604492, "gen_logits_min": -29.27346420288086, "gen_logits_std": 3.1151132583618164, "gen_loss": 0.2700723707675934, "grad_norm": 0.42064632460635687, "learning_rate": 2.2926315789473683e-05, "loss": 0.2877, "mean_copy_accuracy": 0.9957072734832764, "mean_gen_accuracy": 0.8791788220405579, "mean_token_accuracy": 0.9015633910894394, "num_tokens": 634252504.0, "sample_num_tokens": 8605.5, "step": 6336, "total_num_tokens": 634286926.0, "z_loss": 0.0004254792584106326 }, { "copy_logits_max": -5.465752601623535, "copy_logits_min": -750000000.0, "copy_num_tokens": 322.875, "epoch": 1.2943068675006382, "gen_logits_max": 4.587869644165039, "gen_logits_mean": -15.969975471496582, "gen_logits_min": -28.001375198364258, "gen_logits_std": 3.0956108570098877, "gen_loss": 0.30170369148254395, "grad_norm": 0.3847170742104799, "learning_rate": 2.292505263157895e-05, "loss": 0.2882, "mean_copy_accuracy": 0.9967450499534607, "mean_gen_accuracy": 0.8715079426765442, "mean_token_accuracy": 0.9017874896526337, "num_tokens": 634496371.0, "sample_num_tokens": 6698.25, "step": 6337, "total_num_tokens": 634523164.0, "z_loss": 0.0005105945747345686 }, { "copy_logits_max": -3.160851001739502, "copy_logits_min": -750000000.0, "copy_num_tokens": 679.0, "epoch": 1.294511105437835, "gen_logits_max": 3.4307360649108887, "gen_logits_mean": -16.27289581298828, "gen_logits_min": -29.119850158691406, "gen_logits_std": 3.1419386863708496, "gen_loss": 0.2305433601140976, "grad_norm": 0.42462721443598933, "learning_rate": 2.292378947368421e-05, "loss": 0.2771, "mean_copy_accuracy": 0.9968356788158417, "mean_gen_accuracy": 0.8781275451183319, "mean_token_accuracy": 0.9049075394868851, "num_tokens": 634743204.0, "sample_num_tokens": 10504.5, "step": 6338, "total_num_tokens": 634785222.0, "z_loss": 0.0004145569109823555 }, { "copy_logits_max": -3.017498016357422, "copy_logits_min": -750000000.0, "copy_num_tokens": 325.5625, "epoch": 1.294715343375032, "gen_logits_max": 4.266911029815674, "gen_logits_mean": -15.9327974319458, "gen_logits_min": -28.11353874206543, "gen_logits_std": 3.075061798095703, "gen_loss": 0.3500671982765198, "grad_norm": 0.37503060197218163, "learning_rate": 2.2922526315789476e-05, "loss": 0.2885, "mean_copy_accuracy": 0.995476633310318, "mean_gen_accuracy": 0.8724630177021027, "mean_token_accuracy": 0.9021289944648743, "num_tokens": 635014036.0, "sample_num_tokens": 7028.0, "step": 6339, "total_num_tokens": 635042148.0, "z_loss": 0.0006069866940379143 }, { "copy_logits_max": -3.1291327476501465, "copy_logits_min": -750000064.0, "copy_num_tokens": 479.125, "epoch": 1.2949195813122287, "gen_logits_max": 3.244906425476074, "gen_logits_mean": -17.58477210998535, "gen_logits_min": -30.013328552246094, "gen_logits_std": 3.158909559249878, "gen_loss": 0.2449243664741516, "grad_norm": 0.3955934984518425, "learning_rate": 2.2921263157894736e-05, "loss": 0.2667, "mean_copy_accuracy": 0.9974247366189957, "mean_gen_accuracy": 0.877426266670227, "mean_token_accuracy": 0.9111202657222748, "num_tokens": 635305734.0, "sample_num_tokens": 8215.5, "step": 6340, "total_num_tokens": 635338596.0, "z_loss": 0.0004795967251993716 }, { "copy_logits_max": -2.813563108444214, "copy_logits_min": -750000000.0, "copy_num_tokens": 289.9375, "epoch": 1.2951238192494257, "gen_logits_max": 5.061114311218262, "gen_logits_mean": -15.585457801818848, "gen_logits_min": -28.130434036254883, "gen_logits_std": 3.0832715034484863, "gen_loss": 0.2632039189338684, "grad_norm": 0.3508510261336399, "learning_rate": 2.292e-05, "loss": 0.2634, "mean_copy_accuracy": 0.9974236637353897, "mean_gen_accuracy": 0.8828938156366348, "mean_token_accuracy": 0.9115988910198212, "num_tokens": 635592653.0, "sample_num_tokens": 7499.25, "step": 6341, "total_num_tokens": 635622650.0, "z_loss": 0.000508682569488883 }, { "copy_logits_max": -3.381502628326416, "copy_logits_min": -750000128.0, "copy_num_tokens": 382.875, "epoch": 1.2953280571866224, "gen_logits_max": 4.0599775314331055, "gen_logits_mean": -16.329334259033203, "gen_logits_min": -28.27373504638672, "gen_logits_std": 3.085235118865967, "gen_loss": 0.3109825551509857, "grad_norm": 0.36572730843230855, "learning_rate": 2.2918736842105262e-05, "loss": 0.2715, "mean_copy_accuracy": 0.994950920343399, "mean_gen_accuracy": 0.8820364326238632, "mean_token_accuracy": 0.9068818986415863, "num_tokens": 635857949.0, "sample_num_tokens": 8159.25, "step": 6342, "total_num_tokens": 635890586.0, "z_loss": 0.0005827443674206734 }, { "copy_logits_max": -0.3684842884540558, "copy_logits_min": -750000000.0, "copy_num_tokens": 433.9375, "epoch": 1.2955322951238193, "gen_logits_max": 5.459719657897949, "gen_logits_mean": -13.325811386108398, "gen_logits_min": -26.281288146972656, "gen_logits_std": 3.03252911567688, "gen_loss": 0.2795936167240143, "grad_norm": 0.38178278185741304, "learning_rate": 2.2917473684210526e-05, "loss": 0.2703, "mean_copy_accuracy": 0.996158093214035, "mean_gen_accuracy": 0.8787597566843033, "mean_token_accuracy": 0.9095665365457535, "num_tokens": 636142783.0, "sample_num_tokens": 7554.75, "step": 6343, "total_num_tokens": 636173002.0, "z_loss": 0.0004898409824818373 }, { "copy_logits_max": -1.525742769241333, "copy_logits_min": -750000000.0, "copy_num_tokens": 452.625, "epoch": 1.295736533061016, "gen_logits_max": 4.288846969604492, "gen_logits_mean": -16.20923614501953, "gen_logits_min": -28.672142028808594, "gen_logits_std": 3.076094627380371, "gen_loss": 0.3297864496707916, "grad_norm": 0.42381136793388174, "learning_rate": 2.2916210526315787e-05, "loss": 0.3188, "mean_copy_accuracy": 0.9962025284767151, "mean_gen_accuracy": 0.8628970384597778, "mean_token_accuracy": 0.8945983797311783, "num_tokens": 636417388.0, "sample_num_tokens": 8500.0, "step": 6344, "total_num_tokens": 636451388.0, "z_loss": 0.0006046814378350973 }, { "copy_logits_max": -3.7649827003479004, "copy_logits_min": -750000000.0, "copy_num_tokens": 393.5, "epoch": 1.295940770998213, "gen_logits_max": 3.7234480381011963, "gen_logits_mean": -17.92960548400879, "gen_logits_min": -29.895648956298828, "gen_logits_std": 3.1360902786254883, "gen_loss": 0.27588748931884766, "grad_norm": 0.36105377267608335, "learning_rate": 2.2914947368421055e-05, "loss": 0.281, "mean_copy_accuracy": 0.9964146614074707, "mean_gen_accuracy": 0.8743862807750702, "mean_token_accuracy": 0.9034574925899506, "num_tokens": 636697130.0, "sample_num_tokens": 8039.0, "step": 6345, "total_num_tokens": 636729286.0, "z_loss": 0.0005401438102126122 }, { "copy_logits_max": -0.7396335601806641, "copy_logits_min": -687500032.0, "copy_num_tokens": 405.625, "epoch": 1.2961450089354098, "gen_logits_max": 3.95457124710083, "gen_logits_mean": -15.939912796020508, "gen_logits_min": -28.3103084564209, "gen_logits_std": 3.0744690895080566, "gen_loss": 0.32346200942993164, "grad_norm": 0.38954141543350046, "learning_rate": 2.291368421052632e-05, "loss": 0.2928, "mean_copy_accuracy": 0.9959248006343842, "mean_gen_accuracy": 0.8699054569005966, "mean_token_accuracy": 0.9011470228433609, "num_tokens": 636948635.0, "sample_num_tokens": 7371.75, "step": 6346, "total_num_tokens": 636978122.0, "z_loss": 0.0005688283708877861 }, { "copy_logits_max": -1.31777024269104, "copy_logits_min": -750000000.0, "copy_num_tokens": 657.0, "epoch": 1.2963492468726066, "gen_logits_max": 4.054572582244873, "gen_logits_mean": -14.92467212677002, "gen_logits_min": -27.441497802734375, "gen_logits_std": 3.0554862022399902, "gen_loss": 0.24823768436908722, "grad_norm": 0.39593052525871003, "learning_rate": 2.291242105263158e-05, "loss": 0.2719, "mean_copy_accuracy": 0.996786430478096, "mean_gen_accuracy": 0.8778851628303528, "mean_token_accuracy": 0.9076252728700638, "num_tokens": 637204288.0, "sample_num_tokens": 9634.5, "step": 6347, "total_num_tokens": 637242826.0, "z_loss": 0.000515036634169519 }, { "copy_logits_max": -0.6784060001373291, "copy_logits_min": -750000064.0, "copy_num_tokens": 347.0625, "epoch": 1.2965534848098035, "gen_logits_max": 4.447640419006348, "gen_logits_mean": -15.256206512451172, "gen_logits_min": -27.299482345581055, "gen_logits_std": 3.0146586894989014, "gen_loss": 0.25530195236206055, "grad_norm": 0.40241875976572095, "learning_rate": 2.2911157894736844e-05, "loss": 0.2788, "mean_copy_accuracy": 0.9950644075870514, "mean_gen_accuracy": 0.8778753578662872, "mean_token_accuracy": 0.9051853269338608, "num_tokens": 637464220.0, "sample_num_tokens": 8039.0, "step": 6348, "total_num_tokens": 637496376.0, "z_loss": 0.000528179865796119 }, { "copy_logits_max": -2.3374111652374268, "copy_logits_min": -750000000.0, "copy_num_tokens": 461.6875, "epoch": 1.2967577227470002, "gen_logits_max": 3.9641051292419434, "gen_logits_mean": -16.42740249633789, "gen_logits_min": -28.30485725402832, "gen_logits_std": 3.0569863319396973, "gen_loss": 0.27240628004074097, "grad_norm": 0.3731573495768238, "learning_rate": 2.2909894736842105e-05, "loss": 0.2727, "mean_copy_accuracy": 0.9967027902603149, "mean_gen_accuracy": 0.8783323615789413, "mean_token_accuracy": 0.9068388044834137, "num_tokens": 637714725.0, "sample_num_tokens": 8429.25, "step": 6349, "total_num_tokens": 637748442.0, "z_loss": 0.0006120826583355665 }, { "copy_logits_max": -3.2995848655700684, "copy_logits_min": -750000000.0, "copy_num_tokens": 311.25, "epoch": 1.2969619606841971, "gen_logits_max": 4.494475841522217, "gen_logits_mean": -16.408992767333984, "gen_logits_min": -28.15157699584961, "gen_logits_std": 3.0413928031921387, "gen_loss": 0.3134092092514038, "grad_norm": 0.3504164542289458, "learning_rate": 2.290863157894737e-05, "loss": 0.2802, "mean_copy_accuracy": 0.9966145157814026, "mean_gen_accuracy": 0.8767201602458954, "mean_token_accuracy": 0.9058196693658829, "num_tokens": 637994196.0, "sample_num_tokens": 7165.0, "step": 6350, "total_num_tokens": 638022856.0, "z_loss": 0.0006337352097034454 }, { "copy_logits_max": -0.283600777387619, "copy_logits_min": -750000000.0, "copy_num_tokens": 428.8125, "epoch": 1.2971661986213938, "gen_logits_max": 4.133914947509766, "gen_logits_mean": -14.5529146194458, "gen_logits_min": -27.225704193115234, "gen_logits_std": 3.0237462520599365, "gen_loss": 0.3196684718132019, "grad_norm": 0.38761919667160605, "learning_rate": 2.290736842105263e-05, "loss": 0.2903, "mean_copy_accuracy": 0.9969556629657745, "mean_gen_accuracy": 0.8679115474224091, "mean_token_accuracy": 0.9014817327260971, "num_tokens": 638248558.0, "sample_num_tokens": 7507.0, "step": 6351, "total_num_tokens": 638278586.0, "z_loss": 0.0006222722586244345 }, { "copy_logits_max": -3.870830774307251, "copy_logits_min": -750000000.0, "copy_num_tokens": 336.8125, "epoch": 1.2973704365585907, "gen_logits_max": 4.798477649688721, "gen_logits_mean": -15.152955055236816, "gen_logits_min": -27.015071868896484, "gen_logits_std": 3.005983352661133, "gen_loss": 0.30220556259155273, "grad_norm": 0.3709263489057086, "learning_rate": 2.2906105263157895e-05, "loss": 0.2763, "mean_copy_accuracy": 0.9953763782978058, "mean_gen_accuracy": 0.8767590522766113, "mean_token_accuracy": 0.9054085463285446, "num_tokens": 638520059.0, "sample_num_tokens": 7873.75, "step": 6352, "total_num_tokens": 638551554.0, "z_loss": 0.0005622797179967165 }, { "copy_logits_max": -2.2994418144226074, "copy_logits_min": -750000000.0, "copy_num_tokens": 402.875, "epoch": 1.2975746744957877, "gen_logits_max": 4.275840759277344, "gen_logits_mean": -15.798725128173828, "gen_logits_min": -27.56985092163086, "gen_logits_std": 3.0364081859588623, "gen_loss": 0.28417977690696716, "grad_norm": 0.37346321217932554, "learning_rate": 2.290484210526316e-05, "loss": 0.2819, "mean_copy_accuracy": 0.9952055066823959, "mean_gen_accuracy": 0.8807202726602554, "mean_token_accuracy": 0.9048594981431961, "num_tokens": 638763399.0, "sample_num_tokens": 7727.75, "step": 6353, "total_num_tokens": 638794310.0, "z_loss": 0.0005220860475674272 }, { "copy_logits_max": -2.23781156539917, "copy_logits_min": -750000064.0, "copy_num_tokens": 592.375, "epoch": 1.2977789124329844, "gen_logits_max": 3.910419464111328, "gen_logits_mean": -15.070653915405273, "gen_logits_min": -27.163066864013672, "gen_logits_std": 3.009871244430542, "gen_loss": 0.2748872637748718, "grad_norm": 0.3952089300099463, "learning_rate": 2.2903578947368423e-05, "loss": 0.2872, "mean_copy_accuracy": 0.9964391887187958, "mean_gen_accuracy": 0.8770652562379837, "mean_token_accuracy": 0.9054225832223892, "num_tokens": 639026974.0, "sample_num_tokens": 8883.0, "step": 6354, "total_num_tokens": 639062506.0, "z_loss": 0.000465963501483202 }, { "copy_logits_max": -2.253448963165283, "copy_logits_min": -750000000.0, "copy_num_tokens": 604.375, "epoch": 1.2979831503701813, "gen_logits_max": 3.3351998329162598, "gen_logits_mean": -15.337696075439453, "gen_logits_min": -27.15816879272461, "gen_logits_std": 3.0107338428497314, "gen_loss": 0.24810919165611267, "grad_norm": 0.4193883808686888, "learning_rate": 2.2902315789473684e-05, "loss": 0.2933, "mean_copy_accuracy": 0.9961742162704468, "mean_gen_accuracy": 0.8712707757949829, "mean_token_accuracy": 0.9018279612064362, "num_tokens": 639303316.0, "sample_num_tokens": 9666.0, "step": 6355, "total_num_tokens": 639341980.0, "z_loss": 0.00041572973714210093 }, { "copy_logits_max": -4.043800354003906, "copy_logits_min": -750000000.0, "copy_num_tokens": 400.5625, "epoch": 1.2981873883073782, "gen_logits_max": 3.3105897903442383, "gen_logits_mean": -16.652782440185547, "gen_logits_min": -28.278902053833008, "gen_logits_std": 3.036583423614502, "gen_loss": 0.309242308139801, "grad_norm": 0.3777944239127976, "learning_rate": 2.290105263157895e-05, "loss": 0.3055, "mean_copy_accuracy": 0.9962591975927353, "mean_gen_accuracy": 0.8682104796171188, "mean_token_accuracy": 0.8953498005867004, "num_tokens": 639571860.0, "sample_num_tokens": 8007.0, "step": 6356, "total_num_tokens": 639603888.0, "z_loss": 0.0004842875059694052 }, { "copy_logits_max": -2.490851402282715, "copy_logits_min": -687500032.0, "copy_num_tokens": 401.125, "epoch": 1.298391626244575, "gen_logits_max": 3.891972064971924, "gen_logits_mean": -16.644079208374023, "gen_logits_min": -28.31020164489746, "gen_logits_std": 3.057562828063965, "gen_loss": 0.2809792160987854, "grad_norm": 0.35925826710884257, "learning_rate": 2.289978947368421e-05, "loss": 0.2692, "mean_copy_accuracy": 0.9965447038412094, "mean_gen_accuracy": 0.880207359790802, "mean_token_accuracy": 0.9085501581430435, "num_tokens": 639843415.0, "sample_num_tokens": 8203.25, "step": 6357, "total_num_tokens": 639876228.0, "z_loss": 0.0004901321954093874 }, { "copy_logits_max": -4.37684440612793, "copy_logits_min": -750000064.0, "copy_num_tokens": 277.25, "epoch": 1.2985958641817716, "gen_logits_max": 4.131810188293457, "gen_logits_mean": -16.040367126464844, "gen_logits_min": -28.409568786621094, "gen_logits_std": 3.046227216720581, "gen_loss": 0.30617159605026245, "grad_norm": 0.4084074603774227, "learning_rate": 2.2898526315789474e-05, "loss": 0.3019, "mean_copy_accuracy": 0.9959539026021957, "mean_gen_accuracy": 0.8704459518194199, "mean_token_accuracy": 0.8969789147377014, "num_tokens": 640102385.0, "sample_num_tokens": 7316.75, "step": 6358, "total_num_tokens": 640131652.0, "z_loss": 0.0004887069808319211 }, { "copy_logits_max": -3.500084161758423, "copy_logits_min": -750000000.0, "copy_num_tokens": 369.625, "epoch": 1.2988001021189686, "gen_logits_max": 3.761375904083252, "gen_logits_mean": -16.774974822998047, "gen_logits_min": -28.933916091918945, "gen_logits_std": 3.074035882949829, "gen_loss": 0.2989862263202667, "grad_norm": 0.37902105129519115, "learning_rate": 2.2897263157894738e-05, "loss": 0.2938, "mean_copy_accuracy": 0.9958540201187134, "mean_gen_accuracy": 0.868761882185936, "mean_token_accuracy": 0.900116816163063, "num_tokens": 640380872.0, "sample_num_tokens": 7855.0, "step": 6359, "total_num_tokens": 640412292.0, "z_loss": 0.0005022227996960282 }, { "copy_logits_max": -3.5203213691711426, "copy_logits_min": -750000000.0, "copy_num_tokens": 546.9375, "epoch": 1.2990043400561655, "gen_logits_max": 2.434410333633423, "gen_logits_mean": -18.08420753479004, "gen_logits_min": -30.08338165283203, "gen_logits_std": 3.1203269958496094, "gen_loss": 0.2558039426803589, "grad_norm": 0.37745842705170646, "learning_rate": 2.2896e-05, "loss": 0.2695, "mean_copy_accuracy": 0.9961636513471603, "mean_gen_accuracy": 0.8770206421613693, "mean_token_accuracy": 0.9073342978954315, "num_tokens": 640651288.0, "sample_num_tokens": 9210.0, "step": 6360, "total_num_tokens": 640688128.0, "z_loss": 0.0004546709533315152 }, { "copy_logits_max": -4.060641288757324, "copy_logits_min": -750000000.0, "copy_num_tokens": 443.0625, "epoch": 1.2992085779933622, "gen_logits_max": 3.64516282081604, "gen_logits_mean": -16.24182891845703, "gen_logits_min": -29.14737319946289, "gen_logits_std": 3.0485777854919434, "gen_loss": 0.28968411684036255, "grad_norm": 0.3504690140814914, "learning_rate": 2.2894736842105263e-05, "loss": 0.2688, "mean_copy_accuracy": 0.9972385764122009, "mean_gen_accuracy": 0.8756003975868225, "mean_token_accuracy": 0.9088824391365051, "num_tokens": 640931248.0, "sample_num_tokens": 8700.0, "step": 6361, "total_num_tokens": 640966048.0, "z_loss": 0.0005447000730782747 }, { "copy_logits_max": -3.6072657108306885, "copy_logits_min": -750000000.0, "copy_num_tokens": 347.3125, "epoch": 1.2994128159305591, "gen_logits_max": 4.66205358505249, "gen_logits_mean": -15.755013465881348, "gen_logits_min": -27.558765411376953, "gen_logits_std": 3.0267205238342285, "gen_loss": 0.3066060245037079, "grad_norm": 0.35797888217238943, "learning_rate": 2.2893473684210528e-05, "loss": 0.2984, "mean_copy_accuracy": 0.996892049908638, "mean_gen_accuracy": 0.8688599318265915, "mean_token_accuracy": 0.8982479274272919, "num_tokens": 641212673.0, "sample_num_tokens": 7615.75, "step": 6362, "total_num_tokens": 641243136.0, "z_loss": 0.0006468254723586142 }, { "copy_logits_max": -1.1010266542434692, "copy_logits_min": -750000000.0, "copy_num_tokens": 433.3125, "epoch": 1.299617053867756, "gen_logits_max": 4.280213356018066, "gen_logits_mean": -15.624919891357422, "gen_logits_min": -27.45915985107422, "gen_logits_std": 3.033780097961426, "gen_loss": 0.3158976435661316, "grad_norm": 0.37197440733784476, "learning_rate": 2.2892210526315792e-05, "loss": 0.268, "mean_copy_accuracy": 0.9962359666824341, "mean_gen_accuracy": 0.8813731968402863, "mean_token_accuracy": 0.9083608239889145, "num_tokens": 641462384.0, "sample_num_tokens": 8238.5, "step": 6363, "total_num_tokens": 641495338.0, "z_loss": 0.0006364218425005674 }, { "copy_logits_max": -0.8914439678192139, "copy_logits_min": -750000000.0, "copy_num_tokens": 513.0625, "epoch": 1.2998212918049528, "gen_logits_max": 4.523652076721191, "gen_logits_mean": -15.57042121887207, "gen_logits_min": -27.34736442565918, "gen_logits_std": 3.051051139831543, "gen_loss": 0.2699201703071594, "grad_norm": 0.38062548124823053, "learning_rate": 2.2890947368421053e-05, "loss": 0.2647, "mean_copy_accuracy": 0.9964240938425064, "mean_gen_accuracy": 0.8797940760850906, "mean_token_accuracy": 0.9084722101688385, "num_tokens": 641751601.0, "sample_num_tokens": 9224.25, "step": 6364, "total_num_tokens": 641788498.0, "z_loss": 0.0005624338518828154 }, { "copy_logits_max": -5.548998832702637, "copy_logits_min": -750000000.0, "copy_num_tokens": 277.3125, "epoch": 1.3000255297421495, "gen_logits_max": 4.157385349273682, "gen_logits_mean": -16.888267517089844, "gen_logits_min": -28.909704208374023, "gen_logits_std": 3.05765438079834, "gen_loss": 0.3402899503707886, "grad_norm": 0.403642333047354, "learning_rate": 2.2889684210526317e-05, "loss": 0.289, "mean_copy_accuracy": 0.9939437061548233, "mean_gen_accuracy": 0.8701701164245605, "mean_token_accuracy": 0.9009932726621628, "num_tokens": 642038752.0, "sample_num_tokens": 7161.5, "step": 6365, "total_num_tokens": 642067398.0, "z_loss": 0.0005968943587504327 }, { "copy_logits_max": -4.053332328796387, "copy_logits_min": -750000064.0, "copy_num_tokens": 395.0, "epoch": 1.3002297676793464, "gen_logits_max": 3.1204514503479004, "gen_logits_mean": -17.413188934326172, "gen_logits_min": -29.312728881835938, "gen_logits_std": 3.114121437072754, "gen_loss": 0.26983457803726196, "grad_norm": 0.3838714535746377, "learning_rate": 2.2888421052631578e-05, "loss": 0.2691, "mean_copy_accuracy": 0.9961099326610565, "mean_gen_accuracy": 0.8778945803642273, "mean_token_accuracy": 0.9069753289222717, "num_tokens": 642297401.0, "sample_num_tokens": 7445.75, "step": 6366, "total_num_tokens": 642327184.0, "z_loss": 0.0005450048483908176 }, { "copy_logits_max": -3.3726806640625, "copy_logits_min": -687500032.0, "copy_num_tokens": 330.4375, "epoch": 1.3004340056165433, "gen_logits_max": 3.9910385608673096, "gen_logits_mean": -16.26654052734375, "gen_logits_min": -28.258495330810547, "gen_logits_std": 3.0485386848449707, "gen_loss": 0.3244703710079193, "grad_norm": 0.4270439831739471, "learning_rate": 2.2887157894736842e-05, "loss": 0.3056, "mean_copy_accuracy": 0.9965542107820511, "mean_gen_accuracy": 0.8661701083183289, "mean_token_accuracy": 0.8959906548261642, "num_tokens": 642542985.0, "sample_num_tokens": 7001.25, "step": 6367, "total_num_tokens": 642570990.0, "z_loss": 0.0006306055001914501 }, { "copy_logits_max": -3.3034377098083496, "copy_logits_min": -750000064.0, "copy_num_tokens": 615.0, "epoch": 1.30063824355374, "gen_logits_max": 4.360536575317383, "gen_logits_mean": -15.066971778869629, "gen_logits_min": -27.426456451416016, "gen_logits_std": 3.0436036586761475, "gen_loss": 0.2274336814880371, "grad_norm": 0.3921984894465305, "learning_rate": 2.2885894736842103e-05, "loss": 0.2571, "mean_copy_accuracy": 0.9971435368061066, "mean_gen_accuracy": 0.8803799450397491, "mean_token_accuracy": 0.9129524976015091, "num_tokens": 642824142.0, "sample_num_tokens": 9610.5, "step": 6368, "total_num_tokens": 642862584.0, "z_loss": 0.0004919599741697311 }, { "copy_logits_max": -3.850834608078003, "copy_logits_min": -687500032.0, "copy_num_tokens": 655.625, "epoch": 1.300842481490937, "gen_logits_max": 3.225088119506836, "gen_logits_mean": -16.96853256225586, "gen_logits_min": -28.824674606323242, "gen_logits_std": 3.1005377769470215, "gen_loss": 0.2729145586490631, "grad_norm": 0.3510422200410392, "learning_rate": 2.2884631578947368e-05, "loss": 0.2821, "mean_copy_accuracy": 0.9969664961099625, "mean_gen_accuracy": 0.872879832983017, "mean_token_accuracy": 0.9049165695905685, "num_tokens": 643113699.0, "sample_num_tokens": 9550.25, "step": 6369, "total_num_tokens": 643151900.0, "z_loss": 0.0005234827403910458 }, { "copy_logits_max": -1.5200161933898926, "copy_logits_min": -625000064.0, "copy_num_tokens": 641.5, "epoch": 1.3010467194281339, "gen_logits_max": 4.411957740783691, "gen_logits_mean": -14.652917861938477, "gen_logits_min": -26.959373474121094, "gen_logits_std": 3.0424742698669434, "gen_loss": 0.249203622341156, "grad_norm": 0.39509884893893815, "learning_rate": 2.2883368421052632e-05, "loss": 0.2681, "mean_copy_accuracy": 0.9962314069271088, "mean_gen_accuracy": 0.8764637261629105, "mean_token_accuracy": 0.9091017991304398, "num_tokens": 643404856.0, "sample_num_tokens": 10123.5, "step": 6370, "total_num_tokens": 643445350.0, "z_loss": 0.0004776420828420669 }, { "copy_logits_max": -4.273135662078857, "copy_logits_min": -750000000.0, "copy_num_tokens": 370.25, "epoch": 1.3012509573653306, "gen_logits_max": 4.498635768890381, "gen_logits_mean": -16.173694610595703, "gen_logits_min": -28.039154052734375, "gen_logits_std": 3.040771007537842, "gen_loss": 0.2931845188140869, "grad_norm": 0.3837043303248106, "learning_rate": 2.2882105263157896e-05, "loss": 0.2771, "mean_copy_accuracy": 0.9962881058454514, "mean_gen_accuracy": 0.876945972442627, "mean_token_accuracy": 0.9046511203050613, "num_tokens": 643679098.0, "sample_num_tokens": 8674.5, "step": 6371, "total_num_tokens": 643713796.0, "z_loss": 0.0005574612878262997 }, { "copy_logits_max": -4.47955322265625, "copy_logits_min": -750000000.0, "copy_num_tokens": 444.875, "epoch": 1.3014551953025275, "gen_logits_max": 4.037502765655518, "gen_logits_mean": -16.81155776977539, "gen_logits_min": -28.906753540039062, "gen_logits_std": 3.1028013229370117, "gen_loss": 0.2687975764274597, "grad_norm": 0.3906662660625327, "learning_rate": 2.288084210526316e-05, "loss": 0.2864, "mean_copy_accuracy": 0.9968272298574448, "mean_gen_accuracy": 0.8790693134069443, "mean_token_accuracy": 0.9039613157510757, "num_tokens": 643952668.0, "sample_num_tokens": 8363.0, "step": 6372, "total_num_tokens": 643986120.0, "z_loss": 0.0005653304397128522 }, { "copy_logits_max": -2.5119285583496094, "copy_logits_min": -687500032.0, "copy_num_tokens": 476.6875, "epoch": 1.3016594332397242, "gen_logits_max": 5.018392562866211, "gen_logits_mean": -14.42756462097168, "gen_logits_min": -26.429794311523438, "gen_logits_std": 2.981949806213379, "gen_loss": 0.3279026448726654, "grad_norm": 0.368356626233968, "learning_rate": 2.287957894736842e-05, "loss": 0.3009, "mean_copy_accuracy": 0.9964166134595871, "mean_gen_accuracy": 0.8721098154783249, "mean_token_accuracy": 0.8976640701293945, "num_tokens": 644217009.0, "sample_num_tokens": 9647.25, "step": 6373, "total_num_tokens": 644255598.0, "z_loss": 0.0006183824152685702 }, { "copy_logits_max": -2.230330228805542, "copy_logits_min": -750000128.0, "copy_num_tokens": 474.4375, "epoch": 1.3018636711769211, "gen_logits_max": 4.503616809844971, "gen_logits_mean": -15.274553298950195, "gen_logits_min": -27.796762466430664, "gen_logits_std": 3.031121253967285, "gen_loss": 0.31029245257377625, "grad_norm": 0.37163953109174275, "learning_rate": 2.2878315789473686e-05, "loss": 0.2944, "mean_copy_accuracy": 0.9965958893299103, "mean_gen_accuracy": 0.8708498626947403, "mean_token_accuracy": 0.9013354331254959, "num_tokens": 644505539.0, "sample_num_tokens": 9062.25, "step": 6374, "total_num_tokens": 644541788.0, "z_loss": 0.0006213319720700383 }, { "copy_logits_max": -2.517711639404297, "copy_logits_min": -750000000.0, "copy_num_tokens": 448.0625, "epoch": 1.3020679091141178, "gen_logits_max": 4.068953037261963, "gen_logits_mean": -16.330101013183594, "gen_logits_min": -28.344776153564453, "gen_logits_std": 3.0525360107421875, "gen_loss": 0.28547802567481995, "grad_norm": 0.3701583613888743, "learning_rate": 2.2877052631578947e-05, "loss": 0.2754, "mean_copy_accuracy": 0.9973320662975311, "mean_gen_accuracy": 0.872904822230339, "mean_token_accuracy": 0.9047359675168991, "num_tokens": 644770977.0, "sample_num_tokens": 8159.75, "step": 6375, "total_num_tokens": 644803616.0, "z_loss": 0.0005757440812885761 }, { "copy_logits_max": -4.785328388214111, "copy_logits_min": -750000064.0, "copy_num_tokens": 655.25, "epoch": 1.3022721470513148, "gen_logits_max": 2.570998191833496, "gen_logits_mean": -18.1092529296875, "gen_logits_min": -29.91042709350586, "gen_logits_std": 3.121838092803955, "gen_loss": 0.26941806077957153, "grad_norm": 0.3609701739754994, "learning_rate": 2.287578947368421e-05, "loss": 0.2638, "mean_copy_accuracy": 0.997161015868187, "mean_gen_accuracy": 0.8791305869817734, "mean_token_accuracy": 0.9097515791654587, "num_tokens": 645054624.0, "sample_num_tokens": 9852.5, "step": 6376, "total_num_tokens": 645094034.0, "z_loss": 0.00048510776832699776 }, { "copy_logits_max": -4.381281852722168, "copy_logits_min": -750000064.0, "copy_num_tokens": 455.0625, "epoch": 1.3024763849885117, "gen_logits_max": 4.150539875030518, "gen_logits_mean": -16.802562713623047, "gen_logits_min": -28.575550079345703, "gen_logits_std": 3.07574462890625, "gen_loss": 0.28679826855659485, "grad_norm": 0.35589173792857154, "learning_rate": 2.2874526315789472e-05, "loss": 0.2696, "mean_copy_accuracy": 0.9957557022571564, "mean_gen_accuracy": 0.8781449496746063, "mean_token_accuracy": 0.910266324877739, "num_tokens": 645319579.0, "sample_num_tokens": 8584.75, "step": 6377, "total_num_tokens": 645353918.0, "z_loss": 0.0005660606548190117 }, { "copy_logits_max": -2.76947283744812, "copy_logits_min": -750000000.0, "copy_num_tokens": 407.0625, "epoch": 1.3026806229257084, "gen_logits_max": 3.176440715789795, "gen_logits_mean": -17.425012588500977, "gen_logits_min": -29.653226852416992, "gen_logits_std": 3.1318931579589844, "gen_loss": 0.2564944922924042, "grad_norm": 0.36223573083853366, "learning_rate": 2.287326315789474e-05, "loss": 0.2626, "mean_copy_accuracy": 0.9970849752426147, "mean_gen_accuracy": 0.8791421055793762, "mean_token_accuracy": 0.9087399989366531, "num_tokens": 645585084.0, "sample_num_tokens": 7319.5, "step": 6378, "total_num_tokens": 645614362.0, "z_loss": 0.0004925555549561977 }, { "copy_logits_max": -5.856985092163086, "copy_logits_min": -750000000.0, "copy_num_tokens": 293.0, "epoch": 1.3028848608629053, "gen_logits_max": 4.366372585296631, "gen_logits_mean": -17.16795539855957, "gen_logits_min": -29.11900520324707, "gen_logits_std": 3.0587029457092285, "gen_loss": 0.30005037784576416, "grad_norm": 0.36264932838338687, "learning_rate": 2.2872e-05, "loss": 0.2984, "mean_copy_accuracy": 0.9960856586694717, "mean_gen_accuracy": 0.8696358352899551, "mean_token_accuracy": 0.8989241570234299, "num_tokens": 645865134.0, "sample_num_tokens": 7240.0, "step": 6379, "total_num_tokens": 645894094.0, "z_loss": 0.0005720455083064735 }, { "copy_logits_max": -4.655797481536865, "copy_logits_min": -750000000.0, "copy_num_tokens": 355.0, "epoch": 1.303089098800102, "gen_logits_max": 4.621913909912109, "gen_logits_mean": -17.226938247680664, "gen_logits_min": -29.49249267578125, "gen_logits_std": 3.0796422958374023, "gen_loss": 0.26737916469573975, "grad_norm": 0.401059461917517, "learning_rate": 2.2870736842105265e-05, "loss": 0.3013, "mean_copy_accuracy": 0.9960011541843414, "mean_gen_accuracy": 0.8688157796859741, "mean_token_accuracy": 0.8977733999490738, "num_tokens": 646118412.0, "sample_num_tokens": 7655.0, "step": 6380, "total_num_tokens": 646149032.0, "z_loss": 0.00048509216867387295 }, { "copy_logits_max": -5.975478172302246, "copy_logits_min": -750000064.0, "copy_num_tokens": 433.1875, "epoch": 1.303293336737299, "gen_logits_max": 4.140753269195557, "gen_logits_mean": -17.57381248474121, "gen_logits_min": -29.733139038085938, "gen_logits_std": 3.1206910610198975, "gen_loss": 0.28341883420944214, "grad_norm": 0.3624802393349816, "learning_rate": 2.2869473684210526e-05, "loss": 0.2757, "mean_copy_accuracy": 0.996700257062912, "mean_gen_accuracy": 0.8752427697181702, "mean_token_accuracy": 0.9071636348962784, "num_tokens": 646403874.0, "sample_num_tokens": 8246.5, "step": 6381, "total_num_tokens": 646436860.0, "z_loss": 0.00048772135050967336 }, { "copy_logits_max": -6.796782493591309, "copy_logits_min": -750000128.0, "copy_num_tokens": 557.25, "epoch": 1.3034975746744957, "gen_logits_max": 3.440612316131592, "gen_logits_mean": -16.940410614013672, "gen_logits_min": -29.213973999023438, "gen_logits_std": 3.125105381011963, "gen_loss": 0.30036211013793945, "grad_norm": 0.37955073454794436, "learning_rate": 2.286821052631579e-05, "loss": 0.2794, "mean_copy_accuracy": 0.9965122491121292, "mean_gen_accuracy": 0.8678408563137054, "mean_token_accuracy": 0.9052297323942184, "num_tokens": 646670012.0, "sample_num_tokens": 9100.0, "step": 6382, "total_num_tokens": 646706412.0, "z_loss": 0.0004985015839338303 }, { "copy_logits_max": -3.3250820636749268, "copy_logits_min": -750000000.0, "copy_num_tokens": 534.3125, "epoch": 1.3037018126116926, "gen_logits_max": 4.011943817138672, "gen_logits_mean": -15.970873832702637, "gen_logits_min": -28.576129913330078, "gen_logits_std": 3.0677003860473633, "gen_loss": 0.28275826573371887, "grad_norm": 0.4008695097708252, "learning_rate": 2.286694736842105e-05, "loss": 0.2966, "mean_copy_accuracy": 0.9961199015378952, "mean_gen_accuracy": 0.8670143336057663, "mean_token_accuracy": 0.8987606763839722, "num_tokens": 646932662.0, "sample_num_tokens": 8643.0, "step": 6383, "total_num_tokens": 646967234.0, "z_loss": 0.00046106657828204334 }, { "copy_logits_max": -6.278396129608154, "copy_logits_min": -750000000.0, "copy_num_tokens": 492.8125, "epoch": 1.3039060505488895, "gen_logits_max": 3.536548376083374, "gen_logits_mean": -17.351028442382812, "gen_logits_min": -29.855947494506836, "gen_logits_std": 3.0789289474487305, "gen_loss": 0.29519879817962646, "grad_norm": 0.39446019249016456, "learning_rate": 2.2865684210526315e-05, "loss": 0.2927, "mean_copy_accuracy": 0.996049702167511, "mean_gen_accuracy": 0.8680603057146072, "mean_token_accuracy": 0.9001889228820801, "num_tokens": 647206463.0, "sample_num_tokens": 9186.75, "step": 6384, "total_num_tokens": 647243210.0, "z_loss": 0.00048546300968155265 }, { "copy_logits_max": -4.297040939331055, "copy_logits_min": -750000000.0, "copy_num_tokens": 510.3125, "epoch": 1.3041102884860862, "gen_logits_max": 4.126012802124023, "gen_logits_mean": -16.806400299072266, "gen_logits_min": -29.266944885253906, "gen_logits_std": 3.0638480186462402, "gen_loss": 0.290404349565506, "grad_norm": 0.38239757989740447, "learning_rate": 2.286442105263158e-05, "loss": 0.2828, "mean_copy_accuracy": 0.9958649575710297, "mean_gen_accuracy": 0.8746679574251175, "mean_token_accuracy": 0.9034847468137741, "num_tokens": 647466669.0, "sample_num_tokens": 8639.25, "step": 6385, "total_num_tokens": 647501226.0, "z_loss": 0.0004953944589942694 }, { "copy_logits_max": -5.2683258056640625, "copy_logits_min": -750000000.0, "copy_num_tokens": 353.375, "epoch": 1.3043145264232832, "gen_logits_max": 4.955496311187744, "gen_logits_mean": -16.347562789916992, "gen_logits_min": -28.931364059448242, "gen_logits_std": 3.0504140853881836, "gen_loss": 0.2726282477378845, "grad_norm": 0.3795104118707684, "learning_rate": 2.2863157894736844e-05, "loss": 0.2828, "mean_copy_accuracy": 0.9957527816295624, "mean_gen_accuracy": 0.8771374523639679, "mean_token_accuracy": 0.9041097313165665, "num_tokens": 647719692.0, "sample_num_tokens": 7869.0, "step": 6386, "total_num_tokens": 647751168.0, "z_loss": 0.0005097012035548687 }, { "copy_logits_max": -3.52868390083313, "copy_logits_min": -750000000.0, "copy_num_tokens": 477.25, "epoch": 1.30451876436048, "gen_logits_max": 5.685333251953125, "gen_logits_mean": -14.499868392944336, "gen_logits_min": -26.983997344970703, "gen_logits_std": 2.970801830291748, "gen_loss": 0.3359223008155823, "grad_norm": 0.3872837796527604, "learning_rate": 2.2861894736842108e-05, "loss": 0.2922, "mean_copy_accuracy": 0.9971201121807098, "mean_gen_accuracy": 0.8702082931995392, "mean_token_accuracy": 0.900273323059082, "num_tokens": 647979178.0, "sample_num_tokens": 8656.5, "step": 6387, "total_num_tokens": 648013804.0, "z_loss": 0.0006495376001112163 }, { "copy_logits_max": -3.204298973083496, "copy_logits_min": -687500032.0, "copy_num_tokens": 612.75, "epoch": 1.3047230022976768, "gen_logits_max": 4.009346008300781, "gen_logits_mean": -16.676494598388672, "gen_logits_min": -29.3089599609375, "gen_logits_std": 3.064624309539795, "gen_loss": 0.2738821506500244, "grad_norm": 0.38303332397333756, "learning_rate": 2.286063157894737e-05, "loss": 0.2793, "mean_copy_accuracy": 0.9957316368818283, "mean_gen_accuracy": 0.8793655335903168, "mean_token_accuracy": 0.9072212129831314, "num_tokens": 648263738.0, "sample_num_tokens": 9668.0, "step": 6388, "total_num_tokens": 648302410.0, "z_loss": 0.0005119290435686707 }, { "copy_logits_max": -5.968334197998047, "copy_logits_min": -750000000.0, "copy_num_tokens": 415.25, "epoch": 1.3049272402348735, "gen_logits_max": 4.0492963790893555, "gen_logits_mean": -16.630531311035156, "gen_logits_min": -28.594009399414062, "gen_logits_std": 3.046074867248535, "gen_loss": 0.2572360038757324, "grad_norm": 0.36207802458253785, "learning_rate": 2.2859368421052634e-05, "loss": 0.2779, "mean_copy_accuracy": 0.9958374351263046, "mean_gen_accuracy": 0.8744038641452789, "mean_token_accuracy": 0.9044522643089294, "num_tokens": 648541667.0, "sample_num_tokens": 7602.25, "step": 6389, "total_num_tokens": 648572076.0, "z_loss": 0.0005128895863890648 }, { "copy_logits_max": -5.298479080200195, "copy_logits_min": -750000000.0, "copy_num_tokens": 305.4375, "epoch": 1.3051314781720704, "gen_logits_max": 6.195754051208496, "gen_logits_mean": -14.198184967041016, "gen_logits_min": -26.364479064941406, "gen_logits_std": 2.9823596477508545, "gen_loss": 0.36599788069725037, "grad_norm": 0.3335171115503523, "learning_rate": 2.2858105263157894e-05, "loss": 0.2724, "mean_copy_accuracy": 0.9959153980016708, "mean_gen_accuracy": 0.8799784481525421, "mean_token_accuracy": 0.9060351699590683, "num_tokens": 648835908.0, "sample_num_tokens": 9328.0, "step": 6390, "total_num_tokens": 648873220.0, "z_loss": 0.0006856803083792329 }, { "copy_logits_max": -4.008635520935059, "copy_logits_min": -750000000.0, "copy_num_tokens": 430.375, "epoch": 1.3053357161092674, "gen_logits_max": 3.551278829574585, "gen_logits_mean": -16.91119956970215, "gen_logits_min": -28.850051879882812, "gen_logits_std": 3.0418546199798584, "gen_loss": 0.30490273237228394, "grad_norm": 0.3946340064691242, "learning_rate": 2.285684210526316e-05, "loss": 0.2846, "mean_copy_accuracy": 0.9966104626655579, "mean_gen_accuracy": 0.8714984059333801, "mean_token_accuracy": 0.9035933315753937, "num_tokens": 649106198.0, "sample_num_tokens": 8088.5, "step": 6391, "total_num_tokens": 649138552.0, "z_loss": 0.0005855878698639572 }, { "copy_logits_max": -6.051806449890137, "copy_logits_min": -687500032.0, "copy_num_tokens": 417.375, "epoch": 1.305539954046464, "gen_logits_max": 4.316579341888428, "gen_logits_mean": -16.313011169433594, "gen_logits_min": -28.356557846069336, "gen_logits_std": 3.039550542831421, "gen_loss": 0.2797098755836487, "grad_norm": 0.34960696712215344, "learning_rate": 2.285557894736842e-05, "loss": 0.2741, "mean_copy_accuracy": 0.9960641115903854, "mean_gen_accuracy": 0.8756754845380783, "mean_token_accuracy": 0.9069485068321228, "num_tokens": 649400110.0, "sample_num_tokens": 8250.5, "step": 6392, "total_num_tokens": 649433112.0, "z_loss": 0.0005265497602522373 }, { "copy_logits_max": -3.78602933883667, "copy_logits_min": -750000000.0, "copy_num_tokens": 327.3125, "epoch": 1.305744191983661, "gen_logits_max": 4.204809188842773, "gen_logits_mean": -16.800382614135742, "gen_logits_min": -28.852657318115234, "gen_logits_std": 3.070638656616211, "gen_loss": 0.32805687189102173, "grad_norm": 0.3792304555100728, "learning_rate": 2.2854315789473684e-05, "loss": 0.3003, "mean_copy_accuracy": 0.995582640171051, "mean_gen_accuracy": 0.8745233118534088, "mean_token_accuracy": 0.8987340778112411, "num_tokens": 649657411.0, "sample_num_tokens": 7499.75, "step": 6393, "total_num_tokens": 649687410.0, "z_loss": 0.0006479834555648267 }, { "copy_logits_max": -7.171685218811035, "copy_logits_min": -750000000.0, "copy_num_tokens": 251.25, "epoch": 1.305948429920858, "gen_logits_max": 3.2392587661743164, "gen_logits_mean": -18.672775268554688, "gen_logits_min": -30.336130142211914, "gen_logits_std": 3.130244731903076, "gen_loss": 0.24055331945419312, "grad_norm": 0.3860967764523069, "learning_rate": 2.2853052631578948e-05, "loss": 0.2694, "mean_copy_accuracy": 0.9957308322191238, "mean_gen_accuracy": 0.8805100321769714, "mean_token_accuracy": 0.9067169576883316, "num_tokens": 649914505.0, "sample_num_tokens": 6083.25, "step": 6394, "total_num_tokens": 649938838.0, "z_loss": 0.00044397974852472544 }, { "copy_logits_max": -2.6202502250671387, "copy_logits_min": -750000000.0, "copy_num_tokens": 569.75, "epoch": 1.3061526678580546, "gen_logits_max": 3.3263180255889893, "gen_logits_mean": -15.531964302062988, "gen_logits_min": -28.056900024414062, "gen_logits_std": 3.006873846054077, "gen_loss": 0.29422175884246826, "grad_norm": 0.3886001414160006, "learning_rate": 2.2851789473684213e-05, "loss": 0.276, "mean_copy_accuracy": 0.9969203472137451, "mean_gen_accuracy": 0.8722607642412186, "mean_token_accuracy": 0.90603868663311, "num_tokens": 650176633.0, "sample_num_tokens": 8570.75, "step": 6395, "total_num_tokens": 650210916.0, "z_loss": 0.0005460734246298671 }, { "copy_logits_max": -5.288735866546631, "copy_logits_min": -687500032.0, "copy_num_tokens": 363.1875, "epoch": 1.3063569057952515, "gen_logits_max": 4.674942493438721, "gen_logits_mean": -16.54894256591797, "gen_logits_min": -28.28351402282715, "gen_logits_std": 3.0582127571105957, "gen_loss": 0.2836865782737732, "grad_norm": 0.5227916363675096, "learning_rate": 2.2850526315789474e-05, "loss": 0.2886, "mean_copy_accuracy": 0.9955435395240784, "mean_gen_accuracy": 0.8772406876087189, "mean_token_accuracy": 0.9014606326818466, "num_tokens": 650455700.0, "sample_num_tokens": 8269.5, "step": 6396, "total_num_tokens": 650488778.0, "z_loss": 0.0005304461810737848 }, { "copy_logits_max": -3.662137508392334, "copy_logits_min": -687500032.0, "copy_num_tokens": 479.0, "epoch": 1.3065611437324482, "gen_logits_max": 3.2188920974731445, "gen_logits_mean": -17.30707550048828, "gen_logits_min": -29.12073516845703, "gen_logits_std": 3.0613341331481934, "gen_loss": 0.3072658181190491, "grad_norm": 0.3630068023300661, "learning_rate": 2.2849263157894738e-05, "loss": 0.2917, "mean_copy_accuracy": 0.9956577867269516, "mean_gen_accuracy": 0.8715314269065857, "mean_token_accuracy": 0.901416152715683, "num_tokens": 650725492.0, "sample_num_tokens": 8904.5, "step": 6397, "total_num_tokens": 650761110.0, "z_loss": 0.0006080353050492704 }, { "copy_logits_max": -0.9577308893203735, "copy_logits_min": -750000000.0, "copy_num_tokens": 340.5625, "epoch": 1.3067653816696452, "gen_logits_max": 4.26565408706665, "gen_logits_mean": -15.603126525878906, "gen_logits_min": -27.527177810668945, "gen_logits_std": 2.9772932529449463, "gen_loss": 0.3455696702003479, "grad_norm": 0.39215877777050867, "learning_rate": 2.2848000000000002e-05, "loss": 0.2868, "mean_copy_accuracy": 0.9972472935914993, "mean_gen_accuracy": 0.8756951540708542, "mean_token_accuracy": 0.903285413980484, "num_tokens": 650987664.0, "sample_num_tokens": 8193.5, "step": 6398, "total_num_tokens": 651020438.0, "z_loss": 0.0006882348097860813 }, { "copy_logits_max": -2.283292293548584, "copy_logits_min": -750000000.0, "copy_num_tokens": 345.625, "epoch": 1.3069696196068419, "gen_logits_max": 4.02669620513916, "gen_logits_mean": -16.64775848388672, "gen_logits_min": -28.39891815185547, "gen_logits_std": 3.0812931060791016, "gen_loss": 0.27658790349960327, "grad_norm": 0.4441501652115246, "learning_rate": 2.2846736842105263e-05, "loss": 0.2673, "mean_copy_accuracy": 0.9974974393844604, "mean_gen_accuracy": 0.8783183395862579, "mean_token_accuracy": 0.908745065331459, "num_tokens": 651255198.0, "sample_num_tokens": 7807.0, "step": 6399, "total_num_tokens": 651286426.0, "z_loss": 0.000563178095035255 }, { "copy_logits_max": -4.0831298828125, "copy_logits_min": -750000000.0, "copy_num_tokens": 464.5, "epoch": 1.3071738575440388, "gen_logits_max": 3.638272762298584, "gen_logits_mean": -16.77848243713379, "gen_logits_min": -28.592802047729492, "gen_logits_std": 3.047895669937134, "gen_loss": 0.3280273675918579, "grad_norm": 0.34577273571470324, "learning_rate": 2.2845473684210527e-05, "loss": 0.2893, "mean_copy_accuracy": 0.9969307631254196, "mean_gen_accuracy": 0.8746385127305984, "mean_token_accuracy": 0.9010367393493652, "num_tokens": 651522407.0, "sample_num_tokens": 9841.75, "step": 6400, "total_num_tokens": 651561774.0, "z_loss": 0.000619297381490469 }, { "copy_logits_max": -3.942561149597168, "copy_logits_min": -750000000.0, "copy_num_tokens": 598.625, "epoch": 1.3073780954812357, "gen_logits_max": 3.262385368347168, "gen_logits_mean": -15.987415313720703, "gen_logits_min": -28.302635192871094, "gen_logits_std": 3.0397446155548096, "gen_loss": 0.2413199245929718, "grad_norm": 0.36718095157934527, "learning_rate": 2.284421052631579e-05, "loss": 0.2733, "mean_copy_accuracy": 0.9951655864715576, "mean_gen_accuracy": 0.8830353766679764, "mean_token_accuracy": 0.9080914407968521, "num_tokens": 651787149.0, "sample_num_tokens": 9564.25, "step": 6401, "total_num_tokens": 651825406.0, "z_loss": 0.000442157790530473 }, { "copy_logits_max": -3.4144108295440674, "copy_logits_min": -750000000.0, "copy_num_tokens": 537.5, "epoch": 1.3075823334184324, "gen_logits_max": 3.126397132873535, "gen_logits_mean": -16.69127655029297, "gen_logits_min": -28.572834014892578, "gen_logits_std": 3.0595974922180176, "gen_loss": 0.2865760624408722, "grad_norm": 0.3693529211728135, "learning_rate": 2.2842947368421056e-05, "loss": 0.2964, "mean_copy_accuracy": 0.9961436241865158, "mean_gen_accuracy": 0.8703542649745941, "mean_token_accuracy": 0.9005661457777023, "num_tokens": 652057441.0, "sample_num_tokens": 9552.25, "step": 6402, "total_num_tokens": 652095650.0, "z_loss": 0.0005353789310902357 }, { "copy_logits_max": -5.532444477081299, "copy_logits_min": -750000000.0, "copy_num_tokens": 377.375, "epoch": 1.3077865713556294, "gen_logits_max": 3.647726058959961, "gen_logits_mean": -17.36859893798828, "gen_logits_min": -29.21806526184082, "gen_logits_std": 3.1137306690216064, "gen_loss": 0.3032372295856476, "grad_norm": 0.37855565649004863, "learning_rate": 2.2841684210526317e-05, "loss": 0.2881, "mean_copy_accuracy": 0.9938030242919922, "mean_gen_accuracy": 0.8761787563562393, "mean_token_accuracy": 0.9027100205421448, "num_tokens": 652324395.0, "sample_num_tokens": 8788.75, "step": 6403, "total_num_tokens": 652359550.0, "z_loss": 0.00056447897804901 }, { "copy_logits_max": -1.1540733575820923, "copy_logits_min": -750000000.0, "copy_num_tokens": 500.5, "epoch": 1.307990809292826, "gen_logits_max": 2.7278242111206055, "gen_logits_mean": -16.979990005493164, "gen_logits_min": -29.221086502075195, "gen_logits_std": 3.095107078552246, "gen_loss": 0.24993057548999786, "grad_norm": 0.37303519078786085, "learning_rate": 2.284042105263158e-05, "loss": 0.2761, "mean_copy_accuracy": 0.9967737942934036, "mean_gen_accuracy": 0.879610002040863, "mean_token_accuracy": 0.9074907898902893, "num_tokens": 652599429.0, "sample_num_tokens": 8402.25, "step": 6404, "total_num_tokens": 652633038.0, "z_loss": 0.0005034086061641574 }, { "copy_logits_max": -0.2558481693267822, "copy_logits_min": -687500032.0, "copy_num_tokens": 492.0, "epoch": 1.308195047230023, "gen_logits_max": 3.211688280105591, "gen_logits_mean": -16.208927154541016, "gen_logits_min": -28.77639389038086, "gen_logits_std": 3.116504669189453, "gen_loss": 0.2801535427570343, "grad_norm": 0.38714626096032767, "learning_rate": 2.2839157894736842e-05, "loss": 0.2938, "mean_copy_accuracy": 0.9972953200340271, "mean_gen_accuracy": 0.8705197870731354, "mean_token_accuracy": 0.9031452834606171, "num_tokens": 652865640.0, "sample_num_tokens": 7894.5, "step": 6405, "total_num_tokens": 652897218.0, "z_loss": 0.0005229862872511148 }, { "copy_logits_max": -2.529709815979004, "copy_logits_min": -750000000.0, "copy_num_tokens": 564.125, "epoch": 1.3083992851672197, "gen_logits_max": 2.7642791271209717, "gen_logits_mean": -16.773597717285156, "gen_logits_min": -28.949491500854492, "gen_logits_std": 3.1327733993530273, "gen_loss": 0.2746557593345642, "grad_norm": 0.35274864828442715, "learning_rate": 2.2837894736842106e-05, "loss": 0.2845, "mean_copy_accuracy": 0.9957620948553085, "mean_gen_accuracy": 0.8734586834907532, "mean_token_accuracy": 0.901690661907196, "num_tokens": 653123775.0, "sample_num_tokens": 8928.75, "step": 6406, "total_num_tokens": 653159490.0, "z_loss": 0.0004673404910136014 }, { "copy_logits_max": -2.6259055137634277, "copy_logits_min": -687500032.0, "copy_num_tokens": 439.8125, "epoch": 1.3086035231044166, "gen_logits_max": 2.99760103225708, "gen_logits_mean": -17.49880027770996, "gen_logits_min": -29.72774314880371, "gen_logits_std": 3.1472620964050293, "gen_loss": 0.2721986174583435, "grad_norm": 0.4429498131619958, "learning_rate": 2.2836631578947367e-05, "loss": 0.2993, "mean_copy_accuracy": 0.9954172670841217, "mean_gen_accuracy": 0.8694076836109161, "mean_token_accuracy": 0.8980424702167511, "num_tokens": 653382671.0, "sample_num_tokens": 8121.25, "step": 6407, "total_num_tokens": 653415156.0, "z_loss": 0.00045287038665264845 }, { "copy_logits_max": -3.0729053020477295, "copy_logits_min": -750000064.0, "copy_num_tokens": 365.1875, "epoch": 1.3088077610416136, "gen_logits_max": 3.422549247741699, "gen_logits_mean": -16.9862060546875, "gen_logits_min": -28.91558837890625, "gen_logits_std": 3.1047983169555664, "gen_loss": 0.29923155903816223, "grad_norm": 0.3831474238373313, "learning_rate": 2.2835368421052632e-05, "loss": 0.295, "mean_copy_accuracy": 0.9958149939775467, "mean_gen_accuracy": 0.8690595924854279, "mean_token_accuracy": 0.8972921669483185, "num_tokens": 653637614.0, "sample_num_tokens": 7820.0, "step": 6408, "total_num_tokens": 653668894.0, "z_loss": 0.000528945354744792 }, { "copy_logits_max": -3.792534828186035, "copy_logits_min": -750000000.0, "copy_num_tokens": 516.4375, "epoch": 1.3090119989788103, "gen_logits_max": 3.136521816253662, "gen_logits_mean": -17.094493865966797, "gen_logits_min": -29.42652130126953, "gen_logits_std": 3.134800910949707, "gen_loss": 0.2522318661212921, "grad_norm": 0.3850519621897941, "learning_rate": 2.2834105263157893e-05, "loss": 0.2911, "mean_copy_accuracy": 0.9957582503557205, "mean_gen_accuracy": 0.873811274766922, "mean_token_accuracy": 0.9000911116600037, "num_tokens": 653900700.0, "sample_num_tokens": 8889.0, "step": 6409, "total_num_tokens": 653936256.0, "z_loss": 0.00047841272316873074 }, { "copy_logits_max": -2.943389892578125, "copy_logits_min": -750000000.0, "copy_num_tokens": 564.625, "epoch": 1.3092162369160072, "gen_logits_max": 3.042503833770752, "gen_logits_mean": -16.923751831054688, "gen_logits_min": -29.175308227539062, "gen_logits_std": 3.1433773040771484, "gen_loss": 0.29167455434799194, "grad_norm": 0.41931606396591375, "learning_rate": 2.2832842105263157e-05, "loss": 0.3069, "mean_copy_accuracy": 0.9951158314943314, "mean_gen_accuracy": 0.8668720424175262, "mean_token_accuracy": 0.8968516737222672, "num_tokens": 654162725.0, "sample_num_tokens": 9198.75, "step": 6410, "total_num_tokens": 654199520.0, "z_loss": 0.0005482150008901954 }, { "copy_logits_max": -2.52392578125, "copy_logits_min": -625000064.0, "copy_num_tokens": 530.125, "epoch": 1.3094204748532041, "gen_logits_max": 3.227952718734741, "gen_logits_mean": -17.48557472229004, "gen_logits_min": -29.897315979003906, "gen_logits_std": 3.160231351852417, "gen_loss": 0.2778083384037018, "grad_norm": 0.3720198643137878, "learning_rate": 2.283157894736842e-05, "loss": 0.268, "mean_copy_accuracy": 0.9963792264461517, "mean_gen_accuracy": 0.8807911276817322, "mean_token_accuracy": 0.9100304991006851, "num_tokens": 654461330.0, "sample_num_tokens": 9785.5, "step": 6411, "total_num_tokens": 654500472.0, "z_loss": 0.00048377501661889255 }, { "copy_logits_max": -4.4012627601623535, "copy_logits_min": -750000000.0, "copy_num_tokens": 539.8125, "epoch": 1.3096247127904008, "gen_logits_max": 3.4402241706848145, "gen_logits_mean": -17.986228942871094, "gen_logits_min": -30.282501220703125, "gen_logits_std": 3.158884286880493, "gen_loss": 0.26543039083480835, "grad_norm": 0.38046108114204547, "learning_rate": 2.2830315789473686e-05, "loss": 0.2782, "mean_copy_accuracy": 0.9963715821504593, "mean_gen_accuracy": 0.8758283257484436, "mean_token_accuracy": 0.9051400423049927, "num_tokens": 654757177.0, "sample_num_tokens": 9826.75, "step": 6412, "total_num_tokens": 654796484.0, "z_loss": 0.000546229537576437 }, { "copy_logits_max": -3.9828405380249023, "copy_logits_min": -750000000.0, "copy_num_tokens": 388.3125, "epoch": 1.3098289507275975, "gen_logits_max": 3.6812171936035156, "gen_logits_mean": -16.619731903076172, "gen_logits_min": -29.21371078491211, "gen_logits_std": 3.099910259246826, "gen_loss": 0.26705631613731384, "grad_norm": 0.37073304215997044, "learning_rate": 2.282905263157895e-05, "loss": 0.2822, "mean_copy_accuracy": 0.9961667358875275, "mean_gen_accuracy": 0.8764966279268265, "mean_token_accuracy": 0.9036884754896164, "num_tokens": 655024686.0, "sample_num_tokens": 8246.5, "step": 6413, "total_num_tokens": 655057672.0, "z_loss": 0.0005639346782118082 }, { "copy_logits_max": -0.2815646529197693, "copy_logits_min": -750000000.0, "copy_num_tokens": 462.125, "epoch": 1.3100331886647945, "gen_logits_max": 4.185425758361816, "gen_logits_mean": -15.270015716552734, "gen_logits_min": -27.714473724365234, "gen_logits_std": 3.083951234817505, "gen_loss": 0.276813805103302, "grad_norm": 0.4023548960086138, "learning_rate": 2.282778947368421e-05, "loss": 0.2739, "mean_copy_accuracy": 0.9968229383230209, "mean_gen_accuracy": 0.8786925375461578, "mean_token_accuracy": 0.9086016565561295, "num_tokens": 655286331.0, "sample_num_tokens": 8541.75, "step": 6414, "total_num_tokens": 655320498.0, "z_loss": 0.0005446954164654016 }, { "copy_logits_max": -2.205843925476074, "copy_logits_min": -687500032.0, "copy_num_tokens": 390.375, "epoch": 1.3102374266019914, "gen_logits_max": 3.8981246948242188, "gen_logits_mean": -17.17049789428711, "gen_logits_min": -29.337818145751953, "gen_logits_std": 3.134153366088867, "gen_loss": 0.29556527733802795, "grad_norm": 0.4258813361152089, "learning_rate": 2.2826526315789475e-05, "loss": 0.2919, "mean_copy_accuracy": 0.9953693896532059, "mean_gen_accuracy": 0.8737048655748367, "mean_token_accuracy": 0.9007563441991806, "num_tokens": 655532148.0, "sample_num_tokens": 8216.5, "step": 6415, "total_num_tokens": 655565014.0, "z_loss": 0.0006117132143117487 }, { "copy_logits_max": 1.9814680814743042, "copy_logits_min": -750000064.0, "copy_num_tokens": 979.5, "epoch": 1.310441664539188, "gen_logits_max": 2.375997543334961, "gen_logits_mean": -16.476028442382812, "gen_logits_min": -29.129587173461914, "gen_logits_std": 3.150214672088623, "gen_loss": 0.22887666523456573, "grad_norm": 0.4193863999775378, "learning_rate": 2.2825263157894736e-05, "loss": 0.2985, "mean_copy_accuracy": 0.9956449419260025, "mean_gen_accuracy": 0.8660627901554108, "mean_token_accuracy": 0.9003751873970032, "num_tokens": 655810295.0, "sample_num_tokens": 11982.25, "step": 6416, "total_num_tokens": 655858224.0, "z_loss": 0.0004281268338672817 }, { "copy_logits_max": -1.125255823135376, "copy_logits_min": -687500032.0, "copy_num_tokens": 592.3125, "epoch": 1.310645902476385, "gen_logits_max": 3.9398107528686523, "gen_logits_mean": -15.243223190307617, "gen_logits_min": -27.86139678955078, "gen_logits_std": 3.1226601600646973, "gen_loss": 0.27334022521972656, "grad_norm": 0.4047104652843229, "learning_rate": 2.2824e-05, "loss": 0.2808, "mean_copy_accuracy": 0.995390772819519, "mean_gen_accuracy": 0.8785655796527863, "mean_token_accuracy": 0.9047006517648697, "num_tokens": 656084187.0, "sample_num_tokens": 9226.25, "step": 6417, "total_num_tokens": 656121092.0, "z_loss": 0.0005015340866521001 }, { "copy_logits_max": -2.817115306854248, "copy_logits_min": -750000000.0, "copy_num_tokens": 462.0, "epoch": 1.310850140413582, "gen_logits_max": 3.424959659576416, "gen_logits_mean": -16.953107833862305, "gen_logits_min": -28.620105743408203, "gen_logits_std": 3.0847349166870117, "gen_loss": 0.24648535251617432, "grad_norm": 0.3722738033052672, "learning_rate": 2.282273684210526e-05, "loss": 0.2923, "mean_copy_accuracy": 0.9965233504772186, "mean_gen_accuracy": 0.8768454790115356, "mean_token_accuracy": 0.9025845229625702, "num_tokens": 656357539.0, "sample_num_tokens": 8819.75, "step": 6418, "total_num_tokens": 656392818.0, "z_loss": 0.0004191633779555559 }, { "copy_logits_max": -1.95663583278656, "copy_logits_min": -750000064.0, "copy_num_tokens": 543.0625, "epoch": 1.3110543783507786, "gen_logits_max": 1.9367716312408447, "gen_logits_mean": -18.542125701904297, "gen_logits_min": -30.712221145629883, "gen_logits_std": 3.1628050804138184, "gen_loss": 0.2471456080675125, "grad_norm": 0.4156393238319385, "learning_rate": 2.282147368421053e-05, "loss": 0.2693, "mean_copy_accuracy": 0.9968316853046417, "mean_gen_accuracy": 0.8760962933301926, "mean_token_accuracy": 0.9064209759235382, "num_tokens": 656629820.0, "sample_num_tokens": 8458.0, "step": 6419, "total_num_tokens": 656663652.0, "z_loss": 0.00039465015288442373 }, { "copy_logits_max": -2.253119945526123, "copy_logits_min": -750000000.0, "copy_num_tokens": 439.125, "epoch": 1.3112586162879754, "gen_logits_max": 3.0796658992767334, "gen_logits_mean": -17.841827392578125, "gen_logits_min": -29.532241821289062, "gen_logits_std": 3.1289985179901123, "gen_loss": 0.2884175777435303, "grad_norm": 0.3938434415952454, "learning_rate": 2.282021052631579e-05, "loss": 0.2847, "mean_copy_accuracy": 0.9964098632335663, "mean_gen_accuracy": 0.8726061880588531, "mean_token_accuracy": 0.9023105204105377, "num_tokens": 656904875.0, "sample_num_tokens": 8183.75, "step": 6420, "total_num_tokens": 656937610.0, "z_loss": 0.00047784202615730464 }, { "copy_logits_max": -2.1118264198303223, "copy_logits_min": -750000000.0, "copy_num_tokens": 509.625, "epoch": 1.3114628542251723, "gen_logits_max": 3.24960994720459, "gen_logits_mean": -17.91750717163086, "gen_logits_min": -29.80184555053711, "gen_logits_std": 3.109229326248169, "gen_loss": 0.2742498815059662, "grad_norm": 0.3791527864102359, "learning_rate": 2.2818947368421054e-05, "loss": 0.2789, "mean_copy_accuracy": 0.9955642223358154, "mean_gen_accuracy": 0.8768990486860275, "mean_token_accuracy": 0.9066118448972702, "num_tokens": 657186340.0, "sample_num_tokens": 8736.0, "step": 6421, "total_num_tokens": 657221284.0, "z_loss": 0.0004918272607028484 }, { "copy_logits_max": -1.540804386138916, "copy_logits_min": -750000000.0, "copy_num_tokens": 485.5, "epoch": 1.3116670921623692, "gen_logits_max": 3.0418224334716797, "gen_logits_mean": -17.824440002441406, "gen_logits_min": -30.121917724609375, "gen_logits_std": 3.1059179306030273, "gen_loss": 0.2847636938095093, "grad_norm": 0.36510667216791637, "learning_rate": 2.2817684210526315e-05, "loss": 0.2739, "mean_copy_accuracy": 0.9968246519565582, "mean_gen_accuracy": 0.8765775859355927, "mean_token_accuracy": 0.9080182462930679, "num_tokens": 657484216.0, "sample_num_tokens": 8953.5, "step": 6422, "total_num_tokens": 657520030.0, "z_loss": 0.0005135164246894419 }, { "copy_logits_max": -2.322991132736206, "copy_logits_min": -750000000.0, "copy_num_tokens": 410.5625, "epoch": 1.311871330099566, "gen_logits_max": 3.76986026763916, "gen_logits_mean": -16.060672760009766, "gen_logits_min": -27.444509506225586, "gen_logits_std": 2.9926276206970215, "gen_loss": 0.27832353115081787, "grad_norm": 0.37281569641564516, "learning_rate": 2.281642105263158e-05, "loss": 0.2962, "mean_copy_accuracy": 0.9962847530841827, "mean_gen_accuracy": 0.8676262199878693, "mean_token_accuracy": 0.8997863084077835, "num_tokens": 657756524.0, "sample_num_tokens": 7750.0, "step": 6423, "total_num_tokens": 657787524.0, "z_loss": 0.0005361154908314347 }, { "copy_logits_max": -2.7484846115112305, "copy_logits_min": -750000064.0, "copy_num_tokens": 508.0625, "epoch": 1.3120755680367628, "gen_logits_max": 3.2522730827331543, "gen_logits_mean": -16.859622955322266, "gen_logits_min": -28.729978561401367, "gen_logits_std": 3.0801339149475098, "gen_loss": 0.2858009338378906, "grad_norm": 0.39013060171627745, "learning_rate": 2.281515789473684e-05, "loss": 0.288, "mean_copy_accuracy": 0.9962224662303925, "mean_gen_accuracy": 0.8758474290370941, "mean_token_accuracy": 0.9032191783189774, "num_tokens": 658016425.0, "sample_num_tokens": 8211.75, "step": 6424, "total_num_tokens": 658049272.0, "z_loss": 0.0005114362575113773 }, { "copy_logits_max": -4.064975261688232, "copy_logits_min": -687500032.0, "copy_num_tokens": 343.375, "epoch": 1.3122798059739598, "gen_logits_max": 4.519480228424072, "gen_logits_mean": -16.351652145385742, "gen_logits_min": -28.338634490966797, "gen_logits_std": 3.1110141277313232, "gen_loss": 0.3265065848827362, "grad_norm": 0.3720644741286295, "learning_rate": 2.2813894736842105e-05, "loss": 0.2871, "mean_copy_accuracy": 0.9964558780193329, "mean_gen_accuracy": 0.8754412978887558, "mean_token_accuracy": 0.9042022675275803, "num_tokens": 658290299.0, "sample_num_tokens": 7692.25, "step": 6425, "total_num_tokens": 658321068.0, "z_loss": 0.0005958775291219354 }, { "copy_logits_max": -3.8585002422332764, "copy_logits_min": -687500032.0, "copy_num_tokens": 325.8125, "epoch": 1.3124840439111565, "gen_logits_max": 3.330057144165039, "gen_logits_mean": -17.004793167114258, "gen_logits_min": -29.086029052734375, "gen_logits_std": 3.059570789337158, "gen_loss": 0.2774244248867035, "grad_norm": 0.39513357563215873, "learning_rate": 2.281263157894737e-05, "loss": 0.2697, "mean_copy_accuracy": 0.9960833042860031, "mean_gen_accuracy": 0.8777344673871994, "mean_token_accuracy": 0.9068561047315598, "num_tokens": 658550382.0, "sample_num_tokens": 7138.0, "step": 6426, "total_num_tokens": 658578934.0, "z_loss": 0.00047349208034574986 }, { "copy_logits_max": -3.7304301261901855, "copy_logits_min": -750000064.0, "copy_num_tokens": 407.625, "epoch": 1.3126882818483534, "gen_logits_max": 3.3187155723571777, "gen_logits_mean": -17.742023468017578, "gen_logits_min": -29.531635284423828, "gen_logits_std": 3.0673866271972656, "gen_loss": 0.33052000403404236, "grad_norm": 0.4001320972047835, "learning_rate": 2.2811368421052633e-05, "loss": 0.3098, "mean_copy_accuracy": 0.9961316883563995, "mean_gen_accuracy": 0.8658522069454193, "mean_token_accuracy": 0.8970289975404739, "num_tokens": 658813182.0, "sample_num_tokens": 8102.0, "step": 6427, "total_num_tokens": 658845590.0, "z_loss": 0.0006039631552994251 }, { "copy_logits_max": -4.560672283172607, "copy_logits_min": -750000000.0, "copy_num_tokens": 404.3125, "epoch": 1.31289251978555, "gen_logits_max": 3.5027356147766113, "gen_logits_mean": -17.210174560546875, "gen_logits_min": -28.875015258789062, "gen_logits_std": 3.054060459136963, "gen_loss": 0.2904227077960968, "grad_norm": 0.3620294992481321, "learning_rate": 2.2810105263157898e-05, "loss": 0.2799, "mean_copy_accuracy": 0.9956611692905426, "mean_gen_accuracy": 0.8779281079769135, "mean_token_accuracy": 0.9056670069694519, "num_tokens": 659102845.0, "sample_num_tokens": 8786.75, "step": 6428, "total_num_tokens": 659137992.0, "z_loss": 0.0005391660961322486 }, { "copy_logits_max": -3.082408905029297, "copy_logits_min": -750000064.0, "copy_num_tokens": 417.9375, "epoch": 1.313096757722747, "gen_logits_max": 3.20689058303833, "gen_logits_mean": -17.88768768310547, "gen_logits_min": -29.574893951416016, "gen_logits_std": 3.0867903232574463, "gen_loss": 0.28158196806907654, "grad_norm": 0.4026004882933017, "learning_rate": 2.280884210526316e-05, "loss": 0.2918, "mean_copy_accuracy": 0.9970332682132721, "mean_gen_accuracy": 0.8716804832220078, "mean_token_accuracy": 0.9025816023349762, "num_tokens": 659361842.0, "sample_num_tokens": 8482.0, "step": 6429, "total_num_tokens": 659395770.0, "z_loss": 0.0005117640248499811 }, { "copy_logits_max": -3.250727891921997, "copy_logits_min": -687500032.0, "copy_num_tokens": 448.125, "epoch": 1.3133009956599437, "gen_logits_max": 3.6776340007781982, "gen_logits_mean": -16.20267105102539, "gen_logits_min": -27.812847137451172, "gen_logits_std": 2.9890289306640625, "gen_loss": 0.2694576382637024, "grad_norm": 0.44710547112682664, "learning_rate": 2.2807578947368423e-05, "loss": 0.2598, "mean_copy_accuracy": 0.9953485727310181, "mean_gen_accuracy": 0.8839543461799622, "mean_token_accuracy": 0.9106174260377884, "num_tokens": 659640137.0, "sample_num_tokens": 9322.25, "step": 6430, "total_num_tokens": 659677426.0, "z_loss": 0.0005087419413030148 }, { "copy_logits_max": -4.021427154541016, "copy_logits_min": -750000000.0, "copy_num_tokens": 440.625, "epoch": 1.3135052335971407, "gen_logits_max": 3.194941282272339, "gen_logits_mean": -17.23247528076172, "gen_logits_min": -28.942184448242188, "gen_logits_std": 3.068906307220459, "gen_loss": 0.2834972143173218, "grad_norm": 0.37019780696528254, "learning_rate": 2.2806315789473684e-05, "loss": 0.2851, "mean_copy_accuracy": 0.995664119720459, "mean_gen_accuracy": 0.8754668831825256, "mean_token_accuracy": 0.903469979763031, "num_tokens": 659911972.0, "sample_num_tokens": 7546.5, "step": 6431, "total_num_tokens": 659942158.0, "z_loss": 0.0005543904844671488 }, { "copy_logits_max": -5.537420272827148, "copy_logits_min": -750000000.0, "copy_num_tokens": 357.25, "epoch": 1.3137094715343376, "gen_logits_max": 3.0504634380340576, "gen_logits_mean": -18.337881088256836, "gen_logits_min": -29.596284866333008, "gen_logits_std": 3.043030023574829, "gen_loss": 0.29008176922798157, "grad_norm": 0.3535895937949304, "learning_rate": 2.2805052631578948e-05, "loss": 0.2576, "mean_copy_accuracy": 0.9958228468894958, "mean_gen_accuracy": 0.8824181705713272, "mean_token_accuracy": 0.9124791473150253, "num_tokens": 660194698.0, "sample_num_tokens": 7787.0, "step": 6432, "total_num_tokens": 660225846.0, "z_loss": 0.0005302201025187969 }, { "copy_logits_max": -5.303663730621338, "copy_logits_min": -750000000.0, "copy_num_tokens": 305.9375, "epoch": 1.3139137094715343, "gen_logits_max": 3.45967698097229, "gen_logits_mean": -17.264936447143555, "gen_logits_min": -28.903995513916016, "gen_logits_std": 3.043686628341675, "gen_loss": 0.29628244042396545, "grad_norm": 0.35010270956104167, "learning_rate": 2.280378947368421e-05, "loss": 0.2932, "mean_copy_accuracy": 0.9950826466083527, "mean_gen_accuracy": 0.8777670711278915, "mean_token_accuracy": 0.9000042527914047, "num_tokens": 660451392.0, "sample_num_tokens": 6935.5, "step": 6433, "total_num_tokens": 660479134.0, "z_loss": 0.0005792242009192705 }, { "copy_logits_max": -5.403048515319824, "copy_logits_min": -687500032.0, "copy_num_tokens": 359.3125, "epoch": 1.3141179474087312, "gen_logits_max": 3.2175164222717285, "gen_logits_mean": -18.225082397460938, "gen_logits_min": -29.752151489257812, "gen_logits_std": 3.0591933727264404, "gen_loss": 0.28981316089630127, "grad_norm": 0.4087318953527928, "learning_rate": 2.2802526315789473e-05, "loss": 0.2881, "mean_copy_accuracy": 0.9969082474708557, "mean_gen_accuracy": 0.8708691596984863, "mean_token_accuracy": 0.9010849744081497, "num_tokens": 660717841.0, "sample_num_tokens": 7790.25, "step": 6434, "total_num_tokens": 660749002.0, "z_loss": 0.0005636144778691232 }, { "copy_logits_max": -6.610768795013428, "copy_logits_min": -750000000.0, "copy_num_tokens": 399.8125, "epoch": 1.314322185345928, "gen_logits_max": 4.117105007171631, "gen_logits_mean": -16.840248107910156, "gen_logits_min": -28.483123779296875, "gen_logits_std": 3.010634660720825, "gen_loss": 0.2835031747817993, "grad_norm": 0.4039020270331417, "learning_rate": 2.2801263157894738e-05, "loss": 0.2908, "mean_copy_accuracy": 0.9955371469259262, "mean_gen_accuracy": 0.8774177879095078, "mean_token_accuracy": 0.902382954955101, "num_tokens": 660976455.0, "sample_num_tokens": 9394.25, "step": 6435, "total_num_tokens": 661014032.0, "z_loss": 0.0005468805902637541 }, { "copy_logits_max": -7.005588531494141, "copy_logits_min": -750000000.0, "copy_num_tokens": 449.25, "epoch": 1.3145264232831249, "gen_logits_max": 2.8621034622192383, "gen_logits_mean": -18.859050750732422, "gen_logits_min": -30.14679718017578, "gen_logits_std": 3.09334659576416, "gen_loss": 0.26732176542282104, "grad_norm": 0.38495046646748843, "learning_rate": 2.2800000000000002e-05, "loss": 0.2731, "mean_copy_accuracy": 0.9956939518451691, "mean_gen_accuracy": 0.8855527490377426, "mean_token_accuracy": 0.9095670282840729, "num_tokens": 661246978.0, "sample_num_tokens": 9306.0, "step": 6436, "total_num_tokens": 661284202.0, "z_loss": 0.0005099493428133428 }, { "copy_logits_max": -4.341952323913574, "copy_logits_min": -750000000.0, "copy_num_tokens": 414.9375, "epoch": 1.3147306612203216, "gen_logits_max": 2.9323060512542725, "gen_logits_mean": -17.91140365600586, "gen_logits_min": -29.516220092773438, "gen_logits_std": 3.0602314472198486, "gen_loss": 0.31997495889663696, "grad_norm": 0.36936749288390924, "learning_rate": 2.2798736842105263e-05, "loss": 0.3009, "mean_copy_accuracy": 0.9966082870960236, "mean_gen_accuracy": 0.8666804879903793, "mean_token_accuracy": 0.897380530834198, "num_tokens": 661519888.0, "sample_num_tokens": 7707.5, "step": 6437, "total_num_tokens": 661550718.0, "z_loss": 0.0006077471189200878 }, { "copy_logits_max": -3.8688607215881348, "copy_logits_min": -750000000.0, "copy_num_tokens": 439.875, "epoch": 1.3149348991575185, "gen_logits_max": 3.676499366760254, "gen_logits_mean": -16.31751251220703, "gen_logits_min": -27.84798240661621, "gen_logits_std": 2.9278836250305176, "gen_loss": 0.27854710817337036, "grad_norm": 0.38164798884186407, "learning_rate": 2.2797473684210527e-05, "loss": 0.2664, "mean_copy_accuracy": 0.9963106065988541, "mean_gen_accuracy": 0.8799701631069183, "mean_token_accuracy": 0.9098369181156158, "num_tokens": 661796448.0, "sample_num_tokens": 8735.5, "step": 6438, "total_num_tokens": 661831390.0, "z_loss": 0.00047674885718151927 }, { "copy_logits_max": -4.103650093078613, "copy_logits_min": -750000000.0, "copy_num_tokens": 529.625, "epoch": 1.3151391370947154, "gen_logits_max": 2.723065137863159, "gen_logits_mean": -16.999624252319336, "gen_logits_min": -28.929574966430664, "gen_logits_std": 3.0264129638671875, "gen_loss": 0.25567248463630676, "grad_norm": 0.3687640778175935, "learning_rate": 2.279621052631579e-05, "loss": 0.2724, "mean_copy_accuracy": 0.9971236288547516, "mean_gen_accuracy": 0.8717888444662094, "mean_token_accuracy": 0.9080156534910202, "num_tokens": 662077589.0, "sample_num_tokens": 9051.75, "step": 6439, "total_num_tokens": 662113796.0, "z_loss": 0.0004507827979978174 }, { "copy_logits_max": -4.426947116851807, "copy_logits_min": -750000000.0, "copy_num_tokens": 397.9375, "epoch": 1.3153433750319121, "gen_logits_max": 3.666938543319702, "gen_logits_mean": -16.618938446044922, "gen_logits_min": -28.2406063079834, "gen_logits_std": 2.9982974529266357, "gen_loss": 0.285163551568985, "grad_norm": 0.3577958782159971, "learning_rate": 2.2794947368421052e-05, "loss": 0.2704, "mean_copy_accuracy": 0.9972621202468872, "mean_gen_accuracy": 0.8762117922306061, "mean_token_accuracy": 0.9079022109508514, "num_tokens": 662372309.0, "sample_num_tokens": 8340.75, "step": 6440, "total_num_tokens": 662405672.0, "z_loss": 0.0004414469876792282 }, { "copy_logits_max": -6.086442947387695, "copy_logits_min": -750000000.0, "copy_num_tokens": 231.4375, "epoch": 1.315547612969109, "gen_logits_max": 4.213342189788818, "gen_logits_mean": -18.03908920288086, "gen_logits_min": -29.417518615722656, "gen_logits_std": 3.0932278633117676, "gen_loss": 0.2729341983795166, "grad_norm": 0.3538383963120715, "learning_rate": 2.2793684210526317e-05, "loss": 0.2883, "mean_copy_accuracy": 0.9958384782075882, "mean_gen_accuracy": 0.8737560659646988, "mean_token_accuracy": 0.9015560895204544, "num_tokens": 662640151.0, "sample_num_tokens": 7074.75, "step": 6441, "total_num_tokens": 662668450.0, "z_loss": 0.0004521518712863326 }, { "copy_logits_max": -2.1159722805023193, "copy_logits_min": -750000000.0, "copy_num_tokens": 441.6875, "epoch": 1.315751850906306, "gen_logits_max": 3.4457061290740967, "gen_logits_mean": -16.724872589111328, "gen_logits_min": -28.965438842773438, "gen_logits_std": 3.051649570465088, "gen_loss": 0.26520049571990967, "grad_norm": 0.3547661024792077, "learning_rate": 2.2792421052631578e-05, "loss": 0.2621, "mean_copy_accuracy": 0.9977297484874725, "mean_gen_accuracy": 0.8793499171733856, "mean_token_accuracy": 0.9101866185665131, "num_tokens": 662904136.0, "sample_num_tokens": 8820.5, "step": 6442, "total_num_tokens": 662939418.0, "z_loss": 0.0004354377742856741 }, { "copy_logits_max": -2.6719260215759277, "copy_logits_min": -687500032.0, "copy_num_tokens": 445.9375, "epoch": 1.3159560888435027, "gen_logits_max": 3.444009780883789, "gen_logits_mean": -16.206649780273438, "gen_logits_min": -27.776277542114258, "gen_logits_std": 2.990213394165039, "gen_loss": 0.2577051818370819, "grad_norm": 0.4414815828311958, "learning_rate": 2.2791157894736845e-05, "loss": 0.2929, "mean_copy_accuracy": 0.9960137754678726, "mean_gen_accuracy": 0.875578984618187, "mean_token_accuracy": 0.9013108760118484, "num_tokens": 663151790.0, "sample_num_tokens": 7640.0, "step": 6443, "total_num_tokens": 663182350.0, "z_loss": 0.000491282669827342 }, { "copy_logits_max": -2.0104568004608154, "copy_logits_min": -750000064.0, "copy_num_tokens": 582.75, "epoch": 1.3161603267806994, "gen_logits_max": 2.5946950912475586, "gen_logits_mean": -17.756404876708984, "gen_logits_min": -30.000917434692383, "gen_logits_std": 3.1359434127807617, "gen_loss": 0.2653484344482422, "grad_norm": 0.36043615878843965, "learning_rate": 2.2789894736842106e-05, "loss": 0.2736, "mean_copy_accuracy": 0.996869757771492, "mean_gen_accuracy": 0.8748522251844406, "mean_token_accuracy": 0.9081123918294907, "num_tokens": 663420946.0, "sample_num_tokens": 8785.5, "step": 6444, "total_num_tokens": 663456088.0, "z_loss": 0.0004849213291890919 }, { "copy_logits_max": -2.709397792816162, "copy_logits_min": -687500032.0, "copy_num_tokens": 605.0, "epoch": 1.3163645647178963, "gen_logits_max": 1.8495111465454102, "gen_logits_mean": -18.74825096130371, "gen_logits_min": -31.02432632446289, "gen_logits_std": 3.143951416015625, "gen_loss": 0.2507537007331848, "grad_norm": 0.3772761931076994, "learning_rate": 2.278863157894737e-05, "loss": 0.2791, "mean_copy_accuracy": 0.9960192143917084, "mean_gen_accuracy": 0.873405784368515, "mean_token_accuracy": 0.9056319147348404, "num_tokens": 663684265.0, "sample_num_tokens": 8807.25, "step": 6445, "total_num_tokens": 663719494.0, "z_loss": 0.0004562817339319736 }, { "copy_logits_max": -7.41419792175293, "copy_logits_min": -750000000.0, "copy_num_tokens": 340.0, "epoch": 1.3165688026550932, "gen_logits_max": 3.342808246612549, "gen_logits_mean": -18.382396697998047, "gen_logits_min": -29.562002182006836, "gen_logits_std": 3.0479235649108887, "gen_loss": 0.2904038429260254, "grad_norm": 0.381364288508455, "learning_rate": 2.278736842105263e-05, "loss": 0.3027, "mean_copy_accuracy": 0.995834693312645, "mean_gen_accuracy": 0.8675701469182968, "mean_token_accuracy": 0.895880326628685, "num_tokens": 663944841.0, "sample_num_tokens": 7693.75, "step": 6446, "total_num_tokens": 663975616.0, "z_loss": 0.0004956048214808106 }, { "copy_logits_max": -2.5498147010803223, "copy_logits_min": -750000000.0, "copy_num_tokens": 764.25, "epoch": 1.31677304059229, "gen_logits_max": 3.1374168395996094, "gen_logits_mean": -16.82526969909668, "gen_logits_min": -29.056766510009766, "gen_logits_std": 3.0675442218780518, "gen_loss": 0.2509465515613556, "grad_norm": 0.3882132728971122, "learning_rate": 2.2786105263157896e-05, "loss": 0.2532, "mean_copy_accuracy": 0.9965442717075348, "mean_gen_accuracy": 0.8855504095554352, "mean_token_accuracy": 0.9170043617486954, "num_tokens": 664219342.0, "sample_num_tokens": 10074.5, "step": 6447, "total_num_tokens": 664259640.0, "z_loss": 0.0004811251419596374 }, { "copy_logits_max": -1.600394368171692, "copy_logits_min": -687500032.0, "copy_num_tokens": 480.4375, "epoch": 1.3169772785294869, "gen_logits_max": 3.867905855178833, "gen_logits_mean": -16.461755752563477, "gen_logits_min": -28.674171447753906, "gen_logits_std": 3.0150372982025146, "gen_loss": 0.25872936844825745, "grad_norm": 0.38895479945771444, "learning_rate": 2.2784842105263157e-05, "loss": 0.2839, "mean_copy_accuracy": 0.9960030615329742, "mean_gen_accuracy": 0.8723223954439163, "mean_token_accuracy": 0.9031723737716675, "num_tokens": 664482571.0, "sample_num_tokens": 8593.75, "step": 6448, "total_num_tokens": 664516946.0, "z_loss": 0.0004754556866828352 }, { "copy_logits_max": -1.8488456010818481, "copy_logits_min": -687500032.0, "copy_num_tokens": 675.9375, "epoch": 1.3171815164666838, "gen_logits_max": 3.6564643383026123, "gen_logits_mean": -16.192224502563477, "gen_logits_min": -28.479536056518555, "gen_logits_std": 3.0146560668945312, "gen_loss": 0.22626709938049316, "grad_norm": 0.3589064323497755, "learning_rate": 2.278357894736842e-05, "loss": 0.2652, "mean_copy_accuracy": 0.9958101063966751, "mean_gen_accuracy": 0.8802231252193451, "mean_token_accuracy": 0.9105450212955475, "num_tokens": 664760833.0, "sample_num_tokens": 10072.75, "step": 6449, "total_num_tokens": 664801124.0, "z_loss": 0.00045457648229785264 }, { "copy_logits_max": -3.399746894836426, "copy_logits_min": -687500032.0, "copy_num_tokens": 455.25, "epoch": 1.3173857544038805, "gen_logits_max": 3.245199680328369, "gen_logits_mean": -16.448673248291016, "gen_logits_min": -28.270889282226562, "gen_logits_std": 2.983031988143921, "gen_loss": 0.2964898943901062, "grad_norm": 0.4004567811613137, "learning_rate": 2.2782315789473682e-05, "loss": 0.2702, "mean_copy_accuracy": 0.9964923560619354, "mean_gen_accuracy": 0.8794053047895432, "mean_token_accuracy": 0.9090558141469955, "num_tokens": 665049748.0, "sample_num_tokens": 8470.0, "step": 6450, "total_num_tokens": 665083628.0, "z_loss": 0.0005507877795025706 }, { "copy_logits_max": -4.248456001281738, "copy_logits_min": -750000000.0, "copy_num_tokens": 491.8125, "epoch": 1.3175899923410774, "gen_logits_max": 3.6769890785217285, "gen_logits_mean": -16.882875442504883, "gen_logits_min": -28.635944366455078, "gen_logits_std": 2.9863524436950684, "gen_loss": 0.30369770526885986, "grad_norm": 0.3581952022876289, "learning_rate": 2.278105263157895e-05, "loss": 0.2831, "mean_copy_accuracy": 0.9961525946855545, "mean_gen_accuracy": 0.8759283274412155, "mean_token_accuracy": 0.9018469303846359, "num_tokens": 665319609.0, "sample_num_tokens": 9987.25, "step": 6451, "total_num_tokens": 665359558.0, "z_loss": 0.000549013027921319 }, { "copy_logits_max": -4.394762992858887, "copy_logits_min": -750000000.0, "copy_num_tokens": 387.6875, "epoch": 1.3177942302782741, "gen_logits_max": 3.5100109577178955, "gen_logits_mean": -17.195255279541016, "gen_logits_min": -28.439250946044922, "gen_logits_std": 3.004246234893799, "gen_loss": 0.28833991289138794, "grad_norm": 0.38520602863428527, "learning_rate": 2.2779789473684214e-05, "loss": 0.2739, "mean_copy_accuracy": 0.9950042217969894, "mean_gen_accuracy": 0.8777158707380295, "mean_token_accuracy": 0.9055917859077454, "num_tokens": 665580951.0, "sample_num_tokens": 7653.25, "step": 6452, "total_num_tokens": 665611564.0, "z_loss": 0.000544171198271215 }, { "copy_logits_max": -3.570371150970459, "copy_logits_min": -750000000.0, "copy_num_tokens": 250.9375, "epoch": 1.317998468215471, "gen_logits_max": 4.3647613525390625, "gen_logits_mean": -17.219493865966797, "gen_logits_min": -28.465742111206055, "gen_logits_std": 2.9940104484558105, "gen_loss": 0.2788822054862976, "grad_norm": 0.3733843544978953, "learning_rate": 2.2778526315789475e-05, "loss": 0.2983, "mean_copy_accuracy": 0.9960573464632034, "mean_gen_accuracy": 0.8722332864999771, "mean_token_accuracy": 0.8990339189767838, "num_tokens": 665859071.0, "sample_num_tokens": 7188.25, "step": 6453, "total_num_tokens": 665887824.0, "z_loss": 0.0005343393422663212 }, { "copy_logits_max": -5.033372402191162, "copy_logits_min": -750000000.0, "copy_num_tokens": 360.0625, "epoch": 1.3182027061526678, "gen_logits_max": 3.227046251296997, "gen_logits_mean": -18.384201049804688, "gen_logits_min": -29.63603973388672, "gen_logits_std": 3.0590133666992188, "gen_loss": 0.28184595704078674, "grad_norm": 0.3697601160440128, "learning_rate": 2.277726315789474e-05, "loss": 0.2713, "mean_copy_accuracy": 0.9963383227586746, "mean_gen_accuracy": 0.8809799253940582, "mean_token_accuracy": 0.9083084315061569, "num_tokens": 666118446.0, "sample_num_tokens": 7990.5, "step": 6454, "total_num_tokens": 666150408.0, "z_loss": 0.0005398915964178741 }, { "copy_logits_max": -2.960763454437256, "copy_logits_min": -750000000.0, "copy_num_tokens": 363.9375, "epoch": 1.3184069440898647, "gen_logits_max": 2.829897403717041, "gen_logits_mean": -18.527130126953125, "gen_logits_min": -30.389572143554688, "gen_logits_std": 3.1121368408203125, "gen_loss": 0.2597900331020355, "grad_norm": 0.3665977080233491, "learning_rate": 2.2776e-05, "loss": 0.262, "mean_copy_accuracy": 0.9962023794651031, "mean_gen_accuracy": 0.8812851011753082, "mean_token_accuracy": 0.9101238995790482, "num_tokens": 666380205.0, "sample_num_tokens": 7324.25, "step": 6455, "total_num_tokens": 666409502.0, "z_loss": 0.000534012564457953 }, { "copy_logits_max": -2.4157605171203613, "copy_logits_min": -750000000.0, "copy_num_tokens": 548.0, "epoch": 1.3186111820270616, "gen_logits_max": 4.3158063888549805, "gen_logits_mean": -16.181163787841797, "gen_logits_min": -27.88226318359375, "gen_logits_std": 3.031803846359253, "gen_loss": 0.31754884123802185, "grad_norm": 0.3818073849642953, "learning_rate": 2.2774736842105264e-05, "loss": 0.2967, "mean_copy_accuracy": 0.9961739927530289, "mean_gen_accuracy": 0.8689303547143936, "mean_token_accuracy": 0.9012309014797211, "num_tokens": 666640514.0, "sample_num_tokens": 8580.5, "step": 6456, "total_num_tokens": 666674836.0, "z_loss": 0.0005814704345539212 }, { "copy_logits_max": -2.2780113220214844, "copy_logits_min": -750000000.0, "copy_num_tokens": 478.0, "epoch": 1.3188154199642583, "gen_logits_max": 4.047121047973633, "gen_logits_mean": -16.271995544433594, "gen_logits_min": -27.794151306152344, "gen_logits_std": 2.9982919692993164, "gen_loss": 0.3138791024684906, "grad_norm": 0.3720443610660483, "learning_rate": 2.2773473684210525e-05, "loss": 0.2987, "mean_copy_accuracy": 0.9957224428653717, "mean_gen_accuracy": 0.8688721209764481, "mean_token_accuracy": 0.8980752527713776, "num_tokens": 666912408.0, "sample_num_tokens": 8464.0, "step": 6457, "total_num_tokens": 666946264.0, "z_loss": 0.0005775857134722173 }, { "copy_logits_max": -1.647669792175293, "copy_logits_min": -687500032.0, "copy_num_tokens": 377.125, "epoch": 1.3190196579014553, "gen_logits_max": 3.8676586151123047, "gen_logits_mean": -15.6054048538208, "gen_logits_min": -27.34486961364746, "gen_logits_std": 2.988231658935547, "gen_loss": 0.2659026086330414, "grad_norm": 0.3820131356800197, "learning_rate": 2.277221052631579e-05, "loss": 0.2862, "mean_copy_accuracy": 0.9953740537166595, "mean_gen_accuracy": 0.8770132213830948, "mean_token_accuracy": 0.9023186564445496, "num_tokens": 667164219.0, "sample_num_tokens": 7625.75, "step": 6458, "total_num_tokens": 667194722.0, "z_loss": 0.0004939428763464093 }, { "copy_logits_max": -6.221911430358887, "copy_logits_min": -750000000.0, "copy_num_tokens": 430.3125, "epoch": 1.319223895838652, "gen_logits_max": 3.697089433670044, "gen_logits_mean": -17.30795669555664, "gen_logits_min": -28.36680793762207, "gen_logits_std": 3.0070714950561523, "gen_loss": 0.2527283728122711, "grad_norm": 0.36917254659593807, "learning_rate": 2.277094736842105e-05, "loss": 0.2698, "mean_copy_accuracy": 0.9967217296361923, "mean_gen_accuracy": 0.8792423605918884, "mean_token_accuracy": 0.9078027755022049, "num_tokens": 667424180.0, "sample_num_tokens": 8953.5, "step": 6459, "total_num_tokens": 667459994.0, "z_loss": 0.0004100618534721434 }, { "copy_logits_max": -3.154209852218628, "copy_logits_min": -750000000.0, "copy_num_tokens": 440.25, "epoch": 1.3194281337758489, "gen_logits_max": 2.666616439819336, "gen_logits_mean": -16.930206298828125, "gen_logits_min": -28.634174346923828, "gen_logits_std": 3.003403425216675, "gen_loss": 0.2834612727165222, "grad_norm": 0.37898790490536766, "learning_rate": 2.2769684210526318e-05, "loss": 0.2881, "mean_copy_accuracy": 0.9967022687196732, "mean_gen_accuracy": 0.8714364171028137, "mean_token_accuracy": 0.9018947780132294, "num_tokens": 667679098.0, "sample_num_tokens": 7364.0, "step": 6460, "total_num_tokens": 667708554.0, "z_loss": 0.00045131586375646293 }, { "copy_logits_max": -1.6413705348968506, "copy_logits_min": -750000000.0, "copy_num_tokens": 472.875, "epoch": 1.3196323717130456, "gen_logits_max": 3.4094624519348145, "gen_logits_mean": -16.16122817993164, "gen_logits_min": -28.44198989868164, "gen_logits_std": 3.039848804473877, "gen_loss": 0.24697458744049072, "grad_norm": 0.4000871390869676, "learning_rate": 2.276842105263158e-05, "loss": 0.2755, "mean_copy_accuracy": 0.9951939731836319, "mean_gen_accuracy": 0.8758277595043182, "mean_token_accuracy": 0.9051250070333481, "num_tokens": 667935443.0, "sample_num_tokens": 9293.25, "step": 6461, "total_num_tokens": 667972616.0, "z_loss": 0.00043688341975212097 }, { "copy_logits_max": -2.867464542388916, "copy_logits_min": -750000000.0, "copy_num_tokens": 556.6875, "epoch": 1.3198366096502425, "gen_logits_max": 3.0872793197631836, "gen_logits_mean": -16.108308792114258, "gen_logits_min": -28.127676010131836, "gen_logits_std": 3.0021812915802, "gen_loss": 0.2924842834472656, "grad_norm": 0.348439734556052, "learning_rate": 2.2767157894736844e-05, "loss": 0.2764, "mean_copy_accuracy": 0.9970801472663879, "mean_gen_accuracy": 0.8752074837684631, "mean_token_accuracy": 0.9075889587402344, "num_tokens": 668232251.0, "sample_num_tokens": 8471.25, "step": 6462, "total_num_tokens": 668266136.0, "z_loss": 0.0004890597192570567 }, { "copy_logits_max": -1.7979421615600586, "copy_logits_min": -750000000.0, "copy_num_tokens": 592.625, "epoch": 1.3200408475874394, "gen_logits_max": 2.424294948577881, "gen_logits_mean": -16.532482147216797, "gen_logits_min": -28.942729949951172, "gen_logits_std": 3.064924716949463, "gen_loss": 0.27468955516815186, "grad_norm": 0.39447356025785496, "learning_rate": 2.2765894736842104e-05, "loss": 0.2695, "mean_copy_accuracy": 0.9959512799978256, "mean_gen_accuracy": 0.8733518123626709, "mean_token_accuracy": 0.9086120873689651, "num_tokens": 668516732.0, "sample_num_tokens": 8352.5, "step": 6463, "total_num_tokens": 668550142.0, "z_loss": 0.00045472197234630585 }, { "copy_logits_max": -1.9459642171859741, "copy_logits_min": -750000000.0, "copy_num_tokens": 634.1875, "epoch": 1.3202450855246362, "gen_logits_max": 2.7938990592956543, "gen_logits_mean": -16.340896606445312, "gen_logits_min": -28.546775817871094, "gen_logits_std": 3.0542452335357666, "gen_loss": 0.26639068126678467, "grad_norm": 0.39983417327643533, "learning_rate": 2.276463157894737e-05, "loss": 0.2985, "mean_copy_accuracy": 0.9961787462234497, "mean_gen_accuracy": 0.86789770424366, "mean_token_accuracy": 0.8982237875461578, "num_tokens": 668771664.0, "sample_num_tokens": 9667.0, "step": 6464, "total_num_tokens": 668810332.0, "z_loss": 0.00045301340287551284 }, { "copy_logits_max": -2.712883949279785, "copy_logits_min": -750000000.0, "copy_num_tokens": 483.0625, "epoch": 1.320449323461833, "gen_logits_max": 2.8658623695373535, "gen_logits_mean": -17.230817794799805, "gen_logits_min": -29.020109176635742, "gen_logits_std": 3.0665273666381836, "gen_loss": 0.2939305007457733, "grad_norm": 0.38346119530506384, "learning_rate": 2.276336842105263e-05, "loss": 0.2965, "mean_copy_accuracy": 0.9956567287445068, "mean_gen_accuracy": 0.8712999671697617, "mean_token_accuracy": 0.9000956863164902, "num_tokens": 669021516.0, "sample_num_tokens": 8357.0, "step": 6465, "total_num_tokens": 669054944.0, "z_loss": 0.0005735390004701912 }, { "copy_logits_max": -4.50337028503418, "copy_logits_min": -687500032.0, "copy_num_tokens": 398.9375, "epoch": 1.32065356139903, "gen_logits_max": 2.8706235885620117, "gen_logits_mean": -17.962623596191406, "gen_logits_min": -29.702007293701172, "gen_logits_std": 3.082350492477417, "gen_loss": 0.3072463274002075, "grad_norm": 0.3888122679827419, "learning_rate": 2.2762105263157894e-05, "loss": 0.274, "mean_copy_accuracy": 0.9962266981601715, "mean_gen_accuracy": 0.8789082318544388, "mean_token_accuracy": 0.9080261886119843, "num_tokens": 669283721.0, "sample_num_tokens": 8136.25, "step": 6466, "total_num_tokens": 669316266.0, "z_loss": 0.0005224436754360795 }, { "copy_logits_max": -2.566586494445801, "copy_logits_min": -750000064.0, "copy_num_tokens": 522.5625, "epoch": 1.3208577993362267, "gen_logits_max": 2.3828957080841064, "gen_logits_mean": -17.527008056640625, "gen_logits_min": -29.34566879272461, "gen_logits_std": 3.069648265838623, "gen_loss": 0.27880197763442993, "grad_norm": 0.3728163601897504, "learning_rate": 2.276084210526316e-05, "loss": 0.271, "mean_copy_accuracy": 0.9958463311195374, "mean_gen_accuracy": 0.8784314095973969, "mean_token_accuracy": 0.9068543016910553, "num_tokens": 669557379.0, "sample_num_tokens": 9004.75, "step": 6467, "total_num_tokens": 669593398.0, "z_loss": 0.00047934427857398987 }, { "copy_logits_max": -3.6464428901672363, "copy_logits_min": -687500032.0, "copy_num_tokens": 458.375, "epoch": 1.3210620372734234, "gen_logits_max": 2.7792232036590576, "gen_logits_mean": -16.835174560546875, "gen_logits_min": -28.597553253173828, "gen_logits_std": 3.019866943359375, "gen_loss": 0.29331251978874207, "grad_norm": 0.42280904042100054, "learning_rate": 2.2759578947368423e-05, "loss": 0.2917, "mean_copy_accuracy": 0.9964099079370499, "mean_gen_accuracy": 0.8712028414011002, "mean_token_accuracy": 0.9003288447856903, "num_tokens": 669797021.0, "sample_num_tokens": 8209.25, "step": 6468, "total_num_tokens": 669829858.0, "z_loss": 0.0004889731062576175 }, { "copy_logits_max": -3.1769778728485107, "copy_logits_min": -750000000.0, "copy_num_tokens": 465.9375, "epoch": 1.3212662752106203, "gen_logits_max": 3.747966766357422, "gen_logits_mean": -16.17129135131836, "gen_logits_min": -27.77210235595703, "gen_logits_std": 3.0018506050109863, "gen_loss": 0.27925488352775574, "grad_norm": 0.38803937614631473, "learning_rate": 2.2758315789473687e-05, "loss": 0.2738, "mean_copy_accuracy": 0.9959921985864639, "mean_gen_accuracy": 0.8795956075191498, "mean_token_accuracy": 0.9099982976913452, "num_tokens": 670085246.0, "sample_num_tokens": 8742.5, "step": 6469, "total_num_tokens": 670120216.0, "z_loss": 0.0005273784627206624 }, { "copy_logits_max": -3.187690258026123, "copy_logits_min": -750000000.0, "copy_num_tokens": 473.625, "epoch": 1.3214705131478173, "gen_logits_max": 2.819113254547119, "gen_logits_mean": -17.366403579711914, "gen_logits_min": -29.045055389404297, "gen_logits_std": 3.044414520263672, "gen_loss": 0.2974800169467926, "grad_norm": 0.37093668894986, "learning_rate": 2.2757052631578948e-05, "loss": 0.2827, "mean_copy_accuracy": 0.9969807267189026, "mean_gen_accuracy": 0.8690271526575089, "mean_token_accuracy": 0.9030216187238693, "num_tokens": 670364021.0, "sample_num_tokens": 8656.25, "step": 6470, "total_num_tokens": 670398646.0, "z_loss": 0.0005587012856267393 }, { "copy_logits_max": -3.6798934936523438, "copy_logits_min": -687500032.0, "copy_num_tokens": 471.75, "epoch": 1.321674751085014, "gen_logits_max": 3.0724730491638184, "gen_logits_mean": -16.113101959228516, "gen_logits_min": -27.843276977539062, "gen_logits_std": 3.005096912384033, "gen_loss": 0.2813335955142975, "grad_norm": 0.3649546748703314, "learning_rate": 2.2755789473684212e-05, "loss": 0.2814, "mean_copy_accuracy": 0.9962827265262604, "mean_gen_accuracy": 0.8765042424201965, "mean_token_accuracy": 0.9048061519861221, "num_tokens": 670629497.0, "sample_num_tokens": 8370.25, "step": 6471, "total_num_tokens": 670662978.0, "z_loss": 0.0005083527648821473 }, { "copy_logits_max": -5.908378601074219, "copy_logits_min": -687500032.0, "copy_num_tokens": 579.1875, "epoch": 1.321878989022211, "gen_logits_max": 3.513221263885498, "gen_logits_mean": -15.836180686950684, "gen_logits_min": -27.522960662841797, "gen_logits_std": 2.986907958984375, "gen_loss": 0.27470123767852783, "grad_norm": 0.4087620554025341, "learning_rate": 2.2754526315789473e-05, "loss": 0.2846, "mean_copy_accuracy": 0.9970621764659882, "mean_gen_accuracy": 0.874138206243515, "mean_token_accuracy": 0.9039219319820404, "num_tokens": 670886030.0, "sample_num_tokens": 10610.0, "step": 6472, "total_num_tokens": 670928470.0, "z_loss": 0.0005300340708345175 }, { "copy_logits_max": -3.7931931018829346, "copy_logits_min": -625000064.0, "copy_num_tokens": 803.75, "epoch": 1.3220832269594078, "gen_logits_max": 2.102926731109619, "gen_logits_mean": -16.452634811401367, "gen_logits_min": -28.68692970275879, "gen_logits_std": 3.0017850399017334, "gen_loss": 0.261519193649292, "grad_norm": 0.362404996211535, "learning_rate": 2.2753263157894737e-05, "loss": 0.2765, "mean_copy_accuracy": 0.9971507787704468, "mean_gen_accuracy": 0.8758811950683594, "mean_token_accuracy": 0.9062799364328384, "num_tokens": 671177901.0, "sample_num_tokens": 10818.75, "step": 6473, "total_num_tokens": 671221176.0, "z_loss": 0.0004689493216574192 }, { "copy_logits_max": -5.119051933288574, "copy_logits_min": -750000000.0, "copy_num_tokens": 590.8125, "epoch": 1.3222874648966045, "gen_logits_max": 4.2056684494018555, "gen_logits_mean": -15.093080520629883, "gen_logits_min": -27.048694610595703, "gen_logits_std": 2.9949326515197754, "gen_loss": 0.25582873821258545, "grad_norm": 0.34431002880873257, "learning_rate": 2.2752e-05, "loss": 0.2633, "mean_copy_accuracy": 0.9959201812744141, "mean_gen_accuracy": 0.8794451653957367, "mean_token_accuracy": 0.909605085849762, "num_tokens": 671466341.0, "sample_num_tokens": 9098.25, "step": 6474, "total_num_tokens": 671502734.0, "z_loss": 0.0004873988509643823 }, { "copy_logits_max": -5.550461769104004, "copy_logits_min": -750000000.0, "copy_num_tokens": 452.125, "epoch": 1.3224917028338012, "gen_logits_max": 2.4086503982543945, "gen_logits_mean": -16.956539154052734, "gen_logits_min": -28.87749481201172, "gen_logits_std": 3.076310634613037, "gen_loss": 0.24491679668426514, "grad_norm": 0.38135262725856556, "learning_rate": 2.2750736842105263e-05, "loss": 0.2582, "mean_copy_accuracy": 0.9967555105686188, "mean_gen_accuracy": 0.8856403082609177, "mean_token_accuracy": 0.9122473299503326, "num_tokens": 671718055.0, "sample_num_tokens": 7294.75, "step": 6475, "total_num_tokens": 671747234.0, "z_loss": 0.00048674270510673523 }, { "copy_logits_max": -3.415019989013672, "copy_logits_min": -687500032.0, "copy_num_tokens": 469.125, "epoch": 1.3226959407709982, "gen_logits_max": 3.4417684078216553, "gen_logits_mean": -16.370075225830078, "gen_logits_min": -28.154830932617188, "gen_logits_std": 3.018355369567871, "gen_loss": 0.2818446755409241, "grad_norm": 0.35194706494224254, "learning_rate": 2.2749473684210527e-05, "loss": 0.2944, "mean_copy_accuracy": 0.9966091066598892, "mean_gen_accuracy": 0.8713272362947464, "mean_token_accuracy": 0.9013002663850784, "num_tokens": 671995715.0, "sample_num_tokens": 8670.25, "step": 6476, "total_num_tokens": 672030396.0, "z_loss": 0.0005725410883314908 }, { "copy_logits_max": -3.1121675968170166, "copy_logits_min": -687500032.0, "copy_num_tokens": 404.625, "epoch": 1.322900178708195, "gen_logits_max": 4.288273811340332, "gen_logits_mean": -14.852827072143555, "gen_logits_min": -26.204242706298828, "gen_logits_std": 2.9144253730773926, "gen_loss": 0.31429433822631836, "grad_norm": 0.4526774538352907, "learning_rate": 2.274821052631579e-05, "loss": 0.3021, "mean_copy_accuracy": 0.9967236965894699, "mean_gen_accuracy": 0.8681982010602951, "mean_token_accuracy": 0.8989256918430328, "num_tokens": 672277144.0, "sample_num_tokens": 8278.0, "step": 6477, "total_num_tokens": 672310256.0, "z_loss": 0.0006734146154485643 }, { "copy_logits_max": -5.467451095581055, "copy_logits_min": -750000000.0, "copy_num_tokens": 539.0, "epoch": 1.3231044166453918, "gen_logits_max": 3.658907413482666, "gen_logits_mean": -16.194292068481445, "gen_logits_min": -27.81866455078125, "gen_logits_std": 2.999951124191284, "gen_loss": 0.31053850054740906, "grad_norm": 0.35280045036105967, "learning_rate": 2.2746947368421052e-05, "loss": 0.2879, "mean_copy_accuracy": 0.9966307878494263, "mean_gen_accuracy": 0.8717483580112457, "mean_token_accuracy": 0.9033254235982895, "num_tokens": 672550456.0, "sample_num_tokens": 8929.5, "step": 6478, "total_num_tokens": 672586174.0, "z_loss": 0.0005749354604631662 }, { "copy_logits_max": -6.9947509765625, "copy_logits_min": -750000064.0, "copy_num_tokens": 332.8125, "epoch": 1.3233086545825887, "gen_logits_max": 3.3948144912719727, "gen_logits_mean": -17.629119873046875, "gen_logits_min": -28.857084274291992, "gen_logits_std": 3.021366596221924, "gen_loss": 0.31455063819885254, "grad_norm": 0.3509436883362236, "learning_rate": 2.2745684210526316e-05, "loss": 0.2763, "mean_copy_accuracy": 0.9970346689224243, "mean_gen_accuracy": 0.875310555100441, "mean_token_accuracy": 0.9064602106809616, "num_tokens": 672854681.0, "sample_num_tokens": 7705.25, "step": 6479, "total_num_tokens": 672885502.0, "z_loss": 0.0005346969701349735 }, { "copy_logits_max": -2.1854686737060547, "copy_logits_min": -750000000.0, "copy_num_tokens": 345.5, "epoch": 1.3235128925197857, "gen_logits_max": 4.48374080657959, "gen_logits_mean": -15.636756896972656, "gen_logits_min": -27.427677154541016, "gen_logits_std": 3.0005922317504883, "gen_loss": 0.3197041153907776, "grad_norm": 0.40556980515772195, "learning_rate": 2.274442105263158e-05, "loss": 0.2978, "mean_copy_accuracy": 0.9943819344043732, "mean_gen_accuracy": 0.8728205263614655, "mean_token_accuracy": 0.8998350650072098, "num_tokens": 673109030.0, "sample_num_tokens": 7728.5, "step": 6480, "total_num_tokens": 673139944.0, "z_loss": 0.0006067149806767702 }, { "copy_logits_max": -6.280679702758789, "copy_logits_min": -750000000.0, "copy_num_tokens": 442.0, "epoch": 1.3237171304569824, "gen_logits_max": 3.2134132385253906, "gen_logits_mean": -17.14425277709961, "gen_logits_min": -28.90515899658203, "gen_logits_std": 3.06451153755188, "gen_loss": 0.27963119745254517, "grad_norm": 0.41037832254267054, "learning_rate": 2.2743157894736842e-05, "loss": 0.2807, "mean_copy_accuracy": 0.9959095418453217, "mean_gen_accuracy": 0.8760102838277817, "mean_token_accuracy": 0.9054408669471741, "num_tokens": 673375493.0, "sample_num_tokens": 8569.75, "step": 6481, "total_num_tokens": 673409772.0, "z_loss": 0.0005171520169824362 }, { "copy_logits_max": -6.727799415588379, "copy_logits_min": -750000000.0, "copy_num_tokens": 283.0, "epoch": 1.3239213683941793, "gen_logits_max": 3.4224154949188232, "gen_logits_mean": -17.623144149780273, "gen_logits_min": -28.95775604248047, "gen_logits_std": 3.0486412048339844, "gen_loss": 0.2817143499851227, "grad_norm": 0.3654907418520429, "learning_rate": 2.2741894736842106e-05, "loss": 0.2818, "mean_copy_accuracy": 0.9966654181480408, "mean_gen_accuracy": 0.8770259916782379, "mean_token_accuracy": 0.904169037938118, "num_tokens": 673639660.0, "sample_num_tokens": 7371.5, "step": 6482, "total_num_tokens": 673669146.0, "z_loss": 0.0004737996496260166 }, { "copy_logits_max": -5.270899772644043, "copy_logits_min": -750000000.0, "copy_num_tokens": 391.875, "epoch": 1.324125606331376, "gen_logits_max": 3.921065330505371, "gen_logits_mean": -14.863044738769531, "gen_logits_min": -26.714895248413086, "gen_logits_std": 2.98626971244812, "gen_loss": 0.2761420011520386, "grad_norm": 0.3948260707436295, "learning_rate": 2.2740631578947367e-05, "loss": 0.287, "mean_copy_accuracy": 0.9961767643690109, "mean_gen_accuracy": 0.8703688234090805, "mean_token_accuracy": 0.9025828987360001, "num_tokens": 673893754.0, "sample_num_tokens": 7920.5, "step": 6483, "total_num_tokens": 673925436.0, "z_loss": 0.0004771957464981824 }, { "copy_logits_max": -3.3881678581237793, "copy_logits_min": -750000000.0, "copy_num_tokens": 527.375, "epoch": 1.324329844268573, "gen_logits_max": 3.305612087249756, "gen_logits_mean": -15.889429092407227, "gen_logits_min": -27.889713287353516, "gen_logits_std": 3.0263047218322754, "gen_loss": 0.29031407833099365, "grad_norm": 0.3559689620941674, "learning_rate": 2.2739368421052635e-05, "loss": 0.2529, "mean_copy_accuracy": 0.9961346387863159, "mean_gen_accuracy": 0.8839749097824097, "mean_token_accuracy": 0.9135407954454422, "num_tokens": 674168875.0, "sample_num_tokens": 8782.75, "step": 6484, "total_num_tokens": 674204006.0, "z_loss": 0.0005267648957669735 }, { "copy_logits_max": -6.24226188659668, "copy_logits_min": -687499968.0, "copy_num_tokens": 367.625, "epoch": 1.3245340822057696, "gen_logits_max": 3.634615898132324, "gen_logits_mean": -16.45466423034668, "gen_logits_min": -27.825828552246094, "gen_logits_std": 2.996814489364624, "gen_loss": 0.27263012528419495, "grad_norm": 0.40735744473316027, "learning_rate": 2.2738105263157896e-05, "loss": 0.2873, "mean_copy_accuracy": 0.9968875050544739, "mean_gen_accuracy": 0.8784564137458801, "mean_token_accuracy": 0.9049209803342819, "num_tokens": 674438835.0, "sample_num_tokens": 8355.25, "step": 6485, "total_num_tokens": 674472256.0, "z_loss": 0.0004359024460427463 }, { "copy_logits_max": -1.8649325370788574, "copy_logits_min": -750000064.0, "copy_num_tokens": 561.1875, "epoch": 1.3247383201429666, "gen_logits_max": 3.6595146656036377, "gen_logits_mean": -15.3326997756958, "gen_logits_min": -27.137699127197266, "gen_logits_std": 3.0118212699890137, "gen_loss": 0.27990061044692993, "grad_norm": 0.38456446624257595, "learning_rate": 2.273684210526316e-05, "loss": 0.2971, "mean_copy_accuracy": 0.9963032454252243, "mean_gen_accuracy": 0.8677933514118195, "mean_token_accuracy": 0.8997442126274109, "num_tokens": 674720468.0, "sample_num_tokens": 8891.0, "step": 6486, "total_num_tokens": 674756032.0, "z_loss": 0.000506310723721981 }, { "copy_logits_max": -5.76756477355957, "copy_logits_min": -750000000.0, "copy_num_tokens": 404.375, "epoch": 1.3249425580801635, "gen_logits_max": 3.505927562713623, "gen_logits_mean": -17.576847076416016, "gen_logits_min": -29.03184700012207, "gen_logits_std": 3.065471649169922, "gen_loss": 0.2974517047405243, "grad_norm": 0.3884494159338841, "learning_rate": 2.273557894736842e-05, "loss": 0.2681, "mean_copy_accuracy": 0.9969240576028824, "mean_gen_accuracy": 0.8790250569581985, "mean_token_accuracy": 0.9076800495386124, "num_tokens": 675004755.0, "sample_num_tokens": 8961.25, "step": 6487, "total_num_tokens": 675040600.0, "z_loss": 0.000555607199203223 }, { "copy_logits_max": -5.262560844421387, "copy_logits_min": -687500032.0, "copy_num_tokens": 394.125, "epoch": 1.3251467960173602, "gen_logits_max": 3.528468132019043, "gen_logits_mean": -16.818666458129883, "gen_logits_min": -28.366661071777344, "gen_logits_std": 3.0748071670532227, "gen_loss": 0.2779405117034912, "grad_norm": 0.413115171260718, "learning_rate": 2.2734315789473685e-05, "loss": 0.2885, "mean_copy_accuracy": 0.9957057982683182, "mean_gen_accuracy": 0.8736582249403, "mean_token_accuracy": 0.9016677588224411, "num_tokens": 675261002.0, "sample_num_tokens": 7750.5, "step": 6488, "total_num_tokens": 675292004.0, "z_loss": 0.0004829663666896522 }, { "copy_logits_max": -3.669656276702881, "copy_logits_min": -687500032.0, "copy_num_tokens": 524.625, "epoch": 1.3253510339545571, "gen_logits_max": 3.2413406372070312, "gen_logits_mean": -16.980199813842773, "gen_logits_min": -29.187585830688477, "gen_logits_std": 3.1382665634155273, "gen_loss": 0.26520639657974243, "grad_norm": 0.3553506483407723, "learning_rate": 2.2733052631578946e-05, "loss": 0.2733, "mean_copy_accuracy": 0.9954899698495865, "mean_gen_accuracy": 0.8790755867958069, "mean_token_accuracy": 0.9082776010036469, "num_tokens": 675529836.0, "sample_num_tokens": 8585.5, "step": 6489, "total_num_tokens": 675564178.0, "z_loss": 0.0005437443032860756 }, { "copy_logits_max": -4.6949262619018555, "copy_logits_min": -687500096.0, "copy_num_tokens": 476.875, "epoch": 1.3255552718917538, "gen_logits_max": 3.874929428100586, "gen_logits_mean": -16.17723274230957, "gen_logits_min": -28.42599868774414, "gen_logits_std": 3.058920383453369, "gen_loss": 0.2861521244049072, "grad_norm": 0.45690185151340773, "learning_rate": 2.273178947368421e-05, "loss": 0.2775, "mean_copy_accuracy": 0.9959787577390671, "mean_gen_accuracy": 0.8749029189348221, "mean_token_accuracy": 0.9058769345283508, "num_tokens": 675794936.0, "sample_num_tokens": 8623.5, "step": 6490, "total_num_tokens": 675829430.0, "z_loss": 0.0005228989175520837 }, { "copy_logits_max": -4.141627311706543, "copy_logits_min": -750000000.0, "copy_num_tokens": 546.4375, "epoch": 1.3257595098289507, "gen_logits_max": 4.59370231628418, "gen_logits_mean": -14.930572509765625, "gen_logits_min": -26.957626342773438, "gen_logits_std": 3.0505521297454834, "gen_loss": 0.28247570991516113, "grad_norm": 0.36574400970302184, "learning_rate": 2.273052631578947e-05, "loss": 0.2582, "mean_copy_accuracy": 0.9972254484891891, "mean_gen_accuracy": 0.8785753101110458, "mean_token_accuracy": 0.9140863120555878, "num_tokens": 676093892.0, "sample_num_tokens": 8794.0, "step": 6491, "total_num_tokens": 676129068.0, "z_loss": 0.0005024533020332456 }, { "copy_logits_max": -5.9752326011657715, "copy_logits_min": -750000000.0, "copy_num_tokens": 420.0, "epoch": 1.3259637477661474, "gen_logits_max": 2.692869186401367, "gen_logits_mean": -18.53624725341797, "gen_logits_min": -30.37433433532715, "gen_logits_std": 3.1502132415771484, "gen_loss": 0.264279842376709, "grad_norm": 0.44550833068674905, "learning_rate": 2.272926315789474e-05, "loss": 0.2739, "mean_copy_accuracy": 0.9963847249746323, "mean_gen_accuracy": 0.8749326169490814, "mean_token_accuracy": 0.9084220081567764, "num_tokens": 676375555.0, "sample_num_tokens": 7806.75, "step": 6492, "total_num_tokens": 676406782.0, "z_loss": 0.00047144401469267905 }, { "copy_logits_max": -5.013096332550049, "copy_logits_min": -750000064.0, "copy_num_tokens": 454.1875, "epoch": 1.3261679857033444, "gen_logits_max": 4.065736770629883, "gen_logits_mean": -15.328566551208496, "gen_logits_min": -26.711402893066406, "gen_logits_std": 3.017805576324463, "gen_loss": 0.28751829266548157, "grad_norm": 0.4026373852147372, "learning_rate": 2.2728000000000003e-05, "loss": 0.2908, "mean_copy_accuracy": 0.9957043826580048, "mean_gen_accuracy": 0.8717405498027802, "mean_token_accuracy": 0.9008971005678177, "num_tokens": 676639081.0, "sample_num_tokens": 8719.75, "step": 6493, "total_num_tokens": 676673960.0, "z_loss": 0.0004628981405403465 }, { "copy_logits_max": -5.425621032714844, "copy_logits_min": -750000000.0, "copy_num_tokens": 296.9375, "epoch": 1.3263722236405413, "gen_logits_max": 4.398741722106934, "gen_logits_mean": -15.135366439819336, "gen_logits_min": -26.659149169921875, "gen_logits_std": 2.9806809425354004, "gen_loss": 0.28313755989074707, "grad_norm": 0.4227685623269179, "learning_rate": 2.2726736842105264e-05, "loss": 0.2951, "mean_copy_accuracy": 0.9948443025350571, "mean_gen_accuracy": 0.8789307773113251, "mean_token_accuracy": 0.8995323181152344, "num_tokens": 676885038.0, "sample_num_tokens": 7863.0, "step": 6494, "total_num_tokens": 676916490.0, "z_loss": 0.0004754909605253488 }, { "copy_logits_max": -2.2342734336853027, "copy_logits_min": -750000000.0, "copy_num_tokens": 513.3125, "epoch": 1.326576461577738, "gen_logits_max": 4.369914531707764, "gen_logits_mean": -14.137195587158203, "gen_logits_min": -26.374374389648438, "gen_logits_std": 3.008957862854004, "gen_loss": 0.29493749141693115, "grad_norm": 0.3765113193734542, "learning_rate": 2.272547368421053e-05, "loss": 0.2814, "mean_copy_accuracy": 0.9956322461366653, "mean_gen_accuracy": 0.8752052634954453, "mean_token_accuracy": 0.9038743376731873, "num_tokens": 677132939.0, "sample_num_tokens": 8608.25, "step": 6495, "total_num_tokens": 677167372.0, "z_loss": 0.0006631819996982813 }, { "copy_logits_max": -2.987776279449463, "copy_logits_min": -750000000.0, "copy_num_tokens": 662.25, "epoch": 1.326780699514935, "gen_logits_max": 3.7371256351470947, "gen_logits_mean": -14.566993713378906, "gen_logits_min": -26.341384887695312, "gen_logits_std": 3.0070858001708984, "gen_loss": 0.2907249927520752, "grad_norm": 0.3657662650442264, "learning_rate": 2.272421052631579e-05, "loss": 0.2722, "mean_copy_accuracy": 0.9974697828292847, "mean_gen_accuracy": 0.8740136623382568, "mean_token_accuracy": 0.908838614821434, "num_tokens": 677411707.0, "sample_num_tokens": 9022.25, "step": 6496, "total_num_tokens": 677447796.0, "z_loss": 0.000702973804436624 }, { "copy_logits_max": -3.5342912673950195, "copy_logits_min": -687500032.0, "copy_num_tokens": 505.4375, "epoch": 1.3269849374521319, "gen_logits_max": 5.125273704528809, "gen_logits_mean": -13.493946075439453, "gen_logits_min": -25.902847290039062, "gen_logits_std": 3.0099985599517822, "gen_loss": 0.2784954905509949, "grad_norm": 0.36968147336906804, "learning_rate": 2.2722947368421054e-05, "loss": 0.2703, "mean_copy_accuracy": 0.996692955493927, "mean_gen_accuracy": 0.8773521333932877, "mean_token_accuracy": 0.9088519662618637, "num_tokens": 677690329.0, "sample_num_tokens": 8693.25, "step": 6497, "total_num_tokens": 677725102.0, "z_loss": 0.0006615051534026861 }, { "copy_logits_max": -5.031960487365723, "copy_logits_min": -750000000.0, "copy_num_tokens": 465.9375, "epoch": 1.3271891753893286, "gen_logits_max": 4.663747310638428, "gen_logits_mean": -15.337799072265625, "gen_logits_min": -26.53217315673828, "gen_logits_std": 3.010298728942871, "gen_loss": 0.30365073680877686, "grad_norm": 0.3913723445839969, "learning_rate": 2.2721684210526315e-05, "loss": 0.2948, "mean_copy_accuracy": 0.9957020431756973, "mean_gen_accuracy": 0.8718939870595932, "mean_token_accuracy": 0.9002253860235214, "num_tokens": 677936944.0, "sample_num_tokens": 8793.0, "step": 6498, "total_num_tokens": 677972116.0, "z_loss": 0.0005995614337734878 }, { "copy_logits_max": -5.081512928009033, "copy_logits_min": -750000000.0, "copy_num_tokens": 513.3125, "epoch": 1.3273934133265253, "gen_logits_max": 4.744961738586426, "gen_logits_mean": -15.016948699951172, "gen_logits_min": -26.84502410888672, "gen_logits_std": 3.071899890899658, "gen_loss": 0.3070247173309326, "grad_norm": 0.35936559171340077, "learning_rate": 2.272042105263158e-05, "loss": 0.2783, "mean_copy_accuracy": 0.9954283982515335, "mean_gen_accuracy": 0.876071110367775, "mean_token_accuracy": 0.9059441238641739, "num_tokens": 678242433.0, "sample_num_tokens": 8750.75, "step": 6499, "total_num_tokens": 678277436.0, "z_loss": 0.0006095331627875566 }, { "epoch": 1.3275976512637222, "grad_norm": 0.35949934771525366, "learning_rate": 2.2719157894736843e-05, "loss": 0.2623, "step": 6500 }, { "epoch": 1.3275976512637222, "eval_copy_logits_max": -7.368526458740234, "eval_copy_logits_min": -81.79364013671875, "eval_gen_logits_max": 2.8069934844970703, "eval_gen_logits_mean": -21.171184539794922, "eval_gen_logits_min": -32.21269607543945, "eval_gen_logits_std": 3.161245822906494, "eval_gen_loss": 0.32552847266197205, "eval_loss": 0.3006582260131836, "eval_mean_copy_accuracy": 0.9933502078056335, "eval_mean_gen_accuracy": 0.8827762603759766, "eval_mean_token_accuracy": 0.8971404433250427, "eval_num_tokens": 678550946.0, "eval_runtime": 0.6839, "eval_samples_per_second": 11.698, "eval_steps_per_second": 2.924, "eval_total_num_tokens": 678550946.0, "eval_z_loss": 0.0005272179841995239, "step": 6500 }, { "copy_logits_max": -4.757665634155273, "copy_logits_min": -750000000.0, "copy_num_tokens": 414.375, "epoch": 1.3278018892009191, "gen_logits_max": 4.526881694793701, "gen_logits_mean": -15.047636032104492, "gen_logits_min": -26.800586700439453, "gen_logits_std": 3.064532518386841, "gen_loss": 0.28583860397338867, "grad_norm": 0.3656883499376238, "learning_rate": 2.2717894736842108e-05, "loss": 0.2821, "mean_copy_accuracy": 0.9960514307022095, "mean_gen_accuracy": 0.879428543150425, "mean_token_accuracy": 0.9078773260116577, "num_tokens": 678779067.0, "sample_num_tokens": 8147.25, "step": 6501, "total_num_tokens": 678811656.0, "z_loss": 0.0005160103319212794 }, { "copy_logits_max": -3.9322926998138428, "copy_logits_min": -750000000.0, "copy_num_tokens": 484.4375, "epoch": 1.3280061271381158, "gen_logits_max": 3.5806355476379395, "gen_logits_mean": -16.898500442504883, "gen_logits_min": -28.734355926513672, "gen_logits_std": 3.0891501903533936, "gen_loss": 0.29319795966148376, "grad_norm": 0.37841934239263103, "learning_rate": 2.271663157894737e-05, "loss": 0.2654, "mean_copy_accuracy": 0.9959165751934052, "mean_gen_accuracy": 0.8808063119649887, "mean_token_accuracy": 0.9106682538986206, "num_tokens": 679033601.0, "sample_num_tokens": 8464.75, "step": 6502, "total_num_tokens": 679067460.0, "z_loss": 0.0005410578451119363 }, { "copy_logits_max": -4.988562107086182, "copy_logits_min": -750000000.0, "copy_num_tokens": 406.75, "epoch": 1.3282103650753128, "gen_logits_max": 4.5731377601623535, "gen_logits_mean": -16.088788986206055, "gen_logits_min": -27.399301528930664, "gen_logits_std": 3.0305752754211426, "gen_loss": 0.2818956673145294, "grad_norm": 0.35982332916485166, "learning_rate": 2.2715368421052633e-05, "loss": 0.2688, "mean_copy_accuracy": 0.9973507821559906, "mean_gen_accuracy": 0.8821029514074326, "mean_token_accuracy": 0.9098737835884094, "num_tokens": 679327340.0, "sample_num_tokens": 9287.0, "step": 6503, "total_num_tokens": 679364488.0, "z_loss": 0.0004902890650555491 }, { "copy_logits_max": -1.3904709815979004, "copy_logits_min": -750000000.0, "copy_num_tokens": 575.25, "epoch": 1.3284146030125097, "gen_logits_max": 2.946704864501953, "gen_logits_mean": -16.581911087036133, "gen_logits_min": -29.39461898803711, "gen_logits_std": 3.14727783203125, "gen_loss": 0.2590431571006775, "grad_norm": 0.3490690300604483, "learning_rate": 2.2714105263157894e-05, "loss": 0.2442, "mean_copy_accuracy": 0.9973829686641693, "mean_gen_accuracy": 0.8843802511692047, "mean_token_accuracy": 0.9176937490701675, "num_tokens": 679603618.0, "sample_num_tokens": 8314.5, "step": 6504, "total_num_tokens": 679636876.0, "z_loss": 0.000491464277729392 }, { "copy_logits_max": -4.973177909851074, "copy_logits_min": -750000000.0, "copy_num_tokens": 334.5625, "epoch": 1.3286188409497064, "gen_logits_max": 4.850909233093262, "gen_logits_mean": -15.495803833007812, "gen_logits_min": -27.259071350097656, "gen_logits_std": 3.0403428077697754, "gen_loss": 0.29660338163375854, "grad_norm": 0.3532040201104138, "learning_rate": 2.2712842105263158e-05, "loss": 0.2795, "mean_copy_accuracy": 0.9961091130971909, "mean_gen_accuracy": 0.8796968758106232, "mean_token_accuracy": 0.9060664772987366, "num_tokens": 679866500.0, "sample_num_tokens": 7756.0, "step": 6505, "total_num_tokens": 679897524.0, "z_loss": 0.0004902351647615433 }, { "copy_logits_max": -3.0145070552825928, "copy_logits_min": -750000064.0, "copy_num_tokens": 584.875, "epoch": 1.3288230788869033, "gen_logits_max": 3.282115936279297, "gen_logits_mean": -16.107219696044922, "gen_logits_min": -28.544063568115234, "gen_logits_std": 3.073655128479004, "gen_loss": 0.28948521614074707, "grad_norm": 0.35966949411925897, "learning_rate": 2.2711578947368422e-05, "loss": 0.272, "mean_copy_accuracy": 0.9962029606103897, "mean_gen_accuracy": 0.8770421147346497, "mean_token_accuracy": 0.9066374897956848, "num_tokens": 680160450.0, "sample_num_tokens": 8811.5, "step": 6506, "total_num_tokens": 680195696.0, "z_loss": 0.0005345428362488747 }, { "copy_logits_max": -4.053772449493408, "copy_logits_min": -750000000.0, "copy_num_tokens": 467.5, "epoch": 1.3290273168241, "gen_logits_max": 4.002148151397705, "gen_logits_mean": -15.028857231140137, "gen_logits_min": -27.307432174682617, "gen_logits_std": 2.98476505279541, "gen_loss": 0.2885063886642456, "grad_norm": 0.44071782873548243, "learning_rate": 2.2710315789473683e-05, "loss": 0.2921, "mean_copy_accuracy": 0.9958391487598419, "mean_gen_accuracy": 0.8743336945772171, "mean_token_accuracy": 0.9006942212581635, "num_tokens": 680426364.0, "sample_num_tokens": 8557.0, "step": 6507, "total_num_tokens": 680460592.0, "z_loss": 0.0005593409878201783 }, { "copy_logits_max": -5.143159866333008, "copy_logits_min": -625000064.0, "copy_num_tokens": 554.5625, "epoch": 1.329231554761297, "gen_logits_max": 2.8078982830047607, "gen_logits_mean": -17.345394134521484, "gen_logits_min": -29.57792854309082, "gen_logits_std": 3.100524425506592, "gen_loss": 0.23486466705799103, "grad_norm": 0.38746869765133046, "learning_rate": 2.2709052631578948e-05, "loss": 0.275, "mean_copy_accuracy": 0.9961296021938324, "mean_gen_accuracy": 0.8803047984838486, "mean_token_accuracy": 0.9073480367660522, "num_tokens": 680693903.0, "sample_num_tokens": 8848.75, "step": 6508, "total_num_tokens": 680729298.0, "z_loss": 0.0005205453489907086 }, { "copy_logits_max": -5.186015605926514, "copy_logits_min": -687500032.0, "copy_num_tokens": 652.75, "epoch": 1.3294357926984937, "gen_logits_max": 2.6840269565582275, "gen_logits_mean": -17.219478607177734, "gen_logits_min": -29.213457107543945, "gen_logits_std": 3.1126363277435303, "gen_loss": 0.26319485902786255, "grad_norm": 0.3918980374311222, "learning_rate": 2.2707789473684212e-05, "loss": 0.2563, "mean_copy_accuracy": 0.9973415285348892, "mean_gen_accuracy": 0.8738577216863632, "mean_token_accuracy": 0.9126044362783432, "num_tokens": 680985781.0, "sample_num_tokens": 9996.25, "step": 6509, "total_num_tokens": 681025766.0, "z_loss": 0.00048812024760991335 }, { "copy_logits_max": -4.6545209884643555, "copy_logits_min": -562500032.0, "copy_num_tokens": 614.6875, "epoch": 1.3296400306356906, "gen_logits_max": 5.2620320320129395, "gen_logits_mean": -14.05929183959961, "gen_logits_min": -26.64950180053711, "gen_logits_std": 3.043783187866211, "gen_loss": 0.2745548188686371, "grad_norm": 0.3682170360148779, "learning_rate": 2.2706526315789476e-05, "loss": 0.2756, "mean_copy_accuracy": 0.996647983789444, "mean_gen_accuracy": 0.8748040199279785, "mean_token_accuracy": 0.9063196182250977, "num_tokens": 681260892.0, "sample_num_tokens": 9774.5, "step": 6510, "total_num_tokens": 681299990.0, "z_loss": 0.0005453816847875714 }, { "copy_logits_max": -5.351572036743164, "copy_logits_min": -750000000.0, "copy_num_tokens": 305.3125, "epoch": 1.3298442685728875, "gen_logits_max": 5.584977149963379, "gen_logits_mean": -15.35543441772461, "gen_logits_min": -27.085763931274414, "gen_logits_std": 3.0819485187530518, "gen_loss": 0.31084591150283813, "grad_norm": 0.41096322127671436, "learning_rate": 2.2705263157894737e-05, "loss": 0.2948, "mean_copy_accuracy": 0.9953965097665787, "mean_gen_accuracy": 0.8724645674228668, "mean_token_accuracy": 0.9001214802265167, "num_tokens": 681530334.0, "sample_num_tokens": 7743.5, "step": 6511, "total_num_tokens": 681561308.0, "z_loss": 0.0005886688595637679 }, { "copy_logits_max": -3.811373710632324, "copy_logits_min": -687500032.0, "copy_num_tokens": 570.75, "epoch": 1.3300485065100842, "gen_logits_max": 2.2189927101135254, "gen_logits_mean": -18.436664581298828, "gen_logits_min": -30.70475196838379, "gen_logits_std": 3.176903247833252, "gen_loss": 0.2784687280654907, "grad_norm": 0.44893838070447006, "learning_rate": 2.2704e-05, "loss": 0.3138, "mean_copy_accuracy": 0.9954449385404587, "mean_gen_accuracy": 0.8610031455755234, "mean_token_accuracy": 0.8936209976673126, "num_tokens": 681801991.0, "sample_num_tokens": 8926.25, "step": 6512, "total_num_tokens": 681837696.0, "z_loss": 0.0005573880043812096 }, { "copy_logits_max": -4.578321933746338, "copy_logits_min": -750000000.0, "copy_num_tokens": 470.8125, "epoch": 1.3302527444472811, "gen_logits_max": 5.489400863647461, "gen_logits_mean": -13.141764640808105, "gen_logits_min": -25.225481033325195, "gen_logits_std": 3.029691696166992, "gen_loss": 0.2952173352241516, "grad_norm": 0.40673223580708573, "learning_rate": 2.2702736842105262e-05, "loss": 0.2773, "mean_copy_accuracy": 0.9967741519212723, "mean_gen_accuracy": 0.8792298138141632, "mean_token_accuracy": 0.9067609906196594, "num_tokens": 682066258.0, "sample_num_tokens": 8752.5, "step": 6513, "total_num_tokens": 682101268.0, "z_loss": 0.0005607841303572059 }, { "copy_logits_max": -3.6587915420532227, "copy_logits_min": -750000064.0, "copy_num_tokens": 500.1875, "epoch": 1.3304569823844778, "gen_logits_max": 3.100343704223633, "gen_logits_mean": -17.144702911376953, "gen_logits_min": -29.171409606933594, "gen_logits_std": 3.1398768424987793, "gen_loss": 0.22887755930423737, "grad_norm": 0.4042117188507484, "learning_rate": 2.2701473684210527e-05, "loss": 0.276, "mean_copy_accuracy": 0.9963089674711227, "mean_gen_accuracy": 0.874392181634903, "mean_token_accuracy": 0.9067694395780563, "num_tokens": 682323334.0, "sample_num_tokens": 7968.5, "step": 6514, "total_num_tokens": 682355208.0, "z_loss": 0.0005277713062241673 }, { "copy_logits_max": -4.748689651489258, "copy_logits_min": -750000000.0, "copy_num_tokens": 419.1875, "epoch": 1.3306612203216748, "gen_logits_max": 4.245330810546875, "gen_logits_mean": -15.291238784790039, "gen_logits_min": -27.371994018554688, "gen_logits_std": 3.0749411582946777, "gen_loss": 0.2583833932876587, "grad_norm": 0.37310680885551495, "learning_rate": 2.2700210526315788e-05, "loss": 0.2787, "mean_copy_accuracy": 0.9958234131336212, "mean_gen_accuracy": 0.8805978745222092, "mean_token_accuracy": 0.9049879610538483, "num_tokens": 682587838.0, "sample_num_tokens": 7242.5, "step": 6515, "total_num_tokens": 682616808.0, "z_loss": 0.0004815561987925321 }, { "copy_logits_max": -5.849756717681885, "copy_logits_min": -750000000.0, "copy_num_tokens": 460.375, "epoch": 1.3308654582588715, "gen_logits_max": 4.804593086242676, "gen_logits_mean": -15.431109428405762, "gen_logits_min": -27.33053207397461, "gen_logits_std": 3.0882022380828857, "gen_loss": 0.2784704566001892, "grad_norm": 0.38602868975306165, "learning_rate": 2.2698947368421052e-05, "loss": 0.2845, "mean_copy_accuracy": 0.9962056130170822, "mean_gen_accuracy": 0.8728995025157928, "mean_token_accuracy": 0.9034039378166199, "num_tokens": 682865878.0, "sample_num_tokens": 9209.0, "step": 6516, "total_num_tokens": 682902714.0, "z_loss": 0.0004970533773303032 }, { "copy_logits_max": -6.929206848144531, "copy_logits_min": -750000000.0, "copy_num_tokens": 458.25, "epoch": 1.3310696961960684, "gen_logits_max": 4.023047924041748, "gen_logits_mean": -16.755586624145508, "gen_logits_min": -28.895856857299805, "gen_logits_std": 3.1366519927978516, "gen_loss": 0.267422080039978, "grad_norm": 0.36127388536581945, "learning_rate": 2.2697684210526316e-05, "loss": 0.2718, "mean_copy_accuracy": 0.9963438510894775, "mean_gen_accuracy": 0.8785978406667709, "mean_token_accuracy": 0.9076039046049118, "num_tokens": 683134303.0, "sample_num_tokens": 8479.75, "step": 6517, "total_num_tokens": 683168222.0, "z_loss": 0.0005170410731807351 }, { "copy_logits_max": -4.269261360168457, "copy_logits_min": -750000000.0, "copy_num_tokens": 506.1875, "epoch": 1.3312739341332653, "gen_logits_max": 4.986596584320068, "gen_logits_mean": -15.356328964233398, "gen_logits_min": -27.48329734802246, "gen_logits_std": 3.0799708366394043, "gen_loss": 0.268951416015625, "grad_norm": 0.3345334853888955, "learning_rate": 2.269642105263158e-05, "loss": 0.2657, "mean_copy_accuracy": 0.9962048530578613, "mean_gen_accuracy": 0.8828834742307663, "mean_token_accuracy": 0.9101562052965164, "num_tokens": 683410091.0, "sample_num_tokens": 8846.75, "step": 6518, "total_num_tokens": 683445478.0, "z_loss": 0.0005645040655508637 }, { "copy_logits_max": -6.245822906494141, "copy_logits_min": -750000000.0, "copy_num_tokens": 400.6875, "epoch": 1.331478172070462, "gen_logits_max": 5.35463809967041, "gen_logits_mean": -15.058645248413086, "gen_logits_min": -27.906972885131836, "gen_logits_std": 3.1072006225585938, "gen_loss": 0.2550458610057831, "grad_norm": 0.35579239580947536, "learning_rate": 2.2695157894736845e-05, "loss": 0.2801, "mean_copy_accuracy": 0.9954910427331924, "mean_gen_accuracy": 0.8758040368556976, "mean_token_accuracy": 0.905053436756134, "num_tokens": 683680408.0, "sample_num_tokens": 7064.5, "step": 6519, "total_num_tokens": 683708666.0, "z_loss": 0.000549547839909792 }, { "copy_logits_max": -8.319578170776367, "copy_logits_min": -750000000.0, "copy_num_tokens": 249.0, "epoch": 1.331682410007659, "gen_logits_max": 6.480235576629639, "gen_logits_mean": -13.818412780761719, "gen_logits_min": -26.198741912841797, "gen_logits_std": 3.0381150245666504, "gen_loss": 0.31511402130126953, "grad_norm": 0.3875229177779439, "learning_rate": 2.2693894736842106e-05, "loss": 0.2898, "mean_copy_accuracy": 0.9947658777236938, "mean_gen_accuracy": 0.8743275701999664, "mean_token_accuracy": 0.9007097631692886, "num_tokens": 683936779.0, "sample_num_tokens": 7302.75, "step": 6520, "total_num_tokens": 683965990.0, "z_loss": 0.0006141390185803175 }, { "copy_logits_max": -7.090958595275879, "copy_logits_min": -750000000.0, "copy_num_tokens": 455.125, "epoch": 1.331886647944856, "gen_logits_max": 4.607579231262207, "gen_logits_mean": -15.402414321899414, "gen_logits_min": -27.61888885498047, "gen_logits_std": 3.1031007766723633, "gen_loss": 0.280856728553772, "grad_norm": 0.37566741117576014, "learning_rate": 2.269263157894737e-05, "loss": 0.2819, "mean_copy_accuracy": 0.9954180866479874, "mean_gen_accuracy": 0.8771000653505325, "mean_token_accuracy": 0.9052578061819077, "num_tokens": 684188782.0, "sample_num_tokens": 9497.5, "step": 6521, "total_num_tokens": 684226772.0, "z_loss": 0.0005310840788297355 }, { "copy_logits_max": -6.867443561553955, "copy_logits_min": -750000000.0, "copy_num_tokens": 466.1875, "epoch": 1.3320908858820526, "gen_logits_max": 5.384953498840332, "gen_logits_mean": -14.111778259277344, "gen_logits_min": -26.36353302001953, "gen_logits_std": 3.0639562606811523, "gen_loss": 0.2561212182044983, "grad_norm": 0.37867833502467463, "learning_rate": 2.269136842105263e-05, "loss": 0.2738, "mean_copy_accuracy": 0.9967107027769089, "mean_gen_accuracy": 0.8763161599636078, "mean_token_accuracy": 0.9056411683559418, "num_tokens": 684481254.0, "sample_num_tokens": 8613.5, "step": 6522, "total_num_tokens": 684515708.0, "z_loss": 0.0004902094369754195 }, { "copy_logits_max": -6.856880187988281, "copy_logits_min": -750000000.0, "copy_num_tokens": 383.75, "epoch": 1.3322951238192493, "gen_logits_max": 4.601212501525879, "gen_logits_mean": -15.388681411743164, "gen_logits_min": -27.69567108154297, "gen_logits_std": 3.107163667678833, "gen_loss": 0.27243661880493164, "grad_norm": 0.3756273256125328, "learning_rate": 2.2690105263157895e-05, "loss": 0.2844, "mean_copy_accuracy": 0.9959441870450974, "mean_gen_accuracy": 0.8755082935094833, "mean_token_accuracy": 0.9030013233423233, "num_tokens": 684735468.0, "sample_num_tokens": 7742.0, "step": 6523, "total_num_tokens": 684766436.0, "z_loss": 0.0005162870511412621 }, { "copy_logits_max": -5.711567401885986, "copy_logits_min": -750000000.0, "copy_num_tokens": 530.4375, "epoch": 1.3324993617564462, "gen_logits_max": 4.733748435974121, "gen_logits_mean": -14.134599685668945, "gen_logits_min": -26.526731491088867, "gen_logits_std": 3.079829454421997, "gen_loss": 0.2997073233127594, "grad_norm": 0.3509881919255131, "learning_rate": 2.2688842105263156e-05, "loss": 0.2789, "mean_copy_accuracy": 0.9973210990428925, "mean_gen_accuracy": 0.8774093985557556, "mean_token_accuracy": 0.9076132476329803, "num_tokens": 685012935.0, "sample_num_tokens": 8524.25, "step": 6524, "total_num_tokens": 685047032.0, "z_loss": 0.0006156090530566871 }, { "copy_logits_max": -5.127912998199463, "copy_logits_min": -750000064.0, "copy_num_tokens": 461.0, "epoch": 1.3327035996936432, "gen_logits_max": 5.0535173416137695, "gen_logits_mean": -14.770322799682617, "gen_logits_min": -26.894386291503906, "gen_logits_std": 3.0873360633850098, "gen_loss": 0.31547972559928894, "grad_norm": 0.37562616293242146, "learning_rate": 2.2687578947368424e-05, "loss": 0.2748, "mean_copy_accuracy": 0.9975084513425827, "mean_gen_accuracy": 0.876486212015152, "mean_token_accuracy": 0.9074946939945221, "num_tokens": 685285743.0, "sample_num_tokens": 9562.75, "step": 6525, "total_num_tokens": 685323994.0, "z_loss": 0.0005953480722382665 }, { "copy_logits_max": -5.728702545166016, "copy_logits_min": -750000000.0, "copy_num_tokens": 523.8125, "epoch": 1.3329078376308399, "gen_logits_max": 3.7013425827026367, "gen_logits_mean": -16.806245803833008, "gen_logits_min": -29.19507598876953, "gen_logits_std": 3.136991500854492, "gen_loss": 0.28171011805534363, "grad_norm": 0.3657291399459368, "learning_rate": 2.2686315789473685e-05, "loss": 0.2931, "mean_copy_accuracy": 0.9967691600322723, "mean_gen_accuracy": 0.873898908495903, "mean_token_accuracy": 0.900763675570488, "num_tokens": 685558848.0, "sample_num_tokens": 9360.5, "step": 6526, "total_num_tokens": 685596290.0, "z_loss": 0.0005119962734170258 }, { "copy_logits_max": -7.459989547729492, "copy_logits_min": -750000000.0, "copy_num_tokens": 328.1875, "epoch": 1.3331120755680368, "gen_logits_max": 4.970292091369629, "gen_logits_mean": -14.44936752319336, "gen_logits_min": -26.40689468383789, "gen_logits_std": 3.0728201866149902, "gen_loss": 0.2640431225299835, "grad_norm": 0.3652075087314215, "learning_rate": 2.268505263157895e-05, "loss": 0.2749, "mean_copy_accuracy": 0.9959330260753632, "mean_gen_accuracy": 0.8778659701347351, "mean_token_accuracy": 0.9063519090414047, "num_tokens": 685828976.0, "sample_num_tokens": 7754.0, "step": 6527, "total_num_tokens": 685859992.0, "z_loss": 0.0004916819743812084 }, { "copy_logits_max": -6.967914581298828, "copy_logits_min": -750000000.0, "copy_num_tokens": 395.8125, "epoch": 1.3333163135052337, "gen_logits_max": 4.569777488708496, "gen_logits_mean": -15.775062561035156, "gen_logits_min": -27.877168655395508, "gen_logits_std": 3.1236720085144043, "gen_loss": 0.3031843900680542, "grad_norm": 0.36003375804965276, "learning_rate": 2.268378947368421e-05, "loss": 0.2896, "mean_copy_accuracy": 0.9961051493883133, "mean_gen_accuracy": 0.8715856224298477, "mean_token_accuracy": 0.9008388221263885, "num_tokens": 686104781.0, "sample_num_tokens": 8413.75, "step": 6528, "total_num_tokens": 686138436.0, "z_loss": 0.0005362315569072962 }, { "copy_logits_max": -4.524280548095703, "copy_logits_min": -750000064.0, "copy_num_tokens": 459.25, "epoch": 1.3335205514424304, "gen_logits_max": 5.902299880981445, "gen_logits_mean": -12.771431922912598, "gen_logits_min": -25.300559997558594, "gen_logits_std": 3.0785903930664062, "gen_loss": 0.2695392072200775, "grad_norm": 0.3697147665036643, "learning_rate": 2.2682526315789474e-05, "loss": 0.2836, "mean_copy_accuracy": 0.996126651763916, "mean_gen_accuracy": 0.8736391961574554, "mean_token_accuracy": 0.9016827791929245, "num_tokens": 686384442.0, "sample_num_tokens": 8270.0, "step": 6529, "total_num_tokens": 686417522.0, "z_loss": 0.00046770379412919283 }, { "copy_logits_max": -4.897780418395996, "copy_logits_min": -750000000.0, "copy_num_tokens": 696.4375, "epoch": 1.3337247893796271, "gen_logits_max": 3.8052382469177246, "gen_logits_mean": -15.381935119628906, "gen_logits_min": -27.977169036865234, "gen_logits_std": 3.131444215774536, "gen_loss": 0.28396695852279663, "grad_norm": 0.3764613370005059, "learning_rate": 2.2681263157894735e-05, "loss": 0.2716, "mean_copy_accuracy": 0.9970397502183914, "mean_gen_accuracy": 0.8746695071458817, "mean_token_accuracy": 0.9100570678710938, "num_tokens": 686662109.0, "sample_num_tokens": 10185.25, "step": 6530, "total_num_tokens": 686702850.0, "z_loss": 0.0004609891911968589 }, { "copy_logits_max": -7.138371467590332, "copy_logits_min": -750000000.0, "copy_num_tokens": 371.125, "epoch": 1.333929027316824, "gen_logits_max": 4.780501842498779, "gen_logits_mean": -14.674287796020508, "gen_logits_min": -27.016582489013672, "gen_logits_std": 3.080503463745117, "gen_loss": 0.2958548665046692, "grad_norm": 0.3869002231186963, "learning_rate": 2.268e-05, "loss": 0.2798, "mean_copy_accuracy": 0.9970269948244095, "mean_gen_accuracy": 0.8733604848384857, "mean_token_accuracy": 0.9039544463157654, "num_tokens": 686935306.0, "sample_num_tokens": 7510.0, "step": 6531, "total_num_tokens": 686965346.0, "z_loss": 0.00048816160415299237 }, { "copy_logits_max": -5.7585649490356445, "copy_logits_min": -687500032.0, "copy_num_tokens": 401.0625, "epoch": 1.334133265254021, "gen_logits_max": 3.371023654937744, "gen_logits_mean": -16.485151290893555, "gen_logits_min": -28.906885147094727, "gen_logits_std": 3.164388656616211, "gen_loss": 0.2441735565662384, "grad_norm": 0.37990436411421286, "learning_rate": 2.267873684210526e-05, "loss": 0.2725, "mean_copy_accuracy": 0.9956367909908295, "mean_gen_accuracy": 0.8785832524299622, "mean_token_accuracy": 0.9080619215965271, "num_tokens": 687203245.0, "sample_num_tokens": 7573.25, "step": 6532, "total_num_tokens": 687233538.0, "z_loss": 0.00036982481833547354 }, { "copy_logits_max": -4.716609954833984, "copy_logits_min": -750000000.0, "copy_num_tokens": 519.4375, "epoch": 1.3343375031912177, "gen_logits_max": 4.6503705978393555, "gen_logits_mean": -13.864476203918457, "gen_logits_min": -26.365245819091797, "gen_logits_std": 3.0684304237365723, "gen_loss": 0.25349265336990356, "grad_norm": 0.37587372236454, "learning_rate": 2.267747368421053e-05, "loss": 0.264, "mean_copy_accuracy": 0.9962109476327896, "mean_gen_accuracy": 0.8808545768260956, "mean_token_accuracy": 0.9084265381097794, "num_tokens": 687468600.0, "sample_num_tokens": 8877.0, "step": 6533, "total_num_tokens": 687504108.0, "z_loss": 0.00041881599463522434 }, { "copy_logits_max": -4.952017784118652, "copy_logits_min": -687500032.0, "copy_num_tokens": 398.125, "epoch": 1.3345417411284146, "gen_logits_max": 4.178117752075195, "gen_logits_mean": -16.02187156677246, "gen_logits_min": -28.102693557739258, "gen_logits_std": 3.106132984161377, "gen_loss": 0.30896660685539246, "grad_norm": 0.3568976145994992, "learning_rate": 2.2676210526315793e-05, "loss": 0.2784, "mean_copy_accuracy": 0.9960793554782867, "mean_gen_accuracy": 0.8770337402820587, "mean_token_accuracy": 0.9059410989284515, "num_tokens": 687728252.0, "sample_num_tokens": 7344.0, "step": 6534, "total_num_tokens": 687757628.0, "z_loss": 0.00048766544205136597 }, { "copy_logits_max": -5.079489231109619, "copy_logits_min": -750000000.0, "copy_num_tokens": 457.9375, "epoch": 1.3347459790656115, "gen_logits_max": 4.747935771942139, "gen_logits_mean": -15.235383987426758, "gen_logits_min": -27.52696990966797, "gen_logits_std": 3.073991537094116, "gen_loss": 0.30190154910087585, "grad_norm": 0.35887097782162, "learning_rate": 2.2674947368421054e-05, "loss": 0.2727, "mean_copy_accuracy": 0.9971035420894623, "mean_gen_accuracy": 0.8750435709953308, "mean_token_accuracy": 0.906242161989212, "num_tokens": 688009028.0, "sample_num_tokens": 9469.5, "step": 6535, "total_num_tokens": 688046906.0, "z_loss": 0.0004974277690052986 }, { "copy_logits_max": -5.36057186126709, "copy_logits_min": -750000000.0, "copy_num_tokens": 414.1875, "epoch": 1.3349502170028082, "gen_logits_max": 4.83197021484375, "gen_logits_mean": -15.571178436279297, "gen_logits_min": -28.003347396850586, "gen_logits_std": 3.0992238521575928, "gen_loss": 0.29134243726730347, "grad_norm": 0.41210971810983316, "learning_rate": 2.2673684210526318e-05, "loss": 0.2705, "mean_copy_accuracy": 0.9967472553253174, "mean_gen_accuracy": 0.877034917473793, "mean_token_accuracy": 0.9090335071086884, "num_tokens": 688275015.0, "sample_num_tokens": 7430.75, "step": 6536, "total_num_tokens": 688304738.0, "z_loss": 0.0004813397245015949 }, { "copy_logits_max": -4.0724382400512695, "copy_logits_min": -687500032.0, "copy_num_tokens": 348.0, "epoch": 1.3351544549400052, "gen_logits_max": 4.136858940124512, "gen_logits_mean": -15.46328067779541, "gen_logits_min": -28.02105140686035, "gen_logits_std": 3.076063632965088, "gen_loss": 0.30251210927963257, "grad_norm": 0.36802191659311256, "learning_rate": 2.267242105263158e-05, "loss": 0.2879, "mean_copy_accuracy": 0.9958111494779587, "mean_gen_accuracy": 0.8741345107555389, "mean_token_accuracy": 0.9018101841211319, "num_tokens": 688546631.0, "sample_num_tokens": 7533.75, "step": 6537, "total_num_tokens": 688576766.0, "z_loss": 0.00048807586426846683 }, { "copy_logits_max": -5.216493129730225, "copy_logits_min": -687500032.0, "copy_num_tokens": 449.3125, "epoch": 1.3353586928772019, "gen_logits_max": 3.534820079803467, "gen_logits_mean": -17.15069007873535, "gen_logits_min": -29.435073852539062, "gen_logits_std": 3.1411824226379395, "gen_loss": 0.2762809991836548, "grad_norm": 0.3682997008062138, "learning_rate": 2.2671157894736843e-05, "loss": 0.2846, "mean_copy_accuracy": 0.9955760538578033, "mean_gen_accuracy": 0.8817828446626663, "mean_token_accuracy": 0.9048227071762085, "num_tokens": 688812289.0, "sample_num_tokens": 8327.25, "step": 6538, "total_num_tokens": 688845598.0, "z_loss": 0.0004958497011102736 }, { "copy_logits_max": -4.449268817901611, "copy_logits_min": -750000000.0, "copy_num_tokens": 353.8125, "epoch": 1.3355629308143988, "gen_logits_max": 4.345654010772705, "gen_logits_mean": -15.749171257019043, "gen_logits_min": -27.89474105834961, "gen_logits_std": 3.112100124359131, "gen_loss": 0.24783523380756378, "grad_norm": 0.3694271497913292, "learning_rate": 2.2669894736842104e-05, "loss": 0.2763, "mean_copy_accuracy": 0.9938654601573944, "mean_gen_accuracy": 0.8833395540714264, "mean_token_accuracy": 0.9038106054067612, "num_tokens": 689057575.0, "sample_num_tokens": 7311.25, "step": 6539, "total_num_tokens": 689086820.0, "z_loss": 0.00045094231609255075 }, { "copy_logits_max": -4.717523574829102, "copy_logits_min": -687500032.0, "copy_num_tokens": 715.5, "epoch": 1.3357671687515955, "gen_logits_max": 3.8908743858337402, "gen_logits_mean": -15.26559829711914, "gen_logits_min": -27.820804595947266, "gen_logits_std": 3.1148571968078613, "gen_loss": 0.24435283243656158, "grad_norm": 0.3701476305659666, "learning_rate": 2.266863157894737e-05, "loss": 0.2643, "mean_copy_accuracy": 0.9971834719181061, "mean_gen_accuracy": 0.8799430280923843, "mean_token_accuracy": 0.9096531718969345, "num_tokens": 689326426.0, "sample_num_tokens": 9512.0, "step": 6540, "total_num_tokens": 689364474.0, "z_loss": 0.00041560549288988113 }, { "copy_logits_max": -2.5640010833740234, "copy_logits_min": -687500032.0, "copy_num_tokens": 409.5, "epoch": 1.3359714066887924, "gen_logits_max": 3.9081616401672363, "gen_logits_mean": -15.599532127380371, "gen_logits_min": -28.282482147216797, "gen_logits_std": 3.0882601737976074, "gen_loss": 0.3320801854133606, "grad_norm": 0.4233067965839664, "learning_rate": 2.2667368421052633e-05, "loss": 0.3151, "mean_copy_accuracy": 0.9967197775840759, "mean_gen_accuracy": 0.8623377680778503, "mean_token_accuracy": 0.8945158272981644, "num_tokens": 689582270.0, "sample_num_tokens": 7556.5, "step": 6541, "total_num_tokens": 689612496.0, "z_loss": 0.0005727006355300546 }, { "copy_logits_max": -5.864243030548096, "copy_logits_min": -750000000.0, "copy_num_tokens": 363.25, "epoch": 1.3361756446259894, "gen_logits_max": 3.714928150177002, "gen_logits_mean": -17.668474197387695, "gen_logits_min": -29.413368225097656, "gen_logits_std": 3.129871129989624, "gen_loss": 0.30630433559417725, "grad_norm": 0.3429170787886011, "learning_rate": 2.2666105263157897e-05, "loss": 0.2766, "mean_copy_accuracy": 0.9973420351743698, "mean_gen_accuracy": 0.8757561147212982, "mean_token_accuracy": 0.9059264212846756, "num_tokens": 689863523.0, "sample_num_tokens": 7911.25, "step": 6542, "total_num_tokens": 689895168.0, "z_loss": 0.0005616797134280205 }, { "copy_logits_max": -4.166878700256348, "copy_logits_min": -750000000.0, "copy_num_tokens": 328.0, "epoch": 1.336379882563186, "gen_logits_max": 4.897828578948975, "gen_logits_mean": -15.718338012695312, "gen_logits_min": -27.686424255371094, "gen_logits_std": 3.0762319564819336, "gen_loss": 0.27791082859039307, "grad_norm": 0.39452998947398443, "learning_rate": 2.2664842105263158e-05, "loss": 0.2958, "mean_copy_accuracy": 0.9961133152246475, "mean_gen_accuracy": 0.8718757182359695, "mean_token_accuracy": 0.8990839868783951, "num_tokens": 690113691.0, "sample_num_tokens": 7627.75, "step": 6543, "total_num_tokens": 690144202.0, "z_loss": 0.00047646183520555496 }, { "copy_logits_max": -6.040648460388184, "copy_logits_min": -750000000.0, "copy_num_tokens": 303.6875, "epoch": 1.336584120500383, "gen_logits_max": 4.9998369216918945, "gen_logits_mean": -16.374210357666016, "gen_logits_min": -28.362924575805664, "gen_logits_std": 3.095752239227295, "gen_loss": 0.3261202573776245, "grad_norm": 0.3795858393979685, "learning_rate": 2.2663578947368422e-05, "loss": 0.3005, "mean_copy_accuracy": 0.9968602657318115, "mean_gen_accuracy": 0.8685210645198822, "mean_token_accuracy": 0.8988070785999298, "num_tokens": 690379796.0, "sample_num_tokens": 7504.5, "step": 6544, "total_num_tokens": 690409814.0, "z_loss": 0.0005954502848908305 }, { "copy_logits_max": -5.501623630523682, "copy_logits_min": -750000000.0, "copy_num_tokens": 501.25, "epoch": 1.3367883584375797, "gen_logits_max": 3.1961331367492676, "gen_logits_mean": -17.28050994873047, "gen_logits_min": -29.325748443603516, "gen_logits_std": 3.1193995475769043, "gen_loss": 0.2976296842098236, "grad_norm": 0.3926646428560517, "learning_rate": 2.2662315789473683e-05, "loss": 0.2609, "mean_copy_accuracy": 0.9962773472070694, "mean_gen_accuracy": 0.8758163452148438, "mean_token_accuracy": 0.9097090810537338, "num_tokens": 690671062.0, "sample_num_tokens": 8224.0, "step": 6545, "total_num_tokens": 690703958.0, "z_loss": 0.0005158460699021816 }, { "copy_logits_max": -1.8210028409957886, "copy_logits_min": -750000064.0, "copy_num_tokens": 499.75, "epoch": 1.3369925963747766, "gen_logits_max": 5.034602165222168, "gen_logits_mean": -14.250682830810547, "gen_logits_min": -26.70755386352539, "gen_logits_std": 3.0745978355407715, "gen_loss": 0.2486979365348816, "grad_norm": 0.3745311386911792, "learning_rate": 2.2661052631578947e-05, "loss": 0.2908, "mean_copy_accuracy": 0.9974281638860703, "mean_gen_accuracy": 0.8670500367879868, "mean_token_accuracy": 0.9013993293046951, "num_tokens": 690941503.0, "sample_num_tokens": 8696.25, "step": 6546, "total_num_tokens": 690976288.0, "z_loss": 0.00045007828157395124 }, { "copy_logits_max": -4.601472854614258, "copy_logits_min": -687500032.0, "copy_num_tokens": 452.875, "epoch": 1.3371968343119733, "gen_logits_max": 3.623206615447998, "gen_logits_mean": -16.291393280029297, "gen_logits_min": -28.328466415405273, "gen_logits_std": 3.1312859058380127, "gen_loss": 0.2577621340751648, "grad_norm": 0.3230236158290719, "learning_rate": 2.2659789473684212e-05, "loss": 0.2602, "mean_copy_accuracy": 0.9969214797019958, "mean_gen_accuracy": 0.8772715926170349, "mean_token_accuracy": 0.9108264744281769, "num_tokens": 691244474.0, "sample_num_tokens": 7803.5, "step": 6547, "total_num_tokens": 691275688.0, "z_loss": 0.0004648921312764287 }, { "copy_logits_max": -5.332694053649902, "copy_logits_min": -750000000.0, "copy_num_tokens": 550.9375, "epoch": 1.3374010722491703, "gen_logits_max": 3.6644301414489746, "gen_logits_mean": -16.694080352783203, "gen_logits_min": -28.652917861938477, "gen_logits_std": 3.148925304412842, "gen_loss": 0.2290116548538208, "grad_norm": 0.3477547186103829, "learning_rate": 2.2658526315789473e-05, "loss": 0.2522, "mean_copy_accuracy": 0.9967665821313858, "mean_gen_accuracy": 0.8835034966468811, "mean_token_accuracy": 0.9143939912319183, "num_tokens": 691509333.0, "sample_num_tokens": 8287.75, "step": 6548, "total_num_tokens": 691542484.0, "z_loss": 0.00036476663080975413 }, { "copy_logits_max": -4.3952250480651855, "copy_logits_min": -687500096.0, "copy_num_tokens": 667.0625, "epoch": 1.3376053101863672, "gen_logits_max": 3.7502431869506836, "gen_logits_mean": -15.171131134033203, "gen_logits_min": -27.513568878173828, "gen_logits_std": 3.1171884536743164, "gen_loss": 0.22964075207710266, "grad_norm": 0.3587169597343015, "learning_rate": 2.265726315789474e-05, "loss": 0.2712, "mean_copy_accuracy": 0.9960758984088898, "mean_gen_accuracy": 0.8750161677598953, "mean_token_accuracy": 0.908477395772934, "num_tokens": 691785976.0, "sample_num_tokens": 9133.5, "step": 6549, "total_num_tokens": 691822510.0, "z_loss": 0.00040406128391623497 }, { "copy_logits_max": -6.656392574310303, "copy_logits_min": -687500160.0, "copy_num_tokens": 341.3125, "epoch": 1.337809548123564, "gen_logits_max": 4.2026777267456055, "gen_logits_mean": -17.47176742553711, "gen_logits_min": -29.37325096130371, "gen_logits_std": 3.151782274246216, "gen_loss": 0.3012023866176605, "grad_norm": 0.3739474610282505, "learning_rate": 2.2656e-05, "loss": 0.261, "mean_copy_accuracy": 0.9973768889904022, "mean_gen_accuracy": 0.8777131140232086, "mean_token_accuracy": 0.9111738055944443, "num_tokens": 692060088.0, "sample_num_tokens": 7300.0, "step": 6550, "total_num_tokens": 692089288.0, "z_loss": 0.0005445080460049212 }, { "copy_logits_max": -2.3940844535827637, "copy_logits_min": -750000000.0, "copy_num_tokens": 746.0625, "epoch": 1.3380137860607608, "gen_logits_max": 4.184319972991943, "gen_logits_mean": -14.141093254089355, "gen_logits_min": -26.77008628845215, "gen_logits_std": 3.0801053047180176, "gen_loss": 0.2575203776359558, "grad_norm": 0.41428420608595995, "learning_rate": 2.2654736842105266e-05, "loss": 0.2959, "mean_copy_accuracy": 0.9963724911212921, "mean_gen_accuracy": 0.8705417066812515, "mean_token_accuracy": 0.899170309305191, "num_tokens": 692318434.0, "sample_num_tokens": 9938.0, "step": 6551, "total_num_tokens": 692358186.0, "z_loss": 0.0004909130511805415 }, { "copy_logits_max": -3.948671579360962, "copy_logits_min": -750000064.0, "copy_num_tokens": 369.9375, "epoch": 1.3382180239979578, "gen_logits_max": 3.908740997314453, "gen_logits_mean": -16.636138916015625, "gen_logits_min": -28.487289428710938, "gen_logits_std": 3.1003684997558594, "gen_loss": 0.2824297845363617, "grad_norm": 0.41687490199767036, "learning_rate": 2.2653473684210527e-05, "loss": 0.2841, "mean_copy_accuracy": 0.9948525428771973, "mean_gen_accuracy": 0.8776852935552597, "mean_token_accuracy": 0.9047926366329193, "num_tokens": 692589703.0, "sample_num_tokens": 7554.25, "step": 6552, "total_num_tokens": 692619920.0, "z_loss": 0.0005019316449761391 }, { "copy_logits_max": 0.25568389892578125, "copy_logits_min": -687500032.0, "copy_num_tokens": 454.25, "epoch": 1.3384222619351545, "gen_logits_max": 5.328355312347412, "gen_logits_mean": -14.230243682861328, "gen_logits_min": -26.185876846313477, "gen_logits_std": 3.0582656860351562, "gen_loss": 0.2524243891239166, "grad_norm": 0.36079966155948634, "learning_rate": 2.265221052631579e-05, "loss": 0.2713, "mean_copy_accuracy": 0.9972047656774521, "mean_gen_accuracy": 0.8779831230640411, "mean_token_accuracy": 0.9067858010530472, "num_tokens": 692876700.0, "sample_num_tokens": 8786.5, "step": 6553, "total_num_tokens": 692911846.0, "z_loss": 0.0005122155416756868 }, { "copy_logits_max": -2.936673641204834, "copy_logits_min": -750000000.0, "copy_num_tokens": 396.9375, "epoch": 1.3386264998723512, "gen_logits_max": 4.086385250091553, "gen_logits_mean": -16.262142181396484, "gen_logits_min": -28.402767181396484, "gen_logits_std": 3.0990688800811768, "gen_loss": 0.301348477602005, "grad_norm": 0.33770478013252075, "learning_rate": 2.2650947368421052e-05, "loss": 0.267, "mean_copy_accuracy": 0.9968129992485046, "mean_gen_accuracy": 0.8838911056518555, "mean_token_accuracy": 0.909474328160286, "num_tokens": 693152919.0, "sample_num_tokens": 7417.75, "step": 6554, "total_num_tokens": 693182590.0, "z_loss": 0.0006539457244798541 }, { "copy_logits_max": -2.7673850059509277, "copy_logits_min": -750000000.0, "copy_num_tokens": 504.3125, "epoch": 1.338830737809548, "gen_logits_max": 3.34293270111084, "gen_logits_mean": -17.08495330810547, "gen_logits_min": -29.258989334106445, "gen_logits_std": 3.1277332305908203, "gen_loss": 0.23313233256340027, "grad_norm": 0.38964428532407724, "learning_rate": 2.2649684210526316e-05, "loss": 0.289, "mean_copy_accuracy": 0.9968315809965134, "mean_gen_accuracy": 0.8735257685184479, "mean_token_accuracy": 0.9009996354579926, "num_tokens": 693433682.0, "sample_num_tokens": 8433.5, "step": 6555, "total_num_tokens": 693467416.0, "z_loss": 0.0004790381935890764 }, { "copy_logits_max": -5.020920753479004, "copy_logits_min": -750000064.0, "copy_num_tokens": 550.8125, "epoch": 1.339034975746745, "gen_logits_max": 4.197521686553955, "gen_logits_mean": -15.38209342956543, "gen_logits_min": -27.611209869384766, "gen_logits_std": 3.0984835624694824, "gen_loss": 0.23651662468910217, "grad_norm": 0.35108700104246, "learning_rate": 2.2648421052631577e-05, "loss": 0.2554, "mean_copy_accuracy": 0.995691254734993, "mean_gen_accuracy": 0.8865728825330734, "mean_token_accuracy": 0.9147219061851501, "num_tokens": 693706399.0, "sample_num_tokens": 8910.75, "step": 6556, "total_num_tokens": 693742042.0, "z_loss": 0.00044384237844496965 }, { "copy_logits_max": -7.064850807189941, "copy_logits_min": -687500032.0, "copy_num_tokens": 370.8125, "epoch": 1.3392392136839417, "gen_logits_max": 3.8711557388305664, "gen_logits_mean": -17.98271369934082, "gen_logits_min": -29.60474967956543, "gen_logits_std": 3.122136116027832, "gen_loss": 0.2587553858757019, "grad_norm": 0.38295675795561895, "learning_rate": 2.264715789473684e-05, "loss": 0.2738, "mean_copy_accuracy": 0.9959115236997604, "mean_gen_accuracy": 0.8775358647108078, "mean_token_accuracy": 0.9057979583740234, "num_tokens": 693980844.0, "sample_num_tokens": 8562.5, "step": 6557, "total_num_tokens": 694015094.0, "z_loss": 0.00047307118074968457 }, { "copy_logits_max": -5.462160587310791, "copy_logits_min": -750000000.0, "copy_num_tokens": 417.5, "epoch": 1.3394434516211386, "gen_logits_max": 3.6792733669281006, "gen_logits_mean": -16.610870361328125, "gen_logits_min": -28.384809494018555, "gen_logits_std": 3.1001086235046387, "gen_loss": 0.29724574089050293, "grad_norm": 0.3687235329382155, "learning_rate": 2.2645894736842106e-05, "loss": 0.2801, "mean_copy_accuracy": 0.995788961648941, "mean_gen_accuracy": 0.8751515001058578, "mean_token_accuracy": 0.9027807712554932, "num_tokens": 694237419.0, "sample_num_tokens": 7777.75, "step": 6558, "total_num_tokens": 694268530.0, "z_loss": 0.0005367798148654401 }, { "copy_logits_max": -4.292899131774902, "copy_logits_min": -750000000.0, "copy_num_tokens": 427.0, "epoch": 1.3396476895583356, "gen_logits_max": 4.300265312194824, "gen_logits_mean": -16.619857788085938, "gen_logits_min": -28.8175048828125, "gen_logits_std": 3.131612539291382, "gen_loss": 0.27391481399536133, "grad_norm": 0.37481084722340957, "learning_rate": 2.264463157894737e-05, "loss": 0.2928, "mean_copy_accuracy": 0.9952320605516434, "mean_gen_accuracy": 0.8701293915510178, "mean_token_accuracy": 0.9013227373361588, "num_tokens": 694507402.0, "sample_num_tokens": 8064.5, "step": 6559, "total_num_tokens": 694539660.0, "z_loss": 0.0005812240997329354 }, { "copy_logits_max": -4.114984512329102, "copy_logits_min": -750000000.0, "copy_num_tokens": 558.4375, "epoch": 1.3398519274955323, "gen_logits_max": 4.2325544357299805, "gen_logits_mean": -15.641206741333008, "gen_logits_min": -28.038118362426758, "gen_logits_std": 3.091742992401123, "gen_loss": 0.28615355491638184, "grad_norm": 0.374369428002252, "learning_rate": 2.2643368421052634e-05, "loss": 0.2667, "mean_copy_accuracy": 0.996897280216217, "mean_gen_accuracy": 0.8800756931304932, "mean_token_accuracy": 0.9100717455148697, "num_tokens": 694785673.0, "sample_num_tokens": 10465.25, "step": 6560, "total_num_tokens": 694827534.0, "z_loss": 0.0005545002641156316 }, { "copy_logits_max": -5.069094657897949, "copy_logits_min": -750000000.0, "copy_num_tokens": 413.0625, "epoch": 1.3400561654327292, "gen_logits_max": 3.0364251136779785, "gen_logits_mean": -17.160980224609375, "gen_logits_min": -29.406177520751953, "gen_logits_std": 3.1449451446533203, "gen_loss": 0.2791061997413635, "grad_norm": 0.3793236140744814, "learning_rate": 2.2642105263157895e-05, "loss": 0.2778, "mean_copy_accuracy": 0.9961660504341125, "mean_gen_accuracy": 0.8743676990270615, "mean_token_accuracy": 0.9059399366378784, "num_tokens": 695071514.0, "sample_num_tokens": 7588.0, "step": 6561, "total_num_tokens": 695101866.0, "z_loss": 0.0005240606842562556 }, { "copy_logits_max": -4.450784206390381, "copy_logits_min": -750000000.0, "copy_num_tokens": 446.4375, "epoch": 1.340260403369926, "gen_logits_max": 4.385562896728516, "gen_logits_mean": -16.26511573791504, "gen_logits_min": -28.657512664794922, "gen_logits_std": 3.1247239112854004, "gen_loss": 0.2955741286277771, "grad_norm": 0.42252501622878785, "learning_rate": 2.264084210526316e-05, "loss": 0.2942, "mean_copy_accuracy": 0.9954512566328049, "mean_gen_accuracy": 0.870649442076683, "mean_token_accuracy": 0.9004447609186172, "num_tokens": 695352808.0, "sample_num_tokens": 8430.5, "step": 6562, "total_num_tokens": 695386530.0, "z_loss": 0.0006028305506333709 }, { "copy_logits_max": -5.3709564208984375, "copy_logits_min": -750000000.0, "copy_num_tokens": 369.1875, "epoch": 1.3404646413071228, "gen_logits_max": 5.064457893371582, "gen_logits_mean": -15.331857681274414, "gen_logits_min": -27.102935791015625, "gen_logits_std": 3.081709861755371, "gen_loss": 0.2858986258506775, "grad_norm": 0.3806278583399033, "learning_rate": 2.263957894736842e-05, "loss": 0.3009, "mean_copy_accuracy": 0.9956945031881332, "mean_gen_accuracy": 0.8724417239427567, "mean_token_accuracy": 0.8980338275432587, "num_tokens": 695617769.0, "sample_num_tokens": 7614.75, "step": 6563, "total_num_tokens": 695648228.0, "z_loss": 0.0005203754408285022 }, { "copy_logits_max": -4.591614723205566, "copy_logits_min": -750000000.0, "copy_num_tokens": 543.25, "epoch": 1.3406688792443195, "gen_logits_max": 3.9898574352264404, "gen_logits_mean": -16.089067459106445, "gen_logits_min": -28.310474395751953, "gen_logits_std": 3.1487298011779785, "gen_loss": 0.21820397675037384, "grad_norm": 0.37807623327876955, "learning_rate": 2.2638315789473685e-05, "loss": 0.2568, "mean_copy_accuracy": 0.9959382861852646, "mean_gen_accuracy": 0.8865178674459457, "mean_token_accuracy": 0.9132457375526428, "num_tokens": 695884353.0, "sample_num_tokens": 8754.25, "step": 6564, "total_num_tokens": 695919370.0, "z_loss": 0.0004222468996886164 }, { "copy_logits_max": -6.6397199630737305, "copy_logits_min": -750000000.0, "copy_num_tokens": 361.75, "epoch": 1.3408731171815165, "gen_logits_max": 4.110089302062988, "gen_logits_mean": -16.537416458129883, "gen_logits_min": -28.855358123779297, "gen_logits_std": 3.1372878551483154, "gen_loss": 0.2798440456390381, "grad_norm": 0.36569484711263694, "learning_rate": 2.2637052631578946e-05, "loss": 0.2719, "mean_copy_accuracy": 0.9965171068906784, "mean_gen_accuracy": 0.8812166005373001, "mean_token_accuracy": 0.9086077511310577, "num_tokens": 696145351.0, "sample_num_tokens": 7949.75, "step": 6565, "total_num_tokens": 696177150.0, "z_loss": 0.0005179460858926177 }, { "copy_logits_max": -4.089846611022949, "copy_logits_min": -750000000.0, "copy_num_tokens": 566.25, "epoch": 1.3410773551187134, "gen_logits_max": 3.255779981613159, "gen_logits_mean": -16.216522216796875, "gen_logits_min": -28.386974334716797, "gen_logits_std": 3.100248098373413, "gen_loss": 0.2683025598526001, "grad_norm": 0.3757226197620017, "learning_rate": 2.2635789473684213e-05, "loss": 0.2794, "mean_copy_accuracy": 0.9959108233451843, "mean_gen_accuracy": 0.875513032078743, "mean_token_accuracy": 0.9041671752929688, "num_tokens": 696387217.0, "sample_num_tokens": 8765.25, "step": 6566, "total_num_tokens": 696422278.0, "z_loss": 0.00048596804845146835 }, { "copy_logits_max": -4.47865104675293, "copy_logits_min": -750000064.0, "copy_num_tokens": 482.25, "epoch": 1.34128159305591, "gen_logits_max": 3.2525110244750977, "gen_logits_mean": -17.293689727783203, "gen_logits_min": -29.71551513671875, "gen_logits_std": 3.1635522842407227, "gen_loss": 0.29740482568740845, "grad_norm": 0.3797933619764729, "learning_rate": 2.2634526315789474e-05, "loss": 0.2633, "mean_copy_accuracy": 0.9960908591747284, "mean_gen_accuracy": 0.8775043338537216, "mean_token_accuracy": 0.9110359847545624, "num_tokens": 696686907.0, "sample_num_tokens": 8253.75, "step": 6567, "total_num_tokens": 696719922.0, "z_loss": 0.0005233107367530465 }, { "copy_logits_max": -4.870865345001221, "copy_logits_min": -750000000.0, "copy_num_tokens": 418.6875, "epoch": 1.341485830993107, "gen_logits_max": 3.36918306350708, "gen_logits_mean": -16.700468063354492, "gen_logits_min": -29.005157470703125, "gen_logits_std": 3.1503822803497314, "gen_loss": 0.27058130502700806, "grad_norm": 0.38671153697597926, "learning_rate": 2.263326315789474e-05, "loss": 0.2667, "mean_copy_accuracy": 0.9961851388216019, "mean_gen_accuracy": 0.8818233907222748, "mean_token_accuracy": 0.9091347754001617, "num_tokens": 696935348.0, "sample_num_tokens": 7713.0, "step": 6568, "total_num_tokens": 696966200.0, "z_loss": 0.00048130418872460723 }, { "copy_logits_max": -3.857123374938965, "copy_logits_min": -750000000.0, "copy_num_tokens": 493.5, "epoch": 1.3416900689303037, "gen_logits_max": 4.098745346069336, "gen_logits_mean": -16.17101287841797, "gen_logits_min": -28.545257568359375, "gen_logits_std": 3.144167900085449, "gen_loss": 0.31199419498443604, "grad_norm": 0.3631880530839184, "learning_rate": 2.2632e-05, "loss": 0.2959, "mean_copy_accuracy": 0.9961895644664764, "mean_gen_accuracy": 0.869824007153511, "mean_token_accuracy": 0.8988121002912521, "num_tokens": 697197751.0, "sample_num_tokens": 8250.25, "step": 6569, "total_num_tokens": 697230752.0, "z_loss": 0.0005835675401613116 }, { "copy_logits_max": -6.745532035827637, "copy_logits_min": -687500032.0, "copy_num_tokens": 470.25, "epoch": 1.3418943068675007, "gen_logits_max": 4.338259696960449, "gen_logits_mean": -15.47946834564209, "gen_logits_min": -27.817386627197266, "gen_logits_std": 3.1041316986083984, "gen_loss": 0.3010963797569275, "grad_norm": 0.39174354159422015, "learning_rate": 2.2630736842105264e-05, "loss": 0.2957, "mean_copy_accuracy": 0.9959051758050919, "mean_gen_accuracy": 0.8721776604652405, "mean_token_accuracy": 0.8987876921892166, "num_tokens": 697456427.0, "sample_num_tokens": 9187.75, "step": 6570, "total_num_tokens": 697493178.0, "z_loss": 0.0004971501766704023 }, { "copy_logits_max": -5.465533256530762, "copy_logits_min": -750000000.0, "copy_num_tokens": 408.625, "epoch": 1.3420985448046974, "gen_logits_max": 3.552600383758545, "gen_logits_mean": -16.986160278320312, "gen_logits_min": -29.253963470458984, "gen_logits_std": 3.1359100341796875, "gen_loss": 0.27192485332489014, "grad_norm": 0.4316804051385579, "learning_rate": 2.2629473684210525e-05, "loss": 0.2891, "mean_copy_accuracy": 0.9948982000350952, "mean_gen_accuracy": 0.8736418336629868, "mean_token_accuracy": 0.9003600925207138, "num_tokens": 697732046.0, "sample_num_tokens": 8307.0, "step": 6571, "total_num_tokens": 697765274.0, "z_loss": 0.00047723253373987973 }, { "copy_logits_max": -4.03240442276001, "copy_logits_min": -750000000.0, "copy_num_tokens": 500.375, "epoch": 1.3423027827418943, "gen_logits_max": 3.354966163635254, "gen_logits_mean": -16.164417266845703, "gen_logits_min": -28.456172943115234, "gen_logits_std": 3.128063678741455, "gen_loss": 0.25525468587875366, "grad_norm": 0.42061602186878955, "learning_rate": 2.262821052631579e-05, "loss": 0.2648, "mean_copy_accuracy": 0.9957025498151779, "mean_gen_accuracy": 0.8780168741941452, "mean_token_accuracy": 0.9076466262340546, "num_tokens": 697987083.0, "sample_num_tokens": 9633.75, "step": 6572, "total_num_tokens": 698025618.0, "z_loss": 0.00048598405555821955 }, { "copy_logits_max": -4.626946449279785, "copy_logits_min": -625000064.0, "copy_num_tokens": 393.875, "epoch": 1.3425070206790912, "gen_logits_max": 4.113490104675293, "gen_logits_mean": -16.847293853759766, "gen_logits_min": -28.860008239746094, "gen_logits_std": 3.1110668182373047, "gen_loss": 0.3213393986225128, "grad_norm": 0.39160539456372107, "learning_rate": 2.2626947368421053e-05, "loss": 0.3102, "mean_copy_accuracy": 0.9956508725881577, "mean_gen_accuracy": 0.8676755875349045, "mean_token_accuracy": 0.8940568715333939, "num_tokens": 698240245.0, "sample_num_tokens": 8327.25, "step": 6573, "total_num_tokens": 698273554.0, "z_loss": 0.00063035060884431 }, { "copy_logits_max": -2.2967846393585205, "copy_logits_min": -750000000.0, "copy_num_tokens": 721.125, "epoch": 1.342711258616288, "gen_logits_max": 5.008350372314453, "gen_logits_mean": -15.609908103942871, "gen_logits_min": -27.908124923706055, "gen_logits_std": 3.111912965774536, "gen_loss": 0.22971853613853455, "grad_norm": 0.42294944058251605, "learning_rate": 2.2625684210526318e-05, "loss": 0.2721, "mean_copy_accuracy": 0.9971075057983398, "mean_gen_accuracy": 0.8772002160549164, "mean_token_accuracy": 0.90708427131176, "num_tokens": 698526633.0, "sample_num_tokens": 10972.25, "step": 6574, "total_num_tokens": 698570522.0, "z_loss": 0.0004802424809895456 }, { "copy_logits_max": -6.060097694396973, "copy_logits_min": -750000000.0, "copy_num_tokens": 373.375, "epoch": 1.3429154965534849, "gen_logits_max": 4.614135265350342, "gen_logits_mean": -16.097692489624023, "gen_logits_min": -28.27446174621582, "gen_logits_std": 3.1177053451538086, "gen_loss": 0.28757116198539734, "grad_norm": 0.33828229619882744, "learning_rate": 2.2624421052631582e-05, "loss": 0.2546, "mean_copy_accuracy": 0.9965378791093826, "mean_gen_accuracy": 0.8876163512468338, "mean_token_accuracy": 0.9126801639795303, "num_tokens": 698796897.0, "sample_num_tokens": 8246.75, "step": 6575, "total_num_tokens": 698829884.0, "z_loss": 0.0005202849861234426 }, { "copy_logits_max": -4.481562614440918, "copy_logits_min": -750000064.0, "copy_num_tokens": 526.6875, "epoch": 1.3431197344906818, "gen_logits_max": 4.71485710144043, "gen_logits_mean": -14.791129112243652, "gen_logits_min": -26.984619140625, "gen_logits_std": 3.1096882820129395, "gen_loss": 0.2688264548778534, "grad_norm": 0.3777702297444188, "learning_rate": 2.2623157894736843e-05, "loss": 0.2576, "mean_copy_accuracy": 0.9970473796129227, "mean_gen_accuracy": 0.8834833204746246, "mean_token_accuracy": 0.9140583276748657, "num_tokens": 699067940.0, "sample_num_tokens": 8626.5, "step": 6576, "total_num_tokens": 699102446.0, "z_loss": 0.00044646774767898023 }, { "copy_logits_max": -1.0449787378311157, "copy_logits_min": -687500032.0, "copy_num_tokens": 410.125, "epoch": 1.3433239724278785, "gen_logits_max": 4.985639572143555, "gen_logits_mean": -14.533087730407715, "gen_logits_min": -26.76013946533203, "gen_logits_std": 3.0994341373443604, "gen_loss": 0.2971891164779663, "grad_norm": 0.37891332033150116, "learning_rate": 2.2621894736842107e-05, "loss": 0.2776, "mean_copy_accuracy": 0.9964220225811005, "mean_gen_accuracy": 0.8732371926307678, "mean_token_accuracy": 0.9050422608852386, "num_tokens": 699354658.0, "sample_num_tokens": 7719.5, "step": 6577, "total_num_tokens": 699385536.0, "z_loss": 0.0005430266028270125 }, { "copy_logits_max": -6.460738182067871, "copy_logits_min": -750000000.0, "copy_num_tokens": 412.875, "epoch": 1.3435282103650752, "gen_logits_max": 5.077446937561035, "gen_logits_mean": -15.619842529296875, "gen_logits_min": -27.49538230895996, "gen_logits_std": 3.0872962474823, "gen_loss": 0.26587721705436707, "grad_norm": 0.3871284987424396, "learning_rate": 2.2620631578947368e-05, "loss": 0.2762, "mean_copy_accuracy": 0.9955501407384872, "mean_gen_accuracy": 0.8815285861492157, "mean_token_accuracy": 0.9063916057348251, "num_tokens": 699622044.0, "sample_num_tokens": 9175.5, "step": 6578, "total_num_tokens": 699658746.0, "z_loss": 0.0004328516952227801 }, { "copy_logits_max": -3.525268316268921, "copy_logits_min": -750000000.0, "copy_num_tokens": 448.0625, "epoch": 1.3437324483022721, "gen_logits_max": 4.463117599487305, "gen_logits_mean": -15.15412712097168, "gen_logits_min": -27.766525268554688, "gen_logits_std": 3.0752742290496826, "gen_loss": 0.2660726308822632, "grad_norm": 0.3543855827256659, "learning_rate": 2.2619368421052632e-05, "loss": 0.2745, "mean_copy_accuracy": 0.99654321372509, "mean_gen_accuracy": 0.8771426230669022, "mean_token_accuracy": 0.9065282940864563, "num_tokens": 699892320.0, "sample_num_tokens": 7700.5, "step": 6579, "total_num_tokens": 699923122.0, "z_loss": 0.0005405659321695566 }, { "copy_logits_max": -2.9629006385803223, "copy_logits_min": -687500032.0, "copy_num_tokens": 692.0, "epoch": 1.343936686239469, "gen_logits_max": 4.378481388092041, "gen_logits_mean": -14.312214851379395, "gen_logits_min": -26.80682373046875, "gen_logits_std": 3.080716133117676, "gen_loss": 0.28347161412239075, "grad_norm": 0.37638271709435484, "learning_rate": 2.2618105263157893e-05, "loss": 0.2753, "mean_copy_accuracy": 0.9964499473571777, "mean_gen_accuracy": 0.8762281537055969, "mean_token_accuracy": 0.9058569669723511, "num_tokens": 700150963.0, "sample_num_tokens": 10427.25, "step": 6580, "total_num_tokens": 700192672.0, "z_loss": 0.0004927248228341341 }, { "copy_logits_max": -4.247468948364258, "copy_logits_min": -750000000.0, "copy_num_tokens": 323.1875, "epoch": 1.3441409241766658, "gen_logits_max": 4.672722816467285, "gen_logits_mean": -14.925655364990234, "gen_logits_min": -26.678861618041992, "gen_logits_std": 3.0393457412719727, "gen_loss": 0.2709924280643463, "grad_norm": 0.3844922419848696, "learning_rate": 2.2616842105263158e-05, "loss": 0.2948, "mean_copy_accuracy": 0.995379164814949, "mean_gen_accuracy": 0.8751912117004395, "mean_token_accuracy": 0.8967534750699997, "num_tokens": 700406928.0, "sample_num_tokens": 7784.5, "step": 6581, "total_num_tokens": 700438066.0, "z_loss": 0.00044481089571490884 }, { "copy_logits_max": -4.148784160614014, "copy_logits_min": -687500032.0, "copy_num_tokens": 506.125, "epoch": 1.3443451621138627, "gen_logits_max": 3.925609827041626, "gen_logits_mean": -15.206514358520508, "gen_logits_min": -28.299942016601562, "gen_logits_std": 3.11661958694458, "gen_loss": 0.2522197365760803, "grad_norm": 0.5642809550405921, "learning_rate": 2.2615578947368422e-05, "loss": 0.2909, "mean_copy_accuracy": 0.9950543344020844, "mean_gen_accuracy": 0.8685343861579895, "mean_token_accuracy": 0.9035215675830841, "num_tokens": 700658172.0, "sample_num_tokens": 8035.0, "step": 6582, "total_num_tokens": 700690312.0, "z_loss": 0.0004891870776191354 }, { "copy_logits_max": -3.277348279953003, "copy_logits_min": -750000000.0, "copy_num_tokens": 831.5625, "epoch": 1.3445494000510596, "gen_logits_max": 2.771749496459961, "gen_logits_mean": -16.629262924194336, "gen_logits_min": -28.934282302856445, "gen_logits_std": 3.1394667625427246, "gen_loss": 0.23893237113952637, "grad_norm": 0.36187984205849605, "learning_rate": 2.2614315789473686e-05, "loss": 0.266, "mean_copy_accuracy": 0.9962148517370224, "mean_gen_accuracy": 0.8791764080524445, "mean_token_accuracy": 0.9103375375270844, "num_tokens": 700932433.0, "sample_num_tokens": 10110.25, "step": 6583, "total_num_tokens": 700972874.0, "z_loss": 0.0004749794607050717 }, { "copy_logits_max": -5.431003570556641, "copy_logits_min": -750000000.0, "copy_num_tokens": 329.25, "epoch": 1.3447536379882563, "gen_logits_max": 5.445555686950684, "gen_logits_mean": -14.899906158447266, "gen_logits_min": -27.02765655517578, "gen_logits_std": 3.081214427947998, "gen_loss": 0.3483583927154541, "grad_norm": 0.414791318669763, "learning_rate": 2.2613052631578947e-05, "loss": 0.293, "mean_copy_accuracy": 0.9946922957897186, "mean_gen_accuracy": 0.8772843927145004, "mean_token_accuracy": 0.9001785814762115, "num_tokens": 701180987.0, "sample_num_tokens": 7337.25, "step": 6584, "total_num_tokens": 701210336.0, "z_loss": 0.0007017937605269253 }, { "copy_logits_max": -3.7447781562805176, "copy_logits_min": -750000128.0, "copy_num_tokens": 557.4375, "epoch": 1.344957875925453, "gen_logits_max": 3.334526300430298, "gen_logits_mean": -15.780515670776367, "gen_logits_min": -28.357479095458984, "gen_logits_std": 3.0890188217163086, "gen_loss": 0.2877032160758972, "grad_norm": 0.3758012200532889, "learning_rate": 2.261178947368421e-05, "loss": 0.2631, "mean_copy_accuracy": 0.9972344040870667, "mean_gen_accuracy": 0.875405877828598, "mean_token_accuracy": 0.9107231497764587, "num_tokens": 701472068.0, "sample_num_tokens": 8662.5, "step": 6585, "total_num_tokens": 701506718.0, "z_loss": 0.0005587043706327677 }, { "copy_logits_max": -4.28460168838501, "copy_logits_min": -750000000.0, "copy_num_tokens": 586.3125, "epoch": 1.34516211386265, "gen_logits_max": 2.4215168952941895, "gen_logits_mean": -16.796894073486328, "gen_logits_min": -28.732582092285156, "gen_logits_std": 3.1314311027526855, "gen_loss": 0.2563071548938751, "grad_norm": 0.3754769963446143, "learning_rate": 2.2610526315789472e-05, "loss": 0.261, "mean_copy_accuracy": 0.9970604926347733, "mean_gen_accuracy": 0.877518817782402, "mean_token_accuracy": 0.9096876084804535, "num_tokens": 701758976.0, "sample_num_tokens": 8143.5, "step": 6586, "total_num_tokens": 701791550.0, "z_loss": 0.0004473417648114264 }, { "copy_logits_max": -5.381012916564941, "copy_logits_min": -750000000.0, "copy_num_tokens": 441.375, "epoch": 1.3453663517998469, "gen_logits_max": 3.614835739135742, "gen_logits_mean": -16.713726043701172, "gen_logits_min": -28.520870208740234, "gen_logits_std": 3.0984396934509277, "gen_loss": 0.30397409200668335, "grad_norm": 0.38851561176961524, "learning_rate": 2.2609263157894737e-05, "loss": 0.2971, "mean_copy_accuracy": 0.9964972138404846, "mean_gen_accuracy": 0.8696035593748093, "mean_token_accuracy": 0.900561973452568, "num_tokens": 702046451.0, "sample_num_tokens": 8422.75, "step": 6587, "total_num_tokens": 702080142.0, "z_loss": 0.0005648258375003934 }, { "copy_logits_max": -4.645376205444336, "copy_logits_min": -750000000.0, "copy_num_tokens": 488.625, "epoch": 1.3455705897370436, "gen_logits_max": 4.923797130584717, "gen_logits_mean": -14.77065658569336, "gen_logits_min": -26.95559310913086, "gen_logits_std": 3.1195437908172607, "gen_loss": 0.27825111150741577, "grad_norm": 0.39902841510097925, "learning_rate": 2.2608e-05, "loss": 0.2942, "mean_copy_accuracy": 0.9961119294166565, "mean_gen_accuracy": 0.8677746653556824, "mean_token_accuracy": 0.9020951986312866, "num_tokens": 702319854.0, "sample_num_tokens": 8401.0, "step": 6588, "total_num_tokens": 702353458.0, "z_loss": 0.0005035764188505709 }, { "copy_logits_max": -4.367063522338867, "copy_logits_min": -687500032.0, "copy_num_tokens": 583.5625, "epoch": 1.3457748276742405, "gen_logits_max": 3.008553981781006, "gen_logits_mean": -16.60494613647461, "gen_logits_min": -28.46135902404785, "gen_logits_std": 3.0913941860198975, "gen_loss": 0.26947012543678284, "grad_norm": 0.3921165210745269, "learning_rate": 2.2606736842105262e-05, "loss": 0.2656, "mean_copy_accuracy": 0.9976738095283508, "mean_gen_accuracy": 0.8781252354383469, "mean_token_accuracy": 0.9100062847137451, "num_tokens": 702602991.0, "sample_num_tokens": 9049.25, "step": 6589, "total_num_tokens": 702639188.0, "z_loss": 0.00044379502651281655 }, { "copy_logits_max": -6.788955211639404, "copy_logits_min": -750000000.0, "copy_num_tokens": 488.125, "epoch": 1.3459790656114374, "gen_logits_max": 3.8343987464904785, "gen_logits_mean": -16.369470596313477, "gen_logits_min": -28.500858306884766, "gen_logits_std": 3.100588321685791, "gen_loss": 0.312553733587265, "grad_norm": 0.3454915823663676, "learning_rate": 2.260547368421053e-05, "loss": 0.2863, "mean_copy_accuracy": 0.9963783770799637, "mean_gen_accuracy": 0.8731798082590103, "mean_token_accuracy": 0.9023858308792114, "num_tokens": 702866650.0, "sample_num_tokens": 8679.5, "step": 6590, "total_num_tokens": 702901368.0, "z_loss": 0.0005274752038531005 }, { "copy_logits_max": -5.508486747741699, "copy_logits_min": -750000000.0, "copy_num_tokens": 467.9375, "epoch": 1.3461833035486341, "gen_logits_max": 3.001905918121338, "gen_logits_mean": -17.44777488708496, "gen_logits_min": -29.906347274780273, "gen_logits_std": 3.1620559692382812, "gen_loss": 0.2693502902984619, "grad_norm": 0.39393749773317865, "learning_rate": 2.260421052631579e-05, "loss": 0.2866, "mean_copy_accuracy": 0.9960215240716934, "mean_gen_accuracy": 0.8760631233453751, "mean_token_accuracy": 0.903042808175087, "num_tokens": 703124997.0, "sample_num_tokens": 8599.25, "step": 6591, "total_num_tokens": 703159394.0, "z_loss": 0.0005283001228235662 }, { "copy_logits_max": -4.8757500648498535, "copy_logits_min": -750000064.0, "copy_num_tokens": 507.0625, "epoch": 1.346387541485831, "gen_logits_max": 3.5839622020721436, "gen_logits_mean": -16.070209503173828, "gen_logits_min": -29.104076385498047, "gen_logits_std": 3.1195621490478516, "gen_loss": 0.268368661403656, "grad_norm": 0.4618820393949135, "learning_rate": 2.2602947368421055e-05, "loss": 0.2795, "mean_copy_accuracy": 0.9956637471914291, "mean_gen_accuracy": 0.877029225230217, "mean_token_accuracy": 0.9051517993211746, "num_tokens": 703394484.0, "sample_num_tokens": 8434.0, "step": 6592, "total_num_tokens": 703428220.0, "z_loss": 0.0004902899963781238 }, { "copy_logits_max": -6.358151435852051, "copy_logits_min": -750000000.0, "copy_num_tokens": 392.5625, "epoch": 1.3465917794230278, "gen_logits_max": 4.855190753936768, "gen_logits_mean": -15.165945053100586, "gen_logits_min": -28.24167251586914, "gen_logits_std": 3.0920839309692383, "gen_loss": 0.2688225507736206, "grad_norm": 0.4032586245509992, "learning_rate": 2.2601684210526316e-05, "loss": 0.2833, "mean_copy_accuracy": 0.9971485435962677, "mean_gen_accuracy": 0.8763281852006912, "mean_token_accuracy": 0.9035099595785141, "num_tokens": 703649661.0, "sample_num_tokens": 7816.75, "step": 6593, "total_num_tokens": 703680928.0, "z_loss": 0.000466890080133453 }, { "copy_logits_max": -5.84097146987915, "copy_logits_min": -750000000.0, "copy_num_tokens": 440.25, "epoch": 1.3467960173602247, "gen_logits_max": 3.639209747314453, "gen_logits_mean": -16.329530715942383, "gen_logits_min": -29.090513229370117, "gen_logits_std": 3.1224451065063477, "gen_loss": 0.2750547528266907, "grad_norm": 0.4000233879267909, "learning_rate": 2.260042105263158e-05, "loss": 0.2755, "mean_copy_accuracy": 0.9966974407434464, "mean_gen_accuracy": 0.877548336982727, "mean_token_accuracy": 0.9082216918468475, "num_tokens": 703913473.0, "sample_num_tokens": 7875.25, "step": 6594, "total_num_tokens": 703944974.0, "z_loss": 0.0004823396448045969 }, { "copy_logits_max": -4.8012542724609375, "copy_logits_min": -750000000.0, "copy_num_tokens": 330.0625, "epoch": 1.3470002552974214, "gen_logits_max": 5.2385053634643555, "gen_logits_mean": -13.899843215942383, "gen_logits_min": -26.754053115844727, "gen_logits_std": 3.0751373767852783, "gen_loss": 0.2949255406856537, "grad_norm": 0.3839108013880036, "learning_rate": 2.259915789473684e-05, "loss": 0.2772, "mean_copy_accuracy": 0.9958759546279907, "mean_gen_accuracy": 0.8762904107570648, "mean_token_accuracy": 0.9045742005109787, "num_tokens": 704179582.0, "sample_num_tokens": 7804.0, "step": 6595, "total_num_tokens": 704210798.0, "z_loss": 0.0005128620541654527 }, { "copy_logits_max": -3.8489584922790527, "copy_logits_min": -750000000.0, "copy_num_tokens": 468.3125, "epoch": 1.3472044932346183, "gen_logits_max": 4.247929096221924, "gen_logits_mean": -15.846481323242188, "gen_logits_min": -28.32238006591797, "gen_logits_std": 3.1428747177124023, "gen_loss": 0.26457929611206055, "grad_norm": 0.41160824052595235, "learning_rate": 2.2597894736842105e-05, "loss": 0.2685, "mean_copy_accuracy": 0.9961304664611816, "mean_gen_accuracy": 0.8781538605690002, "mean_token_accuracy": 0.9095191955566406, "num_tokens": 704447479.0, "sample_num_tokens": 8610.75, "step": 6596, "total_num_tokens": 704481922.0, "z_loss": 0.00046640148502774537 }, { "copy_logits_max": -3.2385926246643066, "copy_logits_min": -750000000.0, "copy_num_tokens": 589.4375, "epoch": 1.3474087311718153, "gen_logits_max": 4.773658752441406, "gen_logits_mean": -14.227507591247559, "gen_logits_min": -27.831541061401367, "gen_logits_std": 3.116696834564209, "gen_loss": 0.2822266221046448, "grad_norm": 0.35045733785470157, "learning_rate": 2.2596631578947366e-05, "loss": 0.2811, "mean_copy_accuracy": 0.9966490864753723, "mean_gen_accuracy": 0.8716793060302734, "mean_token_accuracy": 0.9060752242803574, "num_tokens": 704756671.0, "sample_num_tokens": 9476.75, "step": 6597, "total_num_tokens": 704794578.0, "z_loss": 0.0005741209024563432 }, { "copy_logits_max": -2.323092460632324, "copy_logits_min": -750000000.0, "copy_num_tokens": 349.6875, "epoch": 1.347612969109012, "gen_logits_max": 6.191499710083008, "gen_logits_mean": -13.45680046081543, "gen_logits_min": -26.429052352905273, "gen_logits_std": 3.0759525299072266, "gen_loss": 0.32238972187042236, "grad_norm": 0.38962460564675044, "learning_rate": 2.2595368421052634e-05, "loss": 0.2951, "mean_copy_accuracy": 0.9960283190011978, "mean_gen_accuracy": 0.8683695495128632, "mean_token_accuracy": 0.8997458219528198, "num_tokens": 705018835.0, "sample_num_tokens": 7392.25, "step": 6598, "total_num_tokens": 705048404.0, "z_loss": 0.0006190167041495442 }, { "copy_logits_max": -4.323176860809326, "copy_logits_min": -750000000.0, "copy_num_tokens": 363.3125, "epoch": 1.3478172070462089, "gen_logits_max": 5.6257219314575195, "gen_logits_mean": -14.838632583618164, "gen_logits_min": -27.25581932067871, "gen_logits_std": 3.076404571533203, "gen_loss": 0.29237598180770874, "grad_norm": 0.42321066959687326, "learning_rate": 2.2594105263157895e-05, "loss": 0.2833, "mean_copy_accuracy": 0.9964706152677536, "mean_gen_accuracy": 0.8757417052984238, "mean_token_accuracy": 0.9044989198446274, "num_tokens": 705284503.0, "sample_num_tokens": 7973.25, "step": 6599, "total_num_tokens": 705316396.0, "z_loss": 0.0005518762627616525 }, { "copy_logits_max": -2.622628688812256, "copy_logits_min": -750000128.0, "copy_num_tokens": 557.125, "epoch": 1.3480214449834056, "gen_logits_max": 3.784954071044922, "gen_logits_mean": -15.97028923034668, "gen_logits_min": -28.555530548095703, "gen_logits_std": 3.1398816108703613, "gen_loss": 0.2765182554721832, "grad_norm": 0.3948567902887177, "learning_rate": 2.259284210526316e-05, "loss": 0.2782, "mean_copy_accuracy": 0.9963947087526321, "mean_gen_accuracy": 0.8709908872842789, "mean_token_accuracy": 0.9077162146568298, "num_tokens": 705552907.0, "sample_num_tokens": 8219.75, "step": 6600, "total_num_tokens": 705585786.0, "z_loss": 0.00047719920985400677 }, { "copy_logits_max": -1.969114065170288, "copy_logits_min": -687500032.0, "copy_num_tokens": 517.375, "epoch": 1.3482256829206025, "gen_logits_max": 5.286192417144775, "gen_logits_mean": -14.444003105163574, "gen_logits_min": -26.566307067871094, "gen_logits_std": 3.1495327949523926, "gen_loss": 0.31049302220344543, "grad_norm": 0.3865216836948732, "learning_rate": 2.2591578947368424e-05, "loss": 0.2802, "mean_copy_accuracy": 0.9965060204267502, "mean_gen_accuracy": 0.8732784539461136, "mean_token_accuracy": 0.9059888422489166, "num_tokens": 705830588.0, "sample_num_tokens": 7761.5, "step": 6601, "total_num_tokens": 705861634.0, "z_loss": 0.0005790800205431879 }, { "copy_logits_max": -4.796323299407959, "copy_logits_min": -750000128.0, "copy_num_tokens": 654.5625, "epoch": 1.3484299208577992, "gen_logits_max": 4.100314617156982, "gen_logits_mean": -16.34419822692871, "gen_logits_min": -28.520408630371094, "gen_logits_std": 3.162240505218506, "gen_loss": 0.26202088594436646, "grad_norm": 0.37043304684225853, "learning_rate": 2.2590315789473684e-05, "loss": 0.2661, "mean_copy_accuracy": 0.9963973015546799, "mean_gen_accuracy": 0.8819075375795364, "mean_token_accuracy": 0.9112242460250854, "num_tokens": 706098707.0, "sample_num_tokens": 9572.25, "step": 6602, "total_num_tokens": 706136996.0, "z_loss": 0.000504880677908659 }, { "copy_logits_max": -4.765923500061035, "copy_logits_min": -625000064.0, "copy_num_tokens": 530.875, "epoch": 1.3486341587949962, "gen_logits_max": 4.26362419128418, "gen_logits_mean": -16.0592041015625, "gen_logits_min": -28.490943908691406, "gen_logits_std": 3.1541600227355957, "gen_loss": 0.2857798635959625, "grad_norm": 0.35022089680693413, "learning_rate": 2.258905263157895e-05, "loss": 0.2854, "mean_copy_accuracy": 0.996914267539978, "mean_gen_accuracy": 0.8725209683179855, "mean_token_accuracy": 0.9022127985954285, "num_tokens": 706368353.0, "sample_num_tokens": 8562.25, "step": 6603, "total_num_tokens": 706402602.0, "z_loss": 0.0005827454151585698 }, { "copy_logits_max": -4.797074317932129, "copy_logits_min": -750000000.0, "copy_num_tokens": 286.25, "epoch": 1.348838396732193, "gen_logits_max": 7.03125, "gen_logits_mean": -12.351236343383789, "gen_logits_min": -25.378192901611328, "gen_logits_std": 3.06711483001709, "gen_loss": 0.28902697563171387, "grad_norm": 0.3944317254408973, "learning_rate": 2.258778947368421e-05, "loss": 0.278, "mean_copy_accuracy": 0.996360570192337, "mean_gen_accuracy": 0.8788669109344482, "mean_token_accuracy": 0.907183051109314, "num_tokens": 706640464.0, "sample_num_tokens": 6938.5, "step": 6604, "total_num_tokens": 706668218.0, "z_loss": 0.0005712012061849236 }, { "copy_logits_max": -3.7312097549438477, "copy_logits_min": -687500032.0, "copy_num_tokens": 511.4375, "epoch": 1.3490426346693898, "gen_logits_max": 5.144283771514893, "gen_logits_mean": -15.492618560791016, "gen_logits_min": -27.93830108642578, "gen_logits_std": 3.1603221893310547, "gen_loss": 0.2438502162694931, "grad_norm": 0.3854725387391419, "learning_rate": 2.2586526315789474e-05, "loss": 0.2541, "mean_copy_accuracy": 0.995824933052063, "mean_gen_accuracy": 0.8821417987346649, "mean_token_accuracy": 0.9124131202697754, "num_tokens": 706908300.0, "sample_num_tokens": 8711.0, "step": 6605, "total_num_tokens": 706943144.0, "z_loss": 0.0004490095598157495 }, { "copy_logits_max": -4.972557067871094, "copy_logits_min": -750000064.0, "copy_num_tokens": 406.625, "epoch": 1.3492468726065867, "gen_logits_max": 4.844810962677002, "gen_logits_mean": -16.656719207763672, "gen_logits_min": -28.95571517944336, "gen_logits_std": 3.124917507171631, "gen_loss": 0.28108060359954834, "grad_norm": 0.38909841458509153, "learning_rate": 2.2585263157894735e-05, "loss": 0.271, "mean_copy_accuracy": 0.9966705590486526, "mean_gen_accuracy": 0.8752633035182953, "mean_token_accuracy": 0.9085641652345657, "num_tokens": 707194135.0, "sample_num_tokens": 8289.25, "step": 6606, "total_num_tokens": 707227292.0, "z_loss": 0.0005228708032518625 }, { "copy_logits_max": -3.696070432662964, "copy_logits_min": -687500032.0, "copy_num_tokens": 521.5, "epoch": 1.3494511105437836, "gen_logits_max": 5.5664873123168945, "gen_logits_mean": -13.472064971923828, "gen_logits_min": -25.61737823486328, "gen_logits_std": 3.0493416786193848, "gen_loss": 0.27654722332954407, "grad_norm": 0.4102504083910517, "learning_rate": 2.2584000000000003e-05, "loss": 0.2945, "mean_copy_accuracy": 0.995606392621994, "mean_gen_accuracy": 0.8679730594158173, "mean_token_accuracy": 0.9013810753822327, "num_tokens": 707463621.0, "sample_num_tokens": 8313.25, "step": 6607, "total_num_tokens": 707496874.0, "z_loss": 0.0005055044312030077 }, { "copy_logits_max": -2.353780746459961, "copy_logits_min": -750000000.0, "copy_num_tokens": 564.625, "epoch": 1.3496553484809803, "gen_logits_max": 6.03280782699585, "gen_logits_mean": -14.00021743774414, "gen_logits_min": -26.00029754638672, "gen_logits_std": 3.0531110763549805, "gen_loss": 0.30540183186531067, "grad_norm": 0.3594851239240252, "learning_rate": 2.2582736842105264e-05, "loss": 0.2824, "mean_copy_accuracy": 0.9965906590223312, "mean_gen_accuracy": 0.8743969649076462, "mean_token_accuracy": 0.9045481234788895, "num_tokens": 707750973.0, "sample_num_tokens": 9247.25, "step": 6608, "total_num_tokens": 707787962.0, "z_loss": 0.0006371081108227372 }, { "copy_logits_max": -1.4484648704528809, "copy_logits_min": -687500032.0, "copy_num_tokens": 527.9375, "epoch": 1.349859586418177, "gen_logits_max": 4.6241278648376465, "gen_logits_mean": -15.862903594970703, "gen_logits_min": -28.682199478149414, "gen_logits_std": 3.1056480407714844, "gen_loss": 0.26287803053855896, "grad_norm": 0.3717969525796113, "learning_rate": 2.2581473684210528e-05, "loss": 0.2766, "mean_copy_accuracy": 0.9969907999038696, "mean_gen_accuracy": 0.8780302703380585, "mean_token_accuracy": 0.905949741601944, "num_tokens": 708035095.0, "sample_num_tokens": 8445.75, "step": 6609, "total_num_tokens": 708068878.0, "z_loss": 0.0005903149722144008 }, { "copy_logits_max": -4.550788879394531, "copy_logits_min": -750000000.0, "copy_num_tokens": 320.1875, "epoch": 1.350063824355374, "gen_logits_max": 5.225378513336182, "gen_logits_mean": -16.12430191040039, "gen_logits_min": -28.013751983642578, "gen_logits_std": 3.063772201538086, "gen_loss": 0.3101939558982849, "grad_norm": 0.36801867285324286, "learning_rate": 2.258021052631579e-05, "loss": 0.2866, "mean_copy_accuracy": 0.9962897747755051, "mean_gen_accuracy": 0.8735284060239792, "mean_token_accuracy": 0.9019923806190491, "num_tokens": 708308236.0, "sample_num_tokens": 7168.0, "step": 6610, "total_num_tokens": 708336908.0, "z_loss": 0.0005557135445997119 }, { "copy_logits_max": -4.065349578857422, "copy_logits_min": -750000000.0, "copy_num_tokens": 350.375, "epoch": 1.350268062292571, "gen_logits_max": 4.99685001373291, "gen_logits_mean": -15.51380729675293, "gen_logits_min": -27.696887969970703, "gen_logits_std": 3.0829222202301025, "gen_loss": 0.30538177490234375, "grad_norm": 0.3453128183564572, "learning_rate": 2.2578947368421053e-05, "loss": 0.2795, "mean_copy_accuracy": 0.9963653385639191, "mean_gen_accuracy": 0.8723086416721344, "mean_token_accuracy": 0.9058740735054016, "num_tokens": 708582810.0, "sample_num_tokens": 6994.5, "step": 6611, "total_num_tokens": 708610788.0, "z_loss": 0.0005759877385571599 }, { "copy_logits_max": -4.958034992218018, "copy_logits_min": -750000000.0, "copy_num_tokens": 350.75, "epoch": 1.3504723002297676, "gen_logits_max": 4.616260528564453, "gen_logits_mean": -16.39522933959961, "gen_logits_min": -28.34312629699707, "gen_logits_std": 3.1105058193206787, "gen_loss": 0.2782607674598694, "grad_norm": 0.4133310171311806, "learning_rate": 2.2577684210526314e-05, "loss": 0.2754, "mean_copy_accuracy": 0.9957248419523239, "mean_gen_accuracy": 0.8760724514722824, "mean_token_accuracy": 0.903967097401619, "num_tokens": 708858656.0, "sample_num_tokens": 7864.5, "step": 6612, "total_num_tokens": 708890114.0, "z_loss": 0.0005558353732340038 }, { "copy_logits_max": -4.991137504577637, "copy_logits_min": -750000000.0, "copy_num_tokens": 349.0, "epoch": 1.3506765381669645, "gen_logits_max": 5.577351093292236, "gen_logits_mean": -15.07274341583252, "gen_logits_min": -26.84024429321289, "gen_logits_std": 3.0385255813598633, "gen_loss": 0.3007494807243347, "grad_norm": 0.3691328364691026, "learning_rate": 2.257642105263158e-05, "loss": 0.2788, "mean_copy_accuracy": 0.9962779134511948, "mean_gen_accuracy": 0.8766535669565201, "mean_token_accuracy": 0.9043204188346863, "num_tokens": 709126109.0, "sample_num_tokens": 7249.25, "step": 6613, "total_num_tokens": 709155106.0, "z_loss": 0.0005691249389201403 }, { "copy_logits_max": -4.218428134918213, "copy_logits_min": -750000000.0, "copy_num_tokens": 468.5, "epoch": 1.3508807761041615, "gen_logits_max": 3.6874427795410156, "gen_logits_mean": -17.32286834716797, "gen_logits_min": -29.235694885253906, "gen_logits_std": 3.1436820030212402, "gen_loss": 0.2754760682582855, "grad_norm": 0.3804238365754374, "learning_rate": 2.2575157894736843e-05, "loss": 0.2669, "mean_copy_accuracy": 0.9971265643835068, "mean_gen_accuracy": 0.877986267209053, "mean_token_accuracy": 0.9090783298015594, "num_tokens": 709386206.0, "sample_num_tokens": 8260.5, "step": 6614, "total_num_tokens": 709419248.0, "z_loss": 0.0005699669709429145 }, { "copy_logits_max": -5.150092601776123, "copy_logits_min": -750000000.0, "copy_num_tokens": 476.0, "epoch": 1.3510850140413582, "gen_logits_max": 3.4275293350219727, "gen_logits_mean": -17.12405014038086, "gen_logits_min": -28.931610107421875, "gen_logits_std": 3.127763271331787, "gen_loss": 0.28609126806259155, "grad_norm": 0.3389926867935178, "learning_rate": 2.2573894736842107e-05, "loss": 0.272, "mean_copy_accuracy": 0.9958193600177765, "mean_gen_accuracy": 0.8802086710929871, "mean_token_accuracy": 0.9056361168622971, "num_tokens": 709643790.0, "sample_num_tokens": 8038.5, "step": 6615, "total_num_tokens": 709675944.0, "z_loss": 0.0005040167598053813 }, { "copy_logits_max": -4.221590042114258, "copy_logits_min": -750000064.0, "copy_num_tokens": 544.125, "epoch": 1.351289251978555, "gen_logits_max": 5.137780666351318, "gen_logits_mean": -15.14169692993164, "gen_logits_min": -27.48555564880371, "gen_logits_std": 3.0981764793395996, "gen_loss": 0.2954048812389374, "grad_norm": 0.3967334277637265, "learning_rate": 2.257263157894737e-05, "loss": 0.298, "mean_copy_accuracy": 0.9960026443004608, "mean_gen_accuracy": 0.8699855506420135, "mean_token_accuracy": 0.9004493802785873, "num_tokens": 709902124.0, "sample_num_tokens": 8689.5, "step": 6616, "total_num_tokens": 709936882.0, "z_loss": 0.0005648852093145251 }, { "copy_logits_max": -4.298313617706299, "copy_logits_min": -687500032.0, "copy_num_tokens": 512.6875, "epoch": 1.3514934899157518, "gen_logits_max": 5.039809703826904, "gen_logits_mean": -15.066380500793457, "gen_logits_min": -27.545381546020508, "gen_logits_std": 3.1069958209991455, "gen_loss": 0.2966623306274414, "grad_norm": 0.38624247970219555, "learning_rate": 2.2571368421052632e-05, "loss": 0.2731, "mean_copy_accuracy": 0.9964136034250259, "mean_gen_accuracy": 0.8798983544111252, "mean_token_accuracy": 0.9063805490732193, "num_tokens": 710154290.0, "sample_num_tokens": 8597.5, "step": 6617, "total_num_tokens": 710188680.0, "z_loss": 0.0005690880352631211 }, { "copy_logits_max": -3.7806272506713867, "copy_logits_min": -750000000.0, "copy_num_tokens": 622.625, "epoch": 1.3516977278529487, "gen_logits_max": 3.8309202194213867, "gen_logits_mean": -16.620323181152344, "gen_logits_min": -29.030689239501953, "gen_logits_std": 3.137775182723999, "gen_loss": 0.27599048614501953, "grad_norm": 0.35769558471005475, "learning_rate": 2.2570105263157897e-05, "loss": 0.2702, "mean_copy_accuracy": 0.9969552159309387, "mean_gen_accuracy": 0.8786876797676086, "mean_token_accuracy": 0.9085773676633835, "num_tokens": 710426652.0, "sample_num_tokens": 9293.0, "step": 6618, "total_num_tokens": 710463824.0, "z_loss": 0.0005557488184422255 }, { "copy_logits_max": -5.312859058380127, "copy_logits_min": -750000000.0, "copy_num_tokens": 385.1875, "epoch": 1.3519019657901454, "gen_logits_max": 4.127449989318848, "gen_logits_mean": -16.304025650024414, "gen_logits_min": -28.24002456665039, "gen_logits_std": 3.1011219024658203, "gen_loss": 0.2878493666648865, "grad_norm": 0.36871200293670314, "learning_rate": 2.2568842105263157e-05, "loss": 0.2925, "mean_copy_accuracy": 0.9965890944004059, "mean_gen_accuracy": 0.8732901066541672, "mean_token_accuracy": 0.8999338895082474, "num_tokens": 710690019.0, "sample_num_tokens": 8165.75, "step": 6619, "total_num_tokens": 710722682.0, "z_loss": 0.0005262251943349838 }, { "copy_logits_max": -3.2570555210113525, "copy_logits_min": -687500032.0, "copy_num_tokens": 787.5, "epoch": 1.3521062037273424, "gen_logits_max": 2.2922310829162598, "gen_logits_mean": -17.4913330078125, "gen_logits_min": -29.993507385253906, "gen_logits_std": 3.1879520416259766, "gen_loss": 0.20774675905704498, "grad_norm": 0.3783908762535702, "learning_rate": 2.2567578947368422e-05, "loss": 0.2589, "mean_copy_accuracy": 0.9975176602602005, "mean_gen_accuracy": 0.876856729388237, "mean_token_accuracy": 0.9116477817296982, "num_tokens": 710978197.0, "sample_num_tokens": 10153.75, "step": 6620, "total_num_tokens": 711018812.0, "z_loss": 0.00041066535050049424 }, { "copy_logits_max": -4.097302436828613, "copy_logits_min": -750000064.0, "copy_num_tokens": 634.4375, "epoch": 1.3523104416645393, "gen_logits_max": 3.7504634857177734, "gen_logits_mean": -16.105327606201172, "gen_logits_min": -28.945110321044922, "gen_logits_std": 3.1431336402893066, "gen_loss": 0.24523255228996277, "grad_norm": 0.3586487785020147, "learning_rate": 2.2566315789473683e-05, "loss": 0.2641, "mean_copy_accuracy": 0.9958395510911942, "mean_gen_accuracy": 0.8798253387212753, "mean_token_accuracy": 0.9104786813259125, "num_tokens": 711257790.0, "sample_num_tokens": 9509.0, "step": 6621, "total_num_tokens": 711295826.0, "z_loss": 0.0004424436192493886 }, { "copy_logits_max": -2.9663455486297607, "copy_logits_min": -750000000.0, "copy_num_tokens": 323.625, "epoch": 1.352514679601736, "gen_logits_max": 4.232634544372559, "gen_logits_mean": -16.176551818847656, "gen_logits_min": -28.175182342529297, "gen_logits_std": 3.0974583625793457, "gen_loss": 0.29601922631263733, "grad_norm": 0.34625881124709207, "learning_rate": 2.2565052631578947e-05, "loss": 0.2752, "mean_copy_accuracy": 0.9955965131521225, "mean_gen_accuracy": 0.8818720877170563, "mean_token_accuracy": 0.9066212624311447, "num_tokens": 711530092.0, "sample_num_tokens": 7213.0, "step": 6622, "total_num_tokens": 711558944.0, "z_loss": 0.0005021011456847191 }, { "copy_logits_max": -2.9487476348876953, "copy_logits_min": -750000000.0, "copy_num_tokens": 454.5, "epoch": 1.352718917538933, "gen_logits_max": 3.8944356441497803, "gen_logits_mean": -15.43732738494873, "gen_logits_min": -27.433181762695312, "gen_logits_std": 3.0922610759735107, "gen_loss": 0.2891206741333008, "grad_norm": 0.35072866830129257, "learning_rate": 2.256378947368421e-05, "loss": 0.276, "mean_copy_accuracy": 0.9965911209583282, "mean_gen_accuracy": 0.8734014630317688, "mean_token_accuracy": 0.9073550999164581, "num_tokens": 711802966.0, "sample_num_tokens": 7639.5, "step": 6623, "total_num_tokens": 711833524.0, "z_loss": 0.0005226583452895284 }, { "copy_logits_max": -1.4525172710418701, "copy_logits_min": -687500032.0, "copy_num_tokens": 520.6875, "epoch": 1.3529231554761296, "gen_logits_max": 3.978698968887329, "gen_logits_mean": -14.891117095947266, "gen_logits_min": -26.78664779663086, "gen_logits_std": 3.075629711151123, "gen_loss": 0.2778109014034271, "grad_norm": 0.48882270552961804, "learning_rate": 2.2562526315789476e-05, "loss": 0.2817, "mean_copy_accuracy": 0.9963964819908142, "mean_gen_accuracy": 0.875340610742569, "mean_token_accuracy": 0.9059530794620514, "num_tokens": 712075141.0, "sample_num_tokens": 8711.25, "step": 6624, "total_num_tokens": 712109986.0, "z_loss": 0.0004857431922573596 }, { "copy_logits_max": -6.282149314880371, "copy_logits_min": -750000000.0, "copy_num_tokens": 268.125, "epoch": 1.3531273934133266, "gen_logits_max": 4.715468406677246, "gen_logits_mean": -16.658157348632812, "gen_logits_min": -28.50595474243164, "gen_logits_std": 3.1285719871520996, "gen_loss": 0.27779799699783325, "grad_norm": 0.42455906151212847, "learning_rate": 2.2561263157894737e-05, "loss": 0.2794, "mean_copy_accuracy": 0.9949012100696564, "mean_gen_accuracy": 0.8792372792959213, "mean_token_accuracy": 0.9045569598674774, "num_tokens": 712316909.0, "sample_num_tokens": 7145.25, "step": 6625, "total_num_tokens": 712345490.0, "z_loss": 0.00048601021990180016 }, { "copy_logits_max": -4.67999267578125, "copy_logits_min": -750000000.0, "copy_num_tokens": 366.875, "epoch": 1.3533316313505233, "gen_logits_max": 4.78920841217041, "gen_logits_mean": -15.175260543823242, "gen_logits_min": -27.27406883239746, "gen_logits_std": 3.1181845664978027, "gen_loss": 0.2803862392902374, "grad_norm": 0.37754693889817414, "learning_rate": 2.256e-05, "loss": 0.2818, "mean_copy_accuracy": 0.9953054338693619, "mean_gen_accuracy": 0.8751752972602844, "mean_token_accuracy": 0.9039188623428345, "num_tokens": 712583214.0, "sample_num_tokens": 7284.5, "step": 6626, "total_num_tokens": 712612352.0, "z_loss": 0.0004987471038475633 }, { "copy_logits_max": -4.284308433532715, "copy_logits_min": -750000000.0, "copy_num_tokens": 482.625, "epoch": 1.3535358692877202, "gen_logits_max": 3.847517490386963, "gen_logits_mean": -16.755603790283203, "gen_logits_min": -29.266101837158203, "gen_logits_std": 3.1296677589416504, "gen_loss": 0.25512802600860596, "grad_norm": 0.41034316334072, "learning_rate": 2.2558736842105265e-05, "loss": 0.289, "mean_copy_accuracy": 0.9964649230241776, "mean_gen_accuracy": 0.8681544959545135, "mean_token_accuracy": 0.9020579010248184, "num_tokens": 712843017.0, "sample_num_tokens": 9542.75, "step": 6627, "total_num_tokens": 712881188.0, "z_loss": 0.0004267616895958781 }, { "copy_logits_max": -2.4359264373779297, "copy_logits_min": -750000000.0, "copy_num_tokens": 519.0625, "epoch": 1.353740107224917, "gen_logits_max": 4.110434532165527, "gen_logits_mean": -14.993138313293457, "gen_logits_min": -27.28650665283203, "gen_logits_std": 3.1144766807556152, "gen_loss": 0.2554873824119568, "grad_norm": 0.601917052822124, "learning_rate": 2.2557473684210526e-05, "loss": 0.2642, "mean_copy_accuracy": 0.9961721003055573, "mean_gen_accuracy": 0.8796310871839523, "mean_token_accuracy": 0.9095077216625214, "num_tokens": 713109917.0, "sample_num_tokens": 7960.75, "step": 6628, "total_num_tokens": 713141760.0, "z_loss": 0.000519297260325402 }, { "copy_logits_max": -2.9983925819396973, "copy_logits_min": -750000000.0, "copy_num_tokens": 544.3125, "epoch": 1.3539443451621138, "gen_logits_max": 2.9621009826660156, "gen_logits_mean": -16.862506866455078, "gen_logits_min": -29.092634201049805, "gen_logits_std": 3.1490976810455322, "gen_loss": 0.26588648557662964, "grad_norm": 0.3694012299729659, "learning_rate": 2.255621052631579e-05, "loss": 0.2865, "mean_copy_accuracy": 0.9965268820524216, "mean_gen_accuracy": 0.8675872832536697, "mean_token_accuracy": 0.9023443460464478, "num_tokens": 713382437.0, "sample_num_tokens": 8378.25, "step": 6629, "total_num_tokens": 713415950.0, "z_loss": 0.0004934320459142327 }, { "copy_logits_max": -1.89067804813385, "copy_logits_min": -750000000.0, "copy_num_tokens": 383.125, "epoch": 1.3541485830993107, "gen_logits_max": 4.924556732177734, "gen_logits_mean": -15.27966022491455, "gen_logits_min": -27.740148544311523, "gen_logits_std": 3.1130764484405518, "gen_loss": 0.2976369261741638, "grad_norm": 0.3858764423546118, "learning_rate": 2.255494736842105e-05, "loss": 0.2745, "mean_copy_accuracy": 0.9951572269201279, "mean_gen_accuracy": 0.8797646164894104, "mean_token_accuracy": 0.907648429274559, "num_tokens": 713643582.0, "sample_num_tokens": 8039.0, "step": 6630, "total_num_tokens": 713675738.0, "z_loss": 0.0005434167687781155 }, { "copy_logits_max": -4.601577281951904, "copy_logits_min": -750000128.0, "copy_num_tokens": 471.625, "epoch": 1.3543528210365074, "gen_logits_max": 3.744415521621704, "gen_logits_mean": -17.43560028076172, "gen_logits_min": -29.413610458374023, "gen_logits_std": 3.1697981357574463, "gen_loss": 0.2714846134185791, "grad_norm": 0.41947782063756894, "learning_rate": 2.255368421052632e-05, "loss": 0.2764, "mean_copy_accuracy": 0.9952280968427658, "mean_gen_accuracy": 0.8797276169061661, "mean_token_accuracy": 0.9068891406059265, "num_tokens": 713925788.0, "sample_num_tokens": 8834.5, "step": 6631, "total_num_tokens": 713961126.0, "z_loss": 0.0004949760041199625 }, { "copy_logits_max": -5.966753005981445, "copy_logits_min": -750000000.0, "copy_num_tokens": 328.0, "epoch": 1.3545570589737044, "gen_logits_max": 4.158638000488281, "gen_logits_mean": -17.49386215209961, "gen_logits_min": -29.178783416748047, "gen_logits_std": 3.1591315269470215, "gen_loss": 0.3199715316295624, "grad_norm": 0.3403552951335617, "learning_rate": 2.255242105263158e-05, "loss": 0.2878, "mean_copy_accuracy": 0.9960136264562607, "mean_gen_accuracy": 0.8744706362485886, "mean_token_accuracy": 0.9042335450649261, "num_tokens": 714216175.0, "sample_num_tokens": 7755.25, "step": 6632, "total_num_tokens": 714247196.0, "z_loss": 0.000533365469891578 }, { "copy_logits_max": -2.0017905235290527, "copy_logits_min": -750000064.0, "copy_num_tokens": 453.625, "epoch": 1.354761296910901, "gen_logits_max": 5.624463081359863, "gen_logits_mean": -13.7667236328125, "gen_logits_min": -25.86236572265625, "gen_logits_std": 3.1005325317382812, "gen_loss": 0.2603071928024292, "grad_norm": 0.48148061404297093, "learning_rate": 2.2551157894736844e-05, "loss": 0.2599, "mean_copy_accuracy": 0.9954682737588882, "mean_gen_accuracy": 0.8811904191970825, "mean_token_accuracy": 0.9103541374206543, "num_tokens": 714480617.0, "sample_num_tokens": 8861.25, "step": 6633, "total_num_tokens": 714516062.0, "z_loss": 0.00044190441258251667 }, { "copy_logits_max": -4.072749137878418, "copy_logits_min": -750000000.0, "copy_num_tokens": 409.4375, "epoch": 1.354965534848098, "gen_logits_max": 4.285348892211914, "gen_logits_mean": -16.777645111083984, "gen_logits_min": -28.692811965942383, "gen_logits_std": 3.1619555950164795, "gen_loss": 0.27667272090911865, "grad_norm": 0.39517536850553536, "learning_rate": 2.2549894736842105e-05, "loss": 0.3052, "mean_copy_accuracy": 0.9954137951135635, "mean_gen_accuracy": 0.8701167404651642, "mean_token_accuracy": 0.8968197256326675, "num_tokens": 714743099.0, "sample_num_tokens": 7718.75, "step": 6634, "total_num_tokens": 714773974.0, "z_loss": 0.0004837414890062064 }, { "copy_logits_max": -5.385848045349121, "copy_logits_min": -750000000.0, "copy_num_tokens": 404.6875, "epoch": 1.355169772785295, "gen_logits_max": 3.737156867980957, "gen_logits_mean": -17.09929656982422, "gen_logits_min": -29.422035217285156, "gen_logits_std": 3.1823911666870117, "gen_loss": 0.24673227965831757, "grad_norm": 0.5357706051107679, "learning_rate": 2.254863157894737e-05, "loss": 0.2849, "mean_copy_accuracy": 0.995289072394371, "mean_gen_accuracy": 0.875541165471077, "mean_token_accuracy": 0.9032907336950302, "num_tokens": 715006100.0, "sample_num_tokens": 7970.5, "step": 6635, "total_num_tokens": 715037982.0, "z_loss": 0.000428046565502882 }, { "copy_logits_max": -6.535282611846924, "copy_logits_min": -750000000.0, "copy_num_tokens": 337.5625, "epoch": 1.3553740107224916, "gen_logits_max": 4.919768333435059, "gen_logits_mean": -16.2067928314209, "gen_logits_min": -27.872562408447266, "gen_logits_std": 3.105017900466919, "gen_loss": 0.3324267864227295, "grad_norm": 0.37108343042527786, "learning_rate": 2.254736842105263e-05, "loss": 0.284, "mean_copy_accuracy": 0.99522565305233, "mean_gen_accuracy": 0.8802679777145386, "mean_token_accuracy": 0.9048431813716888, "num_tokens": 715271526.0, "sample_num_tokens": 8539.0, "step": 6636, "total_num_tokens": 715305682.0, "z_loss": 0.0005597671261057258 }, { "copy_logits_max": -2.6530256271362305, "copy_logits_min": -750000000.0, "copy_num_tokens": 536.25, "epoch": 1.3555782486596886, "gen_logits_max": 4.16256046295166, "gen_logits_mean": -15.84969711303711, "gen_logits_min": -28.169328689575195, "gen_logits_std": 3.110382556915283, "gen_loss": 0.2962539792060852, "grad_norm": 0.3637756699523105, "learning_rate": 2.2546105263157895e-05, "loss": 0.2587, "mean_copy_accuracy": 0.9961210340261459, "mean_gen_accuracy": 0.8828293979167938, "mean_token_accuracy": 0.9126508235931396, "num_tokens": 715560893.0, "sample_num_tokens": 10241.25, "step": 6637, "total_num_tokens": 715601858.0, "z_loss": 0.0005126570467837155 }, { "copy_logits_max": -4.983453750610352, "copy_logits_min": -750000000.0, "copy_num_tokens": 464.125, "epoch": 1.3557824865968855, "gen_logits_max": 4.272617340087891, "gen_logits_mean": -15.434104919433594, "gen_logits_min": -28.111196517944336, "gen_logits_std": 3.0981392860412598, "gen_loss": 0.3166011571884155, "grad_norm": 0.4242131822373163, "learning_rate": 2.2544842105263156e-05, "loss": 0.2817, "mean_copy_accuracy": 0.995845764875412, "mean_gen_accuracy": 0.8715979903936386, "mean_token_accuracy": 0.9043365269899368, "num_tokens": 715840743.0, "sample_num_tokens": 8386.25, "step": 6638, "total_num_tokens": 715874288.0, "z_loss": 0.0005952100618742406 }, { "copy_logits_max": -5.177067756652832, "copy_logits_min": -750000000.0, "copy_num_tokens": 423.9375, "epoch": 1.3559867245340822, "gen_logits_max": 5.307662487030029, "gen_logits_mean": -15.300222396850586, "gen_logits_min": -27.420501708984375, "gen_logits_std": 3.0866122245788574, "gen_loss": 0.2975255846977234, "grad_norm": 0.36133585269083335, "learning_rate": 2.2543578947368423e-05, "loss": 0.2836, "mean_copy_accuracy": 0.9960954189300537, "mean_gen_accuracy": 0.8778451979160309, "mean_token_accuracy": 0.9040098339319229, "num_tokens": 716110404.0, "sample_num_tokens": 8633.5, "step": 6639, "total_num_tokens": 716144938.0, "z_loss": 0.0005615847767330706 }, { "copy_logits_max": -3.7792890071868896, "copy_logits_min": -750000000.0, "copy_num_tokens": 304.375, "epoch": 1.356190962471279, "gen_logits_max": 5.044198989868164, "gen_logits_mean": -15.89494800567627, "gen_logits_min": -28.420251846313477, "gen_logits_std": 3.106309413909912, "gen_loss": 0.2956967353820801, "grad_norm": 0.37578700381577446, "learning_rate": 2.2542315789473688e-05, "loss": 0.2848, "mean_copy_accuracy": 0.9954387694597244, "mean_gen_accuracy": 0.8775175511837006, "mean_token_accuracy": 0.904790997505188, "num_tokens": 716373580.0, "sample_num_tokens": 6774.0, "step": 6640, "total_num_tokens": 716400676.0, "z_loss": 0.0005438015214167535 }, { "copy_logits_max": -3.7083089351654053, "copy_logits_min": -750000000.0, "copy_num_tokens": 433.375, "epoch": 1.3563952004084758, "gen_logits_max": 3.7749533653259277, "gen_logits_mean": -17.14308738708496, "gen_logits_min": -29.137752532958984, "gen_logits_std": 3.1564319133758545, "gen_loss": 0.2826381027698517, "grad_norm": 0.3957910239255912, "learning_rate": 2.254105263157895e-05, "loss": 0.2792, "mean_copy_accuracy": 0.9959261566400528, "mean_gen_accuracy": 0.8745098412036896, "mean_token_accuracy": 0.904051885008812, "num_tokens": 716635240.0, "sample_num_tokens": 8555.5, "step": 6641, "total_num_tokens": 716669462.0, "z_loss": 0.0005486337468028069 }, { "copy_logits_max": -3.91888689994812, "copy_logits_min": -687500032.0, "copy_num_tokens": 570.4375, "epoch": 1.3565994383456728, "gen_logits_max": 4.304311275482178, "gen_logits_mean": -14.952722549438477, "gen_logits_min": -27.47805404663086, "gen_logits_std": 3.119976043701172, "gen_loss": 0.26641783118247986, "grad_norm": 0.35638844852061247, "learning_rate": 2.2539789473684213e-05, "loss": 0.2606, "mean_copy_accuracy": 0.9963260889053345, "mean_gen_accuracy": 0.8834995478391647, "mean_token_accuracy": 0.9111249446868896, "num_tokens": 716895618.0, "sample_num_tokens": 8723.0, "step": 6642, "total_num_tokens": 716930510.0, "z_loss": 0.0004737335257232189 }, { "copy_logits_max": -6.787599086761475, "copy_logits_min": -750000000.0, "copy_num_tokens": 299.4375, "epoch": 1.3568036762828695, "gen_logits_max": 4.947273254394531, "gen_logits_mean": -16.022415161132812, "gen_logits_min": -27.92443084716797, "gen_logits_std": 3.1095285415649414, "gen_loss": 0.3291565477848053, "grad_norm": 0.4608031122570247, "learning_rate": 2.2538526315789474e-05, "loss": 0.2848, "mean_copy_accuracy": 0.9951083064079285, "mean_gen_accuracy": 0.8775566071271896, "mean_token_accuracy": 0.9015550315380096, "num_tokens": 717156330.0, "sample_num_tokens": 7528.5, "step": 6643, "total_num_tokens": 717186444.0, "z_loss": 0.0005208880174905062 }, { "copy_logits_max": -5.663548946380615, "copy_logits_min": -625000064.0, "copy_num_tokens": 423.5, "epoch": 1.3570079142200664, "gen_logits_max": 3.2247724533081055, "gen_logits_mean": -18.440399169921875, "gen_logits_min": -30.579580307006836, "gen_logits_std": 3.1908464431762695, "gen_loss": 0.28134816884994507, "grad_norm": 0.38811906501592613, "learning_rate": 2.2537263157894738e-05, "loss": 0.2789, "mean_copy_accuracy": 0.9957192838191986, "mean_gen_accuracy": 0.8764749467372894, "mean_token_accuracy": 0.9058379679918289, "num_tokens": 717425919.0, "sample_num_tokens": 7923.75, "step": 6644, "total_num_tokens": 717457614.0, "z_loss": 0.00047007339890114963 }, { "copy_logits_max": -3.3330483436584473, "copy_logits_min": -750000000.0, "copy_num_tokens": 733.9375, "epoch": 1.3572121521572633, "gen_logits_max": 4.534646034240723, "gen_logits_mean": -15.103662490844727, "gen_logits_min": -27.819347381591797, "gen_logits_std": 3.1082704067230225, "gen_loss": 0.2770802974700928, "grad_norm": 0.3810200987654045, "learning_rate": 2.2536e-05, "loss": 0.278, "mean_copy_accuracy": 0.9969188421964645, "mean_gen_accuracy": 0.8676041066646576, "mean_token_accuracy": 0.905165046453476, "num_tokens": 717716493.0, "sample_num_tokens": 10867.75, "step": 6645, "total_num_tokens": 717759964.0, "z_loss": 0.0005205664783716202 }, { "copy_logits_max": -1.2083382606506348, "copy_logits_min": -687500032.0, "copy_num_tokens": 696.625, "epoch": 1.35741639009446, "gen_logits_max": 5.083560943603516, "gen_logits_mean": -14.724870681762695, "gen_logits_min": -27.34084701538086, "gen_logits_std": 3.1508514881134033, "gen_loss": 0.2587871551513672, "grad_norm": 0.3409607054571048, "learning_rate": 2.2534736842105263e-05, "loss": 0.2709, "mean_copy_accuracy": 0.9973271638154984, "mean_gen_accuracy": 0.8753196895122528, "mean_token_accuracy": 0.9064807891845703, "num_tokens": 718007686.0, "sample_num_tokens": 10614.5, "step": 6646, "total_num_tokens": 718050144.0, "z_loss": 0.00044914812315255404 }, { "copy_logits_max": -3.589038372039795, "copy_logits_min": -687500032.0, "copy_num_tokens": 664.625, "epoch": 1.357620628031657, "gen_logits_max": 4.391099452972412, "gen_logits_mean": -15.102351188659668, "gen_logits_min": -27.781381607055664, "gen_logits_std": 3.1458685398101807, "gen_loss": 0.2589870095252991, "grad_norm": 0.36233884124711085, "learning_rate": 2.2533473684210528e-05, "loss": 0.2772, "mean_copy_accuracy": 0.997261717915535, "mean_gen_accuracy": 0.8715384602546692, "mean_token_accuracy": 0.906690776348114, "num_tokens": 718290339.0, "sample_num_tokens": 9165.25, "step": 6647, "total_num_tokens": 718327000.0, "z_loss": 0.0004294469254091382 }, { "copy_logits_max": -6.200822830200195, "copy_logits_min": -562500032.0, "copy_num_tokens": 373.75, "epoch": 1.3578248659688537, "gen_logits_max": 3.480154275894165, "gen_logits_mean": -18.003952026367188, "gen_logits_min": -30.099458694458008, "gen_logits_std": 3.1709632873535156, "gen_loss": 0.31622105836868286, "grad_norm": 0.3555038634702506, "learning_rate": 2.2532210526315792e-05, "loss": 0.2674, "mean_copy_accuracy": 0.9970754235982895, "mean_gen_accuracy": 0.8768699765205383, "mean_token_accuracy": 0.9109059274196625, "num_tokens": 718563749.0, "sample_num_tokens": 8025.75, "step": 6648, "total_num_tokens": 718595852.0, "z_loss": 0.0005982829025015235 }, { "copy_logits_max": -3.865574836730957, "copy_logits_min": -750000000.0, "copy_num_tokens": 409.8125, "epoch": 1.3580291039060506, "gen_logits_max": 4.641631126403809, "gen_logits_mean": -15.477605819702148, "gen_logits_min": -27.715591430664062, "gen_logits_std": 3.1125447750091553, "gen_loss": 0.30710136890411377, "grad_norm": 0.37888228798018136, "learning_rate": 2.2530947368421053e-05, "loss": 0.2803, "mean_copy_accuracy": 0.9958891719579697, "mean_gen_accuracy": 0.8766444325447083, "mean_token_accuracy": 0.9051828533411026, "num_tokens": 718818388.0, "sample_num_tokens": 7637.5, "step": 6649, "total_num_tokens": 718848938.0, "z_loss": 0.0005611821543425322 }, { "copy_logits_max": -7.781553745269775, "copy_logits_min": -750000000.0, "copy_num_tokens": 417.125, "epoch": 1.3582333418432473, "gen_logits_max": 3.281994342803955, "gen_logits_mean": -18.073410034179688, "gen_logits_min": -30.0479793548584, "gen_logits_std": 3.176752805709839, "gen_loss": 0.28385621309280396, "grad_norm": 0.36059103228196027, "learning_rate": 2.2529684210526317e-05, "loss": 0.2796, "mean_copy_accuracy": 0.9971283823251724, "mean_gen_accuracy": 0.8781016319990158, "mean_token_accuracy": 0.9057621508836746, "num_tokens": 719101929.0, "sample_num_tokens": 8937.75, "step": 6650, "total_num_tokens": 719137680.0, "z_loss": 0.0005156385595910251 }, { "copy_logits_max": -7.348992824554443, "copy_logits_min": -750000000.0, "copy_num_tokens": 362.3125, "epoch": 1.3584375797804442, "gen_logits_max": 4.031431198120117, "gen_logits_mean": -17.38863754272461, "gen_logits_min": -29.30229377746582, "gen_logits_std": 3.165433883666992, "gen_loss": 0.2651907205581665, "grad_norm": 0.36656372951430766, "learning_rate": 2.2528421052631578e-05, "loss": 0.2758, "mean_copy_accuracy": 0.9958066046237946, "mean_gen_accuracy": 0.8782907724380493, "mean_token_accuracy": 0.9046655595302582, "num_tokens": 719364948.0, "sample_num_tokens": 8634.0, "step": 6651, "total_num_tokens": 719399484.0, "z_loss": 0.0004603066190611571 }, { "copy_logits_max": -1.644360065460205, "copy_logits_min": -750000000.0, "copy_num_tokens": 735.1875, "epoch": 1.3586418177176411, "gen_logits_max": 3.776893138885498, "gen_logits_mean": -15.135193824768066, "gen_logits_min": -27.81026840209961, "gen_logits_std": 3.133758068084717, "gen_loss": 0.2601202130317688, "grad_norm": 0.36909255626560666, "learning_rate": 2.2527157894736842e-05, "loss": 0.2895, "mean_copy_accuracy": 0.9967376887798309, "mean_gen_accuracy": 0.8682196736335754, "mean_token_accuracy": 0.9021879434585571, "num_tokens": 719639849.0, "sample_num_tokens": 10389.75, "step": 6652, "total_num_tokens": 719681408.0, "z_loss": 0.0005078667309135199 }, { "copy_logits_max": -4.198904991149902, "copy_logits_min": -750000128.0, "copy_num_tokens": 493.0, "epoch": 1.3588460556548378, "gen_logits_max": 4.431790351867676, "gen_logits_mean": -15.247424125671387, "gen_logits_min": -27.655397415161133, "gen_logits_std": 3.1086506843566895, "gen_loss": 0.2860344648361206, "grad_norm": 0.4108393238122567, "learning_rate": 2.2525894736842103e-05, "loss": 0.2982, "mean_copy_accuracy": 0.9957975000143051, "mean_gen_accuracy": 0.8717337846755981, "mean_token_accuracy": 0.8982463628053665, "num_tokens": 719895764.0, "sample_num_tokens": 8606.0, "step": 6653, "total_num_tokens": 719930188.0, "z_loss": 0.000524211791343987 }, { "copy_logits_max": -4.159760475158691, "copy_logits_min": -687500032.0, "copy_num_tokens": 525.0625, "epoch": 1.3590502935920348, "gen_logits_max": 3.5502848625183105, "gen_logits_mean": -16.689498901367188, "gen_logits_min": -28.92373275756836, "gen_logits_std": 3.1533589363098145, "gen_loss": 0.2617841958999634, "grad_norm": 0.3447787410016047, "learning_rate": 2.2524631578947368e-05, "loss": 0.2702, "mean_copy_accuracy": 0.9967454820871353, "mean_gen_accuracy": 0.8777129352092743, "mean_token_accuracy": 0.908251017332077, "num_tokens": 720172822.0, "sample_num_tokens": 8818.0, "step": 6654, "total_num_tokens": 720208094.0, "z_loss": 0.0005213127005845308 }, { "copy_logits_max": -3.772653579711914, "copy_logits_min": -750000000.0, "copy_num_tokens": 629.9375, "epoch": 1.3592545315292315, "gen_logits_max": 3.754470109939575, "gen_logits_mean": -15.854806900024414, "gen_logits_min": -29.370845794677734, "gen_logits_std": 3.1453309059143066, "gen_loss": 0.25295132398605347, "grad_norm": 0.35193625879849055, "learning_rate": 2.2523368421052632e-05, "loss": 0.2719, "mean_copy_accuracy": 0.9959709942340851, "mean_gen_accuracy": 0.8742179125547409, "mean_token_accuracy": 0.9070441424846649, "num_tokens": 720458372.0, "sample_num_tokens": 9313.5, "step": 6655, "total_num_tokens": 720495626.0, "z_loss": 0.0004976376658305526 }, { "copy_logits_max": -6.985464096069336, "copy_logits_min": -750000064.0, "copy_num_tokens": 426.8125, "epoch": 1.3594587694664284, "gen_logits_max": 3.8910601139068604, "gen_logits_mean": -17.59284210205078, "gen_logits_min": -29.687150955200195, "gen_logits_std": 3.145630359649658, "gen_loss": 0.2677841782569885, "grad_norm": 0.3665213746509756, "learning_rate": 2.2522105263157896e-05, "loss": 0.2866, "mean_copy_accuracy": 0.9975918531417847, "mean_gen_accuracy": 0.86876580119133, "mean_token_accuracy": 0.9019687920808792, "num_tokens": 720735242.0, "sample_num_tokens": 8496.5, "step": 6656, "total_num_tokens": 720769228.0, "z_loss": 0.0005168431671336293 }, { "copy_logits_max": -3.454153299331665, "copy_logits_min": -750000000.0, "copy_num_tokens": 550.0625, "epoch": 1.3596630074036251, "gen_logits_max": 3.887295722961426, "gen_logits_mean": -16.001440048217773, "gen_logits_min": -28.801101684570312, "gen_logits_std": 3.1331822872161865, "gen_loss": 0.27517592906951904, "grad_norm": 0.33591814877495857, "learning_rate": 2.252084210526316e-05, "loss": 0.2894, "mean_copy_accuracy": 0.99697645008564, "mean_gen_accuracy": 0.8705555647611618, "mean_token_accuracy": 0.9042970091104507, "num_tokens": 721019891.0, "sample_num_tokens": 8593.25, "step": 6657, "total_num_tokens": 721054264.0, "z_loss": 0.0005426681600511074 }, { "copy_logits_max": -3.502906560897827, "copy_logits_min": -750000000.0, "copy_num_tokens": 492.375, "epoch": 1.359867245340822, "gen_logits_max": 3.927496910095215, "gen_logits_mean": -15.56906509399414, "gen_logits_min": -28.391571044921875, "gen_logits_std": 3.1083712577819824, "gen_loss": 0.2715173661708832, "grad_norm": 0.36639133509842076, "learning_rate": 2.251957894736842e-05, "loss": 0.2782, "mean_copy_accuracy": 0.9966132789850235, "mean_gen_accuracy": 0.8760325014591217, "mean_token_accuracy": 0.9053077399730682, "num_tokens": 721275509.0, "sample_num_tokens": 8157.25, "step": 6658, "total_num_tokens": 721308138.0, "z_loss": 0.000538280641194433 }, { "copy_logits_max": -3.643085479736328, "copy_logits_min": -750000000.0, "copy_num_tokens": 460.625, "epoch": 1.360071483278019, "gen_logits_max": 4.326369285583496, "gen_logits_mean": -16.352420806884766, "gen_logits_min": -28.777694702148438, "gen_logits_std": 3.115987777709961, "gen_loss": 0.2778889536857605, "grad_norm": 0.3505444765335311, "learning_rate": 2.2518315789473686e-05, "loss": 0.2796, "mean_copy_accuracy": 0.9972202330827713, "mean_gen_accuracy": 0.8741369247436523, "mean_token_accuracy": 0.9065338522195816, "num_tokens": 721557416.0, "sample_num_tokens": 8608.0, "step": 6659, "total_num_tokens": 721591848.0, "z_loss": 0.000514105660840869 }, { "copy_logits_max": -6.307525157928467, "copy_logits_min": -750000000.0, "copy_num_tokens": 489.375, "epoch": 1.3602757212152157, "gen_logits_max": 3.8871066570281982, "gen_logits_mean": -16.922645568847656, "gen_logits_min": -30.051513671875, "gen_logits_std": 3.162508726119995, "gen_loss": 0.24711847305297852, "grad_norm": 0.37410143499889886, "learning_rate": 2.2517052631578947e-05, "loss": 0.28, "mean_copy_accuracy": 0.9962007850408554, "mean_gen_accuracy": 0.8778407424688339, "mean_token_accuracy": 0.9046425670385361, "num_tokens": 721824141.0, "sample_num_tokens": 9582.75, "step": 6660, "total_num_tokens": 721862472.0, "z_loss": 0.0004628024762496352 }, { "copy_logits_max": -3.850137948989868, "copy_logits_min": -687500032.0, "copy_num_tokens": 719.4375, "epoch": 1.3604799591524126, "gen_logits_max": 1.9730985164642334, "gen_logits_mean": -17.982803344726562, "gen_logits_min": -30.74039077758789, "gen_logits_std": 3.2117373943328857, "gen_loss": 0.24624769389629364, "grad_norm": 0.35335759024170105, "learning_rate": 2.251578947368421e-05, "loss": 0.2771, "mean_copy_accuracy": 0.9956184923648834, "mean_gen_accuracy": 0.8716677725315094, "mean_token_accuracy": 0.906463235616684, "num_tokens": 722099678.0, "sample_num_tokens": 9580.5, "step": 6661, "total_num_tokens": 722138000.0, "z_loss": 0.00043680856470018625 }, { "copy_logits_max": -4.710018157958984, "copy_logits_min": -687500032.0, "copy_num_tokens": 648.5625, "epoch": 1.3606841970896095, "gen_logits_max": 2.9541521072387695, "gen_logits_mean": -16.574052810668945, "gen_logits_min": -28.822175979614258, "gen_logits_std": 3.1479313373565674, "gen_loss": 0.3028041422367096, "grad_norm": 0.3575421786793956, "learning_rate": 2.2514526315789472e-05, "loss": 0.2863, "mean_copy_accuracy": 0.99619260430336, "mean_gen_accuracy": 0.872075229883194, "mean_token_accuracy": 0.9020488113164902, "num_tokens": 722372214.0, "sample_num_tokens": 9483.0, "step": 6662, "total_num_tokens": 722410146.0, "z_loss": 0.0005646409117616713 }, { "copy_logits_max": -4.068718433380127, "copy_logits_min": -687500032.0, "copy_num_tokens": 602.6875, "epoch": 1.3608884350268062, "gen_logits_max": 3.5694901943206787, "gen_logits_mean": -15.832986831665039, "gen_logits_min": -28.60950469970703, "gen_logits_std": 3.1504158973693848, "gen_loss": 0.2702138125896454, "grad_norm": 0.33748569036590126, "learning_rate": 2.2513263157894736e-05, "loss": 0.2747, "mean_copy_accuracy": 0.9975331872701645, "mean_gen_accuracy": 0.8739650249481201, "mean_token_accuracy": 0.9076776802539825, "num_tokens": 722662292.0, "sample_num_tokens": 8908.5, "step": 6663, "total_num_tokens": 722697926.0, "z_loss": 0.0005158773274160922 }, { "copy_logits_max": -4.051566123962402, "copy_logits_min": -750000000.0, "copy_num_tokens": 606.0625, "epoch": 1.361092672964003, "gen_logits_max": 3.87581729888916, "gen_logits_mean": -16.007709503173828, "gen_logits_min": -28.363903045654297, "gen_logits_std": 3.1777889728546143, "gen_loss": 0.22567695379257202, "grad_norm": 0.40352697367443835, "learning_rate": 2.2512e-05, "loss": 0.2706, "mean_copy_accuracy": 0.9968287497758865, "mean_gen_accuracy": 0.874870628118515, "mean_token_accuracy": 0.9086024165153503, "num_tokens": 722938998.0, "sample_num_tokens": 9486.0, "step": 6664, "total_num_tokens": 722976942.0, "z_loss": 0.0004428987158462405 }, { "copy_logits_max": -3.5087099075317383, "copy_logits_min": -687500032.0, "copy_num_tokens": 536.125, "epoch": 1.3612969109011999, "gen_logits_max": 4.105895042419434, "gen_logits_mean": -15.92288589477539, "gen_logits_min": -28.884963989257812, "gen_logits_std": 3.1548073291778564, "gen_loss": 0.29412218928337097, "grad_norm": 0.35086625873105276, "learning_rate": 2.2510736842105265e-05, "loss": 0.2754, "mean_copy_accuracy": 0.9973838329315186, "mean_gen_accuracy": 0.8744665384292603, "mean_token_accuracy": 0.9061763137578964, "num_tokens": 723199767.0, "sample_num_tokens": 9192.75, "step": 6665, "total_num_tokens": 723236538.0, "z_loss": 0.000512638536747545 }, { "copy_logits_max": -7.813241958618164, "copy_logits_min": -750000064.0, "copy_num_tokens": 278.5, "epoch": 1.3615011488383968, "gen_logits_max": 4.227568626403809, "gen_logits_mean": -17.316051483154297, "gen_logits_min": -28.958744049072266, "gen_logits_std": 3.106579303741455, "gen_loss": 0.3078685700893402, "grad_norm": 0.36531455609420405, "learning_rate": 2.2509473684210526e-05, "loss": 0.2835, "mean_copy_accuracy": 0.9959277361631393, "mean_gen_accuracy": 0.8762384504079819, "mean_token_accuracy": 0.9045360088348389, "num_tokens": 723483984.0, "sample_num_tokens": 7532.5, "step": 6666, "total_num_tokens": 723514114.0, "z_loss": 0.0005475750658661127 }, { "copy_logits_max": -5.941356182098389, "copy_logits_min": -750000000.0, "copy_num_tokens": 457.875, "epoch": 1.3617053867755935, "gen_logits_max": 4.448907852172852, "gen_logits_mean": -16.381206512451172, "gen_logits_min": -28.536989212036133, "gen_logits_std": 3.1227262020111084, "gen_loss": 0.3198128640651703, "grad_norm": 0.3787353903746446, "learning_rate": 2.250821052631579e-05, "loss": 0.2894, "mean_copy_accuracy": 0.9970910102128983, "mean_gen_accuracy": 0.870667427778244, "mean_token_accuracy": 0.902007058262825, "num_tokens": 723761915.0, "sample_num_tokens": 9331.75, "step": 6667, "total_num_tokens": 723799242.0, "z_loss": 0.0005092587089166045 }, { "copy_logits_max": -5.0859761238098145, "copy_logits_min": -687500032.0, "copy_num_tokens": 562.75, "epoch": 1.3619096247127904, "gen_logits_max": 4.121916770935059, "gen_logits_mean": -15.226972579956055, "gen_logits_min": -27.377700805664062, "gen_logits_std": 3.1194610595703125, "gen_loss": 0.2816511392593384, "grad_norm": 0.367052760121144, "learning_rate": 2.2506947368421054e-05, "loss": 0.267, "mean_copy_accuracy": 0.9965843111276627, "mean_gen_accuracy": 0.8824636191129684, "mean_token_accuracy": 0.9095101803541183, "num_tokens": 724033075.0, "sample_num_tokens": 8570.75, "step": 6668, "total_num_tokens": 724067358.0, "z_loss": 0.00045562104787677526 }, { "copy_logits_max": -2.4144368171691895, "copy_logits_min": -750000064.0, "copy_num_tokens": 550.8125, "epoch": 1.3621138626499874, "gen_logits_max": 2.7596147060394287, "gen_logits_mean": -17.23944854736328, "gen_logits_min": -29.840574264526367, "gen_logits_std": 3.1854310035705566, "gen_loss": 0.2820202112197876, "grad_norm": 0.3522714082726356, "learning_rate": 2.2505684210526315e-05, "loss": 0.2771, "mean_copy_accuracy": 0.9978459775447845, "mean_gen_accuracy": 0.8699697703123093, "mean_token_accuracy": 0.9061638861894608, "num_tokens": 724304988.0, "sample_num_tokens": 8642.0, "step": 6669, "total_num_tokens": 724339556.0, "z_loss": 0.00044180682743899524 }, { "copy_logits_max": -4.0663957595825195, "copy_logits_min": -750000000.0, "copy_num_tokens": 407.0, "epoch": 1.362318100587184, "gen_logits_max": 3.5604286193847656, "gen_logits_mean": -16.240802764892578, "gen_logits_min": -28.90036392211914, "gen_logits_std": 3.1624629497528076, "gen_loss": 0.2769002616405487, "grad_norm": 0.405237490475953, "learning_rate": 2.250442105263158e-05, "loss": 0.29, "mean_copy_accuracy": 0.9955936819314957, "mean_gen_accuracy": 0.8734539598226547, "mean_token_accuracy": 0.901422768831253, "num_tokens": 724562131.0, "sample_num_tokens": 7501.75, "step": 6670, "total_num_tokens": 724592138.0, "z_loss": 0.00044367153896018863 }, { "copy_logits_max": -5.100955009460449, "copy_logits_min": -687500032.0, "copy_num_tokens": 650.9375, "epoch": 1.362522338524381, "gen_logits_max": 3.070953845977783, "gen_logits_mean": -16.534683227539062, "gen_logits_min": -29.89683723449707, "gen_logits_std": 3.1909146308898926, "gen_loss": 0.24658243358135223, "grad_norm": 0.34547897508742387, "learning_rate": 2.250315789473684e-05, "loss": 0.2606, "mean_copy_accuracy": 0.9977240562438965, "mean_gen_accuracy": 0.8772598206996918, "mean_token_accuracy": 0.9114086776971817, "num_tokens": 724853714.0, "sample_num_tokens": 9585.0, "step": 6671, "total_num_tokens": 724892054.0, "z_loss": 0.00033939018612727523 }, { "copy_logits_max": -6.3002848625183105, "copy_logits_min": -750000000.0, "copy_num_tokens": 452.1875, "epoch": 1.3627265764615777, "gen_logits_max": 3.372256278991699, "gen_logits_mean": -17.299917221069336, "gen_logits_min": -30.05364227294922, "gen_logits_std": 3.187592029571533, "gen_loss": 0.2851800322532654, "grad_norm": 0.3426163650222573, "learning_rate": 2.250189473684211e-05, "loss": 0.2738, "mean_copy_accuracy": 0.9968057423830032, "mean_gen_accuracy": 0.8805464208126068, "mean_token_accuracy": 0.9071144461631775, "num_tokens": 725130807.0, "sample_num_tokens": 8285.75, "step": 6672, "total_num_tokens": 725163950.0, "z_loss": 0.0004204904253128916 }, { "copy_logits_max": -6.568063259124756, "copy_logits_min": -750000000.0, "copy_num_tokens": 585.75, "epoch": 1.3629308143987746, "gen_logits_max": 3.3062026500701904, "gen_logits_mean": -16.535919189453125, "gen_logits_min": -29.141826629638672, "gen_logits_std": 3.167712926864624, "gen_loss": 0.24541796743869781, "grad_norm": 0.41348339786050303, "learning_rate": 2.250063157894737e-05, "loss": 0.2527, "mean_copy_accuracy": 0.9956091940402985, "mean_gen_accuracy": 0.8810471296310425, "mean_token_accuracy": 0.9141294360160828, "num_tokens": 725400192.0, "sample_num_tokens": 8528.5, "step": 6673, "total_num_tokens": 725434306.0, "z_loss": 0.0003875723632518202 }, { "copy_logits_max": -8.185948371887207, "copy_logits_min": -750000000.0, "copy_num_tokens": 343.5, "epoch": 1.3631350523359713, "gen_logits_max": 4.158441543579102, "gen_logits_mean": -16.963924407958984, "gen_logits_min": -29.684295654296875, "gen_logits_std": 3.145915985107422, "gen_loss": 0.28141966462135315, "grad_norm": 0.34103971352429824, "learning_rate": 2.2499368421052634e-05, "loss": 0.2599, "mean_copy_accuracy": 0.9968682080507278, "mean_gen_accuracy": 0.8799070864915848, "mean_token_accuracy": 0.9094413667917252, "num_tokens": 725668698.0, "sample_num_tokens": 7709.0, "step": 6674, "total_num_tokens": 725699534.0, "z_loss": 0.0004646744055207819 }, { "copy_logits_max": -6.679182529449463, "copy_logits_min": -750000064.0, "copy_num_tokens": 337.5, "epoch": 1.3633392902731682, "gen_logits_max": 4.895130157470703, "gen_logits_mean": -15.556300163269043, "gen_logits_min": -27.747982025146484, "gen_logits_std": 3.120511531829834, "gen_loss": 0.29392096400260925, "grad_norm": 0.36014082031784755, "learning_rate": 2.2498105263157894e-05, "loss": 0.2785, "mean_copy_accuracy": 0.9954004436731339, "mean_gen_accuracy": 0.8793338239192963, "mean_token_accuracy": 0.9053733944892883, "num_tokens": 725937478.0, "sample_num_tokens": 7084.5, "step": 6675, "total_num_tokens": 725965816.0, "z_loss": 0.0004796982975676656 }, { "copy_logits_max": -4.3631062507629395, "copy_logits_min": -750000000.0, "copy_num_tokens": 412.9375, "epoch": 1.3635435282103652, "gen_logits_max": 4.27632474899292, "gen_logits_mean": -16.135316848754883, "gen_logits_min": -28.480796813964844, "gen_logits_std": 3.1384127140045166, "gen_loss": 0.28171873092651367, "grad_norm": 0.4225727679209952, "learning_rate": 2.249684210526316e-05, "loss": 0.2789, "mean_copy_accuracy": 0.996525228023529, "mean_gen_accuracy": 0.8781036287546158, "mean_token_accuracy": 0.9067995399236679, "num_tokens": 726210209.0, "sample_num_tokens": 7287.25, "step": 6676, "total_num_tokens": 726239358.0, "z_loss": 0.0005672978004440665 }, { "copy_logits_max": -5.161996364593506, "copy_logits_min": -625000064.0, "copy_num_tokens": 590.375, "epoch": 1.3637477661475619, "gen_logits_max": 2.7810089588165283, "gen_logits_mean": -17.830116271972656, "gen_logits_min": -30.23281478881836, "gen_logits_std": 3.195183515548706, "gen_loss": 0.251263290643692, "grad_norm": 0.38046048179112224, "learning_rate": 2.249557894736842e-05, "loss": 0.2921, "mean_copy_accuracy": 0.9973293542861938, "mean_gen_accuracy": 0.8665001690387726, "mean_token_accuracy": 0.8999163210391998, "num_tokens": 726483131.0, "sample_num_tokens": 9478.75, "step": 6677, "total_num_tokens": 726521046.0, "z_loss": 0.0005006137071177363 }, { "copy_logits_max": -6.653578758239746, "copy_logits_min": -750000000.0, "copy_num_tokens": 432.125, "epoch": 1.3639520040847588, "gen_logits_max": 3.6722519397735596, "gen_logits_mean": -16.64240264892578, "gen_logits_min": -29.263898849487305, "gen_logits_std": 3.1573851108551025, "gen_loss": 0.29588836431503296, "grad_norm": 0.3944399116855449, "learning_rate": 2.2494315789473684e-05, "loss": 0.285, "mean_copy_accuracy": 0.9966230690479279, "mean_gen_accuracy": 0.8779450207948685, "mean_token_accuracy": 0.9038202166557312, "num_tokens": 726723679.0, "sample_num_tokens": 7224.75, "step": 6678, "total_num_tokens": 726752578.0, "z_loss": 0.0005042780539952219 }, { "copy_logits_max": -5.109284400939941, "copy_logits_min": -750000064.0, "copy_num_tokens": 510.5, "epoch": 1.3641562420219555, "gen_logits_max": 5.294972896575928, "gen_logits_mean": -15.108895301818848, "gen_logits_min": -27.662216186523438, "gen_logits_std": 3.143439531326294, "gen_loss": 0.26721319556236267, "grad_norm": 0.3918569213524061, "learning_rate": 2.2493052631578945e-05, "loss": 0.276, "mean_copy_accuracy": 0.9958977848291397, "mean_gen_accuracy": 0.8779753297567368, "mean_token_accuracy": 0.9066581726074219, "num_tokens": 727007922.0, "sample_num_tokens": 8423.5, "step": 6679, "total_num_tokens": 727041616.0, "z_loss": 0.0004517938068602234 }, { "copy_logits_max": -3.0648257732391357, "copy_logits_min": -750000000.0, "copy_num_tokens": 641.5625, "epoch": 1.3643604799591524, "gen_logits_max": 3.7057361602783203, "gen_logits_mean": -16.566740036010742, "gen_logits_min": -29.500118255615234, "gen_logits_std": 3.188292980194092, "gen_loss": 0.25372424721717834, "grad_norm": 0.3948684683611697, "learning_rate": 2.2491789473684213e-05, "loss": 0.2925, "mean_copy_accuracy": 0.9959991723299026, "mean_gen_accuracy": 0.8690597862005234, "mean_token_accuracy": 0.9013424813747406, "num_tokens": 727269864.0, "sample_num_tokens": 9209.5, "step": 6680, "total_num_tokens": 727306702.0, "z_loss": 0.0005100027192384005 }, { "copy_logits_max": -6.071366310119629, "copy_logits_min": -750000000.0, "copy_num_tokens": 467.4375, "epoch": 1.3645647178963491, "gen_logits_max": 5.133767127990723, "gen_logits_mean": -14.696762084960938, "gen_logits_min": -27.125911712646484, "gen_logits_std": 3.0943360328674316, "gen_loss": 0.278494656085968, "grad_norm": 0.34634684955716405, "learning_rate": 2.2490526315789477e-05, "loss": 0.2808, "mean_copy_accuracy": 0.997021421790123, "mean_gen_accuracy": 0.8742827475070953, "mean_token_accuracy": 0.9058976322412491, "num_tokens": 727559518.0, "sample_num_tokens": 8255.5, "step": 6681, "total_num_tokens": 727592540.0, "z_loss": 0.0005610206862911582 }, { "copy_logits_max": -0.8745014071464539, "copy_logits_min": -687500032.0, "copy_num_tokens": 494.8125, "epoch": 1.364768955833546, "gen_logits_max": 3.4596915245056152, "gen_logits_mean": -16.696002960205078, "gen_logits_min": -29.465599060058594, "gen_logits_std": 3.158743143081665, "gen_loss": 0.2773420214653015, "grad_norm": 0.35993912989321486, "learning_rate": 2.2489263157894738e-05, "loss": 0.2714, "mean_copy_accuracy": 0.997153714299202, "mean_gen_accuracy": 0.874594122171402, "mean_token_accuracy": 0.9069768935441971, "num_tokens": 727823676.0, "sample_num_tokens": 7931.0, "step": 6682, "total_num_tokens": 727855400.0, "z_loss": 0.0006066864589229226 }, { "copy_logits_max": -1.0855793952941895, "copy_logits_min": -750000000.0, "copy_num_tokens": 651.0, "epoch": 1.364973193770743, "gen_logits_max": 3.6471152305603027, "gen_logits_mean": -16.056781768798828, "gen_logits_min": -29.319318771362305, "gen_logits_std": 3.1807444095611572, "gen_loss": 0.2603886127471924, "grad_norm": 0.37000165914510796, "learning_rate": 2.2488000000000002e-05, "loss": 0.2786, "mean_copy_accuracy": 0.9970084428787231, "mean_gen_accuracy": 0.8713469803333282, "mean_token_accuracy": 0.9045733511447906, "num_tokens": 728123738.0, "sample_num_tokens": 9043.0, "step": 6683, "total_num_tokens": 728159910.0, "z_loss": 0.0006134625873528421 }, { "copy_logits_max": -5.119220733642578, "copy_logits_min": -750000000.0, "copy_num_tokens": 337.375, "epoch": 1.3651774317079397, "gen_logits_max": 4.346584320068359, "gen_logits_mean": -17.206693649291992, "gen_logits_min": -29.39435577392578, "gen_logits_std": 3.1352627277374268, "gen_loss": 0.3142400085926056, "grad_norm": 0.3637798278427341, "learning_rate": 2.2486736842105263e-05, "loss": 0.2918, "mean_copy_accuracy": 0.9958250373601913, "mean_gen_accuracy": 0.8769606947898865, "mean_token_accuracy": 0.9006487280130386, "num_tokens": 728378418.0, "sample_num_tokens": 7099.5, "step": 6684, "total_num_tokens": 728406816.0, "z_loss": 0.0006315921200439334 }, { "copy_logits_max": -4.016844272613525, "copy_logits_min": -687500032.0, "copy_num_tokens": 434.125, "epoch": 1.3653816696451366, "gen_logits_max": 4.4887614250183105, "gen_logits_mean": -16.022550582885742, "gen_logits_min": -28.546783447265625, "gen_logits_std": 3.1311957836151123, "gen_loss": 0.2984945774078369, "grad_norm": 0.37378627424114247, "learning_rate": 2.2485473684210527e-05, "loss": 0.267, "mean_copy_accuracy": 0.9954561591148376, "mean_gen_accuracy": 0.8777254223823547, "mean_token_accuracy": 0.9081571847200394, "num_tokens": 728638072.0, "sample_num_tokens": 8219.5, "step": 6685, "total_num_tokens": 728670950.0, "z_loss": 0.0006172485882416368 }, { "copy_logits_max": -5.0651044845581055, "copy_logits_min": -750000000.0, "copy_num_tokens": 449.3125, "epoch": 1.3655859075823333, "gen_logits_max": 4.858393669128418, "gen_logits_mean": -16.228683471679688, "gen_logits_min": -29.299060821533203, "gen_logits_std": 3.1563076972961426, "gen_loss": 0.2508464455604553, "grad_norm": 0.4266164671102773, "learning_rate": 2.248421052631579e-05, "loss": 0.2836, "mean_copy_accuracy": 0.9959604442119598, "mean_gen_accuracy": 0.8759787976741791, "mean_token_accuracy": 0.9034961611032486, "num_tokens": 728902388.0, "sample_num_tokens": 8718.0, "step": 6686, "total_num_tokens": 728937260.0, "z_loss": 0.0005435456987470388 }, { "copy_logits_max": -3.5618979930877686, "copy_logits_min": -750000064.0, "copy_num_tokens": 554.625, "epoch": 1.3657901455195303, "gen_logits_max": 4.15750789642334, "gen_logits_mean": -15.86384391784668, "gen_logits_min": -29.113483428955078, "gen_logits_std": 3.166091203689575, "gen_loss": 0.2263135015964508, "grad_norm": 0.3823564184511836, "learning_rate": 2.2482947368421053e-05, "loss": 0.2628, "mean_copy_accuracy": 0.9966264516115189, "mean_gen_accuracy": 0.8788603842258453, "mean_token_accuracy": 0.9111826419830322, "num_tokens": 729162394.0, "sample_num_tokens": 7688.0, "step": 6687, "total_num_tokens": 729193146.0, "z_loss": 0.0004662997380364686 }, { "copy_logits_max": -4.77948522567749, "copy_logits_min": -750000000.0, "copy_num_tokens": 379.875, "epoch": 1.365994383456727, "gen_logits_max": 4.958887577056885, "gen_logits_mean": -16.980979919433594, "gen_logits_min": -28.948589324951172, "gen_logits_std": 3.1487016677856445, "gen_loss": 0.2802588939666748, "grad_norm": 0.3865492433218408, "learning_rate": 2.2481684210526317e-05, "loss": 0.2864, "mean_copy_accuracy": 0.9967930018901825, "mean_gen_accuracy": 0.8819806277751923, "mean_token_accuracy": 0.904808297753334, "num_tokens": 729419542.0, "sample_num_tokens": 8511.5, "step": 6688, "total_num_tokens": 729453588.0, "z_loss": 0.0004887895192950964 }, { "copy_logits_max": -3.8350656032562256, "copy_logits_min": -687500032.0, "copy_num_tokens": 473.9375, "epoch": 1.366198621393924, "gen_logits_max": 4.28092098236084, "gen_logits_mean": -16.768693923950195, "gen_logits_min": -29.568078994750977, "gen_logits_std": 3.215728282928467, "gen_loss": 0.23716993629932404, "grad_norm": 0.38285479842566794, "learning_rate": 2.248042105263158e-05, "loss": 0.2649, "mean_copy_accuracy": 0.9968631267547607, "mean_gen_accuracy": 0.8794556707143784, "mean_token_accuracy": 0.9095176756381989, "num_tokens": 729683132.0, "sample_num_tokens": 7699.0, "step": 6689, "total_num_tokens": 729713928.0, "z_loss": 0.00047607708256691694 }, { "copy_logits_max": -5.269695281982422, "copy_logits_min": -750000000.0, "copy_num_tokens": 565.75, "epoch": 1.3664028593311208, "gen_logits_max": 4.292314529418945, "gen_logits_mean": -16.889846801757812, "gen_logits_min": -29.241323471069336, "gen_logits_std": 3.2234420776367188, "gen_loss": 0.27190208435058594, "grad_norm": 0.40218181688608845, "learning_rate": 2.2479157894736842e-05, "loss": 0.2857, "mean_copy_accuracy": 0.9969985336065292, "mean_gen_accuracy": 0.8713219314813614, "mean_token_accuracy": 0.9036803692579269, "num_tokens": 729948859.0, "sample_num_tokens": 9087.75, "step": 6690, "total_num_tokens": 729985210.0, "z_loss": 0.00046004768228158355 }, { "copy_logits_max": -5.418791770935059, "copy_logits_min": -750000000.0, "copy_num_tokens": 481.375, "epoch": 1.3666070972683175, "gen_logits_max": 4.577645778656006, "gen_logits_mean": -15.384021759033203, "gen_logits_min": -27.576950073242188, "gen_logits_std": 3.0936219692230225, "gen_loss": 0.27835798263549805, "grad_norm": 0.41281994475689837, "learning_rate": 2.2477894736842107e-05, "loss": 0.2939, "mean_copy_accuracy": 0.9960556030273438, "mean_gen_accuracy": 0.8720668852329254, "mean_token_accuracy": 0.9012719243764877, "num_tokens": 730213065.0, "sample_num_tokens": 9005.25, "step": 6691, "total_num_tokens": 730249086.0, "z_loss": 0.00044279833673499525 }, { "copy_logits_max": -6.364692687988281, "copy_logits_min": -687500032.0, "copy_num_tokens": 497.9375, "epoch": 1.3668113352055145, "gen_logits_max": 2.897069215774536, "gen_logits_mean": -18.401988983154297, "gen_logits_min": -30.417652130126953, "gen_logits_std": 3.2011375427246094, "gen_loss": 0.27024155855178833, "grad_norm": 0.436603862387537, "learning_rate": 2.2476631578947367e-05, "loss": 0.2767, "mean_copy_accuracy": 0.9965919852256775, "mean_gen_accuracy": 0.8767849504947662, "mean_token_accuracy": 0.9068358093500137, "num_tokens": 730482482.0, "sample_num_tokens": 9367.0, "step": 6692, "total_num_tokens": 730519950.0, "z_loss": 0.00040818919660523534 }, { "copy_logits_max": -2.2912960052490234, "copy_logits_min": -687500032.0, "copy_num_tokens": 645.8125, "epoch": 1.3670155731427114, "gen_logits_max": 3.7035603523254395, "gen_logits_mean": -15.865401268005371, "gen_logits_min": -29.165096282958984, "gen_logits_std": 3.18401837348938, "gen_loss": 0.2728267014026642, "grad_norm": 0.3630771171310604, "learning_rate": 2.2475368421052632e-05, "loss": 0.2815, "mean_copy_accuracy": 0.9970572292804718, "mean_gen_accuracy": 0.8713343441486359, "mean_token_accuracy": 0.9068786948919296, "num_tokens": 730775330.0, "sample_num_tokens": 9569.0, "step": 6693, "total_num_tokens": 730813606.0, "z_loss": 0.0005500412080436945 }, { "copy_logits_max": -2.4372568130493164, "copy_logits_min": -750000000.0, "copy_num_tokens": 397.3125, "epoch": 1.367219811079908, "gen_logits_max": 3.3005623817443848, "gen_logits_mean": -17.250078201293945, "gen_logits_min": -29.489837646484375, "gen_logits_std": 3.2048912048339844, "gen_loss": 0.276702880859375, "grad_norm": 0.38487537304734454, "learning_rate": 2.2474105263157896e-05, "loss": 0.272, "mean_copy_accuracy": 0.9952654838562012, "mean_gen_accuracy": 0.881240576505661, "mean_token_accuracy": 0.9073967337608337, "num_tokens": 731044989.0, "sample_num_tokens": 7674.25, "step": 6694, "total_num_tokens": 731075686.0, "z_loss": 0.0005697881570085883 }, { "copy_logits_max": -0.583114504814148, "copy_logits_min": -750000064.0, "copy_num_tokens": 348.375, "epoch": 1.3674240490171048, "gen_logits_max": 4.394961357116699, "gen_logits_mean": -15.383079528808594, "gen_logits_min": -27.687450408935547, "gen_logits_std": 3.1586568355560303, "gen_loss": 0.2695106267929077, "grad_norm": 0.39309356900803477, "learning_rate": 2.2472842105263157e-05, "loss": 0.2706, "mean_copy_accuracy": 0.997284546494484, "mean_gen_accuracy": 0.879443809390068, "mean_token_accuracy": 0.9073254466056824, "num_tokens": 731294936.0, "sample_num_tokens": 6735.0, "step": 6695, "total_num_tokens": 731321876.0, "z_loss": 0.0004939897335134447 }, { "copy_logits_max": -4.212913513183594, "copy_logits_min": -750000000.0, "copy_num_tokens": 392.1875, "epoch": 1.3676282869543017, "gen_logits_max": 3.0131235122680664, "gen_logits_mean": -18.2382869720459, "gen_logits_min": -30.643707275390625, "gen_logits_std": 3.2199127674102783, "gen_loss": 0.2740252912044525, "grad_norm": 0.3835839620337205, "learning_rate": 2.247157894736842e-05, "loss": 0.2769, "mean_copy_accuracy": 0.9968830049037933, "mean_gen_accuracy": 0.8757100105285645, "mean_token_accuracy": 0.9074010998010635, "num_tokens": 731544567.0, "sample_num_tokens": 7126.75, "step": 6696, "total_num_tokens": 731573074.0, "z_loss": 0.0004908213741146028 }, { "copy_logits_max": -5.51947021484375, "copy_logits_min": -750000064.0, "copy_num_tokens": 560.625, "epoch": 1.3678325248914986, "gen_logits_max": 3.325237274169922, "gen_logits_mean": -17.943706512451172, "gen_logits_min": -29.973594665527344, "gen_logits_std": 3.186958074569702, "gen_loss": 0.27464255690574646, "grad_norm": 0.3551664248061298, "learning_rate": 2.2470315789473686e-05, "loss": 0.2734, "mean_copy_accuracy": 0.9961473792791367, "mean_gen_accuracy": 0.879219576716423, "mean_token_accuracy": 0.9077452421188354, "num_tokens": 731803606.0, "sample_num_tokens": 8984.5, "step": 6697, "total_num_tokens": 731839544.0, "z_loss": 0.0004662882420234382 }, { "copy_logits_max": -3.6566050052642822, "copy_logits_min": -750000000.0, "copy_num_tokens": 599.8125, "epoch": 1.3680367628286954, "gen_logits_max": 1.5679556131362915, "gen_logits_mean": -19.02338218688965, "gen_logits_min": -30.853595733642578, "gen_logits_std": 3.241008758544922, "gen_loss": 0.2506922483444214, "grad_norm": 0.36136941990488586, "learning_rate": 2.246905263157895e-05, "loss": 0.2733, "mean_copy_accuracy": 0.9967627972364426, "mean_gen_accuracy": 0.8760139048099518, "mean_token_accuracy": 0.9079925268888474, "num_tokens": 732092782.0, "sample_num_tokens": 8882.5, "step": 6698, "total_num_tokens": 732128312.0, "z_loss": 0.00042196031427010894 }, { "copy_logits_max": -4.391933441162109, "copy_logits_min": -750000000.0, "copy_num_tokens": 413.75, "epoch": 1.3682410007658923, "gen_logits_max": 4.125832557678223, "gen_logits_mean": -16.097665786743164, "gen_logits_min": -28.106544494628906, "gen_logits_std": 3.1597900390625, "gen_loss": 0.28018444776535034, "grad_norm": 0.36475075194233814, "learning_rate": 2.246778947368421e-05, "loss": 0.2736, "mean_copy_accuracy": 0.9964814633131027, "mean_gen_accuracy": 0.8741450160741806, "mean_token_accuracy": 0.9090965986251831, "num_tokens": 732400799.0, "sample_num_tokens": 7688.25, "step": 6699, "total_num_tokens": 732431552.0, "z_loss": 0.0004742882738355547 }, { "copy_logits_max": -3.305042028427124, "copy_logits_min": -750000000.0, "copy_num_tokens": 510.0625, "epoch": 1.3684452387030892, "gen_logits_max": 5.531525611877441, "gen_logits_mean": -13.459856033325195, "gen_logits_min": -25.77132797241211, "gen_logits_std": 3.1157655715942383, "gen_loss": 0.2609197199344635, "grad_norm": 0.3904789397052703, "learning_rate": 2.2466526315789475e-05, "loss": 0.2656, "mean_copy_accuracy": 0.9972957372665405, "mean_gen_accuracy": 0.8786014467477798, "mean_token_accuracy": 0.9094521403312683, "num_tokens": 732666992.0, "sample_num_tokens": 9421.0, "step": 6700, "total_num_tokens": 732704676.0, "z_loss": 0.00042762368684634566 }, { "copy_logits_max": -6.226408004760742, "copy_logits_min": -750000000.0, "copy_num_tokens": 483.5625, "epoch": 1.368649476640286, "gen_logits_max": 3.8858375549316406, "gen_logits_mean": -16.254859924316406, "gen_logits_min": -28.712839126586914, "gen_logits_std": 3.1700663566589355, "gen_loss": 0.2761697769165039, "grad_norm": 0.3605419483481687, "learning_rate": 2.2465263157894736e-05, "loss": 0.2896, "mean_copy_accuracy": 0.9968390464782715, "mean_gen_accuracy": 0.873927965760231, "mean_token_accuracy": 0.9019447267055511, "num_tokens": 732929828.0, "sample_num_tokens": 8621.5, "step": 6701, "total_num_tokens": 732964314.0, "z_loss": 0.00043139472836628556 }, { "copy_logits_max": -6.149443626403809, "copy_logits_min": -750000000.0, "copy_num_tokens": 401.0625, "epoch": 1.3688537145774828, "gen_logits_max": 4.284529685974121, "gen_logits_mean": -17.27135467529297, "gen_logits_min": -29.61418914794922, "gen_logits_std": 3.1880016326904297, "gen_loss": 0.2486991137266159, "grad_norm": 0.3637711054542257, "learning_rate": 2.2464e-05, "loss": 0.267, "mean_copy_accuracy": 0.9957831054925919, "mean_gen_accuracy": 0.8846425563097, "mean_token_accuracy": 0.9084748178720474, "num_tokens": 733200976.0, "sample_num_tokens": 8768.5, "step": 6702, "total_num_tokens": 733236050.0, "z_loss": 0.00043389189522713423 }, { "copy_logits_max": -4.491937637329102, "copy_logits_min": -750000000.0, "copy_num_tokens": 542.25, "epoch": 1.3690579525146795, "gen_logits_max": 3.5314207077026367, "gen_logits_mean": -17.03321075439453, "gen_logits_min": -30.439565658569336, "gen_logits_std": 3.203822612762451, "gen_loss": 0.2691025733947754, "grad_norm": 0.392961301238933, "learning_rate": 2.246273684210526e-05, "loss": 0.2905, "mean_copy_accuracy": 0.9964902400970459, "mean_gen_accuracy": 0.87358358502388, "mean_token_accuracy": 0.9024982750415802, "num_tokens": 733459475.0, "sample_num_tokens": 9121.75, "step": 6703, "total_num_tokens": 733495962.0, "z_loss": 0.00043530817492865026 }, { "copy_logits_max": -5.054203987121582, "copy_logits_min": -750000000.0, "copy_num_tokens": 655.0, "epoch": 1.3692621904518765, "gen_logits_max": 2.983395576477051, "gen_logits_mean": -16.79422950744629, "gen_logits_min": -29.196807861328125, "gen_logits_std": 3.2048323154449463, "gen_loss": 0.24774211645126343, "grad_norm": 0.3587442218436, "learning_rate": 2.2461473684210526e-05, "loss": 0.2598, "mean_copy_accuracy": 0.9978997856378555, "mean_gen_accuracy": 0.8782128244638443, "mean_token_accuracy": 0.9118143618106842, "num_tokens": 733730613.0, "sample_num_tokens": 9549.75, "step": 6704, "total_num_tokens": 733768812.0, "z_loss": 0.00042868792661465704 }, { "copy_logits_max": -5.791421413421631, "copy_logits_min": -750000000.0, "copy_num_tokens": 332.875, "epoch": 1.3694664283890732, "gen_logits_max": 4.595706939697266, "gen_logits_mean": -15.811979293823242, "gen_logits_min": -28.27846908569336, "gen_logits_std": 3.1471078395843506, "gen_loss": 0.3025254011154175, "grad_norm": 0.36211063158951745, "learning_rate": 2.246021052631579e-05, "loss": 0.2937, "mean_copy_accuracy": 0.9962785392999649, "mean_gen_accuracy": 0.8705796003341675, "mean_token_accuracy": 0.8997969180345535, "num_tokens": 734009127.0, "sample_num_tokens": 7321.75, "step": 6705, "total_num_tokens": 734038414.0, "z_loss": 0.00048078232794068754 }, { "copy_logits_max": -3.4471030235290527, "copy_logits_min": -750000000.0, "copy_num_tokens": 730.375, "epoch": 1.36967066632627, "gen_logits_max": 3.2013020515441895, "gen_logits_mean": -16.4896183013916, "gen_logits_min": -29.38298797607422, "gen_logits_std": 3.2172129154205322, "gen_loss": 0.2310936599969864, "grad_norm": 0.3705061998597006, "learning_rate": 2.2458947368421054e-05, "loss": 0.2515, "mean_copy_accuracy": 0.9964343905448914, "mean_gen_accuracy": 0.8823576271533966, "mean_token_accuracy": 0.9146556258201599, "num_tokens": 734279397.0, "sample_num_tokens": 9314.75, "step": 6706, "total_num_tokens": 734316656.0, "z_loss": 0.00040598976192995906 }, { "copy_logits_max": -5.081070899963379, "copy_logits_min": -750000064.0, "copy_num_tokens": 714.0, "epoch": 1.369874904263467, "gen_logits_max": 3.6922292709350586, "gen_logits_mean": -16.13949966430664, "gen_logits_min": -29.13705825805664, "gen_logits_std": 3.194657802581787, "gen_loss": 0.23790888488292694, "grad_norm": 0.3579979415480822, "learning_rate": 2.2457684210526315e-05, "loss": 0.2822, "mean_copy_accuracy": 0.9963680058717728, "mean_gen_accuracy": 0.8722442984580994, "mean_token_accuracy": 0.9040418565273285, "num_tokens": 734566833.0, "sample_num_tokens": 9539.25, "step": 6707, "total_num_tokens": 734604990.0, "z_loss": 0.00046965613728389144 }, { "copy_logits_max": -5.335805416107178, "copy_logits_min": -750000000.0, "copy_num_tokens": 594.3125, "epoch": 1.3700791422006637, "gen_logits_max": 4.0088653564453125, "gen_logits_mean": -16.34162139892578, "gen_logits_min": -28.76031494140625, "gen_logits_std": 3.196164846420288, "gen_loss": 0.2648160755634308, "grad_norm": 0.3730453495074888, "learning_rate": 2.245642105263158e-05, "loss": 0.2743, "mean_copy_accuracy": 0.9965103268623352, "mean_gen_accuracy": 0.8778677582740784, "mean_token_accuracy": 0.9052699357271194, "num_tokens": 734813514.0, "sample_num_tokens": 9035.5, "step": 6708, "total_num_tokens": 734849656.0, "z_loss": 0.0004270537756383419 }, { "copy_logits_max": -4.2284932136535645, "copy_logits_min": -750000064.0, "copy_num_tokens": 503.8125, "epoch": 1.3702833801378607, "gen_logits_max": 5.088080883026123, "gen_logits_mean": -15.504565238952637, "gen_logits_min": -28.749740600585938, "gen_logits_std": 3.1985249519348145, "gen_loss": 0.2625012695789337, "grad_norm": 0.3837683368069266, "learning_rate": 2.2455157894736844e-05, "loss": 0.2743, "mean_copy_accuracy": 0.996484637260437, "mean_gen_accuracy": 0.8748973608016968, "mean_token_accuracy": 0.9067651778459549, "num_tokens": 735079008.0, "sample_num_tokens": 7748.5, "step": 6709, "total_num_tokens": 735110002.0, "z_loss": 0.0004437660682015121 }, { "copy_logits_max": -6.405638217926025, "copy_logits_min": -750000064.0, "copy_num_tokens": 434.375, "epoch": 1.3704876180750574, "gen_logits_max": 4.622840881347656, "gen_logits_mean": -15.318316459655762, "gen_logits_min": -27.39086151123047, "gen_logits_std": 3.138180732727051, "gen_loss": 0.27919286489486694, "grad_norm": 0.39952701360598386, "learning_rate": 2.2453894736842105e-05, "loss": 0.2817, "mean_copy_accuracy": 0.9962769448757172, "mean_gen_accuracy": 0.8746105879545212, "mean_token_accuracy": 0.9051224738359451, "num_tokens": 735323508.0, "sample_num_tokens": 7688.5, "step": 6710, "total_num_tokens": 735354262.0, "z_loss": 0.00045592949027195573 }, { "copy_logits_max": -5.650670051574707, "copy_logits_min": -750000064.0, "copy_num_tokens": 532.3125, "epoch": 1.3706918560122543, "gen_logits_max": 5.173572540283203, "gen_logits_mean": -14.83300495147705, "gen_logits_min": -27.37139129638672, "gen_logits_std": 3.1438851356506348, "gen_loss": 0.26850202679634094, "grad_norm": 0.4321133593557741, "learning_rate": 2.245263157894737e-05, "loss": 0.2651, "mean_copy_accuracy": 0.9974299967288971, "mean_gen_accuracy": 0.8778633326292038, "mean_token_accuracy": 0.90756855905056, "num_tokens": 735575041.0, "sample_num_tokens": 8669.75, "step": 6711, "total_num_tokens": 735609720.0, "z_loss": 0.0004638243990484625 }, { "copy_logits_max": -5.558497428894043, "copy_logits_min": -750000000.0, "copy_num_tokens": 380.625, "epoch": 1.370896093949451, "gen_logits_max": 5.0767412185668945, "gen_logits_mean": -16.35609245300293, "gen_logits_min": -29.12122917175293, "gen_logits_std": 3.136707305908203, "gen_loss": 0.30912333726882935, "grad_norm": 0.359876471976868, "learning_rate": 2.245136842105263e-05, "loss": 0.2732, "mean_copy_accuracy": 0.9966216534376144, "mean_gen_accuracy": 0.8786201626062393, "mean_token_accuracy": 0.9053513556718826, "num_tokens": 735845689.0, "sample_num_tokens": 8602.25, "step": 6712, "total_num_tokens": 735880098.0, "z_loss": 0.0005160621367394924 }, { "copy_logits_max": -5.528069972991943, "copy_logits_min": -750000000.0, "copy_num_tokens": 565.8125, "epoch": 1.371100331886648, "gen_logits_max": 4.038203239440918, "gen_logits_mean": -16.476604461669922, "gen_logits_min": -30.136293411254883, "gen_logits_std": 3.179231882095337, "gen_loss": 0.25380629301071167, "grad_norm": 0.3602784622220922, "learning_rate": 2.2450105263157898e-05, "loss": 0.2674, "mean_copy_accuracy": 0.9968759566545486, "mean_gen_accuracy": 0.8741002082824707, "mean_token_accuracy": 0.909705862402916, "num_tokens": 736140238.0, "sample_num_tokens": 9093.0, "step": 6713, "total_num_tokens": 736176610.0, "z_loss": 0.00048748342669568956 }, { "copy_logits_max": -7.085352897644043, "copy_logits_min": -750000000.0, "copy_num_tokens": 417.0625, "epoch": 1.3713045698238449, "gen_logits_max": 4.11213493347168, "gen_logits_mean": -16.938091278076172, "gen_logits_min": -30.107624053955078, "gen_logits_std": 3.1619062423706055, "gen_loss": 0.28251874446868896, "grad_norm": 0.38716555732673075, "learning_rate": 2.244884210526316e-05, "loss": 0.2986, "mean_copy_accuracy": 0.9965365082025528, "mean_gen_accuracy": 0.8656614124774933, "mean_token_accuracy": 0.8974844366312027, "num_tokens": 736432142.0, "sample_num_tokens": 8498.5, "step": 6714, "total_num_tokens": 736466136.0, "z_loss": 0.0005200974410399795 }, { "copy_logits_max": -6.097019672393799, "copy_logits_min": -687500032.0, "copy_num_tokens": 422.9375, "epoch": 1.3715088077610416, "gen_logits_max": 3.962350368499756, "gen_logits_mean": -16.72512435913086, "gen_logits_min": -29.61334991455078, "gen_logits_std": 3.169900894165039, "gen_loss": 0.2891843318939209, "grad_norm": 0.3836344868491448, "learning_rate": 2.2447578947368423e-05, "loss": 0.3028, "mean_copy_accuracy": 0.9960516691207886, "mean_gen_accuracy": 0.8665148466825485, "mean_token_accuracy": 0.8981138914823532, "num_tokens": 736693999.0, "sample_num_tokens": 8176.25, "step": 6715, "total_num_tokens": 736726704.0, "z_loss": 0.0005588968051597476 }, { "copy_logits_max": -6.986621856689453, "copy_logits_min": -687500032.0, "copy_num_tokens": 437.4375, "epoch": 1.3717130456982385, "gen_logits_max": 3.866499900817871, "gen_logits_mean": -15.992600440979004, "gen_logits_min": -29.451290130615234, "gen_logits_std": 3.1356301307678223, "gen_loss": 0.29853248596191406, "grad_norm": 0.39151530033233106, "learning_rate": 2.2446315789473684e-05, "loss": 0.3073, "mean_copy_accuracy": 0.9969941526651382, "mean_gen_accuracy": 0.8666238635778427, "mean_token_accuracy": 0.8976694643497467, "num_tokens": 736954851.0, "sample_num_tokens": 7683.25, "step": 6716, "total_num_tokens": 736985584.0, "z_loss": 0.0005310600972734392 }, { "copy_logits_max": -7.809229850769043, "copy_logits_min": -750000000.0, "copy_num_tokens": 581.75, "epoch": 1.3719172836354354, "gen_logits_max": 5.10123348236084, "gen_logits_mean": -14.185486793518066, "gen_logits_min": -28.553489685058594, "gen_logits_std": 3.1208934783935547, "gen_loss": 0.2407979518175125, "grad_norm": 0.3846234981088392, "learning_rate": 2.2445052631578948e-05, "loss": 0.28, "mean_copy_accuracy": 0.9964329153299332, "mean_gen_accuracy": 0.8736184686422348, "mean_token_accuracy": 0.9050099551677704, "num_tokens": 737234973.0, "sample_num_tokens": 9023.75, "step": 6717, "total_num_tokens": 737271068.0, "z_loss": 0.0004459256597328931 }, { "copy_logits_max": -8.395092010498047, "copy_logits_min": -750000000.0, "copy_num_tokens": 290.3125, "epoch": 1.3721215215726321, "gen_logits_max": 4.422150135040283, "gen_logits_mean": -16.590709686279297, "gen_logits_min": -29.303524017333984, "gen_logits_std": 3.126812219619751, "gen_loss": 0.30620336532592773, "grad_norm": 0.43433526560634866, "learning_rate": 2.244378947368421e-05, "loss": 0.2938, "mean_copy_accuracy": 0.9963182061910629, "mean_gen_accuracy": 0.8749524652957916, "mean_token_accuracy": 0.9014682173728943, "num_tokens": 737496805.0, "sample_num_tokens": 6823.25, "step": 6718, "total_num_tokens": 737524098.0, "z_loss": 0.0005071592167951167 }, { "copy_logits_max": -7.522749900817871, "copy_logits_min": -750000064.0, "copy_num_tokens": 358.0, "epoch": 1.3723257595098288, "gen_logits_max": 4.492124557495117, "gen_logits_mean": -16.634693145751953, "gen_logits_min": -29.445636749267578, "gen_logits_std": 3.152099847793579, "gen_loss": 0.30283892154693604, "grad_norm": 0.3888989185551086, "learning_rate": 2.2442526315789473e-05, "loss": 0.2728, "mean_copy_accuracy": 0.9969287216663361, "mean_gen_accuracy": 0.8791804611682892, "mean_token_accuracy": 0.9077293127775192, "num_tokens": 737745793.0, "sample_num_tokens": 7246.75, "step": 6719, "total_num_tokens": 737774780.0, "z_loss": 0.0005820424994453788 }, { "copy_logits_max": -6.569192409515381, "copy_logits_min": -750000000.0, "copy_num_tokens": 687.9375, "epoch": 1.3725299974470258, "gen_logits_max": 2.44356107711792, "gen_logits_mean": -17.2882022857666, "gen_logits_min": -30.50365447998047, "gen_logits_std": 3.2147045135498047, "gen_loss": 0.21608875691890717, "grad_norm": 0.37594000375045916, "learning_rate": 2.2441263157894734e-05, "loss": 0.2749, "mean_copy_accuracy": 0.9973011463880539, "mean_gen_accuracy": 0.879218652844429, "mean_token_accuracy": 0.9074610024690628, "num_tokens": 738006608.0, "sample_num_tokens": 9942.0, "step": 6720, "total_num_tokens": 738046376.0, "z_loss": 0.00041519670048728585 }, { "copy_logits_max": -7.3903303146362305, "copy_logits_min": -750000000.0, "copy_num_tokens": 355.625, "epoch": 1.3727342353842227, "gen_logits_max": 4.0717291831970215, "gen_logits_mean": -16.158000946044922, "gen_logits_min": -28.61241912841797, "gen_logits_std": 3.1842079162597656, "gen_loss": 0.2476559281349182, "grad_norm": 0.33923025780269245, "learning_rate": 2.2440000000000002e-05, "loss": 0.255, "mean_copy_accuracy": 0.9961644858121872, "mean_gen_accuracy": 0.8906560838222504, "mean_token_accuracy": 0.9131584763526917, "num_tokens": 738278699.0, "sample_num_tokens": 7379.75, "step": 6721, "total_num_tokens": 738308218.0, "z_loss": 0.00047573045594617724 }, { "copy_logits_max": -7.136875152587891, "copy_logits_min": -750000000.0, "copy_num_tokens": 323.5, "epoch": 1.3729384733214194, "gen_logits_max": 3.9067063331604004, "gen_logits_mean": -17.09090232849121, "gen_logits_min": -29.32300567626953, "gen_logits_std": 3.1666336059570312, "gen_loss": 0.30319979786872864, "grad_norm": 0.33120382248800284, "learning_rate": 2.2438736842105266e-05, "loss": 0.2701, "mean_copy_accuracy": 0.9972143024206161, "mean_gen_accuracy": 0.877022460103035, "mean_token_accuracy": 0.9088491350412369, "num_tokens": 738572667.0, "sample_num_tokens": 7359.25, "step": 6722, "total_num_tokens": 738602104.0, "z_loss": 0.0005079189431853592 }, { "copy_logits_max": -4.500680923461914, "copy_logits_min": -750000000.0, "copy_num_tokens": 550.8125, "epoch": 1.3731427112586163, "gen_logits_max": 3.2289681434631348, "gen_logits_mean": -17.261980056762695, "gen_logits_min": -29.816543579101562, "gen_logits_std": 3.2026824951171875, "gen_loss": 0.2593463063240051, "grad_norm": 0.3876586041902117, "learning_rate": 2.2437473684210527e-05, "loss": 0.2789, "mean_copy_accuracy": 0.9966641962528229, "mean_gen_accuracy": 0.8756909221410751, "mean_token_accuracy": 0.9056698828935623, "num_tokens": 738838818.0, "sample_num_tokens": 8786.0, "step": 6723, "total_num_tokens": 738873962.0, "z_loss": 0.0004796079592779279 }, { "copy_logits_max": -4.910567283630371, "copy_logits_min": -750000000.0, "copy_num_tokens": 494.1875, "epoch": 1.3733469491958132, "gen_logits_max": 4.212125778198242, "gen_logits_mean": -15.44566535949707, "gen_logits_min": -27.899330139160156, "gen_logits_std": 3.1631035804748535, "gen_loss": 0.2710823118686676, "grad_norm": 0.37647780474316445, "learning_rate": 2.243621052631579e-05, "loss": 0.2722, "mean_copy_accuracy": 0.9952948093414307, "mean_gen_accuracy": 0.8837287425994873, "mean_token_accuracy": 0.9095996469259262, "num_tokens": 739130233.0, "sample_num_tokens": 8052.25, "step": 6724, "total_num_tokens": 739162442.0, "z_loss": 0.0004986291751265526 }, { "copy_logits_max": -6.819965362548828, "copy_logits_min": -687500096.0, "copy_num_tokens": 422.875, "epoch": 1.37355118713301, "gen_logits_max": 3.558461904525757, "gen_logits_mean": -17.215015411376953, "gen_logits_min": -29.354209899902344, "gen_logits_std": 3.1656174659729004, "gen_loss": 0.28860706090927124, "grad_norm": 0.33208898687507155, "learning_rate": 2.2434947368421052e-05, "loss": 0.2779, "mean_copy_accuracy": 0.9970503598451614, "mean_gen_accuracy": 0.8770656734704971, "mean_token_accuracy": 0.9062646329402924, "num_tokens": 739414246.0, "sample_num_tokens": 7987.0, "step": 6725, "total_num_tokens": 739446194.0, "z_loss": 0.000504153489600867 }, { "copy_logits_max": -6.035181999206543, "copy_logits_min": -750000000.0, "copy_num_tokens": 430.25, "epoch": 1.3737554250702069, "gen_logits_max": 3.756838321685791, "gen_logits_mean": -16.802635192871094, "gen_logits_min": -29.147602081298828, "gen_logits_std": 3.1761369705200195, "gen_loss": 0.276664137840271, "grad_norm": 0.3641663872669676, "learning_rate": 2.2433684210526317e-05, "loss": 0.2795, "mean_copy_accuracy": 0.9966200590133667, "mean_gen_accuracy": 0.8733881860971451, "mean_token_accuracy": 0.9043193757534027, "num_tokens": 739684096.0, "sample_num_tokens": 8291.0, "step": 6726, "total_num_tokens": 739717260.0, "z_loss": 0.0004743371391668916 }, { "copy_logits_max": -7.724941730499268, "copy_logits_min": -750000000.0, "copy_num_tokens": 265.1875, "epoch": 1.3739596630074036, "gen_logits_max": 4.189522743225098, "gen_logits_mean": -16.985675811767578, "gen_logits_min": -28.750812530517578, "gen_logits_std": 3.1415762901306152, "gen_loss": 0.3002029061317444, "grad_norm": 0.34179292095632563, "learning_rate": 2.2432421052631578e-05, "loss": 0.2863, "mean_copy_accuracy": 0.9954678118228912, "mean_gen_accuracy": 0.8770405054092407, "mean_token_accuracy": 0.9034305810928345, "num_tokens": 739961059.0, "sample_num_tokens": 7040.25, "step": 6727, "total_num_tokens": 739989220.0, "z_loss": 0.0005585027392953634 }, { "copy_logits_max": -4.643694877624512, "copy_logits_min": -750000000.0, "copy_num_tokens": 376.1875, "epoch": 1.3741639009446005, "gen_logits_max": 4.526923179626465, "gen_logits_mean": -15.45306396484375, "gen_logits_min": -27.746492385864258, "gen_logits_std": 3.1313376426696777, "gen_loss": 0.2909697890281677, "grad_norm": 0.3755272675456094, "learning_rate": 2.2431157894736842e-05, "loss": 0.2908, "mean_copy_accuracy": 0.9961089491844177, "mean_gen_accuracy": 0.8714676797389984, "mean_token_accuracy": 0.9023859351873398, "num_tokens": 740227951.0, "sample_num_tokens": 7182.75, "step": 6728, "total_num_tokens": 740256682.0, "z_loss": 0.0005075822700746357 }, { "copy_logits_max": -4.959494590759277, "copy_logits_min": -750000000.0, "copy_num_tokens": 516.5, "epoch": 1.3743681388817972, "gen_logits_max": 3.596668004989624, "gen_logits_mean": -16.37520408630371, "gen_logits_min": -28.88525390625, "gen_logits_std": 3.18072509765625, "gen_loss": 0.24683615565299988, "grad_norm": 0.34057746414877377, "learning_rate": 2.2429894736842106e-05, "loss": 0.2569, "mean_copy_accuracy": 0.9969318509101868, "mean_gen_accuracy": 0.8787406086921692, "mean_token_accuracy": 0.9117754548788071, "num_tokens": 740522898.0, "sample_num_tokens": 8952.0, "step": 6729, "total_num_tokens": 740558706.0, "z_loss": 0.00048686418449506164 }, { "copy_logits_max": -2.9232335090637207, "copy_logits_min": -687500032.0, "copy_num_tokens": 562.8125, "epoch": 1.3745723768189941, "gen_logits_max": 2.8960092067718506, "gen_logits_mean": -16.865150451660156, "gen_logits_min": -29.164093017578125, "gen_logits_std": 3.160226821899414, "gen_loss": 0.26103484630584717, "grad_norm": 0.34719389567873565, "learning_rate": 2.242863157894737e-05, "loss": 0.2761, "mean_copy_accuracy": 0.9970841109752655, "mean_gen_accuracy": 0.8752236664295197, "mean_token_accuracy": 0.9056509882211685, "num_tokens": 740793154.0, "sample_num_tokens": 8645.0, "step": 6730, "total_num_tokens": 740827734.0, "z_loss": 0.0005208937218412757 }, { "copy_logits_max": -5.152361869812012, "copy_logits_min": -750000000.0, "copy_num_tokens": 522.6875, "epoch": 1.374776614756191, "gen_logits_max": 3.613924503326416, "gen_logits_mean": -16.00303077697754, "gen_logits_min": -28.05029296875, "gen_logits_std": 3.156282901763916, "gen_loss": 0.27059119939804077, "grad_norm": 0.3674858807940542, "learning_rate": 2.242736842105263e-05, "loss": 0.2795, "mean_copy_accuracy": 0.9965323507785797, "mean_gen_accuracy": 0.8693500906229019, "mean_token_accuracy": 0.905803993344307, "num_tokens": 741086147.0, "sample_num_tokens": 8700.25, "step": 6731, "total_num_tokens": 741120948.0, "z_loss": 0.0004796416615135968 }, { "copy_logits_max": -3.723686695098877, "copy_logits_min": -750000000.0, "copy_num_tokens": 443.3125, "epoch": 1.3749808526933878, "gen_logits_max": 3.2131426334381104, "gen_logits_mean": -17.30658721923828, "gen_logits_min": -29.674549102783203, "gen_logits_std": 3.176115036010742, "gen_loss": 0.26598769426345825, "grad_norm": 0.3837731645119143, "learning_rate": 2.2426105263157896e-05, "loss": 0.2823, "mean_copy_accuracy": 0.9963806122541428, "mean_gen_accuracy": 0.8766217082738876, "mean_token_accuracy": 0.9054749608039856, "num_tokens": 741344513.0, "sample_num_tokens": 8144.75, "step": 6732, "total_num_tokens": 741377092.0, "z_loss": 0.00045152485836297274 }, { "copy_logits_max": -6.148826599121094, "copy_logits_min": -750000000.0, "copy_num_tokens": 327.1875, "epoch": 1.3751850906305847, "gen_logits_max": 3.7558937072753906, "gen_logits_mean": -17.70126724243164, "gen_logits_min": -29.99243927001953, "gen_logits_std": 3.1830711364746094, "gen_loss": 0.28342509269714355, "grad_norm": 0.37462343823601374, "learning_rate": 2.2424842105263157e-05, "loss": 0.277, "mean_copy_accuracy": 0.9954623430967331, "mean_gen_accuracy": 0.8771186769008636, "mean_token_accuracy": 0.9058061242103577, "num_tokens": 741612339.0, "sample_num_tokens": 7539.25, "step": 6733, "total_num_tokens": 741642496.0, "z_loss": 0.0004802684998139739 }, { "copy_logits_max": -4.3613128662109375, "copy_logits_min": -750000000.0, "copy_num_tokens": 222.75, "epoch": 1.3753893285677814, "gen_logits_max": 5.4510393142700195, "gen_logits_mean": -15.402676582336426, "gen_logits_min": -27.52483367919922, "gen_logits_std": 3.076221466064453, "gen_loss": 0.3398609459400177, "grad_norm": 0.3687936315675416, "learning_rate": 2.242357894736842e-05, "loss": 0.2921, "mean_copy_accuracy": 0.9965278506278992, "mean_gen_accuracy": 0.8718041628599167, "mean_token_accuracy": 0.9007742702960968, "num_tokens": 741874123.0, "sample_num_tokens": 7399.25, "step": 6734, "total_num_tokens": 741903720.0, "z_loss": 0.0005892777116969228 }, { "copy_logits_max": -5.801309108734131, "copy_logits_min": -750000000.0, "copy_num_tokens": 248.1875, "epoch": 1.3755935665049783, "gen_logits_max": 5.506138801574707, "gen_logits_mean": -13.90164566040039, "gen_logits_min": -25.612079620361328, "gen_logits_std": 3.0614840984344482, "gen_loss": 0.3294057548046112, "grad_norm": 0.39861038062962556, "learning_rate": 2.2422315789473685e-05, "loss": 0.2759, "mean_copy_accuracy": 0.9966601133346558, "mean_gen_accuracy": 0.8771848827600479, "mean_token_accuracy": 0.9067059308290482, "num_tokens": 742143397.0, "sample_num_tokens": 6533.75, "step": 6735, "total_num_tokens": 742169532.0, "z_loss": 0.0005575702525675297 }, { "copy_logits_max": -4.694509029388428, "copy_logits_min": -750000000.0, "copy_num_tokens": 418.0625, "epoch": 1.375797804442175, "gen_logits_max": 3.64577579498291, "gen_logits_mean": -16.59856605529785, "gen_logits_min": -28.825092315673828, "gen_logits_std": 3.140650749206543, "gen_loss": 0.31343138217926025, "grad_norm": 0.373809618612669, "learning_rate": 2.2421052631578946e-05, "loss": 0.2754, "mean_copy_accuracy": 0.9969045370817184, "mean_gen_accuracy": 0.8774971663951874, "mean_token_accuracy": 0.9066435545682907, "num_tokens": 742411523.0, "sample_num_tokens": 8447.75, "step": 6736, "total_num_tokens": 742445314.0, "z_loss": 0.0005672161933034658 }, { "copy_logits_max": -4.577505111694336, "copy_logits_min": -750000000.0, "copy_num_tokens": 445.9375, "epoch": 1.376002042379372, "gen_logits_max": 3.5524067878723145, "gen_logits_mean": -16.65311622619629, "gen_logits_min": -28.98436737060547, "gen_logits_std": 3.1289286613464355, "gen_loss": 0.3022346794605255, "grad_norm": 0.39286189519766107, "learning_rate": 2.2419789473684214e-05, "loss": 0.2784, "mean_copy_accuracy": 0.9967681616544724, "mean_gen_accuracy": 0.8764403313398361, "mean_token_accuracy": 0.9056687653064728, "num_tokens": 742683525.0, "sample_num_tokens": 9171.25, "step": 6737, "total_num_tokens": 742720210.0, "z_loss": 0.0005087223835289478 }, { "copy_logits_max": -3.3810691833496094, "copy_logits_min": -750000000.0, "copy_num_tokens": 590.125, "epoch": 1.3762062803165689, "gen_logits_max": 3.1284713745117188, "gen_logits_mean": -16.152481079101562, "gen_logits_min": -28.672616958618164, "gen_logits_std": 3.149974822998047, "gen_loss": 0.25643569231033325, "grad_norm": 0.37830117150690035, "learning_rate": 2.2418526315789475e-05, "loss": 0.2754, "mean_copy_accuracy": 0.9966547340154648, "mean_gen_accuracy": 0.8737176656723022, "mean_token_accuracy": 0.9069837182760239, "num_tokens": 742964389.0, "sample_num_tokens": 9485.25, "step": 6738, "total_num_tokens": 743002330.0, "z_loss": 0.0004187833401374519 }, { "copy_logits_max": -3.410806179046631, "copy_logits_min": -750000128.0, "copy_num_tokens": 606.6875, "epoch": 1.3764105182537656, "gen_logits_max": 3.370730400085449, "gen_logits_mean": -16.635507583618164, "gen_logits_min": -28.936172485351562, "gen_logits_std": 3.18386173248291, "gen_loss": 0.23230712115764618, "grad_norm": 0.3430613336910581, "learning_rate": 2.241726315789474e-05, "loss": 0.2725, "mean_copy_accuracy": 0.9965257346630096, "mean_gen_accuracy": 0.8795971125364304, "mean_token_accuracy": 0.9080201536417007, "num_tokens": 743244809.0, "sample_num_tokens": 9256.75, "step": 6739, "total_num_tokens": 743281836.0, "z_loss": 0.000397166091715917 }, { "copy_logits_max": -4.0785675048828125, "copy_logits_min": -750000000.0, "copy_num_tokens": 603.25, "epoch": 1.3766147561909625, "gen_logits_max": 2.768681049346924, "gen_logits_mean": -17.40145492553711, "gen_logits_min": -29.740131378173828, "gen_logits_std": 3.1764254570007324, "gen_loss": 0.26240456104278564, "grad_norm": 0.37612304989313555, "learning_rate": 2.2416e-05, "loss": 0.282, "mean_copy_accuracy": 0.996732622385025, "mean_gen_accuracy": 0.8746226280927658, "mean_token_accuracy": 0.9057618677616119, "num_tokens": 743517941.0, "sample_num_tokens": 8627.25, "step": 6740, "total_num_tokens": 743552450.0, "z_loss": 0.0004398859746288508 }, { "copy_logits_max": 0.44681257009506226, "copy_logits_min": -750000000.0, "copy_num_tokens": 455.375, "epoch": 1.3768189941281592, "gen_logits_max": 5.007102012634277, "gen_logits_mean": -14.452338218688965, "gen_logits_min": -26.9719181060791, "gen_logits_std": 3.1002793312072754, "gen_loss": 0.27217382192611694, "grad_norm": 0.35208593235908475, "learning_rate": 2.2414736842105264e-05, "loss": 0.2726, "mean_copy_accuracy": 0.996202826499939, "mean_gen_accuracy": 0.8787113726139069, "mean_token_accuracy": 0.9068233221769333, "num_tokens": 743798136.0, "sample_num_tokens": 8770.5, "step": 6741, "total_num_tokens": 743833218.0, "z_loss": 0.000507851131260395 }, { "copy_logits_max": -3.428466796875, "copy_logits_min": -750000000.0, "copy_num_tokens": 425.3125, "epoch": 1.3770232320653562, "gen_logits_max": 4.951489448547363, "gen_logits_mean": -14.509468078613281, "gen_logits_min": -26.83725929260254, "gen_logits_std": 3.1032135486602783, "gen_loss": 0.25818923115730286, "grad_norm": 0.37748314027721863, "learning_rate": 2.2413473684210525e-05, "loss": 0.272, "mean_copy_accuracy": 0.9971991628408432, "mean_gen_accuracy": 0.874809980392456, "mean_token_accuracy": 0.9085682481527328, "num_tokens": 744077058.0, "sample_num_tokens": 8536.5, "step": 6742, "total_num_tokens": 744111204.0, "z_loss": 0.0004559827211778611 }, { "copy_logits_max": 1.47831392288208, "copy_logits_min": -750000000.0, "copy_num_tokens": 640.5625, "epoch": 1.3772274700025529, "gen_logits_max": 4.322309494018555, "gen_logits_mean": -14.005226135253906, "gen_logits_min": -27.738487243652344, "gen_logits_std": 3.111656665802002, "gen_loss": 0.25468555092811584, "grad_norm": 0.37899321554167303, "learning_rate": 2.241221052631579e-05, "loss": 0.2802, "mean_copy_accuracy": 0.9971333593130112, "mean_gen_accuracy": 0.8711785972118378, "mean_token_accuracy": 0.9049816578626633, "num_tokens": 744330789.0, "sample_num_tokens": 8735.75, "step": 6743, "total_num_tokens": 744365732.0, "z_loss": 0.0004887086106464267 }, { "copy_logits_max": -3.125448703765869, "copy_logits_min": -750000000.0, "copy_num_tokens": 456.8125, "epoch": 1.3774317079397498, "gen_logits_max": 4.601547718048096, "gen_logits_mean": -15.40653133392334, "gen_logits_min": -28.462135314941406, "gen_logits_std": 3.135953903198242, "gen_loss": 0.22979548573493958, "grad_norm": 0.38099126746652545, "learning_rate": 2.241094736842105e-05, "loss": 0.2703, "mean_copy_accuracy": 0.9948477149009705, "mean_gen_accuracy": 0.8801922500133514, "mean_token_accuracy": 0.9083299338817596, "num_tokens": 744598491.0, "sample_num_tokens": 8417.75, "step": 6744, "total_num_tokens": 744632162.0, "z_loss": 0.00042521863360889256 }, { "copy_logits_max": -1.2019675970077515, "copy_logits_min": -750000000.0, "copy_num_tokens": 464.0, "epoch": 1.3776359458769467, "gen_logits_max": 4.179912567138672, "gen_logits_mean": -15.628687858581543, "gen_logits_min": -28.32903289794922, "gen_logits_std": 3.1436519622802734, "gen_loss": 0.2729402482509613, "grad_norm": 0.3751308130127733, "learning_rate": 2.240968421052632e-05, "loss": 0.2867, "mean_copy_accuracy": 0.9960879534482956, "mean_gen_accuracy": 0.8712495714426041, "mean_token_accuracy": 0.9040148109197617, "num_tokens": 744850735.0, "sample_num_tokens": 7847.75, "step": 6745, "total_num_tokens": 744882126.0, "z_loss": 0.0004983608960174024 }, { "copy_logits_max": -2.3949832916259766, "copy_logits_min": -750000000.0, "copy_num_tokens": 317.75, "epoch": 1.3778401838141434, "gen_logits_max": 5.621647834777832, "gen_logits_mean": -14.541868209838867, "gen_logits_min": -26.399303436279297, "gen_logits_std": 3.1190078258514404, "gen_loss": 0.27928459644317627, "grad_norm": 0.3638564509530958, "learning_rate": 2.240842105263158e-05, "loss": 0.2587, "mean_copy_accuracy": 0.9954987615346909, "mean_gen_accuracy": 0.8828122615814209, "mean_token_accuracy": 0.909721702337265, "num_tokens": 745103001.0, "sample_num_tokens": 7137.25, "step": 6746, "total_num_tokens": 745131550.0, "z_loss": 0.0005464341375045478 }, { "copy_logits_max": -2.2162296772003174, "copy_logits_min": -750000000.0, "copy_num_tokens": 424.375, "epoch": 1.3780444217513403, "gen_logits_max": 5.638149738311768, "gen_logits_mean": -13.968549728393555, "gen_logits_min": -25.859012603759766, "gen_logits_std": 3.112466812133789, "gen_loss": 0.2785934805870056, "grad_norm": 0.5247371553810429, "learning_rate": 2.2407157894736844e-05, "loss": 0.2815, "mean_copy_accuracy": 0.9966911226511002, "mean_gen_accuracy": 0.8738131523132324, "mean_token_accuracy": 0.9070219546556473, "num_tokens": 745377486.0, "sample_num_tokens": 8328.5, "step": 6747, "total_num_tokens": 745410800.0, "z_loss": 0.0005451049655675888 }, { "copy_logits_max": -4.670249938964844, "copy_logits_min": -750000000.0, "copy_num_tokens": 438.375, "epoch": 1.3782486596885373, "gen_logits_max": 4.477740287780762, "gen_logits_mean": -16.119272232055664, "gen_logits_min": -28.424345016479492, "gen_logits_std": 3.1614880561828613, "gen_loss": 0.26032471656799316, "grad_norm": 0.38552147479865034, "learning_rate": 2.2405894736842108e-05, "loss": 0.2859, "mean_copy_accuracy": 0.995631217956543, "mean_gen_accuracy": 0.8748925179243088, "mean_token_accuracy": 0.9031338840723038, "num_tokens": 745644258.0, "sample_num_tokens": 7851.5, "step": 6748, "total_num_tokens": 745675664.0, "z_loss": 0.0004933325108140707 }, { "copy_logits_max": 0.9194406270980835, "copy_logits_min": -687500032.0, "copy_num_tokens": 406.5, "epoch": 1.378452897625734, "gen_logits_max": 5.731821060180664, "gen_logits_mean": -13.577215194702148, "gen_logits_min": -25.6717472076416, "gen_logits_std": 3.08050537109375, "gen_loss": 0.36388784646987915, "grad_norm": 0.3508778373568323, "learning_rate": 2.240463157894737e-05, "loss": 0.2779, "mean_copy_accuracy": 0.9966390132904053, "mean_gen_accuracy": 0.8738158792257309, "mean_token_accuracy": 0.9055102914571762, "num_tokens": 745920463.0, "sample_num_tokens": 8132.25, "step": 6749, "total_num_tokens": 745952992.0, "z_loss": 0.000673145754262805 }, { "copy_logits_max": -3.039531707763672, "copy_logits_min": -750000000.0, "copy_num_tokens": 487.5625, "epoch": 1.3786571355629307, "gen_logits_max": 5.522926330566406, "gen_logits_mean": -14.119215965270996, "gen_logits_min": -26.08087921142578, "gen_logits_std": 3.1139702796936035, "gen_loss": 0.2939522862434387, "grad_norm": 0.40503933348651916, "learning_rate": 2.2403368421052633e-05, "loss": 0.2972, "mean_copy_accuracy": 0.9962988495826721, "mean_gen_accuracy": 0.8715111315250397, "mean_token_accuracy": 0.9004198610782623, "num_tokens": 746201728.0, "sample_num_tokens": 8811.5, "step": 6750, "total_num_tokens": 746236974.0, "z_loss": 0.0005853300681337714 }, { "copy_logits_max": 2.1160073280334473, "copy_logits_min": -750000128.0, "copy_num_tokens": 631.5, "epoch": 1.3788613735001276, "gen_logits_max": 5.545828819274902, "gen_logits_mean": -13.511604309082031, "gen_logits_min": -25.73890495300293, "gen_logits_std": 3.1054065227508545, "gen_loss": 0.2587425708770752, "grad_norm": 0.3712702451878523, "learning_rate": 2.2402105263157894e-05, "loss": 0.3013, "mean_copy_accuracy": 0.9967808872461319, "mean_gen_accuracy": 0.8714424520730972, "mean_token_accuracy": 0.8985806554555893, "num_tokens": 746462229.0, "sample_num_tokens": 9538.25, "step": 6751, "total_num_tokens": 746500382.0, "z_loss": 0.0004909363924525678 }, { "copy_logits_max": -2.1251380443573, "copy_logits_min": -750000000.0, "copy_num_tokens": 510.3125, "epoch": 1.3790656114373245, "gen_logits_max": 3.573106527328491, "gen_logits_mean": -16.9285831451416, "gen_logits_min": -29.03423309326172, "gen_logits_std": 3.2044992446899414, "gen_loss": 0.26719188690185547, "grad_norm": 0.35334109928444196, "learning_rate": 2.240084210526316e-05, "loss": 0.272, "mean_copy_accuracy": 0.9957294166088104, "mean_gen_accuracy": 0.8784822672605515, "mean_token_accuracy": 0.9068983048200607, "num_tokens": 746733371.0, "sample_num_tokens": 8222.75, "step": 6752, "total_num_tokens": 746766262.0, "z_loss": 0.0005117229884490371 }, { "copy_logits_max": -1.6026711463928223, "copy_logits_min": -625000064.0, "copy_num_tokens": 636.1875, "epoch": 1.3792698493745212, "gen_logits_max": 4.997579097747803, "gen_logits_mean": -14.752979278564453, "gen_logits_min": -26.830211639404297, "gen_logits_std": 3.1539478302001953, "gen_loss": 0.22074785828590393, "grad_norm": 0.3695418329701395, "learning_rate": 2.239957894736842e-05, "loss": 0.2598, "mean_copy_accuracy": 0.9969100803136826, "mean_gen_accuracy": 0.8861645460128784, "mean_token_accuracy": 0.9135522693395615, "num_tokens": 747017724.0, "sample_num_tokens": 8756.0, "step": 6753, "total_num_tokens": 747052748.0, "z_loss": 0.00042154514812864363 }, { "copy_logits_max": -1.3462493419647217, "copy_logits_min": -625000064.0, "copy_num_tokens": 558.9375, "epoch": 1.3794740873117182, "gen_logits_max": 5.102278709411621, "gen_logits_mean": -14.4714994430542, "gen_logits_min": -26.60820960998535, "gen_logits_std": 3.144322395324707, "gen_loss": 0.2705319821834564, "grad_norm": 0.34049087295662983, "learning_rate": 2.2398315789473687e-05, "loss": 0.2604, "mean_copy_accuracy": 0.9969932734966278, "mean_gen_accuracy": 0.8824436366558075, "mean_token_accuracy": 0.911901518702507, "num_tokens": 747296577.0, "sample_num_tokens": 8322.75, "step": 6754, "total_num_tokens": 747329868.0, "z_loss": 0.0005295224254950881 }, { "copy_logits_max": -2.228328227996826, "copy_logits_min": -750000000.0, "copy_num_tokens": 600.3125, "epoch": 1.379678325248915, "gen_logits_max": 5.82587194442749, "gen_logits_mean": -13.452512741088867, "gen_logits_min": -25.674701690673828, "gen_logits_std": 3.100696563720703, "gen_loss": 0.25921985507011414, "grad_norm": 0.3815866369564244, "learning_rate": 2.2397052631578948e-05, "loss": 0.2624, "mean_copy_accuracy": 0.9967806190252304, "mean_gen_accuracy": 0.881200984120369, "mean_token_accuracy": 0.9102692753076553, "num_tokens": 747577635.0, "sample_num_tokens": 10991.75, "step": 6755, "total_num_tokens": 747621602.0, "z_loss": 0.00044055416947230697 }, { "copy_logits_max": -0.36508333683013916, "copy_logits_min": -750000000.0, "copy_num_tokens": 529.875, "epoch": 1.3798825631861118, "gen_logits_max": 6.2512969970703125, "gen_logits_mean": -12.638368606567383, "gen_logits_min": -25.082195281982422, "gen_logits_std": 3.090155601501465, "gen_loss": 0.27530765533447266, "grad_norm": 0.3732978023783164, "learning_rate": 2.2395789473684212e-05, "loss": 0.2835, "mean_copy_accuracy": 0.9964307099580765, "mean_gen_accuracy": 0.8709866255521774, "mean_token_accuracy": 0.9024176299571991, "num_tokens": 747858433.0, "sample_num_tokens": 8995.75, "step": 6756, "total_num_tokens": 747894416.0, "z_loss": 0.0004719743155874312 }, { "copy_logits_max": -3.9057912826538086, "copy_logits_min": -750000064.0, "copy_num_tokens": 241.75, "epoch": 1.3800868011233087, "gen_logits_max": 5.514004707336426, "gen_logits_mean": -15.827342987060547, "gen_logits_min": -27.62565040588379, "gen_logits_std": 3.107027530670166, "gen_loss": 0.2844182252883911, "grad_norm": 0.3693369120718976, "learning_rate": 2.2394526315789473e-05, "loss": 0.2747, "mean_copy_accuracy": 0.9964983612298965, "mean_gen_accuracy": 0.8844162821769714, "mean_token_accuracy": 0.9086768925189972, "num_tokens": 748125724.0, "sample_num_tokens": 6958.5, "step": 6757, "total_num_tokens": 748153558.0, "z_loss": 0.0005188616341911256 }, { "copy_logits_max": -0.7085018157958984, "copy_logits_min": -750000000.0, "copy_num_tokens": 528.3125, "epoch": 1.3802910390605054, "gen_logits_max": 4.456455230712891, "gen_logits_mean": -16.337501525878906, "gen_logits_min": -28.81161117553711, "gen_logits_std": 3.1986382007598877, "gen_loss": 0.24645860493183136, "grad_norm": 0.4184345181561386, "learning_rate": 2.2393263157894737e-05, "loss": 0.2808, "mean_copy_accuracy": 0.9957267493009567, "mean_gen_accuracy": 0.8763441145420074, "mean_token_accuracy": 0.9039812684059143, "num_tokens": 748372691.0, "sample_num_tokens": 8108.75, "step": 6758, "total_num_tokens": 748405126.0, "z_loss": 0.00043170497519895434 }, { "copy_logits_max": -0.41592633724212646, "copy_logits_min": -750000000.0, "copy_num_tokens": 411.0625, "epoch": 1.3804952769977024, "gen_logits_max": 5.812438011169434, "gen_logits_mean": -13.949148178100586, "gen_logits_min": -26.1456298828125, "gen_logits_std": 3.1012649536132812, "gen_loss": 0.2928716838359833, "grad_norm": 0.44619666010958603, "learning_rate": 2.2392e-05, "loss": 0.298, "mean_copy_accuracy": 0.9949943572282791, "mean_gen_accuracy": 0.8730659633874893, "mean_token_accuracy": 0.9010110944509506, "num_tokens": 748626414.0, "sample_num_tokens": 8543.5, "step": 6759, "total_num_tokens": 748660588.0, "z_loss": 0.00048786550178192556 }, { "copy_logits_max": -0.40130504965782166, "copy_logits_min": -687500032.0, "copy_num_tokens": 543.0, "epoch": 1.380699514934899, "gen_logits_max": 5.54988956451416, "gen_logits_mean": -13.504009246826172, "gen_logits_min": -26.113340377807617, "gen_logits_std": 3.099207878112793, "gen_loss": 0.29016855359077454, "grad_norm": 0.37834449033650525, "learning_rate": 2.2390736842105263e-05, "loss": 0.2884, "mean_copy_accuracy": 0.9951380789279938, "mean_gen_accuracy": 0.8733556419610977, "mean_token_accuracy": 0.9020422995090485, "num_tokens": 748890864.0, "sample_num_tokens": 9477.0, "step": 6760, "total_num_tokens": 748928772.0, "z_loss": 0.0005441589746624231 }, { "copy_logits_max": -4.488212585449219, "copy_logits_min": -750000000.0, "copy_num_tokens": 630.6875, "epoch": 1.380903752872096, "gen_logits_max": 3.4354913234710693, "gen_logits_mean": -16.784454345703125, "gen_logits_min": -29.127761840820312, "gen_logits_std": 3.192403793334961, "gen_loss": 0.25840744376182556, "grad_norm": 0.39521219837906424, "learning_rate": 2.2389473684210527e-05, "loss": 0.2629, "mean_copy_accuracy": 0.995670884847641, "mean_gen_accuracy": 0.8779814541339874, "mean_token_accuracy": 0.9093920737504959, "num_tokens": 749178827.0, "sample_num_tokens": 10086.75, "step": 6761, "total_num_tokens": 749219174.0, "z_loss": 0.00044056103797629476 }, { "copy_logits_max": -1.4099977016448975, "copy_logits_min": -750000000.0, "copy_num_tokens": 518.0625, "epoch": 1.381107990809293, "gen_logits_max": 4.092823028564453, "gen_logits_mean": -15.870792388916016, "gen_logits_min": -28.074201583862305, "gen_logits_std": 3.1633691787719727, "gen_loss": 0.2534031569957733, "grad_norm": 0.38777137252531096, "learning_rate": 2.238821052631579e-05, "loss": 0.2676, "mean_copy_accuracy": 0.996819868683815, "mean_gen_accuracy": 0.8782208263874054, "mean_token_accuracy": 0.9104832857847214, "num_tokens": 749458327.0, "sample_num_tokens": 8099.25, "step": 6762, "total_num_tokens": 749490724.0, "z_loss": 0.00040655455086380243 }, { "copy_logits_max": -5.848640441894531, "copy_logits_min": -687500032.0, "copy_num_tokens": 275.0, "epoch": 1.3813122287464896, "gen_logits_max": 4.491262435913086, "gen_logits_mean": -17.148130416870117, "gen_logits_min": -29.04482650756836, "gen_logits_std": 3.1690659523010254, "gen_loss": 0.2879742383956909, "grad_norm": 0.3559037687407405, "learning_rate": 2.2386947368421056e-05, "loss": 0.2797, "mean_copy_accuracy": 0.9966059327125549, "mean_gen_accuracy": 0.8751577436923981, "mean_token_accuracy": 0.9045419692993164, "num_tokens": 749753856.0, "sample_num_tokens": 7302.0, "step": 6763, "total_num_tokens": 749783064.0, "z_loss": 0.0004422614292707294 }, { "copy_logits_max": -5.810689926147461, "copy_logits_min": -750000000.0, "copy_num_tokens": 376.0, "epoch": 1.3815164666836866, "gen_logits_max": 4.507828712463379, "gen_logits_mean": -16.72679901123047, "gen_logits_min": -28.574031829833984, "gen_logits_std": 3.160196304321289, "gen_loss": 0.2818678915500641, "grad_norm": 0.3610478109515147, "learning_rate": 2.2385684210526317e-05, "loss": 0.2755, "mean_copy_accuracy": 0.9947133213281631, "mean_gen_accuracy": 0.8806889206171036, "mean_token_accuracy": 0.9068120270967484, "num_tokens": 750020810.0, "sample_num_tokens": 8629.0, "step": 6764, "total_num_tokens": 750055326.0, "z_loss": 0.0004493814194574952 }, { "copy_logits_max": -4.355772495269775, "copy_logits_min": -687500032.0, "copy_num_tokens": 522.75, "epoch": 1.3817207046208833, "gen_logits_max": 4.953548908233643, "gen_logits_mean": -14.159868240356445, "gen_logits_min": -26.895221710205078, "gen_logits_std": 3.104487657546997, "gen_loss": 0.2614936828613281, "grad_norm": 0.41601274136921085, "learning_rate": 2.238442105263158e-05, "loss": 0.2494, "mean_copy_accuracy": 0.996523842215538, "mean_gen_accuracy": 0.8872400373220444, "mean_token_accuracy": 0.9171198755502701, "num_tokens": 750311245.0, "sample_num_tokens": 8684.25, "step": 6765, "total_num_tokens": 750345982.0, "z_loss": 0.00045114915701560676 }, { "copy_logits_max": -2.6065964698791504, "copy_logits_min": -750000000.0, "copy_num_tokens": 464.8125, "epoch": 1.3819249425580802, "gen_logits_max": 6.237252235412598, "gen_logits_mean": -13.144800186157227, "gen_logits_min": -25.521583557128906, "gen_logits_std": 3.075456380844116, "gen_loss": 0.31050097942352295, "grad_norm": 0.34636931773017293, "learning_rate": 2.2383157894736842e-05, "loss": 0.2821, "mean_copy_accuracy": 0.9970021694898605, "mean_gen_accuracy": 0.870862677693367, "mean_token_accuracy": 0.9032147079706192, "num_tokens": 750591589.0, "sample_num_tokens": 8787.75, "step": 6766, "total_num_tokens": 750626740.0, "z_loss": 0.0005730840493924916 }, { "copy_logits_max": -3.2751827239990234, "copy_logits_min": -750000000.0, "copy_num_tokens": 523.0, "epoch": 1.3821291804952769, "gen_logits_max": 5.735003471374512, "gen_logits_mean": -14.444788932800293, "gen_logits_min": -26.270694732666016, "gen_logits_std": 3.091189384460449, "gen_loss": 0.2922211289405823, "grad_norm": 0.383553482680371, "learning_rate": 2.2381894736842106e-05, "loss": 0.2731, "mean_copy_accuracy": 0.9959712326526642, "mean_gen_accuracy": 0.8783803731203079, "mean_token_accuracy": 0.9086375385522842, "num_tokens": 750855311.0, "sample_num_tokens": 8622.25, "step": 6767, "total_num_tokens": 750889800.0, "z_loss": 0.0005873057525604963 }, { "copy_logits_max": -3.026233673095703, "copy_logits_min": -750000000.0, "copy_num_tokens": 699.125, "epoch": 1.3823334184324738, "gen_logits_max": 3.6928446292877197, "gen_logits_mean": -15.426750183105469, "gen_logits_min": -28.200416564941406, "gen_logits_std": 3.158931255340576, "gen_loss": 0.24640803039073944, "grad_norm": 0.37028081366590165, "learning_rate": 2.2380631578947367e-05, "loss": 0.2647, "mean_copy_accuracy": 0.9970792979001999, "mean_gen_accuracy": 0.874025747179985, "mean_token_accuracy": 0.9096671938896179, "num_tokens": 751161263.0, "sample_num_tokens": 10621.75, "step": 6768, "total_num_tokens": 751203750.0, "z_loss": 0.0005158059066161513 }, { "copy_logits_max": -4.046804904937744, "copy_logits_min": -750000000.0, "copy_num_tokens": 447.3125, "epoch": 1.3825376563696707, "gen_logits_max": 5.006048679351807, "gen_logits_mean": -15.508146286010742, "gen_logits_min": -27.27545738220215, "gen_logits_std": 3.1092708110809326, "gen_loss": 0.2923586368560791, "grad_norm": 0.3765606786758572, "learning_rate": 2.237936842105263e-05, "loss": 0.2755, "mean_copy_accuracy": 0.9941657483577728, "mean_gen_accuracy": 0.8812392354011536, "mean_token_accuracy": 0.90464948117733, "num_tokens": 751425008.0, "sample_num_tokens": 8787.0, "step": 6769, "total_num_tokens": 751460156.0, "z_loss": 0.000589469273108989 }, { "copy_logits_max": 0.10408180952072144, "copy_logits_min": -750000000.0, "copy_num_tokens": 350.1875, "epoch": 1.3827418943068674, "gen_logits_max": 4.900836944580078, "gen_logits_mean": -14.977540969848633, "gen_logits_min": -27.000389099121094, "gen_logits_std": 3.095952272415161, "gen_loss": 0.2866979241371155, "grad_norm": 0.45296425310938543, "learning_rate": 2.2378105263157896e-05, "loss": 0.2754, "mean_copy_accuracy": 0.9956959187984467, "mean_gen_accuracy": 0.8789138942956924, "mean_token_accuracy": 0.9051593840122223, "num_tokens": 751675040.0, "sample_num_tokens": 7419.5, "step": 6770, "total_num_tokens": 751704718.0, "z_loss": 0.0005788760026916862 }, { "copy_logits_max": -2.705933094024658, "copy_logits_min": -750000000.0, "copy_num_tokens": 404.8125, "epoch": 1.3829461322440644, "gen_logits_max": 4.15038537979126, "gen_logits_mean": -16.085403442382812, "gen_logits_min": -28.644441604614258, "gen_logits_std": 3.1453185081481934, "gen_loss": 0.2865369915962219, "grad_norm": 0.3908920452386088, "learning_rate": 2.237684210526316e-05, "loss": 0.2838, "mean_copy_accuracy": 0.9963371306657791, "mean_gen_accuracy": 0.8669870942831039, "mean_token_accuracy": 0.9034639298915863, "num_tokens": 751941058.0, "sample_num_tokens": 7082.5, "step": 6771, "total_num_tokens": 751969388.0, "z_loss": 0.0005503186257556081 }, { "copy_logits_max": 0.24983370304107666, "copy_logits_min": -687500032.0, "copy_num_tokens": 492.6875, "epoch": 1.3831503701812613, "gen_logits_max": 6.075589656829834, "gen_logits_mean": -13.398870468139648, "gen_logits_min": -26.15109634399414, "gen_logits_std": 3.1048591136932373, "gen_loss": 0.26177629828453064, "grad_norm": 0.4204410249206091, "learning_rate": 2.237557894736842e-05, "loss": 0.286, "mean_copy_accuracy": 0.9936328530311584, "mean_gen_accuracy": 0.8809751868247986, "mean_token_accuracy": 0.9022528976202011, "num_tokens": 752192328.0, "sample_num_tokens": 9312.0, "step": 6772, "total_num_tokens": 752229576.0, "z_loss": 0.00047326748608611524 }, { "copy_logits_max": -4.685758590698242, "copy_logits_min": -750000000.0, "copy_num_tokens": 467.375, "epoch": 1.383354608118458, "gen_logits_max": 4.325854301452637, "gen_logits_mean": -16.350669860839844, "gen_logits_min": -28.35869789123535, "gen_logits_std": 3.1315362453460693, "gen_loss": 0.2793060541152954, "grad_norm": 0.358163760675796, "learning_rate": 2.2374315789473685e-05, "loss": 0.2673, "mean_copy_accuracy": 0.9953548908233643, "mean_gen_accuracy": 0.8800778985023499, "mean_token_accuracy": 0.9084460735321045, "num_tokens": 752462885.0, "sample_num_tokens": 8896.75, "step": 6773, "total_num_tokens": 752498472.0, "z_loss": 0.0005036619841121137 }, { "copy_logits_max": -4.145723342895508, "copy_logits_min": -750000000.0, "copy_num_tokens": 399.375, "epoch": 1.3835588460556547, "gen_logits_max": 5.17551851272583, "gen_logits_mean": -15.869620323181152, "gen_logits_min": -28.03643226623535, "gen_logits_std": 3.1391518115997314, "gen_loss": 0.30432263016700745, "grad_norm": 0.35859104229746835, "learning_rate": 2.2373052631578946e-05, "loss": 0.2831, "mean_copy_accuracy": 0.9945583194494247, "mean_gen_accuracy": 0.8745501637458801, "mean_token_accuracy": 0.9052378982305527, "num_tokens": 752749390.0, "sample_num_tokens": 7716.0, "step": 6774, "total_num_tokens": 752780254.0, "z_loss": 0.0005327618564479053 }, { "copy_logits_max": -4.8831915855407715, "copy_logits_min": -750000000.0, "copy_num_tokens": 201.375, "epoch": 1.3837630839928516, "gen_logits_max": 4.279065132141113, "gen_logits_mean": -17.05846405029297, "gen_logits_min": -29.088871002197266, "gen_logits_std": 3.1523420810699463, "gen_loss": 0.2709524929523468, "grad_norm": 0.37835680715582637, "learning_rate": 2.237178947368421e-05, "loss": 0.269, "mean_copy_accuracy": 0.9942638278007507, "mean_gen_accuracy": 0.8822461664676666, "mean_token_accuracy": 0.9095665216445923, "num_tokens": 753036334.0, "sample_num_tokens": 6391.5, "step": 6775, "total_num_tokens": 753061900.0, "z_loss": 0.0005087228491902351 }, { "copy_logits_max": -1.829177975654602, "copy_logits_min": -750000000.0, "copy_num_tokens": 656.25, "epoch": 1.3839673219300486, "gen_logits_max": 3.424996852874756, "gen_logits_mean": -16.628942489624023, "gen_logits_min": -29.049171447753906, "gen_logits_std": 3.188915729522705, "gen_loss": 0.2720297574996948, "grad_norm": 0.3986330653004179, "learning_rate": 2.2370526315789475e-05, "loss": 0.2859, "mean_copy_accuracy": 0.9965249449014664, "mean_gen_accuracy": 0.8727771490812302, "mean_token_accuracy": 0.9030524343252182, "num_tokens": 753289238.0, "sample_num_tokens": 9772.5, "step": 6776, "total_num_tokens": 753328328.0, "z_loss": 0.0005691704573109746 }, { "copy_logits_max": -5.166869163513184, "copy_logits_min": -750000000.0, "copy_num_tokens": 394.625, "epoch": 1.3841715598672453, "gen_logits_max": 3.9872546195983887, "gen_logits_mean": -16.970046997070312, "gen_logits_min": -29.0380916595459, "gen_logits_std": 3.175896167755127, "gen_loss": 0.23654653131961823, "grad_norm": 0.39317501567835883, "learning_rate": 2.2369263157894736e-05, "loss": 0.2772, "mean_copy_accuracy": 0.9962518215179443, "mean_gen_accuracy": 0.8781292289495468, "mean_token_accuracy": 0.9058746695518494, "num_tokens": 753565784.0, "sample_num_tokens": 7998.0, "step": 6777, "total_num_tokens": 753597776.0, "z_loss": 0.0004517053603194654 }, { "copy_logits_max": -5.258376121520996, "copy_logits_min": -750000000.0, "copy_num_tokens": 300.75, "epoch": 1.3843757978044422, "gen_logits_max": 5.522473335266113, "gen_logits_mean": -14.851835250854492, "gen_logits_min": -26.827011108398438, "gen_logits_std": 3.0990281105041504, "gen_loss": 0.31232964992523193, "grad_norm": 0.3578539029613578, "learning_rate": 2.2368000000000003e-05, "loss": 0.2834, "mean_copy_accuracy": 0.9962330162525177, "mean_gen_accuracy": 0.8762571066617966, "mean_token_accuracy": 0.9033529162406921, "num_tokens": 753831178.0, "sample_num_tokens": 7828.5, "step": 6778, "total_num_tokens": 753862492.0, "z_loss": 0.0005536285461857915 }, { "copy_logits_max": -0.07457435131072998, "copy_logits_min": -687500032.0, "copy_num_tokens": 732.375, "epoch": 1.3845800357416391, "gen_logits_max": 4.366524696350098, "gen_logits_mean": -14.473753929138184, "gen_logits_min": -26.521135330200195, "gen_logits_std": 3.1197378635406494, "gen_loss": 0.30652010440826416, "grad_norm": 0.3915069524886662, "learning_rate": 2.2366736842105264e-05, "loss": 0.3001, "mean_copy_accuracy": 0.9958098232746124, "mean_gen_accuracy": 0.8665487468242645, "mean_token_accuracy": 0.8992595821619034, "num_tokens": 754104828.0, "sample_num_tokens": 10445.0, "step": 6779, "total_num_tokens": 754146608.0, "z_loss": 0.0005554328672587872 }, { "copy_logits_max": -3.2401862144470215, "copy_logits_min": -750000000.0, "copy_num_tokens": 373.5625, "epoch": 1.3847842736788358, "gen_logits_max": 4.999039649963379, "gen_logits_mean": -16.35919189453125, "gen_logits_min": -28.27398681640625, "gen_logits_std": 3.1386685371398926, "gen_loss": 0.3221248388290405, "grad_norm": 0.39679648096844183, "learning_rate": 2.236547368421053e-05, "loss": 0.2959, "mean_copy_accuracy": 0.9955744445323944, "mean_gen_accuracy": 0.8680853396654129, "mean_token_accuracy": 0.8985501229763031, "num_tokens": 754403732.0, "sample_num_tokens": 8602.0, "step": 6780, "total_num_tokens": 754438140.0, "z_loss": 0.0005621266318485141 }, { "copy_logits_max": -2.6243577003479004, "copy_logits_min": -687500032.0, "copy_num_tokens": 584.1875, "epoch": 1.3849885116160328, "gen_logits_max": 2.5656747817993164, "gen_logits_mean": -18.475208282470703, "gen_logits_min": -30.404735565185547, "gen_logits_std": 3.2274765968322754, "gen_loss": 0.24943111836910248, "grad_norm": 0.38386402367838757, "learning_rate": 2.236421052631579e-05, "loss": 0.2785, "mean_copy_accuracy": 0.9958015978336334, "mean_gen_accuracy": 0.8742525428533554, "mean_token_accuracy": 0.9047620892524719, "num_tokens": 754655824.0, "sample_num_tokens": 8373.5, "step": 6781, "total_num_tokens": 754689318.0, "z_loss": 0.0005410547601059079 }, { "copy_logits_max": -1.239478349685669, "copy_logits_min": -750000000.0, "copy_num_tokens": 571.375, "epoch": 1.3851927495532295, "gen_logits_max": 4.738534927368164, "gen_logits_mean": -14.159445762634277, "gen_logits_min": -27.28217887878418, "gen_logits_std": 3.1062676906585693, "gen_loss": 0.2873566448688507, "grad_norm": 0.36042059131086096, "learning_rate": 2.2362947368421054e-05, "loss": 0.2753, "mean_copy_accuracy": 0.9959726929664612, "mean_gen_accuracy": 0.875321701169014, "mean_token_accuracy": 0.9061255902051926, "num_tokens": 754939742.0, "sample_num_tokens": 9098.0, "step": 6782, "total_num_tokens": 754976134.0, "z_loss": 0.0006105726351961493 }, { "copy_logits_max": -2.556307315826416, "copy_logits_min": -750000000.0, "copy_num_tokens": 559.0625, "epoch": 1.3853969874904264, "gen_logits_max": 3.5496468544006348, "gen_logits_mean": -15.774646759033203, "gen_logits_min": -29.043703079223633, "gen_logits_std": 3.167057514190674, "gen_loss": 0.28811001777648926, "grad_norm": 0.3841394970301571, "learning_rate": 2.2361684210526315e-05, "loss": 0.2787, "mean_copy_accuracy": 0.9962971210479736, "mean_gen_accuracy": 0.8705709129571915, "mean_token_accuracy": 0.9042296856641769, "num_tokens": 755216769.0, "sample_num_tokens": 8658.25, "step": 6783, "total_num_tokens": 755251402.0, "z_loss": 0.0006228031124919653 }, { "copy_logits_max": -2.3425514698028564, "copy_logits_min": -750000000.0, "copy_num_tokens": 431.5625, "epoch": 1.385601225427623, "gen_logits_max": 4.700161457061768, "gen_logits_mean": -14.736520767211914, "gen_logits_min": -27.14297103881836, "gen_logits_std": 3.1254875659942627, "gen_loss": 0.2847599685192108, "grad_norm": 0.39517983024814274, "learning_rate": 2.236042105263158e-05, "loss": 0.2771, "mean_copy_accuracy": 0.9961617588996887, "mean_gen_accuracy": 0.8765241056680679, "mean_token_accuracy": 0.9077341705560684, "num_tokens": 755483629.0, "sample_num_tokens": 8418.25, "step": 6784, "total_num_tokens": 755517302.0, "z_loss": 0.0005129070486873388 }, { "copy_logits_max": -2.3555214405059814, "copy_logits_min": -750000000.0, "copy_num_tokens": 375.4375, "epoch": 1.38580546336482, "gen_logits_max": 4.667727470397949, "gen_logits_mean": -14.706352233886719, "gen_logits_min": -27.2286319732666, "gen_logits_std": 3.1234147548675537, "gen_loss": 0.31667834520339966, "grad_norm": 0.410309991712983, "learning_rate": 2.235915789473684e-05, "loss": 0.3008, "mean_copy_accuracy": 0.9951569139957428, "mean_gen_accuracy": 0.8689097017049789, "mean_token_accuracy": 0.8991873562335968, "num_tokens": 755726391.0, "sample_num_tokens": 7309.75, "step": 6785, "total_num_tokens": 755755630.0, "z_loss": 0.0005925705772824585 }, { "copy_logits_max": -4.439814567565918, "copy_logits_min": -750000000.0, "copy_num_tokens": 534.9375, "epoch": 1.386009701302017, "gen_logits_max": 3.960437297821045, "gen_logits_mean": -15.853815078735352, "gen_logits_min": -28.697551727294922, "gen_logits_std": 3.1777782440185547, "gen_loss": 0.23690611124038696, "grad_norm": 0.4024917140315369, "learning_rate": 2.2357894736842108e-05, "loss": 0.271, "mean_copy_accuracy": 0.9958038926124573, "mean_gen_accuracy": 0.877494141459465, "mean_token_accuracy": 0.9074880927801132, "num_tokens": 755991348.0, "sample_num_tokens": 8272.5, "step": 6786, "total_num_tokens": 756024438.0, "z_loss": 0.0005322634242475033 }, { "copy_logits_max": -3.1390280723571777, "copy_logits_min": -750000000.0, "copy_num_tokens": 488.5625, "epoch": 1.3862139392392137, "gen_logits_max": 2.9719338417053223, "gen_logits_mean": -18.22612953186035, "gen_logits_min": -30.215490341186523, "gen_logits_std": 3.2021713256835938, "gen_loss": 0.3017219305038452, "grad_norm": 0.5136207515228677, "learning_rate": 2.235663157894737e-05, "loss": 0.2784, "mean_copy_accuracy": 0.9960493892431259, "mean_gen_accuracy": 0.876182347536087, "mean_token_accuracy": 0.9041914194822311, "num_tokens": 756252777.0, "sample_num_tokens": 9146.25, "step": 6787, "total_num_tokens": 756289362.0, "z_loss": 0.0005557001568377018 }, { "copy_logits_max": -2.0906548500061035, "copy_logits_min": -750000000.0, "copy_num_tokens": 752.25, "epoch": 1.3864181771764106, "gen_logits_max": 3.927732467651367, "gen_logits_mean": -15.304136276245117, "gen_logits_min": -27.728919982910156, "gen_logits_std": 3.1393167972564697, "gen_loss": 0.27212122082710266, "grad_norm": 0.3813867985110724, "learning_rate": 2.2355368421052633e-05, "loss": 0.2816, "mean_copy_accuracy": 0.9963751435279846, "mean_gen_accuracy": 0.8717772960662842, "mean_token_accuracy": 0.9052000343799591, "num_tokens": 756513947.0, "sample_num_tokens": 10029.75, "step": 6788, "total_num_tokens": 756554066.0, "z_loss": 0.0005366354016587138 }, { "copy_logits_max": -1.1092092990875244, "copy_logits_min": -750000000.0, "copy_num_tokens": 488.5, "epoch": 1.3866224151136073, "gen_logits_max": 2.8828225135803223, "gen_logits_mean": -17.441574096679688, "gen_logits_min": -29.611787796020508, "gen_logits_std": 3.18917179107666, "gen_loss": 0.2707616090774536, "grad_norm": 0.37762409436912164, "learning_rate": 2.2354105263157897e-05, "loss": 0.2703, "mean_copy_accuracy": 0.9961452186107635, "mean_gen_accuracy": 0.8756056576967239, "mean_token_accuracy": 0.9087287783622742, "num_tokens": 756785227.0, "sample_num_tokens": 7998.25, "step": 6789, "total_num_tokens": 756817220.0, "z_loss": 0.00045988080091774464 }, { "copy_logits_max": -1.758013367652893, "copy_logits_min": -750000000.0, "copy_num_tokens": 509.875, "epoch": 1.3868266530508042, "gen_logits_max": 3.818970203399658, "gen_logits_mean": -16.587661743164062, "gen_logits_min": -28.53382682800293, "gen_logits_std": 3.1597981452941895, "gen_loss": 0.26805514097213745, "grad_norm": 0.37058693905118145, "learning_rate": 2.2352842105263158e-05, "loss": 0.2878, "mean_copy_accuracy": 0.9967102110385895, "mean_gen_accuracy": 0.8670935183763504, "mean_token_accuracy": 0.901658907532692, "num_tokens": 757065975.0, "sample_num_tokens": 8867.25, "step": 6790, "total_num_tokens": 757101444.0, "z_loss": 0.0004525850818026811 }, { "copy_logits_max": -1.8237104415893555, "copy_logits_min": -750000000.0, "copy_num_tokens": 561.3125, "epoch": 1.387030890988001, "gen_logits_max": 4.414061546325684, "gen_logits_mean": -14.719574928283691, "gen_logits_min": -27.47960662841797, "gen_logits_std": 3.118288993835449, "gen_loss": 0.24442580342292786, "grad_norm": 0.4008616622489324, "learning_rate": 2.2351578947368422e-05, "loss": 0.2639, "mean_copy_accuracy": 0.9962616711854935, "mean_gen_accuracy": 0.8802340626716614, "mean_token_accuracy": 0.9107005596160889, "num_tokens": 757318795.0, "sample_num_tokens": 8811.25, "step": 6791, "total_num_tokens": 757354040.0, "z_loss": 0.0004566878778859973 }, { "copy_logits_max": -1.2291934490203857, "copy_logits_min": -750000000.0, "copy_num_tokens": 313.3125, "epoch": 1.3872351289251978, "gen_logits_max": 4.591701030731201, "gen_logits_mean": -15.291715621948242, "gen_logits_min": -27.504180908203125, "gen_logits_std": 3.085256814956665, "gen_loss": 0.31422871351242065, "grad_norm": 0.37210333548551083, "learning_rate": 2.2350315789473683e-05, "loss": 0.2928, "mean_copy_accuracy": 0.9961001127958298, "mean_gen_accuracy": 0.8696360737085342, "mean_token_accuracy": 0.9004729241132736, "num_tokens": 757580951.0, "sample_num_tokens": 7442.25, "step": 6792, "total_num_tokens": 757610720.0, "z_loss": 0.0005594920367002487 }, { "copy_logits_max": -6.07547664642334, "copy_logits_min": -687500032.0, "copy_num_tokens": 420.875, "epoch": 1.3874393668623948, "gen_logits_max": 4.472592830657959, "gen_logits_mean": -16.116474151611328, "gen_logits_min": -28.63950538635254, "gen_logits_std": 3.095188617706299, "gen_loss": 0.2970760762691498, "grad_norm": 0.38048423294841555, "learning_rate": 2.2349052631578948e-05, "loss": 0.2818, "mean_copy_accuracy": 0.995377779006958, "mean_gen_accuracy": 0.8707149177789688, "mean_token_accuracy": 0.9026635885238647, "num_tokens": 757850819.0, "sample_num_tokens": 8352.75, "step": 6793, "total_num_tokens": 757884230.0, "z_loss": 0.0005539205158129334 }, { "copy_logits_max": -3.434354066848755, "copy_logits_min": -750000000.0, "copy_num_tokens": 388.3125, "epoch": 1.3876436047995915, "gen_logits_max": 4.271250247955322, "gen_logits_mean": -15.402069091796875, "gen_logits_min": -27.75116729736328, "gen_logits_std": 3.0941553115844727, "gen_loss": 0.2729063630104065, "grad_norm": 0.3558098080726863, "learning_rate": 2.2347789473684212e-05, "loss": 0.2729, "mean_copy_accuracy": 0.9956658035516739, "mean_gen_accuracy": 0.8816581517457962, "mean_token_accuracy": 0.9065627604722977, "num_tokens": 758120048.0, "sample_num_tokens": 8065.0, "step": 6794, "total_num_tokens": 758152308.0, "z_loss": 0.0004839618341065943 }, { "copy_logits_max": -1.3723840713500977, "copy_logits_min": -750000000.0, "copy_num_tokens": 506.6875, "epoch": 1.3878478427367884, "gen_logits_max": 4.330552577972412, "gen_logits_mean": -14.9271821975708, "gen_logits_min": -27.22500991821289, "gen_logits_std": 3.1067745685577393, "gen_loss": 0.2772541642189026, "grad_norm": 0.37118313436233646, "learning_rate": 2.2346526315789476e-05, "loss": 0.2706, "mean_copy_accuracy": 0.9969767332077026, "mean_gen_accuracy": 0.8769605308771133, "mean_token_accuracy": 0.908716470003128, "num_tokens": 758405052.0, "sample_num_tokens": 8685.0, "step": 6795, "total_num_tokens": 758439792.0, "z_loss": 0.0005106574390083551 }, { "copy_logits_max": -3.700956344604492, "copy_logits_min": -750000000.0, "copy_num_tokens": 361.3125, "epoch": 1.3880520806739851, "gen_logits_max": 3.5063905715942383, "gen_logits_mean": -17.07773208618164, "gen_logits_min": -29.073497772216797, "gen_logits_std": 3.1283187866210938, "gen_loss": 0.3096235990524292, "grad_norm": 0.35867811502180896, "learning_rate": 2.2345263157894737e-05, "loss": 0.2776, "mean_copy_accuracy": 0.9959363490343094, "mean_gen_accuracy": 0.8774841129779816, "mean_token_accuracy": 0.9058521389961243, "num_tokens": 758674743.0, "sample_num_tokens": 7110.25, "step": 6796, "total_num_tokens": 758703184.0, "z_loss": 0.0005883117555640638 }, { "copy_logits_max": -2.2221477031707764, "copy_logits_min": -750000000.0, "copy_num_tokens": 456.1875, "epoch": 1.388256318611182, "gen_logits_max": 4.179067611694336, "gen_logits_mean": -15.475157737731934, "gen_logits_min": -27.64496612548828, "gen_logits_std": 3.1288833618164062, "gen_loss": 0.2386031448841095, "grad_norm": 0.37971014107830703, "learning_rate": 2.2344e-05, "loss": 0.2737, "mean_copy_accuracy": 0.9960802495479584, "mean_gen_accuracy": 0.8770332038402557, "mean_token_accuracy": 0.9057214111089706, "num_tokens": 758930990.0, "sample_num_tokens": 7665.5, "step": 6797, "total_num_tokens": 758961652.0, "z_loss": 0.0004980890080332756 }, { "copy_logits_max": -3.9299449920654297, "copy_logits_min": -750000000.0, "copy_num_tokens": 380.125, "epoch": 1.3884605565483787, "gen_logits_max": 3.470475673675537, "gen_logits_mean": -17.336437225341797, "gen_logits_min": -29.427074432373047, "gen_logits_std": 3.147963523864746, "gen_loss": 0.260039746761322, "grad_norm": 0.36636049110755603, "learning_rate": 2.2342736842105262e-05, "loss": 0.2647, "mean_copy_accuracy": 0.9967876821756363, "mean_gen_accuracy": 0.8800293207168579, "mean_token_accuracy": 0.9090252667665482, "num_tokens": 759202297.0, "sample_num_tokens": 7560.75, "step": 6798, "total_num_tokens": 759232540.0, "z_loss": 0.0005092535866424441 }, { "copy_logits_max": -2.8598225116729736, "copy_logits_min": -625000064.0, "copy_num_tokens": 407.9375, "epoch": 1.3886647944855757, "gen_logits_max": 3.919239044189453, "gen_logits_mean": -16.16604232788086, "gen_logits_min": -28.241870880126953, "gen_logits_std": 3.145921230316162, "gen_loss": 0.24753853678703308, "grad_norm": 0.37689467667054394, "learning_rate": 2.2341473684210527e-05, "loss": 0.2778, "mean_copy_accuracy": 0.9968196004629135, "mean_gen_accuracy": 0.8780418485403061, "mean_token_accuracy": 0.9050832837820053, "num_tokens": 759449261.0, "sample_num_tokens": 7652.25, "step": 6799, "total_num_tokens": 759479870.0, "z_loss": 0.00046575383748859167 }, { "copy_logits_max": -0.9403066635131836, "copy_logits_min": -687500032.0, "copy_num_tokens": 743.75, "epoch": 1.3888690324227726, "gen_logits_max": 4.192174434661865, "gen_logits_mean": -14.361611366271973, "gen_logits_min": -28.051700592041016, "gen_logits_std": 3.1324710845947266, "gen_loss": 0.225664883852005, "grad_norm": 0.3490329334711672, "learning_rate": 2.2340210526315788e-05, "loss": 0.2586, "mean_copy_accuracy": 0.9971908628940582, "mean_gen_accuracy": 0.8842927813529968, "mean_token_accuracy": 0.9110078066587448, "num_tokens": 759725090.0, "sample_num_tokens": 9771.0, "step": 6800, "total_num_tokens": 759764174.0, "z_loss": 0.0004482190706767142 }, { "copy_logits_max": 1.942491054534912, "copy_logits_min": -750000000.0, "copy_num_tokens": 625.25, "epoch": 1.3890732703599693, "gen_logits_max": 3.108180522918701, "gen_logits_mean": -15.69047737121582, "gen_logits_min": -27.97474479675293, "gen_logits_std": 3.1415538787841797, "gen_loss": 0.24495716392993927, "grad_norm": 3.763209037021203, "learning_rate": 2.2338947368421052e-05, "loss": 0.2807, "mean_copy_accuracy": 0.9961837232112885, "mean_gen_accuracy": 0.8725724518299103, "mean_token_accuracy": 0.9048887342214584, "num_tokens": 760012203.0, "sample_num_tokens": 8927.75, "step": 6801, "total_num_tokens": 760047914.0, "z_loss": 0.00043069064849987626 }, { "copy_logits_max": -5.015698432922363, "copy_logits_min": -750000064.0, "copy_num_tokens": 501.125, "epoch": 1.3892775082971662, "gen_logits_max": 2.6188724040985107, "gen_logits_mean": -18.15401840209961, "gen_logits_min": -30.118074417114258, "gen_logits_std": 3.1958746910095215, "gen_loss": 0.27316340804100037, "grad_norm": 0.4224783350562932, "learning_rate": 2.2337684210526316e-05, "loss": 0.281, "mean_copy_accuracy": 0.9962425976991653, "mean_gen_accuracy": 0.8774467259645462, "mean_token_accuracy": 0.9042880982160568, "num_tokens": 760266165.0, "sample_num_tokens": 8782.25, "step": 6802, "total_num_tokens": 760301294.0, "z_loss": 0.0004958743811585009 }, { "copy_logits_max": -2.4822745323181152, "copy_logits_min": -750000000.0, "copy_num_tokens": 470.5625, "epoch": 1.3894817462343632, "gen_logits_max": 3.9151992797851562, "gen_logits_mean": -15.6279296875, "gen_logits_min": -28.143381118774414, "gen_logits_std": 3.117514133453369, "gen_loss": 0.28269869089126587, "grad_norm": 0.40354021143033425, "learning_rate": 2.233642105263158e-05, "loss": 0.2653, "mean_copy_accuracy": 0.9966418743133545, "mean_gen_accuracy": 0.880143478512764, "mean_token_accuracy": 0.9101635366678238, "num_tokens": 760533418.0, "sample_num_tokens": 8315.0, "step": 6803, "total_num_tokens": 760566678.0, "z_loss": 0.0004920073552057147 }, { "copy_logits_max": -6.111835956573486, "copy_logits_min": -750000000.0, "copy_num_tokens": 356.5, "epoch": 1.3896859841715599, "gen_logits_max": 4.334519863128662, "gen_logits_mean": -16.80005645751953, "gen_logits_min": -28.732858657836914, "gen_logits_std": 3.1149609088897705, "gen_loss": 0.27510419487953186, "grad_norm": 0.3748560187183299, "learning_rate": 2.2335157894736845e-05, "loss": 0.2688, "mean_copy_accuracy": 0.9971113950014114, "mean_gen_accuracy": 0.8814582973718643, "mean_token_accuracy": 0.9086544662714005, "num_tokens": 760795854.0, "sample_num_tokens": 7822.0, "step": 6804, "total_num_tokens": 760827142.0, "z_loss": 0.0005069702165201306 }, { "copy_logits_max": -5.585212707519531, "copy_logits_min": -750000000.0, "copy_num_tokens": 475.0, "epoch": 1.3898902221087566, "gen_logits_max": 4.9614033699035645, "gen_logits_mean": -14.18547248840332, "gen_logits_min": -26.625324249267578, "gen_logits_std": 3.0436038970947266, "gen_loss": 0.258586049079895, "grad_norm": 0.39424692961210644, "learning_rate": 2.2333894736842106e-05, "loss": 0.2666, "mean_copy_accuracy": 0.9964928925037384, "mean_gen_accuracy": 0.8812980651855469, "mean_token_accuracy": 0.9108586460351944, "num_tokens": 761060058.0, "sample_num_tokens": 8131.0, "step": 6805, "total_num_tokens": 761092582.0, "z_loss": 0.0004768331127706915 }, { "copy_logits_max": -2.4690563678741455, "copy_logits_min": -750000000.0, "copy_num_tokens": 459.25, "epoch": 1.3900944600459535, "gen_logits_max": 4.594038009643555, "gen_logits_mean": -15.603857040405273, "gen_logits_min": -27.741987228393555, "gen_logits_std": 3.118612289428711, "gen_loss": 0.2815965414047241, "grad_norm": 0.37318678288306445, "learning_rate": 2.233263157894737e-05, "loss": 0.2844, "mean_copy_accuracy": 0.9965467154979706, "mean_gen_accuracy": 0.873113602399826, "mean_token_accuracy": 0.9040241539478302, "num_tokens": 761344529.0, "sample_num_tokens": 8791.25, "step": 6806, "total_num_tokens": 761379694.0, "z_loss": 0.0005109324119985104 }, { "copy_logits_max": -4.552778244018555, "copy_logits_min": -687500032.0, "copy_num_tokens": 436.3125, "epoch": 1.3902986979831504, "gen_logits_max": 3.9555959701538086, "gen_logits_mean": -16.200809478759766, "gen_logits_min": -28.709625244140625, "gen_logits_std": 3.1286232471466064, "gen_loss": 0.2873261570930481, "grad_norm": 0.3867536494469092, "learning_rate": 2.233136842105263e-05, "loss": 0.2744, "mean_copy_accuracy": 0.9969692826271057, "mean_gen_accuracy": 0.8778268396854401, "mean_token_accuracy": 0.9071785509586334, "num_tokens": 761611366.0, "sample_num_tokens": 8134.5, "step": 6807, "total_num_tokens": 761643904.0, "z_loss": 0.0005647381767630577 }, { "copy_logits_max": -2.969968795776367, "copy_logits_min": -750000000.0, "copy_num_tokens": 308.8125, "epoch": 1.3905029359203471, "gen_logits_max": 4.734413146972656, "gen_logits_mean": -17.31114959716797, "gen_logits_min": -29.466991424560547, "gen_logits_std": 3.1592087745666504, "gen_loss": 0.2857191264629364, "grad_norm": 0.40062586868019107, "learning_rate": 2.2330105263157895e-05, "loss": 0.2731, "mean_copy_accuracy": 0.9960163235664368, "mean_gen_accuracy": 0.879517987370491, "mean_token_accuracy": 0.9067505300045013, "num_tokens": 761884496.0, "sample_num_tokens": 7756.0, "step": 6808, "total_num_tokens": 761915520.0, "z_loss": 0.0005370269645936787 }, { "copy_logits_max": -0.45446234941482544, "copy_logits_min": -750000000.0, "copy_num_tokens": 443.875, "epoch": 1.390707173857544, "gen_logits_max": 5.3340349197387695, "gen_logits_mean": -13.91786003112793, "gen_logits_min": -26.5454158782959, "gen_logits_std": 3.029905319213867, "gen_loss": 0.27097660303115845, "grad_norm": 0.3784283857560614, "learning_rate": 2.2328842105263156e-05, "loss": 0.2679, "mean_copy_accuracy": 0.9965513050556183, "mean_gen_accuracy": 0.8763548582792282, "mean_token_accuracy": 0.9093745350837708, "num_tokens": 762147202.0, "sample_num_tokens": 7913.0, "step": 6809, "total_num_tokens": 762178854.0, "z_loss": 0.0004831029800698161 }, { "copy_logits_max": 0.5235397219657898, "copy_logits_min": -687500032.0, "copy_num_tokens": 540.75, "epoch": 1.390911411794741, "gen_logits_max": 5.499919414520264, "gen_logits_mean": -14.748435974121094, "gen_logits_min": -27.056049346923828, "gen_logits_std": 3.120537281036377, "gen_loss": 0.2728767991065979, "grad_norm": 0.4034199092580159, "learning_rate": 2.232757894736842e-05, "loss": 0.293, "mean_copy_accuracy": 0.9953183084726334, "mean_gen_accuracy": 0.8718626797199249, "mean_token_accuracy": 0.9006330072879791, "num_tokens": 762423575.0, "sample_num_tokens": 9056.25, "step": 6810, "total_num_tokens": 762459800.0, "z_loss": 0.0004981641541235149 }, { "copy_logits_max": 4.200837135314941, "copy_logits_min": -687500032.0, "copy_num_tokens": 518.1875, "epoch": 1.3911156497319377, "gen_logits_max": 4.302406311035156, "gen_logits_mean": -14.863807678222656, "gen_logits_min": -27.71837043762207, "gen_logits_std": 3.1161537170410156, "gen_loss": 0.27580177783966064, "grad_norm": 0.41415401976737926, "learning_rate": 2.2326315789473685e-05, "loss": 0.2877, "mean_copy_accuracy": 0.9954245686531067, "mean_gen_accuracy": 0.870159924030304, "mean_token_accuracy": 0.9008220136165619, "num_tokens": 762684586.0, "sample_num_tokens": 7868.5, "step": 6811, "total_num_tokens": 762716060.0, "z_loss": 0.0005815659533254802 }, { "copy_logits_max": 1.9543462991714478, "copy_logits_min": -687500032.0, "copy_num_tokens": 575.875, "epoch": 1.3913198876691346, "gen_logits_max": 5.267095565795898, "gen_logits_mean": -14.97258186340332, "gen_logits_min": -27.38831901550293, "gen_logits_std": 3.1150224208831787, "gen_loss": 0.2770693600177765, "grad_norm": 0.3668641000776769, "learning_rate": 2.232505263157895e-05, "loss": 0.2797, "mean_copy_accuracy": 0.9955123960971832, "mean_gen_accuracy": 0.8776471316814423, "mean_token_accuracy": 0.9060544222593307, "num_tokens": 762944553.0, "sample_num_tokens": 9280.75, "step": 6812, "total_num_tokens": 762981676.0, "z_loss": 0.0006355881923809648 }, { "copy_logits_max": -0.5846467018127441, "copy_logits_min": -750000000.0, "copy_num_tokens": 492.0, "epoch": 1.3915241256063313, "gen_logits_max": 5.25715446472168, "gen_logits_mean": -15.181166648864746, "gen_logits_min": -27.701339721679688, "gen_logits_std": 3.0987143516540527, "gen_loss": 0.28156304359436035, "grad_norm": 0.3794672703995202, "learning_rate": 2.232378947368421e-05, "loss": 0.2863, "mean_copy_accuracy": 0.9961365461349487, "mean_gen_accuracy": 0.8777066171169281, "mean_token_accuracy": 0.904729887843132, "num_tokens": 763213767.0, "sample_num_tokens": 8181.25, "step": 6813, "total_num_tokens": 763246492.0, "z_loss": 0.0006010960205458105 }, { "copy_logits_max": -2.459360361099243, "copy_logits_min": -750000000.0, "copy_num_tokens": 361.875, "epoch": 1.3917283635435282, "gen_logits_max": 5.804376125335693, "gen_logits_mean": -14.36993408203125, "gen_logits_min": -26.595867156982422, "gen_logits_std": 3.064763069152832, "gen_loss": 0.293443888425827, "grad_norm": 0.42778801414001977, "learning_rate": 2.2322526315789475e-05, "loss": 0.2815, "mean_copy_accuracy": 0.9960268437862396, "mean_gen_accuracy": 0.8738390654325485, "mean_token_accuracy": 0.9027872383594513, "num_tokens": 763472520.0, "sample_num_tokens": 7627.5, "step": 6814, "total_num_tokens": 763503030.0, "z_loss": 0.0005017273360863328 }, { "copy_logits_max": -5.3426127433776855, "copy_logits_min": -750000000.0, "copy_num_tokens": 284.3125, "epoch": 1.391932601480725, "gen_logits_max": 5.570922374725342, "gen_logits_mean": -16.495861053466797, "gen_logits_min": -27.921010971069336, "gen_logits_std": 3.1076910495758057, "gen_loss": 0.31545618176460266, "grad_norm": 0.366308403348383, "learning_rate": 2.232126315789474e-05, "loss": 0.2771, "mean_copy_accuracy": 0.9949426203966141, "mean_gen_accuracy": 0.8802068084478378, "mean_token_accuracy": 0.90667624771595, "num_tokens": 763750740.0, "sample_num_tokens": 8335.0, "step": 6815, "total_num_tokens": 763784080.0, "z_loss": 0.0004987536813132465 }, { "copy_logits_max": -0.07486414909362793, "copy_logits_min": -750000000.0, "copy_num_tokens": 369.0, "epoch": 1.3921368394179219, "gen_logits_max": 4.367654800415039, "gen_logits_mean": -15.55232048034668, "gen_logits_min": -28.006999969482422, "gen_logits_std": 3.1318235397338867, "gen_loss": 0.27786189317703247, "grad_norm": 0.4300121662752568, "learning_rate": 2.232e-05, "loss": 0.2743, "mean_copy_accuracy": 0.9950126111507416, "mean_gen_accuracy": 0.8758333772420883, "mean_token_accuracy": 0.9061367213726044, "num_tokens": 764015720.0, "sample_num_tokens": 7879.0, "step": 6816, "total_num_tokens": 764047236.0, "z_loss": 0.0004419783945195377 }, { "copy_logits_max": -1.4100834131240845, "copy_logits_min": -687500032.0, "copy_num_tokens": 552.5625, "epoch": 1.3923410773551188, "gen_logits_max": 3.408116340637207, "gen_logits_mean": -16.890064239501953, "gen_logits_min": -29.32352066040039, "gen_logits_std": 3.199843645095825, "gen_loss": 0.25235527753829956, "grad_norm": 0.369144899345335, "learning_rate": 2.2318736842105264e-05, "loss": 0.2653, "mean_copy_accuracy": 0.9957570135593414, "mean_gen_accuracy": 0.8765410482883453, "mean_token_accuracy": 0.9098270386457443, "num_tokens": 764297268.0, "sample_num_tokens": 9146.0, "step": 6817, "total_num_tokens": 764333852.0, "z_loss": 0.000467872858280316 }, { "copy_logits_max": -6.262511730194092, "copy_logits_min": -750000000.0, "copy_num_tokens": 277.5, "epoch": 1.3925453152923155, "gen_logits_max": 4.647691249847412, "gen_logits_mean": -17.46600914001465, "gen_logits_min": -29.32923126220703, "gen_logits_std": 3.1493074893951416, "gen_loss": 0.29518383741378784, "grad_norm": 0.38948688057883546, "learning_rate": 2.2317473684210525e-05, "loss": 0.2904, "mean_copy_accuracy": 0.996111735701561, "mean_gen_accuracy": 0.8736477345228195, "mean_token_accuracy": 0.9030789434909821, "num_tokens": 764569272.0, "sample_num_tokens": 7594.5, "step": 6818, "total_num_tokens": 764599650.0, "z_loss": 0.0005090354243293405 }, { "copy_logits_max": -5.334582328796387, "copy_logits_min": -750000000.0, "copy_num_tokens": 486.875, "epoch": 1.3927495532295124, "gen_logits_max": 3.3109774589538574, "gen_logits_mean": -17.931621551513672, "gen_logits_min": -29.88568115234375, "gen_logits_std": 3.1973724365234375, "gen_loss": 0.2731539309024811, "grad_norm": 0.3999886692475713, "learning_rate": 2.2316210526315793e-05, "loss": 0.2685, "mean_copy_accuracy": 0.9972408562898636, "mean_gen_accuracy": 0.8795436024665833, "mean_token_accuracy": 0.9089044332504272, "num_tokens": 764841819.0, "sample_num_tokens": 8251.25, "step": 6819, "total_num_tokens": 764874824.0, "z_loss": 0.0004690570058301091 }, { "copy_logits_max": -2.142948627471924, "copy_logits_min": -625000064.0, "copy_num_tokens": 397.4375, "epoch": 1.3929537911667091, "gen_logits_max": 5.215285301208496, "gen_logits_mean": -15.097661018371582, "gen_logits_min": -26.921611785888672, "gen_logits_std": 3.12018084526062, "gen_loss": 0.27645084261894226, "grad_norm": 0.3935304847659454, "learning_rate": 2.2314947368421054e-05, "loss": 0.2846, "mean_copy_accuracy": 0.9967720210552216, "mean_gen_accuracy": 0.874551996588707, "mean_token_accuracy": 0.9031037390232086, "num_tokens": 765115268.0, "sample_num_tokens": 9029.5, "step": 6820, "total_num_tokens": 765151386.0, "z_loss": 0.00043633778113871813 }, { "copy_logits_max": -1.8581461906433105, "copy_logits_min": -625000064.0, "copy_num_tokens": 743.6875, "epoch": 1.393158029103906, "gen_logits_max": 5.068876266479492, "gen_logits_mean": -13.87663459777832, "gen_logits_min": -26.420806884765625, "gen_logits_std": 3.152604579925537, "gen_loss": 0.23326648771762848, "grad_norm": 0.41438155113991165, "learning_rate": 2.2313684210526318e-05, "loss": 0.2616, "mean_copy_accuracy": 0.9964353293180466, "mean_gen_accuracy": 0.8770681023597717, "mean_token_accuracy": 0.9127448499202728, "num_tokens": 765397189.0, "sample_num_tokens": 10335.25, "step": 6821, "total_num_tokens": 765438530.0, "z_loss": 0.0004240622220095247 }, { "copy_logits_max": -2.37872052192688, "copy_logits_min": -750000000.0, "copy_num_tokens": 372.5625, "epoch": 1.3933622670411028, "gen_logits_max": 6.8955535888671875, "gen_logits_mean": -13.132731437683105, "gen_logits_min": -25.055570602416992, "gen_logits_std": 3.080176830291748, "gen_loss": 0.29994750022888184, "grad_norm": 0.3761646721268883, "learning_rate": 2.231242105263158e-05, "loss": 0.2884, "mean_copy_accuracy": 0.9962119609117508, "mean_gen_accuracy": 0.873199850320816, "mean_token_accuracy": 0.9025670289993286, "num_tokens": 765666030.0, "sample_num_tokens": 7237.0, "step": 6822, "total_num_tokens": 765694978.0, "z_loss": 0.0006459950236603618 }, { "copy_logits_max": -4.614955902099609, "copy_logits_min": -750000000.0, "copy_num_tokens": 379.75, "epoch": 1.3935665049782997, "gen_logits_max": 6.161944389343262, "gen_logits_mean": -15.294023513793945, "gen_logits_min": -26.982833862304688, "gen_logits_std": 3.1097943782806396, "gen_loss": 0.29353445768356323, "grad_norm": 0.4115080764664498, "learning_rate": 2.2311157894736843e-05, "loss": 0.3029, "mean_copy_accuracy": 0.9945170134305954, "mean_gen_accuracy": 0.8729718178510666, "mean_token_accuracy": 0.8975010216236115, "num_tokens": 765917340.0, "sample_num_tokens": 8228.5, "step": 6823, "total_num_tokens": 765950254.0, "z_loss": 0.0007083833916112781 }, { "copy_logits_max": -5.141969680786133, "copy_logits_min": -750000000.0, "copy_num_tokens": 318.4375, "epoch": 1.3937707429154966, "gen_logits_max": 4.786457061767578, "gen_logits_mean": -16.310041427612305, "gen_logits_min": -28.09354591369629, "gen_logits_std": 3.154232978820801, "gen_loss": 0.2979055941104889, "grad_norm": 0.3854399568030297, "learning_rate": 2.2309894736842104e-05, "loss": 0.2903, "mean_copy_accuracy": 0.9964101612567902, "mean_gen_accuracy": 0.8721377551555634, "mean_token_accuracy": 0.9009661674499512, "num_tokens": 766191372.0, "sample_num_tokens": 7317.0, "step": 6824, "total_num_tokens": 766220640.0, "z_loss": 0.0006618694169446826 }, { "copy_logits_max": -2.691896438598633, "copy_logits_min": -750000000.0, "copy_num_tokens": 618.125, "epoch": 1.3939749808526933, "gen_logits_max": 5.68170690536499, "gen_logits_mean": -13.112556457519531, "gen_logits_min": -25.637144088745117, "gen_logits_std": 3.1068785190582275, "gen_loss": 0.28318506479263306, "grad_norm": 0.3798359094931291, "learning_rate": 2.230863157894737e-05, "loss": 0.3031, "mean_copy_accuracy": 0.9958251863718033, "mean_gen_accuracy": 0.8662488907575607, "mean_token_accuracy": 0.8947746753692627, "num_tokens": 766463663.0, "sample_num_tokens": 9818.25, "step": 6825, "total_num_tokens": 766502936.0, "z_loss": 0.0008136929245665669 }, { "copy_logits_max": -0.5121377110481262, "copy_logits_min": -750000000.0, "copy_num_tokens": 343.75, "epoch": 1.3941792187898903, "gen_logits_max": 4.409627437591553, "gen_logits_mean": -15.499465942382812, "gen_logits_min": -27.344457626342773, "gen_logits_std": 3.1313600540161133, "gen_loss": 0.2896973490715027, "grad_norm": 0.38639015551861006, "learning_rate": 2.230736842105263e-05, "loss": 0.2799, "mean_copy_accuracy": 0.9953843653202057, "mean_gen_accuracy": 0.8783710300922394, "mean_token_accuracy": 0.9051704704761505, "num_tokens": 766727705.0, "sample_num_tokens": 7176.25, "step": 6826, "total_num_tokens": 766756410.0, "z_loss": 0.000621334183961153 }, { "copy_logits_max": -0.4779769778251648, "copy_logits_min": -687500032.0, "copy_num_tokens": 557.0, "epoch": 1.3943834567270872, "gen_logits_max": 4.815516948699951, "gen_logits_mean": -14.013479232788086, "gen_logits_min": -26.684410095214844, "gen_logits_std": 3.1246776580810547, "gen_loss": 0.26734453439712524, "grad_norm": 0.4214895893403826, "learning_rate": 2.2306105263157897e-05, "loss": 0.2976, "mean_copy_accuracy": 0.9966240078210831, "mean_gen_accuracy": 0.8661996126174927, "mean_token_accuracy": 0.899864599108696, "num_tokens": 766999650.0, "sample_num_tokens": 9193.5, "step": 6827, "total_num_tokens": 767036424.0, "z_loss": 0.0005369777791202068 }, { "copy_logits_max": -7.606634140014648, "copy_logits_min": -750000000.0, "copy_num_tokens": 555.0, "epoch": 1.394587694664284, "gen_logits_max": 3.99407958984375, "gen_logits_mean": -16.073644638061523, "gen_logits_min": -27.974105834960938, "gen_logits_std": 3.1429178714752197, "gen_loss": 0.263548880815506, "grad_norm": 0.36851242078376295, "learning_rate": 2.2304842105263158e-05, "loss": 0.2784, "mean_copy_accuracy": 0.9962713420391083, "mean_gen_accuracy": 0.8757120668888092, "mean_token_accuracy": 0.9061764627695084, "num_tokens": 767279591.0, "sample_num_tokens": 10105.75, "step": 6828, "total_num_tokens": 767320014.0, "z_loss": 0.0004625613510143012 }, { "copy_logits_max": -4.735532760620117, "copy_logits_min": -750000000.0, "copy_num_tokens": 542.8125, "epoch": 1.3947919326014806, "gen_logits_max": 4.881791114807129, "gen_logits_mean": -14.451651573181152, "gen_logits_min": -26.738521575927734, "gen_logits_std": 3.1240482330322266, "gen_loss": 0.2762880325317383, "grad_norm": 0.3883961058607178, "learning_rate": 2.2303578947368422e-05, "loss": 0.2741, "mean_copy_accuracy": 0.9960668087005615, "mean_gen_accuracy": 0.8799529820680618, "mean_token_accuracy": 0.9083212316036224, "num_tokens": 767552401.0, "sample_num_tokens": 9818.25, "step": 6829, "total_num_tokens": 767591674.0, "z_loss": 0.0005017854273319244 }, { "copy_logits_max": -3.2963569164276123, "copy_logits_min": -750000000.0, "copy_num_tokens": 602.8125, "epoch": 1.3949961705386775, "gen_logits_max": 3.3753013610839844, "gen_logits_mean": -16.011747360229492, "gen_logits_min": -28.44292449951172, "gen_logits_std": 3.1739501953125, "gen_loss": 0.25181639194488525, "grad_norm": 0.35767056417108484, "learning_rate": 2.2302315789473687e-05, "loss": 0.2699, "mean_copy_accuracy": 0.9963482618331909, "mean_gen_accuracy": 0.8732561767101288, "mean_token_accuracy": 0.908934086561203, "num_tokens": 767858961.0, "sample_num_tokens": 9012.75, "step": 6830, "total_num_tokens": 767895012.0, "z_loss": 0.000540215871296823 }, { "copy_logits_max": -4.021301746368408, "copy_logits_min": -687500032.0, "copy_num_tokens": 560.875, "epoch": 1.3952004084758745, "gen_logits_max": 3.799884796142578, "gen_logits_mean": -16.17609214782715, "gen_logits_min": -28.282184600830078, "gen_logits_std": 3.1709165573120117, "gen_loss": 0.32497644424438477, "grad_norm": 0.3486396851376309, "learning_rate": 2.2301052631578947e-05, "loss": 0.2899, "mean_copy_accuracy": 0.9953587502241135, "mean_gen_accuracy": 0.8743244558572769, "mean_token_accuracy": 0.9027498662471771, "num_tokens": 768157959.0, "sample_num_tokens": 9874.75, "step": 6831, "total_num_tokens": 768197458.0, "z_loss": 0.0006801303243264556 }, { "copy_logits_max": -5.974173545837402, "copy_logits_min": -750000000.0, "copy_num_tokens": 382.75, "epoch": 1.3954046464130712, "gen_logits_max": 3.3592441082000732, "gen_logits_mean": -17.77012825012207, "gen_logits_min": -29.432785034179688, "gen_logits_std": 3.164426803588867, "gen_loss": 0.30234014987945557, "grad_norm": 0.4017590172409874, "learning_rate": 2.2299789473684212e-05, "loss": 0.2797, "mean_copy_accuracy": 0.9966630190610886, "mean_gen_accuracy": 0.8742717504501343, "mean_token_accuracy": 0.9054391384124756, "num_tokens": 768449499.0, "sample_num_tokens": 7709.25, "step": 6832, "total_num_tokens": 768480336.0, "z_loss": 0.000577608123421669 }, { "copy_logits_max": -4.16370153427124, "copy_logits_min": -750000000.0, "copy_num_tokens": 538.8125, "epoch": 1.395608884350268, "gen_logits_max": 4.010183334350586, "gen_logits_mean": -15.353164672851562, "gen_logits_min": -27.292491912841797, "gen_logits_std": 3.1263718605041504, "gen_loss": 0.2703411877155304, "grad_norm": 0.3728742120543935, "learning_rate": 2.2298526315789473e-05, "loss": 0.2737, "mean_copy_accuracy": 0.9965259283781052, "mean_gen_accuracy": 0.8760763108730316, "mean_token_accuracy": 0.9065665304660797, "num_tokens": 768710014.0, "sample_num_tokens": 8028.0, "step": 6833, "total_num_tokens": 768742126.0, "z_loss": 0.0005828276043757796 }, { "copy_logits_max": -5.096721649169922, "copy_logits_min": -750000000.0, "copy_num_tokens": 581.875, "epoch": 1.395813122287465, "gen_logits_max": 3.5854296684265137, "gen_logits_mean": -16.207744598388672, "gen_logits_min": -28.062946319580078, "gen_logits_std": 3.149653434753418, "gen_loss": 0.2628962993621826, "grad_norm": 0.3943758390482833, "learning_rate": 2.2297263157894737e-05, "loss": 0.2817, "mean_copy_accuracy": 0.9964066445827484, "mean_gen_accuracy": 0.8763362318277359, "mean_token_accuracy": 0.9051704406738281, "num_tokens": 768975456.0, "sample_num_tokens": 9558.5, "step": 6834, "total_num_tokens": 769013690.0, "z_loss": 0.00048069830518215895 }, { "copy_logits_max": -3.976839303970337, "copy_logits_min": -750000000.0, "copy_num_tokens": 536.625, "epoch": 1.3960173602246617, "gen_logits_max": 4.115420818328857, "gen_logits_mean": -15.738815307617188, "gen_logits_min": -27.297931671142578, "gen_logits_std": 3.102555990219116, "gen_loss": 0.29685184359550476, "grad_norm": 0.39849353016347805, "learning_rate": 2.2296e-05, "loss": 0.2733, "mean_copy_accuracy": 0.9958101958036423, "mean_gen_accuracy": 0.879060834646225, "mean_token_accuracy": 0.9076032191514969, "num_tokens": 769256898.0, "sample_num_tokens": 10007.5, "step": 6835, "total_num_tokens": 769296928.0, "z_loss": 0.0005383217940106988 }, { "copy_logits_max": -4.802028656005859, "copy_logits_min": -750000000.0, "copy_num_tokens": 572.0, "epoch": 1.3962215981618586, "gen_logits_max": 3.4930615425109863, "gen_logits_mean": -16.038818359375, "gen_logits_min": -27.553909301757812, "gen_logits_std": 3.123230457305908, "gen_loss": 0.29859304428100586, "grad_norm": 0.3622761110964977, "learning_rate": 2.2294736842105266e-05, "loss": 0.2988, "mean_copy_accuracy": 0.995439812541008, "mean_gen_accuracy": 0.8689152598381042, "mean_token_accuracy": 0.8986831605434418, "num_tokens": 769545672.0, "sample_num_tokens": 9111.0, "step": 6836, "total_num_tokens": 769582116.0, "z_loss": 0.0005163488676771522 }, { "copy_logits_max": -4.185337543487549, "copy_logits_min": -750000000.0, "copy_num_tokens": 521.5, "epoch": 1.3964258360990554, "gen_logits_max": 3.6142935752868652, "gen_logits_mean": -16.49512481689453, "gen_logits_min": -28.423004150390625, "gen_logits_std": 3.1294398307800293, "gen_loss": 0.28335970640182495, "grad_norm": 0.38283854668439804, "learning_rate": 2.2293473684210527e-05, "loss": 0.2774, "mean_copy_accuracy": 0.9961362928152084, "mean_gen_accuracy": 0.8786060959100723, "mean_token_accuracy": 0.9035548269748688, "num_tokens": 769791111.0, "sample_num_tokens": 8193.75, "step": 6837, "total_num_tokens": 769823886.0, "z_loss": 0.0005557756521739066 }, { "copy_logits_max": -5.698564529418945, "copy_logits_min": -750000000.0, "copy_num_tokens": 401.375, "epoch": 1.3966300740362523, "gen_logits_max": 3.652845859527588, "gen_logits_mean": -16.4408016204834, "gen_logits_min": -27.902080535888672, "gen_logits_std": 3.1206612586975098, "gen_loss": 0.2743546664714813, "grad_norm": 0.3969100612888424, "learning_rate": 2.229221052631579e-05, "loss": 0.2853, "mean_copy_accuracy": 0.9969336241483688, "mean_gen_accuracy": 0.87313611805439, "mean_token_accuracy": 0.9033733457326889, "num_tokens": 770048747.0, "sample_num_tokens": 8063.75, "step": 6838, "total_num_tokens": 770081002.0, "z_loss": 0.0004230201884638518 }, { "copy_logits_max": -5.639031887054443, "copy_logits_min": -750000000.0, "copy_num_tokens": 394.125, "epoch": 1.396834311973449, "gen_logits_max": 4.114040851593018, "gen_logits_mean": -16.63010597229004, "gen_logits_min": -28.639995574951172, "gen_logits_std": 3.1106173992156982, "gen_loss": 0.31278374791145325, "grad_norm": 0.3725585948325721, "learning_rate": 2.2290947368421052e-05, "loss": 0.273, "mean_copy_accuracy": 0.9961753487586975, "mean_gen_accuracy": 0.8797323256731033, "mean_token_accuracy": 0.9068267792463303, "num_tokens": 770325936.0, "sample_num_tokens": 8796.0, "step": 6839, "total_num_tokens": 770361120.0, "z_loss": 0.0005665603675879538 }, { "copy_logits_max": -6.037695407867432, "copy_logits_min": -750000000.0, "copy_num_tokens": 440.875, "epoch": 1.397038549910646, "gen_logits_max": 4.629452705383301, "gen_logits_mean": -15.709122657775879, "gen_logits_min": -27.178951263427734, "gen_logits_std": 3.0928378105163574, "gen_loss": 0.28710272908210754, "grad_norm": 0.47978568484890133, "learning_rate": 2.2289684210526316e-05, "loss": 0.2593, "mean_copy_accuracy": 0.9953430891036987, "mean_gen_accuracy": 0.8842204660177231, "mean_token_accuracy": 0.9120471775531769, "num_tokens": 770594427.0, "sample_num_tokens": 8922.25, "step": 6840, "total_num_tokens": 770630116.0, "z_loss": 0.000470260827569291 }, { "copy_logits_max": -4.179418563842773, "copy_logits_min": -687500032.0, "copy_num_tokens": 625.0625, "epoch": 1.3972427878478428, "gen_logits_max": 3.05281400680542, "gen_logits_mean": -17.402082443237305, "gen_logits_min": -29.577938079833984, "gen_logits_std": 3.197282314300537, "gen_loss": 0.24633602797985077, "grad_norm": 0.4072918270694058, "learning_rate": 2.2288421052631577e-05, "loss": 0.2631, "mean_copy_accuracy": 0.9956710040569305, "mean_gen_accuracy": 0.8825435787439346, "mean_token_accuracy": 0.9101662933826447, "num_tokens": 770869855.0, "sample_num_tokens": 9178.75, "step": 6841, "total_num_tokens": 770906570.0, "z_loss": 0.000490133068524301 }, { "copy_logits_max": -5.468618392944336, "copy_logits_min": -750000000.0, "copy_num_tokens": 525.9375, "epoch": 1.3974470257850395, "gen_logits_max": 3.3562707901000977, "gen_logits_mean": -16.01772689819336, "gen_logits_min": -27.66457176208496, "gen_logits_std": 3.090900421142578, "gen_loss": 0.2762722969055176, "grad_norm": 0.3584456950251939, "learning_rate": 2.228715789473684e-05, "loss": 0.2826, "mean_copy_accuracy": 0.9967896640300751, "mean_gen_accuracy": 0.8742291331291199, "mean_token_accuracy": 0.9056868702173233, "num_tokens": 771152552.0, "sample_num_tokens": 8036.0, "step": 6842, "total_num_tokens": 771184696.0, "z_loss": 0.0004854879225604236 }, { "copy_logits_max": -5.471482276916504, "copy_logits_min": -750000000.0, "copy_num_tokens": 427.1875, "epoch": 1.3976512637222365, "gen_logits_max": 4.931674957275391, "gen_logits_mean": -14.384934425354004, "gen_logits_min": -26.073354721069336, "gen_logits_std": 3.0503740310668945, "gen_loss": 0.3037831783294678, "grad_norm": 0.39038920435632823, "learning_rate": 2.2285894736842106e-05, "loss": 0.28, "mean_copy_accuracy": 0.9960246086120605, "mean_gen_accuracy": 0.8756959140300751, "mean_token_accuracy": 0.905765637755394, "num_tokens": 771431482.0, "sample_num_tokens": 8373.5, "step": 6843, "total_num_tokens": 771464976.0, "z_loss": 0.0005160572472959757 }, { "copy_logits_max": -5.829918384552002, "copy_logits_min": -687500032.0, "copy_num_tokens": 477.1875, "epoch": 1.3978555016594332, "gen_logits_max": 4.316384315490723, "gen_logits_mean": -15.485427856445312, "gen_logits_min": -27.23388671875, "gen_logits_std": 3.113112449645996, "gen_loss": 0.2654880881309509, "grad_norm": 0.37393421518813486, "learning_rate": 2.228463157894737e-05, "loss": 0.2734, "mean_copy_accuracy": 0.9962129443883896, "mean_gen_accuracy": 0.8780393153429031, "mean_token_accuracy": 0.9068750739097595, "num_tokens": 771687142.0, "sample_num_tokens": 8304.5, "step": 6844, "total_num_tokens": 771720360.0, "z_loss": 0.0004399239842314273 }, { "copy_logits_max": -6.6564106941223145, "copy_logits_min": -750000000.0, "copy_num_tokens": 590.5625, "epoch": 1.39805973959663, "gen_logits_max": 3.8970487117767334, "gen_logits_mean": -14.900792121887207, "gen_logits_min": -27.067121505737305, "gen_logits_std": 3.0819225311279297, "gen_loss": 0.252464234828949, "grad_norm": 0.3695664461599929, "learning_rate": 2.2283368421052634e-05, "loss": 0.2652, "mean_copy_accuracy": 0.9965234994888306, "mean_gen_accuracy": 0.8747316747903824, "mean_token_accuracy": 0.9094183593988419, "num_tokens": 771942907.0, "sample_num_tokens": 8941.75, "step": 6845, "total_num_tokens": 771978674.0, "z_loss": 0.00043261091923341155 }, { "copy_logits_max": -8.234853744506836, "copy_logits_min": -687500032.0, "copy_num_tokens": 531.8125, "epoch": 1.3982639775338268, "gen_logits_max": 3.8220131397247314, "gen_logits_mean": -16.06549835205078, "gen_logits_min": -27.99652862548828, "gen_logits_std": 3.105255126953125, "gen_loss": 0.24731096625328064, "grad_norm": 0.3622911186251653, "learning_rate": 2.2282105263157895e-05, "loss": 0.2644, "mean_copy_accuracy": 0.997172549366951, "mean_gen_accuracy": 0.8787106573581696, "mean_token_accuracy": 0.9101554900407791, "num_tokens": 772215236.0, "sample_num_tokens": 9027.0, "step": 6846, "total_num_tokens": 772251344.0, "z_loss": 0.0004064247477799654 }, { "copy_logits_max": -7.8371686935424805, "copy_logits_min": -687500032.0, "copy_num_tokens": 609.125, "epoch": 1.3984682154710237, "gen_logits_max": 3.5249383449554443, "gen_logits_mean": -16.03205680847168, "gen_logits_min": -27.501590728759766, "gen_logits_std": 3.056939125061035, "gen_loss": 0.29001036286354065, "grad_norm": 0.38358784985991884, "learning_rate": 2.228084210526316e-05, "loss": 0.2832, "mean_copy_accuracy": 0.9966977834701538, "mean_gen_accuracy": 0.8710371553897858, "mean_token_accuracy": 0.9037686288356781, "num_tokens": 772489145.0, "sample_num_tokens": 10164.75, "step": 6847, "total_num_tokens": 772529804.0, "z_loss": 0.0004727250197902322 }, { "copy_logits_max": -6.535407066345215, "copy_logits_min": -625000000.0, "copy_num_tokens": 713.0625, "epoch": 1.3986724534082207, "gen_logits_max": 2.657148838043213, "gen_logits_mean": -16.069332122802734, "gen_logits_min": -27.976947784423828, "gen_logits_std": 3.0520284175872803, "gen_loss": 0.2351444661617279, "grad_norm": 0.3740700752355362, "learning_rate": 2.227957894736842e-05, "loss": 0.2873, "mean_copy_accuracy": 0.9967218935489655, "mean_gen_accuracy": 0.8686251193284988, "mean_token_accuracy": 0.9026546031236649, "num_tokens": 772779843.0, "sample_num_tokens": 10028.25, "step": 6848, "total_num_tokens": 772819956.0, "z_loss": 0.0004246821044944227 }, { "copy_logits_max": -7.807662010192871, "copy_logits_min": -687500032.0, "copy_num_tokens": 447.0, "epoch": 1.3988766913454174, "gen_logits_max": 4.1154584884643555, "gen_logits_mean": -15.99648380279541, "gen_logits_min": -27.997116088867188, "gen_logits_std": 3.086012840270996, "gen_loss": 0.22245344519615173, "grad_norm": 0.36119809094657385, "learning_rate": 2.2278315789473685e-05, "loss": 0.2596, "mean_copy_accuracy": 0.9972104877233505, "mean_gen_accuracy": 0.8797068297863007, "mean_token_accuracy": 0.911676362156868, "num_tokens": 773060823.0, "sample_num_tokens": 8356.75, "step": 6849, "total_num_tokens": 773094250.0, "z_loss": 0.00037401399458758533 }, { "copy_logits_max": -6.726388931274414, "copy_logits_min": -750000000.0, "copy_num_tokens": 483.5625, "epoch": 1.3990809292826143, "gen_logits_max": 3.5948009490966797, "gen_logits_mean": -15.503623008728027, "gen_logits_min": -27.087215423583984, "gen_logits_std": 3.029639482498169, "gen_loss": 0.2928793430328369, "grad_norm": 0.3859841860293549, "learning_rate": 2.2277052631578946e-05, "loss": 0.274, "mean_copy_accuracy": 0.9967190474271774, "mean_gen_accuracy": 0.8775608986616135, "mean_token_accuracy": 0.9076943397521973, "num_tokens": 773322861.0, "sample_num_tokens": 7777.25, "step": 6850, "total_num_tokens": 773353970.0, "z_loss": 0.000478702480904758 }, { "copy_logits_max": -7.508523464202881, "copy_logits_min": -750000000.0, "copy_num_tokens": 456.125, "epoch": 1.399285167219811, "gen_logits_max": 3.9985392093658447, "gen_logits_mean": -15.848466873168945, "gen_logits_min": -27.701818466186523, "gen_logits_std": 3.0378150939941406, "gen_loss": 0.2761915326118469, "grad_norm": 0.3548214444938973, "learning_rate": 2.227578947368421e-05, "loss": 0.2698, "mean_copy_accuracy": 0.9966636449098587, "mean_gen_accuracy": 0.8800166696310043, "mean_token_accuracy": 0.9077930748462677, "num_tokens": 773590157.0, "sample_num_tokens": 8314.25, "step": 6851, "total_num_tokens": 773623414.0, "z_loss": 0.00047178679960779846 }, { "copy_logits_max": -5.7964630126953125, "copy_logits_min": -687500032.0, "copy_num_tokens": 505.8125, "epoch": 1.399489405157008, "gen_logits_max": 4.229299545288086, "gen_logits_mean": -15.247686386108398, "gen_logits_min": -27.04834747314453, "gen_logits_std": 3.0126936435699463, "gen_loss": 0.2914767861366272, "grad_norm": 0.37930981458630353, "learning_rate": 2.2274526315789474e-05, "loss": 0.2829, "mean_copy_accuracy": 0.9959786385297775, "mean_gen_accuracy": 0.8773677349090576, "mean_token_accuracy": 0.904596745967865, "num_tokens": 773843130.0, "sample_num_tokens": 8473.0, "step": 6852, "total_num_tokens": 773877022.0, "z_loss": 0.0005597930285148323 }, { "copy_logits_max": -6.616660118103027, "copy_logits_min": -750000000.0, "copy_num_tokens": 485.625, "epoch": 1.3996936430942046, "gen_logits_max": 3.639342784881592, "gen_logits_mean": -15.843870162963867, "gen_logits_min": -27.400665283203125, "gen_logits_std": 3.012005090713501, "gen_loss": 0.2752980589866638, "grad_norm": 0.37048253904638173, "learning_rate": 2.227326315789474e-05, "loss": 0.2779, "mean_copy_accuracy": 0.9968262165784836, "mean_gen_accuracy": 0.8768189996480942, "mean_token_accuracy": 0.9061803072690964, "num_tokens": 774115540.0, "sample_num_tokens": 8301.0, "step": 6853, "total_num_tokens": 774148744.0, "z_loss": 0.0005764561938121915 }, { "copy_logits_max": -6.319607734680176, "copy_logits_min": -750000128.0, "copy_num_tokens": 686.8125, "epoch": 1.3998978810314016, "gen_logits_max": 2.9049196243286133, "gen_logits_mean": -16.317028045654297, "gen_logits_min": -28.350936889648438, "gen_logits_std": 3.095001697540283, "gen_loss": 0.23319515585899353, "grad_norm": 0.38430854347289756, "learning_rate": 2.2272e-05, "loss": 0.2665, "mean_copy_accuracy": 0.9971368163824081, "mean_gen_accuracy": 0.8786722719669342, "mean_token_accuracy": 0.9096446335315704, "num_tokens": 774392903.0, "sample_num_tokens": 9639.25, "step": 6854, "total_num_tokens": 774431460.0, "z_loss": 0.00041620703996159136 }, { "copy_logits_max": -5.72639274597168, "copy_logits_min": -687500032.0, "copy_num_tokens": 613.0, "epoch": 1.4001021189685985, "gen_logits_max": 4.238489151000977, "gen_logits_mean": -13.7399263381958, "gen_logits_min": -25.758466720581055, "gen_logits_std": 2.968789577484131, "gen_loss": 0.2375805824995041, "grad_norm": 0.3629428017540147, "learning_rate": 2.2270736842105264e-05, "loss": 0.2592, "mean_copy_accuracy": 0.9973527044057846, "mean_gen_accuracy": 0.8801998198032379, "mean_token_accuracy": 0.9115746915340424, "num_tokens": 774648542.0, "sample_num_tokens": 9234.0, "step": 6855, "total_num_tokens": 774685478.0, "z_loss": 0.00047773768892511725 }, { "copy_logits_max": -6.557771682739258, "copy_logits_min": -750000000.0, "copy_num_tokens": 569.4375, "epoch": 1.4003063569057952, "gen_logits_max": 3.8437159061431885, "gen_logits_mean": -16.166183471679688, "gen_logits_min": -28.41517448425293, "gen_logits_std": 3.0949904918670654, "gen_loss": 0.2862648665904999, "grad_norm": 0.7771211811485034, "learning_rate": 2.2269473684210528e-05, "loss": 0.3027, "mean_copy_accuracy": 0.9958323389291763, "mean_gen_accuracy": 0.8657210916280746, "mean_token_accuracy": 0.8989316672086716, "num_tokens": 774921773.0, "sample_num_tokens": 8996.25, "step": 6856, "total_num_tokens": 774957758.0, "z_loss": 0.0005006755236536264 }, { "copy_logits_max": -6.498927593231201, "copy_logits_min": -750000000.0, "copy_num_tokens": 477.8125, "epoch": 1.4005105948429921, "gen_logits_max": 3.9396677017211914, "gen_logits_mean": -16.94204330444336, "gen_logits_min": -29.054628372192383, "gen_logits_std": 3.134669780731201, "gen_loss": 0.3007388114929199, "grad_norm": 0.366299022520923, "learning_rate": 2.226821052631579e-05, "loss": 0.2958, "mean_copy_accuracy": 0.9963299185037613, "mean_gen_accuracy": 0.8685683012008667, "mean_token_accuracy": 0.8996508419513702, "num_tokens": 775192495.0, "sample_num_tokens": 9061.75, "step": 6857, "total_num_tokens": 775228742.0, "z_loss": 0.0005599907017312944 }, { "copy_logits_max": -5.556338310241699, "copy_logits_min": -750000000.0, "copy_num_tokens": 432.0625, "epoch": 1.400714832780189, "gen_logits_max": 4.40244197845459, "gen_logits_mean": -16.52874755859375, "gen_logits_min": -28.200937271118164, "gen_logits_std": 3.141171455383301, "gen_loss": 0.2805827856063843, "grad_norm": 0.34767849231490033, "learning_rate": 2.2266947368421053e-05, "loss": 0.2664, "mean_copy_accuracy": 0.9975545108318329, "mean_gen_accuracy": 0.8796762675046921, "mean_token_accuracy": 0.9082376807928085, "num_tokens": 775466484.0, "sample_num_tokens": 7859.5, "step": 6858, "total_num_tokens": 775497922.0, "z_loss": 0.0005364217213355005 }, { "copy_logits_max": -5.414212703704834, "copy_logits_min": -750000064.0, "copy_num_tokens": 417.75, "epoch": 1.4009190707173858, "gen_logits_max": 3.327876567840576, "gen_logits_mean": -17.598403930664062, "gen_logits_min": -29.368507385253906, "gen_logits_std": 3.2026987075805664, "gen_loss": 0.23922517895698547, "grad_norm": 0.3411465641894072, "learning_rate": 2.2265684210526314e-05, "loss": 0.2529, "mean_copy_accuracy": 0.9973310679197311, "mean_gen_accuracy": 0.8827771097421646, "mean_token_accuracy": 0.9141711890697479, "num_tokens": 775760553.0, "sample_num_tokens": 7810.75, "step": 6859, "total_num_tokens": 775791796.0, "z_loss": 0.0004702972073573619 }, { "copy_logits_max": -6.05727481842041, "copy_logits_min": -687500032.0, "copy_num_tokens": 424.1875, "epoch": 1.4011233086545825, "gen_logits_max": 3.1023786067962646, "gen_logits_mean": -16.73889923095703, "gen_logits_min": -28.627756118774414, "gen_logits_std": 3.1193065643310547, "gen_loss": 0.3004137873649597, "grad_norm": 0.39630318091067035, "learning_rate": 2.2264421052631582e-05, "loss": 0.3106, "mean_copy_accuracy": 0.9955220222473145, "mean_gen_accuracy": 0.8648319691419601, "mean_token_accuracy": 0.8944932073354721, "num_tokens": 775994131.0, "sample_num_tokens": 7247.75, "step": 6860, "total_num_tokens": 776023122.0, "z_loss": 0.0005197337595745921 }, { "copy_logits_max": -7.711522579193115, "copy_logits_min": -750000000.0, "copy_num_tokens": 440.3125, "epoch": 1.4013275465917794, "gen_logits_max": 3.445830821990967, "gen_logits_mean": -17.32619857788086, "gen_logits_min": -29.2091007232666, "gen_logits_std": 3.1683125495910645, "gen_loss": 0.2630322277545929, "grad_norm": 0.3647091889800128, "learning_rate": 2.2263157894736843e-05, "loss": 0.2616, "mean_copy_accuracy": 0.9970107227563858, "mean_gen_accuracy": 0.8844129294157028, "mean_token_accuracy": 0.9089274406433105, "num_tokens": 776243254.0, "sample_num_tokens": 8536.0, "step": 6861, "total_num_tokens": 776277398.0, "z_loss": 0.0004518043133430183 }, { "copy_logits_max": -8.637593269348145, "copy_logits_min": -687500032.0, "copy_num_tokens": 413.0, "epoch": 1.4015317845289763, "gen_logits_max": 4.143916130065918, "gen_logits_mean": -16.167659759521484, "gen_logits_min": -28.1684627532959, "gen_logits_std": 3.124981641769409, "gen_loss": 0.28327620029449463, "grad_norm": 0.37889077841464175, "learning_rate": 2.2261894736842107e-05, "loss": 0.2792, "mean_copy_accuracy": 0.9967541694641113, "mean_gen_accuracy": 0.8697715699672699, "mean_token_accuracy": 0.9051367044448853, "num_tokens": 776523453.0, "sample_num_tokens": 8128.25, "step": 6862, "total_num_tokens": 776555966.0, "z_loss": 0.0004834520223084837 }, { "copy_logits_max": -6.005206108093262, "copy_logits_min": -687500032.0, "copy_num_tokens": 812.625, "epoch": 1.401736022466173, "gen_logits_max": 3.1475186347961426, "gen_logits_mean": -16.308618545532227, "gen_logits_min": -28.512344360351562, "gen_logits_std": 3.164882183074951, "gen_loss": 0.22244594991207123, "grad_norm": 0.38918860671331607, "learning_rate": 2.2260631578947368e-05, "loss": 0.2765, "mean_copy_accuracy": 0.9961663633584976, "mean_gen_accuracy": 0.8702707141637802, "mean_token_accuracy": 0.9066388756036758, "num_tokens": 776782695.0, "sample_num_tokens": 9651.75, "step": 6863, "total_num_tokens": 776821302.0, "z_loss": 0.0004268519696779549 }, { "copy_logits_max": -7.423108100891113, "copy_logits_min": -687500032.0, "copy_num_tokens": 651.1875, "epoch": 1.40194026040337, "gen_logits_max": 4.178735733032227, "gen_logits_mean": -14.604898452758789, "gen_logits_min": -26.588178634643555, "gen_logits_std": 3.0836117267608643, "gen_loss": 0.24062135815620422, "grad_norm": 0.3359836406209606, "learning_rate": 2.2259368421052632e-05, "loss": 0.2792, "mean_copy_accuracy": 0.9967052787542343, "mean_gen_accuracy": 0.8751390427350998, "mean_token_accuracy": 0.904869481921196, "num_tokens": 777061124.0, "sample_num_tokens": 9728.5, "step": 6864, "total_num_tokens": 777100038.0, "z_loss": 0.00045847438741475344 }, { "copy_logits_max": -7.489200115203857, "copy_logits_min": -750000000.0, "copy_num_tokens": 560.5625, "epoch": 1.4021444983405669, "gen_logits_max": 4.004496097564697, "gen_logits_mean": -16.772525787353516, "gen_logits_min": -28.83206558227539, "gen_logits_std": 3.172593593597412, "gen_loss": 0.22691841423511505, "grad_norm": 0.5107327766788978, "learning_rate": 2.2258105263157893e-05, "loss": 0.2797, "mean_copy_accuracy": 0.9962546974420547, "mean_gen_accuracy": 0.8760191351175308, "mean_token_accuracy": 0.9075851589441299, "num_tokens": 777342878.0, "sample_num_tokens": 8184.0, "step": 6865, "total_num_tokens": 777375614.0, "z_loss": 0.00039461831329390407 }, { "copy_logits_max": -6.690402984619141, "copy_logits_min": -750000064.0, "copy_num_tokens": 429.5, "epoch": 1.4023487362777636, "gen_logits_max": 4.472214698791504, "gen_logits_mean": -15.948734283447266, "gen_logits_min": -28.270977020263672, "gen_logits_std": 3.110184907913208, "gen_loss": 0.32606256008148193, "grad_norm": 0.36682444517944035, "learning_rate": 2.2256842105263158e-05, "loss": 0.2857, "mean_copy_accuracy": 0.9970918446779251, "mean_gen_accuracy": 0.8731843084096909, "mean_token_accuracy": 0.9030033051967621, "num_tokens": 777611544.0, "sample_num_tokens": 8769.5, "step": 6866, "total_num_tokens": 777646622.0, "z_loss": 0.0004967320710420609 }, { "copy_logits_max": -3.5700347423553467, "copy_logits_min": -750000000.0, "copy_num_tokens": 698.6875, "epoch": 1.4025529742149605, "gen_logits_max": 3.161653995513916, "gen_logits_mean": -16.04913330078125, "gen_logits_min": -28.295392990112305, "gen_logits_std": 3.1414315700531006, "gen_loss": 0.28262126445770264, "grad_norm": 0.3777356164598465, "learning_rate": 2.225557894736842e-05, "loss": 0.2692, "mean_copy_accuracy": 0.9968397617340088, "mean_gen_accuracy": 0.875190943479538, "mean_token_accuracy": 0.9088967144489288, "num_tokens": 777907706.0, "sample_num_tokens": 9380.5, "step": 6867, "total_num_tokens": 777945228.0, "z_loss": 0.00047869328409433365 }, { "copy_logits_max": -5.635766506195068, "copy_logits_min": -687500032.0, "copy_num_tokens": 431.375, "epoch": 1.4027572121521572, "gen_logits_max": 3.4394359588623047, "gen_logits_mean": -16.744232177734375, "gen_logits_min": -28.780685424804688, "gen_logits_std": 3.1604933738708496, "gen_loss": 0.2621271014213562, "grad_norm": 0.3729736445164406, "learning_rate": 2.2254315789473686e-05, "loss": 0.2808, "mean_copy_accuracy": 0.9963085949420929, "mean_gen_accuracy": 0.8729686737060547, "mean_token_accuracy": 0.9040957689285278, "num_tokens": 778178993.0, "sample_num_tokens": 7599.25, "step": 6868, "total_num_tokens": 778209390.0, "z_loss": 0.0004312474047765136 }, { "copy_logits_max": -4.848630428314209, "copy_logits_min": -625000064.0, "copy_num_tokens": 705.25, "epoch": 1.4029614500893541, "gen_logits_max": 3.723958969116211, "gen_logits_mean": -16.30583953857422, "gen_logits_min": -28.67214012145996, "gen_logits_std": 3.133437156677246, "gen_loss": 0.29530155658721924, "grad_norm": 0.34127511544307415, "learning_rate": 2.225305263157895e-05, "loss": 0.2604, "mean_copy_accuracy": 0.9967108815908432, "mean_gen_accuracy": 0.8781526237726212, "mean_token_accuracy": 0.9107061177492142, "num_tokens": 778453700.0, "sample_num_tokens": 9698.0, "step": 6869, "total_num_tokens": 778492492.0, "z_loss": 0.00048736814642325044 }, { "copy_logits_max": -8.860241889953613, "copy_logits_min": -750000000.0, "copy_num_tokens": 535.875, "epoch": 1.4031656880265508, "gen_logits_max": 4.854492664337158, "gen_logits_mean": -15.262605667114258, "gen_logits_min": -26.8520565032959, "gen_logits_std": 3.057006359100342, "gen_loss": 0.30222952365875244, "grad_norm": 0.35930459857612856, "learning_rate": 2.225178947368421e-05, "loss": 0.2871, "mean_copy_accuracy": 0.9966979175806046, "mean_gen_accuracy": 0.8748804181814194, "mean_token_accuracy": 0.9027455449104309, "num_tokens": 778724124.0, "sample_num_tokens": 8640.0, "step": 6870, "total_num_tokens": 778758684.0, "z_loss": 0.0005032338085584342 }, { "copy_logits_max": -8.014272689819336, "copy_logits_min": -687500032.0, "copy_num_tokens": 286.25, "epoch": 1.4033699259637478, "gen_logits_max": 4.763925552368164, "gen_logits_mean": -15.132024765014648, "gen_logits_min": -27.084148406982422, "gen_logits_std": 3.0608062744140625, "gen_loss": 0.2901325821876526, "grad_norm": 0.3752707874735685, "learning_rate": 2.2250526315789476e-05, "loss": 0.2854, "mean_copy_accuracy": 0.9956166297197342, "mean_gen_accuracy": 0.8731331527233124, "mean_token_accuracy": 0.9038367420434952, "num_tokens": 778989985.0, "sample_num_tokens": 6165.25, "step": 6871, "total_num_tokens": 779014646.0, "z_loss": 0.0004719134885817766 }, { "copy_logits_max": -9.502518653869629, "copy_logits_min": -750000000.0, "copy_num_tokens": 415.9375, "epoch": 1.4035741639009447, "gen_logits_max": 5.2611403465271, "gen_logits_mean": -15.173452377319336, "gen_logits_min": -27.444726943969727, "gen_logits_std": 3.091501474380493, "gen_loss": 0.24861937761306763, "grad_norm": 0.39717022397577706, "learning_rate": 2.2249263157894737e-05, "loss": 0.2817, "mean_copy_accuracy": 0.9962755590677261, "mean_gen_accuracy": 0.8774413913488388, "mean_token_accuracy": 0.9051167815923691, "num_tokens": 779244453.0, "sample_num_tokens": 7125.75, "step": 6872, "total_num_tokens": 779272956.0, "z_loss": 0.0004221614508423954 }, { "copy_logits_max": -8.258150100708008, "copy_logits_min": -562500032.0, "copy_num_tokens": 535.75, "epoch": 1.4037784018381414, "gen_logits_max": 4.872795104980469, "gen_logits_mean": -16.448169708251953, "gen_logits_min": -28.426956176757812, "gen_logits_std": 3.1047987937927246, "gen_loss": 0.24801005423069, "grad_norm": 0.4285134283675805, "learning_rate": 2.2248e-05, "loss": 0.2665, "mean_copy_accuracy": 0.9946288615465164, "mean_gen_accuracy": 0.8840437680482864, "mean_token_accuracy": 0.9095311313867569, "num_tokens": 779493474.0, "sample_num_tokens": 8325.5, "step": 6873, "total_num_tokens": 779526776.0, "z_loss": 0.0003989448305219412 }, { "copy_logits_max": -6.382401943206787, "copy_logits_min": -750000000.0, "copy_num_tokens": 595.375, "epoch": 1.4039826397753383, "gen_logits_max": 4.938329696655273, "gen_logits_mean": -14.927579879760742, "gen_logits_min": -27.012393951416016, "gen_logits_std": 3.063007116317749, "gen_loss": 0.27403461933135986, "grad_norm": 0.36052071166856586, "learning_rate": 2.2246736842105262e-05, "loss": 0.2768, "mean_copy_accuracy": 0.9963077753782272, "mean_gen_accuracy": 0.8743649125099182, "mean_token_accuracy": 0.9051166623830795, "num_tokens": 779782859.0, "sample_num_tokens": 9661.25, "step": 6874, "total_num_tokens": 779821504.0, "z_loss": 0.0005274621653370559 }, { "copy_logits_max": -8.209920883178711, "copy_logits_min": -750000000.0, "copy_num_tokens": 490.4375, "epoch": 1.404186877712535, "gen_logits_max": 5.182869911193848, "gen_logits_mean": -15.446012496948242, "gen_logits_min": -27.217967987060547, "gen_logits_std": 3.047306776046753, "gen_loss": 0.3099019527435303, "grad_norm": 0.3937019342180703, "learning_rate": 2.2245473684210526e-05, "loss": 0.2822, "mean_copy_accuracy": 0.9966554939746857, "mean_gen_accuracy": 0.8732930272817612, "mean_token_accuracy": 0.9045725762844086, "num_tokens": 780045870.0, "sample_num_tokens": 8596.5, "step": 6875, "total_num_tokens": 780080256.0, "z_loss": 0.0005877481307834387 }, { "copy_logits_max": -5.770556449890137, "copy_logits_min": -750000000.0, "copy_num_tokens": 367.25, "epoch": 1.404391115649732, "gen_logits_max": 4.364444732666016, "gen_logits_mean": -17.142004013061523, "gen_logits_min": -29.002262115478516, "gen_logits_std": 3.1556859016418457, "gen_loss": 0.2905293107032776, "grad_norm": 0.35671553185912747, "learning_rate": 2.224421052631579e-05, "loss": 0.2564, "mean_copy_accuracy": 0.9963236004114151, "mean_gen_accuracy": 0.8783707618713379, "mean_token_accuracy": 0.9120252281427383, "num_tokens": 780326407.0, "sample_num_tokens": 7399.75, "step": 6876, "total_num_tokens": 780356006.0, "z_loss": 0.000579227227717638 }, { "copy_logits_max": -6.626534461975098, "copy_logits_min": -687500032.0, "copy_num_tokens": 313.8125, "epoch": 1.4045953535869287, "gen_logits_max": 4.64754056930542, "gen_logits_mean": -16.658000946044922, "gen_logits_min": -28.530174255371094, "gen_logits_std": 3.1141953468322754, "gen_loss": 0.3329305946826935, "grad_norm": 0.3608295159272845, "learning_rate": 2.2242947368421055e-05, "loss": 0.2797, "mean_copy_accuracy": 0.9962768107652664, "mean_gen_accuracy": 0.8790756314992905, "mean_token_accuracy": 0.9076238572597504, "num_tokens": 780592296.0, "sample_num_tokens": 7133.0, "step": 6877, "total_num_tokens": 780620828.0, "z_loss": 0.0005827081622555852 }, { "copy_logits_max": -5.5584330558776855, "copy_logits_min": -687500032.0, "copy_num_tokens": 568.9375, "epoch": 1.4047995915241256, "gen_logits_max": 4.455430030822754, "gen_logits_mean": -15.814483642578125, "gen_logits_min": -28.046791076660156, "gen_logits_std": 3.1395959854125977, "gen_loss": 0.2426488697528839, "grad_norm": 0.41752103279231734, "learning_rate": 2.2241684210526316e-05, "loss": 0.2728, "mean_copy_accuracy": 0.997250109910965, "mean_gen_accuracy": 0.87834432721138, "mean_token_accuracy": 0.9083057940006256, "num_tokens": 780843829.0, "sample_num_tokens": 8173.25, "step": 6878, "total_num_tokens": 780876522.0, "z_loss": 0.0004710997745860368 }, { "copy_logits_max": -7.141112327575684, "copy_logits_min": -750000000.0, "copy_num_tokens": 513.0625, "epoch": 1.4050038294613225, "gen_logits_max": 4.5864057540893555, "gen_logits_mean": -16.666706085205078, "gen_logits_min": -28.632261276245117, "gen_logits_std": 3.164763927459717, "gen_loss": 0.23787908256053925, "grad_norm": 0.40365268618326416, "learning_rate": 2.224042105263158e-05, "loss": 0.2668, "mean_copy_accuracy": 0.9963332265615463, "mean_gen_accuracy": 0.8780331164598465, "mean_token_accuracy": 0.9088293015956879, "num_tokens": 781107169.0, "sample_num_tokens": 7902.75, "step": 6879, "total_num_tokens": 781138780.0, "z_loss": 0.00046145846135914326 }, { "copy_logits_max": -8.63107681274414, "copy_logits_min": -750000064.0, "copy_num_tokens": 362.1875, "epoch": 1.4052080673985192, "gen_logits_max": 6.483420372009277, "gen_logits_mean": -14.466087341308594, "gen_logits_min": -26.513565063476562, "gen_logits_std": 3.070277452468872, "gen_loss": 0.3008818030357361, "grad_norm": 0.38254064932552617, "learning_rate": 2.223915789473684e-05, "loss": 0.2917, "mean_copy_accuracy": 0.9949068427085876, "mean_gen_accuracy": 0.8794241100549698, "mean_token_accuracy": 0.9003239721059799, "num_tokens": 781352443.0, "sample_num_tokens": 7827.25, "step": 6880, "total_num_tokens": 781383752.0, "z_loss": 0.0005815428448840976 }, { "copy_logits_max": -7.1902008056640625, "copy_logits_min": -750000000.0, "copy_num_tokens": 349.25, "epoch": 1.4054123053357161, "gen_logits_max": 4.504875183105469, "gen_logits_mean": -16.41376495361328, "gen_logits_min": -27.952756881713867, "gen_logits_std": 3.133594036102295, "gen_loss": 0.28679507970809937, "grad_norm": 0.3654622481712121, "learning_rate": 2.2237894736842105e-05, "loss": 0.2604, "mean_copy_accuracy": 0.9963376075029373, "mean_gen_accuracy": 0.8806296288967133, "mean_token_accuracy": 0.9107457846403122, "num_tokens": 781630993.0, "sample_num_tokens": 6631.25, "step": 6881, "total_num_tokens": 781657518.0, "z_loss": 0.0004900689236819744 }, { "copy_logits_max": -7.544337749481201, "copy_logits_min": -750000000.0, "copy_num_tokens": 423.9375, "epoch": 1.405616543272913, "gen_logits_max": 4.655662536621094, "gen_logits_mean": -16.644756317138672, "gen_logits_min": -28.0924072265625, "gen_logits_std": 3.1044790744781494, "gen_loss": 0.27963167428970337, "grad_norm": 0.36042742105956915, "learning_rate": 2.223663157894737e-05, "loss": 0.2688, "mean_copy_accuracy": 0.9966589957475662, "mean_gen_accuracy": 0.8783726543188095, "mean_token_accuracy": 0.907935306429863, "num_tokens": 781891784.0, "sample_num_tokens": 8700.5, "step": 6882, "total_num_tokens": 781926586.0, "z_loss": 0.0004989571170881391 }, { "copy_logits_max": -7.850753307342529, "copy_logits_min": -750000000.0, "copy_num_tokens": 401.0, "epoch": 1.4058207812101098, "gen_logits_max": 4.29110050201416, "gen_logits_mean": -15.5582275390625, "gen_logits_min": -27.33509063720703, "gen_logits_std": 3.0706396102905273, "gen_loss": 0.2593592405319214, "grad_norm": 0.3707355490032519, "learning_rate": 2.223536842105263e-05, "loss": 0.2724, "mean_copy_accuracy": 0.9970389008522034, "mean_gen_accuracy": 0.8772110342979431, "mean_token_accuracy": 0.908046618103981, "num_tokens": 782145029.0, "sample_num_tokens": 8214.25, "step": 6883, "total_num_tokens": 782177886.0, "z_loss": 0.0004740118747577071 }, { "copy_logits_max": -6.958563327789307, "copy_logits_min": -750000000.0, "copy_num_tokens": 635.375, "epoch": 1.4060250191473065, "gen_logits_max": 5.030094623565674, "gen_logits_mean": -14.356471061706543, "gen_logits_min": -26.2891845703125, "gen_logits_std": 3.114590883255005, "gen_loss": 0.22455698251724243, "grad_norm": 0.34887170450552923, "learning_rate": 2.22341052631579e-05, "loss": 0.2634, "mean_copy_accuracy": 0.9963884502649307, "mean_gen_accuracy": 0.8845761120319366, "mean_token_accuracy": 0.9101001769304276, "num_tokens": 782444734.0, "sample_num_tokens": 10060.5, "step": 6884, "total_num_tokens": 782484976.0, "z_loss": 0.0004143443948123604 }, { "copy_logits_max": -8.113816261291504, "copy_logits_min": -750000000.0, "copy_num_tokens": 612.3125, "epoch": 1.4062292570845034, "gen_logits_max": 4.53824520111084, "gen_logits_mean": -16.57794189453125, "gen_logits_min": -28.581684112548828, "gen_logits_std": 3.104032516479492, "gen_loss": 0.2348524034023285, "grad_norm": 0.34481790820862396, "learning_rate": 2.223284210526316e-05, "loss": 0.2758, "mean_copy_accuracy": 0.9969164580106735, "mean_gen_accuracy": 0.8747458904981613, "mean_token_accuracy": 0.9077101498842239, "num_tokens": 782740957.0, "sample_num_tokens": 10814.75, "step": 6885, "total_num_tokens": 782784216.0, "z_loss": 0.00041598593816161156 }, { "copy_logits_max": -5.907252311706543, "copy_logits_min": -750000000.0, "copy_num_tokens": 653.375, "epoch": 1.4064334950217003, "gen_logits_max": 3.248760461807251, "gen_logits_mean": -16.291465759277344, "gen_logits_min": -28.401411056518555, "gen_logits_std": 3.175626754760742, "gen_loss": 0.22706976532936096, "grad_norm": 0.34963735756812786, "learning_rate": 2.2231578947368424e-05, "loss": 0.2632, "mean_copy_accuracy": 0.9970659613609314, "mean_gen_accuracy": 0.8804731369018555, "mean_token_accuracy": 0.9103022515773773, "num_tokens": 783030564.0, "sample_num_tokens": 9610.5, "step": 6886, "total_num_tokens": 783069006.0, "z_loss": 0.00039410300087183714 }, { "copy_logits_max": -7.044124603271484, "copy_logits_min": -750000000.0, "copy_num_tokens": 379.75, "epoch": 1.406637732958897, "gen_logits_max": 3.9966514110565186, "gen_logits_mean": -16.477554321289062, "gen_logits_min": -28.43189811706543, "gen_logits_std": 3.155909538269043, "gen_loss": 0.2808722257614136, "grad_norm": 0.33673333998405935, "learning_rate": 2.2230315789473685e-05, "loss": 0.2744, "mean_copy_accuracy": 0.9963420033454895, "mean_gen_accuracy": 0.8753305673599243, "mean_token_accuracy": 0.9065076857805252, "num_tokens": 783302393.0, "sample_num_tokens": 7034.75, "step": 6887, "total_num_tokens": 783330532.0, "z_loss": 0.0004689083434641361 }, { "copy_logits_max": -7.134011745452881, "copy_logits_min": -750000064.0, "copy_num_tokens": 409.8125, "epoch": 1.406841970896094, "gen_logits_max": 3.849416732788086, "gen_logits_mean": -15.597084999084473, "gen_logits_min": -27.094436645507812, "gen_logits_std": 3.0795514583587646, "gen_loss": 0.2760240137577057, "grad_norm": 0.3562057670518067, "learning_rate": 2.222905263157895e-05, "loss": 0.2774, "mean_copy_accuracy": 0.9971714019775391, "mean_gen_accuracy": 0.8725358247756958, "mean_token_accuracy": 0.9068047553300858, "num_tokens": 783584895.0, "sample_num_tokens": 7774.25, "step": 6888, "total_num_tokens": 783615992.0, "z_loss": 0.0004924163222312927 }, { "copy_logits_max": -7.234273433685303, "copy_logits_min": -750000000.0, "copy_num_tokens": 318.25, "epoch": 1.407046208833291, "gen_logits_max": 4.194573402404785, "gen_logits_mean": -16.149856567382812, "gen_logits_min": -27.75206756591797, "gen_logits_std": 3.1425342559814453, "gen_loss": 0.3082156181335449, "grad_norm": 0.36913892688158856, "learning_rate": 2.222778947368421e-05, "loss": 0.2824, "mean_copy_accuracy": 0.9962875992059708, "mean_gen_accuracy": 0.8786519467830658, "mean_token_accuracy": 0.9039113968610764, "num_tokens": 783844518.0, "sample_num_tokens": 7366.0, "step": 6889, "total_num_tokens": 783873982.0, "z_loss": 0.0005094901425763965 }, { "copy_logits_max": -6.44412899017334, "copy_logits_min": -625000064.0, "copy_num_tokens": 468.6875, "epoch": 1.4072504467704876, "gen_logits_max": 3.169187068939209, "gen_logits_mean": -17.42409896850586, "gen_logits_min": -29.752599716186523, "gen_logits_std": 3.205343246459961, "gen_loss": 0.27480536699295044, "grad_norm": 0.3947239409582058, "learning_rate": 2.2226526315789474e-05, "loss": 0.2876, "mean_copy_accuracy": 0.9972334653139114, "mean_gen_accuracy": 0.8721577674150467, "mean_token_accuracy": 0.9028276354074478, "num_tokens": 784101479.0, "sample_num_tokens": 8221.75, "step": 6890, "total_num_tokens": 784134366.0, "z_loss": 0.00044960423838347197 }, { "copy_logits_max": -9.386541366577148, "copy_logits_min": -750000000.0, "copy_num_tokens": 273.9375, "epoch": 1.4074546847076845, "gen_logits_max": 4.275135517120361, "gen_logits_mean": -17.49775505065918, "gen_logits_min": -28.8176212310791, "gen_logits_std": 3.1490368843078613, "gen_loss": 0.31252604722976685, "grad_norm": 0.3850934711390159, "learning_rate": 2.2225263157894735e-05, "loss": 0.2922, "mean_copy_accuracy": 0.9958197474479675, "mean_gen_accuracy": 0.8751521706581116, "mean_token_accuracy": 0.9022537171840668, "num_tokens": 784368266.0, "sample_num_tokens": 7622.5, "step": 6891, "total_num_tokens": 784398756.0, "z_loss": 0.0005232631810940802 }, { "copy_logits_max": -8.646892547607422, "copy_logits_min": -750000000.0, "copy_num_tokens": 367.25, "epoch": 1.4076589226448812, "gen_logits_max": 3.967884063720703, "gen_logits_mean": -17.059642791748047, "gen_logits_min": -28.830276489257812, "gen_logits_std": 3.158294677734375, "gen_loss": 0.28402769565582275, "grad_norm": 0.3808861569415777, "learning_rate": 2.2224e-05, "loss": 0.2817, "mean_copy_accuracy": 0.9968697279691696, "mean_gen_accuracy": 0.8796305656433105, "mean_token_accuracy": 0.904271811246872, "num_tokens": 784609905.0, "sample_num_tokens": 7550.25, "step": 6892, "total_num_tokens": 784640106.0, "z_loss": 0.0004990889574401081 }, { "copy_logits_max": -8.202337265014648, "copy_logits_min": -750000000.0, "copy_num_tokens": 283.875, "epoch": 1.4078631605820782, "gen_logits_max": 4.341182231903076, "gen_logits_mean": -16.795076370239258, "gen_logits_min": -28.801620483398438, "gen_logits_std": 3.144562005996704, "gen_loss": 0.33021774888038635, "grad_norm": 0.3875882780777501, "learning_rate": 2.2222736842105264e-05, "loss": 0.302, "mean_copy_accuracy": 0.9962039589881897, "mean_gen_accuracy": 0.8717013746500015, "mean_token_accuracy": 0.8975434303283691, "num_tokens": 784876245.0, "sample_num_tokens": 7154.25, "step": 6893, "total_num_tokens": 784904862.0, "z_loss": 0.0005391676677390933 }, { "copy_logits_max": -7.412735462188721, "copy_logits_min": -750000000.0, "copy_num_tokens": 362.5, "epoch": 1.4080673985192749, "gen_logits_max": 4.003123760223389, "gen_logits_mean": -17.21792221069336, "gen_logits_min": -28.951753616333008, "gen_logits_std": 3.1797661781311035, "gen_loss": 0.28171640634536743, "grad_norm": 0.3827776151306587, "learning_rate": 2.2221473684210528e-05, "loss": 0.2889, "mean_copy_accuracy": 0.9964747577905655, "mean_gen_accuracy": 0.8747858107089996, "mean_token_accuracy": 0.9013292789459229, "num_tokens": 785129087.0, "sample_num_tokens": 7839.25, "step": 6894, "total_num_tokens": 785160444.0, "z_loss": 0.00048808762221597135 }, { "copy_logits_max": -7.411921977996826, "copy_logits_min": -750000064.0, "copy_num_tokens": 414.875, "epoch": 1.4082716364564718, "gen_logits_max": 3.1550137996673584, "gen_logits_mean": -17.970382690429688, "gen_logits_min": -30.111215591430664, "gen_logits_std": 3.1952757835388184, "gen_loss": 0.2351589798927307, "grad_norm": 0.3726966839168855, "learning_rate": 2.222021052631579e-05, "loss": 0.2611, "mean_copy_accuracy": 0.996510699391365, "mean_gen_accuracy": 0.8816912323236465, "mean_token_accuracy": 0.9114983528852463, "num_tokens": 785388209.0, "sample_num_tokens": 7698.75, "step": 6895, "total_num_tokens": 785419004.0, "z_loss": 0.0004515695327427238 }, { "copy_logits_max": -6.146065711975098, "copy_logits_min": -750000000.0, "copy_num_tokens": 451.625, "epoch": 1.4084758743936687, "gen_logits_max": 4.503486633300781, "gen_logits_mean": -15.192808151245117, "gen_logits_min": -27.625080108642578, "gen_logits_std": 3.099017381668091, "gen_loss": 0.2908826470375061, "grad_norm": 0.357408794867612, "learning_rate": 2.2218947368421053e-05, "loss": 0.2876, "mean_copy_accuracy": 0.9958926439285278, "mean_gen_accuracy": 0.871685341000557, "mean_token_accuracy": 0.9022900760173798, "num_tokens": 785653770.0, "sample_num_tokens": 8092.5, "step": 6896, "total_num_tokens": 785686140.0, "z_loss": 0.0005765149253420532 }, { "copy_logits_max": -8.138355255126953, "copy_logits_min": -687500032.0, "copy_num_tokens": 310.6875, "epoch": 1.4086801123308654, "gen_logits_max": 3.7624282836914062, "gen_logits_mean": -16.480310440063477, "gen_logits_min": -28.42927360534668, "gen_logits_std": 3.144404888153076, "gen_loss": 0.2529538869857788, "grad_norm": 0.3894087038926885, "learning_rate": 2.2217684210526317e-05, "loss": 0.2911, "mean_copy_accuracy": 0.9962895810604095, "mean_gen_accuracy": 0.8751253485679626, "mean_token_accuracy": 0.9014184474945068, "num_tokens": 785916287.0, "sample_num_tokens": 7681.75, "step": 6897, "total_num_tokens": 785947014.0, "z_loss": 0.00045194063568487763 }, { "copy_logits_max": -8.07326889038086, "copy_logits_min": -750000000.0, "copy_num_tokens": 272.9375, "epoch": 1.4088843502680624, "gen_logits_max": 3.639416217803955, "gen_logits_mean": -18.04808807373047, "gen_logits_min": -29.906055450439453, "gen_logits_std": 3.170161247253418, "gen_loss": 0.29370927810668945, "grad_norm": 0.36851104162549836, "learning_rate": 2.221642105263158e-05, "loss": 0.2734, "mean_copy_accuracy": 0.9966197907924652, "mean_gen_accuracy": 0.8753665536642075, "mean_token_accuracy": 0.9080945253372192, "num_tokens": 786190131.0, "sample_num_tokens": 7151.25, "step": 6898, "total_num_tokens": 786218736.0, "z_loss": 0.00047836749581620097 }, { "copy_logits_max": -7.274052619934082, "copy_logits_min": -750000000.0, "copy_num_tokens": 286.3125, "epoch": 1.409088588205259, "gen_logits_max": 4.608467102050781, "gen_logits_mean": -16.79517364501953, "gen_logits_min": -28.933971405029297, "gen_logits_std": 3.153158664703369, "gen_loss": 0.29401564598083496, "grad_norm": 0.4007117177015212, "learning_rate": 2.2215157894736843e-05, "loss": 0.2877, "mean_copy_accuracy": 0.996089443564415, "mean_gen_accuracy": 0.8757407069206238, "mean_token_accuracy": 0.902516558766365, "num_tokens": 786454201.0, "sample_num_tokens": 7184.25, "step": 6899, "total_num_tokens": 786482938.0, "z_loss": 0.0005053507629781961 }, { "copy_logits_max": -7.843462944030762, "copy_logits_min": -750000000.0, "copy_num_tokens": 271.1875, "epoch": 1.409292826142456, "gen_logits_max": 4.873326301574707, "gen_logits_mean": -17.520343780517578, "gen_logits_min": -29.56639862060547, "gen_logits_std": 3.177638053894043, "gen_loss": 0.3010367751121521, "grad_norm": 0.3810896046923116, "learning_rate": 2.2213894736842104e-05, "loss": 0.2984, "mean_copy_accuracy": 0.9957339018583298, "mean_gen_accuracy": 0.8701661378145218, "mean_token_accuracy": 0.8983978927135468, "num_tokens": 786725639.0, "sample_num_tokens": 7127.25, "step": 6900, "total_num_tokens": 786754148.0, "z_loss": 0.0005103705916553736 }, { "copy_logits_max": -6.343196868896484, "copy_logits_min": -750000000.0, "copy_num_tokens": 491.625, "epoch": 1.4094970640796527, "gen_logits_max": 3.8893489837646484, "gen_logits_mean": -16.955970764160156, "gen_logits_min": -29.009178161621094, "gen_logits_std": 3.1439695358276367, "gen_loss": 0.31028079986572266, "grad_norm": 0.36055599730205784, "learning_rate": 2.221263157894737e-05, "loss": 0.2767, "mean_copy_accuracy": 0.9969812780618668, "mean_gen_accuracy": 0.8773595839738846, "mean_token_accuracy": 0.9087421745061874, "num_tokens": 787007167.0, "sample_num_tokens": 9681.75, "step": 6901, "total_num_tokens": 787045894.0, "z_loss": 0.0005385079421103001 }, { "copy_logits_max": -7.722952365875244, "copy_logits_min": -750000000.0, "copy_num_tokens": 338.6875, "epoch": 1.4097013020168496, "gen_logits_max": 4.074309349060059, "gen_logits_mean": -16.66647720336914, "gen_logits_min": -28.466861724853516, "gen_logits_std": 3.1593704223632812, "gen_loss": 0.24078507721424103, "grad_norm": 0.7321967089165886, "learning_rate": 2.2211368421052632e-05, "loss": 0.2663, "mean_copy_accuracy": 0.9963858872652054, "mean_gen_accuracy": 0.8809045702219009, "mean_token_accuracy": 0.9115879684686661, "num_tokens": 787283675.0, "sample_num_tokens": 7981.75, "step": 6902, "total_num_tokens": 787315602.0, "z_loss": 0.0003885342157445848 }, { "copy_logits_max": -6.097926139831543, "copy_logits_min": -750000000.0, "copy_num_tokens": 443.9375, "epoch": 1.4099055399540465, "gen_logits_max": 3.0645551681518555, "gen_logits_mean": -17.148578643798828, "gen_logits_min": -29.486305236816406, "gen_logits_std": 3.151949882507324, "gen_loss": 0.29664891958236694, "grad_norm": 0.3991658931126214, "learning_rate": 2.2210105263157897e-05, "loss": 0.2936, "mean_copy_accuracy": 0.996099516749382, "mean_gen_accuracy": 0.8679774105548859, "mean_token_accuracy": 0.901189535856247, "num_tokens": 787548623.0, "sample_num_tokens": 8025.25, "step": 6903, "total_num_tokens": 787580724.0, "z_loss": 0.0004978446522727609 }, { "copy_logits_max": -5.340170860290527, "copy_logits_min": -625000000.0, "copy_num_tokens": 669.875, "epoch": 1.4101097778912433, "gen_logits_max": 4.457478046417236, "gen_logits_mean": -14.826223373413086, "gen_logits_min": -27.55577850341797, "gen_logits_std": 3.105339765548706, "gen_loss": 0.27318358421325684, "grad_norm": 0.3613857509112081, "learning_rate": 2.2208842105263157e-05, "loss": 0.2704, "mean_copy_accuracy": 0.9967275112867355, "mean_gen_accuracy": 0.874092772603035, "mean_token_accuracy": 0.9086682945489883, "num_tokens": 787847006.0, "sample_num_tokens": 10621.5, "step": 6904, "total_num_tokens": 787889492.0, "z_loss": 0.00047150158206932247 }, { "copy_logits_max": -7.679232597351074, "copy_logits_min": -750000000.0, "copy_num_tokens": 521.375, "epoch": 1.4103140158284402, "gen_logits_max": 3.8259663581848145, "gen_logits_mean": -16.693954467773438, "gen_logits_min": -28.69231414794922, "gen_logits_std": 3.1650729179382324, "gen_loss": 0.280281662940979, "grad_norm": 0.48417949482191075, "learning_rate": 2.2207578947368422e-05, "loss": 0.2839, "mean_copy_accuracy": 0.9977054744958878, "mean_gen_accuracy": 0.8724986612796783, "mean_token_accuracy": 0.9032684564590454, "num_tokens": 788138300.0, "sample_num_tokens": 9113.5, "step": 6905, "total_num_tokens": 788174754.0, "z_loss": 0.0004441662458702922 }, { "copy_logits_max": -7.783980846405029, "copy_logits_min": -750000000.0, "copy_num_tokens": 342.9375, "epoch": 1.4105182537656369, "gen_logits_max": 4.298959732055664, "gen_logits_mean": -16.818683624267578, "gen_logits_min": -28.673480987548828, "gen_logits_std": 3.144335985183716, "gen_loss": 0.3165464401245117, "grad_norm": 0.3765302653897494, "learning_rate": 2.2206315789473683e-05, "loss": 0.2757, "mean_copy_accuracy": 0.9960483759641647, "mean_gen_accuracy": 0.8799285739660263, "mean_token_accuracy": 0.9075653553009033, "num_tokens": 788423638.0, "sample_num_tokens": 8365.0, "step": 6906, "total_num_tokens": 788457098.0, "z_loss": 0.0005282373167574406 }, { "copy_logits_max": -7.3026018142700195, "copy_logits_min": -687500032.0, "copy_num_tokens": 421.5, "epoch": 1.4107224917028338, "gen_logits_max": 3.9948625564575195, "gen_logits_mean": -16.05194091796875, "gen_logits_min": -28.906877517700195, "gen_logits_std": 3.181694984436035, "gen_loss": 0.25748586654663086, "grad_norm": 0.37367684633248766, "learning_rate": 2.2205052631578947e-05, "loss": 0.2612, "mean_copy_accuracy": 0.9965966492891312, "mean_gen_accuracy": 0.8784657567739487, "mean_token_accuracy": 0.9113866984844208, "num_tokens": 788693868.0, "sample_num_tokens": 7687.0, "step": 6907, "total_num_tokens": 788724616.0, "z_loss": 0.00043416526750661433 }, { "copy_logits_max": -7.946924209594727, "copy_logits_min": -750000000.0, "copy_num_tokens": 430.9375, "epoch": 1.4109267296400305, "gen_logits_max": 3.7963356971740723, "gen_logits_mean": -17.19243812561035, "gen_logits_min": -29.473072052001953, "gen_logits_std": 3.20164155960083, "gen_loss": 0.2460397183895111, "grad_norm": 0.3818421899809575, "learning_rate": 2.2203789473684208e-05, "loss": 0.2762, "mean_copy_accuracy": 0.9963324517011642, "mean_gen_accuracy": 0.8808094412088394, "mean_token_accuracy": 0.9065815359354019, "num_tokens": 788938329.0, "sample_num_tokens": 8460.25, "step": 6908, "total_num_tokens": 788972170.0, "z_loss": 0.0004224239382892847 }, { "copy_logits_max": -7.3471293449401855, "copy_logits_min": -750000000.0, "copy_num_tokens": 437.125, "epoch": 1.4111309675772274, "gen_logits_max": 3.520879030227661, "gen_logits_mean": -17.53333854675293, "gen_logits_min": -29.444150924682617, "gen_logits_std": 3.1981635093688965, "gen_loss": 0.2920134961605072, "grad_norm": 0.34056913883315054, "learning_rate": 2.2202526315789476e-05, "loss": 0.2863, "mean_copy_accuracy": 0.997501403093338, "mean_gen_accuracy": 0.8714636415243149, "mean_token_accuracy": 0.9045156985521317, "num_tokens": 789222176.0, "sample_num_tokens": 8228.5, "step": 6909, "total_num_tokens": 789255090.0, "z_loss": 0.0004969557048752904 }, { "copy_logits_max": -9.077714920043945, "copy_logits_min": -687500032.0, "copy_num_tokens": 503.75, "epoch": 1.4113352055144244, "gen_logits_max": 4.862053871154785, "gen_logits_mean": -14.283795356750488, "gen_logits_min": -27.276683807373047, "gen_logits_std": 3.110840320587158, "gen_loss": 0.2595735788345337, "grad_norm": 0.38949810352353437, "learning_rate": 2.220126315789474e-05, "loss": 0.2989, "mean_copy_accuracy": 0.9963464885950089, "mean_gen_accuracy": 0.868326723575592, "mean_token_accuracy": 0.8969643265008926, "num_tokens": 789488177.0, "sample_num_tokens": 9224.75, "step": 6910, "total_num_tokens": 789525076.0, "z_loss": 0.0005059503600932658 }, { "copy_logits_max": -5.972300052642822, "copy_logits_min": -750000000.0, "copy_num_tokens": 410.25, "epoch": 1.411539443451621, "gen_logits_max": 4.430222511291504, "gen_logits_mean": -16.82667350769043, "gen_logits_min": -28.84218406677246, "gen_logits_std": 3.161437511444092, "gen_loss": 0.3126724362373352, "grad_norm": 0.39448780894990587, "learning_rate": 2.22e-05, "loss": 0.2912, "mean_copy_accuracy": 0.9949061274528503, "mean_gen_accuracy": 0.8738311380147934, "mean_token_accuracy": 0.9017324298620224, "num_tokens": 789737650.0, "sample_num_tokens": 8318.5, "step": 6911, "total_num_tokens": 789770924.0, "z_loss": 0.0006379593978635967 }, { "copy_logits_max": -6.153275012969971, "copy_logits_min": -750000064.0, "copy_num_tokens": 510.0, "epoch": 1.411743681388818, "gen_logits_max": 4.112743377685547, "gen_logits_mean": -15.330769538879395, "gen_logits_min": -28.76940155029297, "gen_logits_std": 3.155264377593994, "gen_loss": 0.2622090280056, "grad_norm": 0.3362439301399446, "learning_rate": 2.2198736842105265e-05, "loss": 0.2546, "mean_copy_accuracy": 0.997306615114212, "mean_gen_accuracy": 0.8805297613143921, "mean_token_accuracy": 0.914358988404274, "num_tokens": 790045582.0, "sample_num_tokens": 9700.0, "step": 6912, "total_num_tokens": 790084382.0, "z_loss": 0.0004827805678360164 }, { "copy_logits_max": -5.942608833312988, "copy_logits_min": -750000000.0, "copy_num_tokens": 541.125, "epoch": 1.411947919326015, "gen_logits_max": 3.8972091674804688, "gen_logits_mean": -16.142993927001953, "gen_logits_min": -28.1890811920166, "gen_logits_std": 3.156818389892578, "gen_loss": 0.28492146730422974, "grad_norm": 0.3357637162943584, "learning_rate": 2.2197473684210526e-05, "loss": 0.261, "mean_copy_accuracy": 0.9960961192846298, "mean_gen_accuracy": 0.880913257598877, "mean_token_accuracy": 0.9100676625967026, "num_tokens": 790338178.0, "sample_num_tokens": 9331.0, "step": 6913, "total_num_tokens": 790375502.0, "z_loss": 0.0005428001750260592 }, { "copy_logits_max": -6.8118791580200195, "copy_logits_min": -750000000.0, "copy_num_tokens": 502.125, "epoch": 1.4121521572632116, "gen_logits_max": 4.586793899536133, "gen_logits_mean": -16.108367919921875, "gen_logits_min": -28.098285675048828, "gen_logits_std": 3.148998498916626, "gen_loss": 0.2742304801940918, "grad_norm": 0.3818986351059872, "learning_rate": 2.219621052631579e-05, "loss": 0.2855, "mean_copy_accuracy": 0.9967170357704163, "mean_gen_accuracy": 0.8736252635717392, "mean_token_accuracy": 0.9032344967126846, "num_tokens": 790609119.0, "sample_num_tokens": 8580.25, "step": 6914, "total_num_tokens": 790643440.0, "z_loss": 0.0005231447285041213 }, { "copy_logits_max": -8.337310791015625, "copy_logits_min": -750000000.0, "copy_num_tokens": 407.5625, "epoch": 1.4123563952004083, "gen_logits_max": 5.7134504318237305, "gen_logits_mean": -14.224806785583496, "gen_logits_min": -26.923126220703125, "gen_logits_std": 3.098067045211792, "gen_loss": 0.31637299060821533, "grad_norm": 0.40357022493513395, "learning_rate": 2.219494736842105e-05, "loss": 0.2985, "mean_copy_accuracy": 0.9958995431661606, "mean_gen_accuracy": 0.8724388033151627, "mean_token_accuracy": 0.8993622958660126, "num_tokens": 790875078.0, "sample_num_tokens": 8940.5, "step": 6915, "total_num_tokens": 790910840.0, "z_loss": 0.0006181197240948677 }, { "copy_logits_max": -5.544447422027588, "copy_logits_min": -750000000.0, "copy_num_tokens": 401.0625, "epoch": 1.4125606331376053, "gen_logits_max": 3.063746213912964, "gen_logits_mean": -17.941978454589844, "gen_logits_min": -29.930299758911133, "gen_logits_std": 3.2042722702026367, "gen_loss": 0.2938171923160553, "grad_norm": 0.3491675213141356, "learning_rate": 2.2193684210526316e-05, "loss": 0.2761, "mean_copy_accuracy": 0.9961519092321396, "mean_gen_accuracy": 0.8763325959444046, "mean_token_accuracy": 0.9054671972990036, "num_tokens": 791138771.0, "sample_num_tokens": 7009.25, "step": 6916, "total_num_tokens": 791166808.0, "z_loss": 0.0005569519707933068 }, { "copy_logits_max": -7.528562545776367, "copy_logits_min": -750000000.0, "copy_num_tokens": 382.6875, "epoch": 1.4127648710748022, "gen_logits_max": 3.640141487121582, "gen_logits_mean": -17.774826049804688, "gen_logits_min": -29.869033813476562, "gen_logits_std": 3.191420793533325, "gen_loss": 0.2922949194908142, "grad_norm": 0.3504897434096401, "learning_rate": 2.219242105263158e-05, "loss": 0.2624, "mean_copy_accuracy": 0.9970493614673615, "mean_gen_accuracy": 0.882220983505249, "mean_token_accuracy": 0.9108409285545349, "num_tokens": 791421420.0, "sample_num_tokens": 8469.5, "step": 6917, "total_num_tokens": 791455298.0, "z_loss": 0.0005098172696307302 }, { "copy_logits_max": -6.496464729309082, "copy_logits_min": -687500032.0, "copy_num_tokens": 456.875, "epoch": 1.412969109011999, "gen_logits_max": 3.5873866081237793, "gen_logits_mean": -16.531539916992188, "gen_logits_min": -28.783313751220703, "gen_logits_std": 3.1740667819976807, "gen_loss": 0.24603377282619476, "grad_norm": 0.3751203545190237, "learning_rate": 2.2191157894736844e-05, "loss": 0.2641, "mean_copy_accuracy": 0.9967056512832642, "mean_gen_accuracy": 0.8799020349979401, "mean_token_accuracy": 0.910144254565239, "num_tokens": 791695347.0, "sample_num_tokens": 8611.75, "step": 6918, "total_num_tokens": 791729794.0, "z_loss": 0.00047442771028727293 }, { "copy_logits_max": -6.622444152832031, "copy_logits_min": -687500032.0, "copy_num_tokens": 537.625, "epoch": 1.4131733469491958, "gen_logits_max": 3.2651004791259766, "gen_logits_mean": -15.859183311462402, "gen_logits_min": -28.171825408935547, "gen_logits_std": 3.160248041152954, "gen_loss": 0.28760161995887756, "grad_norm": 0.3355845608762716, "learning_rate": 2.2189894736842105e-05, "loss": 0.2705, "mean_copy_accuracy": 0.9967821538448334, "mean_gen_accuracy": 0.8770390152931213, "mean_token_accuracy": 0.9084737598896027, "num_tokens": 791960903.0, "sample_num_tokens": 8555.75, "step": 6919, "total_num_tokens": 791995126.0, "z_loss": 0.0005169374053366482 }, { "copy_logits_max": -5.798676490783691, "copy_logits_min": -750000000.0, "copy_num_tokens": 342.4375, "epoch": 1.4133775848863928, "gen_logits_max": 3.9178357124328613, "gen_logits_mean": -16.657533645629883, "gen_logits_min": -28.76490020751953, "gen_logits_std": 3.160256862640381, "gen_loss": 0.2969132363796234, "grad_norm": 0.3722634625055606, "learning_rate": 2.218863157894737e-05, "loss": 0.3038, "mean_copy_accuracy": 0.9952000826597214, "mean_gen_accuracy": 0.8713931143283844, "mean_token_accuracy": 0.8965461850166321, "num_tokens": 792219714.0, "sample_num_tokens": 8130.5, "step": 6920, "total_num_tokens": 792252236.0, "z_loss": 0.0005196016281843185 }, { "copy_logits_max": -6.4772491455078125, "copy_logits_min": -750000000.0, "copy_num_tokens": 788.375, "epoch": 1.4135818228235895, "gen_logits_max": 3.0564136505126953, "gen_logits_mean": -16.18787956237793, "gen_logits_min": -28.683372497558594, "gen_logits_std": 3.1918649673461914, "gen_loss": 0.2618540823459625, "grad_norm": 0.3665669053124005, "learning_rate": 2.218736842105263e-05, "loss": 0.2774, "mean_copy_accuracy": 0.9961084872484207, "mean_gen_accuracy": 0.8741312623023987, "mean_token_accuracy": 0.9057711511850357, "num_tokens": 792477521.0, "sample_num_tokens": 10131.25, "step": 6921, "total_num_tokens": 792518046.0, "z_loss": 0.0004986879648640752 }, { "copy_logits_max": -7.517073631286621, "copy_logits_min": -625000064.0, "copy_num_tokens": 585.6875, "epoch": 1.4137860607607864, "gen_logits_max": 3.8526439666748047, "gen_logits_mean": -15.657029151916504, "gen_logits_min": -28.413312911987305, "gen_logits_std": 3.136160373687744, "gen_loss": 0.2988016903400421, "grad_norm": 0.3400412524100798, "learning_rate": 2.2186105263157895e-05, "loss": 0.2851, "mean_copy_accuracy": 0.9961860477924347, "mean_gen_accuracy": 0.8701358288526535, "mean_token_accuracy": 0.9030839502811432, "num_tokens": 792752332.0, "sample_num_tokens": 9800.5, "step": 6922, "total_num_tokens": 792791534.0, "z_loss": 0.0005317639443092048 }, { "copy_logits_max": -7.187773704528809, "copy_logits_min": -750000064.0, "copy_num_tokens": 581.0, "epoch": 1.413990298697983, "gen_logits_max": 3.8107898235321045, "gen_logits_mean": -15.219532012939453, "gen_logits_min": -27.296167373657227, "gen_logits_std": 3.147240400314331, "gen_loss": 0.23810814321041107, "grad_norm": 0.3739855777139447, "learning_rate": 2.218484210526316e-05, "loss": 0.2673, "mean_copy_accuracy": 0.9968938082456589, "mean_gen_accuracy": 0.8787615150213242, "mean_token_accuracy": 0.9084795713424683, "num_tokens": 793017911.0, "sample_num_tokens": 8878.75, "step": 6923, "total_num_tokens": 793053426.0, "z_loss": 0.0004498944617807865 }, { "copy_logits_max": -5.976446151733398, "copy_logits_min": -687500032.0, "copy_num_tokens": 496.0, "epoch": 1.41419453663518, "gen_logits_max": 3.233855724334717, "gen_logits_mean": -16.9036808013916, "gen_logits_min": -29.042226791381836, "gen_logits_std": 3.1843695640563965, "gen_loss": 0.3107898533344269, "grad_norm": 0.33825581896435813, "learning_rate": 2.218357894736842e-05, "loss": 0.284, "mean_copy_accuracy": 0.9968890696763992, "mean_gen_accuracy": 0.8704697042703629, "mean_token_accuracy": 0.9039898961782455, "num_tokens": 793308337.0, "sample_num_tokens": 9101.25, "step": 6924, "total_num_tokens": 793344742.0, "z_loss": 0.0005465684225782752 }, { "copy_logits_max": -8.28589153289795, "copy_logits_min": -750000000.0, "copy_num_tokens": 438.0625, "epoch": 1.4143987745723767, "gen_logits_max": 2.8669629096984863, "gen_logits_mean": -17.955615997314453, "gen_logits_min": -29.77893829345703, "gen_logits_std": 3.200385093688965, "gen_loss": 0.28017479181289673, "grad_norm": 0.3839753048499828, "learning_rate": 2.2182315789473688e-05, "loss": 0.2934, "mean_copy_accuracy": 0.9963372647762299, "mean_gen_accuracy": 0.8707434087991714, "mean_token_accuracy": 0.9014659225940704, "num_tokens": 793570994.0, "sample_num_tokens": 8458.5, "step": 6925, "total_num_tokens": 793604828.0, "z_loss": 0.0004991724854335189 }, { "copy_logits_max": -8.310052871704102, "copy_logits_min": -750000000.0, "copy_num_tokens": 297.8125, "epoch": 1.4146030125095737, "gen_logits_max": 4.56610631942749, "gen_logits_mean": -15.827835083007812, "gen_logits_min": -27.94469451904297, "gen_logits_std": 3.1455581188201904, "gen_loss": 0.33367839455604553, "grad_norm": 0.3537878276517685, "learning_rate": 2.218105263157895e-05, "loss": 0.2951, "mean_copy_accuracy": 0.996199443936348, "mean_gen_accuracy": 0.8732451051473618, "mean_token_accuracy": 0.9007783979177475, "num_tokens": 793831232.0, "sample_num_tokens": 6860.5, "step": 6926, "total_num_tokens": 793858674.0, "z_loss": 0.0005438079824671149 }, { "copy_logits_max": -8.537601470947266, "copy_logits_min": -687500032.0, "copy_num_tokens": 483.4375, "epoch": 1.4148072504467706, "gen_logits_max": 4.291431427001953, "gen_logits_mean": -15.370789527893066, "gen_logits_min": -28.749935150146484, "gen_logits_std": 3.160165548324585, "gen_loss": 0.2929631173610687, "grad_norm": 0.37147011740149194, "learning_rate": 2.2179789473684213e-05, "loss": 0.2774, "mean_copy_accuracy": 0.9960599094629288, "mean_gen_accuracy": 0.8705360740423203, "mean_token_accuracy": 0.9070483446121216, "num_tokens": 794092718.0, "sample_num_tokens": 8312.0, "step": 6927, "total_num_tokens": 794125966.0, "z_loss": 0.0005048968596383929 }, { "copy_logits_max": -6.742794036865234, "copy_logits_min": -750000000.0, "copy_num_tokens": 537.5, "epoch": 1.4150114883839673, "gen_logits_max": 4.158747673034668, "gen_logits_mean": -16.002960205078125, "gen_logits_min": -28.125839233398438, "gen_logits_std": 3.1724157333374023, "gen_loss": 0.26014262437820435, "grad_norm": 0.39520446088903843, "learning_rate": 2.2178526315789474e-05, "loss": 0.2771, "mean_copy_accuracy": 0.9966542273759842, "mean_gen_accuracy": 0.8773293793201447, "mean_token_accuracy": 0.9075490981340408, "num_tokens": 794340789.0, "sample_num_tokens": 9062.25, "step": 6928, "total_num_tokens": 794377038.0, "z_loss": 0.0004957873024977744 }, { "copy_logits_max": -7.31730842590332, "copy_logits_min": -625000064.0, "copy_num_tokens": 405.3125, "epoch": 1.4152157263211642, "gen_logits_max": 4.712730884552002, "gen_logits_mean": -15.589155197143555, "gen_logits_min": -27.939403533935547, "gen_logits_std": 3.1750288009643555, "gen_loss": 0.29826831817626953, "grad_norm": 0.41041467733712444, "learning_rate": 2.2177263157894738e-05, "loss": 0.2766, "mean_copy_accuracy": 0.9960561543703079, "mean_gen_accuracy": 0.8778524249792099, "mean_token_accuracy": 0.9053049981594086, "num_tokens": 794602770.0, "sample_num_tokens": 7516.0, "step": 6929, "total_num_tokens": 794632834.0, "z_loss": 0.0005303181824274361 }, { "copy_logits_max": -8.749940872192383, "copy_logits_min": -750000000.0, "copy_num_tokens": 658.625, "epoch": 1.415419964258361, "gen_logits_max": 3.191668748855591, "gen_logits_mean": -16.349777221679688, "gen_logits_min": -28.71994972229004, "gen_logits_std": 3.2094228267669678, "gen_loss": 0.26006051898002625, "grad_norm": 0.3939838884222354, "learning_rate": 2.2176e-05, "loss": 0.2991, "mean_copy_accuracy": 0.9958513081073761, "mean_gen_accuracy": 0.8698848485946655, "mean_token_accuracy": 0.8989378213882446, "num_tokens": 794870403.0, "sample_num_tokens": 10282.75, "step": 6930, "total_num_tokens": 794911534.0, "z_loss": 0.00043570372508838773 }, { "copy_logits_max": -7.153179168701172, "copy_logits_min": -750000000.0, "copy_num_tokens": 548.9375, "epoch": 1.4156242021955578, "gen_logits_max": 3.0010085105895996, "gen_logits_mean": -17.062259674072266, "gen_logits_min": -29.377857208251953, "gen_logits_std": 3.2197470664978027, "gen_loss": 0.2617608606815338, "grad_norm": 0.36786339971987025, "learning_rate": 2.2174736842105263e-05, "loss": 0.2766, "mean_copy_accuracy": 0.9963078200817108, "mean_gen_accuracy": 0.8794776797294617, "mean_token_accuracy": 0.9059624075889587, "num_tokens": 795132657.0, "sample_num_tokens": 8490.75, "step": 6931, "total_num_tokens": 795166620.0, "z_loss": 0.00043978699250146747 }, { "copy_logits_max": -6.87937068939209, "copy_logits_min": -750000000.0, "copy_num_tokens": 466.8125, "epoch": 1.4158284401327546, "gen_logits_max": 4.0920820236206055, "gen_logits_mean": -16.001571655273438, "gen_logits_min": -28.30996322631836, "gen_logits_std": 3.176347255706787, "gen_loss": 0.3162893056869507, "grad_norm": 0.3387966212108965, "learning_rate": 2.2173473684210524e-05, "loss": 0.2657, "mean_copy_accuracy": 0.9958862811326981, "mean_gen_accuracy": 0.8844931423664093, "mean_token_accuracy": 0.9099970906972885, "num_tokens": 795412465.0, "sample_num_tokens": 8807.25, "step": 6932, "total_num_tokens": 795447694.0, "z_loss": 0.0005928870523348451 }, { "copy_logits_max": -7.724792003631592, "copy_logits_min": -750000000.0, "copy_num_tokens": 739.6875, "epoch": 1.4160326780699515, "gen_logits_max": 4.729351043701172, "gen_logits_mean": -14.442707061767578, "gen_logits_min": -27.062835693359375, "gen_logits_std": 3.149404525756836, "gen_loss": 0.26178252696990967, "grad_norm": 0.3733416613090879, "learning_rate": 2.2172210526315792e-05, "loss": 0.2667, "mean_copy_accuracy": 0.9959728568792343, "mean_gen_accuracy": 0.8833498954772949, "mean_token_accuracy": 0.9098720550537109, "num_tokens": 795668475.0, "sample_num_tokens": 10769.25, "step": 6933, "total_num_tokens": 795711552.0, "z_loss": 0.0004561155801638961 }, { "copy_logits_max": -5.395246505737305, "copy_logits_min": -750000000.0, "copy_num_tokens": 493.4375, "epoch": 1.4162369160071484, "gen_logits_max": 4.847748756408691, "gen_logits_mean": -14.614357948303223, "gen_logits_min": -27.312606811523438, "gen_logits_std": 3.1105403900146484, "gen_loss": 0.27829572558403015, "grad_norm": 0.3349611143221414, "learning_rate": 2.2170947368421053e-05, "loss": 0.2725, "mean_copy_accuracy": 0.9968284964561462, "mean_gen_accuracy": 0.875911682844162, "mean_token_accuracy": 0.9074925035238266, "num_tokens": 795946727.0, "sample_num_tokens": 8510.25, "step": 6934, "total_num_tokens": 795980768.0, "z_loss": 0.000496110413223505 }, { "copy_logits_max": -6.847621440887451, "copy_logits_min": -750000000.0, "copy_num_tokens": 552.4375, "epoch": 1.416441153944345, "gen_logits_max": 3.77921462059021, "gen_logits_mean": -16.695528030395508, "gen_logits_min": -28.731304168701172, "gen_logits_std": 3.1793899536132812, "gen_loss": 0.282046377658844, "grad_norm": 0.35199220191579705, "learning_rate": 2.2169684210526317e-05, "loss": 0.268, "mean_copy_accuracy": 0.9966251701116562, "mean_gen_accuracy": 0.8779221326112747, "mean_token_accuracy": 0.9097948670387268, "num_tokens": 796229614.0, "sample_num_tokens": 8892.0, "step": 6935, "total_num_tokens": 796265182.0, "z_loss": 0.0005074040964245796 }, { "copy_logits_max": -7.687195301055908, "copy_logits_min": -750000000.0, "copy_num_tokens": 546.1875, "epoch": 1.416645391881542, "gen_logits_max": 3.89357590675354, "gen_logits_mean": -16.725982666015625, "gen_logits_min": -29.393524169921875, "gen_logits_std": 3.2153000831604004, "gen_loss": 0.252402663230896, "grad_norm": 0.3345541767204419, "learning_rate": 2.216842105263158e-05, "loss": 0.257, "mean_copy_accuracy": 0.9962746798992157, "mean_gen_accuracy": 0.8856264650821686, "mean_token_accuracy": 0.912957951426506, "num_tokens": 796516814.0, "sample_num_tokens": 8391.5, "step": 6936, "total_num_tokens": 796550380.0, "z_loss": 0.0004774551198352128 }, { "copy_logits_max": -4.359164237976074, "copy_logits_min": -750000000.0, "copy_num_tokens": 464.125, "epoch": 1.416849629818739, "gen_logits_max": 3.0526084899902344, "gen_logits_mean": -17.05624771118164, "gen_logits_min": -29.292537689208984, "gen_logits_std": 3.1951382160186768, "gen_loss": 0.28023386001586914, "grad_norm": 0.36815988454949145, "learning_rate": 2.2167157894736843e-05, "loss": 0.274, "mean_copy_accuracy": 0.9965888261795044, "mean_gen_accuracy": 0.8757307529449463, "mean_token_accuracy": 0.9058162569999695, "num_tokens": 796779387.0, "sample_num_tokens": 7707.25, "step": 6937, "total_num_tokens": 796810216.0, "z_loss": 0.0004997156793251634 }, { "copy_logits_max": -5.8045125007629395, "copy_logits_min": -750000000.0, "copy_num_tokens": 635.4375, "epoch": 1.4170538677559357, "gen_logits_max": 3.2780771255493164, "gen_logits_mean": -16.384204864501953, "gen_logits_min": -28.960041046142578, "gen_logits_std": 3.2231483459472656, "gen_loss": 0.24808666110038757, "grad_norm": 0.3655595461988172, "learning_rate": 2.2165894736842107e-05, "loss": 0.252, "mean_copy_accuracy": 0.9963742047548294, "mean_gen_accuracy": 0.8826103508472443, "mean_token_accuracy": 0.9143490642309189, "num_tokens": 797043937.0, "sample_num_tokens": 9240.25, "step": 6938, "total_num_tokens": 797080898.0, "z_loss": 0.0004519140056800097 }, { "copy_logits_max": -6.521910667419434, "copy_logits_min": -750000000.0, "copy_num_tokens": 457.4375, "epoch": 1.4172581056931324, "gen_logits_max": 3.893538236618042, "gen_logits_mean": -16.485380172729492, "gen_logits_min": -28.803016662597656, "gen_logits_std": 3.202234983444214, "gen_loss": 0.22558657824993134, "grad_norm": 0.3381909878071926, "learning_rate": 2.2164631578947368e-05, "loss": 0.2591, "mean_copy_accuracy": 0.9970526248216629, "mean_gen_accuracy": 0.8848316371440887, "mean_token_accuracy": 0.9126283228397369, "num_tokens": 797329161.0, "sample_num_tokens": 8538.75, "step": 6939, "total_num_tokens": 797363316.0, "z_loss": 0.0004390084359329194 }, { "copy_logits_max": -6.624185562133789, "copy_logits_min": -687500032.0, "copy_num_tokens": 341.5625, "epoch": 1.4174623436303293, "gen_logits_max": 4.34055233001709, "gen_logits_mean": -17.094562530517578, "gen_logits_min": -29.12076187133789, "gen_logits_std": 3.1853699684143066, "gen_loss": 0.3023673892021179, "grad_norm": 0.3363762215397446, "learning_rate": 2.2163368421052632e-05, "loss": 0.2807, "mean_copy_accuracy": 0.996170237660408, "mean_gen_accuracy": 0.8786543011665344, "mean_token_accuracy": 0.9043246060609818, "num_tokens": 797588482.0, "sample_num_tokens": 7513.0, "step": 6940, "total_num_tokens": 797618534.0, "z_loss": 0.0005496138473972678 }, { "copy_logits_max": -6.256951332092285, "copy_logits_min": -687500032.0, "copy_num_tokens": 364.6875, "epoch": 1.4176665815675262, "gen_logits_max": 4.385320663452148, "gen_logits_mean": -17.217174530029297, "gen_logits_min": -29.194534301757812, "gen_logits_std": 3.179107427597046, "gen_loss": 0.27511096000671387, "grad_norm": 0.3791315780489742, "learning_rate": 2.2162105263157893e-05, "loss": 0.2774, "mean_copy_accuracy": 0.996397852897644, "mean_gen_accuracy": 0.8754508942365646, "mean_token_accuracy": 0.9070566743612289, "num_tokens": 797888075.0, "sample_num_tokens": 7908.25, "step": 6941, "total_num_tokens": 797919708.0, "z_loss": 0.0005681090988218784 }, { "copy_logits_max": -6.26331901550293, "copy_logits_min": -750000064.0, "copy_num_tokens": 512.75, "epoch": 1.417870819504723, "gen_logits_max": 3.8988254070281982, "gen_logits_mean": -17.991018295288086, "gen_logits_min": -30.320913314819336, "gen_logits_std": 3.217738628387451, "gen_loss": 0.27481722831726074, "grad_norm": 0.39093046790400104, "learning_rate": 2.216084210526316e-05, "loss": 0.281, "mean_copy_accuracy": 0.9967647343873978, "mean_gen_accuracy": 0.8741149753332138, "mean_token_accuracy": 0.9030958563089371, "num_tokens": 798171202.0, "sample_num_tokens": 9011.0, "step": 6942, "total_num_tokens": 798207246.0, "z_loss": 0.0005315793678164482 }, { "copy_logits_max": -3.995889663696289, "copy_logits_min": -750000000.0, "copy_num_tokens": 300.875, "epoch": 1.4180750574419199, "gen_logits_max": 4.696987628936768, "gen_logits_mean": -16.719512939453125, "gen_logits_min": -28.766206741333008, "gen_logits_std": 3.171931505203247, "gen_loss": 0.30384641885757446, "grad_norm": 0.37382843218250017, "learning_rate": 2.215957894736842e-05, "loss": 0.2953, "mean_copy_accuracy": 0.9963549226522446, "mean_gen_accuracy": 0.87557353079319, "mean_token_accuracy": 0.8999403864145279, "num_tokens": 798436184.0, "sample_num_tokens": 7331.0, "step": 6943, "total_num_tokens": 798465508.0, "z_loss": 0.0005554455565288663 }, { "copy_logits_max": -1.4858496189117432, "copy_logits_min": -750000000.0, "copy_num_tokens": 383.25, "epoch": 1.4182792953791168, "gen_logits_max": 4.586114406585693, "gen_logits_mean": -16.117366790771484, "gen_logits_min": -28.905364990234375, "gen_logits_std": 3.1875197887420654, "gen_loss": 0.263816237449646, "grad_norm": 0.36745513728824863, "learning_rate": 2.2158315789473686e-05, "loss": 0.275, "mean_copy_accuracy": 0.9968250244855881, "mean_gen_accuracy": 0.8801973015069962, "mean_token_accuracy": 0.9073277413845062, "num_tokens": 798689087.0, "sample_num_tokens": 7649.25, "step": 6944, "total_num_tokens": 798719684.0, "z_loss": 0.00048657270963303745 }, { "copy_logits_max": -1.9735993146896362, "copy_logits_min": -750000000.0, "copy_num_tokens": 592.625, "epoch": 1.4184835333163135, "gen_logits_max": 3.3564350605010986, "gen_logits_mean": -16.523174285888672, "gen_logits_min": -29.345447540283203, "gen_logits_std": 3.2082200050354004, "gen_loss": 0.24413274228572845, "grad_norm": 0.37362275352323515, "learning_rate": 2.2157052631578947e-05, "loss": 0.2568, "mean_copy_accuracy": 0.9962664842605591, "mean_gen_accuracy": 0.8839668184518814, "mean_token_accuracy": 0.9111634939908981, "num_tokens": 798940531.0, "sample_num_tokens": 8662.75, "step": 6945, "total_num_tokens": 798975182.0, "z_loss": 0.0004437096358742565 }, { "copy_logits_max": -3.6186304092407227, "copy_logits_min": -750000000.0, "copy_num_tokens": 477.6875, "epoch": 1.4186877712535104, "gen_logits_max": 4.300833225250244, "gen_logits_mean": -15.107261657714844, "gen_logits_min": -27.77907371520996, "gen_logits_std": 3.1326889991760254, "gen_loss": 0.3003826141357422, "grad_norm": 0.3473509786096994, "learning_rate": 2.215578947368421e-05, "loss": 0.2737, "mean_copy_accuracy": 0.9965589493513107, "mean_gen_accuracy": 0.878905862569809, "mean_token_accuracy": 0.9104224890470505, "num_tokens": 799260741.0, "sample_num_tokens": 8921.75, "step": 6946, "total_num_tokens": 799296428.0, "z_loss": 0.0005195364356040955 }, { "copy_logits_max": -2.211012601852417, "copy_logits_min": -687499968.0, "copy_num_tokens": 655.9375, "epoch": 1.4188920091907071, "gen_logits_max": 2.5006332397460938, "gen_logits_mean": -17.85355567932129, "gen_logits_min": -30.351476669311523, "gen_logits_std": 3.240705966949463, "gen_loss": 0.2794519066810608, "grad_norm": 0.40029788711306225, "learning_rate": 2.2154526315789472e-05, "loss": 0.2856, "mean_copy_accuracy": 0.9957238286733627, "mean_gen_accuracy": 0.8718733787536621, "mean_token_accuracy": 0.9038603454828262, "num_tokens": 799528623.0, "sample_num_tokens": 9288.75, "step": 6947, "total_num_tokens": 799565778.0, "z_loss": 0.0004847632662858814 }, { "copy_logits_max": -3.015413284301758, "copy_logits_min": -687500032.0, "copy_num_tokens": 453.75, "epoch": 1.419096247127904, "gen_logits_max": 3.7559762001037598, "gen_logits_mean": -17.23493194580078, "gen_logits_min": -29.50633430480957, "gen_logits_std": 3.2072794437408447, "gen_loss": 0.2667253017425537, "grad_norm": 0.344428305872798, "learning_rate": 2.2153263157894736e-05, "loss": 0.269, "mean_copy_accuracy": 0.9963739067316055, "mean_gen_accuracy": 0.8826106190681458, "mean_token_accuracy": 0.9087418019771576, "num_tokens": 799798416.0, "sample_num_tokens": 8234.5, "step": 6948, "total_num_tokens": 799831354.0, "z_loss": 0.0004797600849997252 }, { "copy_logits_max": -0.10982036590576172, "copy_logits_min": -750000000.0, "copy_num_tokens": 361.25, "epoch": 1.4193004850651008, "gen_logits_max": 4.79469633102417, "gen_logits_mean": -15.299856185913086, "gen_logits_min": -27.669504165649414, "gen_logits_std": 3.1376495361328125, "gen_loss": 0.2847464084625244, "grad_norm": 0.3764756242289159, "learning_rate": 2.2151999999999997e-05, "loss": 0.2685, "mean_copy_accuracy": 0.9960660934448242, "mean_gen_accuracy": 0.8811586797237396, "mean_token_accuracy": 0.9079347848892212, "num_tokens": 800086331.0, "sample_num_tokens": 8643.75, "step": 6949, "total_num_tokens": 800120906.0, "z_loss": 0.0005047843442298472 }, { "copy_logits_max": -1.6984891891479492, "copy_logits_min": -687500032.0, "copy_num_tokens": 396.25, "epoch": 1.4195047230022977, "gen_logits_max": 4.187359809875488, "gen_logits_mean": -16.077552795410156, "gen_logits_min": -28.305885314941406, "gen_logits_std": 3.1676254272460938, "gen_loss": 0.2652631402015686, "grad_norm": 0.36221153396881456, "learning_rate": 2.2150736842105265e-05, "loss": 0.2588, "mean_copy_accuracy": 0.9969267547130585, "mean_gen_accuracy": 0.8797254413366318, "mean_token_accuracy": 0.9109849035739899, "num_tokens": 800379084.0, "sample_num_tokens": 7574.5, "step": 6950, "total_num_tokens": 800409382.0, "z_loss": 0.00046386331086978316 }, { "copy_logits_max": -3.4965267181396484, "copy_logits_min": -750000000.0, "copy_num_tokens": 422.3125, "epoch": 1.4197089609394946, "gen_logits_max": 4.856873512268066, "gen_logits_mean": -14.648727416992188, "gen_logits_min": -26.878623962402344, "gen_logits_std": 3.152304172515869, "gen_loss": 0.30168187618255615, "grad_norm": 0.3560418635982081, "learning_rate": 2.214947368421053e-05, "loss": 0.2809, "mean_copy_accuracy": 0.9975108802318573, "mean_gen_accuracy": 0.8742263615131378, "mean_token_accuracy": 0.9041004031896591, "num_tokens": 800638989.0, "sample_num_tokens": 7874.25, "step": 6951, "total_num_tokens": 800670486.0, "z_loss": 0.0005701108602806926 }, { "copy_logits_max": -2.4116368293762207, "copy_logits_min": -750000000.0, "copy_num_tokens": 624.0625, "epoch": 1.4199131988766913, "gen_logits_max": 4.095416069030762, "gen_logits_mean": -15.716917037963867, "gen_logits_min": -28.271888732910156, "gen_logits_std": 3.183063268661499, "gen_loss": 0.2479628622531891, "grad_norm": 0.33810790691600284, "learning_rate": 2.214821052631579e-05, "loss": 0.27, "mean_copy_accuracy": 0.9964563250541687, "mean_gen_accuracy": 0.8762006312608719, "mean_token_accuracy": 0.9076443761587143, "num_tokens": 800929792.0, "sample_num_tokens": 9431.5, "step": 6952, "total_num_tokens": 800967518.0, "z_loss": 0.00044744418119080365 }, { "copy_logits_max": -4.867893695831299, "copy_logits_min": -750000000.0, "copy_num_tokens": 513.25, "epoch": 1.4201174368138882, "gen_logits_max": 4.561400890350342, "gen_logits_mean": -15.0260591506958, "gen_logits_min": -27.774198532104492, "gen_logits_std": 3.133721113204956, "gen_loss": 0.26850613951683044, "grad_norm": 0.35372027242608406, "learning_rate": 2.2146947368421055e-05, "loss": 0.2794, "mean_copy_accuracy": 0.996321052312851, "mean_gen_accuracy": 0.8755829334259033, "mean_token_accuracy": 0.9062376022338867, "num_tokens": 801203421.0, "sample_num_tokens": 8198.25, "step": 6953, "total_num_tokens": 801236214.0, "z_loss": 0.0005354540189728141 }, { "copy_logits_max": -4.346141815185547, "copy_logits_min": -687500032.0, "copy_num_tokens": 440.625, "epoch": 1.420321674751085, "gen_logits_max": 4.567193984985352, "gen_logits_mean": -15.265997886657715, "gen_logits_min": -27.495281219482422, "gen_logits_std": 3.1236491203308105, "gen_loss": 0.3210218846797943, "grad_norm": 0.3645225309673349, "learning_rate": 2.2145684210526315e-05, "loss": 0.2783, "mean_copy_accuracy": 0.9968318343162537, "mean_gen_accuracy": 0.8752018213272095, "mean_token_accuracy": 0.9057756513357162, "num_tokens": 801477872.0, "sample_num_tokens": 8247.5, "step": 6954, "total_num_tokens": 801510862.0, "z_loss": 0.0005510281189344823 }, { "copy_logits_max": -5.980757713317871, "copy_logits_min": -750000000.0, "copy_num_tokens": 470.0, "epoch": 1.4205259126882819, "gen_logits_max": 3.699392795562744, "gen_logits_mean": -17.772727966308594, "gen_logits_min": -29.71513557434082, "gen_logits_std": 3.202948570251465, "gen_loss": 0.2839123010635376, "grad_norm": 0.3715655017393429, "learning_rate": 2.214442105263158e-05, "loss": 0.261, "mean_copy_accuracy": 0.9978182166814804, "mean_gen_accuracy": 0.8829129487276077, "mean_token_accuracy": 0.911948561668396, "num_tokens": 801752648.0, "sample_num_tokens": 8962.5, "step": 6955, "total_num_tokens": 801788498.0, "z_loss": 0.000519062508828938 }, { "copy_logits_max": -5.999959945678711, "copy_logits_min": -687500032.0, "copy_num_tokens": 527.375, "epoch": 1.4207301506254786, "gen_logits_max": 3.920473098754883, "gen_logits_mean": -15.758817672729492, "gen_logits_min": -27.800125122070312, "gen_logits_std": 3.1133627891540527, "gen_loss": 0.23663204908370972, "grad_norm": 0.3654014437965774, "learning_rate": 2.214315789473684e-05, "loss": 0.2771, "mean_copy_accuracy": 0.9960014820098877, "mean_gen_accuracy": 0.8762428313493729, "mean_token_accuracy": 0.906055822968483, "num_tokens": 802019722.0, "sample_num_tokens": 8444.0, "step": 6956, "total_num_tokens": 802053498.0, "z_loss": 0.0004030583950225264 }, { "copy_logits_max": -3.781294345855713, "copy_logits_min": -687500032.0, "copy_num_tokens": 358.8125, "epoch": 1.4209343885626755, "gen_logits_max": 4.350768089294434, "gen_logits_mean": -14.240626335144043, "gen_logits_min": -26.101879119873047, "gen_logits_std": 3.0558629035949707, "gen_loss": 0.31860390305519104, "grad_norm": 0.36749333137949486, "learning_rate": 2.2141894736842105e-05, "loss": 0.3091, "mean_copy_accuracy": 0.9961555600166321, "mean_gen_accuracy": 0.8671148717403412, "mean_token_accuracy": 0.8958519250154495, "num_tokens": 802274700.0, "sample_num_tokens": 6824.0, "step": 6957, "total_num_tokens": 802301996.0, "z_loss": 0.0005523572326637805 }, { "copy_logits_max": -6.559438705444336, "copy_logits_min": -750000000.0, "copy_num_tokens": 460.3125, "epoch": 1.4211386264998724, "gen_logits_max": 3.854412078857422, "gen_logits_mean": -16.763383865356445, "gen_logits_min": -28.752260208129883, "gen_logits_std": 3.155874252319336, "gen_loss": 0.2842656373977661, "grad_norm": 0.3641567829960545, "learning_rate": 2.214063157894737e-05, "loss": 0.2718, "mean_copy_accuracy": 0.9972810596227646, "mean_gen_accuracy": 0.8792450577020645, "mean_token_accuracy": 0.9086452275514603, "num_tokens": 802542705.0, "sample_num_tokens": 8022.75, "step": 6958, "total_num_tokens": 802574796.0, "z_loss": 0.000507208751514554 }, { "copy_logits_max": -5.145036220550537, "copy_logits_min": -750000000.0, "copy_num_tokens": 496.25, "epoch": 1.4213428644370691, "gen_logits_max": 4.593569755554199, "gen_logits_mean": -14.796735763549805, "gen_logits_min": -26.50382423400879, "gen_logits_std": 3.0681662559509277, "gen_loss": 0.30756375193595886, "grad_norm": 0.38953632727023246, "learning_rate": 2.2139368421052634e-05, "loss": 0.2971, "mean_copy_accuracy": 0.9968705028295517, "mean_gen_accuracy": 0.863942340016365, "mean_token_accuracy": 0.8980806171894073, "num_tokens": 802822799.0, "sample_num_tokens": 9103.75, "step": 6959, "total_num_tokens": 802859214.0, "z_loss": 0.000496438704431057 }, { "copy_logits_max": -2.2677862644195557, "copy_logits_min": -750000000.0, "copy_num_tokens": 388.75, "epoch": 1.421547102374266, "gen_logits_max": 5.172360897064209, "gen_logits_mean": -13.351869583129883, "gen_logits_min": -25.56161880493164, "gen_logits_std": 3.0390427112579346, "gen_loss": 0.3027175962924957, "grad_norm": 0.3615359962616014, "learning_rate": 2.2138105263157895e-05, "loss": 0.2944, "mean_copy_accuracy": 0.9966390430927277, "mean_gen_accuracy": 0.8724261671304703, "mean_token_accuracy": 0.901816338300705, "num_tokens": 803096750.0, "sample_num_tokens": 7471.0, "step": 6960, "total_num_tokens": 803126634.0, "z_loss": 0.0004904282977804542 }, { "copy_logits_max": -2.954728126525879, "copy_logits_min": -750000000.0, "copy_num_tokens": 374.75, "epoch": 1.4217513403114628, "gen_logits_max": 4.270668983459473, "gen_logits_mean": -16.59429931640625, "gen_logits_min": -28.426158905029297, "gen_logits_std": 3.1633355617523193, "gen_loss": 0.2888503670692444, "grad_norm": 0.3916969247276215, "learning_rate": 2.213684210526316e-05, "loss": 0.2996, "mean_copy_accuracy": 0.996431440114975, "mean_gen_accuracy": 0.874627023935318, "mean_token_accuracy": 0.8994388431310654, "num_tokens": 803369029.0, "sample_num_tokens": 7259.25, "step": 6961, "total_num_tokens": 803398066.0, "z_loss": 0.000515566673129797 }, { "copy_logits_max": -4.365480422973633, "copy_logits_min": -750000000.0, "copy_num_tokens": 362.625, "epoch": 1.4219555782486597, "gen_logits_max": 4.672420501708984, "gen_logits_mean": -15.44892692565918, "gen_logits_min": -27.974647521972656, "gen_logits_std": 3.092228412628174, "gen_loss": 0.31373128294944763, "grad_norm": 0.3843465496236481, "learning_rate": 2.213557894736842e-05, "loss": 0.2991, "mean_copy_accuracy": 0.9949114769697189, "mean_gen_accuracy": 0.8723422586917877, "mean_token_accuracy": 0.8992411494255066, "num_tokens": 803618458.0, "sample_num_tokens": 8146.0, "step": 6962, "total_num_tokens": 803651042.0, "z_loss": 0.0005601479206234217 }, { "copy_logits_max": -1.3263133764266968, "copy_logits_min": -687500032.0, "copy_num_tokens": 466.9375, "epoch": 1.4221598161858564, "gen_logits_max": 3.7064778804779053, "gen_logits_mean": -16.067726135253906, "gen_logits_min": -28.395034790039062, "gen_logits_std": 3.1661529541015625, "gen_loss": 0.2569791078567505, "grad_norm": 0.3868659048838861, "learning_rate": 2.2134315789473684e-05, "loss": 0.2639, "mean_copy_accuracy": 0.9961345046758652, "mean_gen_accuracy": 0.8850035220384598, "mean_token_accuracy": 0.9116048216819763, "num_tokens": 803884198.0, "sample_num_tokens": 8071.0, "step": 6963, "total_num_tokens": 803916482.0, "z_loss": 0.000522832153365016 }, { "copy_logits_max": -0.3850056827068329, "copy_logits_min": -625000064.0, "copy_num_tokens": 541.0, "epoch": 1.4223640541230533, "gen_logits_max": 3.355013132095337, "gen_logits_mean": -15.863874435424805, "gen_logits_min": -27.771394729614258, "gen_logits_std": 3.158313751220703, "gen_loss": 0.28443339467048645, "grad_norm": 0.4049780429774081, "learning_rate": 2.213305263157895e-05, "loss": 0.2862, "mean_copy_accuracy": 0.9959878325462341, "mean_gen_accuracy": 0.8719600588083267, "mean_token_accuracy": 0.9035025388002396, "num_tokens": 804138930.0, "sample_num_tokens": 8847.0, "step": 6964, "total_num_tokens": 804174318.0, "z_loss": 0.0005514639196917415 }, { "copy_logits_max": -4.625443935394287, "copy_logits_min": -687500032.0, "copy_num_tokens": 584.6875, "epoch": 1.4225682920602503, "gen_logits_max": 3.603816509246826, "gen_logits_mean": -16.662033081054688, "gen_logits_min": -29.163841247558594, "gen_logits_std": 3.1455154418945312, "gen_loss": 0.30320215225219727, "grad_norm": 0.38884843464695135, "learning_rate": 2.213178947368421e-05, "loss": 0.2902, "mean_copy_accuracy": 0.997376099228859, "mean_gen_accuracy": 0.8637649118900299, "mean_token_accuracy": 0.9034570157527924, "num_tokens": 804424250.0, "sample_num_tokens": 10146.5, "step": 6965, "total_num_tokens": 804464836.0, "z_loss": 0.0005379209178499877 }, { "copy_logits_max": -2.328502655029297, "copy_logits_min": -687500032.0, "copy_num_tokens": 629.625, "epoch": 1.422772529997447, "gen_logits_max": 3.18506121635437, "gen_logits_mean": -15.919537544250488, "gen_logits_min": -28.041566848754883, "gen_logits_std": 3.136807441711426, "gen_loss": 0.25739675760269165, "grad_norm": 0.3614606766197795, "learning_rate": 2.2130526315789477e-05, "loss": 0.2705, "mean_copy_accuracy": 0.9968061298131943, "mean_gen_accuracy": 0.8766897469758987, "mean_token_accuracy": 0.9086574912071228, "num_tokens": 804705238.0, "sample_num_tokens": 9691.5, "step": 6966, "total_num_tokens": 804744004.0, "z_loss": 0.00045222294284030795 }, { "copy_logits_max": -3.458296775817871, "copy_logits_min": -749999936.0, "copy_num_tokens": 478.125, "epoch": 1.422976767934644, "gen_logits_max": 4.156460762023926, "gen_logits_mean": -15.675772666931152, "gen_logits_min": -27.738506317138672, "gen_logits_std": 3.11189603805542, "gen_loss": 0.28203853964805603, "grad_norm": 0.36380645230755465, "learning_rate": 2.2129263157894738e-05, "loss": 0.2996, "mean_copy_accuracy": 0.9957642555236816, "mean_gen_accuracy": 0.8685311526060104, "mean_token_accuracy": 0.8996375501155853, "num_tokens": 804983292.0, "sample_num_tokens": 9144.0, "step": 6967, "total_num_tokens": 805019868.0, "z_loss": 0.0004646682646125555 }, { "copy_logits_max": -5.225155830383301, "copy_logits_min": -750000000.0, "copy_num_tokens": 482.4375, "epoch": 1.4231810058718408, "gen_logits_max": 3.457026243209839, "gen_logits_mean": -16.19967269897461, "gen_logits_min": -28.182228088378906, "gen_logits_std": 3.1257481575012207, "gen_loss": 0.2919195294380188, "grad_norm": 0.3799447503332717, "learning_rate": 2.2128000000000002e-05, "loss": 0.2876, "mean_copy_accuracy": 0.9963912069797516, "mean_gen_accuracy": 0.8745115548372269, "mean_token_accuracy": 0.9032837152481079, "num_tokens": 805227926.0, "sample_num_tokens": 7932.0, "step": 6968, "total_num_tokens": 805259654.0, "z_loss": 0.000486703182104975 }, { "copy_logits_max": -3.7597336769104004, "copy_logits_min": -750000000.0, "copy_num_tokens": 545.5, "epoch": 1.4233852438090375, "gen_logits_max": 3.826521635055542, "gen_logits_mean": -15.682799339294434, "gen_logits_min": -27.64596939086914, "gen_logits_std": 3.134829521179199, "gen_loss": 0.2515436112880707, "grad_norm": 0.38320684653829495, "learning_rate": 2.2126736842105263e-05, "loss": 0.2713, "mean_copy_accuracy": 0.9968273192644119, "mean_gen_accuracy": 0.8776451349258423, "mean_token_accuracy": 0.9077005684375763, "num_tokens": 805504466.0, "sample_num_tokens": 9165.5, "step": 6969, "total_num_tokens": 805541128.0, "z_loss": 0.0004631183110177517 }, { "copy_logits_max": -4.510011672973633, "copy_logits_min": -750000000.0, "copy_num_tokens": 420.0625, "epoch": 1.4235894817462342, "gen_logits_max": 3.553407907485962, "gen_logits_mean": -16.4574031829834, "gen_logits_min": -28.03632164001465, "gen_logits_std": 3.1429524421691895, "gen_loss": 0.23512475192546844, "grad_norm": 0.33945623880269554, "learning_rate": 2.2125473684210528e-05, "loss": 0.2512, "mean_copy_accuracy": 0.9972285181283951, "mean_gen_accuracy": 0.8870601207017899, "mean_token_accuracy": 0.9156609326601028, "num_tokens": 805795985.0, "sample_num_tokens": 8106.75, "step": 6970, "total_num_tokens": 805828412.0, "z_loss": 0.0004576444043777883 }, { "copy_logits_max": -4.144376754760742, "copy_logits_min": -750000064.0, "copy_num_tokens": 424.1875, "epoch": 1.4237937196834312, "gen_logits_max": 3.649989128112793, "gen_logits_mean": -16.806182861328125, "gen_logits_min": -28.209228515625, "gen_logits_std": 3.1399505138397217, "gen_loss": 0.24319639801979065, "grad_norm": 0.3632927769154057, "learning_rate": 2.212421052631579e-05, "loss": 0.2766, "mean_copy_accuracy": 0.9968182444572449, "mean_gen_accuracy": 0.8792458772659302, "mean_token_accuracy": 0.9056422263383865, "num_tokens": 806058596.0, "sample_num_tokens": 7790.5, "step": 6971, "total_num_tokens": 806089758.0, "z_loss": 0.0004417655582074076 }, { "copy_logits_max": -1.369552493095398, "copy_logits_min": -687500032.0, "copy_num_tokens": 654.375, "epoch": 1.423997957620628, "gen_logits_max": 4.273900032043457, "gen_logits_mean": -13.986603736877441, "gen_logits_min": -25.898025512695312, "gen_logits_std": 3.0682520866394043, "gen_loss": 0.26993298530578613, "grad_norm": 0.33645168062997555, "learning_rate": 2.2122947368421053e-05, "loss": 0.2727, "mean_copy_accuracy": 0.9964931458234787, "mean_gen_accuracy": 0.8766455352306366, "mean_token_accuracy": 0.9073940217494965, "num_tokens": 806322611.0, "sample_num_tokens": 9885.75, "step": 6972, "total_num_tokens": 806362154.0, "z_loss": 0.0005271061090752482 }, { "copy_logits_max": -3.538422107696533, "copy_logits_min": -750000000.0, "copy_num_tokens": 574.875, "epoch": 1.4242021955578248, "gen_logits_max": 3.1206369400024414, "gen_logits_mean": -16.934673309326172, "gen_logits_min": -28.7981014251709, "gen_logits_std": 3.168038845062256, "gen_loss": 0.2771438956260681, "grad_norm": 0.3464673916542623, "learning_rate": 2.2121684210526314e-05, "loss": 0.2603, "mean_copy_accuracy": 0.9975393563508987, "mean_gen_accuracy": 0.8797716945409775, "mean_token_accuracy": 0.9125326722860336, "num_tokens": 806611815.0, "sample_num_tokens": 9240.25, "step": 6973, "total_num_tokens": 806648776.0, "z_loss": 0.0005154098616912961 }, { "copy_logits_max": -5.012489318847656, "copy_logits_min": -750000000.0, "copy_num_tokens": 374.0, "epoch": 1.4244064334950217, "gen_logits_max": 4.495575428009033, "gen_logits_mean": -15.601308822631836, "gen_logits_min": -26.977941513061523, "gen_logits_std": 3.0412955284118652, "gen_loss": 0.2781403064727783, "grad_norm": 0.3732729137816847, "learning_rate": 2.212042105263158e-05, "loss": 0.2776, "mean_copy_accuracy": 0.9967370331287384, "mean_gen_accuracy": 0.8764885663986206, "mean_token_accuracy": 0.9054238498210907, "num_tokens": 806882718.0, "sample_num_tokens": 7392.5, "step": 6974, "total_num_tokens": 806912288.0, "z_loss": 0.0004917450132779777 }, { "copy_logits_max": -5.246218681335449, "copy_logits_min": -750000000.0, "copy_num_tokens": 438.0, "epoch": 1.4246106714322186, "gen_logits_max": 3.700615167617798, "gen_logits_mean": -16.11279296875, "gen_logits_min": -28.215442657470703, "gen_logits_std": 3.130923271179199, "gen_loss": 0.2801719903945923, "grad_norm": 0.35854384713498477, "learning_rate": 2.2119157894736842e-05, "loss": 0.2786, "mean_copy_accuracy": 0.9962766319513321, "mean_gen_accuracy": 0.8739897012710571, "mean_token_accuracy": 0.9055019319057465, "num_tokens": 807143665.0, "sample_num_tokens": 7353.75, "step": 6975, "total_num_tokens": 807173080.0, "z_loss": 0.0005188965587876737 }, { "copy_logits_max": -5.318731784820557, "copy_logits_min": -750000000.0, "copy_num_tokens": 671.4375, "epoch": 1.4248149093694153, "gen_logits_max": 3.4451210498809814, "gen_logits_mean": -16.10881233215332, "gen_logits_min": -28.341571807861328, "gen_logits_std": 3.1540050506591797, "gen_loss": 0.2397008240222931, "grad_norm": 0.3401877217671106, "learning_rate": 2.2117894736842107e-05, "loss": 0.2616, "mean_copy_accuracy": 0.9966381788253784, "mean_gen_accuracy": 0.879537433385849, "mean_token_accuracy": 0.9118300080299377, "num_tokens": 807440131.0, "sample_num_tokens": 9385.75, "step": 6976, "total_num_tokens": 807477674.0, "z_loss": 0.00044616154627874494 }, { "copy_logits_max": -3.0718436241149902, "copy_logits_min": -750000000.0, "copy_num_tokens": 420.0625, "epoch": 1.4250191473066123, "gen_logits_max": 4.164594650268555, "gen_logits_mean": -16.04064178466797, "gen_logits_min": -28.92181396484375, "gen_logits_std": 3.1225948333740234, "gen_loss": 0.2724563479423523, "grad_norm": 0.3628367140851971, "learning_rate": 2.211663157894737e-05, "loss": 0.2987, "mean_copy_accuracy": 0.9970766603946686, "mean_gen_accuracy": 0.8683934956789017, "mean_token_accuracy": 0.9001832902431488, "num_tokens": 807723945.0, "sample_num_tokens": 8218.75, "step": 6977, "total_num_tokens": 807756820.0, "z_loss": 0.0005368327838368714 }, { "copy_logits_max": -4.883593559265137, "copy_logits_min": -750000000.0, "copy_num_tokens": 357.8125, "epoch": 1.425223385243809, "gen_logits_max": 4.104372024536133, "gen_logits_mean": -16.538124084472656, "gen_logits_min": -27.720779418945312, "gen_logits_std": 3.0970802307128906, "gen_loss": 0.30059507489204407, "grad_norm": 0.3775120158888454, "learning_rate": 2.2115368421052632e-05, "loss": 0.2854, "mean_copy_accuracy": 0.9972372800111771, "mean_gen_accuracy": 0.8784998804330826, "mean_token_accuracy": 0.901737317442894, "num_tokens": 807975993.0, "sample_num_tokens": 8014.25, "step": 6978, "total_num_tokens": 808008050.0, "z_loss": 0.0005047612939961255 }, { "copy_logits_max": -3.2817916870117188, "copy_logits_min": -750000000.0, "copy_num_tokens": 425.1875, "epoch": 1.425427623181006, "gen_logits_max": 3.4417433738708496, "gen_logits_mean": -16.82100486755371, "gen_logits_min": -28.136219024658203, "gen_logits_std": 3.1278374195098877, "gen_loss": 0.26178261637687683, "grad_norm": 0.3677659278629218, "learning_rate": 2.2114105263157896e-05, "loss": 0.2781, "mean_copy_accuracy": 0.9956469833850861, "mean_gen_accuracy": 0.8786373734474182, "mean_token_accuracy": 0.9054115265607834, "num_tokens": 808241078.0, "sample_num_tokens": 8850.5, "step": 6979, "total_num_tokens": 808276480.0, "z_loss": 0.00045829234295524657 }, { "copy_logits_max": -2.19655179977417, "copy_logits_min": -750000000.0, "copy_num_tokens": 465.8125, "epoch": 1.4256318611182026, "gen_logits_max": 4.390615463256836, "gen_logits_mean": -14.420088768005371, "gen_logits_min": -25.95945167541504, "gen_logits_std": 3.0375332832336426, "gen_loss": 0.2816705107688904, "grad_norm": 0.35966449895532127, "learning_rate": 2.2112842105263157e-05, "loss": 0.2954, "mean_copy_accuracy": 0.9963008612394333, "mean_gen_accuracy": 0.8722242712974548, "mean_token_accuracy": 0.8992988616228104, "num_tokens": 808516245.0, "sample_num_tokens": 8576.25, "step": 6980, "total_num_tokens": 808550550.0, "z_loss": 0.0004811745311599225 }, { "copy_logits_max": -3.6971638202667236, "copy_logits_min": -750000000.0, "copy_num_tokens": 345.6875, "epoch": 1.4258360990553995, "gen_logits_max": 4.4505615234375, "gen_logits_mean": -16.567611694335938, "gen_logits_min": -28.396671295166016, "gen_logits_std": 3.131880283355713, "gen_loss": 0.2772284150123596, "grad_norm": 0.37674927307681233, "learning_rate": 2.211157894736842e-05, "loss": 0.2898, "mean_copy_accuracy": 0.9965965896844864, "mean_gen_accuracy": 0.872564971446991, "mean_token_accuracy": 0.9020959436893463, "num_tokens": 808791979.0, "sample_num_tokens": 8173.25, "step": 6981, "total_num_tokens": 808824672.0, "z_loss": 0.0004904072266072035 }, { "copy_logits_max": -2.760303020477295, "copy_logits_min": -750000000.0, "copy_num_tokens": 553.25, "epoch": 1.4260403369925965, "gen_logits_max": 4.703285217285156, "gen_logits_mean": -14.67284107208252, "gen_logits_min": -26.751867294311523, "gen_logits_std": 3.127483606338501, "gen_loss": 0.26305702328681946, "grad_norm": 0.3623847527208851, "learning_rate": 2.2110315789473686e-05, "loss": 0.2683, "mean_copy_accuracy": 0.9962086379528046, "mean_gen_accuracy": 0.8784535378217697, "mean_token_accuracy": 0.9104242473840714, "num_tokens": 809063315.0, "sample_num_tokens": 9204.25, "step": 6982, "total_num_tokens": 809100132.0, "z_loss": 0.00042200915049761534 }, { "copy_logits_max": -3.760765790939331, "copy_logits_min": -750000000.0, "copy_num_tokens": 325.5, "epoch": 1.4262445749297932, "gen_logits_max": 5.084556579589844, "gen_logits_mean": -14.956840515136719, "gen_logits_min": -26.552053451538086, "gen_logits_std": 3.101160764694214, "gen_loss": 0.28012600541114807, "grad_norm": 0.3287899700345877, "learning_rate": 2.210905263157895e-05, "loss": 0.2746, "mean_copy_accuracy": 0.997259721159935, "mean_gen_accuracy": 0.8778980821371078, "mean_token_accuracy": 0.9076859503984451, "num_tokens": 809347990.0, "sample_num_tokens": 7518.5, "step": 6983, "total_num_tokens": 809378064.0, "z_loss": 0.0004860628687310964 }, { "copy_logits_max": -1.3329522609710693, "copy_logits_min": -750000064.0, "copy_num_tokens": 561.125, "epoch": 1.42644881286699, "gen_logits_max": 3.8101253509521484, "gen_logits_mean": -14.85000991821289, "gen_logits_min": -27.034626007080078, "gen_logits_std": 3.127370834350586, "gen_loss": 0.2745510935783386, "grad_norm": 0.34039723335354977, "learning_rate": 2.210778947368421e-05, "loss": 0.2614, "mean_copy_accuracy": 0.9976866841316223, "mean_gen_accuracy": 0.8803579956293106, "mean_token_accuracy": 0.9100948572158813, "num_tokens": 809608412.0, "sample_num_tokens": 9054.0, "step": 6984, "total_num_tokens": 809644628.0, "z_loss": 0.0004931190051138401 }, { "copy_logits_max": -0.9769008159637451, "copy_logits_min": -750000000.0, "copy_num_tokens": 346.875, "epoch": 1.4266530508041868, "gen_logits_max": 4.861537933349609, "gen_logits_mean": -14.590167999267578, "gen_logits_min": -26.55782699584961, "gen_logits_std": 3.1430184841156006, "gen_loss": 0.27956128120422363, "grad_norm": 0.3826278388968691, "learning_rate": 2.2106526315789475e-05, "loss": 0.2894, "mean_copy_accuracy": 0.9962996244430542, "mean_gen_accuracy": 0.8723680675029755, "mean_token_accuracy": 0.9012536406517029, "num_tokens": 809872246.0, "sample_num_tokens": 6738.5, "step": 6985, "total_num_tokens": 809899200.0, "z_loss": 0.0004524458199739456 }, { "copy_logits_max": -4.603872776031494, "copy_logits_min": -750000000.0, "copy_num_tokens": 552.0, "epoch": 1.4268572887413837, "gen_logits_max": 4.485853672027588, "gen_logits_mean": -15.472917556762695, "gen_logits_min": -27.965721130371094, "gen_logits_std": 3.1775009632110596, "gen_loss": 0.24883028864860535, "grad_norm": 0.37104023883558857, "learning_rate": 2.2105263157894736e-05, "loss": 0.2647, "mean_copy_accuracy": 0.9974273145198822, "mean_gen_accuracy": 0.8786183148622513, "mean_token_accuracy": 0.9100372195243835, "num_tokens": 810146760.0, "sample_num_tokens": 8780.0, "step": 6986, "total_num_tokens": 810181880.0, "z_loss": 0.00044728704961016774 }, { "copy_logits_max": -6.906920433044434, "copy_logits_min": -750000000.0, "copy_num_tokens": 457.6875, "epoch": 1.4270615266785804, "gen_logits_max": 5.087246417999268, "gen_logits_mean": -15.270125389099121, "gen_logits_min": -27.01416778564453, "gen_logits_std": 3.149855852127075, "gen_loss": 0.2433566153049469, "grad_norm": 0.3869459288531491, "learning_rate": 2.2104e-05, "loss": 0.2697, "mean_copy_accuracy": 0.9963261634111404, "mean_gen_accuracy": 0.8801548779010773, "mean_token_accuracy": 0.9096629172563553, "num_tokens": 810418365.0, "sample_num_tokens": 8779.25, "step": 6987, "total_num_tokens": 810453482.0, "z_loss": 0.0004270389617886394 }, { "copy_logits_max": -5.986454010009766, "copy_logits_min": -750000064.0, "copy_num_tokens": 341.5625, "epoch": 1.4272657646157774, "gen_logits_max": 3.9362566471099854, "gen_logits_mean": -17.423912048339844, "gen_logits_min": -29.119049072265625, "gen_logits_std": 3.1947569847106934, "gen_loss": 0.2562602758407593, "grad_norm": 0.358625627994823, "learning_rate": 2.210273684210526e-05, "loss": 0.2761, "mean_copy_accuracy": 0.9956737458705902, "mean_gen_accuracy": 0.880569651722908, "mean_token_accuracy": 0.9053675830364227, "num_tokens": 810676524.0, "sample_num_tokens": 7210.0, "step": 6988, "total_num_tokens": 810705364.0, "z_loss": 0.0004215693916194141 }, { "copy_logits_max": -4.271500587463379, "copy_logits_min": -687500032.0, "copy_num_tokens": 371.0625, "epoch": 1.4274700025529743, "gen_logits_max": 3.721012592315674, "gen_logits_mean": -17.029056549072266, "gen_logits_min": -28.63994598388672, "gen_logits_std": 3.188037633895874, "gen_loss": 0.2699105739593506, "grad_norm": 0.347683770611515, "learning_rate": 2.2101473684210526e-05, "loss": 0.2715, "mean_copy_accuracy": 0.9963020831346512, "mean_gen_accuracy": 0.8790244907140732, "mean_token_accuracy": 0.9079118967056274, "num_tokens": 810947590.0, "sample_num_tokens": 7829.0, "step": 6989, "total_num_tokens": 810978906.0, "z_loss": 0.00043709352030418813 }, { "copy_logits_max": -6.841973781585693, "copy_logits_min": -750000000.0, "copy_num_tokens": 420.6875, "epoch": 1.427674240490171, "gen_logits_max": 3.6730923652648926, "gen_logits_mean": -17.458045959472656, "gen_logits_min": -29.23365592956543, "gen_logits_std": 3.2051758766174316, "gen_loss": 0.284600168466568, "grad_norm": 0.34717474003580123, "learning_rate": 2.210021052631579e-05, "loss": 0.2714, "mean_copy_accuracy": 0.9958248287439346, "mean_gen_accuracy": 0.8836422264575958, "mean_token_accuracy": 0.9070227742195129, "num_tokens": 811220315.0, "sample_num_tokens": 8792.75, "step": 6990, "total_num_tokens": 811255486.0, "z_loss": 0.00048171597882173955 }, { "copy_logits_max": -4.511192798614502, "copy_logits_min": -750000000.0, "copy_num_tokens": 435.75, "epoch": 1.427878478427368, "gen_logits_max": 4.384570598602295, "gen_logits_mean": -15.652305603027344, "gen_logits_min": -27.739715576171875, "gen_logits_std": 3.1499767303466797, "gen_loss": 0.2582041025161743, "grad_norm": 0.3561828004092443, "learning_rate": 2.2098947368421054e-05, "loss": 0.2691, "mean_copy_accuracy": 0.99759341776371, "mean_gen_accuracy": 0.8789306133985519, "mean_token_accuracy": 0.9082359820604324, "num_tokens": 811490384.0, "sample_num_tokens": 8172.0, "step": 6991, "total_num_tokens": 811523072.0, "z_loss": 0.0004425857332535088 }, { "copy_logits_max": -6.128575325012207, "copy_logits_min": -750000000.0, "copy_num_tokens": 386.625, "epoch": 1.4280827163645649, "gen_logits_max": 4.227497100830078, "gen_logits_mean": -16.57501792907715, "gen_logits_min": -28.330245971679688, "gen_logits_std": 3.1671464443206787, "gen_loss": 0.2642585039138794, "grad_norm": 0.35804755584358006, "learning_rate": 2.209768421052632e-05, "loss": 0.2706, "mean_copy_accuracy": 0.997021272778511, "mean_gen_accuracy": 0.8821825832128525, "mean_token_accuracy": 0.9086082726716995, "num_tokens": 811748144.0, "sample_num_tokens": 7373.0, "step": 6992, "total_num_tokens": 811777636.0, "z_loss": 0.00047111717867664993 }, { "copy_logits_max": -3.6178054809570312, "copy_logits_min": -750000000.0, "copy_num_tokens": 508.1875, "epoch": 1.4282869543017616, "gen_logits_max": 3.4074063301086426, "gen_logits_mean": -17.197906494140625, "gen_logits_min": -29.439334869384766, "gen_logits_std": 3.212430477142334, "gen_loss": 0.27374720573425293, "grad_norm": 0.31756698488186774, "learning_rate": 2.209642105263158e-05, "loss": 0.2448, "mean_copy_accuracy": 0.9977842718362808, "mean_gen_accuracy": 0.87796750664711, "mean_token_accuracy": 0.9177684336900711, "num_tokens": 812056461.0, "sample_num_tokens": 8195.25, "step": 6993, "total_num_tokens": 812089242.0, "z_loss": 0.0005396631313487887 }, { "copy_logits_max": -5.860919952392578, "copy_logits_min": -750000000.0, "copy_num_tokens": 480.0, "epoch": 1.4284911922389583, "gen_logits_max": 4.7263641357421875, "gen_logits_mean": -15.289239883422852, "gen_logits_min": -26.980113983154297, "gen_logits_std": 3.1403276920318604, "gen_loss": 0.28550925850868225, "grad_norm": 0.38087424531746444, "learning_rate": 2.2095157894736844e-05, "loss": 0.2797, "mean_copy_accuracy": 0.9972168356180191, "mean_gen_accuracy": 0.8783191293478012, "mean_token_accuracy": 0.9054103195667267, "num_tokens": 812321684.0, "sample_num_tokens": 7930.0, "step": 6994, "total_num_tokens": 812353404.0, "z_loss": 0.0004924562526866794 }, { "copy_logits_max": -5.274312973022461, "copy_logits_min": -687500032.0, "copy_num_tokens": 448.8125, "epoch": 1.4286954301761552, "gen_logits_max": 4.207259654998779, "gen_logits_mean": -16.135210037231445, "gen_logits_min": -28.029727935791016, "gen_logits_std": 3.1588103771209717, "gen_loss": 0.2983829379081726, "grad_norm": 0.3474258609309928, "learning_rate": 2.2093894736842105e-05, "loss": 0.2721, "mean_copy_accuracy": 0.9963049441576004, "mean_gen_accuracy": 0.8740911781787872, "mean_token_accuracy": 0.9072614312171936, "num_tokens": 812604375.0, "sample_num_tokens": 8474.25, "step": 6995, "total_num_tokens": 812638272.0, "z_loss": 0.0005081133567728102 }, { "copy_logits_max": -5.866767883300781, "copy_logits_min": -750000064.0, "copy_num_tokens": 400.8125, "epoch": 1.4288996681133521, "gen_logits_max": 3.8214714527130127, "gen_logits_mean": -16.51228904724121, "gen_logits_min": -28.46047592163086, "gen_logits_std": 3.1839723587036133, "gen_loss": 0.2646779417991638, "grad_norm": 0.36964376402966553, "learning_rate": 2.209263157894737e-05, "loss": 0.2855, "mean_copy_accuracy": 0.9964879304170609, "mean_gen_accuracy": 0.8768272250890732, "mean_token_accuracy": 0.9039608389139175, "num_tokens": 812863520.0, "sample_num_tokens": 8163.5, "step": 6996, "total_num_tokens": 812896174.0, "z_loss": 0.00046261356328614056 }, { "copy_logits_max": -4.613755226135254, "copy_logits_min": -750000000.0, "copy_num_tokens": 372.25, "epoch": 1.4291039060505488, "gen_logits_max": 4.002049446105957, "gen_logits_mean": -16.932655334472656, "gen_logits_min": -28.80332374572754, "gen_logits_std": 3.1824660301208496, "gen_loss": 0.3044655919075012, "grad_norm": 0.3184902694414641, "learning_rate": 2.209136842105263e-05, "loss": 0.255, "mean_copy_accuracy": 0.9973722547292709, "mean_gen_accuracy": 0.8853590190410614, "mean_token_accuracy": 0.9138314574956894, "num_tokens": 813157273.0, "sample_num_tokens": 8067.75, "step": 6997, "total_num_tokens": 813189544.0, "z_loss": 0.0005307758110575378 }, { "copy_logits_max": -2.4045798778533936, "copy_logits_min": -750000000.0, "copy_num_tokens": 513.0, "epoch": 1.4293081439877457, "gen_logits_max": 3.5673534870147705, "gen_logits_mean": -16.53042221069336, "gen_logits_min": -28.62564468383789, "gen_logits_std": 3.215182304382324, "gen_loss": 0.26361826062202454, "grad_norm": 0.3424144107961201, "learning_rate": 2.2090105263157894e-05, "loss": 0.2513, "mean_copy_accuracy": 0.9972320944070816, "mean_gen_accuracy": 0.8818003237247467, "mean_token_accuracy": 0.9144920855760574, "num_tokens": 813418654.0, "sample_num_tokens": 7843.5, "step": 6998, "total_num_tokens": 813450028.0, "z_loss": 0.0004594501224346459 }, { "copy_logits_max": -4.0543928146362305, "copy_logits_min": -750000000.0, "copy_num_tokens": 501.5625, "epoch": 1.4295123819249427, "gen_logits_max": 5.919321060180664, "gen_logits_mean": -12.730974197387695, "gen_logits_min": -25.029150009155273, "gen_logits_std": 3.110157012939453, "gen_loss": 0.24106302857398987, "grad_norm": 0.35010059701664825, "learning_rate": 2.208884210526316e-05, "loss": 0.2456, "mean_copy_accuracy": 0.9966237097978592, "mean_gen_accuracy": 0.8888362497091293, "mean_token_accuracy": 0.9162164330482483, "num_tokens": 813697519.0, "sample_num_tokens": 8778.25, "step": 6999, "total_num_tokens": 813732632.0, "z_loss": 0.0004710309731308371 }, { "epoch": 1.4297166198621394, "grad_norm": 0.3832955573261221, "learning_rate": 2.2087578947368423e-05, "loss": 0.2887, "step": 7000 }, { "epoch": 1.4297166198621394, "eval_copy_logits_max": -8.72757339477539, "eval_copy_logits_min": -85.40353393554688, "eval_gen_logits_max": 2.8730108737945557, "eval_gen_logits_mean": -21.27642250061035, "eval_gen_logits_min": -32.37735366821289, "eval_gen_logits_std": 3.2349050045013428, "eval_gen_loss": 0.3299168348312378, "eval_loss": 0.30500754714012146, "eval_mean_copy_accuracy": 0.9933502078056335, "eval_mean_gen_accuracy": 0.8774110972881317, "eval_mean_token_accuracy": 0.8924303352832794, "eval_num_tokens": 813971460.0, "eval_runtime": 0.6672, "eval_samples_per_second": 11.99, "eval_steps_per_second": 2.998, "eval_total_num_tokens": 813971460.0, "eval_z_loss": 0.0005085846059955657, "step": 7000 }, { "copy_logits_max": -2.8911023139953613, "copy_logits_min": -750000000.0, "copy_num_tokens": 605.5, "epoch": 1.4299208577993363, "gen_logits_max": 4.357407093048096, "gen_logits_mean": -14.59414291381836, "gen_logits_min": -26.884525299072266, "gen_logits_std": 3.1332430839538574, "gen_loss": 0.258940726518631, "grad_norm": 0.38688734014586473, "learning_rate": 2.2086315789473684e-05, "loss": 0.2701, "mean_copy_accuracy": 0.995660088956356, "mean_gen_accuracy": 0.8806750178337097, "mean_token_accuracy": 0.9059333875775337, "num_tokens": 814209582.0, "sample_num_tokens": 9205.0, "step": 7001, "total_num_tokens": 814246402.0, "z_loss": 0.00044544253614731133 }, { "copy_logits_max": -3.246486186981201, "copy_logits_min": -687500032.0, "copy_num_tokens": 392.4375, "epoch": 1.430125095736533, "gen_logits_max": 5.3700714111328125, "gen_logits_mean": -14.57022476196289, "gen_logits_min": -26.715778350830078, "gen_logits_std": 3.149172067642212, "gen_loss": 0.27917391061782837, "grad_norm": 0.3761019902231268, "learning_rate": 2.2085052631578948e-05, "loss": 0.2817, "mean_copy_accuracy": 0.995946004986763, "mean_gen_accuracy": 0.877098947763443, "mean_token_accuracy": 0.9046092182397842, "num_tokens": 814459799.0, "sample_num_tokens": 7860.75, "step": 7002, "total_num_tokens": 814491242.0, "z_loss": 0.0004572030738927424 }, { "copy_logits_max": 0.6335845589637756, "copy_logits_min": -750000000.0, "copy_num_tokens": 551.5, "epoch": 1.43032933367373, "gen_logits_max": 3.9673972129821777, "gen_logits_mean": -15.084269523620605, "gen_logits_min": -27.64957046508789, "gen_logits_std": 3.1923303604125977, "gen_loss": 0.24798749387264252, "grad_norm": 0.3874894409255243, "learning_rate": 2.2083789473684213e-05, "loss": 0.2595, "mean_copy_accuracy": 0.9970507174730301, "mean_gen_accuracy": 0.8783969432115555, "mean_token_accuracy": 0.9131626486778259, "num_tokens": 814719558.0, "sample_num_tokens": 7887.0, "step": 7003, "total_num_tokens": 814751106.0, "z_loss": 0.0005184104084037244 }, { "copy_logits_max": 0.6452209949493408, "copy_logits_min": -750000000.0, "copy_num_tokens": 387.6875, "epoch": 1.4305335716109266, "gen_logits_max": 4.57867431640625, "gen_logits_mean": -16.04833221435547, "gen_logits_min": -27.97393035888672, "gen_logits_std": 3.1784398555755615, "gen_loss": 0.2692054510116577, "grad_norm": 0.4143112791698152, "learning_rate": 2.2082526315789473e-05, "loss": 0.2746, "mean_copy_accuracy": 0.995539665222168, "mean_gen_accuracy": 0.8838782608509064, "mean_token_accuracy": 0.9064288884401321, "num_tokens": 814956527.0, "sample_num_tokens": 8131.25, "step": 7004, "total_num_tokens": 814989052.0, "z_loss": 0.000510280835442245 }, { "copy_logits_max": 1.1438568830490112, "copy_logits_min": -750000000.0, "copy_num_tokens": 527.1875, "epoch": 1.4307378095481236, "gen_logits_max": 4.599251747131348, "gen_logits_mean": -14.774730682373047, "gen_logits_min": -27.123708724975586, "gen_logits_std": 3.1480119228363037, "gen_loss": 0.28257644176483154, "grad_norm": 0.3981101440035704, "learning_rate": 2.2081263157894738e-05, "loss": 0.2828, "mean_copy_accuracy": 0.9963886588811874, "mean_gen_accuracy": 0.8740398585796356, "mean_token_accuracy": 0.9035402685403824, "num_tokens": 815208540.0, "sample_num_tokens": 8942.5, "step": 7005, "total_num_tokens": 815244310.0, "z_loss": 0.000635074800811708 }, { "copy_logits_max": 1.3851081132888794, "copy_logits_min": -687500032.0, "copy_num_tokens": 442.9375, "epoch": 1.4309420474853205, "gen_logits_max": 4.359249114990234, "gen_logits_mean": -15.845908164978027, "gen_logits_min": -27.873722076416016, "gen_logits_std": 3.179922103881836, "gen_loss": 0.23993860185146332, "grad_norm": 0.3752417091880217, "learning_rate": 2.208e-05, "loss": 0.2604, "mean_copy_accuracy": 0.9965296238660812, "mean_gen_accuracy": 0.8854700326919556, "mean_token_accuracy": 0.9114327132701874, "num_tokens": 815477791.0, "sample_num_tokens": 7655.25, "step": 7006, "total_num_tokens": 815508412.0, "z_loss": 0.0004882921930402517 }, { "copy_logits_max": -2.7252891063690186, "copy_logits_min": -750000000.0, "copy_num_tokens": 358.75, "epoch": 1.4311462854225172, "gen_logits_max": 3.857358694076538, "gen_logits_mean": -17.518756866455078, "gen_logits_min": -29.587665557861328, "gen_logits_std": 3.2034971714019775, "gen_loss": 0.2860872149467468, "grad_norm": 0.34117216831255137, "learning_rate": 2.2078736842105266e-05, "loss": 0.2714, "mean_copy_accuracy": 0.9951586723327637, "mean_gen_accuracy": 0.8826646655797958, "mean_token_accuracy": 0.9076839536428452, "num_tokens": 815747213.0, "sample_num_tokens": 7618.75, "step": 7007, "total_num_tokens": 815777688.0, "z_loss": 0.00048136638361029327 }, { "copy_logits_max": -0.18465864658355713, "copy_logits_min": -750000000.0, "copy_num_tokens": 575.75, "epoch": 1.4313505233597141, "gen_logits_max": 3.4517595767974854, "gen_logits_mean": -16.04025650024414, "gen_logits_min": -28.103229522705078, "gen_logits_std": 3.1800403594970703, "gen_loss": 0.2957287132740021, "grad_norm": 0.4007185215980494, "learning_rate": 2.2077473684210527e-05, "loss": 0.2895, "mean_copy_accuracy": 0.9956025332212448, "mean_gen_accuracy": 0.870423287153244, "mean_token_accuracy": 0.9008860439062119, "num_tokens": 816012545.0, "sample_num_tokens": 8929.25, "step": 7008, "total_num_tokens": 816048262.0, "z_loss": 0.0005002027028240263 }, { "copy_logits_max": -2.606677770614624, "copy_logits_min": -750000000.0, "copy_num_tokens": 594.9375, "epoch": 1.4315547612969108, "gen_logits_max": 2.3638834953308105, "gen_logits_mean": -17.907657623291016, "gen_logits_min": -29.643667221069336, "gen_logits_std": 3.2062809467315674, "gen_loss": 0.26448771357536316, "grad_norm": 0.33971344377868784, "learning_rate": 2.207621052631579e-05, "loss": 0.2575, "mean_copy_accuracy": 0.9973207116127014, "mean_gen_accuracy": 0.8811805695295334, "mean_token_accuracy": 0.9118150323629379, "num_tokens": 816291130.0, "sample_num_tokens": 10306.5, "step": 7009, "total_num_tokens": 816332356.0, "z_loss": 0.0004357628640718758 }, { "copy_logits_max": -0.29537394642829895, "copy_logits_min": -750000064.0, "copy_num_tokens": 479.75, "epoch": 1.4317589992341078, "gen_logits_max": 3.17043137550354, "gen_logits_mean": -16.2406005859375, "gen_logits_min": -27.957416534423828, "gen_logits_std": 3.177706718444824, "gen_loss": 0.2507628798484802, "grad_norm": 0.34137322527251196, "learning_rate": 2.2074947368421053e-05, "loss": 0.2568, "mean_copy_accuracy": 0.9973203241825104, "mean_gen_accuracy": 0.8851635903120041, "mean_token_accuracy": 0.9136723130941391, "num_tokens": 816571061.0, "sample_num_tokens": 8425.25, "step": 7010, "total_num_tokens": 816604762.0, "z_loss": 0.0004180381656624377 }, { "copy_logits_max": -3.3601646423339844, "copy_logits_min": -750000000.0, "copy_num_tokens": 531.0625, "epoch": 1.4319632371713045, "gen_logits_max": 3.0268592834472656, "gen_logits_mean": -17.385051727294922, "gen_logits_min": -29.49962615966797, "gen_logits_std": 3.220745086669922, "gen_loss": 0.2366427779197693, "grad_norm": 0.346223734073796, "learning_rate": 2.2073684210526317e-05, "loss": 0.2723, "mean_copy_accuracy": 0.9968475103378296, "mean_gen_accuracy": 0.8747610300779343, "mean_token_accuracy": 0.907778188586235, "num_tokens": 816846781.0, "sample_num_tokens": 8492.25, "step": 7011, "total_num_tokens": 816880750.0, "z_loss": 0.00036448711762204766 }, { "copy_logits_max": -2.542893171310425, "copy_logits_min": -750000000.0, "copy_num_tokens": 543.8125, "epoch": 1.4321674751085014, "gen_logits_max": 2.659322738647461, "gen_logits_mean": -18.551273345947266, "gen_logits_min": -29.92249298095703, "gen_logits_std": 3.214456558227539, "gen_loss": 0.2491331696510315, "grad_norm": 0.3536206470689665, "learning_rate": 2.2072421052631578e-05, "loss": 0.2731, "mean_copy_accuracy": 0.996077299118042, "mean_gen_accuracy": 0.8806368112564087, "mean_token_accuracy": 0.9086111783981323, "num_tokens": 817129650.0, "sample_num_tokens": 9649.5, "step": 7012, "total_num_tokens": 817168248.0, "z_loss": 0.000395608542021364 }, { "copy_logits_max": 2.420527696609497, "copy_logits_min": -750000000.0, "copy_num_tokens": 461.125, "epoch": 1.4323717130456983, "gen_logits_max": 4.1000471115112305, "gen_logits_mean": -14.14482593536377, "gen_logits_min": -25.832460403442383, "gen_logits_std": 3.0771307945251465, "gen_loss": 0.2943110167980194, "grad_norm": 0.3756665355877261, "learning_rate": 2.2071157894736842e-05, "loss": 0.2731, "mean_copy_accuracy": 0.9958577156066895, "mean_gen_accuracy": 0.8789598345756531, "mean_token_accuracy": 0.908400297164917, "num_tokens": 817409341.0, "sample_num_tokens": 8828.75, "step": 7013, "total_num_tokens": 817444656.0, "z_loss": 0.0005020279204472899 }, { "copy_logits_max": -1.6812553405761719, "copy_logits_min": -687500032.0, "copy_num_tokens": 313.875, "epoch": 1.432575950982895, "gen_logits_max": 3.4691667556762695, "gen_logits_mean": -17.890586853027344, "gen_logits_min": -29.621280670166016, "gen_logits_std": 3.202064037322998, "gen_loss": 0.3070848286151886, "grad_norm": 0.34054361732395694, "learning_rate": 2.2069894736842103e-05, "loss": 0.267, "mean_copy_accuracy": 0.9967966973781586, "mean_gen_accuracy": 0.8840775489807129, "mean_token_accuracy": 0.9097859561443329, "num_tokens": 817670979.0, "sample_num_tokens": 7128.25, "step": 7014, "total_num_tokens": 817699492.0, "z_loss": 0.0005497531965374947 }, { "copy_logits_max": -4.026014804840088, "copy_logits_min": -750000000.0, "copy_num_tokens": 472.25, "epoch": 1.432780188920092, "gen_logits_max": 3.519702196121216, "gen_logits_mean": -17.322343826293945, "gen_logits_min": -28.877347946166992, "gen_logits_std": 3.164705753326416, "gen_loss": 0.27549639344215393, "grad_norm": 0.3837770346000583, "learning_rate": 2.206863157894737e-05, "loss": 0.279, "mean_copy_accuracy": 0.9966525882482529, "mean_gen_accuracy": 0.8793924152851105, "mean_token_accuracy": 0.907551109790802, "num_tokens": 817936631.0, "sample_num_tokens": 8611.25, "step": 7015, "total_num_tokens": 817971076.0, "z_loss": 0.0004984131664969027 }, { "copy_logits_max": 1.5589847564697266, "copy_logits_min": -750000000.0, "copy_num_tokens": 958.8125, "epoch": 1.4329844268572887, "gen_logits_max": 3.0962233543395996, "gen_logits_mean": -15.22922420501709, "gen_logits_min": -27.447309494018555, "gen_logits_std": 3.202868938446045, "gen_loss": 0.20304374396800995, "grad_norm": 0.36490547240946025, "learning_rate": 2.206736842105263e-05, "loss": 0.238, "mean_copy_accuracy": 0.9971147179603577, "mean_gen_accuracy": 0.8823405206203461, "mean_token_accuracy": 0.9192927479743958, "num_tokens": 818208644.0, "sample_num_tokens": 11129.5, "step": 7016, "total_num_tokens": 818253162.0, "z_loss": 0.0004194416687823832 }, { "copy_logits_max": -0.10763221979141235, "copy_logits_min": -750000128.0, "copy_num_tokens": 380.25, "epoch": 1.4331886647944856, "gen_logits_max": 5.737261772155762, "gen_logits_mean": -14.087197303771973, "gen_logits_min": -25.618173599243164, "gen_logits_std": 3.035097122192383, "gen_loss": 0.3513489067554474, "grad_norm": 0.3603808881188675, "learning_rate": 2.2066105263157896e-05, "loss": 0.3202, "mean_copy_accuracy": 0.9959290325641632, "mean_gen_accuracy": 0.8642543107271194, "mean_token_accuracy": 0.894176185131073, "num_tokens": 818488787.0, "sample_num_tokens": 7392.75, "step": 7017, "total_num_tokens": 818518358.0, "z_loss": 0.0006022232118993998 }, { "copy_logits_max": -3.856924057006836, "copy_logits_min": -750000000.0, "copy_num_tokens": 505.0625, "epoch": 1.4333929027316823, "gen_logits_max": 2.698213815689087, "gen_logits_mean": -18.822418212890625, "gen_logits_min": -30.815399169921875, "gen_logits_std": 3.2539849281311035, "gen_loss": 0.23942142724990845, "grad_norm": 0.3654615853038633, "learning_rate": 2.206484210526316e-05, "loss": 0.2732, "mean_copy_accuracy": 0.9961383044719696, "mean_gen_accuracy": 0.8793057352304459, "mean_token_accuracy": 0.90566585958004, "num_tokens": 818770808.0, "sample_num_tokens": 8505.0, "step": 7018, "total_num_tokens": 818804828.0, "z_loss": 0.00046100784675218165 }, { "copy_logits_max": -2.001356363296509, "copy_logits_min": -750000000.0, "copy_num_tokens": 488.0625, "epoch": 1.4335971406688792, "gen_logits_max": 3.3999390602111816, "gen_logits_mean": -17.001174926757812, "gen_logits_min": -28.582571029663086, "gen_logits_std": 3.1307554244995117, "gen_loss": 0.281866192817688, "grad_norm": 0.3273115759804195, "learning_rate": 2.206357894736842e-05, "loss": 0.2481, "mean_copy_accuracy": 0.997361809015274, "mean_gen_accuracy": 0.8785793036222458, "mean_token_accuracy": 0.9158923625946045, "num_tokens": 819064844.0, "sample_num_tokens": 8064.0, "step": 7019, "total_num_tokens": 819097100.0, "z_loss": 0.0005021001561544836 }, { "copy_logits_max": -5.324834823608398, "copy_logits_min": -750000000.0, "copy_num_tokens": 248.25, "epoch": 1.4338013786060761, "gen_logits_max": 4.973139762878418, "gen_logits_mean": -16.24280548095703, "gen_logits_min": -27.307706832885742, "gen_logits_std": 3.096328020095825, "gen_loss": 0.2859437167644501, "grad_norm": 0.3579307419925161, "learning_rate": 2.2062315789473685e-05, "loss": 0.2999, "mean_copy_accuracy": 0.9955439269542694, "mean_gen_accuracy": 0.8774814605712891, "mean_token_accuracy": 0.8989902585744858, "num_tokens": 819324432.0, "sample_num_tokens": 8576.5, "step": 7020, "total_num_tokens": 819358738.0, "z_loss": 0.0005670831305906177 }, { "copy_logits_max": 0.29800963401794434, "copy_logits_min": -687500032.0, "copy_num_tokens": 387.125, "epoch": 1.4340056165432729, "gen_logits_max": 4.399752140045166, "gen_logits_mean": -16.328105926513672, "gen_logits_min": -28.237655639648438, "gen_logits_std": 3.138707160949707, "gen_loss": 0.30198293924331665, "grad_norm": 0.34731129141693673, "learning_rate": 2.2061052631578946e-05, "loss": 0.2738, "mean_copy_accuracy": 0.9970610290765762, "mean_gen_accuracy": 0.8762638121843338, "mean_token_accuracy": 0.9067274332046509, "num_tokens": 819605143.0, "sample_num_tokens": 8283.25, "step": 7021, "total_num_tokens": 819638276.0, "z_loss": 0.000594131532125175 }, { "copy_logits_max": -2.872877836227417, "copy_logits_min": -750000000.0, "copy_num_tokens": 324.6875, "epoch": 1.4342098544804698, "gen_logits_max": 5.332145690917969, "gen_logits_mean": -15.468920707702637, "gen_logits_min": -26.726341247558594, "gen_logits_std": 3.0549285411834717, "gen_loss": 0.3398188352584839, "grad_norm": 0.4117106191285014, "learning_rate": 2.205978947368421e-05, "loss": 0.3114, "mean_copy_accuracy": 0.996004119515419, "mean_gen_accuracy": 0.8670477569103241, "mean_token_accuracy": 0.8947671800851822, "num_tokens": 819855046.0, "sample_num_tokens": 7419.0, "step": 7022, "total_num_tokens": 819884722.0, "z_loss": 0.0006377390236593783 }, { "copy_logits_max": 3.779754400253296, "copy_logits_min": -750000000.0, "copy_num_tokens": 639.9375, "epoch": 1.4344140924176667, "gen_logits_max": 4.0853376388549805, "gen_logits_mean": -15.602836608886719, "gen_logits_min": -27.64752197265625, "gen_logits_std": 3.122696876525879, "gen_loss": 0.3122066855430603, "grad_norm": 0.37471728987807895, "learning_rate": 2.2058526315789475e-05, "loss": 0.2822, "mean_copy_accuracy": 0.9964706003665924, "mean_gen_accuracy": 0.8718554824590683, "mean_token_accuracy": 0.906559556722641, "num_tokens": 820105944.0, "sample_num_tokens": 9030.0, "step": 7023, "total_num_tokens": 820142064.0, "z_loss": 0.0005834034527651966 }, { "copy_logits_max": -2.197985887527466, "copy_logits_min": -750000000.0, "copy_num_tokens": 687.25, "epoch": 1.4346183303548634, "gen_logits_max": 2.707773447036743, "gen_logits_mean": -17.35079002380371, "gen_logits_min": -29.101289749145508, "gen_logits_std": 3.1566011905670166, "gen_loss": 0.24617016315460205, "grad_norm": 0.35243878120846117, "learning_rate": 2.205726315789474e-05, "loss": 0.2657, "mean_copy_accuracy": 0.9972657859325409, "mean_gen_accuracy": 0.8806901425123215, "mean_token_accuracy": 0.911119356751442, "num_tokens": 820382162.0, "sample_num_tokens": 9337.0, "step": 7024, "total_num_tokens": 820419510.0, "z_loss": 0.0004726803454104811 }, { "copy_logits_max": 0.7821856737136841, "copy_logits_min": -750000000.0, "copy_num_tokens": 801.0625, "epoch": 1.4348225682920601, "gen_logits_max": 4.38425874710083, "gen_logits_mean": -15.139395713806152, "gen_logits_min": -27.047218322753906, "gen_logits_std": 3.088831663131714, "gen_loss": 0.28136563301086426, "grad_norm": 0.37159956745892536, "learning_rate": 2.2056e-05, "loss": 0.2854, "mean_copy_accuracy": 0.9972752779722214, "mean_gen_accuracy": 0.8709899038076401, "mean_token_accuracy": 0.9044334888458252, "num_tokens": 820669992.0, "sample_num_tokens": 10958.0, "step": 7025, "total_num_tokens": 820713824.0, "z_loss": 0.0005303026409819722 }, { "copy_logits_max": -0.25592607259750366, "copy_logits_min": -750000000.0, "copy_num_tokens": 525.875, "epoch": 1.435026806229257, "gen_logits_max": 3.761030673980713, "gen_logits_mean": -16.27145767211914, "gen_logits_min": -28.257383346557617, "gen_logits_std": 3.1488451957702637, "gen_loss": 0.2580409646034241, "grad_norm": 0.37949566326071155, "learning_rate": 2.2054736842105265e-05, "loss": 0.29, "mean_copy_accuracy": 0.9947773367166519, "mean_gen_accuracy": 0.8748756051063538, "mean_token_accuracy": 0.9022984802722931, "num_tokens": 820944604.0, "sample_num_tokens": 8236.0, "step": 7026, "total_num_tokens": 820977548.0, "z_loss": 0.0004817476437892765 }, { "copy_logits_max": 1.1744191646575928, "copy_logits_min": -750000000.0, "copy_num_tokens": 487.875, "epoch": 1.435231044166454, "gen_logits_max": 3.6559836864471436, "gen_logits_mean": -17.115272521972656, "gen_logits_min": -29.241226196289062, "gen_logits_std": 3.1871585845947266, "gen_loss": 0.2929608225822449, "grad_norm": 0.3720500371705863, "learning_rate": 2.2053473684210525e-05, "loss": 0.2645, "mean_copy_accuracy": 0.9969691932201385, "mean_gen_accuracy": 0.8784746378660202, "mean_token_accuracy": 0.9091509431600571, "num_tokens": 821182821.0, "sample_num_tokens": 8409.25, "step": 7027, "total_num_tokens": 821216458.0, "z_loss": 0.0004816948203369975 }, { "copy_logits_max": -1.1155058145523071, "copy_logits_min": -750000000.0, "copy_num_tokens": 515.4375, "epoch": 1.4354352821036507, "gen_logits_max": 4.4144062995910645, "gen_logits_mean": -16.40400505065918, "gen_logits_min": -28.33898162841797, "gen_logits_std": 3.169171094894409, "gen_loss": 0.274289071559906, "grad_norm": 0.3418614757303548, "learning_rate": 2.205221052631579e-05, "loss": 0.2736, "mean_copy_accuracy": 0.9967156052589417, "mean_gen_accuracy": 0.8790159821510315, "mean_token_accuracy": 0.9100981503725052, "num_tokens": 821483069.0, "sample_num_tokens": 8933.75, "step": 7028, "total_num_tokens": 821518804.0, "z_loss": 0.0004920745850540698 }, { "copy_logits_max": -2.0560154914855957, "copy_logits_min": -750000000.0, "copy_num_tokens": 355.5625, "epoch": 1.4356395200408476, "gen_logits_max": 4.50180721282959, "gen_logits_mean": -16.861852645874023, "gen_logits_min": -28.646093368530273, "gen_logits_std": 3.182008981704712, "gen_loss": 0.3271884322166443, "grad_norm": 0.3519953342914169, "learning_rate": 2.205094736842105e-05, "loss": 0.2731, "mean_copy_accuracy": 0.99646957218647, "mean_gen_accuracy": 0.8792973607778549, "mean_token_accuracy": 0.9085760414600372, "num_tokens": 821765010.0, "sample_num_tokens": 7808.5, "step": 7029, "total_num_tokens": 821796244.0, "z_loss": 0.0005209521623328328 }, { "copy_logits_max": -1.4056451320648193, "copy_logits_min": -750000000.0, "copy_num_tokens": 608.75, "epoch": 1.4358437579780445, "gen_logits_max": 3.8158957958221436, "gen_logits_mean": -16.36261749267578, "gen_logits_min": -28.882762908935547, "gen_logits_std": 3.1974105834960938, "gen_loss": 0.27413493394851685, "grad_norm": 0.41507849885574194, "learning_rate": 2.2049684210526315e-05, "loss": 0.2835, "mean_copy_accuracy": 0.9954908192157745, "mean_gen_accuracy": 0.8743176013231277, "mean_token_accuracy": 0.9038015156984329, "num_tokens": 822045059.0, "sample_num_tokens": 9502.25, "step": 7030, "total_num_tokens": 822083068.0, "z_loss": 0.00042714382288977504 }, { "copy_logits_max": -0.5531021952629089, "copy_logits_min": -625000064.0, "copy_num_tokens": 576.6875, "epoch": 1.4360479959152412, "gen_logits_max": 4.566338062286377, "gen_logits_mean": -14.6992826461792, "gen_logits_min": -26.99582862854004, "gen_logits_std": 3.1471290588378906, "gen_loss": 0.2800203859806061, "grad_norm": 0.3988177744567188, "learning_rate": 2.2048421052631583e-05, "loss": 0.2803, "mean_copy_accuracy": 0.9968030750751495, "mean_gen_accuracy": 0.8695497810840607, "mean_token_accuracy": 0.9023625403642654, "num_tokens": 822311998.0, "sample_num_tokens": 9122.5, "step": 7031, "total_num_tokens": 822348488.0, "z_loss": 0.0005138427950441837 }, { "copy_logits_max": -3.84315824508667, "copy_logits_min": -750000000.0, "copy_num_tokens": 358.0625, "epoch": 1.4362522338524382, "gen_logits_max": 4.726591110229492, "gen_logits_mean": -16.611204147338867, "gen_logits_min": -28.238834381103516, "gen_logits_std": 3.133988857269287, "gen_loss": 0.29139620065689087, "grad_norm": 0.37840241684516235, "learning_rate": 2.2047157894736844e-05, "loss": 0.2818, "mean_copy_accuracy": 0.9971386343240738, "mean_gen_accuracy": 0.875239223241806, "mean_token_accuracy": 0.9026953279972076, "num_tokens": 822572423.0, "sample_num_tokens": 7733.25, "step": 7032, "total_num_tokens": 822603356.0, "z_loss": 0.0004897410981357098 }, { "copy_logits_max": -3.2101831436157227, "copy_logits_min": -750000000.0, "copy_num_tokens": 474.5625, "epoch": 1.4364564717896349, "gen_logits_max": 4.3329596519470215, "gen_logits_mean": -15.310267448425293, "gen_logits_min": -27.229890823364258, "gen_logits_std": 3.088531255722046, "gen_loss": 0.27691754698753357, "grad_norm": 0.37691264748007314, "learning_rate": 2.2045894736842108e-05, "loss": 0.2818, "mean_copy_accuracy": 0.9953765422105789, "mean_gen_accuracy": 0.8779505491256714, "mean_token_accuracy": 0.9039318561553955, "num_tokens": 822837043.0, "sample_num_tokens": 8710.75, "step": 7033, "total_num_tokens": 822871886.0, "z_loss": 0.0004508759011514485 }, { "copy_logits_max": -4.669166564941406, "copy_logits_min": -750000000.0, "copy_num_tokens": 543.375, "epoch": 1.4366607097268318, "gen_logits_max": 3.205397605895996, "gen_logits_mean": -18.022464752197266, "gen_logits_min": -30.12628936767578, "gen_logits_std": 3.175128936767578, "gen_loss": 0.3053797483444214, "grad_norm": 0.3682413786417949, "learning_rate": 2.204463157894737e-05, "loss": 0.3026, "mean_copy_accuracy": 0.9955335706472397, "mean_gen_accuracy": 0.8729103952646255, "mean_token_accuracy": 0.8973426967859268, "num_tokens": 823107187.0, "sample_num_tokens": 9444.25, "step": 7034, "total_num_tokens": 823144964.0, "z_loss": 0.00045419641537591815 }, { "copy_logits_max": -3.1236257553100586, "copy_logits_min": -750000064.0, "copy_num_tokens": 486.0, "epoch": 1.4368649476640285, "gen_logits_max": 3.61091947555542, "gen_logits_mean": -16.18649673461914, "gen_logits_min": -27.933082580566406, "gen_logits_std": 3.1056294441223145, "gen_loss": 0.2911367118358612, "grad_norm": 0.36939119271158233, "learning_rate": 2.2043368421052633e-05, "loss": 0.2854, "mean_copy_accuracy": 0.9952458739280701, "mean_gen_accuracy": 0.8717856109142303, "mean_token_accuracy": 0.9037105590105057, "num_tokens": 823385898.0, "sample_num_tokens": 7599.5, "step": 7035, "total_num_tokens": 823416296.0, "z_loss": 0.0005325292586348951 }, { "copy_logits_max": -1.3951191902160645, "copy_logits_min": -750000000.0, "copy_num_tokens": 526.9375, "epoch": 1.4370691856012254, "gen_logits_max": 4.105710029602051, "gen_logits_mean": -16.21232795715332, "gen_logits_min": -27.81046485900879, "gen_logits_std": 3.112241744995117, "gen_loss": 0.29389214515686035, "grad_norm": 0.3890332587240857, "learning_rate": 2.2042105263157894e-05, "loss": 0.2826, "mean_copy_accuracy": 0.9963453263044357, "mean_gen_accuracy": 0.8712627440690994, "mean_token_accuracy": 0.9036151021718979, "num_tokens": 823658741.0, "sample_num_tokens": 8864.75, "step": 7036, "total_num_tokens": 823694200.0, "z_loss": 0.0005253429990261793 }, { "copy_logits_max": -2.5183305740356445, "copy_logits_min": -750000000.0, "copy_num_tokens": 400.1875, "epoch": 1.4372734235384224, "gen_logits_max": 5.25971794128418, "gen_logits_mean": -16.602569580078125, "gen_logits_min": -28.203205108642578, "gen_logits_std": 3.1065621376037598, "gen_loss": 0.3077537715435028, "grad_norm": 0.36851029448261874, "learning_rate": 2.204084210526316e-05, "loss": 0.2766, "mean_copy_accuracy": 0.9962326288223267, "mean_gen_accuracy": 0.8811245411634445, "mean_token_accuracy": 0.9067681431770325, "num_tokens": 823932914.0, "sample_num_tokens": 8922.5, "step": 7037, "total_num_tokens": 823968604.0, "z_loss": 0.0005320427007973194 }, { "copy_logits_max": -1.199366807937622, "copy_logits_min": -750000000.0, "copy_num_tokens": 517.8125, "epoch": 1.437477661475619, "gen_logits_max": 3.929265022277832, "gen_logits_mean": -15.56153678894043, "gen_logits_min": -27.451473236083984, "gen_logits_std": 3.1194469928741455, "gen_loss": 0.2686130106449127, "grad_norm": 0.33855733509863484, "learning_rate": 2.203957894736842e-05, "loss": 0.2673, "mean_copy_accuracy": 0.996593788266182, "mean_gen_accuracy": 0.8773889541625977, "mean_token_accuracy": 0.9096509367227554, "num_tokens": 824205410.0, "sample_num_tokens": 8249.0, "step": 7038, "total_num_tokens": 824238406.0, "z_loss": 0.000438288931036368 }, { "copy_logits_max": -1.2594151496887207, "copy_logits_min": -625000064.0, "copy_num_tokens": 465.375, "epoch": 1.437681899412816, "gen_logits_max": 4.065497875213623, "gen_logits_mean": -16.440052032470703, "gen_logits_min": -28.483718872070312, "gen_logits_std": 3.1400153636932373, "gen_loss": 0.2898968756198883, "grad_norm": 0.3886465364950504, "learning_rate": 2.2038315789473684e-05, "loss": 0.2693, "mean_copy_accuracy": 0.9943450391292572, "mean_gen_accuracy": 0.8813993334770203, "mean_token_accuracy": 0.90733702480793, "num_tokens": 824461081.0, "sample_num_tokens": 8832.25, "step": 7039, "total_num_tokens": 824496410.0, "z_loss": 0.0005423279944807291 }, { "copy_logits_max": -0.8760044574737549, "copy_logits_min": -687500032.0, "copy_num_tokens": 607.625, "epoch": 1.4378861373500127, "gen_logits_max": 4.311038017272949, "gen_logits_mean": -15.646949768066406, "gen_logits_min": -27.419729232788086, "gen_logits_std": 3.090730667114258, "gen_loss": 0.24636255204677582, "grad_norm": 0.3983373311756618, "learning_rate": 2.2037052631578948e-05, "loss": 0.2645, "mean_copy_accuracy": 0.9970600306987762, "mean_gen_accuracy": 0.8812082558870316, "mean_token_accuracy": 0.9111863672733307, "num_tokens": 824731251.0, "sample_num_tokens": 9156.25, "step": 7040, "total_num_tokens": 824767876.0, "z_loss": 0.0004809959791600704 }, { "copy_logits_max": -4.582131385803223, "copy_logits_min": -750000000.0, "copy_num_tokens": 505.0, "epoch": 1.4380903752872096, "gen_logits_max": 3.6698038578033447, "gen_logits_mean": -16.532028198242188, "gen_logits_min": -28.56472396850586, "gen_logits_std": 3.1401631832122803, "gen_loss": 0.24855300784111023, "grad_norm": 0.3830098487455567, "learning_rate": 2.2035789473684212e-05, "loss": 0.2773, "mean_copy_accuracy": 0.9962241053581238, "mean_gen_accuracy": 0.8798654675483704, "mean_token_accuracy": 0.9055749177932739, "num_tokens": 825005150.0, "sample_num_tokens": 8904.0, "step": 7041, "total_num_tokens": 825040766.0, "z_loss": 0.0004427549720276147 }, { "copy_logits_max": -3.3500375747680664, "copy_logits_min": -750000000.0, "copy_num_tokens": 499.5625, "epoch": 1.4382946132244063, "gen_logits_max": 3.498530864715576, "gen_logits_mean": -16.162952423095703, "gen_logits_min": -28.02745819091797, "gen_logits_std": 3.1320600509643555, "gen_loss": 0.28446298837661743, "grad_norm": 0.34681553152902694, "learning_rate": 2.2034526315789473e-05, "loss": 0.2659, "mean_copy_accuracy": 0.9969022125005722, "mean_gen_accuracy": 0.8742768615484238, "mean_token_accuracy": 0.9089339375495911, "num_tokens": 825264036.0, "sample_num_tokens": 7886.0, "step": 7042, "total_num_tokens": 825295580.0, "z_loss": 0.0005660753813572228 }, { "copy_logits_max": -0.06790971755981445, "copy_logits_min": -687500032.0, "copy_num_tokens": 540.625, "epoch": 1.4384988511616033, "gen_logits_max": 3.8526649475097656, "gen_logits_mean": -15.47060489654541, "gen_logits_min": -27.354124069213867, "gen_logits_std": 3.119504928588867, "gen_loss": 0.26750892400741577, "grad_norm": 0.36429142767353473, "learning_rate": 2.2033263157894738e-05, "loss": 0.2719, "mean_copy_accuracy": 0.9962868243455887, "mean_gen_accuracy": 0.8762021064758301, "mean_token_accuracy": 0.9078095257282257, "num_tokens": 825535807.0, "sample_num_tokens": 8756.25, "step": 7043, "total_num_tokens": 825570832.0, "z_loss": 0.0005218948936089873 }, { "copy_logits_max": -3.7800002098083496, "copy_logits_min": -750000064.0, "copy_num_tokens": 520.4375, "epoch": 1.4387030890988002, "gen_logits_max": 2.8582258224487305, "gen_logits_mean": -17.40085220336914, "gen_logits_min": -29.283733367919922, "gen_logits_std": 3.167257070541382, "gen_loss": 0.2544407248497009, "grad_norm": 0.3965157359936245, "learning_rate": 2.2032000000000002e-05, "loss": 0.2654, "mean_copy_accuracy": 0.9968216866254807, "mean_gen_accuracy": 0.8772279769182205, "mean_token_accuracy": 0.9105053097009659, "num_tokens": 825839274.0, "sample_num_tokens": 8930.5, "step": 7044, "total_num_tokens": 825874996.0, "z_loss": 0.00045052339555695653 }, { "copy_logits_max": -4.359974384307861, "copy_logits_min": -687500032.0, "copy_num_tokens": 503.1875, "epoch": 1.4389073270359969, "gen_logits_max": 4.021750450134277, "gen_logits_mean": -15.605052947998047, "gen_logits_min": -27.219623565673828, "gen_logits_std": 3.1101677417755127, "gen_loss": 0.2789788544178009, "grad_norm": 0.3747204562451789, "learning_rate": 2.2030736842105263e-05, "loss": 0.2707, "mean_copy_accuracy": 0.9961680620908737, "mean_gen_accuracy": 0.8751410841941833, "mean_token_accuracy": 0.9079720079898834, "num_tokens": 826120974.0, "sample_num_tokens": 8733.5, "step": 7045, "total_num_tokens": 826155908.0, "z_loss": 0.0004368623485788703 }, { "copy_logits_max": -5.612115859985352, "copy_logits_min": -687500032.0, "copy_num_tokens": 596.6875, "epoch": 1.4391115649731938, "gen_logits_max": 2.97023344039917, "gen_logits_mean": -17.746078491210938, "gen_logits_min": -29.508075714111328, "gen_logits_std": 3.19303035736084, "gen_loss": 0.2984059751033783, "grad_norm": 0.37529929340018464, "learning_rate": 2.2029473684210527e-05, "loss": 0.2796, "mean_copy_accuracy": 0.9955601841211319, "mean_gen_accuracy": 0.8799936324357986, "mean_token_accuracy": 0.9073713421821594, "num_tokens": 826402782.0, "sample_num_tokens": 9861.5, "step": 7046, "total_num_tokens": 826442228.0, "z_loss": 0.0004781337338499725 }, { "copy_logits_max": -4.975769996643066, "copy_logits_min": -687500032.0, "copy_num_tokens": 569.4375, "epoch": 1.4393158029103907, "gen_logits_max": 4.102138519287109, "gen_logits_mean": -15.107653617858887, "gen_logits_min": -27.18627166748047, "gen_logits_std": 3.1134047508239746, "gen_loss": 0.2564653158187866, "grad_norm": 0.3400495583771859, "learning_rate": 2.2028210526315788e-05, "loss": 0.2538, "mean_copy_accuracy": 0.9971830546855927, "mean_gen_accuracy": 0.8813051581382751, "mean_token_accuracy": 0.914491817355156, "num_tokens": 826692736.0, "sample_num_tokens": 9436.0, "step": 7047, "total_num_tokens": 826730480.0, "z_loss": 0.00037972390418872237 }, { "copy_logits_max": -4.574443817138672, "copy_logits_min": -750000000.0, "copy_num_tokens": 372.375, "epoch": 1.4395200408475874, "gen_logits_max": 3.3052151203155518, "gen_logits_mean": -17.215496063232422, "gen_logits_min": -28.99045753479004, "gen_logits_std": 3.171542167663574, "gen_loss": 0.28603360056877136, "grad_norm": 0.3443415898716339, "learning_rate": 2.2026947368421056e-05, "loss": 0.2769, "mean_copy_accuracy": 0.9974599331617355, "mean_gen_accuracy": 0.8734500706195831, "mean_token_accuracy": 0.9076593369245529, "num_tokens": 826976195.0, "sample_num_tokens": 6873.75, "step": 7048, "total_num_tokens": 827003690.0, "z_loss": 0.00046379578998312354 }, { "copy_logits_max": -2.6989028453826904, "copy_logits_min": -750000000.0, "copy_num_tokens": 856.25, "epoch": 1.4397242787847842, "gen_logits_max": 3.3287811279296875, "gen_logits_mean": -15.076162338256836, "gen_logits_min": -27.159034729003906, "gen_logits_std": 3.129189968109131, "gen_loss": 0.24587886035442352, "grad_norm": 0.3644044050018145, "learning_rate": 2.2025684210526317e-05, "loss": 0.2577, "mean_copy_accuracy": 0.9982965141534805, "mean_gen_accuracy": 0.8737228214740753, "mean_token_accuracy": 0.9129238277673721, "num_tokens": 827276975.0, "sample_num_tokens": 10900.75, "step": 7049, "total_num_tokens": 827320578.0, "z_loss": 0.0003937451692763716 }, { "copy_logits_max": -3.763918876647949, "copy_logits_min": -750000000.0, "copy_num_tokens": 521.5625, "epoch": 1.439928516721981, "gen_logits_max": 3.5610177516937256, "gen_logits_mean": -16.74091339111328, "gen_logits_min": -28.67124366760254, "gen_logits_std": 3.1676037311553955, "gen_loss": 0.2902863621711731, "grad_norm": 0.3492539971104406, "learning_rate": 2.202442105263158e-05, "loss": 0.2913, "mean_copy_accuracy": 0.9970103353261948, "mean_gen_accuracy": 0.867083266377449, "mean_token_accuracy": 0.9014508426189423, "num_tokens": 827563543.0, "sample_num_tokens": 9487.25, "step": 7050, "total_num_tokens": 827601492.0, "z_loss": 0.00046817021211609244 }, { "copy_logits_max": -3.3693768978118896, "copy_logits_min": -750000000.0, "copy_num_tokens": 541.3125, "epoch": 1.440132754659178, "gen_logits_max": 4.691149711608887, "gen_logits_mean": -14.66016674041748, "gen_logits_min": -26.483139038085938, "gen_logits_std": 3.0640132427215576, "gen_loss": 0.2677420973777771, "grad_norm": 0.3707677071462224, "learning_rate": 2.2023157894736842e-05, "loss": 0.2703, "mean_copy_accuracy": 0.997464582324028, "mean_gen_accuracy": 0.8802850693464279, "mean_token_accuracy": 0.9073814749717712, "num_tokens": 827834954.0, "sample_num_tokens": 9627.0, "step": 7051, "total_num_tokens": 827873462.0, "z_loss": 0.00043508692760951817 }, { "copy_logits_max": -2.453648090362549, "copy_logits_min": -750000000.0, "copy_num_tokens": 604.125, "epoch": 1.4403369925963747, "gen_logits_max": 3.426100730895996, "gen_logits_mean": -16.652494430541992, "gen_logits_min": -28.877954483032227, "gen_logits_std": 3.1724791526794434, "gen_loss": 0.27623802423477173, "grad_norm": 0.37025674852671087, "learning_rate": 2.2021894736842106e-05, "loss": 0.2801, "mean_copy_accuracy": 0.9953241646289825, "mean_gen_accuracy": 0.8755847662687302, "mean_token_accuracy": 0.9063564091920853, "num_tokens": 828109552.0, "sample_num_tokens": 9423.0, "step": 7052, "total_num_tokens": 828147244.0, "z_loss": 0.0005087442696094513 }, { "copy_logits_max": -0.2575379014015198, "copy_logits_min": -750000000.0, "copy_num_tokens": 587.0625, "epoch": 1.4405412305335716, "gen_logits_max": 3.8487133979797363, "gen_logits_mean": -14.936209678649902, "gen_logits_min": -26.507083892822266, "gen_logits_std": 3.103883743286133, "gen_loss": 0.27317342162132263, "grad_norm": 0.3468293881246926, "learning_rate": 2.2020631578947367e-05, "loss": 0.2597, "mean_copy_accuracy": 0.9969084858894348, "mean_gen_accuracy": 0.8786753118038177, "mean_token_accuracy": 0.9112181961536407, "num_tokens": 828395635.0, "sample_num_tokens": 8900.25, "step": 7053, "total_num_tokens": 828431236.0, "z_loss": 0.0005133262602612376 }, { "copy_logits_max": -4.53735876083374, "copy_logits_min": -687500032.0, "copy_num_tokens": 451.8125, "epoch": 1.4407454684707686, "gen_logits_max": 3.529313325881958, "gen_logits_mean": -17.83040428161621, "gen_logits_min": -29.374290466308594, "gen_logits_std": 3.192023754119873, "gen_loss": 0.25530192255973816, "grad_norm": 0.35915861016171374, "learning_rate": 2.201936842105263e-05, "loss": 0.2794, "mean_copy_accuracy": 0.9955776482820511, "mean_gen_accuracy": 0.8785441964864731, "mean_token_accuracy": 0.9058271497488022, "num_tokens": 828664502.0, "sample_num_tokens": 8891.0, "step": 7054, "total_num_tokens": 828700066.0, "z_loss": 0.0004556555359158665 }, { "copy_logits_max": -4.168329238891602, "copy_logits_min": -750000000.0, "copy_num_tokens": 526.3125, "epoch": 1.4409497064079653, "gen_logits_max": 4.226522445678711, "gen_logits_mean": -16.22185707092285, "gen_logits_min": -28.21538734436035, "gen_logits_std": 3.120335578918457, "gen_loss": 0.27193278074264526, "grad_norm": 0.3885850650730338, "learning_rate": 2.2018105263157892e-05, "loss": 0.2835, "mean_copy_accuracy": 0.9970158785581589, "mean_gen_accuracy": 0.8764442205429077, "mean_token_accuracy": 0.9043809324502945, "num_tokens": 828935336.0, "sample_num_tokens": 9802.0, "step": 7055, "total_num_tokens": 828974544.0, "z_loss": 0.0005038946401327848 }, { "copy_logits_max": -6.047577381134033, "copy_logits_min": -750000000.0, "copy_num_tokens": 377.0625, "epoch": 1.4411539443451622, "gen_logits_max": 3.2857418060302734, "gen_logits_mean": -18.16954803466797, "gen_logits_min": -29.851943969726562, "gen_logits_std": 3.176257610321045, "gen_loss": 0.25362154841423035, "grad_norm": 0.4233284555765879, "learning_rate": 2.201684210526316e-05, "loss": 0.3072, "mean_copy_accuracy": 0.9960949867963791, "mean_gen_accuracy": 0.8703663498163223, "mean_token_accuracy": 0.8965460062026978, "num_tokens": 829180721.0, "sample_num_tokens": 8324.25, "step": 7056, "total_num_tokens": 829214018.0, "z_loss": 0.00045612669782713056 }, { "copy_logits_max": -3.3486812114715576, "copy_logits_min": -750000000.0, "copy_num_tokens": 366.0, "epoch": 1.441358182282359, "gen_logits_max": 4.464733600616455, "gen_logits_mean": -16.078447341918945, "gen_logits_min": -27.750484466552734, "gen_logits_std": 3.1237130165100098, "gen_loss": 0.24654477834701538, "grad_norm": 0.33700970277121667, "learning_rate": 2.2015578947368424e-05, "loss": 0.2634, "mean_copy_accuracy": 0.9971804618835449, "mean_gen_accuracy": 0.8835023790597916, "mean_token_accuracy": 0.9097265750169754, "num_tokens": 829439969.0, "sample_num_tokens": 7965.75, "step": 7057, "total_num_tokens": 829471832.0, "z_loss": 0.00042533897794783115 }, { "copy_logits_max": -3.022412061691284, "copy_logits_min": -687500032.0, "copy_num_tokens": 504.6875, "epoch": 1.4415624202195558, "gen_logits_max": 5.569945335388184, "gen_logits_mean": -14.335147857666016, "gen_logits_min": -26.06694793701172, "gen_logits_std": 3.052598476409912, "gen_loss": 0.2718620300292969, "grad_norm": 0.36883308114678437, "learning_rate": 2.2014315789473685e-05, "loss": 0.2786, "mean_copy_accuracy": 0.9967238754034042, "mean_gen_accuracy": 0.876701146364212, "mean_token_accuracy": 0.9040528982877731, "num_tokens": 829695315.0, "sample_num_tokens": 9503.25, "step": 7058, "total_num_tokens": 829733328.0, "z_loss": 0.0005111722275614738 }, { "copy_logits_max": -5.346837997436523, "copy_logits_min": -750000000.0, "copy_num_tokens": 488.75, "epoch": 1.4417666581567525, "gen_logits_max": 5.593878746032715, "gen_logits_mean": -13.978203773498535, "gen_logits_min": -25.9639892578125, "gen_logits_std": 3.062335968017578, "gen_loss": 0.3026924431324005, "grad_norm": 0.411046360709634, "learning_rate": 2.201305263157895e-05, "loss": 0.287, "mean_copy_accuracy": 0.9971628338098526, "mean_gen_accuracy": 0.8738854229450226, "mean_token_accuracy": 0.901791512966156, "num_tokens": 829973915.0, "sample_num_tokens": 10409.75, "step": 7059, "total_num_tokens": 830015554.0, "z_loss": 0.0005271551199257374 }, { "copy_logits_max": -5.853567600250244, "copy_logits_min": -750000000.0, "copy_num_tokens": 501.875, "epoch": 1.4419708960939495, "gen_logits_max": 3.9854772090911865, "gen_logits_mean": -16.365886688232422, "gen_logits_min": -28.325937271118164, "gen_logits_std": 3.174375295639038, "gen_loss": 0.2848179340362549, "grad_norm": 0.3598609506080849, "learning_rate": 2.201178947368421e-05, "loss": 0.2639, "mean_copy_accuracy": 0.99739009141922, "mean_gen_accuracy": 0.8776293396949768, "mean_token_accuracy": 0.9106540679931641, "num_tokens": 830280184.0, "sample_num_tokens": 8976.5, "step": 7060, "total_num_tokens": 830316090.0, "z_loss": 0.0004806107026524842 }, { "copy_logits_max": -2.0456643104553223, "copy_logits_min": -750000000.0, "copy_num_tokens": 550.9375, "epoch": 1.4421751340311464, "gen_logits_max": 3.2695984840393066, "gen_logits_mean": -15.81296443939209, "gen_logits_min": -27.598337173461914, "gen_logits_std": 3.084855318069458, "gen_loss": 0.27617913484573364, "grad_norm": 0.3850222822978584, "learning_rate": 2.2010526315789475e-05, "loss": 0.2751, "mean_copy_accuracy": 0.9957644939422607, "mean_gen_accuracy": 0.873347282409668, "mean_token_accuracy": 0.9071588218212128, "num_tokens": 830555239.0, "sample_num_tokens": 8905.25, "step": 7061, "total_num_tokens": 830590860.0, "z_loss": 0.0004553128674160689 }, { "copy_logits_max": -5.901418685913086, "copy_logits_min": -750000000.0, "copy_num_tokens": 370.4375, "epoch": 1.442379371968343, "gen_logits_max": 3.2273197174072266, "gen_logits_mean": -18.59754180908203, "gen_logits_min": -30.10671615600586, "gen_logits_std": 3.201157331466675, "gen_loss": 0.2660118043422699, "grad_norm": 0.37147867824832415, "learning_rate": 2.2009263157894736e-05, "loss": 0.2843, "mean_copy_accuracy": 0.9971796870231628, "mean_gen_accuracy": 0.875678226351738, "mean_token_accuracy": 0.9017181247472763, "num_tokens": 830822663.0, "sample_num_tokens": 8239.75, "step": 7062, "total_num_tokens": 830855622.0, "z_loss": 0.0004797402070835233 }, { "copy_logits_max": -1.0144072771072388, "copy_logits_min": -750000000.0, "copy_num_tokens": 495.8125, "epoch": 1.44258360990554, "gen_logits_max": 3.458590507507324, "gen_logits_mean": -16.15884017944336, "gen_logits_min": -28.34868049621582, "gen_logits_std": 3.1640801429748535, "gen_loss": 0.2411564290523529, "grad_norm": 0.3794061728628666, "learning_rate": 2.2008e-05, "loss": 0.265, "mean_copy_accuracy": 0.9969276487827301, "mean_gen_accuracy": 0.8781577944755554, "mean_token_accuracy": 0.9087937027215958, "num_tokens": 831096766.0, "sample_num_tokens": 8299.5, "step": 7063, "total_num_tokens": 831129964.0, "z_loss": 0.0005037118680775166 }, { "copy_logits_max": -3.199573516845703, "copy_logits_min": -750000000.0, "copy_num_tokens": 421.875, "epoch": 1.4427878478427367, "gen_logits_max": 4.361140251159668, "gen_logits_mean": -16.548545837402344, "gen_logits_min": -28.59355354309082, "gen_logits_std": 3.178114891052246, "gen_loss": 0.3097898066043854, "grad_norm": 0.33787956041828654, "learning_rate": 2.2006736842105264e-05, "loss": 0.2805, "mean_copy_accuracy": 0.9964890033006668, "mean_gen_accuracy": 0.8768345862627029, "mean_token_accuracy": 0.904401034116745, "num_tokens": 831367825.0, "sample_num_tokens": 8285.25, "step": 7064, "total_num_tokens": 831400966.0, "z_loss": 0.0005699624307453632 }, { "copy_logits_max": -3.1032562255859375, "copy_logits_min": -687500032.0, "copy_num_tokens": 519.5, "epoch": 1.4429920857799337, "gen_logits_max": 4.3491926193237305, "gen_logits_mean": -15.67906379699707, "gen_logits_min": -27.34688377380371, "gen_logits_std": 3.1157546043395996, "gen_loss": 0.2952580451965332, "grad_norm": 0.36396941057700277, "learning_rate": 2.200547368421053e-05, "loss": 0.283, "mean_copy_accuracy": 0.996410682797432, "mean_gen_accuracy": 0.8693326264619827, "mean_token_accuracy": 0.9026627540588379, "num_tokens": 831644105.0, "sample_num_tokens": 10370.75, "step": 7065, "total_num_tokens": 831685588.0, "z_loss": 0.0005161496810615063 }, { "copy_logits_max": -1.2258074283599854, "copy_logits_min": -750000000.0, "copy_num_tokens": 583.5, "epoch": 1.4431963237171304, "gen_logits_max": 3.717609405517578, "gen_logits_mean": -16.057891845703125, "gen_logits_min": -27.808883666992188, "gen_logits_std": 3.1558594703674316, "gen_loss": 0.2765478491783142, "grad_norm": 0.42706462448700555, "learning_rate": 2.200421052631579e-05, "loss": 0.2916, "mean_copy_accuracy": 0.9951708018779755, "mean_gen_accuracy": 0.8718377500772476, "mean_token_accuracy": 0.8996364921331406, "num_tokens": 831907054.0, "sample_num_tokens": 9433.0, "step": 7066, "total_num_tokens": 831944786.0, "z_loss": 0.0004662068095058203 }, { "copy_logits_max": -3.079435110092163, "copy_logits_min": -750000000.0, "copy_num_tokens": 396.25, "epoch": 1.4434005616543273, "gen_logits_max": 3.2321689128875732, "gen_logits_mean": -17.518287658691406, "gen_logits_min": -29.331310272216797, "gen_logits_std": 3.1810414791107178, "gen_loss": 0.26677438616752625, "grad_norm": 0.37007402486867325, "learning_rate": 2.2002947368421054e-05, "loss": 0.2756, "mean_copy_accuracy": 0.9959303587675095, "mean_gen_accuracy": 0.8770864903926849, "mean_token_accuracy": 0.907580092549324, "num_tokens": 832168803.0, "sample_num_tokens": 7234.25, "step": 7067, "total_num_tokens": 832197740.0, "z_loss": 0.0004800864844582975 }, { "copy_logits_max": -4.004937171936035, "copy_logits_min": -750000000.0, "copy_num_tokens": 414.25, "epoch": 1.4436047995915242, "gen_logits_max": 3.8885064125061035, "gen_logits_mean": -16.614431381225586, "gen_logits_min": -28.242319107055664, "gen_logits_std": 3.1354751586914062, "gen_loss": 0.2927860617637634, "grad_norm": 0.39425288732928027, "learning_rate": 2.2001684210526315e-05, "loss": 0.3088, "mean_copy_accuracy": 0.9958700686693192, "mean_gen_accuracy": 0.8682562708854675, "mean_token_accuracy": 0.8941068947315216, "num_tokens": 832421527.0, "sample_num_tokens": 8492.75, "step": 7068, "total_num_tokens": 832455498.0, "z_loss": 0.0005186788621358573 }, { "copy_logits_max": -1.571183681488037, "copy_logits_min": -750000064.0, "copy_num_tokens": 446.8125, "epoch": 1.443809037528721, "gen_logits_max": 6.108258247375488, "gen_logits_mean": -13.091344833374023, "gen_logits_min": -25.670433044433594, "gen_logits_std": 3.155874729156494, "gen_loss": 0.3071296215057373, "grad_norm": 1.1609013953643674, "learning_rate": 2.200042105263158e-05, "loss": 0.2822, "mean_copy_accuracy": 0.9957165271043777, "mean_gen_accuracy": 0.8768871575593948, "mean_token_accuracy": 0.903341993689537, "num_tokens": 832669178.0, "sample_num_tokens": 7711.0, "step": 7069, "total_num_tokens": 832700022.0, "z_loss": 0.0005005199927836657 }, { "copy_logits_max": -2.68432879447937, "copy_logits_min": -750000000.0, "copy_num_tokens": 422.5, "epoch": 1.4440132754659178, "gen_logits_max": 4.155806541442871, "gen_logits_mean": -15.879063606262207, "gen_logits_min": -27.723012924194336, "gen_logits_std": 3.1484973430633545, "gen_loss": 0.313179612159729, "grad_norm": 0.36596388402737995, "learning_rate": 2.199915789473684e-05, "loss": 0.2782, "mean_copy_accuracy": 0.9962832778692245, "mean_gen_accuracy": 0.8756938874721527, "mean_token_accuracy": 0.9042748808860779, "num_tokens": 832918455.0, "sample_num_tokens": 7718.75, "step": 7070, "total_num_tokens": 832949330.0, "z_loss": 0.0004950393922626972 }, { "copy_logits_max": -2.9314041137695312, "copy_logits_min": -750000000.0, "copy_num_tokens": 428.0625, "epoch": 1.4442175134031146, "gen_logits_max": 4.68361234664917, "gen_logits_mean": -15.514432907104492, "gen_logits_min": -27.34237289428711, "gen_logits_std": 3.125617504119873, "gen_loss": 0.3054708242416382, "grad_norm": 0.3894321376061095, "learning_rate": 2.1997894736842104e-05, "loss": 0.2787, "mean_copy_accuracy": 0.9955429285764694, "mean_gen_accuracy": 0.8760459274053574, "mean_token_accuracy": 0.9044477194547653, "num_tokens": 833167109.0, "sample_num_tokens": 8186.25, "step": 7071, "total_num_tokens": 833199854.0, "z_loss": 0.0005024259444326162 }, { "copy_logits_max": -2.297208309173584, "copy_logits_min": -750000000.0, "copy_num_tokens": 631.875, "epoch": 1.4444217513403115, "gen_logits_max": 3.773702621459961, "gen_logits_mean": -15.927841186523438, "gen_logits_min": -28.583513259887695, "gen_logits_std": 3.215003490447998, "gen_loss": 0.2643706202507019, "grad_norm": 0.39825718577570035, "learning_rate": 2.1996631578947372e-05, "loss": 0.2854, "mean_copy_accuracy": 0.997358962893486, "mean_gen_accuracy": 0.8758916407823563, "mean_token_accuracy": 0.905450701713562, "num_tokens": 833417577.0, "sample_num_tokens": 9684.25, "step": 7072, "total_num_tokens": 833456314.0, "z_loss": 0.0004561313835438341 }, { "copy_logits_max": -3.0899715423583984, "copy_logits_min": -750000000.0, "copy_num_tokens": 458.8125, "epoch": 1.4446259892775082, "gen_logits_max": 5.589048385620117, "gen_logits_mean": -14.595660209655762, "gen_logits_min": -26.669078826904297, "gen_logits_std": 3.1767959594726562, "gen_loss": 0.24349921941757202, "grad_norm": 0.3761645039033358, "learning_rate": 2.1995368421052633e-05, "loss": 0.2612, "mean_copy_accuracy": 0.9972889721393585, "mean_gen_accuracy": 0.8798733800649643, "mean_token_accuracy": 0.9115579277276993, "num_tokens": 833690032.0, "sample_num_tokens": 7713.0, "step": 7073, "total_num_tokens": 833720884.0, "z_loss": 0.0003824786690529436 }, { "copy_logits_max": -4.796358585357666, "copy_logits_min": -750000000.0, "copy_num_tokens": 312.0, "epoch": 1.444830227214705, "gen_logits_max": 5.806293487548828, "gen_logits_mean": -15.17367172241211, "gen_logits_min": -26.93040657043457, "gen_logits_std": 3.1458888053894043, "gen_loss": 0.29031556844711304, "grad_norm": 0.3843921848317246, "learning_rate": 2.1994105263157897e-05, "loss": 0.2932, "mean_copy_accuracy": 0.9962974488735199, "mean_gen_accuracy": 0.8736734092235565, "mean_token_accuracy": 0.9015867412090302, "num_tokens": 833955579.0, "sample_num_tokens": 7961.25, "step": 7074, "total_num_tokens": 833987424.0, "z_loss": 0.0004205938312225044 }, { "copy_logits_max": -2.3570122718811035, "copy_logits_min": -750000000.0, "copy_num_tokens": 424.5625, "epoch": 1.445034465151902, "gen_logits_max": 6.282100200653076, "gen_logits_mean": -14.139925003051758, "gen_logits_min": -26.8265323638916, "gen_logits_std": 3.168752670288086, "gen_loss": 0.25999540090560913, "grad_norm": 0.36415811316025104, "learning_rate": 2.1992842105263158e-05, "loss": 0.2745, "mean_copy_accuracy": 0.9956106543540955, "mean_gen_accuracy": 0.878691166639328, "mean_token_accuracy": 0.9073240458965302, "num_tokens": 834223059.0, "sample_num_tokens": 8794.75, "step": 7075, "total_num_tokens": 834258238.0, "z_loss": 0.00042188074439764023 }, { "copy_logits_max": -5.650180816650391, "copy_logits_min": -750000000.0, "copy_num_tokens": 342.8125, "epoch": 1.4452387030890987, "gen_logits_max": 6.6730265617370605, "gen_logits_mean": -13.343950271606445, "gen_logits_min": -25.571414947509766, "gen_logits_std": 3.1056015491485596, "gen_loss": 0.30149731040000916, "grad_norm": 0.34217793816624, "learning_rate": 2.1991578947368423e-05, "loss": 0.2739, "mean_copy_accuracy": 0.996802031993866, "mean_gen_accuracy": 0.8777823150157928, "mean_token_accuracy": 0.9079731553792953, "num_tokens": 834494101.0, "sample_num_tokens": 7606.75, "step": 7076, "total_num_tokens": 834524528.0, "z_loss": 0.0004812043043784797 }, { "copy_logits_max": -4.980679512023926, "copy_logits_min": -750000000.0, "copy_num_tokens": 390.3125, "epoch": 1.4454429410262957, "gen_logits_max": 4.1149115562438965, "gen_logits_mean": -16.52315902709961, "gen_logits_min": -28.197200775146484, "gen_logits_std": 3.2007877826690674, "gen_loss": 0.2739781141281128, "grad_norm": 0.3421026225720116, "learning_rate": 2.1990315789473683e-05, "loss": 0.2702, "mean_copy_accuracy": 0.9972467571496964, "mean_gen_accuracy": 0.8751741498708725, "mean_token_accuracy": 0.9060316830873489, "num_tokens": 834757162.0, "sample_num_tokens": 7493.0, "step": 7077, "total_num_tokens": 834787134.0, "z_loss": 0.0004183339769951999 }, { "copy_logits_max": -3.7579185962677, "copy_logits_min": -687500032.0, "copy_num_tokens": 626.6875, "epoch": 1.4456471789634926, "gen_logits_max": 5.207589626312256, "gen_logits_mean": -14.743234634399414, "gen_logits_min": -26.83709716796875, "gen_logits_std": 3.1897361278533936, "gen_loss": 0.2824835479259491, "grad_norm": 0.37220957948057243, "learning_rate": 2.1989052631578948e-05, "loss": 0.2943, "mean_copy_accuracy": 0.9968327283859253, "mean_gen_accuracy": 0.8715190589427948, "mean_token_accuracy": 0.9017375558614731, "num_tokens": 835042761.0, "sample_num_tokens": 10359.75, "step": 7078, "total_num_tokens": 835084200.0, "z_loss": 0.00046602278598584235 }, { "copy_logits_max": -0.27283626794815063, "copy_logits_min": -750000000.0, "copy_num_tokens": 461.5, "epoch": 1.4458514169006893, "gen_logits_max": 3.5376219749450684, "gen_logits_mean": -15.914416313171387, "gen_logits_min": -28.122295379638672, "gen_logits_std": 3.181671380996704, "gen_loss": 0.2466532289981842, "grad_norm": 0.3732788966992971, "learning_rate": 2.198778947368421e-05, "loss": 0.284, "mean_copy_accuracy": 0.9971770793199539, "mean_gen_accuracy": 0.8733035027980804, "mean_token_accuracy": 0.9041749536991119, "num_tokens": 835322643.0, "sample_num_tokens": 7624.25, "step": 7079, "total_num_tokens": 835353140.0, "z_loss": 0.0003984238428529352 }, { "copy_logits_max": -5.26780891418457, "copy_logits_min": -750000000.0, "copy_num_tokens": 470.375, "epoch": 1.446055654837886, "gen_logits_max": 5.5105767250061035, "gen_logits_mean": -14.808623313903809, "gen_logits_min": -26.89385223388672, "gen_logits_std": 3.1538355350494385, "gen_loss": 0.28942397236824036, "grad_norm": 0.36371584636588933, "learning_rate": 2.1986526315789476e-05, "loss": 0.297, "mean_copy_accuracy": 0.9955711662769318, "mean_gen_accuracy": 0.8709144443273544, "mean_token_accuracy": 0.8988704532384872, "num_tokens": 835580360.0, "sample_num_tokens": 8727.5, "step": 7080, "total_num_tokens": 835615270.0, "z_loss": 0.0004987275460734963 }, { "copy_logits_max": -0.9482614994049072, "copy_logits_min": -750000000.0, "copy_num_tokens": 445.4375, "epoch": 1.446259892775083, "gen_logits_max": 5.313467979431152, "gen_logits_mean": -14.853926658630371, "gen_logits_min": -27.078006744384766, "gen_logits_std": 3.1712698936462402, "gen_loss": 0.22547364234924316, "grad_norm": 0.4030224174334349, "learning_rate": 2.1985263157894737e-05, "loss": 0.2738, "mean_copy_accuracy": 0.9963791966438293, "mean_gen_accuracy": 0.8802653849124908, "mean_token_accuracy": 0.9072535037994385, "num_tokens": 835839676.0, "sample_num_tokens": 7877.5, "step": 7081, "total_num_tokens": 835871186.0, "z_loss": 0.00040172916487790644 }, { "copy_logits_max": -5.203461170196533, "copy_logits_min": -750000000.0, "copy_num_tokens": 336.0, "epoch": 1.4464641307122799, "gen_logits_max": 4.732402801513672, "gen_logits_mean": -16.54096221923828, "gen_logits_min": -28.2902774810791, "gen_logits_std": 3.18723201751709, "gen_loss": 0.33231690526008606, "grad_norm": 0.3643188629370509, "learning_rate": 2.1984e-05, "loss": 0.318, "mean_copy_accuracy": 0.9967065900564194, "mean_gen_accuracy": 0.865656852722168, "mean_token_accuracy": 0.8912892639636993, "num_tokens": 836106840.0, "sample_num_tokens": 7956.5, "step": 7082, "total_num_tokens": 836138666.0, "z_loss": 0.0005770124844275415 }, { "copy_logits_max": -5.5341691970825195, "copy_logits_min": -750000000.0, "copy_num_tokens": 389.375, "epoch": 1.4466683686494766, "gen_logits_max": 3.7254953384399414, "gen_logits_mean": -17.89309310913086, "gen_logits_min": -29.61130142211914, "gen_logits_std": 3.2348580360412598, "gen_loss": 0.3171541094779968, "grad_norm": 0.3783510703283785, "learning_rate": 2.1982736842105263e-05, "loss": 0.2908, "mean_copy_accuracy": 0.9957660734653473, "mean_gen_accuracy": 0.8709384202957153, "mean_token_accuracy": 0.901239275932312, "num_tokens": 836359835.0, "sample_num_tokens": 7732.75, "step": 7083, "total_num_tokens": 836390766.0, "z_loss": 0.0005403778050094843 }, { "copy_logits_max": -4.189582824707031, "copy_logits_min": -750000064.0, "copy_num_tokens": 477.0625, "epoch": 1.4468726065866735, "gen_logits_max": 5.2988128662109375, "gen_logits_mean": -15.138389587402344, "gen_logits_min": -27.062156677246094, "gen_logits_std": 3.156137466430664, "gen_loss": 0.3004932999610901, "grad_norm": 0.4028826313474025, "learning_rate": 2.1981473684210527e-05, "loss": 0.2816, "mean_copy_accuracy": 0.9969414919614792, "mean_gen_accuracy": 0.8714589625597, "mean_token_accuracy": 0.9052894711494446, "num_tokens": 836626775.0, "sample_num_tokens": 8988.25, "step": 7084, "total_num_tokens": 836662728.0, "z_loss": 0.0005500355619005859 }, { "copy_logits_max": -4.633389949798584, "copy_logits_min": -750000064.0, "copy_num_tokens": 370.75, "epoch": 1.4470768445238704, "gen_logits_max": 7.022422790527344, "gen_logits_mean": -13.514942169189453, "gen_logits_min": -25.920101165771484, "gen_logits_std": 3.12164306640625, "gen_loss": 0.3372073769569397, "grad_norm": 0.37752044050383915, "learning_rate": 2.198021052631579e-05, "loss": 0.3104, "mean_copy_accuracy": 0.9973989427089691, "mean_gen_accuracy": 0.8594418317079544, "mean_token_accuracy": 0.8954828828573227, "num_tokens": 836890769.0, "sample_num_tokens": 7343.75, "step": 7085, "total_num_tokens": 836920144.0, "z_loss": 0.0005972346989437938 }, { "copy_logits_max": -1.599626064300537, "copy_logits_min": -750000000.0, "copy_num_tokens": 368.6875, "epoch": 1.4472810824610671, "gen_logits_max": 5.386517524719238, "gen_logits_mean": -15.128896713256836, "gen_logits_min": -26.786317825317383, "gen_logits_std": 3.108372688293457, "gen_loss": 0.3030952215194702, "grad_norm": 0.3775734299191488, "learning_rate": 2.1978947368421052e-05, "loss": 0.2846, "mean_copy_accuracy": 0.9960527420043945, "mean_gen_accuracy": 0.875145435333252, "mean_token_accuracy": 0.9037057608366013, "num_tokens": 837154628.0, "sample_num_tokens": 7613.5, "step": 7086, "total_num_tokens": 837185082.0, "z_loss": 0.0005340996431186795 }, { "copy_logits_max": -3.6610121726989746, "copy_logits_min": -750000000.0, "copy_num_tokens": 470.5625, "epoch": 1.447485320398264, "gen_logits_max": 4.476108551025391, "gen_logits_mean": -15.532354354858398, "gen_logits_min": -27.290464401245117, "gen_logits_std": 3.1565728187561035, "gen_loss": 0.284990131855011, "grad_norm": 0.4024671805906311, "learning_rate": 2.1977684210526316e-05, "loss": 0.2878, "mean_copy_accuracy": 0.9966412037611008, "mean_gen_accuracy": 0.8711679577827454, "mean_token_accuracy": 0.9013779163360596, "num_tokens": 837414700.0, "sample_num_tokens": 7912.0, "step": 7087, "total_num_tokens": 837446348.0, "z_loss": 0.0004888265393674374 }, { "copy_logits_max": -4.92376708984375, "copy_logits_min": -750000000.0, "copy_num_tokens": 291.8125, "epoch": 1.4476895583354608, "gen_logits_max": 4.978240966796875, "gen_logits_mean": -15.980119705200195, "gen_logits_min": -27.27289390563965, "gen_logits_std": 3.079564094543457, "gen_loss": 0.3137926459312439, "grad_norm": 0.3855604681381655, "learning_rate": 2.1976421052631577e-05, "loss": 0.3102, "mean_copy_accuracy": 0.9948378801345825, "mean_gen_accuracy": 0.8679544478654861, "mean_token_accuracy": 0.8944890946149826, "num_tokens": 837676148.0, "sample_num_tokens": 7855.0, "step": 7088, "total_num_tokens": 837707568.0, "z_loss": 0.0005209792871028185 }, { "copy_logits_max": -0.76617431640625, "copy_logits_min": -750000000.0, "copy_num_tokens": 483.875, "epoch": 1.4478937962726577, "gen_logits_max": 5.3380937576293945, "gen_logits_mean": -13.775202751159668, "gen_logits_min": -25.236881256103516, "gen_logits_std": 3.0308644771575928, "gen_loss": 0.3014736771583557, "grad_norm": 0.3875325969691768, "learning_rate": 2.1975157894736845e-05, "loss": 0.2888, "mean_copy_accuracy": 0.9971965253353119, "mean_gen_accuracy": 0.8700668662786484, "mean_token_accuracy": 0.9020205587148666, "num_tokens": 837955999.0, "sample_num_tokens": 8888.25, "step": 7089, "total_num_tokens": 837991552.0, "z_loss": 0.000492881634272635 }, { "copy_logits_max": -1.2434232234954834, "copy_logits_min": -750000000.0, "copy_num_tokens": 404.5625, "epoch": 1.4480980342098544, "gen_logits_max": 4.741713523864746, "gen_logits_mean": -13.511521339416504, "gen_logits_min": -24.847932815551758, "gen_logits_std": 2.959453582763672, "gen_loss": 0.26040977239608765, "grad_norm": 0.38238420009870666, "learning_rate": 2.1973894736842106e-05, "loss": 0.279, "mean_copy_accuracy": 0.9966471940279007, "mean_gen_accuracy": 0.8784545511007309, "mean_token_accuracy": 0.9048471450805664, "num_tokens": 838215202.0, "sample_num_tokens": 7623.0, "step": 7090, "total_num_tokens": 838245694.0, "z_loss": 0.0004764948389492929 }, { "copy_logits_max": -3.233339548110962, "copy_logits_min": -750000000.0, "copy_num_tokens": 770.875, "epoch": 1.4483022721470513, "gen_logits_max": 2.5754313468933105, "gen_logits_mean": -17.752477645874023, "gen_logits_min": -29.599998474121094, "gen_logits_std": 3.194225311279297, "gen_loss": 0.25316572189331055, "grad_norm": 0.3642504939013074, "learning_rate": 2.197263157894737e-05, "loss": 0.2735, "mean_copy_accuracy": 0.9974613040685654, "mean_gen_accuracy": 0.8738966435194016, "mean_token_accuracy": 0.9058537185192108, "num_tokens": 838495363.0, "sample_num_tokens": 11989.75, "step": 7091, "total_num_tokens": 838543322.0, "z_loss": 0.00045929706539027393 }, { "copy_logits_max": 0.4942657947540283, "copy_logits_min": -750000000.0, "copy_num_tokens": 810.1875, "epoch": 1.4485065100842482, "gen_logits_max": 3.4140803813934326, "gen_logits_mean": -15.503058433532715, "gen_logits_min": -27.6886043548584, "gen_logits_std": 3.1124446392059326, "gen_loss": 0.26201331615448, "grad_norm": 0.3317859941928066, "learning_rate": 2.197136842105263e-05, "loss": 0.273, "mean_copy_accuracy": 0.9971188306808472, "mean_gen_accuracy": 0.8731264173984528, "mean_token_accuracy": 0.907151997089386, "num_tokens": 838794424.0, "sample_num_tokens": 10397.0, "step": 7092, "total_num_tokens": 838836012.0, "z_loss": 0.0005213910480961204 }, { "copy_logits_max": -1.5974338054656982, "copy_logits_min": -750000064.0, "copy_num_tokens": 451.125, "epoch": 1.448710748021445, "gen_logits_max": 4.202644348144531, "gen_logits_mean": -15.569313049316406, "gen_logits_min": -27.26607322692871, "gen_logits_std": 3.0947163105010986, "gen_loss": 0.2910597026348114, "grad_norm": 0.3769635332875724, "learning_rate": 2.1970105263157895e-05, "loss": 0.2832, "mean_copy_accuracy": 0.9970708638429642, "mean_gen_accuracy": 0.8740974515676498, "mean_token_accuracy": 0.9042463153600693, "num_tokens": 839067694.0, "sample_num_tokens": 8314.5, "step": 7093, "total_num_tokens": 839100952.0, "z_loss": 0.0005171720404177904 }, { "copy_logits_max": -1.2716885805130005, "copy_logits_min": -750000000.0, "copy_num_tokens": 407.5625, "epoch": 1.4489149859586419, "gen_logits_max": 3.8151512145996094, "gen_logits_mean": -15.413030624389648, "gen_logits_min": -27.378568649291992, "gen_logits_std": 3.0947563648223877, "gen_loss": 0.25117990374565125, "grad_norm": 0.35747159340255363, "learning_rate": 2.1968842105263156e-05, "loss": 0.261, "mean_copy_accuracy": 0.996550127863884, "mean_gen_accuracy": 0.8852577656507492, "mean_token_accuracy": 0.9113838225603104, "num_tokens": 839332900.0, "sample_num_tokens": 7764.5, "step": 7094, "total_num_tokens": 839363958.0, "z_loss": 0.00047485344111919403 }, { "copy_logits_max": -2.5609560012817383, "copy_logits_min": -687500032.0, "copy_num_tokens": 331.0625, "epoch": 1.4491192238958386, "gen_logits_max": 5.485495567321777, "gen_logits_mean": -14.867890357971191, "gen_logits_min": -26.772968292236328, "gen_logits_std": 3.098933696746826, "gen_loss": 0.2862269878387451, "grad_norm": 0.3595236092532639, "learning_rate": 2.196757894736842e-05, "loss": 0.2868, "mean_copy_accuracy": 0.9973730742931366, "mean_gen_accuracy": 0.8746453374624252, "mean_token_accuracy": 0.9049599468708038, "num_tokens": 839612718.0, "sample_num_tokens": 7678.5, "step": 7095, "total_num_tokens": 839643432.0, "z_loss": 0.0005338104674592614 }, { "copy_logits_max": -2.896125555038452, "copy_logits_min": -687500096.0, "copy_num_tokens": 533.6875, "epoch": 1.4493234618330355, "gen_logits_max": 3.654938220977783, "gen_logits_mean": -16.556896209716797, "gen_logits_min": -28.443077087402344, "gen_logits_std": 3.110358715057373, "gen_loss": 0.2572534680366516, "grad_norm": 0.3517655988868348, "learning_rate": 2.196631578947368e-05, "loss": 0.266, "mean_copy_accuracy": 0.996892511844635, "mean_gen_accuracy": 0.875733807682991, "mean_token_accuracy": 0.9102549999952316, "num_tokens": 839899304.0, "sample_num_tokens": 8952.0, "step": 7096, "total_num_tokens": 839935112.0, "z_loss": 0.0004688529297709465 }, { "copy_logits_max": -1.8891632556915283, "copy_logits_min": -750000000.0, "copy_num_tokens": 361.75, "epoch": 1.4495276997702322, "gen_logits_max": 4.493006706237793, "gen_logits_mean": -15.798637390136719, "gen_logits_min": -27.55565071105957, "gen_logits_std": 3.0714244842529297, "gen_loss": 0.2571685314178467, "grad_norm": 0.34950801403545106, "learning_rate": 2.196505263157895e-05, "loss": 0.2726, "mean_copy_accuracy": 0.9961602091789246, "mean_gen_accuracy": 0.8829137235879898, "mean_token_accuracy": 0.9074511229991913, "num_tokens": 840157580.0, "sample_num_tokens": 7929.0, "step": 7097, "total_num_tokens": 840189296.0, "z_loss": 0.000478610658319667 }, { "copy_logits_max": -1.091089129447937, "copy_logits_min": -750000000.0, "copy_num_tokens": 600.8125, "epoch": 1.4497319377074291, "gen_logits_max": 3.848179340362549, "gen_logits_mean": -15.655658721923828, "gen_logits_min": -27.514835357666016, "gen_logits_std": 3.081974983215332, "gen_loss": 0.25710737705230713, "grad_norm": 0.35558476439675885, "learning_rate": 2.1963789473684214e-05, "loss": 0.2694, "mean_copy_accuracy": 0.9971054047346115, "mean_gen_accuracy": 0.8790372461080551, "mean_token_accuracy": 0.9097408950328827, "num_tokens": 840436836.0, "sample_num_tokens": 10172.5, "step": 7098, "total_num_tokens": 840477526.0, "z_loss": 0.00048285030061379075 }, { "copy_logits_max": -0.413466215133667, "copy_logits_min": -750000000.0, "copy_num_tokens": 532.125, "epoch": 1.449936175644626, "gen_logits_max": 3.607171058654785, "gen_logits_mean": -16.031105041503906, "gen_logits_min": -28.282947540283203, "gen_logits_std": 3.119227409362793, "gen_loss": 0.2744159400463104, "grad_norm": 0.3445638957445179, "learning_rate": 2.1962526315789475e-05, "loss": 0.2724, "mean_copy_accuracy": 0.9970099329948425, "mean_gen_accuracy": 0.8737398386001587, "mean_token_accuracy": 0.9068117737770081, "num_tokens": 840733964.0, "sample_num_tokens": 9021.5, "step": 7099, "total_num_tokens": 840770050.0, "z_loss": 0.00047647079918533564 }, { "copy_logits_max": -1.1655443906784058, "copy_logits_min": -625000064.0, "copy_num_tokens": 398.4375, "epoch": 1.4501404135818228, "gen_logits_max": 5.0942888259887695, "gen_logits_mean": -16.275434494018555, "gen_logits_min": -28.299720764160156, "gen_logits_std": 3.129844903945923, "gen_loss": 0.2942124009132385, "grad_norm": 0.3907347711135099, "learning_rate": 2.196126315789474e-05, "loss": 0.3045, "mean_copy_accuracy": 0.9963723868131638, "mean_gen_accuracy": 0.8689365535974503, "mean_token_accuracy": 0.8958916217088699, "num_tokens": 840993616.0, "sample_num_tokens": 8815.5, "step": 7100, "total_num_tokens": 841028878.0, "z_loss": 0.0005498587270267308 }, { "copy_logits_max": 0.45145028829574585, "copy_logits_min": -750000064.0, "copy_num_tokens": 749.9375, "epoch": 1.4503446515190197, "gen_logits_max": 4.377712726593018, "gen_logits_mean": -14.232664108276367, "gen_logits_min": -25.898136138916016, "gen_logits_std": 3.02632999420166, "gen_loss": 0.24267607927322388, "grad_norm": 0.3592928963549536, "learning_rate": 2.196e-05, "loss": 0.251, "mean_copy_accuracy": 0.9968219995498657, "mean_gen_accuracy": 0.8830492794513702, "mean_token_accuracy": 0.9149049669504166, "num_tokens": 841264899.0, "sample_num_tokens": 9957.25, "step": 7101, "total_num_tokens": 841304728.0, "z_loss": 0.00044073781464248896 }, { "copy_logits_max": -0.4156873822212219, "copy_logits_min": -750000064.0, "copy_num_tokens": 608.5625, "epoch": 1.4505488894562166, "gen_logits_max": 2.9402356147766113, "gen_logits_mean": -16.869651794433594, "gen_logits_min": -28.721038818359375, "gen_logits_std": 3.1264801025390625, "gen_loss": 0.27306464314460754, "grad_norm": 0.39218857066367313, "learning_rate": 2.1958736842105264e-05, "loss": 0.2874, "mean_copy_accuracy": 0.9965852349996567, "mean_gen_accuracy": 0.8744165450334549, "mean_token_accuracy": 0.9019755423069, "num_tokens": 841538499.0, "sample_num_tokens": 9647.75, "step": 7102, "total_num_tokens": 841577090.0, "z_loss": 0.00045562133891507983 }, { "copy_logits_max": 0.9938012361526489, "copy_logits_min": -687500032.0, "copy_num_tokens": 458.4375, "epoch": 1.4507531273934133, "gen_logits_max": 4.014463901519775, "gen_logits_mean": -15.226655960083008, "gen_logits_min": -26.851680755615234, "gen_logits_std": 3.046088218688965, "gen_loss": 0.24472522735595703, "grad_norm": 0.3542000625013837, "learning_rate": 2.1957473684210525e-05, "loss": 0.2678, "mean_copy_accuracy": 0.9967711567878723, "mean_gen_accuracy": 0.8799636363983154, "mean_token_accuracy": 0.9094510227441788, "num_tokens": 841825853.0, "sample_num_tokens": 8069.25, "step": 7103, "total_num_tokens": 841858130.0, "z_loss": 0.0004328897048253566 }, { "copy_logits_max": -1.2321345806121826, "copy_logits_min": -750000000.0, "copy_num_tokens": 506.9375, "epoch": 1.45095736533061, "gen_logits_max": 3.694253921508789, "gen_logits_mean": -16.31736183166504, "gen_logits_min": -27.926008224487305, "gen_logits_std": 3.10990571975708, "gen_loss": 0.26688089966773987, "grad_norm": 0.37594107112766434, "learning_rate": 2.195621052631579e-05, "loss": 0.2966, "mean_copy_accuracy": 0.997325524687767, "mean_gen_accuracy": 0.8686004728078842, "mean_token_accuracy": 0.9039603769779205, "num_tokens": 842116087.0, "sample_num_tokens": 9084.25, "step": 7104, "total_num_tokens": 842152424.0, "z_loss": 0.0004251970676705241 }, { "copy_logits_max": 1.9742302894592285, "copy_logits_min": -750000000.0, "copy_num_tokens": 565.3125, "epoch": 1.451161603267807, "gen_logits_max": 4.201796531677246, "gen_logits_mean": -14.780288696289062, "gen_logits_min": -26.764432907104492, "gen_logits_std": 3.062452793121338, "gen_loss": 0.28845715522766113, "grad_norm": 0.33970817480768767, "learning_rate": 2.1954947368421054e-05, "loss": 0.274, "mean_copy_accuracy": 0.9973036199808121, "mean_gen_accuracy": 0.8742440044879913, "mean_token_accuracy": 0.9075641930103302, "num_tokens": 842400994.0, "sample_num_tokens": 9206.5, "step": 7105, "total_num_tokens": 842437820.0, "z_loss": 0.00044903199886903167 }, { "copy_logits_max": -0.09641456604003906, "copy_logits_min": -750000000.0, "copy_num_tokens": 559.6875, "epoch": 1.451365841205004, "gen_logits_max": 2.7930638790130615, "gen_logits_mean": -16.856136322021484, "gen_logits_min": -28.504032135009766, "gen_logits_std": 3.123854160308838, "gen_loss": 0.2844851613044739, "grad_norm": 0.41168967372164156, "learning_rate": 2.1953684210526318e-05, "loss": 0.2964, "mean_copy_accuracy": 0.9960132241249084, "mean_gen_accuracy": 0.8718030154705048, "mean_token_accuracy": 0.8982189148664474, "num_tokens": 842661122.0, "sample_num_tokens": 8835.5, "step": 7106, "total_num_tokens": 842696464.0, "z_loss": 0.0004070323193445802 }, { "copy_logits_max": 1.6853916645050049, "copy_logits_min": -750000000.0, "copy_num_tokens": 515.8125, "epoch": 1.4515700791422006, "gen_logits_max": 4.1068949699401855, "gen_logits_mean": -15.482860565185547, "gen_logits_min": -27.19894790649414, "gen_logits_std": 3.1150412559509277, "gen_loss": 0.24651303887367249, "grad_norm": 0.38551456665732253, "learning_rate": 2.195242105263158e-05, "loss": 0.288, "mean_copy_accuracy": 0.9970977008342743, "mean_gen_accuracy": 0.8738829046487808, "mean_token_accuracy": 0.9026817679405212, "num_tokens": 842926474.0, "sample_num_tokens": 8190.0, "step": 7107, "total_num_tokens": 842959234.0, "z_loss": 0.00036485542659647763 }, { "copy_logits_max": 2.9884591102600098, "copy_logits_min": -625000064.0, "copy_num_tokens": 677.6875, "epoch": 1.4517743170793975, "gen_logits_max": 2.9446659088134766, "gen_logits_mean": -16.30910301208496, "gen_logits_min": -28.222930908203125, "gen_logits_std": 3.1109259128570557, "gen_loss": 0.2857506573200226, "grad_norm": 0.36896611715693944, "learning_rate": 2.1951157894736843e-05, "loss": 0.2817, "mean_copy_accuracy": 0.9975582361221313, "mean_gen_accuracy": 0.8644085079431534, "mean_token_accuracy": 0.9046277552843094, "num_tokens": 843217188.0, "sample_num_tokens": 9503.0, "step": 7108, "total_num_tokens": 843255200.0, "z_loss": 0.0005108597688376904 }, { "copy_logits_max": -0.617599368095398, "copy_logits_min": -750000000.0, "copy_num_tokens": 475.25, "epoch": 1.4519785550165945, "gen_logits_max": 2.745260238647461, "gen_logits_mean": -17.832494735717773, "gen_logits_min": -29.566362380981445, "gen_logits_std": 3.168062925338745, "gen_loss": 0.2733863592147827, "grad_norm": 0.40351653287701533, "learning_rate": 2.1949894736842104e-05, "loss": 0.2797, "mean_copy_accuracy": 0.9968007504940033, "mean_gen_accuracy": 0.8742249310016632, "mean_token_accuracy": 0.9046721309423447, "num_tokens": 843488458.0, "sample_num_tokens": 8402.0, "step": 7109, "total_num_tokens": 843522066.0, "z_loss": 0.0004223016439937055 }, { "copy_logits_max": 3.534994602203369, "copy_logits_min": -750000000.0, "copy_num_tokens": 710.6875, "epoch": 1.4521827929537912, "gen_logits_max": 3.133537769317627, "gen_logits_mean": -15.986787796020508, "gen_logits_min": -28.18572425842285, "gen_logits_std": 3.1894912719726562, "gen_loss": 0.26459014415740967, "grad_norm": 0.39222812718979533, "learning_rate": 2.194863157894737e-05, "loss": 0.3024, "mean_copy_accuracy": 0.9971190690994263, "mean_gen_accuracy": 0.8653139173984528, "mean_token_accuracy": 0.8980516344308853, "num_tokens": 843754562.0, "sample_num_tokens": 9280.5, "step": 7110, "total_num_tokens": 843791684.0, "z_loss": 0.0004575008642859757 }, { "copy_logits_max": 0.8178176283836365, "copy_logits_min": -625000064.0, "copy_num_tokens": 439.6875, "epoch": 1.4523870308909879, "gen_logits_max": 4.248553276062012, "gen_logits_mean": -14.752679824829102, "gen_logits_min": -26.40869140625, "gen_logits_std": 3.0159435272216797, "gen_loss": 0.29165399074554443, "grad_norm": 0.347045533454903, "learning_rate": 2.1947368421052633e-05, "loss": 0.282, "mean_copy_accuracy": 0.99654221534729, "mean_gen_accuracy": 0.875274270772934, "mean_token_accuracy": 0.9053610563278198, "num_tokens": 844030102.0, "sample_num_tokens": 7976.0, "step": 7111, "total_num_tokens": 844062006.0, "z_loss": 0.0005264083156362176 }, { "copy_logits_max": 2.136834144592285, "copy_logits_min": -750000000.0, "copy_num_tokens": 392.375, "epoch": 1.4525912688281848, "gen_logits_max": 4.76804256439209, "gen_logits_mean": -15.2606840133667, "gen_logits_min": -28.00238800048828, "gen_logits_std": 3.122682571411133, "gen_loss": 0.2933119833469391, "grad_norm": 0.37022246601194636, "learning_rate": 2.1946105263157894e-05, "loss": 0.2895, "mean_copy_accuracy": 0.9961780160665512, "mean_gen_accuracy": 0.8723461478948593, "mean_token_accuracy": 0.9018024504184723, "num_tokens": 844286042.0, "sample_num_tokens": 7659.0, "step": 7112, "total_num_tokens": 844316678.0, "z_loss": 0.0005354488966986537 }, { "copy_logits_max": -1.9134479761123657, "copy_logits_min": -687500032.0, "copy_num_tokens": 545.875, "epoch": 1.4527955067653817, "gen_logits_max": 3.3497135639190674, "gen_logits_mean": -17.424741744995117, "gen_logits_min": -29.169118881225586, "gen_logits_std": 3.147488594055176, "gen_loss": 0.2444121241569519, "grad_norm": 0.37295335166141297, "learning_rate": 2.194484210526316e-05, "loss": 0.255, "mean_copy_accuracy": 0.9964828491210938, "mean_gen_accuracy": 0.8856457471847534, "mean_token_accuracy": 0.9129723757505417, "num_tokens": 844549715.0, "sample_num_tokens": 9088.75, "step": 7113, "total_num_tokens": 844586070.0, "z_loss": 0.0004864717775490135 }, { "copy_logits_max": -2.1629953384399414, "copy_logits_min": -750000064.0, "copy_num_tokens": 406.0, "epoch": 1.4529997447025784, "gen_logits_max": 4.047411918640137, "gen_logits_mean": -17.719623565673828, "gen_logits_min": -29.14487075805664, "gen_logits_std": 3.1532087326049805, "gen_loss": 0.2541094124317169, "grad_norm": 0.3166541851248292, "learning_rate": 2.1943578947368422e-05, "loss": 0.2502, "mean_copy_accuracy": 0.9971597343683243, "mean_gen_accuracy": 0.8863544166088104, "mean_token_accuracy": 0.9142087996006012, "num_tokens": 844857804.0, "sample_num_tokens": 9198.5, "step": 7114, "total_num_tokens": 844894598.0, "z_loss": 0.000492763880174607 }, { "copy_logits_max": -0.6748903393745422, "copy_logits_min": -750000000.0, "copy_num_tokens": 348.9375, "epoch": 1.4532039826397753, "gen_logits_max": 5.219203948974609, "gen_logits_mean": -15.309900283813477, "gen_logits_min": -27.164499282836914, "gen_logits_std": 3.0499367713928223, "gen_loss": 0.3127811551094055, "grad_norm": 0.34615239388367547, "learning_rate": 2.1942315789473687e-05, "loss": 0.2851, "mean_copy_accuracy": 0.9960970133543015, "mean_gen_accuracy": 0.8757955431938171, "mean_token_accuracy": 0.903157114982605, "num_tokens": 845126337.0, "sample_num_tokens": 8162.25, "step": 7115, "total_num_tokens": 845158986.0, "z_loss": 0.0005426947609521449 }, { "copy_logits_max": 0.7924366593360901, "copy_logits_min": -750000000.0, "copy_num_tokens": 595.375, "epoch": 1.4534082205769723, "gen_logits_max": 3.162479877471924, "gen_logits_mean": -16.606380462646484, "gen_logits_min": -28.48523712158203, "gen_logits_std": 3.1430909633636475, "gen_loss": 0.2705203890800476, "grad_norm": 0.35545422456145565, "learning_rate": 2.1941052631578948e-05, "loss": 0.2892, "mean_copy_accuracy": 0.9972969442605972, "mean_gen_accuracy": 0.8692176043987274, "mean_token_accuracy": 0.9015298932790756, "num_tokens": 845403061.0, "sample_num_tokens": 8840.75, "step": 7116, "total_num_tokens": 845438424.0, "z_loss": 0.0005306293023750186 }, { "copy_logits_max": -0.8675774931907654, "copy_logits_min": -750000000.0, "copy_num_tokens": 455.0, "epoch": 1.453612458514169, "gen_logits_max": 4.720977783203125, "gen_logits_mean": -15.144222259521484, "gen_logits_min": -26.96873664855957, "gen_logits_std": 3.077868700027466, "gen_loss": 0.2969980239868164, "grad_norm": 0.35710797867093164, "learning_rate": 2.1939789473684212e-05, "loss": 0.274, "mean_copy_accuracy": 0.9958247691392899, "mean_gen_accuracy": 0.876246988773346, "mean_token_accuracy": 0.906306192278862, "num_tokens": 845665686.0, "sample_num_tokens": 8599.0, "step": 7117, "total_num_tokens": 845700082.0, "z_loss": 0.0005740197957493365 }, { "copy_logits_max": -2.678684949874878, "copy_logits_min": -750000000.0, "copy_num_tokens": 363.8125, "epoch": 1.453816696451366, "gen_logits_max": 4.248698711395264, "gen_logits_mean": -16.979398727416992, "gen_logits_min": -28.87570571899414, "gen_logits_std": 3.1587917804718018, "gen_loss": 0.28106486797332764, "grad_norm": 0.3447386672513696, "learning_rate": 2.1938526315789473e-05, "loss": 0.2639, "mean_copy_accuracy": 0.9963214546442032, "mean_gen_accuracy": 0.8811298608779907, "mean_token_accuracy": 0.911963939666748, "num_tokens": 845949425.0, "sample_num_tokens": 7977.75, "step": 7118, "total_num_tokens": 845981336.0, "z_loss": 0.0005164634203538299 }, { "copy_logits_max": 3.0448288917541504, "copy_logits_min": -750000000.0, "copy_num_tokens": 652.375, "epoch": 1.4540209343885626, "gen_logits_max": 2.5304975509643555, "gen_logits_mean": -16.601451873779297, "gen_logits_min": -28.927406311035156, "gen_logits_std": 3.1882176399230957, "gen_loss": 0.27531421184539795, "grad_norm": 0.3484652837438268, "learning_rate": 2.1937263157894737e-05, "loss": 0.2657, "mean_copy_accuracy": 0.9970609843730927, "mean_gen_accuracy": 0.8768329918384552, "mean_token_accuracy": 0.909681648015976, "num_tokens": 846218629.0, "sample_num_tokens": 8735.75, "step": 7119, "total_num_tokens": 846253572.0, "z_loss": 0.0005069889011792839 }, { "copy_logits_max": 0.6308629512786865, "copy_logits_min": -687500032.0, "copy_num_tokens": 552.6875, "epoch": 1.4542251723257595, "gen_logits_max": 3.853569984436035, "gen_logits_mean": -16.200927734375, "gen_logits_min": -28.41283416748047, "gen_logits_std": 3.1792140007019043, "gen_loss": 0.28219395875930786, "grad_norm": 0.3624830118763393, "learning_rate": 2.1935999999999998e-05, "loss": 0.2836, "mean_copy_accuracy": 0.9962621033191681, "mean_gen_accuracy": 0.8776305317878723, "mean_token_accuracy": 0.9032468646764755, "num_tokens": 846467356.0, "sample_num_tokens": 9096.5, "step": 7120, "total_num_tokens": 846503742.0, "z_loss": 0.0005143095622770488 }, { "copy_logits_max": 1.356209635734558, "copy_logits_min": -750000000.0, "copy_num_tokens": 421.6875, "epoch": 1.4544294102629562, "gen_logits_max": 4.9530930519104, "gen_logits_mean": -14.622000694274902, "gen_logits_min": -26.730552673339844, "gen_logits_std": 3.084705352783203, "gen_loss": 0.32024896144866943, "grad_norm": 0.34764829916960377, "learning_rate": 2.1934736842105266e-05, "loss": 0.2917, "mean_copy_accuracy": 0.9966957718133926, "mean_gen_accuracy": 0.871869370341301, "mean_token_accuracy": 0.9007960557937622, "num_tokens": 846734137.0, "sample_num_tokens": 7726.25, "step": 7121, "total_num_tokens": 846765042.0, "z_loss": 0.0005770243005827069 }, { "copy_logits_max": -1.0127605199813843, "copy_logits_min": -687500032.0, "copy_num_tokens": 381.625, "epoch": 1.4546336482001532, "gen_logits_max": 4.642570972442627, "gen_logits_mean": -15.302738189697266, "gen_logits_min": -27.462366104125977, "gen_logits_std": 3.1249165534973145, "gen_loss": 0.3043544888496399, "grad_norm": 0.37455442877724365, "learning_rate": 2.1933473684210527e-05, "loss": 0.289, "mean_copy_accuracy": 0.9969171732664108, "mean_gen_accuracy": 0.8723078221082687, "mean_token_accuracy": 0.9019911289215088, "num_tokens": 846991743.0, "sample_num_tokens": 7439.25, "step": 7122, "total_num_tokens": 847021500.0, "z_loss": 0.0005231336108408868 }, { "copy_logits_max": -0.40383782982826233, "copy_logits_min": -750000000.0, "copy_num_tokens": 631.5625, "epoch": 1.45483788613735, "gen_logits_max": 3.029970645904541, "gen_logits_mean": -16.110660552978516, "gen_logits_min": -28.095748901367188, "gen_logits_std": 3.1786556243896484, "gen_loss": 0.23458944261074066, "grad_norm": 0.3720662084812471, "learning_rate": 2.193221052631579e-05, "loss": 0.2769, "mean_copy_accuracy": 0.9973815828561783, "mean_gen_accuracy": 0.8732002973556519, "mean_token_accuracy": 0.904968798160553, "num_tokens": 847248795.0, "sample_num_tokens": 9074.25, "step": 7123, "total_num_tokens": 847285092.0, "z_loss": 0.0004042315704282373 }, { "copy_logits_max": 0.3251610994338989, "copy_logits_min": -687500032.0, "copy_num_tokens": 503.25, "epoch": 1.4550421240745468, "gen_logits_max": 4.670177459716797, "gen_logits_mean": -14.848223686218262, "gen_logits_min": -26.661636352539062, "gen_logits_std": 3.0748274326324463, "gen_loss": 0.2603234648704529, "grad_norm": 0.39365173995636726, "learning_rate": 2.1930947368421052e-05, "loss": 0.2718, "mean_copy_accuracy": 0.9957853704690933, "mean_gen_accuracy": 0.8771432191133499, "mean_token_accuracy": 0.9060443490743637, "num_tokens": 847530334.0, "sample_num_tokens": 9565.5, "step": 7124, "total_num_tokens": 847568596.0, "z_loss": 0.0004385033971630037 }, { "copy_logits_max": -3.9981296062469482, "copy_logits_min": -750000000.0, "copy_num_tokens": 554.0, "epoch": 1.4552463620117437, "gen_logits_max": 3.5487704277038574, "gen_logits_mean": -17.037002563476562, "gen_logits_min": -28.645008087158203, "gen_logits_std": 3.1866493225097656, "gen_loss": 0.2509077191352844, "grad_norm": 0.3621137181744738, "learning_rate": 2.1929684210526316e-05, "loss": 0.2593, "mean_copy_accuracy": 0.9970046877861023, "mean_gen_accuracy": 0.8839870393276215, "mean_token_accuracy": 0.9116059094667435, "num_tokens": 847779846.0, "sample_num_tokens": 9639.5, "step": 7125, "total_num_tokens": 847818404.0, "z_loss": 0.0004799002781510353 }, { "copy_logits_max": -3.164712905883789, "copy_logits_min": -687500032.0, "copy_num_tokens": 552.625, "epoch": 1.4554505999489404, "gen_logits_max": 3.744962692260742, "gen_logits_mean": -16.572877883911133, "gen_logits_min": -28.43480682373047, "gen_logits_std": 3.1719181537628174, "gen_loss": 0.2681695818901062, "grad_norm": 0.3456510567916284, "learning_rate": 2.192842105263158e-05, "loss": 0.2646, "mean_copy_accuracy": 0.9971112161874771, "mean_gen_accuracy": 0.8812494277954102, "mean_token_accuracy": 0.9120226800441742, "num_tokens": 848080546.0, "sample_num_tokens": 10252.5, "step": 7126, "total_num_tokens": 848121556.0, "z_loss": 0.0006308567244559526 }, { "copy_logits_max": -0.895400881767273, "copy_logits_min": -750000000.0, "copy_num_tokens": 642.0625, "epoch": 1.4556548378861374, "gen_logits_max": 4.168277740478516, "gen_logits_mean": -14.26740837097168, "gen_logits_min": -26.25714874267578, "gen_logits_std": 3.106678009033203, "gen_loss": 0.2159728705883026, "grad_norm": 0.36550169053683584, "learning_rate": 2.192715789473684e-05, "loss": 0.2534, "mean_copy_accuracy": 0.9977655708789825, "mean_gen_accuracy": 0.8823288530111313, "mean_token_accuracy": 0.9132494181394577, "num_tokens": 848358445.0, "sample_num_tokens": 9103.75, "step": 7127, "total_num_tokens": 848394860.0, "z_loss": 0.0004857457533944398 }, { "copy_logits_max": -4.145981788635254, "copy_logits_min": -750000000.0, "copy_num_tokens": 359.75, "epoch": 1.455859075823334, "gen_logits_max": 5.230588912963867, "gen_logits_mean": -14.631512641906738, "gen_logits_min": -26.3175106048584, "gen_logits_std": 3.094404697418213, "gen_loss": 0.31680452823638916, "grad_norm": 0.3751024561204364, "learning_rate": 2.1925894736842106e-05, "loss": 0.29, "mean_copy_accuracy": 0.9956336170434952, "mean_gen_accuracy": 0.8737767189741135, "mean_token_accuracy": 0.9020587503910065, "num_tokens": 848617548.0, "sample_num_tokens": 7567.0, "step": 7128, "total_num_tokens": 848647816.0, "z_loss": 0.0005582158919423819 }, { "copy_logits_max": -1.3822568655014038, "copy_logits_min": -750000000.0, "copy_num_tokens": 464.9375, "epoch": 1.456063313760531, "gen_logits_max": 3.7254409790039062, "gen_logits_mean": -16.562185287475586, "gen_logits_min": -28.429397583007812, "gen_logits_std": 3.15958833694458, "gen_loss": 0.30793389678001404, "grad_norm": 0.3470005134129127, "learning_rate": 2.192463157894737e-05, "loss": 0.2891, "mean_copy_accuracy": 0.9962989240884781, "mean_gen_accuracy": 0.867752805352211, "mean_token_accuracy": 0.9028447419404984, "num_tokens": 848891933.0, "sample_num_tokens": 8217.25, "step": 7129, "total_num_tokens": 848924802.0, "z_loss": 0.0005575432442128658 }, { "copy_logits_max": -0.5560693740844727, "copy_logits_min": -750000064.0, "copy_num_tokens": 555.0, "epoch": 1.456267551697728, "gen_logits_max": 3.853848695755005, "gen_logits_mean": -15.434752464294434, "gen_logits_min": -27.381832122802734, "gen_logits_std": 3.163734197616577, "gen_loss": 0.253844678401947, "grad_norm": 0.370210343460267, "learning_rate": 2.1923368421052634e-05, "loss": 0.2519, "mean_copy_accuracy": 0.9950993359088898, "mean_gen_accuracy": 0.8836924731731415, "mean_token_accuracy": 0.9144146144390106, "num_tokens": 849175528.0, "sample_num_tokens": 8751.0, "step": 7130, "total_num_tokens": 849210532.0, "z_loss": 0.0004846459487453103 }, { "copy_logits_max": -5.423893451690674, "copy_logits_min": -750000000.0, "copy_num_tokens": 370.875, "epoch": 1.4564717896349246, "gen_logits_max": 3.7345337867736816, "gen_logits_mean": -18.143280029296875, "gen_logits_min": -29.66087532043457, "gen_logits_std": 3.200343608856201, "gen_loss": 0.27354270219802856, "grad_norm": 0.3346755334344836, "learning_rate": 2.1922105263157895e-05, "loss": 0.2556, "mean_copy_accuracy": 0.9968069642782211, "mean_gen_accuracy": 0.8842241019010544, "mean_token_accuracy": 0.9130074232816696, "num_tokens": 849452722.0, "sample_num_tokens": 8867.5, "step": 7131, "total_num_tokens": 849488192.0, "z_loss": 0.0005061699775978923 }, { "copy_logits_max": -1.2995450496673584, "copy_logits_min": -750000000.0, "copy_num_tokens": 342.5, "epoch": 1.4566760275721216, "gen_logits_max": 4.047048568725586, "gen_logits_mean": -16.574731826782227, "gen_logits_min": -28.27691650390625, "gen_logits_std": 3.1824402809143066, "gen_loss": 0.2829189896583557, "grad_norm": 0.3664469595597972, "learning_rate": 2.192084210526316e-05, "loss": 0.2632, "mean_copy_accuracy": 0.9974082112312317, "mean_gen_accuracy": 0.8786070942878723, "mean_token_accuracy": 0.9104248881340027, "num_tokens": 849736118.0, "sample_num_tokens": 7074.0, "step": 7132, "total_num_tokens": 849764414.0, "z_loss": 0.0005312110297381878 }, { "copy_logits_max": -0.4657495617866516, "copy_logits_min": -687500032.0, "copy_num_tokens": 635.25, "epoch": 1.4568802655093185, "gen_logits_max": 3.969918966293335, "gen_logits_mean": -15.319999694824219, "gen_logits_min": -27.40039825439453, "gen_logits_std": 3.142836570739746, "gen_loss": 0.2648337483406067, "grad_norm": 0.36866571542447085, "learning_rate": 2.191957894736842e-05, "loss": 0.2668, "mean_copy_accuracy": 0.9964683353900909, "mean_gen_accuracy": 0.8792262077331543, "mean_token_accuracy": 0.9098152071237564, "num_tokens": 849999558.0, "sample_num_tokens": 9281.0, "step": 7133, "total_num_tokens": 850036682.0, "z_loss": 0.0005253264680504799 }, { "copy_logits_max": -5.827280044555664, "copy_logits_min": -750000000.0, "copy_num_tokens": 454.8125, "epoch": 1.4570845034465152, "gen_logits_max": 3.426145315170288, "gen_logits_mean": -16.693506240844727, "gen_logits_min": -28.569156646728516, "gen_logits_std": 3.207319974899292, "gen_loss": 0.24939733743667603, "grad_norm": 0.36303647400111244, "learning_rate": 2.1918315789473685e-05, "loss": 0.2731, "mean_copy_accuracy": 0.9964819550514221, "mean_gen_accuracy": 0.873854324221611, "mean_token_accuracy": 0.9058901965618134, "num_tokens": 850274331.0, "sample_num_tokens": 7710.25, "step": 7134, "total_num_tokens": 850305172.0, "z_loss": 0.00040771791827864945 }, { "copy_logits_max": -4.179566383361816, "copy_logits_min": -687500032.0, "copy_num_tokens": 320.4375, "epoch": 1.457288741383712, "gen_logits_max": 4.813884258270264, "gen_logits_mean": -15.479447364807129, "gen_logits_min": -27.329832077026367, "gen_logits_std": 3.1365904808044434, "gen_loss": 0.3026607930660248, "grad_norm": 0.3471008652296804, "learning_rate": 2.1917052631578946e-05, "loss": 0.2816, "mean_copy_accuracy": 0.9956378489732742, "mean_gen_accuracy": 0.871613085269928, "mean_token_accuracy": 0.9034290164709091, "num_tokens": 850542992.0, "sample_num_tokens": 6909.5, "step": 7135, "total_num_tokens": 850570630.0, "z_loss": 0.0004976079799234867 }, { "copy_logits_max": 0.13917982578277588, "copy_logits_min": -687500032.0, "copy_num_tokens": 523.5, "epoch": 1.4574929793209088, "gen_logits_max": 3.2230868339538574, "gen_logits_mean": -16.355545043945312, "gen_logits_min": -28.865638732910156, "gen_logits_std": 3.2011990547180176, "gen_loss": 0.24132275581359863, "grad_norm": 0.3868591874603992, "learning_rate": 2.191578947368421e-05, "loss": 0.2797, "mean_copy_accuracy": 0.9943865537643433, "mean_gen_accuracy": 0.8785114288330078, "mean_token_accuracy": 0.9053614735603333, "num_tokens": 850811744.0, "sample_num_tokens": 8175.0, "step": 7136, "total_num_tokens": 850844444.0, "z_loss": 0.00046683987602591515 }, { "copy_logits_max": -2.8305766582489014, "copy_logits_min": -750000000.0, "copy_num_tokens": 821.5, "epoch": 1.4576972172581057, "gen_logits_max": 2.424239158630371, "gen_logits_mean": -16.76449203491211, "gen_logits_min": -28.927614212036133, "gen_logits_std": 3.1982240676879883, "gen_loss": 0.2491452395915985, "grad_norm": 0.372817068442675, "learning_rate": 2.191452631578947e-05, "loss": 0.2763, "mean_copy_accuracy": 0.9962710291147232, "mean_gen_accuracy": 0.8747139275074005, "mean_token_accuracy": 0.9060812741518021, "num_tokens": 851089676.0, "sample_num_tokens": 10832.0, "step": 7137, "total_num_tokens": 851133004.0, "z_loss": 0.00043742862180806696 }, { "copy_logits_max": -5.1718316078186035, "copy_logits_min": -750000000.0, "copy_num_tokens": 309.625, "epoch": 1.4579014551953025, "gen_logits_max": 4.213569641113281, "gen_logits_mean": -16.471155166625977, "gen_logits_min": -28.45532989501953, "gen_logits_std": 3.168588638305664, "gen_loss": 0.29621726274490356, "grad_norm": 0.3549375893321675, "learning_rate": 2.191326315789474e-05, "loss": 0.2918, "mean_copy_accuracy": 0.9972081333398819, "mean_gen_accuracy": 0.8749447911977768, "mean_token_accuracy": 0.9010615646839142, "num_tokens": 851354661.0, "sample_num_tokens": 7173.25, "step": 7138, "total_num_tokens": 851383354.0, "z_loss": 0.0004957925411872566 }, { "copy_logits_max": -5.102085590362549, "copy_logits_min": -687500032.0, "copy_num_tokens": 451.25, "epoch": 1.4581056931324994, "gen_logits_max": 3.6364879608154297, "gen_logits_mean": -16.368572235107422, "gen_logits_min": -28.382888793945312, "gen_logits_std": 3.2019567489624023, "gen_loss": 0.2785683274269104, "grad_norm": 0.37075247812954504, "learning_rate": 2.1912000000000003e-05, "loss": 0.2744, "mean_copy_accuracy": 0.9976211786270142, "mean_gen_accuracy": 0.8784996122121811, "mean_token_accuracy": 0.9072298556566238, "num_tokens": 851628945.0, "sample_num_tokens": 8290.75, "step": 7139, "total_num_tokens": 851662108.0, "z_loss": 0.00047919852659106255 }, { "copy_logits_max": -6.984111309051514, "copy_logits_min": -750000000.0, "copy_num_tokens": 303.1875, "epoch": 1.4583099310696963, "gen_logits_max": 4.405139923095703, "gen_logits_mean": -16.49359130859375, "gen_logits_min": -28.613941192626953, "gen_logits_std": 3.1774885654449463, "gen_loss": 0.3089798092842102, "grad_norm": 0.39687526313028526, "learning_rate": 2.1910736842105264e-05, "loss": 0.2887, "mean_copy_accuracy": 0.9966313391923904, "mean_gen_accuracy": 0.8701785355806351, "mean_token_accuracy": 0.9009933620691299, "num_tokens": 851905722.0, "sample_num_tokens": 7739.5, "step": 7140, "total_num_tokens": 851936680.0, "z_loss": 0.00047599594108760357 }, { "copy_logits_max": -4.9986982345581055, "copy_logits_min": -750000000.0, "copy_num_tokens": 800.875, "epoch": 1.458514169006893, "gen_logits_max": 3.6970739364624023, "gen_logits_mean": -15.312434196472168, "gen_logits_min": -27.661603927612305, "gen_logits_std": 3.1547465324401855, "gen_loss": 0.24111321568489075, "grad_norm": 0.3840242231308131, "learning_rate": 2.1909473684210528e-05, "loss": 0.2913, "mean_copy_accuracy": 0.9957357496023178, "mean_gen_accuracy": 0.8688730746507645, "mean_token_accuracy": 0.8997039943933487, "num_tokens": 852180828.0, "sample_num_tokens": 11289.5, "step": 7141, "total_num_tokens": 852225986.0, "z_loss": 0.0003782019775826484 }, { "copy_logits_max": -3.8206310272216797, "copy_logits_min": -750000000.0, "copy_num_tokens": 665.5, "epoch": 1.45871840694409, "gen_logits_max": 2.7695937156677246, "gen_logits_mean": -16.945411682128906, "gen_logits_min": -28.94217872619629, "gen_logits_std": 3.1822097301483154, "gen_loss": 0.26590609550476074, "grad_norm": 0.36958622430805316, "learning_rate": 2.190821052631579e-05, "loss": 0.2714, "mean_copy_accuracy": 0.996415302157402, "mean_gen_accuracy": 0.8731729686260223, "mean_token_accuracy": 0.9064857065677643, "num_tokens": 852485026.0, "sample_num_tokens": 10208.5, "step": 7142, "total_num_tokens": 852525860.0, "z_loss": 0.0004331777454353869 }, { "copy_logits_max": -6.456174850463867, "copy_logits_min": -750000000.0, "copy_num_tokens": 317.0625, "epoch": 1.4589226448812866, "gen_logits_max": 3.6431686878204346, "gen_logits_mean": -16.792728424072266, "gen_logits_min": -28.523548126220703, "gen_logits_std": 3.159813165664673, "gen_loss": 0.27485448122024536, "grad_norm": 0.34634360662892716, "learning_rate": 2.1906947368421053e-05, "loss": 0.2804, "mean_copy_accuracy": 0.9964485317468643, "mean_gen_accuracy": 0.8785600960254669, "mean_token_accuracy": 0.9055878669023514, "num_tokens": 852757877.0, "sample_num_tokens": 7388.75, "step": 7143, "total_num_tokens": 852787432.0, "z_loss": 0.00047325988998636603 }, { "copy_logits_max": -4.699202537536621, "copy_logits_min": -687500032.0, "copy_num_tokens": 422.75, "epoch": 1.4591268828184836, "gen_logits_max": 2.8114960193634033, "gen_logits_mean": -18.570819854736328, "gen_logits_min": -30.460899353027344, "gen_logits_std": 3.249879837036133, "gen_loss": 0.2841183841228485, "grad_norm": 0.3431220371994898, "learning_rate": 2.1905684210526314e-05, "loss": 0.2769, "mean_copy_accuracy": 0.9972791373729706, "mean_gen_accuracy": 0.8739347606897354, "mean_token_accuracy": 0.906515970826149, "num_tokens": 853022254.0, "sample_num_tokens": 7520.5, "step": 7144, "total_num_tokens": 853052336.0, "z_loss": 0.0005473042256198823 }, { "copy_logits_max": -3.593954086303711, "copy_logits_min": -687500032.0, "copy_num_tokens": 479.0625, "epoch": 1.4593311207556803, "gen_logits_max": 3.871285915374756, "gen_logits_mean": -16.19406509399414, "gen_logits_min": -28.30221939086914, "gen_logits_std": 3.1490063667297363, "gen_loss": 0.2983447313308716, "grad_norm": 0.3605833689936807, "learning_rate": 2.190442105263158e-05, "loss": 0.2904, "mean_copy_accuracy": 0.9972244203090668, "mean_gen_accuracy": 0.8720743954181671, "mean_token_accuracy": 0.9011993706226349, "num_tokens": 853319601.0, "sample_num_tokens": 9194.75, "step": 7145, "total_num_tokens": 853356380.0, "z_loss": 0.000543144647963345 }, { "copy_logits_max": -6.952727317810059, "copy_logits_min": -750000000.0, "copy_num_tokens": 332.25, "epoch": 1.4595353586928772, "gen_logits_max": 4.8434600830078125, "gen_logits_mean": -15.760238647460938, "gen_logits_min": -27.371471405029297, "gen_logits_std": 3.137721061706543, "gen_loss": 0.3682324290275574, "grad_norm": 0.3774372264581118, "learning_rate": 2.1903157894736843e-05, "loss": 0.3091, "mean_copy_accuracy": 0.9966727793216705, "mean_gen_accuracy": 0.8662378638982773, "mean_token_accuracy": 0.8942891955375671, "num_tokens": 853577682.0, "sample_num_tokens": 8222.0, "step": 7146, "total_num_tokens": 853610570.0, "z_loss": 0.0006089579546824098 }, { "copy_logits_max": -5.201974868774414, "copy_logits_min": -750000000.0, "copy_num_tokens": 528.75, "epoch": 1.4597395966300741, "gen_logits_max": 3.0041189193725586, "gen_logits_mean": -16.416208267211914, "gen_logits_min": -28.84143829345703, "gen_logits_std": 3.190077543258667, "gen_loss": 0.2712346315383911, "grad_norm": 0.35519805860112424, "learning_rate": 2.1901894736842107e-05, "loss": 0.2721, "mean_copy_accuracy": 0.9956320524215698, "mean_gen_accuracy": 0.874737411737442, "mean_token_accuracy": 0.9066799879074097, "num_tokens": 853849175.0, "sample_num_tokens": 8741.25, "step": 7147, "total_num_tokens": 853884140.0, "z_loss": 0.0004924478125758469 }, { "copy_logits_max": -6.552600860595703, "copy_logits_min": -750000000.0, "copy_num_tokens": 521.5, "epoch": 1.4599438345672708, "gen_logits_max": 3.9692869186401367, "gen_logits_mean": -15.404646873474121, "gen_logits_min": -27.254091262817383, "gen_logits_std": 3.171704053878784, "gen_loss": 0.27237454056739807, "grad_norm": 0.38277726247714006, "learning_rate": 2.1900631578947368e-05, "loss": 0.2881, "mean_copy_accuracy": 0.995370477437973, "mean_gen_accuracy": 0.8774422109127045, "mean_token_accuracy": 0.9027123749256134, "num_tokens": 854128019.0, "sample_num_tokens": 9023.25, "step": 7148, "total_num_tokens": 854164112.0, "z_loss": 0.00047448804252780974 }, { "copy_logits_max": -7.264726161956787, "copy_logits_min": -750000000.0, "copy_num_tokens": 429.5625, "epoch": 1.4601480725044678, "gen_logits_max": 3.6823363304138184, "gen_logits_mean": -17.031667709350586, "gen_logits_min": -28.934017181396484, "gen_logits_std": 3.1968865394592285, "gen_loss": 0.2940503656864166, "grad_norm": 0.3942165939712666, "learning_rate": 2.1899368421052633e-05, "loss": 0.2613, "mean_copy_accuracy": 0.9961590617895126, "mean_gen_accuracy": 0.8820713013410568, "mean_token_accuracy": 0.9091409146785736, "num_tokens": 854401867.0, "sample_num_tokens": 8654.25, "step": 7149, "total_num_tokens": 854436484.0, "z_loss": 0.00048290425911545753 }, { "copy_logits_max": -6.3347392082214355, "copy_logits_min": -750000000.0, "copy_num_tokens": 535.5625, "epoch": 1.4603523104416645, "gen_logits_max": 2.9391915798187256, "gen_logits_mean": -17.769214630126953, "gen_logits_min": -29.226863861083984, "gen_logits_std": 3.2080249786376953, "gen_loss": 0.2873286008834839, "grad_norm": 0.36492572240273446, "learning_rate": 2.1898105263157893e-05, "loss": 0.2872, "mean_copy_accuracy": 0.9964325129985809, "mean_gen_accuracy": 0.8744871616363525, "mean_token_accuracy": 0.9040791541337967, "num_tokens": 854669832.0, "sample_num_tokens": 9301.5, "step": 7150, "total_num_tokens": 854707038.0, "z_loss": 0.0004780609451700002 }, { "copy_logits_max": -7.91750431060791, "copy_logits_min": -750000000.0, "copy_num_tokens": 459.1875, "epoch": 1.4605565483788614, "gen_logits_max": 3.990391254425049, "gen_logits_mean": -16.307796478271484, "gen_logits_min": -28.064952850341797, "gen_logits_std": 3.1662962436676025, "gen_loss": 0.269598126411438, "grad_norm": 0.32980783156969806, "learning_rate": 2.1896842105263158e-05, "loss": 0.2845, "mean_copy_accuracy": 0.996818944811821, "mean_gen_accuracy": 0.8750187009572983, "mean_token_accuracy": 0.9038197994232178, "num_tokens": 854949313.0, "sample_num_tokens": 8601.25, "step": 7151, "total_num_tokens": 854983718.0, "z_loss": 0.0004656695236917585 }, { "copy_logits_max": -6.739861488342285, "copy_logits_min": -750000000.0, "copy_num_tokens": 535.3125, "epoch": 1.460760786316058, "gen_logits_max": 2.8158199787139893, "gen_logits_mean": -17.583873748779297, "gen_logits_min": -29.64093017578125, "gen_logits_std": 3.235187530517578, "gen_loss": 0.2417142242193222, "grad_norm": 0.3822054229663913, "learning_rate": 2.1895578947368422e-05, "loss": 0.266, "mean_copy_accuracy": 0.9962266385555267, "mean_gen_accuracy": 0.8760248273611069, "mean_token_accuracy": 0.9109153747558594, "num_tokens": 855209063.0, "sample_num_tokens": 8802.25, "step": 7152, "total_num_tokens": 855244272.0, "z_loss": 0.00040448427898809314 }, { "copy_logits_max": -7.046379089355469, "copy_logits_min": -750000000.0, "copy_num_tokens": 499.5625, "epoch": 1.460965024253255, "gen_logits_max": 3.438870429992676, "gen_logits_mean": -17.740562438964844, "gen_logits_min": -29.663021087646484, "gen_logits_std": 3.220174789428711, "gen_loss": 0.2847849130630493, "grad_norm": 0.3460052824889351, "learning_rate": 2.1894315789473683e-05, "loss": 0.269, "mean_copy_accuracy": 0.9967571496963501, "mean_gen_accuracy": 0.8774015456438065, "mean_token_accuracy": 0.9094044715166092, "num_tokens": 855492845.0, "sample_num_tokens": 8876.75, "step": 7153, "total_num_tokens": 855528352.0, "z_loss": 0.0004525157855823636 }, { "copy_logits_max": -7.2333149909973145, "copy_logits_min": -687500096.0, "copy_num_tokens": 413.25, "epoch": 1.461169262190452, "gen_logits_max": 3.1572842597961426, "gen_logits_mean": -18.631168365478516, "gen_logits_min": -30.690073013305664, "gen_logits_std": 3.2657017707824707, "gen_loss": 0.27967190742492676, "grad_norm": 0.374611481975116, "learning_rate": 2.189305263157895e-05, "loss": 0.2687, "mean_copy_accuracy": 0.9970085173845291, "mean_gen_accuracy": 0.8761798590421677, "mean_token_accuracy": 0.907664105296135, "num_tokens": 855776533.0, "sample_num_tokens": 8510.75, "step": 7154, "total_num_tokens": 855810576.0, "z_loss": 0.0004710737557616085 }, { "copy_logits_max": -8.148213386535645, "copy_logits_min": -750000000.0, "copy_num_tokens": 467.875, "epoch": 1.4613735001276487, "gen_logits_max": 5.37677001953125, "gen_logits_mean": -13.936273574829102, "gen_logits_min": -25.868309020996094, "gen_logits_std": 3.1172385215759277, "gen_loss": 0.2721085250377655, "grad_norm": 0.3630591239006519, "learning_rate": 2.189178947368421e-05, "loss": 0.2842, "mean_copy_accuracy": 0.9964570105075836, "mean_gen_accuracy": 0.8771742433309555, "mean_token_accuracy": 0.9050203263759613, "num_tokens": 856046814.0, "sample_num_tokens": 7531.5, "step": 7155, "total_num_tokens": 856076940.0, "z_loss": 0.00045885040890425444 }, { "copy_logits_max": -6.772821426391602, "copy_logits_min": -750000000.0, "copy_num_tokens": 413.375, "epoch": 1.4615777380648456, "gen_logits_max": 5.141963958740234, "gen_logits_mean": -14.77684211730957, "gen_logits_min": -27.129907608032227, "gen_logits_std": 3.1474485397338867, "gen_loss": 0.3034093379974365, "grad_norm": 0.39744193143401285, "learning_rate": 2.1890526315789476e-05, "loss": 0.2832, "mean_copy_accuracy": 0.9969637244939804, "mean_gen_accuracy": 0.8742382675409317, "mean_token_accuracy": 0.9048337489366531, "num_tokens": 856306618.0, "sample_num_tokens": 8254.5, "step": 7156, "total_num_tokens": 856339636.0, "z_loss": 0.0004924056120216846 }, { "copy_logits_max": -6.433223724365234, "copy_logits_min": -750000000.0, "copy_num_tokens": 497.0, "epoch": 1.4617819760020425, "gen_logits_max": 2.692302942276001, "gen_logits_mean": -17.879201889038086, "gen_logits_min": -30.156435012817383, "gen_logits_std": 3.2574174404144287, "gen_loss": 0.25502699613571167, "grad_norm": 0.37294677591131536, "learning_rate": 2.1889263157894737e-05, "loss": 0.2776, "mean_copy_accuracy": 0.9962030053138733, "mean_gen_accuracy": 0.8747177422046661, "mean_token_accuracy": 0.9072845131158829, "num_tokens": 856590063.0, "sample_num_tokens": 8029.75, "step": 7157, "total_num_tokens": 856622182.0, "z_loss": 0.00043017667485401034 }, { "copy_logits_max": -5.8419952392578125, "copy_logits_min": -750000000.0, "copy_num_tokens": 730.4375, "epoch": 1.4619862139392392, "gen_logits_max": 2.0991153717041016, "gen_logits_mean": -18.139915466308594, "gen_logits_min": -30.823705673217773, "gen_logits_std": 3.274454355239868, "gen_loss": 0.24302887916564941, "grad_norm": 0.3745310669860469, "learning_rate": 2.1888e-05, "loss": 0.2575, "mean_copy_accuracy": 0.996014341711998, "mean_gen_accuracy": 0.8840410113334656, "mean_token_accuracy": 0.9118975698947906, "num_tokens": 856835367.0, "sample_num_tokens": 9831.25, "step": 7158, "total_num_tokens": 856874692.0, "z_loss": 0.00040043145418167114 }, { "copy_logits_max": -5.787653923034668, "copy_logits_min": -750000000.0, "copy_num_tokens": 670.4375, "epoch": 1.462190451876436, "gen_logits_max": 3.3646695613861084, "gen_logits_mean": -16.037643432617188, "gen_logits_min": -28.263870239257812, "gen_logits_std": 3.2172763347625732, "gen_loss": 0.2688966691493988, "grad_norm": 0.342126047976232, "learning_rate": 2.1886736842105262e-05, "loss": 0.2715, "mean_copy_accuracy": 0.9973983913660049, "mean_gen_accuracy": 0.8731317818164825, "mean_token_accuracy": 0.9069121032953262, "num_tokens": 857120095.0, "sample_num_tokens": 9477.25, "step": 7159, "total_num_tokens": 857158004.0, "z_loss": 0.00048397958744317293 }, { "copy_logits_max": -7.190093040466309, "copy_logits_min": -750000000.0, "copy_num_tokens": 346.0, "epoch": 1.4623946898136329, "gen_logits_max": 4.264158725738525, "gen_logits_mean": -17.143512725830078, "gen_logits_min": -29.072158813476562, "gen_logits_std": 3.210425615310669, "gen_loss": 0.28440266847610474, "grad_norm": 0.3768320477755831, "learning_rate": 2.1885473684210526e-05, "loss": 0.2762, "mean_copy_accuracy": 0.9959776848554611, "mean_gen_accuracy": 0.8820785731077194, "mean_token_accuracy": 0.9058988243341446, "num_tokens": 857369987.0, "sample_num_tokens": 8051.75, "step": 7160, "total_num_tokens": 857402194.0, "z_loss": 0.00047134043416008353 }, { "copy_logits_max": -5.428380966186523, "copy_logits_min": -687500032.0, "copy_num_tokens": 455.25, "epoch": 1.4625989277508298, "gen_logits_max": 4.260241985321045, "gen_logits_mean": -15.581428527832031, "gen_logits_min": -27.506704330444336, "gen_logits_std": 3.1720945835113525, "gen_loss": 0.3135020136833191, "grad_norm": 0.36632793867497915, "learning_rate": 2.1884210526315787e-05, "loss": 0.2954, "mean_copy_accuracy": 0.9972935020923615, "mean_gen_accuracy": 0.8688914775848389, "mean_token_accuracy": 0.9004826247692108, "num_tokens": 857649477.0, "sample_num_tokens": 8779.25, "step": 7161, "total_num_tokens": 857684594.0, "z_loss": 0.0005173973040655255 }, { "copy_logits_max": -6.299764633178711, "copy_logits_min": -750000000.0, "copy_num_tokens": 326.0625, "epoch": 1.4628031656880265, "gen_logits_max": 4.151086807250977, "gen_logits_mean": -16.84359359741211, "gen_logits_min": -28.921058654785156, "gen_logits_std": 3.2085652351379395, "gen_loss": 0.24631045758724213, "grad_norm": 0.3874765318517105, "learning_rate": 2.1882947368421055e-05, "loss": 0.2734, "mean_copy_accuracy": 0.9963644444942474, "mean_gen_accuracy": 0.8744401186704636, "mean_token_accuracy": 0.9067338407039642, "num_tokens": 857935220.0, "sample_num_tokens": 7375.5, "step": 7162, "total_num_tokens": 857964722.0, "z_loss": 0.0004363320185802877 }, { "copy_logits_max": -1.7862242460250854, "copy_logits_min": -750000000.0, "copy_num_tokens": 474.75, "epoch": 1.4630074036252234, "gen_logits_max": 4.086644649505615, "gen_logits_mean": -14.405176162719727, "gen_logits_min": -26.23463249206543, "gen_logits_std": 3.103344440460205, "gen_loss": 0.28979867696762085, "grad_norm": 0.42346608159700916, "learning_rate": 2.1881684210526316e-05, "loss": 0.2873, "mean_copy_accuracy": 0.9965050369501114, "mean_gen_accuracy": 0.8737713843584061, "mean_token_accuracy": 0.9030310958623886, "num_tokens": 858212865.0, "sample_num_tokens": 8049.25, "step": 7163, "total_num_tokens": 858245062.0, "z_loss": 0.0005003171390853822 }, { "copy_logits_max": -3.699789047241211, "copy_logits_min": -750000000.0, "copy_num_tokens": 538.125, "epoch": 1.4632116415624203, "gen_logits_max": 3.1778793334960938, "gen_logits_mean": -17.01944351196289, "gen_logits_min": -28.905563354492188, "gen_logits_std": 3.1883533000946045, "gen_loss": 0.30136895179748535, "grad_norm": 0.34943973587939187, "learning_rate": 2.188042105263158e-05, "loss": 0.262, "mean_copy_accuracy": 0.997007742524147, "mean_gen_accuracy": 0.8770825862884521, "mean_token_accuracy": 0.9100129902362823, "num_tokens": 858491095.0, "sample_num_tokens": 9098.75, "step": 7164, "total_num_tokens": 858527490.0, "z_loss": 0.0005503326538018882 }, { "copy_logits_max": -2.189124584197998, "copy_logits_min": -687500032.0, "copy_num_tokens": 480.0, "epoch": 1.463415879499617, "gen_logits_max": 4.19787073135376, "gen_logits_mean": -15.799800872802734, "gen_logits_min": -27.827373504638672, "gen_logits_std": 3.1066060066223145, "gen_loss": 0.33656442165374756, "grad_norm": 0.3520776042544846, "learning_rate": 2.1879157894736845e-05, "loss": 0.285, "mean_copy_accuracy": 0.9966670721769333, "mean_gen_accuracy": 0.8723539412021637, "mean_token_accuracy": 0.9030163437128067, "num_tokens": 858770968.0, "sample_num_tokens": 8770.5, "step": 7165, "total_num_tokens": 858806050.0, "z_loss": 0.0006028881762176752 }, { "copy_logits_max": -1.981132984161377, "copy_logits_min": -750000000.0, "copy_num_tokens": 505.25, "epoch": 1.4636201174368138, "gen_logits_max": 3.596658706665039, "gen_logits_mean": -15.82205867767334, "gen_logits_min": -27.98575210571289, "gen_logits_std": 3.133646011352539, "gen_loss": 0.26423701643943787, "grad_norm": 0.38797613032338213, "learning_rate": 2.1877894736842106e-05, "loss": 0.2717, "mean_copy_accuracy": 0.9974408745765686, "mean_gen_accuracy": 0.8778492212295532, "mean_token_accuracy": 0.9079077690839767, "num_tokens": 859038644.0, "sample_num_tokens": 9596.5, "step": 7166, "total_num_tokens": 859077030.0, "z_loss": 0.00045643263729289174 }, { "copy_logits_max": -4.060105800628662, "copy_logits_min": -750000000.0, "copy_num_tokens": 576.75, "epoch": 1.4638243553740107, "gen_logits_max": 3.438537120819092, "gen_logits_mean": -17.13848876953125, "gen_logits_min": -29.170269012451172, "gen_logits_std": 3.2027881145477295, "gen_loss": 0.26083189249038696, "grad_norm": 0.5364593334290441, "learning_rate": 2.187663157894737e-05, "loss": 0.2779, "mean_copy_accuracy": 0.9969986528158188, "mean_gen_accuracy": 0.8781758695840836, "mean_token_accuracy": 0.9067949801683426, "num_tokens": 859321220.0, "sample_num_tokens": 9377.5, "step": 7167, "total_num_tokens": 859358730.0, "z_loss": 0.000520469038747251 }, { "copy_logits_max": -2.932464361190796, "copy_logits_min": -750000000.0, "copy_num_tokens": 430.9375, "epoch": 1.4640285933112076, "gen_logits_max": 5.346880912780762, "gen_logits_mean": -15.221199035644531, "gen_logits_min": -27.288578033447266, "gen_logits_std": 3.1619062423706055, "gen_loss": 0.25498872995376587, "grad_norm": 0.346394680452082, "learning_rate": 2.187536842105263e-05, "loss": 0.2826, "mean_copy_accuracy": 0.9965876638889313, "mean_gen_accuracy": 0.8750816434621811, "mean_token_accuracy": 0.9049761295318604, "num_tokens": 859597834.0, "sample_num_tokens": 8060.5, "step": 7168, "total_num_tokens": 859630076.0, "z_loss": 0.00047995400382205844 }, { "copy_logits_max": -3.7104196548461914, "copy_logits_min": -750000000.0, "copy_num_tokens": 349.5, "epoch": 1.4642328312484043, "gen_logits_max": 3.928816318511963, "gen_logits_mean": -16.48971176147461, "gen_logits_min": -28.157224655151367, "gen_logits_std": 3.1717333793640137, "gen_loss": 0.21823996305465698, "grad_norm": 0.3489367458792276, "learning_rate": 2.1874105263157895e-05, "loss": 0.2574, "mean_copy_accuracy": 0.9954144656658173, "mean_gen_accuracy": 0.8832006305456161, "mean_token_accuracy": 0.9125725030899048, "num_tokens": 859871551.0, "sample_num_tokens": 7818.75, "step": 7169, "total_num_tokens": 859902826.0, "z_loss": 0.0004430054686963558 }, { "copy_logits_max": 1.1570851802825928, "copy_logits_min": -750000000.0, "copy_num_tokens": 564.375, "epoch": 1.4644370691856012, "gen_logits_max": 4.527349472045898, "gen_logits_mean": -14.077295303344727, "gen_logits_min": -26.74211311340332, "gen_logits_std": 3.085726737976074, "gen_loss": 0.32390305399894714, "grad_norm": 0.3397825657423455, "learning_rate": 2.187284210526316e-05, "loss": 0.2878, "mean_copy_accuracy": 0.9964367896318436, "mean_gen_accuracy": 0.8723773807287216, "mean_token_accuracy": 0.9035672396421432, "num_tokens": 860149746.0, "sample_num_tokens": 9540.5, "step": 7170, "total_num_tokens": 860187908.0, "z_loss": 0.0005214014672674239 }, { "copy_logits_max": -5.2291178703308105, "copy_logits_min": -625000064.0, "copy_num_tokens": 405.4375, "epoch": 1.4646413071227982, "gen_logits_max": 4.17728328704834, "gen_logits_mean": -16.696224212646484, "gen_logits_min": -28.820707321166992, "gen_logits_std": 3.1912147998809814, "gen_loss": 0.2981976568698883, "grad_norm": 0.3543748972210729, "learning_rate": 2.1871578947368424e-05, "loss": 0.2758, "mean_copy_accuracy": 0.9968426674604416, "mean_gen_accuracy": 0.8724125474691391, "mean_token_accuracy": 0.9046857357025146, "num_tokens": 860438141.0, "sample_num_tokens": 7865.25, "step": 7171, "total_num_tokens": 860469602.0, "z_loss": 0.000559592735953629 }, { "copy_logits_max": -3.999404191970825, "copy_logits_min": -750000064.0, "copy_num_tokens": 484.875, "epoch": 1.4648455450599949, "gen_logits_max": 3.773749828338623, "gen_logits_mean": -16.380577087402344, "gen_logits_min": -28.432525634765625, "gen_logits_std": 3.187356472015381, "gen_loss": 0.2394108772277832, "grad_norm": 0.3427054753875192, "learning_rate": 2.1870315789473685e-05, "loss": 0.2351, "mean_copy_accuracy": 0.9978370517492294, "mean_gen_accuracy": 0.8902106732130051, "mean_token_accuracy": 0.9205836057662964, "num_tokens": 860733043.0, "sample_num_tokens": 8366.75, "step": 7172, "total_num_tokens": 860766510.0, "z_loss": 0.0004030624113511294 }, { "copy_logits_max": -4.181427478790283, "copy_logits_min": -750000000.0, "copy_num_tokens": 335.3125, "epoch": 1.4650497829971918, "gen_logits_max": 4.235932350158691, "gen_logits_mean": -16.227739334106445, "gen_logits_min": -28.16904640197754, "gen_logits_std": 3.164721965789795, "gen_loss": 0.29757022857666016, "grad_norm": 0.3902983044266896, "learning_rate": 2.186905263157895e-05, "loss": 0.305, "mean_copy_accuracy": 0.9964619129896164, "mean_gen_accuracy": 0.8683361113071442, "mean_token_accuracy": 0.898468017578125, "num_tokens": 860991921.0, "sample_num_tokens": 7902.75, "step": 7173, "total_num_tokens": 861023532.0, "z_loss": 0.00045131606748327613 }, { "copy_logits_max": -5.472525596618652, "copy_logits_min": -750000000.0, "copy_num_tokens": 510.75, "epoch": 1.4652540209343885, "gen_logits_max": 2.6210575103759766, "gen_logits_mean": -18.149816513061523, "gen_logits_min": -30.23354721069336, "gen_logits_std": 3.248569965362549, "gen_loss": 0.24733930826187134, "grad_norm": 0.34011807151763507, "learning_rate": 2.186778947368421e-05, "loss": 0.2573, "mean_copy_accuracy": 0.997110903263092, "mean_gen_accuracy": 0.8794198781251907, "mean_token_accuracy": 0.9125218987464905, "num_tokens": 861293653.0, "sample_num_tokens": 8507.25, "step": 7174, "total_num_tokens": 861327682.0, "z_loss": 0.0004181574040558189 }, { "copy_logits_max": -4.465576171875, "copy_logits_min": -750000000.0, "copy_num_tokens": 572.6875, "epoch": 1.4654582588715854, "gen_logits_max": 3.7256593704223633, "gen_logits_mean": -16.22136116027832, "gen_logits_min": -28.645462036132812, "gen_logits_std": 3.2150466442108154, "gen_loss": 0.2591250538825989, "grad_norm": 0.3768252296249802, "learning_rate": 2.1866526315789474e-05, "loss": 0.2789, "mean_copy_accuracy": 0.9953547120094299, "mean_gen_accuracy": 0.8742277771234512, "mean_token_accuracy": 0.9043940901756287, "num_tokens": 861546196.0, "sample_num_tokens": 8760.5, "step": 7175, "total_num_tokens": 861581238.0, "z_loss": 0.000468762475065887 }, { "copy_logits_max": -3.039233446121216, "copy_logits_min": -750000000.0, "copy_num_tokens": 583.1875, "epoch": 1.4656624968087821, "gen_logits_max": 3.2784385681152344, "gen_logits_mean": -16.271595001220703, "gen_logits_min": -28.326915740966797, "gen_logits_std": 3.1654624938964844, "gen_loss": 0.26492804288864136, "grad_norm": 0.3515533984889569, "learning_rate": 2.1865263157894735e-05, "loss": 0.2778, "mean_copy_accuracy": 0.9974314570426941, "mean_gen_accuracy": 0.871872678399086, "mean_token_accuracy": 0.9051319807767868, "num_tokens": 861834402.0, "sample_num_tokens": 9518.5, "step": 7176, "total_num_tokens": 861872476.0, "z_loss": 0.00042079182458110154 }, { "copy_logits_max": -7.088111877441406, "copy_logits_min": -750000000.0, "copy_num_tokens": 292.9375, "epoch": 1.465866734745979, "gen_logits_max": 4.184618949890137, "gen_logits_mean": -17.36888313293457, "gen_logits_min": -29.15873146057129, "gen_logits_std": 3.2056422233581543, "gen_loss": 0.28352904319763184, "grad_norm": 0.37258263099638933, "learning_rate": 2.1864e-05, "loss": 0.2693, "mean_copy_accuracy": 0.9972005188465118, "mean_gen_accuracy": 0.8778282850980759, "mean_token_accuracy": 0.9093352258205414, "num_tokens": 862112134.0, "sample_num_tokens": 7346.5, "step": 7177, "total_num_tokens": 862141520.0, "z_loss": 0.0004170140018686652 }, { "copy_logits_max": -4.899871826171875, "copy_logits_min": -750000000.0, "copy_num_tokens": 394.0625, "epoch": 1.466070972683176, "gen_logits_max": 2.7343051433563232, "gen_logits_mean": -18.195281982421875, "gen_logits_min": -30.339984893798828, "gen_logits_std": 3.234555959701538, "gen_loss": 0.2567678987979889, "grad_norm": 0.3783243639208511, "learning_rate": 2.1862736842105264e-05, "loss": 0.2777, "mean_copy_accuracy": 0.9962103962898254, "mean_gen_accuracy": 0.8792722970247269, "mean_token_accuracy": 0.906325951218605, "num_tokens": 862364589.0, "sample_num_tokens": 7329.75, "step": 7178, "total_num_tokens": 862393908.0, "z_loss": 0.00040824117604643106 }, { "copy_logits_max": -1.8058652877807617, "copy_logits_min": -750000000.0, "copy_num_tokens": 394.3125, "epoch": 1.4662752106203727, "gen_logits_max": 5.288154125213623, "gen_logits_mean": -14.467796325683594, "gen_logits_min": -26.371612548828125, "gen_logits_std": 3.0957109928131104, "gen_loss": 0.3019205331802368, "grad_norm": 0.3683846747021354, "learning_rate": 2.1861473684210528e-05, "loss": 0.2817, "mean_copy_accuracy": 0.9970275312662125, "mean_gen_accuracy": 0.8816054165363312, "mean_token_accuracy": 0.9049167633056641, "num_tokens": 862609185.0, "sample_num_tokens": 8232.25, "step": 7179, "total_num_tokens": 862642114.0, "z_loss": 0.0004830642428714782 }, { "copy_logits_max": -3.459815263748169, "copy_logits_min": -750000000.0, "copy_num_tokens": 374.75, "epoch": 1.4664794485575696, "gen_logits_max": 4.09952449798584, "gen_logits_mean": -15.741445541381836, "gen_logits_min": -27.695232391357422, "gen_logits_std": 3.15946364402771, "gen_loss": 0.29010242223739624, "grad_norm": 0.3818853214013798, "learning_rate": 2.1860210526315792e-05, "loss": 0.3042, "mean_copy_accuracy": 0.995602935552597, "mean_gen_accuracy": 0.8651556968688965, "mean_token_accuracy": 0.8974641412496567, "num_tokens": 862885900.0, "sample_num_tokens": 7394.5, "step": 7180, "total_num_tokens": 862915478.0, "z_loss": 0.0005198552389629185 }, { "copy_logits_max": -6.769122123718262, "copy_logits_min": -750000000.0, "copy_num_tokens": 228.0, "epoch": 1.4666836864947663, "gen_logits_max": 3.9638311862945557, "gen_logits_mean": -17.08920669555664, "gen_logits_min": -29.105953216552734, "gen_logits_std": 3.209120750427246, "gen_loss": 0.2752065658569336, "grad_norm": 0.4014484532667904, "learning_rate": 2.1858947368421053e-05, "loss": 0.2863, "mean_copy_accuracy": 0.9939411282539368, "mean_gen_accuracy": 0.8783567547798157, "mean_token_accuracy": 0.9025434255599976, "num_tokens": 863127225.0, "sample_num_tokens": 7026.75, "step": 7181, "total_num_tokens": 863155332.0, "z_loss": 0.0004411425907164812 }, { "copy_logits_max": -6.599098205566406, "copy_logits_min": -750000000.0, "copy_num_tokens": 389.5, "epoch": 1.4668879244319633, "gen_logits_max": 3.342376232147217, "gen_logits_mean": -17.325984954833984, "gen_logits_min": -28.590362548828125, "gen_logits_std": 3.1769819259643555, "gen_loss": 0.3005865514278412, "grad_norm": 0.3863219904742764, "learning_rate": 2.1857684210526318e-05, "loss": 0.2715, "mean_copy_accuracy": 0.9969416558742523, "mean_gen_accuracy": 0.8819448053836823, "mean_token_accuracy": 0.9076409488916397, "num_tokens": 863390504.0, "sample_num_tokens": 8147.0, "step": 7182, "total_num_tokens": 863423092.0, "z_loss": 0.0004276731051504612 }, { "copy_logits_max": -6.319344520568848, "copy_logits_min": -750000000.0, "copy_num_tokens": 433.9375, "epoch": 1.46709216236916, "gen_logits_max": 5.3870849609375, "gen_logits_mean": -14.222475051879883, "gen_logits_min": -25.97671127319336, "gen_logits_std": 3.102815628051758, "gen_loss": 0.2797178030014038, "grad_norm": 0.3394584421454008, "learning_rate": 2.185642105263158e-05, "loss": 0.2601, "mean_copy_accuracy": 0.997064396739006, "mean_gen_accuracy": 0.8826327621936798, "mean_token_accuracy": 0.9119459986686707, "num_tokens": 863664737.0, "sample_num_tokens": 8245.25, "step": 7183, "total_num_tokens": 863697718.0, "z_loss": 0.00044979419908486307 }, { "copy_logits_max": -6.360968112945557, "copy_logits_min": -750000000.0, "copy_num_tokens": 380.75, "epoch": 1.4672964003063569, "gen_logits_max": 4.155326843261719, "gen_logits_mean": -16.22032928466797, "gen_logits_min": -27.925704956054688, "gen_logits_std": 3.185471296310425, "gen_loss": 0.30175501108169556, "grad_norm": 0.39475946572416293, "learning_rate": 2.1855157894736843e-05, "loss": 0.2826, "mean_copy_accuracy": 0.9958408772945404, "mean_gen_accuracy": 0.8728371262550354, "mean_token_accuracy": 0.903130441904068, "num_tokens": 863922652.0, "sample_num_tokens": 7353.0, "step": 7184, "total_num_tokens": 863952064.0, "z_loss": 0.0004937004414387047 }, { "copy_logits_max": -6.598445892333984, "copy_logits_min": -750000000.0, "copy_num_tokens": 620.5625, "epoch": 1.4675006382435538, "gen_logits_max": 4.13016939163208, "gen_logits_mean": -15.043078422546387, "gen_logits_min": -27.28874397277832, "gen_logits_std": 3.1243605613708496, "gen_loss": 0.3024377226829529, "grad_norm": 0.376315362822574, "learning_rate": 2.1853894736842104e-05, "loss": 0.2792, "mean_copy_accuracy": 0.9977589696645737, "mean_gen_accuracy": 0.8758621215820312, "mean_token_accuracy": 0.9079982489347458, "num_tokens": 864191780.0, "sample_num_tokens": 9519.5, "step": 7185, "total_num_tokens": 864229858.0, "z_loss": 0.0004532148886937648 }, { "copy_logits_max": -7.522264003753662, "copy_logits_min": -750000000.0, "copy_num_tokens": 358.5625, "epoch": 1.4677048761807505, "gen_logits_max": 3.758899211883545, "gen_logits_mean": -17.35404396057129, "gen_logits_min": -29.2669734954834, "gen_logits_std": 3.1900506019592285, "gen_loss": 0.3116447925567627, "grad_norm": 0.39250380059219303, "learning_rate": 2.1852631578947368e-05, "loss": 0.304, "mean_copy_accuracy": 0.9953569620847702, "mean_gen_accuracy": 0.871379941701889, "mean_token_accuracy": 0.8961944580078125, "num_tokens": 864448437.0, "sample_num_tokens": 7402.25, "step": 7186, "total_num_tokens": 864478046.0, "z_loss": 0.0005289408727549016 }, { "copy_logits_max": -6.196316719055176, "copy_logits_min": -750000064.0, "copy_num_tokens": 425.4375, "epoch": 1.4679091141179474, "gen_logits_max": 3.292318344116211, "gen_logits_mean": -16.38652801513672, "gen_logits_min": -28.30084991455078, "gen_logits_std": 3.177562713623047, "gen_loss": 0.29044651985168457, "grad_norm": 0.3755789775587414, "learning_rate": 2.1851368421052632e-05, "loss": 0.2974, "mean_copy_accuracy": 0.9963957667350769, "mean_gen_accuracy": 0.8689423352479935, "mean_token_accuracy": 0.8977773636579514, "num_tokens": 864707996.0, "sample_num_tokens": 7633.0, "step": 7187, "total_num_tokens": 864738528.0, "z_loss": 0.0004547724383883178 }, { "copy_logits_max": -5.391858100891113, "copy_logits_min": -750000000.0, "copy_num_tokens": 509.9375, "epoch": 1.4681133520551444, "gen_logits_max": 3.584343910217285, "gen_logits_mean": -17.026123046875, "gen_logits_min": -28.961929321289062, "gen_logits_std": 3.2055838108062744, "gen_loss": 0.26236385107040405, "grad_norm": 0.3456021147113024, "learning_rate": 2.1850105263157897e-05, "loss": 0.252, "mean_copy_accuracy": 0.9977882504463196, "mean_gen_accuracy": 0.8866184949874878, "mean_token_accuracy": 0.9162296503782272, "num_tokens": 865013173.0, "sample_num_tokens": 9208.75, "step": 7188, "total_num_tokens": 865050008.0, "z_loss": 0.0004846008960157633 }, { "copy_logits_max": -3.5226197242736816, "copy_logits_min": -687500032.0, "copy_num_tokens": 380.4375, "epoch": 1.468317589992341, "gen_logits_max": 4.612913608551025, "gen_logits_mean": -14.910733222961426, "gen_logits_min": -26.903167724609375, "gen_logits_std": 3.07930326461792, "gen_loss": 0.28715330362319946, "grad_norm": 0.37746250359612116, "learning_rate": 2.1848842105263158e-05, "loss": 0.2949, "mean_copy_accuracy": 0.9962926208972931, "mean_gen_accuracy": 0.8701087981462479, "mean_token_accuracy": 0.9002507627010345, "num_tokens": 865262367.0, "sample_num_tokens": 7821.25, "step": 7189, "total_num_tokens": 865293652.0, "z_loss": 0.0005026025464758277 }, { "copy_logits_max": -6.349832534790039, "copy_logits_min": -687500032.0, "copy_num_tokens": 450.3125, "epoch": 1.4685218279295378, "gen_logits_max": 2.722560405731201, "gen_logits_mean": -17.800617218017578, "gen_logits_min": -29.606632232666016, "gen_logits_std": 3.2155933380126953, "gen_loss": 0.25325068831443787, "grad_norm": 0.3819812213059587, "learning_rate": 2.1847578947368422e-05, "loss": 0.2604, "mean_copy_accuracy": 0.9958344548940659, "mean_gen_accuracy": 0.882125973701477, "mean_token_accuracy": 0.9108992964029312, "num_tokens": 865536631.0, "sample_num_tokens": 7971.25, "step": 7190, "total_num_tokens": 865568516.0, "z_loss": 0.00046343618305400014 }, { "copy_logits_max": -5.729108810424805, "copy_logits_min": -750000000.0, "copy_num_tokens": 371.75, "epoch": 1.4687260658667347, "gen_logits_max": 4.540396690368652, "gen_logits_mean": -15.162772178649902, "gen_logits_min": -27.350160598754883, "gen_logits_std": 3.145113468170166, "gen_loss": 0.26330503821372986, "grad_norm": 0.37227554963179543, "learning_rate": 2.1846315789473683e-05, "loss": 0.2771, "mean_copy_accuracy": 0.9960255324840546, "mean_gen_accuracy": 0.8830273151397705, "mean_token_accuracy": 0.9069838523864746, "num_tokens": 865790188.0, "sample_num_tokens": 7699.5, "step": 7191, "total_num_tokens": 865820986.0, "z_loss": 0.0004665376618504524 }, { "copy_logits_max": -6.075483798980713, "copy_logits_min": -687500096.0, "copy_num_tokens": 464.125, "epoch": 1.4689303038039316, "gen_logits_max": 3.915268898010254, "gen_logits_mean": -16.277084350585938, "gen_logits_min": -27.769390106201172, "gen_logits_std": 3.156543731689453, "gen_loss": 0.27290698885917664, "grad_norm": 0.35262294915545467, "learning_rate": 2.1845052631578947e-05, "loss": 0.2629, "mean_copy_accuracy": 0.9968817234039307, "mean_gen_accuracy": 0.8821068108081818, "mean_token_accuracy": 0.9109651744365692, "num_tokens": 866049390.0, "sample_num_tokens": 8657.5, "step": 7192, "total_num_tokens": 866084020.0, "z_loss": 0.0004879938205704093 }, { "copy_logits_max": -2.8093154430389404, "copy_logits_min": -750000000.0, "copy_num_tokens": 613.8125, "epoch": 1.4691345417411283, "gen_logits_max": 4.402774333953857, "gen_logits_mean": -14.732621192932129, "gen_logits_min": -28.144634246826172, "gen_logits_std": 3.160473346710205, "gen_loss": 0.27089834213256836, "grad_norm": 0.3560146142074569, "learning_rate": 2.184378947368421e-05, "loss": 0.274, "mean_copy_accuracy": 0.9968259781599045, "mean_gen_accuracy": 0.8788680583238602, "mean_token_accuracy": 0.9066311866044998, "num_tokens": 866332913.0, "sample_num_tokens": 9530.25, "step": 7193, "total_num_tokens": 866371034.0, "z_loss": 0.0005004468257538974 }, { "copy_logits_max": -6.948124408721924, "copy_logits_min": -750000000.0, "copy_num_tokens": 431.0625, "epoch": 1.4693387796783253, "gen_logits_max": 3.860246181488037, "gen_logits_mean": -15.338565826416016, "gen_logits_min": -28.283679962158203, "gen_logits_std": 3.179079294204712, "gen_loss": 0.2876640558242798, "grad_norm": 0.3844814781349611, "learning_rate": 2.1842526315789472e-05, "loss": 0.2851, "mean_copy_accuracy": 0.9964543730020523, "mean_gen_accuracy": 0.872727707028389, "mean_token_accuracy": 0.9037027209997177, "num_tokens": 866611578.0, "sample_num_tokens": 7474.0, "step": 7194, "total_num_tokens": 866641474.0, "z_loss": 0.0005290201515890658 }, { "copy_logits_max": -6.645277500152588, "copy_logits_min": -750000000.0, "copy_num_tokens": 486.625, "epoch": 1.4695430176155222, "gen_logits_max": 4.547173500061035, "gen_logits_mean": -16.16029167175293, "gen_logits_min": -28.204105377197266, "gen_logits_std": 3.184807538986206, "gen_loss": 0.28330281376838684, "grad_norm": 0.3474026744635883, "learning_rate": 2.184126315789474e-05, "loss": 0.2819, "mean_copy_accuracy": 0.9966192841529846, "mean_gen_accuracy": 0.8724812865257263, "mean_token_accuracy": 0.9040452092885971, "num_tokens": 866890289.0, "sample_num_tokens": 8855.25, "step": 7195, "total_num_tokens": 866925710.0, "z_loss": 0.0005447842995636165 }, { "copy_logits_max": -6.450552463531494, "copy_logits_min": -750000000.0, "copy_num_tokens": 591.75, "epoch": 1.469747255552719, "gen_logits_max": 3.5162973403930664, "gen_logits_mean": -15.453312873840332, "gen_logits_min": -27.983104705810547, "gen_logits_std": 3.1955759525299072, "gen_loss": 0.24204228818416595, "grad_norm": 0.33713673187757864, "learning_rate": 2.184e-05, "loss": 0.2584, "mean_copy_accuracy": 0.998042032122612, "mean_gen_accuracy": 0.8794050514698029, "mean_token_accuracy": 0.9160457402467728, "num_tokens": 867191171.0, "sample_num_tokens": 8508.75, "step": 7196, "total_num_tokens": 867225206.0, "z_loss": 0.0004383088671602309 }, { "copy_logits_max": -7.462227821350098, "copy_logits_min": -750000000.0, "copy_num_tokens": 449.75, "epoch": 1.4699514934899158, "gen_logits_max": 5.091691970825195, "gen_logits_mean": -15.692795753479004, "gen_logits_min": -28.364805221557617, "gen_logits_std": 3.1728384494781494, "gen_loss": 0.28403347730636597, "grad_norm": 0.3517248036690116, "learning_rate": 2.1838736842105265e-05, "loss": 0.2798, "mean_copy_accuracy": 0.9973742514848709, "mean_gen_accuracy": 0.8735473901033401, "mean_token_accuracy": 0.9063757210969925, "num_tokens": 867483226.0, "sample_num_tokens": 9059.5, "step": 7197, "total_num_tokens": 867519464.0, "z_loss": 0.0005153286037966609 }, { "copy_logits_max": -5.804375648498535, "copy_logits_min": -750000000.0, "copy_num_tokens": 449.5625, "epoch": 1.4701557314271125, "gen_logits_max": 5.163506984710693, "gen_logits_mean": -13.90753173828125, "gen_logits_min": -26.19064712524414, "gen_logits_std": 3.1249053478240967, "gen_loss": 0.29152417182922363, "grad_norm": 0.3363337260819885, "learning_rate": 2.1837473684210526e-05, "loss": 0.2723, "mean_copy_accuracy": 0.9971441477537155, "mean_gen_accuracy": 0.8755875080823898, "mean_token_accuracy": 0.9072342813014984, "num_tokens": 867772077.0, "sample_num_tokens": 7902.25, "step": 7198, "total_num_tokens": 867803686.0, "z_loss": 0.0005056045483797789 }, { "copy_logits_max": -4.803093433380127, "copy_logits_min": -750000000.0, "copy_num_tokens": 406.875, "epoch": 1.4703599693643095, "gen_logits_max": 5.376550674438477, "gen_logits_mean": -15.445663452148438, "gen_logits_min": -27.807205200195312, "gen_logits_std": 3.1675305366516113, "gen_loss": 0.2843976616859436, "grad_norm": 0.37338427751565934, "learning_rate": 2.183621052631579e-05, "loss": 0.2906, "mean_copy_accuracy": 0.9966672956943512, "mean_gen_accuracy": 0.8711848556995392, "mean_token_accuracy": 0.901651918888092, "num_tokens": 868018521.0, "sample_num_tokens": 8179.25, "step": 7199, "total_num_tokens": 868051238.0, "z_loss": 0.0005390665028244257 }, { "copy_logits_max": -6.516317844390869, "copy_logits_min": -500000064.0, "copy_num_tokens": 716.5, "epoch": 1.4705642073015062, "gen_logits_max": 1.8398945331573486, "gen_logits_mean": -18.748779296875, "gen_logits_min": -31.099245071411133, "gen_logits_std": 3.2801594734191895, "gen_loss": 0.2532760500907898, "grad_norm": 0.3637556669944528, "learning_rate": 2.183494736842105e-05, "loss": 0.2745, "mean_copy_accuracy": 0.9967099726200104, "mean_gen_accuracy": 0.8772419691085815, "mean_token_accuracy": 0.9078431129455566, "num_tokens": 868270797.0, "sample_num_tokens": 9866.75, "step": 7200, "total_num_tokens": 868310264.0, "z_loss": 0.00043000149889849126 }, { "copy_logits_max": -5.736899375915527, "copy_logits_min": -687500032.0, "copy_num_tokens": 442.75, "epoch": 1.470768445238703, "gen_logits_max": 3.5543625354766846, "gen_logits_mean": -15.951547622680664, "gen_logits_min": -27.96969223022461, "gen_logits_std": 3.178309917449951, "gen_loss": 0.29587727785110474, "grad_norm": 0.3271212522624672, "learning_rate": 2.1833684210526316e-05, "loss": 0.2604, "mean_copy_accuracy": 0.9961892515420914, "mean_gen_accuracy": 0.8836992084980011, "mean_token_accuracy": 0.9125460088253021, "num_tokens": 868549918.0, "sample_num_tokens": 7703.5, "step": 7201, "total_num_tokens": 868580732.0, "z_loss": 0.0005336303147487342 }, { "copy_logits_max": -5.1138787269592285, "copy_logits_min": -750000000.0, "copy_num_tokens": 447.375, "epoch": 1.4709726831759, "gen_logits_max": 4.393679141998291, "gen_logits_mean": -15.598543167114258, "gen_logits_min": -27.47570037841797, "gen_logits_std": 3.1482701301574707, "gen_loss": 0.2866745591163635, "grad_norm": 0.39260177068587754, "learning_rate": 2.1832421052631577e-05, "loss": 0.2876, "mean_copy_accuracy": 0.9967259615659714, "mean_gen_accuracy": 0.8714839816093445, "mean_token_accuracy": 0.9029159694910049, "num_tokens": 868820502.0, "sample_num_tokens": 8185.5, "step": 7202, "total_num_tokens": 868853244.0, "z_loss": 0.0005168215720914304 }, { "copy_logits_max": -2.004903793334961, "copy_logits_min": -750000000.0, "copy_num_tokens": 488.875, "epoch": 1.4711769211130967, "gen_logits_max": 4.369595527648926, "gen_logits_mean": -15.254093170166016, "gen_logits_min": -27.30913734436035, "gen_logits_std": 3.1501169204711914, "gen_loss": 0.2781226933002472, "grad_norm": 0.34501870982731886, "learning_rate": 2.1831157894736844e-05, "loss": 0.2759, "mean_copy_accuracy": 0.9965634346008301, "mean_gen_accuracy": 0.8772998601198196, "mean_token_accuracy": 0.9056639969348907, "num_tokens": 869101978.0, "sample_num_tokens": 9649.0, "step": 7203, "total_num_tokens": 869140574.0, "z_loss": 0.0005162713932804763 }, { "copy_logits_max": -3.30256986618042, "copy_logits_min": -750000000.0, "copy_num_tokens": 367.5, "epoch": 1.4713811590502937, "gen_logits_max": 4.630046844482422, "gen_logits_mean": -15.587284088134766, "gen_logits_min": -27.600452423095703, "gen_logits_std": 3.1497745513916016, "gen_loss": 0.30651697516441345, "grad_norm": 0.35724150183303066, "learning_rate": 2.1829894736842105e-05, "loss": 0.2892, "mean_copy_accuracy": 0.9966556578874588, "mean_gen_accuracy": 0.8762487173080444, "mean_token_accuracy": 0.903403177857399, "num_tokens": 869376815.0, "sample_num_tokens": 8056.25, "step": 7204, "total_num_tokens": 869409040.0, "z_loss": 0.0005448347656056285 }, { "copy_logits_max": -0.6332699060440063, "copy_logits_min": -687500032.0, "copy_num_tokens": 437.6875, "epoch": 1.4715853969874904, "gen_logits_max": 4.49298095703125, "gen_logits_mean": -14.940773010253906, "gen_logits_min": -27.00385284423828, "gen_logits_std": 3.160407543182373, "gen_loss": 0.2804371118545532, "grad_norm": 0.35367036361466764, "learning_rate": 2.182863157894737e-05, "loss": 0.2781, "mean_copy_accuracy": 0.9972107410430908, "mean_gen_accuracy": 0.875075951218605, "mean_token_accuracy": 0.9061400890350342, "num_tokens": 869652107.0, "sample_num_tokens": 7997.25, "step": 7205, "total_num_tokens": 869684096.0, "z_loss": 0.00045179491280578077 }, { "copy_logits_max": -4.088373184204102, "copy_logits_min": -750000000.0, "copy_num_tokens": 308.1875, "epoch": 1.4717896349246873, "gen_logits_max": 4.16385555267334, "gen_logits_mean": -17.299888610839844, "gen_logits_min": -29.07915687561035, "gen_logits_std": 3.191180467605591, "gen_loss": 0.31196093559265137, "grad_norm": 0.36840371455896737, "learning_rate": 2.1827368421052634e-05, "loss": 0.2773, "mean_copy_accuracy": 0.9958567768335342, "mean_gen_accuracy": 0.8787883818149567, "mean_token_accuracy": 0.903766393661499, "num_tokens": 869912713.0, "sample_num_tokens": 7720.75, "step": 7206, "total_num_tokens": 869943596.0, "z_loss": 0.0005388202844187617 }, { "copy_logits_max": -3.2505953311920166, "copy_logits_min": -750000000.0, "copy_num_tokens": 529.1875, "epoch": 1.471993872861884, "gen_logits_max": 3.4594674110412598, "gen_logits_mean": -17.164440155029297, "gen_logits_min": -28.77685546875, "gen_logits_std": 3.2251811027526855, "gen_loss": 0.2549516558647156, "grad_norm": 0.3968377055877588, "learning_rate": 2.1826105263157895e-05, "loss": 0.2957, "mean_copy_accuracy": 0.9951088279485703, "mean_gen_accuracy": 0.873611330986023, "mean_token_accuracy": 0.8993369042873383, "num_tokens": 870161426.0, "sample_num_tokens": 9668.5, "step": 7207, "total_num_tokens": 870200100.0, "z_loss": 0.00044252717634662986 }, { "copy_logits_max": -1.0746502876281738, "copy_logits_min": -750000000.0, "copy_num_tokens": 544.8125, "epoch": 1.472198110799081, "gen_logits_max": 2.4210476875305176, "gen_logits_mean": -17.915008544921875, "gen_logits_min": -29.95765495300293, "gen_logits_std": 3.266303062438965, "gen_loss": 0.285896897315979, "grad_norm": 0.340574198711284, "learning_rate": 2.182484210526316e-05, "loss": 0.2814, "mean_copy_accuracy": 0.9971986413002014, "mean_gen_accuracy": 0.87300606071949, "mean_token_accuracy": 0.9040348082780838, "num_tokens": 870435827.0, "sample_num_tokens": 8943.25, "step": 7208, "total_num_tokens": 870471600.0, "z_loss": 0.00045749821583740413 }, { "copy_logits_max": -0.014774203300476074, "copy_logits_min": -750000064.0, "copy_num_tokens": 616.875, "epoch": 1.4724023487362778, "gen_logits_max": 3.488476276397705, "gen_logits_mean": -16.320213317871094, "gen_logits_min": -28.57892417907715, "gen_logits_std": 3.2439112663269043, "gen_loss": 0.2377922385931015, "grad_norm": 0.400386299019223, "learning_rate": 2.182357894736842e-05, "loss": 0.2887, "mean_copy_accuracy": 0.9963169544935226, "mean_gen_accuracy": 0.8749140501022339, "mean_token_accuracy": 0.9007952064275742, "num_tokens": 870683904.0, "sample_num_tokens": 9390.0, "step": 7209, "total_num_tokens": 870721464.0, "z_loss": 0.0004113449831493199 }, { "copy_logits_max": 0.6492645740509033, "copy_logits_min": -687500032.0, "copy_num_tokens": 449.8125, "epoch": 1.4726065866734745, "gen_logits_max": 4.293596267700195, "gen_logits_mean": -15.811307907104492, "gen_logits_min": -28.604076385498047, "gen_logits_std": 3.200305700302124, "gen_loss": 0.2808037996292114, "grad_norm": 0.3383296014536648, "learning_rate": 2.1822315789473684e-05, "loss": 0.2711, "mean_copy_accuracy": 0.9965766072273254, "mean_gen_accuracy": 0.880205512046814, "mean_token_accuracy": 0.906919464468956, "num_tokens": 870963403.0, "sample_num_tokens": 8617.25, "step": 7210, "total_num_tokens": 870997872.0, "z_loss": 0.0004551057645585388 }, { "copy_logits_max": 1.126369833946228, "copy_logits_min": -750000000.0, "copy_num_tokens": 632.375, "epoch": 1.4728108246106715, "gen_logits_max": 4.50651741027832, "gen_logits_mean": -15.269933700561523, "gen_logits_min": -27.620698928833008, "gen_logits_std": 3.176323890686035, "gen_loss": 0.29199525713920593, "grad_norm": 0.3608907014327165, "learning_rate": 2.182105263157895e-05, "loss": 0.265, "mean_copy_accuracy": 0.9967457056045532, "mean_gen_accuracy": 0.8769358396530151, "mean_token_accuracy": 0.9105057716369629, "num_tokens": 871244000.0, "sample_num_tokens": 9978.0, "step": 7211, "total_num_tokens": 871283912.0, "z_loss": 0.0005133392405696213 }, { "copy_logits_max": 0.4047091603279114, "copy_logits_min": -625000000.0, "copy_num_tokens": 414.5625, "epoch": 1.4730150625478684, "gen_logits_max": 5.214175701141357, "gen_logits_mean": -15.005136489868164, "gen_logits_min": -26.944917678833008, "gen_logits_std": 3.1624181270599365, "gen_loss": 0.2762417197227478, "grad_norm": 0.3453128493448515, "learning_rate": 2.1819789473684213e-05, "loss": 0.2856, "mean_copy_accuracy": 0.9961913973093033, "mean_gen_accuracy": 0.8772879838943481, "mean_token_accuracy": 0.9028539508581161, "num_tokens": 871515460.0, "sample_num_tokens": 8258.5, "step": 7212, "total_num_tokens": 871548494.0, "z_loss": 0.00047213619109243155 }, { "copy_logits_max": -2.322542667388916, "copy_logits_min": -750000000.0, "copy_num_tokens": 335.9375, "epoch": 1.473219300485065, "gen_logits_max": 5.090869426727295, "gen_logits_mean": -16.21984100341797, "gen_logits_min": -28.139568328857422, "gen_logits_std": 3.1970605850219727, "gen_loss": 0.2781803011894226, "grad_norm": 0.3363832722291688, "learning_rate": 2.1818526315789474e-05, "loss": 0.2497, "mean_copy_accuracy": 0.9973269253969193, "mean_gen_accuracy": 0.886576384305954, "mean_token_accuracy": 0.9157175570726395, "num_tokens": 871791657.0, "sample_num_tokens": 7989.25, "step": 7213, "total_num_tokens": 871823614.0, "z_loss": 0.0004899096675217152 }, { "copy_logits_max": 3.252976417541504, "copy_logits_min": -750000000.0, "copy_num_tokens": 469.875, "epoch": 1.4734235384222618, "gen_logits_max": 5.237612724304199, "gen_logits_mean": -13.601781845092773, "gen_logits_min": -26.523405075073242, "gen_logits_std": 3.1569690704345703, "gen_loss": 0.28080472350120544, "grad_norm": 0.35908720152014895, "learning_rate": 2.1817263157894738e-05, "loss": 0.256, "mean_copy_accuracy": 0.9977621138095856, "mean_gen_accuracy": 0.8786395043134689, "mean_token_accuracy": 0.9117996096611023, "num_tokens": 872039163.0, "sample_num_tokens": 7738.25, "step": 7214, "total_num_tokens": 872070116.0, "z_loss": 0.0004735948168672621 }, { "copy_logits_max": -0.3483656048774719, "copy_logits_min": -750000064.0, "copy_num_tokens": 418.5, "epoch": 1.4736277763594587, "gen_logits_max": 4.128934860229492, "gen_logits_mean": -16.254268646240234, "gen_logits_min": -28.360057830810547, "gen_logits_std": 3.200286388397217, "gen_loss": 0.2994975447654724, "grad_norm": 0.3850068417377335, "learning_rate": 2.1816e-05, "loss": 0.2754, "mean_copy_accuracy": 0.996064692735672, "mean_gen_accuracy": 0.8760653287172318, "mean_token_accuracy": 0.9046999961137772, "num_tokens": 872287656.0, "sample_num_tokens": 7824.5, "step": 7215, "total_num_tokens": 872318954.0, "z_loss": 0.0005339200724847615 }, { "copy_logits_max": 0.0019559860229492188, "copy_logits_min": -750000000.0, "copy_num_tokens": 604.8125, "epoch": 1.4738320142966557, "gen_logits_max": 3.585108757019043, "gen_logits_mean": -15.95352554321289, "gen_logits_min": -28.646631240844727, "gen_logits_std": 3.22243595123291, "gen_loss": 0.2526180148124695, "grad_norm": 0.3745143821173598, "learning_rate": 2.1814736842105263e-05, "loss": 0.2845, "mean_copy_accuracy": 0.9967413395643234, "mean_gen_accuracy": 0.8711735159158707, "mean_token_accuracy": 0.9010578095912933, "num_tokens": 872559419.0, "sample_num_tokens": 9332.25, "step": 7216, "total_num_tokens": 872596748.0, "z_loss": 0.000537021376658231 }, { "copy_logits_max": -1.4314494132995605, "copy_logits_min": -750000000.0, "copy_num_tokens": 373.6875, "epoch": 1.4740362522338524, "gen_logits_max": 3.7487802505493164, "gen_logits_mean": -17.14137077331543, "gen_logits_min": -28.85904884338379, "gen_logits_std": 3.241300344467163, "gen_loss": 0.2316097915172577, "grad_norm": 0.35495720785948137, "learning_rate": 2.1813473684210524e-05, "loss": 0.2783, "mean_copy_accuracy": 0.9967412501573563, "mean_gen_accuracy": 0.881421372294426, "mean_token_accuracy": 0.9051326215267181, "num_tokens": 872835945.0, "sample_num_tokens": 7568.75, "step": 7217, "total_num_tokens": 872866220.0, "z_loss": 0.000444532633991912 }, { "copy_logits_max": -2.1941685676574707, "copy_logits_min": -750000000.0, "copy_num_tokens": 496.8125, "epoch": 1.4742404901710493, "gen_logits_max": 4.832151889801025, "gen_logits_mean": -14.922088623046875, "gen_logits_min": -27.258947372436523, "gen_logits_std": 3.170461654663086, "gen_loss": 0.2804938554763794, "grad_norm": 0.40917586136695044, "learning_rate": 2.181221052631579e-05, "loss": 0.2844, "mean_copy_accuracy": 0.9972106963396072, "mean_gen_accuracy": 0.875706136226654, "mean_token_accuracy": 0.9052341282367706, "num_tokens": 873101490.0, "sample_num_tokens": 8868.5, "step": 7218, "total_num_tokens": 873136964.0, "z_loss": 0.0004740661825053394 }, { "copy_logits_max": -4.861711025238037, "copy_logits_min": -687500032.0, "copy_num_tokens": 577.75, "epoch": 1.4744447281082462, "gen_logits_max": 2.8926210403442383, "gen_logits_mean": -17.33983612060547, "gen_logits_min": -29.504547119140625, "gen_logits_std": 3.2421445846557617, "gen_loss": 0.2676127552986145, "grad_norm": 0.358429098700817, "learning_rate": 2.1810947368421056e-05, "loss": 0.2603, "mean_copy_accuracy": 0.9975456595420837, "mean_gen_accuracy": 0.8789075762033463, "mean_token_accuracy": 0.9127029627561569, "num_tokens": 873379937.0, "sample_num_tokens": 8737.75, "step": 7219, "total_num_tokens": 873414888.0, "z_loss": 0.0004554516635835171 }, { "copy_logits_max": -3.536949634552002, "copy_logits_min": -687500032.0, "copy_num_tokens": 411.25, "epoch": 1.474648966045443, "gen_logits_max": 3.522280216217041, "gen_logits_mean": -17.733604431152344, "gen_logits_min": -30.27775764465332, "gen_logits_std": 3.2347025871276855, "gen_loss": 0.32261666655540466, "grad_norm": 0.6126981084510104, "learning_rate": 2.1809684210526317e-05, "loss": 0.274, "mean_copy_accuracy": 0.9959583580493927, "mean_gen_accuracy": 0.8750630915164948, "mean_token_accuracy": 0.9089475870132446, "num_tokens": 873670231.0, "sample_num_tokens": 7972.25, "step": 7220, "total_num_tokens": 873702120.0, "z_loss": 0.0005212057731114328 }, { "copy_logits_max": -0.5577439069747925, "copy_logits_min": -750000064.0, "copy_num_tokens": 552.25, "epoch": 1.4748532039826396, "gen_logits_max": 3.762707471847534, "gen_logits_mean": -15.316866874694824, "gen_logits_min": -27.97502899169922, "gen_logits_std": 3.225918769836426, "gen_loss": 0.26483118534088135, "grad_norm": 0.39039745411735266, "learning_rate": 2.180842105263158e-05, "loss": 0.269, "mean_copy_accuracy": 0.9960552453994751, "mean_gen_accuracy": 0.8766647428274155, "mean_token_accuracy": 0.9072137176990509, "num_tokens": 873939226.0, "sample_num_tokens": 8244.5, "step": 7221, "total_num_tokens": 873972204.0, "z_loss": 0.00047544256085529923 }, { "copy_logits_max": 0.8301503658294678, "copy_logits_min": -687500032.0, "copy_num_tokens": 469.875, "epoch": 1.4750574419198366, "gen_logits_max": 4.451170444488525, "gen_logits_mean": -15.352999687194824, "gen_logits_min": -27.660995483398438, "gen_logits_std": 3.19480562210083, "gen_loss": 0.2866916060447693, "grad_norm": 0.3981366999247471, "learning_rate": 2.1807157894736843e-05, "loss": 0.3009, "mean_copy_accuracy": 0.9954996556043625, "mean_gen_accuracy": 0.8702109605073929, "mean_token_accuracy": 0.8984034359455109, "num_tokens": 874206643.0, "sample_num_tokens": 8642.25, "step": 7222, "total_num_tokens": 874241212.0, "z_loss": 0.0004978031502105296 }, { "copy_logits_max": -0.32249268889427185, "copy_logits_min": -687500032.0, "copy_num_tokens": 391.75, "epoch": 1.4752616798570335, "gen_logits_max": 3.694986343383789, "gen_logits_mean": -16.67098617553711, "gen_logits_min": -29.151824951171875, "gen_logits_std": 3.22540545463562, "gen_loss": 0.2782701849937439, "grad_norm": 0.3863479929069605, "learning_rate": 2.1805894736842107e-05, "loss": 0.2761, "mean_copy_accuracy": 0.9969931989908218, "mean_gen_accuracy": 0.8754903972148895, "mean_token_accuracy": 0.9057058542966843, "num_tokens": 874461675.0, "sample_num_tokens": 7248.25, "step": 7223, "total_num_tokens": 874490668.0, "z_loss": 0.0005320272175595164 }, { "copy_logits_max": -0.850085973739624, "copy_logits_min": -687500032.0, "copy_num_tokens": 527.4375, "epoch": 1.4754659177942302, "gen_logits_max": 4.517568588256836, "gen_logits_mean": -15.631420135498047, "gen_logits_min": -27.75525665283203, "gen_logits_std": 3.208667039871216, "gen_loss": 0.25207072496414185, "grad_norm": 0.3830683684997228, "learning_rate": 2.1804631578947368e-05, "loss": 0.2649, "mean_copy_accuracy": 0.996292382478714, "mean_gen_accuracy": 0.8816669881343842, "mean_token_accuracy": 0.9095395356416702, "num_tokens": 874715238.0, "sample_num_tokens": 8545.5, "step": 7224, "total_num_tokens": 874749420.0, "z_loss": 0.0005216104909777641 }, { "copy_logits_max": -2.1748876571655273, "copy_logits_min": -750000000.0, "copy_num_tokens": 608.5, "epoch": 1.4756701557314271, "gen_logits_max": 2.6068906784057617, "gen_logits_mean": -17.75345230102539, "gen_logits_min": -29.876083374023438, "gen_logits_std": 3.275413751602173, "gen_loss": 0.2781882882118225, "grad_norm": 0.3590964804177304, "learning_rate": 2.1803368421052632e-05, "loss": 0.2791, "mean_copy_accuracy": 0.9974326342344284, "mean_gen_accuracy": 0.8703577220439911, "mean_token_accuracy": 0.9060038030147552, "num_tokens": 874989132.0, "sample_num_tokens": 9132.0, "step": 7225, "total_num_tokens": 875025660.0, "z_loss": 0.000505334697663784 }, { "copy_logits_max": -4.109414100646973, "copy_logits_min": -687500032.0, "copy_num_tokens": 457.9375, "epoch": 1.475874393668624, "gen_logits_max": 3.8779492378234863, "gen_logits_mean": -16.61924934387207, "gen_logits_min": -28.63128662109375, "gen_logits_std": 3.2089896202087402, "gen_loss": 0.3127385675907135, "grad_norm": 0.3512271724229563, "learning_rate": 2.1802105263157893e-05, "loss": 0.2822, "mean_copy_accuracy": 0.9959911704063416, "mean_gen_accuracy": 0.876896470785141, "mean_token_accuracy": 0.9034138917922974, "num_tokens": 875261407.0, "sample_num_tokens": 8193.75, "step": 7226, "total_num_tokens": 875294182.0, "z_loss": 0.0005618016002699733 }, { "copy_logits_max": -2.9166181087493896, "copy_logits_min": -750000000.0, "copy_num_tokens": 628.3125, "epoch": 1.4760786316058208, "gen_logits_max": 4.6386871337890625, "gen_logits_mean": -14.370881080627441, "gen_logits_min": -26.937591552734375, "gen_logits_std": 3.1889753341674805, "gen_loss": 0.2680049240589142, "grad_norm": 0.3533435494071652, "learning_rate": 2.1800842105263157e-05, "loss": 0.2594, "mean_copy_accuracy": 0.9962488263845444, "mean_gen_accuracy": 0.883815661072731, "mean_token_accuracy": 0.9128897041082382, "num_tokens": 875551611.0, "sample_num_tokens": 9140.25, "step": 7227, "total_num_tokens": 875588172.0, "z_loss": 0.0004399201716296375 }, { "copy_logits_max": -4.824450492858887, "copy_logits_min": -750000000.0, "copy_num_tokens": 505.5625, "epoch": 1.4762828695430177, "gen_logits_max": 3.7482025623321533, "gen_logits_mean": -16.888315200805664, "gen_logits_min": -29.265098571777344, "gen_logits_std": 3.2436461448669434, "gen_loss": 0.27135464549064636, "grad_norm": 0.3575854155376894, "learning_rate": 2.179957894736842e-05, "loss": 0.2685, "mean_copy_accuracy": 0.9965206980705261, "mean_gen_accuracy": 0.8784939050674438, "mean_token_accuracy": 0.9097878038883209, "num_tokens": 875820696.0, "sample_num_tokens": 8108.0, "step": 7228, "total_num_tokens": 875853128.0, "z_loss": 0.00047112570609897375 }, { "copy_logits_max": -5.9596686363220215, "copy_logits_min": -750000000.0, "copy_num_tokens": 313.4375, "epoch": 1.4764871074802144, "gen_logits_max": 4.877565383911133, "gen_logits_mean": -16.05976104736328, "gen_logits_min": -28.254444122314453, "gen_logits_std": 3.1981918811798096, "gen_loss": 0.30906227231025696, "grad_norm": 0.3636786635199081, "learning_rate": 2.1798315789473686e-05, "loss": 0.2902, "mean_copy_accuracy": 0.9960956424474716, "mean_gen_accuracy": 0.8716557323932648, "mean_token_accuracy": 0.9004757255315781, "num_tokens": 876088778.0, "sample_num_tokens": 7267.5, "step": 7229, "total_num_tokens": 876117848.0, "z_loss": 0.0005437983199954033 }, { "copy_logits_max": -4.160395622253418, "copy_logits_min": -750000000.0, "copy_num_tokens": 455.8125, "epoch": 1.4766913454174113, "gen_logits_max": 3.553406238555908, "gen_logits_mean": -17.155506134033203, "gen_logits_min": -29.33525848388672, "gen_logits_std": 3.260504961013794, "gen_loss": 0.28014010190963745, "grad_norm": 0.4803353671501256, "learning_rate": 2.1797052631578947e-05, "loss": 0.2853, "mean_copy_accuracy": 0.9955664128065109, "mean_gen_accuracy": 0.8730631768703461, "mean_token_accuracy": 0.9039866626262665, "num_tokens": 876337590.0, "sample_num_tokens": 7662.5, "step": 7230, "total_num_tokens": 876368240.0, "z_loss": 0.00048314439482055604 }, { "copy_logits_max": -1.65330970287323, "copy_logits_min": -750000000.0, "copy_num_tokens": 563.1875, "epoch": 1.476895583354608, "gen_logits_max": 4.907042026519775, "gen_logits_mean": -14.504650115966797, "gen_logits_min": -26.787761688232422, "gen_logits_std": 3.184992790222168, "gen_loss": 0.2527114152908325, "grad_norm": 0.41407502643680055, "learning_rate": 2.179578947368421e-05, "loss": 0.2842, "mean_copy_accuracy": 0.9970842152833939, "mean_gen_accuracy": 0.8749734461307526, "mean_token_accuracy": 0.9037225246429443, "num_tokens": 876613340.0, "sample_num_tokens": 9217.5, "step": 7231, "total_num_tokens": 876650210.0, "z_loss": 0.00040640070801600814 }, { "copy_logits_max": -3.9287924766540527, "copy_logits_min": -750000000.0, "copy_num_tokens": 391.125, "epoch": 1.477099821291805, "gen_logits_max": 5.221764087677002, "gen_logits_mean": -14.947059631347656, "gen_logits_min": -27.171504974365234, "gen_logits_std": 3.1995222568511963, "gen_loss": 0.25517094135284424, "grad_norm": 0.3336312091598085, "learning_rate": 2.1794526315789476e-05, "loss": 0.2633, "mean_copy_accuracy": 0.9959182739257812, "mean_gen_accuracy": 0.8858962953090668, "mean_token_accuracy": 0.9104178845882416, "num_tokens": 876904526.0, "sample_num_tokens": 8775.0, "step": 7232, "total_num_tokens": 876939626.0, "z_loss": 0.00044668876216746867 }, { "copy_logits_max": -3.4126951694488525, "copy_logits_min": -750000064.0, "copy_num_tokens": 457.625, "epoch": 1.4773040592290019, "gen_logits_max": 4.69919490814209, "gen_logits_mean": -15.181453704833984, "gen_logits_min": -27.732643127441406, "gen_logits_std": 3.191739082336426, "gen_loss": 0.27810755372047424, "grad_norm": 0.3700251367918757, "learning_rate": 2.1793263157894736e-05, "loss": 0.2831, "mean_copy_accuracy": 0.994979977607727, "mean_gen_accuracy": 0.8775411546230316, "mean_token_accuracy": 0.9046323746442795, "num_tokens": 877186420.0, "sample_num_tokens": 8899.0, "step": 7233, "total_num_tokens": 877222016.0, "z_loss": 0.0004126703133806586 }, { "copy_logits_max": -3.98502779006958, "copy_logits_min": -687500032.0, "copy_num_tokens": 815.5625, "epoch": 1.4775082971661986, "gen_logits_max": 3.749767780303955, "gen_logits_mean": -16.174392700195312, "gen_logits_min": -28.999290466308594, "gen_logits_std": 3.2435898780822754, "gen_loss": 0.2236262410879135, "grad_norm": 0.35840101826597726, "learning_rate": 2.1792e-05, "loss": 0.2631, "mean_copy_accuracy": 0.996815636754036, "mean_gen_accuracy": 0.8794043213129044, "mean_token_accuracy": 0.9111423641443253, "num_tokens": 877474956.0, "sample_num_tokens": 11714.5, "step": 7234, "total_num_tokens": 877521814.0, "z_loss": 0.0003788547473959625 }, { "copy_logits_max": -3.577594757080078, "copy_logits_min": -750000000.0, "copy_num_tokens": 499.625, "epoch": 1.4777125351033955, "gen_logits_max": 2.941749095916748, "gen_logits_mean": -17.82703399658203, "gen_logits_min": -29.784664154052734, "gen_logits_std": 3.2629857063293457, "gen_loss": 0.2835530638694763, "grad_norm": 0.31414236456726274, "learning_rate": 2.179073684210526e-05, "loss": 0.2639, "mean_copy_accuracy": 0.996707871556282, "mean_gen_accuracy": 0.878010168671608, "mean_token_accuracy": 0.9091218113899231, "num_tokens": 877783353.0, "sample_num_tokens": 8412.75, "step": 7235, "total_num_tokens": 877817004.0, "z_loss": 0.00042059493716806173 }, { "copy_logits_max": -1.2035152912139893, "copy_logits_min": -750000000.0, "copy_num_tokens": 565.1875, "epoch": 1.4779167730405922, "gen_logits_max": 3.3651928901672363, "gen_logits_mean": -16.837221145629883, "gen_logits_min": -29.045867919921875, "gen_logits_std": 3.2342753410339355, "gen_loss": 0.25649306178092957, "grad_norm": 0.3777539083427022, "learning_rate": 2.178947368421053e-05, "loss": 0.2662, "mean_copy_accuracy": 0.9964523315429688, "mean_gen_accuracy": 0.8791686296463013, "mean_token_accuracy": 0.9083088040351868, "num_tokens": 878056042.0, "sample_num_tokens": 8703.5, "step": 7236, "total_num_tokens": 878090856.0, "z_loss": 0.0003956345026381314 }, { "copy_logits_max": -4.15631628036499, "copy_logits_min": -750000000.0, "copy_num_tokens": 417.625, "epoch": 1.4781210109777891, "gen_logits_max": 4.301549434661865, "gen_logits_mean": -16.802732467651367, "gen_logits_min": -28.730518341064453, "gen_logits_std": 3.211397171020508, "gen_loss": 0.28614944219589233, "grad_norm": 0.378813683892119, "learning_rate": 2.178821052631579e-05, "loss": 0.2637, "mean_copy_accuracy": 0.996327131986618, "mean_gen_accuracy": 0.880514919757843, "mean_token_accuracy": 0.9120189696550369, "num_tokens": 878339212.0, "sample_num_tokens": 8232.5, "step": 7237, "total_num_tokens": 878372142.0, "z_loss": 0.0005048522725701332 }, { "copy_logits_max": -5.931349277496338, "copy_logits_min": -687500032.0, "copy_num_tokens": 502.375, "epoch": 1.4783252489149858, "gen_logits_max": 3.3932528495788574, "gen_logits_mean": -17.65360450744629, "gen_logits_min": -29.770099639892578, "gen_logits_std": 3.25565767288208, "gen_loss": 0.2764873504638672, "grad_norm": 0.38540681324172316, "learning_rate": 2.1786947368421055e-05, "loss": 0.2755, "mean_copy_accuracy": 0.997919961810112, "mean_gen_accuracy": 0.8706713169813156, "mean_token_accuracy": 0.907038077712059, "num_tokens": 878615073.0, "sample_num_tokens": 9015.75, "step": 7238, "total_num_tokens": 878651136.0, "z_loss": 0.00043596597970463336 }, { "copy_logits_max": -4.1663103103637695, "copy_logits_min": -750000000.0, "copy_num_tokens": 449.875, "epoch": 1.4785294868521828, "gen_logits_max": 4.743114471435547, "gen_logits_mean": -15.393593788146973, "gen_logits_min": -27.242904663085938, "gen_logits_std": 3.193002223968506, "gen_loss": 0.25967931747436523, "grad_norm": 0.35693598275103133, "learning_rate": 2.1785684210526316e-05, "loss": 0.2644, "mean_copy_accuracy": 0.9969597905874252, "mean_gen_accuracy": 0.8763080835342407, "mean_token_accuracy": 0.9095250517129898, "num_tokens": 878891809.0, "sample_num_tokens": 8040.75, "step": 7239, "total_num_tokens": 878923972.0, "z_loss": 0.0004464529629331082 }, { "copy_logits_max": -5.264421463012695, "copy_logits_min": -750000000.0, "copy_num_tokens": 397.125, "epoch": 1.4787337247893797, "gen_logits_max": 4.61427116394043, "gen_logits_mean": -16.47804069519043, "gen_logits_min": -28.555477142333984, "gen_logits_std": 3.220003843307495, "gen_loss": 0.2825452983379364, "grad_norm": 0.38339414836607993, "learning_rate": 2.178442105263158e-05, "loss": 0.2729, "mean_copy_accuracy": 0.9956061542034149, "mean_gen_accuracy": 0.8776144534349442, "mean_token_accuracy": 0.9056661278009415, "num_tokens": 879155921.0, "sample_num_tokens": 8255.75, "step": 7240, "total_num_tokens": 879188944.0, "z_loss": 0.0004690346249844879 }, { "copy_logits_max": -4.95028829574585, "copy_logits_min": -750000064.0, "copy_num_tokens": 608.0, "epoch": 1.4789379627265764, "gen_logits_max": 4.321852684020996, "gen_logits_mean": -14.7205810546875, "gen_logits_min": -26.891315460205078, "gen_logits_std": 3.185189723968506, "gen_loss": 0.2519915997982025, "grad_norm": 0.3652742325717786, "learning_rate": 2.178315789473684e-05, "loss": 0.2725, "mean_copy_accuracy": 0.9970351904630661, "mean_gen_accuracy": 0.8799229562282562, "mean_token_accuracy": 0.9096101224422455, "num_tokens": 879418437.0, "sample_num_tokens": 9544.25, "step": 7241, "total_num_tokens": 879456614.0, "z_loss": 0.00038855799357406795 }, { "copy_logits_max": -6.546319484710693, "copy_logits_min": -750000128.0, "copy_num_tokens": 304.5, "epoch": 1.4791422006637733, "gen_logits_max": 4.377557754516602, "gen_logits_mean": -16.842979431152344, "gen_logits_min": -28.52041244506836, "gen_logits_std": 3.2173616886138916, "gen_loss": 0.28596022725105286, "grad_norm": 0.36083166294208985, "learning_rate": 2.1781894736842105e-05, "loss": 0.2815, "mean_copy_accuracy": 0.9971135854721069, "mean_gen_accuracy": 0.8772224485874176, "mean_token_accuracy": 0.9042396396398544, "num_tokens": 879678570.0, "sample_num_tokens": 7560.0, "step": 7242, "total_num_tokens": 879708810.0, "z_loss": 0.00046430155634880066 }, { "copy_logits_max": -4.291561126708984, "copy_logits_min": -687500032.0, "copy_num_tokens": 557.0, "epoch": 1.4793464386009703, "gen_logits_max": 3.707782506942749, "gen_logits_mean": -16.378515243530273, "gen_logits_min": -28.482370376586914, "gen_logits_std": 3.210444211959839, "gen_loss": 0.302219420671463, "grad_norm": 0.4006874131052309, "learning_rate": 2.1780631578947366e-05, "loss": 0.2843, "mean_copy_accuracy": 0.996375173330307, "mean_gen_accuracy": 0.87380750477314, "mean_token_accuracy": 0.9026339948177338, "num_tokens": 879935329.0, "sample_num_tokens": 9495.25, "step": 7243, "total_num_tokens": 879973310.0, "z_loss": 0.0005183426546864212 }, { "copy_logits_max": -4.853652000427246, "copy_logits_min": -750000000.0, "copy_num_tokens": 410.75, "epoch": 1.479550676538167, "gen_logits_max": 4.306324005126953, "gen_logits_mean": -15.619166374206543, "gen_logits_min": -27.83277130126953, "gen_logits_std": 3.18127179145813, "gen_loss": 0.2872588038444519, "grad_norm": 0.3502406109961204, "learning_rate": 2.1779368421052634e-05, "loss": 0.2695, "mean_copy_accuracy": 0.9958698451519012, "mean_gen_accuracy": 0.883293628692627, "mean_token_accuracy": 0.9069057106971741, "num_tokens": 880200809.0, "sample_num_tokens": 7931.75, "step": 7244, "total_num_tokens": 880232536.0, "z_loss": 0.0005051359767094254 }, { "copy_logits_max": -4.917801856994629, "copy_logits_min": -687500032.0, "copy_num_tokens": 437.0, "epoch": 1.4797549144753637, "gen_logits_max": 3.4525914192199707, "gen_logits_mean": -18.236108779907227, "gen_logits_min": -30.202110290527344, "gen_logits_std": 3.257584571838379, "gen_loss": 0.27084630727767944, "grad_norm": 0.3354982250260815, "learning_rate": 2.1778105263157895e-05, "loss": 0.2618, "mean_copy_accuracy": 0.9965576082468033, "mean_gen_accuracy": 0.880234494805336, "mean_token_accuracy": 0.9105845838785172, "num_tokens": 880465611.0, "sample_num_tokens": 7661.75, "step": 7245, "total_num_tokens": 880496258.0, "z_loss": 0.0005470876349136233 }, { "copy_logits_max": -4.635181427001953, "copy_logits_min": -750000000.0, "copy_num_tokens": 658.875, "epoch": 1.4799591524125606, "gen_logits_max": 3.721552610397339, "gen_logits_mean": -16.232969284057617, "gen_logits_min": -28.575233459472656, "gen_logits_std": 3.220946788787842, "gen_loss": 0.24303776025772095, "grad_norm": 0.37950363425072764, "learning_rate": 2.177684210526316e-05, "loss": 0.2793, "mean_copy_accuracy": 0.9963211268186569, "mean_gen_accuracy": 0.876174584031105, "mean_token_accuracy": 0.9066583812236786, "num_tokens": 880730553.0, "sample_num_tokens": 10594.25, "step": 7246, "total_num_tokens": 880772930.0, "z_loss": 0.00048141434672288597 }, { "copy_logits_max": -4.927617073059082, "copy_logits_min": -750000000.0, "copy_num_tokens": 760.3125, "epoch": 1.4801633903497575, "gen_logits_max": 3.996511459350586, "gen_logits_mean": -15.801984786987305, "gen_logits_min": -28.400808334350586, "gen_logits_std": 3.1902778148651123, "gen_loss": 0.27591672539711, "grad_norm": 0.38370819200253864, "learning_rate": 2.1775578947368423e-05, "loss": 0.2813, "mean_copy_accuracy": 0.9967794269323349, "mean_gen_accuracy": 0.8700881600379944, "mean_token_accuracy": 0.9056014567613602, "num_tokens": 881018352.0, "sample_num_tokens": 10531.5, "step": 7247, "total_num_tokens": 881060478.0, "z_loss": 0.0005551106296479702 }, { "copy_logits_max": -3.996534585952759, "copy_logits_min": -750000000.0, "copy_num_tokens": 481.5, "epoch": 1.4803676282869542, "gen_logits_max": 4.910439491271973, "gen_logits_mean": -14.502240180969238, "gen_logits_min": -26.513710021972656, "gen_logits_std": 3.1538498401641846, "gen_loss": 0.27545595169067383, "grad_norm": 0.3886628014413616, "learning_rate": 2.1774315789473684e-05, "loss": 0.2741, "mean_copy_accuracy": 0.9969737380743027, "mean_gen_accuracy": 0.8789591640233994, "mean_token_accuracy": 0.9065997004508972, "num_tokens": 881286246.0, "sample_num_tokens": 8529.5, "step": 7248, "total_num_tokens": 881320364.0, "z_loss": 0.000527895288541913 }, { "copy_logits_max": -4.480522155761719, "copy_logits_min": -687500032.0, "copy_num_tokens": 650.3125, "epoch": 1.4805718662241512, "gen_logits_max": 4.189797401428223, "gen_logits_mean": -15.16553783416748, "gen_logits_min": -27.63440704345703, "gen_logits_std": 3.170045852661133, "gen_loss": 0.28913313150405884, "grad_norm": 0.38033186905287314, "learning_rate": 2.177305263157895e-05, "loss": 0.2863, "mean_copy_accuracy": 0.9964570850133896, "mean_gen_accuracy": 0.869164451956749, "mean_token_accuracy": 0.9036700874567032, "num_tokens": 881551631.0, "sample_num_tokens": 9861.75, "step": 7249, "total_num_tokens": 881591078.0, "z_loss": 0.0005550882779061794 }, { "copy_logits_max": -3.9341299533843994, "copy_logits_min": -750000000.0, "copy_num_tokens": 733.6875, "epoch": 1.480776104161348, "gen_logits_max": 3.6616673469543457, "gen_logits_mean": -15.84174633026123, "gen_logits_min": -28.063373565673828, "gen_logits_std": 3.211390256881714, "gen_loss": 0.2385532259941101, "grad_norm": 0.3478619028467696, "learning_rate": 2.177178947368421e-05, "loss": 0.2671, "mean_copy_accuracy": 0.9975137114524841, "mean_gen_accuracy": 0.8765041381120682, "mean_token_accuracy": 0.9087716788053513, "num_tokens": 881830777.0, "sample_num_tokens": 9276.25, "step": 7250, "total_num_tokens": 881867882.0, "z_loss": 0.0004784779157489538 }, { "copy_logits_max": -5.566897392272949, "copy_logits_min": -750000000.0, "copy_num_tokens": 387.25, "epoch": 1.4809803420985448, "gen_logits_max": 3.8815276622772217, "gen_logits_mean": -17.171262741088867, "gen_logits_min": -28.862178802490234, "gen_logits_std": 3.211731433868408, "gen_loss": 0.3066081404685974, "grad_norm": 0.3530832539466026, "learning_rate": 2.1770526315789474e-05, "loss": 0.2868, "mean_copy_accuracy": 0.9974118769168854, "mean_gen_accuracy": 0.8732983767986298, "mean_token_accuracy": 0.9033592194318771, "num_tokens": 882105922.0, "sample_num_tokens": 7560.5, "step": 7251, "total_num_tokens": 882136164.0, "z_loss": 0.0005304605583660305 }, { "copy_logits_max": -6.019176959991455, "copy_logits_min": -687500032.0, "copy_num_tokens": 360.0625, "epoch": 1.4811845800357417, "gen_logits_max": 4.842155933380127, "gen_logits_mean": -16.203929901123047, "gen_logits_min": -28.220046997070312, "gen_logits_std": 3.160360813140869, "gen_loss": 0.30131781101226807, "grad_norm": 0.38602607276679923, "learning_rate": 2.1769263157894738e-05, "loss": 0.3019, "mean_copy_accuracy": 0.9948321580886841, "mean_gen_accuracy": 0.8702028095722198, "mean_token_accuracy": 0.8973095417022705, "num_tokens": 882358386.0, "sample_num_tokens": 8217.0, "step": 7252, "total_num_tokens": 882391254.0, "z_loss": 0.0005271293339319527 }, { "copy_logits_max": -5.367745399475098, "copy_logits_min": -750000000.0, "copy_num_tokens": 492.0625, "epoch": 1.4813888179729384, "gen_logits_max": 3.538444995880127, "gen_logits_mean": -16.32276153564453, "gen_logits_min": -28.63579559326172, "gen_logits_std": 3.181309223175049, "gen_loss": 0.2864951491355896, "grad_norm": 0.3912065206263507, "learning_rate": 2.1768000000000002e-05, "loss": 0.2814, "mean_copy_accuracy": 0.9955654144287109, "mean_gen_accuracy": 0.8743607252836227, "mean_token_accuracy": 0.9050930589437485, "num_tokens": 882632734.0, "sample_num_tokens": 8208.0, "step": 7253, "total_num_tokens": 882665566.0, "z_loss": 0.0005060033872723579 }, { "copy_logits_max": -3.9223556518554688, "copy_logits_min": -687500032.0, "copy_num_tokens": 319.5625, "epoch": 1.4815930559101353, "gen_logits_max": 3.958967447280884, "gen_logits_mean": -17.305099487304688, "gen_logits_min": -29.46320343017578, "gen_logits_std": 3.228469133377075, "gen_loss": 0.29960617423057556, "grad_norm": 0.33071621499601594, "learning_rate": 2.1766736842105263e-05, "loss": 0.261, "mean_copy_accuracy": 0.9977156817913055, "mean_gen_accuracy": 0.8818492591381073, "mean_token_accuracy": 0.9118586629629135, "num_tokens": 882902723.0, "sample_num_tokens": 6924.25, "step": 7254, "total_num_tokens": 882930420.0, "z_loss": 0.000535706989467144 }, { "copy_logits_max": -4.424793720245361, "copy_logits_min": -750000000.0, "copy_num_tokens": 459.4375, "epoch": 1.481797293847332, "gen_logits_max": 4.4359636306762695, "gen_logits_mean": -14.634181022644043, "gen_logits_min": -26.411964416503906, "gen_logits_std": 3.1572117805480957, "gen_loss": 0.3054100275039673, "grad_norm": 0.35530541172135366, "learning_rate": 2.1765473684210528e-05, "loss": 0.2982, "mean_copy_accuracy": 0.9967207163572311, "mean_gen_accuracy": 0.873311385512352, "mean_token_accuracy": 0.9013042598962784, "num_tokens": 883164112.0, "sample_num_tokens": 8328.5, "step": 7255, "total_num_tokens": 883197426.0, "z_loss": 0.000488497840706259 }, { "copy_logits_max": -5.453153610229492, "copy_logits_min": -562500096.0, "copy_num_tokens": 561.625, "epoch": 1.482001531784529, "gen_logits_max": 3.026123285293579, "gen_logits_mean": -17.909927368164062, "gen_logits_min": -30.073699951171875, "gen_logits_std": 3.2595551013946533, "gen_loss": 0.2868061661720276, "grad_norm": 0.35773182457982344, "learning_rate": 2.176421052631579e-05, "loss": 0.2654, "mean_copy_accuracy": 0.9971744120121002, "mean_gen_accuracy": 0.8766394406557083, "mean_token_accuracy": 0.9107172936201096, "num_tokens": 883458251.0, "sample_num_tokens": 9094.25, "step": 7256, "total_num_tokens": 883494628.0, "z_loss": 0.0005029984167777002 }, { "copy_logits_max": -3.325363874435425, "copy_logits_min": -750000000.0, "copy_num_tokens": 383.1875, "epoch": 1.482205769721726, "gen_logits_max": 4.1125688552856445, "gen_logits_mean": -15.808780670166016, "gen_logits_min": -27.721107482910156, "gen_logits_std": 3.1846656799316406, "gen_loss": 0.3196093440055847, "grad_norm": 0.3706682722100064, "learning_rate": 2.1762947368421053e-05, "loss": 0.2897, "mean_copy_accuracy": 0.9959559738636017, "mean_gen_accuracy": 0.8773814737796783, "mean_token_accuracy": 0.9019730240106583, "num_tokens": 883715402.0, "sample_num_tokens": 7981.0, "step": 7257, "total_num_tokens": 883747326.0, "z_loss": 0.0004569706507027149 }, { "copy_logits_max": -6.38192892074585, "copy_logits_min": -750000000.0, "copy_num_tokens": 603.0625, "epoch": 1.4824100076589226, "gen_logits_max": 3.643202066421509, "gen_logits_mean": -16.320106506347656, "gen_logits_min": -28.696155548095703, "gen_logits_std": 3.214646339416504, "gen_loss": 0.2714809775352478, "grad_norm": 0.3487326176553276, "learning_rate": 2.1761684210526314e-05, "loss": 0.2658, "mean_copy_accuracy": 0.9973549693822861, "mean_gen_accuracy": 0.87090764939785, "mean_token_accuracy": 0.9092510342597961, "num_tokens": 884005943.0, "sample_num_tokens": 9244.75, "step": 7258, "total_num_tokens": 884042922.0, "z_loss": 0.00043145063682459295 }, { "copy_logits_max": -4.239688873291016, "copy_logits_min": -750000000.0, "copy_num_tokens": 406.375, "epoch": 1.4826142455961195, "gen_logits_max": 4.522947788238525, "gen_logits_mean": -15.634747505187988, "gen_logits_min": -27.622394561767578, "gen_logits_std": 3.166236400604248, "gen_loss": 0.329131156206131, "grad_norm": 0.3653245390778811, "learning_rate": 2.1760421052631578e-05, "loss": 0.2821, "mean_copy_accuracy": 0.9963841140270233, "mean_gen_accuracy": 0.8745806813240051, "mean_token_accuracy": 0.9034178406000137, "num_tokens": 884272521.0, "sample_num_tokens": 8800.75, "step": 7259, "total_num_tokens": 884307724.0, "z_loss": 0.0004955778131261468 }, { "copy_logits_max": -1.5408443212509155, "copy_logits_min": -750000000.0, "copy_num_tokens": 378.8125, "epoch": 1.4828184835333162, "gen_logits_max": 3.7492687702178955, "gen_logits_mean": -16.654071807861328, "gen_logits_min": -28.638872146606445, "gen_logits_std": 3.196314811706543, "gen_loss": 0.3143439292907715, "grad_norm": 0.4480763167068773, "learning_rate": 2.1759157894736846e-05, "loss": 0.2956, "mean_copy_accuracy": 0.9954252243041992, "mean_gen_accuracy": 0.872718408703804, "mean_token_accuracy": 0.9004566669464111, "num_tokens": 884535013.0, "sample_num_tokens": 7715.75, "step": 7260, "total_num_tokens": 884565876.0, "z_loss": 0.0004820963949896395 }, { "copy_logits_max": 0.2382584512233734, "copy_logits_min": -687500032.0, "copy_num_tokens": 653.625, "epoch": 1.4830227214705132, "gen_logits_max": 3.2925233840942383, "gen_logits_mean": -16.344087600708008, "gen_logits_min": -28.463817596435547, "gen_logits_std": 3.2290518283843994, "gen_loss": 0.2361297309398651, "grad_norm": 0.35171117277807445, "learning_rate": 2.1757894736842107e-05, "loss": 0.2617, "mean_copy_accuracy": 0.9965743720531464, "mean_gen_accuracy": 0.8771422058343887, "mean_token_accuracy": 0.9098226428031921, "num_tokens": 884816882.0, "sample_num_tokens": 9809.0, "step": 7261, "total_num_tokens": 884856118.0, "z_loss": 0.0004407393862493336 }, { "copy_logits_max": 0.8593357801437378, "copy_logits_min": -750000000.0, "copy_num_tokens": 390.5, "epoch": 1.4832269594077099, "gen_logits_max": 4.723269462585449, "gen_logits_mean": -14.511920928955078, "gen_logits_min": -26.733217239379883, "gen_logits_std": 3.144770383834839, "gen_loss": 0.31746092438697815, "grad_norm": 0.3336629814898068, "learning_rate": 2.175663157894737e-05, "loss": 0.2845, "mean_copy_accuracy": 0.9963522553443909, "mean_gen_accuracy": 0.8765801042318344, "mean_token_accuracy": 0.9035169780254364, "num_tokens": 885101633.0, "sample_num_tokens": 8061.25, "step": 7262, "total_num_tokens": 885133878.0, "z_loss": 0.0005849256413057446 }, { "copy_logits_max": -0.5624300241470337, "copy_logits_min": -687500032.0, "copy_num_tokens": 397.6875, "epoch": 1.4834311973449068, "gen_logits_max": 4.208467483520508, "gen_logits_mean": -15.914312362670898, "gen_logits_min": -28.417434692382812, "gen_logits_std": 3.2126269340515137, "gen_loss": 0.29085007309913635, "grad_norm": 0.3885206118231055, "learning_rate": 2.1755368421052632e-05, "loss": 0.2955, "mean_copy_accuracy": 0.9944641590118408, "mean_gen_accuracy": 0.8748881667852402, "mean_token_accuracy": 0.8994922190904617, "num_tokens": 885359824.0, "sample_num_tokens": 7999.5, "step": 7263, "total_num_tokens": 885391822.0, "z_loss": 0.0004891241551376879 }, { "copy_logits_max": -1.4890246391296387, "copy_logits_min": -750000000.0, "copy_num_tokens": 367.0625, "epoch": 1.4836354352821037, "gen_logits_max": 4.264528751373291, "gen_logits_mean": -16.24118423461914, "gen_logits_min": -28.311737060546875, "gen_logits_std": 3.2272605895996094, "gen_loss": 0.2705855071544647, "grad_norm": 0.33992664693860714, "learning_rate": 2.1754105263157896e-05, "loss": 0.2691, "mean_copy_accuracy": 0.9969944059848785, "mean_gen_accuracy": 0.8800534158945084, "mean_token_accuracy": 0.9078313112258911, "num_tokens": 885626294.0, "sample_num_tokens": 6949.5, "step": 7264, "total_num_tokens": 885654092.0, "z_loss": 0.0004970108275301754 }, { "copy_logits_max": -1.0687167644500732, "copy_logits_min": -687500032.0, "copy_num_tokens": 562.4375, "epoch": 1.4838396732193004, "gen_logits_max": 3.1822423934936523, "gen_logits_mean": -17.7628116607666, "gen_logits_min": -29.945558547973633, "gen_logits_std": 3.277705192565918, "gen_loss": 0.27308356761932373, "grad_norm": 0.3490694630962223, "learning_rate": 2.1752842105263157e-05, "loss": 0.2766, "mean_copy_accuracy": 0.9964312613010406, "mean_gen_accuracy": 0.8723635971546173, "mean_token_accuracy": 0.9042373895645142, "num_tokens": 885881035.0, "sample_num_tokens": 8798.75, "step": 7265, "total_num_tokens": 885916230.0, "z_loss": 0.0004919924540445209 }, { "copy_logits_max": -1.0876586437225342, "copy_logits_min": -750000000.0, "copy_num_tokens": 396.9375, "epoch": 1.4840439111564974, "gen_logits_max": 5.465122699737549, "gen_logits_mean": -14.213823318481445, "gen_logits_min": -26.778392791748047, "gen_logits_std": 3.159834384918213, "gen_loss": 0.2806020677089691, "grad_norm": 0.3503045280432404, "learning_rate": 2.175157894736842e-05, "loss": 0.276, "mean_copy_accuracy": 0.9962329119443893, "mean_gen_accuracy": 0.8796272575855255, "mean_token_accuracy": 0.9069637954235077, "num_tokens": 886159572.0, "sample_num_tokens": 8180.5, "step": 7266, "total_num_tokens": 886192294.0, "z_loss": 0.00047572070616297424 }, { "copy_logits_max": -3.248528480529785, "copy_logits_min": -625000064.0, "copy_num_tokens": 424.6875, "epoch": 1.4842481490936943, "gen_logits_max": 4.017440319061279, "gen_logits_mean": -17.529224395751953, "gen_logits_min": -29.342205047607422, "gen_logits_std": 3.243452787399292, "gen_loss": 0.3072466254234314, "grad_norm": 0.3336461552176963, "learning_rate": 2.1750315789473682e-05, "loss": 0.276, "mean_copy_accuracy": 0.9969003051519394, "mean_gen_accuracy": 0.8754719644784927, "mean_token_accuracy": 0.9074586927890778, "num_tokens": 886447097.0, "sample_num_tokens": 8621.75, "step": 7267, "total_num_tokens": 886481584.0, "z_loss": 0.0005313419387675822 }, { "copy_logits_max": -2.062927484512329, "copy_logits_min": -750000000.0, "copy_num_tokens": 278.8125, "epoch": 1.484452387030891, "gen_logits_max": 5.237595558166504, "gen_logits_mean": -16.05234718322754, "gen_logits_min": -27.96898078918457, "gen_logits_std": 3.2024409770965576, "gen_loss": 0.3016502559185028, "grad_norm": 0.38450313548434323, "learning_rate": 2.174905263157895e-05, "loss": 0.2803, "mean_copy_accuracy": 0.9961978197097778, "mean_gen_accuracy": 0.8806949108839035, "mean_token_accuracy": 0.9072518199682236, "num_tokens": 886715238.0, "sample_num_tokens": 6951.5, "step": 7268, "total_num_tokens": 886743044.0, "z_loss": 0.0005025411956012249 }, { "copy_logits_max": -1.837043285369873, "copy_logits_min": -687500032.0, "copy_num_tokens": 613.6875, "epoch": 1.4846566249680877, "gen_logits_max": 2.6591243743896484, "gen_logits_mean": -17.77979278564453, "gen_logits_min": -29.672637939453125, "gen_logits_std": 3.2687339782714844, "gen_loss": 0.268619179725647, "grad_norm": 0.3473958507496245, "learning_rate": 2.174778947368421e-05, "loss": 0.2507, "mean_copy_accuracy": 0.9963619858026505, "mean_gen_accuracy": 0.8855619430541992, "mean_token_accuracy": 0.9147680848836899, "num_tokens": 886985579.0, "sample_num_tokens": 9196.75, "step": 7269, "total_num_tokens": 887022366.0, "z_loss": 0.0004868623218499124 }, { "copy_logits_max": -0.29954615235328674, "copy_logits_min": -687500032.0, "copy_num_tokens": 349.5625, "epoch": 1.4848608629052846, "gen_logits_max": 4.356489181518555, "gen_logits_mean": -16.5897274017334, "gen_logits_min": -28.878890991210938, "gen_logits_std": 3.2481322288513184, "gen_loss": 0.26850593090057373, "grad_norm": 0.36778890867197384, "learning_rate": 2.1746526315789475e-05, "loss": 0.2856, "mean_copy_accuracy": 0.9954335987567902, "mean_gen_accuracy": 0.8779400289058685, "mean_token_accuracy": 0.9024218320846558, "num_tokens": 887249278.0, "sample_num_tokens": 7572.0, "step": 7270, "total_num_tokens": 887279566.0, "z_loss": 0.0004713443049695343 }, { "copy_logits_max": -3.603214979171753, "copy_logits_min": -687500032.0, "copy_num_tokens": 472.25, "epoch": 1.4850651008424816, "gen_logits_max": 3.6752336025238037, "gen_logits_mean": -16.716400146484375, "gen_logits_min": -28.37761878967285, "gen_logits_std": 3.1997742652893066, "gen_loss": 0.30565348267555237, "grad_norm": 0.3495401111297238, "learning_rate": 2.1745263157894736e-05, "loss": 0.2741, "mean_copy_accuracy": 0.9966490566730499, "mean_gen_accuracy": 0.8777537941932678, "mean_token_accuracy": 0.9082933664321899, "num_tokens": 887512567.0, "sample_num_tokens": 7852.75, "step": 7271, "total_num_tokens": 887543978.0, "z_loss": 0.0004795725690200925 }, { "copy_logits_max": -3.002570867538452, "copy_logits_min": -625000000.0, "copy_num_tokens": 413.25, "epoch": 1.4852693387796783, "gen_logits_max": 4.022061347961426, "gen_logits_mean": -16.392183303833008, "gen_logits_min": -28.746814727783203, "gen_logits_std": 3.237729072570801, "gen_loss": 0.2535945177078247, "grad_norm": 0.3672804141949069, "learning_rate": 2.1744e-05, "loss": 0.2722, "mean_copy_accuracy": 0.9957590997219086, "mean_gen_accuracy": 0.8739976733922958, "mean_token_accuracy": 0.9066808074712753, "num_tokens": 887784986.0, "sample_num_tokens": 7140.0, "step": 7272, "total_num_tokens": 887813546.0, "z_loss": 0.0004356333229225129 }, { "copy_logits_max": -6.06784725189209, "copy_logits_min": -750000000.0, "copy_num_tokens": 417.3125, "epoch": 1.4854735767168752, "gen_logits_max": 3.7062976360321045, "gen_logits_mean": -17.618621826171875, "gen_logits_min": -29.615314483642578, "gen_logits_std": 3.258690357208252, "gen_loss": 0.28587806224823, "grad_norm": 0.43685190929010176, "learning_rate": 2.1742736842105265e-05, "loss": 0.2834, "mean_copy_accuracy": 0.9965519607067108, "mean_gen_accuracy": 0.877122700214386, "mean_token_accuracy": 0.9066615104675293, "num_tokens": 888053664.0, "sample_num_tokens": 8874.0, "step": 7273, "total_num_tokens": 888089160.0, "z_loss": 0.0004621310508809984 }, { "copy_logits_max": -1.5366652011871338, "copy_logits_min": -750000000.0, "copy_num_tokens": 552.4375, "epoch": 1.4856778146540721, "gen_logits_max": 4.172955513000488, "gen_logits_mean": -15.438011169433594, "gen_logits_min": -27.366994857788086, "gen_logits_std": 3.1965107917785645, "gen_loss": 0.29815444350242615, "grad_norm": 0.39335618541014405, "learning_rate": 2.1741473684210526e-05, "loss": 0.2878, "mean_copy_accuracy": 0.996878370642662, "mean_gen_accuracy": 0.8711079508066177, "mean_token_accuracy": 0.9022766053676605, "num_tokens": 888311617.0, "sample_num_tokens": 9529.75, "step": 7274, "total_num_tokens": 888349736.0, "z_loss": 0.0004515875189099461 }, { "copy_logits_max": -2.5707221031188965, "copy_logits_min": -750000000.0, "copy_num_tokens": 392.0625, "epoch": 1.4858820525912688, "gen_logits_max": 5.160594940185547, "gen_logits_mean": -14.955052375793457, "gen_logits_min": -27.1376953125, "gen_logits_std": 3.1526546478271484, "gen_loss": 0.3304012417793274, "grad_norm": 0.39855222954939495, "learning_rate": 2.174021052631579e-05, "loss": 0.2992, "mean_copy_accuracy": 0.9964303225278854, "mean_gen_accuracy": 0.8712684512138367, "mean_token_accuracy": 0.8994805812835693, "num_tokens": 888585918.0, "sample_num_tokens": 8340.0, "step": 7275, "total_num_tokens": 888619278.0, "z_loss": 0.0005363199743442237 }, { "copy_logits_max": -6.86693811416626, "copy_logits_min": -750000000.0, "copy_num_tokens": 416.5625, "epoch": 1.4860862905284655, "gen_logits_max": 3.955537796020508, "gen_logits_mean": -16.494659423828125, "gen_logits_min": -28.708446502685547, "gen_logits_std": 3.1846225261688232, "gen_loss": 0.2792169451713562, "grad_norm": 0.41147090073333176, "learning_rate": 2.173894736842105e-05, "loss": 0.2686, "mean_copy_accuracy": 0.9972197115421295, "mean_gen_accuracy": 0.8816753625869751, "mean_token_accuracy": 0.9105382710695267, "num_tokens": 888848456.0, "sample_num_tokens": 7591.5, "step": 7276, "total_num_tokens": 888878822.0, "z_loss": 0.0004683380830101669 }, { "copy_logits_max": -6.413415908813477, "copy_logits_min": -750000000.0, "copy_num_tokens": 429.375, "epoch": 1.4862905284656625, "gen_logits_max": 3.7431225776672363, "gen_logits_mean": -17.655742645263672, "gen_logits_min": -29.668968200683594, "gen_logits_std": 3.2410857677459717, "gen_loss": 0.30099111795425415, "grad_norm": 0.37860437884615966, "learning_rate": 2.173768421052632e-05, "loss": 0.2934, "mean_copy_accuracy": 0.9966254979372025, "mean_gen_accuracy": 0.874130129814148, "mean_token_accuracy": 0.9007779508829117, "num_tokens": 889095797.0, "sample_num_tokens": 8734.25, "step": 7277, "total_num_tokens": 889130734.0, "z_loss": 0.0004778769798576832 }, { "copy_logits_max": -5.956423282623291, "copy_logits_min": -750000000.0, "copy_num_tokens": 414.875, "epoch": 1.4864947664028594, "gen_logits_max": 4.225613117218018, "gen_logits_mean": -16.63418960571289, "gen_logits_min": -29.001487731933594, "gen_logits_std": 3.221676826477051, "gen_loss": 0.29575300216674805, "grad_norm": 0.39063969791776576, "learning_rate": 2.173642105263158e-05, "loss": 0.2898, "mean_copy_accuracy": 0.9964450746774673, "mean_gen_accuracy": 0.8737845867872238, "mean_token_accuracy": 0.901108905673027, "num_tokens": 889357140.0, "sample_num_tokens": 7955.0, "step": 7278, "total_num_tokens": 889388960.0, "z_loss": 0.0005257027805782855 }, { "copy_logits_max": -5.719193458557129, "copy_logits_min": -750000000.0, "copy_num_tokens": 442.8125, "epoch": 1.486699004340056, "gen_logits_max": 3.516519069671631, "gen_logits_mean": -17.127201080322266, "gen_logits_min": -28.991668701171875, "gen_logits_std": 3.227034568786621, "gen_loss": 0.3030756115913391, "grad_norm": 0.3508513455220532, "learning_rate": 2.1735157894736844e-05, "loss": 0.2787, "mean_copy_accuracy": 0.9967412054538727, "mean_gen_accuracy": 0.8769946098327637, "mean_token_accuracy": 0.9062611162662506, "num_tokens": 889646834.0, "sample_num_tokens": 7853.5, "step": 7279, "total_num_tokens": 889678248.0, "z_loss": 0.0004936653422191739 }, { "copy_logits_max": -6.162422180175781, "copy_logits_min": -687500032.0, "copy_num_tokens": 362.3125, "epoch": 1.486903242277253, "gen_logits_max": 4.217214584350586, "gen_logits_mean": -16.351051330566406, "gen_logits_min": -28.59334945678711, "gen_logits_std": 3.176180124282837, "gen_loss": 0.31836941838264465, "grad_norm": 0.38379207092714396, "learning_rate": 2.1733894736842105e-05, "loss": 0.2821, "mean_copy_accuracy": 0.9955519437789917, "mean_gen_accuracy": 0.8774160593748093, "mean_token_accuracy": 0.9032090455293655, "num_tokens": 889902618.0, "sample_num_tokens": 7836.0, "step": 7280, "total_num_tokens": 889933962.0, "z_loss": 0.0005130671197548509 }, { "copy_logits_max": -4.627201080322266, "copy_logits_min": -750000000.0, "copy_num_tokens": 491.125, "epoch": 1.48710748021445, "gen_logits_max": 4.294934272766113, "gen_logits_mean": -14.88084888458252, "gen_logits_min": -27.43549156188965, "gen_logits_std": 3.1582815647125244, "gen_loss": 0.26076316833496094, "grad_norm": 0.36759003940146573, "learning_rate": 2.173263157894737e-05, "loss": 0.2669, "mean_copy_accuracy": 0.9970732182264328, "mean_gen_accuracy": 0.8789498209953308, "mean_token_accuracy": 0.9085872769355774, "num_tokens": 890172364.0, "sample_num_tokens": 7827.5, "step": 7281, "total_num_tokens": 890203674.0, "z_loss": 0.0004369674134068191 }, { "copy_logits_max": -3.604743003845215, "copy_logits_min": -750000000.0, "copy_num_tokens": 491.9375, "epoch": 1.4873117181516466, "gen_logits_max": 4.107929229736328, "gen_logits_mean": -15.974660873413086, "gen_logits_min": -28.258697509765625, "gen_logits_std": 3.223221778869629, "gen_loss": 0.30484381318092346, "grad_norm": 0.39173680033523395, "learning_rate": 2.173136842105263e-05, "loss": 0.2797, "mean_copy_accuracy": 0.9965102672576904, "mean_gen_accuracy": 0.870794340968132, "mean_token_accuracy": 0.9046545326709747, "num_tokens": 890429265.0, "sample_num_tokens": 8825.25, "step": 7282, "total_num_tokens": 890464566.0, "z_loss": 0.000487815763335675 }, { "copy_logits_max": -6.569458961486816, "copy_logits_min": -750000000.0, "copy_num_tokens": 396.75, "epoch": 1.4875159560888436, "gen_logits_max": 3.560415744781494, "gen_logits_mean": -17.643587112426758, "gen_logits_min": -29.362838745117188, "gen_logits_std": 3.2108840942382812, "gen_loss": 0.26844316720962524, "grad_norm": 0.3741738199578222, "learning_rate": 2.1730105263157894e-05, "loss": 0.2881, "mean_copy_accuracy": 0.9969882071018219, "mean_gen_accuracy": 0.8713764399290085, "mean_token_accuracy": 0.9031423330307007, "num_tokens": 890702695.0, "sample_num_tokens": 7539.25, "step": 7283, "total_num_tokens": 890732852.0, "z_loss": 0.00043768715113401413 }, { "copy_logits_max": -5.497818470001221, "copy_logits_min": -687500032.0, "copy_num_tokens": 463.8125, "epoch": 1.4877201940260403, "gen_logits_max": 2.7708849906921387, "gen_logits_mean": -18.222797393798828, "gen_logits_min": -29.810619354248047, "gen_logits_std": 3.1923255920410156, "gen_loss": 0.2594259977340698, "grad_norm": 0.3880977717330874, "learning_rate": 2.1728842105263155e-05, "loss": 0.2834, "mean_copy_accuracy": 0.9969584792852402, "mean_gen_accuracy": 0.8757561594247818, "mean_token_accuracy": 0.9053639620542526, "num_tokens": 890972197.0, "sample_num_tokens": 8121.25, "step": 7284, "total_num_tokens": 891004682.0, "z_loss": 0.0004382358747534454 }, { "copy_logits_max": -5.523903846740723, "copy_logits_min": -750000000.0, "copy_num_tokens": 459.0625, "epoch": 1.4879244319632372, "gen_logits_max": 3.3332672119140625, "gen_logits_mean": -17.49362564086914, "gen_logits_min": -29.501354217529297, "gen_logits_std": 3.2220120429992676, "gen_loss": 0.2764208912849426, "grad_norm": 0.3419970537043441, "learning_rate": 2.1727578947368423e-05, "loss": 0.2736, "mean_copy_accuracy": 0.9969467520713806, "mean_gen_accuracy": 0.8789222985506058, "mean_token_accuracy": 0.9068419486284256, "num_tokens": 891244056.0, "sample_num_tokens": 7896.0, "step": 7285, "total_num_tokens": 891275640.0, "z_loss": 0.00047620132681913674 }, { "copy_logits_max": -4.968625068664551, "copy_logits_min": -750000000.0, "copy_num_tokens": 413.25, "epoch": 1.488128669900434, "gen_logits_max": 4.0988688468933105, "gen_logits_mean": -16.834941864013672, "gen_logits_min": -28.614673614501953, "gen_logits_std": 3.1877636909484863, "gen_loss": 0.2739035189151764, "grad_norm": 0.35399620349214916, "learning_rate": 2.1726315789473687e-05, "loss": 0.2778, "mean_copy_accuracy": 0.9960364252328873, "mean_gen_accuracy": 0.8798555880784988, "mean_token_accuracy": 0.9061280339956284, "num_tokens": 891518152.0, "sample_num_tokens": 8211.0, "step": 7286, "total_num_tokens": 891550996.0, "z_loss": 0.0004445167724043131 }, { "copy_logits_max": -3.981053352355957, "copy_logits_min": -625000064.0, "copy_num_tokens": 453.0, "epoch": 1.4883329078376308, "gen_logits_max": 3.078458547592163, "gen_logits_mean": -17.551563262939453, "gen_logits_min": -29.697982788085938, "gen_logits_std": 3.2186663150787354, "gen_loss": 0.2914734482765198, "grad_norm": 0.36209897692313847, "learning_rate": 2.1725052631578948e-05, "loss": 0.3008, "mean_copy_accuracy": 0.9972487986087799, "mean_gen_accuracy": 0.8673881143331528, "mean_token_accuracy": 0.8987761586904526, "num_tokens": 891785059.0, "sample_num_tokens": 7552.25, "step": 7287, "total_num_tokens": 891815268.0, "z_loss": 0.00051122932927683 }, { "copy_logits_max": -6.289567947387695, "copy_logits_min": -750000000.0, "copy_num_tokens": 396.0625, "epoch": 1.4885371457748278, "gen_logits_max": 4.670267105102539, "gen_logits_mean": -16.31414794921875, "gen_logits_min": -28.13890838623047, "gen_logits_std": 3.1597814559936523, "gen_loss": 0.305164098739624, "grad_norm": 0.7770040460944034, "learning_rate": 2.1723789473684213e-05, "loss": 0.285, "mean_copy_accuracy": 0.9968024492263794, "mean_gen_accuracy": 0.8771934658288956, "mean_token_accuracy": 0.9048108160495758, "num_tokens": 892036917.0, "sample_num_tokens": 8252.25, "step": 7288, "total_num_tokens": 892069926.0, "z_loss": 0.0005211700336076319 }, { "copy_logits_max": -2.9445953369140625, "copy_logits_min": -750000000.0, "copy_num_tokens": 555.125, "epoch": 1.4887413837120245, "gen_logits_max": 4.78108024597168, "gen_logits_mean": -14.862253189086914, "gen_logits_min": -27.70104217529297, "gen_logits_std": 3.1850690841674805, "gen_loss": 0.2664814889431, "grad_norm": 0.3766250545478502, "learning_rate": 2.1722526315789473e-05, "loss": 0.2845, "mean_copy_accuracy": 0.9964383691549301, "mean_gen_accuracy": 0.8754900395870209, "mean_token_accuracy": 0.9043401479721069, "num_tokens": 892295264.0, "sample_num_tokens": 10066.0, "step": 7289, "total_num_tokens": 892335528.0, "z_loss": 0.00047260071733035147 }, { "copy_logits_max": -5.901330947875977, "copy_logits_min": -750000000.0, "copy_num_tokens": 382.0, "epoch": 1.4889456216492214, "gen_logits_max": 3.25075101852417, "gen_logits_mean": -17.76906967163086, "gen_logits_min": -29.812135696411133, "gen_logits_std": 3.224184513092041, "gen_loss": 0.3215346336364746, "grad_norm": 0.3821651361604987, "learning_rate": 2.1721263157894738e-05, "loss": 0.299, "mean_copy_accuracy": 0.9962394684553146, "mean_gen_accuracy": 0.8705575466156006, "mean_token_accuracy": 0.8982605636119843, "num_tokens": 892550114.0, "sample_num_tokens": 7828.5, "step": 7290, "total_num_tokens": 892581428.0, "z_loss": 0.0005621118471026421 }, { "copy_logits_max": -3.551280975341797, "copy_logits_min": -750000064.0, "copy_num_tokens": 587.8125, "epoch": 1.489149859586418, "gen_logits_max": 3.0701904296875, "gen_logits_mean": -16.51248550415039, "gen_logits_min": -28.943584442138672, "gen_logits_std": 3.2230489253997803, "gen_loss": 0.24113139510154724, "grad_norm": 0.3554345337332427, "learning_rate": 2.172e-05, "loss": 0.2663, "mean_copy_accuracy": 0.9976243227720261, "mean_gen_accuracy": 0.8748007267713547, "mean_token_accuracy": 0.909272089600563, "num_tokens": 892827423.0, "sample_num_tokens": 9196.75, "step": 7291, "total_num_tokens": 892864210.0, "z_loss": 0.00042440753895789385 }, { "copy_logits_max": -3.0708250999450684, "copy_logits_min": -687500032.0, "copy_num_tokens": 359.3125, "epoch": 1.489354097523615, "gen_logits_max": 3.8638641834259033, "gen_logits_mean": -16.170236587524414, "gen_logits_min": -28.411109924316406, "gen_logits_std": 3.207118034362793, "gen_loss": 0.2776355445384979, "grad_norm": 0.35732285481749715, "learning_rate": 2.1718736842105263e-05, "loss": 0.287, "mean_copy_accuracy": 0.996826633810997, "mean_gen_accuracy": 0.8732112646102905, "mean_token_accuracy": 0.9039464741945267, "num_tokens": 893105748.0, "sample_num_tokens": 7608.0, "step": 7292, "total_num_tokens": 893136180.0, "z_loss": 0.0004877089522778988 }, { "copy_logits_max": -0.5230336785316467, "copy_logits_min": -750000064.0, "copy_num_tokens": 708.0, "epoch": 1.4895583354608117, "gen_logits_max": 2.666360378265381, "gen_logits_mean": -16.703826904296875, "gen_logits_min": -29.482210159301758, "gen_logits_std": 3.2363762855529785, "gen_loss": 0.25451600551605225, "grad_norm": 0.35876882869321886, "learning_rate": 2.1717473684210527e-05, "loss": 0.2514, "mean_copy_accuracy": 0.9967781454324722, "mean_gen_accuracy": 0.8858042508363724, "mean_token_accuracy": 0.9152984917163849, "num_tokens": 893393691.0, "sample_num_tokens": 9883.25, "step": 7293, "total_num_tokens": 893433224.0, "z_loss": 0.00042542506707832217 }, { "copy_logits_max": -2.3409175872802734, "copy_logits_min": -750000000.0, "copy_num_tokens": 457.75, "epoch": 1.4897625733980087, "gen_logits_max": 4.167868614196777, "gen_logits_mean": -16.152984619140625, "gen_logits_min": -28.11752700805664, "gen_logits_std": 3.1852121353149414, "gen_loss": 0.255390465259552, "grad_norm": 0.43068448271972143, "learning_rate": 2.171621052631579e-05, "loss": 0.2838, "mean_copy_accuracy": 0.9955334067344666, "mean_gen_accuracy": 0.8774742782115936, "mean_token_accuracy": 0.9030809998512268, "num_tokens": 893658168.0, "sample_num_tokens": 9372.0, "step": 7294, "total_num_tokens": 893695656.0, "z_loss": 0.0004244855954311788 }, { "copy_logits_max": -2.764561653137207, "copy_logits_min": -750000000.0, "copy_num_tokens": 676.3125, "epoch": 1.4899668113352056, "gen_logits_max": 2.2076873779296875, "gen_logits_mean": -17.47044563293457, "gen_logits_min": -29.58946418762207, "gen_logits_std": 3.2593255043029785, "gen_loss": 0.2719145119190216, "grad_norm": 0.3548813492347598, "learning_rate": 2.1714947368421053e-05, "loss": 0.2656, "mean_copy_accuracy": 0.9974757879972458, "mean_gen_accuracy": 0.8726256489753723, "mean_token_accuracy": 0.9097696095705032, "num_tokens": 893943980.0, "sample_num_tokens": 9690.5, "step": 7295, "total_num_tokens": 893982742.0, "z_loss": 0.0004554800980258733 }, { "copy_logits_max": -4.0158257484436035, "copy_logits_min": -750000000.0, "copy_num_tokens": 414.8125, "epoch": 1.4901710492724023, "gen_logits_max": 3.2121713161468506, "gen_logits_mean": -17.664047241210938, "gen_logits_min": -29.30966567993164, "gen_logits_std": 3.2180464267730713, "gen_loss": 0.28602880239486694, "grad_norm": 0.36173863480996366, "learning_rate": 2.1713684210526317e-05, "loss": 0.2979, "mean_copy_accuracy": 0.9968713372945786, "mean_gen_accuracy": 0.869770810008049, "mean_token_accuracy": 0.8998987525701523, "num_tokens": 894208749.0, "sample_num_tokens": 7799.25, "step": 7296, "total_num_tokens": 894239946.0, "z_loss": 0.0004565808631014079 }, { "copy_logits_max": -2.078092098236084, "copy_logits_min": -687500096.0, "copy_num_tokens": 425.4375, "epoch": 1.4903752872095992, "gen_logits_max": 3.907731771469116, "gen_logits_mean": -16.120996475219727, "gen_logits_min": -28.70183563232422, "gen_logits_std": 3.2062056064605713, "gen_loss": 0.3133328855037689, "grad_norm": 0.36450157652936976, "learning_rate": 2.1712421052631578e-05, "loss": 0.2812, "mean_copy_accuracy": 0.996228352189064, "mean_gen_accuracy": 0.8784995675086975, "mean_token_accuracy": 0.9050064980983734, "num_tokens": 894475921.0, "sample_num_tokens": 7937.25, "step": 7297, "total_num_tokens": 894507670.0, "z_loss": 0.0005759048508480191 }, { "copy_logits_max": -4.261220932006836, "copy_logits_min": -750000000.0, "copy_num_tokens": 445.9375, "epoch": 1.4905795251467961, "gen_logits_max": 4.248756408691406, "gen_logits_mean": -15.530723571777344, "gen_logits_min": -27.960901260375977, "gen_logits_std": 3.1766319274902344, "gen_loss": 0.28323376178741455, "grad_norm": 0.3825866455512113, "learning_rate": 2.1711157894736842e-05, "loss": 0.2803, "mean_copy_accuracy": 0.9957230985164642, "mean_gen_accuracy": 0.8742019981145859, "mean_token_accuracy": 0.9050270617008209, "num_tokens": 894750197.0, "sample_num_tokens": 8661.25, "step": 7298, "total_num_tokens": 894784842.0, "z_loss": 0.000489452329929918 }, { "copy_logits_max": -5.307016849517822, "copy_logits_min": -750000000.0, "copy_num_tokens": 434.75, "epoch": 1.4907837630839929, "gen_logits_max": 3.32877779006958, "gen_logits_mean": -18.13507080078125, "gen_logits_min": -30.193546295166016, "gen_logits_std": 3.248340606689453, "gen_loss": 0.2542082667350769, "grad_norm": 0.40560879752288037, "learning_rate": 2.1709894736842106e-05, "loss": 0.2864, "mean_copy_accuracy": 0.9959349036216736, "mean_gen_accuracy": 0.8739934712648392, "mean_token_accuracy": 0.9029337912797928, "num_tokens": 895011273.0, "sample_num_tokens": 8550.75, "step": 7299, "total_num_tokens": 895045476.0, "z_loss": 0.0004368692170828581 }, { "copy_logits_max": -5.217028617858887, "copy_logits_min": -750000000.0, "copy_num_tokens": 410.0625, "epoch": 1.4909880010211896, "gen_logits_max": 3.151911973953247, "gen_logits_mean": -17.88008689880371, "gen_logits_min": -29.966114044189453, "gen_logits_std": 3.2362618446350098, "gen_loss": 0.2877776622772217, "grad_norm": 0.3661703649025026, "learning_rate": 2.1708631578947367e-05, "loss": 0.2823, "mean_copy_accuracy": 0.9968902766704559, "mean_gen_accuracy": 0.8758330345153809, "mean_token_accuracy": 0.9046281278133392, "num_tokens": 895276024.0, "sample_num_tokens": 7803.5, "step": 7300, "total_num_tokens": 895307238.0, "z_loss": 0.0005256535951048136 }, { "copy_logits_max": -4.973444938659668, "copy_logits_min": -750000000.0, "copy_num_tokens": 318.625, "epoch": 1.4911922389583865, "gen_logits_max": 3.4623148441314697, "gen_logits_mean": -17.55001449584961, "gen_logits_min": -29.682783126831055, "gen_logits_std": 3.210925817489624, "gen_loss": 0.32377707958221436, "grad_norm": 0.43609140379915023, "learning_rate": 2.1707368421052635e-05, "loss": 0.2976, "mean_copy_accuracy": 0.9973029494285583, "mean_gen_accuracy": 0.8698878139257431, "mean_token_accuracy": 0.8990659117698669, "num_tokens": 895524282.0, "sample_num_tokens": 6492.5, "step": 7301, "total_num_tokens": 895550252.0, "z_loss": 0.0005434896447695792 }, { "copy_logits_max": -4.566749572753906, "copy_logits_min": -750000000.0, "copy_num_tokens": 487.0625, "epoch": 1.4913964768955834, "gen_logits_max": 2.4335989952087402, "gen_logits_mean": -18.586624145507812, "gen_logits_min": -30.75565528869629, "gen_logits_std": 3.271026134490967, "gen_loss": 0.3063446879386902, "grad_norm": 0.38302826677680873, "learning_rate": 2.1706105263157896e-05, "loss": 0.3077, "mean_copy_accuracy": 0.995597630739212, "mean_gen_accuracy": 0.8654463887214661, "mean_token_accuracy": 0.8974052220582962, "num_tokens": 895777697.0, "sample_num_tokens": 8614.25, "step": 7302, "total_num_tokens": 895812154.0, "z_loss": 0.0005360653158277273 }, { "copy_logits_max": -6.452320098876953, "copy_logits_min": -750000000.0, "copy_num_tokens": 467.125, "epoch": 1.4916007148327801, "gen_logits_max": 4.483980655670166, "gen_logits_mean": -16.578941345214844, "gen_logits_min": -28.693851470947266, "gen_logits_std": 3.197265148162842, "gen_loss": 0.2712964415550232, "grad_norm": 0.38065808556099373, "learning_rate": 2.170484210526316e-05, "loss": 0.2803, "mean_copy_accuracy": 0.996658593416214, "mean_gen_accuracy": 0.8787517994642258, "mean_token_accuracy": 0.9049075245857239, "num_tokens": 896059366.0, "sample_num_tokens": 9396.5, "step": 7303, "total_num_tokens": 896096952.0, "z_loss": 0.00046447303611785173 }, { "copy_logits_max": -4.730443954467773, "copy_logits_min": -750000000.0, "copy_num_tokens": 486.4375, "epoch": 1.491804952769977, "gen_logits_max": 4.138088703155518, "gen_logits_mean": -15.543356895446777, "gen_logits_min": -28.718013763427734, "gen_logits_std": 3.183722972869873, "gen_loss": 0.275016188621521, "grad_norm": 0.35433810562807216, "learning_rate": 2.170357894736842e-05, "loss": 0.272, "mean_copy_accuracy": 0.9965184777975082, "mean_gen_accuracy": 0.8778052628040314, "mean_token_accuracy": 0.9062074422836304, "num_tokens": 896332350.0, "sample_num_tokens": 8418.5, "step": 7304, "total_num_tokens": 896366024.0, "z_loss": 0.0005035426584072411 }, { "copy_logits_max": -7.210197448730469, "copy_logits_min": -750000000.0, "copy_num_tokens": 342.75, "epoch": 1.492009190707174, "gen_logits_max": 3.7609176635742188, "gen_logits_mean": -17.969820022583008, "gen_logits_min": -29.72024154663086, "gen_logits_std": 3.2283222675323486, "gen_loss": 0.3146117031574249, "grad_norm": 0.34284626270924223, "learning_rate": 2.1702315789473686e-05, "loss": 0.2825, "mean_copy_accuracy": 0.9961851835250854, "mean_gen_accuracy": 0.8741553574800491, "mean_token_accuracy": 0.9035538583993912, "num_tokens": 896599149.0, "sample_num_tokens": 7785.25, "step": 7305, "total_num_tokens": 896630290.0, "z_loss": 0.000511379272211343 }, { "copy_logits_max": -5.140013217926025, "copy_logits_min": -750000000.0, "copy_num_tokens": 575.75, "epoch": 1.4922134286443707, "gen_logits_max": 5.042201519012451, "gen_logits_mean": -14.261305809020996, "gen_logits_min": -26.59185028076172, "gen_logits_std": 3.1738553047180176, "gen_loss": 0.2786043882369995, "grad_norm": 0.3578271478121417, "learning_rate": 2.1701052631578946e-05, "loss": 0.2769, "mean_copy_accuracy": 0.9967804551124573, "mean_gen_accuracy": 0.8728695660829544, "mean_token_accuracy": 0.905759334564209, "num_tokens": 896855936.0, "sample_num_tokens": 8403.0, "step": 7306, "total_num_tokens": 896889548.0, "z_loss": 0.0005364902317523956 }, { "copy_logits_max": -6.613693714141846, "copy_logits_min": -750000000.0, "copy_num_tokens": 434.6875, "epoch": 1.4924176665815676, "gen_logits_max": 4.970489501953125, "gen_logits_mean": -15.281716346740723, "gen_logits_min": -27.204689025878906, "gen_logits_std": 3.1840615272521973, "gen_loss": 0.2957199215888977, "grad_norm": 0.3995029319924846, "learning_rate": 2.169978947368421e-05, "loss": 0.2992, "mean_copy_accuracy": 0.9958802461624146, "mean_gen_accuracy": 0.8721572458744049, "mean_token_accuracy": 0.8986139446496964, "num_tokens": 897113176.0, "sample_num_tokens": 8409.0, "step": 7307, "total_num_tokens": 897146812.0, "z_loss": 0.0005585579783655703 }, { "copy_logits_max": -3.0352180004119873, "copy_logits_min": -750000000.0, "copy_num_tokens": 608.9375, "epoch": 1.4926219045187643, "gen_logits_max": 4.426772117614746, "gen_logits_mean": -14.933226585388184, "gen_logits_min": -27.3189754486084, "gen_logits_std": 3.1730687618255615, "gen_loss": 0.2589420974254608, "grad_norm": 0.3810221954344092, "learning_rate": 2.169852631578947e-05, "loss": 0.2832, "mean_copy_accuracy": 0.9957684129476547, "mean_gen_accuracy": 0.877692848443985, "mean_token_accuracy": 0.9052494019269943, "num_tokens": 897379984.0, "sample_num_tokens": 9288.0, "step": 7308, "total_num_tokens": 897417136.0, "z_loss": 0.0004507637058850378 }, { "copy_logits_max": -7.784337043762207, "copy_logits_min": -687500032.0, "copy_num_tokens": 501.75, "epoch": 1.4928261424559612, "gen_logits_max": 3.81990385055542, "gen_logits_mean": -16.988975524902344, "gen_logits_min": -28.853771209716797, "gen_logits_std": 3.2162625789642334, "gen_loss": 0.30616486072540283, "grad_norm": 0.3384228273422973, "learning_rate": 2.169726315789474e-05, "loss": 0.2785, "mean_copy_accuracy": 0.9967696070671082, "mean_gen_accuracy": 0.8739845007658005, "mean_token_accuracy": 0.9058811217546463, "num_tokens": 897660747.0, "sample_num_tokens": 9591.25, "step": 7309, "total_num_tokens": 897699112.0, "z_loss": 0.0005173047538846731 }, { "copy_logits_max": -6.374273300170898, "copy_logits_min": -750000000.0, "copy_num_tokens": 531.625, "epoch": 1.493030380393158, "gen_logits_max": 3.1396052837371826, "gen_logits_mean": -17.115720748901367, "gen_logits_min": -29.696910858154297, "gen_logits_std": 3.2544307708740234, "gen_loss": 0.25188952684402466, "grad_norm": 0.33027272114757517, "learning_rate": 2.1696e-05, "loss": 0.2608, "mean_copy_accuracy": 0.9975264966487885, "mean_gen_accuracy": 0.8754262328147888, "mean_token_accuracy": 0.9108163118362427, "num_tokens": 897958363.0, "sample_num_tokens": 8283.25, "step": 7310, "total_num_tokens": 897991496.0, "z_loss": 0.0004392068658489734 }, { "copy_logits_max": -5.539827346801758, "copy_logits_min": -750000000.0, "copy_num_tokens": 523.0, "epoch": 1.4932346183303549, "gen_logits_max": 3.820666551589966, "gen_logits_mean": -16.00647735595703, "gen_logits_min": -28.147661209106445, "gen_logits_std": 3.1991186141967773, "gen_loss": 0.30761563777923584, "grad_norm": 0.3573462176208873, "learning_rate": 2.1694736842105265e-05, "loss": 0.2793, "mean_copy_accuracy": 0.9976336061954498, "mean_gen_accuracy": 0.8725287020206451, "mean_token_accuracy": 0.9059710949659348, "num_tokens": 898237842.0, "sample_num_tokens": 9556.5, "step": 7311, "total_num_tokens": 898276068.0, "z_loss": 0.0005209692753851414 }, { "copy_logits_max": -4.735654354095459, "copy_logits_min": -750000000.0, "copy_num_tokens": 502.4375, "epoch": 1.4934388562675518, "gen_logits_max": 4.261625289916992, "gen_logits_mean": -15.516090393066406, "gen_logits_min": -28.16527557373047, "gen_logits_std": 3.1981804370880127, "gen_loss": 0.2724676728248596, "grad_norm": 0.36062407200379826, "learning_rate": 2.1693473684210526e-05, "loss": 0.2905, "mean_copy_accuracy": 0.9962015599012375, "mean_gen_accuracy": 0.8732576370239258, "mean_token_accuracy": 0.9022400677204132, "num_tokens": 898505593.0, "sample_num_tokens": 9083.75, "step": 7312, "total_num_tokens": 898541928.0, "z_loss": 0.0004536662017926574 }, { "copy_logits_max": -4.763132095336914, "copy_logits_min": -750000000.0, "copy_num_tokens": 353.0, "epoch": 1.4936430942047485, "gen_logits_max": 3.946444511413574, "gen_logits_mean": -16.694351196289062, "gen_logits_min": -28.94649887084961, "gen_logits_std": 3.233546018600464, "gen_loss": 0.2756779193878174, "grad_norm": 0.3529914359718807, "learning_rate": 2.169221052631579e-05, "loss": 0.2881, "mean_copy_accuracy": 0.9961098879575729, "mean_gen_accuracy": 0.8799489140510559, "mean_token_accuracy": 0.9035355746746063, "num_tokens": 898771415.0, "sample_num_tokens": 7528.25, "step": 7313, "total_num_tokens": 898801528.0, "z_loss": 0.0004816972650587559 }, { "copy_logits_max": -4.568061351776123, "copy_logits_min": -750000000.0, "copy_num_tokens": 707.5, "epoch": 1.4938473321419454, "gen_logits_max": 3.3920979499816895, "gen_logits_mean": -16.764087677001953, "gen_logits_min": -29.527233123779297, "gen_logits_std": 3.251467227935791, "gen_loss": 0.23944583535194397, "grad_norm": 0.3692048517449963, "learning_rate": 2.1690947368421054e-05, "loss": 0.2452, "mean_copy_accuracy": 0.997107669711113, "mean_gen_accuracy": 0.8838633000850677, "mean_token_accuracy": 0.916343629360199, "num_tokens": 899047808.0, "sample_num_tokens": 10226.0, "step": 7314, "total_num_tokens": 899088712.0, "z_loss": 0.00045901426346972585 }, { "copy_logits_max": -4.206165313720703, "copy_logits_min": -687500032.0, "copy_num_tokens": 474.5, "epoch": 1.4940515700791421, "gen_logits_max": 3.8343400955200195, "gen_logits_mean": -17.301902770996094, "gen_logits_min": -29.715675354003906, "gen_logits_std": 3.258185863494873, "gen_loss": 0.2924707531929016, "grad_norm": 0.35156919904567774, "learning_rate": 2.1689684210526315e-05, "loss": 0.2864, "mean_copy_accuracy": 0.9967768043279648, "mean_gen_accuracy": 0.8738423883914948, "mean_token_accuracy": 0.9039162397384644, "num_tokens": 899332694.0, "sample_num_tokens": 8816.5, "step": 7315, "total_num_tokens": 899367960.0, "z_loss": 0.00048342658556066453 }, { "copy_logits_max": -4.322351455688477, "copy_logits_min": -750000000.0, "copy_num_tokens": 332.3125, "epoch": 1.494255808016339, "gen_logits_max": 4.84869384765625, "gen_logits_mean": -16.061918258666992, "gen_logits_min": -28.46091079711914, "gen_logits_std": 3.215824604034424, "gen_loss": 0.31580811738967896, "grad_norm": 0.3503092871702581, "learning_rate": 2.168842105263158e-05, "loss": 0.2908, "mean_copy_accuracy": 0.9963721483945847, "mean_gen_accuracy": 0.8731188178062439, "mean_token_accuracy": 0.9006896764039993, "num_tokens": 899608904.0, "sample_num_tokens": 7382.5, "step": 7316, "total_num_tokens": 899638434.0, "z_loss": 0.0005635374109260738 }, { "copy_logits_max": -4.254561424255371, "copy_logits_min": -750000000.0, "copy_num_tokens": 377.4375, "epoch": 1.4944600459535358, "gen_logits_max": 4.791965007781982, "gen_logits_mean": -14.98876953125, "gen_logits_min": -28.036338806152344, "gen_logits_std": 3.19504451751709, "gen_loss": 0.29994454979896545, "grad_norm": 0.3406183564692111, "learning_rate": 2.1687157894736844e-05, "loss": 0.2779, "mean_copy_accuracy": 0.9960274696350098, "mean_gen_accuracy": 0.8783114105463028, "mean_token_accuracy": 0.9052609205245972, "num_tokens": 899873455.0, "sample_num_tokens": 7142.25, "step": 7317, "total_num_tokens": 899902024.0, "z_loss": 0.0005131997168064117 }, { "copy_logits_max": -5.642425537109375, "copy_logits_min": -750000000.0, "copy_num_tokens": 218.625, "epoch": 1.4946642838907327, "gen_logits_max": 6.150400161743164, "gen_logits_mean": -15.521099090576172, "gen_logits_min": -27.432395935058594, "gen_logits_std": 3.187440872192383, "gen_loss": 0.30598166584968567, "grad_norm": 0.3540432908012719, "learning_rate": 2.1685894736842108e-05, "loss": 0.2869, "mean_copy_accuracy": 0.9967904090881348, "mean_gen_accuracy": 0.8770571649074554, "mean_token_accuracy": 0.9046635031700134, "num_tokens": 900146974.0, "sample_num_tokens": 7208.5, "step": 7318, "total_num_tokens": 900175808.0, "z_loss": 0.0005123070441186428 }, { "copy_logits_max": -4.11866569519043, "copy_logits_min": -750000000.0, "copy_num_tokens": 308.4375, "epoch": 1.4948685218279296, "gen_logits_max": 5.587249755859375, "gen_logits_mean": -15.10279655456543, "gen_logits_min": -27.481088638305664, "gen_logits_std": 3.1988656520843506, "gen_loss": 0.29993584752082825, "grad_norm": 0.37652602050838957, "learning_rate": 2.168463157894737e-05, "loss": 0.2955, "mean_copy_accuracy": 0.9972119033336639, "mean_gen_accuracy": 0.8699356615543365, "mean_token_accuracy": 0.8992383778095245, "num_tokens": 900395547.0, "sample_num_tokens": 7560.75, "step": 7319, "total_num_tokens": 900425790.0, "z_loss": 0.0005116746760904789 }, { "copy_logits_max": 0.539116382598877, "copy_logits_min": -750000000.0, "copy_num_tokens": 591.8125, "epoch": 1.4950727597651263, "gen_logits_max": 4.785161018371582, "gen_logits_mean": -14.943621635437012, "gen_logits_min": -27.203886032104492, "gen_logits_std": 3.2107081413269043, "gen_loss": 0.2523595690727234, "grad_norm": 0.3604149427572411, "learning_rate": 2.1683368421052633e-05, "loss": 0.2725, "mean_copy_accuracy": 0.9970615804195404, "mean_gen_accuracy": 0.8791299760341644, "mean_token_accuracy": 0.9098249524831772, "num_tokens": 900674491.0, "sample_num_tokens": 8923.75, "step": 7320, "total_num_tokens": 900710186.0, "z_loss": 0.0004546145210042596 }, { "copy_logits_max": -3.608926773071289, "copy_logits_min": -750000000.0, "copy_num_tokens": 502.3125, "epoch": 1.4952769977023233, "gen_logits_max": 4.979185104370117, "gen_logits_mean": -15.593469619750977, "gen_logits_min": -28.095443725585938, "gen_logits_std": 3.23183536529541, "gen_loss": 0.2662579417228699, "grad_norm": 0.36065954871675737, "learning_rate": 2.1682105263157894e-05, "loss": 0.2655, "mean_copy_accuracy": 0.9966375380754471, "mean_gen_accuracy": 0.8757102936506271, "mean_token_accuracy": 0.9094781577587128, "num_tokens": 900950617.0, "sample_num_tokens": 8110.75, "step": 7321, "total_num_tokens": 900983060.0, "z_loss": 0.0004599857784342021 }, { "copy_logits_max": -0.7766467332839966, "copy_logits_min": -750000064.0, "copy_num_tokens": 576.25, "epoch": 1.4954812356395202, "gen_logits_max": 4.482851505279541, "gen_logits_mean": -14.569893836975098, "gen_logits_min": -27.281423568725586, "gen_logits_std": 3.2098920345306396, "gen_loss": 0.2629960775375366, "grad_norm": 0.3923923250987761, "learning_rate": 2.168084210526316e-05, "loss": 0.2879, "mean_copy_accuracy": 0.9959606528282166, "mean_gen_accuracy": 0.8732338100671768, "mean_token_accuracy": 0.9019996076822281, "num_tokens": 901209011.0, "sample_num_tokens": 8637.25, "step": 7322, "total_num_tokens": 901243560.0, "z_loss": 0.00047317863209173083 }, { "copy_logits_max": -1.643501877784729, "copy_logits_min": -750000000.0, "copy_num_tokens": 442.6875, "epoch": 1.4956854735767169, "gen_logits_max": 5.194669723510742, "gen_logits_mean": -15.514301300048828, "gen_logits_min": -27.774261474609375, "gen_logits_std": 3.2155075073242188, "gen_loss": 0.2712255120277405, "grad_norm": 0.3462242913370308, "learning_rate": 2.167957894736842e-05, "loss": 0.2892, "mean_copy_accuracy": 0.9962000846862793, "mean_gen_accuracy": 0.8744486570358276, "mean_token_accuracy": 0.9023071229457855, "num_tokens": 901484451.0, "sample_num_tokens": 8005.25, "step": 7323, "total_num_tokens": 901516472.0, "z_loss": 0.000475610897410661 }, { "copy_logits_max": -0.9228124618530273, "copy_logits_min": -750000000.0, "copy_num_tokens": 541.125, "epoch": 1.4958897115139136, "gen_logits_max": 5.442441463470459, "gen_logits_mean": -13.894977569580078, "gen_logits_min": -26.110151290893555, "gen_logits_std": 3.219407081604004, "gen_loss": 0.2499212920665741, "grad_norm": 0.37553559943890574, "learning_rate": 2.1678315789473684e-05, "loss": 0.2716, "mean_copy_accuracy": 0.9969693720340729, "mean_gen_accuracy": 0.8756683021783829, "mean_token_accuracy": 0.9083777070045471, "num_tokens": 901762745.0, "sample_num_tokens": 8955.25, "step": 7324, "total_num_tokens": 901798566.0, "z_loss": 0.00040455773705616593 }, { "copy_logits_max": -1.1079448461532593, "copy_logits_min": -625000064.0, "copy_num_tokens": 593.875, "epoch": 1.4960939494511105, "gen_logits_max": 5.081249237060547, "gen_logits_mean": -15.148298263549805, "gen_logits_min": -27.83013916015625, "gen_logits_std": 3.245900869369507, "gen_loss": 0.24794867634773254, "grad_norm": 0.34379334992247834, "learning_rate": 2.1677052631578948e-05, "loss": 0.2738, "mean_copy_accuracy": 0.996682807803154, "mean_gen_accuracy": 0.8787666708230972, "mean_token_accuracy": 0.908955529332161, "num_tokens": 902043752.0, "sample_num_tokens": 9265.5, "step": 7325, "total_num_tokens": 902080814.0, "z_loss": 0.0004262130823917687 }, { "copy_logits_max": -0.2067345380783081, "copy_logits_min": -750000064.0, "copy_num_tokens": 440.0625, "epoch": 1.4962981873883074, "gen_logits_max": 3.996131181716919, "gen_logits_mean": -15.978129386901855, "gen_logits_min": -28.320491790771484, "gen_logits_std": 3.2495906352996826, "gen_loss": 0.2871621251106262, "grad_norm": 0.3571117724963931, "learning_rate": 2.1675789473684212e-05, "loss": 0.2741, "mean_copy_accuracy": 0.9966624081134796, "mean_gen_accuracy": 0.8790435642004013, "mean_token_accuracy": 0.9068325906991959, "num_tokens": 902308973.0, "sample_num_tokens": 8643.75, "step": 7326, "total_num_tokens": 902343548.0, "z_loss": 0.0004511259321589023 }, { "copy_logits_max": -1.7328944206237793, "copy_logits_min": -750000000.0, "copy_num_tokens": 386.5625, "epoch": 1.4965024253255041, "gen_logits_max": 4.950199127197266, "gen_logits_mean": -15.671796798706055, "gen_logits_min": -27.44980812072754, "gen_logits_std": 3.2279374599456787, "gen_loss": 0.2717330753803253, "grad_norm": 0.3552798448552343, "learning_rate": 2.1674526315789477e-05, "loss": 0.2657, "mean_copy_accuracy": 0.9972657114267349, "mean_gen_accuracy": 0.8816863298416138, "mean_token_accuracy": 0.9089511930942535, "num_tokens": 902605389.0, "sample_num_tokens": 8367.25, "step": 7327, "total_num_tokens": 902638858.0, "z_loss": 0.00038351345574483275 }, { "copy_logits_max": -3.8793210983276367, "copy_logits_min": -750000000.0, "copy_num_tokens": 305.9375, "epoch": 1.496706663262701, "gen_logits_max": 3.5082597732543945, "gen_logits_mean": -17.788179397583008, "gen_logits_min": -29.615093231201172, "gen_logits_std": 3.261545181274414, "gen_loss": 0.3184281587600708, "grad_norm": 0.4144735563018397, "learning_rate": 2.1673263157894738e-05, "loss": 0.3033, "mean_copy_accuracy": 0.9954262375831604, "mean_gen_accuracy": 0.8685432374477386, "mean_token_accuracy": 0.8968456536531448, "num_tokens": 902859170.0, "sample_num_tokens": 7186.0, "step": 7328, "total_num_tokens": 902887914.0, "z_loss": 0.0004678161349147558 }, { "copy_logits_max": -0.2654506266117096, "copy_logits_min": -750000064.0, "copy_num_tokens": 624.4375, "epoch": 1.496910901199898, "gen_logits_max": 4.652695655822754, "gen_logits_mean": -14.464052200317383, "gen_logits_min": -27.156206130981445, "gen_logits_std": 3.233116626739502, "gen_loss": 0.25336983799934387, "grad_norm": 0.3611554038284677, "learning_rate": 2.1672000000000002e-05, "loss": 0.2644, "mean_copy_accuracy": 0.9964330494403839, "mean_gen_accuracy": 0.876218855381012, "mean_token_accuracy": 0.9103426188230515, "num_tokens": 903133171.0, "sample_num_tokens": 9239.75, "step": 7329, "total_num_tokens": 903170130.0, "z_loss": 0.0003975558211095631 }, { "copy_logits_max": -0.3075811266899109, "copy_logits_min": -750000064.0, "copy_num_tokens": 490.0, "epoch": 1.4971151391370947, "gen_logits_max": 3.509121894836426, "gen_logits_mean": -16.405305862426758, "gen_logits_min": -28.59282875061035, "gen_logits_std": 3.25305438041687, "gen_loss": 0.27367711067199707, "grad_norm": 0.334958992597339, "learning_rate": 2.1670736842105263e-05, "loss": 0.281, "mean_copy_accuracy": 0.9967080056667328, "mean_gen_accuracy": 0.8770134299993515, "mean_token_accuracy": 0.9050658792257309, "num_tokens": 903405201.0, "sample_num_tokens": 8579.25, "step": 7330, "total_num_tokens": 903439518.0, "z_loss": 0.0003998338943347335 }, { "copy_logits_max": -2.4339325428009033, "copy_logits_min": -750000000.0, "copy_num_tokens": 483.625, "epoch": 1.4973193770742914, "gen_logits_max": 3.1698575019836426, "gen_logits_mean": -18.10427474975586, "gen_logits_min": -30.24806785583496, "gen_logits_std": 3.3009166717529297, "gen_loss": 0.2847784161567688, "grad_norm": 0.3599401526654077, "learning_rate": 2.1669473684210527e-05, "loss": 0.281, "mean_copy_accuracy": 0.9970604479312897, "mean_gen_accuracy": 0.8750410377979279, "mean_token_accuracy": 0.9067564457654953, "num_tokens": 903682283.0, "sample_num_tokens": 8399.75, "step": 7331, "total_num_tokens": 903715882.0, "z_loss": 0.00047005340456962585 }, { "copy_logits_max": -1.4768072366714478, "copy_logits_min": -750000000.0, "copy_num_tokens": 772.0, "epoch": 1.4975236150114883, "gen_logits_max": 3.710521697998047, "gen_logits_mean": -15.495412826538086, "gen_logits_min": -27.876564025878906, "gen_logits_std": 3.2344236373901367, "gen_loss": 0.2826222777366638, "grad_norm": 0.3601220543273877, "learning_rate": 2.1668210526315788e-05, "loss": 0.2581, "mean_copy_accuracy": 0.9961969703435898, "mean_gen_accuracy": 0.8813019245862961, "mean_token_accuracy": 0.9138815551996231, "num_tokens": 903949309.0, "sample_num_tokens": 10113.75, "step": 7332, "total_num_tokens": 903989764.0, "z_loss": 0.00046974181896075606 }, { "copy_logits_max": -4.4131317138671875, "copy_logits_min": -687500032.0, "copy_num_tokens": 661.125, "epoch": 1.4977278529486853, "gen_logits_max": 4.570268630981445, "gen_logits_mean": -15.388691902160645, "gen_logits_min": -28.08380126953125, "gen_logits_std": 3.2430896759033203, "gen_loss": 0.23442019522190094, "grad_norm": 0.3634102575925329, "learning_rate": 2.1666947368421052e-05, "loss": 0.2539, "mean_copy_accuracy": 0.9969507604837418, "mean_gen_accuracy": 0.88172547519207, "mean_token_accuracy": 0.9129035174846649, "num_tokens": 904226825.0, "sample_num_tokens": 9545.25, "step": 7333, "total_num_tokens": 904265006.0, "z_loss": 0.00038554746424779296 }, { "copy_logits_max": -4.02412748336792, "copy_logits_min": -750000064.0, "copy_num_tokens": 358.875, "epoch": 1.497932090885882, "gen_logits_max": 5.236032485961914, "gen_logits_mean": -15.143722534179688, "gen_logits_min": -27.296646118164062, "gen_logits_std": 3.21006441116333, "gen_loss": 0.3110145330429077, "grad_norm": 0.3157964563958992, "learning_rate": 2.1665684210526317e-05, "loss": 0.2764, "mean_copy_accuracy": 0.996326133608818, "mean_gen_accuracy": 0.8731033056974411, "mean_token_accuracy": 0.9059309810400009, "num_tokens": 904515342.0, "sample_num_tokens": 7566.5, "step": 7334, "total_num_tokens": 904545608.0, "z_loss": 0.0004987533902749419 }, { "copy_logits_max": -7.1786651611328125, "copy_logits_min": -750000000.0, "copy_num_tokens": 304.8125, "epoch": 1.498136328823079, "gen_logits_max": 5.308379650115967, "gen_logits_mean": -15.89278793334961, "gen_logits_min": -27.88339614868164, "gen_logits_std": 3.227179527282715, "gen_loss": 0.27613264322280884, "grad_norm": 0.3696658971032855, "learning_rate": 2.166442105263158e-05, "loss": 0.2662, "mean_copy_accuracy": 0.9960107505321503, "mean_gen_accuracy": 0.8839643448591232, "mean_token_accuracy": 0.9101540744304657, "num_tokens": 904781950.0, "sample_num_tokens": 7656.5, "step": 7335, "total_num_tokens": 904812576.0, "z_loss": 0.00047045055544003844 }, { "copy_logits_max": -3.304163932800293, "copy_logits_min": -750000000.0, "copy_num_tokens": 431.875, "epoch": 1.4983405667602758, "gen_logits_max": 5.6535420417785645, "gen_logits_mean": -14.384647369384766, "gen_logits_min": -26.40814208984375, "gen_logits_std": 3.213074207305908, "gen_loss": 0.2434850037097931, "grad_norm": 0.3997028996426657, "learning_rate": 2.1663157894736842e-05, "loss": 0.29, "mean_copy_accuracy": 0.9964780062437057, "mean_gen_accuracy": 0.8756383806467056, "mean_token_accuracy": 0.9030041247606277, "num_tokens": 905019406.0, "sample_num_tokens": 7424.5, "step": 7336, "total_num_tokens": 905049104.0, "z_loss": 0.0003995092411059886 }, { "copy_logits_max": -4.603590488433838, "copy_logits_min": -750000000.0, "copy_num_tokens": 502.9375, "epoch": 1.4985448046974725, "gen_logits_max": 4.975149631500244, "gen_logits_mean": -15.09825611114502, "gen_logits_min": -27.308879852294922, "gen_logits_std": 3.2073116302490234, "gen_loss": 0.28481775522232056, "grad_norm": 0.3497098385150417, "learning_rate": 2.1661894736842106e-05, "loss": 0.2809, "mean_copy_accuracy": 0.9966827481985092, "mean_gen_accuracy": 0.8723303973674774, "mean_token_accuracy": 0.9045037031173706, "num_tokens": 905310542.0, "sample_num_tokens": 9028.0, "step": 7337, "total_num_tokens": 905346654.0, "z_loss": 0.0004322104505263269 }, { "copy_logits_max": -4.585046768188477, "copy_logits_min": -750000000.0, "copy_num_tokens": 690.75, "epoch": 1.4987490426346695, "gen_logits_max": 4.075788497924805, "gen_logits_mean": -15.037976264953613, "gen_logits_min": -27.50783920288086, "gen_logits_std": 3.2219104766845703, "gen_loss": 0.25965648889541626, "grad_norm": 0.37758785619445423, "learning_rate": 2.1660631578947367e-05, "loss": 0.2904, "mean_copy_accuracy": 0.9959312975406647, "mean_gen_accuracy": 0.8660908937454224, "mean_token_accuracy": 0.9016036540269852, "num_tokens": 905597444.0, "sample_num_tokens": 9054.0, "step": 7338, "total_num_tokens": 905633660.0, "z_loss": 0.00047164232819341123 }, { "copy_logits_max": -6.54515266418457, "copy_logits_min": -750000000.0, "copy_num_tokens": 418.0, "epoch": 1.4989532805718662, "gen_logits_max": 4.216711521148682, "gen_logits_mean": -16.305233001708984, "gen_logits_min": -28.339996337890625, "gen_logits_std": 3.259799003601074, "gen_loss": 0.24559232592582703, "grad_norm": 0.37715240661497706, "learning_rate": 2.165936842105263e-05, "loss": 0.2673, "mean_copy_accuracy": 0.997598260641098, "mean_gen_accuracy": 0.8823050260543823, "mean_token_accuracy": 0.9108785688877106, "num_tokens": 905881060.0, "sample_num_tokens": 8063.0, "step": 7339, "total_num_tokens": 905913312.0, "z_loss": 0.00043241397361271083 }, { "copy_logits_max": -7.820087432861328, "copy_logits_min": -750000064.0, "copy_num_tokens": 362.1875, "epoch": 1.499157518509063, "gen_logits_max": 5.590153217315674, "gen_logits_mean": -15.05744743347168, "gen_logits_min": -26.828327178955078, "gen_logits_std": 3.195539712905884, "gen_loss": 0.2830246090888977, "grad_norm": 0.3627280597861058, "learning_rate": 2.1658105263157896e-05, "loss": 0.2772, "mean_copy_accuracy": 0.9966160356998444, "mean_gen_accuracy": 0.8784277141094208, "mean_token_accuracy": 0.9067357629537582, "num_tokens": 906144640.0, "sample_num_tokens": 7878.5, "step": 7340, "total_num_tokens": 906176154.0, "z_loss": 0.00045568379573524 }, { "copy_logits_max": -4.423842430114746, "copy_logits_min": -750000000.0, "copy_num_tokens": 728.3125, "epoch": 1.4993617564462598, "gen_logits_max": 5.3087968826293945, "gen_logits_mean": -14.361621856689453, "gen_logits_min": -26.508399963378906, "gen_logits_std": 3.188181161880493, "gen_loss": 0.23410087823867798, "grad_norm": 0.367925787450819, "learning_rate": 2.1656842105263157e-05, "loss": 0.284, "mean_copy_accuracy": 0.99643574655056, "mean_gen_accuracy": 0.8718340396881104, "mean_token_accuracy": 0.9041487127542496, "num_tokens": 906424940.0, "sample_num_tokens": 10425.5, "step": 7341, "total_num_tokens": 906466642.0, "z_loss": 0.0004401682526804507 }, { "copy_logits_max": -5.828509330749512, "copy_logits_min": -687500032.0, "copy_num_tokens": 505.125, "epoch": 1.4995659943834567, "gen_logits_max": 4.266707420349121, "gen_logits_mean": -16.39505386352539, "gen_logits_min": -28.44489288330078, "gen_logits_std": 3.238521099090576, "gen_loss": 0.2808816432952881, "grad_norm": 0.3543934423602128, "learning_rate": 2.1655578947368424e-05, "loss": 0.2762, "mean_copy_accuracy": 0.9962090849876404, "mean_gen_accuracy": 0.8787635713815689, "mean_token_accuracy": 0.9058268517255783, "num_tokens": 906694736.0, "sample_num_tokens": 8718.5, "step": 7342, "total_num_tokens": 906729610.0, "z_loss": 0.0004920322098769248 }, { "copy_logits_max": -3.8336105346679688, "copy_logits_min": -687500032.0, "copy_num_tokens": 475.1875, "epoch": 1.4997702323206537, "gen_logits_max": 5.935812473297119, "gen_logits_mean": -13.40754508972168, "gen_logits_min": -25.55044937133789, "gen_logits_std": 3.174934148788452, "gen_loss": 0.28074827790260315, "grad_norm": 0.3925815812828638, "learning_rate": 2.1654315789473685e-05, "loss": 0.2892, "mean_copy_accuracy": 0.9966748952865601, "mean_gen_accuracy": 0.8711021095514297, "mean_token_accuracy": 0.9052885919809341, "num_tokens": 906966269.0, "sample_num_tokens": 8196.25, "step": 7343, "total_num_tokens": 906999054.0, "z_loss": 0.0004993776092305779 }, { "copy_logits_max": -4.28613805770874, "copy_logits_min": -750000000.0, "copy_num_tokens": 621.75, "epoch": 1.4999744702578504, "gen_logits_max": 3.6222026348114014, "gen_logits_mean": -16.021081924438477, "gen_logits_min": -28.355806350708008, "gen_logits_std": 3.247997283935547, "gen_loss": 0.2669379413127899, "grad_norm": 0.34618172927386603, "learning_rate": 2.165305263157895e-05, "loss": 0.2819, "mean_copy_accuracy": 0.996550664305687, "mean_gen_accuracy": 0.8739578574895859, "mean_token_accuracy": 0.9036465585231781, "num_tokens": 907242878.0, "sample_num_tokens": 9345.0, "step": 7344, "total_num_tokens": 907280258.0, "z_loss": 0.0005935837980359793 }, { "copy_logits_max": -6.708405494689941, "copy_logits_min": -687500032.0, "copy_num_tokens": 403.3125, "epoch": 1.5001787081950473, "gen_logits_max": 4.924871444702148, "gen_logits_mean": -14.443685531616211, "gen_logits_min": -26.86530113220215, "gen_logits_std": 3.2098939418792725, "gen_loss": 0.26148688793182373, "grad_norm": 0.370548229658298, "learning_rate": 2.165178947368421e-05, "loss": 0.2811, "mean_copy_accuracy": 0.9969881922006607, "mean_gen_accuracy": 0.8733169585466385, "mean_token_accuracy": 0.9040088057518005, "num_tokens": 907539274.0, "sample_num_tokens": 8270.5, "step": 7345, "total_num_tokens": 907572356.0, "z_loss": 0.000495067797601223 }, { "copy_logits_max": -4.473248481750488, "copy_logits_min": -750000000.0, "copy_num_tokens": 342.125, "epoch": 1.5003829461322442, "gen_logits_max": 5.053733825683594, "gen_logits_mean": -15.183682441711426, "gen_logits_min": -27.65562629699707, "gen_logits_std": 3.2223315238952637, "gen_loss": 0.28314948081970215, "grad_norm": 0.34361867716125866, "learning_rate": 2.1650526315789475e-05, "loss": 0.2874, "mean_copy_accuracy": 0.9957514852285385, "mean_gen_accuracy": 0.8765898197889328, "mean_token_accuracy": 0.902035728096962, "num_tokens": 907800289.0, "sample_num_tokens": 7060.25, "step": 7346, "total_num_tokens": 907828530.0, "z_loss": 0.0005002787220291793 }, { "copy_logits_max": -4.689229965209961, "copy_logits_min": -750000000.0, "copy_num_tokens": 440.5625, "epoch": 1.500587184069441, "gen_logits_max": 4.318137168884277, "gen_logits_mean": -15.366372108459473, "gen_logits_min": -27.70692253112793, "gen_logits_std": 3.2268216609954834, "gen_loss": 0.2840363383293152, "grad_norm": 0.3391300554062904, "learning_rate": 2.1649263157894736e-05, "loss": 0.2571, "mean_copy_accuracy": 0.9973718374967575, "mean_gen_accuracy": 0.8836687058210373, "mean_token_accuracy": 0.9149437844753265, "num_tokens": 908081362.0, "sample_num_tokens": 7937.0, "step": 7347, "total_num_tokens": 908113110.0, "z_loss": 0.0005192967946641147 }, { "copy_logits_max": -5.414380073547363, "copy_logits_min": -750000000.0, "copy_num_tokens": 541.25, "epoch": 1.5007914220066376, "gen_logits_max": 3.2825875282287598, "gen_logits_mean": -16.904354095458984, "gen_logits_min": -28.81539535522461, "gen_logits_std": 3.2298672199249268, "gen_loss": 0.29553431272506714, "grad_norm": 0.3610614609334169, "learning_rate": 2.1648e-05, "loss": 0.2948, "mean_copy_accuracy": 0.9967874139547348, "mean_gen_accuracy": 0.8727266937494278, "mean_token_accuracy": 0.9015575349330902, "num_tokens": 908344688.0, "sample_num_tokens": 9019.0, "step": 7348, "total_num_tokens": 908380764.0, "z_loss": 0.000495823216624558 }, { "copy_logits_max": -7.654739856719971, "copy_logits_min": -750000000.0, "copy_num_tokens": 416.625, "epoch": 1.5009956599438345, "gen_logits_max": 4.587080478668213, "gen_logits_mean": -15.887297630310059, "gen_logits_min": -27.761837005615234, "gen_logits_std": 3.206972599029541, "gen_loss": 0.24684789776802063, "grad_norm": 0.34746929710522517, "learning_rate": 2.164673684210526e-05, "loss": 0.2686, "mean_copy_accuracy": 0.9970481693744659, "mean_gen_accuracy": 0.8755426853895187, "mean_token_accuracy": 0.907886728644371, "num_tokens": 908632484.0, "sample_num_tokens": 8960.5, "step": 7349, "total_num_tokens": 908668326.0, "z_loss": 0.00039737968472763896 }, { "copy_logits_max": -4.854183673858643, "copy_logits_min": -625000064.0, "copy_num_tokens": 532.875, "epoch": 1.5011998978810315, "gen_logits_max": 3.1662850379943848, "gen_logits_mean": -17.170137405395508, "gen_logits_min": -29.41926383972168, "gen_logits_std": 3.2622008323669434, "gen_loss": 0.2790529131889343, "grad_norm": 0.3584338914675779, "learning_rate": 2.164547368421053e-05, "loss": 0.2778, "mean_copy_accuracy": 0.9962925165891647, "mean_gen_accuracy": 0.8745176941156387, "mean_token_accuracy": 0.906283050775528, "num_tokens": 908921617.0, "sample_num_tokens": 8693.75, "step": 7350, "total_num_tokens": 908956392.0, "z_loss": 0.000452784588560462 }, { "copy_logits_max": -4.788518905639648, "copy_logits_min": -750000000.0, "copy_num_tokens": 456.625, "epoch": 1.5014041358182282, "gen_logits_max": 4.778888702392578, "gen_logits_mean": -15.589532852172852, "gen_logits_min": -27.68169403076172, "gen_logits_std": 3.205822229385376, "gen_loss": 0.24426162242889404, "grad_norm": 0.38070262854620457, "learning_rate": 2.164421052631579e-05, "loss": 0.2744, "mean_copy_accuracy": 0.9963563829660416, "mean_gen_accuracy": 0.8785082101821899, "mean_token_accuracy": 0.9074163287878036, "num_tokens": 909208274.0, "sample_num_tokens": 7976.5, "step": 7351, "total_num_tokens": 909240180.0, "z_loss": 0.00043587712571024895 }, { "copy_logits_max": -5.7260236740112305, "copy_logits_min": -750000000.0, "copy_num_tokens": 367.4375, "epoch": 1.501608373755425, "gen_logits_max": 3.7863855361938477, "gen_logits_mean": -17.688058853149414, "gen_logits_min": -29.2840633392334, "gen_logits_std": 3.2259278297424316, "gen_loss": 0.2816377282142639, "grad_norm": 0.3811036820796839, "learning_rate": 2.1642947368421054e-05, "loss": 0.2933, "mean_copy_accuracy": 0.996127724647522, "mean_gen_accuracy": 0.8736579567193985, "mean_token_accuracy": 0.902393102645874, "num_tokens": 909453503.0, "sample_num_tokens": 7258.75, "step": 7352, "total_num_tokens": 909482538.0, "z_loss": 0.0005060537368990481 }, { "copy_logits_max": -6.875588417053223, "copy_logits_min": -687500032.0, "copy_num_tokens": 356.625, "epoch": 1.501812611692622, "gen_logits_max": 4.432015419006348, "gen_logits_mean": -15.897358894348145, "gen_logits_min": -27.924640655517578, "gen_logits_std": 3.1926939487457275, "gen_loss": 0.29821252822875977, "grad_norm": 0.3761479582323934, "learning_rate": 2.1641684210526318e-05, "loss": 0.272, "mean_copy_accuracy": 0.9966095089912415, "mean_gen_accuracy": 0.879096120595932, "mean_token_accuracy": 0.9055885225534439, "num_tokens": 909725209.0, "sample_num_tokens": 7610.25, "step": 7353, "total_num_tokens": 909755650.0, "z_loss": 0.0005005107377655804 }, { "copy_logits_max": -5.665345191955566, "copy_logits_min": -750000128.0, "copy_num_tokens": 527.75, "epoch": 1.5020168496298187, "gen_logits_max": 3.75785493850708, "gen_logits_mean": -15.527433395385742, "gen_logits_min": -27.874126434326172, "gen_logits_std": 3.207597494125366, "gen_loss": 0.24689581990242004, "grad_norm": 0.3806357887254184, "learning_rate": 2.164042105263158e-05, "loss": 0.2675, "mean_copy_accuracy": 0.9971649795770645, "mean_gen_accuracy": 0.8749368488788605, "mean_token_accuracy": 0.9079049825668335, "num_tokens": 909976993.0, "sample_num_tokens": 8337.75, "step": 7354, "total_num_tokens": 910010344.0, "z_loss": 0.0004398948804009706 }, { "copy_logits_max": -6.432941436767578, "copy_logits_min": -750000000.0, "copy_num_tokens": 542.875, "epoch": 1.5022210875670154, "gen_logits_max": 3.969177722930908, "gen_logits_mean": -15.635857582092285, "gen_logits_min": -27.895652770996094, "gen_logits_std": 3.1660966873168945, "gen_loss": 0.271124005317688, "grad_norm": 0.362362759269118, "learning_rate": 2.1639157894736843e-05, "loss": 0.274, "mean_copy_accuracy": 0.996036633849144, "mean_gen_accuracy": 0.8803270161151886, "mean_token_accuracy": 0.9061528295278549, "num_tokens": 910249173.0, "sample_num_tokens": 9496.75, "step": 7355, "total_num_tokens": 910287160.0, "z_loss": 0.00046909943921491504 }, { "copy_logits_max": -6.730855941772461, "copy_logits_min": -750000000.0, "copy_num_tokens": 508.5, "epoch": 1.5024253255042124, "gen_logits_max": 4.677670001983643, "gen_logits_mean": -15.647618293762207, "gen_logits_min": -27.36796760559082, "gen_logits_std": 3.1717658042907715, "gen_loss": 0.2720600366592407, "grad_norm": 0.3219005241832047, "learning_rate": 2.1637894736842104e-05, "loss": 0.2584, "mean_copy_accuracy": 0.9968470335006714, "mean_gen_accuracy": 0.8835416287183762, "mean_token_accuracy": 0.9129849672317505, "num_tokens": 910534303.0, "sample_num_tokens": 8487.25, "step": 7356, "total_num_tokens": 910568252.0, "z_loss": 0.00046998003381304443 }, { "copy_logits_max": -5.509859085083008, "copy_logits_min": -750000000.0, "copy_num_tokens": 445.6875, "epoch": 1.5026295634414093, "gen_logits_max": 4.405613899230957, "gen_logits_mean": -15.827567100524902, "gen_logits_min": -27.549179077148438, "gen_logits_std": 3.1561501026153564, "gen_loss": 0.30891674757003784, "grad_norm": 0.355533426811711, "learning_rate": 2.163663157894737e-05, "loss": 0.274, "mean_copy_accuracy": 0.9966689497232437, "mean_gen_accuracy": 0.8739665299654007, "mean_token_accuracy": 0.9056882858276367, "num_tokens": 910797193.0, "sample_num_tokens": 8513.25, "step": 7357, "total_num_tokens": 910831246.0, "z_loss": 0.0005569283966906369 }, { "copy_logits_max": -6.736174583435059, "copy_logits_min": -750000000.0, "copy_num_tokens": 461.125, "epoch": 1.502833801378606, "gen_logits_max": 4.555966377258301, "gen_logits_mean": -15.445167541503906, "gen_logits_min": -27.360095977783203, "gen_logits_std": 3.1443371772766113, "gen_loss": 0.29605990648269653, "grad_norm": 0.3737397242600007, "learning_rate": 2.1635368421052633e-05, "loss": 0.289, "mean_copy_accuracy": 0.9966401904821396, "mean_gen_accuracy": 0.8704640418291092, "mean_token_accuracy": 0.9011535793542862, "num_tokens": 911063394.0, "sample_num_tokens": 8943.0, "step": 7358, "total_num_tokens": 911099166.0, "z_loss": 0.0005091903731226921 }, { "copy_logits_max": -6.975624084472656, "copy_logits_min": -750000000.0, "copy_num_tokens": 418.0, "epoch": 1.503038039315803, "gen_logits_max": 4.531829833984375, "gen_logits_mean": -15.263540267944336, "gen_logits_min": -27.360191345214844, "gen_logits_std": 3.1476447582244873, "gen_loss": 0.28526270389556885, "grad_norm": 0.3513537663189782, "learning_rate": 2.1634105263157897e-05, "loss": 0.2708, "mean_copy_accuracy": 0.9968020468950272, "mean_gen_accuracy": 0.8737054765224457, "mean_token_accuracy": 0.9060321152210236, "num_tokens": 911338772.0, "sample_num_tokens": 7869.0, "step": 7359, "total_num_tokens": 911370248.0, "z_loss": 0.0004782142932526767 }, { "copy_logits_max": -5.405902862548828, "copy_logits_min": -750000000.0, "copy_num_tokens": 458.875, "epoch": 1.5032422772529999, "gen_logits_max": 3.865123987197876, "gen_logits_mean": -16.5793399810791, "gen_logits_min": -28.8817195892334, "gen_logits_std": 3.218076467514038, "gen_loss": 0.2735646963119507, "grad_norm": 0.36215165759460016, "learning_rate": 2.1632842105263158e-05, "loss": 0.2895, "mean_copy_accuracy": 0.9961468875408173, "mean_gen_accuracy": 0.8775433450937271, "mean_token_accuracy": 0.9035815298557281, "num_tokens": 911582304.0, "sample_num_tokens": 7955.0, "step": 7360, "total_num_tokens": 911614124.0, "z_loss": 0.0004746380145661533 }, { "copy_logits_max": -6.7693634033203125, "copy_logits_min": -750000000.0, "copy_num_tokens": 344.9375, "epoch": 1.5034465151901966, "gen_logits_max": 5.251796722412109, "gen_logits_mean": -15.050792694091797, "gen_logits_min": -27.835655212402344, "gen_logits_std": 3.1475133895874023, "gen_loss": 0.3056509494781494, "grad_norm": 0.3940007626967973, "learning_rate": 2.1631578947368423e-05, "loss": 0.2828, "mean_copy_accuracy": 0.9973282366991043, "mean_gen_accuracy": 0.8740137368440628, "mean_token_accuracy": 0.9053602665662766, "num_tokens": 911860933.0, "sample_num_tokens": 7163.25, "step": 7361, "total_num_tokens": 911889586.0, "z_loss": 0.0005674633430317044 }, { "copy_logits_max": -3.9892032146453857, "copy_logits_min": -750000000.0, "copy_num_tokens": 604.4375, "epoch": 1.5036507531273933, "gen_logits_max": 3.4521870613098145, "gen_logits_mean": -16.41629409790039, "gen_logits_min": -28.527301788330078, "gen_logits_std": 3.213595151901245, "gen_loss": 0.24941155314445496, "grad_norm": 0.3631759211252098, "learning_rate": 2.1630315789473684e-05, "loss": 0.2684, "mean_copy_accuracy": 0.9977514445781708, "mean_gen_accuracy": 0.8747518807649612, "mean_token_accuracy": 0.9073542207479477, "num_tokens": 912125106.0, "sample_num_tokens": 8467.0, "step": 7362, "total_num_tokens": 912158974.0, "z_loss": 0.00044601113768294454 }, { "copy_logits_max": -5.875690460205078, "copy_logits_min": -750000000.0, "copy_num_tokens": 504.5, "epoch": 1.5038549910645902, "gen_logits_max": 4.053903579711914, "gen_logits_mean": -15.62162971496582, "gen_logits_min": -28.24417495727539, "gen_logits_std": 3.191934585571289, "gen_loss": 0.3068912923336029, "grad_norm": 0.3778838934892687, "learning_rate": 2.1629052631578948e-05, "loss": 0.2918, "mean_copy_accuracy": 0.9969221651554108, "mean_gen_accuracy": 0.872560903429985, "mean_token_accuracy": 0.9030100852251053, "num_tokens": 912380849.0, "sample_num_tokens": 8155.75, "step": 7363, "total_num_tokens": 912413472.0, "z_loss": 0.0005810413858853281 }, { "copy_logits_max": -6.469264984130859, "copy_logits_min": -750000128.0, "copy_num_tokens": 511.8125, "epoch": 1.5040592290017871, "gen_logits_max": 4.534084796905518, "gen_logits_mean": -14.741353988647461, "gen_logits_min": -27.220867156982422, "gen_logits_std": 3.1683573722839355, "gen_loss": 0.27573126554489136, "grad_norm": 0.3709471855372519, "learning_rate": 2.162778947368421e-05, "loss": 0.268, "mean_copy_accuracy": 0.9967481642961502, "mean_gen_accuracy": 0.877569392323494, "mean_token_accuracy": 0.9098215997219086, "num_tokens": 912654538.0, "sample_num_tokens": 8290.5, "step": 7364, "total_num_tokens": 912687700.0, "z_loss": 0.0005033074412494898 }, { "copy_logits_max": -7.879085540771484, "copy_logits_min": -750000000.0, "copy_num_tokens": 368.375, "epoch": 1.5042634669389838, "gen_logits_max": 3.713135242462158, "gen_logits_mean": -17.062570571899414, "gen_logits_min": -29.069923400878906, "gen_logits_std": 3.204277992248535, "gen_loss": 0.26565659046173096, "grad_norm": 0.35353805222662693, "learning_rate": 2.1626526315789473e-05, "loss": 0.2669, "mean_copy_accuracy": 0.9966963976621628, "mean_gen_accuracy": 0.881435826420784, "mean_token_accuracy": 0.9076569527387619, "num_tokens": 912917972.0, "sample_num_tokens": 7734.0, "step": 7365, "total_num_tokens": 912948908.0, "z_loss": 0.000531477271579206 }, { "copy_logits_max": -7.855899810791016, "copy_logits_min": -750000000.0, "copy_num_tokens": 315.375, "epoch": 1.5044677048761808, "gen_logits_max": 5.409920692443848, "gen_logits_mean": -14.79105281829834, "gen_logits_min": -27.043073654174805, "gen_logits_std": 3.1140639781951904, "gen_loss": 0.29633498191833496, "grad_norm": 0.37213797426001516, "learning_rate": 2.1625263157894737e-05, "loss": 0.2815, "mean_copy_accuracy": 0.9959691613912582, "mean_gen_accuracy": 0.875599667429924, "mean_token_accuracy": 0.9033065736293793, "num_tokens": 913155588.0, "sample_num_tokens": 7165.5, "step": 7366, "total_num_tokens": 913184250.0, "z_loss": 0.0005389029392972589 }, { "copy_logits_max": -6.574361801147461, "copy_logits_min": -750000000.0, "copy_num_tokens": 616.9375, "epoch": 1.5046719428133777, "gen_logits_max": 3.067085027694702, "gen_logits_mean": -16.80474090576172, "gen_logits_min": -29.625904083251953, "gen_logits_std": 3.2035293579101562, "gen_loss": 0.2797296643257141, "grad_norm": 0.34957797697400644, "learning_rate": 2.1624e-05, "loss": 0.2779, "mean_copy_accuracy": 0.996630996465683, "mean_gen_accuracy": 0.8743528872728348, "mean_token_accuracy": 0.9056823998689651, "num_tokens": 913420832.0, "sample_num_tokens": 9245.5, "step": 7367, "total_num_tokens": 913457814.0, "z_loss": 0.0005300113116391003 }, { "copy_logits_max": -6.2806572914123535, "copy_logits_min": -750000000.0, "copy_num_tokens": 390.8125, "epoch": 1.5048761807505744, "gen_logits_max": 3.266643762588501, "gen_logits_mean": -17.845046997070312, "gen_logits_min": -29.97516632080078, "gen_logits_std": 3.2456445693969727, "gen_loss": 0.2639056444168091, "grad_norm": 0.3851497423709032, "learning_rate": 2.1622736842105266e-05, "loss": 0.2802, "mean_copy_accuracy": 0.9968144595623016, "mean_gen_accuracy": 0.8758846372365952, "mean_token_accuracy": 0.9048074185848236, "num_tokens": 913678766.0, "sample_num_tokens": 7743.5, "step": 7368, "total_num_tokens": 913709740.0, "z_loss": 0.0004543656832538545 }, { "copy_logits_max": -7.99128532409668, "copy_logits_min": -750000000.0, "copy_num_tokens": 587.3125, "epoch": 1.505080418687771, "gen_logits_max": 4.11573600769043, "gen_logits_mean": -15.400996208190918, "gen_logits_min": -27.590587615966797, "gen_logits_std": 3.1721181869506836, "gen_loss": 0.2989351153373718, "grad_norm": 0.3588757727878138, "learning_rate": 2.1621473684210527e-05, "loss": 0.2586, "mean_copy_accuracy": 0.9965261667966843, "mean_gen_accuracy": 0.8818110823631287, "mean_token_accuracy": 0.9112021028995514, "num_tokens": 913942876.0, "sample_num_tokens": 9917.0, "step": 7369, "total_num_tokens": 913982544.0, "z_loss": 0.0005012759938836098 }, { "copy_logits_max": -7.897758960723877, "copy_logits_min": -687500096.0, "copy_num_tokens": 529.0, "epoch": 1.5052846566249682, "gen_logits_max": 2.9212241172790527, "gen_logits_mean": -16.958829879760742, "gen_logits_min": -29.627193450927734, "gen_logits_std": 3.2612357139587402, "gen_loss": 0.2264050543308258, "grad_norm": 0.40067099645435983, "learning_rate": 2.162021052631579e-05, "loss": 0.2791, "mean_copy_accuracy": 0.9957749247550964, "mean_gen_accuracy": 0.877995029091835, "mean_token_accuracy": 0.9057315289974213, "num_tokens": 914214450.0, "sample_num_tokens": 8442.5, "step": 7370, "total_num_tokens": 914248220.0, "z_loss": 0.0003847131447400898 }, { "copy_logits_max": -5.448532581329346, "copy_logits_min": -750000000.0, "copy_num_tokens": 544.75, "epoch": 1.505488894562165, "gen_logits_max": 3.877106189727783, "gen_logits_mean": -16.290691375732422, "gen_logits_min": -28.76333999633789, "gen_logits_std": 3.2135956287384033, "gen_loss": 0.2592114210128784, "grad_norm": 0.3489090402815838, "learning_rate": 2.1618947368421052e-05, "loss": 0.2645, "mean_copy_accuracy": 0.9979587644338608, "mean_gen_accuracy": 0.8808863610029221, "mean_token_accuracy": 0.9094289690256119, "num_tokens": 914490652.0, "sample_num_tokens": 8974.5, "step": 7371, "total_num_tokens": 914526550.0, "z_loss": 0.00046977883903309703 }, { "copy_logits_max": -8.092727661132812, "copy_logits_min": -750000064.0, "copy_num_tokens": 426.5, "epoch": 1.5056931324993617, "gen_logits_max": 4.789492607116699, "gen_logits_mean": -15.195024490356445, "gen_logits_min": -27.926246643066406, "gen_logits_std": 3.188023090362549, "gen_loss": 0.2608639597892761, "grad_norm": 0.3574039483186271, "learning_rate": 2.1617684210526316e-05, "loss": 0.261, "mean_copy_accuracy": 0.9969169050455093, "mean_gen_accuracy": 0.8853797018527985, "mean_token_accuracy": 0.9109066873788834, "num_tokens": 914766570.0, "sample_num_tokens": 8794.5, "step": 7372, "total_num_tokens": 914801748.0, "z_loss": 0.00048125023022294044 }, { "copy_logits_max": -5.319273948669434, "copy_logits_min": -750000000.0, "copy_num_tokens": 642.0, "epoch": 1.5058973704365586, "gen_logits_max": 3.615112543106079, "gen_logits_mean": -15.234916687011719, "gen_logits_min": -27.92804718017578, "gen_logits_std": 3.2157692909240723, "gen_loss": 0.2816593647003174, "grad_norm": 0.3606449860351672, "learning_rate": 2.1616421052631577e-05, "loss": 0.2812, "mean_copy_accuracy": 0.9969788044691086, "mean_gen_accuracy": 0.8769713789224625, "mean_token_accuracy": 0.9063268303871155, "num_tokens": 915031352.0, "sample_num_tokens": 9146.0, "step": 7373, "total_num_tokens": 915067936.0, "z_loss": 0.0005071893101558089 }, { "copy_logits_max": -7.502870082855225, "copy_logits_min": -750000000.0, "copy_num_tokens": 357.0625, "epoch": 1.5061016083737555, "gen_logits_max": 3.517091751098633, "gen_logits_mean": -17.439922332763672, "gen_logits_min": -29.852588653564453, "gen_logits_std": 3.239288330078125, "gen_loss": 0.30533796548843384, "grad_norm": 0.3910345171108221, "learning_rate": 2.161515789473684e-05, "loss": 0.2882, "mean_copy_accuracy": 0.9966978281736374, "mean_gen_accuracy": 0.8734606057405472, "mean_token_accuracy": 0.9026933759450912, "num_tokens": 915291434.0, "sample_num_tokens": 7244.0, "step": 7374, "total_num_tokens": 915320410.0, "z_loss": 0.0005609653890132904 }, { "copy_logits_max": -5.775121688842773, "copy_logits_min": -750000064.0, "copy_num_tokens": 402.5625, "epoch": 1.5063058463109522, "gen_logits_max": 4.866694450378418, "gen_logits_mean": -14.488423347473145, "gen_logits_min": -26.659709930419922, "gen_logits_std": 3.168163537979126, "gen_loss": 0.31403958797454834, "grad_norm": 0.3707117394598334, "learning_rate": 2.1613894736842106e-05, "loss": 0.2875, "mean_copy_accuracy": 0.996645599603653, "mean_gen_accuracy": 0.8735007792711258, "mean_token_accuracy": 0.9028545767068863, "num_tokens": 915558430.0, "sample_num_tokens": 8100.0, "step": 7375, "total_num_tokens": 915590830.0, "z_loss": 0.0005720111075788736 }, { "copy_logits_max": -6.229412078857422, "copy_logits_min": -687500032.0, "copy_num_tokens": 534.5625, "epoch": 1.5065100842481491, "gen_logits_max": 4.1397857666015625, "gen_logits_mean": -15.908611297607422, "gen_logits_min": -28.38779067993164, "gen_logits_std": 3.2519311904907227, "gen_loss": 0.23992568254470825, "grad_norm": 0.4001006190843601, "learning_rate": 2.161263157894737e-05, "loss": 0.2808, "mean_copy_accuracy": 0.9956563115119934, "mean_gen_accuracy": 0.8765628337860107, "mean_token_accuracy": 0.9034232199192047, "num_tokens": 915805197.0, "sample_num_tokens": 8664.25, "step": 7376, "total_num_tokens": 915839854.0, "z_loss": 0.000470551080070436 }, { "copy_logits_max": -6.815548419952393, "copy_logits_min": -750000000.0, "copy_num_tokens": 361.875, "epoch": 1.506714322185346, "gen_logits_max": 4.987281799316406, "gen_logits_mean": -15.196239471435547, "gen_logits_min": -27.216773986816406, "gen_logits_std": 3.1818926334381104, "gen_loss": 0.29725804924964905, "grad_norm": 0.3624098054584525, "learning_rate": 2.161136842105263e-05, "loss": 0.2934, "mean_copy_accuracy": 0.9971212446689606, "mean_gen_accuracy": 0.8738376945257187, "mean_token_accuracy": 0.898571789264679, "num_tokens": 916059005.0, "sample_num_tokens": 8297.25, "step": 7377, "total_num_tokens": 916092194.0, "z_loss": 0.00047904130769893527 }, { "copy_logits_max": -7.91335391998291, "copy_logits_min": -750000000.0, "copy_num_tokens": 496.625, "epoch": 1.5069185601225428, "gen_logits_max": 3.902094841003418, "gen_logits_mean": -15.708627700805664, "gen_logits_min": -27.673627853393555, "gen_logits_std": 3.1942379474639893, "gen_loss": 0.3121626377105713, "grad_norm": 0.37051399900961596, "learning_rate": 2.1610105263157896e-05, "loss": 0.2847, "mean_copy_accuracy": 0.9958099275827408, "mean_gen_accuracy": 0.8730116933584213, "mean_token_accuracy": 0.9032951146364212, "num_tokens": 916319635.0, "sample_num_tokens": 8622.75, "step": 7378, "total_num_tokens": 916354126.0, "z_loss": 0.0005114099476486444 }, { "copy_logits_max": -6.98780632019043, "copy_logits_min": -750000000.0, "copy_num_tokens": 794.625, "epoch": 1.5071227980597395, "gen_logits_max": 3.1476821899414062, "gen_logits_mean": -16.204391479492188, "gen_logits_min": -28.339149475097656, "gen_logits_std": 3.2280144691467285, "gen_loss": 0.25985878705978394, "grad_norm": 0.37491352095569797, "learning_rate": 2.1608842105263156e-05, "loss": 0.2978, "mean_copy_accuracy": 0.9971009939908981, "mean_gen_accuracy": 0.8703332096338272, "mean_token_accuracy": 0.9001353085041046, "num_tokens": 916612802.0, "sample_num_tokens": 11268.0, "step": 7379, "total_num_tokens": 916657874.0, "z_loss": 0.0004128075670450926 }, { "copy_logits_max": -8.27670955657959, "copy_logits_min": -750000000.0, "copy_num_tokens": 440.5625, "epoch": 1.5073270359969364, "gen_logits_max": 3.878868579864502, "gen_logits_mean": -15.786346435546875, "gen_logits_min": -28.362117767333984, "gen_logits_std": 3.207836627960205, "gen_loss": 0.25040480494499207, "grad_norm": 0.36696535547687953, "learning_rate": 2.160757894736842e-05, "loss": 0.2646, "mean_copy_accuracy": 0.9961488395929337, "mean_gen_accuracy": 0.878545418381691, "mean_token_accuracy": 0.9099984169006348, "num_tokens": 916899684.0, "sample_num_tokens": 8298.5, "step": 7380, "total_num_tokens": 916932878.0, "z_loss": 0.00037698191590607166 }, { "copy_logits_max": -5.711489677429199, "copy_logits_min": -750000000.0, "copy_num_tokens": 448.0625, "epoch": 1.5075312739341333, "gen_logits_max": 4.358813285827637, "gen_logits_mean": -15.187849998474121, "gen_logits_min": -28.163368225097656, "gen_logits_std": 3.1937460899353027, "gen_loss": 0.2949657440185547, "grad_norm": 0.3355772912412187, "learning_rate": 2.1606315789473685e-05, "loss": 0.2698, "mean_copy_accuracy": 0.9977602809667587, "mean_gen_accuracy": 0.8724697083234787, "mean_token_accuracy": 0.9096650928258896, "num_tokens": 917182645.0, "sample_num_tokens": 8059.75, "step": 7381, "total_num_tokens": 917214884.0, "z_loss": 0.00047949145664460957 }, { "copy_logits_max": -6.664172649383545, "copy_logits_min": -750000064.0, "copy_num_tokens": 581.8125, "epoch": 1.50773551187133, "gen_logits_max": 2.9164557456970215, "gen_logits_mean": -17.263736724853516, "gen_logits_min": -29.85814666748047, "gen_logits_std": 3.2585363388061523, "gen_loss": 0.2909526228904724, "grad_norm": 0.3910480321717454, "learning_rate": 2.1605052631578946e-05, "loss": 0.2646, "mean_copy_accuracy": 0.9971469342708588, "mean_gen_accuracy": 0.8755490034818649, "mean_token_accuracy": 0.9096375107765198, "num_tokens": 917469479.0, "sample_num_tokens": 9331.75, "step": 7382, "total_num_tokens": 917506806.0, "z_loss": 0.00047990470193326473 }, { "copy_logits_max": -5.162744045257568, "copy_logits_min": -750000000.0, "copy_num_tokens": 581.5625, "epoch": 1.507939749808527, "gen_logits_max": 3.0720410346984863, "gen_logits_mean": -16.777976989746094, "gen_logits_min": -30.12285614013672, "gen_logits_std": 3.262840509414673, "gen_loss": 0.27231961488723755, "grad_norm": 0.37808588539440474, "learning_rate": 2.1603789473684214e-05, "loss": 0.2811, "mean_copy_accuracy": 0.9965673387050629, "mean_gen_accuracy": 0.8752307891845703, "mean_token_accuracy": 0.9056892096996307, "num_tokens": 917732076.0, "sample_num_tokens": 8727.5, "step": 7383, "total_num_tokens": 917766986.0, "z_loss": 0.0004739565192721784 }, { "copy_logits_max": -5.808564186096191, "copy_logits_min": -750000000.0, "copy_num_tokens": 658.75, "epoch": 1.508143987745724, "gen_logits_max": 3.0216565132141113, "gen_logits_mean": -17.412723541259766, "gen_logits_min": -30.117237091064453, "gen_logits_std": 3.2742514610290527, "gen_loss": 0.2555919289588928, "grad_norm": 0.35519525278517966, "learning_rate": 2.1602526315789475e-05, "loss": 0.2718, "mean_copy_accuracy": 0.99693363904953, "mean_gen_accuracy": 0.878081887960434, "mean_token_accuracy": 0.9077571630477905, "num_tokens": 918001640.0, "sample_num_tokens": 9863.5, "step": 7384, "total_num_tokens": 918041094.0, "z_loss": 0.00042404839769005775 }, { "copy_logits_max": -6.613147258758545, "copy_logits_min": -750000000.0, "copy_num_tokens": 436.4375, "epoch": 1.5083482256829206, "gen_logits_max": 3.337131977081299, "gen_logits_mean": -17.1884822845459, "gen_logits_min": -29.46686553955078, "gen_logits_std": 3.2567436695098877, "gen_loss": 0.26713481545448303, "grad_norm": 0.3840266376332984, "learning_rate": 2.160126315789474e-05, "loss": 0.2777, "mean_copy_accuracy": 0.9962204098701477, "mean_gen_accuracy": 0.8725631386041641, "mean_token_accuracy": 0.9036044925451279, "num_tokens": 918267973.0, "sample_num_tokens": 8310.25, "step": 7385, "total_num_tokens": 918301214.0, "z_loss": 0.00047004641965031624 }, { "copy_logits_max": -7.35826301574707, "copy_logits_min": -750000000.0, "copy_num_tokens": 487.875, "epoch": 1.5085524636201173, "gen_logits_max": 5.118045330047607, "gen_logits_mean": -13.839290618896484, "gen_logits_min": -26.8199462890625, "gen_logits_std": 3.1645026206970215, "gen_loss": 0.29873740673065186, "grad_norm": 0.3723585501563445, "learning_rate": 2.16e-05, "loss": 0.2758, "mean_copy_accuracy": 0.997068390250206, "mean_gen_accuracy": 0.8716485053300858, "mean_token_accuracy": 0.9051435440778732, "num_tokens": 918535049.0, "sample_num_tokens": 8679.75, "step": 7386, "total_num_tokens": 918569768.0, "z_loss": 0.0004919510101899505 }, { "copy_logits_max": -7.952496528625488, "copy_logits_min": -750000000.0, "copy_num_tokens": 520.375, "epoch": 1.5087567015573142, "gen_logits_max": 3.942338466644287, "gen_logits_mean": -16.841670989990234, "gen_logits_min": -29.07913589477539, "gen_logits_std": 3.2535276412963867, "gen_loss": 0.24619527161121368, "grad_norm": 0.35094663697483963, "learning_rate": 2.1598736842105264e-05, "loss": 0.2632, "mean_copy_accuracy": 0.9966990202665329, "mean_gen_accuracy": 0.8756910115480423, "mean_token_accuracy": 0.9098104387521744, "num_tokens": 918828524.0, "sample_num_tokens": 9005.5, "step": 7387, "total_num_tokens": 918864546.0, "z_loss": 0.00042565944022499025 }, { "copy_logits_max": -7.779704570770264, "copy_logits_min": -687500032.0, "copy_num_tokens": 701.75, "epoch": 1.5089609394945112, "gen_logits_max": 3.9280266761779785, "gen_logits_mean": -15.477300643920898, "gen_logits_min": -27.839641571044922, "gen_logits_std": 3.2066941261291504, "gen_loss": 0.2553512454032898, "grad_norm": 0.38141448109526416, "learning_rate": 2.1597473684210525e-05, "loss": 0.2822, "mean_copy_accuracy": 0.9969512373209, "mean_gen_accuracy": 0.8759069442749023, "mean_token_accuracy": 0.9065288305282593, "num_tokens": 919109496.0, "sample_num_tokens": 10287.5, "step": 7388, "total_num_tokens": 919150646.0, "z_loss": 0.00046469972585327923 }, { "copy_logits_max": -5.14559268951416, "copy_logits_min": -750000064.0, "copy_num_tokens": 738.75, "epoch": 1.5091651774317079, "gen_logits_max": 3.2839016914367676, "gen_logits_mean": -16.177318572998047, "gen_logits_min": -28.759727478027344, "gen_logits_std": 3.2607293128967285, "gen_loss": 0.26815903186798096, "grad_norm": 0.3744551297963323, "learning_rate": 2.159621052631579e-05, "loss": 0.274, "mean_copy_accuracy": 0.9971161335706711, "mean_gen_accuracy": 0.8748932331800461, "mean_token_accuracy": 0.9083651751279831, "num_tokens": 919408175.0, "sample_num_tokens": 9818.75, "step": 7389, "total_num_tokens": 919447450.0, "z_loss": 0.0005153404781594872 }, { "copy_logits_max": -6.282578468322754, "copy_logits_min": -750000000.0, "copy_num_tokens": 750.5625, "epoch": 1.5093694153689048, "gen_logits_max": 2.7117323875427246, "gen_logits_mean": -16.921733856201172, "gen_logits_min": -29.24151611328125, "gen_logits_std": 3.2625350952148438, "gen_loss": 0.22965224087238312, "grad_norm": 0.35404310190267463, "learning_rate": 2.159494736842105e-05, "loss": 0.2552, "mean_copy_accuracy": 0.9976324439048767, "mean_gen_accuracy": 0.8811748623847961, "mean_token_accuracy": 0.915830209851265, "num_tokens": 919704289.0, "sample_num_tokens": 10653.75, "step": 7390, "total_num_tokens": 919746904.0, "z_loss": 0.00044357668957673013 }, { "copy_logits_max": -6.160649299621582, "copy_logits_min": -687500032.0, "copy_num_tokens": 501.375, "epoch": 1.5095736533061017, "gen_logits_max": 3.6634159088134766, "gen_logits_mean": -16.346851348876953, "gen_logits_min": -28.52448272705078, "gen_logits_std": 3.189038038253784, "gen_loss": 0.29111289978027344, "grad_norm": 0.34070053002850337, "learning_rate": 2.1593684210526318e-05, "loss": 0.2744, "mean_copy_accuracy": 0.9973525702953339, "mean_gen_accuracy": 0.8778845816850662, "mean_token_accuracy": 0.9074828624725342, "num_tokens": 919993853.0, "sample_num_tokens": 8622.75, "step": 7391, "total_num_tokens": 920028344.0, "z_loss": 0.0005054381326772273 }, { "copy_logits_max": -5.592217445373535, "copy_logits_min": -750000000.0, "copy_num_tokens": 326.25, "epoch": 1.5097778912432984, "gen_logits_max": 5.461770057678223, "gen_logits_mean": -15.246126174926758, "gen_logits_min": -27.765079498291016, "gen_logits_std": 3.195523262023926, "gen_loss": 0.3106021583080292, "grad_norm": 0.3598322094608386, "learning_rate": 2.159242105263158e-05, "loss": 0.2676, "mean_copy_accuracy": 0.9968788474798203, "mean_gen_accuracy": 0.8779821693897247, "mean_token_accuracy": 0.9084039330482483, "num_tokens": 920273610.0, "sample_num_tokens": 7069.0, "step": 7392, "total_num_tokens": 920301886.0, "z_loss": 0.0005257870070636272 }, { "copy_logits_max": -7.576814651489258, "copy_logits_min": -750000000.0, "copy_num_tokens": 397.5, "epoch": 1.5099821291804951, "gen_logits_max": 4.534516334533691, "gen_logits_mean": -16.4158992767334, "gen_logits_min": -28.39020538330078, "gen_logits_std": 3.223822593688965, "gen_loss": 0.31069856882095337, "grad_norm": 0.37602183071753054, "learning_rate": 2.1591157894736843e-05, "loss": 0.2788, "mean_copy_accuracy": 0.9968660473823547, "mean_gen_accuracy": 0.8714621216058731, "mean_token_accuracy": 0.9042539149522781, "num_tokens": 920537843.0, "sample_num_tokens": 7874.75, "step": 7393, "total_num_tokens": 920569342.0, "z_loss": 0.0004834046121686697 }, { "copy_logits_max": -4.916875839233398, "copy_logits_min": -750000000.0, "copy_num_tokens": 399.6875, "epoch": 1.5101863671176923, "gen_logits_max": 4.38171911239624, "gen_logits_mean": -16.175146102905273, "gen_logits_min": -27.81991195678711, "gen_logits_std": 3.1951208114624023, "gen_loss": 0.27327680587768555, "grad_norm": 0.3775181463604845, "learning_rate": 2.1589894736842108e-05, "loss": 0.2592, "mean_copy_accuracy": 0.9968917667865753, "mean_gen_accuracy": 0.8786950409412384, "mean_token_accuracy": 0.9122239947319031, "num_tokens": 920832922.0, "sample_num_tokens": 8165.0, "step": 7394, "total_num_tokens": 920865582.0, "z_loss": 0.00043444824405014515 }, { "copy_logits_max": -5.594912528991699, "copy_logits_min": -687500032.0, "copy_num_tokens": 472.75, "epoch": 1.510390605054889, "gen_logits_max": 5.097272872924805, "gen_logits_mean": -14.37452507019043, "gen_logits_min": -26.908184051513672, "gen_logits_std": 3.1848490238189697, "gen_loss": 0.25419217348098755, "grad_norm": 0.4139734177130572, "learning_rate": 2.158863157894737e-05, "loss": 0.285, "mean_copy_accuracy": 0.9963572025299072, "mean_gen_accuracy": 0.8728027641773224, "mean_token_accuracy": 0.9032817184925079, "num_tokens": 921092255.0, "sample_num_tokens": 7831.25, "step": 7395, "total_num_tokens": 921123580.0, "z_loss": 0.0004579091619234532 }, { "copy_logits_max": -7.374707221984863, "copy_logits_min": -750000000.0, "copy_num_tokens": 323.4375, "epoch": 1.5105948429920857, "gen_logits_max": 3.8891589641571045, "gen_logits_mean": -16.385860443115234, "gen_logits_min": -28.77478790283203, "gen_logits_std": 3.223428726196289, "gen_loss": 0.2279037982225418, "grad_norm": 0.34202772520501223, "learning_rate": 2.1587368421052633e-05, "loss": 0.2743, "mean_copy_accuracy": 0.9974616467952728, "mean_gen_accuracy": 0.8760341256856918, "mean_token_accuracy": 0.9053586721420288, "num_tokens": 921376263.0, "sample_num_tokens": 7459.75, "step": 7396, "total_num_tokens": 921406102.0, "z_loss": 0.00038237450644373894 }, { "copy_logits_max": -5.53069543838501, "copy_logits_min": -750000000.0, "copy_num_tokens": 436.0625, "epoch": 1.5107990809292826, "gen_logits_max": 4.657373905181885, "gen_logits_mean": -15.120088577270508, "gen_logits_min": -27.352455139160156, "gen_logits_std": 3.190276861190796, "gen_loss": 0.3297812342643738, "grad_norm": 0.3753581281778283, "learning_rate": 2.1586105263157894e-05, "loss": 0.2634, "mean_copy_accuracy": 0.9973421394824982, "mean_gen_accuracy": 0.8789838403463364, "mean_token_accuracy": 0.9110292196273804, "num_tokens": 921646101.0, "sample_num_tokens": 7937.75, "step": 7397, "total_num_tokens": 921677852.0, "z_loss": 0.0005683574127033353 }, { "copy_logits_max": -3.6131858825683594, "copy_logits_min": -750000000.0, "copy_num_tokens": 501.4375, "epoch": 1.5110033188664795, "gen_logits_max": 3.7100729942321777, "gen_logits_mean": -16.489561080932617, "gen_logits_min": -28.897850036621094, "gen_logits_std": 3.254176616668701, "gen_loss": 0.30629047751426697, "grad_norm": 0.3624684569349879, "learning_rate": 2.1584842105263158e-05, "loss": 0.2775, "mean_copy_accuracy": 0.9976396411657333, "mean_gen_accuracy": 0.8696193248033524, "mean_token_accuracy": 0.9058239459991455, "num_tokens": 921918069.0, "sample_num_tokens": 8624.75, "step": 7398, "total_num_tokens": 921952568.0, "z_loss": 0.0005114491796121001 }, { "copy_logits_max": -3.1286683082580566, "copy_logits_min": -750000000.0, "copy_num_tokens": 360.1875, "epoch": 1.5112075568036762, "gen_logits_max": 4.642333030700684, "gen_logits_mean": -15.3704252243042, "gen_logits_min": -27.62205696105957, "gen_logits_std": 3.2167930603027344, "gen_loss": 0.2963603138923645, "grad_norm": 0.3764457934307742, "learning_rate": 2.1583578947368422e-05, "loss": 0.2883, "mean_copy_accuracy": 0.9965689480304718, "mean_gen_accuracy": 0.8733353018760681, "mean_token_accuracy": 0.9028442651033401, "num_tokens": 922194376.0, "sample_num_tokens": 7747.5, "step": 7399, "total_num_tokens": 922225366.0, "z_loss": 0.0004652936477214098 }, { "copy_logits_max": -5.413053035736084, "copy_logits_min": -750000128.0, "copy_num_tokens": 439.6875, "epoch": 1.5114117947408732, "gen_logits_max": 4.627806186676025, "gen_logits_mean": -16.016523361206055, "gen_logits_min": -28.15618896484375, "gen_logits_std": 3.249054193496704, "gen_loss": 0.26110926270484924, "grad_norm": 0.38941031995844944, "learning_rate": 2.1582315789473687e-05, "loss": 0.2904, "mean_copy_accuracy": 0.9968863427639008, "mean_gen_accuracy": 0.8749202191829681, "mean_token_accuracy": 0.9024054706096649, "num_tokens": 922458185.0, "sample_num_tokens": 8314.25, "step": 7400, "total_num_tokens": 922491442.0, "z_loss": 0.0005150104407221079 }, { "copy_logits_max": -4.699440002441406, "copy_logits_min": -750000000.0, "copy_num_tokens": 489.625, "epoch": 1.51161603267807, "gen_logits_max": 4.7708539962768555, "gen_logits_mean": -15.540233612060547, "gen_logits_min": -27.66183853149414, "gen_logits_std": 3.212000608444214, "gen_loss": 0.28008395433425903, "grad_norm": 0.3561397067077992, "learning_rate": 2.1581052631578948e-05, "loss": 0.2649, "mean_copy_accuracy": 0.9972030073404312, "mean_gen_accuracy": 0.8814066499471664, "mean_token_accuracy": 0.9097685217857361, "num_tokens": 922733019.0, "sample_num_tokens": 9245.25, "step": 7401, "total_num_tokens": 922770000.0, "z_loss": 0.0005200746236369014 }, { "copy_logits_max": -4.873576641082764, "copy_logits_min": -750000000.0, "copy_num_tokens": 432.4375, "epoch": 1.5118202706152668, "gen_logits_max": 4.572436332702637, "gen_logits_mean": -15.14518928527832, "gen_logits_min": -27.522459030151367, "gen_logits_std": 3.2093138694763184, "gen_loss": 0.29084163904190063, "grad_norm": 0.38004616166134186, "learning_rate": 2.1579789473684212e-05, "loss": 0.2804, "mean_copy_accuracy": 0.996316522359848, "mean_gen_accuracy": 0.8739672750234604, "mean_token_accuracy": 0.9059416800737381, "num_tokens": 923000758.0, "sample_num_tokens": 7853.0, "step": 7402, "total_num_tokens": 923032170.0, "z_loss": 0.0005651118699461222 }, { "copy_logits_max": -5.416796684265137, "copy_logits_min": -750000000.0, "copy_num_tokens": 437.0625, "epoch": 1.5120245085524635, "gen_logits_max": 4.3616461753845215, "gen_logits_mean": -16.794099807739258, "gen_logits_min": -28.700233459472656, "gen_logits_std": 3.2155990600585938, "gen_loss": 0.29248741269111633, "grad_norm": 0.3466059659147462, "learning_rate": 2.1578526315789473e-05, "loss": 0.2722, "mean_copy_accuracy": 0.9970537722110748, "mean_gen_accuracy": 0.8789899051189423, "mean_token_accuracy": 0.9074116945266724, "num_tokens": 923300995.0, "sample_num_tokens": 9673.75, "step": 7403, "total_num_tokens": 923339690.0, "z_loss": 0.0005485062720254064 }, { "copy_logits_max": -2.418818473815918, "copy_logits_min": -687500032.0, "copy_num_tokens": 512.125, "epoch": 1.5122287464896604, "gen_logits_max": 3.6039340496063232, "gen_logits_mean": -16.347015380859375, "gen_logits_min": -28.68185043334961, "gen_logits_std": 3.2666540145874023, "gen_loss": 0.2721980810165405, "grad_norm": 0.35365113734636733, "learning_rate": 2.1577263157894737e-05, "loss": 0.2742, "mean_copy_accuracy": 0.9958650320768356, "mean_gen_accuracy": 0.8787039518356323, "mean_token_accuracy": 0.9063645005226135, "num_tokens": 923589673.0, "sample_num_tokens": 8366.75, "step": 7404, "total_num_tokens": 923623140.0, "z_loss": 0.000593440025113523 }, { "copy_logits_max": -3.0576345920562744, "copy_logits_min": -750000000.0, "copy_num_tokens": 395.5, "epoch": 1.5124329844268574, "gen_logits_max": 4.066415309906006, "gen_logits_mean": -16.05945587158203, "gen_logits_min": -28.171159744262695, "gen_logits_std": 3.2269763946533203, "gen_loss": 0.2583751380443573, "grad_norm": 0.3493231550522984, "learning_rate": 2.1575999999999998e-05, "loss": 0.2689, "mean_copy_accuracy": 0.9964828044176102, "mean_gen_accuracy": 0.8808091282844543, "mean_token_accuracy": 0.9103474020957947, "num_tokens": 923881802.0, "sample_num_tokens": 8000.0, "step": 7405, "total_num_tokens": 923913802.0, "z_loss": 0.00047250190982595086 }, { "copy_logits_max": -3.857419729232788, "copy_logits_min": -750000064.0, "copy_num_tokens": 660.375, "epoch": 1.512637222364054, "gen_logits_max": 2.367976665496826, "gen_logits_mean": -18.883573532104492, "gen_logits_min": -31.06509017944336, "gen_logits_std": 3.3187880516052246, "gen_loss": 0.280830979347229, "grad_norm": 0.3597175757774696, "learning_rate": 2.1574736842105262e-05, "loss": 0.2838, "mean_copy_accuracy": 0.9967536330223083, "mean_gen_accuracy": 0.8759428560733795, "mean_token_accuracy": 0.904106855392456, "num_tokens": 924150064.0, "sample_num_tokens": 10230.0, "step": 7406, "total_num_tokens": 924190984.0, "z_loss": 0.000499033136293292 }, { "copy_logits_max": -7.649705410003662, "copy_logits_min": -750000000.0, "copy_num_tokens": 332.3125, "epoch": 1.512841460301251, "gen_logits_max": 3.7532806396484375, "gen_logits_mean": -17.731924057006836, "gen_logits_min": -29.8717098236084, "gen_logits_std": 3.2569925785064697, "gen_loss": 0.27954527735710144, "grad_norm": 0.3321369085217234, "learning_rate": 2.157347368421053e-05, "loss": 0.2677, "mean_copy_accuracy": 0.9959770888090134, "mean_gen_accuracy": 0.8812148571014404, "mean_token_accuracy": 0.910097748041153, "num_tokens": 924447934.0, "sample_num_tokens": 7729.5, "step": 7407, "total_num_tokens": 924478852.0, "z_loss": 0.0004589296877384186 }, { "copy_logits_max": -4.378553867340088, "copy_logits_min": -750000064.0, "copy_num_tokens": 532.375, "epoch": 1.513045698238448, "gen_logits_max": 2.851205348968506, "gen_logits_mean": -17.74960708618164, "gen_logits_min": -29.859386444091797, "gen_logits_std": 3.2816548347473145, "gen_loss": 0.2691850960254669, "grad_norm": 0.36943579676091454, "learning_rate": 2.157221052631579e-05, "loss": 0.2779, "mean_copy_accuracy": 0.9968073517084122, "mean_gen_accuracy": 0.8739617317914963, "mean_token_accuracy": 0.9070322215557098, "num_tokens": 924733470.0, "sample_num_tokens": 9464.5, "step": 7408, "total_num_tokens": 924771328.0, "z_loss": 0.00044652214273810387 }, { "copy_logits_max": -6.104771614074707, "copy_logits_min": -750000000.0, "copy_num_tokens": 272.125, "epoch": 1.5132499361756446, "gen_logits_max": 4.667370796203613, "gen_logits_mean": -16.167699813842773, "gen_logits_min": -27.867450714111328, "gen_logits_std": 3.1952426433563232, "gen_loss": 0.3093912601470947, "grad_norm": 0.37818133877424354, "learning_rate": 2.1570947368421055e-05, "loss": 0.2911, "mean_copy_accuracy": 0.9963397085666656, "mean_gen_accuracy": 0.8733662366867065, "mean_token_accuracy": 0.9005580842494965, "num_tokens": 924993825.0, "sample_num_tokens": 7453.75, "step": 7409, "total_num_tokens": 925023640.0, "z_loss": 0.000495469372253865 }, { "copy_logits_max": -5.944672584533691, "copy_logits_min": -687500096.0, "copy_num_tokens": 491.4375, "epoch": 1.5134541741128413, "gen_logits_max": 3.7467620372772217, "gen_logits_mean": -15.54498291015625, "gen_logits_min": -28.11981964111328, "gen_logits_std": 3.232642650604248, "gen_loss": 0.25951582193374634, "grad_norm": 0.3481094509346601, "learning_rate": 2.1569684210526316e-05, "loss": 0.2718, "mean_copy_accuracy": 0.9974664896726608, "mean_gen_accuracy": 0.8764225840568542, "mean_token_accuracy": 0.9086264222860336, "num_tokens": 925240274.0, "sample_num_tokens": 8244.5, "step": 7410, "total_num_tokens": 925273252.0, "z_loss": 0.00047026871470734477 }, { "copy_logits_max": -2.8251256942749023, "copy_logits_min": -750000000.0, "copy_num_tokens": 821.5, "epoch": 1.5136584120500383, "gen_logits_max": 3.3666787147521973, "gen_logits_mean": -16.304105758666992, "gen_logits_min": -28.710107803344727, "gen_logits_std": 3.262937307357788, "gen_loss": 0.2347526103258133, "grad_norm": 0.33130624089684785, "learning_rate": 2.156842105263158e-05, "loss": 0.2647, "mean_copy_accuracy": 0.9966063946485519, "mean_gen_accuracy": 0.8766952753067017, "mean_token_accuracy": 0.9098428636789322, "num_tokens": 925516437.0, "sample_num_tokens": 10858.75, "step": 7411, "total_num_tokens": 925559872.0, "z_loss": 0.00040733860805630684 }, { "copy_logits_max": -5.494061470031738, "copy_logits_min": -750000000.0, "copy_num_tokens": 328.875, "epoch": 1.5138626499872352, "gen_logits_max": 3.5675790309906006, "gen_logits_mean": -17.248733520507812, "gen_logits_min": -29.364139556884766, "gen_logits_std": 3.251288414001465, "gen_loss": 0.2660445272922516, "grad_norm": 0.31849207973698845, "learning_rate": 2.156715789473684e-05, "loss": 0.2675, "mean_copy_accuracy": 0.996632993221283, "mean_gen_accuracy": 0.88352270424366, "mean_token_accuracy": 0.9089638143777847, "num_tokens": 925797685.0, "sample_num_tokens": 7122.25, "step": 7412, "total_num_tokens": 925826174.0, "z_loss": 0.00045661127660423517 }, { "copy_logits_max": -6.483902931213379, "copy_logits_min": -750000000.0, "copy_num_tokens": 301.4375, "epoch": 1.514066887924432, "gen_logits_max": 4.601597785949707, "gen_logits_mean": -15.741029739379883, "gen_logits_min": -28.00759506225586, "gen_logits_std": 3.17982816696167, "gen_loss": 0.3146522045135498, "grad_norm": 0.36877400001688393, "learning_rate": 2.1565894736842106e-05, "loss": 0.2864, "mean_copy_accuracy": 0.9962052255868912, "mean_gen_accuracy": 0.8792935460805893, "mean_token_accuracy": 0.9023807644844055, "num_tokens": 926056608.0, "sample_num_tokens": 6941.5, "step": 7413, "total_num_tokens": 926084374.0, "z_loss": 0.0005896262591704726 }, { "copy_logits_max": -6.938203811645508, "copy_logits_min": -750000000.0, "copy_num_tokens": 343.75, "epoch": 1.5142711258616288, "gen_logits_max": 4.226601600646973, "gen_logits_mean": -16.817157745361328, "gen_logits_min": -28.580102920532227, "gen_logits_std": 3.2178239822387695, "gen_loss": 0.2485605627298355, "grad_norm": 0.3695875017462915, "learning_rate": 2.1564631578947367e-05, "loss": 0.2865, "mean_copy_accuracy": 0.9963230937719345, "mean_gen_accuracy": 0.8710546046495438, "mean_token_accuracy": 0.9022192060947418, "num_tokens": 926336071.0, "sample_num_tokens": 7834.75, "step": 7414, "total_num_tokens": 926367410.0, "z_loss": 0.00045364792458713055 }, { "copy_logits_max": -7.202559471130371, "copy_logits_min": -750000000.0, "copy_num_tokens": 419.5625, "epoch": 1.5144753637988257, "gen_logits_max": 3.7358670234680176, "gen_logits_mean": -16.704444885253906, "gen_logits_min": -29.037858963012695, "gen_logits_std": 3.2401814460754395, "gen_loss": 0.2659556269645691, "grad_norm": 0.35865551041092797, "learning_rate": 2.1563368421052634e-05, "loss": 0.2702, "mean_copy_accuracy": 0.9966229498386383, "mean_gen_accuracy": 0.8796529173851013, "mean_token_accuracy": 0.9073244780302048, "num_tokens": 926578932.0, "sample_num_tokens": 8206.5, "step": 7415, "total_num_tokens": 926611758.0, "z_loss": 0.00046717363875359297 }, { "copy_logits_max": -7.005356311798096, "copy_logits_min": -750000064.0, "copy_num_tokens": 427.5625, "epoch": 1.5146796017360225, "gen_logits_max": 3.487330675125122, "gen_logits_mean": -17.40224838256836, "gen_logits_min": -29.8839111328125, "gen_logits_std": 3.268841028213501, "gen_loss": 0.2715909779071808, "grad_norm": 0.3364706124893278, "learning_rate": 2.1562105263157895e-05, "loss": 0.2749, "mean_copy_accuracy": 0.9964366108179092, "mean_gen_accuracy": 0.8793160766363144, "mean_token_accuracy": 0.9055797308683395, "num_tokens": 926851396.0, "sample_num_tokens": 7877.5, "step": 7416, "total_num_tokens": 926882906.0, "z_loss": 0.0004914157325401902 }, { "copy_logits_max": -6.283900260925293, "copy_logits_min": -750000064.0, "copy_num_tokens": 486.0, "epoch": 1.5148838396732192, "gen_logits_max": 4.2611260414123535, "gen_logits_mean": -15.833881378173828, "gen_logits_min": -28.85748291015625, "gen_logits_std": 3.225010871887207, "gen_loss": 0.2536289691925049, "grad_norm": 0.34972840947543365, "learning_rate": 2.156084210526316e-05, "loss": 0.2635, "mean_copy_accuracy": 0.9968632012605667, "mean_gen_accuracy": 0.8799994736909866, "mean_token_accuracy": 0.9116149693727493, "num_tokens": 927111760.0, "sample_num_tokens": 8534.5, "step": 7417, "total_num_tokens": 927145898.0, "z_loss": 0.00044284953037276864 }, { "copy_logits_max": -7.147663116455078, "copy_logits_min": -750000000.0, "copy_num_tokens": 446.875, "epoch": 1.515088077610416, "gen_logits_max": 3.5248374938964844, "gen_logits_mean": -17.43020248413086, "gen_logits_min": -29.884212493896484, "gen_logits_std": 3.2704105377197266, "gen_loss": 0.26638033986091614, "grad_norm": 0.3623361482211846, "learning_rate": 2.155957894736842e-05, "loss": 0.2687, "mean_copy_accuracy": 0.9965140968561172, "mean_gen_accuracy": 0.878458708524704, "mean_token_accuracy": 0.9082069545984268, "num_tokens": 927373543.0, "sample_num_tokens": 8276.75, "step": 7418, "total_num_tokens": 927406650.0, "z_loss": 0.0004897629260085523 }, { "copy_logits_max": -3.655459403991699, "copy_logits_min": -750000000.0, "copy_num_tokens": 382.75, "epoch": 1.515292315547613, "gen_logits_max": 4.852914810180664, "gen_logits_mean": -15.231407165527344, "gen_logits_min": -27.312789916992188, "gen_logits_std": 3.1688079833984375, "gen_loss": 0.30205821990966797, "grad_norm": 0.3375336807496563, "learning_rate": 2.1558315789473685e-05, "loss": 0.2695, "mean_copy_accuracy": 0.9971567541360855, "mean_gen_accuracy": 0.8773812055587769, "mean_token_accuracy": 0.9082989245653152, "num_tokens": 927650160.0, "sample_num_tokens": 7791.0, "step": 7419, "total_num_tokens": 927681324.0, "z_loss": 0.0005411329329945147 }, { "copy_logits_max": -5.967223644256592, "copy_logits_min": -750000000.0, "copy_num_tokens": 361.9375, "epoch": 1.5154965534848097, "gen_logits_max": 4.4462995529174805, "gen_logits_mean": -16.672815322875977, "gen_logits_min": -28.898242950439453, "gen_logits_std": 3.180349111557007, "gen_loss": 0.3328682780265808, "grad_norm": 0.39728630523877545, "learning_rate": 2.155705263157895e-05, "loss": 0.29, "mean_copy_accuracy": 0.996281087398529, "mean_gen_accuracy": 0.8744518905878067, "mean_token_accuracy": 0.9033090621232986, "num_tokens": 927894229.0, "sample_num_tokens": 7540.25, "step": 7420, "total_num_tokens": 927924390.0, "z_loss": 0.0005898196250200272 }, { "copy_logits_max": -4.046454429626465, "copy_logits_min": -750000000.0, "copy_num_tokens": 407.5, "epoch": 1.5157007914220066, "gen_logits_max": 2.901937484741211, "gen_logits_mean": -17.869850158691406, "gen_logits_min": -29.61043930053711, "gen_logits_std": 3.234382152557373, "gen_loss": 0.26006174087524414, "grad_norm": 0.37563001355501946, "learning_rate": 2.155578947368421e-05, "loss": 0.2804, "mean_copy_accuracy": 0.9977326095104218, "mean_gen_accuracy": 0.876554325222969, "mean_token_accuracy": 0.9033775478601456, "num_tokens": 928174439.0, "sample_num_tokens": 8553.25, "step": 7421, "total_num_tokens": 928208652.0, "z_loss": 0.00042669946560636163 }, { "copy_logits_max": -7.686418533325195, "copy_logits_min": -750000064.0, "copy_num_tokens": 275.9375, "epoch": 1.5159050293592036, "gen_logits_max": 4.466916561126709, "gen_logits_mean": -16.801301956176758, "gen_logits_min": -29.24451446533203, "gen_logits_std": 3.21014404296875, "gen_loss": 0.28244900703430176, "grad_norm": 0.3846689971482382, "learning_rate": 2.1554526315789474e-05, "loss": 0.2824, "mean_copy_accuracy": 0.9962612688541412, "mean_gen_accuracy": 0.8761157989501953, "mean_token_accuracy": 0.9043036103248596, "num_tokens": 928429452.0, "sample_num_tokens": 7370.0, "step": 7422, "total_num_tokens": 928458932.0, "z_loss": 0.000496238877531141 }, { "copy_logits_max": -6.15842342376709, "copy_logits_min": -687500032.0, "copy_num_tokens": 504.875, "epoch": 1.5161092672964003, "gen_logits_max": 2.686452865600586, "gen_logits_mean": -17.443603515625, "gen_logits_min": -29.654541015625, "gen_logits_std": 3.2365481853485107, "gen_loss": 0.22797918319702148, "grad_norm": 0.34171857757073815, "learning_rate": 2.1553263157894735e-05, "loss": 0.2793, "mean_copy_accuracy": 0.9969236105680466, "mean_gen_accuracy": 0.8735009431838989, "mean_token_accuracy": 0.9051232635974884, "num_tokens": 928687261.0, "sample_num_tokens": 8071.75, "step": 7423, "total_num_tokens": 928719548.0, "z_loss": 0.000374104012735188 }, { "copy_logits_max": -6.842195510864258, "copy_logits_min": -750000000.0, "copy_num_tokens": 443.6875, "epoch": 1.516313505233597, "gen_logits_max": 3.806330680847168, "gen_logits_mean": -16.586997985839844, "gen_logits_min": -29.049915313720703, "gen_logits_std": 3.228487014770508, "gen_loss": 0.28617751598358154, "grad_norm": 0.3384927967843589, "learning_rate": 2.1552000000000003e-05, "loss": 0.261, "mean_copy_accuracy": 0.9974403977394104, "mean_gen_accuracy": 0.8812734335660934, "mean_token_accuracy": 0.9121065735816956, "num_tokens": 928962928.0, "sample_num_tokens": 8136.0, "step": 7424, "total_num_tokens": 928995472.0, "z_loss": 0.0004547480493783951 }, { "copy_logits_max": -5.848880767822266, "copy_logits_min": -750000000.0, "copy_num_tokens": 524.5, "epoch": 1.5165177431707941, "gen_logits_max": 3.828279495239258, "gen_logits_mean": -15.416824340820312, "gen_logits_min": -28.06574249267578, "gen_logits_std": 3.206199884414673, "gen_loss": 0.28711992502212524, "grad_norm": 0.37094545810416885, "learning_rate": 2.1550736842105264e-05, "loss": 0.2864, "mean_copy_accuracy": 0.99711012840271, "mean_gen_accuracy": 0.8714907467365265, "mean_token_accuracy": 0.9053244590759277, "num_tokens": 929255629.0, "sample_num_tokens": 8800.75, "step": 7425, "total_num_tokens": 929290832.0, "z_loss": 0.0004896309692412615 }, { "copy_logits_max": -5.9612531661987305, "copy_logits_min": -750000064.0, "copy_num_tokens": 909.75, "epoch": 1.5167219811079908, "gen_logits_max": 3.2986879348754883, "gen_logits_mean": -16.118316650390625, "gen_logits_min": -28.514190673828125, "gen_logits_std": 3.260012626647949, "gen_loss": 0.2484728991985321, "grad_norm": 0.3389957779148338, "learning_rate": 2.1549473684210528e-05, "loss": 0.267, "mean_copy_accuracy": 0.9965433478355408, "mean_gen_accuracy": 0.8809797167778015, "mean_token_accuracy": 0.9096676111221313, "num_tokens": 929540782.0, "sample_num_tokens": 11248.5, "step": 7426, "total_num_tokens": 929585776.0, "z_loss": 0.0004707368207164109 }, { "copy_logits_max": -6.240853309631348, "copy_logits_min": -750000000.0, "copy_num_tokens": 736.4375, "epoch": 1.5169262190451875, "gen_logits_max": 3.0829896926879883, "gen_logits_mean": -17.01576042175293, "gen_logits_min": -29.5648193359375, "gen_logits_std": 3.2792539596557617, "gen_loss": 0.2672901749610901, "grad_norm": 0.34870407081131344, "learning_rate": 2.154821052631579e-05, "loss": 0.2856, "mean_copy_accuracy": 0.9975002408027649, "mean_gen_accuracy": 0.8698835372924805, "mean_token_accuracy": 0.9035662412643433, "num_tokens": 929828581.0, "sample_num_tokens": 10541.75, "step": 7427, "total_num_tokens": 929870748.0, "z_loss": 0.00047401024494320154 }, { "copy_logits_max": -5.888104438781738, "copy_logits_min": -750000000.0, "copy_num_tokens": 422.0, "epoch": 1.5171304569823845, "gen_logits_max": 4.334277629852295, "gen_logits_mean": -15.00103759765625, "gen_logits_min": -27.345600128173828, "gen_logits_std": 3.2301535606384277, "gen_loss": 0.25928282737731934, "grad_norm": 0.33150023311683485, "learning_rate": 2.1546947368421054e-05, "loss": 0.2493, "mean_copy_accuracy": 0.9968224316835403, "mean_gen_accuracy": 0.8843042701482773, "mean_token_accuracy": 0.9142761379480362, "num_tokens": 930109860.0, "sample_num_tokens": 7477.0, "step": 7428, "total_num_tokens": 930139768.0, "z_loss": 0.00043529339018277824 }, { "copy_logits_max": -7.395833492279053, "copy_logits_min": -687500032.0, "copy_num_tokens": 359.5625, "epoch": 1.5173346949195814, "gen_logits_max": 4.131977081298828, "gen_logits_mean": -16.580322265625, "gen_logits_min": -28.989112854003906, "gen_logits_std": 3.26615309715271, "gen_loss": 0.26399171352386475, "grad_norm": 0.3606761023487511, "learning_rate": 2.1545684210526314e-05, "loss": 0.2813, "mean_copy_accuracy": 0.9968913644552231, "mean_gen_accuracy": 0.8730456978082657, "mean_token_accuracy": 0.9038238525390625, "num_tokens": 930383992.0, "sample_num_tokens": 7546.5, "step": 7429, "total_num_tokens": 930414178.0, "z_loss": 0.000431546795880422 }, { "copy_logits_max": -8.804559707641602, "copy_logits_min": -750000000.0, "copy_num_tokens": 202.75, "epoch": 1.517538932856778, "gen_logits_max": 4.013179302215576, "gen_logits_mean": -18.145103454589844, "gen_logits_min": -29.971309661865234, "gen_logits_std": 3.285853385925293, "gen_loss": 0.2711101174354553, "grad_norm": 0.3862918495924795, "learning_rate": 2.154442105263158e-05, "loss": 0.2701, "mean_copy_accuracy": 0.9951619654893875, "mean_gen_accuracy": 0.8838271945714951, "mean_token_accuracy": 0.9075860679149628, "num_tokens": 930642161.0, "sample_num_tokens": 6670.25, "step": 7430, "total_num_tokens": 930668842.0, "z_loss": 0.0004367489309515804 }, { "copy_logits_max": -4.429365634918213, "copy_logits_min": -750000064.0, "copy_num_tokens": 594.625, "epoch": 1.517743170793975, "gen_logits_max": 4.053511619567871, "gen_logits_mean": -15.22696304321289, "gen_logits_min": -28.080631256103516, "gen_logits_std": 3.2385003566741943, "gen_loss": 0.2610486149787903, "grad_norm": 0.3761499791520115, "learning_rate": 2.154315789473684e-05, "loss": 0.2638, "mean_copy_accuracy": 0.99745973944664, "mean_gen_accuracy": 0.8806158155202866, "mean_token_accuracy": 0.9105709940195084, "num_tokens": 930916965.0, "sample_num_tokens": 9372.25, "step": 7431, "total_num_tokens": 930954454.0, "z_loss": 0.0003966428921557963 }, { "copy_logits_max": -8.339553833007812, "copy_logits_min": -750000000.0, "copy_num_tokens": 359.5625, "epoch": 1.517947408731172, "gen_logits_max": 4.3872222900390625, "gen_logits_mean": -16.455190658569336, "gen_logits_min": -28.38825225830078, "gen_logits_std": 3.24880313873291, "gen_loss": 0.26534968614578247, "grad_norm": 0.3565583530079882, "learning_rate": 2.1541894736842107e-05, "loss": 0.2804, "mean_copy_accuracy": 0.9965703338384628, "mean_gen_accuracy": 0.87687087059021, "mean_token_accuracy": 0.9044484794139862, "num_tokens": 931182385.0, "sample_num_tokens": 8118.75, "step": 7432, "total_num_tokens": 931214860.0, "z_loss": 0.0003973588172812015 }, { "copy_logits_max": -4.70440673828125, "copy_logits_min": -750000000.0, "copy_num_tokens": 436.4375, "epoch": 1.5181516466683687, "gen_logits_max": 4.210290908813477, "gen_logits_mean": -15.461114883422852, "gen_logits_min": -28.14270782470703, "gen_logits_std": 3.2252352237701416, "gen_loss": 0.2609255015850067, "grad_norm": 0.35920924017197203, "learning_rate": 2.1540631578947368e-05, "loss": 0.2626, "mean_copy_accuracy": 0.9972212463617325, "mean_gen_accuracy": 0.8821526765823364, "mean_token_accuracy": 0.9104027152061462, "num_tokens": 931451378.0, "sample_num_tokens": 8020.0, "step": 7433, "total_num_tokens": 931483458.0, "z_loss": 0.00043332180939614773 }, { "copy_logits_max": -5.428814888000488, "copy_logits_min": -750000000.0, "copy_num_tokens": 403.25, "epoch": 1.5183558846055654, "gen_logits_max": 4.363190650939941, "gen_logits_mean": -15.883097648620605, "gen_logits_min": -28.559307098388672, "gen_logits_std": 3.2538440227508545, "gen_loss": 0.24355030059814453, "grad_norm": 0.36542888061014367, "learning_rate": 2.1539368421052633e-05, "loss": 0.2659, "mean_copy_accuracy": 0.9971559941768646, "mean_gen_accuracy": 0.881730318069458, "mean_token_accuracy": 0.9104130119085312, "num_tokens": 931720061.0, "sample_num_tokens": 7835.25, "step": 7434, "total_num_tokens": 931751402.0, "z_loss": 0.0004238280816935003 }, { "copy_logits_max": -7.359698295593262, "copy_logits_min": -750000000.0, "copy_num_tokens": 405.125, "epoch": 1.5185601225427623, "gen_logits_max": 3.449185371398926, "gen_logits_mean": -17.337650299072266, "gen_logits_min": -30.0887451171875, "gen_logits_std": 3.258742570877075, "gen_loss": 0.2748245894908905, "grad_norm": 0.6289274441609675, "learning_rate": 2.1538105263157897e-05, "loss": 0.2689, "mean_copy_accuracy": 0.9969932585954666, "mean_gen_accuracy": 0.8796613067388535, "mean_token_accuracy": 0.9080423414707184, "num_tokens": 931982086.0, "sample_num_tokens": 8797.0, "step": 7435, "total_num_tokens": 932017274.0, "z_loss": 0.0004394283751025796 }, { "copy_logits_max": -5.545470714569092, "copy_logits_min": -750000000.0, "copy_num_tokens": 592.125, "epoch": 1.5187643604799592, "gen_logits_max": 3.9857420921325684, "gen_logits_mean": -15.868766784667969, "gen_logits_min": -28.28504753112793, "gen_logits_std": 3.224764823913574, "gen_loss": 0.29546794295310974, "grad_norm": 0.36828679648041623, "learning_rate": 2.1536842105263158e-05, "loss": 0.2812, "mean_copy_accuracy": 0.9971276670694351, "mean_gen_accuracy": 0.8696986436843872, "mean_token_accuracy": 0.9042083919048309, "num_tokens": 932271710.0, "sample_num_tokens": 9198.5, "step": 7436, "total_num_tokens": 932308504.0, "z_loss": 0.00046317357919178903 }, { "copy_logits_max": -7.312844276428223, "copy_logits_min": -750000000.0, "copy_num_tokens": 448.5, "epoch": 1.518968598417156, "gen_logits_max": 2.563469886779785, "gen_logits_mean": -18.313039779663086, "gen_logits_min": -30.238271713256836, "gen_logits_std": 3.2772421836853027, "gen_loss": 0.2561311721801758, "grad_norm": 0.3621270085529636, "learning_rate": 2.1535578947368422e-05, "loss": 0.2653, "mean_copy_accuracy": 0.9965391159057617, "mean_gen_accuracy": 0.8795144259929657, "mean_token_accuracy": 0.9081456363201141, "num_tokens": 932549316.0, "sample_num_tokens": 8405.5, "step": 7437, "total_num_tokens": 932582938.0, "z_loss": 0.00043999491026625037 }, { "copy_logits_max": -4.494899749755859, "copy_logits_min": -687500032.0, "copy_num_tokens": 653.75, "epoch": 1.5191728363543529, "gen_logits_max": 2.9705004692077637, "gen_logits_mean": -16.278169631958008, "gen_logits_min": -28.775360107421875, "gen_logits_std": 3.245542049407959, "gen_loss": 0.2670329213142395, "grad_norm": 0.3358818250288879, "learning_rate": 2.1534315789473683e-05, "loss": 0.2709, "mean_copy_accuracy": 0.9973978251218796, "mean_gen_accuracy": 0.874384194612503, "mean_token_accuracy": 0.9075047075748444, "num_tokens": 932865140.0, "sample_num_tokens": 10019.0, "step": 7438, "total_num_tokens": 932905216.0, "z_loss": 0.0004682315338868648 }, { "copy_logits_max": -6.486583709716797, "copy_logits_min": -750000000.0, "copy_num_tokens": 462.9375, "epoch": 1.5193770742915498, "gen_logits_max": 3.003727436065674, "gen_logits_mean": -18.179840087890625, "gen_logits_min": -30.216318130493164, "gen_logits_std": 3.291904926300049, "gen_loss": 0.26465439796447754, "grad_norm": 0.3661536481166213, "learning_rate": 2.1533052631578947e-05, "loss": 0.2541, "mean_copy_accuracy": 0.9963520020246506, "mean_gen_accuracy": 0.8864034414291382, "mean_token_accuracy": 0.9139319509267807, "num_tokens": 933150672.0, "sample_num_tokens": 8768.0, "step": 7439, "total_num_tokens": 933185744.0, "z_loss": 0.0004630361800082028 }, { "copy_logits_max": -4.558104515075684, "copy_logits_min": -750000000.0, "copy_num_tokens": 355.5, "epoch": 1.5195813122287465, "gen_logits_max": 5.387921333312988, "gen_logits_mean": -13.68413257598877, "gen_logits_min": -25.953323364257812, "gen_logits_std": 3.1490237712860107, "gen_loss": 0.29892081022262573, "grad_norm": 0.3644595690591726, "learning_rate": 2.153178947368421e-05, "loss": 0.2807, "mean_copy_accuracy": 0.9963683038949966, "mean_gen_accuracy": 0.8750518560409546, "mean_token_accuracy": 0.9034483283758163, "num_tokens": 933425176.0, "sample_num_tokens": 7777.5, "step": 7440, "total_num_tokens": 933456286.0, "z_loss": 0.0005390750011429191 }, { "copy_logits_max": -6.785531997680664, "copy_logits_min": -750000000.0, "copy_num_tokens": 361.375, "epoch": 1.5197855501659432, "gen_logits_max": 3.6399807929992676, "gen_logits_mean": -16.902965545654297, "gen_logits_min": -29.00315284729004, "gen_logits_std": 3.2403404712677, "gen_loss": 0.25916215777397156, "grad_norm": 0.37261928751404827, "learning_rate": 2.1530526315789476e-05, "loss": 0.283, "mean_copy_accuracy": 0.9963357150554657, "mean_gen_accuracy": 0.8721556663513184, "mean_token_accuracy": 0.9026240408420563, "num_tokens": 933693208.0, "sample_num_tokens": 7570.0, "step": 7441, "total_num_tokens": 933723488.0, "z_loss": 0.0004535603220574558 }, { "copy_logits_max": -8.775218963623047, "copy_logits_min": -750000000.0, "copy_num_tokens": 223.8125, "epoch": 1.5199897881031401, "gen_logits_max": 4.552424430847168, "gen_logits_mean": -16.894880294799805, "gen_logits_min": -28.436187744140625, "gen_logits_std": 3.1837167739868164, "gen_loss": 0.3326902985572815, "grad_norm": 0.373220314801404, "learning_rate": 2.1529263157894737e-05, "loss": 0.3183, "mean_copy_accuracy": 0.9956364035606384, "mean_gen_accuracy": 0.8655454069375992, "mean_token_accuracy": 0.8931885659694672, "num_tokens": 933952220.0, "sample_num_tokens": 6979.0, "step": 7442, "total_num_tokens": 933980136.0, "z_loss": 0.0005590051878243685 }, { "copy_logits_max": -5.71673059463501, "copy_logits_min": -750000000.0, "copy_num_tokens": 329.6875, "epoch": 1.520194026040337, "gen_logits_max": 4.442758083343506, "gen_logits_mean": -16.586483001708984, "gen_logits_min": -28.878555297851562, "gen_logits_std": 3.2228569984436035, "gen_loss": 0.3026520013809204, "grad_norm": 0.4122425466347979, "learning_rate": 2.1528e-05, "loss": 0.2872, "mean_copy_accuracy": 0.9964764565229416, "mean_gen_accuracy": 0.8752185553312302, "mean_token_accuracy": 0.9013795703649521, "num_tokens": 934193528.0, "sample_num_tokens": 7536.5, "step": 7443, "total_num_tokens": 934223674.0, "z_loss": 0.0005833314498886466 }, { "copy_logits_max": -6.133415222167969, "copy_logits_min": -625000064.0, "copy_num_tokens": 465.9375, "epoch": 1.5203982639775337, "gen_logits_max": 4.035761833190918, "gen_logits_mean": -15.560101509094238, "gen_logits_min": -27.74979019165039, "gen_logits_std": 3.150737762451172, "gen_loss": 0.3035670518875122, "grad_norm": 0.41103806591817754, "learning_rate": 2.1526736842105262e-05, "loss": 0.3296, "mean_copy_accuracy": 0.9958605170249939, "mean_gen_accuracy": 0.862172856926918, "mean_token_accuracy": 0.8908908367156982, "num_tokens": 934444753.0, "sample_num_tokens": 8719.25, "step": 7444, "total_num_tokens": 934479630.0, "z_loss": 0.0004918102640658617 }, { "copy_logits_max": -8.206367492675781, "copy_logits_min": -750000000.0, "copy_num_tokens": 467.6875, "epoch": 1.5206025019147307, "gen_logits_max": 3.346625804901123, "gen_logits_mean": -17.732688903808594, "gen_logits_min": -29.722637176513672, "gen_logits_std": 3.256166458129883, "gen_loss": 0.24881234765052795, "grad_norm": 0.38071373416121224, "learning_rate": 2.1525473684210526e-05, "loss": 0.2884, "mean_copy_accuracy": 0.9958319067955017, "mean_gen_accuracy": 0.8726324141025543, "mean_token_accuracy": 0.9011644423007965, "num_tokens": 934720022.0, "sample_num_tokens": 8585.0, "step": 7445, "total_num_tokens": 934754362.0, "z_loss": 0.00042466612649150193 }, { "copy_logits_max": -6.159913063049316, "copy_logits_min": -750000128.0, "copy_num_tokens": 484.6875, "epoch": 1.5208067398519276, "gen_logits_max": 3.2352089881896973, "gen_logits_mean": -16.734148025512695, "gen_logits_min": -28.974258422851562, "gen_logits_std": 3.1801512241363525, "gen_loss": 0.28068867325782776, "grad_norm": 0.4015852414346688, "learning_rate": 2.1524210526315787e-05, "loss": 0.294, "mean_copy_accuracy": 0.9959824830293655, "mean_gen_accuracy": 0.8686114102602005, "mean_token_accuracy": 0.9001772403717041, "num_tokens": 934990570.0, "sample_num_tokens": 8382.0, "step": 7446, "total_num_tokens": 935024098.0, "z_loss": 0.00048818805953487754 }, { "copy_logits_max": -3.992363929748535, "copy_logits_min": -687500032.0, "copy_num_tokens": 546.6875, "epoch": 1.5210109777891243, "gen_logits_max": 3.365664482116699, "gen_logits_mean": -16.88119125366211, "gen_logits_min": -28.952808380126953, "gen_logits_std": 3.2231948375701904, "gen_loss": 0.27093052864074707, "grad_norm": 0.3533486385738825, "learning_rate": 2.1522947368421052e-05, "loss": 0.2786, "mean_copy_accuracy": 0.9966796338558197, "mean_gen_accuracy": 0.8759389370679855, "mean_token_accuracy": 0.9060491323471069, "num_tokens": 935286747.0, "sample_num_tokens": 9039.75, "step": 7447, "total_num_tokens": 935322906.0, "z_loss": 0.0005203799810260534 }, { "copy_logits_max": -4.106235504150391, "copy_logits_min": -750000000.0, "copy_num_tokens": 269.25, "epoch": 1.521215215726321, "gen_logits_max": 5.0493011474609375, "gen_logits_mean": -15.164422988891602, "gen_logits_min": -27.34729766845703, "gen_logits_std": 3.1707444190979004, "gen_loss": 0.24901890754699707, "grad_norm": 0.3391213300662224, "learning_rate": 2.152168421052632e-05, "loss": 0.2548, "mean_copy_accuracy": 0.9972618520259857, "mean_gen_accuracy": 0.8881449550390244, "mean_token_accuracy": 0.9130150079727173, "num_tokens": 935577444.0, "sample_num_tokens": 7187.0, "step": 7448, "total_num_tokens": 935606192.0, "z_loss": 0.0004555927007459104 }, { "copy_logits_max": -4.427424430847168, "copy_logits_min": -750000064.0, "copy_num_tokens": 383.75, "epoch": 1.5214194536635182, "gen_logits_max": 4.068562984466553, "gen_logits_mean": -16.238290786743164, "gen_logits_min": -28.24273681640625, "gen_logits_std": 3.182924747467041, "gen_loss": 0.2954034209251404, "grad_norm": 0.3497024112386039, "learning_rate": 2.152042105263158e-05, "loss": 0.2844, "mean_copy_accuracy": 0.9980680197477341, "mean_gen_accuracy": 0.8727751970291138, "mean_token_accuracy": 0.9046225845813751, "num_tokens": 935852192.0, "sample_num_tokens": 8041.0, "step": 7449, "total_num_tokens": 935884356.0, "z_loss": 0.0005673638079315424 }, { "copy_logits_max": -5.85750150680542, "copy_logits_min": -750000000.0, "copy_num_tokens": 524.6875, "epoch": 1.5216236916007149, "gen_logits_max": 3.9310226440429688, "gen_logits_mean": -16.59283447265625, "gen_logits_min": -28.73241424560547, "gen_logits_std": 3.190160036087036, "gen_loss": 0.2799484133720398, "grad_norm": 0.36477690948800406, "learning_rate": 2.1519157894736845e-05, "loss": 0.2779, "mean_copy_accuracy": 0.9971403032541275, "mean_gen_accuracy": 0.8772750049829483, "mean_token_accuracy": 0.9067178070545197, "num_tokens": 936123422.0, "sample_num_tokens": 8927.5, "step": 7450, "total_num_tokens": 936159132.0, "z_loss": 0.0004963838728144765 }, { "copy_logits_max": -4.31829833984375, "copy_logits_min": -687500032.0, "copy_num_tokens": 502.9375, "epoch": 1.5218279295379116, "gen_logits_max": 2.9941487312316895, "gen_logits_mean": -16.859512329101562, "gen_logits_min": -28.760578155517578, "gen_logits_std": 3.2148032188415527, "gen_loss": 0.2751705050468445, "grad_norm": 0.3516440802369806, "learning_rate": 2.1517894736842106e-05, "loss": 0.2842, "mean_copy_accuracy": 0.9969979971647263, "mean_gen_accuracy": 0.8769538402557373, "mean_token_accuracy": 0.9054198861122131, "num_tokens": 936399561.0, "sample_num_tokens": 8848.25, "step": 7451, "total_num_tokens": 936434954.0, "z_loss": 0.0004552719183266163 }, { "copy_logits_max": -4.836612701416016, "copy_logits_min": -687500032.0, "copy_num_tokens": 593.5, "epoch": 1.5220321674751085, "gen_logits_max": 2.8706750869750977, "gen_logits_mean": -17.06352996826172, "gen_logits_min": -29.309083938598633, "gen_logits_std": 3.2233693599700928, "gen_loss": 0.2654995620250702, "grad_norm": 0.3696530586207608, "learning_rate": 2.151663157894737e-05, "loss": 0.2777, "mean_copy_accuracy": 0.9970763772726059, "mean_gen_accuracy": 0.875713050365448, "mean_token_accuracy": 0.9062497317790985, "num_tokens": 936680021.0, "sample_num_tokens": 9620.25, "step": 7452, "total_num_tokens": 936718502.0, "z_loss": 0.00043443520553410053 }, { "copy_logits_max": -4.166426658630371, "copy_logits_min": -687500032.0, "copy_num_tokens": 370.0, "epoch": 1.5222364054123054, "gen_logits_max": 3.4328672885894775, "gen_logits_mean": -17.360107421875, "gen_logits_min": -29.07508087158203, "gen_logits_std": 3.210574150085449, "gen_loss": 0.33230528235435486, "grad_norm": 0.36191931967881186, "learning_rate": 2.151536842105263e-05, "loss": 0.2885, "mean_copy_accuracy": 0.9957622438669205, "mean_gen_accuracy": 0.8739601373672485, "mean_token_accuracy": 0.9013849496841431, "num_tokens": 936946410.0, "sample_num_tokens": 8038.0, "step": 7453, "total_num_tokens": 936978562.0, "z_loss": 0.00048691645497456193 }, { "copy_logits_max": -4.291706085205078, "copy_logits_min": -750000000.0, "copy_num_tokens": 340.625, "epoch": 1.5224406433495021, "gen_logits_max": 3.4593844413757324, "gen_logits_mean": -16.853378295898438, "gen_logits_min": -29.11130142211914, "gen_logits_std": 3.222985029220581, "gen_loss": 0.2750745415687561, "grad_norm": 0.3777962054851653, "learning_rate": 2.1514105263157895e-05, "loss": 0.2654, "mean_copy_accuracy": 0.9971014112234116, "mean_gen_accuracy": 0.8840014189481735, "mean_token_accuracy": 0.9108686000108719, "num_tokens": 937213370.0, "sample_num_tokens": 7087.0, "step": 7454, "total_num_tokens": 937241718.0, "z_loss": 0.0004531529266387224 }, { "copy_logits_max": -3.661100149154663, "copy_logits_min": -625000064.0, "copy_num_tokens": 541.3125, "epoch": 1.522644881286699, "gen_logits_max": 3.3570995330810547, "gen_logits_mean": -15.779777526855469, "gen_logits_min": -28.22280502319336, "gen_logits_std": 3.1705753803253174, "gen_loss": 0.22803303599357605, "grad_norm": 0.3480426664621489, "learning_rate": 2.1512842105263156e-05, "loss": 0.246, "mean_copy_accuracy": 0.9979635924100876, "mean_gen_accuracy": 0.8882190436124802, "mean_token_accuracy": 0.9153787195682526, "num_tokens": 937483802.0, "sample_num_tokens": 8307.5, "step": 7455, "total_num_tokens": 937517032.0, "z_loss": 0.0003595438611228019 }, { "copy_logits_max": -5.910164833068848, "copy_logits_min": -750000000.0, "copy_num_tokens": 498.1875, "epoch": 1.522849119223896, "gen_logits_max": 2.6500229835510254, "gen_logits_mean": -17.960233688354492, "gen_logits_min": -30.052597045898438, "gen_logits_std": 3.232487201690674, "gen_loss": 0.23043572902679443, "grad_norm": 0.34509518668270894, "learning_rate": 2.1511578947368424e-05, "loss": 0.2615, "mean_copy_accuracy": 0.9970593005418777, "mean_gen_accuracy": 0.8837011009454727, "mean_token_accuracy": 0.909243032336235, "num_tokens": 937750241.0, "sample_num_tokens": 7971.25, "step": 7456, "total_num_tokens": 937782126.0, "z_loss": 0.000376612733816728 }, { "copy_logits_max": -4.866745948791504, "copy_logits_min": -687500032.0, "copy_num_tokens": 524.5625, "epoch": 1.5230533571610927, "gen_logits_max": 3.3532040119171143, "gen_logits_mean": -16.304384231567383, "gen_logits_min": -28.551437377929688, "gen_logits_std": 3.205770492553711, "gen_loss": 0.2603089213371277, "grad_norm": 0.354188262773666, "learning_rate": 2.1510315789473685e-05, "loss": 0.278, "mean_copy_accuracy": 0.9975864887237549, "mean_gen_accuracy": 0.8814035505056381, "mean_token_accuracy": 0.9075743854045868, "num_tokens": 938020544.0, "sample_num_tokens": 9633.0, "step": 7457, "total_num_tokens": 938059076.0, "z_loss": 0.0003992859274148941 }, { "copy_logits_max": -6.313819885253906, "copy_logits_min": -750000000.0, "copy_num_tokens": 317.8125, "epoch": 1.5232575950982894, "gen_logits_max": 4.29886531829834, "gen_logits_mean": -17.439346313476562, "gen_logits_min": -29.912338256835938, "gen_logits_std": 3.261723756790161, "gen_loss": 0.2673267722129822, "grad_norm": 0.35152357802177353, "learning_rate": 2.150905263157895e-05, "loss": 0.2685, "mean_copy_accuracy": 0.9969282150268555, "mean_gen_accuracy": 0.878326341509819, "mean_token_accuracy": 0.9073806256055832, "num_tokens": 938293376.0, "sample_num_tokens": 7159.0, "step": 7458, "total_num_tokens": 938322012.0, "z_loss": 0.00042743279482237995 }, { "copy_logits_max": -5.557223320007324, "copy_logits_min": -687500032.0, "copy_num_tokens": 486.6875, "epoch": 1.5234618330354863, "gen_logits_max": 4.251881122589111, "gen_logits_mean": -15.38879108428955, "gen_logits_min": -27.532489776611328, "gen_logits_std": 3.1649699211120605, "gen_loss": 0.2954947352409363, "grad_norm": 0.3712294843742611, "learning_rate": 2.150778947368421e-05, "loss": 0.2864, "mean_copy_accuracy": 0.9961495250463486, "mean_gen_accuracy": 0.8781402558088303, "mean_token_accuracy": 0.9037387818098068, "num_tokens": 938546022.0, "sample_num_tokens": 8880.5, "step": 7459, "total_num_tokens": 938581544.0, "z_loss": 0.0004618667298927903 }, { "copy_logits_max": -2.0411128997802734, "copy_logits_min": -750000000.0, "copy_num_tokens": 720.5625, "epoch": 1.5236660709726833, "gen_logits_max": 2.8741650581359863, "gen_logits_mean": -17.48415756225586, "gen_logits_min": -30.025489807128906, "gen_logits_std": 3.292898654937744, "gen_loss": 0.23845136165618896, "grad_norm": 0.34810877067729995, "learning_rate": 2.1506526315789474e-05, "loss": 0.2624, "mean_copy_accuracy": 0.9972453117370605, "mean_gen_accuracy": 0.877476304769516, "mean_token_accuracy": 0.9112506359815598, "num_tokens": 938834426.0, "sample_num_tokens": 10117.5, "step": 7460, "total_num_tokens": 938874896.0, "z_loss": 0.000440979958511889 }, { "copy_logits_max": -4.172904968261719, "copy_logits_min": -750000000.0, "copy_num_tokens": 461.5, "epoch": 1.52387030890988, "gen_logits_max": 4.293557643890381, "gen_logits_mean": -15.358659744262695, "gen_logits_min": -28.0463924407959, "gen_logits_std": 3.1927852630615234, "gen_loss": 0.2625008225440979, "grad_norm": 0.3282357805360721, "learning_rate": 2.150526315789474e-05, "loss": 0.2528, "mean_copy_accuracy": 0.9968756884336472, "mean_gen_accuracy": 0.8829546719789505, "mean_token_accuracy": 0.9138689041137695, "num_tokens": 939135118.0, "sample_num_tokens": 9362.0, "step": 7461, "total_num_tokens": 939172566.0, "z_loss": 0.0004955567419528961 }, { "copy_logits_max": -5.180624485015869, "copy_logits_min": -750000000.0, "copy_num_tokens": 441.25, "epoch": 1.5240745468470769, "gen_logits_max": 4.133275985717773, "gen_logits_mean": -16.84368133544922, "gen_logits_min": -29.104936599731445, "gen_logits_std": 3.229585886001587, "gen_loss": 0.2785959839820862, "grad_norm": 0.33876219236475796, "learning_rate": 2.1504e-05, "loss": 0.2824, "mean_copy_accuracy": 0.9974605441093445, "mean_gen_accuracy": 0.8756280243396759, "mean_token_accuracy": 0.9043559581041336, "num_tokens": 939408187.0, "sample_num_tokens": 8614.25, "step": 7462, "total_num_tokens": 939442644.0, "z_loss": 0.0004982292884960771 }, { "copy_logits_max": -4.978503227233887, "copy_logits_min": -687500032.0, "copy_num_tokens": 471.5, "epoch": 1.5242787847842738, "gen_logits_max": 3.387038230895996, "gen_logits_mean": -17.19243049621582, "gen_logits_min": -29.689071655273438, "gen_logits_std": 3.259061336517334, "gen_loss": 0.32028037309646606, "grad_norm": 0.3542990350841456, "learning_rate": 2.1502736842105264e-05, "loss": 0.2846, "mean_copy_accuracy": 0.9973737746477127, "mean_gen_accuracy": 0.8710309863090515, "mean_token_accuracy": 0.9025859087705612, "num_tokens": 939674284.0, "sample_num_tokens": 8628.5, "step": 7463, "total_num_tokens": 939708798.0, "z_loss": 0.0005562129663303494 }, { "copy_logits_max": -7.594776630401611, "copy_logits_min": -750000000.0, "copy_num_tokens": 362.1875, "epoch": 1.5244830227214705, "gen_logits_max": 5.010410308837891, "gen_logits_mean": -14.99826431274414, "gen_logits_min": -27.106197357177734, "gen_logits_std": 3.1344106197357178, "gen_loss": 0.2631292939186096, "grad_norm": 0.3724645078841283, "learning_rate": 2.1501473684210528e-05, "loss": 0.273, "mean_copy_accuracy": 0.9962356090545654, "mean_gen_accuracy": 0.8806481212377548, "mean_token_accuracy": 0.906715452671051, "num_tokens": 939940596.0, "sample_num_tokens": 8995.5, "step": 7464, "total_num_tokens": 939976578.0, "z_loss": 0.00043597814510576427 }, { "copy_logits_max": -5.407135009765625, "copy_logits_min": -750000000.0, "copy_num_tokens": 609.625, "epoch": 1.5246872606586672, "gen_logits_max": 3.6393465995788574, "gen_logits_mean": -15.840904235839844, "gen_logits_min": -27.865558624267578, "gen_logits_std": 3.207594394683838, "gen_loss": 0.2578810453414917, "grad_norm": 0.3360994305590685, "learning_rate": 2.1500210526315792e-05, "loss": 0.2679, "mean_copy_accuracy": 0.9968635141849518, "mean_gen_accuracy": 0.8809318393468857, "mean_token_accuracy": 0.9097524434328079, "num_tokens": 940208034.0, "sample_num_tokens": 9866.5, "step": 7465, "total_num_tokens": 940247500.0, "z_loss": 0.0004284300666768104 }, { "copy_logits_max": -7.287375450134277, "copy_logits_min": -750000000.0, "copy_num_tokens": 450.3125, "epoch": 1.5248914985958641, "gen_logits_max": 2.936659812927246, "gen_logits_mean": -17.479589462280273, "gen_logits_min": -29.629261016845703, "gen_logits_std": 3.222524642944336, "gen_loss": 0.27660906314849854, "grad_norm": 0.32014146843964614, "learning_rate": 2.1498947368421053e-05, "loss": 0.2627, "mean_copy_accuracy": 0.997253343462944, "mean_gen_accuracy": 0.882019892334938, "mean_token_accuracy": 0.9119641780853271, "num_tokens": 940505941.0, "sample_num_tokens": 8645.25, "step": 7466, "total_num_tokens": 940540522.0, "z_loss": 0.0004640259430743754 }, { "copy_logits_max": -8.61752700805664, "copy_logits_min": -750000000.0, "copy_num_tokens": 438.875, "epoch": 1.525095736533061, "gen_logits_max": 3.3101134300231934, "gen_logits_mean": -18.247657775878906, "gen_logits_min": -29.868619918823242, "gen_logits_std": 3.2508010864257812, "gen_loss": 0.2663993537425995, "grad_norm": 0.36171775997758676, "learning_rate": 2.1497684210526318e-05, "loss": 0.2657, "mean_copy_accuracy": 0.9971608072519302, "mean_gen_accuracy": 0.8836100101470947, "mean_token_accuracy": 0.9087065011262894, "num_tokens": 940771261.0, "sample_num_tokens": 8761.25, "step": 7467, "total_num_tokens": 940806306.0, "z_loss": 0.0004390481626614928 }, { "copy_logits_max": -5.870267868041992, "copy_logits_min": -750000000.0, "copy_num_tokens": 409.75, "epoch": 1.5252999744702578, "gen_logits_max": 5.237403869628906, "gen_logits_mean": -15.626922607421875, "gen_logits_min": -27.478469848632812, "gen_logits_std": 3.194581985473633, "gen_loss": 0.3146165609359741, "grad_norm": 0.3509398472224047, "learning_rate": 2.149642105263158e-05, "loss": 0.2981, "mean_copy_accuracy": 0.997094601392746, "mean_gen_accuracy": 0.8662165105342865, "mean_token_accuracy": 0.9000730216503143, "num_tokens": 941053468.0, "sample_num_tokens": 8929.5, "step": 7468, "total_num_tokens": 941089186.0, "z_loss": 0.000495143118314445 }, { "copy_logits_max": -7.1749796867370605, "copy_logits_min": -750000000.0, "copy_num_tokens": 293.1875, "epoch": 1.5255042124074547, "gen_logits_max": 4.072066307067871, "gen_logits_mean": -17.681093215942383, "gen_logits_min": -29.39132308959961, "gen_logits_std": 3.2468767166137695, "gen_loss": 0.26883935928344727, "grad_norm": 0.3575682929150568, "learning_rate": 2.1495157894736843e-05, "loss": 0.2717, "mean_copy_accuracy": 0.9964622110128403, "mean_gen_accuracy": 0.8796080350875854, "mean_token_accuracy": 0.9074605107307434, "num_tokens": 941319130.0, "sample_num_tokens": 7738.5, "step": 7469, "total_num_tokens": 941350084.0, "z_loss": 0.00046523642959073186 }, { "copy_logits_max": -6.1753644943237305, "copy_logits_min": -687500032.0, "copy_num_tokens": 294.375, "epoch": 1.5257084503446516, "gen_logits_max": 5.13131046295166, "gen_logits_mean": -16.728614807128906, "gen_logits_min": -28.437118530273438, "gen_logits_std": 3.2111573219299316, "gen_loss": 0.3105238080024719, "grad_norm": 0.38136153328357375, "learning_rate": 2.1493894736842104e-05, "loss": 0.2875, "mean_copy_accuracy": 0.996548518538475, "mean_gen_accuracy": 0.8754599690437317, "mean_token_accuracy": 0.9023871421813965, "num_tokens": 941561088.0, "sample_num_tokens": 6961.5, "step": 7470, "total_num_tokens": 941588934.0, "z_loss": 0.0005540002603083849 }, { "copy_logits_max": -4.881198883056641, "copy_logits_min": -750000000.0, "copy_num_tokens": 299.6875, "epoch": 1.5259126882818483, "gen_logits_max": 5.381199836730957, "gen_logits_mean": -14.760549545288086, "gen_logits_min": -26.915626525878906, "gen_logits_std": 3.1464133262634277, "gen_loss": 0.32170259952545166, "grad_norm": 0.3748593568786672, "learning_rate": 2.1492631578947368e-05, "loss": 0.3022, "mean_copy_accuracy": 0.9965128153562546, "mean_gen_accuracy": 0.8701322674751282, "mean_token_accuracy": 0.8978983014822006, "num_tokens": 941795827.0, "sample_num_tokens": 6691.25, "step": 7471, "total_num_tokens": 941822592.0, "z_loss": 0.0005389318102970719 }, { "copy_logits_max": -6.935208320617676, "copy_logits_min": -750000000.0, "copy_num_tokens": 374.3125, "epoch": 1.526116926219045, "gen_logits_max": 5.464898109436035, "gen_logits_mean": -15.214861869812012, "gen_logits_min": -27.399768829345703, "gen_logits_std": 3.190976619720459, "gen_loss": 0.26560914516448975, "grad_norm": 0.36754333220095137, "learning_rate": 2.149136842105263e-05, "loss": 0.2789, "mean_copy_accuracy": 0.9958226084709167, "mean_gen_accuracy": 0.8751703798770905, "mean_token_accuracy": 0.9028065651655197, "num_tokens": 942050227.0, "sample_num_tokens": 7585.25, "step": 7472, "total_num_tokens": 942080568.0, "z_loss": 0.00046185223618522286 }, { "copy_logits_max": -5.051201820373535, "copy_logits_min": -750000000.0, "copy_num_tokens": 710.8125, "epoch": 1.526321164156242, "gen_logits_max": 4.155655384063721, "gen_logits_mean": -14.722848892211914, "gen_logits_min": -26.860553741455078, "gen_logits_std": 3.186483383178711, "gen_loss": 0.23311179876327515, "grad_norm": 0.35634513378735255, "learning_rate": 2.1490105263157897e-05, "loss": 0.2813, "mean_copy_accuracy": 0.9974106550216675, "mean_gen_accuracy": 0.8711583167314529, "mean_token_accuracy": 0.9064769148826599, "num_tokens": 942325680.0, "sample_num_tokens": 9071.0, "step": 7473, "total_num_tokens": 942361964.0, "z_loss": 0.0004003983340226114 }, { "copy_logits_max": -7.652592182159424, "copy_logits_min": -687500032.0, "copy_num_tokens": 341.375, "epoch": 1.526525402093439, "gen_logits_max": 4.364299774169922, "gen_logits_mean": -17.114904403686523, "gen_logits_min": -28.775171279907227, "gen_logits_std": 3.225253105163574, "gen_loss": 0.30360686779022217, "grad_norm": 0.35152725926590184, "learning_rate": 2.148884210526316e-05, "loss": 0.2999, "mean_copy_accuracy": 0.996682345867157, "mean_gen_accuracy": 0.8715933561325073, "mean_token_accuracy": 0.8992732465267181, "num_tokens": 942596221.0, "sample_num_tokens": 7712.75, "step": 7474, "total_num_tokens": 942627072.0, "z_loss": 0.0005325191305018961 }, { "copy_logits_max": -8.061991691589355, "copy_logits_min": -750000000.0, "copy_num_tokens": 468.625, "epoch": 1.5267296400306356, "gen_logits_max": 3.8722875118255615, "gen_logits_mean": -16.948816299438477, "gen_logits_min": -29.01725196838379, "gen_logits_std": 3.238070011138916, "gen_loss": 0.29335230588912964, "grad_norm": 0.3731755917918809, "learning_rate": 2.1487578947368422e-05, "loss": 0.2975, "mean_copy_accuracy": 0.9970678985118866, "mean_gen_accuracy": 0.8729293644428253, "mean_token_accuracy": 0.8988919258117676, "num_tokens": 942853267.0, "sample_num_tokens": 9021.75, "step": 7475, "total_num_tokens": 942889354.0, "z_loss": 0.0004729964421130717 }, { "copy_logits_max": -4.973738670349121, "copy_logits_min": -750000064.0, "copy_num_tokens": 608.4375, "epoch": 1.5269338779678325, "gen_logits_max": 3.929237127304077, "gen_logits_mean": -15.613018035888672, "gen_logits_min": -27.6418399810791, "gen_logits_std": 3.226242780685425, "gen_loss": 0.26157718896865845, "grad_norm": 0.3946618623949167, "learning_rate": 2.1486315789473686e-05, "loss": 0.2937, "mean_copy_accuracy": 0.9960602223873138, "mean_gen_accuracy": 0.8719994872808456, "mean_token_accuracy": 0.9014332443475723, "num_tokens": 943133216.0, "sample_num_tokens": 9695.0, "step": 7476, "total_num_tokens": 943171996.0, "z_loss": 0.0004486591788008809 }, { "copy_logits_max": -5.778525352478027, "copy_logits_min": -750000000.0, "copy_num_tokens": 468.4375, "epoch": 1.5271381159050295, "gen_logits_max": 3.7808337211608887, "gen_logits_mean": -15.583290100097656, "gen_logits_min": -27.86077308654785, "gen_logits_std": 3.203352451324463, "gen_loss": 0.27347925305366516, "grad_norm": 0.33934690707821397, "learning_rate": 2.1485052631578947e-05, "loss": 0.265, "mean_copy_accuracy": 0.9972214549779892, "mean_gen_accuracy": 0.8783082365989685, "mean_token_accuracy": 0.907904252409935, "num_tokens": 943406349.0, "sample_num_tokens": 7612.75, "step": 7477, "total_num_tokens": 943436800.0, "z_loss": 0.00046701717656105757 }, { "copy_logits_max": -7.3011064529418945, "copy_logits_min": -750000000.0, "copy_num_tokens": 676.3125, "epoch": 1.5273423538422262, "gen_logits_max": 3.7946105003356934, "gen_logits_mean": -15.583797454833984, "gen_logits_min": -27.950986862182617, "gen_logits_std": 3.217864751815796, "gen_loss": 0.2155511975288391, "grad_norm": 0.3799738899530361, "learning_rate": 2.148378947368421e-05, "loss": 0.2713, "mean_copy_accuracy": 0.9965660572052002, "mean_gen_accuracy": 0.8781246095895767, "mean_token_accuracy": 0.9067205488681793, "num_tokens": 943675184.0, "sample_num_tokens": 10339.0, "step": 7478, "total_num_tokens": 943716540.0, "z_loss": 0.00038821581983938813 }, { "copy_logits_max": -6.872852802276611, "copy_logits_min": -750000000.0, "copy_num_tokens": 357.4375, "epoch": 1.5275465917794229, "gen_logits_max": 4.749947547912598, "gen_logits_mean": -14.901115417480469, "gen_logits_min": -27.5986328125, "gen_logits_std": 3.1744518280029297, "gen_loss": 0.28389984369277954, "grad_norm": 0.35085681008490077, "learning_rate": 2.1482526315789472e-05, "loss": 0.2825, "mean_copy_accuracy": 0.9965926557779312, "mean_gen_accuracy": 0.8735227584838867, "mean_token_accuracy": 0.9050286263227463, "num_tokens": 943944420.0, "sample_num_tokens": 7454.0, "step": 7479, "total_num_tokens": 943974236.0, "z_loss": 0.00046710093738511205 }, { "copy_logits_max": -7.6514387130737305, "copy_logits_min": -750000000.0, "copy_num_tokens": 413.0625, "epoch": 1.52775082971662, "gen_logits_max": 3.480905055999756, "gen_logits_mean": -17.456588745117188, "gen_logits_min": -29.657150268554688, "gen_logits_std": 3.271768093109131, "gen_loss": 0.2772625684738159, "grad_norm": 0.36830327373875604, "learning_rate": 2.1481263157894737e-05, "loss": 0.2893, "mean_copy_accuracy": 0.9973154664039612, "mean_gen_accuracy": 0.8702878355979919, "mean_token_accuracy": 0.900416299700737, "num_tokens": 944196928.0, "sample_num_tokens": 8073.5, "step": 7480, "total_num_tokens": 944229222.0, "z_loss": 0.00044648320181295276 }, { "copy_logits_max": -5.696295738220215, "copy_logits_min": -687500032.0, "copy_num_tokens": 432.5625, "epoch": 1.5279550676538167, "gen_logits_max": 4.433770179748535, "gen_logits_mean": -15.460275650024414, "gen_logits_min": -27.504512786865234, "gen_logits_std": 3.2000081539154053, "gen_loss": 0.27784740924835205, "grad_norm": 0.3739415255979956, "learning_rate": 2.148e-05, "loss": 0.273, "mean_copy_accuracy": 0.9955727458000183, "mean_gen_accuracy": 0.8790952414274216, "mean_token_accuracy": 0.9074531942605972, "num_tokens": 944451328.0, "sample_num_tokens": 7881.5, "step": 7481, "total_num_tokens": 944482854.0, "z_loss": 0.0005047848680987954 }, { "copy_logits_max": -8.209897994995117, "copy_logits_min": -750000000.0, "copy_num_tokens": 373.0, "epoch": 1.5281593055910134, "gen_logits_max": 3.2907683849334717, "gen_logits_mean": -17.625673294067383, "gen_logits_min": -29.43248748779297, "gen_logits_std": 3.2532355785369873, "gen_loss": 0.27579134702682495, "grad_norm": 0.3637931802147394, "learning_rate": 2.1478736842105265e-05, "loss": 0.2907, "mean_copy_accuracy": 0.9968445748090744, "mean_gen_accuracy": 0.8704520463943481, "mean_token_accuracy": 0.9008587896823883, "num_tokens": 944727815.0, "sample_num_tokens": 8047.75, "step": 7482, "total_num_tokens": 944760006.0, "z_loss": 0.0004585537826642394 }, { "copy_logits_max": -7.423651218414307, "copy_logits_min": -750000000.0, "copy_num_tokens": 268.4375, "epoch": 1.5283635435282104, "gen_logits_max": 5.128436088562012, "gen_logits_mean": -17.07681655883789, "gen_logits_min": -28.857776641845703, "gen_logits_std": 3.2332448959350586, "gen_loss": 0.22388646006584167, "grad_norm": 0.3599833619451854, "learning_rate": 2.1477473684210526e-05, "loss": 0.2677, "mean_copy_accuracy": 0.9956919550895691, "mean_gen_accuracy": 0.8878287672996521, "mean_token_accuracy": 0.9090878814458847, "num_tokens": 944978810.0, "sample_num_tokens": 7564.0, "step": 7483, "total_num_tokens": 945009066.0, "z_loss": 0.00038556335493922234 }, { "copy_logits_max": -5.085582733154297, "copy_logits_min": -750000000.0, "copy_num_tokens": 466.8125, "epoch": 1.5285677814654073, "gen_logits_max": 5.675775527954102, "gen_logits_mean": -13.358186721801758, "gen_logits_min": -26.352989196777344, "gen_logits_std": 3.088292121887207, "gen_loss": 0.283562034368515, "grad_norm": 0.3601716315889244, "learning_rate": 2.147621052631579e-05, "loss": 0.2808, "mean_copy_accuracy": 0.9966454207897186, "mean_gen_accuracy": 0.8763739168643951, "mean_token_accuracy": 0.9049885123968124, "num_tokens": 945246460.0, "sample_num_tokens": 8059.0, "step": 7484, "total_num_tokens": 945278696.0, "z_loss": 0.0005211306270211935 }, { "copy_logits_max": -6.429990291595459, "copy_logits_min": -750000000.0, "copy_num_tokens": 348.8125, "epoch": 1.528772019402604, "gen_logits_max": 3.998621702194214, "gen_logits_mean": -17.07626724243164, "gen_logits_min": -29.350910186767578, "gen_logits_std": 3.25197696685791, "gen_loss": 0.257935106754303, "grad_norm": 0.35766904822881473, "learning_rate": 2.147494736842105e-05, "loss": 0.2752, "mean_copy_accuracy": 0.9961799681186676, "mean_gen_accuracy": 0.8803291618824005, "mean_token_accuracy": 0.905087798833847, "num_tokens": 945485047.0, "sample_num_tokens": 7625.25, "step": 7485, "total_num_tokens": 945515548.0, "z_loss": 0.00047477398766204715 }, { "copy_logits_max": -7.577883720397949, "copy_logits_min": -750000000.0, "copy_num_tokens": 599.6875, "epoch": 1.528976257339801, "gen_logits_max": 3.65511417388916, "gen_logits_mean": -15.6953706741333, "gen_logits_min": -27.725906372070312, "gen_logits_std": 3.189969062805176, "gen_loss": 0.26476654410362244, "grad_norm": 0.3913644369003477, "learning_rate": 2.1473684210526316e-05, "loss": 0.2778, "mean_copy_accuracy": 0.9967080801725388, "mean_gen_accuracy": 0.8745564669370651, "mean_token_accuracy": 0.9060704410076141, "num_tokens": 945751081.0, "sample_num_tokens": 9294.25, "step": 7486, "total_num_tokens": 945788258.0, "z_loss": 0.000477488967590034 }, { "copy_logits_max": -5.924154281616211, "copy_logits_min": -750000000.0, "copy_num_tokens": 542.4375, "epoch": 1.5291804952769978, "gen_logits_max": 3.9754385948181152, "gen_logits_mean": -15.644804000854492, "gen_logits_min": -27.81509017944336, "gen_logits_std": 3.201972484588623, "gen_loss": 0.3209511637687683, "grad_norm": 0.34314484284014574, "learning_rate": 2.1472421052631577e-05, "loss": 0.2787, "mean_copy_accuracy": 0.9968363046646118, "mean_gen_accuracy": 0.8750020563602448, "mean_token_accuracy": 0.9078782200813293, "num_tokens": 946034908.0, "sample_num_tokens": 9571.5, "step": 7487, "total_num_tokens": 946073194.0, "z_loss": 0.0005566258914768696 }, { "copy_logits_max": -6.010402679443359, "copy_logits_min": -750000000.0, "copy_num_tokens": 551.875, "epoch": 1.5293847332141945, "gen_logits_max": 4.507119178771973, "gen_logits_mean": -14.561515808105469, "gen_logits_min": -27.11471939086914, "gen_logits_std": 3.191932439804077, "gen_loss": 0.24487826228141785, "grad_norm": 0.3832388473779899, "learning_rate": 2.147115789473684e-05, "loss": 0.2615, "mean_copy_accuracy": 0.9972329288721085, "mean_gen_accuracy": 0.8812070935964584, "mean_token_accuracy": 0.9080658555030823, "num_tokens": 946309283.0, "sample_num_tokens": 9597.75, "step": 7488, "total_num_tokens": 946347674.0, "z_loss": 0.00046416674740612507 }, { "copy_logits_max": -3.6798832416534424, "copy_logits_min": -687500032.0, "copy_num_tokens": 393.5, "epoch": 1.5295889711513913, "gen_logits_max": 4.511335372924805, "gen_logits_mean": -14.883574485778809, "gen_logits_min": -27.434194564819336, "gen_logits_std": 3.2087695598602295, "gen_loss": 0.2680269181728363, "grad_norm": 0.3626202074757308, "learning_rate": 2.146989473684211e-05, "loss": 0.2678, "mean_copy_accuracy": 0.9976536482572556, "mean_gen_accuracy": 0.8812536299228668, "mean_token_accuracy": 0.9096876680850983, "num_tokens": 946578293.0, "sample_num_tokens": 7106.25, "step": 7489, "total_num_tokens": 946606718.0, "z_loss": 0.00043503294000402093 }, { "copy_logits_max": -7.5753655433654785, "copy_logits_min": -750000000.0, "copy_num_tokens": 385.625, "epoch": 1.5297932090885882, "gen_logits_max": 4.759289264678955, "gen_logits_mean": -15.80394458770752, "gen_logits_min": -27.52952766418457, "gen_logits_std": 3.2095794677734375, "gen_loss": 0.28918886184692383, "grad_norm": 0.352131276176168, "learning_rate": 2.146863157894737e-05, "loss": 0.2732, "mean_copy_accuracy": 0.9973262697458267, "mean_gen_accuracy": 0.8745190799236298, "mean_token_accuracy": 0.90538290143013, "num_tokens": 946871031.0, "sample_num_tokens": 9016.75, "step": 7490, "total_num_tokens": 946907098.0, "z_loss": 0.0004931349540129304 }, { "copy_logits_max": -8.675275802612305, "copy_logits_min": -750000000.0, "copy_num_tokens": 332.1875, "epoch": 1.529997447025785, "gen_logits_max": 4.453660011291504, "gen_logits_mean": -16.735401153564453, "gen_logits_min": -28.5987606048584, "gen_logits_std": 3.1934854984283447, "gen_loss": 0.3154560625553131, "grad_norm": 0.3713031196816732, "learning_rate": 2.1467368421052634e-05, "loss": 0.2956, "mean_copy_accuracy": 0.9958790689706802, "mean_gen_accuracy": 0.87297722697258, "mean_token_accuracy": 0.8999107033014297, "num_tokens": 947136707.0, "sample_num_tokens": 8714.25, "step": 7491, "total_num_tokens": 947171564.0, "z_loss": 0.0005411293241195381 }, { "copy_logits_max": -2.5507683753967285, "copy_logits_min": -750000000.0, "copy_num_tokens": 697.0, "epoch": 1.5302016849629818, "gen_logits_max": 3.634397029876709, "gen_logits_mean": -15.240944862365723, "gen_logits_min": -27.635658264160156, "gen_logits_std": 3.2206132411956787, "gen_loss": 0.24017281830310822, "grad_norm": 0.41907246271685966, "learning_rate": 2.1466105263157895e-05, "loss": 0.2722, "mean_copy_accuracy": 0.996428832411766, "mean_gen_accuracy": 0.8765060752630234, "mean_token_accuracy": 0.9108505845069885, "num_tokens": 947416959.0, "sample_num_tokens": 9563.25, "step": 7492, "total_num_tokens": 947455212.0, "z_loss": 0.0004017663304693997 }, { "copy_logits_max": -5.710393905639648, "copy_logits_min": -750000000.0, "copy_num_tokens": 416.6875, "epoch": 1.5304059229001787, "gen_logits_max": 3.716764211654663, "gen_logits_mean": -17.180068969726562, "gen_logits_min": -29.249736785888672, "gen_logits_std": 3.234550714492798, "gen_loss": 0.3085888624191284, "grad_norm": 0.35035326489096774, "learning_rate": 2.146484210526316e-05, "loss": 0.2676, "mean_copy_accuracy": 0.9972183108329773, "mean_gen_accuracy": 0.8776030391454697, "mean_token_accuracy": 0.9080667793750763, "num_tokens": 947696194.0, "sample_num_tokens": 8051.0, "step": 7493, "total_num_tokens": 947728398.0, "z_loss": 0.0005345044191926718 }, { "copy_logits_max": -4.933371543884277, "copy_logits_min": -750000000.0, "copy_num_tokens": 524.75, "epoch": 1.5306101608373757, "gen_logits_max": 4.800249099731445, "gen_logits_mean": -15.354747772216797, "gen_logits_min": -27.884201049804688, "gen_logits_std": 3.179304599761963, "gen_loss": 0.28512537479400635, "grad_norm": 0.4111822231925196, "learning_rate": 2.146357894736842e-05, "loss": 0.2992, "mean_copy_accuracy": 0.9969616532325745, "mean_gen_accuracy": 0.8723650127649307, "mean_token_accuracy": 0.8996558785438538, "num_tokens": 947942836.0, "sample_num_tokens": 8452.0, "step": 7494, "total_num_tokens": 947976644.0, "z_loss": 0.000573827070184052 }, { "copy_logits_max": -5.347638130187988, "copy_logits_min": -625000064.0, "copy_num_tokens": 393.25, "epoch": 1.5308143987745724, "gen_logits_max": 3.612287759780884, "gen_logits_mean": -16.772897720336914, "gen_logits_min": -28.893239974975586, "gen_logits_std": 3.241218090057373, "gen_loss": 0.260197252035141, "grad_norm": 0.37421023955890725, "learning_rate": 2.1462315789473684e-05, "loss": 0.2705, "mean_copy_accuracy": 0.9973615556955338, "mean_gen_accuracy": 0.8776594996452332, "mean_token_accuracy": 0.9105961322784424, "num_tokens": 948212038.0, "sample_num_tokens": 7364.5, "step": 7495, "total_num_tokens": 948241496.0, "z_loss": 0.0004494466120377183 }, { "copy_logits_max": -4.0980329513549805, "copy_logits_min": -750000000.0, "copy_num_tokens": 698.25, "epoch": 1.531018636711769, "gen_logits_max": 3.647974967956543, "gen_logits_mean": -15.733909606933594, "gen_logits_min": -28.028547286987305, "gen_logits_std": 3.1763086318969727, "gen_loss": 0.2555888295173645, "grad_norm": 0.387675240898366, "learning_rate": 2.1461052631578945e-05, "loss": 0.2781, "mean_copy_accuracy": 0.9968504309654236, "mean_gen_accuracy": 0.8745824545621872, "mean_token_accuracy": 0.9039237052202225, "num_tokens": 948469225.0, "sample_num_tokens": 10418.25, "step": 7496, "total_num_tokens": 948510898.0, "z_loss": 0.00041734360274858773 }, { "copy_logits_max": -4.735623359680176, "copy_logits_min": -687500032.0, "copy_num_tokens": 400.4375, "epoch": 1.531222874648966, "gen_logits_max": 4.523684501647949, "gen_logits_mean": -15.627716064453125, "gen_logits_min": -28.008758544921875, "gen_logits_std": 3.201983690261841, "gen_loss": 0.28375720977783203, "grad_norm": 0.35650260844418874, "learning_rate": 2.1459789473684213e-05, "loss": 0.283, "mean_copy_accuracy": 0.9952630400657654, "mean_gen_accuracy": 0.8764046728610992, "mean_token_accuracy": 0.9047518670558929, "num_tokens": 948742538.0, "sample_num_tokens": 7188.0, "step": 7497, "total_num_tokens": 948771290.0, "z_loss": 0.0004822742776013911 }, { "copy_logits_max": -3.31534481048584, "copy_logits_min": -750000000.0, "copy_num_tokens": 485.75, "epoch": 1.531427112586163, "gen_logits_max": 3.8198726177215576, "gen_logits_mean": -15.157349586486816, "gen_logits_min": -28.1225528717041, "gen_logits_std": 3.165520668029785, "gen_loss": 0.23484477400779724, "grad_norm": 0.36631275202988567, "learning_rate": 2.1458526315789474e-05, "loss": 0.2596, "mean_copy_accuracy": 0.9963597804307938, "mean_gen_accuracy": 0.882594496011734, "mean_token_accuracy": 0.9112494140863419, "num_tokens": 949008587.0, "sample_num_tokens": 8052.25, "step": 7498, "total_num_tokens": 949040796.0, "z_loss": 0.00036329994327388704 }, { "copy_logits_max": -6.679398059844971, "copy_logits_min": -750000000.0, "copy_num_tokens": 466.3125, "epoch": 1.5316313505233596, "gen_logits_max": 3.116973400115967, "gen_logits_mean": -18.703310012817383, "gen_logits_min": -30.23828887939453, "gen_logits_std": 3.2688560485839844, "gen_loss": 0.27871084213256836, "grad_norm": 0.3733589997894155, "learning_rate": 2.1457263157894738e-05, "loss": 0.2799, "mean_copy_accuracy": 0.9962972402572632, "mean_gen_accuracy": 0.8787548094987869, "mean_token_accuracy": 0.9076341986656189, "num_tokens": 949291360.0, "sample_num_tokens": 9455.0, "step": 7499, "total_num_tokens": 949329180.0, "z_loss": 0.0004463795048650354 }, { "epoch": 1.5318355884605566, "grad_norm": 0.3627224905654711, "learning_rate": 2.1456e-05, "loss": 0.2687, "step": 7500 }, { "epoch": 1.5318355884605566, "eval_copy_logits_max": -7.539266109466553, "eval_copy_logits_min": -82.89180755615234, "eval_gen_logits_max": 2.502999782562256, "eval_gen_logits_mean": -21.633649826049805, "eval_gen_logits_min": -32.665504455566406, "eval_gen_logits_std": 3.313689947128296, "eval_gen_loss": 0.31058549880981445, "eval_loss": 0.28636300563812256, "eval_mean_copy_accuracy": 0.9933502078056335, "eval_mean_gen_accuracy": 0.8851127326488495, "eval_mean_token_accuracy": 0.8991273641586304, "eval_num_tokens": 949602196.0, "eval_runtime": 0.692, "eval_samples_per_second": 11.561, "eval_steps_per_second": 2.89, "eval_total_num_tokens": 949602196.0, "eval_z_loss": 0.0004787435755133629, "step": 7500 }, { "copy_logits_max": -5.463945388793945, "copy_logits_min": -750000000.0, "copy_num_tokens": 484.1875, "epoch": 1.5320398263977535, "gen_logits_max": 4.032837867736816, "gen_logits_mean": -15.63339614868164, "gen_logits_min": -27.88294792175293, "gen_logits_std": 3.16545033454895, "gen_loss": 0.33231794834136963, "grad_norm": 0.3243330402162556, "learning_rate": 2.1454736842105264e-05, "loss": 0.2726, "mean_copy_accuracy": 0.9970076829195023, "mean_gen_accuracy": 0.8775305151939392, "mean_token_accuracy": 0.9088972508907318, "num_tokens": 949840002.0, "sample_num_tokens": 8963.0, "step": 7501, "total_num_tokens": 949875854.0, "z_loss": 0.0005659280577674508 }, { "copy_logits_max": -5.041101455688477, "copy_logits_min": -750000000.0, "copy_num_tokens": 512.125, "epoch": 1.5322440643349502, "gen_logits_max": 3.0389957427978516, "gen_logits_mean": -17.223979949951172, "gen_logits_min": -29.235979080200195, "gen_logits_std": 3.252884864807129, "gen_loss": 0.26899194717407227, "grad_norm": 0.3721783932904554, "learning_rate": 2.1453473684210528e-05, "loss": 0.2824, "mean_copy_accuracy": 0.9971072375774384, "mean_gen_accuracy": 0.87384332716465, "mean_token_accuracy": 0.904755562543869, "num_tokens": 950113729.0, "sample_num_tokens": 8400.75, "step": 7502, "total_num_tokens": 950147332.0, "z_loss": 0.0004689900379162282 }, { "copy_logits_max": -5.835583209991455, "copy_logits_min": -750000000.0, "copy_num_tokens": 427.75, "epoch": 1.532448302272147, "gen_logits_max": 3.3749115467071533, "gen_logits_mean": -17.799468994140625, "gen_logits_min": -30.093421936035156, "gen_logits_std": 3.2633585929870605, "gen_loss": 0.2701566815376282, "grad_norm": 0.33694616120192783, "learning_rate": 2.145221052631579e-05, "loss": 0.2683, "mean_copy_accuracy": 0.9968199729919434, "mean_gen_accuracy": 0.8803516775369644, "mean_token_accuracy": 0.9097348898649216, "num_tokens": 950403153.0, "sample_num_tokens": 8066.75, "step": 7503, "total_num_tokens": 950435420.0, "z_loss": 0.000454218708910048 }, { "copy_logits_max": -1.1814337968826294, "copy_logits_min": -750000000.0, "copy_num_tokens": 555.3125, "epoch": 1.532652540209344, "gen_logits_max": 4.245237350463867, "gen_logits_mean": -15.536417007446289, "gen_logits_min": -28.090045928955078, "gen_logits_std": 3.197110176086426, "gen_loss": 0.324363112449646, "grad_norm": 0.37656281990105805, "learning_rate": 2.1450947368421053e-05, "loss": 0.2871, "mean_copy_accuracy": 0.9969705641269684, "mean_gen_accuracy": 0.8708003610372543, "mean_token_accuracy": 0.9032422751188278, "num_tokens": 950673170.0, "sample_num_tokens": 9475.0, "step": 7504, "total_num_tokens": 950711070.0, "z_loss": 0.0005679347086697817 }, { "copy_logits_max": -5.340519905090332, "copy_logits_min": -750000000.0, "copy_num_tokens": 459.25, "epoch": 1.5328567781465408, "gen_logits_max": 3.576641082763672, "gen_logits_mean": -17.266239166259766, "gen_logits_min": -29.57257080078125, "gen_logits_std": 3.250948429107666, "gen_loss": 0.26619839668273926, "grad_norm": 0.4185076872975806, "learning_rate": 2.1449684210526317e-05, "loss": 0.2851, "mean_copy_accuracy": 0.9967830628156662, "mean_gen_accuracy": 0.8719114810228348, "mean_token_accuracy": 0.9027621001005173, "num_tokens": 950952464.0, "sample_num_tokens": 8347.5, "step": 7505, "total_num_tokens": 950985854.0, "z_loss": 0.0005105729214847088 }, { "copy_logits_max": -2.650603771209717, "copy_logits_min": -750000000.0, "copy_num_tokens": 371.9375, "epoch": 1.5330610160837375, "gen_logits_max": 4.810790061950684, "gen_logits_mean": -14.75920581817627, "gen_logits_min": -27.40422821044922, "gen_logits_std": 3.1226108074188232, "gen_loss": 0.299220472574234, "grad_norm": 0.358057541915367, "learning_rate": 2.1448421052631582e-05, "loss": 0.291, "mean_copy_accuracy": 0.9962978065013885, "mean_gen_accuracy": 0.8751869946718216, "mean_token_accuracy": 0.9023445695638657, "num_tokens": 951224255.0, "sample_num_tokens": 7692.25, "step": 7506, "total_num_tokens": 951255024.0, "z_loss": 0.000554050668142736 }, { "copy_logits_max": -2.9604361057281494, "copy_logits_min": -750000000.0, "copy_num_tokens": 586.1875, "epoch": 1.5332652540209344, "gen_logits_max": 4.510250091552734, "gen_logits_mean": -15.24116325378418, "gen_logits_min": -27.90298080444336, "gen_logits_std": 3.124056339263916, "gen_loss": 0.275176465511322, "grad_norm": 0.3840995411710883, "learning_rate": 2.1447157894736843e-05, "loss": 0.2969, "mean_copy_accuracy": 0.9964389055967331, "mean_gen_accuracy": 0.8715484440326691, "mean_token_accuracy": 0.9006918966770172, "num_tokens": 951488004.0, "sample_num_tokens": 8844.5, "step": 7507, "total_num_tokens": 951523382.0, "z_loss": 0.0004665147280320525 }, { "copy_logits_max": -5.694370269775391, "copy_logits_min": -750000064.0, "copy_num_tokens": 579.5, "epoch": 1.5334694919581313, "gen_logits_max": 4.649248123168945, "gen_logits_mean": -14.457903861999512, "gen_logits_min": -26.632495880126953, "gen_logits_std": 3.16566801071167, "gen_loss": 0.2715464234352112, "grad_norm": 0.3459221523696633, "learning_rate": 2.1445894736842107e-05, "loss": 0.2807, "mean_copy_accuracy": 0.996345192193985, "mean_gen_accuracy": 0.8704893589019775, "mean_token_accuracy": 0.9040872603654861, "num_tokens": 951769541.0, "sample_num_tokens": 8757.25, "step": 7508, "total_num_tokens": 951804570.0, "z_loss": 0.0004482483782339841 }, { "copy_logits_max": -4.728536605834961, "copy_logits_min": -625000064.0, "copy_num_tokens": 530.0, "epoch": 1.533673729895328, "gen_logits_max": 3.5864815711975098, "gen_logits_mean": -17.15863037109375, "gen_logits_min": -29.56022071838379, "gen_logits_std": 3.255359649658203, "gen_loss": 0.2842201590538025, "grad_norm": 0.3466250797661109, "learning_rate": 2.1444631578947368e-05, "loss": 0.2839, "mean_copy_accuracy": 0.9960635751485825, "mean_gen_accuracy": 0.8711427450180054, "mean_token_accuracy": 0.9053965061903, "num_tokens": 952049285.0, "sample_num_tokens": 8962.75, "step": 7509, "total_num_tokens": 952085136.0, "z_loss": 0.0005308127147145569 }, { "copy_logits_max": -4.813220977783203, "copy_logits_min": -750000000.0, "copy_num_tokens": 423.5625, "epoch": 1.533877967832525, "gen_logits_max": 4.524064064025879, "gen_logits_mean": -15.923624038696289, "gen_logits_min": -27.951810836791992, "gen_logits_std": 3.1957573890686035, "gen_loss": 0.26375827193260193, "grad_norm": 0.39260049331860936, "learning_rate": 2.1443368421052632e-05, "loss": 0.2753, "mean_copy_accuracy": 0.9970409125089645, "mean_gen_accuracy": 0.8786194026470184, "mean_token_accuracy": 0.9068475067615509, "num_tokens": 952338557.0, "sample_num_tokens": 8661.75, "step": 7510, "total_num_tokens": 952373204.0, "z_loss": 0.00040562814683653414 }, { "copy_logits_max": -5.284875869750977, "copy_logits_min": -750000000.0, "copy_num_tokens": 503.125, "epoch": 1.5340822057697219, "gen_logits_max": 3.8606984615325928, "gen_logits_mean": -16.14774513244629, "gen_logits_min": -28.282917022705078, "gen_logits_std": 3.2293450832366943, "gen_loss": 0.26171159744262695, "grad_norm": 0.3485245917131046, "learning_rate": 2.1442105263157893e-05, "loss": 0.279, "mean_copy_accuracy": 0.9966566860675812, "mean_gen_accuracy": 0.875185951590538, "mean_token_accuracy": 0.9061244875192642, "num_tokens": 952608832.0, "sample_num_tokens": 8717.5, "step": 7511, "total_num_tokens": 952643702.0, "z_loss": 0.00041364150820299983 }, { "copy_logits_max": -3.2379159927368164, "copy_logits_min": -687500032.0, "copy_num_tokens": 322.125, "epoch": 1.5342864437069186, "gen_logits_max": 4.247289180755615, "gen_logits_mean": -16.17767333984375, "gen_logits_min": -28.405168533325195, "gen_logits_std": 3.198258638381958, "gen_loss": 0.2965584397315979, "grad_norm": 0.3260107738913913, "learning_rate": 2.1440842105263157e-05, "loss": 0.2794, "mean_copy_accuracy": 0.997327595949173, "mean_gen_accuracy": 0.8741034865379333, "mean_token_accuracy": 0.9043476283550262, "num_tokens": 952886527.0, "sample_num_tokens": 7172.25, "step": 7512, "total_num_tokens": 952915216.0, "z_loss": 0.0005039593670517206 }, { "copy_logits_max": -5.856813430786133, "copy_logits_min": -562500032.0, "copy_num_tokens": 597.6875, "epoch": 1.5344906816441153, "gen_logits_max": 4.546668529510498, "gen_logits_mean": -15.543061256408691, "gen_logits_min": -27.561859130859375, "gen_logits_std": 3.2187161445617676, "gen_loss": 0.22789421677589417, "grad_norm": 0.38029666723336186, "learning_rate": 2.1439578947368422e-05, "loss": 0.2773, "mean_copy_accuracy": 0.99701027572155, "mean_gen_accuracy": 0.8799460977315903, "mean_token_accuracy": 0.9066729694604874, "num_tokens": 953149349.0, "sample_num_tokens": 9194.25, "step": 7513, "total_num_tokens": 953186126.0, "z_loss": 0.0003589359112083912 }, { "copy_logits_max": -5.011263370513916, "copy_logits_min": -750000064.0, "copy_num_tokens": 559.875, "epoch": 1.5346949195813122, "gen_logits_max": 4.021633148193359, "gen_logits_mean": -15.383323669433594, "gen_logits_min": -27.838150024414062, "gen_logits_std": 3.215521812438965, "gen_loss": 0.29519927501678467, "grad_norm": 0.362316752476812, "learning_rate": 2.1438315789473686e-05, "loss": 0.2802, "mean_copy_accuracy": 0.9954416900873184, "mean_gen_accuracy": 0.878203272819519, "mean_token_accuracy": 0.9050475060939789, "num_tokens": 953398593.0, "sample_num_tokens": 8623.25, "step": 7514, "total_num_tokens": 953433086.0, "z_loss": 0.00045135110849514604 }, { "copy_logits_max": -2.5880823135375977, "copy_logits_min": -750000064.0, "copy_num_tokens": 402.0625, "epoch": 1.5348991575185091, "gen_logits_max": 5.002577781677246, "gen_logits_mean": -15.00736141204834, "gen_logits_min": -27.077306747436523, "gen_logits_std": 3.2033605575561523, "gen_loss": 0.3068671226501465, "grad_norm": 0.3626238389706787, "learning_rate": 2.143705263157895e-05, "loss": 0.2929, "mean_copy_accuracy": 0.9953461140394211, "mean_gen_accuracy": 0.8737006485462189, "mean_token_accuracy": 0.9002986997365952, "num_tokens": 953670466.0, "sample_num_tokens": 8199.5, "step": 7515, "total_num_tokens": 953703264.0, "z_loss": 0.00048340356443077326 }, { "copy_logits_max": -5.806137561798096, "copy_logits_min": -750000000.0, "copy_num_tokens": 456.0, "epoch": 1.5351033954557058, "gen_logits_max": 4.812297344207764, "gen_logits_mean": -15.327407836914062, "gen_logits_min": -27.675678253173828, "gen_logits_std": 3.2305264472961426, "gen_loss": 0.2528657615184784, "grad_norm": 0.37801708053074307, "learning_rate": 2.143578947368421e-05, "loss": 0.2675, "mean_copy_accuracy": 0.9971032589673996, "mean_gen_accuracy": 0.875982791185379, "mean_token_accuracy": 0.9086306244134903, "num_tokens": 953947538.0, "sample_num_tokens": 9083.5, "step": 7516, "total_num_tokens": 953983872.0, "z_loss": 0.000374992931028828 }, { "copy_logits_max": -4.469775199890137, "copy_logits_min": -625000064.0, "copy_num_tokens": 414.1875, "epoch": 1.5353076333929028, "gen_logits_max": 4.779231071472168, "gen_logits_mean": -15.75042724609375, "gen_logits_min": -28.176982879638672, "gen_logits_std": 3.233165740966797, "gen_loss": 0.2529347538948059, "grad_norm": 0.34475220521177113, "learning_rate": 2.1434526315789476e-05, "loss": 0.2694, "mean_copy_accuracy": 0.9969123750925064, "mean_gen_accuracy": 0.8719766139984131, "mean_token_accuracy": 0.9085772633552551, "num_tokens": 954248234.0, "sample_num_tokens": 7691.0, "step": 7517, "total_num_tokens": 954278998.0, "z_loss": 0.0004213005304336548 }, { "copy_logits_max": -4.570947647094727, "copy_logits_min": -750000000.0, "copy_num_tokens": 389.1875, "epoch": 1.5355118713300997, "gen_logits_max": 5.1937255859375, "gen_logits_mean": -14.621917724609375, "gen_logits_min": -26.897266387939453, "gen_logits_std": 3.169567108154297, "gen_loss": 0.3440718650817871, "grad_norm": 0.3420773562667208, "learning_rate": 2.1433263157894736e-05, "loss": 0.2781, "mean_copy_accuracy": 0.9953178316354752, "mean_gen_accuracy": 0.883523091673851, "mean_token_accuracy": 0.9068122506141663, "num_tokens": 954526158.0, "sample_num_tokens": 8312.0, "step": 7518, "total_num_tokens": 954559406.0, "z_loss": 0.0005361426156014204 }, { "copy_logits_max": -5.961320877075195, "copy_logits_min": -750000000.0, "copy_num_tokens": 389.8125, "epoch": 1.5357161092672964, "gen_logits_max": 3.160973310470581, "gen_logits_mean": -17.923696517944336, "gen_logits_min": -29.993877410888672, "gen_logits_std": 3.2796244621276855, "gen_loss": 0.2874777019023895, "grad_norm": 0.36990351955794093, "learning_rate": 2.1432e-05, "loss": 0.2624, "mean_copy_accuracy": 0.9969180226325989, "mean_gen_accuracy": 0.8803208321332932, "mean_token_accuracy": 0.9097943902015686, "num_tokens": 954806816.0, "sample_num_tokens": 7640.5, "step": 7519, "total_num_tokens": 954837378.0, "z_loss": 0.0004839281318709254 }, { "copy_logits_max": -4.192677974700928, "copy_logits_min": -750000000.0, "copy_num_tokens": 388.0625, "epoch": 1.535920347204493, "gen_logits_max": 3.9985291957855225, "gen_logits_mean": -16.70901107788086, "gen_logits_min": -28.476882934570312, "gen_logits_std": 3.2074480056762695, "gen_loss": 0.27912697196006775, "grad_norm": 0.35462229850126453, "learning_rate": 2.1430736842105262e-05, "loss": 0.2621, "mean_copy_accuracy": 0.9963971376419067, "mean_gen_accuracy": 0.8835112899541855, "mean_token_accuracy": 0.9115294963121414, "num_tokens": 955085074.0, "sample_num_tokens": 7662.5, "step": 7520, "total_num_tokens": 955115724.0, "z_loss": 0.0004512216546572745 }, { "copy_logits_max": -5.24947452545166, "copy_logits_min": -750000000.0, "copy_num_tokens": 414.5625, "epoch": 1.53612458514169, "gen_logits_max": 4.354185104370117, "gen_logits_mean": -17.59356689453125, "gen_logits_min": -29.82254409790039, "gen_logits_std": 3.269589900970459, "gen_loss": 0.2746778726577759, "grad_norm": 0.3600044429023749, "learning_rate": 2.1429473684210526e-05, "loss": 0.2909, "mean_copy_accuracy": 0.9965238720178604, "mean_gen_accuracy": 0.8750407546758652, "mean_token_accuracy": 0.9028404206037521, "num_tokens": 955346813.0, "sample_num_tokens": 8896.75, "step": 7521, "total_num_tokens": 955382400.0, "z_loss": 0.0004504653625190258 }, { "copy_logits_max": -4.873305320739746, "copy_logits_min": -750000000.0, "copy_num_tokens": 566.0, "epoch": 1.536328823078887, "gen_logits_max": 5.640275001525879, "gen_logits_mean": -14.091952323913574, "gen_logits_min": -26.24628448486328, "gen_logits_std": 3.1572017669677734, "gen_loss": 0.2878763675689697, "grad_norm": 0.38188412019921336, "learning_rate": 2.142821052631579e-05, "loss": 0.2911, "mean_copy_accuracy": 0.9970129430294037, "mean_gen_accuracy": 0.8751864582300186, "mean_token_accuracy": 0.9017772525548935, "num_tokens": 955608037.0, "sample_num_tokens": 9581.25, "step": 7522, "total_num_tokens": 955646362.0, "z_loss": 0.0005105740274302661 }, { "copy_logits_max": -3.825941562652588, "copy_logits_min": -750000000.0, "copy_num_tokens": 525.5, "epoch": 1.5365330610160837, "gen_logits_max": 3.2040810585021973, "gen_logits_mean": -17.56305694580078, "gen_logits_min": -29.75838851928711, "gen_logits_std": 3.270565986633301, "gen_loss": 0.2819758951663971, "grad_norm": 0.36899984138535796, "learning_rate": 2.1426947368421055e-05, "loss": 0.2733, "mean_copy_accuracy": 0.9960983395576477, "mean_gen_accuracy": 0.8800262808799744, "mean_token_accuracy": 0.9060885608196259, "num_tokens": 955868850.0, "sample_num_tokens": 8555.5, "step": 7523, "total_num_tokens": 955903072.0, "z_loss": 0.00044170499313622713 }, { "copy_logits_max": -2.724978446960449, "copy_logits_min": -750000000.0, "copy_num_tokens": 545.75, "epoch": 1.5367372989532806, "gen_logits_max": 4.831368446350098, "gen_logits_mean": -15.150917053222656, "gen_logits_min": -27.459150314331055, "gen_logits_std": 3.22969913482666, "gen_loss": 0.26464587450027466, "grad_norm": 0.3663286381942547, "learning_rate": 2.1425684210526316e-05, "loss": 0.2913, "mean_copy_accuracy": 0.996507853269577, "mean_gen_accuracy": 0.8715979903936386, "mean_token_accuracy": 0.901001900434494, "num_tokens": 956124943.0, "sample_num_tokens": 9135.25, "step": 7524, "total_num_tokens": 956161484.0, "z_loss": 0.0004915889585390687 }, { "copy_logits_max": -2.473468780517578, "copy_logits_min": -750000000.0, "copy_num_tokens": 375.0625, "epoch": 1.5369415368904775, "gen_logits_max": 5.461182594299316, "gen_logits_mean": -15.010116577148438, "gen_logits_min": -27.19137191772461, "gen_logits_std": 3.1987760066986084, "gen_loss": 0.27672672271728516, "grad_norm": 0.35939505834693325, "learning_rate": 2.142442105263158e-05, "loss": 0.2783, "mean_copy_accuracy": 0.9961370527744293, "mean_gen_accuracy": 0.8759312480688095, "mean_token_accuracy": 0.9053103625774384, "num_tokens": 956376032.0, "sample_num_tokens": 7250.5, "step": 7525, "total_num_tokens": 956405034.0, "z_loss": 0.0005588908679783344 }, { "copy_logits_max": -3.7865028381347656, "copy_logits_min": -750000000.0, "copy_num_tokens": 581.6875, "epoch": 1.5371457748276742, "gen_logits_max": 5.120065689086914, "gen_logits_mean": -15.358905792236328, "gen_logits_min": -26.890384674072266, "gen_logits_std": 3.1735000610351562, "gen_loss": 0.24384739995002747, "grad_norm": 0.3359458433409849, "learning_rate": 2.142315789473684e-05, "loss": 0.2552, "mean_copy_accuracy": 0.9975083917379379, "mean_gen_accuracy": 0.8848870992660522, "mean_token_accuracy": 0.9123025983572006, "num_tokens": 956650070.0, "sample_num_tokens": 10973.0, "step": 7526, "total_num_tokens": 956693962.0, "z_loss": 0.0004888465628027916 }, { "copy_logits_max": -2.16359281539917, "copy_logits_min": -687500032.0, "copy_num_tokens": 434.8125, "epoch": 1.537350012764871, "gen_logits_max": 5.6918625831604, "gen_logits_mean": -14.46026611328125, "gen_logits_min": -26.002750396728516, "gen_logits_std": 3.161994457244873, "gen_loss": 0.2712501883506775, "grad_norm": 0.3248097453488132, "learning_rate": 2.1421894736842105e-05, "loss": 0.2517, "mean_copy_accuracy": 0.9976073950529099, "mean_gen_accuracy": 0.8871550410985947, "mean_token_accuracy": 0.915454775094986, "num_tokens": 956944494.0, "sample_num_tokens": 8962.0, "step": 7527, "total_num_tokens": 956980342.0, "z_loss": 0.00044807896483689547 }, { "copy_logits_max": -3.078965663909912, "copy_logits_min": -750000000.0, "copy_num_tokens": 361.375, "epoch": 1.5375542507020679, "gen_logits_max": 5.544658660888672, "gen_logits_mean": -15.04899787902832, "gen_logits_min": -26.743864059448242, "gen_logits_std": 3.1893081665039062, "gen_loss": 0.26265740394592285, "grad_norm": 0.3611322121478189, "learning_rate": 2.142063157894737e-05, "loss": 0.2882, "mean_copy_accuracy": 0.9967340528964996, "mean_gen_accuracy": 0.8728101849555969, "mean_token_accuracy": 0.9009084850549698, "num_tokens": 957213529.0, "sample_num_tokens": 8471.25, "step": 7528, "total_num_tokens": 957247414.0, "z_loss": 0.0004591770702973008 }, { "copy_logits_max": -2.419981002807617, "copy_logits_min": -750000000.0, "copy_num_tokens": 465.3125, "epoch": 1.5377584886392648, "gen_logits_max": 3.945984125137329, "gen_logits_mean": -17.005979537963867, "gen_logits_min": -29.125524520874023, "gen_logits_std": 3.251352310180664, "gen_loss": 0.3073159456253052, "grad_norm": 0.3360210046776896, "learning_rate": 2.141936842105263e-05, "loss": 0.2912, "mean_copy_accuracy": 0.9965953528881073, "mean_gen_accuracy": 0.8693564087152481, "mean_token_accuracy": 0.9022180736064911, "num_tokens": 957505773.0, "sample_num_tokens": 8420.75, "step": 7529, "total_num_tokens": 957539456.0, "z_loss": 0.0005240211030468345 }, { "copy_logits_max": 0.0026190578937530518, "copy_logits_min": -750000128.0, "copy_num_tokens": 565.625, "epoch": 1.5379627265764615, "gen_logits_max": 6.504699230194092, "gen_logits_mean": -12.53765869140625, "gen_logits_min": -24.766990661621094, "gen_logits_std": 3.154010772705078, "gen_loss": 0.23339444398880005, "grad_norm": 0.3559109350572903, "learning_rate": 2.1418105263157898e-05, "loss": 0.271, "mean_copy_accuracy": 0.9969660341739655, "mean_gen_accuracy": 0.8786766678094864, "mean_token_accuracy": 0.9064512699842453, "num_tokens": 957767215.0, "sample_num_tokens": 8857.25, "step": 7530, "total_num_tokens": 957802644.0, "z_loss": 0.000394498638343066 }, { "copy_logits_max": -4.8126749992370605, "copy_logits_min": -750000000.0, "copy_num_tokens": 465.625, "epoch": 1.5381669645136584, "gen_logits_max": 4.620850563049316, "gen_logits_mean": -15.720189094543457, "gen_logits_min": -27.49243927001953, "gen_logits_std": 3.234098434448242, "gen_loss": 0.28611046075820923, "grad_norm": 0.36539610305238246, "learning_rate": 2.141684210526316e-05, "loss": 0.2842, "mean_copy_accuracy": 0.9958285540342331, "mean_gen_accuracy": 0.8786545693874359, "mean_token_accuracy": 0.902954950928688, "num_tokens": 958030217.0, "sample_num_tokens": 9207.25, "step": 7531, "total_num_tokens": 958067046.0, "z_loss": 0.00048669942771084607 }, { "copy_logits_max": -3.9530320167541504, "copy_logits_min": -750000000.0, "copy_num_tokens": 432.1875, "epoch": 1.5383712024508553, "gen_logits_max": 4.887997627258301, "gen_logits_mean": -15.82875919342041, "gen_logits_min": -27.952531814575195, "gen_logits_std": 3.227358818054199, "gen_loss": 0.3061496615409851, "grad_norm": 0.33966585797079585, "learning_rate": 2.1415578947368423e-05, "loss": 0.292, "mean_copy_accuracy": 0.9968550354242325, "mean_gen_accuracy": 0.8720115125179291, "mean_token_accuracy": 0.901824876666069, "num_tokens": 958317854.0, "sample_num_tokens": 9509.5, "step": 7532, "total_num_tokens": 958355892.0, "z_loss": 0.0005048024468123913 }, { "copy_logits_max": -5.835150718688965, "copy_logits_min": -750000064.0, "copy_num_tokens": 519.75, "epoch": 1.538575440388052, "gen_logits_max": 4.72382116317749, "gen_logits_mean": -15.389713287353516, "gen_logits_min": -27.33835220336914, "gen_logits_std": 3.247079849243164, "gen_loss": 0.24769248068332672, "grad_norm": 0.32967815321859545, "learning_rate": 2.1414315789473684e-05, "loss": 0.2638, "mean_copy_accuracy": 0.9981252998113632, "mean_gen_accuracy": 0.8794550746679306, "mean_token_accuracy": 0.9096356928348541, "num_tokens": 958607130.0, "sample_num_tokens": 8824.0, "step": 7533, "total_num_tokens": 958642426.0, "z_loss": 0.0003883344470523298 }, { "copy_logits_max": -5.9073381423950195, "copy_logits_min": -750000000.0, "copy_num_tokens": 335.625, "epoch": 1.5387796783252488, "gen_logits_max": 3.6806039810180664, "gen_logits_mean": -17.855234146118164, "gen_logits_min": -29.289813995361328, "gen_logits_std": 3.2639780044555664, "gen_loss": 0.29820960760116577, "grad_norm": 0.3853789804527745, "learning_rate": 2.141305263157895e-05, "loss": 0.2835, "mean_copy_accuracy": 0.9957811683416367, "mean_gen_accuracy": 0.8776395618915558, "mean_token_accuracy": 0.9054450243711472, "num_tokens": 958865886.0, "sample_num_tokens": 7866.0, "step": 7534, "total_num_tokens": 958897350.0, "z_loss": 0.0004741945886053145 }, { "copy_logits_max": -4.497923851013184, "copy_logits_min": -750000000.0, "copy_num_tokens": 299.875, "epoch": 1.538983916262446, "gen_logits_max": 5.531924247741699, "gen_logits_mean": -15.214620590209961, "gen_logits_min": -26.905807495117188, "gen_logits_std": 3.1415343284606934, "gen_loss": 0.32135558128356934, "grad_norm": 0.375583953687763, "learning_rate": 2.141178947368421e-05, "loss": 0.2893, "mean_copy_accuracy": 0.9963660836219788, "mean_gen_accuracy": 0.8757578134536743, "mean_token_accuracy": 0.9033454209566116, "num_tokens": 959134785.0, "sample_num_tokens": 7462.75, "step": 7535, "total_num_tokens": 959164636.0, "z_loss": 0.0005409441655501723 }, { "copy_logits_max": -3.1840269565582275, "copy_logits_min": -750000000.0, "copy_num_tokens": 480.6875, "epoch": 1.5391881541996426, "gen_logits_max": 4.618682861328125, "gen_logits_mean": -15.000900268554688, "gen_logits_min": -27.05094337463379, "gen_logits_std": 3.229623556137085, "gen_loss": 0.2908400893211365, "grad_norm": 0.36029925661786866, "learning_rate": 2.1410526315789474e-05, "loss": 0.2734, "mean_copy_accuracy": 0.9961681813001633, "mean_gen_accuracy": 0.8737568706274033, "mean_token_accuracy": 0.9051456600427628, "num_tokens": 959402628.0, "sample_num_tokens": 8308.0, "step": 7536, "total_num_tokens": 959435860.0, "z_loss": 0.00047820681356824934 }, { "copy_logits_max": -5.20039176940918, "copy_logits_min": -687500032.0, "copy_num_tokens": 604.3125, "epoch": 1.5393923921368393, "gen_logits_max": 4.091794490814209, "gen_logits_mean": -15.863530158996582, "gen_logits_min": -28.070528030395508, "gen_logits_std": 3.266578435897827, "gen_loss": 0.23689445853233337, "grad_norm": 0.35191599580191724, "learning_rate": 2.1409263157894735e-05, "loss": 0.2536, "mean_copy_accuracy": 0.9965801239013672, "mean_gen_accuracy": 0.8870319575071335, "mean_token_accuracy": 0.9157262295484543, "num_tokens": 959675769.0, "sample_num_tokens": 8935.25, "step": 7537, "total_num_tokens": 959711510.0, "z_loss": 0.0003882166347466409 }, { "copy_logits_max": -4.25218391418457, "copy_logits_min": -750000064.0, "copy_num_tokens": 447.0625, "epoch": 1.5395966300740362, "gen_logits_max": 4.909949779510498, "gen_logits_mean": -14.15588665008545, "gen_logits_min": -26.436195373535156, "gen_logits_std": 3.179042339324951, "gen_loss": 0.278562068939209, "grad_norm": 0.35422483032137037, "learning_rate": 2.1408000000000002e-05, "loss": 0.2711, "mean_copy_accuracy": 0.9967432171106339, "mean_gen_accuracy": 0.8786035925149918, "mean_token_accuracy": 0.9079626351594925, "num_tokens": 959942238.0, "sample_num_tokens": 7682.0, "step": 7538, "total_num_tokens": 959972966.0, "z_loss": 0.0004578889056574553 }, { "copy_logits_max": -6.300201416015625, "copy_logits_min": -687500032.0, "copy_num_tokens": 330.6875, "epoch": 1.5398008680112332, "gen_logits_max": 5.447634696960449, "gen_logits_mean": -15.745721817016602, "gen_logits_min": -27.452030181884766, "gen_logits_std": 3.207519769668579, "gen_loss": 0.28843843936920166, "grad_norm": 0.3682550813846457, "learning_rate": 2.1406736842105263e-05, "loss": 0.2829, "mean_copy_accuracy": 0.995758593082428, "mean_gen_accuracy": 0.8769767731428146, "mean_token_accuracy": 0.9034446179866791, "num_tokens": 960198679.0, "sample_num_tokens": 7350.25, "step": 7539, "total_num_tokens": 960228080.0, "z_loss": 0.0004943089443258941 }, { "copy_logits_max": -4.947833061218262, "copy_logits_min": -687500032.0, "copy_num_tokens": 496.9375, "epoch": 1.5400051059484299, "gen_logits_max": 4.902204990386963, "gen_logits_mean": -14.396686553955078, "gen_logits_min": -27.349872589111328, "gen_logits_std": 3.203641891479492, "gen_loss": 0.2826935648918152, "grad_norm": 0.37347858874222356, "learning_rate": 2.1405473684210528e-05, "loss": 0.2698, "mean_copy_accuracy": 0.9979946911334991, "mean_gen_accuracy": 0.8727011829614639, "mean_token_accuracy": 0.9076343774795532, "num_tokens": 960467813.0, "sample_num_tokens": 8104.25, "step": 7540, "total_num_tokens": 960500230.0, "z_loss": 0.0004611504264175892 }, { "copy_logits_max": -7.32461404800415, "copy_logits_min": -750000000.0, "copy_num_tokens": 408.4375, "epoch": 1.5402093438856268, "gen_logits_max": 4.110339164733887, "gen_logits_mean": -16.199687957763672, "gen_logits_min": -28.161136627197266, "gen_logits_std": 3.231877326965332, "gen_loss": 0.28113746643066406, "grad_norm": 0.37828811567258, "learning_rate": 2.1404210526315792e-05, "loss": 0.2736, "mean_copy_accuracy": 0.9961059838533401, "mean_gen_accuracy": 0.8825678825378418, "mean_token_accuracy": 0.9060345888137817, "num_tokens": 960720290.0, "sample_num_tokens": 8498.0, "step": 7541, "total_num_tokens": 960754282.0, "z_loss": 0.0004769897786900401 }, { "copy_logits_max": -5.80008602142334, "copy_logits_min": -750000000.0, "copy_num_tokens": 456.75, "epoch": 1.5404135818228237, "gen_logits_max": 4.813502311706543, "gen_logits_mean": -14.828409194946289, "gen_logits_min": -27.53019905090332, "gen_logits_std": 3.2071685791015625, "gen_loss": 0.2636915147304535, "grad_norm": 0.36319640649091384, "learning_rate": 2.1402947368421053e-05, "loss": 0.2924, "mean_copy_accuracy": 0.9965684562921524, "mean_gen_accuracy": 0.8706291913986206, "mean_token_accuracy": 0.9005773961544037, "num_tokens": 960981514.0, "sample_num_tokens": 8816.5, "step": 7542, "total_num_tokens": 961016780.0, "z_loss": 0.0005226818029768765 }, { "copy_logits_max": -5.911153793334961, "copy_logits_min": -750000000.0, "copy_num_tokens": 392.5, "epoch": 1.5406178197600204, "gen_logits_max": 3.930905818939209, "gen_logits_mean": -16.62689971923828, "gen_logits_min": -28.93526268005371, "gen_logits_std": 3.2458462715148926, "gen_loss": 0.287508100271225, "grad_norm": 0.3641221847095997, "learning_rate": 2.1401684210526317e-05, "loss": 0.275, "mean_copy_accuracy": 0.9963383078575134, "mean_gen_accuracy": 0.8753224611282349, "mean_token_accuracy": 0.9061145484447479, "num_tokens": 961245195.0, "sample_num_tokens": 7502.75, "step": 7543, "total_num_tokens": 961275206.0, "z_loss": 0.0005272143171168864 }, { "copy_logits_max": -5.569523811340332, "copy_logits_min": -625000064.0, "copy_num_tokens": 576.1875, "epoch": 1.5408220576972171, "gen_logits_max": 4.3107099533081055, "gen_logits_mean": -14.985299110412598, "gen_logits_min": -27.41733169555664, "gen_logits_std": 3.2172913551330566, "gen_loss": 0.2419857233762741, "grad_norm": 0.38532741458961856, "learning_rate": 2.1400421052631578e-05, "loss": 0.2843, "mean_copy_accuracy": 0.9967123121023178, "mean_gen_accuracy": 0.8746939599514008, "mean_token_accuracy": 0.9044670760631561, "num_tokens": 961499462.0, "sample_num_tokens": 9151.0, "step": 7544, "total_num_tokens": 961536066.0, "z_loss": 0.00047976989299058914 }, { "copy_logits_max": -6.5888285636901855, "copy_logits_min": -687500032.0, "copy_num_tokens": 471.375, "epoch": 1.541026295634414, "gen_logits_max": 4.761795997619629, "gen_logits_mean": -15.151876449584961, "gen_logits_min": -27.005023956298828, "gen_logits_std": 3.141275644302368, "gen_loss": 0.3109322190284729, "grad_norm": 0.32872504074557657, "learning_rate": 2.1399157894736842e-05, "loss": 0.2729, "mean_copy_accuracy": 0.9971352219581604, "mean_gen_accuracy": 0.8807355165481567, "mean_token_accuracy": 0.9069349616765976, "num_tokens": 961783977.0, "sample_num_tokens": 9557.75, "step": 7545, "total_num_tokens": 961822208.0, "z_loss": 0.0005409564473666251 }, { "copy_logits_max": -7.0440216064453125, "copy_logits_min": -750000000.0, "copy_num_tokens": 546.3125, "epoch": 1.541230533571611, "gen_logits_max": 4.199670791625977, "gen_logits_mean": -14.137535095214844, "gen_logits_min": -26.116336822509766, "gen_logits_std": 3.1057934761047363, "gen_loss": 0.27167677879333496, "grad_norm": 0.37183441680493834, "learning_rate": 2.1397894736842107e-05, "loss": 0.2692, "mean_copy_accuracy": 0.996948316693306, "mean_gen_accuracy": 0.885252445936203, "mean_token_accuracy": 0.9089978933334351, "num_tokens": 962058937.0, "sample_num_tokens": 8608.25, "step": 7546, "total_num_tokens": 962093370.0, "z_loss": 0.00046127886162139475 }, { "copy_logits_max": -7.230377197265625, "copy_logits_min": -687500032.0, "copy_num_tokens": 593.6875, "epoch": 1.5414347715088077, "gen_logits_max": 3.4990181922912598, "gen_logits_mean": -15.941452026367188, "gen_logits_min": -28.267688751220703, "gen_logits_std": 3.2444303035736084, "gen_loss": 0.25308847427368164, "grad_norm": 0.3425057927010931, "learning_rate": 2.139663157894737e-05, "loss": 0.2696, "mean_copy_accuracy": 0.9965313225984573, "mean_gen_accuracy": 0.8779535591602325, "mean_token_accuracy": 0.9088690727949142, "num_tokens": 962338785.0, "sample_num_tokens": 9423.25, "step": 7547, "total_num_tokens": 962376478.0, "z_loss": 0.00042790448060259223 }, { "copy_logits_max": -6.558778762817383, "copy_logits_min": -750000000.0, "copy_num_tokens": 620.875, "epoch": 1.5416390094460046, "gen_logits_max": 3.654399871826172, "gen_logits_mean": -15.285146713256836, "gen_logits_min": -27.71980094909668, "gen_logits_std": 3.2257239818573, "gen_loss": 0.23790714144706726, "grad_norm": 0.380665832916709, "learning_rate": 2.1395368421052632e-05, "loss": 0.275, "mean_copy_accuracy": 0.9964464455842972, "mean_gen_accuracy": 0.8791908621788025, "mean_token_accuracy": 0.9081549942493439, "num_tokens": 962609341.0, "sample_num_tokens": 8763.75, "step": 7548, "total_num_tokens": 962644396.0, "z_loss": 0.000446322956122458 }, { "copy_logits_max": -6.141171932220459, "copy_logits_min": -750000000.0, "copy_num_tokens": 607.9375, "epoch": 1.5418432473832016, "gen_logits_max": 2.995781421661377, "gen_logits_mean": -17.513404846191406, "gen_logits_min": -29.725040435791016, "gen_logits_std": 3.266064405441284, "gen_loss": 0.2845991551876068, "grad_norm": 0.3419597938524349, "learning_rate": 2.1394105263157896e-05, "loss": 0.2719, "mean_copy_accuracy": 0.9966966807842255, "mean_gen_accuracy": 0.872461199760437, "mean_token_accuracy": 0.905627891421318, "num_tokens": 962885919.0, "sample_num_tokens": 10035.25, "step": 7549, "total_num_tokens": 962926060.0, "z_loss": 0.0005224501946941018 }, { "copy_logits_max": -4.631643295288086, "copy_logits_min": -750000000.0, "copy_num_tokens": 621.6875, "epoch": 1.5420474853203983, "gen_logits_max": 2.428112030029297, "gen_logits_mean": -17.59170913696289, "gen_logits_min": -29.493083953857422, "gen_logits_std": 3.2677488327026367, "gen_loss": 0.2778676748275757, "grad_norm": 0.3606505115177149, "learning_rate": 2.1392842105263157e-05, "loss": 0.2714, "mean_copy_accuracy": 0.9975534528493881, "mean_gen_accuracy": 0.8736686706542969, "mean_token_accuracy": 0.9092669636011124, "num_tokens": 963177393.0, "sample_num_tokens": 9126.25, "step": 7550, "total_num_tokens": 963213898.0, "z_loss": 0.0004518026835285127 }, { "copy_logits_max": -3.706836700439453, "copy_logits_min": -750000000.0, "copy_num_tokens": 510.25, "epoch": 1.542251723257595, "gen_logits_max": 3.836362600326538, "gen_logits_mean": -16.086673736572266, "gen_logits_min": -27.866201400756836, "gen_logits_std": 3.1889331340789795, "gen_loss": 0.2857785224914551, "grad_norm": 0.35173462086698704, "learning_rate": 2.139157894736842e-05, "loss": 0.2696, "mean_copy_accuracy": 0.9970031827688217, "mean_gen_accuracy": 0.8747769296169281, "mean_token_accuracy": 0.908701702952385, "num_tokens": 963458555.0, "sample_num_tokens": 8039.75, "step": 7551, "total_num_tokens": 963490714.0, "z_loss": 0.0005303332582116127 }, { "copy_logits_max": -4.503681182861328, "copy_logits_min": -687500032.0, "copy_num_tokens": 501.6875, "epoch": 1.542455961194792, "gen_logits_max": 2.940106153488159, "gen_logits_mean": -17.29458999633789, "gen_logits_min": -29.374135971069336, "gen_logits_std": 3.2337565422058105, "gen_loss": 0.28281667828559875, "grad_norm": 0.3676864532562331, "learning_rate": 2.1390315789473682e-05, "loss": 0.2652, "mean_copy_accuracy": 0.9971181005239487, "mean_gen_accuracy": 0.8792794793844223, "mean_token_accuracy": 0.9096492677927017, "num_tokens": 963716352.0, "sample_num_tokens": 7963.0, "step": 7552, "total_num_tokens": 963748204.0, "z_loss": 0.0004845418152399361 }, { "copy_logits_max": -5.211752891540527, "copy_logits_min": -687500032.0, "copy_num_tokens": 555.5625, "epoch": 1.5426601991319888, "gen_logits_max": 2.8912672996520996, "gen_logits_mean": -17.79825210571289, "gen_logits_min": -29.802841186523438, "gen_logits_std": 3.2107620239257812, "gen_loss": 0.26716771721839905, "grad_norm": 0.36979848861480763, "learning_rate": 2.1389052631578947e-05, "loss": 0.2631, "mean_copy_accuracy": 0.996154174208641, "mean_gen_accuracy": 0.8808824568986893, "mean_token_accuracy": 0.910531148314476, "num_tokens": 963984726.0, "sample_num_tokens": 9032.5, "step": 7553, "total_num_tokens": 964020856.0, "z_loss": 0.00047494220780208707 }, { "copy_logits_max": -4.307735443115234, "copy_logits_min": -687500032.0, "copy_num_tokens": 771.1875, "epoch": 1.5428644370691855, "gen_logits_max": 3.520315408706665, "gen_logits_mean": -16.800825119018555, "gen_logits_min": -28.836261749267578, "gen_logits_std": 3.265019416809082, "gen_loss": 0.2644153833389282, "grad_norm": 0.3379815107426534, "learning_rate": 2.138778947368421e-05, "loss": 0.2807, "mean_copy_accuracy": 0.9979260712862015, "mean_gen_accuracy": 0.8685740530490875, "mean_token_accuracy": 0.9055635631084442, "num_tokens": 964286955.0, "sample_num_tokens": 9897.75, "step": 7554, "total_num_tokens": 964326546.0, "z_loss": 0.0004634691867977381 }, { "copy_logits_max": -4.872534275054932, "copy_logits_min": -687500032.0, "copy_num_tokens": 767.6875, "epoch": 1.5430686750063825, "gen_logits_max": 3.063460111618042, "gen_logits_mean": -17.310930252075195, "gen_logits_min": -28.896310806274414, "gen_logits_std": 3.208024501800537, "gen_loss": 0.259507417678833, "grad_norm": 0.36400415684006443, "learning_rate": 2.1386526315789475e-05, "loss": 0.2654, "mean_copy_accuracy": 0.996700718998909, "mean_gen_accuracy": 0.8795306980609894, "mean_token_accuracy": 0.9110382199287415, "num_tokens": 964572946.0, "sample_num_tokens": 10116.0, "step": 7555, "total_num_tokens": 964613410.0, "z_loss": 0.0004468891420401633 }, { "copy_logits_max": -5.324223518371582, "copy_logits_min": -687500032.0, "copy_num_tokens": 449.875, "epoch": 1.5432729129435794, "gen_logits_max": 2.6192097663879395, "gen_logits_mean": -18.721349716186523, "gen_logits_min": -30.347841262817383, "gen_logits_std": 3.2674412727355957, "gen_loss": 0.24989482760429382, "grad_norm": 0.37285947369374584, "learning_rate": 2.138526315789474e-05, "loss": 0.2751, "mean_copy_accuracy": 0.996467262506485, "mean_gen_accuracy": 0.8808767795562744, "mean_token_accuracy": 0.9060249179601669, "num_tokens": 964829571.0, "sample_num_tokens": 7665.25, "step": 7556, "total_num_tokens": 964860232.0, "z_loss": 0.0004136952920816839 }, { "copy_logits_max": -2.5585334300994873, "copy_logits_min": -750000064.0, "copy_num_tokens": 517.0, "epoch": 1.543477150880776, "gen_logits_max": 4.610889434814453, "gen_logits_mean": -15.477266311645508, "gen_logits_min": -27.185314178466797, "gen_logits_std": 3.164275884628296, "gen_loss": 0.29129406809806824, "grad_norm": 0.3889383544937148, "learning_rate": 2.1384e-05, "loss": 0.2755, "mean_copy_accuracy": 0.9960202276706696, "mean_gen_accuracy": 0.8763077259063721, "mean_token_accuracy": 0.9060746133327484, "num_tokens": 965091961.0, "sample_num_tokens": 8599.75, "step": 7557, "total_num_tokens": 965126360.0, "z_loss": 0.000473237712867558 }, { "copy_logits_max": -5.980666160583496, "copy_logits_min": -750000000.0, "copy_num_tokens": 262.9375, "epoch": 1.5436813888179728, "gen_logits_max": 3.7511744499206543, "gen_logits_mean": -17.589935302734375, "gen_logits_min": -29.139549255371094, "gen_logits_std": 3.196631669998169, "gen_loss": 0.29185137152671814, "grad_norm": 0.3494132505361292, "learning_rate": 2.1382736842105265e-05, "loss": 0.2808, "mean_copy_accuracy": 0.9956841766834259, "mean_gen_accuracy": 0.8809670954942703, "mean_token_accuracy": 0.9034752994775772, "num_tokens": 965342676.0, "sample_num_tokens": 6297.5, "step": 7558, "total_num_tokens": 965367866.0, "z_loss": 0.00046054969425313175 }, { "copy_logits_max": -6.331010341644287, "copy_logits_min": -750000000.0, "copy_num_tokens": 507.0, "epoch": 1.54388562675517, "gen_logits_max": 4.385142803192139, "gen_logits_mean": -14.555561065673828, "gen_logits_min": -25.896808624267578, "gen_logits_std": 2.998303174972534, "gen_loss": 0.3055420517921448, "grad_norm": 0.34392756184021867, "learning_rate": 2.1381473684210526e-05, "loss": 0.2757, "mean_copy_accuracy": 0.99777951836586, "mean_gen_accuracy": 0.875774547457695, "mean_token_accuracy": 0.9082721322774887, "num_tokens": 965629844.0, "sample_num_tokens": 9270.0, "step": 7559, "total_num_tokens": 965666924.0, "z_loss": 0.00047905914834700525 }, { "copy_logits_max": -6.8721208572387695, "copy_logits_min": -750000000.0, "copy_num_tokens": 372.875, "epoch": 1.5440898646923666, "gen_logits_max": 4.167784214019775, "gen_logits_mean": -17.119335174560547, "gen_logits_min": -29.22513198852539, "gen_logits_std": 3.2483952045440674, "gen_loss": 0.29514819383621216, "grad_norm": 0.34029588859511545, "learning_rate": 2.138021052631579e-05, "loss": 0.2697, "mean_copy_accuracy": 0.9962432235479355, "mean_gen_accuracy": 0.8830834776163101, "mean_token_accuracy": 0.9086614102125168, "num_tokens": 965920203.0, "sample_num_tokens": 7658.25, "step": 7560, "total_num_tokens": 965950836.0, "z_loss": 0.0004550334415398538 }, { "copy_logits_max": -4.5826520919799805, "copy_logits_min": -687500032.0, "copy_num_tokens": 431.5625, "epoch": 1.5442941026295633, "gen_logits_max": 4.550146102905273, "gen_logits_mean": -16.817018508911133, "gen_logits_min": -28.704160690307617, "gen_logits_std": 3.216484546661377, "gen_loss": 0.25458580255508423, "grad_norm": 0.434277591948529, "learning_rate": 2.137894736842105e-05, "loss": 0.298, "mean_copy_accuracy": 0.9965380430221558, "mean_gen_accuracy": 0.873441755771637, "mean_token_accuracy": 0.8977244943380356, "num_tokens": 966168533.0, "sample_num_tokens": 7859.75, "step": 7561, "total_num_tokens": 966199972.0, "z_loss": 0.0004270653589628637 }, { "copy_logits_max": -0.4734848737716675, "copy_logits_min": -750000000.0, "copy_num_tokens": 419.1875, "epoch": 1.5444983405667603, "gen_logits_max": 5.296942710876465, "gen_logits_mean": -13.593683242797852, "gen_logits_min": -25.946666717529297, "gen_logits_std": 3.102480411529541, "gen_loss": 0.2862725257873535, "grad_norm": 0.35046256078813043, "learning_rate": 2.1377684210526315e-05, "loss": 0.2593, "mean_copy_accuracy": 0.9969204962253571, "mean_gen_accuracy": 0.8814781308174133, "mean_token_accuracy": 0.9108870923519135, "num_tokens": 966438672.0, "sample_num_tokens": 7906.0, "step": 7562, "total_num_tokens": 966470296.0, "z_loss": 0.00047809001989662647 }, { "copy_logits_max": -2.369098663330078, "copy_logits_min": -750000000.0, "copy_num_tokens": 449.0, "epoch": 1.5447025785039572, "gen_logits_max": 4.0081095695495605, "gen_logits_mean": -17.01927375793457, "gen_logits_min": -29.365509033203125, "gen_logits_std": 3.253204822540283, "gen_loss": 0.2596151828765869, "grad_norm": 0.3396628364215683, "learning_rate": 2.137642105263158e-05, "loss": 0.2573, "mean_copy_accuracy": 0.9969922751188278, "mean_gen_accuracy": 0.8838490545749664, "mean_token_accuracy": 0.9136923402547836, "num_tokens": 966714710.0, "sample_num_tokens": 8532.0, "step": 7563, "total_num_tokens": 966748838.0, "z_loss": 0.0005035159410908818 }, { "copy_logits_max": -4.412206649780273, "copy_logits_min": -750000128.0, "copy_num_tokens": 357.5625, "epoch": 1.544906816441154, "gen_logits_max": 5.148599624633789, "gen_logits_mean": -15.900139808654785, "gen_logits_min": -27.735820770263672, "gen_logits_std": 3.2004964351654053, "gen_loss": 0.2847260534763336, "grad_norm": 0.36031908600059925, "learning_rate": 2.1375157894736844e-05, "loss": 0.2768, "mean_copy_accuracy": 0.9956511408090591, "mean_gen_accuracy": 0.8803058713674545, "mean_token_accuracy": 0.9066714346408844, "num_tokens": 966974106.0, "sample_num_tokens": 7990.0, "step": 7564, "total_num_tokens": 967006066.0, "z_loss": 0.0005394504405558109 }, { "copy_logits_max": -3.4431943893432617, "copy_logits_min": -687500032.0, "copy_num_tokens": 389.8125, "epoch": 1.5451110543783506, "gen_logits_max": 4.3329596519470215, "gen_logits_mean": -17.187828063964844, "gen_logits_min": -29.06266212463379, "gen_logits_std": 3.2639760971069336, "gen_loss": 0.29191046953201294, "grad_norm": 0.37156137382736176, "learning_rate": 2.1373894736842105e-05, "loss": 0.295, "mean_copy_accuracy": 0.9965142160654068, "mean_gen_accuracy": 0.8686336427927017, "mean_token_accuracy": 0.8982216864824295, "num_tokens": 967243447.0, "sample_num_tokens": 9303.75, "step": 7565, "total_num_tokens": 967280662.0, "z_loss": 0.0005603793542832136 }, { "copy_logits_max": -3.2051620483398438, "copy_logits_min": -750000000.0, "copy_num_tokens": 560.0625, "epoch": 1.5453152923155478, "gen_logits_max": 4.608137130737305, "gen_logits_mean": -16.295150756835938, "gen_logits_min": -28.339969635009766, "gen_logits_std": 3.273946762084961, "gen_loss": 0.2678239941596985, "grad_norm": 0.38640963076514145, "learning_rate": 2.137263157894737e-05, "loss": 0.2843, "mean_copy_accuracy": 0.9968953132629395, "mean_gen_accuracy": 0.8709263950586319, "mean_token_accuracy": 0.9053729027509689, "num_tokens": 967525559.0, "sample_num_tokens": 9459.75, "step": 7566, "total_num_tokens": 967563398.0, "z_loss": 0.0005038634408265352 }, { "copy_logits_max": -5.669712066650391, "copy_logits_min": -750000000.0, "copy_num_tokens": 328.875, "epoch": 1.5455195302527445, "gen_logits_max": 5.921608924865723, "gen_logits_mean": -14.197439193725586, "gen_logits_min": -26.24305534362793, "gen_logits_std": 3.195613145828247, "gen_loss": 0.2933375835418701, "grad_norm": 0.39981929708947556, "learning_rate": 2.137136842105263e-05, "loss": 0.2841, "mean_copy_accuracy": 0.9970693737268448, "mean_gen_accuracy": 0.877702921628952, "mean_token_accuracy": 0.9044578671455383, "num_tokens": 967779923.0, "sample_num_tokens": 7668.75, "step": 7567, "total_num_tokens": 967810598.0, "z_loss": 0.0005117414984852076 }, { "copy_logits_max": -5.857968330383301, "copy_logits_min": -750000000.0, "copy_num_tokens": 302.5, "epoch": 1.5457237681899412, "gen_logits_max": 4.234138488769531, "gen_logits_mean": -17.034343719482422, "gen_logits_min": -29.097286224365234, "gen_logits_std": 3.271815299987793, "gen_loss": 0.2873077392578125, "grad_norm": 0.37712061711247713, "learning_rate": 2.1370105263157894e-05, "loss": 0.2907, "mean_copy_accuracy": 0.9970489889383316, "mean_gen_accuracy": 0.8730046153068542, "mean_token_accuracy": 0.9010702669620514, "num_tokens": 968040745.0, "sample_num_tokens": 7207.75, "step": 7568, "total_num_tokens": 968069576.0, "z_loss": 0.0004671583301387727 }, { "copy_logits_max": -6.343470573425293, "copy_logits_min": -750000000.0, "copy_num_tokens": 346.875, "epoch": 1.545928006127138, "gen_logits_max": 5.463173866271973, "gen_logits_mean": -15.135758399963379, "gen_logits_min": -27.269393920898438, "gen_logits_std": 3.2191596031188965, "gen_loss": 0.283622682094574, "grad_norm": 0.38043227882134045, "learning_rate": 2.136884210526316e-05, "loss": 0.285, "mean_copy_accuracy": 0.9961689114570618, "mean_gen_accuracy": 0.8765614330768585, "mean_token_accuracy": 0.9034752696752548, "num_tokens": 968329793.0, "sample_num_tokens": 7743.75, "step": 7569, "total_num_tokens": 968360768.0, "z_loss": 0.0004399027384351939 }, { "copy_logits_max": -7.332092761993408, "copy_logits_min": -750000000.0, "copy_num_tokens": 272.1875, "epoch": 1.546132244064335, "gen_logits_max": 4.411228179931641, "gen_logits_mean": -16.07378387451172, "gen_logits_min": -28.171348571777344, "gen_logits_std": 3.2371397018432617, "gen_loss": 0.31031253933906555, "grad_norm": 0.356189401438971, "learning_rate": 2.136757894736842e-05, "loss": 0.2778, "mean_copy_accuracy": 0.9970659017562866, "mean_gen_accuracy": 0.8796246200799942, "mean_token_accuracy": 0.9045058041810989, "num_tokens": 968596647.0, "sample_num_tokens": 6706.75, "step": 7570, "total_num_tokens": 968623474.0, "z_loss": 0.00044366600923240185 }, { "copy_logits_max": -7.264150619506836, "copy_logits_min": -750000000.0, "copy_num_tokens": 401.6875, "epoch": 1.5463364820015317, "gen_logits_max": 3.6080873012542725, "gen_logits_mean": -17.75354766845703, "gen_logits_min": -29.483112335205078, "gen_logits_std": 3.295161724090576, "gen_loss": 0.26442259550094604, "grad_norm": 0.3746054369563133, "learning_rate": 2.1366315789473687e-05, "loss": 0.2644, "mean_copy_accuracy": 0.997901514172554, "mean_gen_accuracy": 0.8806540966033936, "mean_token_accuracy": 0.9088229835033417, "num_tokens": 968858008.0, "sample_num_tokens": 8066.0, "step": 7571, "total_num_tokens": 968890272.0, "z_loss": 0.0004105233238078654 }, { "copy_logits_max": -5.577722549438477, "copy_logits_min": -750000000.0, "copy_num_tokens": 589.25, "epoch": 1.5465407199387287, "gen_logits_max": 2.9664225578308105, "gen_logits_mean": -16.79909896850586, "gen_logits_min": -29.149444580078125, "gen_logits_std": 3.2927706241607666, "gen_loss": 0.26334428787231445, "grad_norm": 0.35900215807381514, "learning_rate": 2.136505263157895e-05, "loss": 0.2723, "mean_copy_accuracy": 0.9973093122243881, "mean_gen_accuracy": 0.8724205940961838, "mean_token_accuracy": 0.9075264483690262, "num_tokens": 969148078.0, "sample_num_tokens": 8735.0, "step": 7572, "total_num_tokens": 969183018.0, "z_loss": 0.00041339168092235923 }, { "copy_logits_max": -6.50040340423584, "copy_logits_min": -750000064.0, "copy_num_tokens": 587.4375, "epoch": 1.5467449578759256, "gen_logits_max": 5.047589302062988, "gen_logits_mean": -14.421570777893066, "gen_logits_min": -26.696514129638672, "gen_logits_std": 3.225715160369873, "gen_loss": 0.23209959268569946, "grad_norm": 0.40252701585775547, "learning_rate": 2.1363789473684213e-05, "loss": 0.2742, "mean_copy_accuracy": 0.997100368142128, "mean_gen_accuracy": 0.8736657053232193, "mean_token_accuracy": 0.9053017050027847, "num_tokens": 969412993.0, "sample_num_tokens": 9142.75, "step": 7573, "total_num_tokens": 969449564.0, "z_loss": 0.0003659412614069879 }, { "copy_logits_max": -4.524564743041992, "copy_logits_min": -750000064.0, "copy_num_tokens": 409.125, "epoch": 1.5469491958131223, "gen_logits_max": 4.290340423583984, "gen_logits_mean": -16.236934661865234, "gen_logits_min": -28.36461639404297, "gen_logits_std": 3.2462759017944336, "gen_loss": 0.29952433705329895, "grad_norm": 0.35066284264896674, "learning_rate": 2.1362526315789474e-05, "loss": 0.2857, "mean_copy_accuracy": 0.9962522387504578, "mean_gen_accuracy": 0.8741852343082428, "mean_token_accuracy": 0.9032252579927444, "num_tokens": 969682986.0, "sample_num_tokens": 7223.5, "step": 7574, "total_num_tokens": 969711880.0, "z_loss": 0.0004946257104165852 }, { "copy_logits_max": -6.629241466522217, "copy_logits_min": -687500032.0, "copy_num_tokens": 490.6875, "epoch": 1.547153433750319, "gen_logits_max": 4.450104713439941, "gen_logits_mean": -16.2540283203125, "gen_logits_min": -28.299280166625977, "gen_logits_std": 3.250558376312256, "gen_loss": 0.27092379331588745, "grad_norm": 0.37590289906951063, "learning_rate": 2.1361263157894738e-05, "loss": 0.2852, "mean_copy_accuracy": 0.9969255328178406, "mean_gen_accuracy": 0.8756019324064255, "mean_token_accuracy": 0.9046141356229782, "num_tokens": 969954837.0, "sample_num_tokens": 8844.25, "step": 7575, "total_num_tokens": 969990214.0, "z_loss": 0.0004968831781297922 }, { "copy_logits_max": -5.939687252044678, "copy_logits_min": -687500032.0, "copy_num_tokens": 481.1875, "epoch": 1.547357671687516, "gen_logits_max": 5.109044075012207, "gen_logits_mean": -14.352470397949219, "gen_logits_min": -26.566356658935547, "gen_logits_std": 3.1789472103118896, "gen_loss": 0.2973339557647705, "grad_norm": 0.35176565415683625, "learning_rate": 2.136e-05, "loss": 0.2845, "mean_copy_accuracy": 0.9974982142448425, "mean_gen_accuracy": 0.8777026981115341, "mean_token_accuracy": 0.9048255831003189, "num_tokens": 970227539.0, "sample_num_tokens": 9477.75, "step": 7576, "total_num_tokens": 970265450.0, "z_loss": 0.0005185824120417237 }, { "copy_logits_max": -6.022110462188721, "copy_logits_min": -750000000.0, "copy_num_tokens": 566.75, "epoch": 1.5475619096247129, "gen_logits_max": 4.77599573135376, "gen_logits_mean": -14.875530242919922, "gen_logits_min": -27.04582977294922, "gen_logits_std": 3.1688132286071777, "gen_loss": 0.2697300314903259, "grad_norm": 0.34260128506520343, "learning_rate": 2.1358736842105263e-05, "loss": 0.2788, "mean_copy_accuracy": 0.9973973631858826, "mean_gen_accuracy": 0.8746501356363297, "mean_token_accuracy": 0.9046431332826614, "num_tokens": 970496950.0, "sample_num_tokens": 10790.0, "step": 7577, "total_num_tokens": 970540110.0, "z_loss": 0.0004598513769451529 }, { "copy_logits_max": -5.547497749328613, "copy_logits_min": -750000000.0, "copy_num_tokens": 394.875, "epoch": 1.5477661475619096, "gen_logits_max": 4.804636001586914, "gen_logits_mean": -15.458581924438477, "gen_logits_min": -27.61681365966797, "gen_logits_std": 3.2300000190734863, "gen_loss": 0.28883224725723267, "grad_norm": 0.37426148564959794, "learning_rate": 2.1357473684210524e-05, "loss": 0.294, "mean_copy_accuracy": 0.9971256256103516, "mean_gen_accuracy": 0.868810385465622, "mean_token_accuracy": 0.9028944820165634, "num_tokens": 970777035.0, "sample_num_tokens": 7110.25, "step": 7578, "total_num_tokens": 970805476.0, "z_loss": 0.000519806519150734 }, { "copy_logits_max": -7.772953987121582, "copy_logits_min": -750000000.0, "copy_num_tokens": 311.9375, "epoch": 1.5479703854991065, "gen_logits_max": 4.6977691650390625, "gen_logits_mean": -15.99183464050293, "gen_logits_min": -27.923168182373047, "gen_logits_std": 3.178480625152588, "gen_loss": 0.3018166422843933, "grad_norm": 0.35659624090850295, "learning_rate": 2.1356210526315792e-05, "loss": 0.2645, "mean_copy_accuracy": 0.9968619048595428, "mean_gen_accuracy": 0.8784385025501251, "mean_token_accuracy": 0.9078683704137802, "num_tokens": 971029938.0, "sample_num_tokens": 8156.0, "step": 7579, "total_num_tokens": 971062562.0, "z_loss": 0.0004862446803599596 }, { "copy_logits_max": -6.121096611022949, "copy_logits_min": -750000000.0, "copy_num_tokens": 506.1875, "epoch": 1.5481746234363034, "gen_logits_max": 3.0264639854431152, "gen_logits_mean": -17.524311065673828, "gen_logits_min": -29.593677520751953, "gen_logits_std": 3.2701056003570557, "gen_loss": 0.2903291583061218, "grad_norm": 0.3266597978565175, "learning_rate": 2.1354947368421053e-05, "loss": 0.2664, "mean_copy_accuracy": 0.9974056929349899, "mean_gen_accuracy": 0.8791752904653549, "mean_token_accuracy": 0.9096016585826874, "num_tokens": 971301090.0, "sample_num_tokens": 8596.0, "step": 7580, "total_num_tokens": 971335474.0, "z_loss": 0.0004816895234398544 }, { "copy_logits_max": -6.4438018798828125, "copy_logits_min": -750000000.0, "copy_num_tokens": 666.625, "epoch": 1.5483788613735001, "gen_logits_max": 2.9654929637908936, "gen_logits_mean": -17.771286010742188, "gen_logits_min": -29.792552947998047, "gen_logits_std": 3.293245792388916, "gen_loss": 0.28395986557006836, "grad_norm": 0.3671810489876067, "learning_rate": 2.1353684210526317e-05, "loss": 0.2843, "mean_copy_accuracy": 0.9969320297241211, "mean_gen_accuracy": 0.8733225166797638, "mean_token_accuracy": 0.9028357118368149, "num_tokens": 971589293.0, "sample_num_tokens": 9974.25, "step": 7581, "total_num_tokens": 971629190.0, "z_loss": 0.0004728935891762376 }, { "copy_logits_max": -4.934143543243408, "copy_logits_min": -687500032.0, "copy_num_tokens": 596.0625, "epoch": 1.5485830993106968, "gen_logits_max": 3.280411720275879, "gen_logits_mean": -17.350379943847656, "gen_logits_min": -29.704116821289062, "gen_logits_std": 3.2840332984924316, "gen_loss": 0.2820695638656616, "grad_norm": 0.38752273656014785, "learning_rate": 2.135242105263158e-05, "loss": 0.2836, "mean_copy_accuracy": 0.9965804666280746, "mean_gen_accuracy": 0.8767587840557098, "mean_token_accuracy": 0.9040263295173645, "num_tokens": 971849367.0, "sample_num_tokens": 9476.75, "step": 7582, "total_num_tokens": 971887274.0, "z_loss": 0.0004715390386991203 }, { "copy_logits_max": -4.700616359710693, "copy_logits_min": -750000000.0, "copy_num_tokens": 426.6875, "epoch": 1.5487873372478937, "gen_logits_max": 4.776259422302246, "gen_logits_mean": -15.489797592163086, "gen_logits_min": -27.621742248535156, "gen_logits_std": 3.2262001037597656, "gen_loss": 0.30960536003112793, "grad_norm": 0.36128372693345695, "learning_rate": 2.1351157894736842e-05, "loss": 0.2753, "mean_copy_accuracy": 0.9971158355474472, "mean_gen_accuracy": 0.8765774816274643, "mean_token_accuracy": 0.9070822149515152, "num_tokens": 972125705.0, "sample_num_tokens": 7785.75, "step": 7583, "total_num_tokens": 972156848.0, "z_loss": 0.0005529705667868257 }, { "copy_logits_max": -6.003305912017822, "copy_logits_min": -687500032.0, "copy_num_tokens": 406.3125, "epoch": 1.5489915751850907, "gen_logits_max": 5.19252872467041, "gen_logits_mean": -15.902185440063477, "gen_logits_min": -28.594818115234375, "gen_logits_std": 3.2581491470336914, "gen_loss": 0.2580138146877289, "grad_norm": 0.3395145453128162, "learning_rate": 2.1349894736842106e-05, "loss": 0.2598, "mean_copy_accuracy": 0.9961981177330017, "mean_gen_accuracy": 0.8806590288877487, "mean_token_accuracy": 0.9106072634458542, "num_tokens": 972399489.0, "sample_num_tokens": 7773.25, "step": 7584, "total_num_tokens": 972430582.0, "z_loss": 0.000528708565980196 }, { "copy_logits_max": -4.316747665405273, "copy_logits_min": -687500032.0, "copy_num_tokens": 471.375, "epoch": 1.5491958131222874, "gen_logits_max": 3.976130247116089, "gen_logits_mean": -16.74826431274414, "gen_logits_min": -29.439340591430664, "gen_logits_std": 3.2697339057922363, "gen_loss": 0.2640552520751953, "grad_norm": 0.36530748027489246, "learning_rate": 2.1348631578947367e-05, "loss": 0.2835, "mean_copy_accuracy": 0.9962203651666641, "mean_gen_accuracy": 0.8724058866500854, "mean_token_accuracy": 0.9024662971496582, "num_tokens": 972666120.0, "sample_num_tokens": 8466.0, "step": 7585, "total_num_tokens": 972699984.0, "z_loss": 0.0005188712384551764 }, { "copy_logits_max": -7.8158674240112305, "copy_logits_min": -750000000.0, "copy_num_tokens": 370.6875, "epoch": 1.5494000510594843, "gen_logits_max": 3.5625152587890625, "gen_logits_mean": -17.57870864868164, "gen_logits_min": -29.48883056640625, "gen_logits_std": 3.2369110584259033, "gen_loss": 0.29246026277542114, "grad_norm": 0.3787673699468876, "learning_rate": 2.1347368421052632e-05, "loss": 0.2728, "mean_copy_accuracy": 0.9966227561235428, "mean_gen_accuracy": 0.8776975572109222, "mean_token_accuracy": 0.907756432890892, "num_tokens": 972921907.0, "sample_num_tokens": 7273.75, "step": 7586, "total_num_tokens": 972951002.0, "z_loss": 0.00048074068035930395 }, { "copy_logits_max": -6.685626983642578, "copy_logits_min": -687500032.0, "copy_num_tokens": 294.125, "epoch": 1.5496042889966812, "gen_logits_max": 5.563885688781738, "gen_logits_mean": -15.729185104370117, "gen_logits_min": -27.99778175354004, "gen_logits_std": 3.204906463623047, "gen_loss": 0.3514496386051178, "grad_norm": 0.38949278781365826, "learning_rate": 2.1346105263157896e-05, "loss": 0.3022, "mean_copy_accuracy": 0.9957613497972488, "mean_gen_accuracy": 0.8712966591119766, "mean_token_accuracy": 0.8984916657209396, "num_tokens": 973162110.0, "sample_num_tokens": 7778.5, "step": 7587, "total_num_tokens": 973193224.0, "z_loss": 0.0006276080384850502 }, { "copy_logits_max": -6.695744514465332, "copy_logits_min": -750000000.0, "copy_num_tokens": 204.1875, "epoch": 1.549808526933878, "gen_logits_max": 5.253637313842773, "gen_logits_mean": -16.42237091064453, "gen_logits_min": -28.335552215576172, "gen_logits_std": 3.2260842323303223, "gen_loss": 0.2890722453594208, "grad_norm": 0.3983420703645138, "learning_rate": 2.134484210526316e-05, "loss": 0.2966, "mean_copy_accuracy": 0.9955049008131027, "mean_gen_accuracy": 0.8729973286390305, "mean_token_accuracy": 0.8990582078695297, "num_tokens": 973417008.0, "sample_num_tokens": 6084.0, "step": 7588, "total_num_tokens": 973441344.0, "z_loss": 0.0005016422364860773 }, { "copy_logits_max": -5.466126918792725, "copy_logits_min": -750000000.0, "copy_num_tokens": 372.375, "epoch": 1.5500127648710746, "gen_logits_max": 5.162075042724609, "gen_logits_mean": -14.744571685791016, "gen_logits_min": -27.100690841674805, "gen_logits_std": 3.153489589691162, "gen_loss": 0.31528377532958984, "grad_norm": 0.375897543642786, "learning_rate": 2.134357894736842e-05, "loss": 0.2858, "mean_copy_accuracy": 0.9962153285741806, "mean_gen_accuracy": 0.8766499757766724, "mean_token_accuracy": 0.9026793390512466, "num_tokens": 973660603.0, "sample_num_tokens": 7998.25, "step": 7589, "total_num_tokens": 973692596.0, "z_loss": 0.000631962378975004 }, { "copy_logits_max": -3.670912265777588, "copy_logits_min": -750000064.0, "copy_num_tokens": 683.9375, "epoch": 1.5502170028082718, "gen_logits_max": 4.441562652587891, "gen_logits_mean": -14.84825611114502, "gen_logits_min": -27.376436233520508, "gen_logits_std": 3.241164445877075, "gen_loss": 0.2358170747756958, "grad_norm": 0.35122680771224457, "learning_rate": 2.1342315789473686e-05, "loss": 0.2545, "mean_copy_accuracy": 0.9976025074720383, "mean_gen_accuracy": 0.8822146207094193, "mean_token_accuracy": 0.9123163819313049, "num_tokens": 973941373.0, "sample_num_tokens": 9699.25, "step": 7590, "total_num_tokens": 973980170.0, "z_loss": 0.000405320490244776 }, { "copy_logits_max": -5.916039943695068, "copy_logits_min": -750000000.0, "copy_num_tokens": 398.75, "epoch": 1.5504212407454685, "gen_logits_max": 3.74741530418396, "gen_logits_mean": -17.44944953918457, "gen_logits_min": -29.502286911010742, "gen_logits_std": 3.2477457523345947, "gen_loss": 0.25925618410110474, "grad_norm": 0.34893065765413944, "learning_rate": 2.1341052631578947e-05, "loss": 0.2835, "mean_copy_accuracy": 0.9965144544839859, "mean_gen_accuracy": 0.8761976808309555, "mean_token_accuracy": 0.9046052396297455, "num_tokens": 974216597.0, "sample_num_tokens": 7473.75, "step": 7591, "total_num_tokens": 974246492.0, "z_loss": 0.00043172723962925375 }, { "copy_logits_max": -6.905041694641113, "copy_logits_min": -750000000.0, "copy_num_tokens": 346.6875, "epoch": 1.5506254786826652, "gen_logits_max": 4.2753496170043945, "gen_logits_mean": -15.704353332519531, "gen_logits_min": -27.493209838867188, "gen_logits_std": 3.1371889114379883, "gen_loss": 0.28104162216186523, "grad_norm": 0.3601592009624702, "learning_rate": 2.133978947368421e-05, "loss": 0.2851, "mean_copy_accuracy": 0.9961476176977158, "mean_gen_accuracy": 0.8767914324998856, "mean_token_accuracy": 0.9025528132915497, "num_tokens": 974474480.0, "sample_num_tokens": 7605.0, "step": 7592, "total_num_tokens": 974504900.0, "z_loss": 0.0004625894653145224 }, { "copy_logits_max": -3.9986443519592285, "copy_logits_min": -750000000.0, "copy_num_tokens": 539.375, "epoch": 1.5508297166198621, "gen_logits_max": 3.872468948364258, "gen_logits_mean": -15.582892417907715, "gen_logits_min": -27.878856658935547, "gen_logits_std": 3.1447079181671143, "gen_loss": 0.307251513004303, "grad_norm": 0.35898553772018543, "learning_rate": 2.1338526315789472e-05, "loss": 0.2857, "mean_copy_accuracy": 0.9963133186101913, "mean_gen_accuracy": 0.8707746714353561, "mean_token_accuracy": 0.9020044803619385, "num_tokens": 974734002.0, "sample_num_tokens": 8570.5, "step": 7593, "total_num_tokens": 974768284.0, "z_loss": 0.0005293801077641547 }, { "copy_logits_max": -4.836642265319824, "copy_logits_min": -750000000.0, "copy_num_tokens": 535.0, "epoch": 1.551033954557059, "gen_logits_max": 2.880870819091797, "gen_logits_mean": -17.11656379699707, "gen_logits_min": -29.464534759521484, "gen_logits_std": 3.192986488342285, "gen_loss": 0.2599145472049713, "grad_norm": 0.34963290276887005, "learning_rate": 2.1337263157894736e-05, "loss": 0.2632, "mean_copy_accuracy": 0.9965762495994568, "mean_gen_accuracy": 0.8859039694070816, "mean_token_accuracy": 0.9114228338003159, "num_tokens": 974998769.0, "sample_num_tokens": 9485.25, "step": 7594, "total_num_tokens": 975036710.0, "z_loss": 0.000426850572694093 }, { "copy_logits_max": -6.206566333770752, "copy_logits_min": -750000000.0, "copy_num_tokens": 541.4375, "epoch": 1.5512381924942558, "gen_logits_max": 2.9460158348083496, "gen_logits_mean": -17.37081527709961, "gen_logits_min": -29.845483779907227, "gen_logits_std": 3.2545900344848633, "gen_loss": 0.26812198758125305, "grad_norm": 0.3834930978808976, "learning_rate": 2.1336000000000004e-05, "loss": 0.2754, "mean_copy_accuracy": 0.9967291802167892, "mean_gen_accuracy": 0.8754958063364029, "mean_token_accuracy": 0.9064264446496964, "num_tokens": 975278171.0, "sample_num_tokens": 8880.75, "step": 7595, "total_num_tokens": 975313694.0, "z_loss": 0.0004384356434457004 }, { "copy_logits_max": -5.527090072631836, "copy_logits_min": -750000000.0, "copy_num_tokens": 480.25, "epoch": 1.5514424304314527, "gen_logits_max": 3.0321011543273926, "gen_logits_mean": -16.86187744140625, "gen_logits_min": -29.21600914001465, "gen_logits_std": 3.2379045486450195, "gen_loss": 0.29594868421554565, "grad_norm": 0.3698591245977386, "learning_rate": 2.1334736842105265e-05, "loss": 0.2738, "mean_copy_accuracy": 0.9973079860210419, "mean_gen_accuracy": 0.8772912323474884, "mean_token_accuracy": 0.9072757512331009, "num_tokens": 975555814.0, "sample_num_tokens": 8073.0, "step": 7596, "total_num_tokens": 975588106.0, "z_loss": 0.0004478679911699146 }, { "copy_logits_max": -5.549639701843262, "copy_logits_min": -750000064.0, "copy_num_tokens": 541.3125, "epoch": 1.5516466683686496, "gen_logits_max": 3.9697012901306152, "gen_logits_mean": -15.847578048706055, "gen_logits_min": -28.129308700561523, "gen_logits_std": 3.2331273555755615, "gen_loss": 0.27083784341812134, "grad_norm": 0.35416387208520606, "learning_rate": 2.133347368421053e-05, "loss": 0.2751, "mean_copy_accuracy": 0.9969208240509033, "mean_gen_accuracy": 0.8770391196012497, "mean_token_accuracy": 0.9064145535230637, "num_tokens": 975842162.0, "sample_num_tokens": 8944.0, "step": 7597, "total_num_tokens": 975877938.0, "z_loss": 0.0004283810849301517 }, { "copy_logits_max": -6.5361456871032715, "copy_logits_min": -750000064.0, "copy_num_tokens": 334.0, "epoch": 1.5518509063058463, "gen_logits_max": 4.169162750244141, "gen_logits_mean": -16.13494873046875, "gen_logits_min": -28.56803321838379, "gen_logits_std": 3.2032599449157715, "gen_loss": 0.30449771881103516, "grad_norm": 0.4148457510104677, "learning_rate": 2.133221052631579e-05, "loss": 0.2782, "mean_copy_accuracy": 0.9961946904659271, "mean_gen_accuracy": 0.8782548010349274, "mean_token_accuracy": 0.9068493247032166, "num_tokens": 976123383.0, "sample_num_tokens": 7398.25, "step": 7598, "total_num_tokens": 976152976.0, "z_loss": 0.00048703968059271574 }, { "copy_logits_max": -7.133486747741699, "copy_logits_min": -750000000.0, "copy_num_tokens": 434.875, "epoch": 1.552055144243043, "gen_logits_max": 3.5854082107543945, "gen_logits_mean": -17.596168518066406, "gen_logits_min": -29.73365592956543, "gen_logits_std": 3.2788071632385254, "gen_loss": 0.28221434354782104, "grad_norm": 0.3567286417934367, "learning_rate": 2.1330947368421054e-05, "loss": 0.2844, "mean_copy_accuracy": 0.9968707710504532, "mean_gen_accuracy": 0.8745050132274628, "mean_token_accuracy": 0.9045048654079437, "num_tokens": 976400628.0, "sample_num_tokens": 9309.0, "step": 7599, "total_num_tokens": 976437864.0, "z_loss": 0.0004632752970792353 }, { "copy_logits_max": -4.60806941986084, "copy_logits_min": -750000000.0, "copy_num_tokens": 447.625, "epoch": 1.55225938218024, "gen_logits_max": 4.479496955871582, "gen_logits_mean": -15.980989456176758, "gen_logits_min": -28.671283721923828, "gen_logits_std": 3.2644524574279785, "gen_loss": 0.29011350870132446, "grad_norm": 0.35658569793179745, "learning_rate": 2.1329684210526315e-05, "loss": 0.279, "mean_copy_accuracy": 0.9977315813302994, "mean_gen_accuracy": 0.8719374239444733, "mean_token_accuracy": 0.9040925204753876, "num_tokens": 976675347.0, "sample_num_tokens": 8555.25, "step": 7600, "total_num_tokens": 976709568.0, "z_loss": 0.0005361040821298957 }, { "copy_logits_max": -7.494734764099121, "copy_logits_min": -750000000.0, "copy_num_tokens": 494.1875, "epoch": 1.5524636201174369, "gen_logits_max": 4.288393020629883, "gen_logits_mean": -16.429515838623047, "gen_logits_min": -28.111352920532227, "gen_logits_std": 3.2191238403320312, "gen_loss": 0.29003944993019104, "grad_norm": 0.3918699526544444, "learning_rate": 2.132842105263158e-05, "loss": 0.2796, "mean_copy_accuracy": 0.9969918876886368, "mean_gen_accuracy": 0.8759761899709702, "mean_token_accuracy": 0.9048728942871094, "num_tokens": 976959538.0, "sample_num_tokens": 9398.5, "step": 7601, "total_num_tokens": 976997132.0, "z_loss": 0.00046488794032484293 }, { "copy_logits_max": -5.889216423034668, "copy_logits_min": -750000000.0, "copy_num_tokens": 539.1875, "epoch": 1.5526678580546336, "gen_logits_max": 5.13206672668457, "gen_logits_mean": -13.532914161682129, "gen_logits_min": -26.063705444335938, "gen_logits_std": 3.189889907836914, "gen_loss": 0.24552875757217407, "grad_norm": 0.3623404928465906, "learning_rate": 2.132715789473684e-05, "loss": 0.2905, "mean_copy_accuracy": 0.9954864382743835, "mean_gen_accuracy": 0.8710407763719559, "mean_token_accuracy": 0.9014129787683487, "num_tokens": 977231031.0, "sample_num_tokens": 8601.25, "step": 7602, "total_num_tokens": 977265436.0, "z_loss": 0.0004078864003531635 }, { "copy_logits_max": -5.929537773132324, "copy_logits_min": -750000000.0, "copy_num_tokens": 598.3125, "epoch": 1.5528720959918305, "gen_logits_max": 2.9027371406555176, "gen_logits_mean": -17.281795501708984, "gen_logits_min": -29.70417022705078, "gen_logits_std": 3.2984020709991455, "gen_loss": 0.2539907693862915, "grad_norm": 0.37665924807082074, "learning_rate": 2.1325894736842108e-05, "loss": 0.2555, "mean_copy_accuracy": 0.9970339834690094, "mean_gen_accuracy": 0.8821306079626083, "mean_token_accuracy": 0.9149978905916214, "num_tokens": 977526635.0, "sample_num_tokens": 9520.75, "step": 7603, "total_num_tokens": 977564718.0, "z_loss": 0.00042481982382014394 }, { "copy_logits_max": -4.978457927703857, "copy_logits_min": -750000000.0, "copy_num_tokens": 449.875, "epoch": 1.5530763339290274, "gen_logits_max": 3.893113851547241, "gen_logits_mean": -17.60427474975586, "gen_logits_min": -29.487857818603516, "gen_logits_std": 3.2695000171661377, "gen_loss": 0.3210946321487427, "grad_norm": 0.3517897035634996, "learning_rate": 2.132463157894737e-05, "loss": 0.275, "mean_copy_accuracy": 0.9968288391828537, "mean_gen_accuracy": 0.8757859766483307, "mean_token_accuracy": 0.9093500971794128, "num_tokens": 977829331.0, "sample_num_tokens": 9311.75, "step": 7604, "total_num_tokens": 977866578.0, "z_loss": 0.0005949011538177729 }, { "copy_logits_max": -4.083677291870117, "copy_logits_min": -750000000.0, "copy_num_tokens": 359.125, "epoch": 1.5532805718662241, "gen_logits_max": 4.193950653076172, "gen_logits_mean": -16.15318489074707, "gen_logits_min": -28.064697265625, "gen_logits_std": 3.2446253299713135, "gen_loss": 0.28668731451034546, "grad_norm": 0.3358207704138758, "learning_rate": 2.1323368421052633e-05, "loss": 0.2793, "mean_copy_accuracy": 0.9971267729997635, "mean_gen_accuracy": 0.8684911280870438, "mean_token_accuracy": 0.9057255238294601, "num_tokens": 978138798.0, "sample_num_tokens": 6907.0, "step": 7605, "total_num_tokens": 978166426.0, "z_loss": 0.0005111639038659632 }, { "copy_logits_max": -4.511641979217529, "copy_logits_min": -750000000.0, "copy_num_tokens": 400.75, "epoch": 1.5534848098034209, "gen_logits_max": 5.032247543334961, "gen_logits_mean": -15.125709533691406, "gen_logits_min": -27.493743896484375, "gen_logits_std": 3.2316741943359375, "gen_loss": 0.30058780312538147, "grad_norm": 0.36687709433132615, "learning_rate": 2.1322105263157894e-05, "loss": 0.271, "mean_copy_accuracy": 0.9967855513095856, "mean_gen_accuracy": 0.8775110393762589, "mean_token_accuracy": 0.9067167490720749, "num_tokens": 978382889.0, "sample_num_tokens": 7508.75, "step": 7606, "total_num_tokens": 978412924.0, "z_loss": 0.0006008332129567862 }, { "copy_logits_max": -4.830169677734375, "copy_logits_min": -687500032.0, "copy_num_tokens": 690.625, "epoch": 1.5536890477406178, "gen_logits_max": 3.994481086730957, "gen_logits_mean": -16.244529724121094, "gen_logits_min": -28.768108367919922, "gen_logits_std": 3.251662015914917, "gen_loss": 0.2728818655014038, "grad_norm": 0.3662688540431317, "learning_rate": 2.132084210526316e-05, "loss": 0.2795, "mean_copy_accuracy": 0.9976700246334076, "mean_gen_accuracy": 0.8720511049032211, "mean_token_accuracy": 0.9064967036247253, "num_tokens": 978665545.0, "sample_num_tokens": 10535.25, "step": 7607, "total_num_tokens": 978707686.0, "z_loss": 0.0005045722355134785 }, { "copy_logits_max": -3.9280495643615723, "copy_logits_min": -750000000.0, "copy_num_tokens": 511.0625, "epoch": 1.5538932856778147, "gen_logits_max": 4.396587371826172, "gen_logits_mean": -15.870342254638672, "gen_logits_min": -28.07240104675293, "gen_logits_std": 3.251192569732666, "gen_loss": 0.3227055072784424, "grad_norm": 0.35507682302229787, "learning_rate": 2.131957894736842e-05, "loss": 0.2817, "mean_copy_accuracy": 0.9959052205085754, "mean_gen_accuracy": 0.8710481226444244, "mean_token_accuracy": 0.9040102958679199, "num_tokens": 978937377.0, "sample_num_tokens": 8425.75, "step": 7608, "total_num_tokens": 978971080.0, "z_loss": 0.0005924094002693892 }, { "copy_logits_max": -5.398138046264648, "copy_logits_min": -750000000.0, "copy_num_tokens": 624.875, "epoch": 1.5540975236150114, "gen_logits_max": 3.7115936279296875, "gen_logits_mean": -16.027690887451172, "gen_logits_min": -28.484657287597656, "gen_logits_std": 3.2633025646209717, "gen_loss": 0.25895681977272034, "grad_norm": 0.38531365090402653, "learning_rate": 2.1318315789473684e-05, "loss": 0.2798, "mean_copy_accuracy": 0.9961149394512177, "mean_gen_accuracy": 0.8761603534221649, "mean_token_accuracy": 0.9063092917203903, "num_tokens": 979206745.0, "sample_num_tokens": 8952.25, "step": 7609, "total_num_tokens": 979242554.0, "z_loss": 0.00046265427954494953 }, { "copy_logits_max": -3.189549446105957, "copy_logits_min": -750000000.0, "copy_num_tokens": 412.1875, "epoch": 1.5543017615522083, "gen_logits_max": 4.331254005432129, "gen_logits_mean": -15.878362655639648, "gen_logits_min": -28.237993240356445, "gen_logits_std": 3.2691242694854736, "gen_loss": 0.25573429465293884, "grad_norm": 0.3257908648871594, "learning_rate": 2.1317052631578948e-05, "loss": 0.2459, "mean_copy_accuracy": 0.9973194748163223, "mean_gen_accuracy": 0.8896895945072174, "mean_token_accuracy": 0.916249081492424, "num_tokens": 979494643.0, "sample_num_tokens": 7645.25, "step": 7610, "total_num_tokens": 979525224.0, "z_loss": 0.0005112806102260947 }, { "copy_logits_max": -4.7335405349731445, "copy_logits_min": -687500096.0, "copy_num_tokens": 413.125, "epoch": 1.5545059994894053, "gen_logits_max": 5.3559722900390625, "gen_logits_mean": -14.988101959228516, "gen_logits_min": -27.638839721679688, "gen_logits_std": 3.2209348678588867, "gen_loss": 0.28393083810806274, "grad_norm": 0.37636533413781803, "learning_rate": 2.1315789473684212e-05, "loss": 0.2817, "mean_copy_accuracy": 0.9964818805456161, "mean_gen_accuracy": 0.8786527216434479, "mean_token_accuracy": 0.9044282287359238, "num_tokens": 979750354.0, "sample_num_tokens": 8565.0, "step": 7611, "total_num_tokens": 979784614.0, "z_loss": 0.0004859698237851262 }, { "copy_logits_max": -5.818299293518066, "copy_logits_min": -750000000.0, "copy_num_tokens": 475.9375, "epoch": 1.554710237426602, "gen_logits_max": 4.205018520355225, "gen_logits_mean": -16.736392974853516, "gen_logits_min": -29.018993377685547, "gen_logits_std": 3.268444538116455, "gen_loss": 0.2933707535266876, "grad_norm": 0.3394108906841778, "learning_rate": 2.1314526315789477e-05, "loss": 0.2679, "mean_copy_accuracy": 0.9969068318605423, "mean_gen_accuracy": 0.8802094012498856, "mean_token_accuracy": 0.9081057608127594, "num_tokens": 980010411.0, "sample_num_tokens": 8701.75, "step": 7612, "total_num_tokens": 980045218.0, "z_loss": 0.00048213277477771044 }, { "copy_logits_max": -6.812932014465332, "copy_logits_min": -750000000.0, "copy_num_tokens": 543.25, "epoch": 1.5549144753637987, "gen_logits_max": 4.944987773895264, "gen_logits_mean": -14.510798454284668, "gen_logits_min": -26.91895866394043, "gen_logits_std": 3.2280805110931396, "gen_loss": 0.2630222737789154, "grad_norm": 0.3581112155995661, "learning_rate": 2.1313263157894738e-05, "loss": 0.2597, "mean_copy_accuracy": 0.9974992573261261, "mean_gen_accuracy": 0.8799233287572861, "mean_token_accuracy": 0.9124673157930374, "num_tokens": 980302258.0, "sample_num_tokens": 8985.5, "step": 7613, "total_num_tokens": 980338200.0, "z_loss": 0.000444627134129405 }, { "copy_logits_max": -3.242798328399658, "copy_logits_min": -750000000.0, "copy_num_tokens": 529.3125, "epoch": 1.5551187133009958, "gen_logits_max": 4.179712772369385, "gen_logits_mean": -15.340251922607422, "gen_logits_min": -27.99066925048828, "gen_logits_std": 3.2389564514160156, "gen_loss": 0.29097092151641846, "grad_norm": 0.32569837778480687, "learning_rate": 2.1312000000000002e-05, "loss": 0.2674, "mean_copy_accuracy": 0.9970188140869141, "mean_gen_accuracy": 0.8771075755357742, "mean_token_accuracy": 0.9101391136646271, "num_tokens": 980604226.0, "sample_num_tokens": 8515.0, "step": 7614, "total_num_tokens": 980638286.0, "z_loss": 0.0004316478152759373 }, { "copy_logits_max": -7.412954330444336, "copy_logits_min": -750000000.0, "copy_num_tokens": 175.0625, "epoch": 1.5553229512381925, "gen_logits_max": 5.319344520568848, "gen_logits_mean": -16.02008056640625, "gen_logits_min": -27.78988265991211, "gen_logits_std": 3.223665237426758, "gen_loss": 0.3499266803264618, "grad_norm": 0.32373717605621505, "learning_rate": 2.1310736842105263e-05, "loss": 0.2909, "mean_copy_accuracy": 0.9949508756399155, "mean_gen_accuracy": 0.8740153461694717, "mean_token_accuracy": 0.899521678686142, "num_tokens": 980869231.0, "sample_num_tokens": 6089.25, "step": 7615, "total_num_tokens": 980893588.0, "z_loss": 0.000545675924513489 }, { "copy_logits_max": -6.015881538391113, "copy_logits_min": -750000064.0, "copy_num_tokens": 720.75, "epoch": 1.5555271891753892, "gen_logits_max": 3.5287039279937744, "gen_logits_mean": -16.52480697631836, "gen_logits_min": -28.865659713745117, "gen_logits_std": 3.2796387672424316, "gen_loss": 0.2826569676399231, "grad_norm": 0.3611715711095239, "learning_rate": 2.1309473684210527e-05, "loss": 0.2524, "mean_copy_accuracy": 0.9974645227193832, "mean_gen_accuracy": 0.8844467401504517, "mean_token_accuracy": 0.9140107482671738, "num_tokens": 981140834.0, "sample_num_tokens": 10625.5, "step": 7616, "total_num_tokens": 981183336.0, "z_loss": 0.0004519151116255671 }, { "copy_logits_max": -6.732358932495117, "copy_logits_min": -750000000.0, "copy_num_tokens": 377.375, "epoch": 1.5557314271125862, "gen_logits_max": 4.221535682678223, "gen_logits_mean": -16.962242126464844, "gen_logits_min": -29.001602172851562, "gen_logits_std": 3.286252498626709, "gen_loss": 0.2547011971473694, "grad_norm": 0.34956258623971526, "learning_rate": 2.1308210526315788e-05, "loss": 0.2758, "mean_copy_accuracy": 0.9965654462575912, "mean_gen_accuracy": 0.8782928287982941, "mean_token_accuracy": 0.9075440615415573, "num_tokens": 981419434.0, "sample_num_tokens": 8041.5, "step": 7617, "total_num_tokens": 981451600.0, "z_loss": 0.00043511990224942565 }, { "copy_logits_max": -7.075020790100098, "copy_logits_min": -750000000.0, "copy_num_tokens": 592.5, "epoch": 1.555935665049783, "gen_logits_max": 2.8635687828063965, "gen_logits_mean": -17.497940063476562, "gen_logits_min": -29.573551177978516, "gen_logits_std": 3.31632137298584, "gen_loss": 0.2826054096221924, "grad_norm": 0.36358439299442036, "learning_rate": 2.1306947368421052e-05, "loss": 0.2758, "mean_copy_accuracy": 0.9970053136348724, "mean_gen_accuracy": 0.8742114752531052, "mean_token_accuracy": 0.9077916890382767, "num_tokens": 981710168.0, "sample_num_tokens": 8855.5, "step": 7618, "total_num_tokens": 981745590.0, "z_loss": 0.0004480992502067238 }, { "copy_logits_max": -8.358509063720703, "copy_logits_min": -750000000.0, "copy_num_tokens": 237.6875, "epoch": 1.5561399029869798, "gen_logits_max": 4.410597801208496, "gen_logits_mean": -17.030902862548828, "gen_logits_min": -28.565448760986328, "gen_logits_std": 3.250366687774658, "gen_loss": 0.2794576585292816, "grad_norm": 0.3404634555449993, "learning_rate": 2.1305684210526313e-05, "loss": 0.2722, "mean_copy_accuracy": 0.9968791157007217, "mean_gen_accuracy": 0.883770614862442, "mean_token_accuracy": 0.9085604697465897, "num_tokens": 981969799.0, "sample_num_tokens": 6643.75, "step": 7619, "total_num_tokens": 981996374.0, "z_loss": 0.00040279654785990715 }, { "copy_logits_max": -8.342046737670898, "copy_logits_min": -750000000.0, "copy_num_tokens": 516.9375, "epoch": 1.5563441409241765, "gen_logits_max": 3.4648780822753906, "gen_logits_mean": -16.705421447753906, "gen_logits_min": -29.10390853881836, "gen_logits_std": 3.3039419651031494, "gen_loss": 0.2680947184562683, "grad_norm": 0.3624857965604069, "learning_rate": 2.130442105263158e-05, "loss": 0.2656, "mean_copy_accuracy": 0.9970106035470963, "mean_gen_accuracy": 0.8793482482433319, "mean_token_accuracy": 0.9086440801620483, "num_tokens": 982229205.0, "sample_num_tokens": 8991.75, "step": 7620, "total_num_tokens": 982265172.0, "z_loss": 0.0004238656547386199 }, { "copy_logits_max": -7.760888576507568, "copy_logits_min": -687500032.0, "copy_num_tokens": 717.375, "epoch": 1.5565483788613736, "gen_logits_max": 2.859100818634033, "gen_logits_mean": -17.172256469726562, "gen_logits_min": -29.252716064453125, "gen_logits_std": 3.3038418292999268, "gen_loss": 0.23032289743423462, "grad_norm": 0.3849631186428519, "learning_rate": 2.1303157894736842e-05, "loss": 0.26, "mean_copy_accuracy": 0.9967318326234818, "mean_gen_accuracy": 0.8810623288154602, "mean_token_accuracy": 0.911224752664566, "num_tokens": 982490313.0, "sample_num_tokens": 9932.75, "step": 7621, "total_num_tokens": 982530044.0, "z_loss": 0.0003857760748360306 }, { "copy_logits_max": -7.424684524536133, "copy_logits_min": -750000000.0, "copy_num_tokens": 495.625, "epoch": 1.5567526167985704, "gen_logits_max": 3.3063602447509766, "gen_logits_mean": -16.751996994018555, "gen_logits_min": -28.95720863342285, "gen_logits_std": 3.2956042289733887, "gen_loss": 0.26926976442337036, "grad_norm": 0.32830722671034174, "learning_rate": 2.1301894736842106e-05, "loss": 0.2569, "mean_copy_accuracy": 0.9970674961805344, "mean_gen_accuracy": 0.8837121427059174, "mean_token_accuracy": 0.9132543951272964, "num_tokens": 982776852.0, "sample_num_tokens": 8422.0, "step": 7622, "total_num_tokens": 982810540.0, "z_loss": 0.00043573856237344444 }, { "copy_logits_max": -8.001747131347656, "copy_logits_min": -750000000.0, "copy_num_tokens": 379.5, "epoch": 1.556956854735767, "gen_logits_max": 3.657045841217041, "gen_logits_mean": -16.602909088134766, "gen_logits_min": -28.958602905273438, "gen_logits_std": 3.275578498840332, "gen_loss": 0.29157185554504395, "grad_norm": 0.33539200966667565, "learning_rate": 2.130063157894737e-05, "loss": 0.2595, "mean_copy_accuracy": 0.996976375579834, "mean_gen_accuracy": 0.8801788836717606, "mean_token_accuracy": 0.910755068063736, "num_tokens": 983064840.0, "sample_num_tokens": 7572.5, "step": 7623, "total_num_tokens": 983095130.0, "z_loss": 0.0004450748674571514 }, { "copy_logits_max": -8.05968952178955, "copy_logits_min": -750000000.0, "copy_num_tokens": 411.875, "epoch": 1.557161092672964, "gen_logits_max": 3.3405516147613525, "gen_logits_mean": -17.405611038208008, "gen_logits_min": -29.540929794311523, "gen_logits_std": 3.299907684326172, "gen_loss": 0.2535378336906433, "grad_norm": 0.33522346269438313, "learning_rate": 2.129936842105263e-05, "loss": 0.27, "mean_copy_accuracy": 0.9971352517604828, "mean_gen_accuracy": 0.8786014914512634, "mean_token_accuracy": 0.9063833206892014, "num_tokens": 983339385.0, "sample_num_tokens": 8088.75, "step": 7624, "total_num_tokens": 983371740.0, "z_loss": 0.00041025984683074057 }, { "copy_logits_max": -4.977344512939453, "copy_logits_min": -750000000.0, "copy_num_tokens": 608.1875, "epoch": 1.557365330610161, "gen_logits_max": 2.9620728492736816, "gen_logits_mean": -16.595016479492188, "gen_logits_min": -29.761470794677734, "gen_logits_std": 3.3178467750549316, "gen_loss": 0.25556379556655884, "grad_norm": 0.35812433253776577, "learning_rate": 2.1298105263157896e-05, "loss": 0.2568, "mean_copy_accuracy": 0.9969502836465836, "mean_gen_accuracy": 0.8799146264791489, "mean_token_accuracy": 0.9125896841287613, "num_tokens": 983604880.0, "sample_num_tokens": 9032.5, "step": 7625, "total_num_tokens": 983641010.0, "z_loss": 0.0004628780880011618 }, { "copy_logits_max": -4.238962173461914, "copy_logits_min": -750000000.0, "copy_num_tokens": 475.25, "epoch": 1.5575695685473576, "gen_logits_max": 3.561753749847412, "gen_logits_mean": -16.392845153808594, "gen_logits_min": -28.795974731445312, "gen_logits_std": 3.3027729988098145, "gen_loss": 0.24880558252334595, "grad_norm": 0.33996969748030587, "learning_rate": 2.1296842105263157e-05, "loss": 0.2457, "mean_copy_accuracy": 0.9968792349100113, "mean_gen_accuracy": 0.8851243555545807, "mean_token_accuracy": 0.9169685542583466, "num_tokens": 983887463.0, "sample_num_tokens": 7748.75, "step": 7626, "total_num_tokens": 983918458.0, "z_loss": 0.000382395985070616 }, { "copy_logits_max": -5.615927219390869, "copy_logits_min": -750000000.0, "copy_num_tokens": 452.625, "epoch": 1.5577738064845545, "gen_logits_max": 3.278886318206787, "gen_logits_mean": -17.364667892456055, "gen_logits_min": -29.480850219726562, "gen_logits_std": 3.3139302730560303, "gen_loss": 0.28114548325538635, "grad_norm": 0.3399548471490065, "learning_rate": 2.129557894736842e-05, "loss": 0.2578, "mean_copy_accuracy": 0.9972585737705231, "mean_gen_accuracy": 0.8802904784679413, "mean_token_accuracy": 0.911301389336586, "num_tokens": 984167973.0, "sample_num_tokens": 7610.75, "step": 7627, "total_num_tokens": 984198416.0, "z_loss": 0.00043908855877816677 }, { "copy_logits_max": -5.803390026092529, "copy_logits_min": -750000000.0, "copy_num_tokens": 353.1875, "epoch": 1.5579780444217515, "gen_logits_max": 4.402507305145264, "gen_logits_mean": -16.32166862487793, "gen_logits_min": -28.564294815063477, "gen_logits_std": 3.2762231826782227, "gen_loss": 0.27553796768188477, "grad_norm": 0.39838033427852326, "learning_rate": 2.1294315789473685e-05, "loss": 0.2757, "mean_copy_accuracy": 0.9967578202486038, "mean_gen_accuracy": 0.8808585107326508, "mean_token_accuracy": 0.9050044566392899, "num_tokens": 984429275.0, "sample_num_tokens": 8020.75, "step": 7628, "total_num_tokens": 984461358.0, "z_loss": 0.00043134030420333147 }, { "copy_logits_max": -6.767826080322266, "copy_logits_min": -750000000.0, "copy_num_tokens": 366.6875, "epoch": 1.5581822823589482, "gen_logits_max": 4.185974597930908, "gen_logits_mean": -16.129703521728516, "gen_logits_min": -28.04322624206543, "gen_logits_std": 3.2560229301452637, "gen_loss": 0.29335638880729675, "grad_norm": 0.3648259298079662, "learning_rate": 2.129305263157895e-05, "loss": 0.2835, "mean_copy_accuracy": 0.9975522458553314, "mean_gen_accuracy": 0.8748671859502792, "mean_token_accuracy": 0.9046867042779922, "num_tokens": 984707022.0, "sample_num_tokens": 7815.0, "step": 7629, "total_num_tokens": 984738282.0, "z_loss": 0.0004162515979260206 }, { "copy_logits_max": -6.362004280090332, "copy_logits_min": -750000000.0, "copy_num_tokens": 592.6875, "epoch": 1.5583865202961449, "gen_logits_max": 3.628159999847412, "gen_logits_mean": -16.58413314819336, "gen_logits_min": -28.546424865722656, "gen_logits_std": 3.294830799102783, "gen_loss": 0.2514776587486267, "grad_norm": 0.352435907441253, "learning_rate": 2.129178947368421e-05, "loss": 0.2641, "mean_copy_accuracy": 0.9960944652557373, "mean_gen_accuracy": 0.8828819692134857, "mean_token_accuracy": 0.910507082939148, "num_tokens": 984972083.0, "sample_num_tokens": 9551.75, "step": 7630, "total_num_tokens": 985010290.0, "z_loss": 0.0003654363099485636 }, { "copy_logits_max": -7.301231861114502, "copy_logits_min": -750000000.0, "copy_num_tokens": 522.375, "epoch": 1.5585907582333418, "gen_logits_max": 3.4247007369995117, "gen_logits_mean": -17.727670669555664, "gen_logits_min": -29.718862533569336, "gen_logits_std": 3.3120710849761963, "gen_loss": 0.285868763923645, "grad_norm": 0.39456974693058644, "learning_rate": 2.1290526315789475e-05, "loss": 0.2873, "mean_copy_accuracy": 0.9963949918746948, "mean_gen_accuracy": 0.8729608058929443, "mean_token_accuracy": 0.9024301171302795, "num_tokens": 985235501.0, "sample_num_tokens": 9440.25, "step": 7631, "total_num_tokens": 985273262.0, "z_loss": 0.0004743231402244419 }, { "copy_logits_max": -8.011547088623047, "copy_logits_min": -750000000.0, "copy_num_tokens": 355.3125, "epoch": 1.5587949961705387, "gen_logits_max": 4.343976974487305, "gen_logits_mean": -17.099472045898438, "gen_logits_min": -28.74198341369629, "gen_logits_std": 3.2561683654785156, "gen_loss": 0.3017899990081787, "grad_norm": 0.5530316110168841, "learning_rate": 2.1289263157894736e-05, "loss": 0.287, "mean_copy_accuracy": 0.996540293097496, "mean_gen_accuracy": 0.8761387765407562, "mean_token_accuracy": 0.901360884308815, "num_tokens": 985492550.0, "sample_num_tokens": 8166.5, "step": 7632, "total_num_tokens": 985525216.0, "z_loss": 0.0004988734144717455 }, { "copy_logits_max": -7.170228958129883, "copy_logits_min": -750000000.0, "copy_num_tokens": 491.75, "epoch": 1.5589992341077354, "gen_logits_max": 2.982722520828247, "gen_logits_mean": -17.82990264892578, "gen_logits_min": -29.892526626586914, "gen_logits_std": 3.3598709106445312, "gen_loss": 0.2233506292104721, "grad_norm": 0.37370003550549175, "learning_rate": 2.1288e-05, "loss": 0.2703, "mean_copy_accuracy": 0.9973814934492111, "mean_gen_accuracy": 0.8753749281167984, "mean_token_accuracy": 0.908770814538002, "num_tokens": 985762775.0, "sample_num_tokens": 8300.75, "step": 7633, "total_num_tokens": 985795978.0, "z_loss": 0.000382129626814276 }, { "copy_logits_max": -6.241347789764404, "copy_logits_min": -750000000.0, "copy_num_tokens": 340.9375, "epoch": 1.5592034720449324, "gen_logits_max": 4.47498893737793, "gen_logits_mean": -16.00445556640625, "gen_logits_min": -27.919448852539062, "gen_logits_std": 3.2664074897766113, "gen_loss": 0.31248384714126587, "grad_norm": 0.6435613839586599, "learning_rate": 2.128673684210526e-05, "loss": 0.2946, "mean_copy_accuracy": 0.9963958263397217, "mean_gen_accuracy": 0.8707052320241928, "mean_token_accuracy": 0.9006534814834595, "num_tokens": 986017880.0, "sample_num_tokens": 7647.5, "step": 7634, "total_num_tokens": 986048470.0, "z_loss": 0.0005338095361366868 }, { "copy_logits_max": -4.333131790161133, "copy_logits_min": -750000000.0, "copy_num_tokens": 327.75, "epoch": 1.5594077099821293, "gen_logits_max": 4.314964771270752, "gen_logits_mean": -16.189598083496094, "gen_logits_min": -28.21891212463379, "gen_logits_std": 3.261298656463623, "gen_loss": 0.281697154045105, "grad_norm": 0.3657763761538184, "learning_rate": 2.1285473684210525e-05, "loss": 0.2831, "mean_copy_accuracy": 0.9956910014152527, "mean_gen_accuracy": 0.876876175403595, "mean_token_accuracy": 0.9041419625282288, "num_tokens": 986290415.0, "sample_num_tokens": 7700.25, "step": 7635, "total_num_tokens": 986321216.0, "z_loss": 0.0004525313270278275 }, { "copy_logits_max": -4.110805511474609, "copy_logits_min": -750000000.0, "copy_num_tokens": 474.625, "epoch": 1.559611947919326, "gen_logits_max": 3.2496495246887207, "gen_logits_mean": -17.81231689453125, "gen_logits_min": -29.97461700439453, "gen_logits_std": 3.2911486625671387, "gen_loss": 0.318156898021698, "grad_norm": 0.37971262202889494, "learning_rate": 2.1284210526315793e-05, "loss": 0.2725, "mean_copy_accuracy": 0.9960838109254837, "mean_gen_accuracy": 0.8834813386201859, "mean_token_accuracy": 0.906763955950737, "num_tokens": 986547749.0, "sample_num_tokens": 8576.75, "step": 7636, "total_num_tokens": 986582056.0, "z_loss": 0.0005563571467064321 }, { "copy_logits_max": -3.8745691776275635, "copy_logits_min": -750000064.0, "copy_num_tokens": 492.1875, "epoch": 1.5598161858565227, "gen_logits_max": 4.136199951171875, "gen_logits_mean": -16.325347900390625, "gen_logits_min": -28.799602508544922, "gen_logits_std": 3.2759501934051514, "gen_loss": 0.2940704822540283, "grad_norm": 0.3541307569693827, "learning_rate": 2.1282947368421054e-05, "loss": 0.2644, "mean_copy_accuracy": 0.9971622973680496, "mean_gen_accuracy": 0.8774037957191467, "mean_token_accuracy": 0.9091913551092148, "num_tokens": 986818462.0, "sample_num_tokens": 9129.0, "step": 7637, "total_num_tokens": 986854978.0, "z_loss": 0.0005469258176162839 }, { "copy_logits_max": -3.1778457164764404, "copy_logits_min": -750000064.0, "copy_num_tokens": 539.25, "epoch": 1.5600204237937196, "gen_logits_max": 3.2356438636779785, "gen_logits_mean": -16.8416690826416, "gen_logits_min": -28.77947998046875, "gen_logits_std": 3.2291665077209473, "gen_loss": 0.28079456090927124, "grad_norm": 0.3726552003813714, "learning_rate": 2.128168421052632e-05, "loss": 0.2825, "mean_copy_accuracy": 0.9972717612981796, "mean_gen_accuracy": 0.8673067092895508, "mean_token_accuracy": 0.9039629697799683, "num_tokens": 987114449.0, "sample_num_tokens": 8719.25, "step": 7638, "total_num_tokens": 987149326.0, "z_loss": 0.0004739940632134676 }, { "copy_logits_max": -5.9938130378723145, "copy_logits_min": -750000064.0, "copy_num_tokens": 474.0625, "epoch": 1.5602246617309166, "gen_logits_max": 4.725067138671875, "gen_logits_mean": -15.612997055053711, "gen_logits_min": -27.976016998291016, "gen_logits_std": 3.2482428550720215, "gen_loss": 0.2856432795524597, "grad_norm": 0.35091241674556156, "learning_rate": 2.128042105263158e-05, "loss": 0.2727, "mean_copy_accuracy": 0.997451588511467, "mean_gen_accuracy": 0.8778342753648758, "mean_token_accuracy": 0.9080526828765869, "num_tokens": 987390491.0, "sample_num_tokens": 8837.25, "step": 7639, "total_num_tokens": 987425840.0, "z_loss": 0.00048769399290904403 }, { "copy_logits_max": -4.668630599975586, "copy_logits_min": -687500096.0, "copy_num_tokens": 629.25, "epoch": 1.5604288996681133, "gen_logits_max": 3.8075480461120605, "gen_logits_mean": -15.800573348999023, "gen_logits_min": -28.230703353881836, "gen_logits_std": 3.2917613983154297, "gen_loss": 0.23545242846012115, "grad_norm": 0.3384035739409001, "learning_rate": 2.1279157894736844e-05, "loss": 0.2618, "mean_copy_accuracy": 0.9965094327926636, "mean_gen_accuracy": 0.8799599409103394, "mean_token_accuracy": 0.9095610082149506, "num_tokens": 987667179.0, "sample_num_tokens": 8642.75, "step": 7640, "total_num_tokens": 987701750.0, "z_loss": 0.0003884746693074703 }, { "copy_logits_max": -7.452192783355713, "copy_logits_min": -750000000.0, "copy_num_tokens": 312.25, "epoch": 1.5606331376053102, "gen_logits_max": 4.669416904449463, "gen_logits_mean": -16.32632064819336, "gen_logits_min": -28.233478546142578, "gen_logits_std": 3.257842779159546, "gen_loss": 0.2938501238822937, "grad_norm": 0.3465471251943784, "learning_rate": 2.1277894736842104e-05, "loss": 0.284, "mean_copy_accuracy": 0.9958964139223099, "mean_gen_accuracy": 0.8755853027105331, "mean_token_accuracy": 0.9040690809488297, "num_tokens": 987930430.0, "sample_num_tokens": 7495.0, "step": 7641, "total_num_tokens": 987960410.0, "z_loss": 0.000544241163879633 }, { "copy_logits_max": -3.956815481185913, "copy_logits_min": -687500032.0, "copy_num_tokens": 308.0, "epoch": 1.5608373755425071, "gen_logits_max": 4.131283760070801, "gen_logits_mean": -16.788066864013672, "gen_logits_min": -29.102155685424805, "gen_logits_std": 3.304490089416504, "gen_loss": 0.25891438126564026, "grad_norm": 0.3601807032236772, "learning_rate": 2.127663157894737e-05, "loss": 0.2758, "mean_copy_accuracy": 0.996967151761055, "mean_gen_accuracy": 0.8793751299381256, "mean_token_accuracy": 0.9046345502138138, "num_tokens": 988175258.0, "sample_num_tokens": 6757.5, "step": 7642, "total_num_tokens": 988202288.0, "z_loss": 0.00045186313218437135 }, { "copy_logits_max": -5.002593040466309, "copy_logits_min": -687500032.0, "copy_num_tokens": 520.9375, "epoch": 1.5610416134797038, "gen_logits_max": 3.6932907104492188, "gen_logits_mean": -16.309450149536133, "gen_logits_min": -28.46305274963379, "gen_logits_std": 3.25671648979187, "gen_loss": 0.28246572613716125, "grad_norm": 0.3447516158439573, "learning_rate": 2.127536842105263e-05, "loss": 0.2662, "mean_copy_accuracy": 0.9970793277025223, "mean_gen_accuracy": 0.8816688656806946, "mean_token_accuracy": 0.9104204177856445, "num_tokens": 988458135.0, "sample_num_tokens": 8661.75, "step": 7643, "total_num_tokens": 988492782.0, "z_loss": 0.0005108413752168417 }, { "copy_logits_max": -5.20623779296875, "copy_logits_min": -687500032.0, "copy_num_tokens": 491.0, "epoch": 1.5612458514169005, "gen_logits_max": 3.8814029693603516, "gen_logits_mean": -15.96872329711914, "gen_logits_min": -28.081907272338867, "gen_logits_std": 3.26059627532959, "gen_loss": 0.2850896716117859, "grad_norm": 0.3666653619273284, "learning_rate": 2.1274105263157897e-05, "loss": 0.2879, "mean_copy_accuracy": 0.9973372370004654, "mean_gen_accuracy": 0.8689681589603424, "mean_token_accuracy": 0.9008225798606873, "num_tokens": 988720464.0, "sample_num_tokens": 8039.5, "step": 7644, "total_num_tokens": 988752622.0, "z_loss": 0.0005118706030771136 }, { "copy_logits_max": -6.048843860626221, "copy_logits_min": -687500032.0, "copy_num_tokens": 468.9375, "epoch": 1.5614500893540977, "gen_logits_max": 4.945520401000977, "gen_logits_mean": -14.983932495117188, "gen_logits_min": -27.322826385498047, "gen_logits_std": 3.2002322673797607, "gen_loss": 0.29230886697769165, "grad_norm": 0.34742614963298896, "learning_rate": 2.127284210526316e-05, "loss": 0.2787, "mean_copy_accuracy": 0.997260108590126, "mean_gen_accuracy": 0.8757896423339844, "mean_token_accuracy": 0.9066115021705627, "num_tokens": 988990458.0, "sample_num_tokens": 8763.5, "step": 7645, "total_num_tokens": 989025512.0, "z_loss": 0.0005099278641864657 }, { "copy_logits_max": -5.352231979370117, "copy_logits_min": -750000000.0, "copy_num_tokens": 530.375, "epoch": 1.5616543272912944, "gen_logits_max": 3.4407434463500977, "gen_logits_mean": -16.533432006835938, "gen_logits_min": -28.97903823852539, "gen_logits_std": 3.263237476348877, "gen_loss": 0.2713848352432251, "grad_norm": 0.35488343025315067, "learning_rate": 2.1271578947368423e-05, "loss": 0.2834, "mean_copy_accuracy": 0.9964002370834351, "mean_gen_accuracy": 0.874124139547348, "mean_token_accuracy": 0.9041211605072021, "num_tokens": 989264829.0, "sample_num_tokens": 8670.25, "step": 7646, "total_num_tokens": 989299510.0, "z_loss": 0.00043406232725828886 }, { "copy_logits_max": -5.438328742980957, "copy_logits_min": -687500032.0, "copy_num_tokens": 338.5625, "epoch": 1.561858565228491, "gen_logits_max": 4.395981788635254, "gen_logits_mean": -15.862081527709961, "gen_logits_min": -28.243404388427734, "gen_logits_std": 3.2239227294921875, "gen_loss": 0.32753193378448486, "grad_norm": 0.3712071979431413, "learning_rate": 2.1270315789473684e-05, "loss": 0.309, "mean_copy_accuracy": 0.9950520694255829, "mean_gen_accuracy": 0.8693165481090546, "mean_token_accuracy": 0.895709440112114, "num_tokens": 989524951.0, "sample_num_tokens": 7569.25, "step": 7647, "total_num_tokens": 989555228.0, "z_loss": 0.00053677789401263 }, { "copy_logits_max": -7.536138534545898, "copy_logits_min": -750000000.0, "copy_num_tokens": 363.125, "epoch": 1.562062803165688, "gen_logits_max": 3.873908281326294, "gen_logits_mean": -17.45064353942871, "gen_logits_min": -29.253328323364258, "gen_logits_std": 3.2916147708892822, "gen_loss": 0.2503185570240021, "grad_norm": 0.365050625290047, "learning_rate": 2.1269052631578948e-05, "loss": 0.2606, "mean_copy_accuracy": 0.9960788488388062, "mean_gen_accuracy": 0.885924220085144, "mean_token_accuracy": 0.9134188443422318, "num_tokens": 989800674.0, "sample_num_tokens": 7323.5, "step": 7648, "total_num_tokens": 989829968.0, "z_loss": 0.00041816907469183207 }, { "copy_logits_max": -6.083192825317383, "copy_logits_min": -750000000.0, "copy_num_tokens": 694.4375, "epoch": 1.562267041102885, "gen_logits_max": 3.22208309173584, "gen_logits_mean": -15.92823600769043, "gen_logits_min": -28.232585906982422, "gen_logits_std": 3.2660489082336426, "gen_loss": 0.227895125746727, "grad_norm": 0.3564906354162104, "learning_rate": 2.1267789473684212e-05, "loss": 0.2489, "mean_copy_accuracy": 0.9977609664201736, "mean_gen_accuracy": 0.8804657757282257, "mean_token_accuracy": 0.9156645834445953, "num_tokens": 990097559.0, "sample_num_tokens": 9938.25, "step": 7649, "total_num_tokens": 990137312.0, "z_loss": 0.00040587244438938797 }, { "copy_logits_max": -7.18321418762207, "copy_logits_min": -750000000.0, "copy_num_tokens": 406.5625, "epoch": 1.5624712790400817, "gen_logits_max": 3.7414803504943848, "gen_logits_mean": -16.884912490844727, "gen_logits_min": -28.620121002197266, "gen_logits_std": 3.252819061279297, "gen_loss": 0.2985984683036804, "grad_norm": 0.35022784777752036, "learning_rate": 2.1266526315789473e-05, "loss": 0.2914, "mean_copy_accuracy": 0.9973947405815125, "mean_gen_accuracy": 0.8705874979496002, "mean_token_accuracy": 0.9020427912473679, "num_tokens": 990386466.0, "sample_num_tokens": 8392.5, "step": 7650, "total_num_tokens": 990420036.0, "z_loss": 0.0004979665973223746 }, { "copy_logits_max": -6.4727325439453125, "copy_logits_min": -750000000.0, "copy_num_tokens": 654.25, "epoch": 1.5626755169772786, "gen_logits_max": 2.215134620666504, "gen_logits_mean": -18.094802856445312, "gen_logits_min": -29.972572326660156, "gen_logits_std": 3.346097469329834, "gen_loss": 0.2615160048007965, "grad_norm": 0.34803313442930234, "learning_rate": 2.1265263157894737e-05, "loss": 0.2545, "mean_copy_accuracy": 0.9975119382143021, "mean_gen_accuracy": 0.880131259560585, "mean_token_accuracy": 0.9118138998746872, "num_tokens": 990670964.0, "sample_num_tokens": 9746.5, "step": 7651, "total_num_tokens": 990709950.0, "z_loss": 0.00042535184184089303 }, { "copy_logits_max": -7.292962074279785, "copy_logits_min": -687500032.0, "copy_num_tokens": 416.8125, "epoch": 1.5628797549144755, "gen_logits_max": 3.4838242530822754, "gen_logits_mean": -16.56514549255371, "gen_logits_min": -28.674821853637695, "gen_logits_std": 3.2928619384765625, "gen_loss": 0.25263047218322754, "grad_norm": 0.32933012358061675, "learning_rate": 2.1264000000000002e-05, "loss": 0.2833, "mean_copy_accuracy": 0.996524840593338, "mean_gen_accuracy": 0.880901575088501, "mean_token_accuracy": 0.9042041003704071, "num_tokens": 990930989.0, "sample_num_tokens": 7483.25, "step": 7652, "total_num_tokens": 990960922.0, "z_loss": 0.0004339340957812965 }, { "copy_logits_max": -7.227932929992676, "copy_logits_min": -750000000.0, "copy_num_tokens": 472.375, "epoch": 1.5630839928516722, "gen_logits_max": 3.1439757347106934, "gen_logits_mean": -17.318744659423828, "gen_logits_min": -29.1616153717041, "gen_logits_std": 3.312135696411133, "gen_loss": 0.23785127699375153, "grad_norm": 0.3654644503046611, "learning_rate": 2.1262736842105266e-05, "loss": 0.2795, "mean_copy_accuracy": 0.9950502961874008, "mean_gen_accuracy": 0.8783248513936996, "mean_token_accuracy": 0.9047341495752335, "num_tokens": 991195042.0, "sample_num_tokens": 8610.0, "step": 7653, "total_num_tokens": 991229482.0, "z_loss": 0.00042518641566857696 }, { "copy_logits_max": -7.721189498901367, "copy_logits_min": -750000000.0, "copy_num_tokens": 233.75, "epoch": 1.563288230788869, "gen_logits_max": 5.449304103851318, "gen_logits_mean": -16.011751174926758, "gen_logits_min": -27.619205474853516, "gen_logits_std": 3.214439868927002, "gen_loss": 0.35055726766586304, "grad_norm": 0.34546138029331175, "learning_rate": 2.1261473684210527e-05, "loss": 0.2907, "mean_copy_accuracy": 0.9961118847131729, "mean_gen_accuracy": 0.8745536804199219, "mean_token_accuracy": 0.9001877158880234, "num_tokens": 991463770.0, "sample_num_tokens": 6902.0, "step": 7654, "total_num_tokens": 991491378.0, "z_loss": 0.0005448127631098032 }, { "copy_logits_max": -6.042311668395996, "copy_logits_min": -750000000.0, "copy_num_tokens": 471.3125, "epoch": 1.5634924687260658, "gen_logits_max": 3.8784894943237305, "gen_logits_mean": -15.813143730163574, "gen_logits_min": -27.749666213989258, "gen_logits_std": 3.2368967533111572, "gen_loss": 0.2789772152900696, "grad_norm": 0.3569184214026099, "learning_rate": 2.126021052631579e-05, "loss": 0.2656, "mean_copy_accuracy": 0.9977907240390778, "mean_gen_accuracy": 0.8744569271802902, "mean_token_accuracy": 0.9086134731769562, "num_tokens": 991731984.0, "sample_num_tokens": 8234.0, "step": 7655, "total_num_tokens": 991764920.0, "z_loss": 0.000515801424626261 }, { "copy_logits_max": -7.8586602210998535, "copy_logits_min": -750000000.0, "copy_num_tokens": 393.8125, "epoch": 1.5636967066632628, "gen_logits_max": 5.624570369720459, "gen_logits_mean": -14.732501983642578, "gen_logits_min": -27.756271362304688, "gen_logits_std": 3.2302637100219727, "gen_loss": 0.26358985900878906, "grad_norm": 0.38254636572268014, "learning_rate": 2.1258947368421052e-05, "loss": 0.276, "mean_copy_accuracy": 0.9969854205846786, "mean_gen_accuracy": 0.8778428435325623, "mean_token_accuracy": 0.9057963937520981, "num_tokens": 991999352.0, "sample_num_tokens": 7943.5, "step": 7656, "total_num_tokens": 992031126.0, "z_loss": 0.0004513211315497756 }, { "copy_logits_max": -7.023556232452393, "copy_logits_min": -750000000.0, "copy_num_tokens": 507.125, "epoch": 1.5639009446004595, "gen_logits_max": 4.544137001037598, "gen_logits_mean": -16.332275390625, "gen_logits_min": -29.139814376831055, "gen_logits_std": 3.268033981323242, "gen_loss": 0.3036766052246094, "grad_norm": 0.3129561530694671, "learning_rate": 2.1257684210526317e-05, "loss": 0.2613, "mean_copy_accuracy": 0.9975534379482269, "mean_gen_accuracy": 0.8774933218955994, "mean_token_accuracy": 0.9103129506111145, "num_tokens": 992289413.0, "sample_num_tokens": 9154.25, "step": 7657, "total_num_tokens": 992326030.0, "z_loss": 0.0005007246509194374 }, { "copy_logits_max": -9.316579818725586, "copy_logits_min": -687500032.0, "copy_num_tokens": 366.0, "epoch": 1.5641051825376564, "gen_logits_max": 3.752206563949585, "gen_logits_mean": -18.298734664916992, "gen_logits_min": -30.20751953125, "gen_logits_std": 3.3246970176696777, "gen_loss": 0.28579044342041016, "grad_norm": 0.3453976390268003, "learning_rate": 2.1256421052631577e-05, "loss": 0.2815, "mean_copy_accuracy": 0.9972685277462006, "mean_gen_accuracy": 0.8699295967817307, "mean_token_accuracy": 0.9047040790319443, "num_tokens": 992569122.0, "sample_num_tokens": 7617.5, "step": 7658, "total_num_tokens": 992599592.0, "z_loss": 0.0004395263676997274 }, { "copy_logits_max": -7.75350284576416, "copy_logits_min": -750000000.0, "copy_num_tokens": 316.875, "epoch": 1.5643094204748533, "gen_logits_max": 3.4239301681518555, "gen_logits_mean": -18.44865608215332, "gen_logits_min": -30.31983184814453, "gen_logits_std": 3.3035223484039307, "gen_loss": 0.30638387799263, "grad_norm": 0.39794545328140385, "learning_rate": 2.1255157894736842e-05, "loss": 0.2945, "mean_copy_accuracy": 0.9957488030195236, "mean_gen_accuracy": 0.8731688559055328, "mean_token_accuracy": 0.9007185846567154, "num_tokens": 992812541.0, "sample_num_tokens": 7231.75, "step": 7659, "total_num_tokens": 992841468.0, "z_loss": 0.00045863352715969086 }, { "copy_logits_max": -7.224825859069824, "copy_logits_min": -750000000.0, "copy_num_tokens": 443.25, "epoch": 1.56451365841205, "gen_logits_max": 4.628373146057129, "gen_logits_mean": -15.370879173278809, "gen_logits_min": -27.752445220947266, "gen_logits_std": 3.236255645751953, "gen_loss": 0.2726631760597229, "grad_norm": 0.37318061266727515, "learning_rate": 2.1253894736842106e-05, "loss": 0.2772, "mean_copy_accuracy": 0.9964931011199951, "mean_gen_accuracy": 0.8756550848484039, "mean_token_accuracy": 0.9077660143375397, "num_tokens": 993076122.0, "sample_num_tokens": 8507.5, "step": 7660, "total_num_tokens": 993110152.0, "z_loss": 0.0004894859157502651 }, { "copy_logits_max": -7.99396276473999, "copy_logits_min": -687500032.0, "copy_num_tokens": 360.75, "epoch": 1.5647178963492467, "gen_logits_max": 5.080874443054199, "gen_logits_mean": -16.176912307739258, "gen_logits_min": -28.204097747802734, "gen_logits_std": 3.247378349304199, "gen_loss": 0.27700939774513245, "grad_norm": 0.3469567469850811, "learning_rate": 2.125263157894737e-05, "loss": 0.2632, "mean_copy_accuracy": 0.9965335726737976, "mean_gen_accuracy": 0.8836960047483444, "mean_token_accuracy": 0.9094046950340271, "num_tokens": 993355735.0, "sample_num_tokens": 8291.75, "step": 7661, "total_num_tokens": 993388902.0, "z_loss": 0.00047769572120159864 }, { "copy_logits_max": -7.018135070800781, "copy_logits_min": -750000000.0, "copy_num_tokens": 230.5625, "epoch": 1.5649221342864437, "gen_logits_max": 4.608957290649414, "gen_logits_mean": -16.683395385742188, "gen_logits_min": -28.81989288330078, "gen_logits_std": 3.2367606163024902, "gen_loss": 0.2729153633117676, "grad_norm": 0.39152604808916286, "learning_rate": 2.1251368421052635e-05, "loss": 0.2857, "mean_copy_accuracy": 0.994800254702568, "mean_gen_accuracy": 0.8791042417287827, "mean_token_accuracy": 0.9012600928544998, "num_tokens": 993596117.0, "sample_num_tokens": 6191.25, "step": 7662, "total_num_tokens": 993620882.0, "z_loss": 0.0005061248084530234 }, { "copy_logits_max": -5.878523349761963, "copy_logits_min": -625000064.0, "copy_num_tokens": 460.625, "epoch": 1.5651263722236406, "gen_logits_max": 3.1957736015319824, "gen_logits_mean": -18.151025772094727, "gen_logits_min": -30.195049285888672, "gen_logits_std": 3.2994019985198975, "gen_loss": 0.2965608835220337, "grad_norm": 0.3687405020446281, "learning_rate": 2.1250105263157896e-05, "loss": 0.2769, "mean_copy_accuracy": 0.9978677779436111, "mean_gen_accuracy": 0.8774361908435822, "mean_token_accuracy": 0.9056877791881561, "num_tokens": 993861990.0, "sample_num_tokens": 8678.0, "step": 7663, "total_num_tokens": 993896702.0, "z_loss": 0.0005561144789680839 }, { "copy_logits_max": -6.504110813140869, "copy_logits_min": -750000000.0, "copy_num_tokens": 556.5, "epoch": 1.5653306101608373, "gen_logits_max": 4.563724040985107, "gen_logits_mean": -16.19295310974121, "gen_logits_min": -28.505088806152344, "gen_logits_std": 3.265087604522705, "gen_loss": 0.2522188127040863, "grad_norm": 0.3528243181989009, "learning_rate": 2.124884210526316e-05, "loss": 0.2683, "mean_copy_accuracy": 0.9969383478164673, "mean_gen_accuracy": 0.8796936571598053, "mean_token_accuracy": 0.9080318659543991, "num_tokens": 994137106.0, "sample_num_tokens": 9298.5, "step": 7664, "total_num_tokens": 994174300.0, "z_loss": 0.0004723169840872288 }, { "copy_logits_max": -7.6303300857543945, "copy_logits_min": -750000000.0, "copy_num_tokens": 345.6875, "epoch": 1.5655348480980342, "gen_logits_max": 3.902310848236084, "gen_logits_mean": -18.175901412963867, "gen_logits_min": -29.6606388092041, "gen_logits_std": 3.2862303256988525, "gen_loss": 0.2758244276046753, "grad_norm": 0.35787799801547576, "learning_rate": 2.124757894736842e-05, "loss": 0.2635, "mean_copy_accuracy": 0.9976636469364166, "mean_gen_accuracy": 0.8812347799539566, "mean_token_accuracy": 0.9109413772821426, "num_tokens": 994405319.0, "sample_num_tokens": 7721.75, "step": 7665, "total_num_tokens": 994436206.0, "z_loss": 0.00043628091225400567 }, { "copy_logits_max": -6.397061347961426, "copy_logits_min": -750000000.0, "copy_num_tokens": 436.1875, "epoch": 1.5657390860352312, "gen_logits_max": 5.278304100036621, "gen_logits_mean": -15.531021118164062, "gen_logits_min": -27.246986389160156, "gen_logits_std": 3.2113964557647705, "gen_loss": 0.2745848000049591, "grad_norm": 0.35030754448242635, "learning_rate": 2.1246315789473685e-05, "loss": 0.2663, "mean_copy_accuracy": 0.9972573071718216, "mean_gen_accuracy": 0.8774997144937515, "mean_token_accuracy": 0.9095748215913773, "num_tokens": 994706988.0, "sample_num_tokens": 9022.0, "step": 7666, "total_num_tokens": 994743076.0, "z_loss": 0.0004143306869082153 }, { "copy_logits_max": -4.428982734680176, "copy_logits_min": -687499968.0, "copy_num_tokens": 408.5, "epoch": 1.5659433239724279, "gen_logits_max": 4.195507049560547, "gen_logits_mean": -15.687566757202148, "gen_logits_min": -28.030109405517578, "gen_logits_std": 3.242030382156372, "gen_loss": 0.2694113850593567, "grad_norm": 0.33819055861556463, "learning_rate": 2.1245052631578946e-05, "loss": 0.2683, "mean_copy_accuracy": 0.9974491447210312, "mean_gen_accuracy": 0.8822219669818878, "mean_token_accuracy": 0.909294530749321, "num_tokens": 994981394.0, "sample_num_tokens": 7493.0, "step": 7667, "total_num_tokens": 995011366.0, "z_loss": 0.0004661066341213882 }, { "copy_logits_max": -5.342381477355957, "copy_logits_min": -687500032.0, "copy_num_tokens": 491.875, "epoch": 1.5661475619096246, "gen_logits_max": 4.440253257751465, "gen_logits_mean": -16.12366485595703, "gen_logits_min": -28.466306686401367, "gen_logits_std": 3.2610814571380615, "gen_loss": 0.27910661697387695, "grad_norm": 0.3633116719428669, "learning_rate": 2.124378947368421e-05, "loss": 0.2619, "mean_copy_accuracy": 0.9962848573923111, "mean_gen_accuracy": 0.8819382190704346, "mean_token_accuracy": 0.9106682389974594, "num_tokens": 995258189.0, "sample_num_tokens": 8665.25, "step": 7668, "total_num_tokens": 995292850.0, "z_loss": 0.0004797732690349221 }, { "copy_logits_max": -6.420877456665039, "copy_logits_min": -750000000.0, "copy_num_tokens": 411.5625, "epoch": 1.5663517998468217, "gen_logits_max": 4.199718952178955, "gen_logits_mean": -16.07859230041504, "gen_logits_min": -28.56391143798828, "gen_logits_std": 3.2480649948120117, "gen_loss": 0.3004970848560333, "grad_norm": 0.37652852051212254, "learning_rate": 2.1242526315789475e-05, "loss": 0.291, "mean_copy_accuracy": 0.9953935891389847, "mean_gen_accuracy": 0.8710098564624786, "mean_token_accuracy": 0.9009978473186493, "num_tokens": 995526729.0, "sample_num_tokens": 8060.75, "step": 7669, "total_num_tokens": 995558972.0, "z_loss": 0.000487359007820487 }, { "copy_logits_max": -6.148970603942871, "copy_logits_min": -750000064.0, "copy_num_tokens": 481.5, "epoch": 1.5665560377840184, "gen_logits_max": 3.481234073638916, "gen_logits_mean": -17.560688018798828, "gen_logits_min": -29.30372428894043, "gen_logits_std": 3.2989015579223633, "gen_loss": 0.2704291343688965, "grad_norm": 0.3917756555083734, "learning_rate": 2.124126315789474e-05, "loss": 0.2765, "mean_copy_accuracy": 0.9952748566865921, "mean_gen_accuracy": 0.8772557377815247, "mean_token_accuracy": 0.9050001353025436, "num_tokens": 995773428.0, "sample_num_tokens": 8513.5, "step": 7670, "total_num_tokens": 995807482.0, "z_loss": 0.00039674865547567606 }, { "copy_logits_max": -4.510903358459473, "copy_logits_min": -750000000.0, "copy_num_tokens": 757.625, "epoch": 1.5667602757212151, "gen_logits_max": 1.0111151933670044, "gen_logits_mean": -19.531675338745117, "gen_logits_min": -31.96666717529297, "gen_logits_std": 3.422337532043457, "gen_loss": 0.23032428324222565, "grad_norm": 0.37406817745694504, "learning_rate": 2.124e-05, "loss": 0.2856, "mean_copy_accuracy": 0.9963007569313049, "mean_gen_accuracy": 0.8738362491130829, "mean_token_accuracy": 0.9033256024122238, "num_tokens": 996028296.0, "sample_num_tokens": 9946.0, "step": 7671, "total_num_tokens": 996068080.0, "z_loss": 0.0004146432038396597 }, { "copy_logits_max": -8.173340797424316, "copy_logits_min": -750000000.0, "copy_num_tokens": 315.625, "epoch": 1.566964513658412, "gen_logits_max": 4.245974540710449, "gen_logits_mean": -17.48264503479004, "gen_logits_min": -29.117351531982422, "gen_logits_std": 3.28842830657959, "gen_loss": 0.3147137761116028, "grad_norm": 0.36172851410101, "learning_rate": 2.1238736842105264e-05, "loss": 0.3014, "mean_copy_accuracy": 0.9959909170866013, "mean_gen_accuracy": 0.8742559999227524, "mean_token_accuracy": 0.8973284959793091, "num_tokens": 996287001.0, "sample_num_tokens": 8041.75, "step": 7672, "total_num_tokens": 996319168.0, "z_loss": 0.00043290690518915653 }, { "copy_logits_max": -5.092373847961426, "copy_logits_min": -750000000.0, "copy_num_tokens": 409.4375, "epoch": 1.567168751595609, "gen_logits_max": 5.142375946044922, "gen_logits_mean": -15.03455924987793, "gen_logits_min": -27.418678283691406, "gen_logits_std": 3.2386252880096436, "gen_loss": 0.2944847643375397, "grad_norm": 0.37292805734314827, "learning_rate": 2.1237473684210525e-05, "loss": 0.2793, "mean_copy_accuracy": 0.9966955482959747, "mean_gen_accuracy": 0.8755421042442322, "mean_token_accuracy": 0.9064928740262985, "num_tokens": 996561554.0, "sample_num_tokens": 7626.5, "step": 7673, "total_num_tokens": 996592060.0, "z_loss": 0.00045383255928754807 }, { "copy_logits_max": -6.818933486938477, "copy_logits_min": -750000000.0, "copy_num_tokens": 382.3125, "epoch": 1.5673729895328057, "gen_logits_max": 4.652703285217285, "gen_logits_mean": -15.61811351776123, "gen_logits_min": -28.110225677490234, "gen_logits_std": 3.2374181747436523, "gen_loss": 0.31307220458984375, "grad_norm": 0.3870877279197442, "learning_rate": 2.123621052631579e-05, "loss": 0.3063, "mean_copy_accuracy": 0.9961349070072174, "mean_gen_accuracy": 0.8692019879817963, "mean_token_accuracy": 0.8979099094867706, "num_tokens": 996813099.0, "sample_num_tokens": 9037.75, "step": 7674, "total_num_tokens": 996849250.0, "z_loss": 0.0004259524866938591 }, { "copy_logits_max": -6.5641584396362305, "copy_logits_min": -750000000.0, "copy_num_tokens": 465.0625, "epoch": 1.5675772274700024, "gen_logits_max": 3.916931629180908, "gen_logits_mean": -16.648351669311523, "gen_logits_min": -28.8975772857666, "gen_logits_std": 3.2942073345184326, "gen_loss": 0.26247596740722656, "grad_norm": 0.36865629226763563, "learning_rate": 2.123494736842105e-05, "loss": 0.2954, "mean_copy_accuracy": 0.9963641911745071, "mean_gen_accuracy": 0.875473216176033, "mean_token_accuracy": 0.8999351859092712, "num_tokens": 997085949.0, "sample_num_tokens": 8448.75, "step": 7675, "total_num_tokens": 997119744.0, "z_loss": 0.00037167774280533195 }, { "copy_logits_max": -7.029972076416016, "copy_logits_min": -750000000.0, "copy_num_tokens": 434.8125, "epoch": 1.5677814654071995, "gen_logits_max": 3.9150376319885254, "gen_logits_mean": -17.807437896728516, "gen_logits_min": -29.587787628173828, "gen_logits_std": 3.3280646800994873, "gen_loss": 0.25935012102127075, "grad_norm": 0.3683051902394924, "learning_rate": 2.1233684210526315e-05, "loss": 0.2621, "mean_copy_accuracy": 0.9970219284296036, "mean_gen_accuracy": 0.8782259821891785, "mean_token_accuracy": 0.9108799546957016, "num_tokens": 997346509.0, "sample_num_tokens": 8697.75, "step": 7676, "total_num_tokens": 997381300.0, "z_loss": 0.00037513001007027924 }, { "copy_logits_max": -6.90776252746582, "copy_logits_min": -750000000.0, "copy_num_tokens": 626.9375, "epoch": 1.5679857033443962, "gen_logits_max": 2.8882875442504883, "gen_logits_mean": -18.063674926757812, "gen_logits_min": -30.11482810974121, "gen_logits_std": 3.341176748275757, "gen_loss": 0.2611771523952484, "grad_norm": 0.35373961621697214, "learning_rate": 2.1232421052631582e-05, "loss": 0.2799, "mean_copy_accuracy": 0.9959344118833542, "mean_gen_accuracy": 0.8750184774398804, "mean_token_accuracy": 0.9038732200860977, "num_tokens": 997623572.0, "sample_num_tokens": 9759.5, "step": 7677, "total_num_tokens": 997662610.0, "z_loss": 0.00038581897388212383 }, { "copy_logits_max": -8.587831497192383, "copy_logits_min": -750000000.0, "copy_num_tokens": 187.625, "epoch": 1.568189941281593, "gen_logits_max": 5.804924964904785, "gen_logits_mean": -15.44810676574707, "gen_logits_min": -27.459983825683594, "gen_logits_std": 3.2287135124206543, "gen_loss": 0.30728381872177124, "grad_norm": 0.33619508531003767, "learning_rate": 2.1231157894736843e-05, "loss": 0.3009, "mean_copy_accuracy": 0.9972074627876282, "mean_gen_accuracy": 0.8705991059541702, "mean_token_accuracy": 0.8978846818208694, "num_tokens": 997882700.0, "sample_num_tokens": 6370.0, "step": 7678, "total_num_tokens": 997908180.0, "z_loss": 0.00045706203673034906 }, { "copy_logits_max": -8.256933212280273, "copy_logits_min": -750000128.0, "copy_num_tokens": 440.9375, "epoch": 1.5683941792187899, "gen_logits_max": 5.63125467300415, "gen_logits_mean": -14.841907501220703, "gen_logits_min": -26.951780319213867, "gen_logits_std": 3.2367632389068604, "gen_loss": 0.29695796966552734, "grad_norm": 0.3937397374697553, "learning_rate": 2.1229894736842108e-05, "loss": 0.2992, "mean_copy_accuracy": 0.9959001839160919, "mean_gen_accuracy": 0.874398946762085, "mean_token_accuracy": 0.8976226150989532, "num_tokens": 998120777.0, "sample_num_tokens": 8511.75, "step": 7679, "total_num_tokens": 998154824.0, "z_loss": 0.00045594057883135974 }, { "copy_logits_max": -5.553555011749268, "copy_logits_min": -750000000.0, "copy_num_tokens": 701.125, "epoch": 1.5685984171559868, "gen_logits_max": 3.654893398284912, "gen_logits_mean": -16.068634033203125, "gen_logits_min": -28.542640686035156, "gen_logits_std": 3.3237881660461426, "gen_loss": 0.21989966928958893, "grad_norm": 0.34739997876633316, "learning_rate": 2.122863157894737e-05, "loss": 0.269, "mean_copy_accuracy": 0.997049555182457, "mean_gen_accuracy": 0.8806474208831787, "mean_token_accuracy": 0.9091260582208633, "num_tokens": 998381262.0, "sample_num_tokens": 9422.0, "step": 7680, "total_num_tokens": 998418950.0, "z_loss": 0.0003684314724523574 }, { "copy_logits_max": -7.110894680023193, "copy_logits_min": -750000000.0, "copy_num_tokens": 363.875, "epoch": 1.5688026550931835, "gen_logits_max": 4.922213554382324, "gen_logits_mean": -16.486085891723633, "gen_logits_min": -28.85855484008789, "gen_logits_std": 3.2677788734436035, "gen_loss": 0.32058238983154297, "grad_norm": 0.3743859887525477, "learning_rate": 2.1227368421052633e-05, "loss": 0.2968, "mean_copy_accuracy": 0.9965041130781174, "mean_gen_accuracy": 0.8712571859359741, "mean_token_accuracy": 0.8989610224962234, "num_tokens": 998657898.0, "sample_num_tokens": 8499.5, "step": 7681, "total_num_tokens": 998691896.0, "z_loss": 0.0005220556631684303 }, { "copy_logits_max": -7.883457660675049, "copy_logits_min": -750000000.0, "copy_num_tokens": 585.625, "epoch": 1.5690068930303804, "gen_logits_max": 3.680210590362549, "gen_logits_mean": -17.646217346191406, "gen_logits_min": -29.766321182250977, "gen_logits_std": 3.352370262145996, "gen_loss": 0.22844114899635315, "grad_norm": 0.38909446007589193, "learning_rate": 2.1226105263157894e-05, "loss": 0.2784, "mean_copy_accuracy": 0.9970730692148209, "mean_gen_accuracy": 0.8772156685590744, "mean_token_accuracy": 0.9046497493982315, "num_tokens": 998939161.0, "sample_num_tokens": 8903.25, "step": 7682, "total_num_tokens": 998974774.0, "z_loss": 0.00038202106952667236 }, { "copy_logits_max": -6.05438756942749, "copy_logits_min": -750000000.0, "copy_num_tokens": 506.125, "epoch": 1.5692111309675774, "gen_logits_max": 5.515067100524902, "gen_logits_mean": -14.586307525634766, "gen_logits_min": -27.20413589477539, "gen_logits_std": 3.2396018505096436, "gen_loss": 0.2654797434806824, "grad_norm": 0.32264916499187807, "learning_rate": 2.1224842105263158e-05, "loss": 0.259, "mean_copy_accuracy": 0.9967685490846634, "mean_gen_accuracy": 0.8854183256626129, "mean_token_accuracy": 0.9132959991693497, "num_tokens": 999223100.0, "sample_num_tokens": 8918.0, "step": 7683, "total_num_tokens": 999258772.0, "z_loss": 0.0005337187321856618 }, { "copy_logits_max": -5.2904863357543945, "copy_logits_min": -750000000.0, "copy_num_tokens": 503.625, "epoch": 1.569415368904774, "gen_logits_max": 4.059229850769043, "gen_logits_mean": -16.199769973754883, "gen_logits_min": -28.308969497680664, "gen_logits_std": 3.261352062225342, "gen_loss": 0.3115742802619934, "grad_norm": 0.3462521125644712, "learning_rate": 2.122357894736842e-05, "loss": 0.293, "mean_copy_accuracy": 0.9968618303537369, "mean_gen_accuracy": 0.8677959144115448, "mean_token_accuracy": 0.9013097286224365, "num_tokens": 999518515.0, "sample_num_tokens": 8149.75, "step": 7684, "total_num_tokens": 999551114.0, "z_loss": 0.0005474767531268299 }, { "copy_logits_max": -5.652245044708252, "copy_logits_min": -750000000.0, "copy_num_tokens": 467.9375, "epoch": 1.5696196068419708, "gen_logits_max": 5.092254638671875, "gen_logits_mean": -15.325785636901855, "gen_logits_min": -27.301395416259766, "gen_logits_std": 3.2386608123779297, "gen_loss": 0.31433939933776855, "grad_norm": 0.35464642728380885, "learning_rate": 2.1222315789473687e-05, "loss": 0.2975, "mean_copy_accuracy": 0.9972760528326035, "mean_gen_accuracy": 0.8664837181568146, "mean_token_accuracy": 0.8987444788217545, "num_tokens": 999810726.0, "sample_num_tokens": 9169.0, "step": 7685, "total_num_tokens": 999847402.0, "z_loss": 0.0006012589437887073 }, { "copy_logits_max": -5.573149681091309, "copy_logits_min": -625000064.0, "copy_num_tokens": 550.1875, "epoch": 1.5698238447791677, "gen_logits_max": 4.008366107940674, "gen_logits_mean": -17.12512969970703, "gen_logits_min": -29.35152816772461, "gen_logits_std": 3.306021213531494, "gen_loss": 0.2846341133117676, "grad_norm": 0.37086948255602514, "learning_rate": 2.1221052631578948e-05, "loss": 0.2901, "mean_copy_accuracy": 0.9965298175811768, "mean_gen_accuracy": 0.8689559102058411, "mean_token_accuracy": 0.9011947214603424, "num_tokens": 1000081361.0, "sample_num_tokens": 8507.75, "step": 7686, "total_num_tokens": 1000115392.0, "z_loss": 0.0005197032587602735 }, { "copy_logits_max": -4.693183898925781, "copy_logits_min": -687500032.0, "copy_num_tokens": 509.5625, "epoch": 1.5700280827163646, "gen_logits_max": 6.420598030090332, "gen_logits_mean": -14.13167667388916, "gen_logits_min": -26.351518630981445, "gen_logits_std": 3.2168025970458984, "gen_loss": 0.2557767331600189, "grad_norm": 0.38968648157005564, "learning_rate": 2.1219789473684212e-05, "loss": 0.2732, "mean_copy_accuracy": 0.9960413575172424, "mean_gen_accuracy": 0.8824733048677444, "mean_token_accuracy": 0.9085534811019897, "num_tokens": 1000354121.0, "sample_num_tokens": 8794.75, "step": 7687, "total_num_tokens": 1000389300.0, "z_loss": 0.00048697105376049876 }, { "copy_logits_max": -6.487493515014648, "copy_logits_min": -750000000.0, "copy_num_tokens": 234.0, "epoch": 1.5702323206535613, "gen_logits_max": 5.771519184112549, "gen_logits_mean": -16.00605010986328, "gen_logits_min": -28.000957489013672, "gen_logits_std": 3.237593650817871, "gen_loss": 0.3169599175453186, "grad_norm": 0.35199058915056153, "learning_rate": 2.1218526315789473e-05, "loss": 0.279, "mean_copy_accuracy": 0.9962689280509949, "mean_gen_accuracy": 0.8787482529878616, "mean_token_accuracy": 0.9040104150772095, "num_tokens": 1000613584.0, "sample_num_tokens": 6553.5, "step": 7688, "total_num_tokens": 1000639798.0, "z_loss": 0.0006054500117897987 }, { "copy_logits_max": -4.885036945343018, "copy_logits_min": -750000000.0, "copy_num_tokens": 642.9375, "epoch": 1.5704365585907583, "gen_logits_max": 5.641047477722168, "gen_logits_mean": -14.596272468566895, "gen_logits_min": -27.190277099609375, "gen_logits_std": 3.2502644062042236, "gen_loss": 0.2379954606294632, "grad_norm": 0.3763882567880648, "learning_rate": 2.1217263157894737e-05, "loss": 0.266, "mean_copy_accuracy": 0.9969860762357712, "mean_gen_accuracy": 0.8756407052278519, "mean_token_accuracy": 0.9084359109401703, "num_tokens": 1000873780.0, "sample_num_tokens": 8882.0, "step": 7689, "total_num_tokens": 1000909308.0, "z_loss": 0.0004224532749503851 }, { "copy_logits_max": -8.065620422363281, "copy_logits_min": -750000000.0, "copy_num_tokens": 352.4375, "epoch": 1.5706407965279552, "gen_logits_max": 6.3829874992370605, "gen_logits_mean": -15.782342910766602, "gen_logits_min": -27.556079864501953, "gen_logits_std": 3.242283344268799, "gen_loss": 0.2920109033584595, "grad_norm": 0.335193115873731, "learning_rate": 2.1216e-05, "loss": 0.2736, "mean_copy_accuracy": 0.9972299635410309, "mean_gen_accuracy": 0.8801181316375732, "mean_token_accuracy": 0.9064438045024872, "num_tokens": 1001157904.0, "sample_num_tokens": 8756.0, "step": 7690, "total_num_tokens": 1001192928.0, "z_loss": 0.0005182402674108744 }, { "copy_logits_max": -7.000488758087158, "copy_logits_min": -750000000.0, "copy_num_tokens": 331.5, "epoch": 1.570845034465152, "gen_logits_max": 5.013121128082275, "gen_logits_mean": -16.6406307220459, "gen_logits_min": -28.88349151611328, "gen_logits_std": 3.2788329124450684, "gen_loss": 0.275801420211792, "grad_norm": 0.3151828061557631, "learning_rate": 2.1214736842105262e-05, "loss": 0.266, "mean_copy_accuracy": 0.9973127245903015, "mean_gen_accuracy": 0.8818868696689606, "mean_token_accuracy": 0.9080377817153931, "num_tokens": 1001438634.0, "sample_num_tokens": 7285.0, "step": 7691, "total_num_tokens": 1001467774.0, "z_loss": 0.0004967852728441358 }, { "copy_logits_max": -6.255274295806885, "copy_logits_min": -687500032.0, "copy_num_tokens": 469.875, "epoch": 1.5710492724023486, "gen_logits_max": 4.521336078643799, "gen_logits_mean": -16.889705657958984, "gen_logits_min": -28.928443908691406, "gen_logits_std": 3.2979178428649902, "gen_loss": 0.24849621951580048, "grad_norm": 0.36811669550978726, "learning_rate": 2.1213473684210527e-05, "loss": 0.2637, "mean_copy_accuracy": 0.9960125684738159, "mean_gen_accuracy": 0.882968932390213, "mean_token_accuracy": 0.9116909950971603, "num_tokens": 1001692648.0, "sample_num_tokens": 8824.0, "step": 7692, "total_num_tokens": 1001727944.0, "z_loss": 0.0004201136762276292 }, { "copy_logits_max": -3.309915065765381, "copy_logits_min": -750000064.0, "copy_num_tokens": 634.5, "epoch": 1.5712535103395455, "gen_logits_max": 4.805036544799805, "gen_logits_mean": -14.708677291870117, "gen_logits_min": -27.102901458740234, "gen_logits_std": 3.26007342338562, "gen_loss": 0.23661142587661743, "grad_norm": 0.3536986082932924, "learning_rate": 2.121221052631579e-05, "loss": 0.2655, "mean_copy_accuracy": 0.9977645576000214, "mean_gen_accuracy": 0.8726881891489029, "mean_token_accuracy": 0.908433124423027, "num_tokens": 1001965278.0, "sample_num_tokens": 9449.5, "step": 7693, "total_num_tokens": 1002003076.0, "z_loss": 0.00045208720257505774 }, { "copy_logits_max": -2.2954752445220947, "copy_logits_min": -750000000.0, "copy_num_tokens": 533.6875, "epoch": 1.5714577482767424, "gen_logits_max": 4.236222267150879, "gen_logits_mean": -16.468278884887695, "gen_logits_min": -29.026493072509766, "gen_logits_std": 3.29848575592041, "gen_loss": 0.2616705894470215, "grad_norm": 0.31717090910409707, "learning_rate": 2.1210947368421055e-05, "loss": 0.2633, "mean_copy_accuracy": 0.9977601617574692, "mean_gen_accuracy": 0.8730739802122116, "mean_token_accuracy": 0.9103443175554276, "num_tokens": 1002257584.0, "sample_num_tokens": 8651.0, "step": 7694, "total_num_tokens": 1002292188.0, "z_loss": 0.00046883264440111816 }, { "copy_logits_max": -3.323160171508789, "copy_logits_min": -750000064.0, "copy_num_tokens": 496.8125, "epoch": 1.5716619862139392, "gen_logits_max": 5.8301920890808105, "gen_logits_mean": -13.645116806030273, "gen_logits_min": -25.63507843017578, "gen_logits_std": 3.1942191123962402, "gen_loss": 0.30042707920074463, "grad_norm": 0.3415740595387063, "learning_rate": 2.1209684210526316e-05, "loss": 0.2723, "mean_copy_accuracy": 0.9973961263895035, "mean_gen_accuracy": 0.8725948184728622, "mean_token_accuracy": 0.9082670211791992, "num_tokens": 1002556178.0, "sample_num_tokens": 8829.0, "step": 7695, "total_num_tokens": 1002591494.0, "z_loss": 0.0005104179144836962 }, { "copy_logits_max": -5.2212419509887695, "copy_logits_min": -750000000.0, "copy_num_tokens": 331.4375, "epoch": 1.571866224151136, "gen_logits_max": 3.27986478805542, "gen_logits_mean": -18.85627555847168, "gen_logits_min": -30.63956642150879, "gen_logits_std": 3.3382792472839355, "gen_loss": 0.31357407569885254, "grad_norm": 0.35273338026352136, "learning_rate": 2.120842105263158e-05, "loss": 0.2912, "mean_copy_accuracy": 0.9967515915632248, "mean_gen_accuracy": 0.8695711940526962, "mean_token_accuracy": 0.8998053073883057, "num_tokens": 1002827803.0, "sample_num_tokens": 7106.75, "step": 7696, "total_num_tokens": 1002856230.0, "z_loss": 0.0005022775731049478 }, { "copy_logits_max": -4.37645149230957, "copy_logits_min": -750000000.0, "copy_num_tokens": 374.625, "epoch": 1.572070462088333, "gen_logits_max": 4.468681812286377, "gen_logits_mean": -16.632471084594727, "gen_logits_min": -29.041091918945312, "gen_logits_std": 3.295912742614746, "gen_loss": 0.2853953242301941, "grad_norm": 0.36932507988448765, "learning_rate": 2.120715789473684e-05, "loss": 0.2667, "mean_copy_accuracy": 0.9964213371276855, "mean_gen_accuracy": 0.8765835016965866, "mean_token_accuracy": 0.9101576656103134, "num_tokens": 1003111629.0, "sample_num_tokens": 7488.25, "step": 7697, "total_num_tokens": 1003141582.0, "z_loss": 0.000515424064360559 }, { "copy_logits_max": -4.501764297485352, "copy_logits_min": -750000000.0, "copy_num_tokens": 562.375, "epoch": 1.5722747000255297, "gen_logits_max": 3.7716400623321533, "gen_logits_mean": -16.994054794311523, "gen_logits_min": -29.23625946044922, "gen_logits_std": 3.3348071575164795, "gen_loss": 0.25129348039627075, "grad_norm": 0.37691685794531765, "learning_rate": 2.1205894736842106e-05, "loss": 0.2666, "mean_copy_accuracy": 0.9974070638418198, "mean_gen_accuracy": 0.8799728453159332, "mean_token_accuracy": 0.9088211357593536, "num_tokens": 1003371503.0, "sample_num_tokens": 8768.25, "step": 7698, "total_num_tokens": 1003406576.0, "z_loss": 0.0003692562459036708 }, { "copy_logits_max": -4.216575622558594, "copy_logits_min": -750000000.0, "copy_num_tokens": 268.5, "epoch": 1.5724789379627264, "gen_logits_max": 5.035830020904541, "gen_logits_mean": -15.7044038772583, "gen_logits_min": -27.554702758789062, "gen_logits_std": 3.259182929992676, "gen_loss": 0.303097128868103, "grad_norm": 0.3435533548667615, "learning_rate": 2.1204631578947367e-05, "loss": 0.2803, "mean_copy_accuracy": 0.9962342828512192, "mean_gen_accuracy": 0.8764345794916153, "mean_token_accuracy": 0.9045126885175705, "num_tokens": 1003627469.0, "sample_num_tokens": 6662.75, "step": 7699, "total_num_tokens": 1003654120.0, "z_loss": 0.00042827363358810544 }, { "copy_logits_max": -4.478619575500488, "copy_logits_min": -750000000.0, "copy_num_tokens": 681.1875, "epoch": 1.5726831758999236, "gen_logits_max": 4.54378604888916, "gen_logits_mean": -14.91379165649414, "gen_logits_min": -26.827301025390625, "gen_logits_std": 3.275235652923584, "gen_loss": 0.24903519451618195, "grad_norm": 0.39752531976371197, "learning_rate": 2.120336842105263e-05, "loss": 0.2792, "mean_copy_accuracy": 0.9967354387044907, "mean_gen_accuracy": 0.874224916100502, "mean_token_accuracy": 0.90461166203022, "num_tokens": 1003897426.0, "sample_num_tokens": 10648.5, "step": 7700, "total_num_tokens": 1003940020.0, "z_loss": 0.00034803675953298807 }, { "copy_logits_max": -5.3629560470581055, "copy_logits_min": -687500096.0, "copy_num_tokens": 604.25, "epoch": 1.5728874138371203, "gen_logits_max": 3.0693492889404297, "gen_logits_mean": -17.342918395996094, "gen_logits_min": -29.247940063476562, "gen_logits_std": 3.3155479431152344, "gen_loss": 0.2706696391105652, "grad_norm": 0.3451134746592841, "learning_rate": 2.1202105263157895e-05, "loss": 0.2547, "mean_copy_accuracy": 0.9963074922561646, "mean_gen_accuracy": 0.8831884413957596, "mean_token_accuracy": 0.914766862988472, "num_tokens": 1004190825.0, "sample_num_tokens": 9500.25, "step": 7701, "total_num_tokens": 1004228826.0, "z_loss": 0.00044980604434385896 }, { "copy_logits_max": -4.492290496826172, "copy_logits_min": -625000064.0, "copy_num_tokens": 356.5, "epoch": 1.573091651774317, "gen_logits_max": 4.887072563171387, "gen_logits_mean": -15.895059585571289, "gen_logits_min": -27.717313766479492, "gen_logits_std": 3.257477283477783, "gen_loss": 0.33147042989730835, "grad_norm": 0.390027216054093, "learning_rate": 2.120084210526316e-05, "loss": 0.3118, "mean_copy_accuracy": 0.9965415894985199, "mean_gen_accuracy": 0.8646707236766815, "mean_token_accuracy": 0.8957983255386353, "num_tokens": 1004477663.0, "sample_num_tokens": 8239.25, "step": 7702, "total_num_tokens": 1004510620.0, "z_loss": 0.0005185336340218782 }, { "copy_logits_max": -1.5213394165039062, "copy_logits_min": -750000000.0, "copy_num_tokens": 584.3125, "epoch": 1.573295889711514, "gen_logits_max": 4.226722717285156, "gen_logits_mean": -15.461738586425781, "gen_logits_min": -27.824899673461914, "gen_logits_std": 3.270974636077881, "gen_loss": 0.30644142627716064, "grad_norm": 0.36282480211372387, "learning_rate": 2.1199578947368424e-05, "loss": 0.2909, "mean_copy_accuracy": 0.99665267765522, "mean_gen_accuracy": 0.8710797280073166, "mean_token_accuracy": 0.9017757624387741, "num_tokens": 1004770199.0, "sample_num_tokens": 9212.25, "step": 7703, "total_num_tokens": 1004807048.0, "z_loss": 0.0005190163501538336 }, { "copy_logits_max": -4.387042045593262, "copy_logits_min": -750000064.0, "copy_num_tokens": 325.5625, "epoch": 1.5735001276487108, "gen_logits_max": 4.5862555503845215, "gen_logits_mean": -16.6748104095459, "gen_logits_min": -28.457773208618164, "gen_logits_std": 3.293130397796631, "gen_loss": 0.2832627296447754, "grad_norm": 0.3554068438932821, "learning_rate": 2.1198315789473685e-05, "loss": 0.2789, "mean_copy_accuracy": 0.9963902831077576, "mean_gen_accuracy": 0.8781838268041611, "mean_token_accuracy": 0.9040844738483429, "num_tokens": 1005039055.0, "sample_num_tokens": 7663.75, "step": 7704, "total_num_tokens": 1005069710.0, "z_loss": 0.00043332469067536294 }, { "copy_logits_max": -3.2490074634552, "copy_logits_min": -750000000.0, "copy_num_tokens": 415.0, "epoch": 1.5737043655859075, "gen_logits_max": 4.904985427856445, "gen_logits_mean": -15.098541259765625, "gen_logits_min": -27.502464294433594, "gen_logits_std": 3.261563301086426, "gen_loss": 0.25506848096847534, "grad_norm": 0.3653973880442141, "learning_rate": 2.119705263157895e-05, "loss": 0.2603, "mean_copy_accuracy": 0.9968206733465195, "mean_gen_accuracy": 0.881933867931366, "mean_token_accuracy": 0.9108893573284149, "num_tokens": 1005297691.0, "sample_num_tokens": 7949.25, "step": 7705, "total_num_tokens": 1005329488.0, "z_loss": 0.0003961398033425212 }, { "copy_logits_max": -3.5854899883270264, "copy_logits_min": -687500032.0, "copy_num_tokens": 699.1875, "epoch": 1.5739086035231045, "gen_logits_max": 3.3674702644348145, "gen_logits_mean": -17.844825744628906, "gen_logits_min": -29.89898681640625, "gen_logits_std": 3.3651082515716553, "gen_loss": 0.237690269947052, "grad_norm": 0.3450106091361731, "learning_rate": 2.119578947368421e-05, "loss": 0.2555, "mean_copy_accuracy": 0.9975082725286484, "mean_gen_accuracy": 0.8825041502714157, "mean_token_accuracy": 0.9124103784561157, "num_tokens": 1005596813.0, "sample_num_tokens": 10161.25, "step": 7706, "total_num_tokens": 1005637458.0, "z_loss": 0.00038886634865775704 }, { "copy_logits_max": -4.460363388061523, "copy_logits_min": -625000064.0, "copy_num_tokens": 373.375, "epoch": 1.5741128414603014, "gen_logits_max": 4.531851768493652, "gen_logits_mean": -17.022945404052734, "gen_logits_min": -28.94892120361328, "gen_logits_std": 3.2727065086364746, "gen_loss": 0.31525853276252747, "grad_norm": 0.3449859515086249, "learning_rate": 2.1194526315789474e-05, "loss": 0.2751, "mean_copy_accuracy": 0.9965435564517975, "mean_gen_accuracy": 0.873046025633812, "mean_token_accuracy": 0.9051206260919571, "num_tokens": 1005882794.0, "sample_num_tokens": 8182.0, "step": 7707, "total_num_tokens": 1005915522.0, "z_loss": 0.0004865980881731957 }, { "copy_logits_max": -0.8779409527778625, "copy_logits_min": -750000064.0, "copy_num_tokens": 514.8125, "epoch": 1.574317079397498, "gen_logits_max": 4.436036586761475, "gen_logits_mean": -15.119771003723145, "gen_logits_min": -27.810150146484375, "gen_logits_std": 3.226426601409912, "gen_loss": 0.2793319821357727, "grad_norm": 0.3682318976580481, "learning_rate": 2.1193263157894735e-05, "loss": 0.2775, "mean_copy_accuracy": 0.9966231733560562, "mean_gen_accuracy": 0.8751871138811111, "mean_token_accuracy": 0.9084331095218658, "num_tokens": 1006156215.0, "sample_num_tokens": 8716.25, "step": 7708, "total_num_tokens": 1006191080.0, "z_loss": 0.00048331613652408123 }, { "copy_logits_max": -3.3430941104888916, "copy_logits_min": -750000000.0, "copy_num_tokens": 439.5, "epoch": 1.5745213173346948, "gen_logits_max": 5.048469066619873, "gen_logits_mean": -15.406693458557129, "gen_logits_min": -28.406475067138672, "gen_logits_std": 3.277613639831543, "gen_loss": 0.24251598119735718, "grad_norm": 0.36474787828143257, "learning_rate": 2.1192e-05, "loss": 0.2741, "mean_copy_accuracy": 0.9963248074054718, "mean_gen_accuracy": 0.8732042759656906, "mean_token_accuracy": 0.905218780040741, "num_tokens": 1006429190.0, "sample_num_tokens": 7801.5, "step": 7709, "total_num_tokens": 1006460396.0, "z_loss": 0.00046285311691462994 }, { "copy_logits_max": -5.927605152130127, "copy_logits_min": -750000000.0, "copy_num_tokens": 303.875, "epoch": 1.5747255552718917, "gen_logits_max": 4.288140296936035, "gen_logits_mean": -17.92754554748535, "gen_logits_min": -29.858760833740234, "gen_logits_std": 3.310084581375122, "gen_loss": 0.26819777488708496, "grad_norm": 0.33979784756029324, "learning_rate": 2.1190736842105264e-05, "loss": 0.2684, "mean_copy_accuracy": 0.9958890378475189, "mean_gen_accuracy": 0.8807374686002731, "mean_token_accuracy": 0.9088667631149292, "num_tokens": 1006711556.0, "sample_num_tokens": 8226.5, "step": 7710, "total_num_tokens": 1006744462.0, "z_loss": 0.0004813265986740589 }, { "copy_logits_max": -2.9709153175354004, "copy_logits_min": -750000000.0, "copy_num_tokens": 434.75, "epoch": 1.5749297932090887, "gen_logits_max": 4.2984442710876465, "gen_logits_mean": -15.78152847290039, "gen_logits_min": -28.1741943359375, "gen_logits_std": 3.2666056156158447, "gen_loss": 0.2767210006713867, "grad_norm": 0.3611780388248846, "learning_rate": 2.118947368421053e-05, "loss": 0.2771, "mean_copy_accuracy": 0.997090145945549, "mean_gen_accuracy": 0.8771037608385086, "mean_token_accuracy": 0.9096081405878067, "num_tokens": 1007008168.0, "sample_num_tokens": 8300.0, "step": 7711, "total_num_tokens": 1007041368.0, "z_loss": 0.0005389737198129296 }, { "copy_logits_max": -3.295137882232666, "copy_logits_min": -750000000.0, "copy_num_tokens": 386.875, "epoch": 1.5751340311462854, "gen_logits_max": 4.297231674194336, "gen_logits_mean": -16.6148738861084, "gen_logits_min": -29.022754669189453, "gen_logits_std": 3.2757511138916016, "gen_loss": 0.2912379801273346, "grad_norm": 0.3441365612988206, "learning_rate": 2.118821052631579e-05, "loss": 0.2701, "mean_copy_accuracy": 0.9967789351940155, "mean_gen_accuracy": 0.8766562789678574, "mean_token_accuracy": 0.9078247845172882, "num_tokens": 1007269676.0, "sample_num_tokens": 7451.0, "step": 7712, "total_num_tokens": 1007299480.0, "z_loss": 0.0005379861686378717 }, { "copy_logits_max": -2.1292178630828857, "copy_logits_min": -750000000.0, "copy_num_tokens": 390.5625, "epoch": 1.5753382690834823, "gen_logits_max": 6.175067901611328, "gen_logits_mean": -12.626354217529297, "gen_logits_min": -25.433801651000977, "gen_logits_std": 3.166205406188965, "gen_loss": 0.3476269543170929, "grad_norm": 0.3739263413859981, "learning_rate": 2.1186947368421054e-05, "loss": 0.2911, "mean_copy_accuracy": 0.9955367892980576, "mean_gen_accuracy": 0.8723352402448654, "mean_token_accuracy": 0.9009215831756592, "num_tokens": 1007534140.0, "sample_num_tokens": 8370.5, "step": 7713, "total_num_tokens": 1007567622.0, "z_loss": 0.0006640668725594878 }, { "copy_logits_max": -3.9926483631134033, "copy_logits_min": -750000000.0, "copy_num_tokens": 581.8125, "epoch": 1.5755425070206792, "gen_logits_max": 3.86264705657959, "gen_logits_mean": -16.354881286621094, "gen_logits_min": -28.9370174407959, "gen_logits_std": 3.279015064239502, "gen_loss": 0.28767311573028564, "grad_norm": 0.40092561172888097, "learning_rate": 2.1185684210526314e-05, "loss": 0.2957, "mean_copy_accuracy": 0.9971362948417664, "mean_gen_accuracy": 0.8732628524303436, "mean_token_accuracy": 0.9007241725921631, "num_tokens": 1007818170.0, "sample_num_tokens": 9186.5, "step": 7714, "total_num_tokens": 1007854916.0, "z_loss": 0.00047665482270531356 }, { "copy_logits_max": -5.016082286834717, "copy_logits_min": -687500032.0, "copy_num_tokens": 606.3125, "epoch": 1.575746744957876, "gen_logits_max": 4.457813739776611, "gen_logits_mean": -14.858158111572266, "gen_logits_min": -27.187637329101562, "gen_logits_std": 3.2557520866394043, "gen_loss": 0.2193540632724762, "grad_norm": 0.4077450352633046, "learning_rate": 2.118442105263158e-05, "loss": 0.2674, "mean_copy_accuracy": 0.9970778375864029, "mean_gen_accuracy": 0.877538800239563, "mean_token_accuracy": 0.9104124307632446, "num_tokens": 1008096438.0, "sample_num_tokens": 8937.5, "step": 7715, "total_num_tokens": 1008132188.0, "z_loss": 0.00037160597275942564 }, { "copy_logits_max": -6.602910995483398, "copy_logits_min": -750000000.0, "copy_num_tokens": 474.6875, "epoch": 1.5759509828950726, "gen_logits_max": 3.569654941558838, "gen_logits_mean": -17.430133819580078, "gen_logits_min": -29.712310791015625, "gen_logits_std": 3.3069262504577637, "gen_loss": 0.2769539952278137, "grad_norm": 0.35324289193586567, "learning_rate": 2.1183157894736843e-05, "loss": 0.2734, "mean_copy_accuracy": 0.9967301934957504, "mean_gen_accuracy": 0.879430279135704, "mean_token_accuracy": 0.9078531712293625, "num_tokens": 1008379356.0, "sample_num_tokens": 7979.0, "step": 7716, "total_num_tokens": 1008411272.0, "z_loss": 0.000479760579764843 }, { "copy_logits_max": -4.31583309173584, "copy_logits_min": -750000000.0, "copy_num_tokens": 358.75, "epoch": 1.5761552208322696, "gen_logits_max": 5.397678852081299, "gen_logits_mean": -14.275924682617188, "gen_logits_min": -26.022274017333984, "gen_logits_std": 3.2112696170806885, "gen_loss": 0.28442102670669556, "grad_norm": 0.369236696750778, "learning_rate": 2.1181894736842104e-05, "loss": 0.2793, "mean_copy_accuracy": 0.9956895411014557, "mean_gen_accuracy": 0.8784151673316956, "mean_token_accuracy": 0.906857892870903, "num_tokens": 1008670307.0, "sample_num_tokens": 8239.25, "step": 7717, "total_num_tokens": 1008703264.0, "z_loss": 0.00045227783266454935 }, { "copy_logits_max": -3.540783405303955, "copy_logits_min": -750000000.0, "copy_num_tokens": 603.6875, "epoch": 1.5763594587694665, "gen_logits_max": 4.586134910583496, "gen_logits_mean": -15.426732063293457, "gen_logits_min": -27.556760787963867, "gen_logits_std": 3.2345142364501953, "gen_loss": 0.2810305953025818, "grad_norm": 0.4007188831307121, "learning_rate": 2.1180631578947372e-05, "loss": 0.3078, "mean_copy_accuracy": 0.9959459751844406, "mean_gen_accuracy": 0.8668120503425598, "mean_token_accuracy": 0.8967367261648178, "num_tokens": 1008940534.0, "sample_num_tokens": 10158.0, "step": 7718, "total_num_tokens": 1008981166.0, "z_loss": 0.0004728016210719943 }, { "copy_logits_max": -3.3313939571380615, "copy_logits_min": -750000000.0, "copy_num_tokens": 635.875, "epoch": 1.5765636967066632, "gen_logits_max": 3.3510990142822266, "gen_logits_mean": -17.023765563964844, "gen_logits_min": -29.225019454956055, "gen_logits_std": 3.2949166297912598, "gen_loss": 0.24971343576908112, "grad_norm": 0.3753445214765902, "learning_rate": 2.1179368421052633e-05, "loss": 0.2692, "mean_copy_accuracy": 0.9967670291662216, "mean_gen_accuracy": 0.8748825192451477, "mean_token_accuracy": 0.9080469757318497, "num_tokens": 1009218014.0, "sample_num_tokens": 9676.0, "step": 7719, "total_num_tokens": 1009256718.0, "z_loss": 0.00046352052595466375 }, { "copy_logits_max": -1.7929065227508545, "copy_logits_min": -750000000.0, "copy_num_tokens": 519.0625, "epoch": 1.5767679346438601, "gen_logits_max": 5.120942115783691, "gen_logits_mean": -14.270418167114258, "gen_logits_min": -26.45860481262207, "gen_logits_std": 3.220139503479004, "gen_loss": 0.2733505070209503, "grad_norm": 0.34653341089798995, "learning_rate": 2.1178105263157897e-05, "loss": 0.2741, "mean_copy_accuracy": 0.9968068599700928, "mean_gen_accuracy": 0.8776751607656479, "mean_token_accuracy": 0.906623587012291, "num_tokens": 1009490954.0, "sample_num_tokens": 9321.5, "step": 7720, "total_num_tokens": 1009528240.0, "z_loss": 0.0005135873798280954 }, { "copy_logits_max": -4.923717498779297, "copy_logits_min": -750000000.0, "copy_num_tokens": 414.75, "epoch": 1.576972172581057, "gen_logits_max": 2.32346510887146, "gen_logits_mean": -19.19822883605957, "gen_logits_min": -31.033161163330078, "gen_logits_std": 3.3608157634735107, "gen_loss": 0.2983451187610626, "grad_norm": 0.35477016194526123, "learning_rate": 2.1176842105263158e-05, "loss": 0.2918, "mean_copy_accuracy": 0.9969457238912582, "mean_gen_accuracy": 0.8704542070627213, "mean_token_accuracy": 0.8999113142490387, "num_tokens": 1009765131.0, "sample_num_tokens": 7821.25, "step": 7721, "total_num_tokens": 1009796416.0, "z_loss": 0.0004625895235221833 }, { "copy_logits_max": -2.892378568649292, "copy_logits_min": -750000000.0, "copy_num_tokens": 407.875, "epoch": 1.5771764105182537, "gen_logits_max": 4.037191390991211, "gen_logits_mean": -16.249120712280273, "gen_logits_min": -28.571287155151367, "gen_logits_std": 3.2805604934692383, "gen_loss": 0.268014132976532, "grad_norm": 0.35315271117717006, "learning_rate": 2.1175578947368422e-05, "loss": 0.2755, "mean_copy_accuracy": 0.9969930201768875, "mean_gen_accuracy": 0.8774928748607635, "mean_token_accuracy": 0.9081856310367584, "num_tokens": 1010055116.0, "sample_num_tokens": 7958.5, "step": 7722, "total_num_tokens": 1010086950.0, "z_loss": 0.00042705563828349113 }, { "copy_logits_max": -1.6836118698120117, "copy_logits_min": -750000000.0, "copy_num_tokens": 498.4375, "epoch": 1.5773806484554505, "gen_logits_max": 3.9525930881500244, "gen_logits_mean": -16.682361602783203, "gen_logits_min": -28.93197250366211, "gen_logits_std": 3.3074145317077637, "gen_loss": 0.26207903027534485, "grad_norm": 0.34990971602181137, "learning_rate": 2.1174315789473683e-05, "loss": 0.2697, "mean_copy_accuracy": 0.9967013150453568, "mean_gen_accuracy": 0.8766095191240311, "mean_token_accuracy": 0.9079056233167648, "num_tokens": 1010326891.0, "sample_num_tokens": 8839.75, "step": 7723, "total_num_tokens": 1010362250.0, "z_loss": 0.0004342195170465857 }, { "copy_logits_max": -2.3009753227233887, "copy_logits_min": -750000000.0, "copy_num_tokens": 291.75, "epoch": 1.5775848863926476, "gen_logits_max": 4.855257034301758, "gen_logits_mean": -14.846620559692383, "gen_logits_min": -27.59796714782715, "gen_logits_std": 3.213109254837036, "gen_loss": 0.30355799198150635, "grad_norm": 0.38286572786478523, "learning_rate": 2.1173052631578947e-05, "loss": 0.291, "mean_copy_accuracy": 0.9962291270494461, "mean_gen_accuracy": 0.8737108707427979, "mean_token_accuracy": 0.9014305770397186, "num_tokens": 1010588829.0, "sample_num_tokens": 6657.75, "step": 7724, "total_num_tokens": 1010615460.0, "z_loss": 0.0004764373879879713 }, { "copy_logits_max": -1.8013468980789185, "copy_logits_min": -750000000.0, "copy_num_tokens": 482.8125, "epoch": 1.5777891243298443, "gen_logits_max": 3.604616641998291, "gen_logits_mean": -15.860729217529297, "gen_logits_min": -28.093154907226562, "gen_logits_std": 3.294473171234131, "gen_loss": 0.2529543340206146, "grad_norm": 0.39905472461897085, "learning_rate": 2.117178947368421e-05, "loss": 0.2675, "mean_copy_accuracy": 0.9967232793569565, "mean_gen_accuracy": 0.8790459334850311, "mean_token_accuracy": 0.9120412915945053, "num_tokens": 1010858751.0, "sample_num_tokens": 8187.75, "step": 7725, "total_num_tokens": 1010891502.0, "z_loss": 0.00040146696846932173 }, { "copy_logits_max": -3.903501510620117, "copy_logits_min": -750000064.0, "copy_num_tokens": 323.0625, "epoch": 1.577993362267041, "gen_logits_max": 3.3610262870788574, "gen_logits_mean": -18.216323852539062, "gen_logits_min": -30.198793411254883, "gen_logits_std": 3.309015989303589, "gen_loss": 0.3022022545337677, "grad_norm": 0.34163118251450425, "learning_rate": 2.1170526315789476e-05, "loss": 0.2826, "mean_copy_accuracy": 0.9977896958589554, "mean_gen_accuracy": 0.8719200044870377, "mean_token_accuracy": 0.9042966812849045, "num_tokens": 1011159291.0, "sample_num_tokens": 7732.25, "step": 7726, "total_num_tokens": 1011190220.0, "z_loss": 0.000451964617241174 }, { "copy_logits_max": -0.5489746332168579, "copy_logits_min": -687500032.0, "copy_num_tokens": 517.75, "epoch": 1.578197600204238, "gen_logits_max": 3.1463851928710938, "gen_logits_mean": -16.821794509887695, "gen_logits_min": -29.395423889160156, "gen_logits_std": 3.3122031688690186, "gen_loss": 0.2785906195640564, "grad_norm": 0.41351564139048896, "learning_rate": 2.1169263157894737e-05, "loss": 0.2986, "mean_copy_accuracy": 0.9969772547483444, "mean_gen_accuracy": 0.8724643886089325, "mean_token_accuracy": 0.9003738462924957, "num_tokens": 1011416992.0, "sample_num_tokens": 8180.0, "step": 7727, "total_num_tokens": 1011449712.0, "z_loss": 0.00048498273827135563 }, { "copy_logits_max": -2.3333144187927246, "copy_logits_min": -625000064.0, "copy_num_tokens": 430.625, "epoch": 1.5784018381414349, "gen_logits_max": 2.988150119781494, "gen_logits_mean": -17.661802291870117, "gen_logits_min": -29.801820755004883, "gen_logits_std": 3.312905788421631, "gen_loss": 0.29745230078697205, "grad_norm": 0.39579610018692823, "learning_rate": 2.1168e-05, "loss": 0.2944, "mean_copy_accuracy": 0.9970435202121735, "mean_gen_accuracy": 0.8658504635095596, "mean_token_accuracy": 0.9001745581626892, "num_tokens": 1011672145.0, "sample_num_tokens": 8483.25, "step": 7728, "total_num_tokens": 1011706078.0, "z_loss": 0.0004753965185955167 }, { "copy_logits_max": 2.135037899017334, "copy_logits_min": -687500032.0, "copy_num_tokens": 534.25, "epoch": 1.5786060760786316, "gen_logits_max": 3.839726209640503, "gen_logits_mean": -14.973878860473633, "gen_logits_min": -27.58312225341797, "gen_logits_std": 3.2412595748901367, "gen_loss": 0.2561669945716858, "grad_norm": 0.3646709208461234, "learning_rate": 2.1166736842105262e-05, "loss": 0.2731, "mean_copy_accuracy": 0.996047243475914, "mean_gen_accuracy": 0.8750844895839691, "mean_token_accuracy": 0.9063218981027603, "num_tokens": 1011942949.0, "sample_num_tokens": 8373.25, "step": 7729, "total_num_tokens": 1011976442.0, "z_loss": 0.00042513792868703604 }, { "copy_logits_max": -1.332282304763794, "copy_logits_min": -750000000.0, "copy_num_tokens": 485.5625, "epoch": 1.5788103140158283, "gen_logits_max": 2.965129852294922, "gen_logits_mean": -16.941307067871094, "gen_logits_min": -29.57451629638672, "gen_logits_std": 3.2871756553649902, "gen_loss": 0.2875373065471649, "grad_norm": 0.35179973661789765, "learning_rate": 2.1165473684210527e-05, "loss": 0.2757, "mean_copy_accuracy": 0.9965071976184845, "mean_gen_accuracy": 0.8712881058454514, "mean_token_accuracy": 0.9059966504573822, "num_tokens": 1012235542.0, "sample_num_tokens": 7986.0, "step": 7730, "total_num_tokens": 1012267486.0, "z_loss": 0.0005102276336401701 }, { "copy_logits_max": -1.386272668838501, "copy_logits_min": -750000000.0, "copy_num_tokens": 567.0625, "epoch": 1.5790145519530254, "gen_logits_max": 2.642611026763916, "gen_logits_mean": -17.581464767456055, "gen_logits_min": -29.712867736816406, "gen_logits_std": 3.3120408058166504, "gen_loss": 0.2380886822938919, "grad_norm": 0.3567273772842217, "learning_rate": 2.116421052631579e-05, "loss": 0.2572, "mean_copy_accuracy": 0.997496172785759, "mean_gen_accuracy": 0.8787715435028076, "mean_token_accuracy": 0.9115164279937744, "num_tokens": 1012503490.0, "sample_num_tokens": 8492.0, "step": 7731, "total_num_tokens": 1012537458.0, "z_loss": 0.00039753445889800787 }, { "copy_logits_max": -4.142780303955078, "copy_logits_min": -687500032.0, "copy_num_tokens": 288.875, "epoch": 1.5792187898902221, "gen_logits_max": 4.267512321472168, "gen_logits_mean": -17.355348587036133, "gen_logits_min": -29.1966552734375, "gen_logits_std": 3.265613555908203, "gen_loss": 0.30462658405303955, "grad_norm": 0.32490083687942006, "learning_rate": 2.1162947368421052e-05, "loss": 0.2733, "mean_copy_accuracy": 0.9967000335454941, "mean_gen_accuracy": 0.8820592910051346, "mean_token_accuracy": 0.9070094376802444, "num_tokens": 1012773052.0, "sample_num_tokens": 7731.0, "step": 7732, "total_num_tokens": 1012803976.0, "z_loss": 0.0005086923483759165 }, { "copy_logits_max": 1.2141238451004028, "copy_logits_min": -625000000.0, "copy_num_tokens": 641.6875, "epoch": 1.5794230278274188, "gen_logits_max": 4.30092716217041, "gen_logits_mean": -15.297286987304688, "gen_logits_min": -27.49755859375, "gen_logits_std": 3.238619804382324, "gen_loss": 0.27387285232543945, "grad_norm": 0.3529136860828398, "learning_rate": 2.1161684210526316e-05, "loss": 0.2724, "mean_copy_accuracy": 0.9981808513402939, "mean_gen_accuracy": 0.8701912611722946, "mean_token_accuracy": 0.910143107175827, "num_tokens": 1013058566.0, "sample_num_tokens": 8798.0, "step": 7733, "total_num_tokens": 1013093758.0, "z_loss": 0.0004335944540798664 }, { "copy_logits_max": -1.646995186805725, "copy_logits_min": -750000000.0, "copy_num_tokens": 270.125, "epoch": 1.5796272657646158, "gen_logits_max": 4.739309310913086, "gen_logits_mean": -16.64457893371582, "gen_logits_min": -28.70901107788086, "gen_logits_std": 3.2348713874816895, "gen_loss": 0.3102189302444458, "grad_norm": 0.35599959219075766, "learning_rate": 2.116042105263158e-05, "loss": 0.2986, "mean_copy_accuracy": 0.9968644380569458, "mean_gen_accuracy": 0.8705431967973709, "mean_token_accuracy": 0.8981761485338211, "num_tokens": 1013324722.0, "sample_num_tokens": 7042.0, "step": 7734, "total_num_tokens": 1013352890.0, "z_loss": 0.00048231729306280613 }, { "copy_logits_max": -2.4054341316223145, "copy_logits_min": -750000000.0, "copy_num_tokens": 431.9375, "epoch": 1.5798315037018127, "gen_logits_max": 3.6388187408447266, "gen_logits_mean": -17.15049171447754, "gen_logits_min": -30.560279846191406, "gen_logits_std": 3.313953399658203, "gen_loss": 0.26436519622802734, "grad_norm": 0.37515526165626056, "learning_rate": 2.1159157894736845e-05, "loss": 0.269, "mean_copy_accuracy": 0.9962028115987778, "mean_gen_accuracy": 0.8808142989873886, "mean_token_accuracy": 0.9081908017396927, "num_tokens": 1013588981.0, "sample_num_tokens": 9033.75, "step": 7735, "total_num_tokens": 1013625116.0, "z_loss": 0.0004413780989125371 }, { "copy_logits_max": -1.3913350105285645, "copy_logits_min": -750000000.0, "copy_num_tokens": 248.5625, "epoch": 1.5800357416390094, "gen_logits_max": 5.272177219390869, "gen_logits_mean": -15.55598258972168, "gen_logits_min": -28.14330291748047, "gen_logits_std": 3.1997880935668945, "gen_loss": 0.3077128529548645, "grad_norm": 0.3482218085642363, "learning_rate": 2.1157894736842106e-05, "loss": 0.2727, "mean_copy_accuracy": 0.9959362894296646, "mean_gen_accuracy": 0.8820488452911377, "mean_token_accuracy": 0.9065618366003036, "num_tokens": 1013871901.0, "sample_num_tokens": 7124.75, "step": 7736, "total_num_tokens": 1013900400.0, "z_loss": 0.0005074756336398423 }, { "copy_logits_max": -2.5581064224243164, "copy_logits_min": -687500032.0, "copy_num_tokens": 494.5625, "epoch": 1.5802399795762063, "gen_logits_max": 3.999831438064575, "gen_logits_mean": -16.439483642578125, "gen_logits_min": -28.836524963378906, "gen_logits_std": 3.2780039310455322, "gen_loss": 0.25272274017333984, "grad_norm": 0.3604080088556755, "learning_rate": 2.115663157894737e-05, "loss": 0.2612, "mean_copy_accuracy": 0.9959451109170914, "mean_gen_accuracy": 0.8845312148332596, "mean_token_accuracy": 0.9115496128797531, "num_tokens": 1014126903.0, "sample_num_tokens": 9216.75, "step": 7737, "total_num_tokens": 1014163770.0, "z_loss": 0.0004083531384821981 }, { "copy_logits_max": -2.7137866020202637, "copy_logits_min": -750000000.0, "copy_num_tokens": 505.5625, "epoch": 1.5804442175134032, "gen_logits_max": 4.708772659301758, "gen_logits_mean": -15.953564643859863, "gen_logits_min": -28.498737335205078, "gen_logits_std": 3.248298406600952, "gen_loss": 0.26592373847961426, "grad_norm": 0.3251594163739188, "learning_rate": 2.115536842105263e-05, "loss": 0.2602, "mean_copy_accuracy": 0.9969080090522766, "mean_gen_accuracy": 0.8815846592187881, "mean_token_accuracy": 0.9144637584686279, "num_tokens": 1014434425.0, "sample_num_tokens": 9385.75, "step": 7738, "total_num_tokens": 1014471968.0, "z_loss": 0.00045043445425108075 }, { "copy_logits_max": -2.4948248863220215, "copy_logits_min": -750000000.0, "copy_num_tokens": 540.6875, "epoch": 1.5806484554506, "gen_logits_max": 4.839691162109375, "gen_logits_mean": -15.548684120178223, "gen_logits_min": -27.620529174804688, "gen_logits_std": 3.2632391452789307, "gen_loss": 0.24378634989261627, "grad_norm": 0.33504821086792186, "learning_rate": 2.1154105263157895e-05, "loss": 0.2516, "mean_copy_accuracy": 0.9968009889125824, "mean_gen_accuracy": 0.8836163133382797, "mean_token_accuracy": 0.9144227206707001, "num_tokens": 1014720771.0, "sample_num_tokens": 9249.75, "step": 7739, "total_num_tokens": 1014757770.0, "z_loss": 0.00039745529647916555 }, { "copy_logits_max": -0.8156642317771912, "copy_logits_min": -687500032.0, "copy_num_tokens": 511.0625, "epoch": 1.5808526933877967, "gen_logits_max": 4.256730556488037, "gen_logits_mean": -15.945182800292969, "gen_logits_min": -28.323118209838867, "gen_logits_std": 3.261322498321533, "gen_loss": 0.29316556453704834, "grad_norm": 0.3531528368120045, "learning_rate": 2.1152842105263156e-05, "loss": 0.2889, "mean_copy_accuracy": 0.9963901937007904, "mean_gen_accuracy": 0.8719204217195511, "mean_token_accuracy": 0.9016954004764557, "num_tokens": 1014979302.0, "sample_num_tokens": 8122.0, "step": 7740, "total_num_tokens": 1015011790.0, "z_loss": 0.0004620902764145285 }, { "copy_logits_max": -2.0589399337768555, "copy_logits_min": -750000000.0, "copy_num_tokens": 616.25, "epoch": 1.5810569313249936, "gen_logits_max": 4.2435150146484375, "gen_logits_mean": -16.374412536621094, "gen_logits_min": -28.603557586669922, "gen_logits_std": 3.296766757965088, "gen_loss": 0.2618476152420044, "grad_norm": 0.384958860937921, "learning_rate": 2.115157894736842e-05, "loss": 0.2642, "mean_copy_accuracy": 0.9966280311346054, "mean_gen_accuracy": 0.8793171644210815, "mean_token_accuracy": 0.9098531901836395, "num_tokens": 1015246512.0, "sample_num_tokens": 9210.0, "step": 7741, "total_num_tokens": 1015283352.0, "z_loss": 0.0004114834882784635 }, { "copy_logits_max": -3.506523609161377, "copy_logits_min": -750000000.0, "copy_num_tokens": 439.3125, "epoch": 1.5812611692621905, "gen_logits_max": 4.1654953956604, "gen_logits_mean": -16.79742431640625, "gen_logits_min": -28.46785545349121, "gen_logits_std": 3.281966209411621, "gen_loss": 0.2840203642845154, "grad_norm": 0.31110427274226865, "learning_rate": 2.1150315789473685e-05, "loss": 0.2494, "mean_copy_accuracy": 0.9976935684680939, "mean_gen_accuracy": 0.8849514424800873, "mean_token_accuracy": 0.9160634726285934, "num_tokens": 1015563174.0, "sample_num_tokens": 8639.0, "step": 7742, "total_num_tokens": 1015597730.0, "z_loss": 0.0004510791040956974 }, { "copy_logits_max": -1.7284820079803467, "copy_logits_min": -687500032.0, "copy_num_tokens": 453.1875, "epoch": 1.5814654071993872, "gen_logits_max": 4.821359157562256, "gen_logits_mean": -16.38182830810547, "gen_logits_min": -28.23141098022461, "gen_logits_std": 3.286317825317383, "gen_loss": 0.27754443883895874, "grad_norm": 0.34063513669096535, "learning_rate": 2.114905263157895e-05, "loss": 0.2793, "mean_copy_accuracy": 0.9972136318683624, "mean_gen_accuracy": 0.8732602894306183, "mean_token_accuracy": 0.9065007418394089, "num_tokens": 1015855586.0, "sample_num_tokens": 8284.0, "step": 7743, "total_num_tokens": 1015888722.0, "z_loss": 0.0004387627705000341 }, { "copy_logits_max": -1.9203393459320068, "copy_logits_min": -750000000.0, "copy_num_tokens": 408.25, "epoch": 1.5816696451365841, "gen_logits_max": 4.13718318939209, "gen_logits_mean": -16.936019897460938, "gen_logits_min": -28.883228302001953, "gen_logits_std": 3.2814688682556152, "gen_loss": 0.27752920985221863, "grad_norm": 0.3675746976894503, "learning_rate": 2.1147789473684213e-05, "loss": 0.2707, "mean_copy_accuracy": 0.9967758655548096, "mean_gen_accuracy": 0.8794855326414108, "mean_token_accuracy": 0.9085719585418701, "num_tokens": 1016127523.0, "sample_num_tokens": 8421.75, "step": 7744, "total_num_tokens": 1016161210.0, "z_loss": 0.00043303772690705955 }, { "copy_logits_max": -5.457285404205322, "copy_logits_min": -750000000.0, "copy_num_tokens": 485.9375, "epoch": 1.581873883073781, "gen_logits_max": 4.545855522155762, "gen_logits_mean": -17.19937515258789, "gen_logits_min": -29.227455139160156, "gen_logits_std": 3.3069067001342773, "gen_loss": 0.28712767362594604, "grad_norm": 0.3481641396368155, "learning_rate": 2.1146526315789474e-05, "loss": 0.2743, "mean_copy_accuracy": 0.9970060139894485, "mean_gen_accuracy": 0.8808423429727554, "mean_token_accuracy": 0.9087963998317719, "num_tokens": 1016405255.0, "sample_num_tokens": 9859.75, "step": 7745, "total_num_tokens": 1016444694.0, "z_loss": 0.0004648217000067234 }, { "copy_logits_max": -0.78743577003479, "copy_logits_min": -750000000.0, "copy_num_tokens": 389.5, "epoch": 1.5820781210109778, "gen_logits_max": 5.717873573303223, "gen_logits_mean": -14.226325988769531, "gen_logits_min": -26.563262939453125, "gen_logits_std": 3.251784086227417, "gen_loss": 0.293270468711853, "grad_norm": 0.47662953877593095, "learning_rate": 2.114526315789474e-05, "loss": 0.2949, "mean_copy_accuracy": 0.9965765178203583, "mean_gen_accuracy": 0.8702462464570999, "mean_token_accuracy": 0.8999584764242172, "num_tokens": 1016655165.0, "sample_num_tokens": 7244.25, "step": 7746, "total_num_tokens": 1016684142.0, "z_loss": 0.00047991826431825757 }, { "copy_logits_max": 1.0938420295715332, "copy_logits_min": -750000000.0, "copy_num_tokens": 697.75, "epoch": 1.5822823589481745, "gen_logits_max": 4.367047309875488, "gen_logits_mean": -14.598682403564453, "gen_logits_min": -27.978940963745117, "gen_logits_std": 3.2997002601623535, "gen_loss": 0.21213245391845703, "grad_norm": 0.3726038635452554, "learning_rate": 2.1144e-05, "loss": 0.267, "mean_copy_accuracy": 0.9972973763942719, "mean_gen_accuracy": 0.8819993585348129, "mean_token_accuracy": 0.911872461438179, "num_tokens": 1016926949.0, "sample_num_tokens": 8997.25, "step": 7747, "total_num_tokens": 1016962938.0, "z_loss": 0.0003803693689405918 }, { "copy_logits_max": -3.640089750289917, "copy_logits_min": -750000000.0, "copy_num_tokens": 286.9375, "epoch": 1.5824865968853714, "gen_logits_max": 4.674374580383301, "gen_logits_mean": -16.192808151245117, "gen_logits_min": -28.417861938476562, "gen_logits_std": 3.270857095718384, "gen_loss": 0.2797779440879822, "grad_norm": 0.36700636857523017, "learning_rate": 2.1142736842105264e-05, "loss": 0.2821, "mean_copy_accuracy": 0.9963530600070953, "mean_gen_accuracy": 0.8739316463470459, "mean_token_accuracy": 0.9023184329271317, "num_tokens": 1017180398.0, "sample_num_tokens": 6482.0, "step": 7748, "total_num_tokens": 1017206326.0, "z_loss": 0.0004773699911311269 }, { "copy_logits_max": -3.1394433975219727, "copy_logits_min": -750000064.0, "copy_num_tokens": 564.0, "epoch": 1.5826908348225683, "gen_logits_max": 4.58030891418457, "gen_logits_mean": -16.344072341918945, "gen_logits_min": -28.543128967285156, "gen_logits_std": 3.2938623428344727, "gen_loss": 0.2549036741256714, "grad_norm": 0.36653269059424515, "learning_rate": 2.1141473684210525e-05, "loss": 0.2734, "mean_copy_accuracy": 0.9974993765354156, "mean_gen_accuracy": 0.8717703074216843, "mean_token_accuracy": 0.9076938927173615, "num_tokens": 1017461906.0, "sample_num_tokens": 9561.5, "step": 7749, "total_num_tokens": 1017500152.0, "z_loss": 0.0004296275437809527 }, { "copy_logits_max": -2.379856824874878, "copy_logits_min": -750000000.0, "copy_num_tokens": 397.4375, "epoch": 1.582895072759765, "gen_logits_max": 3.4790120124816895, "gen_logits_mean": -17.451784133911133, "gen_logits_min": -29.982154846191406, "gen_logits_std": 3.318328857421875, "gen_loss": 0.3134725093841553, "grad_norm": 0.33370181359585976, "learning_rate": 2.1140210526315792e-05, "loss": 0.2683, "mean_copy_accuracy": 0.9973937571048737, "mean_gen_accuracy": 0.8777111768722534, "mean_token_accuracy": 0.9090858846902847, "num_tokens": 1017752080.0, "sample_num_tokens": 7537.0, "step": 7750, "total_num_tokens": 1017782228.0, "z_loss": 0.0004731641965918243 }, { "copy_logits_max": -2.566032648086548, "copy_logits_min": -750000000.0, "copy_num_tokens": 649.0625, "epoch": 1.583099310696962, "gen_logits_max": 1.9305307865142822, "gen_logits_mean": -18.85953140258789, "gen_logits_min": -31.30320930480957, "gen_logits_std": 3.380258083343506, "gen_loss": 0.2847641706466675, "grad_norm": 0.3474843000100203, "learning_rate": 2.1138947368421053e-05, "loss": 0.2588, "mean_copy_accuracy": 0.9965213388204575, "mean_gen_accuracy": 0.8818527013063431, "mean_token_accuracy": 0.9124553948640823, "num_tokens": 1018007813.0, "sample_num_tokens": 9207.25, "step": 7751, "total_num_tokens": 1018044642.0, "z_loss": 0.0004969386500306427 }, { "copy_logits_max": -2.466214418411255, "copy_logits_min": -750000064.0, "copy_num_tokens": 371.5, "epoch": 1.583303548634159, "gen_logits_max": 4.493636608123779, "gen_logits_mean": -16.521343231201172, "gen_logits_min": -28.311119079589844, "gen_logits_std": 3.2788376808166504, "gen_loss": 0.2854016423225403, "grad_norm": 0.3463234997473329, "learning_rate": 2.1137684210526318e-05, "loss": 0.2715, "mean_copy_accuracy": 0.9975353628396988, "mean_gen_accuracy": 0.8725629150867462, "mean_token_accuracy": 0.907359704375267, "num_tokens": 1018302253.0, "sample_num_tokens": 7540.75, "step": 7752, "total_num_tokens": 1018332416.0, "z_loss": 0.00045539639540947974 }, { "copy_logits_max": -2.8091378211975098, "copy_logits_min": -750000000.0, "copy_num_tokens": 403.6875, "epoch": 1.5835077865713556, "gen_logits_max": 5.065173149108887, "gen_logits_mean": -16.138774871826172, "gen_logits_min": -28.042936325073242, "gen_logits_std": 3.243201971054077, "gen_loss": 0.30209076404571533, "grad_norm": 0.37212108109439435, "learning_rate": 2.113642105263158e-05, "loss": 0.3079, "mean_copy_accuracy": 0.996550053358078, "mean_gen_accuracy": 0.8670691698789597, "mean_token_accuracy": 0.895216777920723, "num_tokens": 1018567367.0, "sample_num_tokens": 8993.25, "step": 7753, "total_num_tokens": 1018603340.0, "z_loss": 0.000525360053870827 }, { "copy_logits_max": -3.896287441253662, "copy_logits_min": -750000000.0, "copy_num_tokens": 470.875, "epoch": 1.5837120245085523, "gen_logits_max": 4.520636558532715, "gen_logits_mean": -16.31201171875, "gen_logits_min": -28.2584171295166, "gen_logits_std": 3.2532575130462646, "gen_loss": 0.289508581161499, "grad_norm": 0.36799837129224394, "learning_rate": 2.1135157894736843e-05, "loss": 0.2721, "mean_copy_accuracy": 0.9973612278699875, "mean_gen_accuracy": 0.8755378127098083, "mean_token_accuracy": 0.9070944041013718, "num_tokens": 1018854423.0, "sample_num_tokens": 9951.75, "step": 7754, "total_num_tokens": 1018894230.0, "z_loss": 0.0005135799874551594 }, { "copy_logits_max": 0.19884701073169708, "copy_logits_min": -750000000.0, "copy_num_tokens": 593.125, "epoch": 1.5839162624457495, "gen_logits_max": 5.31231689453125, "gen_logits_mean": -13.815153121948242, "gen_logits_min": -26.332138061523438, "gen_logits_std": 3.21746563911438, "gen_loss": 0.2583003044128418, "grad_norm": 0.3562879403618982, "learning_rate": 2.1133894736842104e-05, "loss": 0.2483, "mean_copy_accuracy": 0.9976578801870346, "mean_gen_accuracy": 0.8860528469085693, "mean_token_accuracy": 0.9174413979053497, "num_tokens": 1019134481.0, "sample_num_tokens": 8418.75, "step": 7755, "total_num_tokens": 1019168156.0, "z_loss": 0.0004532322636805475 }, { "copy_logits_max": -4.23671293258667, "copy_logits_min": -750000000.0, "copy_num_tokens": 339.125, "epoch": 1.5841205003829462, "gen_logits_max": 3.4040064811706543, "gen_logits_mean": -18.305147171020508, "gen_logits_min": -30.27898406982422, "gen_logits_std": 3.3278589248657227, "gen_loss": 0.28184449672698975, "grad_norm": 0.4313182270596899, "learning_rate": 2.1132631578947368e-05, "loss": 0.287, "mean_copy_accuracy": 0.9968579709529877, "mean_gen_accuracy": 0.8775347769260406, "mean_token_accuracy": 0.9044363051652908, "num_tokens": 1019391621.0, "sample_num_tokens": 7360.25, "step": 7756, "total_num_tokens": 1019421062.0, "z_loss": 0.00048157377750612795 }, { "copy_logits_max": -1.7045886516571045, "copy_logits_min": -625000064.0, "copy_num_tokens": 470.6875, "epoch": 1.5843247383201429, "gen_logits_max": 3.9792284965515137, "gen_logits_mean": -16.815475463867188, "gen_logits_min": -29.000715255737305, "gen_logits_std": 3.2702927589416504, "gen_loss": 0.30362361669540405, "grad_norm": 0.36786454994991424, "learning_rate": 2.1131368421052632e-05, "loss": 0.2931, "mean_copy_accuracy": 0.9970917701721191, "mean_gen_accuracy": 0.8720249384641647, "mean_token_accuracy": 0.9024278074502945, "num_tokens": 1019655544.0, "sample_num_tokens": 8719.5, "step": 7757, "total_num_tokens": 1019690422.0, "z_loss": 0.0005236900178715587 }, { "copy_logits_max": -4.947368621826172, "copy_logits_min": -750000000.0, "copy_num_tokens": 416.0625, "epoch": 1.5845289762573398, "gen_logits_max": 3.6097073554992676, "gen_logits_mean": -17.763681411743164, "gen_logits_min": -29.536584854125977, "gen_logits_std": 3.296441078186035, "gen_loss": 0.282935231924057, "grad_norm": 0.35551367163577274, "learning_rate": 2.1130105263157893e-05, "loss": 0.2742, "mean_copy_accuracy": 0.9961842745542526, "mean_gen_accuracy": 0.8779332488775253, "mean_token_accuracy": 0.907668948173523, "num_tokens": 1019932576.0, "sample_num_tokens": 9169.0, "step": 7758, "total_num_tokens": 1019969252.0, "z_loss": 0.00047130760503932834 }, { "copy_logits_max": -0.8346812725067139, "copy_logits_min": -750000000.0, "copy_num_tokens": 851.4375, "epoch": 1.5847332141945367, "gen_logits_max": 4.974465370178223, "gen_logits_mean": -13.778478622436523, "gen_logits_min": -26.03746223449707, "gen_logits_std": 3.2192859649658203, "gen_loss": 0.23819667100906372, "grad_norm": 0.3391432684057643, "learning_rate": 2.112884210526316e-05, "loss": 0.2623, "mean_copy_accuracy": 0.9972376972436905, "mean_gen_accuracy": 0.8808828890323639, "mean_token_accuracy": 0.9121781438589096, "num_tokens": 1020219495.0, "sample_num_tokens": 10501.75, "step": 7759, "total_num_tokens": 1020261502.0, "z_loss": 0.0004012797144241631 }, { "copy_logits_max": -3.414996385574341, "copy_logits_min": -750000000.0, "copy_num_tokens": 377.8125, "epoch": 1.5849374521317334, "gen_logits_max": 4.620998859405518, "gen_logits_mean": -15.714077949523926, "gen_logits_min": -28.239675521850586, "gen_logits_std": 3.256957530975342, "gen_loss": 0.2753991484642029, "grad_norm": 0.34603800956399683, "learning_rate": 2.1127578947368422e-05, "loss": 0.2798, "mean_copy_accuracy": 0.9968995302915573, "mean_gen_accuracy": 0.8762913942337036, "mean_token_accuracy": 0.9042837172746658, "num_tokens": 1020487976.0, "sample_num_tokens": 7634.5, "step": 7760, "total_num_tokens": 1020518514.0, "z_loss": 0.00045421707909554243 }, { "copy_logits_max": -2.695457935333252, "copy_logits_min": -687500032.0, "copy_num_tokens": 325.0, "epoch": 1.5851416900689304, "gen_logits_max": 4.631041526794434, "gen_logits_mean": -16.11258316040039, "gen_logits_min": -28.406330108642578, "gen_logits_std": 3.289938449859619, "gen_loss": 0.2619706094264984, "grad_norm": 0.33351731910462157, "learning_rate": 2.1126315789473686e-05, "loss": 0.2788, "mean_copy_accuracy": 0.9965929090976715, "mean_gen_accuracy": 0.8805305063724518, "mean_token_accuracy": 0.906040608882904, "num_tokens": 1020755848.0, "sample_num_tokens": 7643.5, "step": 7761, "total_num_tokens": 1020786422.0, "z_loss": 0.000379164848709479 }, { "copy_logits_max": -7.29874849319458, "copy_logits_min": -750000000.0, "copy_num_tokens": 325.0625, "epoch": 1.5853459280061273, "gen_logits_max": 2.492654800415039, "gen_logits_mean": -19.82350730895996, "gen_logits_min": -31.70041847229004, "gen_logits_std": 3.3744561672210693, "gen_loss": 0.25603049993515015, "grad_norm": 0.35370375794976766, "learning_rate": 2.1125052631578947e-05, "loss": 0.2731, "mean_copy_accuracy": 0.9963819086551666, "mean_gen_accuracy": 0.8786502033472061, "mean_token_accuracy": 0.9066754430532455, "num_tokens": 1021009057.0, "sample_num_tokens": 6841.25, "step": 7762, "total_num_tokens": 1021036422.0, "z_loss": 0.000404143997002393 }, { "copy_logits_max": -4.292122840881348, "copy_logits_min": -750000000.0, "copy_num_tokens": 634.0625, "epoch": 1.585550165943324, "gen_logits_max": 4.383996963500977, "gen_logits_mean": -14.933292388916016, "gen_logits_min": -27.368167877197266, "gen_logits_std": 3.2178914546966553, "gen_loss": 0.28260624408721924, "grad_norm": 0.34270749330288436, "learning_rate": 2.112378947368421e-05, "loss": 0.2748, "mean_copy_accuracy": 0.9976379424333572, "mean_gen_accuracy": 0.8779258877038956, "mean_token_accuracy": 0.9079436659812927, "num_tokens": 1021283199.0, "sample_num_tokens": 9489.25, "step": 7763, "total_num_tokens": 1021321156.0, "z_loss": 0.0004756545531563461 }, { "copy_logits_max": -5.0750412940979, "copy_logits_min": -750000000.0, "copy_num_tokens": 516.5, "epoch": 1.5857544038805207, "gen_logits_max": 4.528800964355469, "gen_logits_mean": -14.902570724487305, "gen_logits_min": -26.963886260986328, "gen_logits_std": 3.1910147666931152, "gen_loss": 0.3076339662075043, "grad_norm": 0.34526884776283717, "learning_rate": 2.1122526315789472e-05, "loss": 0.2868, "mean_copy_accuracy": 0.9961510747671127, "mean_gen_accuracy": 0.8725265562534332, "mean_token_accuracy": 0.9031635373830795, "num_tokens": 1021560415.0, "sample_num_tokens": 8969.25, "step": 7764, "total_num_tokens": 1021596292.0, "z_loss": 0.0004808488884009421 }, { "copy_logits_max": -3.5652122497558594, "copy_logits_min": -750000000.0, "copy_num_tokens": 564.9375, "epoch": 1.5859586418177176, "gen_logits_max": 3.089644432067871, "gen_logits_mean": -17.79188346862793, "gen_logits_min": -30.37508773803711, "gen_logits_std": 3.3234565258026123, "gen_loss": 0.2547953128814697, "grad_norm": 0.3606908551972163, "learning_rate": 2.1121263157894737e-05, "loss": 0.2599, "mean_copy_accuracy": 0.9967516213655472, "mean_gen_accuracy": 0.8824851363897324, "mean_token_accuracy": 0.9098507165908813, "num_tokens": 1021821010.0, "sample_num_tokens": 9106.0, "step": 7765, "total_num_tokens": 1021857434.0, "z_loss": 0.0004944245447404683 }, { "copy_logits_max": -3.6156654357910156, "copy_logits_min": -687500032.0, "copy_num_tokens": 559.5625, "epoch": 1.5861628797549145, "gen_logits_max": 2.9739181995391846, "gen_logits_mean": -17.148357391357422, "gen_logits_min": -29.48754119873047, "gen_logits_std": 3.291032552719116, "gen_loss": 0.2870904803276062, "grad_norm": 0.33321033330765465, "learning_rate": 2.1119999999999998e-05, "loss": 0.2753, "mean_copy_accuracy": 0.997130274772644, "mean_gen_accuracy": 0.8755032569169998, "mean_token_accuracy": 0.9078526794910431, "num_tokens": 1022094884.0, "sample_num_tokens": 8503.5, "step": 7766, "total_num_tokens": 1022128898.0, "z_loss": 0.0005665638018399477 }, { "copy_logits_max": -1.9933559894561768, "copy_logits_min": -687500032.0, "copy_num_tokens": 539.4375, "epoch": 1.5863671176921113, "gen_logits_max": 5.383650779724121, "gen_logits_mean": -13.859559059143066, "gen_logits_min": -26.171327590942383, "gen_logits_std": 3.180386781692505, "gen_loss": 0.28310924768447876, "grad_norm": 0.34853236402388205, "learning_rate": 2.1118736842105265e-05, "loss": 0.272, "mean_copy_accuracy": 0.9969686567783356, "mean_gen_accuracy": 0.8771973848342896, "mean_token_accuracy": 0.9089198708534241, "num_tokens": 1022377993.0, "sample_num_tokens": 9355.25, "step": 7767, "total_num_tokens": 1022415414.0, "z_loss": 0.0005532689392566681 }, { "copy_logits_max": -3.5948286056518555, "copy_logits_min": -687500096.0, "copy_num_tokens": 429.5, "epoch": 1.5865713556293082, "gen_logits_max": 4.459782600402832, "gen_logits_mean": -15.359943389892578, "gen_logits_min": -27.307252883911133, "gen_logits_std": 3.2263360023498535, "gen_loss": 0.26094117760658264, "grad_norm": 0.3683764747627808, "learning_rate": 2.1117473684210526e-05, "loss": 0.2886, "mean_copy_accuracy": 0.995934784412384, "mean_gen_accuracy": 0.8791264742612839, "mean_token_accuracy": 0.9020243585109711, "num_tokens": 1022636114.0, "sample_num_tokens": 8697.0, "step": 7768, "total_num_tokens": 1022670902.0, "z_loss": 0.00043634066241793334 }, { "copy_logits_max": -4.268331527709961, "copy_logits_min": -750000000.0, "copy_num_tokens": 455.8125, "epoch": 1.586775593566505, "gen_logits_max": 3.419898748397827, "gen_logits_mean": -16.704280853271484, "gen_logits_min": -28.561132431030273, "gen_logits_std": 3.247706174850464, "gen_loss": 0.3051975667476654, "grad_norm": 0.35102944210467407, "learning_rate": 2.111621052631579e-05, "loss": 0.2804, "mean_copy_accuracy": 0.9967238456010818, "mean_gen_accuracy": 0.8759491145610809, "mean_token_accuracy": 0.9036400318145752, "num_tokens": 1022897549.0, "sample_num_tokens": 7932.75, "step": 7769, "total_num_tokens": 1022929280.0, "z_loss": 0.0005660838214680552 }, { "copy_logits_max": -1.4987826347351074, "copy_logits_min": -750000000.0, "copy_num_tokens": 721.5625, "epoch": 1.5869798315037018, "gen_logits_max": 3.427131414413452, "gen_logits_mean": -15.934540748596191, "gen_logits_min": -28.185945510864258, "gen_logits_std": 3.24251651763916, "gen_loss": 0.29967203736305237, "grad_norm": 0.36543845574965367, "learning_rate": 2.1114947368421055e-05, "loss": 0.2982, "mean_copy_accuracy": 0.9951480329036713, "mean_gen_accuracy": 0.8652157783508301, "mean_token_accuracy": 0.899139404296875, "num_tokens": 1023168655.0, "sample_num_tokens": 10449.75, "step": 7770, "total_num_tokens": 1023210454.0, "z_loss": 0.0005317428149282932 }, { "copy_logits_max": -2.9982714653015137, "copy_logits_min": -750000000.0, "copy_num_tokens": 549.8125, "epoch": 1.5871840694408985, "gen_logits_max": 2.949579954147339, "gen_logits_mean": -17.353649139404297, "gen_logits_min": -29.69852638244629, "gen_logits_std": 3.308638095855713, "gen_loss": 0.23187434673309326, "grad_norm": 0.36858149254361144, "learning_rate": 2.1113684210526316e-05, "loss": 0.2683, "mean_copy_accuracy": 0.9967943578958511, "mean_gen_accuracy": 0.8831358551979065, "mean_token_accuracy": 0.9098038673400879, "num_tokens": 1023446583.0, "sample_num_tokens": 8710.25, "step": 7771, "total_num_tokens": 1023481424.0, "z_loss": 0.0003876533592119813 }, { "copy_logits_max": -2.994699716567993, "copy_logits_min": -750000000.0, "copy_num_tokens": 466.9375, "epoch": 1.5873883073780954, "gen_logits_max": 3.9254214763641357, "gen_logits_mean": -15.814332962036133, "gen_logits_min": -28.416990280151367, "gen_logits_std": 3.2503576278686523, "gen_loss": 0.23602798581123352, "grad_norm": 0.37452823694869525, "learning_rate": 2.111242105263158e-05, "loss": 0.2781, "mean_copy_accuracy": 0.9976218044757843, "mean_gen_accuracy": 0.8775437027215958, "mean_token_accuracy": 0.9056007862091064, "num_tokens": 1023691135.0, "sample_num_tokens": 7415.25, "step": 7772, "total_num_tokens": 1023720796.0, "z_loss": 0.0003440030850470066 }, { "copy_logits_max": -4.773911476135254, "copy_logits_min": -687500032.0, "copy_num_tokens": 637.0, "epoch": 1.5875925453152924, "gen_logits_max": 3.3578481674194336, "gen_logits_mean": -16.617136001586914, "gen_logits_min": -28.901321411132812, "gen_logits_std": 3.292773723602295, "gen_loss": 0.2821933329105377, "grad_norm": 0.3389531992979016, "learning_rate": 2.111115789473684e-05, "loss": 0.2594, "mean_copy_accuracy": 0.9969639033079147, "mean_gen_accuracy": 0.8802580684423447, "mean_token_accuracy": 0.9123117625713348, "num_tokens": 1023967856.0, "sample_num_tokens": 10139.0, "step": 7773, "total_num_tokens": 1024008412.0, "z_loss": 0.00047157122753560543 }, { "copy_logits_max": -3.579094648361206, "copy_logits_min": -687500032.0, "copy_num_tokens": 634.1875, "epoch": 1.587796783252489, "gen_logits_max": 2.1889615058898926, "gen_logits_mean": -17.429134368896484, "gen_logits_min": -29.804595947265625, "gen_logits_std": 3.298365354537964, "gen_loss": 0.25758013129234314, "grad_norm": 0.3488653768591884, "learning_rate": 2.1109894736842105e-05, "loss": 0.2664, "mean_copy_accuracy": 0.9964019954204559, "mean_gen_accuracy": 0.8756247013807297, "mean_token_accuracy": 0.9084367454051971, "num_tokens": 1024246797.0, "sample_num_tokens": 8866.25, "step": 7774, "total_num_tokens": 1024282262.0, "z_loss": 0.00037881472962908447 }, { "copy_logits_max": -3.739239454269409, "copy_logits_min": -750000000.0, "copy_num_tokens": 543.6875, "epoch": 1.588001021189686, "gen_logits_max": 2.805689811706543, "gen_logits_mean": -18.224332809448242, "gen_logits_min": -30.306425094604492, "gen_logits_std": 3.3378114700317383, "gen_loss": 0.27753734588623047, "grad_norm": 0.34004081046726736, "learning_rate": 2.110863157894737e-05, "loss": 0.284, "mean_copy_accuracy": 0.9970946609973907, "mean_gen_accuracy": 0.8706978410482407, "mean_token_accuracy": 0.9041034132242203, "num_tokens": 1024525020.0, "sample_num_tokens": 8985.0, "step": 7775, "total_num_tokens": 1024560960.0, "z_loss": 0.00044212641660124063 }, { "copy_logits_max": -3.7034218311309814, "copy_logits_min": -750000000.0, "copy_num_tokens": 464.25, "epoch": 1.588205259126883, "gen_logits_max": 3.439711093902588, "gen_logits_mean": -16.455224990844727, "gen_logits_min": -28.574703216552734, "gen_logits_std": 3.2523505687713623, "gen_loss": 0.27096545696258545, "grad_norm": 0.31622127486620444, "learning_rate": 2.1107368421052634e-05, "loss": 0.2622, "mean_copy_accuracy": 0.9971170723438263, "mean_gen_accuracy": 0.881145253777504, "mean_token_accuracy": 0.9115867763757706, "num_tokens": 1024816558.0, "sample_num_tokens": 8245.5, "step": 7776, "total_num_tokens": 1024849540.0, "z_loss": 0.0004061130457557738 }, { "copy_logits_max": -2.701658248901367, "copy_logits_min": -687500032.0, "copy_num_tokens": 623.4375, "epoch": 1.5884094970640796, "gen_logits_max": 2.7406415939331055, "gen_logits_mean": -17.26587677001953, "gen_logits_min": -29.686037063598633, "gen_logits_std": 3.3237922191619873, "gen_loss": 0.23711904883384705, "grad_norm": 0.3618692790773395, "learning_rate": 2.1106105263157895e-05, "loss": 0.2491, "mean_copy_accuracy": 0.9972833395004272, "mean_gen_accuracy": 0.8849906623363495, "mean_token_accuracy": 0.9140627533197403, "num_tokens": 1025062475.0, "sample_num_tokens": 9763.75, "step": 7777, "total_num_tokens": 1025101530.0, "z_loss": 0.00045431876787915826 }, { "copy_logits_max": -3.4776859283447266, "copy_logits_min": -750000000.0, "copy_num_tokens": 449.5, "epoch": 1.5886137350012763, "gen_logits_max": 3.8349058628082275, "gen_logits_mean": -16.936233520507812, "gen_logits_min": -29.242691040039062, "gen_logits_std": 3.2800540924072266, "gen_loss": 0.31598055362701416, "grad_norm": 0.38424351672513296, "learning_rate": 2.110484210526316e-05, "loss": 0.2883, "mean_copy_accuracy": 0.9970943480730057, "mean_gen_accuracy": 0.8733604848384857, "mean_token_accuracy": 0.9015583992004395, "num_tokens": 1025327836.0, "sample_num_tokens": 8645.0, "step": 7778, "total_num_tokens": 1025362416.0, "z_loss": 0.0004848914104513824 }, { "copy_logits_max": -3.9183878898620605, "copy_logits_min": -750000000.0, "copy_num_tokens": 408.0, "epoch": 1.5888179729384733, "gen_logits_max": 4.144277572631836, "gen_logits_mean": -15.653474807739258, "gen_logits_min": -28.102245330810547, "gen_logits_std": 3.21876859664917, "gen_loss": 0.2661190629005432, "grad_norm": 0.3354935921243657, "learning_rate": 2.110357894736842e-05, "loss": 0.2561, "mean_copy_accuracy": 0.997251033782959, "mean_gen_accuracy": 0.881340354681015, "mean_token_accuracy": 0.9127338081598282, "num_tokens": 1025607656.0, "sample_num_tokens": 7568.0, "step": 7779, "total_num_tokens": 1025637928.0, "z_loss": 0.0004342385218478739 }, { "copy_logits_max": -4.166231632232666, "copy_logits_min": -750000064.0, "copy_num_tokens": 686.1875, "epoch": 1.5890222108756702, "gen_logits_max": 2.394911766052246, "gen_logits_mean": -17.618797302246094, "gen_logits_min": -30.058963775634766, "gen_logits_std": 3.3214199542999268, "gen_loss": 0.2478516548871994, "grad_norm": 0.37783337577342424, "learning_rate": 2.1102315789473684e-05, "loss": 0.2747, "mean_copy_accuracy": 0.9958191365003586, "mean_gen_accuracy": 0.8773976713418961, "mean_token_accuracy": 0.9071027785539627, "num_tokens": 1025879036.0, "sample_num_tokens": 9917.0, "step": 7780, "total_num_tokens": 1025918704.0, "z_loss": 0.00043780740816146135 }, { "copy_logits_max": -2.965712308883667, "copy_logits_min": -750000000.0, "copy_num_tokens": 383.0625, "epoch": 1.589226448812867, "gen_logits_max": 3.7915737628936768, "gen_logits_mean": -16.783010482788086, "gen_logits_min": -28.81938934326172, "gen_logits_std": 3.274392604827881, "gen_loss": 0.2914744019508362, "grad_norm": 0.3601631101824341, "learning_rate": 2.1101052631578945e-05, "loss": 0.2752, "mean_copy_accuracy": 0.995644599199295, "mean_gen_accuracy": 0.8795891106128693, "mean_token_accuracy": 0.9059388786554337, "num_tokens": 1026140937.0, "sample_num_tokens": 7646.75, "step": 7781, "total_num_tokens": 1026171524.0, "z_loss": 0.00045260594924911857 }, { "copy_logits_max": -3.8557305335998535, "copy_logits_min": -750000000.0, "copy_num_tokens": 314.0625, "epoch": 1.5894306867500638, "gen_logits_max": 4.481997489929199, "gen_logits_mean": -16.674755096435547, "gen_logits_min": -28.662273406982422, "gen_logits_std": 3.292459011077881, "gen_loss": 0.2677646279335022, "grad_norm": 0.3477091151638683, "learning_rate": 2.109978947368421e-05, "loss": 0.2598, "mean_copy_accuracy": 0.9963170737028122, "mean_gen_accuracy": 0.8839493244886398, "mean_token_accuracy": 0.9107868671417236, "num_tokens": 1026388992.0, "sample_num_tokens": 6812.0, "step": 7782, "total_num_tokens": 1026416240.0, "z_loss": 0.0004238500550854951 }, { "copy_logits_max": -1.2546303272247314, "copy_logits_min": -750000000.0, "copy_num_tokens": 472.0, "epoch": 1.5896349246872608, "gen_logits_max": 4.826982498168945, "gen_logits_mean": -15.076562881469727, "gen_logits_min": -27.011930465698242, "gen_logits_std": 3.1836206912994385, "gen_loss": 0.34127572178840637, "grad_norm": 0.35850496093667683, "learning_rate": 2.1098526315789477e-05, "loss": 0.2805, "mean_copy_accuracy": 0.9969430565834045, "mean_gen_accuracy": 0.8705691546201706, "mean_token_accuracy": 0.904892310500145, "num_tokens": 1026649250.0, "sample_num_tokens": 8128.0, "step": 7783, "total_num_tokens": 1026681762.0, "z_loss": 0.0006061510066501796 }, { "copy_logits_max": -2.4459359645843506, "copy_logits_min": -750000000.0, "copy_num_tokens": 590.0, "epoch": 1.5898391626244575, "gen_logits_max": 2.1840977668762207, "gen_logits_mean": -18.37915802001953, "gen_logits_min": -30.33774185180664, "gen_logits_std": 3.344250202178955, "gen_loss": 0.2533678412437439, "grad_norm": 0.3579585874616132, "learning_rate": 2.109726315789474e-05, "loss": 0.2705, "mean_copy_accuracy": 0.9975186735391617, "mean_gen_accuracy": 0.8715836852788925, "mean_token_accuracy": 0.9065759629011154, "num_tokens": 1026933224.0, "sample_num_tokens": 9114.5, "step": 7784, "total_num_tokens": 1026969682.0, "z_loss": 0.00042397226206958294 }, { "copy_logits_max": -4.057472229003906, "copy_logits_min": -750000064.0, "copy_num_tokens": 540.375, "epoch": 1.5900434005616542, "gen_logits_max": 3.1127359867095947, "gen_logits_mean": -17.190719604492188, "gen_logits_min": -29.545242309570312, "gen_logits_std": 3.284618377685547, "gen_loss": 0.26673150062561035, "grad_norm": 0.35799824071469405, "learning_rate": 2.1096000000000003e-05, "loss": 0.2736, "mean_copy_accuracy": 0.9967814385890961, "mean_gen_accuracy": 0.8791379928588867, "mean_token_accuracy": 0.9063723385334015, "num_tokens": 1027193555.0, "sample_num_tokens": 8647.25, "step": 7785, "total_num_tokens": 1027228144.0, "z_loss": 0.0004957690834999084 }, { "copy_logits_max": -5.783337116241455, "copy_logits_min": -750000000.0, "copy_num_tokens": 486.8125, "epoch": 1.5902476384988513, "gen_logits_max": 3.532212257385254, "gen_logits_mean": -17.339679718017578, "gen_logits_min": -28.993736267089844, "gen_logits_std": 3.2650644779205322, "gen_loss": 0.2551333010196686, "grad_norm": 0.38570524528845246, "learning_rate": 2.1094736842105264e-05, "loss": 0.2689, "mean_copy_accuracy": 0.9956093281507492, "mean_gen_accuracy": 0.8808066248893738, "mean_token_accuracy": 0.9084551930427551, "num_tokens": 1027478891.0, "sample_num_tokens": 9764.25, "step": 7786, "total_num_tokens": 1027517948.0, "z_loss": 0.00043491466203704476 }, { "copy_logits_max": -5.61815071105957, "copy_logits_min": -750000000.0, "copy_num_tokens": 337.375, "epoch": 1.590451876436048, "gen_logits_max": 3.980466604232788, "gen_logits_mean": -16.72736930847168, "gen_logits_min": -28.227821350097656, "gen_logits_std": 3.216108798980713, "gen_loss": 0.27309945225715637, "grad_norm": 0.3435436723704799, "learning_rate": 2.1093473684210528e-05, "loss": 0.26, "mean_copy_accuracy": 0.9969752728939056, "mean_gen_accuracy": 0.8818286508321762, "mean_token_accuracy": 0.9103893488645554, "num_tokens": 1027755884.0, "sample_num_tokens": 7556.5, "step": 7787, "total_num_tokens": 1027786110.0, "z_loss": 0.0004543733666650951 }, { "copy_logits_max": -3.8673956394195557, "copy_logits_min": -750000000.0, "copy_num_tokens": 463.9375, "epoch": 1.5906561143732447, "gen_logits_max": 3.6836023330688477, "gen_logits_mean": -17.013307571411133, "gen_logits_min": -28.84522819519043, "gen_logits_std": 3.2526259422302246, "gen_loss": 0.2683951258659363, "grad_norm": 0.34017609997693965, "learning_rate": 2.109221052631579e-05, "loss": 0.2751, "mean_copy_accuracy": 0.9973346292972565, "mean_gen_accuracy": 0.8783753961324692, "mean_token_accuracy": 0.9080525934696198, "num_tokens": 1028032743.0, "sample_num_tokens": 8764.75, "step": 7788, "total_num_tokens": 1028067802.0, "z_loss": 0.0004525228578131646 }, { "copy_logits_max": -1.2785077095031738, "copy_logits_min": -687500032.0, "copy_num_tokens": 506.75, "epoch": 1.5908603523104416, "gen_logits_max": 5.003818988800049, "gen_logits_mean": -14.4667387008667, "gen_logits_min": -26.596080780029297, "gen_logits_std": 3.187892436981201, "gen_loss": 0.29586321115493774, "grad_norm": 0.37007373803044796, "learning_rate": 2.1090947368421053e-05, "loss": 0.2676, "mean_copy_accuracy": 0.9967765063047409, "mean_gen_accuracy": 0.8805691599845886, "mean_token_accuracy": 0.9106840640306473, "num_tokens": 1028302480.0, "sample_num_tokens": 8970.5, "step": 7789, "total_num_tokens": 1028338362.0, "z_loss": 0.0004476098110899329 }, { "copy_logits_max": -4.590240478515625, "copy_logits_min": -687500032.0, "copy_num_tokens": 374.875, "epoch": 1.5910645902476386, "gen_logits_max": 3.5967283248901367, "gen_logits_mean": -17.064655303955078, "gen_logits_min": -29.112642288208008, "gen_logits_std": 3.276650905609131, "gen_loss": 0.2795073390007019, "grad_norm": 0.3843118159632052, "learning_rate": 2.1089684210526314e-05, "loss": 0.278, "mean_copy_accuracy": 0.9968219697475433, "mean_gen_accuracy": 0.8818199932575226, "mean_token_accuracy": 0.9065620750188828, "num_tokens": 1028556752.0, "sample_num_tokens": 7530.0, "step": 7790, "total_num_tokens": 1028586872.0, "z_loss": 0.00046975459554232657 }, { "copy_logits_max": -4.187828063964844, "copy_logits_min": -687500032.0, "copy_num_tokens": 668.375, "epoch": 1.5912688281848353, "gen_logits_max": 2.237670421600342, "gen_logits_mean": -18.55254364013672, "gen_logits_min": -30.799776077270508, "gen_logits_std": 3.3417632579803467, "gen_loss": 0.24751496315002441, "grad_norm": 0.3491974497371479, "learning_rate": 2.1088421052631582e-05, "loss": 0.2657, "mean_copy_accuracy": 0.997179701924324, "mean_gen_accuracy": 0.8797608911991119, "mean_token_accuracy": 0.9111542701721191, "num_tokens": 1028838425.0, "sample_num_tokens": 9677.25, "step": 7791, "total_num_tokens": 1028877134.0, "z_loss": 0.00045930471969768405 }, { "copy_logits_max": -3.1745591163635254, "copy_logits_min": -625000064.0, "copy_num_tokens": 480.0625, "epoch": 1.5914730661220322, "gen_logits_max": 4.220638751983643, "gen_logits_mean": -16.177169799804688, "gen_logits_min": -28.43068504333496, "gen_logits_std": 3.2452101707458496, "gen_loss": 0.30401816964149475, "grad_norm": 0.35648864814769543, "learning_rate": 2.1087157894736843e-05, "loss": 0.2872, "mean_copy_accuracy": 0.9956371933221817, "mean_gen_accuracy": 0.8760265856981277, "mean_token_accuracy": 0.9020759463310242, "num_tokens": 1029095356.0, "sample_num_tokens": 8661.5, "step": 7792, "total_num_tokens": 1029130002.0, "z_loss": 0.0004937015473842621 }, { "copy_logits_max": -5.891857147216797, "copy_logits_min": -750000000.0, "copy_num_tokens": 504.0625, "epoch": 1.5916773040592291, "gen_logits_max": 2.839447259902954, "gen_logits_mean": -17.844202041625977, "gen_logits_min": -29.89373016357422, "gen_logits_std": 3.2968664169311523, "gen_loss": 0.2892131209373474, "grad_norm": 0.37506745474386166, "learning_rate": 2.1085894736842107e-05, "loss": 0.2738, "mean_copy_accuracy": 0.9957347810268402, "mean_gen_accuracy": 0.8771114647388458, "mean_token_accuracy": 0.9050223529338837, "num_tokens": 1029371367.0, "sample_num_tokens": 9236.75, "step": 7793, "total_num_tokens": 1029408314.0, "z_loss": 0.000469360122224316 }, { "copy_logits_max": -4.4317545890808105, "copy_logits_min": -750000000.0, "copy_num_tokens": 502.0, "epoch": 1.5918815419964258, "gen_logits_max": 4.052824020385742, "gen_logits_mean": -15.936309814453125, "gen_logits_min": -28.33462905883789, "gen_logits_std": 3.2531094551086426, "gen_loss": 0.26290807127952576, "grad_norm": 0.340323052275445, "learning_rate": 2.1084631578947368e-05, "loss": 0.2599, "mean_copy_accuracy": 0.9966937750577927, "mean_gen_accuracy": 0.8803256899118423, "mean_token_accuracy": 0.9118999540805817, "num_tokens": 1029665041.0, "sample_num_tokens": 9120.25, "step": 7794, "total_num_tokens": 1029701522.0, "z_loss": 0.0003993129066657275 }, { "copy_logits_max": -6.522416591644287, "copy_logits_min": -750000000.0, "copy_num_tokens": 537.0625, "epoch": 1.5920857799336225, "gen_logits_max": 4.067682266235352, "gen_logits_mean": -16.022127151489258, "gen_logits_min": -28.106746673583984, "gen_logits_std": 3.2250921726226807, "gen_loss": 0.26073578000068665, "grad_norm": 0.33002942285717646, "learning_rate": 2.1083368421052632e-05, "loss": 0.2722, "mean_copy_accuracy": 0.9964601397514343, "mean_gen_accuracy": 0.8808977454900742, "mean_token_accuracy": 0.90914186835289, "num_tokens": 1029941561.0, "sample_num_tokens": 10334.75, "step": 7795, "total_num_tokens": 1029982900.0, "z_loss": 0.00038748944643884897 }, { "copy_logits_max": -5.301052093505859, "copy_logits_min": -750000000.0, "copy_num_tokens": 331.8125, "epoch": 1.5922900178708195, "gen_logits_max": 4.381155014038086, "gen_logits_mean": -15.82927131652832, "gen_logits_min": -27.782304763793945, "gen_logits_std": 3.229905366897583, "gen_loss": 0.24596835672855377, "grad_norm": 0.3682249991870915, "learning_rate": 2.1082105263157893e-05, "loss": 0.2706, "mean_copy_accuracy": 0.9966733604669571, "mean_gen_accuracy": 0.8816003650426865, "mean_token_accuracy": 0.9080523997545242, "num_tokens": 1030214686.0, "sample_num_tokens": 7338.0, "step": 7796, "total_num_tokens": 1030244038.0, "z_loss": 0.0003511264221742749 }, { "copy_logits_max": -4.7784905433654785, "copy_logits_min": -750000000.0, "copy_num_tokens": 429.5625, "epoch": 1.5924942558080164, "gen_logits_max": 3.8787038326263428, "gen_logits_mean": -16.704200744628906, "gen_logits_min": -29.047075271606445, "gen_logits_std": 3.2583272457122803, "gen_loss": 0.2904927134513855, "grad_norm": 0.38792194457202667, "learning_rate": 2.1080842105263157e-05, "loss": 0.3077, "mean_copy_accuracy": 0.996461883187294, "mean_gen_accuracy": 0.8656759858131409, "mean_token_accuracy": 0.8966280668973923, "num_tokens": 1030481600.0, "sample_num_tokens": 8451.0, "step": 7797, "total_num_tokens": 1030515404.0, "z_loss": 0.00043641854426823556 }, { "copy_logits_max": -3.436084508895874, "copy_logits_min": -750000000.0, "copy_num_tokens": 458.6875, "epoch": 1.592698493745213, "gen_logits_max": 2.8206989765167236, "gen_logits_mean": -17.65484046936035, "gen_logits_min": -30.083030700683594, "gen_logits_std": 3.30204439163208, "gen_loss": 0.28233712911605835, "grad_norm": 0.3542397359010769, "learning_rate": 2.1079578947368422e-05, "loss": 0.2647, "mean_copy_accuracy": 0.9972048401832581, "mean_gen_accuracy": 0.8771688789129257, "mean_token_accuracy": 0.9100469499826431, "num_tokens": 1030762484.0, "sample_num_tokens": 8510.0, "step": 7798, "total_num_tokens": 1030796524.0, "z_loss": 0.0004665941232815385 }, { "copy_logits_max": -5.964041233062744, "copy_logits_min": -750000000.0, "copy_num_tokens": 423.0625, "epoch": 1.59290273168241, "gen_logits_max": 2.8207931518554688, "gen_logits_mean": -17.936763763427734, "gen_logits_min": -30.120220184326172, "gen_logits_std": 3.2898569107055664, "gen_loss": 0.30426710844039917, "grad_norm": 0.3536184901287906, "learning_rate": 2.1078315789473686e-05, "loss": 0.2949, "mean_copy_accuracy": 0.9974270313978195, "mean_gen_accuracy": 0.8719909489154816, "mean_token_accuracy": 0.9013172090053558, "num_tokens": 1031032443.0, "sample_num_tokens": 8066.75, "step": 7799, "total_num_tokens": 1031064710.0, "z_loss": 0.0004857290768995881 }, { "copy_logits_max": -4.569239616394043, "copy_logits_min": -687500032.0, "copy_num_tokens": 483.875, "epoch": 1.593106969619607, "gen_logits_max": 4.356794357299805, "gen_logits_mean": -15.451421737670898, "gen_logits_min": -27.701658248901367, "gen_logits_std": 3.204470157623291, "gen_loss": 0.2862936556339264, "grad_norm": 0.3616289503311305, "learning_rate": 2.107705263157895e-05, "loss": 0.2735, "mean_copy_accuracy": 0.997067391872406, "mean_gen_accuracy": 0.8804414570331573, "mean_token_accuracy": 0.9087841957807541, "num_tokens": 1031315567.0, "sample_num_tokens": 9121.25, "step": 7800, "total_num_tokens": 1031352052.0, "z_loss": 0.0004707244806922972 }, { "copy_logits_max": -4.543999671936035, "copy_logits_min": -750000000.0, "copy_num_tokens": 486.8125, "epoch": 1.5933112075568037, "gen_logits_max": 3.4092414379119873, "gen_logits_mean": -16.014068603515625, "gen_logits_min": -28.70018768310547, "gen_logits_std": 3.242482900619507, "gen_loss": 0.27961069345474243, "grad_norm": 0.39231928709652086, "learning_rate": 2.107578947368421e-05, "loss": 0.2866, "mean_copy_accuracy": 0.9962643980979919, "mean_gen_accuracy": 0.8705505281686783, "mean_token_accuracy": 0.9020723253488541, "num_tokens": 1031566472.0, "sample_num_tokens": 7531.0, "step": 7801, "total_num_tokens": 1031596596.0, "z_loss": 0.000436487200204283 }, { "copy_logits_max": -5.154875755310059, "copy_logits_min": -750000000.0, "copy_num_tokens": 601.8125, "epoch": 1.5935154454940004, "gen_logits_max": 3.335848808288574, "gen_logits_mean": -16.685741424560547, "gen_logits_min": -28.92725372314453, "gen_logits_std": 3.2466983795166016, "gen_loss": 0.27679669857025146, "grad_norm": 0.4085029261174223, "learning_rate": 2.1074526315789476e-05, "loss": 0.2726, "mean_copy_accuracy": 0.9969997704029083, "mean_gen_accuracy": 0.8786751627922058, "mean_token_accuracy": 0.9074781239032745, "num_tokens": 1031826948.0, "sample_num_tokens": 9587.5, "step": 7802, "total_num_tokens": 1031865298.0, "z_loss": 0.0005009712185710669 }, { "copy_logits_max": -6.046046257019043, "copy_logits_min": -750000000.0, "copy_num_tokens": 408.25, "epoch": 1.5937196834311973, "gen_logits_max": 3.5596795082092285, "gen_logits_mean": -16.570682525634766, "gen_logits_min": -28.602725982666016, "gen_logits_std": 3.2339630126953125, "gen_loss": 0.2988062798976898, "grad_norm": 0.39075707668529863, "learning_rate": 2.1073263157894737e-05, "loss": 0.2813, "mean_copy_accuracy": 0.9969940632581711, "mean_gen_accuracy": 0.8758059591054916, "mean_token_accuracy": 0.9049980193376541, "num_tokens": 1032087303.0, "sample_num_tokens": 7939.75, "step": 7803, "total_num_tokens": 1032119062.0, "z_loss": 0.0004993761540390551 }, { "copy_logits_max": -7.5187273025512695, "copy_logits_min": -750000000.0, "copy_num_tokens": 412.9375, "epoch": 1.5939239213683942, "gen_logits_max": 3.7669241428375244, "gen_logits_mean": -16.78436851501465, "gen_logits_min": -28.53178596496582, "gen_logits_std": 3.233372449874878, "gen_loss": 0.28394371271133423, "grad_norm": 0.36900007665745477, "learning_rate": 2.1072e-05, "loss": 0.2912, "mean_copy_accuracy": 0.9963908940553665, "mean_gen_accuracy": 0.8751849681138992, "mean_token_accuracy": 0.9016987234354019, "num_tokens": 1032363312.0, "sample_num_tokens": 7873.5, "step": 7804, "total_num_tokens": 1032394806.0, "z_loss": 0.00048023482668213546 }, { "copy_logits_max": -6.79669713973999, "copy_logits_min": -750000000.0, "copy_num_tokens": 439.25, "epoch": 1.594128159305591, "gen_logits_max": 3.5394129753112793, "gen_logits_mean": -17.088443756103516, "gen_logits_min": -29.020082473754883, "gen_logits_std": 3.2376794815063477, "gen_loss": 0.23737260699272156, "grad_norm": 0.3560159672880999, "learning_rate": 2.1070736842105262e-05, "loss": 0.266, "mean_copy_accuracy": 0.9970891028642654, "mean_gen_accuracy": 0.8793739974498749, "mean_token_accuracy": 0.9097075462341309, "num_tokens": 1032634445.0, "sample_num_tokens": 8945.25, "step": 7805, "total_num_tokens": 1032670226.0, "z_loss": 0.00041902787052094936 }, { "copy_logits_max": -6.572936058044434, "copy_logits_min": -687500032.0, "copy_num_tokens": 527.125, "epoch": 1.5943323972427879, "gen_logits_max": 3.993698835372925, "gen_logits_mean": -14.894357681274414, "gen_logits_min": -26.80706024169922, "gen_logits_std": 3.1225407123565674, "gen_loss": 0.2696491479873657, "grad_norm": 0.32219618986429954, "learning_rate": 2.1069473684210526e-05, "loss": 0.2595, "mean_copy_accuracy": 0.9971036612987518, "mean_gen_accuracy": 0.881863608956337, "mean_token_accuracy": 0.9132378846406937, "num_tokens": 1032923698.0, "sample_num_tokens": 8406.5, "step": 7806, "total_num_tokens": 1032957324.0, "z_loss": 0.0004968395223841071 }, { "copy_logits_max": -7.672129154205322, "copy_logits_min": -750000000.0, "copy_num_tokens": 358.25, "epoch": 1.5945366351799848, "gen_logits_max": 4.227448463439941, "gen_logits_mean": -16.758525848388672, "gen_logits_min": -27.979408264160156, "gen_logits_std": 3.1803464889526367, "gen_loss": 0.2927647531032562, "grad_norm": 0.3476241394810883, "learning_rate": 2.1068210526315787e-05, "loss": 0.2592, "mean_copy_accuracy": 0.9969248473644257, "mean_gen_accuracy": 0.8867196440696716, "mean_token_accuracy": 0.9118797332048416, "num_tokens": 1033218023.0, "sample_num_tokens": 8667.75, "step": 7807, "total_num_tokens": 1033252694.0, "z_loss": 0.0004881601198576391 }, { "copy_logits_max": -5.8905720710754395, "copy_logits_min": -750000000.0, "copy_num_tokens": 459.5, "epoch": 1.5947408731171815, "gen_logits_max": 3.868199586868286, "gen_logits_mean": -16.198753356933594, "gen_logits_min": -28.360960006713867, "gen_logits_std": 3.2032382488250732, "gen_loss": 0.3146704137325287, "grad_norm": 0.32628989655063095, "learning_rate": 2.1066947368421055e-05, "loss": 0.2813, "mean_copy_accuracy": 0.9975338131189346, "mean_gen_accuracy": 0.8745572417974472, "mean_token_accuracy": 0.9061670899391174, "num_tokens": 1033528533.0, "sample_num_tokens": 8788.25, "step": 7808, "total_num_tokens": 1033563686.0, "z_loss": 0.0005768097471445799 }, { "copy_logits_max": -4.751142978668213, "copy_logits_min": -687500032.0, "copy_num_tokens": 558.375, "epoch": 1.5949451110543782, "gen_logits_max": 3.7593348026275635, "gen_logits_mean": -15.35338306427002, "gen_logits_min": -27.34111785888672, "gen_logits_std": 3.1481897830963135, "gen_loss": 0.295717716217041, "grad_norm": 0.34392236791753134, "learning_rate": 2.1065684210526316e-05, "loss": 0.2708, "mean_copy_accuracy": 0.9970195293426514, "mean_gen_accuracy": 0.8728659152984619, "mean_token_accuracy": 0.9077065736055374, "num_tokens": 1033809316.0, "sample_num_tokens": 9809.5, "step": 7809, "total_num_tokens": 1033848554.0, "z_loss": 0.0005556655814871192 }, { "copy_logits_max": -5.4723124504089355, "copy_logits_min": -750000000.0, "copy_num_tokens": 327.125, "epoch": 1.5951493489915753, "gen_logits_max": 4.025815010070801, "gen_logits_mean": -15.968152046203613, "gen_logits_min": -27.522825241088867, "gen_logits_std": 3.109501838684082, "gen_loss": 0.3058999180793762, "grad_norm": 0.3350793492923983, "learning_rate": 2.106442105263158e-05, "loss": 0.2766, "mean_copy_accuracy": 0.9962102323770523, "mean_gen_accuracy": 0.8785082846879959, "mean_token_accuracy": 0.9044784754514694, "num_tokens": 1034092155.0, "sample_num_tokens": 7574.25, "step": 7810, "total_num_tokens": 1034122452.0, "z_loss": 0.0005886730505153537 }, { "copy_logits_max": -9.208383560180664, "copy_logits_min": -750000000.0, "copy_num_tokens": 264.0, "epoch": 1.595353586928772, "gen_logits_max": 4.017449378967285, "gen_logits_mean": -18.25269317626953, "gen_logits_min": -29.61736488342285, "gen_logits_std": 3.2209837436676025, "gen_loss": 0.30805492401123047, "grad_norm": 0.3604607765160403, "learning_rate": 2.1063157894736844e-05, "loss": 0.2825, "mean_copy_accuracy": 0.9970135986804962, "mean_gen_accuracy": 0.880444809794426, "mean_token_accuracy": 0.9018925726413727, "num_tokens": 1034333716.0, "sample_num_tokens": 7084.0, "step": 7811, "total_num_tokens": 1034362052.0, "z_loss": 0.0005866287974640727 }, { "copy_logits_max": -6.365453720092773, "copy_logits_min": -750000000.0, "copy_num_tokens": 463.125, "epoch": 1.5955578248659688, "gen_logits_max": 3.592888832092285, "gen_logits_mean": -15.125862121582031, "gen_logits_min": -26.873884201049805, "gen_logits_std": 3.0363285541534424, "gen_loss": 0.27240023016929626, "grad_norm": 0.3403996743008517, "learning_rate": 2.1061894736842105e-05, "loss": 0.2783, "mean_copy_accuracy": 0.9975246489048004, "mean_gen_accuracy": 0.8759074658155441, "mean_token_accuracy": 0.9057895243167877, "num_tokens": 1034612235.0, "sample_num_tokens": 8071.75, "step": 7812, "total_num_tokens": 1034644522.0, "z_loss": 0.0005013495683670044 }, { "copy_logits_max": -6.738315582275391, "copy_logits_min": -750000000.0, "copy_num_tokens": 326.5, "epoch": 1.5957620628031657, "gen_logits_max": 3.766031265258789, "gen_logits_mean": -16.542896270751953, "gen_logits_min": -28.492938995361328, "gen_logits_std": 3.1560473442077637, "gen_loss": 0.30891862511634827, "grad_norm": 0.35856066755978855, "learning_rate": 2.106063157894737e-05, "loss": 0.2963, "mean_copy_accuracy": 0.9962327927350998, "mean_gen_accuracy": 0.8706519156694412, "mean_token_accuracy": 0.8999589383602142, "num_tokens": 1034879477.0, "sample_num_tokens": 7452.75, "step": 7813, "total_num_tokens": 1034909288.0, "z_loss": 0.000556639046408236 }, { "copy_logits_max": -6.698472023010254, "copy_logits_min": -750000000.0, "copy_num_tokens": 575.875, "epoch": 1.5959663007403626, "gen_logits_max": 2.8493194580078125, "gen_logits_mean": -17.00341033935547, "gen_logits_min": -28.393295288085938, "gen_logits_std": 3.135908842086792, "gen_loss": 0.27596014738082886, "grad_norm": 0.3359851422204242, "learning_rate": 2.105936842105263e-05, "loss": 0.2483, "mean_copy_accuracy": 0.9974628984928131, "mean_gen_accuracy": 0.8821024298667908, "mean_token_accuracy": 0.9155387282371521, "num_tokens": 1035176611.0, "sample_num_tokens": 9275.75, "step": 7814, "total_num_tokens": 1035213714.0, "z_loss": 0.00044855521991848946 }, { "copy_logits_max": -5.782712459564209, "copy_logits_min": -687500032.0, "copy_num_tokens": 512.75, "epoch": 1.5961705386775593, "gen_logits_max": 3.2437705993652344, "gen_logits_mean": -16.50631332397461, "gen_logits_min": -28.928974151611328, "gen_logits_std": 3.228686571121216, "gen_loss": 0.2588210105895996, "grad_norm": 0.34072735694562034, "learning_rate": 2.1058105263157895e-05, "loss": 0.2762, "mean_copy_accuracy": 0.996928483247757, "mean_gen_accuracy": 0.8758567869663239, "mean_token_accuracy": 0.9061266183853149, "num_tokens": 1035458623.0, "sample_num_tokens": 9309.25, "step": 7815, "total_num_tokens": 1035495860.0, "z_loss": 0.0004480034695006907 }, { "copy_logits_max": -6.861542701721191, "copy_logits_min": -687500032.0, "copy_num_tokens": 497.625, "epoch": 1.5963747766147562, "gen_logits_max": 3.268782615661621, "gen_logits_mean": -17.086259841918945, "gen_logits_min": -28.923961639404297, "gen_logits_std": 3.2081050872802734, "gen_loss": 0.28193047642707825, "grad_norm": 0.3363525426621796, "learning_rate": 2.105684210526316e-05, "loss": 0.2805, "mean_copy_accuracy": 0.9974662810564041, "mean_gen_accuracy": 0.87159164249897, "mean_token_accuracy": 0.9038727283477783, "num_tokens": 1035738909.0, "sample_num_tokens": 8402.75, "step": 7816, "total_num_tokens": 1035772520.0, "z_loss": 0.00045682565541937947 }, { "copy_logits_max": -6.684952735900879, "copy_logits_min": -750000000.0, "copy_num_tokens": 520.875, "epoch": 1.5965790145519532, "gen_logits_max": 3.4875640869140625, "gen_logits_mean": -16.080162048339844, "gen_logits_min": -27.962459564208984, "gen_logits_std": 3.1509885787963867, "gen_loss": 0.26826241612434387, "grad_norm": 0.3442975157504411, "learning_rate": 2.1055578947368423e-05, "loss": 0.2831, "mean_copy_accuracy": 0.9962801337242126, "mean_gen_accuracy": 0.8752738386392593, "mean_token_accuracy": 0.9043801128864288, "num_tokens": 1036002020.0, "sample_num_tokens": 9309.0, "step": 7817, "total_num_tokens": 1036039256.0, "z_loss": 0.0004865066730417311 }, { "copy_logits_max": -5.483526229858398, "copy_logits_min": -625000064.0, "copy_num_tokens": 437.5, "epoch": 1.5967832524891499, "gen_logits_max": 2.752927780151367, "gen_logits_mean": -17.104595184326172, "gen_logits_min": -29.050878524780273, "gen_logits_std": 3.243764638900757, "gen_loss": 0.2777477502822876, "grad_norm": 0.3649641460253304, "learning_rate": 2.1054315789473684e-05, "loss": 0.2867, "mean_copy_accuracy": 0.9965080320835114, "mean_gen_accuracy": 0.8748224079608917, "mean_token_accuracy": 0.9017997235059738, "num_tokens": 1036274259.0, "sample_num_tokens": 7696.75, "step": 7818, "total_num_tokens": 1036305046.0, "z_loss": 0.00048009169404394925 }, { "copy_logits_max": -6.564728260040283, "copy_logits_min": -750000000.0, "copy_num_tokens": 566.9375, "epoch": 1.5969874904263466, "gen_logits_max": 2.6469953060150146, "gen_logits_mean": -16.8627872467041, "gen_logits_min": -28.672775268554688, "gen_logits_std": 3.2260499000549316, "gen_loss": 0.2543060779571533, "grad_norm": 0.3271340888029507, "learning_rate": 2.105305263157895e-05, "loss": 0.2586, "mean_copy_accuracy": 0.9979532361030579, "mean_gen_accuracy": 0.8807799518108368, "mean_token_accuracy": 0.9126002937555313, "num_tokens": 1036540065.0, "sample_num_tokens": 8842.75, "step": 7819, "total_num_tokens": 1036575436.0, "z_loss": 0.0004093719180673361 }, { "copy_logits_max": -8.286069869995117, "copy_logits_min": -750000064.0, "copy_num_tokens": 465.75, "epoch": 1.5971917283635435, "gen_logits_max": 4.256766319274902, "gen_logits_mean": -15.3436918258667, "gen_logits_min": -26.962963104248047, "gen_logits_std": 3.0996599197387695, "gen_loss": 0.2631843686103821, "grad_norm": 0.34266319378864335, "learning_rate": 2.105178947368421e-05, "loss": 0.2792, "mean_copy_accuracy": 0.9976062178611755, "mean_gen_accuracy": 0.8766049742698669, "mean_token_accuracy": 0.9084632992744446, "num_tokens": 1036807719.0, "sample_num_tokens": 8556.25, "step": 7820, "total_num_tokens": 1036841944.0, "z_loss": 0.00038529865560121834 }, { "copy_logits_max": -5.945521354675293, "copy_logits_min": -750000000.0, "copy_num_tokens": 340.1875, "epoch": 1.5973959663007404, "gen_logits_max": 3.966765880584717, "gen_logits_mean": -15.284051895141602, "gen_logits_min": -26.962238311767578, "gen_logits_std": 3.0874624252319336, "gen_loss": 0.3295843005180359, "grad_norm": 0.33663811178602865, "learning_rate": 2.1050526315789474e-05, "loss": 0.2758, "mean_copy_accuracy": 0.9966644495725632, "mean_gen_accuracy": 0.8811125159263611, "mean_token_accuracy": 0.907529667019844, "num_tokens": 1037077912.0, "sample_num_tokens": 7485.0, "step": 7821, "total_num_tokens": 1037107852.0, "z_loss": 0.0005098623223602772 }, { "copy_logits_max": -7.603638648986816, "copy_logits_min": -750000000.0, "copy_num_tokens": 366.5625, "epoch": 1.5976002042379371, "gen_logits_max": 3.3542351722717285, "gen_logits_mean": -17.181331634521484, "gen_logits_min": -28.910247802734375, "gen_logits_std": 3.213895797729492, "gen_loss": 0.29664546251296997, "grad_norm": 0.37639366297236526, "learning_rate": 2.1049263157894735e-05, "loss": 0.2802, "mean_copy_accuracy": 0.9973534047603607, "mean_gen_accuracy": 0.8732495605945587, "mean_token_accuracy": 0.9044062346220016, "num_tokens": 1037330231.0, "sample_num_tokens": 7452.75, "step": 7822, "total_num_tokens": 1037360042.0, "z_loss": 0.00045790383592247963 }, { "copy_logits_max": -7.653450012207031, "copy_logits_min": -750000064.0, "copy_num_tokens": 367.75, "epoch": 1.597804442175134, "gen_logits_max": 3.4716176986694336, "gen_logits_mean": -17.397544860839844, "gen_logits_min": -28.832857131958008, "gen_logits_std": 3.2267184257507324, "gen_loss": 0.2786834239959717, "grad_norm": 0.3554385998828916, "learning_rate": 2.1048e-05, "loss": 0.282, "mean_copy_accuracy": 0.9966370016336441, "mean_gen_accuracy": 0.8750740885734558, "mean_token_accuracy": 0.9030820727348328, "num_tokens": 1037605459.0, "sample_num_tokens": 8047.25, "step": 7823, "total_num_tokens": 1037637648.0, "z_loss": 0.00041292200330644846 }, { "copy_logits_max": -4.371410846710205, "copy_logits_min": -750000128.0, "copy_num_tokens": 470.75, "epoch": 1.598008680112331, "gen_logits_max": 2.913079261779785, "gen_logits_mean": -16.601478576660156, "gen_logits_min": -28.315568923950195, "gen_logits_std": 3.2038307189941406, "gen_loss": 0.2755770981311798, "grad_norm": 0.35000264837432127, "learning_rate": 2.1046736842105267e-05, "loss": 0.2661, "mean_copy_accuracy": 0.9970130175352097, "mean_gen_accuracy": 0.8798221796751022, "mean_token_accuracy": 0.9106350839138031, "num_tokens": 1037869646.0, "sample_num_tokens": 7716.0, "step": 7824, "total_num_tokens": 1037900510.0, "z_loss": 0.00043230692972429097 }, { "copy_logits_max": -3.859915256500244, "copy_logits_min": -750000000.0, "copy_num_tokens": 408.75, "epoch": 1.5982129180495277, "gen_logits_max": 3.310472011566162, "gen_logits_mean": -16.715299606323242, "gen_logits_min": -28.35575294494629, "gen_logits_std": 3.2055399417877197, "gen_loss": 0.2840083837509155, "grad_norm": 0.351281574895315, "learning_rate": 2.1045473684210528e-05, "loss": 0.2786, "mean_copy_accuracy": 0.9970983564853668, "mean_gen_accuracy": 0.8719366639852524, "mean_token_accuracy": 0.9078968018293381, "num_tokens": 1038151022.0, "sample_num_tokens": 8309.5, "step": 7825, "total_num_tokens": 1038184260.0, "z_loss": 0.0004467214457690716 }, { "copy_logits_max": -3.0145504474639893, "copy_logits_min": -750000128.0, "copy_num_tokens": 681.5, "epoch": 1.5984171559867244, "gen_logits_max": 3.67800235748291, "gen_logits_mean": -15.486238479614258, "gen_logits_min": -27.102252960205078, "gen_logits_std": 3.1242573261260986, "gen_loss": 0.27298909425735474, "grad_norm": 0.3783879287864515, "learning_rate": 2.1044210526315792e-05, "loss": 0.2794, "mean_copy_accuracy": 0.9972921013832092, "mean_gen_accuracy": 0.8744556158781052, "mean_token_accuracy": 0.9058493226766586, "num_tokens": 1038423957.0, "sample_num_tokens": 9819.75, "step": 7826, "total_num_tokens": 1038463236.0, "z_loss": 0.000468292273581028 }, { "copy_logits_max": -6.419546127319336, "copy_logits_min": -750000000.0, "copy_num_tokens": 426.9375, "epoch": 1.5986213939239213, "gen_logits_max": 3.3584959506988525, "gen_logits_mean": -15.528202056884766, "gen_logits_min": -27.075580596923828, "gen_logits_std": 3.1314451694488525, "gen_loss": 0.2538948655128479, "grad_norm": 0.34870617878180166, "learning_rate": 2.1042947368421053e-05, "loss": 0.2629, "mean_copy_accuracy": 0.9965010583400726, "mean_gen_accuracy": 0.8825551271438599, "mean_token_accuracy": 0.9095330983400345, "num_tokens": 1038697028.0, "sample_num_tokens": 8032.0, "step": 7827, "total_num_tokens": 1038729156.0, "z_loss": 0.0004040779313072562 }, { "copy_logits_max": -6.694077491760254, "copy_logits_min": -750000000.0, "copy_num_tokens": 335.375, "epoch": 1.5988256318611183, "gen_logits_max": 3.8569672107696533, "gen_logits_mean": -16.176044464111328, "gen_logits_min": -27.49793243408203, "gen_logits_std": 3.14786696434021, "gen_loss": 0.2825697362422943, "grad_norm": 0.38039654929611144, "learning_rate": 2.1041684210526317e-05, "loss": 0.3041, "mean_copy_accuracy": 0.9956040233373642, "mean_gen_accuracy": 0.8641560524702072, "mean_token_accuracy": 0.8982934504747391, "num_tokens": 1038972235.0, "sample_num_tokens": 8579.75, "step": 7828, "total_num_tokens": 1039006554.0, "z_loss": 0.00040900433668866754 }, { "copy_logits_max": -5.67637825012207, "copy_logits_min": -750000064.0, "copy_num_tokens": 312.6875, "epoch": 1.599029869798315, "gen_logits_max": 3.3615853786468506, "gen_logits_mean": -17.40723991394043, "gen_logits_min": -28.697004318237305, "gen_logits_std": 3.221444606781006, "gen_loss": 0.2949535548686981, "grad_norm": 0.35571383698636233, "learning_rate": 2.1040421052631578e-05, "loss": 0.2722, "mean_copy_accuracy": 0.9968358129262924, "mean_gen_accuracy": 0.879946231842041, "mean_token_accuracy": 0.9064624160528183, "num_tokens": 1039243834.0, "sample_num_tokens": 7291.5, "step": 7829, "total_num_tokens": 1039273000.0, "z_loss": 0.00045259855687618256 }, { "copy_logits_max": -3.0736563205718994, "copy_logits_min": -750000000.0, "copy_num_tokens": 640.6875, "epoch": 1.599234107735512, "gen_logits_max": 2.8114919662475586, "gen_logits_mean": -17.39016342163086, "gen_logits_min": -28.967287063598633, "gen_logits_std": 3.251558303833008, "gen_loss": 0.2378850132226944, "grad_norm": 0.3700047185906146, "learning_rate": 2.1039157894736842e-05, "loss": 0.2624, "mean_copy_accuracy": 0.9974013715982437, "mean_gen_accuracy": 0.8848151564598083, "mean_token_accuracy": 0.9101250916719437, "num_tokens": 1039515267.0, "sample_num_tokens": 9729.75, "step": 7830, "total_num_tokens": 1039554186.0, "z_loss": 0.0003763721033465117 }, { "copy_logits_max": -5.932426452636719, "copy_logits_min": -750000000.0, "copy_num_tokens": 530.8125, "epoch": 1.5994383456727088, "gen_logits_max": 2.259882926940918, "gen_logits_mean": -18.463300704956055, "gen_logits_min": -30.379850387573242, "gen_logits_std": 3.31451153755188, "gen_loss": 0.2537185549736023, "grad_norm": 0.33405733510441693, "learning_rate": 2.1037894736842103e-05, "loss": 0.2751, "mean_copy_accuracy": 0.9974430799484253, "mean_gen_accuracy": 0.8745729178190231, "mean_token_accuracy": 0.9060849547386169, "num_tokens": 1039798532.0, "sample_num_tokens": 9175.5, "step": 7831, "total_num_tokens": 1039835234.0, "z_loss": 0.00038907461566850543 }, { "copy_logits_max": -6.6529765129089355, "copy_logits_min": -750000000.0, "copy_num_tokens": 272.0625, "epoch": 1.5996425836099055, "gen_logits_max": 3.847017288208008, "gen_logits_mean": -16.9390869140625, "gen_logits_min": -28.020713806152344, "gen_logits_std": 3.11814546585083, "gen_loss": 0.32068416476249695, "grad_norm": 0.340740500207594, "learning_rate": 2.103663157894737e-05, "loss": 0.3033, "mean_copy_accuracy": 0.9965339303016663, "mean_gen_accuracy": 0.8701957762241364, "mean_token_accuracy": 0.8954886794090271, "num_tokens": 1040077669.0, "sample_num_tokens": 7580.75, "step": 7832, "total_num_tokens": 1040107992.0, "z_loss": 0.0005007109721191227 }, { "copy_logits_max": -1.599273681640625, "copy_logits_min": -750000000.0, "copy_num_tokens": 481.8125, "epoch": 1.5998468215471022, "gen_logits_max": 3.175851821899414, "gen_logits_mean": -15.65230655670166, "gen_logits_min": -27.323205947875977, "gen_logits_std": 3.1404709815979004, "gen_loss": 0.2815716564655304, "grad_norm": 0.35950346315535653, "learning_rate": 2.1035368421052632e-05, "loss": 0.2627, "mean_copy_accuracy": 0.996319442987442, "mean_gen_accuracy": 0.8798216432332993, "mean_token_accuracy": 0.909808948636055, "num_tokens": 1040337349.0, "sample_num_tokens": 8472.75, "step": 7833, "total_num_tokens": 1040371240.0, "z_loss": 0.0004080514772795141 }, { "copy_logits_max": -3.825547218322754, "copy_logits_min": -750000000.0, "copy_num_tokens": 287.4375, "epoch": 1.6000510594842992, "gen_logits_max": 3.9340717792510986, "gen_logits_mean": -16.83786392211914, "gen_logits_min": -28.326576232910156, "gen_logits_std": 3.190746307373047, "gen_loss": 0.30125129222869873, "grad_norm": 0.3837324369222951, "learning_rate": 2.1034105263157896e-05, "loss": 0.2863, "mean_copy_accuracy": 0.9959519505500793, "mean_gen_accuracy": 0.8773309737443924, "mean_token_accuracy": 0.9029847979545593, "num_tokens": 1040613593.0, "sample_num_tokens": 7045.25, "step": 7834, "total_num_tokens": 1040641774.0, "z_loss": 0.00047420585178770125 }, { "copy_logits_max": -3.679762125015259, "copy_logits_min": -750000000.0, "copy_num_tokens": 380.5, "epoch": 1.600255297421496, "gen_logits_max": 3.170731782913208, "gen_logits_mean": -17.430606842041016, "gen_logits_min": -28.97618865966797, "gen_logits_std": 3.223332405090332, "gen_loss": 0.3012612462043762, "grad_norm": 0.37058106511089023, "learning_rate": 2.1032842105263157e-05, "loss": 0.2832, "mean_copy_accuracy": 0.9973665177822113, "mean_gen_accuracy": 0.8717007637023926, "mean_token_accuracy": 0.9032019078731537, "num_tokens": 1040880905.0, "sample_num_tokens": 7831.25, "step": 7835, "total_num_tokens": 1040912230.0, "z_loss": 0.0004911855794489384 }, { "copy_logits_max": -4.150904655456543, "copy_logits_min": -750000000.0, "copy_num_tokens": 499.0625, "epoch": 1.6004595353586928, "gen_logits_max": 3.100259304046631, "gen_logits_mean": -17.197593688964844, "gen_logits_min": -28.90283966064453, "gen_logits_std": 3.284437894821167, "gen_loss": 0.26968032121658325, "grad_norm": 0.3964051828814023, "learning_rate": 2.103157894736842e-05, "loss": 0.2928, "mean_copy_accuracy": 0.9963874071836472, "mean_gen_accuracy": 0.8708591461181641, "mean_token_accuracy": 0.9010840505361557, "num_tokens": 1041126129.0, "sample_num_tokens": 8615.75, "step": 7836, "total_num_tokens": 1041160592.0, "z_loss": 0.0004117653297726065 }, { "copy_logits_max": -2.068777322769165, "copy_logits_min": -687500032.0, "copy_num_tokens": 436.1875, "epoch": 1.6006637732958897, "gen_logits_max": 3.15915584564209, "gen_logits_mean": -16.319412231445312, "gen_logits_min": -28.163137435913086, "gen_logits_std": 3.1812055110931396, "gen_loss": 0.25266915559768677, "grad_norm": 0.36599940829642363, "learning_rate": 2.1030315789473686e-05, "loss": 0.2613, "mean_copy_accuracy": 0.9973567724227905, "mean_gen_accuracy": 0.8777567148208618, "mean_token_accuracy": 0.9095668643712997, "num_tokens": 1041389223.0, "sample_num_tokens": 8236.75, "step": 7837, "total_num_tokens": 1041422170.0, "z_loss": 0.00036926090251654387 }, { "copy_logits_max": -2.016758680343628, "copy_logits_min": -687500032.0, "copy_num_tokens": 634.625, "epoch": 1.6008680112330866, "gen_logits_max": 3.4548473358154297, "gen_logits_mean": -15.917024612426758, "gen_logits_min": -28.331172943115234, "gen_logits_std": 3.1802358627319336, "gen_loss": 0.2502020597457886, "grad_norm": 0.33624070002547707, "learning_rate": 2.1029052631578947e-05, "loss": 0.251, "mean_copy_accuracy": 0.9971987456083298, "mean_gen_accuracy": 0.8825060129165649, "mean_token_accuracy": 0.9116677492856979, "num_tokens": 1041660212.0, "sample_num_tokens": 9626.5, "step": 7838, "total_num_tokens": 1041698718.0, "z_loss": 0.0004135626368224621 }, { "copy_logits_max": -2.4454867839813232, "copy_logits_min": -750000000.0, "copy_num_tokens": 578.8125, "epoch": 1.6010722491702833, "gen_logits_max": 3.6315417289733887, "gen_logits_mean": -15.426231384277344, "gen_logits_min": -27.608734130859375, "gen_logits_std": 3.164231777191162, "gen_loss": 0.24526020884513855, "grad_norm": 0.336833217855615, "learning_rate": 2.102778947368421e-05, "loss": 0.2655, "mean_copy_accuracy": 0.9968673139810562, "mean_gen_accuracy": 0.878829762339592, "mean_token_accuracy": 0.9105156809091568, "num_tokens": 1041944654.0, "sample_num_tokens": 9379.0, "step": 7839, "total_num_tokens": 1041982170.0, "z_loss": 0.00042092642979696393 }, { "copy_logits_max": -5.795052528381348, "copy_logits_min": -750000000.0, "copy_num_tokens": 501.1875, "epoch": 1.60127648710748, "gen_logits_max": 2.499164581298828, "gen_logits_mean": -17.676313400268555, "gen_logits_min": -29.897663116455078, "gen_logits_std": 3.2046146392822266, "gen_loss": 0.28127503395080566, "grad_norm": 0.36012632780995985, "learning_rate": 2.1026526315789475e-05, "loss": 0.2857, "mean_copy_accuracy": 0.9970561116933823, "mean_gen_accuracy": 0.8702886402606964, "mean_token_accuracy": 0.9027362614870071, "num_tokens": 1042215151.0, "sample_num_tokens": 8575.25, "step": 7840, "total_num_tokens": 1042249452.0, "z_loss": 0.0004224217846058309 }, { "copy_logits_max": -2.712503671646118, "copy_logits_min": -750000000.0, "copy_num_tokens": 315.625, "epoch": 1.6014807250446772, "gen_logits_max": 4.317756652832031, "gen_logits_mean": -16.249021530151367, "gen_logits_min": -28.313720703125, "gen_logits_std": 3.206170082092285, "gen_loss": 0.26709216833114624, "grad_norm": 0.3870128066530907, "learning_rate": 2.102526315789474e-05, "loss": 0.285, "mean_copy_accuracy": 0.996985524892807, "mean_gen_accuracy": 0.8735585659742355, "mean_token_accuracy": 0.9029659777879715, "num_tokens": 1042472753.0, "sample_num_tokens": 6699.25, "step": 7841, "total_num_tokens": 1042499550.0, "z_loss": 0.00041065915138460696 }, { "copy_logits_max": -6.157993793487549, "copy_logits_min": -750000000.0, "copy_num_tokens": 262.875, "epoch": 1.601684962981874, "gen_logits_max": 3.965254545211792, "gen_logits_mean": -17.208515167236328, "gen_logits_min": -28.888341903686523, "gen_logits_std": 3.245823860168457, "gen_loss": 0.2805275619029999, "grad_norm": 0.36958017737427895, "learning_rate": 2.1024e-05, "loss": 0.2873, "mean_copy_accuracy": 0.9967408031225204, "mean_gen_accuracy": 0.8772429823875427, "mean_token_accuracy": 0.9033424705266953, "num_tokens": 1042735435.0, "sample_num_tokens": 6652.25, "step": 7842, "total_num_tokens": 1042762044.0, "z_loss": 0.00044095926568843424 }, { "copy_logits_max": -5.0139875411987305, "copy_logits_min": -750000000.0, "copy_num_tokens": 354.5625, "epoch": 1.6018892009190706, "gen_logits_max": 3.962707281112671, "gen_logits_mean": -16.783126831054688, "gen_logits_min": -29.01858901977539, "gen_logits_std": 3.2528581619262695, "gen_loss": 0.25349855422973633, "grad_norm": 0.3904797883514239, "learning_rate": 2.1022736842105265e-05, "loss": 0.2853, "mean_copy_accuracy": 0.996152475476265, "mean_gen_accuracy": 0.8744246810674667, "mean_token_accuracy": 0.9031461477279663, "num_tokens": 1042995811.0, "sample_num_tokens": 7382.75, "step": 7843, "total_num_tokens": 1043025342.0, "z_loss": 0.00041307046194560826 }, { "copy_logits_max": -3.536655902862549, "copy_logits_min": -750000000.0, "copy_num_tokens": 314.1875, "epoch": 1.6020934388562675, "gen_logits_max": 4.070992469787598, "gen_logits_mean": -15.867441177368164, "gen_logits_min": -28.170021057128906, "gen_logits_std": 3.2272934913635254, "gen_loss": 0.27580371499061584, "grad_norm": 0.3572679894658371, "learning_rate": 2.1021473684210526e-05, "loss": 0.2716, "mean_copy_accuracy": 0.9967196434736252, "mean_gen_accuracy": 0.8789105862379074, "mean_token_accuracy": 0.906453549861908, "num_tokens": 1043259147.0, "sample_num_tokens": 7187.25, "step": 7844, "total_num_tokens": 1043287896.0, "z_loss": 0.00043893710244446993 }, { "copy_logits_max": -5.123218536376953, "copy_logits_min": -750000000.0, "copy_num_tokens": 464.375, "epoch": 1.6022976767934645, "gen_logits_max": 3.8956422805786133, "gen_logits_mean": -16.637271881103516, "gen_logits_min": -28.967082977294922, "gen_logits_std": 3.256577491760254, "gen_loss": 0.28091609477996826, "grad_norm": 0.37859643976551377, "learning_rate": 2.102021052631579e-05, "loss": 0.2798, "mean_copy_accuracy": 0.9973463714122772, "mean_gen_accuracy": 0.8750568926334381, "mean_token_accuracy": 0.9058851897716522, "num_tokens": 1043524561.0, "sample_num_tokens": 8541.25, "step": 7845, "total_num_tokens": 1043558726.0, "z_loss": 0.0004705431929323822 }, { "copy_logits_max": -3.6262476444244385, "copy_logits_min": -750000000.0, "copy_num_tokens": 524.3125, "epoch": 1.6025019147306612, "gen_logits_max": 4.491937637329102, "gen_logits_mean": -15.515649795532227, "gen_logits_min": -27.820411682128906, "gen_logits_std": 3.212059736251831, "gen_loss": 0.2777753472328186, "grad_norm": 0.35128351415924486, "learning_rate": 2.101894736842105e-05, "loss": 0.2667, "mean_copy_accuracy": 0.9970844089984894, "mean_gen_accuracy": 0.8794932812452316, "mean_token_accuracy": 0.9089771509170532, "num_tokens": 1043800909.0, "sample_num_tokens": 9303.25, "step": 7846, "total_num_tokens": 1043838122.0, "z_loss": 0.0004635101358871907 }, { "copy_logits_max": -3.0447099208831787, "copy_logits_min": -750000000.0, "copy_num_tokens": 449.5625, "epoch": 1.602706152667858, "gen_logits_max": 4.294862747192383, "gen_logits_mean": -15.075182914733887, "gen_logits_min": -27.766130447387695, "gen_logits_std": 3.2160487174987793, "gen_loss": 0.26664450764656067, "grad_norm": 0.3522217263217575, "learning_rate": 2.1017684210526315e-05, "loss": 0.2687, "mean_copy_accuracy": 0.9970391392707825, "mean_gen_accuracy": 0.8739960938692093, "mean_token_accuracy": 0.9085560888051987, "num_tokens": 1044092603.0, "sample_num_tokens": 8377.75, "step": 7847, "total_num_tokens": 1044126114.0, "z_loss": 0.00045022365520708263 }, { "copy_logits_max": -7.019105434417725, "copy_logits_min": -750000000.0, "copy_num_tokens": 235.3125, "epoch": 1.602910390605055, "gen_logits_max": 4.875323295593262, "gen_logits_mean": -17.472370147705078, "gen_logits_min": -29.23483657836914, "gen_logits_std": 3.2748334407806396, "gen_loss": 0.26949819922447205, "grad_norm": 0.3499502009952898, "learning_rate": 2.101642105263158e-05, "loss": 0.2588, "mean_copy_accuracy": 0.9970685094594955, "mean_gen_accuracy": 0.883688747882843, "mean_token_accuracy": 0.9105742871761322, "num_tokens": 1044371141.0, "sample_num_tokens": 7511.75, "step": 7848, "total_num_tokens": 1044401188.0, "z_loss": 0.0004417808377183974 }, { "copy_logits_max": -5.116761207580566, "copy_logits_min": -750000000.0, "copy_num_tokens": 335.1875, "epoch": 1.6031146285422517, "gen_logits_max": 4.849148750305176, "gen_logits_mean": -15.717426300048828, "gen_logits_min": -27.514774322509766, "gen_logits_std": 3.1907269954681396, "gen_loss": 0.3099842369556427, "grad_norm": 0.37336951341802577, "learning_rate": 2.1015157894736844e-05, "loss": 0.2715, "mean_copy_accuracy": 0.9965489208698273, "mean_gen_accuracy": 0.8798686563968658, "mean_token_accuracy": 0.910061240196228, "num_tokens": 1044639397.0, "sample_num_tokens": 8392.25, "step": 7849, "total_num_tokens": 1044672966.0, "z_loss": 0.0005028512678109109 }, { "copy_logits_max": -4.905714988708496, "copy_logits_min": -750000064.0, "copy_num_tokens": 479.5625, "epoch": 1.6033188664794484, "gen_logits_max": 3.4469823837280273, "gen_logits_mean": -16.619091033935547, "gen_logits_min": -28.374916076660156, "gen_logits_std": 3.24222993850708, "gen_loss": 0.26894286274909973, "grad_norm": 0.3544539693474131, "learning_rate": 2.1013894736842105e-05, "loss": 0.2638, "mean_copy_accuracy": 0.996475487947464, "mean_gen_accuracy": 0.8819243311882019, "mean_token_accuracy": 0.9106543660163879, "num_tokens": 1044903861.0, "sample_num_tokens": 7754.25, "step": 7850, "total_num_tokens": 1044934878.0, "z_loss": 0.00043838683632202446 }, { "copy_logits_max": -4.682485580444336, "copy_logits_min": -750000064.0, "copy_num_tokens": 492.3125, "epoch": 1.6035231044166454, "gen_logits_max": 3.2508704662323, "gen_logits_mean": -17.497573852539062, "gen_logits_min": -29.08977508544922, "gen_logits_std": 3.2214455604553223, "gen_loss": 0.26519128680229187, "grad_norm": 0.34026751769723806, "learning_rate": 2.101263157894737e-05, "loss": 0.2692, "mean_copy_accuracy": 0.9972341507673264, "mean_gen_accuracy": 0.880325511097908, "mean_token_accuracy": 0.908769428730011, "num_tokens": 1045176519.0, "sample_num_tokens": 8506.25, "step": 7851, "total_num_tokens": 1045210544.0, "z_loss": 0.0004420395998749882 }, { "copy_logits_max": -4.992920875549316, "copy_logits_min": -750000000.0, "copy_num_tokens": 477.875, "epoch": 1.6037273423538423, "gen_logits_max": 3.8248038291931152, "gen_logits_mean": -15.390045166015625, "gen_logits_min": -26.779489517211914, "gen_logits_std": 3.0642147064208984, "gen_loss": 0.25933560729026794, "grad_norm": 0.36669787532299525, "learning_rate": 2.1011368421052634e-05, "loss": 0.2788, "mean_copy_accuracy": 0.9973162263631821, "mean_gen_accuracy": 0.8773506879806519, "mean_token_accuracy": 0.9052160680294037, "num_tokens": 1045443566.0, "sample_num_tokens": 8698.5, "step": 7852, "total_num_tokens": 1045478360.0, "z_loss": 0.0004637030651792884 }, { "copy_logits_max": -4.761962890625, "copy_logits_min": -687500032.0, "copy_num_tokens": 581.125, "epoch": 1.603931580291039, "gen_logits_max": 3.671290159225464, "gen_logits_mean": -16.481233596801758, "gen_logits_min": -28.485321044921875, "gen_logits_std": 3.206528663635254, "gen_loss": 0.29938915371894836, "grad_norm": 0.3644085594753236, "learning_rate": 2.1010105263157895e-05, "loss": 0.2922, "mean_copy_accuracy": 0.9962913542985916, "mean_gen_accuracy": 0.871582955121994, "mean_token_accuracy": 0.8999984115362167, "num_tokens": 1045714333.0, "sample_num_tokens": 9895.25, "step": 7853, "total_num_tokens": 1045753914.0, "z_loss": 0.0005093180225230753 }, { "copy_logits_max": -5.210419654846191, "copy_logits_min": -687500032.0, "copy_num_tokens": 400.375, "epoch": 1.604135818228236, "gen_logits_max": 3.89264178276062, "gen_logits_mean": -16.179340362548828, "gen_logits_min": -28.119110107421875, "gen_logits_std": 3.2116336822509766, "gen_loss": 0.2778148651123047, "grad_norm": 0.3575793616719288, "learning_rate": 2.100884210526316e-05, "loss": 0.2784, "mean_copy_accuracy": 0.9965539127588272, "mean_gen_accuracy": 0.8796468675136566, "mean_token_accuracy": 0.9054586440324783, "num_tokens": 1045980099.0, "sample_num_tokens": 7334.25, "step": 7854, "total_num_tokens": 1046009436.0, "z_loss": 0.00046286144061014056 }, { "copy_logits_max": -4.2847442626953125, "copy_logits_min": -750000000.0, "copy_num_tokens": 446.5, "epoch": 1.6043400561654328, "gen_logits_max": 4.056243896484375, "gen_logits_mean": -16.061172485351562, "gen_logits_min": -27.67507553100586, "gen_logits_std": 3.1878769397735596, "gen_loss": 0.2843555212020874, "grad_norm": 0.3436417987665064, "learning_rate": 2.100757894736842e-05, "loss": 0.2816, "mean_copy_accuracy": 0.9966976791620255, "mean_gen_accuracy": 0.8751901984214783, "mean_token_accuracy": 0.9032000005245209, "num_tokens": 1046246675.0, "sample_num_tokens": 8568.75, "step": 7855, "total_num_tokens": 1046280950.0, "z_loss": 0.00047454796731472015 }, { "copy_logits_max": -5.650856018066406, "copy_logits_min": -750000000.0, "copy_num_tokens": 376.75, "epoch": 1.6045442941026296, "gen_logits_max": 3.8453783988952637, "gen_logits_mean": -17.36989974975586, "gen_logits_min": -28.77375602722168, "gen_logits_std": 3.252969264984131, "gen_loss": 0.30280908942222595, "grad_norm": 0.35835586156355154, "learning_rate": 2.1006315789473684e-05, "loss": 0.2843, "mean_copy_accuracy": 0.9967372417449951, "mean_gen_accuracy": 0.8731544017791748, "mean_token_accuracy": 0.9020668566226959, "num_tokens": 1046501854.0, "sample_num_tokens": 8159.0, "step": 7856, "total_num_tokens": 1046534490.0, "z_loss": 0.00048271947889588773 }, { "copy_logits_max": -5.68775749206543, "copy_logits_min": -750000000.0, "copy_num_tokens": 383.625, "epoch": 1.6047485320398263, "gen_logits_max": 4.353374004364014, "gen_logits_mean": -14.916116714477539, "gen_logits_min": -25.9666748046875, "gen_logits_std": 3.065734624862671, "gen_loss": 0.2563885450363159, "grad_norm": 0.3509072357761506, "learning_rate": 2.100505263157895e-05, "loss": 0.268, "mean_copy_accuracy": 0.9967055916786194, "mean_gen_accuracy": 0.8828315138816833, "mean_token_accuracy": 0.9102048426866531, "num_tokens": 1046765760.0, "sample_num_tokens": 7579.5, "step": 7857, "total_num_tokens": 1046796078.0, "z_loss": 0.0004094165051355958 }, { "copy_logits_max": -4.863913059234619, "copy_logits_min": -750000000.0, "copy_num_tokens": 336.3125, "epoch": 1.6049527699770232, "gen_logits_max": 3.8637542724609375, "gen_logits_mean": -16.621124267578125, "gen_logits_min": -28.089920043945312, "gen_logits_std": 3.1847996711730957, "gen_loss": 0.2781423330307007, "grad_norm": 0.3667801245549258, "learning_rate": 2.1003789473684213e-05, "loss": 0.2907, "mean_copy_accuracy": 0.9960720092058182, "mean_gen_accuracy": 0.8741769641637802, "mean_token_accuracy": 0.8995592147111893, "num_tokens": 1047013156.0, "sample_num_tokens": 8119.5, "step": 7858, "total_num_tokens": 1047045634.0, "z_loss": 0.0004596645012497902 }, { "copy_logits_max": -1.198603868484497, "copy_logits_min": -687500032.0, "copy_num_tokens": 756.5, "epoch": 1.6051570079142201, "gen_logits_max": 3.6356093883514404, "gen_logits_mean": -14.588802337646484, "gen_logits_min": -26.974334716796875, "gen_logits_std": 3.1780967712402344, "gen_loss": 0.2595669627189636, "grad_norm": 0.36736901988977083, "learning_rate": 2.1002526315789474e-05, "loss": 0.2877, "mean_copy_accuracy": 0.9971868991851807, "mean_gen_accuracy": 0.8683437407016754, "mean_token_accuracy": 0.9028331637382507, "num_tokens": 1047276249.0, "sample_num_tokens": 9878.75, "step": 7859, "total_num_tokens": 1047315764.0, "z_loss": 0.000448131759185344 }, { "copy_logits_max": -7.299410820007324, "copy_logits_min": -750000000.0, "copy_num_tokens": 588.0625, "epoch": 1.6053612458514168, "gen_logits_max": 2.763819694519043, "gen_logits_mean": -18.06797981262207, "gen_logits_min": -30.041807174682617, "gen_logits_std": 3.2955281734466553, "gen_loss": 0.27502357959747314, "grad_norm": 0.33525968658475563, "learning_rate": 2.1001263157894738e-05, "loss": 0.2767, "mean_copy_accuracy": 0.996369332075119, "mean_gen_accuracy": 0.8769548833370209, "mean_token_accuracy": 0.9057538658380508, "num_tokens": 1047572165.0, "sample_num_tokens": 9782.75, "step": 7860, "total_num_tokens": 1047611296.0, "z_loss": 0.0004650181799661368 }, { "copy_logits_max": -4.5642828941345215, "copy_logits_min": -687500032.0, "copy_num_tokens": 753.625, "epoch": 1.6055654837886137, "gen_logits_max": 3.82206654548645, "gen_logits_mean": -15.787529945373535, "gen_logits_min": -28.27375030517578, "gen_logits_std": 3.232232093811035, "gen_loss": 0.26315730810165405, "grad_norm": 0.39570200321866483, "learning_rate": 2.1e-05, "loss": 0.2618, "mean_copy_accuracy": 0.997293546795845, "mean_gen_accuracy": 0.8857906013727188, "mean_token_accuracy": 0.913467064499855, "num_tokens": 1047873459.0, "sample_num_tokens": 10815.75, "step": 7861, "total_num_tokens": 1047916722.0, "z_loss": 0.00046422926243394613 }, { "copy_logits_max": -6.170265197753906, "copy_logits_min": -750000000.0, "copy_num_tokens": 262.25, "epoch": 1.6057697217258107, "gen_logits_max": 3.6484761238098145, "gen_logits_mean": -17.376663208007812, "gen_logits_min": -29.225933074951172, "gen_logits_std": 3.2591798305511475, "gen_loss": 0.3151930570602417, "grad_norm": 0.39390062192362413, "learning_rate": 2.0998736842105263e-05, "loss": 0.2962, "mean_copy_accuracy": 0.994712308049202, "mean_gen_accuracy": 0.8732081204652786, "mean_token_accuracy": 0.8989462107419968, "num_tokens": 1048136429.0, "sample_num_tokens": 6607.25, "step": 7862, "total_num_tokens": 1048162858.0, "z_loss": 0.0004436856252141297 }, { "copy_logits_max": 1.574849247932434, "copy_logits_min": -750000000.0, "copy_num_tokens": 322.3125, "epoch": 1.6059739596630074, "gen_logits_max": 5.064844608306885, "gen_logits_mean": -15.067268371582031, "gen_logits_min": -27.223447799682617, "gen_logits_std": 3.19219708442688, "gen_loss": 0.31389355659484863, "grad_norm": 0.3951621228377069, "learning_rate": 2.0997473684210524e-05, "loss": 0.2946, "mean_copy_accuracy": 0.9963784068822861, "mean_gen_accuracy": 0.8743106126785278, "mean_token_accuracy": 0.8987827152013779, "num_tokens": 1048386372.0, "sample_num_tokens": 8228.5, "step": 7863, "total_num_tokens": 1048419286.0, "z_loss": 0.0004970574518665671 }, { "copy_logits_max": -3.4788875579833984, "copy_logits_min": -750000000.0, "copy_num_tokens": 436.4375, "epoch": 1.606178197600204, "gen_logits_max": 3.886948347091675, "gen_logits_mean": -16.693553924560547, "gen_logits_min": -29.22004508972168, "gen_logits_std": 3.2743067741394043, "gen_loss": 0.2774701714515686, "grad_norm": 0.35457076326933334, "learning_rate": 2.099621052631579e-05, "loss": 0.2704, "mean_copy_accuracy": 0.9968500435352325, "mean_gen_accuracy": 0.8789511173963547, "mean_token_accuracy": 0.9082440584897995, "num_tokens": 1048674272.0, "sample_num_tokens": 8765.5, "step": 7864, "total_num_tokens": 1048709334.0, "z_loss": 0.00047673674998804927 }, { "copy_logits_max": 0.043075233697891235, "copy_logits_min": -687500032.0, "copy_num_tokens": 487.9375, "epoch": 1.6063824355374012, "gen_logits_max": 3.757688045501709, "gen_logits_mean": -16.184803009033203, "gen_logits_min": -28.667224884033203, "gen_logits_std": 3.268965721130371, "gen_loss": 0.29813945293426514, "grad_norm": 0.3500411728608418, "learning_rate": 2.0994947368421056e-05, "loss": 0.2714, "mean_copy_accuracy": 0.9962729811668396, "mean_gen_accuracy": 0.8793741017580032, "mean_token_accuracy": 0.9078971892595291, "num_tokens": 1048941780.0, "sample_num_tokens": 7794.0, "step": 7865, "total_num_tokens": 1048972956.0, "z_loss": 0.0005327281542122364 }, { "copy_logits_max": -3.3625636100769043, "copy_logits_min": -750000000.0, "copy_num_tokens": 363.875, "epoch": 1.606586673474598, "gen_logits_max": 3.3328354358673096, "gen_logits_mean": -17.63692283630371, "gen_logits_min": -30.022239685058594, "gen_logits_std": 3.3141069412231445, "gen_loss": 0.24574801325798035, "grad_norm": 0.3990215753342925, "learning_rate": 2.0993684210526317e-05, "loss": 0.2848, "mean_copy_accuracy": 0.9962436258792877, "mean_gen_accuracy": 0.8750389516353607, "mean_token_accuracy": 0.9014967381954193, "num_tokens": 1049183871.0, "sample_num_tokens": 7184.25, "step": 7866, "total_num_tokens": 1049212608.0, "z_loss": 0.0004045104724355042 }, { "copy_logits_max": 0.03141963481903076, "copy_logits_min": -750000000.0, "copy_num_tokens": 452.0625, "epoch": 1.6067909114117946, "gen_logits_max": 4.733665466308594, "gen_logits_mean": -16.030214309692383, "gen_logits_min": -28.617774963378906, "gen_logits_std": 3.245479106903076, "gen_loss": 0.26229003071784973, "grad_norm": 0.36401313209791347, "learning_rate": 2.099242105263158e-05, "loss": 0.2732, "mean_copy_accuracy": 0.9959010034799576, "mean_gen_accuracy": 0.8791246116161346, "mean_token_accuracy": 0.9098092168569565, "num_tokens": 1049467511.0, "sample_num_tokens": 9434.25, "step": 7867, "total_num_tokens": 1049505248.0, "z_loss": 0.00041681522270664573 }, { "copy_logits_max": -4.616650581359863, "copy_logits_min": -687500032.0, "copy_num_tokens": 634.0625, "epoch": 1.6069951493489916, "gen_logits_max": 2.2779130935668945, "gen_logits_mean": -18.19136619567871, "gen_logits_min": -30.529542922973633, "gen_logits_std": 3.3343372344970703, "gen_loss": 0.2770616114139557, "grad_norm": 0.3515074493931415, "learning_rate": 2.0991157894736842e-05, "loss": 0.2777, "mean_copy_accuracy": 0.9967412352561951, "mean_gen_accuracy": 0.8735101968050003, "mean_token_accuracy": 0.905343621969223, "num_tokens": 1049733354.0, "sample_num_tokens": 9470.5, "step": 7868, "total_num_tokens": 1049771236.0, "z_loss": 0.0004698190314229578 }, { "copy_logits_max": -2.885789632797241, "copy_logits_min": -750000000.0, "copy_num_tokens": 487.6875, "epoch": 1.6071993872861885, "gen_logits_max": 2.5531058311462402, "gen_logits_mean": -18.143354415893555, "gen_logits_min": -30.427539825439453, "gen_logits_std": 3.340439796447754, "gen_loss": 0.289833128452301, "grad_norm": 0.33042921460228725, "learning_rate": 2.0989894736842107e-05, "loss": 0.2802, "mean_copy_accuracy": 0.9977254420518875, "mean_gen_accuracy": 0.8746804147958755, "mean_token_accuracy": 0.9068184942007065, "num_tokens": 1050030965.0, "sample_num_tokens": 8340.25, "step": 7869, "total_num_tokens": 1050064326.0, "z_loss": 0.0004927380359731615 }, { "copy_logits_max": -3.7739906311035156, "copy_logits_min": -750000000.0, "copy_num_tokens": 413.6875, "epoch": 1.6074036252233852, "gen_logits_max": 3.1082468032836914, "gen_logits_mean": -17.114093780517578, "gen_logits_min": -29.20550537109375, "gen_logits_std": 3.2837510108947754, "gen_loss": 0.2947046160697937, "grad_norm": 0.35181633022599207, "learning_rate": 2.0988631578947367e-05, "loss": 0.2883, "mean_copy_accuracy": 0.9970985502004623, "mean_gen_accuracy": 0.871269628405571, "mean_token_accuracy": 0.9013250917196274, "num_tokens": 1050309781.0, "sample_num_tokens": 7614.25, "step": 7870, "total_num_tokens": 1050340238.0, "z_loss": 0.00045254750875756145 }, { "copy_logits_max": -2.1964762210845947, "copy_logits_min": -750000000.0, "copy_num_tokens": 580.25, "epoch": 1.6076078631605821, "gen_logits_max": 3.854769229888916, "gen_logits_mean": -16.448253631591797, "gen_logits_min": -28.975915908813477, "gen_logits_std": 3.291840076446533, "gen_loss": 0.2996964752674103, "grad_norm": 0.39219232812243177, "learning_rate": 2.0987368421052632e-05, "loss": 0.2818, "mean_copy_accuracy": 0.9967537820339203, "mean_gen_accuracy": 0.8772697746753693, "mean_token_accuracy": 0.9057119488716125, "num_tokens": 1050560443.0, "sample_num_tokens": 9080.25, "step": 7871, "total_num_tokens": 1050596764.0, "z_loss": 0.0004948027199134231 }, { "copy_logits_max": -4.649052143096924, "copy_logits_min": -750000000.0, "copy_num_tokens": 503.0625, "epoch": 1.607812101097779, "gen_logits_max": 3.460700035095215, "gen_logits_mean": -17.342472076416016, "gen_logits_min": -29.851409912109375, "gen_logits_std": 3.316692352294922, "gen_loss": 0.29659944772720337, "grad_norm": 0.3659307764166434, "learning_rate": 2.0986105263157893e-05, "loss": 0.2968, "mean_copy_accuracy": 0.9961389452219009, "mean_gen_accuracy": 0.8712555468082428, "mean_token_accuracy": 0.90056012570858, "num_tokens": 1050837912.0, "sample_num_tokens": 9163.5, "step": 7872, "total_num_tokens": 1050874566.0, "z_loss": 0.00051524443551898 }, { "copy_logits_max": -3.7111144065856934, "copy_logits_min": -750000000.0, "copy_num_tokens": 423.125, "epoch": 1.6080163390349758, "gen_logits_max": 4.2831315994262695, "gen_logits_mean": -15.993396759033203, "gen_logits_min": -28.056594848632812, "gen_logits_std": 3.2778682708740234, "gen_loss": 0.2822650671005249, "grad_norm": 0.36741855936932655, "learning_rate": 2.098484210526316e-05, "loss": 0.2794, "mean_copy_accuracy": 0.9968470185995102, "mean_gen_accuracy": 0.8749644011259079, "mean_token_accuracy": 0.906451478600502, "num_tokens": 1051104336.0, "sample_num_tokens": 8171.0, "step": 7873, "total_num_tokens": 1051137020.0, "z_loss": 0.00044580549001693726 }, { "copy_logits_max": -4.825397491455078, "copy_logits_min": -750000000.0, "copy_num_tokens": 452.8125, "epoch": 1.6082205769721725, "gen_logits_max": 3.286621570587158, "gen_logits_mean": -17.672019958496094, "gen_logits_min": -29.771385192871094, "gen_logits_std": 3.310631275177002, "gen_loss": 0.29507023096084595, "grad_norm": 0.36678476621905803, "learning_rate": 2.098357894736842e-05, "loss": 0.2756, "mean_copy_accuracy": 0.9972963482141495, "mean_gen_accuracy": 0.8738609701395035, "mean_token_accuracy": 0.9062102437019348, "num_tokens": 1051365201.0, "sample_num_tokens": 8642.75, "step": 7874, "total_num_tokens": 1051399772.0, "z_loss": 0.0004955419572070241 }, { "copy_logits_max": -3.843245506286621, "copy_logits_min": -750000000.0, "copy_num_tokens": 552.9375, "epoch": 1.6084248149093694, "gen_logits_max": 3.812842607498169, "gen_logits_mean": -15.569395065307617, "gen_logits_min": -27.9592227935791, "gen_logits_std": 3.266042709350586, "gen_loss": 0.25032928586006165, "grad_norm": 0.41821613781052125, "learning_rate": 2.0982315789473686e-05, "loss": 0.2763, "mean_copy_accuracy": 0.996397003531456, "mean_gen_accuracy": 0.8774895071983337, "mean_token_accuracy": 0.9048750102519989, "num_tokens": 1051620231.0, "sample_num_tokens": 9343.25, "step": 7875, "total_num_tokens": 1051657604.0, "z_loss": 0.00040100410114973783 }, { "copy_logits_max": -4.280548095703125, "copy_logits_min": -750000000.0, "copy_num_tokens": 472.6875, "epoch": 1.6086290528465663, "gen_logits_max": 4.066308975219727, "gen_logits_mean": -16.441768646240234, "gen_logits_min": -28.486730575561523, "gen_logits_std": 3.2893176078796387, "gen_loss": 0.32051101326942444, "grad_norm": 0.3524861378239593, "learning_rate": 2.0981052631578947e-05, "loss": 0.2902, "mean_copy_accuracy": 0.9969015419483185, "mean_gen_accuracy": 0.8687741905450821, "mean_token_accuracy": 0.9011108875274658, "num_tokens": 1051914547.0, "sample_num_tokens": 9102.25, "step": 7876, "total_num_tokens": 1051950956.0, "z_loss": 0.0005199499428272247 }, { "copy_logits_max": -5.397461414337158, "copy_logits_min": -687500032.0, "copy_num_tokens": 591.125, "epoch": 1.608833290783763, "gen_logits_max": 3.667268991470337, "gen_logits_mean": -15.502415657043457, "gen_logits_min": -28.031652450561523, "gen_logits_std": 3.3097805976867676, "gen_loss": 0.24092458188533783, "grad_norm": 0.3642818686120525, "learning_rate": 2.097978947368421e-05, "loss": 0.277, "mean_copy_accuracy": 0.9968120008707047, "mean_gen_accuracy": 0.8800683468580246, "mean_token_accuracy": 0.9073712229728699, "num_tokens": 1052176342.0, "sample_num_tokens": 9277.5, "step": 7877, "total_num_tokens": 1052213452.0, "z_loss": 0.0004207261954434216 }, { "copy_logits_max": -5.093253135681152, "copy_logits_min": -625000064.0, "copy_num_tokens": 709.1875, "epoch": 1.60903752872096, "gen_logits_max": 2.1089539527893066, "gen_logits_mean": -18.142818450927734, "gen_logits_min": -30.846023559570312, "gen_logits_std": 3.3875036239624023, "gen_loss": 0.23035278916358948, "grad_norm": 0.3439904254037956, "learning_rate": 2.0978526315789475e-05, "loss": 0.244, "mean_copy_accuracy": 0.997660830616951, "mean_gen_accuracy": 0.8832852095365524, "mean_token_accuracy": 0.9169855117797852, "num_tokens": 1052468663.0, "sample_num_tokens": 9569.75, "step": 7878, "total_num_tokens": 1052506942.0, "z_loss": 0.00043567409738898277 }, { "copy_logits_max": -2.3313443660736084, "copy_logits_min": -750000000.0, "copy_num_tokens": 383.5, "epoch": 1.6092417666581569, "gen_logits_max": 4.672040939331055, "gen_logits_mean": -16.178306579589844, "gen_logits_min": -28.223224639892578, "gen_logits_std": 3.2847776412963867, "gen_loss": 0.27666881680488586, "grad_norm": 0.34911098771351023, "learning_rate": 2.0977263157894736e-05, "loss": 0.295, "mean_copy_accuracy": 0.9971723109483719, "mean_gen_accuracy": 0.8693723827600479, "mean_token_accuracy": 0.8998271226882935, "num_tokens": 1052741274.0, "sample_num_tokens": 7665.0, "step": 7879, "total_num_tokens": 1052771934.0, "z_loss": 0.0004467919352464378 }, { "copy_logits_max": -4.104887962341309, "copy_logits_min": -750000000.0, "copy_num_tokens": 445.5, "epoch": 1.6094460045953536, "gen_logits_max": 4.121603965759277, "gen_logits_mean": -15.647956848144531, "gen_logits_min": -27.90768814086914, "gen_logits_std": 3.2815091609954834, "gen_loss": 0.24870967864990234, "grad_norm": 0.3515752496505385, "learning_rate": 2.0976e-05, "loss": 0.2759, "mean_copy_accuracy": 0.997918426990509, "mean_gen_accuracy": 0.8709468394517899, "mean_token_accuracy": 0.9066347777843475, "num_tokens": 1053032659.0, "sample_num_tokens": 7185.25, "step": 7880, "total_num_tokens": 1053061400.0, "z_loss": 0.0004288841155357659 }, { "copy_logits_max": -4.007800579071045, "copy_logits_min": -750000000.0, "copy_num_tokens": 507.875, "epoch": 1.6096502425325503, "gen_logits_max": 3.4279866218566895, "gen_logits_mean": -17.08625030517578, "gen_logits_min": -29.154193878173828, "gen_logits_std": 3.341475248336792, "gen_loss": 0.2789137363433838, "grad_norm": 0.3480133906603749, "learning_rate": 2.0974736842105265e-05, "loss": 0.2611, "mean_copy_accuracy": 0.9972919672727585, "mean_gen_accuracy": 0.88343945145607, "mean_token_accuracy": 0.9116585552692413, "num_tokens": 1053292813.0, "sample_num_tokens": 8779.75, "step": 7881, "total_num_tokens": 1053327932.0, "z_loss": 0.00045458131353370845 }, { "copy_logits_max": -5.323864459991455, "copy_logits_min": -750000064.0, "copy_num_tokens": 444.5, "epoch": 1.6098544804697472, "gen_logits_max": 3.2897109985351562, "gen_logits_mean": -18.25467300415039, "gen_logits_min": -30.242904663085938, "gen_logits_std": 3.349360704421997, "gen_loss": 0.3148232400417328, "grad_norm": 0.32649225246708713, "learning_rate": 2.097347368421053e-05, "loss": 0.283, "mean_copy_accuracy": 0.997159019112587, "mean_gen_accuracy": 0.875381663441658, "mean_token_accuracy": 0.9026327580213547, "num_tokens": 1053566333.0, "sample_num_tokens": 8629.25, "step": 7882, "total_num_tokens": 1053600850.0, "z_loss": 0.000518559361808002 }, { "copy_logits_max": -5.051663398742676, "copy_logits_min": -750000000.0, "copy_num_tokens": 465.1875, "epoch": 1.6100587184069441, "gen_logits_max": 3.8479092121124268, "gen_logits_mean": -17.086977005004883, "gen_logits_min": -29.451257705688477, "gen_logits_std": 3.336646795272827, "gen_loss": 0.2866734564304352, "grad_norm": 0.33201187774639374, "learning_rate": 2.097221052631579e-05, "loss": 0.273, "mean_copy_accuracy": 0.997055396437645, "mean_gen_accuracy": 0.8771016150712967, "mean_token_accuracy": 0.9057772904634476, "num_tokens": 1053841731.0, "sample_num_tokens": 8800.25, "step": 7883, "total_num_tokens": 1053876932.0, "z_loss": 0.0004575749335344881 }, { "copy_logits_max": -1.9163730144500732, "copy_logits_min": -750000064.0, "copy_num_tokens": 367.875, "epoch": 1.6102629563441409, "gen_logits_max": 4.380164623260498, "gen_logits_mean": -15.707404136657715, "gen_logits_min": -28.206039428710938, "gen_logits_std": 3.304490566253662, "gen_loss": 0.2516319751739502, "grad_norm": 0.3382619359979342, "learning_rate": 2.0970947368421054e-05, "loss": 0.248, "mean_copy_accuracy": 0.9983272105455399, "mean_gen_accuracy": 0.8864018619060516, "mean_token_accuracy": 0.915311336517334, "num_tokens": 1054141378.0, "sample_num_tokens": 7351.0, "step": 7884, "total_num_tokens": 1054170782.0, "z_loss": 0.00042781897354871035 }, { "copy_logits_max": -4.574835777282715, "copy_logits_min": -750000000.0, "copy_num_tokens": 400.25, "epoch": 1.6104671942813378, "gen_logits_max": 3.874908447265625, "gen_logits_mean": -16.700180053710938, "gen_logits_min": -28.841279983520508, "gen_logits_std": 3.3133575916290283, "gen_loss": 0.27872443199157715, "grad_norm": 0.3786030164028577, "learning_rate": 2.0969684210526315e-05, "loss": 0.2736, "mean_copy_accuracy": 0.9970797151327133, "mean_gen_accuracy": 0.8805843144655228, "mean_token_accuracy": 0.9066844135522842, "num_tokens": 1054389311.0, "sample_num_tokens": 8081.75, "step": 7885, "total_num_tokens": 1054421638.0, "z_loss": 0.00043802213622257113 }, { "copy_logits_max": -5.1968183517456055, "copy_logits_min": -750000128.0, "copy_num_tokens": 274.0625, "epoch": 1.6106714322185347, "gen_logits_max": 4.1546430587768555, "gen_logits_mean": -17.44668960571289, "gen_logits_min": -29.590471267700195, "gen_logits_std": 3.347301959991455, "gen_loss": 0.2725173830986023, "grad_norm": 0.3327691362640886, "learning_rate": 2.096842105263158e-05, "loss": 0.268, "mean_copy_accuracy": 0.9970874786376953, "mean_gen_accuracy": 0.8832093924283981, "mean_token_accuracy": 0.9077228307723999, "num_tokens": 1054653033.0, "sample_num_tokens": 7245.25, "step": 7886, "total_num_tokens": 1054682014.0, "z_loss": 0.0004281663568690419 }, { "copy_logits_max": -1.5999014377593994, "copy_logits_min": -750000000.0, "copy_num_tokens": 379.4375, "epoch": 1.6108756701557314, "gen_logits_max": 4.633591651916504, "gen_logits_mean": -14.964978218078613, "gen_logits_min": -28.0834903717041, "gen_logits_std": 3.277174949645996, "gen_loss": 0.30732661485671997, "grad_norm": 0.368024316723757, "learning_rate": 2.096715789473684e-05, "loss": 0.2889, "mean_copy_accuracy": 0.9973347932100296, "mean_gen_accuracy": 0.8718235343694687, "mean_token_accuracy": 0.9035723954439163, "num_tokens": 1054928820.0, "sample_num_tokens": 7619.0, "step": 7887, "total_num_tokens": 1054959296.0, "z_loss": 0.0005166373448446393 }, { "copy_logits_max": -1.6614019870758057, "copy_logits_min": -750000000.0, "copy_num_tokens": 637.5, "epoch": 1.6110799080929281, "gen_logits_max": 3.3729724884033203, "gen_logits_mean": -16.631040573120117, "gen_logits_min": -29.649721145629883, "gen_logits_std": 3.343651056289673, "gen_loss": 0.25800928473472595, "grad_norm": 0.3406531082912278, "learning_rate": 2.0965894736842105e-05, "loss": 0.2474, "mean_copy_accuracy": 0.9973613917827606, "mean_gen_accuracy": 0.8855865150690079, "mean_token_accuracy": 0.916339248418808, "num_tokens": 1055203149.0, "sample_num_tokens": 8913.75, "step": 7888, "total_num_tokens": 1055238804.0, "z_loss": 0.0003955347347073257 }, { "copy_logits_max": -5.417309761047363, "copy_logits_min": -750000000.0, "copy_num_tokens": 391.0625, "epoch": 1.611284146030125, "gen_logits_max": 4.382856845855713, "gen_logits_mean": -15.907441139221191, "gen_logits_min": -29.13735580444336, "gen_logits_std": 3.3147644996643066, "gen_loss": 0.26701831817626953, "grad_norm": 0.3370659404975708, "learning_rate": 2.096463157894737e-05, "loss": 0.2668, "mean_copy_accuracy": 0.9973197430372238, "mean_gen_accuracy": 0.8803548663854599, "mean_token_accuracy": 0.9082406610250473, "num_tokens": 1055475826.0, "sample_num_tokens": 7544.5, "step": 7889, "total_num_tokens": 1055506004.0, "z_loss": 0.000434412679169327 }, { "copy_logits_max": -4.306480407714844, "copy_logits_min": -750000000.0, "copy_num_tokens": 380.125, "epoch": 1.611488383967322, "gen_logits_max": 3.486506938934326, "gen_logits_mean": -17.74408721923828, "gen_logits_min": -30.125293731689453, "gen_logits_std": 3.350942611694336, "gen_loss": 0.27629411220550537, "grad_norm": 0.4140712808700915, "learning_rate": 2.0963368421052633e-05, "loss": 0.2747, "mean_copy_accuracy": 0.9973113685846329, "mean_gen_accuracy": 0.8782334178686142, "mean_token_accuracy": 0.9075639098882675, "num_tokens": 1055737636.0, "sample_num_tokens": 7709.0, "step": 7890, "total_num_tokens": 1055768472.0, "z_loss": 0.0004824755305889994 }, { "copy_logits_max": -3.7069759368896484, "copy_logits_min": -750000000.0, "copy_num_tokens": 587.9375, "epoch": 1.6116926219045187, "gen_logits_max": 3.144857406616211, "gen_logits_mean": -16.88582420349121, "gen_logits_min": -29.090274810791016, "gen_logits_std": 3.328376054763794, "gen_loss": 0.29058656096458435, "grad_norm": 0.3929472690348217, "learning_rate": 2.0962105263157898e-05, "loss": 0.2766, "mean_copy_accuracy": 0.9974910169839859, "mean_gen_accuracy": 0.875017061829567, "mean_token_accuracy": 0.9046147465705872, "num_tokens": 1056003029.0, "sample_num_tokens": 9923.25, "step": 7891, "total_num_tokens": 1056042722.0, "z_loss": 0.00047281227307394147 }, { "copy_logits_max": -4.139577865600586, "copy_logits_min": -750000000.0, "copy_num_tokens": 423.9375, "epoch": 1.6118968598417156, "gen_logits_max": 3.618450164794922, "gen_logits_mean": -16.786163330078125, "gen_logits_min": -28.789230346679688, "gen_logits_std": 3.312869071960449, "gen_loss": 0.3145100176334381, "grad_norm": 0.3746978652602566, "learning_rate": 2.096084210526316e-05, "loss": 0.284, "mean_copy_accuracy": 0.9980465769767761, "mean_gen_accuracy": 0.8747012168169022, "mean_token_accuracy": 0.9066474139690399, "num_tokens": 1056287355.0, "sample_num_tokens": 7935.75, "step": 7892, "total_num_tokens": 1056319098.0, "z_loss": 0.0004859042528551072 }, { "copy_logits_max": -7.459068298339844, "copy_logits_min": -750000000.0, "copy_num_tokens": 466.0, "epoch": 1.6121010977789125, "gen_logits_max": 4.50064754486084, "gen_logits_mean": -15.853378295898438, "gen_logits_min": -28.25072479248047, "gen_logits_std": 3.3200314044952393, "gen_loss": 0.25932377576828003, "grad_norm": 0.39420501653421547, "learning_rate": 2.0959578947368423e-05, "loss": 0.2943, "mean_copy_accuracy": 0.9968273639678955, "mean_gen_accuracy": 0.8699676692485809, "mean_token_accuracy": 0.9008422642946243, "num_tokens": 1056539600.0, "sample_num_tokens": 9060.0, "step": 7893, "total_num_tokens": 1056575840.0, "z_loss": 0.0004119699588045478 }, { "copy_logits_max": -4.122516632080078, "copy_logits_min": -750000000.0, "copy_num_tokens": 395.625, "epoch": 1.6123053357161092, "gen_logits_max": 3.9979920387268066, "gen_logits_mean": -16.70586395263672, "gen_logits_min": -28.868877410888672, "gen_logits_std": 3.3117990493774414, "gen_loss": 0.30191659927368164, "grad_norm": 0.34352770429432555, "learning_rate": 2.0958315789473684e-05, "loss": 0.2738, "mean_copy_accuracy": 0.9963647872209549, "mean_gen_accuracy": 0.8772810399532318, "mean_token_accuracy": 0.9057930707931519, "num_tokens": 1056840269.0, "sample_num_tokens": 8420.25, "step": 7894, "total_num_tokens": 1056873950.0, "z_loss": 0.0004650502814911306 }, { "copy_logits_max": -3.1985890865325928, "copy_logits_min": -687500032.0, "copy_num_tokens": 287.125, "epoch": 1.612509573653306, "gen_logits_max": 3.695716381072998, "gen_logits_mean": -17.552181243896484, "gen_logits_min": -29.864831924438477, "gen_logits_std": 3.324009656906128, "gen_loss": 0.3231687843799591, "grad_norm": 0.3695409866681052, "learning_rate": 2.0957052631578948e-05, "loss": 0.3079, "mean_copy_accuracy": 0.9962850213050842, "mean_gen_accuracy": 0.869503915309906, "mean_token_accuracy": 0.8985274434089661, "num_tokens": 1057108876.0, "sample_num_tokens": 6758.5, "step": 7895, "total_num_tokens": 1057135910.0, "z_loss": 0.00048611394595354795 }, { "copy_logits_max": -3.755495071411133, "copy_logits_min": -687500032.0, "copy_num_tokens": 289.6875, "epoch": 1.612713811590503, "gen_logits_max": 3.9531641006469727, "gen_logits_mean": -17.561626434326172, "gen_logits_min": -29.693880081176758, "gen_logits_std": 3.3231024742126465, "gen_loss": 0.2968904972076416, "grad_norm": 0.37984935316905827, "learning_rate": 2.095578947368421e-05, "loss": 0.2791, "mean_copy_accuracy": 0.997045710682869, "mean_gen_accuracy": 0.8779963105916977, "mean_token_accuracy": 0.9050450474023819, "num_tokens": 1057363508.0, "sample_num_tokens": 6982.5, "step": 7896, "total_num_tokens": 1057391438.0, "z_loss": 0.0004817199660465121 }, { "copy_logits_max": -2.9213080406188965, "copy_logits_min": -750000000.0, "copy_num_tokens": 510.125, "epoch": 1.6129180495276998, "gen_logits_max": 3.5808169841766357, "gen_logits_mean": -16.60256576538086, "gen_logits_min": -29.127681732177734, "gen_logits_std": 3.321866989135742, "gen_loss": 0.2716710865497589, "grad_norm": 0.39186025544147113, "learning_rate": 2.0954526315789477e-05, "loss": 0.2703, "mean_copy_accuracy": 0.9967572838068008, "mean_gen_accuracy": 0.8784114569425583, "mean_token_accuracy": 0.9088175743818283, "num_tokens": 1057624313.0, "sample_num_tokens": 8893.25, "step": 7897, "total_num_tokens": 1057659886.0, "z_loss": 0.0004745326586998999 }, { "copy_logits_max": -5.160752296447754, "copy_logits_min": -687500032.0, "copy_num_tokens": 611.625, "epoch": 1.6131222874648965, "gen_logits_max": 3.9428067207336426, "gen_logits_mean": -15.496382713317871, "gen_logits_min": -28.098186492919922, "gen_logits_std": 3.2627367973327637, "gen_loss": 0.2610991597175598, "grad_norm": 0.336067590615368, "learning_rate": 2.0953263157894738e-05, "loss": 0.2628, "mean_copy_accuracy": 0.9976662397384644, "mean_gen_accuracy": 0.8786098957061768, "mean_token_accuracy": 0.9100830852985382, "num_tokens": 1057915358.0, "sample_num_tokens": 9674.5, "step": 7898, "total_num_tokens": 1057954056.0, "z_loss": 0.00041075405897572637 }, { "copy_logits_max": -4.420400619506836, "copy_logits_min": -750000064.0, "copy_num_tokens": 607.4375, "epoch": 1.6133265254020934, "gen_logits_max": 2.891254425048828, "gen_logits_mean": -17.816600799560547, "gen_logits_min": -30.297040939331055, "gen_logits_std": 3.35994291305542, "gen_loss": 0.2668567895889282, "grad_norm": 0.3655244236106413, "learning_rate": 2.0952000000000002e-05, "loss": 0.2831, "mean_copy_accuracy": 0.9971831291913986, "mean_gen_accuracy": 0.8732752799987793, "mean_token_accuracy": 0.9061259180307388, "num_tokens": 1058183507.0, "sample_num_tokens": 9662.25, "step": 7899, "total_num_tokens": 1058222156.0, "z_loss": 0.000456215173471719 }, { "copy_logits_max": -5.547183990478516, "copy_logits_min": -750000000.0, "copy_num_tokens": 420.625, "epoch": 1.6135307633392904, "gen_logits_max": 3.834982395172119, "gen_logits_mean": -16.996828079223633, "gen_logits_min": -29.424339294433594, "gen_logits_std": 3.304330825805664, "gen_loss": 0.2623301148414612, "grad_norm": 0.3735895663612365, "learning_rate": 2.0950736842105263e-05, "loss": 0.2785, "mean_copy_accuracy": 0.9960108995437622, "mean_gen_accuracy": 0.883182168006897, "mean_token_accuracy": 0.9051848948001862, "num_tokens": 1058419999.0, "sample_num_tokens": 8854.25, "step": 7900, "total_num_tokens": 1058455416.0, "z_loss": 0.00040771113708615303 }, { "copy_logits_max": -2.3925130367279053, "copy_logits_min": -750000000.0, "copy_num_tokens": 584.625, "epoch": 1.613735001276487, "gen_logits_max": 4.692111015319824, "gen_logits_mean": -14.742193222045898, "gen_logits_min": -28.0826473236084, "gen_logits_std": 3.267580986022949, "gen_loss": 0.2716367542743683, "grad_norm": 0.6887536203800229, "learning_rate": 2.0949473684210527e-05, "loss": 0.2806, "mean_copy_accuracy": 0.9971011281013489, "mean_gen_accuracy": 0.8673844039440155, "mean_token_accuracy": 0.9045589417219162, "num_tokens": 1058695319.0, "sample_num_tokens": 8938.75, "step": 7901, "total_num_tokens": 1058731074.0, "z_loss": 0.0004891875432804227 }, { "copy_logits_max": -5.21555233001709, "copy_logits_min": -750000000.0, "copy_num_tokens": 303.9375, "epoch": 1.613939239213684, "gen_logits_max": 5.165158748626709, "gen_logits_mean": -16.022672653198242, "gen_logits_min": -28.775657653808594, "gen_logits_std": 3.2787814140319824, "gen_loss": 0.23445427417755127, "grad_norm": 0.3679103129098196, "learning_rate": 2.0948210526315788e-05, "loss": 0.2729, "mean_copy_accuracy": 0.9966347068548203, "mean_gen_accuracy": 0.8850406259298325, "mean_token_accuracy": 0.9083592146635056, "num_tokens": 1058970490.0, "sample_num_tokens": 8267.5, "step": 7902, "total_num_tokens": 1059003560.0, "z_loss": 0.0004073099698871374 }, { "copy_logits_max": -2.788055419921875, "copy_logits_min": -687500032.0, "copy_num_tokens": 377.5625, "epoch": 1.614143477150881, "gen_logits_max": 5.060544967651367, "gen_logits_mean": -14.868292808532715, "gen_logits_min": -27.433961868286133, "gen_logits_std": 3.25079083442688, "gen_loss": 0.29905641078948975, "grad_norm": 0.3276309149084153, "learning_rate": 2.0946947368421052e-05, "loss": 0.2599, "mean_copy_accuracy": 0.9965344071388245, "mean_gen_accuracy": 0.8841596096754074, "mean_token_accuracy": 0.9100654125213623, "num_tokens": 1059227280.0, "sample_num_tokens": 7584.0, "step": 7903, "total_num_tokens": 1059257616.0, "z_loss": 0.0004816734872292727 }, { "copy_logits_max": -5.168827056884766, "copy_logits_min": -750000000.0, "copy_num_tokens": 436.6875, "epoch": 1.6143477150880776, "gen_logits_max": 4.744945526123047, "gen_logits_mean": -15.807095527648926, "gen_logits_min": -27.939027786254883, "gen_logits_std": 3.250950574874878, "gen_loss": 0.2750438153743744, "grad_norm": 0.3875024582003998, "learning_rate": 2.0945684210526313e-05, "loss": 0.2929, "mean_copy_accuracy": 0.9959875196218491, "mean_gen_accuracy": 0.8746262192726135, "mean_token_accuracy": 0.9013899713754654, "num_tokens": 1059481843.0, "sample_num_tokens": 7704.75, "step": 7904, "total_num_tokens": 1059512662.0, "z_loss": 0.0004414910217747092 }, { "copy_logits_max": -4.247494697570801, "copy_logits_min": -750000000.0, "copy_num_tokens": 428.875, "epoch": 1.6145519530252743, "gen_logits_max": 3.5468971729278564, "gen_logits_mean": -17.15770721435547, "gen_logits_min": -29.459566116333008, "gen_logits_std": 3.305727005004883, "gen_loss": 0.2707166075706482, "grad_norm": 0.38408040229505414, "learning_rate": 2.0944421052631578e-05, "loss": 0.2951, "mean_copy_accuracy": 0.9969030767679214, "mean_gen_accuracy": 0.8677959591150284, "mean_token_accuracy": 0.8994526565074921, "num_tokens": 1059758782.0, "sample_num_tokens": 8005.5, "step": 7905, "total_num_tokens": 1059790804.0, "z_loss": 0.0004742298915516585 }, { "copy_logits_max": -1.791507363319397, "copy_logits_min": -750000064.0, "copy_num_tokens": 643.9375, "epoch": 1.6147561909624712, "gen_logits_max": 3.2582437992095947, "gen_logits_mean": -16.16279411315918, "gen_logits_min": -28.683958053588867, "gen_logits_std": 3.2718005180358887, "gen_loss": 0.2430771440267563, "grad_norm": 0.3431987532927418, "learning_rate": 2.0943157894736845e-05, "loss": 0.2527, "mean_copy_accuracy": 0.9979081749916077, "mean_gen_accuracy": 0.8812588304281235, "mean_token_accuracy": 0.9137883335351944, "num_tokens": 1060028537.0, "sample_num_tokens": 8921.75, "step": 7906, "total_num_tokens": 1060064224.0, "z_loss": 0.00042878161184489727 }, { "copy_logits_max": -4.774484634399414, "copy_logits_min": -750000000.0, "copy_num_tokens": 374.6875, "epoch": 1.6149604288996682, "gen_logits_max": 4.481590747833252, "gen_logits_mean": -16.1257381439209, "gen_logits_min": -28.15427589416504, "gen_logits_std": 3.2598090171813965, "gen_loss": 0.25895237922668457, "grad_norm": 0.3328981119160466, "learning_rate": 2.0941894736842106e-05, "loss": 0.2598, "mean_copy_accuracy": 0.9966604858636856, "mean_gen_accuracy": 0.882540762424469, "mean_token_accuracy": 0.9116054773330688, "num_tokens": 1060306629.0, "sample_num_tokens": 7798.75, "step": 7907, "total_num_tokens": 1060337824.0, "z_loss": 0.000492501596454531 }, { "copy_logits_max": -5.666726112365723, "copy_logits_min": -750000000.0, "copy_num_tokens": 302.6875, "epoch": 1.6151646668368649, "gen_logits_max": 3.9249489307403564, "gen_logits_mean": -17.93982696533203, "gen_logits_min": -29.952224731445312, "gen_logits_std": 3.354316234588623, "gen_loss": 0.2795644998550415, "grad_norm": 0.350183444312014, "learning_rate": 2.094063157894737e-05, "loss": 0.2712, "mean_copy_accuracy": 0.99709652364254, "mean_gen_accuracy": 0.8798671662807465, "mean_token_accuracy": 0.9069645404815674, "num_tokens": 1060563296.0, "sample_num_tokens": 7097.0, "step": 7908, "total_num_tokens": 1060591684.0, "z_loss": 0.0004495979519560933 }, { "copy_logits_max": -3.520636558532715, "copy_logits_min": -750000000.0, "copy_num_tokens": 630.25, "epoch": 1.6153689047740618, "gen_logits_max": 3.9609265327453613, "gen_logits_mean": -16.296058654785156, "gen_logits_min": -29.034221649169922, "gen_logits_std": 3.3014707565307617, "gen_loss": 0.2543131113052368, "grad_norm": 0.35044526418754185, "learning_rate": 2.093936842105263e-05, "loss": 0.2649, "mean_copy_accuracy": 0.997252881526947, "mean_gen_accuracy": 0.879470631480217, "mean_token_accuracy": 0.9128839820623398, "num_tokens": 1060859968.0, "sample_num_tokens": 9361.5, "step": 7909, "total_num_tokens": 1060897414.0, "z_loss": 0.0003934241540264338 }, { "copy_logits_max": -4.940017223358154, "copy_logits_min": -750000064.0, "copy_num_tokens": 425.1875, "epoch": 1.6155731427112587, "gen_logits_max": 5.761301040649414, "gen_logits_mean": -15.010542869567871, "gen_logits_min": -27.118427276611328, "gen_logits_std": 3.216249942779541, "gen_loss": 0.27446460723876953, "grad_norm": 0.3571316144156152, "learning_rate": 2.0938105263157896e-05, "loss": 0.2757, "mean_copy_accuracy": 0.9957352876663208, "mean_gen_accuracy": 0.8794565200805664, "mean_token_accuracy": 0.905890479683876, "num_tokens": 1061134497.0, "sample_num_tokens": 8293.75, "step": 7910, "total_num_tokens": 1061167672.0, "z_loss": 0.00041992179467342794 }, { "copy_logits_max": -3.9179728031158447, "copy_logits_min": -750000000.0, "copy_num_tokens": 361.625, "epoch": 1.6157773806484554, "gen_logits_max": 5.776985168457031, "gen_logits_mean": -14.295799255371094, "gen_logits_min": -26.365814208984375, "gen_logits_std": 3.2009730339050293, "gen_loss": 0.3469400703907013, "grad_norm": 0.37688498511180424, "learning_rate": 2.0936842105263157e-05, "loss": 0.2972, "mean_copy_accuracy": 0.9966076761484146, "mean_gen_accuracy": 0.8737421482801437, "mean_token_accuracy": 0.8995886743068695, "num_tokens": 1061394831.0, "sample_num_tokens": 8266.75, "step": 7911, "total_num_tokens": 1061427898.0, "z_loss": 0.0005116817192174494 }, { "copy_logits_max": -3.298823833465576, "copy_logits_min": -687500032.0, "copy_num_tokens": 546.25, "epoch": 1.6159816185856521, "gen_logits_max": 3.8281912803649902, "gen_logits_mean": -16.755599975585938, "gen_logits_min": -29.094715118408203, "gen_logits_std": 3.298567771911621, "gen_loss": 0.24715206027030945, "grad_norm": 0.36092030719722484, "learning_rate": 2.093557894736842e-05, "loss": 0.2559, "mean_copy_accuracy": 0.9973741173744202, "mean_gen_accuracy": 0.8850178271532059, "mean_token_accuracy": 0.9139778167009354, "num_tokens": 1061663340.0, "sample_num_tokens": 9093.0, "step": 7912, "total_num_tokens": 1061699712.0, "z_loss": 0.00038907575071789324 }, { "copy_logits_max": -2.2935240268707275, "copy_logits_min": -750000000.0, "copy_num_tokens": 368.9375, "epoch": 1.616185856522849, "gen_logits_max": 3.9562716484069824, "gen_logits_mean": -16.89581298828125, "gen_logits_min": -28.98712158203125, "gen_logits_std": 3.2937123775482178, "gen_loss": 0.30018505454063416, "grad_norm": 0.3446432934808997, "learning_rate": 2.0934315789473682e-05, "loss": 0.2658, "mean_copy_accuracy": 0.9967307299375534, "mean_gen_accuracy": 0.879945918917656, "mean_token_accuracy": 0.9092660248279572, "num_tokens": 1061935979.0, "sample_num_tokens": 7604.25, "step": 7913, "total_num_tokens": 1061966396.0, "z_loss": 0.00044422553037293255 }, { "copy_logits_max": -4.007551193237305, "copy_logits_min": -750000000.0, "copy_num_tokens": 459.5625, "epoch": 1.616390094460046, "gen_logits_max": 4.448949813842773, "gen_logits_mean": -16.376205444335938, "gen_logits_min": -29.317949295043945, "gen_logits_std": 3.3101718425750732, "gen_loss": 0.31763720512390137, "grad_norm": 0.3433641536423914, "learning_rate": 2.093305263157895e-05, "loss": 0.2879, "mean_copy_accuracy": 0.997491791844368, "mean_gen_accuracy": 0.8724239468574524, "mean_token_accuracy": 0.9023363143205643, "num_tokens": 1062218333.0, "sample_num_tokens": 8784.75, "step": 7914, "total_num_tokens": 1062253472.0, "z_loss": 0.0004984582192264497 }, { "copy_logits_max": -2.7532401084899902, "copy_logits_min": -687500032.0, "copy_num_tokens": 547.0625, "epoch": 1.6165943323972427, "gen_logits_max": 4.359124660491943, "gen_logits_mean": -14.989532470703125, "gen_logits_min": -27.32868766784668, "gen_logits_std": 3.248182773590088, "gen_loss": 0.26026299595832825, "grad_norm": 0.378745009999451, "learning_rate": 2.093178947368421e-05, "loss": 0.284, "mean_copy_accuracy": 0.9970883876085281, "mean_gen_accuracy": 0.8738122135400772, "mean_token_accuracy": 0.9040324538946152, "num_tokens": 1062489171.0, "sample_num_tokens": 9574.75, "step": 7915, "total_num_tokens": 1062527470.0, "z_loss": 0.0003843232989311218 }, { "copy_logits_max": -5.927779197692871, "copy_logits_min": -750000000.0, "copy_num_tokens": 688.0625, "epoch": 1.6167985703344396, "gen_logits_max": 3.4621694087982178, "gen_logits_mean": -17.161752700805664, "gen_logits_min": -29.334861755371094, "gen_logits_std": 3.3000407218933105, "gen_loss": 0.266046404838562, "grad_norm": 0.3590237588113895, "learning_rate": 2.0930526315789475e-05, "loss": 0.2902, "mean_copy_accuracy": 0.9969208240509033, "mean_gen_accuracy": 0.8698188066482544, "mean_token_accuracy": 0.904192715883255, "num_tokens": 1062783843.0, "sample_num_tokens": 10340.75, "step": 7916, "total_num_tokens": 1062825206.0, "z_loss": 0.0003862605954054743 }, { "copy_logits_max": -3.3807764053344727, "copy_logits_min": -750000064.0, "copy_num_tokens": 509.5625, "epoch": 1.6170028082716366, "gen_logits_max": 4.251426696777344, "gen_logits_mean": -15.053916931152344, "gen_logits_min": -27.40338706970215, "gen_logits_std": 3.2570886611938477, "gen_loss": 0.24542030692100525, "grad_norm": 0.34938996772042286, "learning_rate": 2.0929263157894736e-05, "loss": 0.2633, "mean_copy_accuracy": 0.9968787133693695, "mean_gen_accuracy": 0.8810890763998032, "mean_token_accuracy": 0.9110472351312637, "num_tokens": 1063067606.0, "sample_num_tokens": 8033.0, "step": 7917, "total_num_tokens": 1063099738.0, "z_loss": 0.00046955596189945936 }, { "copy_logits_max": -5.496075630187988, "copy_logits_min": -750000000.0, "copy_num_tokens": 221.3125, "epoch": 1.6172070462088333, "gen_logits_max": 5.275965690612793, "gen_logits_mean": -15.898208618164062, "gen_logits_min": -28.168373107910156, "gen_logits_std": 3.2692322731018066, "gen_loss": 0.2946851849555969, "grad_norm": 0.33776456250835235, "learning_rate": 2.0928e-05, "loss": 0.27, "mean_copy_accuracy": 0.9968959391117096, "mean_gen_accuracy": 0.8830189406871796, "mean_token_accuracy": 0.9069674611091614, "num_tokens": 1063344493.0, "sample_num_tokens": 6133.75, "step": 7918, "total_num_tokens": 1063369028.0, "z_loss": 0.000490501057356596 }, { "copy_logits_max": -3.569852828979492, "copy_logits_min": -750000064.0, "copy_num_tokens": 382.125, "epoch": 1.61741128414603, "gen_logits_max": 4.75295352935791, "gen_logits_mean": -17.095348358154297, "gen_logits_min": -28.94145965576172, "gen_logits_std": 3.303865909576416, "gen_loss": 0.29733777046203613, "grad_norm": 0.38112395265699467, "learning_rate": 2.0926736842105265e-05, "loss": 0.2593, "mean_copy_accuracy": 0.9981738477945328, "mean_gen_accuracy": 0.8806636035442352, "mean_token_accuracy": 0.9127801358699799, "num_tokens": 1063600527.0, "sample_num_tokens": 8019.25, "step": 7919, "total_num_tokens": 1063632604.0, "z_loss": 0.0005099527770653367 }, { "copy_logits_max": -2.754542827606201, "copy_logits_min": -750000000.0, "copy_num_tokens": 683.5625, "epoch": 1.6176155220832271, "gen_logits_max": 3.7027201652526855, "gen_logits_mean": -16.42115020751953, "gen_logits_min": -29.593830108642578, "gen_logits_std": 3.3108794689178467, "gen_loss": 0.25310689210891724, "grad_norm": 0.3331570688478305, "learning_rate": 2.0925473684210525e-05, "loss": 0.2616, "mean_copy_accuracy": 0.9971375018358231, "mean_gen_accuracy": 0.8756315261125565, "mean_token_accuracy": 0.9105152934789658, "num_tokens": 1063898555.0, "sample_num_tokens": 9191.25, "step": 7920, "total_num_tokens": 1063935320.0, "z_loss": 0.0004225789161864668 }, { "copy_logits_max": -3.4876646995544434, "copy_logits_min": -750000000.0, "copy_num_tokens": 406.25, "epoch": 1.6178197600204238, "gen_logits_max": 4.6542840003967285, "gen_logits_mean": -15.966140747070312, "gen_logits_min": -28.209089279174805, "gen_logits_std": 3.279331922531128, "gen_loss": 0.26375359296798706, "grad_norm": 0.383482858505327, "learning_rate": 2.092421052631579e-05, "loss": 0.2724, "mean_copy_accuracy": 0.997150719165802, "mean_gen_accuracy": 0.8771715462207794, "mean_token_accuracy": 0.9083492010831833, "num_tokens": 1064160458.0, "sample_num_tokens": 7686.5, "step": 7921, "total_num_tokens": 1064191204.0, "z_loss": 0.000440408504800871 }, { "copy_logits_max": -3.7455811500549316, "copy_logits_min": -687500032.0, "copy_num_tokens": 377.9375, "epoch": 1.6180239979576205, "gen_logits_max": 6.296173572540283, "gen_logits_mean": -14.393413543701172, "gen_logits_min": -26.777450561523438, "gen_logits_std": 3.2419567108154297, "gen_loss": 0.2438965141773224, "grad_norm": 0.36468640290954685, "learning_rate": 2.0922947368421054e-05, "loss": 0.2629, "mean_copy_accuracy": 0.9959707707166672, "mean_gen_accuracy": 0.8840577155351639, "mean_token_accuracy": 0.9087445139884949, "num_tokens": 1064415530.0, "sample_num_tokens": 8216.5, "step": 7922, "total_num_tokens": 1064448396.0, "z_loss": 0.0004126482817810029 }, { "copy_logits_max": -1.4766921997070312, "copy_logits_min": -687500032.0, "copy_num_tokens": 644.75, "epoch": 1.6182282358948175, "gen_logits_max": 4.6436285972595215, "gen_logits_mean": -15.210914611816406, "gen_logits_min": -27.665998458862305, "gen_logits_std": 3.2813925743103027, "gen_loss": 0.27698245644569397, "grad_norm": 0.3441437557287859, "learning_rate": 2.092168421052632e-05, "loss": 0.2732, "mean_copy_accuracy": 0.9959143102169037, "mean_gen_accuracy": 0.8787740916013718, "mean_token_accuracy": 0.9068858176469803, "num_tokens": 1064686915.0, "sample_num_tokens": 9937.25, "step": 7923, "total_num_tokens": 1064726664.0, "z_loss": 0.00048145424807444215 }, { "copy_logits_max": -5.948543548583984, "copy_logits_min": -687500032.0, "copy_num_tokens": 467.5625, "epoch": 1.6184324738320144, "gen_logits_max": 3.563917398452759, "gen_logits_mean": -18.39980125427246, "gen_logits_min": -30.586376190185547, "gen_logits_std": 3.366016387939453, "gen_loss": 0.28126856684684753, "grad_norm": 0.3674720987631626, "learning_rate": 2.092042105263158e-05, "loss": 0.2801, "mean_copy_accuracy": 0.997711107134819, "mean_gen_accuracy": 0.8767937868833542, "mean_token_accuracy": 0.9042498767375946, "num_tokens": 1064955407.0, "sample_num_tokens": 8939.25, "step": 7924, "total_num_tokens": 1064991164.0, "z_loss": 0.00049593549920246 }, { "copy_logits_max": -7.480238437652588, "copy_logits_min": -750000000.0, "copy_num_tokens": 359.75, "epoch": 1.618636711769211, "gen_logits_max": 3.714634418487549, "gen_logits_mean": -18.332794189453125, "gen_logits_min": -30.05529022216797, "gen_logits_std": 3.343984603881836, "gen_loss": 0.25694310665130615, "grad_norm": 0.35716451086074535, "learning_rate": 2.0919157894736844e-05, "loss": 0.2731, "mean_copy_accuracy": 0.9971271604299545, "mean_gen_accuracy": 0.8746492266654968, "mean_token_accuracy": 0.9061234295368195, "num_tokens": 1065229453.0, "sample_num_tokens": 7920.25, "step": 7925, "total_num_tokens": 1065261134.0, "z_loss": 0.0004201349802315235 }, { "copy_logits_max": -5.528904438018799, "copy_logits_min": -750000000.0, "copy_num_tokens": 324.5, "epoch": 1.618840949706408, "gen_logits_max": 3.9456276893615723, "gen_logits_mean": -18.410520553588867, "gen_logits_min": -30.021442413330078, "gen_logits_std": 3.3107123374938965, "gen_loss": 0.2885473966598511, "grad_norm": 0.3605890697564501, "learning_rate": 2.0917894736842105e-05, "loss": 0.2682, "mean_copy_accuracy": 0.9970492571592331, "mean_gen_accuracy": 0.8875905722379684, "mean_token_accuracy": 0.9078553467988968, "num_tokens": 1065477406.0, "sample_num_tokens": 7834.0, "step": 7926, "total_num_tokens": 1065508742.0, "z_loss": 0.0004628462775144726 }, { "copy_logits_max": -0.4982627332210541, "copy_logits_min": -750000064.0, "copy_num_tokens": 654.25, "epoch": 1.619045187643605, "gen_logits_max": 3.2806448936462402, "gen_logits_mean": -17.126506805419922, "gen_logits_min": -29.782672882080078, "gen_logits_std": 3.322136402130127, "gen_loss": 0.2603607475757599, "grad_norm": 0.3386170856213365, "learning_rate": 2.091663157894737e-05, "loss": 0.2764, "mean_copy_accuracy": 0.9970765113830566, "mean_gen_accuracy": 0.8747348189353943, "mean_token_accuracy": 0.9055652618408203, "num_tokens": 1065744916.0, "sample_num_tokens": 9408.5, "step": 7927, "total_num_tokens": 1065782550.0, "z_loss": 0.0004818916495423764 }, { "copy_logits_max": 0.0686144232749939, "copy_logits_min": -750000000.0, "copy_num_tokens": 508.625, "epoch": 1.6192494255808016, "gen_logits_max": 5.80712366104126, "gen_logits_mean": -13.555825233459473, "gen_logits_min": -26.142118453979492, "gen_logits_std": 3.2002623081207275, "gen_loss": 0.24987691640853882, "grad_norm": 0.3438776234483432, "learning_rate": 2.091536842105263e-05, "loss": 0.2673, "mean_copy_accuracy": 0.9970155954360962, "mean_gen_accuracy": 0.8758348226547241, "mean_token_accuracy": 0.9093438237905502, "num_tokens": 1066026343.0, "sample_num_tokens": 8269.25, "step": 7928, "total_num_tokens": 1066059420.0, "z_loss": 0.000445665413280949 }, { "copy_logits_max": -1.7639436721801758, "copy_logits_min": -750000000.0, "copy_num_tokens": 439.4375, "epoch": 1.6194536635179984, "gen_logits_max": 4.24713134765625, "gen_logits_mean": -15.641260147094727, "gen_logits_min": -27.832595825195312, "gen_logits_std": 3.2516980171203613, "gen_loss": 0.29358404874801636, "grad_norm": 0.33868515752099815, "learning_rate": 2.0914105263157894e-05, "loss": 0.2777, "mean_copy_accuracy": 0.9961969256401062, "mean_gen_accuracy": 0.8777129501104355, "mean_token_accuracy": 0.9055501371622086, "num_tokens": 1066281324.0, "sample_num_tokens": 7857.0, "step": 7929, "total_num_tokens": 1066312752.0, "z_loss": 0.0005395475309342146 }, { "copy_logits_max": -2.692382335662842, "copy_logits_min": -750000128.0, "copy_num_tokens": 463.8125, "epoch": 1.6196579014551953, "gen_logits_max": 4.452584266662598, "gen_logits_mean": -14.845160484313965, "gen_logits_min": -26.648378372192383, "gen_logits_std": 3.1895289421081543, "gen_loss": 0.2444271445274353, "grad_norm": 0.4231340413879936, "learning_rate": 2.091284210526316e-05, "loss": 0.2978, "mean_copy_accuracy": 0.9972567409276962, "mean_gen_accuracy": 0.8722838312387466, "mean_token_accuracy": 0.8990017622709274, "num_tokens": 1066564827.0, "sample_num_tokens": 8639.25, "step": 7930, "total_num_tokens": 1066599384.0, "z_loss": 0.00039077294059097767 }, { "copy_logits_max": -2.295444965362549, "copy_logits_min": -687500032.0, "copy_num_tokens": 458.75, "epoch": 1.6198621393923922, "gen_logits_max": 3.9344184398651123, "gen_logits_mean": -16.784259796142578, "gen_logits_min": -28.305274963378906, "gen_logits_std": 3.262044668197632, "gen_loss": 0.3300377130508423, "grad_norm": 0.32472783747764516, "learning_rate": 2.0911578947368423e-05, "loss": 0.2772, "mean_copy_accuracy": 0.9977887570858002, "mean_gen_accuracy": 0.8710728585720062, "mean_token_accuracy": 0.9050624221563339, "num_tokens": 1066841765.0, "sample_num_tokens": 8793.75, "step": 7931, "total_num_tokens": 1066876940.0, "z_loss": 0.0005389775615185499 }, { "copy_logits_max": -2.2552521228790283, "copy_logits_min": -750000064.0, "copy_num_tokens": 403.375, "epoch": 1.620066377329589, "gen_logits_max": 4.933486461639404, "gen_logits_mean": -16.092275619506836, "gen_logits_min": -28.28643226623535, "gen_logits_std": 3.2726898193359375, "gen_loss": 0.2820366621017456, "grad_norm": 0.38232435239805324, "learning_rate": 2.0910315789473687e-05, "loss": 0.2886, "mean_copy_accuracy": 0.9964839220046997, "mean_gen_accuracy": 0.8707098960876465, "mean_token_accuracy": 0.9009346663951874, "num_tokens": 1067104876.0, "sample_num_tokens": 7931.5, "step": 7932, "total_num_tokens": 1067136602.0, "z_loss": 0.0004523946554400027 }, { "copy_logits_max": -1.5632084608078003, "copy_logits_min": -687500032.0, "copy_num_tokens": 413.8125, "epoch": 1.6202706152667858, "gen_logits_max": 5.281302452087402, "gen_logits_mean": -15.222347259521484, "gen_logits_min": -27.62417984008789, "gen_logits_std": 3.2507102489471436, "gen_loss": 0.2565048933029175, "grad_norm": 0.4077859630083063, "learning_rate": 2.0909052631578948e-05, "loss": 0.2998, "mean_copy_accuracy": 0.9961639940738678, "mean_gen_accuracy": 0.8735156208276749, "mean_token_accuracy": 0.8977860510349274, "num_tokens": 1067350053.0, "sample_num_tokens": 7610.25, "step": 7933, "total_num_tokens": 1067380494.0, "z_loss": 0.0003970840771216899 }, { "copy_logits_max": -1.2708311080932617, "copy_logits_min": -625000064.0, "copy_num_tokens": 638.4375, "epoch": 1.6204748532039828, "gen_logits_max": 4.479102611541748, "gen_logits_mean": -15.002164840698242, "gen_logits_min": -27.437976837158203, "gen_logits_std": 3.2544679641723633, "gen_loss": 0.2821961045265198, "grad_norm": 0.34617387714208075, "learning_rate": 2.0907789473684212e-05, "loss": 0.2853, "mean_copy_accuracy": 0.9970192164182663, "mean_gen_accuracy": 0.8677738755941391, "mean_token_accuracy": 0.902869239449501, "num_tokens": 1067618288.0, "sample_num_tokens": 9312.5, "step": 7934, "total_num_tokens": 1067655538.0, "z_loss": 0.0004434300062712282 }, { "copy_logits_max": -4.7772626876831055, "copy_logits_min": -750000000.0, "copy_num_tokens": 329.4375, "epoch": 1.6206790911411795, "gen_logits_max": 3.934175729751587, "gen_logits_mean": -17.412275314331055, "gen_logits_min": -29.565444946289062, "gen_logits_std": 3.3254942893981934, "gen_loss": 0.2499016523361206, "grad_norm": 0.3307822025856279, "learning_rate": 2.0906526315789473e-05, "loss": 0.2629, "mean_copy_accuracy": 0.997368261218071, "mean_gen_accuracy": 0.8801488727331161, "mean_token_accuracy": 0.9102816730737686, "num_tokens": 1067908803.0, "sample_num_tokens": 6832.25, "step": 7935, "total_num_tokens": 1067936132.0, "z_loss": 0.00043659083894453943 }, { "copy_logits_max": -3.4129638671875, "copy_logits_min": -750000000.0, "copy_num_tokens": 740.0625, "epoch": 1.6208833290783762, "gen_logits_max": 3.045559883117676, "gen_logits_mean": -17.34166717529297, "gen_logits_min": -29.802261352539062, "gen_logits_std": 3.326375722885132, "gen_loss": 0.2566961348056793, "grad_norm": 0.3731496796073553, "learning_rate": 2.0905263157894737e-05, "loss": 0.2858, "mean_copy_accuracy": 0.9966647028923035, "mean_gen_accuracy": 0.8700539618730545, "mean_token_accuracy": 0.9038868248462677, "num_tokens": 1068173121.0, "sample_num_tokens": 10205.25, "step": 7936, "total_num_tokens": 1068213942.0, "z_loss": 0.0004267423937562853 }, { "copy_logits_max": -3.757674217224121, "copy_logits_min": -750000000.0, "copy_num_tokens": 376.4375, "epoch": 1.621087567015573, "gen_logits_max": 3.027973175048828, "gen_logits_mean": -17.95336151123047, "gen_logits_min": -29.942089080810547, "gen_logits_std": 3.342684745788574, "gen_loss": 0.2626733183860779, "grad_norm": 0.3558449673849344, "learning_rate": 2.0904e-05, "loss": 0.283, "mean_copy_accuracy": 0.9970558136701584, "mean_gen_accuracy": 0.8744083046913147, "mean_token_accuracy": 0.903361588716507, "num_tokens": 1068453055.0, "sample_num_tokens": 7666.75, "step": 7937, "total_num_tokens": 1068483722.0, "z_loss": 0.0004048959235660732 }, { "copy_logits_max": -3.835678815841675, "copy_logits_min": -750000000.0, "copy_num_tokens": 522.75, "epoch": 1.62129180495277, "gen_logits_max": 4.020848751068115, "gen_logits_mean": -15.734682083129883, "gen_logits_min": -28.50177764892578, "gen_logits_std": 3.246683359146118, "gen_loss": 0.28770947456359863, "grad_norm": 0.3508886304779763, "learning_rate": 2.0902736842105266e-05, "loss": 0.2714, "mean_copy_accuracy": 0.9973828494548798, "mean_gen_accuracy": 0.8763404041528702, "mean_token_accuracy": 0.9086838066577911, "num_tokens": 1068728547.0, "sample_num_tokens": 9486.25, "step": 7938, "total_num_tokens": 1068766492.0, "z_loss": 0.00044245293247513473 }, { "copy_logits_max": -2.5491154193878174, "copy_logits_min": -750000000.0, "copy_num_tokens": 291.0625, "epoch": 1.6214960428899667, "gen_logits_max": 4.190376281738281, "gen_logits_mean": -16.871341705322266, "gen_logits_min": -28.800941467285156, "gen_logits_std": 3.292203903198242, "gen_loss": 0.292760968208313, "grad_norm": 0.3770513481320167, "learning_rate": 2.0901473684210527e-05, "loss": 0.302, "mean_copy_accuracy": 0.9971729069948196, "mean_gen_accuracy": 0.870701253414154, "mean_token_accuracy": 0.8985377550125122, "num_tokens": 1068980905.0, "sample_num_tokens": 7045.25, "step": 7939, "total_num_tokens": 1069009086.0, "z_loss": 0.0004310371878091246 }, { "copy_logits_max": -2.623323440551758, "copy_logits_min": -750000000.0, "copy_num_tokens": 373.0625, "epoch": 1.6217002808271637, "gen_logits_max": 4.648536205291748, "gen_logits_mean": -16.156970977783203, "gen_logits_min": -27.969982147216797, "gen_logits_std": 3.261953830718994, "gen_loss": 0.2804115414619446, "grad_norm": 0.3771831835902463, "learning_rate": 2.090021052631579e-05, "loss": 0.2761, "mean_copy_accuracy": 0.9966671466827393, "mean_gen_accuracy": 0.8757910281419754, "mean_token_accuracy": 0.9057405889034271, "num_tokens": 1069238581.0, "sample_num_tokens": 8186.25, "step": 7940, "total_num_tokens": 1069271326.0, "z_loss": 0.0004061471263412386 }, { "copy_logits_max": -5.622678756713867, "copy_logits_min": -687500032.0, "copy_num_tokens": 327.9375, "epoch": 1.6219045187643606, "gen_logits_max": 3.226660966873169, "gen_logits_mean": -18.760677337646484, "gen_logits_min": -30.430866241455078, "gen_logits_std": 3.317373752593994, "gen_loss": 0.29712361097335815, "grad_norm": 0.37575140093031967, "learning_rate": 2.0898947368421052e-05, "loss": 0.278, "mean_copy_accuracy": 0.9971394836902618, "mean_gen_accuracy": 0.8772526681423187, "mean_token_accuracy": 0.9065594673156738, "num_tokens": 1069501861.0, "sample_num_tokens": 7946.25, "step": 7941, "total_num_tokens": 1069533646.0, "z_loss": 0.0004443479119800031 }, { "copy_logits_max": -2.3321008682250977, "copy_logits_min": -750000000.0, "copy_num_tokens": 637.4375, "epoch": 1.6221087567015573, "gen_logits_max": 2.236013889312744, "gen_logits_mean": -17.659805297851562, "gen_logits_min": -29.849925994873047, "gen_logits_std": 3.31303071975708, "gen_loss": 0.2788759171962738, "grad_norm": 0.34385127222925466, "learning_rate": 2.0897684210526317e-05, "loss": 0.2539, "mean_copy_accuracy": 0.9973787516355515, "mean_gen_accuracy": 0.8771171569824219, "mean_token_accuracy": 0.9125703275203705, "num_tokens": 1069789192.0, "sample_num_tokens": 9081.5, "step": 7942, "total_num_tokens": 1069825518.0, "z_loss": 0.00044088729191571474 }, { "copy_logits_max": 0.600796639919281, "copy_logits_min": -750000000.0, "copy_num_tokens": 550.5625, "epoch": 1.622312994638754, "gen_logits_max": 4.228862762451172, "gen_logits_mean": -15.881622314453125, "gen_logits_min": -27.729446411132812, "gen_logits_std": 3.285079002380371, "gen_loss": 0.2674121558666229, "grad_norm": 0.37319680846692205, "learning_rate": 2.0896421052631577e-05, "loss": 0.2837, "mean_copy_accuracy": 0.9958910346031189, "mean_gen_accuracy": 0.8763194680213928, "mean_token_accuracy": 0.9028633683919907, "num_tokens": 1070040683.0, "sample_num_tokens": 8898.25, "step": 7943, "total_num_tokens": 1070076276.0, "z_loss": 0.00040999759221449494 }, { "copy_logits_max": -3.0589938163757324, "copy_logits_min": -750000000.0, "copy_num_tokens": 511.6875, "epoch": 1.622517232575951, "gen_logits_max": 3.4864320755004883, "gen_logits_mean": -16.82428741455078, "gen_logits_min": -29.404903411865234, "gen_logits_std": 3.283921241760254, "gen_loss": 0.30506399273872375, "grad_norm": 0.3660914067787032, "learning_rate": 2.0895157894736842e-05, "loss": 0.2804, "mean_copy_accuracy": 0.9955730885267258, "mean_gen_accuracy": 0.8730808645486832, "mean_token_accuracy": 0.9031398445367813, "num_tokens": 1070305579.0, "sample_num_tokens": 9369.75, "step": 7944, "total_num_tokens": 1070343058.0, "z_loss": 0.0005333633162081242 }, { "copy_logits_max": 1.511459231376648, "copy_logits_min": -750000000.0, "copy_num_tokens": 676.5625, "epoch": 1.6227214705131479, "gen_logits_max": 4.560813903808594, "gen_logits_mean": -14.934382438659668, "gen_logits_min": -27.52617835998535, "gen_logits_std": 3.2733054161071777, "gen_loss": 0.2590565085411072, "grad_norm": 0.3906940342220162, "learning_rate": 2.0893894736842106e-05, "loss": 0.2635, "mean_copy_accuracy": 0.9969262927770615, "mean_gen_accuracy": 0.8833461701869965, "mean_token_accuracy": 0.9099190831184387, "num_tokens": 1070562950.0, "sample_num_tokens": 10212.0, "step": 7945, "total_num_tokens": 1070603798.0, "z_loss": 0.00047892224392853677 }, { "copy_logits_max": -3.3314156532287598, "copy_logits_min": -687500032.0, "copy_num_tokens": 466.75, "epoch": 1.6229257084503446, "gen_logits_max": 3.02964448928833, "gen_logits_mean": -17.87710952758789, "gen_logits_min": -29.92942237854004, "gen_logits_std": 3.3653125762939453, "gen_loss": 0.24537710845470428, "grad_norm": 0.36506563710374695, "learning_rate": 2.089263157894737e-05, "loss": 0.288, "mean_copy_accuracy": 0.9975929111242294, "mean_gen_accuracy": 0.8764625191688538, "mean_token_accuracy": 0.9019425362348557, "num_tokens": 1070835368.0, "sample_num_tokens": 8208.5, "step": 7946, "total_num_tokens": 1070868202.0, "z_loss": 0.0004483847296796739 }, { "copy_logits_max": -1.096726894378662, "copy_logits_min": -687500032.0, "copy_num_tokens": 575.8125, "epoch": 1.6231299463875415, "gen_logits_max": 5.070364952087402, "gen_logits_mean": -13.852493286132812, "gen_logits_min": -26.526538848876953, "gen_logits_std": 3.2162094116210938, "gen_loss": 0.30205512046813965, "grad_norm": 0.34874481618178954, "learning_rate": 2.0891368421052635e-05, "loss": 0.2839, "mean_copy_accuracy": 0.9964870363473892, "mean_gen_accuracy": 0.8720909357070923, "mean_token_accuracy": 0.9041583091020584, "num_tokens": 1071099005.0, "sample_num_tokens": 8817.25, "step": 7947, "total_num_tokens": 1071134274.0, "z_loss": 0.00047820014879107475 }, { "copy_logits_max": -0.49475207924842834, "copy_logits_min": -687500032.0, "copy_num_tokens": 608.5625, "epoch": 1.6233341843247384, "gen_logits_max": 2.9737939834594727, "gen_logits_mean": -16.969900131225586, "gen_logits_min": -29.228389739990234, "gen_logits_std": 3.335679292678833, "gen_loss": 0.2744050920009613, "grad_norm": 0.35723308047577446, "learning_rate": 2.0890105263157896e-05, "loss": 0.2846, "mean_copy_accuracy": 0.9970972090959549, "mean_gen_accuracy": 0.8739438503980637, "mean_token_accuracy": 0.9062015116214752, "num_tokens": 1071385470.0, "sample_num_tokens": 9008.0, "step": 7948, "total_num_tokens": 1071421502.0, "z_loss": 0.0004963210085406899 }, { "copy_logits_max": -0.8267040848731995, "copy_logits_min": -750000000.0, "copy_num_tokens": 357.875, "epoch": 1.6235384222619351, "gen_logits_max": 5.3371429443359375, "gen_logits_mean": -14.466809272766113, "gen_logits_min": -26.9383487701416, "gen_logits_std": 3.233454465866089, "gen_loss": 0.28757765889167786, "grad_norm": 0.373455601082107, "learning_rate": 2.088884210526316e-05, "loss": 0.2673, "mean_copy_accuracy": 0.9963818341493607, "mean_gen_accuracy": 0.8823983520269394, "mean_token_accuracy": 0.9081705808639526, "num_tokens": 1071639960.0, "sample_num_tokens": 8163.5, "step": 7949, "total_num_tokens": 1071672614.0, "z_loss": 0.0005343754310160875 }, { "copy_logits_max": -2.436274290084839, "copy_logits_min": -750000000.0, "copy_num_tokens": 435.5625, "epoch": 1.6237426601991318, "gen_logits_max": 4.3455891609191895, "gen_logits_mean": -15.784499168395996, "gen_logits_min": -27.7907772064209, "gen_logits_std": 3.275942802429199, "gen_loss": 0.27346423268318176, "grad_norm": 0.3586018864908367, "learning_rate": 2.088757894736842e-05, "loss": 0.2754, "mean_copy_accuracy": 0.996706560254097, "mean_gen_accuracy": 0.8817391693592072, "mean_token_accuracy": 0.908247321844101, "num_tokens": 1071916212.0, "sample_num_tokens": 9093.5, "step": 7950, "total_num_tokens": 1071952586.0, "z_loss": 0.00046835088869556785 }, { "copy_logits_max": -2.5590386390686035, "copy_logits_min": -750000000.0, "copy_num_tokens": 396.9375, "epoch": 1.623946898136329, "gen_logits_max": 3.899974822998047, "gen_logits_mean": -16.769115447998047, "gen_logits_min": -28.486169815063477, "gen_logits_std": 3.2531228065490723, "gen_loss": 0.3382246196269989, "grad_norm": 0.32425712513440386, "learning_rate": 2.0886315789473685e-05, "loss": 0.2898, "mean_copy_accuracy": 0.998378112912178, "mean_gen_accuracy": 0.8720548748970032, "mean_token_accuracy": 0.9046416580677032, "num_tokens": 1072215786.0, "sample_num_tokens": 8797.0, "step": 7951, "total_num_tokens": 1072250974.0, "z_loss": 0.0005874894559383392 }, { "copy_logits_max": -1.6528931856155396, "copy_logits_min": -750000064.0, "copy_num_tokens": 391.75, "epoch": 1.6241511360735257, "gen_logits_max": 4.756311416625977, "gen_logits_mean": -15.930079460144043, "gen_logits_min": -27.531768798828125, "gen_logits_std": 3.244356632232666, "gen_loss": 0.2900885045528412, "grad_norm": 0.35676412554887277, "learning_rate": 2.0885052631578946e-05, "loss": 0.2719, "mean_copy_accuracy": 0.9970822334289551, "mean_gen_accuracy": 0.881334125995636, "mean_token_accuracy": 0.9081474840641022, "num_tokens": 1072513979.0, "sample_num_tokens": 8718.25, "step": 7952, "total_num_tokens": 1072548852.0, "z_loss": 0.0004666093736886978 }, { "copy_logits_max": -0.8494446873664856, "copy_logits_min": -687500032.0, "copy_num_tokens": 322.625, "epoch": 1.6243553740107224, "gen_logits_max": 4.740959167480469, "gen_logits_mean": -15.804593086242676, "gen_logits_min": -28.251789093017578, "gen_logits_std": 3.2663588523864746, "gen_loss": 0.3195687532424927, "grad_norm": 0.3788209343287403, "learning_rate": 2.088378947368421e-05, "loss": 0.2841, "mean_copy_accuracy": 0.996455043554306, "mean_gen_accuracy": 0.8769977986812592, "mean_token_accuracy": 0.9038843661546707, "num_tokens": 1072783343.0, "sample_num_tokens": 7053.75, "step": 7953, "total_num_tokens": 1072811558.0, "z_loss": 0.000549488584510982 }, { "copy_logits_max": -1.9278980493545532, "copy_logits_min": -750000000.0, "copy_num_tokens": 425.0, "epoch": 1.6245596119479193, "gen_logits_max": 3.179837226867676, "gen_logits_mean": -17.561002731323242, "gen_logits_min": -29.498310089111328, "gen_logits_std": 3.3421101570129395, "gen_loss": 0.2550683617591858, "grad_norm": 0.3764294832921576, "learning_rate": 2.088252631578947e-05, "loss": 0.2837, "mean_copy_accuracy": 0.9968164712190628, "mean_gen_accuracy": 0.873886227607727, "mean_token_accuracy": 0.902625635266304, "num_tokens": 1073030223.0, "sample_num_tokens": 7863.25, "step": 7954, "total_num_tokens": 1073061676.0, "z_loss": 0.00042632175609469414 }, { "copy_logits_max": 0.4441944360733032, "copy_logits_min": -750000000.0, "copy_num_tokens": 444.5625, "epoch": 1.6247638498851162, "gen_logits_max": 4.426028251647949, "gen_logits_mean": -15.10943603515625, "gen_logits_min": -27.038379669189453, "gen_logits_std": 3.2097296714782715, "gen_loss": 0.258161723613739, "grad_norm": 0.35580879193330317, "learning_rate": 2.088126315789474e-05, "loss": 0.263, "mean_copy_accuracy": 0.9965340942144394, "mean_gen_accuracy": 0.8853423893451691, "mean_token_accuracy": 0.9111196994781494, "num_tokens": 1073283507.0, "sample_num_tokens": 8265.25, "step": 7955, "total_num_tokens": 1073316568.0, "z_loss": 0.0004131686291657388 }, { "copy_logits_max": -0.5938490629196167, "copy_logits_min": -750000000.0, "copy_num_tokens": 531.1875, "epoch": 1.624968087822313, "gen_logits_max": 4.066233158111572, "gen_logits_mean": -15.700403213500977, "gen_logits_min": -28.02346420288086, "gen_logits_std": 3.296360492706299, "gen_loss": 0.24971990287303925, "grad_norm": 0.3629554743876212, "learning_rate": 2.088e-05, "loss": 0.2816, "mean_copy_accuracy": 0.9966874271631241, "mean_gen_accuracy": 0.8727952837944031, "mean_token_accuracy": 0.9037607312202454, "num_tokens": 1073545142.0, "sample_num_tokens": 8629.5, "step": 7956, "total_num_tokens": 1073579660.0, "z_loss": 0.00041585482540540397 }, { "copy_logits_max": -4.578411102294922, "copy_logits_min": -750000128.0, "copy_num_tokens": 482.0625, "epoch": 1.6251723257595099, "gen_logits_max": 3.5132203102111816, "gen_logits_mean": -16.573509216308594, "gen_logits_min": -28.9674072265625, "gen_logits_std": 3.3253848552703857, "gen_loss": 0.2790464162826538, "grad_norm": 0.3904222250062846, "learning_rate": 2.0878736842105264e-05, "loss": 0.3017, "mean_copy_accuracy": 0.9963802248239517, "mean_gen_accuracy": 0.8707141727209091, "mean_token_accuracy": 0.8993688076734543, "num_tokens": 1073794354.0, "sample_num_tokens": 8250.0, "step": 7957, "total_num_tokens": 1073827354.0, "z_loss": 0.0004687837208621204 }, { "copy_logits_max": -0.1899707317352295, "copy_logits_min": -750000000.0, "copy_num_tokens": 594.375, "epoch": 1.6253765636967068, "gen_logits_max": 3.8440842628479004, "gen_logits_mean": -16.01919937133789, "gen_logits_min": -28.24762725830078, "gen_logits_std": 3.277803421020508, "gen_loss": 0.24919097125530243, "grad_norm": 0.3643301667741284, "learning_rate": 2.087747368421053e-05, "loss": 0.2601, "mean_copy_accuracy": 0.9969924688339233, "mean_gen_accuracy": 0.8830075860023499, "mean_token_accuracy": 0.9100588858127594, "num_tokens": 1074048883.0, "sample_num_tokens": 9544.25, "step": 7958, "total_num_tokens": 1074087060.0, "z_loss": 0.00047586593427695334 }, { "copy_logits_max": -1.893039345741272, "copy_logits_min": -750000000.0, "copy_num_tokens": 283.9375, "epoch": 1.6255808016339035, "gen_logits_max": 4.225632667541504, "gen_logits_mean": -16.82273292541504, "gen_logits_min": -28.61174201965332, "gen_logits_std": 3.2563529014587402, "gen_loss": 0.2797028720378876, "grad_norm": 0.3260496823775372, "learning_rate": 2.087621052631579e-05, "loss": 0.2584, "mean_copy_accuracy": 0.9966006726026535, "mean_gen_accuracy": 0.8827288448810577, "mean_token_accuracy": 0.9120080471038818, "num_tokens": 1074338184.0, "sample_num_tokens": 7144.0, "step": 7959, "total_num_tokens": 1074366760.0, "z_loss": 0.0005103695439174771 }, { "copy_logits_max": -0.8646299839019775, "copy_logits_min": -687500032.0, "copy_num_tokens": 737.625, "epoch": 1.6257850395711002, "gen_logits_max": 2.8504858016967773, "gen_logits_mean": -17.86935043334961, "gen_logits_min": -29.78778648376465, "gen_logits_std": 3.337381362915039, "gen_loss": 0.2858373820781708, "grad_norm": 0.3671400084644726, "learning_rate": 2.0874947368421054e-05, "loss": 0.2799, "mean_copy_accuracy": 0.9971380978822708, "mean_gen_accuracy": 0.8751049786806107, "mean_token_accuracy": 0.9077488034963608, "num_tokens": 1074607432.0, "sample_num_tokens": 11090.0, "step": 7960, "total_num_tokens": 1074651792.0, "z_loss": 0.0004948112182319164 }, { "copy_logits_max": -0.25334858894348145, "copy_logits_min": -750000000.0, "copy_num_tokens": 518.6875, "epoch": 1.6259892775082971, "gen_logits_max": 3.536921501159668, "gen_logits_mean": -17.06606101989746, "gen_logits_min": -28.85773468017578, "gen_logits_std": 3.302776575088501, "gen_loss": 0.2560877799987793, "grad_norm": 0.37890417918093616, "learning_rate": 2.0873684210526315e-05, "loss": 0.2734, "mean_copy_accuracy": 0.9976591169834137, "mean_gen_accuracy": 0.87364362180233, "mean_token_accuracy": 0.9065369963645935, "num_tokens": 1074894791.0, "sample_num_tokens": 9014.75, "step": 7961, "total_num_tokens": 1074930850.0, "z_loss": 0.00046635602484457195 }, { "copy_logits_max": 0.6642252206802368, "copy_logits_min": -750000000.0, "copy_num_tokens": 464.9375, "epoch": 1.626193515445494, "gen_logits_max": 4.2167439460754395, "gen_logits_mean": -15.546014785766602, "gen_logits_min": -27.68686294555664, "gen_logits_std": 3.2712299823760986, "gen_loss": 0.2522526979446411, "grad_norm": 0.36626258456321065, "learning_rate": 2.087242105263158e-05, "loss": 0.2891, "mean_copy_accuracy": 0.9975599050521851, "mean_gen_accuracy": 0.871594250202179, "mean_token_accuracy": 0.901726171374321, "num_tokens": 1075167016.0, "sample_num_tokens": 8741.0, "step": 7962, "total_num_tokens": 1075201980.0, "z_loss": 0.00041803662315942347 }, { "copy_logits_max": -0.6421777009963989, "copy_logits_min": -687500032.0, "copy_num_tokens": 378.625, "epoch": 1.6263977533826908, "gen_logits_max": 4.437442779541016, "gen_logits_mean": -16.12139129638672, "gen_logits_min": -28.242107391357422, "gen_logits_std": 3.2808756828308105, "gen_loss": 0.3045000433921814, "grad_norm": 0.3721597951225719, "learning_rate": 2.0871157894736843e-05, "loss": 0.2939, "mean_copy_accuracy": 0.9958628118038177, "mean_gen_accuracy": 0.8706532120704651, "mean_token_accuracy": 0.8997126221656799, "num_tokens": 1075419797.0, "sample_num_tokens": 7394.75, "step": 7963, "total_num_tokens": 1075449376.0, "z_loss": 0.0005065407021902502 }, { "copy_logits_max": -2.858231544494629, "copy_logits_min": -750000000.0, "copy_num_tokens": 540.0625, "epoch": 1.6266019913198877, "gen_logits_max": 3.387889862060547, "gen_logits_mean": -16.634021759033203, "gen_logits_min": -28.983800888061523, "gen_logits_std": 3.3144421577453613, "gen_loss": 0.27266424894332886, "grad_norm": 0.3818974553380615, "learning_rate": 2.0869894736842108e-05, "loss": 0.2826, "mean_copy_accuracy": 0.9964743107557297, "mean_gen_accuracy": 0.8783939629793167, "mean_token_accuracy": 0.905116081237793, "num_tokens": 1075675349.0, "sample_num_tokens": 8442.75, "step": 7964, "total_num_tokens": 1075709120.0, "z_loss": 0.0004910033894702792 }, { "copy_logits_max": -2.366762638092041, "copy_logits_min": -687500032.0, "copy_num_tokens": 600.8125, "epoch": 1.6268062292570846, "gen_logits_max": 1.956287145614624, "gen_logits_mean": -18.695114135742188, "gen_logits_min": -30.956954956054688, "gen_logits_std": 3.40010404586792, "gen_loss": 0.23718243837356567, "grad_norm": 0.3765486504458126, "learning_rate": 2.086863157894737e-05, "loss": 0.2788, "mean_copy_accuracy": 0.9972302466630936, "mean_gen_accuracy": 0.8670104891061783, "mean_token_accuracy": 0.9052948504686356, "num_tokens": 1075959762.0, "sample_num_tokens": 9097.0, "step": 7965, "total_num_tokens": 1075996150.0, "z_loss": 0.0004248619079589844 }, { "copy_logits_max": -1.6017963886260986, "copy_logits_min": -687500032.0, "copy_num_tokens": 409.0, "epoch": 1.6270104671942813, "gen_logits_max": 3.5656862258911133, "gen_logits_mean": -16.264204025268555, "gen_logits_min": -28.452545166015625, "gen_logits_std": 3.3314340114593506, "gen_loss": 0.2643095850944519, "grad_norm": 0.35789372718782253, "learning_rate": 2.0867368421052633e-05, "loss": 0.2701, "mean_copy_accuracy": 0.9965064227581024, "mean_gen_accuracy": 0.879870593547821, "mean_token_accuracy": 0.9075288027524948, "num_tokens": 1076227319.0, "sample_num_tokens": 7028.75, "step": 7966, "total_num_tokens": 1076255434.0, "z_loss": 0.0004463529330678284 }, { "copy_logits_max": -2.00985050201416, "copy_logits_min": -750000000.0, "copy_num_tokens": 566.375, "epoch": 1.627214705131478, "gen_logits_max": 3.671321153640747, "gen_logits_mean": -16.864215850830078, "gen_logits_min": -29.12757682800293, "gen_logits_std": 3.305438995361328, "gen_loss": 0.2824956476688385, "grad_norm": 0.41824784599153286, "learning_rate": 2.0866105263157894e-05, "loss": 0.2857, "mean_copy_accuracy": 0.9954937249422073, "mean_gen_accuracy": 0.8781542778015137, "mean_token_accuracy": 0.9029414057731628, "num_tokens": 1076483432.0, "sample_num_tokens": 9769.5, "step": 7967, "total_num_tokens": 1076522510.0, "z_loss": 0.0004449287662282586 }, { "copy_logits_max": -4.407524108886719, "copy_logits_min": -750000000.0, "copy_num_tokens": 399.625, "epoch": 1.627418943068675, "gen_logits_max": 3.1759214401245117, "gen_logits_mean": -17.590179443359375, "gen_logits_min": -29.644807815551758, "gen_logits_std": 3.3116767406463623, "gen_loss": 0.25808238983154297, "grad_norm": 0.3484206181242869, "learning_rate": 2.0864842105263158e-05, "loss": 0.2681, "mean_copy_accuracy": 0.9955274909734726, "mean_gen_accuracy": 0.8806783109903336, "mean_token_accuracy": 0.9086687117815018, "num_tokens": 1076749576.0, "sample_num_tokens": 7946.5, "step": 7968, "total_num_tokens": 1076781362.0, "z_loss": 0.00036213145358487964 }, { "copy_logits_max": -1.3271100521087646, "copy_logits_min": -687500032.0, "copy_num_tokens": 554.5, "epoch": 1.6276231810058719, "gen_logits_max": 2.9493165016174316, "gen_logits_mean": -17.014469146728516, "gen_logits_min": -29.342723846435547, "gen_logits_std": 3.3200137615203857, "gen_loss": 0.275479793548584, "grad_norm": 0.3670581319409181, "learning_rate": 2.086357894736842e-05, "loss": 0.2718, "mean_copy_accuracy": 0.9960403293371201, "mean_gen_accuracy": 0.8800535798072815, "mean_token_accuracy": 0.9075699597597122, "num_tokens": 1077018787.0, "sample_num_tokens": 9191.75, "step": 7969, "total_num_tokens": 1077055554.0, "z_loss": 0.000388260290492326 }, { "copy_logits_max": -3.2660813331604004, "copy_logits_min": -750000000.0, "copy_num_tokens": 534.75, "epoch": 1.6278274189430686, "gen_logits_max": 3.069397449493408, "gen_logits_mean": -17.261859893798828, "gen_logits_min": -29.21537208557129, "gen_logits_std": 3.328616142272949, "gen_loss": 0.28557488322257996, "grad_norm": 0.3995615323556416, "learning_rate": 2.0862315789473683e-05, "loss": 0.29, "mean_copy_accuracy": 0.9957079738378525, "mean_gen_accuracy": 0.8698406219482422, "mean_token_accuracy": 0.9007568806409836, "num_tokens": 1077275474.0, "sample_num_tokens": 8949.5, "step": 7970, "total_num_tokens": 1077311272.0, "z_loss": 0.00039201395702548325 }, { "copy_logits_max": -2.1248528957366943, "copy_logits_min": -750000000.0, "copy_num_tokens": 504.125, "epoch": 1.6280316568802655, "gen_logits_max": 2.4266715049743652, "gen_logits_mean": -18.18354606628418, "gen_logits_min": -30.26970863342285, "gen_logits_std": 3.3732006549835205, "gen_loss": 0.27384862303733826, "grad_norm": 0.3738082405471843, "learning_rate": 2.0861052631578948e-05, "loss": 0.2715, "mean_copy_accuracy": 0.9961907416582108, "mean_gen_accuracy": 0.8798175156116486, "mean_token_accuracy": 0.9080613702535629, "num_tokens": 1077554407.0, "sample_num_tokens": 8722.75, "step": 7971, "total_num_tokens": 1077589298.0, "z_loss": 0.00041408967808820307 }, { "copy_logits_max": -3.4743590354919434, "copy_logits_min": -750000000.0, "copy_num_tokens": 501.625, "epoch": 1.6282358948174624, "gen_logits_max": 3.0111541748046875, "gen_logits_mean": -18.13518524169922, "gen_logits_min": -30.313018798828125, "gen_logits_std": 3.383899688720703, "gen_loss": 0.2819781005382538, "grad_norm": 0.32893055173323194, "learning_rate": 2.0859789473684212e-05, "loss": 0.2626, "mean_copy_accuracy": 0.9968259483575821, "mean_gen_accuracy": 0.8776424080133438, "mean_token_accuracy": 0.9116575568914413, "num_tokens": 1077840933.0, "sample_num_tokens": 8711.25, "step": 7972, "total_num_tokens": 1077875778.0, "z_loss": 0.0004454191366676241 }, { "copy_logits_max": -3.687751054763794, "copy_logits_min": -750000000.0, "copy_num_tokens": 416.3125, "epoch": 1.6284401327546592, "gen_logits_max": 3.836583375930786, "gen_logits_mean": -16.670307159423828, "gen_logits_min": -28.865005493164062, "gen_logits_std": 3.3239047527313232, "gen_loss": 0.240957573056221, "grad_norm": 0.39121816399843545, "learning_rate": 2.0858526315789476e-05, "loss": 0.2877, "mean_copy_accuracy": 0.9961375445127487, "mean_gen_accuracy": 0.8754754662513733, "mean_token_accuracy": 0.9036271572113037, "num_tokens": 1078097599.0, "sample_num_tokens": 8448.75, "step": 7973, "total_num_tokens": 1078131394.0, "z_loss": 0.0003902848402503878 }, { "copy_logits_max": -2.2223525047302246, "copy_logits_min": -750000000.0, "copy_num_tokens": 413.875, "epoch": 1.6286443706918559, "gen_logits_max": 4.392169952392578, "gen_logits_mean": -15.613037109375, "gen_logits_min": -27.800642013549805, "gen_logits_std": 3.2694406509399414, "gen_loss": 0.27664607763290405, "grad_norm": 0.37595718888050594, "learning_rate": 2.0857263157894737e-05, "loss": 0.2795, "mean_copy_accuracy": 0.9971138834953308, "mean_gen_accuracy": 0.8736472129821777, "mean_token_accuracy": 0.9042424112558365, "num_tokens": 1078351707.0, "sample_num_tokens": 7642.75, "step": 7974, "total_num_tokens": 1078382278.0, "z_loss": 0.0004573388141579926 }, { "copy_logits_max": -3.9167468547821045, "copy_logits_min": -687500032.0, "copy_num_tokens": 462.5625, "epoch": 1.628848608629053, "gen_logits_max": 4.740145683288574, "gen_logits_mean": -16.07152557373047, "gen_logits_min": -28.844532012939453, "gen_logits_std": 3.3065195083618164, "gen_loss": 0.2973235249519348, "grad_norm": 0.4053863412948982, "learning_rate": 2.0856e-05, "loss": 0.299, "mean_copy_accuracy": 0.9961506873369217, "mean_gen_accuracy": 0.870435357093811, "mean_token_accuracy": 0.8984895348548889, "num_tokens": 1078597485.0, "sample_num_tokens": 7906.25, "step": 7975, "total_num_tokens": 1078629110.0, "z_loss": 0.0005161271546967328 }, { "copy_logits_max": -4.077312469482422, "copy_logits_min": -687500032.0, "copy_num_tokens": 345.875, "epoch": 1.6290528465662497, "gen_logits_max": 4.965053558349609, "gen_logits_mean": -16.40757942199707, "gen_logits_min": -28.643592834472656, "gen_logits_std": 3.314727306365967, "gen_loss": 0.3021116256713867, "grad_norm": 0.3563911431846493, "learning_rate": 2.0854736842105262e-05, "loss": 0.2731, "mean_copy_accuracy": 0.9958315342664719, "mean_gen_accuracy": 0.8802541196346283, "mean_token_accuracy": 0.9070032685995102, "num_tokens": 1078872031.0, "sample_num_tokens": 8115.75, "step": 7976, "total_num_tokens": 1078904494.0, "z_loss": 0.0004530142468865961 }, { "copy_logits_max": -5.536452770233154, "copy_logits_min": -750000000.0, "copy_num_tokens": 286.4375, "epoch": 1.6292570845034464, "gen_logits_max": 5.825946807861328, "gen_logits_mean": -14.38159465789795, "gen_logits_min": -26.70857048034668, "gen_logits_std": 3.206206798553467, "gen_loss": 0.2517739236354828, "grad_norm": 0.36297510229363095, "learning_rate": 2.0853473684210527e-05, "loss": 0.2566, "mean_copy_accuracy": 0.9971429109573364, "mean_gen_accuracy": 0.8852662444114685, "mean_token_accuracy": 0.9132835268974304, "num_tokens": 1079143143.0, "sample_num_tokens": 7020.75, "step": 7977, "total_num_tokens": 1079171226.0, "z_loss": 0.0003869821666739881 }, { "copy_logits_max": -2.74204158782959, "copy_logits_min": -625000064.0, "copy_num_tokens": 467.9375, "epoch": 1.6294613224406433, "gen_logits_max": 4.418093681335449, "gen_logits_mean": -15.851654052734375, "gen_logits_min": -28.659833908081055, "gen_logits_std": 3.3122873306274414, "gen_loss": 0.28328126668930054, "grad_norm": 0.3638875534427242, "learning_rate": 2.0852210526315788e-05, "loss": 0.2879, "mean_copy_accuracy": 0.9962496012449265, "mean_gen_accuracy": 0.8739487826824188, "mean_token_accuracy": 0.90354123711586, "num_tokens": 1079414290.0, "sample_num_tokens": 8260.5, "step": 7978, "total_num_tokens": 1079447332.0, "z_loss": 0.0005089156329631805 }, { "copy_logits_max": -5.7726593017578125, "copy_logits_min": -750000064.0, "copy_num_tokens": 445.25, "epoch": 1.6296655603778403, "gen_logits_max": 4.953797340393066, "gen_logits_mean": -16.059036254882812, "gen_logits_min": -28.551471710205078, "gen_logits_std": 3.3141732215881348, "gen_loss": 0.23699888586997986, "grad_norm": 0.36768028902899824, "learning_rate": 2.0850947368421055e-05, "loss": 0.2751, "mean_copy_accuracy": 0.9962558746337891, "mean_gen_accuracy": 0.8798237591981888, "mean_token_accuracy": 0.9087274074554443, "num_tokens": 1079691238.0, "sample_num_tokens": 8547.5, "step": 7979, "total_num_tokens": 1079725428.0, "z_loss": 0.0003883293247781694 }, { "copy_logits_max": -3.865490198135376, "copy_logits_min": -687500096.0, "copy_num_tokens": 783.0, "epoch": 1.629869798315037, "gen_logits_max": 3.4437174797058105, "gen_logits_mean": -16.729408264160156, "gen_logits_min": -29.377288818359375, "gen_logits_std": 3.349848747253418, "gen_loss": 0.2597006559371948, "grad_norm": 0.3859803756468468, "learning_rate": 2.0849684210526316e-05, "loss": 0.2747, "mean_copy_accuracy": 0.9968954622745514, "mean_gen_accuracy": 0.8704620599746704, "mean_token_accuracy": 0.9079288989305496, "num_tokens": 1079950603.0, "sample_num_tokens": 10759.75, "step": 7980, "total_num_tokens": 1079993642.0, "z_loss": 0.00043860834557563066 }, { "copy_logits_max": -5.148004055023193, "copy_logits_min": -750000000.0, "copy_num_tokens": 388.5, "epoch": 1.630074036252234, "gen_logits_max": 5.754970550537109, "gen_logits_mean": -13.819172859191895, "gen_logits_min": -26.298564910888672, "gen_logits_std": 3.2285304069519043, "gen_loss": 0.28989702463150024, "grad_norm": 0.34242465789980486, "learning_rate": 2.084842105263158e-05, "loss": 0.2677, "mean_copy_accuracy": 0.9966291189193726, "mean_gen_accuracy": 0.8833858668804169, "mean_token_accuracy": 0.9091273993253708, "num_tokens": 1080219517.0, "sample_num_tokens": 8489.25, "step": 7981, "total_num_tokens": 1080253474.0, "z_loss": 0.0004997070645913482 }, { "copy_logits_max": -3.8592731952667236, "copy_logits_min": -750000000.0, "copy_num_tokens": 303.3125, "epoch": 1.6302782741894308, "gen_logits_max": 5.519713401794434, "gen_logits_mean": -14.994733810424805, "gen_logits_min": -27.344772338867188, "gen_logits_std": 3.244229316711426, "gen_loss": 0.2937483489513397, "grad_norm": 0.3535120953762211, "learning_rate": 2.084715789473684e-05, "loss": 0.2651, "mean_copy_accuracy": 0.9955331981182098, "mean_gen_accuracy": 0.8858431726694107, "mean_token_accuracy": 0.9104000478982925, "num_tokens": 1080497451.0, "sample_num_tokens": 7632.25, "step": 7982, "total_num_tokens": 1080527980.0, "z_loss": 0.0004916326142847538 }, { "copy_logits_max": -3.9098944664001465, "copy_logits_min": -750000000.0, "copy_num_tokens": 295.25, "epoch": 1.6304825121266275, "gen_logits_max": 4.538110256195068, "gen_logits_mean": -17.21030044555664, "gen_logits_min": -29.457956314086914, "gen_logits_std": 3.3134536743164062, "gen_loss": 0.30425897240638733, "grad_norm": 0.3445970043551882, "learning_rate": 2.0845894736842106e-05, "loss": 0.2724, "mean_copy_accuracy": 0.9964751899242401, "mean_gen_accuracy": 0.8753330707550049, "mean_token_accuracy": 0.9068881571292877, "num_tokens": 1080753298.0, "sample_num_tokens": 7377.0, "step": 7983, "total_num_tokens": 1080782806.0, "z_loss": 0.0004884954541921616 }, { "copy_logits_max": -1.6373794078826904, "copy_logits_min": -687500032.0, "copy_num_tokens": 360.0625, "epoch": 1.6306867500638242, "gen_logits_max": 4.638725280761719, "gen_logits_mean": -16.126008987426758, "gen_logits_min": -28.751264572143555, "gen_logits_std": 3.3219094276428223, "gen_loss": 0.2605525851249695, "grad_norm": 0.3621315533821866, "learning_rate": 2.0844631578947367e-05, "loss": 0.2655, "mean_copy_accuracy": 0.9963869601488113, "mean_gen_accuracy": 0.8783509582281113, "mean_token_accuracy": 0.9085662215948105, "num_tokens": 1081012950.0, "sample_num_tokens": 7534.5, "step": 7984, "total_num_tokens": 1081043088.0, "z_loss": 0.00046693888725712895 }, { "copy_logits_max": 0.8450058698654175, "copy_logits_min": -750000000.0, "copy_num_tokens": 592.5625, "epoch": 1.6308909880010212, "gen_logits_max": 3.744107961654663, "gen_logits_mean": -16.176002502441406, "gen_logits_min": -28.85101318359375, "gen_logits_std": 3.3172998428344727, "gen_loss": 0.22799061238765717, "grad_norm": 0.35018147722313214, "learning_rate": 2.084336842105263e-05, "loss": 0.268, "mean_copy_accuracy": 0.9973550289869308, "mean_gen_accuracy": 0.8804055899381638, "mean_token_accuracy": 0.9094529151916504, "num_tokens": 1081299590.0, "sample_num_tokens": 9545.5, "step": 7985, "total_num_tokens": 1081337772.0, "z_loss": 0.00044925170368514955 }, { "copy_logits_max": -0.8964452147483826, "copy_logits_min": -687500096.0, "copy_num_tokens": 488.125, "epoch": 1.631095225938218, "gen_logits_max": 4.849029064178467, "gen_logits_mean": -14.952787399291992, "gen_logits_min": -27.263858795166016, "gen_logits_std": 3.213650941848755, "gen_loss": 0.31601423025131226, "grad_norm": 0.33569320837790345, "learning_rate": 2.0842105263157895e-05, "loss": 0.2828, "mean_copy_accuracy": 0.9969504624605179, "mean_gen_accuracy": 0.8751636445522308, "mean_token_accuracy": 0.9038870632648468, "num_tokens": 1081580496.0, "sample_num_tokens": 8711.5, "step": 7986, "total_num_tokens": 1081615342.0, "z_loss": 0.0006070113740861416 }, { "copy_logits_max": -2.169829845428467, "copy_logits_min": -750000000.0, "copy_num_tokens": 658.125, "epoch": 1.6312994638754148, "gen_logits_max": 5.329290390014648, "gen_logits_mean": -14.592449188232422, "gen_logits_min": -27.110239028930664, "gen_logits_std": 3.2780327796936035, "gen_loss": 0.2563302218914032, "grad_norm": 0.37202797209860344, "learning_rate": 2.084084210526316e-05, "loss": 0.2746, "mean_copy_accuracy": 0.9959260523319244, "mean_gen_accuracy": 0.8782885819673538, "mean_token_accuracy": 0.9071786254644394, "num_tokens": 1081865234.0, "sample_num_tokens": 9540.0, "step": 7987, "total_num_tokens": 1081903394.0, "z_loss": 0.00040145538514479995 }, { "copy_logits_max": -4.466498851776123, "copy_logits_min": -750000000.0, "copy_num_tokens": 292.25, "epoch": 1.6315037018126117, "gen_logits_max": 4.452286243438721, "gen_logits_mean": -17.114025115966797, "gen_logits_min": -29.09958267211914, "gen_logits_std": 3.292171001434326, "gen_loss": 0.2812744379043579, "grad_norm": 0.32901113435639784, "learning_rate": 2.0839578947368424e-05, "loss": 0.2757, "mean_copy_accuracy": 0.9959355890750885, "mean_gen_accuracy": 0.8778405785560608, "mean_token_accuracy": 0.9029859006404877, "num_tokens": 1082143303.0, "sample_num_tokens": 7315.25, "step": 7988, "total_num_tokens": 1082172564.0, "z_loss": 0.0004723727179225534 }, { "copy_logits_max": -1.420663595199585, "copy_logits_min": -750000000.0, "copy_num_tokens": 333.375, "epoch": 1.6317079397498087, "gen_logits_max": 5.18028450012207, "gen_logits_mean": -15.896032333374023, "gen_logits_min": -28.588058471679688, "gen_logits_std": 3.2834367752075195, "gen_loss": 0.3048055171966553, "grad_norm": 0.372851325534498, "learning_rate": 2.0838315789473685e-05, "loss": 0.2712, "mean_copy_accuracy": 0.9953117370605469, "mean_gen_accuracy": 0.8831754922866821, "mean_token_accuracy": 0.9082762151956558, "num_tokens": 1082419442.0, "sample_num_tokens": 8695.5, "step": 7989, "total_num_tokens": 1082454224.0, "z_loss": 0.0004893423756584525 }, { "copy_logits_max": -2.3301191329956055, "copy_logits_min": -625000064.0, "copy_num_tokens": 703.8125, "epoch": 1.6319121776870054, "gen_logits_max": 3.1955912113189697, "gen_logits_mean": -16.18526840209961, "gen_logits_min": -28.886043548583984, "gen_logits_std": 3.3242745399475098, "gen_loss": 0.2609216868877411, "grad_norm": 0.3225488297987221, "learning_rate": 2.083705263157895e-05, "loss": 0.2612, "mean_copy_accuracy": 0.9976691156625748, "mean_gen_accuracy": 0.8788557201623917, "mean_token_accuracy": 0.9133416414260864, "num_tokens": 1082731066.0, "sample_num_tokens": 9072.0, "step": 7990, "total_num_tokens": 1082767354.0, "z_loss": 0.00044867972610518336 }, { "copy_logits_max": -2.704164505004883, "copy_logits_min": -750000000.0, "copy_num_tokens": 544.1875, "epoch": 1.632116415624202, "gen_logits_max": 3.5503368377685547, "gen_logits_mean": -16.98526382446289, "gen_logits_min": -29.55935287475586, "gen_logits_std": 3.3596138954162598, "gen_loss": 0.24232995510101318, "grad_norm": 0.36105067631285864, "learning_rate": 2.083578947368421e-05, "loss": 0.2645, "mean_copy_accuracy": 0.9972274601459503, "mean_gen_accuracy": 0.8809787184000015, "mean_token_accuracy": 0.9099355190992355, "num_tokens": 1082997849.0, "sample_num_tokens": 9288.25, "step": 7991, "total_num_tokens": 1083035002.0, "z_loss": 0.00041932950261980295 }, { "copy_logits_max": -2.0741217136383057, "copy_logits_min": -687500032.0, "copy_num_tokens": 594.4375, "epoch": 1.632320653561399, "gen_logits_max": 2.695359230041504, "gen_logits_mean": -17.578842163085938, "gen_logits_min": -30.609981536865234, "gen_logits_std": 3.3850150108337402, "gen_loss": 0.24323618412017822, "grad_norm": 0.3365270998372922, "learning_rate": 2.0834526315789475e-05, "loss": 0.2774, "mean_copy_accuracy": 0.9964323937892914, "mean_gen_accuracy": 0.8768799155950546, "mean_token_accuracy": 0.9058203250169754, "num_tokens": 1083266696.0, "sample_num_tokens": 9158.0, "step": 7992, "total_num_tokens": 1083303328.0, "z_loss": 0.0003833998925983906 }, { "copy_logits_max": -4.211538314819336, "copy_logits_min": -687500032.0, "copy_num_tokens": 429.3125, "epoch": 1.632524891498596, "gen_logits_max": 2.690464496612549, "gen_logits_mean": -18.722749710083008, "gen_logits_min": -31.044021606445312, "gen_logits_std": 3.403787612915039, "gen_loss": 0.2610841989517212, "grad_norm": 0.319090446968671, "learning_rate": 2.0833263157894735e-05, "loss": 0.2463, "mean_copy_accuracy": 0.9972540438175201, "mean_gen_accuracy": 0.8915531039237976, "mean_token_accuracy": 0.9165162295103073, "num_tokens": 1083542096.0, "sample_num_tokens": 7675.0, "step": 7993, "total_num_tokens": 1083572796.0, "z_loss": 0.0004026526876259595 }, { "copy_logits_max": -4.420691967010498, "copy_logits_min": -750000000.0, "copy_num_tokens": 592.25, "epoch": 1.6327291294357926, "gen_logits_max": 3.2620797157287598, "gen_logits_mean": -16.89926528930664, "gen_logits_min": -29.6275634765625, "gen_logits_std": 3.3434500694274902, "gen_loss": 0.2605131268501282, "grad_norm": 0.3667577182952859, "learning_rate": 2.0832e-05, "loss": 0.2844, "mean_copy_accuracy": 0.9964725375175476, "mean_gen_accuracy": 0.8731991499662399, "mean_token_accuracy": 0.9050918072462082, "num_tokens": 1083799183.0, "sample_num_tokens": 9562.25, "step": 7994, "total_num_tokens": 1083837432.0, "z_loss": 0.0004033391014672816 }, { "copy_logits_max": -3.4833872318267822, "copy_logits_min": -750000000.0, "copy_num_tokens": 615.125, "epoch": 1.6329333673729896, "gen_logits_max": 3.588052272796631, "gen_logits_mean": -16.273427963256836, "gen_logits_min": -29.2534122467041, "gen_logits_std": 3.312905788421631, "gen_loss": 0.25803613662719727, "grad_norm": 0.35447628879966336, "learning_rate": 2.0830736842105264e-05, "loss": 0.2617, "mean_copy_accuracy": 0.9965947568416595, "mean_gen_accuracy": 0.8819159418344498, "mean_token_accuracy": 0.9098463207483292, "num_tokens": 1084079228.0, "sample_num_tokens": 9292.5, "step": 7995, "total_num_tokens": 1084116398.0, "z_loss": 0.0003907052450813353 }, { "copy_logits_max": -5.815823078155518, "copy_logits_min": -687500032.0, "copy_num_tokens": 257.8125, "epoch": 1.6331376053101865, "gen_logits_max": 3.12003755569458, "gen_logits_mean": -19.410755157470703, "gen_logits_min": -31.544883728027344, "gen_logits_std": 3.3779640197753906, "gen_loss": 0.30696791410446167, "grad_norm": 0.35278039282544627, "learning_rate": 2.082947368421053e-05, "loss": 0.2842, "mean_copy_accuracy": 0.9972621202468872, "mean_gen_accuracy": 0.8805644363164902, "mean_token_accuracy": 0.9044878333806992, "num_tokens": 1084351904.0, "sample_num_tokens": 6216.5, "step": 7996, "total_num_tokens": 1084376770.0, "z_loss": 0.0004501563962548971 }, { "copy_logits_max": -1.8652241230010986, "copy_logits_min": -750000000.0, "copy_num_tokens": 566.1875, "epoch": 1.6333418432473832, "gen_logits_max": 4.0184125900268555, "gen_logits_mean": -15.992169380187988, "gen_logits_min": -28.86543083190918, "gen_logits_std": 3.301546335220337, "gen_loss": 0.2887817621231079, "grad_norm": 0.37892669743872825, "learning_rate": 2.082821052631579e-05, "loss": 0.2761, "mean_copy_accuracy": 0.9964715391397476, "mean_gen_accuracy": 0.8744456321001053, "mean_token_accuracy": 0.9072316586971283, "num_tokens": 1084608697.0, "sample_num_tokens": 8646.75, "step": 7997, "total_num_tokens": 1084643284.0, "z_loss": 0.00047859150799922645 }, { "copy_logits_max": -3.2775914669036865, "copy_logits_min": -625000064.0, "copy_num_tokens": 549.0625, "epoch": 1.63354608118458, "gen_logits_max": 3.6490299701690674, "gen_logits_mean": -17.006160736083984, "gen_logits_min": -29.882221221923828, "gen_logits_std": 3.3344318866729736, "gen_loss": 0.24086876213550568, "grad_norm": 0.3656438832115054, "learning_rate": 2.0826947368421054e-05, "loss": 0.2825, "mean_copy_accuracy": 0.99748095870018, "mean_gen_accuracy": 0.8757355064153671, "mean_token_accuracy": 0.9043465703725815, "num_tokens": 1084870230.0, "sample_num_tokens": 9767.0, "step": 7998, "total_num_tokens": 1084909298.0, "z_loss": 0.0004100981750525534 }, { "copy_logits_max": -4.645265579223633, "copy_logits_min": -750000000.0, "copy_num_tokens": 462.8125, "epoch": 1.6337503191217768, "gen_logits_max": 3.7162280082702637, "gen_logits_mean": -16.34111976623535, "gen_logits_min": -28.534873962402344, "gen_logits_std": 3.2709574699401855, "gen_loss": 0.2713131308555603, "grad_norm": 0.33335159335233927, "learning_rate": 2.0825684210526318e-05, "loss": 0.2609, "mean_copy_accuracy": 0.9972971528768539, "mean_gen_accuracy": 0.8776674270629883, "mean_token_accuracy": 0.9118958562612534, "num_tokens": 1085145868.0, "sample_num_tokens": 8730.0, "step": 7999, "total_num_tokens": 1085180788.0, "z_loss": 0.00043302110861986876 }, { "epoch": 1.6339545570589737, "grad_norm": 0.3752694037781568, "learning_rate": 2.082442105263158e-05, "loss": 0.2893, "step": 8000 }, { "epoch": 1.6339545570589737, "eval_copy_logits_max": -8.583415985107422, "eval_copy_logits_min": -86.23593139648438, "eval_gen_logits_max": 2.2257795333862305, "eval_gen_logits_mean": -22.191434860229492, "eval_gen_logits_min": -33.438270568847656, "eval_gen_logits_std": 3.436770439147949, "eval_gen_loss": 0.3170195519924164, "eval_loss": 0.2829743027687073, "eval_mean_copy_accuracy": 0.9929614067077637, "eval_mean_gen_accuracy": 0.884803295135498, "eval_mean_token_accuracy": 0.8987873792648315, "eval_num_tokens": 1085434194.0, "eval_runtime": 0.6944, "eval_samples_per_second": 11.521, "eval_steps_per_second": 2.88, "eval_total_num_tokens": 1085434194.0, "eval_z_loss": 0.0004429419641382992, "step": 8000 }, { "copy_logits_max": -5.752274513244629, "copy_logits_min": -687500032.0, "copy_num_tokens": 490.5625, "epoch": 1.6341587949961704, "gen_logits_max": 2.736419916152954, "gen_logits_mean": -17.72519874572754, "gen_logits_min": -29.884353637695312, "gen_logits_std": 3.312685966491699, "gen_loss": 0.2620474696159363, "grad_norm": 0.363159201608976, "learning_rate": 2.0823157894736843e-05, "loss": 0.2546, "mean_copy_accuracy": 0.9968311488628387, "mean_gen_accuracy": 0.8798714205622673, "mean_token_accuracy": 0.9066683351993561, "num_tokens": 1085670232.0, "sample_num_tokens": 8397.5, "step": 8001, "total_num_tokens": 1085703822.0, "z_loss": 0.0004185803991276771 }, { "copy_logits_max": -4.486787796020508, "copy_logits_min": -750000000.0, "copy_num_tokens": 397.75, "epoch": 1.6343630329333674, "gen_logits_max": 3.86257266998291, "gen_logits_mean": -16.766450881958008, "gen_logits_min": -29.1379337310791, "gen_logits_std": 3.277306079864502, "gen_loss": 0.2830200493335724, "grad_norm": 0.36098692696370266, "learning_rate": 2.0821894736842104e-05, "loss": 0.2703, "mean_copy_accuracy": 0.9964452683925629, "mean_gen_accuracy": 0.8806540369987488, "mean_token_accuracy": 0.9070268124341965, "num_tokens": 1085941688.0, "sample_num_tokens": 8961.5, "step": 8002, "total_num_tokens": 1085977534.0, "z_loss": 0.0005013201152905822 }, { "copy_logits_max": -1.954741358757019, "copy_logits_min": -750000000.0, "copy_num_tokens": 470.6875, "epoch": 1.6345672708705643, "gen_logits_max": 3.393117904663086, "gen_logits_mean": -16.573863983154297, "gen_logits_min": -29.281217575073242, "gen_logits_std": 3.2760934829711914, "gen_loss": 0.3214147686958313, "grad_norm": 0.35441324594148593, "learning_rate": 2.082063157894737e-05, "loss": 0.2934, "mean_copy_accuracy": 0.9971620291471481, "mean_gen_accuracy": 0.8686161786317825, "mean_token_accuracy": 0.9024292677640915, "num_tokens": 1086198177.0, "sample_num_tokens": 7882.75, "step": 8003, "total_num_tokens": 1086229708.0, "z_loss": 0.0006210354622453451 }, { "copy_logits_max": -0.29268014430999756, "copy_logits_min": -750000128.0, "copy_num_tokens": 544.1875, "epoch": 1.634771508807761, "gen_logits_max": 4.651693344116211, "gen_logits_mean": -14.917557716369629, "gen_logits_min": -27.764915466308594, "gen_logits_std": 3.2242767810821533, "gen_loss": 0.25390753149986267, "grad_norm": 0.4049105381038913, "learning_rate": 2.0819368421052633e-05, "loss": 0.2758, "mean_copy_accuracy": 0.9976344406604767, "mean_gen_accuracy": 0.8789958506822586, "mean_token_accuracy": 0.9060186296701431, "num_tokens": 1086458906.0, "sample_num_tokens": 8816.5, "step": 8004, "total_num_tokens": 1086494172.0, "z_loss": 0.0005547034088522196 }, { "copy_logits_max": -0.5663827657699585, "copy_logits_min": -750000000.0, "copy_num_tokens": 430.5, "epoch": 1.6349757467449577, "gen_logits_max": 4.3096466064453125, "gen_logits_mean": -15.746416091918945, "gen_logits_min": -28.221942901611328, "gen_logits_std": 3.2536449432373047, "gen_loss": 0.3132052421569824, "grad_norm": 0.3528195448458247, "learning_rate": 2.0818105263157897e-05, "loss": 0.2653, "mean_copy_accuracy": 0.9979605078697205, "mean_gen_accuracy": 0.8820609301328659, "mean_token_accuracy": 0.9106680303812027, "num_tokens": 1086739660.0, "sample_num_tokens": 8197.5, "step": 8005, "total_num_tokens": 1086772450.0, "z_loss": 0.0006121791084297001 }, { "copy_logits_max": -1.3177525997161865, "copy_logits_min": -750000000.0, "copy_num_tokens": 497.0625, "epoch": 1.6351799846821549, "gen_logits_max": 4.20253849029541, "gen_logits_mean": -15.862165451049805, "gen_logits_min": -28.037822723388672, "gen_logits_std": 3.23750638961792, "gen_loss": 0.2917048931121826, "grad_norm": 0.37635528936397034, "learning_rate": 2.0816842105263158e-05, "loss": 0.2923, "mean_copy_accuracy": 0.9971216470003128, "mean_gen_accuracy": 0.87393818795681, "mean_token_accuracy": 0.9033795893192291, "num_tokens": 1087006833.0, "sample_num_tokens": 8646.25, "step": 8006, "total_num_tokens": 1087041418.0, "z_loss": 0.0005378008354455233 }, { "copy_logits_max": 0.6741489171981812, "copy_logits_min": -687500032.0, "copy_num_tokens": 832.5625, "epoch": 1.6353842226193516, "gen_logits_max": 3.017038583755493, "gen_logits_mean": -15.644546508789062, "gen_logits_min": -28.244325637817383, "gen_logits_std": 3.261964797973633, "gen_loss": 0.25372540950775146, "grad_norm": 0.3404004336985849, "learning_rate": 2.0815578947368422e-05, "loss": 0.2619, "mean_copy_accuracy": 0.9960046261548996, "mean_gen_accuracy": 0.8812840580940247, "mean_token_accuracy": 0.9107104539871216, "num_tokens": 1087303722.0, "sample_num_tokens": 10762.5, "step": 8007, "total_num_tokens": 1087346772.0, "z_loss": 0.0005366871482692659 }, { "copy_logits_max": 0.48443329334259033, "copy_logits_min": -750000000.0, "copy_num_tokens": 474.25, "epoch": 1.6355884605565483, "gen_logits_max": 4.009983062744141, "gen_logits_mean": -14.931632995605469, "gen_logits_min": -26.953136444091797, "gen_logits_std": 3.158754825592041, "gen_loss": 0.25897669792175293, "grad_norm": 0.3546565299562736, "learning_rate": 2.0814315789473683e-05, "loss": 0.2779, "mean_copy_accuracy": 0.9962303787469864, "mean_gen_accuracy": 0.8749684989452362, "mean_token_accuracy": 0.904775395989418, "num_tokens": 1087564683.0, "sample_num_tokens": 8251.25, "step": 8008, "total_num_tokens": 1087597688.0, "z_loss": 0.00046684819972142577 }, { "copy_logits_max": -1.2440662384033203, "copy_logits_min": -687500096.0, "copy_num_tokens": 514.5625, "epoch": 1.6357926984937452, "gen_logits_max": 3.2277934551239014, "gen_logits_mean": -17.056529998779297, "gen_logits_min": -29.186599731445312, "gen_logits_std": 3.260223627090454, "gen_loss": 0.26717403531074524, "grad_norm": 0.3941503482555653, "learning_rate": 2.0813052631578947e-05, "loss": 0.3068, "mean_copy_accuracy": 0.996445968747139, "mean_gen_accuracy": 0.8668946176767349, "mean_token_accuracy": 0.8958492577075958, "num_tokens": 1087817474.0, "sample_num_tokens": 8462.0, "step": 8009, "total_num_tokens": 1087851322.0, "z_loss": 0.0004861871129833162 }, { "copy_logits_max": 1.0518708229064941, "copy_logits_min": -750000000.0, "copy_num_tokens": 324.375, "epoch": 1.6359969364309421, "gen_logits_max": 5.411460876464844, "gen_logits_mean": -15.083272933959961, "gen_logits_min": -27.60921859741211, "gen_logits_std": 3.216747760772705, "gen_loss": 0.2792935073375702, "grad_norm": 0.3582058997066238, "learning_rate": 2.081178947368421e-05, "loss": 0.271, "mean_copy_accuracy": 0.9969653189182281, "mean_gen_accuracy": 0.8801397085189819, "mean_token_accuracy": 0.9074038118124008, "num_tokens": 1088101749.0, "sample_num_tokens": 8225.25, "step": 8010, "total_num_tokens": 1088134650.0, "z_loss": 0.00044921640073880553 }, { "copy_logits_max": 2.029726028442383, "copy_logits_min": -750000000.0, "copy_num_tokens": 540.8125, "epoch": 1.6362011743681388, "gen_logits_max": 4.960330486297607, "gen_logits_mean": -13.937041282653809, "gen_logits_min": -26.510709762573242, "gen_logits_std": 3.1932003498077393, "gen_loss": 0.2791248559951782, "grad_norm": 0.3495687712153529, "learning_rate": 2.0810526315789473e-05, "loss": 0.2861, "mean_copy_accuracy": 0.9976833462715149, "mean_gen_accuracy": 0.8703381717205048, "mean_token_accuracy": 0.9020663946866989, "num_tokens": 1088383935.0, "sample_num_tokens": 9122.25, "step": 8011, "total_num_tokens": 1088420424.0, "z_loss": 0.0004610667529050261 }, { "copy_logits_max": -2.579012870788574, "copy_logits_min": -750000000.0, "copy_num_tokens": 352.3125, "epoch": 1.6364054123053358, "gen_logits_max": 4.273447036743164, "gen_logits_mean": -17.11797523498535, "gen_logits_min": -29.248252868652344, "gen_logits_std": 3.29691219329834, "gen_loss": 0.2828839421272278, "grad_norm": 0.37589086919072745, "learning_rate": 2.080926315789474e-05, "loss": 0.2968, "mean_copy_accuracy": 0.9970418214797974, "mean_gen_accuracy": 0.8725071549415588, "mean_token_accuracy": 0.898368701338768, "num_tokens": 1088639004.0, "sample_num_tokens": 7760.0, "step": 8012, "total_num_tokens": 1088670044.0, "z_loss": 0.00046494498383253813 }, { "copy_logits_max": -4.205883026123047, "copy_logits_min": -750000000.0, "copy_num_tokens": 649.3125, "epoch": 1.6366096502425327, "gen_logits_max": 2.7840681076049805, "gen_logits_mean": -17.811241149902344, "gen_logits_min": -29.977752685546875, "gen_logits_std": 3.304243326187134, "gen_loss": 0.25922542810440063, "grad_norm": 0.4169198037522495, "learning_rate": 2.0808e-05, "loss": 0.2758, "mean_copy_accuracy": 0.9967304021120071, "mean_gen_accuracy": 0.8772965669631958, "mean_token_accuracy": 0.9068996608257294, "num_tokens": 1088889320.0, "sample_num_tokens": 9339.5, "step": 8013, "total_num_tokens": 1088926678.0, "z_loss": 0.00040684526902623475 }, { "copy_logits_max": -2.3854751586914062, "copy_logits_min": -750000000.0, "copy_num_tokens": 486.0625, "epoch": 1.6368138881797294, "gen_logits_max": 2.988570213317871, "gen_logits_mean": -17.685388565063477, "gen_logits_min": -30.414766311645508, "gen_logits_std": 3.3134400844573975, "gen_loss": 0.2844603955745697, "grad_norm": 0.3206674096635391, "learning_rate": 2.0806736842105266e-05, "loss": 0.2667, "mean_copy_accuracy": 0.9960663020610809, "mean_gen_accuracy": 0.8765380680561066, "mean_token_accuracy": 0.9083998054265976, "num_tokens": 1089175129.0, "sample_num_tokens": 8089.25, "step": 8014, "total_num_tokens": 1089207486.0, "z_loss": 0.00046760428813286126 }, { "copy_logits_max": -1.656140685081482, "copy_logits_min": -750000000.0, "copy_num_tokens": 531.1875, "epoch": 1.637018126116926, "gen_logits_max": 3.5155138969421387, "gen_logits_mean": -17.24860382080078, "gen_logits_min": -29.641448974609375, "gen_logits_std": 3.3070130348205566, "gen_loss": 0.25722768902778625, "grad_norm": 0.35200968460595145, "learning_rate": 2.0805473684210527e-05, "loss": 0.2833, "mean_copy_accuracy": 0.9976665675640106, "mean_gen_accuracy": 0.8744165599346161, "mean_token_accuracy": 0.902580976486206, "num_tokens": 1089440602.0, "sample_num_tokens": 8564.0, "step": 8015, "total_num_tokens": 1089474858.0, "z_loss": 0.0003939092857763171 }, { "copy_logits_max": -3.286464214324951, "copy_logits_min": -687500032.0, "copy_num_tokens": 524.75, "epoch": 1.637222364054123, "gen_logits_max": 3.7350165843963623, "gen_logits_mean": -15.94478988647461, "gen_logits_min": -27.91729736328125, "gen_logits_std": 3.1941466331481934, "gen_loss": 0.2755652964115143, "grad_norm": 0.3789703390321092, "learning_rate": 2.080421052631579e-05, "loss": 0.2756, "mean_copy_accuracy": 0.9971735775470734, "mean_gen_accuracy": 0.874439463019371, "mean_token_accuracy": 0.905589148402214, "num_tokens": 1089706658.0, "sample_num_tokens": 9987.5, "step": 8016, "total_num_tokens": 1089746608.0, "z_loss": 0.00044639792758971453 }, { "copy_logits_max": -3.9790022373199463, "copy_logits_min": -750000000.0, "copy_num_tokens": 347.125, "epoch": 1.63742660199132, "gen_logits_max": 3.809939384460449, "gen_logits_mean": -17.480159759521484, "gen_logits_min": -29.40280532836914, "gen_logits_std": 3.2849655151367188, "gen_loss": 0.29874682426452637, "grad_norm": 0.3483359856504406, "learning_rate": 2.0802947368421052e-05, "loss": 0.2823, "mean_copy_accuracy": 0.9966137111186981, "mean_gen_accuracy": 0.8803702741861343, "mean_token_accuracy": 0.9053698629140854, "num_tokens": 1089972101.0, "sample_num_tokens": 8337.75, "step": 8017, "total_num_tokens": 1090005452.0, "z_loss": 0.0004279228742234409 }, { "copy_logits_max": -1.669736385345459, "copy_logits_min": -750000000.0, "copy_num_tokens": 426.0, "epoch": 1.6376308399285167, "gen_logits_max": 3.011976957321167, "gen_logits_mean": -16.975942611694336, "gen_logits_min": -29.233903884887695, "gen_logits_std": 3.271587371826172, "gen_loss": 0.26701995730400085, "grad_norm": 0.3673714638818003, "learning_rate": 2.0801684210526316e-05, "loss": 0.2682, "mean_copy_accuracy": 0.9967844486236572, "mean_gen_accuracy": 0.8757328540086746, "mean_token_accuracy": 0.9091730266809464, "num_tokens": 1090250324.0, "sample_num_tokens": 7536.5, "step": 8018, "total_num_tokens": 1090280470.0, "z_loss": 0.0004162349214311689 }, { "copy_logits_max": -2.126425266265869, "copy_logits_min": -750000000.0, "copy_num_tokens": 306.0625, "epoch": 1.6378350778657136, "gen_logits_max": 3.5349345207214355, "gen_logits_mean": -17.57986831665039, "gen_logits_min": -29.698301315307617, "gen_logits_std": 3.287956476211548, "gen_loss": 0.3015206754207611, "grad_norm": 0.38244150740497207, "learning_rate": 2.0800421052631577e-05, "loss": 0.2901, "mean_copy_accuracy": 0.9978256225585938, "mean_gen_accuracy": 0.8725840747356415, "mean_token_accuracy": 0.901198998093605, "num_tokens": 1090495142.0, "sample_num_tokens": 6935.0, "step": 8019, "total_num_tokens": 1090522882.0, "z_loss": 0.0004622788983397186 }, { "copy_logits_max": -1.8956516981124878, "copy_logits_min": -750000000.0, "copy_num_tokens": 475.5, "epoch": 1.6380393158029105, "gen_logits_max": 3.9668221473693848, "gen_logits_mean": -15.367019653320312, "gen_logits_min": -27.197307586669922, "gen_logits_std": 3.194169044494629, "gen_loss": 0.2901571989059448, "grad_norm": 0.3837721963352889, "learning_rate": 2.0799157894736845e-05, "loss": 0.2947, "mean_copy_accuracy": 0.9970577955245972, "mean_gen_accuracy": 0.8685509115457535, "mean_token_accuracy": 0.8998629301786423, "num_tokens": 1090770082.0, "sample_num_tokens": 8400.0, "step": 8020, "total_num_tokens": 1090803682.0, "z_loss": 0.0004509363789111376 }, { "copy_logits_max": -2.669002056121826, "copy_logits_min": -750000000.0, "copy_num_tokens": 370.5, "epoch": 1.6382435537401072, "gen_logits_max": 3.82446551322937, "gen_logits_mean": -17.042831420898438, "gen_logits_min": -29.610830307006836, "gen_logits_std": 3.2957677841186523, "gen_loss": 0.266573429107666, "grad_norm": 0.3434647927206834, "learning_rate": 2.0797894736842106e-05, "loss": 0.2668, "mean_copy_accuracy": 0.9969999343156815, "mean_gen_accuracy": 0.8748926073312759, "mean_token_accuracy": 0.9072580933570862, "num_tokens": 1091047451.0, "sample_num_tokens": 7248.25, "step": 8021, "total_num_tokens": 1091076444.0, "z_loss": 0.00040911725955083966 }, { "copy_logits_max": -3.7057738304138184, "copy_logits_min": -750000000.0, "copy_num_tokens": 338.25, "epoch": 1.638447791677304, "gen_logits_max": 3.909442186355591, "gen_logits_mean": -15.887813568115234, "gen_logits_min": -28.012088775634766, "gen_logits_std": 3.2151756286621094, "gen_loss": 0.3010317087173462, "grad_norm": 0.38372822877334933, "learning_rate": 2.079663157894737e-05, "loss": 0.2809, "mean_copy_accuracy": 0.9976160675287247, "mean_gen_accuracy": 0.8745889812707901, "mean_token_accuracy": 0.9048519134521484, "num_tokens": 1091287613.0, "sample_num_tokens": 7273.75, "step": 8022, "total_num_tokens": 1091316708.0, "z_loss": 0.00047327764332294464 }, { "copy_logits_max": -0.11785471439361572, "copy_logits_min": -750000000.0, "copy_num_tokens": 563.25, "epoch": 1.6386520296145008, "gen_logits_max": 4.087597370147705, "gen_logits_mean": -15.528032302856445, "gen_logits_min": -27.8330078125, "gen_logits_std": 3.2378885746002197, "gen_loss": 0.23724043369293213, "grad_norm": 0.3798339058219212, "learning_rate": 2.079536842105263e-05, "loss": 0.2846, "mean_copy_accuracy": 0.9965731352567673, "mean_gen_accuracy": 0.8734604865312576, "mean_token_accuracy": 0.906289204955101, "num_tokens": 1091550646.0, "sample_num_tokens": 8425.5, "step": 8023, "total_num_tokens": 1091584348.0, "z_loss": 0.0003624984237831086 }, { "copy_logits_max": -5.371728420257568, "copy_logits_min": -750000000.0, "copy_num_tokens": 460.625, "epoch": 1.6388562675516978, "gen_logits_max": 3.124622344970703, "gen_logits_mean": -18.270427703857422, "gen_logits_min": -30.143556594848633, "gen_logits_std": 3.2878260612487793, "gen_loss": 0.33817338943481445, "grad_norm": 0.3931944071999263, "learning_rate": 2.0794105263157895e-05, "loss": 0.2987, "mean_copy_accuracy": 0.9963763803243637, "mean_gen_accuracy": 0.8707205653190613, "mean_token_accuracy": 0.8990933448076248, "num_tokens": 1091820798.0, "sample_num_tokens": 8825.5, "step": 8024, "total_num_tokens": 1091856100.0, "z_loss": 0.0004991508321836591 }, { "copy_logits_max": -4.143320560455322, "copy_logits_min": -750000000.0, "copy_num_tokens": 425.4375, "epoch": 1.6390605054888945, "gen_logits_max": 3.33351469039917, "gen_logits_mean": -17.438232421875, "gen_logits_min": -29.4492130279541, "gen_logits_std": 3.270951747894287, "gen_loss": 0.26840826869010925, "grad_norm": 0.357207265702031, "learning_rate": 2.0792842105263156e-05, "loss": 0.2821, "mean_copy_accuracy": 0.9971268028020859, "mean_gen_accuracy": 0.8753021657466888, "mean_token_accuracy": 0.9036792367696762, "num_tokens": 1092086921.0, "sample_num_tokens": 9037.75, "step": 8025, "total_num_tokens": 1092123072.0, "z_loss": 0.00044836130109615624 }, { "copy_logits_max": -4.031649589538574, "copy_logits_min": -687500032.0, "copy_num_tokens": 553.1875, "epoch": 1.6392647434260914, "gen_logits_max": 3.033843994140625, "gen_logits_mean": -17.47195816040039, "gen_logits_min": -29.43351173400879, "gen_logits_std": 3.2849371433258057, "gen_loss": 0.3042886257171631, "grad_norm": 0.3592897741092235, "learning_rate": 2.079157894736842e-05, "loss": 0.2881, "mean_copy_accuracy": 0.9968300461769104, "mean_gen_accuracy": 0.8693093210458755, "mean_token_accuracy": 0.9017366915941238, "num_tokens": 1092366005.0, "sample_num_tokens": 9536.75, "step": 8026, "total_num_tokens": 1092404152.0, "z_loss": 0.0005053025088272989 }, { "copy_logits_max": -2.805469036102295, "copy_logits_min": -687500032.0, "copy_num_tokens": 344.0625, "epoch": 1.6394689813632883, "gen_logits_max": 4.110454559326172, "gen_logits_mean": -16.68149185180664, "gen_logits_min": -29.20132827758789, "gen_logits_std": 3.29140305519104, "gen_loss": 0.29152464866638184, "grad_norm": 0.3396454054005376, "learning_rate": 2.0790315789473685e-05, "loss": 0.2745, "mean_copy_accuracy": 0.996993288397789, "mean_gen_accuracy": 0.8784739822149277, "mean_token_accuracy": 0.9067069888114929, "num_tokens": 1092624153.0, "sample_num_tokens": 7532.75, "step": 8027, "total_num_tokens": 1092654284.0, "z_loss": 0.00046383647713810205 }, { "copy_logits_max": -3.264878273010254, "copy_logits_min": -750000000.0, "copy_num_tokens": 519.25, "epoch": 1.639673219300485, "gen_logits_max": 2.917346954345703, "gen_logits_mean": -17.283443450927734, "gen_logits_min": -29.50477409362793, "gen_logits_std": 3.3095507621765137, "gen_loss": 0.2900690734386444, "grad_norm": 0.37598796931048695, "learning_rate": 2.078905263157895e-05, "loss": 0.2806, "mean_copy_accuracy": 0.9975029528141022, "mean_gen_accuracy": 0.8732926696538925, "mean_token_accuracy": 0.9046170860528946, "num_tokens": 1092878055.0, "sample_num_tokens": 8759.25, "step": 8028, "total_num_tokens": 1092913092.0, "z_loss": 0.0004965108819305897 }, { "copy_logits_max": -4.387337684631348, "copy_logits_min": -687500032.0, "copy_num_tokens": 395.125, "epoch": 1.6398774572376817, "gen_logits_max": 3.4511806964874268, "gen_logits_mean": -17.154685974121094, "gen_logits_min": -29.804855346679688, "gen_logits_std": 3.3078255653381348, "gen_loss": 0.2854021191596985, "grad_norm": 0.33883418585094643, "learning_rate": 2.0787789473684213e-05, "loss": 0.2782, "mean_copy_accuracy": 0.9971070736646652, "mean_gen_accuracy": 0.8760427236557007, "mean_token_accuracy": 0.9046800136566162, "num_tokens": 1093149781.0, "sample_num_tokens": 7709.25, "step": 8029, "total_num_tokens": 1093180618.0, "z_loss": 0.0004838623572140932 }, { "copy_logits_max": -5.558315277099609, "copy_logits_min": -750000000.0, "copy_num_tokens": 419.875, "epoch": 1.640081695174879, "gen_logits_max": 2.3668346405029297, "gen_logits_mean": -19.305004119873047, "gen_logits_min": -31.216459274291992, "gen_logits_std": 3.37267804145813, "gen_loss": 0.28667187690734863, "grad_norm": 0.3730870237599703, "learning_rate": 2.0786526315789474e-05, "loss": 0.2973, "mean_copy_accuracy": 0.9968802183866501, "mean_gen_accuracy": 0.8717764616012573, "mean_token_accuracy": 0.8998804837465286, "num_tokens": 1093406583.0, "sample_num_tokens": 7631.75, "step": 8030, "total_num_tokens": 1093437110.0, "z_loss": 0.00048389655421487987 }, { "copy_logits_max": -3.634561777114868, "copy_logits_min": -750000000.0, "copy_num_tokens": 351.625, "epoch": 1.6402859331120756, "gen_logits_max": 3.8969311714172363, "gen_logits_mean": -17.643632888793945, "gen_logits_min": -29.619644165039062, "gen_logits_std": 3.3193304538726807, "gen_loss": 0.23075586557388306, "grad_norm": 0.32065633506075464, "learning_rate": 2.078526315789474e-05, "loss": 0.2536, "mean_copy_accuracy": 0.9981668144464493, "mean_gen_accuracy": 0.8793440908193588, "mean_token_accuracy": 0.9135446101427078, "num_tokens": 1093698705.0, "sample_num_tokens": 7173.25, "step": 8031, "total_num_tokens": 1093727398.0, "z_loss": 0.00035215733805671334 }, { "copy_logits_max": -5.6463141441345215, "copy_logits_min": -750000000.0, "copy_num_tokens": 502.1875, "epoch": 1.6404901710492723, "gen_logits_max": 4.185853481292725, "gen_logits_mean": -16.08488655090332, "gen_logits_min": -28.475910186767578, "gen_logits_std": 3.2656257152557373, "gen_loss": 0.25570595264434814, "grad_norm": 0.34334066458448836, "learning_rate": 2.0784e-05, "loss": 0.2656, "mean_copy_accuracy": 0.9970953613519669, "mean_gen_accuracy": 0.8796369582414627, "mean_token_accuracy": 0.9103062003850937, "num_tokens": 1093989438.0, "sample_num_tokens": 8995.5, "step": 8032, "total_num_tokens": 1094025420.0, "z_loss": 0.0003911165986210108 }, { "copy_logits_max": -5.850634574890137, "copy_logits_min": -750000000.0, "copy_num_tokens": 404.9375, "epoch": 1.6406944089864692, "gen_logits_max": 3.010572910308838, "gen_logits_mean": -18.476444244384766, "gen_logits_min": -30.241130828857422, "gen_logits_std": 3.3456339836120605, "gen_loss": 0.2862411141395569, "grad_norm": 0.3373589743147702, "learning_rate": 2.0782736842105264e-05, "loss": 0.2745, "mean_copy_accuracy": 0.9973196685314178, "mean_gen_accuracy": 0.8757750689983368, "mean_token_accuracy": 0.9072894752025604, "num_tokens": 1094260512.0, "sample_num_tokens": 7807.0, "step": 8033, "total_num_tokens": 1094291740.0, "z_loss": 0.0004280306166037917 }, { "copy_logits_max": -4.090190410614014, "copy_logits_min": -750000000.0, "copy_num_tokens": 319.125, "epoch": 1.6408986469236662, "gen_logits_max": 4.105975151062012, "gen_logits_mean": -17.279582977294922, "gen_logits_min": -29.22027015686035, "gen_logits_std": 3.2988204956054688, "gen_loss": 0.2767663896083832, "grad_norm": 0.3741922889935823, "learning_rate": 2.0781473684210525e-05, "loss": 0.2774, "mean_copy_accuracy": 0.99579256772995, "mean_gen_accuracy": 0.8833477795124054, "mean_token_accuracy": 0.9062472134828568, "num_tokens": 1094500185.0, "sample_num_tokens": 6826.75, "step": 8034, "total_num_tokens": 1094527492.0, "z_loss": 0.0004099010839127004 }, { "copy_logits_max": -4.356239318847656, "copy_logits_min": -750000000.0, "copy_num_tokens": 393.5625, "epoch": 1.6411028848608629, "gen_logits_max": 3.6163330078125, "gen_logits_mean": -18.039236068725586, "gen_logits_min": -29.97373390197754, "gen_logits_std": 3.3255977630615234, "gen_loss": 0.2551521062850952, "grad_norm": 0.4031333151538667, "learning_rate": 2.078021052631579e-05, "loss": 0.2785, "mean_copy_accuracy": 0.9961569756269455, "mean_gen_accuracy": 0.8785210698843002, "mean_token_accuracy": 0.9031549543142319, "num_tokens": 1094750192.0, "sample_num_tokens": 8041.0, "step": 8035, "total_num_tokens": 1094782356.0, "z_loss": 0.0003917373833246529 }, { "copy_logits_max": -5.585220813751221, "copy_logits_min": -687500032.0, "copy_num_tokens": 298.3125, "epoch": 1.6413071227980598, "gen_logits_max": 3.7578461170196533, "gen_logits_mean": -18.451129913330078, "gen_logits_min": -30.45340347290039, "gen_logits_std": 3.346754550933838, "gen_loss": 0.29579877853393555, "grad_norm": 0.33576523766652394, "learning_rate": 2.0778947368421053e-05, "loss": 0.2611, "mean_copy_accuracy": 0.9972529262304306, "mean_gen_accuracy": 0.8819920569658279, "mean_token_accuracy": 0.9100502580404282, "num_tokens": 1095020179.0, "sample_num_tokens": 7629.25, "step": 8036, "total_num_tokens": 1095050696.0, "z_loss": 0.0004758767317980528 }, { "copy_logits_max": -4.704953193664551, "copy_logits_min": -750000128.0, "copy_num_tokens": 418.625, "epoch": 1.6415113607352567, "gen_logits_max": 3.169762134552002, "gen_logits_mean": -18.098918914794922, "gen_logits_min": -30.17927360534668, "gen_logits_std": 3.3391294479370117, "gen_loss": 0.2670690417289734, "grad_norm": 0.35973051301897274, "learning_rate": 2.0777684210526318e-05, "loss": 0.2637, "mean_copy_accuracy": 0.9963571578264236, "mean_gen_accuracy": 0.8811526745557785, "mean_token_accuracy": 0.9094544351100922, "num_tokens": 1095282022.0, "sample_num_tokens": 7855.0, "step": 8037, "total_num_tokens": 1095313442.0, "z_loss": 0.0004113841278012842 }, { "copy_logits_max": -1.0652247667312622, "copy_logits_min": -625000064.0, "copy_num_tokens": 511.9375, "epoch": 1.6417155986724534, "gen_logits_max": 3.633881092071533, "gen_logits_mean": -16.377117156982422, "gen_logits_min": -28.82715606689453, "gen_logits_std": 3.3002376556396484, "gen_loss": 0.2621399462223053, "grad_norm": 0.35675355625862054, "learning_rate": 2.077642105263158e-05, "loss": 0.2823, "mean_copy_accuracy": 0.9964417666196823, "mean_gen_accuracy": 0.8751193732023239, "mean_token_accuracy": 0.9019101411104202, "num_tokens": 1095533621.0, "sample_num_tokens": 7903.75, "step": 8038, "total_num_tokens": 1095565236.0, "z_loss": 0.0004184445133432746 }, { "copy_logits_max": -3.7395291328430176, "copy_logits_min": -687500032.0, "copy_num_tokens": 362.3125, "epoch": 1.6419198366096501, "gen_logits_max": 4.404285430908203, "gen_logits_mean": -16.161231994628906, "gen_logits_min": -28.492341995239258, "gen_logits_std": 3.2532296180725098, "gen_loss": 0.3041101396083832, "grad_norm": 0.35535725578051625, "learning_rate": 2.0775157894736843e-05, "loss": 0.2699, "mean_copy_accuracy": 0.9972109645605087, "mean_gen_accuracy": 0.87895168364048, "mean_token_accuracy": 0.9077194333076477, "num_tokens": 1095796590.0, "sample_num_tokens": 8274.5, "step": 8039, "total_num_tokens": 1095829688.0, "z_loss": 0.0004923450760543346 }, { "copy_logits_max": -3.8360745906829834, "copy_logits_min": -750000000.0, "copy_num_tokens": 485.625, "epoch": 1.642124074546847, "gen_logits_max": 3.7957570552825928, "gen_logits_mean": -17.347991943359375, "gen_logits_min": -29.758934020996094, "gen_logits_std": 3.328153371810913, "gen_loss": 0.27432987093925476, "grad_norm": 0.3553946312257852, "learning_rate": 2.0773894736842107e-05, "loss": 0.2653, "mean_copy_accuracy": 0.9971944838762283, "mean_gen_accuracy": 0.8802213072776794, "mean_token_accuracy": 0.9103894084692001, "num_tokens": 1096061073.0, "sample_num_tokens": 9233.25, "step": 8040, "total_num_tokens": 1096098006.0, "z_loss": 0.0004721618606708944 }, { "copy_logits_max": -4.331214427947998, "copy_logits_min": -750000000.0, "copy_num_tokens": 349.5, "epoch": 1.642328312484044, "gen_logits_max": 4.465327262878418, "gen_logits_mean": -16.293014526367188, "gen_logits_min": -28.662918090820312, "gen_logits_std": 3.2789087295532227, "gen_loss": 0.2810930609703064, "grad_norm": 0.3738133130409109, "learning_rate": 2.0772631578947368e-05, "loss": 0.2827, "mean_copy_accuracy": 0.9960217326879501, "mean_gen_accuracy": 0.8798908442258835, "mean_token_accuracy": 0.9050234705209732, "num_tokens": 1096305056.0, "sample_num_tokens": 7899.0, "step": 8041, "total_num_tokens": 1096336652.0, "z_loss": 0.0004629736067727208 }, { "copy_logits_max": -6.132275581359863, "copy_logits_min": -687500032.0, "copy_num_tokens": 603.4375, "epoch": 1.6425325504212407, "gen_logits_max": 3.0188286304473877, "gen_logits_mean": -17.491729736328125, "gen_logits_min": -30.14525604248047, "gen_logits_std": 3.361114263534546, "gen_loss": 0.2426498979330063, "grad_norm": 0.3220501138689804, "learning_rate": 2.0771368421052632e-05, "loss": 0.2484, "mean_copy_accuracy": 0.9982209354639053, "mean_gen_accuracy": 0.8839857876300812, "mean_token_accuracy": 0.9153759777545929, "num_tokens": 1096600089.0, "sample_num_tokens": 9349.25, "step": 8042, "total_num_tokens": 1096637486.0, "z_loss": 0.00038367946399375796 }, { "copy_logits_max": -6.173385143280029, "copy_logits_min": -687500032.0, "copy_num_tokens": 499.375, "epoch": 1.6427367883584376, "gen_logits_max": 3.1971187591552734, "gen_logits_mean": -17.851524353027344, "gen_logits_min": -30.7020320892334, "gen_logits_std": 3.352059841156006, "gen_loss": 0.2962496876716614, "grad_norm": 0.3120634590862795, "learning_rate": 2.0770105263157893e-05, "loss": 0.2703, "mean_copy_accuracy": 0.9971535503864288, "mean_gen_accuracy": 0.8808454424142838, "mean_token_accuracy": 0.9078685194253922, "num_tokens": 1096896789.0, "sample_num_tokens": 8969.75, "step": 8043, "total_num_tokens": 1096932668.0, "z_loss": 0.0004913450684398413 }, { "copy_logits_max": -5.548360824584961, "copy_logits_min": -750000000.0, "copy_num_tokens": 533.5625, "epoch": 1.6429410262956345, "gen_logits_max": 2.1396524906158447, "gen_logits_mean": -19.30701446533203, "gen_logits_min": -31.635440826416016, "gen_logits_std": 3.410705804824829, "gen_loss": 0.2887049615383148, "grad_norm": 0.37823398595704594, "learning_rate": 2.0768842105263158e-05, "loss": 0.2556, "mean_copy_accuracy": 0.9974908530712128, "mean_gen_accuracy": 0.8755403459072113, "mean_token_accuracy": 0.9121200144290924, "num_tokens": 1097172630.0, "sample_num_tokens": 8563.0, "step": 8044, "total_num_tokens": 1097206882.0, "z_loss": 0.0004929103888571262 }, { "copy_logits_max": -4.544341564178467, "copy_logits_min": -687500032.0, "copy_num_tokens": 423.0, "epoch": 1.6431452642328312, "gen_logits_max": 3.770413875579834, "gen_logits_mean": -16.421226501464844, "gen_logits_min": -30.013917922973633, "gen_logits_std": 3.3172762393951416, "gen_loss": 0.2788689136505127, "grad_norm": 0.3364921375978377, "learning_rate": 2.0767578947368422e-05, "loss": 0.2626, "mean_copy_accuracy": 0.9977864474058151, "mean_gen_accuracy": 0.8814442753791809, "mean_token_accuracy": 0.9108799546957016, "num_tokens": 1097443893.0, "sample_num_tokens": 7649.25, "step": 8045, "total_num_tokens": 1097474490.0, "z_loss": 0.0005221200990490615 }, { "copy_logits_max": -5.380483627319336, "copy_logits_min": -750000000.0, "copy_num_tokens": 484.5, "epoch": 1.643349502170028, "gen_logits_max": 4.154117584228516, "gen_logits_mean": -16.194087982177734, "gen_logits_min": -28.511634826660156, "gen_logits_std": 3.272486686706543, "gen_loss": 0.2748901844024658, "grad_norm": 0.3530494528762042, "learning_rate": 2.0766315789473686e-05, "loss": 0.2761, "mean_copy_accuracy": 0.9979541152715683, "mean_gen_accuracy": 0.8737139701843262, "mean_token_accuracy": 0.9077388495206833, "num_tokens": 1097744665.0, "sample_num_tokens": 8958.25, "step": 8046, "total_num_tokens": 1097780498.0, "z_loss": 0.0005572012159973383 }, { "copy_logits_max": -5.144146919250488, "copy_logits_min": -750000000.0, "copy_num_tokens": 391.75, "epoch": 1.6435537401072249, "gen_logits_max": 3.8990609645843506, "gen_logits_mean": -17.040380477905273, "gen_logits_min": -29.409059524536133, "gen_logits_std": 3.3184986114501953, "gen_loss": 0.28432872891426086, "grad_norm": 0.3601192837633148, "learning_rate": 2.0765052631578947e-05, "loss": 0.2761, "mean_copy_accuracy": 0.997182086110115, "mean_gen_accuracy": 0.8752624690532684, "mean_token_accuracy": 0.9075778871774673, "num_tokens": 1098022430.0, "sample_num_tokens": 8287.5, "step": 8047, "total_num_tokens": 1098055580.0, "z_loss": 0.0004918143386021256 }, { "copy_logits_max": -6.297146320343018, "copy_logits_min": -750000000.0, "copy_num_tokens": 276.125, "epoch": 1.6437579780444218, "gen_logits_max": 4.367086410522461, "gen_logits_mean": -16.804256439208984, "gen_logits_min": -29.439071655273438, "gen_logits_std": 3.3164167404174805, "gen_loss": 0.303625226020813, "grad_norm": 0.3607559904932093, "learning_rate": 2.076378947368421e-05, "loss": 0.2834, "mean_copy_accuracy": 0.9970123022794724, "mean_gen_accuracy": 0.8764153718948364, "mean_token_accuracy": 0.9027704149484634, "num_tokens": 1098274639.0, "sample_num_tokens": 6909.25, "step": 8048, "total_num_tokens": 1098302276.0, "z_loss": 0.00050612777704373 }, { "copy_logits_max": -7.375267028808594, "copy_logits_min": -687500032.0, "copy_num_tokens": 288.0625, "epoch": 1.6439622159816185, "gen_logits_max": 4.1748247146606445, "gen_logits_mean": -17.609615325927734, "gen_logits_min": -30.180908203125, "gen_logits_std": 3.334815502166748, "gen_loss": 0.33008289337158203, "grad_norm": 0.37075977419786665, "learning_rate": 2.0762526315789473e-05, "loss": 0.2959, "mean_copy_accuracy": 0.9973254948854446, "mean_gen_accuracy": 0.8729792088270187, "mean_token_accuracy": 0.8989987820386887, "num_tokens": 1098530698.0, "sample_num_tokens": 7819.0, "step": 8049, "total_num_tokens": 1098561974.0, "z_loss": 0.0005191767122596502 }, { "copy_logits_max": -7.126926898956299, "copy_logits_min": -750000000.0, "copy_num_tokens": 347.75, "epoch": 1.6441664539188154, "gen_logits_max": 4.140490531921387, "gen_logits_mean": -17.659626007080078, "gen_logits_min": -30.45100212097168, "gen_logits_std": 3.342458724975586, "gen_loss": 0.277651846408844, "grad_norm": 0.37607640021309746, "learning_rate": 2.0761263157894737e-05, "loss": 0.2706, "mean_copy_accuracy": 0.9960577636957169, "mean_gen_accuracy": 0.8802700340747833, "mean_token_accuracy": 0.9070278257131577, "num_tokens": 1098791687.0, "sample_num_tokens": 8088.25, "step": 8050, "total_num_tokens": 1098824040.0, "z_loss": 0.000456536712590605 }, { "copy_logits_max": -6.034519195556641, "copy_logits_min": -750000000.0, "copy_num_tokens": 361.5625, "epoch": 1.6443706918560124, "gen_logits_max": 5.095597267150879, "gen_logits_mean": -16.19552230834961, "gen_logits_min": -28.873981475830078, "gen_logits_std": 3.3157191276550293, "gen_loss": 0.27122828364372253, "grad_norm": 0.37422757467705614, "learning_rate": 2.0759999999999998e-05, "loss": 0.2731, "mean_copy_accuracy": 0.9969261139631271, "mean_gen_accuracy": 0.8777221143245697, "mean_token_accuracy": 0.9051996469497681, "num_tokens": 1099049933.0, "sample_num_tokens": 8515.75, "step": 8051, "total_num_tokens": 1099083996.0, "z_loss": 0.0004735974071081728 }, { "copy_logits_max": -7.445425033569336, "copy_logits_min": -750000000.0, "copy_num_tokens": 315.625, "epoch": 1.644574929793209, "gen_logits_max": 3.775017261505127, "gen_logits_mean": -17.90024185180664, "gen_logits_min": -30.526256561279297, "gen_logits_std": 3.3588929176330566, "gen_loss": 0.2974315881729126, "grad_norm": 0.348447346308193, "learning_rate": 2.0758736842105262e-05, "loss": 0.263, "mean_copy_accuracy": 0.9965724647045135, "mean_gen_accuracy": 0.8830307871103287, "mean_token_accuracy": 0.9115615785121918, "num_tokens": 1099316571.0, "sample_num_tokens": 7007.75, "step": 8052, "total_num_tokens": 1099344602.0, "z_loss": 0.0004504915268626064 }, { "copy_logits_max": -4.3822550773620605, "copy_logits_min": -750000000.0, "copy_num_tokens": 431.875, "epoch": 1.6447791677304058, "gen_logits_max": 4.162661552429199, "gen_logits_mean": -15.265785217285156, "gen_logits_min": -28.30514144897461, "gen_logits_std": 3.2882328033447266, "gen_loss": 0.2865373492240906, "grad_norm": 0.5556301106477087, "learning_rate": 2.075747368421053e-05, "loss": 0.2664, "mean_copy_accuracy": 0.9977086335420609, "mean_gen_accuracy": 0.8736129701137543, "mean_token_accuracy": 0.9107245057821274, "num_tokens": 1099580647.0, "sample_num_tokens": 7428.75, "step": 8053, "total_num_tokens": 1099610362.0, "z_loss": 0.0004180411924608052 }, { "copy_logits_max": -7.117029190063477, "copy_logits_min": -750000064.0, "copy_num_tokens": 511.0625, "epoch": 1.6449834056676027, "gen_logits_max": 3.745048999786377, "gen_logits_mean": -16.64828109741211, "gen_logits_min": -29.240543365478516, "gen_logits_std": 3.298671245574951, "gen_loss": 0.28656327724456787, "grad_norm": 0.40439472830742446, "learning_rate": 2.075621052631579e-05, "loss": 0.2661, "mean_copy_accuracy": 0.9965462535619736, "mean_gen_accuracy": 0.8808697164058685, "mean_token_accuracy": 0.9103781580924988, "num_tokens": 1099842149.0, "sample_num_tokens": 8342.25, "step": 8054, "total_num_tokens": 1099875518.0, "z_loss": 0.00040431381785310805 }, { "copy_logits_max": -6.376359939575195, "copy_logits_min": -750000000.0, "copy_num_tokens": 472.0, "epoch": 1.6451876436047996, "gen_logits_max": 3.5859789848327637, "gen_logits_mean": -16.463340759277344, "gen_logits_min": -29.02735137939453, "gen_logits_std": 3.309674024581909, "gen_loss": 0.26968812942504883, "grad_norm": 0.36930307759399855, "learning_rate": 2.0754947368421055e-05, "loss": 0.2923, "mean_copy_accuracy": 0.9975030869245529, "mean_gen_accuracy": 0.8715942203998566, "mean_token_accuracy": 0.90141861140728, "num_tokens": 1100117891.0, "sample_num_tokens": 8710.75, "step": 8055, "total_num_tokens": 1100152734.0, "z_loss": 0.00039919448317959905 }, { "copy_logits_max": -8.466114044189453, "copy_logits_min": -750000000.0, "copy_num_tokens": 308.6875, "epoch": 1.6453918815419963, "gen_logits_max": 3.4165380001068115, "gen_logits_mean": -18.654003143310547, "gen_logits_min": -30.673999786376953, "gen_logits_std": 3.3497071266174316, "gen_loss": 0.3065338134765625, "grad_norm": 0.36565592291929394, "learning_rate": 2.0753684210526316e-05, "loss": 0.2782, "mean_copy_accuracy": 0.9970247596502304, "mean_gen_accuracy": 0.8802265524864197, "mean_token_accuracy": 0.9052214324474335, "num_tokens": 1100366676.0, "sample_num_tokens": 7312.0, "step": 8056, "total_num_tokens": 1100395924.0, "z_loss": 0.00047443274524994195 }, { "copy_logits_max": -7.685197353363037, "copy_logits_min": -750000000.0, "copy_num_tokens": 349.9375, "epoch": 1.6455961194791933, "gen_logits_max": 2.8559823036193848, "gen_logits_mean": -19.063541412353516, "gen_logits_min": -31.40631103515625, "gen_logits_std": 3.406933069229126, "gen_loss": 0.26559001207351685, "grad_norm": 0.38809503781353316, "learning_rate": 2.075242105263158e-05, "loss": 0.2847, "mean_copy_accuracy": 0.9966265261173248, "mean_gen_accuracy": 0.8769704401493073, "mean_token_accuracy": 0.9027747511863708, "num_tokens": 1100621027.0, "sample_num_tokens": 7420.25, "step": 8057, "total_num_tokens": 1100650708.0, "z_loss": 0.0003976903681177646 }, { "copy_logits_max": -8.145195007324219, "copy_logits_min": -750000000.0, "copy_num_tokens": 504.0625, "epoch": 1.6458003574163902, "gen_logits_max": 3.7148921489715576, "gen_logits_mean": -16.217483520507812, "gen_logits_min": -29.074573516845703, "gen_logits_std": 3.330138683319092, "gen_loss": 0.23775038123130798, "grad_norm": 0.3751700443342817, "learning_rate": 2.075115789473684e-05, "loss": 0.2526, "mean_copy_accuracy": 0.9981176108121872, "mean_gen_accuracy": 0.8837774991989136, "mean_token_accuracy": 0.9148551821708679, "num_tokens": 1100897181.0, "sample_num_tokens": 8467.25, "step": 8058, "total_num_tokens": 1100931050.0, "z_loss": 0.0003616329049691558 }, { "copy_logits_max": -7.35629940032959, "copy_logits_min": -750000000.0, "copy_num_tokens": 295.875, "epoch": 1.646004595353587, "gen_logits_max": 3.8982152938842773, "gen_logits_mean": -17.497955322265625, "gen_logits_min": -30.111312866210938, "gen_logits_std": 3.340259552001953, "gen_loss": 0.2932283282279968, "grad_norm": 0.37141876991997974, "learning_rate": 2.0749894736842105e-05, "loss": 0.2838, "mean_copy_accuracy": 0.9971742630004883, "mean_gen_accuracy": 0.8770395964384079, "mean_token_accuracy": 0.9043219685554504, "num_tokens": 1101167285.0, "sample_num_tokens": 7375.25, "step": 8059, "total_num_tokens": 1101196786.0, "z_loss": 0.00041176570812240243 }, { "copy_logits_max": -6.201437950134277, "copy_logits_min": -750000000.0, "copy_num_tokens": 343.4375, "epoch": 1.6462088332907836, "gen_logits_max": 4.195828437805176, "gen_logits_mean": -16.260055541992188, "gen_logits_min": -29.204090118408203, "gen_logits_std": 3.3193485736846924, "gen_loss": 0.2436782419681549, "grad_norm": 0.39842897537753486, "learning_rate": 2.0748631578947366e-05, "loss": 0.2517, "mean_copy_accuracy": 0.9971081763505936, "mean_gen_accuracy": 0.8855688571929932, "mean_token_accuracy": 0.9141180366277695, "num_tokens": 1101437652.0, "sample_num_tokens": 7613.5, "step": 8060, "total_num_tokens": 1101468106.0, "z_loss": 0.00040288761374540627 }, { "copy_logits_max": -6.916943550109863, "copy_logits_min": -750000000.0, "copy_num_tokens": 416.3125, "epoch": 1.6464130712279808, "gen_logits_max": 2.7252197265625, "gen_logits_mean": -19.129379272460938, "gen_logits_min": -31.410978317260742, "gen_logits_std": 3.3964967727661133, "gen_loss": 0.3171301484107971, "grad_norm": 0.349669883545544, "learning_rate": 2.0747368421052634e-05, "loss": 0.2686, "mean_copy_accuracy": 0.9967980533838272, "mean_gen_accuracy": 0.8805011063814163, "mean_token_accuracy": 0.9080384224653244, "num_tokens": 1101687070.0, "sample_num_tokens": 8246.5, "step": 8061, "total_num_tokens": 1101720056.0, "z_loss": 0.0004686190513893962 }, { "copy_logits_max": -0.9223107099533081, "copy_logits_min": -750000000.0, "copy_num_tokens": 448.6875, "epoch": 1.6466173091651775, "gen_logits_max": 3.780381679534912, "gen_logits_mean": -15.768234252929688, "gen_logits_min": -28.207286834716797, "gen_logits_std": 3.2966690063476562, "gen_loss": 0.2571364641189575, "grad_norm": 0.37988284108036235, "learning_rate": 2.0746105263157895e-05, "loss": 0.2617, "mean_copy_accuracy": 0.9976648241281509, "mean_gen_accuracy": 0.8781221807003021, "mean_token_accuracy": 0.9106298685073853, "num_tokens": 1101967305.0, "sample_num_tokens": 7527.25, "step": 8062, "total_num_tokens": 1101997414.0, "z_loss": 0.00046552892308682203 }, { "copy_logits_max": -3.6038529872894287, "copy_logits_min": -750000000.0, "copy_num_tokens": 392.75, "epoch": 1.6468215471023742, "gen_logits_max": 4.579999923706055, "gen_logits_mean": -15.295419692993164, "gen_logits_min": -28.199752807617188, "gen_logits_std": 3.2778658866882324, "gen_loss": 0.2857006788253784, "grad_norm": 0.3607755724903347, "learning_rate": 2.074484210526316e-05, "loss": 0.2943, "mean_copy_accuracy": 0.9969524592161179, "mean_gen_accuracy": 0.8658323734998703, "mean_token_accuracy": 0.9013501852750778, "num_tokens": 1102238491.0, "sample_num_tokens": 6780.25, "step": 8063, "total_num_tokens": 1102265612.0, "z_loss": 0.0004601827240549028 }, { "copy_logits_max": -4.097255229949951, "copy_logits_min": -687500032.0, "copy_num_tokens": 527.9375, "epoch": 1.647025785039571, "gen_logits_max": 2.540674924850464, "gen_logits_mean": -18.000362396240234, "gen_logits_min": -30.602378845214844, "gen_logits_std": 3.3713390827178955, "gen_loss": 0.2579008936882019, "grad_norm": 0.37668395796023485, "learning_rate": 2.074357894736842e-05, "loss": 0.2718, "mean_copy_accuracy": 0.9971732050180435, "mean_gen_accuracy": 0.8788866549730301, "mean_token_accuracy": 0.9076468795537949, "num_tokens": 1102504656.0, "sample_num_tokens": 8478.0, "step": 8064, "total_num_tokens": 1102538568.0, "z_loss": 0.0004566044663079083 }, { "copy_logits_max": -5.152789115905762, "copy_logits_min": -750000000.0, "copy_num_tokens": 339.9375, "epoch": 1.647230022976768, "gen_logits_max": 3.8214211463928223, "gen_logits_mean": -17.17885971069336, "gen_logits_min": -29.84041976928711, "gen_logits_std": 3.3206706047058105, "gen_loss": 0.29274165630340576, "grad_norm": 0.33972524599147835, "learning_rate": 2.0742315789473685e-05, "loss": 0.288, "mean_copy_accuracy": 0.9975069612264633, "mean_gen_accuracy": 0.8724994510412216, "mean_token_accuracy": 0.904025673866272, "num_tokens": 1102786986.0, "sample_num_tokens": 7005.5, "step": 8065, "total_num_tokens": 1102815008.0, "z_loss": 0.0004965622792951763 }, { "copy_logits_max": -4.684414386749268, "copy_logits_min": -687500032.0, "copy_num_tokens": 540.0, "epoch": 1.6474342609139647, "gen_logits_max": 4.2387375831604, "gen_logits_mean": -16.587615966796875, "gen_logits_min": -29.391590118408203, "gen_logits_std": 3.2936432361602783, "gen_loss": 0.2654085159301758, "grad_norm": 0.34543085780507826, "learning_rate": 2.074105263157895e-05, "loss": 0.2738, "mean_copy_accuracy": 0.9963754564523697, "mean_gen_accuracy": 0.8733474314212799, "mean_token_accuracy": 0.9084243774414062, "num_tokens": 1103074328.0, "sample_num_tokens": 9047.0, "step": 8066, "total_num_tokens": 1103110516.0, "z_loss": 0.00045491004129871726 }, { "copy_logits_max": -4.5749125480651855, "copy_logits_min": -750000000.0, "copy_num_tokens": 439.0625, "epoch": 1.6476384988511616, "gen_logits_max": 3.78456974029541, "gen_logits_mean": -16.987220764160156, "gen_logits_min": -29.342557907104492, "gen_logits_std": 3.2961952686309814, "gen_loss": 0.3136660158634186, "grad_norm": 0.36373747114653737, "learning_rate": 2.073978947368421e-05, "loss": 0.285, "mean_copy_accuracy": 0.9969237893819809, "mean_gen_accuracy": 0.8744023591279984, "mean_token_accuracy": 0.903082937002182, "num_tokens": 1103330426.0, "sample_num_tokens": 8227.5, "step": 8067, "total_num_tokens": 1103363336.0, "z_loss": 0.0005752663710154593 }, { "copy_logits_max": -4.15641975402832, "copy_logits_min": -750000000.0, "copy_num_tokens": 687.875, "epoch": 1.6478427367883586, "gen_logits_max": 2.742575168609619, "gen_logits_mean": -17.222667694091797, "gen_logits_min": -29.58580780029297, "gen_logits_std": 3.344484567642212, "gen_loss": 0.24854938685894012, "grad_norm": 0.42837391798717617, "learning_rate": 2.0738526315789474e-05, "loss": 0.2824, "mean_copy_accuracy": 0.9969396740198135, "mean_gen_accuracy": 0.8733667880296707, "mean_token_accuracy": 0.903827890753746, "num_tokens": 1103590595.0, "sample_num_tokens": 9370.75, "step": 8068, "total_num_tokens": 1103628078.0, "z_loss": 0.0004459564806893468 }, { "copy_logits_max": -4.772636890411377, "copy_logits_min": -750000000.0, "copy_num_tokens": 368.8125, "epoch": 1.6480469747255553, "gen_logits_max": 4.008205413818359, "gen_logits_mean": -17.312442779541016, "gen_logits_min": -29.13758659362793, "gen_logits_std": 3.3096394538879395, "gen_loss": 0.269588828086853, "grad_norm": 0.37238355310495763, "learning_rate": 2.073726315789474e-05, "loss": 0.29, "mean_copy_accuracy": 0.9972718209028244, "mean_gen_accuracy": 0.8718052357435226, "mean_token_accuracy": 0.9000792801380157, "num_tokens": 1103849561.0, "sample_num_tokens": 7673.25, "step": 8069, "total_num_tokens": 1103880254.0, "z_loss": 0.00048450956819579005 }, { "copy_logits_max": -2.079042911529541, "copy_logits_min": -750000000.0, "copy_num_tokens": 751.1875, "epoch": 1.648251212662752, "gen_logits_max": 2.296520233154297, "gen_logits_mean": -16.804336547851562, "gen_logits_min": -29.516338348388672, "gen_logits_std": 3.346038818359375, "gen_loss": 0.21893760561943054, "grad_norm": 0.3432574553370033, "learning_rate": 2.0736000000000003e-05, "loss": 0.2454, "mean_copy_accuracy": 0.9973001778125763, "mean_gen_accuracy": 0.877778947353363, "mean_token_accuracy": 0.9160162657499313, "num_tokens": 1104140099.0, "sample_num_tokens": 9757.25, "step": 8070, "total_num_tokens": 1104179128.0, "z_loss": 0.00043246912537142634 }, { "copy_logits_max": -7.103179454803467, "copy_logits_min": -750000000.0, "copy_num_tokens": 367.3125, "epoch": 1.648455450599949, "gen_logits_max": 4.793092727661133, "gen_logits_mean": -15.913840293884277, "gen_logits_min": -28.17115592956543, "gen_logits_std": 3.247847080230713, "gen_loss": 0.30718111991882324, "grad_norm": 0.35097553099197865, "learning_rate": 2.0734736842105264e-05, "loss": 0.2847, "mean_copy_accuracy": 0.9972607642412186, "mean_gen_accuracy": 0.8739931285381317, "mean_token_accuracy": 0.9019415527582169, "num_tokens": 1104407757.0, "sample_num_tokens": 8596.25, "step": 8071, "total_num_tokens": 1104442142.0, "z_loss": 0.0005003863479942083 }, { "copy_logits_max": -3.7726564407348633, "copy_logits_min": -750000000.0, "copy_num_tokens": 317.8125, "epoch": 1.6486596885371458, "gen_logits_max": 5.331389427185059, "gen_logits_mean": -14.115877151489258, "gen_logits_min": -26.71597671508789, "gen_logits_std": 3.214381694793701, "gen_loss": 0.2592583894729614, "grad_norm": 0.34820476025420594, "learning_rate": 2.0733473684210528e-05, "loss": 0.2784, "mean_copy_accuracy": 0.9964482486248016, "mean_gen_accuracy": 0.8792310655117035, "mean_token_accuracy": 0.9063964188098907, "num_tokens": 1104672936.0, "sample_num_tokens": 7145.0, "step": 8072, "total_num_tokens": 1104701516.0, "z_loss": 0.00045632367255166173 }, { "copy_logits_max": -3.6143598556518555, "copy_logits_min": -687500032.0, "copy_num_tokens": 442.75, "epoch": 1.6488639264743425, "gen_logits_max": 5.139106750488281, "gen_logits_mean": -14.035632133483887, "gen_logits_min": -26.855134963989258, "gen_logits_std": 3.2157230377197266, "gen_loss": 0.26393577456474304, "grad_norm": 0.33876339972411307, "learning_rate": 2.073221052631579e-05, "loss": 0.2663, "mean_copy_accuracy": 0.9968438297510147, "mean_gen_accuracy": 0.8775741159915924, "mean_token_accuracy": 0.9099778383970261, "num_tokens": 1104940808.0, "sample_num_tokens": 8060.0, "step": 8073, "total_num_tokens": 1104973048.0, "z_loss": 0.00044725785846821964 }, { "copy_logits_max": -3.828119993209839, "copy_logits_min": -750000000.0, "copy_num_tokens": 645.8125, "epoch": 1.6490681644115395, "gen_logits_max": 3.500046491622925, "gen_logits_mean": -16.100631713867188, "gen_logits_min": -28.37087631225586, "gen_logits_std": 3.279111862182617, "gen_loss": 0.248067706823349, "grad_norm": 0.3540429878961944, "learning_rate": 2.0730947368421053e-05, "loss": 0.2762, "mean_copy_accuracy": 0.9969498366117477, "mean_gen_accuracy": 0.8756072670221329, "mean_token_accuracy": 0.9061510264873505, "num_tokens": 1105231272.0, "sample_num_tokens": 9813.5, "step": 8074, "total_num_tokens": 1105270526.0, "z_loss": 0.0004382654733490199 }, { "copy_logits_max": -3.8521618843078613, "copy_logits_min": -750000000.0, "copy_num_tokens": 599.6875, "epoch": 1.6492724023487364, "gen_logits_max": 3.0719475746154785, "gen_logits_mean": -16.884553909301758, "gen_logits_min": -29.8349609375, "gen_logits_std": 3.321620464324951, "gen_loss": 0.2760130763053894, "grad_norm": 0.3686381958484408, "learning_rate": 2.0729684210526314e-05, "loss": 0.288, "mean_copy_accuracy": 0.9956652522087097, "mean_gen_accuracy": 0.8703545033931732, "mean_token_accuracy": 0.9015212059020996, "num_tokens": 1105492704.0, "sample_num_tokens": 9124.5, "step": 8075, "total_num_tokens": 1105529202.0, "z_loss": 0.0004959828220307827 }, { "copy_logits_max": -3.7427735328674316, "copy_logits_min": -750000064.0, "copy_num_tokens": 530.8125, "epoch": 1.649476640285933, "gen_logits_max": 3.050100326538086, "gen_logits_mean": -17.02690315246582, "gen_logits_min": -29.123912811279297, "gen_logits_std": 3.291485071182251, "gen_loss": 0.24912820756435394, "grad_norm": 0.3547063199216295, "learning_rate": 2.072842105263158e-05, "loss": 0.2556, "mean_copy_accuracy": 0.9977810084819794, "mean_gen_accuracy": 0.8804533034563065, "mean_token_accuracy": 0.9114343225955963, "num_tokens": 1105767682.0, "sample_num_tokens": 8206.5, "step": 8076, "total_num_tokens": 1105800508.0, "z_loss": 0.0004319602157920599 }, { "copy_logits_max": -3.628220319747925, "copy_logits_min": -750000064.0, "copy_num_tokens": 403.0625, "epoch": 1.6496808782231298, "gen_logits_max": 2.472330331802368, "gen_logits_mean": -18.229093551635742, "gen_logits_min": -30.843612670898438, "gen_logits_std": 3.3412230014801025, "gen_loss": 0.25343057513237, "grad_norm": 0.34562597165379155, "learning_rate": 2.0727157894736843e-05, "loss": 0.2677, "mean_copy_accuracy": 0.9966725409030914, "mean_gen_accuracy": 0.8797069936990738, "mean_token_accuracy": 0.9104124307632446, "num_tokens": 1106040987.0, "sample_num_tokens": 7749.75, "step": 8077, "total_num_tokens": 1106071986.0, "z_loss": 0.00044421415077522397 }, { "copy_logits_max": -5.806339263916016, "copy_logits_min": -750000000.0, "copy_num_tokens": 401.3125, "epoch": 1.6498851161603267, "gen_logits_max": 3.724285840988159, "gen_logits_mean": -16.544605255126953, "gen_logits_min": -28.75657844543457, "gen_logits_std": 3.246159553527832, "gen_loss": 0.30325713753700256, "grad_norm": 0.3954487897089405, "learning_rate": 2.0725894736842107e-05, "loss": 0.2906, "mean_copy_accuracy": 0.9967934936285019, "mean_gen_accuracy": 0.8761425763368607, "mean_token_accuracy": 0.9018694758415222, "num_tokens": 1106296488.0, "sample_num_tokens": 8480.0, "step": 8078, "total_num_tokens": 1106330408.0, "z_loss": 0.0004986252752132714 }, { "copy_logits_max": -3.959536075592041, "copy_logits_min": -750000000.0, "copy_num_tokens": 397.4375, "epoch": 1.6500893540975237, "gen_logits_max": 3.5668692588806152, "gen_logits_mean": -15.784348487854004, "gen_logits_min": -27.694774627685547, "gen_logits_std": 3.1785190105438232, "gen_loss": 0.3175468444824219, "grad_norm": 0.3669946345392013, "learning_rate": 2.072463157894737e-05, "loss": 0.2983, "mean_copy_accuracy": 0.9961417466402054, "mean_gen_accuracy": 0.8752328604459763, "mean_token_accuracy": 0.899377778172493, "num_tokens": 1106546571.0, "sample_num_tokens": 7769.25, "step": 8079, "total_num_tokens": 1106577648.0, "z_loss": 0.0005325298407115042 }, { "copy_logits_max": -4.429547309875488, "copy_logits_min": -750000000.0, "copy_num_tokens": 450.375, "epoch": 1.6502935920347204, "gen_logits_max": 2.9203429222106934, "gen_logits_mean": -16.738096237182617, "gen_logits_min": -30.000560760498047, "gen_logits_std": 3.322730779647827, "gen_loss": 0.26649850606918335, "grad_norm": 0.3675586502925319, "learning_rate": 2.0723368421052632e-05, "loss": 0.2783, "mean_copy_accuracy": 0.9966926872730255, "mean_gen_accuracy": 0.873625710606575, "mean_token_accuracy": 0.9038146883249283, "num_tokens": 1106804396.0, "sample_num_tokens": 7676.5, "step": 8080, "total_num_tokens": 1106835102.0, "z_loss": 0.00039621698670089245 }, { "copy_logits_max": -3.6050310134887695, "copy_logits_min": -687500032.0, "copy_num_tokens": 560.3125, "epoch": 1.6504978299719173, "gen_logits_max": 3.100548028945923, "gen_logits_mean": -17.099197387695312, "gen_logits_min": -29.90210723876953, "gen_logits_std": 3.336862802505493, "gen_loss": 0.259918212890625, "grad_norm": 0.3693065681638059, "learning_rate": 2.0722105263157897e-05, "loss": 0.2707, "mean_copy_accuracy": 0.9970404803752899, "mean_gen_accuracy": 0.8812086582183838, "mean_token_accuracy": 0.9083783626556396, "num_tokens": 1107050013.0, "sample_num_tokens": 9737.25, "step": 8081, "total_num_tokens": 1107088962.0, "z_loss": 0.00041436898754909635 }, { "copy_logits_max": -1.2719441652297974, "copy_logits_min": -750000000.0, "copy_num_tokens": 497.4375, "epoch": 1.6507020679091142, "gen_logits_max": 4.106895446777344, "gen_logits_mean": -14.530074119567871, "gen_logits_min": -26.20941925048828, "gen_logits_std": 3.140552043914795, "gen_loss": 0.2581486105918884, "grad_norm": 0.37546436807921474, "learning_rate": 2.0720842105263158e-05, "loss": 0.2558, "mean_copy_accuracy": 0.9958425611257553, "mean_gen_accuracy": 0.887151837348938, "mean_token_accuracy": 0.9132528305053711, "num_tokens": 1107311379.0, "sample_num_tokens": 8337.25, "step": 8082, "total_num_tokens": 1107344728.0, "z_loss": 0.00040548265678808093 }, { "copy_logits_max": -0.5280524492263794, "copy_logits_min": -687500032.0, "copy_num_tokens": 535.5, "epoch": 1.650906305846311, "gen_logits_max": 3.2682437896728516, "gen_logits_mean": -15.334965705871582, "gen_logits_min": -27.612693786621094, "gen_logits_std": 3.1997175216674805, "gen_loss": 0.25626498460769653, "grad_norm": 0.3409823341873553, "learning_rate": 2.0719578947368422e-05, "loss": 0.2822, "mean_copy_accuracy": 0.9962943196296692, "mean_gen_accuracy": 0.8756883889436722, "mean_token_accuracy": 0.9053813517093658, "num_tokens": 1107588545.0, "sample_num_tokens": 8402.75, "step": 8083, "total_num_tokens": 1107622156.0, "z_loss": 0.0005180853186175227 }, { "copy_logits_max": -1.5277249813079834, "copy_logits_min": -750000000.0, "copy_num_tokens": 519.375, "epoch": 1.6511105437835076, "gen_logits_max": 4.711177825927734, "gen_logits_mean": -13.99172592163086, "gen_logits_min": -25.680131912231445, "gen_logits_std": 3.1502718925476074, "gen_loss": 0.28998661041259766, "grad_norm": 0.38516716290466746, "learning_rate": 2.0718315789473683e-05, "loss": 0.2916, "mean_copy_accuracy": 0.9969384074211121, "mean_gen_accuracy": 0.87078657746315, "mean_token_accuracy": 0.9000532031059265, "num_tokens": 1107862587.0, "sample_num_tokens": 9943.25, "step": 8084, "total_num_tokens": 1107902360.0, "z_loss": 0.00048690749099478126 }, { "copy_logits_max": -1.8221348524093628, "copy_logits_min": -687500032.0, "copy_num_tokens": 679.8125, "epoch": 1.6513147817207048, "gen_logits_max": 4.567176342010498, "gen_logits_mean": -14.201770782470703, "gen_logits_min": -26.435850143432617, "gen_logits_std": 3.174370050430298, "gen_loss": 0.22501100599765778, "grad_norm": 0.35887300114662707, "learning_rate": 2.071705263157895e-05, "loss": 0.2639, "mean_copy_accuracy": 0.9969406574964523, "mean_gen_accuracy": 0.8761729747056961, "mean_token_accuracy": 0.9094266444444656, "num_tokens": 1108146510.0, "sample_num_tokens": 9211.5, "step": 8085, "total_num_tokens": 1108183356.0, "z_loss": 0.0003682548413053155 }, { "copy_logits_max": -3.7263453006744385, "copy_logits_min": -687500032.0, "copy_num_tokens": 425.25, "epoch": 1.6515190196579015, "gen_logits_max": 2.9959306716918945, "gen_logits_mean": -17.93258285522461, "gen_logits_min": -29.99396514892578, "gen_logits_std": 3.3458025455474854, "gen_loss": 0.2842179536819458, "grad_norm": 0.3115585757747984, "learning_rate": 2.071578947368421e-05, "loss": 0.2591, "mean_copy_accuracy": 0.9968252182006836, "mean_gen_accuracy": 0.8814739882946014, "mean_token_accuracy": 0.9110547006130219, "num_tokens": 1108438622.0, "sample_num_tokens": 8195.0, "step": 8086, "total_num_tokens": 1108471402.0, "z_loss": 0.0004131383029744029 }, { "copy_logits_max": -4.953123092651367, "copy_logits_min": -687500032.0, "copy_num_tokens": 420.6875, "epoch": 1.6517232575950982, "gen_logits_max": 3.5866518020629883, "gen_logits_mean": -16.353649139404297, "gen_logits_min": -28.1761474609375, "gen_logits_std": 3.2235522270202637, "gen_loss": 0.30072221159935, "grad_norm": 0.3663341665495884, "learning_rate": 2.0714526315789476e-05, "loss": 0.2703, "mean_copy_accuracy": 0.9968211203813553, "mean_gen_accuracy": 0.8777723163366318, "mean_token_accuracy": 0.9077898859977722, "num_tokens": 1108698600.0, "sample_num_tokens": 8030.0, "step": 8087, "total_num_tokens": 1108730720.0, "z_loss": 0.0004742756427731365 }, { "copy_logits_max": -4.92548942565918, "copy_logits_min": -750000000.0, "copy_num_tokens": 490.0625, "epoch": 1.6519274955322951, "gen_logits_max": 3.3179514408111572, "gen_logits_mean": -17.321388244628906, "gen_logits_min": -29.29242706298828, "gen_logits_std": 3.320134162902832, "gen_loss": 0.28982311487197876, "grad_norm": 0.3577923200621124, "learning_rate": 2.0713263157894737e-05, "loss": 0.2859, "mean_copy_accuracy": 0.9972748160362244, "mean_gen_accuracy": 0.8718561381101608, "mean_token_accuracy": 0.9022602140903473, "num_tokens": 1108969708.0, "sample_num_tokens": 9315.0, "step": 8088, "total_num_tokens": 1109006968.0, "z_loss": 0.0004005608498118818 }, { "copy_logits_max": -6.297427177429199, "copy_logits_min": -750000000.0, "copy_num_tokens": 396.375, "epoch": 1.652131733469492, "gen_logits_max": 3.2857227325439453, "gen_logits_mean": -17.773122787475586, "gen_logits_min": -30.104543685913086, "gen_logits_std": 3.373988151550293, "gen_loss": 0.2856265604496002, "grad_norm": 0.33021528113476156, "learning_rate": 2.0712e-05, "loss": 0.2753, "mean_copy_accuracy": 0.997281551361084, "mean_gen_accuracy": 0.8696386814117432, "mean_token_accuracy": 0.9057996571063995, "num_tokens": 1109269513.0, "sample_num_tokens": 8004.75, "step": 8089, "total_num_tokens": 1109301532.0, "z_loss": 0.0004340567102190107 }, { "copy_logits_max": -1.752186894416809, "copy_logits_min": -750000000.0, "copy_num_tokens": 769.8125, "epoch": 1.6523359714066888, "gen_logits_max": 3.105417251586914, "gen_logits_mean": -15.825682640075684, "gen_logits_min": -28.20452880859375, "gen_logits_std": 3.294137954711914, "gen_loss": 0.22417430579662323, "grad_norm": 0.36357675750192914, "learning_rate": 2.0710736842105262e-05, "loss": 0.2414, "mean_copy_accuracy": 0.9981913268566132, "mean_gen_accuracy": 0.8830870687961578, "mean_token_accuracy": 0.9189957529306412, "num_tokens": 1109562925.0, "sample_num_tokens": 10075.75, "step": 8090, "total_num_tokens": 1109603228.0, "z_loss": 0.0003591703425627202 }, { "copy_logits_max": -6.614539623260498, "copy_logits_min": -750000000.0, "copy_num_tokens": 728.125, "epoch": 1.6525402093438857, "gen_logits_max": 2.869497299194336, "gen_logits_mean": -17.408000946044922, "gen_logits_min": -30.18850326538086, "gen_logits_std": 3.3881258964538574, "gen_loss": 0.254161536693573, "grad_norm": 0.34322682737557864, "learning_rate": 2.0709473684210526e-05, "loss": 0.2618, "mean_copy_accuracy": 0.9961613416671753, "mean_gen_accuracy": 0.8810397684574127, "mean_token_accuracy": 0.9108871966600418, "num_tokens": 1109844038.0, "sample_num_tokens": 10382.0, "step": 8091, "total_num_tokens": 1109885566.0, "z_loss": 0.0003579035110305995 }, { "copy_logits_max": -5.178833961486816, "copy_logits_min": -750000000.0, "copy_num_tokens": 439.125, "epoch": 1.6527444472810826, "gen_logits_max": 4.573579788208008, "gen_logits_mean": -14.681900024414062, "gen_logits_min": -27.07763671875, "gen_logits_std": 3.292104721069336, "gen_loss": 0.2581038475036621, "grad_norm": 0.36573221703100783, "learning_rate": 2.0708210526315787e-05, "loss": 0.263, "mean_copy_accuracy": 0.9961991906166077, "mean_gen_accuracy": 0.8816515356302261, "mean_token_accuracy": 0.9098621606826782, "num_tokens": 1110117454.0, "sample_num_tokens": 7699.0, "step": 8092, "total_num_tokens": 1110148250.0, "z_loss": 0.0003499477752484381 }, { "copy_logits_max": -6.347824573516846, "copy_logits_min": -750000000.0, "copy_num_tokens": 289.4375, "epoch": 1.6529486852182793, "gen_logits_max": 3.0955052375793457, "gen_logits_mean": -18.655174255371094, "gen_logits_min": -30.147029876708984, "gen_logits_std": 3.35646390914917, "gen_loss": 0.26616597175598145, "grad_norm": 0.42313238215303156, "learning_rate": 2.070694736842105e-05, "loss": 0.2804, "mean_copy_accuracy": 0.9966304302215576, "mean_gen_accuracy": 0.878692239522934, "mean_token_accuracy": 0.9036554843187332, "num_tokens": 1110376742.0, "sample_num_tokens": 7332.0, "step": 8093, "total_num_tokens": 1110406070.0, "z_loss": 0.0003993623540736735 }, { "copy_logits_max": -4.107058048248291, "copy_logits_min": -687500032.0, "copy_num_tokens": 444.75, "epoch": 1.653152923155476, "gen_logits_max": 3.111710548400879, "gen_logits_mean": -16.650615692138672, "gen_logits_min": -28.568225860595703, "gen_logits_std": 3.283947229385376, "gen_loss": 0.315607488155365, "grad_norm": 0.3513111283057964, "learning_rate": 2.070568421052632e-05, "loss": 0.2929, "mean_copy_accuracy": 0.9968502372503281, "mean_gen_accuracy": 0.8678154200315475, "mean_token_accuracy": 0.9005117416381836, "num_tokens": 1110636229.0, "sample_num_tokens": 7899.25, "step": 8094, "total_num_tokens": 1110667826.0, "z_loss": 0.0005131273064762354 }, { "copy_logits_max": -5.33796501159668, "copy_logits_min": -750000000.0, "copy_num_tokens": 360.5, "epoch": 1.653357161092673, "gen_logits_max": 3.8314242362976074, "gen_logits_mean": -17.615324020385742, "gen_logits_min": -29.559314727783203, "gen_logits_std": 3.348543167114258, "gen_loss": 0.2791992425918579, "grad_norm": 0.38021541305770246, "learning_rate": 2.070442105263158e-05, "loss": 0.2863, "mean_copy_accuracy": 0.9952456951141357, "mean_gen_accuracy": 0.8797300010919571, "mean_token_accuracy": 0.9033742249011993, "num_tokens": 1110886518.0, "sample_num_tokens": 7887.0, "step": 8095, "total_num_tokens": 1110918066.0, "z_loss": 0.00042945850873366 }, { "copy_logits_max": -6.886955261230469, "copy_logits_min": -750000000.0, "copy_num_tokens": 453.4375, "epoch": 1.6535613990298699, "gen_logits_max": 3.6556241512298584, "gen_logits_mean": -17.184829711914062, "gen_logits_min": -29.035171508789062, "gen_logits_std": 3.3222196102142334, "gen_loss": 0.294689416885376, "grad_norm": 0.33738804501715725, "learning_rate": 2.0703157894736844e-05, "loss": 0.2676, "mean_copy_accuracy": 0.9969383925199509, "mean_gen_accuracy": 0.8811389058828354, "mean_token_accuracy": 0.9072638303041458, "num_tokens": 1111144973.0, "sample_num_tokens": 8617.75, "step": 8096, "total_num_tokens": 1111179444.0, "z_loss": 0.0004271562211215496 }, { "copy_logits_max": -7.750321865081787, "copy_logits_min": -750000000.0, "copy_num_tokens": 490.4375, "epoch": 1.6537656369670666, "gen_logits_max": 3.699228525161743, "gen_logits_mean": -17.021678924560547, "gen_logits_min": -29.020998001098633, "gen_logits_std": 3.335367202758789, "gen_loss": 0.2925459146499634, "grad_norm": 0.3950120664242138, "learning_rate": 2.0701894736842105e-05, "loss": 0.312, "mean_copy_accuracy": 0.9959551841020584, "mean_gen_accuracy": 0.8673002123832703, "mean_token_accuracy": 0.8951757401227951, "num_tokens": 1111396122.0, "sample_num_tokens": 8985.5, "step": 8097, "total_num_tokens": 1111432064.0, "z_loss": 0.00041852702270261943 }, { "copy_logits_max": -6.631829738616943, "copy_logits_min": -750000000.0, "copy_num_tokens": 482.6875, "epoch": 1.6539698749042635, "gen_logits_max": 3.04097580909729, "gen_logits_mean": -18.617919921875, "gen_logits_min": -30.755138397216797, "gen_logits_std": 3.4111263751983643, "gen_loss": 0.27491384744644165, "grad_norm": 0.355086083902189, "learning_rate": 2.070063157894737e-05, "loss": 0.2459, "mean_copy_accuracy": 0.9976658374071121, "mean_gen_accuracy": 0.8855850398540497, "mean_token_accuracy": 0.9161663800477982, "num_tokens": 1111692101.0, "sample_num_tokens": 8787.25, "step": 8098, "total_num_tokens": 1111727250.0, "z_loss": 0.0004591216566041112 }, { "copy_logits_max": -5.872613906860352, "copy_logits_min": -750000000.0, "copy_num_tokens": 626.6875, "epoch": 1.6541741128414604, "gen_logits_max": 3.8647828102111816, "gen_logits_mean": -15.61590576171875, "gen_logits_min": -28.154705047607422, "gen_logits_std": 3.3269829750061035, "gen_loss": 0.22244447469711304, "grad_norm": 0.33996013041287426, "learning_rate": 2.069936842105263e-05, "loss": 0.2451, "mean_copy_accuracy": 0.997847780585289, "mean_gen_accuracy": 0.8847395479679108, "mean_token_accuracy": 0.9156480431556702, "num_tokens": 1111984229.0, "sample_num_tokens": 9194.25, "step": 8099, "total_num_tokens": 1112021006.0, "z_loss": 0.0003702891990542412 }, { "copy_logits_max": -6.421411514282227, "copy_logits_min": -750000000.0, "copy_num_tokens": 512.8125, "epoch": 1.6543783507786571, "gen_logits_max": 3.06730318069458, "gen_logits_mean": -18.07525634765625, "gen_logits_min": -30.16129493713379, "gen_logits_std": 3.360717296600342, "gen_loss": 0.2983631193637848, "grad_norm": 0.3518167317152933, "learning_rate": 2.0698105263157895e-05, "loss": 0.2745, "mean_copy_accuracy": 0.9972192645072937, "mean_gen_accuracy": 0.8740929663181305, "mean_token_accuracy": 0.9071721881628036, "num_tokens": 1112280434.0, "sample_num_tokens": 9115.5, "step": 8100, "total_num_tokens": 1112316896.0, "z_loss": 0.0004867225361522287 }, { "copy_logits_max": -5.908915042877197, "copy_logits_min": -750000000.0, "copy_num_tokens": 527.5, "epoch": 1.6545825887158538, "gen_logits_max": 3.5553290843963623, "gen_logits_mean": -16.875654220581055, "gen_logits_min": -29.084373474121094, "gen_logits_std": 3.3633127212524414, "gen_loss": 0.2632254958152771, "grad_norm": 0.36190680072839526, "learning_rate": 2.0696842105263156e-05, "loss": 0.2731, "mean_copy_accuracy": 0.9971403330564499, "mean_gen_accuracy": 0.8738194406032562, "mean_token_accuracy": 0.9059728533029556, "num_tokens": 1112545187.0, "sample_num_tokens": 8360.25, "step": 8101, "total_num_tokens": 1112578628.0, "z_loss": 0.00041411243728362024 }, { "copy_logits_max": -6.757219314575195, "copy_logits_min": -750000000.0, "copy_num_tokens": 570.6875, "epoch": 1.6547868266530508, "gen_logits_max": 4.760536193847656, "gen_logits_mean": -15.555460929870605, "gen_logits_min": -28.01714515686035, "gen_logits_std": 3.324213981628418, "gen_loss": 0.2296549677848816, "grad_norm": 0.33837855188993576, "learning_rate": 2.0695578947368423e-05, "loss": 0.2585, "mean_copy_accuracy": 0.9968767315149307, "mean_gen_accuracy": 0.8825474828481674, "mean_token_accuracy": 0.9133020639419556, "num_tokens": 1112837987.0, "sample_num_tokens": 9920.75, "step": 8102, "total_num_tokens": 1112877670.0, "z_loss": 0.0003889003419317305 }, { "copy_logits_max": -5.892038822174072, "copy_logits_min": -750000000.0, "copy_num_tokens": 471.5, "epoch": 1.6549910645902477, "gen_logits_max": 3.939638376235962, "gen_logits_mean": -15.882211685180664, "gen_logits_min": -28.05139923095703, "gen_logits_std": 3.2996859550476074, "gen_loss": 0.3010670840740204, "grad_norm": 0.31996745463434473, "learning_rate": 2.0694315789473684e-05, "loss": 0.2689, "mean_copy_accuracy": 0.9965187758207321, "mean_gen_accuracy": 0.8788087069988251, "mean_token_accuracy": 0.9104181379079819, "num_tokens": 1113123282.0, "sample_num_tokens": 8553.0, "step": 8103, "total_num_tokens": 1113157494.0, "z_loss": 0.0004924485692754388 }, { "copy_logits_max": -4.258038520812988, "copy_logits_min": -687500032.0, "copy_num_tokens": 320.125, "epoch": 1.6551953025274444, "gen_logits_max": 4.348487854003906, "gen_logits_mean": -15.611302375793457, "gen_logits_min": -28.469207763671875, "gen_logits_std": 3.280545234680176, "gen_loss": 0.27487558126449585, "grad_norm": 0.34595307469741887, "learning_rate": 2.069305263157895e-05, "loss": 0.2629, "mean_copy_accuracy": 0.9972817301750183, "mean_gen_accuracy": 0.8798305243253708, "mean_token_accuracy": 0.9100096523761749, "num_tokens": 1113398072.0, "sample_num_tokens": 6360.0, "step": 8104, "total_num_tokens": 1113423512.0, "z_loss": 0.000436305592302233 }, { "copy_logits_max": -5.994019031524658, "copy_logits_min": -750000000.0, "copy_num_tokens": 479.875, "epoch": 1.6553995404646413, "gen_logits_max": 3.5935757160186768, "gen_logits_mean": -16.586313247680664, "gen_logits_min": -29.195262908935547, "gen_logits_std": 3.3542680740356445, "gen_loss": 0.24009881913661957, "grad_norm": 0.37526125146373784, "learning_rate": 2.069178947368421e-05, "loss": 0.2695, "mean_copy_accuracy": 0.9976159483194351, "mean_gen_accuracy": 0.8781412839889526, "mean_token_accuracy": 0.9085263758897781, "num_tokens": 1113649446.0, "sample_num_tokens": 7900.0, "step": 8105, "total_num_tokens": 1113681046.0, "z_loss": 0.00040473550325259566 }, { "copy_logits_max": -8.349764823913574, "copy_logits_min": -687500032.0, "copy_num_tokens": 383.1875, "epoch": 1.6556037784018383, "gen_logits_max": 3.598261833190918, "gen_logits_mean": -17.720129013061523, "gen_logits_min": -29.809255599975586, "gen_logits_std": 3.36307954788208, "gen_loss": 0.3002382516860962, "grad_norm": 0.3487427992789907, "learning_rate": 2.0690526315789474e-05, "loss": 0.2678, "mean_copy_accuracy": 0.9962590783834457, "mean_gen_accuracy": 0.8780042976140976, "mean_token_accuracy": 0.9063052237033844, "num_tokens": 1113913994.0, "sample_num_tokens": 7921.0, "step": 8106, "total_num_tokens": 1113945678.0, "z_loss": 0.00047463655937463045 }, { "copy_logits_max": -7.560357093811035, "copy_logits_min": -750000000.0, "copy_num_tokens": 433.9375, "epoch": 1.655808016339035, "gen_logits_max": 3.2396106719970703, "gen_logits_mean": -18.01958465576172, "gen_logits_min": -30.483827590942383, "gen_logits_std": 3.3781847953796387, "gen_loss": 0.2699354290962219, "grad_norm": 0.3546834865472716, "learning_rate": 2.0689263157894738e-05, "loss": 0.2807, "mean_copy_accuracy": 0.9978556931018829, "mean_gen_accuracy": 0.8793982118368149, "mean_token_accuracy": 0.9061394929885864, "num_tokens": 1114190445.0, "sample_num_tokens": 9300.75, "step": 8107, "total_num_tokens": 1114227648.0, "z_loss": 0.000446692225523293 }, { "copy_logits_max": -7.115146636962891, "copy_logits_min": -750000000.0, "copy_num_tokens": 588.125, "epoch": 1.6560122542762317, "gen_logits_max": 2.9847044944763184, "gen_logits_mean": -17.5858154296875, "gen_logits_min": -30.118976593017578, "gen_logits_std": 3.3803224563598633, "gen_loss": 0.2903352677822113, "grad_norm": 0.3187847746710602, "learning_rate": 2.0688e-05, "loss": 0.2699, "mean_copy_accuracy": 0.9965639561414719, "mean_gen_accuracy": 0.8743902891874313, "mean_token_accuracy": 0.9049627631902695, "num_tokens": 1114465486.0, "sample_num_tokens": 9258.0, "step": 8108, "total_num_tokens": 1114502518.0, "z_loss": 0.0005315699381753802 }, { "copy_logits_max": -7.185068607330322, "copy_logits_min": -750000000.0, "copy_num_tokens": 402.8125, "epoch": 1.6562164922134286, "gen_logits_max": 3.3520584106445312, "gen_logits_mean": -17.115148544311523, "gen_logits_min": -29.333454132080078, "gen_logits_std": 3.3531646728515625, "gen_loss": 0.2642378807067871, "grad_norm": 0.352680877233384, "learning_rate": 2.0686736842105263e-05, "loss": 0.2607, "mean_copy_accuracy": 0.9969627410173416, "mean_gen_accuracy": 0.8809136301279068, "mean_token_accuracy": 0.9109219461679459, "num_tokens": 1114727211.0, "sample_num_tokens": 7220.25, "step": 8109, "total_num_tokens": 1114756092.0, "z_loss": 0.000447528378572315 }, { "copy_logits_max": -8.012593269348145, "copy_logits_min": -750000000.0, "copy_num_tokens": 473.625, "epoch": 1.6564207301506255, "gen_logits_max": 4.33256721496582, "gen_logits_mean": -15.316805839538574, "gen_logits_min": -27.64443016052246, "gen_logits_std": 3.301051616668701, "gen_loss": 0.26470327377319336, "grad_norm": 0.3968476014194043, "learning_rate": 2.0685473684210528e-05, "loss": 0.2866, "mean_copy_accuracy": 0.9950753450393677, "mean_gen_accuracy": 0.8789497911930084, "mean_token_accuracy": 0.9009544104337692, "num_tokens": 1114974927.0, "sample_num_tokens": 9206.75, "step": 8110, "total_num_tokens": 1115011754.0, "z_loss": 0.00047036269097588956 }, { "copy_logits_max": -7.578135967254639, "copy_logits_min": -750000000.0, "copy_num_tokens": 408.0, "epoch": 1.6566249680878222, "gen_logits_max": 3.4530913829803467, "gen_logits_mean": -17.35755157470703, "gen_logits_min": -29.237140655517578, "gen_logits_std": 3.359492540359497, "gen_loss": 0.30284377932548523, "grad_norm": 0.3368082878909992, "learning_rate": 2.0684210526315792e-05, "loss": 0.2677, "mean_copy_accuracy": 0.9962888211011887, "mean_gen_accuracy": 0.8811123967170715, "mean_token_accuracy": 0.9096596539020538, "num_tokens": 1115244675.0, "sample_num_tokens": 8096.75, "step": 8111, "total_num_tokens": 1115277062.0, "z_loss": 0.0005298830801621079 }, { "copy_logits_max": -9.273955345153809, "copy_logits_min": -750000000.0, "copy_num_tokens": 390.8125, "epoch": 1.6568292060250192, "gen_logits_max": 4.011735916137695, "gen_logits_mean": -16.600393295288086, "gen_logits_min": -28.906452178955078, "gen_logits_std": 3.333578109741211, "gen_loss": 0.2749646008014679, "grad_norm": 0.3670025272331934, "learning_rate": 2.0682947368421053e-05, "loss": 0.2772, "mean_copy_accuracy": 0.9971634894609451, "mean_gen_accuracy": 0.8765824437141418, "mean_token_accuracy": 0.9054907411336899, "num_tokens": 1115516472.0, "sample_num_tokens": 7920.5, "step": 8112, "total_num_tokens": 1115548154.0, "z_loss": 0.0004556643543764949 }, { "copy_logits_max": -11.008330345153809, "copy_logits_min": -749999936.0, "copy_num_tokens": 522.5625, "epoch": 1.657033443962216, "gen_logits_max": 3.970418691635132, "gen_logits_mean": -16.16958236694336, "gen_logits_min": -28.554616928100586, "gen_logits_std": 3.3449079990386963, "gen_loss": 0.24220538139343262, "grad_norm": 0.333375320259781, "learning_rate": 2.0681684210526317e-05, "loss": 0.2644, "mean_copy_accuracy": 0.9969742596149445, "mean_gen_accuracy": 0.8893072009086609, "mean_token_accuracy": 0.9109067022800446, "num_tokens": 1115787497.0, "sample_num_tokens": 10493.75, "step": 8113, "total_num_tokens": 1115829472.0, "z_loss": 0.00036810775054618716 }, { "copy_logits_max": -8.115550994873047, "copy_logits_min": -687500032.0, "copy_num_tokens": 600.25, "epoch": 1.6572376818994128, "gen_logits_max": 2.5731334686279297, "gen_logits_mean": -17.4611873626709, "gen_logits_min": -29.920001983642578, "gen_logits_std": 3.3806588649749756, "gen_loss": 0.28368374705314636, "grad_norm": 0.3127658527324917, "learning_rate": 2.0680421052631578e-05, "loss": 0.2605, "mean_copy_accuracy": 0.997782826423645, "mean_gen_accuracy": 0.8784657120704651, "mean_token_accuracy": 0.911787211894989, "num_tokens": 1116088681.0, "sample_num_tokens": 8781.25, "step": 8114, "total_num_tokens": 1116123806.0, "z_loss": 0.0004747462226077914 }, { "copy_logits_max": -10.587202072143555, "copy_logits_min": -625000064.0, "copy_num_tokens": 394.8125, "epoch": 1.6574419198366095, "gen_logits_max": 4.967029571533203, "gen_logits_mean": -15.762575149536133, "gen_logits_min": -27.59050750732422, "gen_logits_std": 3.308537721633911, "gen_loss": 0.29009881615638733, "grad_norm": 0.3324906812410642, "learning_rate": 2.0679157894736843e-05, "loss": 0.2673, "mean_copy_accuracy": 0.9972075670957565, "mean_gen_accuracy": 0.8849171251058578, "mean_token_accuracy": 0.9103651642799377, "num_tokens": 1116364269.0, "sample_num_tokens": 8473.75, "step": 8115, "total_num_tokens": 1116398164.0, "z_loss": 0.0004882665816694498 }, { "copy_logits_max": -7.966524600982666, "copy_logits_min": -625000064.0, "copy_num_tokens": 604.4375, "epoch": 1.6576461577738066, "gen_logits_max": 3.198354721069336, "gen_logits_mean": -16.775943756103516, "gen_logits_min": -28.951122283935547, "gen_logits_std": 3.381878137588501, "gen_loss": 0.25012901425361633, "grad_norm": 0.360688951156876, "learning_rate": 2.0677894736842103e-05, "loss": 0.2839, "mean_copy_accuracy": 0.9974655359983444, "mean_gen_accuracy": 0.8720131814479828, "mean_token_accuracy": 0.9054602086544037, "num_tokens": 1116640837.0, "sample_num_tokens": 9418.25, "step": 8116, "total_num_tokens": 1116678510.0, "z_loss": 0.0004246810858603567 }, { "copy_logits_max": -6.264311790466309, "copy_logits_min": -750000000.0, "copy_num_tokens": 568.6875, "epoch": 1.6578503957110033, "gen_logits_max": 2.121370315551758, "gen_logits_mean": -18.57916259765625, "gen_logits_min": -31.278873443603516, "gen_logits_std": 3.441455364227295, "gen_loss": 0.257968544960022, "grad_norm": 0.37010522708621524, "learning_rate": 2.0676631578947368e-05, "loss": 0.2691, "mean_copy_accuracy": 0.9964354336261749, "mean_gen_accuracy": 0.8795680999755859, "mean_token_accuracy": 0.9080648869276047, "num_tokens": 1116895176.0, "sample_num_tokens": 9327.0, "step": 8117, "total_num_tokens": 1116932484.0, "z_loss": 0.0004170314350631088 }, { "copy_logits_max": -8.059795379638672, "copy_logits_min": -750000064.0, "copy_num_tokens": 512.8125, "epoch": 1.6580546336482, "gen_logits_max": 2.7972288131713867, "gen_logits_mean": -17.332752227783203, "gen_logits_min": -29.83563995361328, "gen_logits_std": 3.382493734359741, "gen_loss": 0.2876869738101959, "grad_norm": 0.3635030307676993, "learning_rate": 2.0675368421052632e-05, "loss": 0.2805, "mean_copy_accuracy": 0.9964562803506851, "mean_gen_accuracy": 0.8723825812339783, "mean_token_accuracy": 0.9038666784763336, "num_tokens": 1117155410.0, "sample_num_tokens": 8247.0, "step": 8118, "total_num_tokens": 1117188398.0, "z_loss": 0.0004687401233240962 }, { "copy_logits_max": -9.03663158416748, "copy_logits_min": -750000000.0, "copy_num_tokens": 423.8125, "epoch": 1.658258871585397, "gen_logits_max": 3.3843250274658203, "gen_logits_mean": -17.16773223876953, "gen_logits_min": -29.219343185424805, "gen_logits_std": 3.3748931884765625, "gen_loss": 0.2641112804412842, "grad_norm": 0.36998068953410895, "learning_rate": 2.0674105263157896e-05, "loss": 0.2844, "mean_copy_accuracy": 0.9974913597106934, "mean_gen_accuracy": 0.8760019093751907, "mean_token_accuracy": 0.9056397974491119, "num_tokens": 1117415939.0, "sample_num_tokens": 7693.75, "step": 8119, "total_num_tokens": 1117446714.0, "z_loss": 0.00041251350194215775 }, { "copy_logits_max": -7.9041290283203125, "copy_logits_min": -750000064.0, "copy_num_tokens": 349.3125, "epoch": 1.658463109522594, "gen_logits_max": 5.313141822814941, "gen_logits_mean": -14.64886474609375, "gen_logits_min": -27.367902755737305, "gen_logits_std": 3.2839508056640625, "gen_loss": 0.30111968517303467, "grad_norm": 0.3109069163649439, "learning_rate": 2.067284210526316e-05, "loss": 0.25, "mean_copy_accuracy": 0.996259942650795, "mean_gen_accuracy": 0.8858994394540787, "mean_token_accuracy": 0.9141744822263718, "num_tokens": 1117689556.0, "sample_num_tokens": 7561.5, "step": 8120, "total_num_tokens": 1117719802.0, "z_loss": 0.0005059333052486181 }, { "copy_logits_max": -8.16758918762207, "copy_logits_min": -750000000.0, "copy_num_tokens": 655.125, "epoch": 1.6586673474597906, "gen_logits_max": 4.317618370056152, "gen_logits_mean": -15.250507354736328, "gen_logits_min": -27.885547637939453, "gen_logits_std": 3.3137905597686768, "gen_loss": 0.24362191557884216, "grad_norm": 0.4449797232557269, "learning_rate": 2.067157894736842e-05, "loss": 0.2857, "mean_copy_accuracy": 0.9970381110906601, "mean_gen_accuracy": 0.8688836097717285, "mean_token_accuracy": 0.9020756036043167, "num_tokens": 1117948089.0, "sample_num_tokens": 9755.25, "step": 8121, "total_num_tokens": 1117987110.0, "z_loss": 0.00036555767292156816 }, { "copy_logits_max": -10.288656234741211, "copy_logits_min": -750000000.0, "copy_num_tokens": 281.625, "epoch": 1.6588715853969875, "gen_logits_max": 3.7944629192352295, "gen_logits_mean": -17.381839752197266, "gen_logits_min": -29.306640625, "gen_logits_std": 3.305558204650879, "gen_loss": 0.31380176544189453, "grad_norm": 0.3459528042142444, "learning_rate": 2.0670315789473686e-05, "loss": 0.2775, "mean_copy_accuracy": 0.9959284216165543, "mean_gen_accuracy": 0.878672182559967, "mean_token_accuracy": 0.9063911437988281, "num_tokens": 1118218099.0, "sample_num_tokens": 7034.25, "step": 8122, "total_num_tokens": 1118246236.0, "z_loss": 0.000436041911598295 }, { "copy_logits_max": -7.117570400238037, "copy_logits_min": -687500032.0, "copy_num_tokens": 644.3125, "epoch": 1.6590758233341845, "gen_logits_max": 2.849498748779297, "gen_logits_mean": -16.661651611328125, "gen_logits_min": -29.17698097229004, "gen_logits_std": 3.3456594944000244, "gen_loss": 0.28798356652259827, "grad_norm": 0.3707997967663038, "learning_rate": 2.0669052631578947e-05, "loss": 0.2808, "mean_copy_accuracy": 0.9969505369663239, "mean_gen_accuracy": 0.873696818947792, "mean_token_accuracy": 0.9069723039865494, "num_tokens": 1118498775.0, "sample_num_tokens": 9269.25, "step": 8123, "total_num_tokens": 1118535852.0, "z_loss": 0.00044857111060991883 }, { "copy_logits_max": -9.21060562133789, "copy_logits_min": -625000064.0, "copy_num_tokens": 511.8125, "epoch": 1.6592800612713812, "gen_logits_max": 4.086209774017334, "gen_logits_mean": -15.721839904785156, "gen_logits_min": -27.998754501342773, "gen_logits_std": 3.313441276550293, "gen_loss": 0.2689793109893799, "grad_norm": 0.3828325307097854, "learning_rate": 2.066778947368421e-05, "loss": 0.2739, "mean_copy_accuracy": 0.9963184595108032, "mean_gen_accuracy": 0.8793867230415344, "mean_token_accuracy": 0.9057111740112305, "num_tokens": 1118749595.0, "sample_num_tokens": 8348.75, "step": 8124, "total_num_tokens": 1118782990.0, "z_loss": 0.0004240799171384424 }, { "copy_logits_max": -8.993776321411133, "copy_logits_min": -750000000.0, "copy_num_tokens": 372.0625, "epoch": 1.6594842992085779, "gen_logits_max": 3.0596938133239746, "gen_logits_mean": -18.62480354309082, "gen_logits_min": -30.278480529785156, "gen_logits_std": 3.384119987487793, "gen_loss": 0.31080061197280884, "grad_norm": 0.3448247870313052, "learning_rate": 2.0666526315789472e-05, "loss": 0.2885, "mean_copy_accuracy": 0.9969203174114227, "mean_gen_accuracy": 0.872430294752121, "mean_token_accuracy": 0.901793509721756, "num_tokens": 1119025698.0, "sample_num_tokens": 7873.5, "step": 8125, "total_num_tokens": 1119057192.0, "z_loss": 0.0004314403631724417 }, { "copy_logits_max": -9.011893272399902, "copy_logits_min": -687500032.0, "copy_num_tokens": 584.75, "epoch": 1.6596885371457748, "gen_logits_max": 4.727416038513184, "gen_logits_mean": -14.20285701751709, "gen_logits_min": -26.910898208618164, "gen_logits_std": 3.236790657043457, "gen_loss": 0.2948412299156189, "grad_norm": 0.36786643617008935, "learning_rate": 2.066526315789474e-05, "loss": 0.2927, "mean_copy_accuracy": 0.9969998747110367, "mean_gen_accuracy": 0.8694134801626205, "mean_token_accuracy": 0.9006949067115784, "num_tokens": 1119285893.0, "sample_num_tokens": 8720.75, "step": 8126, "total_num_tokens": 1119320776.0, "z_loss": 0.0005453092162497342 }, { "copy_logits_max": -9.36849594116211, "copy_logits_min": -750000000.0, "copy_num_tokens": 415.5, "epoch": 1.6598927750829717, "gen_logits_max": 3.8259878158569336, "gen_logits_mean": -17.978797912597656, "gen_logits_min": -29.930286407470703, "gen_logits_std": 3.364136219024658, "gen_loss": 0.28351640701293945, "grad_norm": 0.3696016778153439, "learning_rate": 2.0664e-05, "loss": 0.2879, "mean_copy_accuracy": 0.9972702860832214, "mean_gen_accuracy": 0.8741893619298935, "mean_token_accuracy": 0.9028939753770828, "num_tokens": 1119565268.0, "sample_num_tokens": 9120.5, "step": 8127, "total_num_tokens": 1119601750.0, "z_loss": 0.0004679582780227065 }, { "copy_logits_max": -9.200307846069336, "copy_logits_min": -750000000.0, "copy_num_tokens": 492.3125, "epoch": 1.6600970130201684, "gen_logits_max": 2.880380153656006, "gen_logits_mean": -18.517057418823242, "gen_logits_min": -30.495777130126953, "gen_logits_std": 3.393043279647827, "gen_loss": 0.26669567823410034, "grad_norm": 0.34964338702140463, "learning_rate": 2.0662736842105265e-05, "loss": 0.259, "mean_copy_accuracy": 0.9963254928588867, "mean_gen_accuracy": 0.8846825808286667, "mean_token_accuracy": 0.9114897847175598, "num_tokens": 1119850940.0, "sample_num_tokens": 10004.5, "step": 8128, "total_num_tokens": 1119890958.0, "z_loss": 0.0004021986969746649 }, { "copy_logits_max": -9.847504615783691, "copy_logits_min": -750000000.0, "copy_num_tokens": 313.875, "epoch": 1.6603012509573654, "gen_logits_max": 3.424241304397583, "gen_logits_mean": -18.649906158447266, "gen_logits_min": -30.47543716430664, "gen_logits_std": 3.3812296390533447, "gen_loss": 0.2976799011230469, "grad_norm": 0.3319893359790857, "learning_rate": 2.0661473684210526e-05, "loss": 0.2808, "mean_copy_accuracy": 0.9967125207185745, "mean_gen_accuracy": 0.8754648566246033, "mean_token_accuracy": 0.9037764966487885, "num_tokens": 1120136782.0, "sample_num_tokens": 7763.0, "step": 8129, "total_num_tokens": 1120167834.0, "z_loss": 0.00045479938853532076 }, { "copy_logits_max": -10.991459846496582, "copy_logits_min": -750000000.0, "copy_num_tokens": 416.625, "epoch": 1.6605054888945623, "gen_logits_max": 3.8126134872436523, "gen_logits_mean": -17.329692840576172, "gen_logits_min": -29.346376419067383, "gen_logits_std": 3.3307909965515137, "gen_loss": 0.2619028687477112, "grad_norm": 0.3349494058245303, "learning_rate": 2.066021052631579e-05, "loss": 0.2616, "mean_copy_accuracy": 0.9974127560853958, "mean_gen_accuracy": 0.8806903213262558, "mean_token_accuracy": 0.9111895114183426, "num_tokens": 1120397280.0, "sample_num_tokens": 8180.5, "step": 8130, "total_num_tokens": 1120430002.0, "z_loss": 0.0004175131325609982 }, { "copy_logits_max": -7.911961078643799, "copy_logits_min": -750000000.0, "copy_num_tokens": 465.5, "epoch": 1.660709726831759, "gen_logits_max": 5.0227274894714355, "gen_logits_mean": -14.21676254272461, "gen_logits_min": -26.912553787231445, "gen_logits_std": 3.270754337310791, "gen_loss": 0.24764615297317505, "grad_norm": 0.35693178030195344, "learning_rate": 2.065894736842105e-05, "loss": 0.2704, "mean_copy_accuracy": 0.9973436295986176, "mean_gen_accuracy": 0.8747621476650238, "mean_token_accuracy": 0.9088557660579681, "num_tokens": 1120660217.0, "sample_num_tokens": 7849.25, "step": 8131, "total_num_tokens": 1120691614.0, "z_loss": 0.00037729801260866225 }, { "copy_logits_max": -7.347139358520508, "copy_logits_min": -687500032.0, "copy_num_tokens": 469.4375, "epoch": 1.6609139647689557, "gen_logits_max": 3.4386982917785645, "gen_logits_mean": -16.675765991210938, "gen_logits_min": -29.005386352539062, "gen_logits_std": 3.3313887119293213, "gen_loss": 0.30079513788223267, "grad_norm": 0.3361294162278953, "learning_rate": 2.0657684210526315e-05, "loss": 0.2684, "mean_copy_accuracy": 0.996976912021637, "mean_gen_accuracy": 0.8777758628129959, "mean_token_accuracy": 0.9078023433685303, "num_tokens": 1120924903.0, "sample_num_tokens": 8304.75, "step": 8132, "total_num_tokens": 1120958122.0, "z_loss": 0.00047134520718827844 }, { "copy_logits_max": -9.797683715820312, "copy_logits_min": -687500032.0, "copy_num_tokens": 542.125, "epoch": 1.6611182027061526, "gen_logits_max": 3.520051956176758, "gen_logits_mean": -17.13064193725586, "gen_logits_min": -29.200708389282227, "gen_logits_std": 3.342531204223633, "gen_loss": 0.25276413559913635, "grad_norm": 0.3625283391875426, "learning_rate": 2.065642105263158e-05, "loss": 0.2682, "mean_copy_accuracy": 0.9965669363737106, "mean_gen_accuracy": 0.8785363882780075, "mean_token_accuracy": 0.9084569960832596, "num_tokens": 1121186768.0, "sample_num_tokens": 8876.5, "step": 8133, "total_num_tokens": 1121222274.0, "z_loss": 0.00041627930477261543 }, { "copy_logits_max": -10.069441795349121, "copy_logits_min": -750000128.0, "copy_num_tokens": 681.75, "epoch": 1.6613224406433496, "gen_logits_max": 3.3931894302368164, "gen_logits_mean": -16.697063446044922, "gen_logits_min": -28.930591583251953, "gen_logits_std": 3.3421630859375, "gen_loss": 0.2238456904888153, "grad_norm": 0.6158768297168409, "learning_rate": 2.0655157894736844e-05, "loss": 0.2664, "mean_copy_accuracy": 0.9974790960550308, "mean_gen_accuracy": 0.8789578676223755, "mean_token_accuracy": 0.9089480638504028, "num_tokens": 1121454064.0, "sample_num_tokens": 8948.0, "step": 8134, "total_num_tokens": 1121489856.0, "z_loss": 0.0003963843046221882 }, { "copy_logits_max": -8.180258750915527, "copy_logits_min": -625000064.0, "copy_num_tokens": 535.3125, "epoch": 1.6615266785805463, "gen_logits_max": 3.097074031829834, "gen_logits_mean": -17.615642547607422, "gen_logits_min": -29.64111328125, "gen_logits_std": 3.360233783721924, "gen_loss": 0.26567989587783813, "grad_norm": 0.31956881690448846, "learning_rate": 2.065389473684211e-05, "loss": 0.2504, "mean_copy_accuracy": 0.9968608021736145, "mean_gen_accuracy": 0.8894984722137451, "mean_token_accuracy": 0.9153922945261002, "num_tokens": 1121747666.0, "sample_num_tokens": 9206.0, "step": 8135, "total_num_tokens": 1121784490.0, "z_loss": 0.00047094374895095825 }, { "copy_logits_max": -10.02276611328125, "copy_logits_min": -750000000.0, "copy_num_tokens": 493.75, "epoch": 1.6617309165177432, "gen_logits_max": 3.5332963466644287, "gen_logits_mean": -15.629054069519043, "gen_logits_min": -27.7919921875, "gen_logits_std": 3.2549521923065186, "gen_loss": 0.25216537714004517, "grad_norm": 0.33802373113579826, "learning_rate": 2.065263157894737e-05, "loss": 0.2658, "mean_copy_accuracy": 0.9968880861997604, "mean_gen_accuracy": 0.8793308436870575, "mean_token_accuracy": 0.9091152995824814, "num_tokens": 1122026761.0, "sample_num_tokens": 8426.75, "step": 8136, "total_num_tokens": 1122060468.0, "z_loss": 0.0004494249587878585 }, { "copy_logits_max": -8.718843460083008, "copy_logits_min": -750000000.0, "copy_num_tokens": 317.9375, "epoch": 1.6619351544549401, "gen_logits_max": 3.788639545440674, "gen_logits_mean": -16.84958267211914, "gen_logits_min": -28.93326187133789, "gen_logits_std": 3.302466869354248, "gen_loss": 0.2633460760116577, "grad_norm": 0.8642452462176684, "learning_rate": 2.0651368421052634e-05, "loss": 0.2682, "mean_copy_accuracy": 0.9961972683668137, "mean_gen_accuracy": 0.8819276541471481, "mean_token_accuracy": 0.9086377322673798, "num_tokens": 1122279170.0, "sample_num_tokens": 6973.5, "step": 8137, "total_num_tokens": 1122307064.0, "z_loss": 0.0005489106988534331 }, { "copy_logits_max": -7.649229049682617, "copy_logits_min": -750000000.0, "copy_num_tokens": 431.125, "epoch": 1.6621393923921368, "gen_logits_max": 3.6631736755371094, "gen_logits_mean": -17.422624588012695, "gen_logits_min": -29.492748260498047, "gen_logits_std": 3.359832286834717, "gen_loss": 0.2677753269672394, "grad_norm": 0.3650552156798738, "learning_rate": 2.0650105263157895e-05, "loss": 0.2679, "mean_copy_accuracy": 0.9962656199932098, "mean_gen_accuracy": 0.8817722052335739, "mean_token_accuracy": 0.9092583358287811, "num_tokens": 1122529090.0, "sample_num_tokens": 7586.5, "step": 8138, "total_num_tokens": 1122559436.0, "z_loss": 0.0006237231427803636 }, { "copy_logits_max": -7.876909255981445, "copy_logits_min": -750000000.0, "copy_num_tokens": 413.4375, "epoch": 1.6623436303293335, "gen_logits_max": 5.712285995483398, "gen_logits_mean": -13.77333927154541, "gen_logits_min": -25.996440887451172, "gen_logits_std": 3.1985321044921875, "gen_loss": 0.32603663206100464, "grad_norm": 0.40806475252751173, "learning_rate": 2.064884210526316e-05, "loss": 0.3005, "mean_copy_accuracy": 0.9948258548974991, "mean_gen_accuracy": 0.8757147043943405, "mean_token_accuracy": 0.8977538645267487, "num_tokens": 1122776236.0, "sample_num_tokens": 8763.5, "step": 8139, "total_num_tokens": 1122811290.0, "z_loss": 0.0007447267998941243 }, { "copy_logits_max": -7.638171195983887, "copy_logits_min": -687500032.0, "copy_num_tokens": 361.8125, "epoch": 1.6625478682665307, "gen_logits_max": 4.402316093444824, "gen_logits_mean": -16.635440826416016, "gen_logits_min": -28.837692260742188, "gen_logits_std": 3.3083951473236084, "gen_loss": 0.3006760776042938, "grad_norm": 0.32687828327050406, "learning_rate": 2.064757894736842e-05, "loss": 0.2676, "mean_copy_accuracy": 0.9970015734434128, "mean_gen_accuracy": 0.8821713477373123, "mean_token_accuracy": 0.9079705327749252, "num_tokens": 1123049518.0, "sample_num_tokens": 8165.5, "step": 8140, "total_num_tokens": 1123082180.0, "z_loss": 0.0006791182095184922 }, { "copy_logits_max": -6.9038190841674805, "copy_logits_min": -750000000.0, "copy_num_tokens": 364.875, "epoch": 1.6627521062037274, "gen_logits_max": 3.9283459186553955, "gen_logits_mean": -17.60419464111328, "gen_logits_min": -29.677318572998047, "gen_logits_std": 3.3462796211242676, "gen_loss": 0.2725544571876526, "grad_norm": 0.38141622196620234, "learning_rate": 2.0646315789473684e-05, "loss": 0.2801, "mean_copy_accuracy": 0.9977268278598785, "mean_gen_accuracy": 0.8742640614509583, "mean_token_accuracy": 0.9031058102846146, "num_tokens": 1123318006.0, "sample_num_tokens": 8141.5, "step": 8141, "total_num_tokens": 1123350572.0, "z_loss": 0.0005908156745135784 }, { "copy_logits_max": -6.200839042663574, "copy_logits_min": -750000000.0, "copy_num_tokens": 393.8125, "epoch": 1.662956344140924, "gen_logits_max": 3.579407215118408, "gen_logits_mean": -17.161169052124023, "gen_logits_min": -29.188772201538086, "gen_logits_std": 3.308478355407715, "gen_loss": 0.3183131217956543, "grad_norm": 0.3106714584611944, "learning_rate": 2.064505263157895e-05, "loss": 0.2688, "mean_copy_accuracy": 0.9968698173761368, "mean_gen_accuracy": 0.8790589421987534, "mean_token_accuracy": 0.9095402806997299, "num_tokens": 1123607093.0, "sample_num_tokens": 8242.75, "step": 8142, "total_num_tokens": 1123640064.0, "z_loss": 0.0005927510210312903 }, { "copy_logits_max": -4.858875751495361, "copy_logits_min": -687500096.0, "copy_num_tokens": 884.0, "epoch": 1.663160582078121, "gen_logits_max": 2.867694139480591, "gen_logits_mean": -15.788430213928223, "gen_logits_min": -28.67496109008789, "gen_logits_std": 3.3352651596069336, "gen_loss": 0.2089768499135971, "grad_norm": 0.3265905964541088, "learning_rate": 2.0643789473684213e-05, "loss": 0.2656, "mean_copy_accuracy": 0.9966404438018799, "mean_gen_accuracy": 0.8805592656135559, "mean_token_accuracy": 0.9088623970746994, "num_tokens": 1123910515.0, "sample_num_tokens": 11385.75, "step": 8143, "total_num_tokens": 1123956058.0, "z_loss": 0.0005053987260907888 }, { "copy_logits_max": -7.639424800872803, "copy_logits_min": -625000064.0, "copy_num_tokens": 534.375, "epoch": 1.663364820015318, "gen_logits_max": 2.6655216217041016, "gen_logits_mean": -19.085981369018555, "gen_logits_min": -30.91181182861328, "gen_logits_std": 3.4161536693573, "gen_loss": 0.2568875253200531, "grad_norm": 0.3479554326682072, "learning_rate": 2.0642526315789474e-05, "loss": 0.2719, "mean_copy_accuracy": 0.9966686815023422, "mean_gen_accuracy": 0.8788640648126602, "mean_token_accuracy": 0.9061640501022339, "num_tokens": 1124186141.0, "sample_num_tokens": 9567.75, "step": 8144, "total_num_tokens": 1124224412.0, "z_loss": 0.00044910365249961615 }, { "copy_logits_max": -7.211872577667236, "copy_logits_min": -750000000.0, "copy_num_tokens": 390.4375, "epoch": 1.6635690579525146, "gen_logits_max": 3.915459394454956, "gen_logits_mean": -17.00611686706543, "gen_logits_min": -29.263442993164062, "gen_logits_std": 3.3274028301239014, "gen_loss": 0.28792133927345276, "grad_norm": 0.3499734268771351, "learning_rate": 2.0641263157894738e-05, "loss": 0.2752, "mean_copy_accuracy": 0.9961710721254349, "mean_gen_accuracy": 0.8775454163551331, "mean_token_accuracy": 0.9050180315971375, "num_tokens": 1124468464.0, "sample_num_tokens": 7924.5, "step": 8145, "total_num_tokens": 1124500162.0, "z_loss": 0.00047707458725199103 }, { "copy_logits_max": -5.776627063751221, "copy_logits_min": -750000000.0, "copy_num_tokens": 472.1875, "epoch": 1.6637732958897116, "gen_logits_max": 3.1150996685028076, "gen_logits_mean": -18.11939239501953, "gen_logits_min": -29.88602638244629, "gen_logits_std": 3.3673834800720215, "gen_loss": 0.2642264664173126, "grad_norm": 0.39782877408023803, "learning_rate": 2.064e-05, "loss": 0.2846, "mean_copy_accuracy": 0.9963793456554413, "mean_gen_accuracy": 0.8767568022012711, "mean_token_accuracy": 0.9027395993471146, "num_tokens": 1124731598.0, "sample_num_tokens": 8128.5, "step": 8146, "total_num_tokens": 1124764112.0, "z_loss": 0.00041891675209626555 }, { "copy_logits_max": -5.453210353851318, "copy_logits_min": -750000000.0, "copy_num_tokens": 663.875, "epoch": 1.6639775338269085, "gen_logits_max": 3.075791358947754, "gen_logits_mean": -17.111507415771484, "gen_logits_min": -29.317481994628906, "gen_logits_std": 3.3500311374664307, "gen_loss": 0.23929643630981445, "grad_norm": 0.3443588167415257, "learning_rate": 2.0638736842105263e-05, "loss": 0.286, "mean_copy_accuracy": 0.997025802731514, "mean_gen_accuracy": 0.8689311593770981, "mean_token_accuracy": 0.9031320214271545, "num_tokens": 1125012069.0, "sample_num_tokens": 9462.75, "step": 8147, "total_num_tokens": 1125049920.0, "z_loss": 0.0003824777086265385 }, { "copy_logits_max": -7.794216156005859, "copy_logits_min": -750000000.0, "copy_num_tokens": 458.125, "epoch": 1.6641817717641052, "gen_logits_max": 4.415413856506348, "gen_logits_mean": -17.253976821899414, "gen_logits_min": -28.841222763061523, "gen_logits_std": 3.3086581230163574, "gen_loss": 0.3234556019306183, "grad_norm": 0.34372434238839844, "learning_rate": 2.0637473684210528e-05, "loss": 0.2865, "mean_copy_accuracy": 0.9968182593584061, "mean_gen_accuracy": 0.8763288706541061, "mean_token_accuracy": 0.9033747315406799, "num_tokens": 1125292088.0, "sample_num_tokens": 9774.0, "step": 8148, "total_num_tokens": 1125331184.0, "z_loss": 0.0005285951774567366 }, { "copy_logits_max": -3.3625264167785645, "copy_logits_min": -750000000.0, "copy_num_tokens": 814.75, "epoch": 1.664386009701302, "gen_logits_max": 2.3443260192871094, "gen_logits_mean": -17.711849212646484, "gen_logits_min": -29.86630630493164, "gen_logits_std": 3.3583383560180664, "gen_loss": 0.2856108546257019, "grad_norm": 0.3808211596395642, "learning_rate": 2.063621052631579e-05, "loss": 0.2852, "mean_copy_accuracy": 0.9972221404314041, "mean_gen_accuracy": 0.8676060736179352, "mean_token_accuracy": 0.901836484670639, "num_tokens": 1125564511.0, "sample_num_tokens": 10339.75, "step": 8149, "total_num_tokens": 1125605870.0, "z_loss": 0.0005012620822526515 }, { "copy_logits_max": -5.7904253005981445, "copy_logits_min": -750000000.0, "copy_num_tokens": 394.25, "epoch": 1.6645902476384988, "gen_logits_max": 3.520277500152588, "gen_logits_mean": -17.517690658569336, "gen_logits_min": -29.284400939941406, "gen_logits_std": 3.3261163234710693, "gen_loss": 0.26416945457458496, "grad_norm": 0.35803051697799054, "learning_rate": 2.0634947368421053e-05, "loss": 0.2568, "mean_copy_accuracy": 0.9973367601633072, "mean_gen_accuracy": 0.8852590769529343, "mean_token_accuracy": 0.9117226153612137, "num_tokens": 1125847299.0, "sample_num_tokens": 8329.75, "step": 8150, "total_num_tokens": 1125880618.0, "z_loss": 0.0003926834324374795 }, { "copy_logits_max": -5.960423469543457, "copy_logits_min": -750000000.0, "copy_num_tokens": 477.8125, "epoch": 1.6647944855756958, "gen_logits_max": 2.981152057647705, "gen_logits_mean": -18.366836547851562, "gen_logits_min": -30.513404846191406, "gen_logits_std": 3.357041597366333, "gen_loss": 0.27081000804901123, "grad_norm": 0.34682074049250283, "learning_rate": 2.0633684210526317e-05, "loss": 0.2729, "mean_copy_accuracy": 0.9963422268629074, "mean_gen_accuracy": 0.8761600255966187, "mean_token_accuracy": 0.907605916261673, "num_tokens": 1126121281.0, "sample_num_tokens": 8130.25, "step": 8151, "total_num_tokens": 1126153802.0, "z_loss": 0.00044538601650856435 }, { "copy_logits_max": -8.018497467041016, "copy_logits_min": -750000000.0, "copy_num_tokens": 330.3125, "epoch": 1.6649987235128925, "gen_logits_max": 4.28391170501709, "gen_logits_mean": -16.924335479736328, "gen_logits_min": -29.174245834350586, "gen_logits_std": 3.3208017349243164, "gen_loss": 0.28459006547927856, "grad_norm": 0.3521252325287246, "learning_rate": 2.063242105263158e-05, "loss": 0.263, "mean_copy_accuracy": 0.9966631084680557, "mean_gen_accuracy": 0.8862676322460175, "mean_token_accuracy": 0.9103815704584122, "num_tokens": 1126382169.0, "sample_num_tokens": 7353.25, "step": 8152, "total_num_tokens": 1126411582.0, "z_loss": 0.00044822716154158115 }, { "copy_logits_max": -2.7519338130950928, "copy_logits_min": -750000000.0, "copy_num_tokens": 289.0, "epoch": 1.6652029614500894, "gen_logits_max": 4.3553361892700195, "gen_logits_mean": -16.63631820678711, "gen_logits_min": -27.773324966430664, "gen_logits_std": 3.170220136642456, "gen_loss": 0.32186129689216614, "grad_norm": 0.35912493479985386, "learning_rate": 2.0631157894736842e-05, "loss": 0.2852, "mean_copy_accuracy": 0.996548056602478, "mean_gen_accuracy": 0.8744979202747345, "mean_token_accuracy": 0.8996889889240265, "num_tokens": 1126645942.0, "sample_num_tokens": 6964.0, "step": 8153, "total_num_tokens": 1126673798.0, "z_loss": 0.0005278325988911092 }, { "copy_logits_max": -1.5364389419555664, "copy_logits_min": -750000000.0, "copy_num_tokens": 494.125, "epoch": 1.6654071993872863, "gen_logits_max": 4.003958702087402, "gen_logits_mean": -15.64111614227295, "gen_logits_min": -27.17494773864746, "gen_logits_std": 3.1781396865844727, "gen_loss": 0.28279057145118713, "grad_norm": 0.3717183617372411, "learning_rate": 2.0629894736842107e-05, "loss": 0.2736, "mean_copy_accuracy": 0.9969049543142319, "mean_gen_accuracy": 0.8786854147911072, "mean_token_accuracy": 0.9082725793123245, "num_tokens": 1126930093.0, "sample_num_tokens": 9339.25, "step": 8154, "total_num_tokens": 1126967450.0, "z_loss": 0.00043021931196562946 }, { "copy_logits_max": -0.025377392768859863, "copy_logits_min": -750000000.0, "copy_num_tokens": 406.875, "epoch": 1.665611437324483, "gen_logits_max": 3.523451805114746, "gen_logits_mean": -16.230571746826172, "gen_logits_min": -27.80291748046875, "gen_logits_std": 3.204741954803467, "gen_loss": 0.24698545038700104, "grad_norm": 0.33696153039252114, "learning_rate": 2.0628631578947368e-05, "loss": 0.2669, "mean_copy_accuracy": 0.9974607974290848, "mean_gen_accuracy": 0.8783890902996063, "mean_token_accuracy": 0.9093201458454132, "num_tokens": 1127218179.0, "sample_num_tokens": 8030.25, "step": 8155, "total_num_tokens": 1127250300.0, "z_loss": 0.00038933096220716834 }, { "copy_logits_max": -0.40059900283813477, "copy_logits_min": -687500032.0, "copy_num_tokens": 443.75, "epoch": 1.6658156752616797, "gen_logits_max": 3.509810447692871, "gen_logits_mean": -15.559354782104492, "gen_logits_min": -27.442907333374023, "gen_logits_std": 3.206214666366577, "gen_loss": 0.2891145646572113, "grad_norm": 0.32547593913849127, "learning_rate": 2.0627368421052632e-05, "loss": 0.273, "mean_copy_accuracy": 0.9970585703849792, "mean_gen_accuracy": 0.8765077441930771, "mean_token_accuracy": 0.9070635885000229, "num_tokens": 1127489863.0, "sample_num_tokens": 7828.25, "step": 8156, "total_num_tokens": 1127521176.0, "z_loss": 0.0004738092829938978 }, { "copy_logits_max": -3.2187657356262207, "copy_logits_min": -750000000.0, "copy_num_tokens": 495.8125, "epoch": 1.6660199131988767, "gen_logits_max": 3.625883102416992, "gen_logits_mean": -15.642410278320312, "gen_logits_min": -27.307851791381836, "gen_logits_std": 3.1960840225219727, "gen_loss": 0.312033474445343, "grad_norm": 0.3569016015727208, "learning_rate": 2.0626105263157893e-05, "loss": 0.2825, "mean_copy_accuracy": 0.996965616941452, "mean_gen_accuracy": 0.8733461201190948, "mean_token_accuracy": 0.9055529236793518, "num_tokens": 1127763519.0, "sample_num_tokens": 8637.25, "step": 8157, "total_num_tokens": 1127798068.0, "z_loss": 0.0005349002312868834 }, { "copy_logits_max": -1.7869549989700317, "copy_logits_min": -750000064.0, "copy_num_tokens": 435.25, "epoch": 1.6662241511360736, "gen_logits_max": 3.840172290802002, "gen_logits_mean": -15.319560050964355, "gen_logits_min": -26.63905143737793, "gen_logits_std": 3.134643316268921, "gen_loss": 0.3121509552001953, "grad_norm": 0.35992852998296654, "learning_rate": 2.0624842105263157e-05, "loss": 0.2806, "mean_copy_accuracy": 0.9962612688541412, "mean_gen_accuracy": 0.8752082288265228, "mean_token_accuracy": 0.9046895503997803, "num_tokens": 1128020495.0, "sample_num_tokens": 8167.25, "step": 8158, "total_num_tokens": 1128053164.0, "z_loss": 0.0005615376285277307 }, { "copy_logits_max": -4.720531463623047, "copy_logits_min": -750000000.0, "copy_num_tokens": 200.125, "epoch": 1.6664283890732703, "gen_logits_max": 4.900900363922119, "gen_logits_mean": -14.719377517700195, "gen_logits_min": -25.580711364746094, "gen_logits_std": 2.9529988765716553, "gen_loss": 0.34064340591430664, "grad_norm": 0.3933060946256253, "learning_rate": 2.062357894736842e-05, "loss": 0.3127, "mean_copy_accuracy": 0.9961724877357483, "mean_gen_accuracy": 0.8647222518920898, "mean_token_accuracy": 0.8922938108444214, "num_tokens": 1128268816.0, "sample_num_tokens": 6101.0, "step": 8159, "total_num_tokens": 1128293220.0, "z_loss": 0.0005915118963457644 }, { "copy_logits_max": -1.000354528427124, "copy_logits_min": -750000000.0, "copy_num_tokens": 462.1875, "epoch": 1.6666326270104672, "gen_logits_max": 4.3070268630981445, "gen_logits_mean": -15.05211353302002, "gen_logits_min": -26.347965240478516, "gen_logits_std": 3.16471004486084, "gen_loss": 0.2623816430568695, "grad_norm": 0.3732901630558335, "learning_rate": 2.0622315789473686e-05, "loss": 0.2485, "mean_copy_accuracy": 0.9975778311491013, "mean_gen_accuracy": 0.8822872787714005, "mean_token_accuracy": 0.914043977856636, "num_tokens": 1128529232.0, "sample_num_tokens": 9108.0, "step": 8160, "total_num_tokens": 1128565664.0, "z_loss": 0.0004887506365776062 }, { "copy_logits_max": -2.7333197593688965, "copy_logits_min": -750000000.0, "copy_num_tokens": 540.125, "epoch": 1.6668368649476641, "gen_logits_max": 3.1952691078186035, "gen_logits_mean": -16.493179321289062, "gen_logits_min": -27.996051788330078, "gen_logits_std": 3.213479518890381, "gen_loss": 0.23593290150165558, "grad_norm": 0.34900783711235567, "learning_rate": 2.062105263157895e-05, "loss": 0.2595, "mean_copy_accuracy": 0.997188150882721, "mean_gen_accuracy": 0.8829543590545654, "mean_token_accuracy": 0.9123914688825607, "num_tokens": 1128812370.0, "sample_num_tokens": 9374.0, "step": 8161, "total_num_tokens": 1128849866.0, "z_loss": 0.00042312638834118843 }, { "copy_logits_max": -2.137820243835449, "copy_logits_min": -687500032.0, "copy_num_tokens": 495.5, "epoch": 1.6670411028848608, "gen_logits_max": 3.0642311573028564, "gen_logits_mean": -16.963268280029297, "gen_logits_min": -28.70798110961914, "gen_logits_std": 3.2541592121124268, "gen_loss": 0.2750856876373291, "grad_norm": 0.33171481313427154, "learning_rate": 2.061978947368421e-05, "loss": 0.2656, "mean_copy_accuracy": 0.9976528733968735, "mean_gen_accuracy": 0.8780248761177063, "mean_token_accuracy": 0.9107542335987091, "num_tokens": 1129104033.0, "sample_num_tokens": 8274.75, "step": 8162, "total_num_tokens": 1129137132.0, "z_loss": 0.0004469634732231498 }, { "copy_logits_max": -1.9138591289520264, "copy_logits_min": -750000000.0, "copy_num_tokens": 576.3125, "epoch": 1.6672453408220576, "gen_logits_max": 3.3563218116760254, "gen_logits_mean": -15.943960189819336, "gen_logits_min": -27.334613800048828, "gen_logits_std": 3.1305789947509766, "gen_loss": 0.26156294345855713, "grad_norm": 0.36897154057916176, "learning_rate": 2.0618526315789475e-05, "loss": 0.2675, "mean_copy_accuracy": 0.9976606369018555, "mean_gen_accuracy": 0.8764611184597015, "mean_token_accuracy": 0.909171387553215, "num_tokens": 1129398075.0, "sample_num_tokens": 8939.25, "step": 8163, "total_num_tokens": 1129433832.0, "z_loss": 0.0004572700709104538 }, { "copy_logits_max": -4.235922813415527, "copy_logits_min": -750000000.0, "copy_num_tokens": 230.5, "epoch": 1.6674495787592545, "gen_logits_max": 4.281006813049316, "gen_logits_mean": -15.96302318572998, "gen_logits_min": -27.02252769470215, "gen_logits_std": 3.1287589073181152, "gen_loss": 0.28251880407333374, "grad_norm": 0.3324296402113643, "learning_rate": 2.0617263157894736e-05, "loss": 0.2726, "mean_copy_accuracy": 0.9960662573575974, "mean_gen_accuracy": 0.8811897188425064, "mean_token_accuracy": 0.9068514108657837, "num_tokens": 1129674667.0, "sample_num_tokens": 6903.75, "step": 8164, "total_num_tokens": 1129702282.0, "z_loss": 0.0005021886900067329 }, { "copy_logits_max": -2.760758399963379, "copy_logits_min": -750000000.0, "copy_num_tokens": 528.0, "epoch": 1.6676538166964514, "gen_logits_max": 3.3773977756500244, "gen_logits_mean": -16.537761688232422, "gen_logits_min": -27.82696533203125, "gen_logits_std": 3.1976242065429688, "gen_loss": 0.26191675662994385, "grad_norm": 0.3640544590972184, "learning_rate": 2.0616e-05, "loss": 0.2707, "mean_copy_accuracy": 0.9962061643600464, "mean_gen_accuracy": 0.8802106082439423, "mean_token_accuracy": 0.9063223451375961, "num_tokens": 1129928362.0, "sample_num_tokens": 8610.5, "step": 8165, "total_num_tokens": 1129962804.0, "z_loss": 0.0004516115295700729 }, { "copy_logits_max": -3.951476812362671, "copy_logits_min": -750000000.0, "copy_num_tokens": 377.875, "epoch": 1.6678580546336481, "gen_logits_max": 3.456305980682373, "gen_logits_mean": -16.561141967773438, "gen_logits_min": -27.93025016784668, "gen_logits_std": 3.1524527072906494, "gen_loss": 0.28166723251342773, "grad_norm": 0.390687552175064, "learning_rate": 2.061473684210526e-05, "loss": 0.2701, "mean_copy_accuracy": 0.9956588298082352, "mean_gen_accuracy": 0.8842184096574783, "mean_token_accuracy": 0.9091173708438873, "num_tokens": 1130183649.0, "sample_num_tokens": 7927.75, "step": 8166, "total_num_tokens": 1130215360.0, "z_loss": 0.0004891565768048167 }, { "copy_logits_max": -3.5571556091308594, "copy_logits_min": -750000064.0, "copy_num_tokens": 496.5, "epoch": 1.668062292570845, "gen_logits_max": 2.481454610824585, "gen_logits_mean": -18.145864486694336, "gen_logits_min": -29.713741302490234, "gen_logits_std": 3.307093381881714, "gen_loss": 0.2880321443080902, "grad_norm": 0.37223285599296485, "learning_rate": 2.061347368421053e-05, "loss": 0.2759, "mean_copy_accuracy": 0.9967071413993835, "mean_gen_accuracy": 0.8764512836933136, "mean_token_accuracy": 0.9053103178739548, "num_tokens": 1130449797.0, "sample_num_tokens": 8258.25, "step": 8167, "total_num_tokens": 1130482830.0, "z_loss": 0.00043610300053842366 }, { "copy_logits_max": -2.4819626808166504, "copy_logits_min": -750000000.0, "copy_num_tokens": 620.375, "epoch": 1.668266530508042, "gen_logits_max": 2.470104932785034, "gen_logits_mean": -17.383420944213867, "gen_logits_min": -28.858360290527344, "gen_logits_std": 3.221473217010498, "gen_loss": 0.26170122623443604, "grad_norm": 0.3713964466586051, "learning_rate": 2.061221052631579e-05, "loss": 0.2832, "mean_copy_accuracy": 0.9967214912176132, "mean_gen_accuracy": 0.8737484365701675, "mean_token_accuracy": 0.9054432660341263, "num_tokens": 1130716089.0, "sample_num_tokens": 8688.25, "step": 8168, "total_num_tokens": 1130750842.0, "z_loss": 0.0004139032680541277 }, { "copy_logits_max": -5.058905601501465, "copy_logits_min": -750000000.0, "copy_num_tokens": 495.5625, "epoch": 1.6684707684452387, "gen_logits_max": 2.884065866470337, "gen_logits_mean": -17.098949432373047, "gen_logits_min": -28.265609741210938, "gen_logits_std": 3.2232863903045654, "gen_loss": 0.25004392862319946, "grad_norm": 0.41503111955959043, "learning_rate": 2.0610947368421054e-05, "loss": 0.2729, "mean_copy_accuracy": 0.9971217215061188, "mean_gen_accuracy": 0.8794734925031662, "mean_token_accuracy": 0.9084568023681641, "num_tokens": 1130991359.0, "sample_num_tokens": 8250.75, "step": 8169, "total_num_tokens": 1131024362.0, "z_loss": 0.00037879805313423276 }, { "copy_logits_max": -5.095340728759766, "copy_logits_min": -687500032.0, "copy_num_tokens": 510.625, "epoch": 1.6686750063824354, "gen_logits_max": 3.3437368869781494, "gen_logits_mean": -16.58828353881836, "gen_logits_min": -27.623626708984375, "gen_logits_std": 3.1502280235290527, "gen_loss": 0.27110064029693604, "grad_norm": 0.37780680729130234, "learning_rate": 2.0609684210526315e-05, "loss": 0.2756, "mean_copy_accuracy": 0.996427446603775, "mean_gen_accuracy": 0.8792498558759689, "mean_token_accuracy": 0.9047485142946243, "num_tokens": 1131243597.0, "sample_num_tokens": 9478.75, "step": 8170, "total_num_tokens": 1131281512.0, "z_loss": 0.00041232083458453417 }, { "copy_logits_max": -3.28855299949646, "copy_logits_min": -687500032.0, "copy_num_tokens": 534.0, "epoch": 1.6688792443196325, "gen_logits_max": 3.6484339237213135, "gen_logits_mean": -15.526000022888184, "gen_logits_min": -26.613502502441406, "gen_logits_std": 3.1232969760894775, "gen_loss": 0.28389978408813477, "grad_norm": 0.3286566910019118, "learning_rate": 2.060842105263158e-05, "loss": 0.2598, "mean_copy_accuracy": 0.9972065389156342, "mean_gen_accuracy": 0.8830733448266983, "mean_token_accuracy": 0.9151877164840698, "num_tokens": 1131544210.0, "sample_num_tokens": 8925.0, "step": 8171, "total_num_tokens": 1131579910.0, "z_loss": 0.0003890325897373259 }, { "copy_logits_max": -6.139829635620117, "copy_logits_min": -687500032.0, "copy_num_tokens": 262.3125, "epoch": 1.6690834822568292, "gen_logits_max": 2.916746139526367, "gen_logits_mean": -19.49248504638672, "gen_logits_min": -31.186674118041992, "gen_logits_std": 3.369114637374878, "gen_loss": 0.27118727564811707, "grad_norm": 0.3544584892763111, "learning_rate": 2.060715789473684e-05, "loss": 0.2638, "mean_copy_accuracy": 0.9967909306287766, "mean_gen_accuracy": 0.8809823542833328, "mean_token_accuracy": 0.9110801070928574, "num_tokens": 1131837242.0, "sample_num_tokens": 7005.0, "step": 8172, "total_num_tokens": 1131865262.0, "z_loss": 0.00037916400469839573 }, { "copy_logits_max": -4.407715797424316, "copy_logits_min": -750000000.0, "copy_num_tokens": 461.1875, "epoch": 1.669287720194026, "gen_logits_max": 2.6710686683654785, "gen_logits_mean": -16.999488830566406, "gen_logits_min": -28.144092559814453, "gen_logits_std": 3.1854782104492188, "gen_loss": 0.2389938235282898, "grad_norm": 0.3443642685443232, "learning_rate": 2.0605894736842105e-05, "loss": 0.2648, "mean_copy_accuracy": 0.9973078966140747, "mean_gen_accuracy": 0.8827681839466095, "mean_token_accuracy": 0.9106602966785431, "num_tokens": 1132114378.0, "sample_num_tokens": 8463.5, "step": 8173, "total_num_tokens": 1132148232.0, "z_loss": 0.00030175899155437946 }, { "copy_logits_max": -5.051093101501465, "copy_logits_min": -750000000.0, "copy_num_tokens": 397.3125, "epoch": 1.6694919581312229, "gen_logits_max": 3.4296836853027344, "gen_logits_mean": -16.730209350585938, "gen_logits_min": -28.31314468383789, "gen_logits_std": 3.178544282913208, "gen_loss": 0.31123659014701843, "grad_norm": 0.33368538217616894, "learning_rate": 2.060463157894737e-05, "loss": 0.2881, "mean_copy_accuracy": 0.9968788623809814, "mean_gen_accuracy": 0.8711962848901749, "mean_token_accuracy": 0.9041933119297028, "num_tokens": 1132402656.0, "sample_num_tokens": 7708.5, "step": 8174, "total_num_tokens": 1132433490.0, "z_loss": 0.0004173479392193258 }, { "copy_logits_max": -8.851202011108398, "copy_logits_min": -750000000.0, "copy_num_tokens": 551.1875, "epoch": 1.6696961960684198, "gen_logits_max": 3.765988826751709, "gen_logits_mean": -16.577974319458008, "gen_logits_min": -28.857322692871094, "gen_logits_std": 3.2803683280944824, "gen_loss": 0.27372026443481445, "grad_norm": 0.35287587571673085, "learning_rate": 2.0603368421052633e-05, "loss": 0.2826, "mean_copy_accuracy": 0.9972447603940964, "mean_gen_accuracy": 0.8692374676465988, "mean_token_accuracy": 0.9038428068161011, "num_tokens": 1132680295.0, "sample_num_tokens": 8579.75, "step": 8175, "total_num_tokens": 1132714614.0, "z_loss": 0.0003874031826853752 }, { "copy_logits_max": -6.044064521789551, "copy_logits_min": -750000128.0, "copy_num_tokens": 443.25, "epoch": 1.6699004340056165, "gen_logits_max": 2.2292428016662598, "gen_logits_mean": -19.06511878967285, "gen_logits_min": -30.438230514526367, "gen_logits_std": 3.3439714908599854, "gen_loss": 0.24233439564704895, "grad_norm": 0.33996396733354867, "learning_rate": 2.0602105263157898e-05, "loss": 0.2443, "mean_copy_accuracy": 0.9975759088993073, "mean_gen_accuracy": 0.8900743722915649, "mean_token_accuracy": 0.9151750802993774, "num_tokens": 1132925475.0, "sample_num_tokens": 8909.75, "step": 8176, "total_num_tokens": 1132961114.0, "z_loss": 0.0003213495365343988 }, { "copy_logits_max": -7.852985382080078, "copy_logits_min": -750000000.0, "copy_num_tokens": 381.9375, "epoch": 1.6701046719428134, "gen_logits_max": 5.045318126678467, "gen_logits_mean": -15.578447341918945, "gen_logits_min": -27.744678497314453, "gen_logits_std": 3.225588798522949, "gen_loss": 0.2886020839214325, "grad_norm": 0.35532335075262467, "learning_rate": 2.060084210526316e-05, "loss": 0.2856, "mean_copy_accuracy": 0.9966255724430084, "mean_gen_accuracy": 0.8752311766147614, "mean_token_accuracy": 0.9042470008134842, "num_tokens": 1133192186.0, "sample_num_tokens": 8154.0, "step": 8177, "total_num_tokens": 1133224802.0, "z_loss": 0.0003898385912179947 }, { "copy_logits_max": -5.603512763977051, "copy_logits_min": -750000000.0, "copy_num_tokens": 420.125, "epoch": 1.6703089098800104, "gen_logits_max": 2.611678123474121, "gen_logits_mean": -18.683452606201172, "gen_logits_min": -30.218700408935547, "gen_logits_std": 3.2963032722473145, "gen_loss": 0.30698513984680176, "grad_norm": 0.36206016699472005, "learning_rate": 2.0599578947368423e-05, "loss": 0.294, "mean_copy_accuracy": 0.9962573200464249, "mean_gen_accuracy": 0.8734233379364014, "mean_token_accuracy": 0.9002279490232468, "num_tokens": 1133440926.0, "sample_num_tokens": 9030.0, "step": 8178, "total_num_tokens": 1133477046.0, "z_loss": 0.0004285684262868017 }, { "copy_logits_max": -4.502302169799805, "copy_logits_min": -750000000.0, "copy_num_tokens": 480.375, "epoch": 1.670513147817207, "gen_logits_max": 2.750544786453247, "gen_logits_mean": -17.838218688964844, "gen_logits_min": -29.82970428466797, "gen_logits_std": 3.3087403774261475, "gen_loss": 0.25721168518066406, "grad_norm": 0.35987407959205264, "learning_rate": 2.0598315789473684e-05, "loss": 0.2735, "mean_copy_accuracy": 0.9970764815807343, "mean_gen_accuracy": 0.8763651698827744, "mean_token_accuracy": 0.9077541828155518, "num_tokens": 1133698286.0, "sample_num_tokens": 8071.5, "step": 8179, "total_num_tokens": 1133730572.0, "z_loss": 0.0004193695494905114 }, { "copy_logits_max": -6.091902256011963, "copy_logits_min": -750000000.0, "copy_num_tokens": 359.5, "epoch": 1.6707173857544038, "gen_logits_max": 3.6078383922576904, "gen_logits_mean": -16.87021255493164, "gen_logits_min": -28.203203201293945, "gen_logits_std": 3.142810583114624, "gen_loss": 0.2765582203865051, "grad_norm": 0.3403242958780901, "learning_rate": 2.0597052631578948e-05, "loss": 0.259, "mean_copy_accuracy": 0.9967945665121078, "mean_gen_accuracy": 0.8827748596668243, "mean_token_accuracy": 0.9102111160755157, "num_tokens": 1133969970.0, "sample_num_tokens": 8547.0, "step": 8180, "total_num_tokens": 1134004158.0, "z_loss": 0.00044233203516341746 }, { "copy_logits_max": -3.3779211044311523, "copy_logits_min": -750000000.0, "copy_num_tokens": 441.0, "epoch": 1.6709216236916007, "gen_logits_max": 3.4271323680877686, "gen_logits_mean": -15.881508827209473, "gen_logits_min": -27.22209930419922, "gen_logits_std": 3.1578426361083984, "gen_loss": 0.27716994285583496, "grad_norm": 0.33734570178837, "learning_rate": 2.059578947368421e-05, "loss": 0.2795, "mean_copy_accuracy": 0.9962359815835953, "mean_gen_accuracy": 0.8719038516283035, "mean_token_accuracy": 0.9033827483654022, "num_tokens": 1134251234.0, "sample_num_tokens": 8310.0, "step": 8181, "total_num_tokens": 1134284474.0, "z_loss": 0.0004912856966257095 }, { "copy_logits_max": -6.831328392028809, "copy_logits_min": -687500032.0, "copy_num_tokens": 498.1875, "epoch": 1.6711258616287976, "gen_logits_max": 3.354644536972046, "gen_logits_mean": -16.638565063476562, "gen_logits_min": -27.802608489990234, "gen_logits_std": 3.1255874633789062, "gen_loss": 0.29549598693847656, "grad_norm": 0.3542676086160015, "learning_rate": 2.0594526315789473e-05, "loss": 0.2855, "mean_copy_accuracy": 0.996413066983223, "mean_gen_accuracy": 0.8744329810142517, "mean_token_accuracy": 0.9000727832317352, "num_tokens": 1134523818.0, "sample_num_tokens": 9314.5, "step": 8182, "total_num_tokens": 1134561076.0, "z_loss": 0.0004805788630619645 }, { "copy_logits_max": -6.857869625091553, "copy_logits_min": -687500032.0, "copy_num_tokens": 322.0625, "epoch": 1.6713300995659943, "gen_logits_max": 4.167910575866699, "gen_logits_mean": -15.417757034301758, "gen_logits_min": -26.94411849975586, "gen_logits_std": 3.077944755554199, "gen_loss": 0.2870330810546875, "grad_norm": 0.3318258910275179, "learning_rate": 2.0593263157894738e-05, "loss": 0.2693, "mean_copy_accuracy": 0.9963317811489105, "mean_gen_accuracy": 0.8825257271528244, "mean_token_accuracy": 0.9081982970237732, "num_tokens": 1134805875.0, "sample_num_tokens": 7611.75, "step": 8183, "total_num_tokens": 1134836322.0, "z_loss": 0.0005003141122870147 }, { "copy_logits_max": -5.338771820068359, "copy_logits_min": -687500032.0, "copy_num_tokens": 404.5625, "epoch": 1.6715343375031912, "gen_logits_max": 4.580153942108154, "gen_logits_mean": -14.85743236541748, "gen_logits_min": -25.960355758666992, "gen_logits_std": 3.0474252700805664, "gen_loss": 0.29834431409835815, "grad_norm": 0.3369404232340408, "learning_rate": 2.0592000000000002e-05, "loss": 0.2695, "mean_copy_accuracy": 0.9973180890083313, "mean_gen_accuracy": 0.8827547132968903, "mean_token_accuracy": 0.9070139825344086, "num_tokens": 1135077220.0, "sample_num_tokens": 8670.5, "step": 8184, "total_num_tokens": 1135111902.0, "z_loss": 0.0004575761849991977 }, { "copy_logits_max": -6.6214165687561035, "copy_logits_min": -750000000.0, "copy_num_tokens": 304.5625, "epoch": 1.6717385754403882, "gen_logits_max": 3.5574591159820557, "gen_logits_mean": -18.096860885620117, "gen_logits_min": -29.396270751953125, "gen_logits_std": 3.2527570724487305, "gen_loss": 0.2915307581424713, "grad_norm": 0.34327268602781696, "learning_rate": 2.0590736842105263e-05, "loss": 0.2902, "mean_copy_accuracy": 0.9964134693145752, "mean_gen_accuracy": 0.8706050664186478, "mean_token_accuracy": 0.9008604884147644, "num_tokens": 1135368335.0, "sample_num_tokens": 7562.25, "step": 8185, "total_num_tokens": 1135398584.0, "z_loss": 0.00046722133993171155 }, { "copy_logits_max": -4.713469505310059, "copy_logits_min": -750000000.0, "copy_num_tokens": 468.0, "epoch": 1.6719428133775849, "gen_logits_max": 3.634449005126953, "gen_logits_mean": -15.098512649536133, "gen_logits_min": -26.68832778930664, "gen_logits_std": 3.0185251235961914, "gen_loss": 0.28475868701934814, "grad_norm": 0.3227782795260401, "learning_rate": 2.0589473684210527e-05, "loss": 0.2824, "mean_copy_accuracy": 0.9975738525390625, "mean_gen_accuracy": 0.8742287904024124, "mean_token_accuracy": 0.9038163125514984, "num_tokens": 1135659765.0, "sample_num_tokens": 8494.75, "step": 8186, "total_num_tokens": 1135693744.0, "z_loss": 0.000467123172711581 }, { "copy_logits_max": -4.399505138397217, "copy_logits_min": -687500032.0, "copy_num_tokens": 620.875, "epoch": 1.6721470513147816, "gen_logits_max": 2.28623104095459, "gen_logits_mean": -17.53406524658203, "gen_logits_min": -29.28404426574707, "gen_logits_std": 3.211380958557129, "gen_loss": 0.26207292079925537, "grad_norm": 0.34235296982684055, "learning_rate": 2.058821052631579e-05, "loss": 0.2744, "mean_copy_accuracy": 0.9971056580543518, "mean_gen_accuracy": 0.8759318590164185, "mean_token_accuracy": 0.906396210193634, "num_tokens": 1135938441.0, "sample_num_tokens": 9300.25, "step": 8187, "total_num_tokens": 1135975642.0, "z_loss": 0.000443093478679657 }, { "copy_logits_max": -2.2842023372650146, "copy_logits_min": -687500096.0, "copy_num_tokens": 891.125, "epoch": 1.6723512892519785, "gen_logits_max": 2.6047167778015137, "gen_logits_mean": -15.981474876403809, "gen_logits_min": -27.841184616088867, "gen_logits_std": 3.1582982540130615, "gen_loss": 0.26444074511528015, "grad_norm": 0.3605134842965056, "learning_rate": 2.0586947368421053e-05, "loss": 0.2753, "mean_copy_accuracy": 0.9981691241264343, "mean_gen_accuracy": 0.8722475916147232, "mean_token_accuracy": 0.908376932144165, "num_tokens": 1136220129.0, "sample_num_tokens": 10783.75, "step": 8188, "total_num_tokens": 1136263264.0, "z_loss": 0.000424085243139416 }, { "copy_logits_max": -4.498132705688477, "copy_logits_min": -750000000.0, "copy_num_tokens": 553.0, "epoch": 1.6725555271891754, "gen_logits_max": 4.320474624633789, "gen_logits_mean": -15.352312088012695, "gen_logits_min": -27.476974487304688, "gen_logits_std": 3.1325817108154297, "gen_loss": 0.28926485776901245, "grad_norm": 0.3724917752637343, "learning_rate": 2.0585684210526317e-05, "loss": 0.2921, "mean_copy_accuracy": 0.9966372698545456, "mean_gen_accuracy": 0.8749375939369202, "mean_token_accuracy": 0.9021247774362564, "num_tokens": 1136487176.0, "sample_num_tokens": 9882.0, "step": 8189, "total_num_tokens": 1136526704.0, "z_loss": 0.00047033108421601355 }, { "copy_logits_max": -5.21661376953125, "copy_logits_min": -750000064.0, "copy_num_tokens": 444.75, "epoch": 1.6727597651263721, "gen_logits_max": 3.2752625942230225, "gen_logits_mean": -17.71518898010254, "gen_logits_min": -29.360214233398438, "gen_logits_std": 3.269110679626465, "gen_loss": 0.2745324969291687, "grad_norm": 0.34187586571762146, "learning_rate": 2.0584421052631578e-05, "loss": 0.2616, "mean_copy_accuracy": 0.9966088980436325, "mean_gen_accuracy": 0.8870446085929871, "mean_token_accuracy": 0.9106937795877457, "num_tokens": 1136774084.0, "sample_num_tokens": 9319.0, "step": 8190, "total_num_tokens": 1136811360.0, "z_loss": 0.0004404656065162271 }, { "copy_logits_max": -5.929923057556152, "copy_logits_min": -750000000.0, "copy_num_tokens": 330.5625, "epoch": 1.672964003063569, "gen_logits_max": 3.446256399154663, "gen_logits_mean": -17.132705688476562, "gen_logits_min": -28.78898048400879, "gen_logits_std": 3.2117669582366943, "gen_loss": 0.2897495627403259, "grad_norm": 0.3242714478644415, "learning_rate": 2.0583157894736842e-05, "loss": 0.2738, "mean_copy_accuracy": 0.9973464161157608, "mean_gen_accuracy": 0.8770444393157959, "mean_token_accuracy": 0.9080158323049545, "num_tokens": 1137080307.0, "sample_num_tokens": 7329.25, "step": 8191, "total_num_tokens": 1137109624.0, "z_loss": 0.00043461949098855257 }, { "copy_logits_max": -2.529627799987793, "copy_logits_min": -750000000.0, "copy_num_tokens": 541.25, "epoch": 1.673168241000766, "gen_logits_max": 3.283041000366211, "gen_logits_mean": -16.16561508178711, "gen_logits_min": -27.470081329345703, "gen_logits_std": 3.1103029251098633, "gen_loss": 0.2802765965461731, "grad_norm": 0.37077836120303775, "learning_rate": 2.0581894736842106e-05, "loss": 0.2715, "mean_copy_accuracy": 0.9973639994859695, "mean_gen_accuracy": 0.8736939430236816, "mean_token_accuracy": 0.9070135951042175, "num_tokens": 1137358035.0, "sample_num_tokens": 9525.25, "step": 8192, "total_num_tokens": 1137396136.0, "z_loss": 0.0004416760348249227 }, { "copy_logits_max": -5.811959743499756, "copy_logits_min": -750000000.0, "copy_num_tokens": 284.75, "epoch": 1.6733724789379627, "gen_logits_max": 4.1068220138549805, "gen_logits_mean": -17.795825958251953, "gen_logits_min": -29.194271087646484, "gen_logits_std": 3.2779409885406494, "gen_loss": 0.29628485441207886, "grad_norm": 0.40121506685563585, "learning_rate": 2.058063157894737e-05, "loss": 0.2759, "mean_copy_accuracy": 0.9964118152856827, "mean_gen_accuracy": 0.8819293975830078, "mean_token_accuracy": 0.9074336886405945, "num_tokens": 1137627438.0, "sample_num_tokens": 7852.5, "step": 8193, "total_num_tokens": 1137658848.0, "z_loss": 0.00047111083404161036 }, { "copy_logits_max": -5.764063835144043, "copy_logits_min": -750000000.0, "copy_num_tokens": 295.75, "epoch": 1.6735767168751594, "gen_logits_max": 3.8418288230895996, "gen_logits_mean": -17.31221580505371, "gen_logits_min": -29.080894470214844, "gen_logits_std": 3.235060214996338, "gen_loss": 0.28342247009277344, "grad_norm": 0.36199213023268084, "learning_rate": 2.057936842105263e-05, "loss": 0.2657, "mean_copy_accuracy": 0.9959534555673599, "mean_gen_accuracy": 0.8875206857919693, "mean_token_accuracy": 0.9095738083124161, "num_tokens": 1137888050.0, "sample_num_tokens": 6994.5, "step": 8194, "total_num_tokens": 1137916028.0, "z_loss": 0.0004374018753878772 }, { "copy_logits_max": -0.7370961904525757, "copy_logits_min": -750000000.0, "copy_num_tokens": 447.125, "epoch": 1.6737809548123566, "gen_logits_max": 4.865408897399902, "gen_logits_mean": -13.906022071838379, "gen_logits_min": -25.61764907836914, "gen_logits_std": 3.109436511993408, "gen_loss": 0.2282026708126068, "grad_norm": 0.36943906728679665, "learning_rate": 2.0578105263157896e-05, "loss": 0.2622, "mean_copy_accuracy": 0.995555192232132, "mean_gen_accuracy": 0.8855665922164917, "mean_token_accuracy": 0.9091561287641525, "num_tokens": 1138144607.0, "sample_num_tokens": 8083.75, "step": 8195, "total_num_tokens": 1138176942.0, "z_loss": 0.0003459009458310902 }, { "copy_logits_max": -2.6138057708740234, "copy_logits_min": -750000000.0, "copy_num_tokens": 396.125, "epoch": 1.6739851927495533, "gen_logits_max": 4.685856819152832, "gen_logits_mean": -15.143584251403809, "gen_logits_min": -26.90686798095703, "gen_logits_std": 3.1345150470733643, "gen_loss": 0.32200223207473755, "grad_norm": 0.3643465725662191, "learning_rate": 2.0576842105263157e-05, "loss": 0.2986, "mean_copy_accuracy": 0.9960060119628906, "mean_gen_accuracy": 0.8707979470491409, "mean_token_accuracy": 0.8988295644521713, "num_tokens": 1138410276.0, "sample_num_tokens": 7783.5, "step": 8196, "total_num_tokens": 1138441410.0, "z_loss": 0.0005117273540236056 }, { "copy_logits_max": -2.283144235610962, "copy_logits_min": -750000000.0, "copy_num_tokens": 507.5, "epoch": 1.67418943068675, "gen_logits_max": 3.9004266262054443, "gen_logits_mean": -15.416648864746094, "gen_logits_min": -27.013206481933594, "gen_logits_std": 3.1541032791137695, "gen_loss": 0.2597465217113495, "grad_norm": 0.3646597917984078, "learning_rate": 2.057557894736842e-05, "loss": 0.269, "mean_copy_accuracy": 0.9959825426340103, "mean_gen_accuracy": 0.8759030699729919, "mean_token_accuracy": 0.9079983532428741, "num_tokens": 1138676784.0, "sample_num_tokens": 8655.5, "step": 8197, "total_num_tokens": 1138711406.0, "z_loss": 0.00043532310519367456 }, { "copy_logits_max": -1.8545650243759155, "copy_logits_min": -750000000.0, "copy_num_tokens": 382.6875, "epoch": 1.674393668623947, "gen_logits_max": 4.637174606323242, "gen_logits_mean": -15.281952857971191, "gen_logits_min": -27.486717224121094, "gen_logits_std": 3.166934013366699, "gen_loss": 0.28838038444519043, "grad_norm": 0.36645080846104444, "learning_rate": 2.0574315789473682e-05, "loss": 0.2802, "mean_copy_accuracy": 0.996578112244606, "mean_gen_accuracy": 0.8737771660089493, "mean_token_accuracy": 0.9029273241758347, "num_tokens": 1138946435.0, "sample_num_tokens": 7237.75, "step": 8198, "total_num_tokens": 1138975386.0, "z_loss": 0.0005300324410200119 }, { "copy_logits_max": -2.8312931060791016, "copy_logits_min": -687500032.0, "copy_num_tokens": 479.3125, "epoch": 1.6745979065611438, "gen_logits_max": 3.1842048168182373, "gen_logits_mean": -17.688907623291016, "gen_logits_min": -29.937416076660156, "gen_logits_std": 3.335926055908203, "gen_loss": 0.28195950388908386, "grad_norm": 0.3821028413546117, "learning_rate": 2.0573052631578946e-05, "loss": 0.2847, "mean_copy_accuracy": 0.9963622987270355, "mean_gen_accuracy": 0.8765782117843628, "mean_token_accuracy": 0.9046540260314941, "num_tokens": 1139218097.0, "sample_num_tokens": 8566.75, "step": 8199, "total_num_tokens": 1139252364.0, "z_loss": 0.00042803536052815616 }, { "copy_logits_max": -4.529232025146484, "copy_logits_min": -750000000.0, "copy_num_tokens": 403.875, "epoch": 1.6748021444983405, "gen_logits_max": 3.19016695022583, "gen_logits_mean": -18.109237670898438, "gen_logits_min": -29.815841674804688, "gen_logits_std": 3.2824673652648926, "gen_loss": 0.30726030468940735, "grad_norm": 0.42513037652821556, "learning_rate": 2.0571789473684214e-05, "loss": 0.305, "mean_copy_accuracy": 0.9971299767494202, "mean_gen_accuracy": 0.8652806878089905, "mean_token_accuracy": 0.8980601727962494, "num_tokens": 1139484293.0, "sample_num_tokens": 8287.75, "step": 8200, "total_num_tokens": 1139517444.0, "z_loss": 0.0004447671235539019 }, { "copy_logits_max": -7.496662616729736, "copy_logits_min": -750000000.0, "copy_num_tokens": 336.6875, "epoch": 1.6750063824355375, "gen_logits_max": 4.933051109313965, "gen_logits_mean": -16.126049041748047, "gen_logits_min": -28.095008850097656, "gen_logits_std": 3.276693820953369, "gen_loss": 0.3085862398147583, "grad_norm": 0.3454804946198365, "learning_rate": 2.0570526315789475e-05, "loss": 0.2859, "mean_copy_accuracy": 0.9962181895971298, "mean_gen_accuracy": 0.8760652542114258, "mean_token_accuracy": 0.9018762111663818, "num_tokens": 1139760384.0, "sample_num_tokens": 8137.5, "step": 8201, "total_num_tokens": 1139792934.0, "z_loss": 0.0004909965209662914 }, { "copy_logits_max": -5.293621063232422, "copy_logits_min": -687500032.0, "copy_num_tokens": 518.0, "epoch": 1.6752106203727344, "gen_logits_max": 3.362302780151367, "gen_logits_mean": -17.135560989379883, "gen_logits_min": -29.03005599975586, "gen_logits_std": 3.301090717315674, "gen_loss": 0.29111093282699585, "grad_norm": 0.3756164429590179, "learning_rate": 2.056926315789474e-05, "loss": 0.2908, "mean_copy_accuracy": 0.996251255273819, "mean_gen_accuracy": 0.8689730614423752, "mean_token_accuracy": 0.9014201611280441, "num_tokens": 1140045102.0, "sample_num_tokens": 8661.0, "step": 8202, "total_num_tokens": 1140079746.0, "z_loss": 0.0005036816583015025 }, { "copy_logits_max": -5.352579116821289, "copy_logits_min": -687500032.0, "copy_num_tokens": 418.0625, "epoch": 1.675414858309931, "gen_logits_max": 3.1263539791107178, "gen_logits_mean": -18.3070125579834, "gen_logits_min": -30.49309539794922, "gen_logits_std": 3.3928284645080566, "gen_loss": 0.252685010433197, "grad_norm": 0.36369310972946767, "learning_rate": 2.0568e-05, "loss": 0.2762, "mean_copy_accuracy": 0.9962771981954575, "mean_gen_accuracy": 0.8748842030763626, "mean_token_accuracy": 0.9053224623203278, "num_tokens": 1140313779.0, "sample_num_tokens": 8209.25, "step": 8203, "total_num_tokens": 1140346616.0, "z_loss": 0.00047715468099340796 }, { "copy_logits_max": -6.399452209472656, "copy_logits_min": -750000000.0, "copy_num_tokens": 325.375, "epoch": 1.6756190962471278, "gen_logits_max": 3.606520175933838, "gen_logits_mean": -17.98524284362793, "gen_logits_min": -29.8812198638916, "gen_logits_std": 3.394861936569214, "gen_loss": 0.2687774896621704, "grad_norm": 0.34728512574811626, "learning_rate": 2.0566736842105265e-05, "loss": 0.2615, "mean_copy_accuracy": 0.9973669946193695, "mean_gen_accuracy": 0.8829087913036346, "mean_token_accuracy": 0.9099635481834412, "num_tokens": 1140608537.0, "sample_num_tokens": 7538.25, "step": 8204, "total_num_tokens": 1140638690.0, "z_loss": 0.0004260519635863602 }, { "copy_logits_max": -5.0805463790893555, "copy_logits_min": -750000064.0, "copy_num_tokens": 447.75, "epoch": 1.6758233341843247, "gen_logits_max": 3.375894546508789, "gen_logits_mean": -16.804031372070312, "gen_logits_min": -29.086585998535156, "gen_logits_std": 3.327775239944458, "gen_loss": 0.2944020926952362, "grad_norm": 0.3440225121720731, "learning_rate": 2.0565473684210525e-05, "loss": 0.2773, "mean_copy_accuracy": 0.9977137893438339, "mean_gen_accuracy": 0.877209261059761, "mean_token_accuracy": 0.9054411798715591, "num_tokens": 1140875534.0, "sample_num_tokens": 8583.0, "step": 8205, "total_num_tokens": 1140909866.0, "z_loss": 0.0004812777624465525 }, { "copy_logits_max": -5.982386589050293, "copy_logits_min": -687500032.0, "copy_num_tokens": 385.0625, "epoch": 1.6760275721215216, "gen_logits_max": 4.1131391525268555, "gen_logits_mean": -16.893268585205078, "gen_logits_min": -28.881879806518555, "gen_logits_std": 3.329723358154297, "gen_loss": 0.3116523027420044, "grad_norm": 0.4088486003041288, "learning_rate": 2.056421052631579e-05, "loss": 0.3113, "mean_copy_accuracy": 0.9968836754560471, "mean_gen_accuracy": 0.8652876168489456, "mean_token_accuracy": 0.896672710776329, "num_tokens": 1141140811.0, "sample_num_tokens": 7947.25, "step": 8206, "total_num_tokens": 1141172600.0, "z_loss": 0.00046081634354777634 }, { "copy_logits_max": -5.989382743835449, "copy_logits_min": -687500032.0, "copy_num_tokens": 446.75, "epoch": 1.6762318100587184, "gen_logits_max": 3.4882524013519287, "gen_logits_mean": -17.17770767211914, "gen_logits_min": -29.132638931274414, "gen_logits_std": 3.2840023040771484, "gen_loss": 0.2662197947502136, "grad_norm": 0.3719583011687447, "learning_rate": 2.056294736842105e-05, "loss": 0.2739, "mean_copy_accuracy": 0.9963464885950089, "mean_gen_accuracy": 0.877746194601059, "mean_token_accuracy": 0.9079970866441727, "num_tokens": 1141419650.0, "sample_num_tokens": 8247.0, "step": 8207, "total_num_tokens": 1141452638.0, "z_loss": 0.000454217049991712 }, { "copy_logits_max": -6.5071611404418945, "copy_logits_min": -750000064.0, "copy_num_tokens": 416.3125, "epoch": 1.6764360479959153, "gen_logits_max": 3.5831596851348877, "gen_logits_mean": -17.57077407836914, "gen_logits_min": -29.40279769897461, "gen_logits_std": 3.3256750106811523, "gen_loss": 0.350823312997818, "grad_norm": 0.3618586369973775, "learning_rate": 2.056168421052632e-05, "loss": 0.2923, "mean_copy_accuracy": 0.9972139298915863, "mean_gen_accuracy": 0.8669766038656235, "mean_token_accuracy": 0.9018429666757584, "num_tokens": 1141705103.0, "sample_num_tokens": 8570.75, "step": 8208, "total_num_tokens": 1141739386.0, "z_loss": 0.0005176309496164322 }, { "copy_logits_max": -3.236229181289673, "copy_logits_min": -750000064.0, "copy_num_tokens": 538.3125, "epoch": 1.6766402859331122, "gen_logits_max": 3.9417102336883545, "gen_logits_mean": -16.601253509521484, "gen_logits_min": -28.700790405273438, "gen_logits_std": 3.342337131500244, "gen_loss": 0.2519453167915344, "grad_norm": 0.38969753111722577, "learning_rate": 2.056042105263158e-05, "loss": 0.2753, "mean_copy_accuracy": 0.995681568980217, "mean_gen_accuracy": 0.8829984664916992, "mean_token_accuracy": 0.9063610881567001, "num_tokens": 1141953653.0, "sample_num_tokens": 9185.25, "step": 8209, "total_num_tokens": 1141990394.0, "z_loss": 0.0004337476857472211 }, { "copy_logits_max": -4.762872695922852, "copy_logits_min": -687500032.0, "copy_num_tokens": 342.5, "epoch": 1.676844523870309, "gen_logits_max": 4.66357946395874, "gen_logits_mean": -16.018142700195312, "gen_logits_min": -27.38323974609375, "gen_logits_std": 3.2457034587860107, "gen_loss": 0.2982012629508972, "grad_norm": 0.35605774844528226, "learning_rate": 2.0559157894736844e-05, "loss": 0.2762, "mean_copy_accuracy": 0.9973982125520706, "mean_gen_accuracy": 0.8753195852041245, "mean_token_accuracy": 0.9043565392494202, "num_tokens": 1142233291.0, "sample_num_tokens": 8120.75, "step": 8210, "total_num_tokens": 1142265774.0, "z_loss": 0.00044067209819331765 }, { "copy_logits_max": -6.0508880615234375, "copy_logits_min": -750000000.0, "copy_num_tokens": 417.5, "epoch": 1.6770487618075056, "gen_logits_max": 2.5094354152679443, "gen_logits_mean": -18.925689697265625, "gen_logits_min": -30.790721893310547, "gen_logits_std": 3.419879913330078, "gen_loss": 0.25064265727996826, "grad_norm": 0.3402825846494301, "learning_rate": 2.0557894736842105e-05, "loss": 0.249, "mean_copy_accuracy": 0.9975953549146652, "mean_gen_accuracy": 0.8849723786115646, "mean_token_accuracy": 0.9155685752630234, "num_tokens": 1142521253.0, "sample_num_tokens": 7757.25, "step": 8211, "total_num_tokens": 1142552282.0, "z_loss": 0.0003750511968974024 }, { "copy_logits_max": -5.186527252197266, "copy_logits_min": -750000000.0, "copy_num_tokens": 437.375, "epoch": 1.6772529997447025, "gen_logits_max": 3.2915005683898926, "gen_logits_mean": -18.604429244995117, "gen_logits_min": -30.607393264770508, "gen_logits_std": 3.4002017974853516, "gen_loss": 0.2740895450115204, "grad_norm": 0.3760368123913748, "learning_rate": 2.055663157894737e-05, "loss": 0.2645, "mean_copy_accuracy": 0.9967807084321976, "mean_gen_accuracy": 0.8795524537563324, "mean_token_accuracy": 0.9091554582118988, "num_tokens": 1142783328.0, "sample_num_tokens": 8370.5, "step": 8212, "total_num_tokens": 1142816810.0, "z_loss": 0.0004297445993870497 }, { "copy_logits_max": -1.5892373323440552, "copy_logits_min": -687500032.0, "copy_num_tokens": 524.9375, "epoch": 1.6774572376818995, "gen_logits_max": 4.379636287689209, "gen_logits_mean": -15.743257522583008, "gen_logits_min": -28.636962890625, "gen_logits_std": 3.3276748657226562, "gen_loss": 0.2607932686805725, "grad_norm": 0.37293839555925606, "learning_rate": 2.055536842105263e-05, "loss": 0.2611, "mean_copy_accuracy": 0.9963872134685516, "mean_gen_accuracy": 0.8802198767662048, "mean_token_accuracy": 0.9093039780855179, "num_tokens": 1143057029.0, "sample_num_tokens": 8814.75, "step": 8213, "total_num_tokens": 1143092288.0, "z_loss": 0.00041791642433963716 }, { "copy_logits_max": -3.704221725463867, "copy_logits_min": -750000000.0, "copy_num_tokens": 319.0625, "epoch": 1.6776614756190962, "gen_logits_max": 3.626127243041992, "gen_logits_mean": -17.917579650878906, "gen_logits_min": -29.885944366455078, "gen_logits_std": 3.3983349800109863, "gen_loss": 0.26729461550712585, "grad_norm": 0.38737320706169975, "learning_rate": 2.0554105263157894e-05, "loss": 0.2941, "mean_copy_accuracy": 0.996211051940918, "mean_gen_accuracy": 0.8733119368553162, "mean_token_accuracy": 0.9010954946279526, "num_tokens": 1143309612.0, "sample_num_tokens": 7373.5, "step": 8214, "total_num_tokens": 1143339106.0, "z_loss": 0.00041980674723163247 }, { "copy_logits_max": -2.4627280235290527, "copy_logits_min": -687500032.0, "copy_num_tokens": 461.9375, "epoch": 1.677865713556293, "gen_logits_max": 4.323683261871338, "gen_logits_mean": -16.951839447021484, "gen_logits_min": -29.25522804260254, "gen_logits_std": 3.3559975624084473, "gen_loss": 0.3047751188278198, "grad_norm": 0.32044903963042987, "learning_rate": 2.055284210526316e-05, "loss": 0.2716, "mean_copy_accuracy": 0.9978064298629761, "mean_gen_accuracy": 0.8757492452859879, "mean_token_accuracy": 0.9084415137767792, "num_tokens": 1143615783.0, "sample_num_tokens": 9313.25, "step": 8215, "total_num_tokens": 1143653036.0, "z_loss": 0.0005386166158132255 }, { "copy_logits_max": -0.6501750946044922, "copy_logits_min": -625000064.0, "copy_num_tokens": 422.375, "epoch": 1.67806995149349, "gen_logits_max": 4.399524688720703, "gen_logits_mean": -16.376205444335938, "gen_logits_min": -29.370908737182617, "gen_logits_std": 3.353044271469116, "gen_loss": 0.2524702548980713, "grad_norm": 0.37686446986370903, "learning_rate": 2.0551578947368423e-05, "loss": 0.249, "mean_copy_accuracy": 0.9968479126691818, "mean_gen_accuracy": 0.8897161781787872, "mean_token_accuracy": 0.9158885031938553, "num_tokens": 1143898699.0, "sample_num_tokens": 8415.75, "step": 8216, "total_num_tokens": 1143932362.0, "z_loss": 0.0004536714986898005 }, { "copy_logits_max": -3.5611658096313477, "copy_logits_min": -750000000.0, "copy_num_tokens": 325.5, "epoch": 1.6782741894306867, "gen_logits_max": 3.955726146697998, "gen_logits_mean": -16.928560256958008, "gen_logits_min": -29.35369873046875, "gen_logits_std": 3.338167667388916, "gen_loss": 0.32510218024253845, "grad_norm": 0.36841231348765296, "learning_rate": 2.0550315789473687e-05, "loss": 0.2896, "mean_copy_accuracy": 0.9974068701267242, "mean_gen_accuracy": 0.8699475079774857, "mean_token_accuracy": 0.9021884649991989, "num_tokens": 1144175767.0, "sample_num_tokens": 7016.25, "step": 8217, "total_num_tokens": 1144203832.0, "z_loss": 0.0004868335381615907 }, { "copy_logits_max": -3.445422649383545, "copy_logits_min": -750000000.0, "copy_num_tokens": 654.25, "epoch": 1.6784784273678834, "gen_logits_max": 3.298029899597168, "gen_logits_mean": -16.782384872436523, "gen_logits_min": -29.83446502685547, "gen_logits_std": 3.392601490020752, "gen_loss": 0.21854162216186523, "grad_norm": 0.4164511449244847, "learning_rate": 2.0549052631578948e-05, "loss": 0.2739, "mean_copy_accuracy": 0.995651975274086, "mean_gen_accuracy": 0.8789428025484085, "mean_token_accuracy": 0.9085679650306702, "num_tokens": 1144444273.0, "sample_num_tokens": 9090.25, "step": 8218, "total_num_tokens": 1144480634.0, "z_loss": 0.00032679957803338766 }, { "copy_logits_max": -5.358447551727295, "copy_logits_min": -750000000.0, "copy_num_tokens": 394.875, "epoch": 1.6786826653050804, "gen_logits_max": 4.3756818771362305, "gen_logits_mean": -15.661198616027832, "gen_logits_min": -27.93605613708496, "gen_logits_std": 3.2934086322784424, "gen_loss": 0.2765941023826599, "grad_norm": 0.39533840261367575, "learning_rate": 2.0547789473684212e-05, "loss": 0.2885, "mean_copy_accuracy": 0.9960623234510422, "mean_gen_accuracy": 0.8736267536878586, "mean_token_accuracy": 0.9010264128446579, "num_tokens": 1144713045.0, "sample_num_tokens": 7930.25, "step": 8219, "total_num_tokens": 1144744766.0, "z_loss": 0.00041830859845504165 }, { "copy_logits_max": -3.9582626819610596, "copy_logits_min": -687500032.0, "copy_num_tokens": 356.75, "epoch": 1.6788869032422773, "gen_logits_max": 5.4181952476501465, "gen_logits_mean": -14.768781661987305, "gen_logits_min": -27.284637451171875, "gen_logits_std": 3.2844440937042236, "gen_loss": 0.307125985622406, "grad_norm": 0.3592009883935108, "learning_rate": 2.0546526315789473e-05, "loss": 0.2857, "mean_copy_accuracy": 0.9964887797832489, "mean_gen_accuracy": 0.8759189993143082, "mean_token_accuracy": 0.9039999097585678, "num_tokens": 1144971413.0, "sample_num_tokens": 8382.25, "step": 8220, "total_num_tokens": 1145004942.0, "z_loss": 0.0004746161284856498 }, { "copy_logits_max": -5.289966583251953, "copy_logits_min": -750000000.0, "copy_num_tokens": 311.8125, "epoch": 1.679091141179474, "gen_logits_max": 4.431057929992676, "gen_logits_mean": -16.870418548583984, "gen_logits_min": -29.100223541259766, "gen_logits_std": 3.3444981575012207, "gen_loss": 0.3157905340194702, "grad_norm": 0.43284269392837943, "learning_rate": 2.0545263157894738e-05, "loss": 0.3005, "mean_copy_accuracy": 0.9958347082138062, "mean_gen_accuracy": 0.8716518878936768, "mean_token_accuracy": 0.8977543860673904, "num_tokens": 1145216650.0, "sample_num_tokens": 7307.0, "step": 8221, "total_num_tokens": 1145245878.0, "z_loss": 0.00048092924407683313 }, { "copy_logits_max": -2.8280727863311768, "copy_logits_min": -687500032.0, "copy_num_tokens": 617.75, "epoch": 1.679295379116671, "gen_logits_max": 3.8910982608795166, "gen_logits_mean": -15.685519218444824, "gen_logits_min": -28.37997055053711, "gen_logits_std": 3.3216421604156494, "gen_loss": 0.2580656111240387, "grad_norm": 0.3937050776996021, "learning_rate": 2.0544e-05, "loss": 0.2638, "mean_copy_accuracy": 0.9970925152301788, "mean_gen_accuracy": 0.8806215822696686, "mean_token_accuracy": 0.9091160446405411, "num_tokens": 1145470566.0, "sample_num_tokens": 8862.0, "step": 8222, "total_num_tokens": 1145506014.0, "z_loss": 0.00044781563337892294 }, { "copy_logits_max": -5.203220367431641, "copy_logits_min": -750000000.0, "copy_num_tokens": 363.3125, "epoch": 1.6794996170538679, "gen_logits_max": 3.0858616828918457, "gen_logits_mean": -18.447093963623047, "gen_logits_min": -30.85993194580078, "gen_logits_std": 3.422086238861084, "gen_loss": 0.25740453600883484, "grad_norm": 0.35704555341635674, "learning_rate": 2.0542736842105263e-05, "loss": 0.2609, "mean_copy_accuracy": 0.996637687087059, "mean_gen_accuracy": 0.8753412663936615, "mean_token_accuracy": 0.9110030978918076, "num_tokens": 1145759886.0, "sample_num_tokens": 7618.5, "step": 8223, "total_num_tokens": 1145790360.0, "z_loss": 0.00042329845018684864 }, { "copy_logits_max": -2.807171583175659, "copy_logits_min": -750000064.0, "copy_num_tokens": 560.4375, "epoch": 1.6797038549910646, "gen_logits_max": 4.9844207763671875, "gen_logits_mean": -15.144791603088379, "gen_logits_min": -27.296260833740234, "gen_logits_std": 3.2944791316986084, "gen_loss": 0.26304298639297485, "grad_norm": 0.3938069721984945, "learning_rate": 2.0541473684210527e-05, "loss": 0.2699, "mean_copy_accuracy": 0.9967458099126816, "mean_gen_accuracy": 0.8801734298467636, "mean_token_accuracy": 0.9088018983602524, "num_tokens": 1146009403.0, "sample_num_tokens": 8869.75, "step": 8224, "total_num_tokens": 1146044882.0, "z_loss": 0.00044524591066874564 }, { "copy_logits_max": -5.6146345138549805, "copy_logits_min": -750000000.0, "copy_num_tokens": 305.875, "epoch": 1.6799080929282613, "gen_logits_max": 4.06240177154541, "gen_logits_mean": -16.824230194091797, "gen_logits_min": -28.933597564697266, "gen_logits_std": 3.318521499633789, "gen_loss": 0.3029966652393341, "grad_norm": 0.35354597238704066, "learning_rate": 2.054021052631579e-05, "loss": 0.2643, "mean_copy_accuracy": 0.9970007240772247, "mean_gen_accuracy": 0.8819511085748672, "mean_token_accuracy": 0.9106007963418961, "num_tokens": 1146297907.0, "sample_num_tokens": 6949.75, "step": 8225, "total_num_tokens": 1146325706.0, "z_loss": 0.00045260094339028 }, { "copy_logits_max": -4.9836506843566895, "copy_logits_min": -750000000.0, "copy_num_tokens": 449.0625, "epoch": 1.6801123308654584, "gen_logits_max": 4.1603546142578125, "gen_logits_mean": -16.80689811706543, "gen_logits_min": -29.044208526611328, "gen_logits_std": 3.3477623462677, "gen_loss": 0.26852428913116455, "grad_norm": 0.34465570538822654, "learning_rate": 2.0538947368421052e-05, "loss": 0.2621, "mean_copy_accuracy": 0.9973713010549545, "mean_gen_accuracy": 0.8775710910558701, "mean_token_accuracy": 0.910201832652092, "num_tokens": 1146596457.0, "sample_num_tokens": 8215.75, "step": 8226, "total_num_tokens": 1146629320.0, "z_loss": 0.0004912345902994275 }, { "copy_logits_max": -2.756845235824585, "copy_logits_min": -750000000.0, "copy_num_tokens": 483.75, "epoch": 1.6803165688026551, "gen_logits_max": 3.868504524230957, "gen_logits_mean": -16.666501998901367, "gen_logits_min": -28.995784759521484, "gen_logits_std": 3.339580535888672, "gen_loss": 0.30657899379730225, "grad_norm": 0.37756123756132043, "learning_rate": 2.0537684210526317e-05, "loss": 0.2926, "mean_copy_accuracy": 0.9972324520349503, "mean_gen_accuracy": 0.8649787455797195, "mean_token_accuracy": 0.9003821164369583, "num_tokens": 1146876461.0, "sample_num_tokens": 8230.25, "step": 8227, "total_num_tokens": 1146909382.0, "z_loss": 0.0005222512409090996 }, { "copy_logits_max": -6.752532958984375, "copy_logits_min": -687500032.0, "copy_num_tokens": 393.6875, "epoch": 1.6805208067398518, "gen_logits_max": 4.102501392364502, "gen_logits_mean": -17.282197952270508, "gen_logits_min": -29.515106201171875, "gen_logits_std": 3.3709969520568848, "gen_loss": 0.2665109634399414, "grad_norm": 0.35544002896514365, "learning_rate": 2.053642105263158e-05, "loss": 0.2658, "mean_copy_accuracy": 0.9966685175895691, "mean_gen_accuracy": 0.8787999302148819, "mean_token_accuracy": 0.9099352806806564, "num_tokens": 1147147976.0, "sample_num_tokens": 8320.0, "step": 8228, "total_num_tokens": 1147181256.0, "z_loss": 0.00043357431422919035 }, { "copy_logits_max": -3.6604137420654297, "copy_logits_min": -750000000.0, "copy_num_tokens": 508.125, "epoch": 1.6807250446770488, "gen_logits_max": 4.858090400695801, "gen_logits_mean": -14.99901008605957, "gen_logits_min": -27.517967224121094, "gen_logits_std": 3.284580707550049, "gen_loss": 0.30400350689888, "grad_norm": 0.36691355146187105, "learning_rate": 2.0535157894736842e-05, "loss": 0.2755, "mean_copy_accuracy": 0.9964759796857834, "mean_gen_accuracy": 0.876231238245964, "mean_token_accuracy": 0.907933235168457, "num_tokens": 1147406461.0, "sample_num_tokens": 8438.75, "step": 8229, "total_num_tokens": 1147440216.0, "z_loss": 0.0005354449385777116 }, { "copy_logits_max": -4.461588382720947, "copy_logits_min": -750000064.0, "copy_num_tokens": 671.5625, "epoch": 1.6809292826142457, "gen_logits_max": 2.7867767810821533, "gen_logits_mean": -17.467735290527344, "gen_logits_min": -29.495101928710938, "gen_logits_std": 3.352689027786255, "gen_loss": 0.2718534469604492, "grad_norm": 0.37506230164675597, "learning_rate": 2.0533894736842106e-05, "loss": 0.271, "mean_copy_accuracy": 0.9974234104156494, "mean_gen_accuracy": 0.874852791428566, "mean_token_accuracy": 0.9096558541059494, "num_tokens": 1147700692.0, "sample_num_tokens": 10051.5, "step": 8230, "total_num_tokens": 1147740898.0, "z_loss": 0.0004593189514707774 }, { "copy_logits_max": -1.7858154773712158, "copy_logits_min": -687500032.0, "copy_num_tokens": 442.125, "epoch": 1.6811335205514424, "gen_logits_max": 3.3109943866729736, "gen_logits_mean": -17.750551223754883, "gen_logits_min": -30.240161895751953, "gen_logits_std": 3.365006923675537, "gen_loss": 0.29062116146087646, "grad_norm": 0.3502990901732234, "learning_rate": 2.0532631578947367e-05, "loss": 0.2711, "mean_copy_accuracy": 0.9969349205493927, "mean_gen_accuracy": 0.8768039047718048, "mean_token_accuracy": 0.9071451276540756, "num_tokens": 1147979675.0, "sample_num_tokens": 8200.25, "step": 8231, "total_num_tokens": 1148012476.0, "z_loss": 0.000503353716339916 }, { "copy_logits_max": -6.022306442260742, "copy_logits_min": -750000128.0, "copy_num_tokens": 551.125, "epoch": 1.6813377584886393, "gen_logits_max": 4.211860656738281, "gen_logits_mean": -16.76108169555664, "gen_logits_min": -29.240867614746094, "gen_logits_std": 3.325489044189453, "gen_loss": 0.3034801483154297, "grad_norm": 0.3773309203191006, "learning_rate": 2.0531368421052635e-05, "loss": 0.2761, "mean_copy_accuracy": 0.9976091235876083, "mean_gen_accuracy": 0.8739330470561981, "mean_token_accuracy": 0.9065361768007278, "num_tokens": 1148257603.0, "sample_num_tokens": 9758.25, "step": 8232, "total_num_tokens": 1148296636.0, "z_loss": 0.0005576441762968898 }, { "copy_logits_max": -4.154462814331055, "copy_logits_min": -750000000.0, "copy_num_tokens": 352.125, "epoch": 1.6815419964258362, "gen_logits_max": 4.361343860626221, "gen_logits_mean": -16.607772827148438, "gen_logits_min": -28.5582275390625, "gen_logits_std": 3.3186941146850586, "gen_loss": 0.30335497856140137, "grad_norm": 0.34605081010500927, "learning_rate": 2.0530105263157896e-05, "loss": 0.2794, "mean_copy_accuracy": 0.9970305860042572, "mean_gen_accuracy": 0.8779721558094025, "mean_token_accuracy": 0.904809445142746, "num_tokens": 1148519123.0, "sample_num_tokens": 7765.25, "step": 8233, "total_num_tokens": 1148550184.0, "z_loss": 0.0005374327884055674 }, { "copy_logits_max": -2.132355213165283, "copy_logits_min": -750000000.0, "copy_num_tokens": 490.3125, "epoch": 1.681746234363033, "gen_logits_max": 2.8851099014282227, "gen_logits_mean": -17.88911247253418, "gen_logits_min": -30.264373779296875, "gen_logits_std": 3.4059391021728516, "gen_loss": 0.28898805379867554, "grad_norm": 0.3694179762060236, "learning_rate": 2.052884210526316e-05, "loss": 0.272, "mean_copy_accuracy": 0.9961500316858292, "mean_gen_accuracy": 0.8807254433631897, "mean_token_accuracy": 0.9078763723373413, "num_tokens": 1148764362.0, "sample_num_tokens": 8305.5, "step": 8234, "total_num_tokens": 1148797584.0, "z_loss": 0.0004584955458994955 }, { "copy_logits_max": -7.5191779136657715, "copy_logits_min": -750000000.0, "copy_num_tokens": 370.3125, "epoch": 1.6819504723002296, "gen_logits_max": 4.401108741760254, "gen_logits_mean": -16.319393157958984, "gen_logits_min": -28.190059661865234, "gen_logits_std": 3.2889459133148193, "gen_loss": 0.29592129588127136, "grad_norm": 0.379975651675776, "learning_rate": 2.052757894736842e-05, "loss": 0.2892, "mean_copy_accuracy": 0.9962074011564255, "mean_gen_accuracy": 0.875076562166214, "mean_token_accuracy": 0.901453286409378, "num_tokens": 1149021906.0, "sample_num_tokens": 7802.0, "step": 8235, "total_num_tokens": 1149053114.0, "z_loss": 0.0004559046064969152 }, { "copy_logits_max": -4.2854509353637695, "copy_logits_min": -750000000.0, "copy_num_tokens": 584.75, "epoch": 1.6821547102374266, "gen_logits_max": 3.0983996391296387, "gen_logits_mean": -18.355262756347656, "gen_logits_min": -30.517391204833984, "gen_logits_std": 3.4056801795959473, "gen_loss": 0.27007195353507996, "grad_norm": 0.3799695125831794, "learning_rate": 2.0526315789473685e-05, "loss": 0.2812, "mean_copy_accuracy": 0.9961453825235367, "mean_gen_accuracy": 0.8774641305208206, "mean_token_accuracy": 0.9048268646001816, "num_tokens": 1149277459.0, "sample_num_tokens": 9306.75, "step": 8236, "total_num_tokens": 1149314686.0, "z_loss": 0.0004479558556340635 }, { "copy_logits_max": -6.040467262268066, "copy_logits_min": -687500032.0, "copy_num_tokens": 318.5, "epoch": 1.6823589481746235, "gen_logits_max": 3.4543581008911133, "gen_logits_mean": -17.71908950805664, "gen_logits_min": -29.578907012939453, "gen_logits_std": 3.3515286445617676, "gen_loss": 0.2859247326850891, "grad_norm": 0.3498425262414907, "learning_rate": 2.0525052631578946e-05, "loss": 0.2658, "mean_copy_accuracy": 0.9973855018615723, "mean_gen_accuracy": 0.8792091906070709, "mean_token_accuracy": 0.9085994958877563, "num_tokens": 1149554687.0, "sample_num_tokens": 6983.75, "step": 8237, "total_num_tokens": 1149582622.0, "z_loss": 0.00047639288823120296 }, { "copy_logits_max": -4.612471103668213, "copy_logits_min": -750000000.0, "copy_num_tokens": 571.875, "epoch": 1.6825631861118202, "gen_logits_max": 3.8250508308410645, "gen_logits_mean": -16.47529411315918, "gen_logits_min": -29.067611694335938, "gen_logits_std": 3.3642215728759766, "gen_loss": 0.22046248614788055, "grad_norm": 0.3856675708588482, "learning_rate": 2.052378947368421e-05, "loss": 0.268, "mean_copy_accuracy": 0.9961224049329758, "mean_gen_accuracy": 0.8836730718612671, "mean_token_accuracy": 0.9090800881385803, "num_tokens": 1149816919.0, "sample_num_tokens": 9927.25, "step": 8238, "total_num_tokens": 1149856628.0, "z_loss": 0.0003875692200381309 }, { "copy_logits_max": -3.9625871181488037, "copy_logits_min": -750000000.0, "copy_num_tokens": 527.8125, "epoch": 1.6827674240490171, "gen_logits_max": 2.7296857833862305, "gen_logits_mean": -17.698444366455078, "gen_logits_min": -29.7542781829834, "gen_logits_std": 3.3546321392059326, "gen_loss": 0.28481224179267883, "grad_norm": 0.34973131127475277, "learning_rate": 2.052252631578947e-05, "loss": 0.2758, "mean_copy_accuracy": 0.9981088042259216, "mean_gen_accuracy": 0.8691748976707458, "mean_token_accuracy": 0.9055671095848083, "num_tokens": 1150106369.0, "sample_num_tokens": 9133.75, "step": 8239, "total_num_tokens": 1150142904.0, "z_loss": 0.0004572385805658996 }, { "copy_logits_max": -6.175239562988281, "copy_logits_min": -750000064.0, "copy_num_tokens": 348.625, "epoch": 1.682971661986214, "gen_logits_max": 4.156391620635986, "gen_logits_mean": -16.91656494140625, "gen_logits_min": -29.131805419921875, "gen_logits_std": 3.3424293994903564, "gen_loss": 0.2813687324523926, "grad_norm": 0.31981360507321654, "learning_rate": 2.0521263157894736e-05, "loss": 0.2551, "mean_copy_accuracy": 0.9966040849685669, "mean_gen_accuracy": 0.8839512914419174, "mean_token_accuracy": 0.912318617105484, "num_tokens": 1150391540.0, "sample_num_tokens": 7886.5, "step": 8240, "total_num_tokens": 1150423086.0, "z_loss": 0.00045580242294818163 }, { "copy_logits_max": -5.554155349731445, "copy_logits_min": -750000000.0, "copy_num_tokens": 599.625, "epoch": 1.6831758999234108, "gen_logits_max": 3.23746657371521, "gen_logits_mean": -16.350482940673828, "gen_logits_min": -28.774662017822266, "gen_logits_std": 3.351710796356201, "gen_loss": 0.27108365297317505, "grad_norm": 0.3434172841387237, "learning_rate": 2.0520000000000003e-05, "loss": 0.2689, "mean_copy_accuracy": 0.9974627494812012, "mean_gen_accuracy": 0.8753017783164978, "mean_token_accuracy": 0.9095466434955597, "num_tokens": 1150678631.0, "sample_num_tokens": 8948.25, "step": 8241, "total_num_tokens": 1150714424.0, "z_loss": 0.00040270877070724964 }, { "copy_logits_max": -4.722699165344238, "copy_logits_min": -687500032.0, "copy_num_tokens": 458.0, "epoch": 1.6833801378606075, "gen_logits_max": 3.0322864055633545, "gen_logits_mean": -17.492557525634766, "gen_logits_min": -29.729616165161133, "gen_logits_std": 3.389514684677124, "gen_loss": 0.2727358341217041, "grad_norm": 0.3707554607032796, "learning_rate": 2.0518736842105264e-05, "loss": 0.2796, "mean_copy_accuracy": 0.9967127442359924, "mean_gen_accuracy": 0.8737427890300751, "mean_token_accuracy": 0.9031244665384293, "num_tokens": 1150950447.0, "sample_num_tokens": 8087.25, "step": 8242, "total_num_tokens": 1150982796.0, "z_loss": 0.0004492606967687607 }, { "copy_logits_max": -6.312248229980469, "copy_logits_min": -750000000.0, "copy_num_tokens": 479.3125, "epoch": 1.6835843757978044, "gen_logits_max": 3.01225209236145, "gen_logits_mean": -18.184877395629883, "gen_logits_min": -30.138992309570312, "gen_logits_std": 3.3710343837738037, "gen_loss": 0.2596902847290039, "grad_norm": 0.3488056441356924, "learning_rate": 2.051747368421053e-05, "loss": 0.2633, "mean_copy_accuracy": 0.9966086000204086, "mean_gen_accuracy": 0.882075697183609, "mean_token_accuracy": 0.9133685380220413, "num_tokens": 1151231683.0, "sample_num_tokens": 8977.75, "step": 8243, "total_num_tokens": 1151267594.0, "z_loss": 0.0004534173058345914 }, { "copy_logits_max": -6.038987636566162, "copy_logits_min": -687500032.0, "copy_num_tokens": 325.5, "epoch": 1.6837886137350013, "gen_logits_max": 4.018481731414795, "gen_logits_mean": -16.69774627685547, "gen_logits_min": -29.15740394592285, "gen_logits_std": 3.3100039958953857, "gen_loss": 0.3113436698913574, "grad_norm": 0.3598178678570097, "learning_rate": 2.051621052631579e-05, "loss": 0.2759, "mean_copy_accuracy": 0.996527224779129, "mean_gen_accuracy": 0.8762057423591614, "mean_token_accuracy": 0.9036523252725601, "num_tokens": 1151517397.0, "sample_num_tokens": 7063.75, "step": 8244, "total_num_tokens": 1151545652.0, "z_loss": 0.000517861801199615 }, { "copy_logits_max": -5.931985855102539, "copy_logits_min": -687500032.0, "copy_num_tokens": 674.0625, "epoch": 1.683992851672198, "gen_logits_max": 3.363982915878296, "gen_logits_mean": -16.930601119995117, "gen_logits_min": -29.533279418945312, "gen_logits_std": 3.3885879516601562, "gen_loss": 0.24746502935886383, "grad_norm": 0.3434869824518476, "learning_rate": 2.0514947368421054e-05, "loss": 0.2615, "mean_copy_accuracy": 0.9971566200256348, "mean_gen_accuracy": 0.8784917891025543, "mean_token_accuracy": 0.9118087291717529, "num_tokens": 1151821803.0, "sample_num_tokens": 10372.25, "step": 8245, "total_num_tokens": 1151863292.0, "z_loss": 0.00040704733692109585 }, { "copy_logits_max": -5.209617614746094, "copy_logits_min": -750000000.0, "copy_num_tokens": 512.5625, "epoch": 1.684197089609395, "gen_logits_max": 3.285053253173828, "gen_logits_mean": -17.481037139892578, "gen_logits_min": -29.884281158447266, "gen_logits_std": 3.3866395950317383, "gen_loss": 0.2769877314567566, "grad_norm": 0.3106234860238174, "learning_rate": 2.0513684210526315e-05, "loss": 0.2574, "mean_copy_accuracy": 0.9966592490673065, "mean_gen_accuracy": 0.8852586448192596, "mean_token_accuracy": 0.9106758683919907, "num_tokens": 1152106060.0, "sample_num_tokens": 9581.5, "step": 8246, "total_num_tokens": 1152144386.0, "z_loss": 0.0004893805598840117 }, { "copy_logits_max": -5.873089790344238, "copy_logits_min": -687500032.0, "copy_num_tokens": 427.125, "epoch": 1.6844013275465919, "gen_logits_max": 3.1332316398620605, "gen_logits_mean": -17.694538116455078, "gen_logits_min": -30.16506576538086, "gen_logits_std": 3.393721580505371, "gen_loss": 0.2699483335018158, "grad_norm": 0.33130656558987975, "learning_rate": 2.051242105263158e-05, "loss": 0.2704, "mean_copy_accuracy": 0.9978746920824051, "mean_gen_accuracy": 0.8775904327630997, "mean_token_accuracy": 0.9077795743942261, "num_tokens": 1152386408.0, "sample_num_tokens": 7544.0, "step": 8247, "total_num_tokens": 1152416584.0, "z_loss": 0.00047206919407472014 }, { "copy_logits_max": -5.771336078643799, "copy_logits_min": -750000000.0, "copy_num_tokens": 667.8125, "epoch": 1.6846055654837886, "gen_logits_max": 2.745030403137207, "gen_logits_mean": -17.464458465576172, "gen_logits_min": -30.31739044189453, "gen_logits_std": 3.408362865447998, "gen_loss": 0.2994600534439087, "grad_norm": 0.38494787216309684, "learning_rate": 2.051115789473684e-05, "loss": 0.2771, "mean_copy_accuracy": 0.9978061020374298, "mean_gen_accuracy": 0.8739073127508163, "mean_token_accuracy": 0.9059126973152161, "num_tokens": 1152656110.0, "sample_num_tokens": 9794.0, "step": 8248, "total_num_tokens": 1152695286.0, "z_loss": 0.00046730315079912543 }, { "copy_logits_max": -7.7528486251831055, "copy_logits_min": -750000064.0, "copy_num_tokens": 539.375, "epoch": 1.6848098034209853, "gen_logits_max": 4.0952372550964355, "gen_logits_mean": -15.595852851867676, "gen_logits_min": -27.060142517089844, "gen_logits_std": 3.1479318141937256, "gen_loss": 0.31312528252601624, "grad_norm": 0.34765788432768396, "learning_rate": 2.0509894736842108e-05, "loss": 0.2762, "mean_copy_accuracy": 0.9970472455024719, "mean_gen_accuracy": 0.8787122964859009, "mean_token_accuracy": 0.9061229228973389, "num_tokens": 1152946664.0, "sample_num_tokens": 10805.0, "step": 8249, "total_num_tokens": 1152989884.0, "z_loss": 0.0004793088883161545 }, { "copy_logits_max": -6.201194763183594, "copy_logits_min": -687500032.0, "copy_num_tokens": 585.0, "epoch": 1.6850140413581824, "gen_logits_max": 3.118342161178589, "gen_logits_mean": -16.951778411865234, "gen_logits_min": -29.17498207092285, "gen_logits_std": 3.3053231239318848, "gen_loss": 0.25512102246284485, "grad_norm": 0.3738174506278595, "learning_rate": 2.050863157894737e-05, "loss": 0.2818, "mean_copy_accuracy": 0.996138408780098, "mean_gen_accuracy": 0.8802371770143509, "mean_token_accuracy": 0.9053909778594971, "num_tokens": 1153211064.0, "sample_num_tokens": 8698.5, "step": 8250, "total_num_tokens": 1153245858.0, "z_loss": 0.0003884859324898571 }, { "copy_logits_max": -6.778848648071289, "copy_logits_min": -750000000.0, "copy_num_tokens": 346.3125, "epoch": 1.6852182792953792, "gen_logits_max": 3.544248104095459, "gen_logits_mean": -17.15228271484375, "gen_logits_min": -29.23662567138672, "gen_logits_std": 3.3157289028167725, "gen_loss": 0.30768883228302, "grad_norm": 0.33194757991127205, "learning_rate": 2.0507368421052633e-05, "loss": 0.2707, "mean_copy_accuracy": 0.9970546364784241, "mean_gen_accuracy": 0.8820671737194061, "mean_token_accuracy": 0.9073287695646286, "num_tokens": 1153468159.0, "sample_num_tokens": 7720.75, "step": 8251, "total_num_tokens": 1153499042.0, "z_loss": 0.0004027341492474079 }, { "copy_logits_max": -7.837837219238281, "copy_logits_min": -750000000.0, "copy_num_tokens": 278.6875, "epoch": 1.6854225172325759, "gen_logits_max": 3.7603139877319336, "gen_logits_mean": -18.092830657958984, "gen_logits_min": -29.828815460205078, "gen_logits_std": 3.353400468826294, "gen_loss": 0.30986732244491577, "grad_norm": 0.38044319964046064, "learning_rate": 2.0506105263157894e-05, "loss": 0.2845, "mean_copy_accuracy": 0.9966374486684799, "mean_gen_accuracy": 0.8768058121204376, "mean_token_accuracy": 0.9031434506177902, "num_tokens": 1153721034.0, "sample_num_tokens": 6879.0, "step": 8252, "total_num_tokens": 1153748550.0, "z_loss": 0.0004164597485214472 }, { "copy_logits_max": -8.357771873474121, "copy_logits_min": -750000000.0, "copy_num_tokens": 465.25, "epoch": 1.6856267551697728, "gen_logits_max": 4.075425148010254, "gen_logits_mean": -16.404457092285156, "gen_logits_min": -28.745445251464844, "gen_logits_std": 3.2914085388183594, "gen_loss": 0.271287202835083, "grad_norm": 0.35123676802789067, "learning_rate": 2.0504842105263158e-05, "loss": 0.2636, "mean_copy_accuracy": 0.9972671270370483, "mean_gen_accuracy": 0.8808192759752274, "mean_token_accuracy": 0.9102924317121506, "num_tokens": 1153980992.0, "sample_num_tokens": 8315.5, "step": 8253, "total_num_tokens": 1154014254.0, "z_loss": 0.0003888513892889023 }, { "copy_logits_max": -6.471789360046387, "copy_logits_min": -750000000.0, "copy_num_tokens": 423.5, "epoch": 1.6858309931069697, "gen_logits_max": 3.49519419670105, "gen_logits_mean": -17.672237396240234, "gen_logits_min": -29.84813690185547, "gen_logits_std": 3.3530430793762207, "gen_loss": 0.27764779329299927, "grad_norm": 0.33306646093168635, "learning_rate": 2.0503578947368423e-05, "loss": 0.2586, "mean_copy_accuracy": 0.9965074211359024, "mean_gen_accuracy": 0.8836212754249573, "mean_token_accuracy": 0.9113707095384598, "num_tokens": 1154269002.0, "sample_num_tokens": 8440.0, "step": 8254, "total_num_tokens": 1154302762.0, "z_loss": 0.00042490160558372736 }, { "copy_logits_max": -5.7988457679748535, "copy_logits_min": -750000000.0, "copy_num_tokens": 509.375, "epoch": 1.6860352310441664, "gen_logits_max": 3.4002556800842285, "gen_logits_mean": -17.083202362060547, "gen_logits_min": -29.69957733154297, "gen_logits_std": 3.3599276542663574, "gen_loss": 0.2712518572807312, "grad_norm": 0.3345476346206954, "learning_rate": 2.0502315789473683e-05, "loss": 0.26, "mean_copy_accuracy": 0.9971601217985153, "mean_gen_accuracy": 0.8803713768720627, "mean_token_accuracy": 0.911362886428833, "num_tokens": 1154570210.0, "sample_num_tokens": 8498.5, "step": 8255, "total_num_tokens": 1154604204.0, "z_loss": 0.0004061806248500943 }, { "copy_logits_max": -4.338284969329834, "copy_logits_min": -687500032.0, "copy_num_tokens": 460.8125, "epoch": 1.6862394689813633, "gen_logits_max": 3.800966739654541, "gen_logits_mean": -16.487377166748047, "gen_logits_min": -28.568164825439453, "gen_logits_std": 3.29793381690979, "gen_loss": 0.27187877893447876, "grad_norm": 0.3246002069126473, "learning_rate": 2.0501052631578948e-05, "loss": 0.2701, "mean_copy_accuracy": 0.9977282136678696, "mean_gen_accuracy": 0.8767107725143433, "mean_token_accuracy": 0.9079241752624512, "num_tokens": 1154842286.0, "sample_num_tokens": 8078.5, "step": 8256, "total_num_tokens": 1154874600.0, "z_loss": 0.0003889403014909476 }, { "copy_logits_max": -6.135512351989746, "copy_logits_min": -750000064.0, "copy_num_tokens": 364.25, "epoch": 1.6864437069185603, "gen_logits_max": 4.4082489013671875, "gen_logits_mean": -16.69835662841797, "gen_logits_min": -29.179513931274414, "gen_logits_std": 3.332014560699463, "gen_loss": 0.286316841840744, "grad_norm": 0.3448597582661139, "learning_rate": 2.0499789473684212e-05, "loss": 0.2777, "mean_copy_accuracy": 0.9963324069976807, "mean_gen_accuracy": 0.8785802870988846, "mean_token_accuracy": 0.9050611704587936, "num_tokens": 1155100086.0, "sample_num_tokens": 8124.5, "step": 8257, "total_num_tokens": 1155132584.0, "z_loss": 0.0004167259903624654 }, { "copy_logits_max": -5.5407819747924805, "copy_logits_min": -750000000.0, "copy_num_tokens": 300.8125, "epoch": 1.686647944855757, "gen_logits_max": 5.207796096801758, "gen_logits_mean": -15.614527702331543, "gen_logits_min": -27.34300994873047, "gen_logits_std": 3.20284366607666, "gen_loss": 0.3044891059398651, "grad_norm": 0.3735463291270551, "learning_rate": 2.0498526315789476e-05, "loss": 0.2891, "mean_copy_accuracy": 0.9966520816087723, "mean_gen_accuracy": 0.8720084726810455, "mean_token_accuracy": 0.9002201557159424, "num_tokens": 1155352373.0, "sample_num_tokens": 6798.75, "step": 8258, "total_num_tokens": 1155379568.0, "z_loss": 0.0004516730550676584 }, { "copy_logits_max": -5.938749313354492, "copy_logits_min": -687500032.0, "copy_num_tokens": 444.0, "epoch": 1.6868521827929537, "gen_logits_max": 4.029515266418457, "gen_logits_mean": -17.278087615966797, "gen_logits_min": -29.471712112426758, "gen_logits_std": 3.351501226425171, "gen_loss": 0.23853078484535217, "grad_norm": 0.3505980718915301, "learning_rate": 2.0497263157894737e-05, "loss": 0.2567, "mean_copy_accuracy": 0.9960642158985138, "mean_gen_accuracy": 0.8849174380302429, "mean_token_accuracy": 0.9121348410844803, "num_tokens": 1155601462.0, "sample_num_tokens": 8061.0, "step": 8259, "total_num_tokens": 1155633706.0, "z_loss": 0.00037905131466686726 }, { "copy_logits_max": -6.620524883270264, "copy_logits_min": -750000128.0, "copy_num_tokens": 505.875, "epoch": 1.6870564207301506, "gen_logits_max": 3.0452754497528076, "gen_logits_mean": -17.5588436126709, "gen_logits_min": -29.582862854003906, "gen_logits_std": 3.359335422515869, "gen_loss": 0.2804790139198303, "grad_norm": 0.42276398009941757, "learning_rate": 2.0496e-05, "loss": 0.2722, "mean_copy_accuracy": 0.9969499111175537, "mean_gen_accuracy": 0.8716858625411987, "mean_token_accuracy": 0.9065219461917877, "num_tokens": 1155861965.0, "sample_num_tokens": 8238.25, "step": 8260, "total_num_tokens": 1155894918.0, "z_loss": 0.000436953705502674 }, { "copy_logits_max": -5.819127559661865, "copy_logits_min": -750000000.0, "copy_num_tokens": 444.25, "epoch": 1.6872606586673475, "gen_logits_max": 3.5592124462127686, "gen_logits_mean": -16.910266876220703, "gen_logits_min": -29.180042266845703, "gen_logits_std": 3.3117876052856445, "gen_loss": 0.29647737741470337, "grad_norm": 0.34515624349359114, "learning_rate": 2.0494736842105263e-05, "loss": 0.2809, "mean_copy_accuracy": 0.9977991580963135, "mean_gen_accuracy": 0.8690039366483688, "mean_token_accuracy": 0.9047453701496124, "num_tokens": 1156149534.0, "sample_num_tokens": 7584.5, "step": 8261, "total_num_tokens": 1156179872.0, "z_loss": 0.0004956746706739068 }, { "copy_logits_max": -7.794872760772705, "copy_logits_min": -750000000.0, "copy_num_tokens": 394.125, "epoch": 1.6874648966045442, "gen_logits_max": 4.033450603485107, "gen_logits_mean": -17.201967239379883, "gen_logits_min": -28.94855308532715, "gen_logits_std": 3.3030810356140137, "gen_loss": 0.3333060145378113, "grad_norm": 0.35192765823684885, "learning_rate": 2.0493473684210527e-05, "loss": 0.2876, "mean_copy_accuracy": 0.9972844272851944, "mean_gen_accuracy": 0.8704774081707001, "mean_token_accuracy": 0.9018224477767944, "num_tokens": 1156417527.0, "sample_num_tokens": 8544.75, "step": 8262, "total_num_tokens": 1156451706.0, "z_loss": 0.0004955604090355337 }, { "copy_logits_max": -9.236878395080566, "copy_logits_min": -750000000.0, "copy_num_tokens": 383.0625, "epoch": 1.6876691345417412, "gen_logits_max": 4.022873401641846, "gen_logits_mean": -18.2282772064209, "gen_logits_min": -30.354251861572266, "gen_logits_std": 3.3747265338897705, "gen_loss": 0.29331016540527344, "grad_norm": 0.36305400859721304, "learning_rate": 2.0492210526315788e-05, "loss": 0.2899, "mean_copy_accuracy": 0.9961292445659637, "mean_gen_accuracy": 0.8789587169885635, "mean_token_accuracy": 0.9014025181531906, "num_tokens": 1156693962.0, "sample_num_tokens": 8574.0, "step": 8263, "total_num_tokens": 1156728258.0, "z_loss": 0.00045788497664034367 }, { "copy_logits_max": -6.058593273162842, "copy_logits_min": -750000000.0, "copy_num_tokens": 604.8125, "epoch": 1.687873372478938, "gen_logits_max": 2.6115567684173584, "gen_logits_mean": -16.835002899169922, "gen_logits_min": -29.572433471679688, "gen_logits_std": 3.3005223274230957, "gen_loss": 0.2565385103225708, "grad_norm": 0.35097105691545866, "learning_rate": 2.0490947368421052e-05, "loss": 0.2805, "mean_copy_accuracy": 0.9963878095149994, "mean_gen_accuracy": 0.8764912039041519, "mean_token_accuracy": 0.905920997262001, "num_tokens": 1156970810.0, "sample_num_tokens": 10185.5, "step": 8264, "total_num_tokens": 1157011552.0, "z_loss": 0.0004547174903564155 }, { "copy_logits_max": -5.655115604400635, "copy_logits_min": -687500032.0, "copy_num_tokens": 411.25, "epoch": 1.6880776104161348, "gen_logits_max": 3.2875468730926514, "gen_logits_mean": -16.96984100341797, "gen_logits_min": -28.967384338378906, "gen_logits_std": 3.2615206241607666, "gen_loss": 0.2848731577396393, "grad_norm": 0.33770949934266914, "learning_rate": 2.0489684210526316e-05, "loss": 0.287, "mean_copy_accuracy": 0.9972182065248489, "mean_gen_accuracy": 0.8726030737161636, "mean_token_accuracy": 0.9023586213588715, "num_tokens": 1157248072.0, "sample_num_tokens": 7616.0, "step": 8265, "total_num_tokens": 1157278536.0, "z_loss": 0.0004813002306036651 }, { "copy_logits_max": -7.492129325866699, "copy_logits_min": -687500032.0, "copy_num_tokens": 403.6875, "epoch": 1.6882818483533315, "gen_logits_max": 4.7445478439331055, "gen_logits_mean": -16.157703399658203, "gen_logits_min": -27.882829666137695, "gen_logits_std": 3.2315449714660645, "gen_loss": 0.28330761194229126, "grad_norm": 0.3545569265542532, "learning_rate": 2.048842105263158e-05, "loss": 0.266, "mean_copy_accuracy": 0.9974149167537689, "mean_gen_accuracy": 0.8807280212640762, "mean_token_accuracy": 0.9089561551809311, "num_tokens": 1157495169.0, "sample_num_tokens": 7978.75, "step": 8266, "total_num_tokens": 1157527084.0, "z_loss": 0.0005332991713657975 }, { "copy_logits_max": -6.503146648406982, "copy_logits_min": -687500032.0, "copy_num_tokens": 580.75, "epoch": 1.6884860862905284, "gen_logits_max": 2.6474363803863525, "gen_logits_mean": -17.62091064453125, "gen_logits_min": -29.427902221679688, "gen_logits_std": 3.2913734912872314, "gen_loss": 0.2532115578651428, "grad_norm": 0.3850042004492342, "learning_rate": 2.048715789473684e-05, "loss": 0.2792, "mean_copy_accuracy": 0.9974546134471893, "mean_gen_accuracy": 0.8770150244235992, "mean_token_accuracy": 0.9055308848619461, "num_tokens": 1157752882.0, "sample_num_tokens": 9027.5, "step": 8267, "total_num_tokens": 1157788992.0, "z_loss": 0.0004535545303951949 }, { "copy_logits_max": -7.782057762145996, "copy_logits_min": -750000000.0, "copy_num_tokens": 705.125, "epoch": 1.6886903242277254, "gen_logits_max": 3.1723055839538574, "gen_logits_mean": -16.21562385559082, "gen_logits_min": -28.163497924804688, "gen_logits_std": 3.2725555896759033, "gen_loss": 0.2816539704799652, "grad_norm": 0.3485908749138262, "learning_rate": 2.0485894736842106e-05, "loss": 0.2792, "mean_copy_accuracy": 0.99660424888134, "mean_gen_accuracy": 0.8763386905193329, "mean_token_accuracy": 0.905211552977562, "num_tokens": 1158007652.0, "sample_num_tokens": 9697.0, "step": 8268, "total_num_tokens": 1158046440.0, "z_loss": 0.0004963710671290755 }, { "copy_logits_max": -7.350739479064941, "copy_logits_min": -750000128.0, "copy_num_tokens": 468.8125, "epoch": 1.688894562164922, "gen_logits_max": 4.007617473602295, "gen_logits_mean": -14.652482032775879, "gen_logits_min": -26.39959716796875, "gen_logits_std": 3.1306416988372803, "gen_loss": 0.2629821300506592, "grad_norm": 0.3335739426348015, "learning_rate": 2.048463157894737e-05, "loss": 0.2665, "mean_copy_accuracy": 0.9976857602596283, "mean_gen_accuracy": 0.875687450170517, "mean_token_accuracy": 0.9093663543462753, "num_tokens": 1158288223.0, "sample_num_tokens": 7935.75, "step": 8269, "total_num_tokens": 1158319966.0, "z_loss": 0.0004278198175597936 }, { "copy_logits_max": -6.778487682342529, "copy_logits_min": -750000000.0, "copy_num_tokens": 363.6875, "epoch": 1.689098800102119, "gen_logits_max": 3.742083787918091, "gen_logits_mean": -16.341800689697266, "gen_logits_min": -28.02408218383789, "gen_logits_std": 3.20590877532959, "gen_loss": 0.33061420917510986, "grad_norm": 0.35479474491085755, "learning_rate": 2.048336842105263e-05, "loss": 0.2906, "mean_copy_accuracy": 0.9956772476434708, "mean_gen_accuracy": 0.8685440123081207, "mean_token_accuracy": 0.901897057890892, "num_tokens": 1158563033.0, "sample_num_tokens": 7205.75, "step": 8270, "total_num_tokens": 1158591856.0, "z_loss": 0.0005593742243945599 }, { "copy_logits_max": -6.670641899108887, "copy_logits_min": -750000000.0, "copy_num_tokens": 351.125, "epoch": 1.689303038039316, "gen_logits_max": 3.07796573638916, "gen_logits_mean": -17.613468170166016, "gen_logits_min": -29.37981414794922, "gen_logits_std": 3.2920477390289307, "gen_loss": 0.26406329870224, "grad_norm": 0.36436241122688157, "learning_rate": 2.0482105263157896e-05, "loss": 0.2804, "mean_copy_accuracy": 0.9968833178281784, "mean_gen_accuracy": 0.871898740530014, "mean_token_accuracy": 0.9044669270515442, "num_tokens": 1158841561.0, "sample_num_tokens": 6922.75, "step": 8271, "total_num_tokens": 1158869252.0, "z_loss": 0.00045679803588427603 }, { "copy_logits_max": -6.793889045715332, "copy_logits_min": -625000064.0, "copy_num_tokens": 743.4375, "epoch": 1.6895072759765126, "gen_logits_max": 2.950470447540283, "gen_logits_mean": -16.266054153442383, "gen_logits_min": -28.123306274414062, "gen_logits_std": 3.243070602416992, "gen_loss": 0.254100501537323, "grad_norm": 0.3504762544258927, "learning_rate": 2.0480842105263156e-05, "loss": 0.2689, "mean_copy_accuracy": 0.9975886046886444, "mean_gen_accuracy": 0.8770599216222763, "mean_token_accuracy": 0.9076387286186218, "num_tokens": 1159128779.0, "sample_num_tokens": 11018.75, "step": 8272, "total_num_tokens": 1159172854.0, "z_loss": 0.00044638922554440796 }, { "copy_logits_max": -6.406131744384766, "copy_logits_min": -750000000.0, "copy_num_tokens": 338.625, "epoch": 1.6897115139137093, "gen_logits_max": 3.7105743885040283, "gen_logits_mean": -16.771249771118164, "gen_logits_min": -28.311050415039062, "gen_logits_std": 3.249768018722534, "gen_loss": 0.28939998149871826, "grad_norm": 0.33711582460232653, "learning_rate": 2.0479578947368424e-05, "loss": 0.2871, "mean_copy_accuracy": 0.9976823478937149, "mean_gen_accuracy": 0.8739374577999115, "mean_token_accuracy": 0.9038698375225067, "num_tokens": 1159415023.0, "sample_num_tokens": 7240.75, "step": 8273, "total_num_tokens": 1159443986.0, "z_loss": 0.00043064708006568253 }, { "copy_logits_max": -7.0094451904296875, "copy_logits_min": -750000000.0, "copy_num_tokens": 431.125, "epoch": 1.6899157518509063, "gen_logits_max": 3.8911056518554688, "gen_logits_mean": -16.15475082397461, "gen_logits_min": -27.862308502197266, "gen_logits_std": 3.2353780269622803, "gen_loss": 0.29875698685646057, "grad_norm": 0.3898299520332533, "learning_rate": 2.0478315789473685e-05, "loss": 0.2795, "mean_copy_accuracy": 0.9973766952753067, "mean_gen_accuracy": 0.873571902513504, "mean_token_accuracy": 0.9051861315965652, "num_tokens": 1159688709.0, "sample_num_tokens": 7997.75, "step": 8274, "total_num_tokens": 1159720700.0, "z_loss": 0.00047284812899306417 }, { "copy_logits_max": -5.8082356452941895, "copy_logits_min": -750000000.0, "copy_num_tokens": 453.5625, "epoch": 1.6901199897881032, "gen_logits_max": 3.6116652488708496, "gen_logits_mean": -16.921859741210938, "gen_logits_min": -28.55167579650879, "gen_logits_std": 3.2708497047424316, "gen_loss": 0.3068639636039734, "grad_norm": 0.343684183748794, "learning_rate": 2.047705263157895e-05, "loss": 0.2749, "mean_copy_accuracy": 0.9965534657239914, "mean_gen_accuracy": 0.8801470249891281, "mean_token_accuracy": 0.9070321321487427, "num_tokens": 1159960358.0, "sample_num_tokens": 9165.5, "step": 8275, "total_num_tokens": 1159997020.0, "z_loss": 0.0004908445989713073 }, { "copy_logits_max": -8.3114595413208, "copy_logits_min": -625000064.0, "copy_num_tokens": 538.0625, "epoch": 1.6903242277252999, "gen_logits_max": 3.587864875793457, "gen_logits_mean": -16.535266876220703, "gen_logits_min": -28.36151885986328, "gen_logits_std": 3.2474889755249023, "gen_loss": 0.3185914158821106, "grad_norm": 0.34108835606306687, "learning_rate": 2.047578947368421e-05, "loss": 0.2662, "mean_copy_accuracy": 0.9975128173828125, "mean_gen_accuracy": 0.8775413781404495, "mean_token_accuracy": 0.9109195321798325, "num_tokens": 1160247632.0, "sample_num_tokens": 9442.5, "step": 8276, "total_num_tokens": 1160285402.0, "z_loss": 0.0004875802551396191 }, { "copy_logits_max": -6.726160526275635, "copy_logits_min": -687500032.0, "copy_num_tokens": 502.75, "epoch": 1.6905284656624968, "gen_logits_max": 4.222026824951172, "gen_logits_mean": -15.669805526733398, "gen_logits_min": -28.39034080505371, "gen_logits_std": 3.2076869010925293, "gen_loss": 0.24449753761291504, "grad_norm": 0.35993543888319507, "learning_rate": 2.0474526315789475e-05, "loss": 0.2789, "mean_copy_accuracy": 0.9966167211532593, "mean_gen_accuracy": 0.8786210119724274, "mean_token_accuracy": 0.904712438583374, "num_tokens": 1160513720.0, "sample_num_tokens": 8636.5, "step": 8277, "total_num_tokens": 1160548266.0, "z_loss": 0.0004184952122159302 }, { "copy_logits_max": -7.822855472564697, "copy_logits_min": -750000000.0, "copy_num_tokens": 305.1875, "epoch": 1.6907327035996937, "gen_logits_max": 3.46030330657959, "gen_logits_mean": -17.96021270751953, "gen_logits_min": -29.401784896850586, "gen_logits_std": 3.2943215370178223, "gen_loss": 0.285102516412735, "grad_norm": 0.32799913360744026, "learning_rate": 2.0473263157894736e-05, "loss": 0.274, "mean_copy_accuracy": 0.9975552260875702, "mean_gen_accuracy": 0.879555344581604, "mean_token_accuracy": 0.9056871384382248, "num_tokens": 1160778384.0, "sample_num_tokens": 7843.5, "step": 8278, "total_num_tokens": 1160809758.0, "z_loss": 0.0004464532248675823 }, { "copy_logits_max": -6.5662407875061035, "copy_logits_min": -750000000.0, "copy_num_tokens": 622.625, "epoch": 1.6909369415368904, "gen_logits_max": 3.2457005977630615, "gen_logits_mean": -16.00516700744629, "gen_logits_min": -27.845619201660156, "gen_logits_std": 3.19122052192688, "gen_loss": 0.23351146280765533, "grad_norm": 0.33113901145307684, "learning_rate": 2.0472e-05, "loss": 0.2548, "mean_copy_accuracy": 0.9977590590715408, "mean_gen_accuracy": 0.8845552653074265, "mean_token_accuracy": 0.9140390753746033, "num_tokens": 1161075644.0, "sample_num_tokens": 8921.5, "step": 8279, "total_num_tokens": 1161111330.0, "z_loss": 0.00042008794844150543 }, { "copy_logits_max": -5.926766395568848, "copy_logits_min": -750000000.0, "copy_num_tokens": 649.1875, "epoch": 1.6911411794740872, "gen_logits_max": 1.6098482608795166, "gen_logits_mean": -18.44127655029297, "gen_logits_min": -30.564714431762695, "gen_logits_std": 3.3364086151123047, "gen_loss": 0.29287010431289673, "grad_norm": 0.3652096773939325, "learning_rate": 2.047073684210526e-05, "loss": 0.2797, "mean_copy_accuracy": 0.996689185500145, "mean_gen_accuracy": 0.8724566549062729, "mean_token_accuracy": 0.904378354549408, "num_tokens": 1161338007.0, "sample_num_tokens": 9290.75, "step": 8280, "total_num_tokens": 1161375170.0, "z_loss": 0.00047703058226034045 }, { "copy_logits_max": -9.05148696899414, "copy_logits_min": -750000000.0, "copy_num_tokens": 408.75, "epoch": 1.6913454174112843, "gen_logits_max": 3.5115013122558594, "gen_logits_mean": -16.936790466308594, "gen_logits_min": -28.691421508789062, "gen_logits_std": 3.2631990909576416, "gen_loss": 0.2876008152961731, "grad_norm": 0.37423502566219163, "learning_rate": 2.046947368421053e-05, "loss": 0.2869, "mean_copy_accuracy": 0.9974548071622849, "mean_gen_accuracy": 0.8725945502519608, "mean_token_accuracy": 0.9030697792768478, "num_tokens": 1161604744.0, "sample_num_tokens": 8438.5, "step": 8281, "total_num_tokens": 1161638498.0, "z_loss": 0.00041972141480073333 }, { "copy_logits_max": -9.22978401184082, "copy_logits_min": -687500032.0, "copy_num_tokens": 445.375, "epoch": 1.691549655348481, "gen_logits_max": 3.1308093070983887, "gen_logits_mean": -18.191429138183594, "gen_logits_min": -29.915607452392578, "gen_logits_std": 3.345906972885132, "gen_loss": 0.2800283432006836, "grad_norm": 0.306821488704548, "learning_rate": 2.0468210526315793e-05, "loss": 0.2583, "mean_copy_accuracy": 0.9974816888570786, "mean_gen_accuracy": 0.8830078095197678, "mean_token_accuracy": 0.9134804010391235, "num_tokens": 1161903214.0, "sample_num_tokens": 8959.0, "step": 8282, "total_num_tokens": 1161939050.0, "z_loss": 0.00040023672045208514 }, { "copy_logits_max": -7.895787239074707, "copy_logits_min": -687500032.0, "copy_num_tokens": 465.875, "epoch": 1.6917538932856777, "gen_logits_max": 2.82924747467041, "gen_logits_mean": -17.51538848876953, "gen_logits_min": -29.779857635498047, "gen_logits_std": 3.336150646209717, "gen_loss": 0.2949732542037964, "grad_norm": 0.37222405628304983, "learning_rate": 2.0466947368421054e-05, "loss": 0.281, "mean_copy_accuracy": 0.9964819550514221, "mean_gen_accuracy": 0.8774595558643341, "mean_token_accuracy": 0.9034586250782013, "num_tokens": 1162192829.0, "sample_num_tokens": 8488.75, "step": 8283, "total_num_tokens": 1162226784.0, "z_loss": 0.0004340216109994799 }, { "copy_logits_max": -7.471280574798584, "copy_logits_min": -750000000.0, "copy_num_tokens": 411.8125, "epoch": 1.6919581312228746, "gen_logits_max": 4.056360721588135, "gen_logits_mean": -16.337310791015625, "gen_logits_min": -27.966093063354492, "gen_logits_std": 3.2766730785369873, "gen_loss": 0.3065033555030823, "grad_norm": 0.36916823128558485, "learning_rate": 2.0465684210526318e-05, "loss": 0.2731, "mean_copy_accuracy": 0.9971056580543518, "mean_gen_accuracy": 0.876502588391304, "mean_token_accuracy": 0.9070308208465576, "num_tokens": 1162492803.0, "sample_num_tokens": 8948.75, "step": 8284, "total_num_tokens": 1162528598.0, "z_loss": 0.00039637688314542174 }, { "copy_logits_max": -6.551726341247559, "copy_logits_min": -687500032.0, "copy_num_tokens": 407.75, "epoch": 1.6921623691600716, "gen_logits_max": 2.451972484588623, "gen_logits_mean": -18.938241958618164, "gen_logits_min": -30.751495361328125, "gen_logits_std": 3.394118547439575, "gen_loss": 0.23384934663772583, "grad_norm": 0.3511941234623668, "learning_rate": 2.046442105263158e-05, "loss": 0.2495, "mean_copy_accuracy": 0.9971334487199783, "mean_gen_accuracy": 0.8888432830572128, "mean_token_accuracy": 0.91327965259552, "num_tokens": 1162761769.0, "sample_num_tokens": 8284.25, "step": 8285, "total_num_tokens": 1162794906.0, "z_loss": 0.0003561661869753152 }, { "copy_logits_max": -6.75588321685791, "copy_logits_min": -750000000.0, "copy_num_tokens": 261.25, "epoch": 1.6923666070972683, "gen_logits_max": 3.873612880706787, "gen_logits_mean": -17.384136199951172, "gen_logits_min": -29.050718307495117, "gen_logits_std": 3.2998580932617188, "gen_loss": 0.3048638105392456, "grad_norm": 0.366653783958256, "learning_rate": 2.0463157894736843e-05, "loss": 0.2792, "mean_copy_accuracy": 0.9962109327316284, "mean_gen_accuracy": 0.8821248710155487, "mean_token_accuracy": 0.9055345803499222, "num_tokens": 1163000263.0, "sample_num_tokens": 7403.75, "step": 8286, "total_num_tokens": 1163029878.0, "z_loss": 0.0004701030266005546 }, { "copy_logits_max": -9.820768356323242, "copy_logits_min": -750000000.0, "copy_num_tokens": 286.5625, "epoch": 1.6925708450344652, "gen_logits_max": 3.6991868019104004, "gen_logits_mean": -18.844968795776367, "gen_logits_min": -30.211877822875977, "gen_logits_std": 3.3295772075653076, "gen_loss": 0.29236525297164917, "grad_norm": 0.37815329758580113, "learning_rate": 2.0461894736842104e-05, "loss": 0.2872, "mean_copy_accuracy": 0.9969931840896606, "mean_gen_accuracy": 0.8766742199659348, "mean_token_accuracy": 0.9015423655509949, "num_tokens": 1163252228.0, "sample_num_tokens": 8146.0, "step": 8287, "total_num_tokens": 1163284812.0, "z_loss": 0.00046686659334227443 }, { "copy_logits_max": -6.7763671875, "copy_logits_min": -750000000.0, "copy_num_tokens": 541.8125, "epoch": 1.6927750829716621, "gen_logits_max": 3.131165027618408, "gen_logits_mean": -16.805530548095703, "gen_logits_min": -28.60170555114746, "gen_logits_std": 3.2460145950317383, "gen_loss": 0.26660412549972534, "grad_norm": 0.3489997821021954, "learning_rate": 2.046063157894737e-05, "loss": 0.2583, "mean_copy_accuracy": 0.9973721355199814, "mean_gen_accuracy": 0.8823384493589401, "mean_token_accuracy": 0.9104553312063217, "num_tokens": 1163527829.0, "sample_num_tokens": 8538.25, "step": 8288, "total_num_tokens": 1163561982.0, "z_loss": 0.000449245679192245 }, { "copy_logits_max": -7.922982215881348, "copy_logits_min": -750000064.0, "copy_num_tokens": 412.625, "epoch": 1.6929793209088588, "gen_logits_max": 3.1124587059020996, "gen_logits_mean": -18.025251388549805, "gen_logits_min": -29.983020782470703, "gen_logits_std": 3.3540172576904297, "gen_loss": 0.2696491777896881, "grad_norm": 0.36057503512950223, "learning_rate": 2.045936842105263e-05, "loss": 0.2611, "mean_copy_accuracy": 0.996327817440033, "mean_gen_accuracy": 0.8833798319101334, "mean_token_accuracy": 0.9117330014705658, "num_tokens": 1163795586.0, "sample_num_tokens": 7707.0, "step": 8289, "total_num_tokens": 1163826414.0, "z_loss": 0.0004125864652451128 }, { "copy_logits_max": -6.770857810974121, "copy_logits_min": -750000000.0, "copy_num_tokens": 537.875, "epoch": 1.6931835588460555, "gen_logits_max": 5.0725932121276855, "gen_logits_mean": -13.824136734008789, "gen_logits_min": -25.809635162353516, "gen_logits_std": 3.124840259552002, "gen_loss": 0.2542896866798401, "grad_norm": 0.37400698453883247, "learning_rate": 2.0458105263157897e-05, "loss": 0.2737, "mean_copy_accuracy": 0.9967644959688187, "mean_gen_accuracy": 0.873809814453125, "mean_token_accuracy": 0.9057357013225555, "num_tokens": 1164062723.0, "sample_num_tokens": 8858.25, "step": 8290, "total_num_tokens": 1164098156.0, "z_loss": 0.0004284608585294336 }, { "copy_logits_max": -8.169756889343262, "copy_logits_min": -750000000.0, "copy_num_tokens": 379.0625, "epoch": 1.6933877967832525, "gen_logits_max": 3.684497356414795, "gen_logits_mean": -17.543479919433594, "gen_logits_min": -29.30527687072754, "gen_logits_std": 3.2881810665130615, "gen_loss": 0.27968287467956543, "grad_norm": 0.3720508449799664, "learning_rate": 2.0456842105263158e-05, "loss": 0.2845, "mean_copy_accuracy": 0.9965428411960602, "mean_gen_accuracy": 0.8730057179927826, "mean_token_accuracy": 0.9022855758666992, "num_tokens": 1164332420.0, "sample_num_tokens": 8145.5, "step": 8291, "total_num_tokens": 1164365002.0, "z_loss": 0.00046216690680012107 }, { "copy_logits_max": -6.20451021194458, "copy_logits_min": -750000000.0, "copy_num_tokens": 718.25, "epoch": 1.6935920347204494, "gen_logits_max": 4.256604194641113, "gen_logits_mean": -14.893716812133789, "gen_logits_min": -26.833728790283203, "gen_logits_std": 3.2203450202941895, "gen_loss": 0.2558964490890503, "grad_norm": 0.33949523739951054, "learning_rate": 2.0455578947368422e-05, "loss": 0.2878, "mean_copy_accuracy": 0.9973231852054596, "mean_gen_accuracy": 0.8702111542224884, "mean_token_accuracy": 0.902137815952301, "num_tokens": 1164622716.0, "sample_num_tokens": 9724.0, "step": 8292, "total_num_tokens": 1164661612.0, "z_loss": 0.00041572292684577405 }, { "copy_logits_max": -6.85758638381958, "copy_logits_min": -750000000.0, "copy_num_tokens": 368.375, "epoch": 1.693796272657646, "gen_logits_max": 4.185830116271973, "gen_logits_mean": -16.714229583740234, "gen_logits_min": -28.495513916015625, "gen_logits_std": 3.287592649459839, "gen_loss": 0.3201647698879242, "grad_norm": 0.3736793956868959, "learning_rate": 2.0454315789473683e-05, "loss": 0.2826, "mean_copy_accuracy": 0.9966229200363159, "mean_gen_accuracy": 0.875572681427002, "mean_token_accuracy": 0.9034314602613449, "num_tokens": 1164894810.0, "sample_num_tokens": 8205.0, "step": 8293, "total_num_tokens": 1164927630.0, "z_loss": 0.0005341041833162308 }, { "copy_logits_max": -8.464323043823242, "copy_logits_min": -750000064.0, "copy_num_tokens": 568.75, "epoch": 1.694000510594843, "gen_logits_max": 4.322154998779297, "gen_logits_mean": -16.553638458251953, "gen_logits_min": -28.294387817382812, "gen_logits_std": 3.2919116020202637, "gen_loss": 0.24767987430095673, "grad_norm": 0.3682479609284805, "learning_rate": 2.0453052631578948e-05, "loss": 0.2657, "mean_copy_accuracy": 0.9973463416099548, "mean_gen_accuracy": 0.8753882497549057, "mean_token_accuracy": 0.9085124433040619, "num_tokens": 1165178603.0, "sample_num_tokens": 9114.25, "step": 8294, "total_num_tokens": 1165215060.0, "z_loss": 0.0004147293511778116 }, { "copy_logits_max": -6.586540222167969, "copy_logits_min": -750000000.0, "copy_num_tokens": 660.9375, "epoch": 1.69420474853204, "gen_logits_max": 4.055117607116699, "gen_logits_mean": -15.407491683959961, "gen_logits_min": -27.580184936523438, "gen_logits_std": 3.2211642265319824, "gen_loss": 0.2618098258972168, "grad_norm": 0.3579365699423172, "learning_rate": 2.0451789473684212e-05, "loss": 0.293, "mean_copy_accuracy": 0.9971340596675873, "mean_gen_accuracy": 0.8693379759788513, "mean_token_accuracy": 0.899906188249588, "num_tokens": 1165449661.0, "sample_num_tokens": 10218.75, "step": 8295, "total_num_tokens": 1165490536.0, "z_loss": 0.00041922341915778816 }, { "copy_logits_max": -5.191864013671875, "copy_logits_min": -750000064.0, "copy_num_tokens": 595.0625, "epoch": 1.6944089864692367, "gen_logits_max": 3.6896705627441406, "gen_logits_mean": -16.85364532470703, "gen_logits_min": -28.657081604003906, "gen_logits_std": 3.289736270904541, "gen_loss": 0.2771684527397156, "grad_norm": 0.3449855532811406, "learning_rate": 2.0450526315789473e-05, "loss": 0.2791, "mean_copy_accuracy": 0.9968578070402145, "mean_gen_accuracy": 0.876389890909195, "mean_token_accuracy": 0.9055635631084442, "num_tokens": 1165714355.0, "sample_num_tokens": 9577.25, "step": 8296, "total_num_tokens": 1165752664.0, "z_loss": 0.00044094977783970535 }, { "copy_logits_max": -7.4269914627075195, "copy_logits_min": -750000064.0, "copy_num_tokens": 348.625, "epoch": 1.6946132244064334, "gen_logits_max": 4.201738357543945, "gen_logits_mean": -17.343320846557617, "gen_logits_min": -29.17211151123047, "gen_logits_std": 3.3099095821380615, "gen_loss": 0.304934024810791, "grad_norm": 0.33117717988498063, "learning_rate": 2.0449263157894737e-05, "loss": 0.2826, "mean_copy_accuracy": 0.9969111829996109, "mean_gen_accuracy": 0.8766311705112457, "mean_token_accuracy": 0.9045449048280716, "num_tokens": 1165985359.0, "sample_num_tokens": 8166.75, "step": 8297, "total_num_tokens": 1166018026.0, "z_loss": 0.0004614099161699414 }, { "copy_logits_max": -7.3515214920043945, "copy_logits_min": -750000000.0, "copy_num_tokens": 472.1875, "epoch": 1.6948174623436303, "gen_logits_max": 3.721970558166504, "gen_logits_mean": -16.64190673828125, "gen_logits_min": -28.50119400024414, "gen_logits_std": 3.266350269317627, "gen_loss": 0.27121156454086304, "grad_norm": 0.3770310750212903, "learning_rate": 2.0448e-05, "loss": 0.2754, "mean_copy_accuracy": 0.996069997549057, "mean_gen_accuracy": 0.8723955005407333, "mean_token_accuracy": 0.9066089242696762, "num_tokens": 1166267943.0, "sample_num_tokens": 8093.75, "step": 8298, "total_num_tokens": 1166300318.0, "z_loss": 0.00047344822087325156 }, { "copy_logits_max": -7.76611852645874, "copy_logits_min": -687500032.0, "copy_num_tokens": 757.4375, "epoch": 1.6950217002808272, "gen_logits_max": 3.078571319580078, "gen_logits_mean": -17.59520721435547, "gen_logits_min": -29.871095657348633, "gen_logits_std": 3.3502047061920166, "gen_loss": 0.22853578627109528, "grad_norm": 0.3815495128286117, "learning_rate": 2.0446736842105266e-05, "loss": 0.2841, "mean_copy_accuracy": 0.9971557557582855, "mean_gen_accuracy": 0.8715731054544449, "mean_token_accuracy": 0.9050825834274292, "num_tokens": 1166543471.0, "sample_num_tokens": 10607.75, "step": 8299, "total_num_tokens": 1166585902.0, "z_loss": 0.00041893008165061474 }, { "copy_logits_max": -8.453604698181152, "copy_logits_min": -750000000.0, "copy_num_tokens": 589.875, "epoch": 1.695225938218024, "gen_logits_max": 3.467900037765503, "gen_logits_mean": -16.949893951416016, "gen_logits_min": -29.303272247314453, "gen_logits_std": 3.3150861263275146, "gen_loss": 0.2533988356590271, "grad_norm": 0.3609979393102259, "learning_rate": 2.0445473684210527e-05, "loss": 0.2712, "mean_copy_accuracy": 0.9973641335964203, "mean_gen_accuracy": 0.8756072521209717, "mean_token_accuracy": 0.9077173620462418, "num_tokens": 1166820501.0, "sample_num_tokens": 8921.25, "step": 8300, "total_num_tokens": 1166856186.0, "z_loss": 0.0004181967815384269 }, { "copy_logits_max": -7.180959701538086, "copy_logits_min": -750000064.0, "copy_num_tokens": 490.4375, "epoch": 1.6954301761552208, "gen_logits_max": 3.2397713661193848, "gen_logits_mean": -17.850595474243164, "gen_logits_min": -29.684612274169922, "gen_logits_std": 3.3390121459960938, "gen_loss": 0.24152490496635437, "grad_norm": 0.3725214897730538, "learning_rate": 2.044421052631579e-05, "loss": 0.2819, "mean_copy_accuracy": 0.9959621876478195, "mean_gen_accuracy": 0.8775551319122314, "mean_token_accuracy": 0.9045786410570145, "num_tokens": 1167080028.0, "sample_num_tokens": 8700.0, "step": 8301, "total_num_tokens": 1167114828.0, "z_loss": 0.00045574200339615345 }, { "copy_logits_max": -7.862028121948242, "copy_logits_min": -750000000.0, "copy_num_tokens": 379.4375, "epoch": 1.6956344140924178, "gen_logits_max": 3.8875036239624023, "gen_logits_mean": -17.742380142211914, "gen_logits_min": -29.37746238708496, "gen_logits_std": 3.3123090267181396, "gen_loss": 0.2795235812664032, "grad_norm": 0.3306759581362153, "learning_rate": 2.0442947368421052e-05, "loss": 0.2611, "mean_copy_accuracy": 0.9961612075567245, "mean_gen_accuracy": 0.8812899440526962, "mean_token_accuracy": 0.910527691245079, "num_tokens": 1167353903.0, "sample_num_tokens": 7735.25, "step": 8302, "total_num_tokens": 1167384844.0, "z_loss": 0.00048164610052481294 }, { "copy_logits_max": -6.385537147521973, "copy_logits_min": -750000064.0, "copy_num_tokens": 500.125, "epoch": 1.6958386520296145, "gen_logits_max": 3.621023178100586, "gen_logits_mean": -16.046314239501953, "gen_logits_min": -28.09257698059082, "gen_logits_std": 3.2585840225219727, "gen_loss": 0.28093236684799194, "grad_norm": 0.39230691709760845, "learning_rate": 2.0441684210526316e-05, "loss": 0.2709, "mean_copy_accuracy": 0.9965043216943741, "mean_gen_accuracy": 0.8766753673553467, "mean_token_accuracy": 0.908845528960228, "num_tokens": 1167630609.0, "sample_num_tokens": 8576.75, "step": 8303, "total_num_tokens": 1167664916.0, "z_loss": 0.00048766747931949794 }, { "copy_logits_max": -8.35258960723877, "copy_logits_min": -750000000.0, "copy_num_tokens": 472.375, "epoch": 1.6960428899668112, "gen_logits_max": 3.429220676422119, "gen_logits_mean": -17.6749267578125, "gen_logits_min": -29.805644989013672, "gen_logits_std": 3.346877098083496, "gen_loss": 0.27240699529647827, "grad_norm": 0.3599557920629197, "learning_rate": 2.0440421052631577e-05, "loss": 0.2525, "mean_copy_accuracy": 0.997034564614296, "mean_gen_accuracy": 0.8869036585092545, "mean_token_accuracy": 0.9162727296352386, "num_tokens": 1167914999.0, "sample_num_tokens": 8661.75, "step": 8304, "total_num_tokens": 1167949646.0, "z_loss": 0.00045149694778956473 }, { "copy_logits_max": -5.925955295562744, "copy_logits_min": -750000000.0, "copy_num_tokens": 453.875, "epoch": 1.6962471279040083, "gen_logits_max": 3.670746326446533, "gen_logits_mean": -17.391422271728516, "gen_logits_min": -29.668720245361328, "gen_logits_std": 3.3855974674224854, "gen_loss": 0.29833120107650757, "grad_norm": 0.36259148799909036, "learning_rate": 2.043915789473684e-05, "loss": 0.2753, "mean_copy_accuracy": 0.9960601627826691, "mean_gen_accuracy": 0.8748635798692703, "mean_token_accuracy": 0.9053975045681, "num_tokens": 1168172828.0, "sample_num_tokens": 8136.0, "step": 8305, "total_num_tokens": 1168205372.0, "z_loss": 0.0005191070958971977 }, { "copy_logits_max": -7.394173622131348, "copy_logits_min": -750000000.0, "copy_num_tokens": 522.125, "epoch": 1.696451365841205, "gen_logits_max": 3.4207639694213867, "gen_logits_mean": -17.66048812866211, "gen_logits_min": -29.933666229248047, "gen_logits_std": 3.3938491344451904, "gen_loss": 0.25596535205841064, "grad_norm": 0.38985464249660445, "learning_rate": 2.0437894736842106e-05, "loss": 0.2623, "mean_copy_accuracy": 0.9979641139507294, "mean_gen_accuracy": 0.8807112723588943, "mean_token_accuracy": 0.9106597453355789, "num_tokens": 1168455448.0, "sample_num_tokens": 10207.5, "step": 8306, "total_num_tokens": 1168496278.0, "z_loss": 0.00043284642742946744 }, { "copy_logits_max": -4.907598972320557, "copy_logits_min": -687500032.0, "copy_num_tokens": 537.3125, "epoch": 1.6966556037784017, "gen_logits_max": 4.057206630706787, "gen_logits_mean": -15.735613822937012, "gen_logits_min": -28.099390029907227, "gen_logits_std": 3.323626756668091, "gen_loss": 0.25840499997138977, "grad_norm": 0.36724252662082824, "learning_rate": 2.043663157894737e-05, "loss": 0.2772, "mean_copy_accuracy": 0.9964058995246887, "mean_gen_accuracy": 0.8767063468694687, "mean_token_accuracy": 0.9048635214567184, "num_tokens": 1168738784.0, "sample_num_tokens": 8485.0, "step": 8307, "total_num_tokens": 1168772724.0, "z_loss": 0.0003932739491574466 }, { "copy_logits_max": -6.47831392288208, "copy_logits_min": -750000000.0, "copy_num_tokens": 430.3125, "epoch": 1.6968598417155987, "gen_logits_max": 3.6682028770446777, "gen_logits_mean": -17.184545516967773, "gen_logits_min": -29.164304733276367, "gen_logits_std": 3.373915195465088, "gen_loss": 0.27771228551864624, "grad_norm": 0.3808028959397802, "learning_rate": 2.0435368421052634e-05, "loss": 0.2929, "mean_copy_accuracy": 0.9958651065826416, "mean_gen_accuracy": 0.8774590492248535, "mean_token_accuracy": 0.901292696595192, "num_tokens": 1168995181.0, "sample_num_tokens": 8218.25, "step": 8308, "total_num_tokens": 1169028054.0, "z_loss": 0.00042003198177553713 }, { "copy_logits_max": -7.080907344818115, "copy_logits_min": -750000000.0, "copy_num_tokens": 270.0, "epoch": 1.6970640796527956, "gen_logits_max": 5.104775428771973, "gen_logits_mean": -15.800430297851562, "gen_logits_min": -27.55970001220703, "gen_logits_std": 3.289860725402832, "gen_loss": 0.317624032497406, "grad_norm": 0.3347589909454802, "learning_rate": 2.0434105263157895e-05, "loss": 0.2693, "mean_copy_accuracy": 0.9971098601818085, "mean_gen_accuracy": 0.8776171654462814, "mean_token_accuracy": 0.9077609777450562, "num_tokens": 1169268182.0, "sample_num_tokens": 6587.0, "step": 8309, "total_num_tokens": 1169294530.0, "z_loss": 0.0004737667622976005 }, { "copy_logits_max": -5.965547561645508, "copy_logits_min": -750000000.0, "copy_num_tokens": 534.1875, "epoch": 1.6972683175899923, "gen_logits_max": 2.230721950531006, "gen_logits_mean": -18.946720123291016, "gen_logits_min": -31.08226776123047, "gen_logits_std": 3.447204113006592, "gen_loss": 0.2601093649864197, "grad_norm": 0.3703041083652625, "learning_rate": 2.043284210526316e-05, "loss": 0.2737, "mean_copy_accuracy": 0.9966456294059753, "mean_gen_accuracy": 0.8764069080352783, "mean_token_accuracy": 0.90678970515728, "num_tokens": 1169530297.0, "sample_num_tokens": 9052.75, "step": 8310, "total_num_tokens": 1169566508.0, "z_loss": 0.0004203427815809846 }, { "copy_logits_max": -5.543079376220703, "copy_logits_min": -750000000.0, "copy_num_tokens": 415.5625, "epoch": 1.6974725555271892, "gen_logits_max": 3.6158297061920166, "gen_logits_mean": -17.146297454833984, "gen_logits_min": -29.16547203063965, "gen_logits_std": 3.372661828994751, "gen_loss": 0.2583656907081604, "grad_norm": 0.3394587280977905, "learning_rate": 2.043157894736842e-05, "loss": 0.2674, "mean_copy_accuracy": 0.9969366192817688, "mean_gen_accuracy": 0.8800633996725082, "mean_token_accuracy": 0.906163215637207, "num_tokens": 1169810592.0, "sample_num_tokens": 7604.0, "step": 8311, "total_num_tokens": 1169841008.0, "z_loss": 0.00042750174179673195 }, { "copy_logits_max": -4.85600471496582, "copy_logits_min": -687500032.0, "copy_num_tokens": 475.0, "epoch": 1.6976767934643862, "gen_logits_max": 3.5493907928466797, "gen_logits_mean": -17.154300689697266, "gen_logits_min": -29.129730224609375, "gen_logits_std": 3.3826403617858887, "gen_loss": 0.28112322092056274, "grad_norm": 0.3864191885933086, "learning_rate": 2.0430315789473685e-05, "loss": 0.2957, "mean_copy_accuracy": 0.9962757080793381, "mean_gen_accuracy": 0.870283380150795, "mean_token_accuracy": 0.8992041498422623, "num_tokens": 1170076543.0, "sample_num_tokens": 8619.75, "step": 8312, "total_num_tokens": 1170111022.0, "z_loss": 0.0004208622267469764 }, { "copy_logits_max": -8.357568740844727, "copy_logits_min": -750000064.0, "copy_num_tokens": 399.4375, "epoch": 1.6978810314015829, "gen_logits_max": 3.579679489135742, "gen_logits_mean": -17.539859771728516, "gen_logits_min": -29.46314239501953, "gen_logits_std": 3.3723702430725098, "gen_loss": 0.29163897037506104, "grad_norm": 0.3516054660026695, "learning_rate": 2.0429052631578946e-05, "loss": 0.2725, "mean_copy_accuracy": 0.997252807021141, "mean_gen_accuracy": 0.8769359141588211, "mean_token_accuracy": 0.9069278389215469, "num_tokens": 1170338130.0, "sample_num_tokens": 8020.0, "step": 8313, "total_num_tokens": 1170370210.0, "z_loss": 0.0004620932159014046 }, { "copy_logits_max": -4.323063850402832, "copy_logits_min": -750000000.0, "copy_num_tokens": 586.25, "epoch": 1.6980852693387796, "gen_logits_max": 2.8659236431121826, "gen_logits_mean": -17.341949462890625, "gen_logits_min": -29.947612762451172, "gen_logits_std": 3.3987479209899902, "gen_loss": 0.2953224778175354, "grad_norm": 0.37159881634898245, "learning_rate": 2.0427789473684213e-05, "loss": 0.3064, "mean_copy_accuracy": 0.9967690855264664, "mean_gen_accuracy": 0.8636255264282227, "mean_token_accuracy": 0.8979814052581787, "num_tokens": 1170622116.0, "sample_num_tokens": 8853.5, "step": 8314, "total_num_tokens": 1170657530.0, "z_loss": 0.0005003241822123528 }, { "copy_logits_max": -6.036675930023193, "copy_logits_min": -750000000.0, "copy_num_tokens": 519.5625, "epoch": 1.6982895072759765, "gen_logits_max": 4.3994245529174805, "gen_logits_mean": -16.300811767578125, "gen_logits_min": -28.72895622253418, "gen_logits_std": 3.331373691558838, "gen_loss": 0.313448965549469, "grad_norm": 0.3680466891701838, "learning_rate": 2.0426526315789474e-05, "loss": 0.2719, "mean_copy_accuracy": 0.9967222809791565, "mean_gen_accuracy": 0.8779115825891495, "mean_token_accuracy": 0.9069749414920807, "num_tokens": 1170891740.0, "sample_num_tokens": 8831.5, "step": 8315, "total_num_tokens": 1170927066.0, "z_loss": 0.0005682266782969236 }, { "copy_logits_max": -7.00629997253418, "copy_logits_min": -750000000.0, "copy_num_tokens": 457.8125, "epoch": 1.6984937452131734, "gen_logits_max": 3.9104909896850586, "gen_logits_mean": -17.643232345581055, "gen_logits_min": -29.797252655029297, "gen_logits_std": 3.3591842651367188, "gen_loss": 0.27401936054229736, "grad_norm": 0.3628394930875911, "learning_rate": 2.042526315789474e-05, "loss": 0.2767, "mean_copy_accuracy": 0.997051939368248, "mean_gen_accuracy": 0.8777750581502914, "mean_token_accuracy": 0.9055618047714233, "num_tokens": 1171153322.0, "sample_num_tokens": 8229.0, "step": 8316, "total_num_tokens": 1171186238.0, "z_loss": 0.0004507272387854755 }, { "copy_logits_max": -4.036290645599365, "copy_logits_min": -750000000.0, "copy_num_tokens": 380.0, "epoch": 1.6986979831503701, "gen_logits_max": 4.414989948272705, "gen_logits_mean": -15.832077980041504, "gen_logits_min": -27.899354934692383, "gen_logits_std": 3.2733774185180664, "gen_loss": 0.31143540143966675, "grad_norm": 0.3708230829800822, "learning_rate": 2.0424e-05, "loss": 0.2975, "mean_copy_accuracy": 0.9972671866416931, "mean_gen_accuracy": 0.8690205067396164, "mean_token_accuracy": 0.8987304866313934, "num_tokens": 1171402333.0, "sample_num_tokens": 7383.25, "step": 8317, "total_num_tokens": 1171431866.0, "z_loss": 0.00048455706564709544 }, { "copy_logits_max": -6.828962802886963, "copy_logits_min": -687500032.0, "copy_num_tokens": 451.5625, "epoch": 1.698902221087567, "gen_logits_max": 3.401275634765625, "gen_logits_mean": -17.058917999267578, "gen_logits_min": -29.012413024902344, "gen_logits_std": 3.313910722732544, "gen_loss": 0.2894994020462036, "grad_norm": 0.3621535975769497, "learning_rate": 2.0422736842105264e-05, "loss": 0.276, "mean_copy_accuracy": 0.9965230822563171, "mean_gen_accuracy": 0.879147857427597, "mean_token_accuracy": 0.9066163301467896, "num_tokens": 1171658193.0, "sample_num_tokens": 8923.75, "step": 8318, "total_num_tokens": 1171693888.0, "z_loss": 0.0004875444865319878 }, { "copy_logits_max": -6.9045491218566895, "copy_logits_min": -750000000.0, "copy_num_tokens": 362.875, "epoch": 1.699106459024764, "gen_logits_max": 3.6075525283813477, "gen_logits_mean": -17.00680923461914, "gen_logits_min": -28.95747947692871, "gen_logits_std": 3.296034097671509, "gen_loss": 0.2775406241416931, "grad_norm": 0.3311299727493273, "learning_rate": 2.0421473684210525e-05, "loss": 0.254, "mean_copy_accuracy": 0.9972636848688126, "mean_gen_accuracy": 0.8849999457597733, "mean_token_accuracy": 0.9134643375873566, "num_tokens": 1171950116.0, "sample_num_tokens": 7724.0, "step": 8319, "total_num_tokens": 1171981012.0, "z_loss": 0.00043999776244163513 }, { "copy_logits_max": -7.169536590576172, "copy_logits_min": -687500032.0, "copy_num_tokens": 441.1875, "epoch": 1.6993106969619607, "gen_logits_max": 4.083199977874756, "gen_logits_mean": -16.43288803100586, "gen_logits_min": -28.13481903076172, "gen_logits_std": 3.246730327606201, "gen_loss": 0.25124630331993103, "grad_norm": 0.33939732812313234, "learning_rate": 2.042021052631579e-05, "loss": 0.252, "mean_copy_accuracy": 0.9973474740982056, "mean_gen_accuracy": 0.8828727751970291, "mean_token_accuracy": 0.9154192805290222, "num_tokens": 1172259797.0, "sample_num_tokens": 8731.25, "step": 8320, "total_num_tokens": 1172294722.0, "z_loss": 0.0004255179374013096 }, { "copy_logits_max": -6.787887096405029, "copy_logits_min": -750000000.0, "copy_num_tokens": 395.125, "epoch": 1.6995149348991574, "gen_logits_max": 3.6304807662963867, "gen_logits_mean": -16.10089111328125, "gen_logits_min": -27.904008865356445, "gen_logits_std": 3.2591159343719482, "gen_loss": 0.26755544543266296, "grad_norm": 0.3452845049732471, "learning_rate": 2.0418947368421053e-05, "loss": 0.263, "mean_copy_accuracy": 0.9974571913480759, "mean_gen_accuracy": 0.8826977610588074, "mean_token_accuracy": 0.9102333188056946, "num_tokens": 1172537566.0, "sample_num_tokens": 7641.0, "step": 8321, "total_num_tokens": 1172568130.0, "z_loss": 0.000426250888267532 }, { "copy_logits_max": -4.759032249450684, "copy_logits_min": -750000000.0, "copy_num_tokens": 704.3125, "epoch": 1.6997191728363543, "gen_logits_max": 3.3167953491210938, "gen_logits_mean": -16.722089767456055, "gen_logits_min": -28.874149322509766, "gen_logits_std": 3.32527494430542, "gen_loss": 0.26936495304107666, "grad_norm": 0.3538386396445643, "learning_rate": 2.0417684210526318e-05, "loss": 0.2808, "mean_copy_accuracy": 0.9960729479789734, "mean_gen_accuracy": 0.8758205473423004, "mean_token_accuracy": 0.9058175534009933, "num_tokens": 1172803047.0, "sample_num_tokens": 9941.75, "step": 8322, "total_num_tokens": 1172842814.0, "z_loss": 0.0004401012556627393 }, { "copy_logits_max": -7.7953338623046875, "copy_logits_min": -750000064.0, "copy_num_tokens": 307.25, "epoch": 1.6999234107735512, "gen_logits_max": 3.3692941665649414, "gen_logits_mean": -18.313859939575195, "gen_logits_min": -29.8095703125, "gen_logits_std": 3.3286044597625732, "gen_loss": 0.29251617193222046, "grad_norm": 0.3306081025543531, "learning_rate": 2.0416421052631582e-05, "loss": 0.2684, "mean_copy_accuracy": 0.997303918004036, "mean_gen_accuracy": 0.880286306142807, "mean_token_accuracy": 0.9088078141212463, "num_tokens": 1173113877.0, "sample_num_tokens": 7335.25, "step": 8323, "total_num_tokens": 1173143218.0, "z_loss": 0.0004153330228291452 }, { "copy_logits_max": -5.764364242553711, "copy_logits_min": -687500032.0, "copy_num_tokens": 442.1875, "epoch": 1.700127648710748, "gen_logits_max": 4.7076826095581055, "gen_logits_mean": -15.439355850219727, "gen_logits_min": -27.555543899536133, "gen_logits_std": 3.233325958251953, "gen_loss": 0.291100412607193, "grad_norm": 0.39333585851029657, "learning_rate": 2.0415157894736843e-05, "loss": 0.2786, "mean_copy_accuracy": 0.9965258836746216, "mean_gen_accuracy": 0.8756839036941528, "mean_token_accuracy": 0.9066156297922134, "num_tokens": 1173391990.0, "sample_num_tokens": 8241.5, "step": 8324, "total_num_tokens": 1173424956.0, "z_loss": 0.0004812762781511992 }, { "copy_logits_max": -7.957548141479492, "copy_logits_min": -750000000.0, "copy_num_tokens": 466.0625, "epoch": 1.7003318866479449, "gen_logits_max": 4.124667167663574, "gen_logits_mean": -15.441266059875488, "gen_logits_min": -27.361526489257812, "gen_logits_std": 3.269716262817383, "gen_loss": 0.24811892211437225, "grad_norm": 0.3549008640088886, "learning_rate": 2.0413894736842107e-05, "loss": 0.255, "mean_copy_accuracy": 0.997056856751442, "mean_gen_accuracy": 0.8849527686834335, "mean_token_accuracy": 0.9130463302135468, "num_tokens": 1173645351.0, "sample_num_tokens": 8143.75, "step": 8325, "total_num_tokens": 1173677926.0, "z_loss": 0.0003939212765544653 }, { "copy_logits_max": -8.087096214294434, "copy_logits_min": -750000000.0, "copy_num_tokens": 515.75, "epoch": 1.7005361245851418, "gen_logits_max": 2.394120216369629, "gen_logits_mean": -18.224750518798828, "gen_logits_min": -30.297327041625977, "gen_logits_std": 3.340916633605957, "gen_loss": 0.2870047092437744, "grad_norm": 0.3538572151077202, "learning_rate": 2.0412631578947368e-05, "loss": 0.2843, "mean_copy_accuracy": 0.9974858164787292, "mean_gen_accuracy": 0.8714772015810013, "mean_token_accuracy": 0.9030662477016449, "num_tokens": 1173916285.0, "sample_num_tokens": 8565.25, "step": 8326, "total_num_tokens": 1173950546.0, "z_loss": 0.00045251072151586413 }, { "copy_logits_max": -5.658147811889648, "copy_logits_min": -750000000.0, "copy_num_tokens": 591.9375, "epoch": 1.7007403625223385, "gen_logits_max": 3.685408592224121, "gen_logits_mean": -14.547565460205078, "gen_logits_min": -26.655996322631836, "gen_logits_std": 3.1630172729492188, "gen_loss": 0.24778468906879425, "grad_norm": 0.44960413465822496, "learning_rate": 2.0411368421052633e-05, "loss": 0.2545, "mean_copy_accuracy": 0.9974077343940735, "mean_gen_accuracy": 0.8806828111410141, "mean_token_accuracy": 0.9143669158220291, "num_tokens": 1174192977.0, "sample_num_tokens": 8636.25, "step": 8327, "total_num_tokens": 1174227522.0, "z_loss": 0.00044085527770221233 }, { "copy_logits_max": -7.438769817352295, "copy_logits_min": -750000064.0, "copy_num_tokens": 684.5625, "epoch": 1.7009446004595352, "gen_logits_max": 2.717191219329834, "gen_logits_mean": -16.933109283447266, "gen_logits_min": -28.71350860595703, "gen_logits_std": 3.291032552719116, "gen_loss": 0.2502855658531189, "grad_norm": 0.8167902197095833, "learning_rate": 2.0410105263157893e-05, "loss": 0.2957, "mean_copy_accuracy": 0.9965350180864334, "mean_gen_accuracy": 0.8696979731321335, "mean_token_accuracy": 0.8984121382236481, "num_tokens": 1174448961.0, "sample_num_tokens": 10075.75, "step": 8328, "total_num_tokens": 1174489264.0, "z_loss": 0.00044984789565205574 }, { "copy_logits_max": -6.733330726623535, "copy_logits_min": -687500032.0, "copy_num_tokens": 409.4375, "epoch": 1.7011488383967321, "gen_logits_max": 4.747828483581543, "gen_logits_mean": -14.71074390411377, "gen_logits_min": -26.476707458496094, "gen_logits_std": 3.1846327781677246, "gen_loss": 0.2915566861629486, "grad_norm": 0.36401863141823576, "learning_rate": 2.0408842105263158e-05, "loss": 0.2718, "mean_copy_accuracy": 0.9972686767578125, "mean_gen_accuracy": 0.8753061294555664, "mean_token_accuracy": 0.9065699428319931, "num_tokens": 1174734395.0, "sample_num_tokens": 7692.75, "step": 8329, "total_num_tokens": 1174765166.0, "z_loss": 0.0004761412274092436 }, { "copy_logits_max": -7.24031925201416, "copy_logits_min": -687500032.0, "copy_num_tokens": 475.875, "epoch": 1.701353076333929, "gen_logits_max": 3.2511777877807617, "gen_logits_mean": -16.602691650390625, "gen_logits_min": -28.756813049316406, "gen_logits_std": 3.335608959197998, "gen_loss": 0.25476914644241333, "grad_norm": 0.37287802951375554, "learning_rate": 2.0407578947368422e-05, "loss": 0.2661, "mean_copy_accuracy": 0.9967648088932037, "mean_gen_accuracy": 0.8784704357385635, "mean_token_accuracy": 0.9072428941726685, "num_tokens": 1174987659.0, "sample_num_tokens": 8345.75, "step": 8330, "total_num_tokens": 1175021042.0, "z_loss": 0.00039354281034320593 }, { "copy_logits_max": -8.612890243530273, "copy_logits_min": -750000000.0, "copy_num_tokens": 533.6875, "epoch": 1.7015573142711258, "gen_logits_max": 3.9181885719299316, "gen_logits_mean": -15.41934585571289, "gen_logits_min": -27.453723907470703, "gen_logits_std": 3.2751893997192383, "gen_loss": 0.3129809498786926, "grad_norm": 0.3425336102314052, "learning_rate": 2.0406315789473686e-05, "loss": 0.2841, "mean_copy_accuracy": 0.9966005235910416, "mean_gen_accuracy": 0.8743486851453781, "mean_token_accuracy": 0.9028364270925522, "num_tokens": 1175256406.0, "sample_num_tokens": 9413.5, "step": 8331, "total_num_tokens": 1175294060.0, "z_loss": 0.0004655019147321582 }, { "copy_logits_max": -8.209883689880371, "copy_logits_min": -750000000.0, "copy_num_tokens": 437.3125, "epoch": 1.7017615522083227, "gen_logits_max": 3.2002737522125244, "gen_logits_mean": -17.906265258789062, "gen_logits_min": -30.19378662109375, "gen_logits_std": 3.38448429107666, "gen_loss": 0.27960205078125, "grad_norm": 0.34801417956468145, "learning_rate": 2.0405052631578947e-05, "loss": 0.2688, "mean_copy_accuracy": 0.9965212643146515, "mean_gen_accuracy": 0.8782989084720612, "mean_token_accuracy": 0.9104441702365875, "num_tokens": 1175551375.0, "sample_num_tokens": 8313.75, "step": 8332, "total_num_tokens": 1175584630.0, "z_loss": 0.0004959087236784399 }, { "copy_logits_max": -9.953699111938477, "copy_logits_min": -750000000.0, "copy_num_tokens": 402.25, "epoch": 1.7019657901455196, "gen_logits_max": 4.003331184387207, "gen_logits_mean": -17.090370178222656, "gen_logits_min": -28.865400314331055, "gen_logits_std": 3.340195655822754, "gen_loss": 0.2768133282661438, "grad_norm": 0.37533109721380287, "learning_rate": 2.040378947368421e-05, "loss": 0.2764, "mean_copy_accuracy": 0.9968455731868744, "mean_gen_accuracy": 0.8743636608123779, "mean_token_accuracy": 0.9056603759527206, "num_tokens": 1175822767.0, "sample_num_tokens": 9334.75, "step": 8333, "total_num_tokens": 1175860106.0, "z_loss": 0.0004293733218219131 }, { "copy_logits_max": -5.516731262207031, "copy_logits_min": -687500032.0, "copy_num_tokens": 460.3125, "epoch": 1.7021700280827163, "gen_logits_max": 3.3391056060791016, "gen_logits_mean": -17.319782257080078, "gen_logits_min": -29.860448837280273, "gen_logits_std": 3.384186267852783, "gen_loss": 0.2877877950668335, "grad_norm": 0.3735458041995101, "learning_rate": 2.0402526315789473e-05, "loss": 0.2963, "mean_copy_accuracy": 0.9973074197769165, "mean_gen_accuracy": 0.8735315054655075, "mean_token_accuracy": 0.8985090106725693, "num_tokens": 1176091277.0, "sample_num_tokens": 8204.75, "step": 8334, "total_num_tokens": 1176124096.0, "z_loss": 0.0004302602319512516 }, { "copy_logits_max": -7.886246681213379, "copy_logits_min": -750000000.0, "copy_num_tokens": 352.6875, "epoch": 1.702374266019913, "gen_logits_max": 3.477741003036499, "gen_logits_mean": -18.060192108154297, "gen_logits_min": -29.86004638671875, "gen_logits_std": 3.3925588130950928, "gen_loss": 0.3084312081336975, "grad_norm": 0.36916590705158847, "learning_rate": 2.0401263157894737e-05, "loss": 0.2967, "mean_copy_accuracy": 0.9980239421129227, "mean_gen_accuracy": 0.8696657717227936, "mean_token_accuracy": 0.900008887052536, "num_tokens": 1176366694.0, "sample_num_tokens": 7337.5, "step": 8335, "total_num_tokens": 1176396044.0, "z_loss": 0.00045850384049117565 }, { "copy_logits_max": -5.8632588386535645, "copy_logits_min": -750000000.0, "copy_num_tokens": 746.9375, "epoch": 1.7025785039571102, "gen_logits_max": 3.836860179901123, "gen_logits_mean": -15.719017028808594, "gen_logits_min": -28.256494522094727, "gen_logits_std": 3.3443851470947266, "gen_loss": 0.2473842203617096, "grad_norm": 0.36390335815078895, "learning_rate": 2.04e-05, "loss": 0.247, "mean_copy_accuracy": 0.997973769903183, "mean_gen_accuracy": 0.8823418170213699, "mean_token_accuracy": 0.9153458774089813, "num_tokens": 1176654978.0, "sample_num_tokens": 10525.0, "step": 8336, "total_num_tokens": 1176697078.0, "z_loss": 0.00038486855919472873 }, { "copy_logits_max": -7.358608245849609, "copy_logits_min": -750000000.0, "copy_num_tokens": 295.9375, "epoch": 1.702782741894307, "gen_logits_max": 4.780614852905273, "gen_logits_mean": -15.2576265335083, "gen_logits_min": -27.475339889526367, "gen_logits_std": 3.2565994262695312, "gen_loss": 0.29655227065086365, "grad_norm": 0.3661321828967744, "learning_rate": 2.0398736842105262e-05, "loss": 0.2794, "mean_copy_accuracy": 0.9973854869604111, "mean_gen_accuracy": 0.872629925608635, "mean_token_accuracy": 0.9043251872062683, "num_tokens": 1176934863.0, "sample_num_tokens": 7809.75, "step": 8337, "total_num_tokens": 1176966102.0, "z_loss": 0.0004525036201812327 }, { "copy_logits_max": -6.628252029418945, "copy_logits_min": -750000000.0, "copy_num_tokens": 538.1875, "epoch": 1.7029869798315036, "gen_logits_max": 3.9890341758728027, "gen_logits_mean": -16.820026397705078, "gen_logits_min": -29.152488708496094, "gen_logits_std": 3.369255781173706, "gen_loss": 0.27309226989746094, "grad_norm": 0.39597493336945144, "learning_rate": 2.0397473684210526e-05, "loss": 0.2708, "mean_copy_accuracy": 0.9952484220266342, "mean_gen_accuracy": 0.8785114884376526, "mean_token_accuracy": 0.9076273143291473, "num_tokens": 1177203624.0, "sample_num_tokens": 9261.5, "step": 8338, "total_num_tokens": 1177240670.0, "z_loss": 0.0004288142663426697 }, { "copy_logits_max": -5.479252815246582, "copy_logits_min": -750000000.0, "copy_num_tokens": 425.1875, "epoch": 1.7031912177687005, "gen_logits_max": 3.8940672874450684, "gen_logits_mean": -16.33071517944336, "gen_logits_min": -28.117225646972656, "gen_logits_std": 3.314042806625366, "gen_loss": 0.2806134521961212, "grad_norm": 0.3740182685557147, "learning_rate": 2.039621052631579e-05, "loss": 0.283, "mean_copy_accuracy": 0.9966602325439453, "mean_gen_accuracy": 0.8781900405883789, "mean_token_accuracy": 0.905221700668335, "num_tokens": 1177467162.0, "sample_num_tokens": 7662.0, "step": 8339, "total_num_tokens": 1177497810.0, "z_loss": 0.0004323878965806216 }, { "copy_logits_max": -5.966598987579346, "copy_logits_min": -750000000.0, "copy_num_tokens": 485.4375, "epoch": 1.7033954557058975, "gen_logits_max": 2.540964126586914, "gen_logits_mean": -19.31353759765625, "gen_logits_min": -31.344932556152344, "gen_logits_std": 3.4737088680267334, "gen_loss": 0.25453782081604004, "grad_norm": 0.3621413461652956, "learning_rate": 2.0394947368421055e-05, "loss": 0.2695, "mean_copy_accuracy": 0.996454730629921, "mean_gen_accuracy": 0.8802812695503235, "mean_token_accuracy": 0.9066145271062851, "num_tokens": 1177744352.0, "sample_num_tokens": 8864.5, "step": 8340, "total_num_tokens": 1177779810.0, "z_loss": 0.00043365973397158086 }, { "copy_logits_max": -4.087089538574219, "copy_logits_min": -750000000.0, "copy_num_tokens": 476.5625, "epoch": 1.7035996936430942, "gen_logits_max": 3.885523796081543, "gen_logits_mean": -15.782794952392578, "gen_logits_min": -28.02831268310547, "gen_logits_std": 3.313776969909668, "gen_loss": 0.25281116366386414, "grad_norm": 0.3580042705700442, "learning_rate": 2.0393684210526316e-05, "loss": 0.2594, "mean_copy_accuracy": 0.9973130822181702, "mean_gen_accuracy": 0.8819349259138107, "mean_token_accuracy": 0.9117775857448578, "num_tokens": 1178032436.0, "sample_num_tokens": 8631.0, "step": 8341, "total_num_tokens": 1178066960.0, "z_loss": 0.0004744543693959713 }, { "copy_logits_max": -6.561245441436768, "copy_logits_min": -750000000.0, "copy_num_tokens": 407.1875, "epoch": 1.703803931580291, "gen_logits_max": 3.9251465797424316, "gen_logits_mean": -16.948387145996094, "gen_logits_min": -29.209064483642578, "gen_logits_std": 3.374904155731201, "gen_loss": 0.2549041509628296, "grad_norm": 0.36928948740277134, "learning_rate": 2.039242105263158e-05, "loss": 0.2699, "mean_copy_accuracy": 0.9973379373550415, "mean_gen_accuracy": 0.8770843744277954, "mean_token_accuracy": 0.9071172475814819, "num_tokens": 1178284977.0, "sample_num_tokens": 7630.25, "step": 8342, "total_num_tokens": 1178315498.0, "z_loss": 0.0004112125316169113 }, { "copy_logits_max": -5.406898498535156, "copy_logits_min": -750000000.0, "copy_num_tokens": 557.625, "epoch": 1.704008169517488, "gen_logits_max": 3.428704261779785, "gen_logits_mean": -15.861309051513672, "gen_logits_min": -28.02269744873047, "gen_logits_std": 3.322638988494873, "gen_loss": 0.2774118185043335, "grad_norm": 0.3337583679179924, "learning_rate": 2.039115789473684e-05, "loss": 0.2866, "mean_copy_accuracy": 0.9978136569261551, "mean_gen_accuracy": 0.8704351037740707, "mean_token_accuracy": 0.9026219993829727, "num_tokens": 1178547184.0, "sample_num_tokens": 7969.5, "step": 8343, "total_num_tokens": 1178579062.0, "z_loss": 0.00041875371243804693 }, { "copy_logits_max": -6.271174907684326, "copy_logits_min": -750000000.0, "copy_num_tokens": 406.1875, "epoch": 1.7042124074546847, "gen_logits_max": 3.3008689880371094, "gen_logits_mean": -17.321483612060547, "gen_logits_min": -29.45039939880371, "gen_logits_std": 3.3519630432128906, "gen_loss": 0.25756511092185974, "grad_norm": 0.3576413526172683, "learning_rate": 2.0389894736842106e-05, "loss": 0.2807, "mean_copy_accuracy": 0.9964159727096558, "mean_gen_accuracy": 0.8741658627986908, "mean_token_accuracy": 0.9041976630687714, "num_tokens": 1178815467.0, "sample_num_tokens": 7760.75, "step": 8344, "total_num_tokens": 1178846510.0, "z_loss": 0.0004021301865577698 }, { "copy_logits_max": -5.957047939300537, "copy_logits_min": -750000000.0, "copy_num_tokens": 469.0, "epoch": 1.7044166453918814, "gen_logits_max": 3.824467658996582, "gen_logits_mean": -15.704228401184082, "gen_logits_min": -28.24515151977539, "gen_logits_std": 3.307792901992798, "gen_loss": 0.2538363039493561, "grad_norm": 0.3904542455523046, "learning_rate": 2.0388631578947366e-05, "loss": 0.271, "mean_copy_accuracy": 0.9972001910209656, "mean_gen_accuracy": 0.8760634511709213, "mean_token_accuracy": 0.9081413894891739, "num_tokens": 1179067552.0, "sample_num_tokens": 8533.5, "step": 8345, "total_num_tokens": 1179101686.0, "z_loss": 0.00039159573498182 }, { "copy_logits_max": -4.470124244689941, "copy_logits_min": -750000000.0, "copy_num_tokens": 502.3125, "epoch": 1.7046208833290784, "gen_logits_max": 4.321232795715332, "gen_logits_mean": -15.361297607421875, "gen_logits_min": -27.506999969482422, "gen_logits_std": 3.298795461654663, "gen_loss": 0.3057368993759155, "grad_norm": 0.3516950642577472, "learning_rate": 2.038736842105263e-05, "loss": 0.2827, "mean_copy_accuracy": 0.9966288506984711, "mean_gen_accuracy": 0.8787132352590561, "mean_token_accuracy": 0.9036529809236526, "num_tokens": 1179328394.0, "sample_num_tokens": 8660.0, "step": 8346, "total_num_tokens": 1179363034.0, "z_loss": 0.0005201627500355244 }, { "copy_logits_max": -5.322192192077637, "copy_logits_min": -750000000.0, "copy_num_tokens": 587.375, "epoch": 1.7048251212662753, "gen_logits_max": 3.538784980773926, "gen_logits_mean": -17.60497283935547, "gen_logits_min": -30.057422637939453, "gen_logits_std": 3.386561870574951, "gen_loss": 0.302289754152298, "grad_norm": 0.33554946689418585, "learning_rate": 2.0386105263157895e-05, "loss": 0.2789, "mean_copy_accuracy": 0.9977103620767593, "mean_gen_accuracy": 0.8746281117200851, "mean_token_accuracy": 0.9065306484699249, "num_tokens": 1179606315.0, "sample_num_tokens": 9719.25, "step": 8347, "total_num_tokens": 1179645192.0, "z_loss": 0.00047237903345376253 }, { "copy_logits_max": -5.710592269897461, "copy_logits_min": -687500032.0, "copy_num_tokens": 625.0625, "epoch": 1.705029359203472, "gen_logits_max": 2.848297119140625, "gen_logits_mean": -17.211767196655273, "gen_logits_min": -29.48196792602539, "gen_logits_std": 3.3915200233459473, "gen_loss": 0.2220497727394104, "grad_norm": 0.3552054278968525, "learning_rate": 2.038484210526316e-05, "loss": 0.2727, "mean_copy_accuracy": 0.9978044033050537, "mean_gen_accuracy": 0.8786957263946533, "mean_token_accuracy": 0.9083487540483475, "num_tokens": 1179875985.0, "sample_num_tokens": 9559.25, "step": 8348, "total_num_tokens": 1179914222.0, "z_loss": 0.000377967837266624 }, { "copy_logits_max": -5.455585479736328, "copy_logits_min": -625000064.0, "copy_num_tokens": 487.1875, "epoch": 1.705233597140669, "gen_logits_max": 4.626157760620117, "gen_logits_mean": -15.194005012512207, "gen_logits_min": -27.802818298339844, "gen_logits_std": 3.29386043548584, "gen_loss": 0.27371734380722046, "grad_norm": 0.3611415036179338, "learning_rate": 2.0383578947368424e-05, "loss": 0.2752, "mean_copy_accuracy": 0.9971317201852798, "mean_gen_accuracy": 0.8792937844991684, "mean_token_accuracy": 0.9066726416349411, "num_tokens": 1180137992.0, "sample_num_tokens": 8696.0, "step": 8349, "total_num_tokens": 1180172776.0, "z_loss": 0.00046213431051000953 }, { "copy_logits_max": -7.5292534828186035, "copy_logits_min": -750000000.0, "copy_num_tokens": 385.3125, "epoch": 1.7054378350778658, "gen_logits_max": 4.789141654968262, "gen_logits_mean": -17.34600830078125, "gen_logits_min": -29.088031768798828, "gen_logits_std": 3.343944787979126, "gen_loss": 0.27436238527297974, "grad_norm": 0.3356263795755209, "learning_rate": 2.0382315789473685e-05, "loss": 0.2676, "mean_copy_accuracy": 0.9967568665742874, "mean_gen_accuracy": 0.8799828886985779, "mean_token_accuracy": 0.9078605622053146, "num_tokens": 1180408212.0, "sample_num_tokens": 8633.5, "step": 8350, "total_num_tokens": 1180442746.0, "z_loss": 0.0004916598554700613 }, { "copy_logits_max": -3.9438414573669434, "copy_logits_min": -750000064.0, "copy_num_tokens": 588.5, "epoch": 1.7056420730150625, "gen_logits_max": 3.218276023864746, "gen_logits_mean": -17.324012756347656, "gen_logits_min": -29.754295349121094, "gen_logits_std": 3.374216079711914, "gen_loss": 0.29766809940338135, "grad_norm": 0.37579694001768543, "learning_rate": 2.038105263157895e-05, "loss": 0.3016, "mean_copy_accuracy": 0.9964433759450912, "mean_gen_accuracy": 0.86520916223526, "mean_token_accuracy": 0.8983861207962036, "num_tokens": 1180660652.0, "sample_num_tokens": 8769.5, "step": 8351, "total_num_tokens": 1180695730.0, "z_loss": 0.0005609173676930368 }, { "copy_logits_max": -5.915567398071289, "copy_logits_min": -750000000.0, "copy_num_tokens": 303.75, "epoch": 1.7058463109522592, "gen_logits_max": 5.261499404907227, "gen_logits_mean": -15.778250694274902, "gen_logits_min": -28.06767463684082, "gen_logits_std": 3.3111515045166016, "gen_loss": 0.2930479347705841, "grad_norm": 0.33580505860391974, "learning_rate": 2.037978947368421e-05, "loss": 0.2747, "mean_copy_accuracy": 0.997494712471962, "mean_gen_accuracy": 0.8783951997756958, "mean_token_accuracy": 0.9062131643295288, "num_tokens": 1180935364.0, "sample_num_tokens": 7382.5, "step": 8352, "total_num_tokens": 1180964894.0, "z_loss": 0.0004957193741574883 }, { "copy_logits_max": -6.385636329650879, "copy_logits_min": -750000000.0, "copy_num_tokens": 478.8125, "epoch": 1.7060505488894562, "gen_logits_max": 4.920529365539551, "gen_logits_mean": -15.852510452270508, "gen_logits_min": -27.7325382232666, "gen_logits_std": 3.2973880767822266, "gen_loss": 0.2819155156612396, "grad_norm": 0.35324387424329107, "learning_rate": 2.0378526315789474e-05, "loss": 0.2863, "mean_copy_accuracy": 0.9972448498010635, "mean_gen_accuracy": 0.8768314719200134, "mean_token_accuracy": 0.9033196419477463, "num_tokens": 1181186475.0, "sample_num_tokens": 8033.25, "step": 8353, "total_num_tokens": 1181218608.0, "z_loss": 0.0004527799319475889 }, { "copy_logits_max": -2.0000438690185547, "copy_logits_min": -750000000.0, "copy_num_tokens": 736.75, "epoch": 1.706254786826653, "gen_logits_max": 5.107382774353027, "gen_logits_mean": -15.564380645751953, "gen_logits_min": -28.234859466552734, "gen_logits_std": 3.3306427001953125, "gen_loss": 0.24696210026741028, "grad_norm": 0.3443152214258508, "learning_rate": 2.0377263157894735e-05, "loss": 0.2694, "mean_copy_accuracy": 0.9975638538599014, "mean_gen_accuracy": 0.8752666860818863, "mean_token_accuracy": 0.9088373333215714, "num_tokens": 1181481222.0, "sample_num_tokens": 10269.5, "step": 8354, "total_num_tokens": 1181522300.0, "z_loss": 0.00041601344128139317 }, { "copy_logits_max": -3.4807679653167725, "copy_logits_min": -750000000.0, "copy_num_tokens": 530.75, "epoch": 1.7064590247638498, "gen_logits_max": 3.819183826446533, "gen_logits_mean": -16.232219696044922, "gen_logits_min": -28.208229064941406, "gen_logits_std": 3.3106799125671387, "gen_loss": 0.276638925075531, "grad_norm": 0.3346300818785318, "learning_rate": 2.0376000000000003e-05, "loss": 0.2755, "mean_copy_accuracy": 0.9970266222953796, "mean_gen_accuracy": 0.87574402987957, "mean_token_accuracy": 0.9048497974872589, "num_tokens": 1181758912.0, "sample_num_tokens": 8427.5, "step": 8355, "total_num_tokens": 1181792622.0, "z_loss": 0.0004052543081343174 }, { "copy_logits_max": -4.0149946212768555, "copy_logits_min": -750000000.0, "copy_num_tokens": 474.0, "epoch": 1.7066632627010467, "gen_logits_max": 3.170156955718994, "gen_logits_mean": -17.01978302001953, "gen_logits_min": -28.77315902709961, "gen_logits_std": 3.2851438522338867, "gen_loss": 0.30435246229171753, "grad_norm": 0.3231958382546193, "learning_rate": 2.0374736842105264e-05, "loss": 0.2659, "mean_copy_accuracy": 0.9972914010286331, "mean_gen_accuracy": 0.8763773739337921, "mean_token_accuracy": 0.9089474827051163, "num_tokens": 1182034650.0, "sample_num_tokens": 8754.0, "step": 8356, "total_num_tokens": 1182069666.0, "z_loss": 0.00047995487693697214 }, { "copy_logits_max": -1.562245488166809, "copy_logits_min": -687500032.0, "copy_num_tokens": 382.0625, "epoch": 1.7068675006382437, "gen_logits_max": 3.5431289672851562, "gen_logits_mean": -16.487884521484375, "gen_logits_min": -28.19045639038086, "gen_logits_std": 3.294877529144287, "gen_loss": 0.24184544384479523, "grad_norm": 0.3373946771093435, "learning_rate": 2.0373473684210528e-05, "loss": 0.2593, "mean_copy_accuracy": 0.997736245393753, "mean_gen_accuracy": 0.878485381603241, "mean_token_accuracy": 0.9117356240749359, "num_tokens": 1182337811.0, "sample_num_tokens": 8061.25, "step": 8357, "total_num_tokens": 1182370056.0, "z_loss": 0.0003629002021625638 }, { "copy_logits_max": -4.266389846801758, "copy_logits_min": -750000000.0, "copy_num_tokens": 429.6875, "epoch": 1.7070717385754404, "gen_logits_max": 3.709643840789795, "gen_logits_mean": -17.216127395629883, "gen_logits_min": -29.05445098876953, "gen_logits_std": 3.2829113006591797, "gen_loss": 0.32004523277282715, "grad_norm": 0.342495724840909, "learning_rate": 2.037221052631579e-05, "loss": 0.2858, "mean_copy_accuracy": 0.9963312745094299, "mean_gen_accuracy": 0.8704253733158112, "mean_token_accuracy": 0.9022321999073029, "num_tokens": 1182607167.0, "sample_num_tokens": 8329.75, "step": 8358, "total_num_tokens": 1182640486.0, "z_loss": 0.0005316099268384278 }, { "copy_logits_max": -4.521198749542236, "copy_logits_min": -750000064.0, "copy_num_tokens": 269.0, "epoch": 1.707275976512637, "gen_logits_max": 4.4492034912109375, "gen_logits_mean": -17.341060638427734, "gen_logits_min": -29.301254272460938, "gen_logits_std": 3.311544418334961, "gen_loss": 0.2823637127876282, "grad_norm": 0.3564704231266942, "learning_rate": 2.0370947368421053e-05, "loss": 0.2908, "mean_copy_accuracy": 0.9965080469846725, "mean_gen_accuracy": 0.8765008896589279, "mean_token_accuracy": 0.9012843072414398, "num_tokens": 1182869081.0, "sample_num_tokens": 6605.75, "step": 8359, "total_num_tokens": 1182895504.0, "z_loss": 0.0004906440153717995 }, { "copy_logits_max": -4.400539875030518, "copy_logits_min": -750000000.0, "copy_num_tokens": 361.25, "epoch": 1.7074802144498342, "gen_logits_max": 3.3706984519958496, "gen_logits_mean": -17.789993286132812, "gen_logits_min": -29.56484603881836, "gen_logits_std": 3.324620008468628, "gen_loss": 0.3098990321159363, "grad_norm": 0.3403855271313405, "learning_rate": 2.0369684210526314e-05, "loss": 0.28, "mean_copy_accuracy": 0.9969295412302017, "mean_gen_accuracy": 0.8760631382465363, "mean_token_accuracy": 0.9032322615385056, "num_tokens": 1183140335.0, "sample_num_tokens": 7289.75, "step": 8360, "total_num_tokens": 1183169494.0, "z_loss": 0.00044870839337818325 }, { "copy_logits_max": -2.7503063678741455, "copy_logits_min": -687500032.0, "copy_num_tokens": 409.4375, "epoch": 1.707684452387031, "gen_logits_max": 3.9503750801086426, "gen_logits_mean": -17.441577911376953, "gen_logits_min": -29.58334732055664, "gen_logits_std": 3.3321051597595215, "gen_loss": 0.2999580502510071, "grad_norm": 0.3491809977679281, "learning_rate": 2.036842105263158e-05, "loss": 0.2867, "mean_copy_accuracy": 0.9971908777952194, "mean_gen_accuracy": 0.87185999751091, "mean_token_accuracy": 0.9033215641975403, "num_tokens": 1183424738.0, "sample_num_tokens": 8636.5, "step": 8361, "total_num_tokens": 1183459284.0, "z_loss": 0.0004841546469833702 }, { "copy_logits_max": -4.537487030029297, "copy_logits_min": -750000000.0, "copy_num_tokens": 403.125, "epoch": 1.7078886903242276, "gen_logits_max": 4.8462371826171875, "gen_logits_mean": -16.239381790161133, "gen_logits_min": -28.26006317138672, "gen_logits_std": 3.308475971221924, "gen_loss": 0.2529764175415039, "grad_norm": 0.34026728342002155, "learning_rate": 2.0367157894736843e-05, "loss": 0.2767, "mean_copy_accuracy": 0.9973815232515335, "mean_gen_accuracy": 0.8765313029289246, "mean_token_accuracy": 0.9063023179769516, "num_tokens": 1183695590.0, "sample_num_tokens": 7932.0, "step": 8362, "total_num_tokens": 1183727318.0, "z_loss": 0.00039946523611433804 }, { "copy_logits_max": -3.8004398345947266, "copy_logits_min": -750000000.0, "copy_num_tokens": 291.1875, "epoch": 1.7080929282614246, "gen_logits_max": 3.570042848587036, "gen_logits_mean": -18.026554107666016, "gen_logits_min": -29.957416534423828, "gen_logits_std": 3.3375821113586426, "gen_loss": 0.2814699411392212, "grad_norm": 0.3348658275216889, "learning_rate": 2.0365894736842107e-05, "loss": 0.2732, "mean_copy_accuracy": 0.9966486245393753, "mean_gen_accuracy": 0.8791507929563522, "mean_token_accuracy": 0.9075184762477875, "num_tokens": 1183961397.0, "sample_num_tokens": 6957.25, "step": 8363, "total_num_tokens": 1183989226.0, "z_loss": 0.0004736489208880812 }, { "copy_logits_max": -5.9460039138793945, "copy_logits_min": -750000000.0, "copy_num_tokens": 474.3125, "epoch": 1.7082971661986215, "gen_logits_max": 3.6376190185546875, "gen_logits_mean": -16.646007537841797, "gen_logits_min": -28.740360260009766, "gen_logits_std": 3.287795066833496, "gen_loss": 0.26415300369262695, "grad_norm": 0.32258066249043255, "learning_rate": 2.036463157894737e-05, "loss": 0.2617, "mean_copy_accuracy": 0.996815025806427, "mean_gen_accuracy": 0.8826226890087128, "mean_token_accuracy": 0.9108990430831909, "num_tokens": 1184243739.0, "sample_num_tokens": 8633.25, "step": 8364, "total_num_tokens": 1184278272.0, "z_loss": 0.0004334392142482102 }, { "copy_logits_max": -5.017653465270996, "copy_logits_min": -750000064.0, "copy_num_tokens": 574.875, "epoch": 1.7085014041358182, "gen_logits_max": 3.7951409816741943, "gen_logits_mean": -16.49588966369629, "gen_logits_min": -28.61968994140625, "gen_logits_std": 3.301316738128662, "gen_loss": 0.26819291710853577, "grad_norm": 0.3416653287905255, "learning_rate": 2.0363368421052632e-05, "loss": 0.2519, "mean_copy_accuracy": 0.997573971748352, "mean_gen_accuracy": 0.8827639222145081, "mean_token_accuracy": 0.9135665148496628, "num_tokens": 1184516141.0, "sample_num_tokens": 9143.75, "step": 8365, "total_num_tokens": 1184552716.0, "z_loss": 0.0004808788071386516 }, { "copy_logits_max": -7.031954765319824, "copy_logits_min": -750000000.0, "copy_num_tokens": 324.9375, "epoch": 1.708705642073015, "gen_logits_max": 3.827296018600464, "gen_logits_mean": -18.335918426513672, "gen_logits_min": -30.03350830078125, "gen_logits_std": 3.353753089904785, "gen_loss": 0.30310961604118347, "grad_norm": 0.3512352073235701, "learning_rate": 2.0362105263157897e-05, "loss": 0.2639, "mean_copy_accuracy": 0.9969618022441864, "mean_gen_accuracy": 0.883425161242485, "mean_token_accuracy": 0.909414678812027, "num_tokens": 1184789288.0, "sample_num_tokens": 8183.0, "step": 8366, "total_num_tokens": 1184822020.0, "z_loss": 0.0004829044337384403 }, { "copy_logits_max": -3.0867486000061035, "copy_logits_min": -750000000.0, "copy_num_tokens": 693.125, "epoch": 1.708909880010212, "gen_logits_max": 3.167323589324951, "gen_logits_mean": -15.31020450592041, "gen_logits_min": -27.270488739013672, "gen_logits_std": 3.2082886695861816, "gen_loss": 0.24831177294254303, "grad_norm": 0.36694695754249845, "learning_rate": 2.0360842105263158e-05, "loss": 0.2676, "mean_copy_accuracy": 0.9983110725879669, "mean_gen_accuracy": 0.8730264753103256, "mean_token_accuracy": 0.9110386818647385, "num_tokens": 1185084164.0, "sample_num_tokens": 9638.5, "step": 8367, "total_num_tokens": 1185122718.0, "z_loss": 0.0003767288872040808 }, { "copy_logits_max": -6.562417030334473, "copy_logits_min": -687500032.0, "copy_num_tokens": 414.5625, "epoch": 1.7091141179474088, "gen_logits_max": 2.9458022117614746, "gen_logits_mean": -18.57354736328125, "gen_logits_min": -30.692943572998047, "gen_logits_std": 3.39558482170105, "gen_loss": 0.2359258383512497, "grad_norm": 0.34051561387568663, "learning_rate": 2.0359578947368422e-05, "loss": 0.2609, "mean_copy_accuracy": 0.9972202032804489, "mean_gen_accuracy": 0.886762946844101, "mean_token_accuracy": 0.9101790487766266, "num_tokens": 1185343750.0, "sample_num_tokens": 7534.5, "step": 8368, "total_num_tokens": 1185373888.0, "z_loss": 0.0003685390984173864 }, { "copy_logits_max": -4.746881484985352, "copy_logits_min": -750000000.0, "copy_num_tokens": 588.875, "epoch": 1.7093183558846055, "gen_logits_max": 2.530994176864624, "gen_logits_mean": -17.941022872924805, "gen_logits_min": -30.033939361572266, "gen_logits_std": 3.3587732315063477, "gen_loss": 0.25972980260849, "grad_norm": 0.35269463769641324, "learning_rate": 2.0358315789473683e-05, "loss": 0.264, "mean_copy_accuracy": 0.9969237297773361, "mean_gen_accuracy": 0.8767445981502533, "mean_token_accuracy": 0.9116853326559067, "num_tokens": 1185619555.0, "sample_num_tokens": 8870.75, "step": 8369, "total_num_tokens": 1185655038.0, "z_loss": 0.0003892554668709636 }, { "copy_logits_max": -6.973783493041992, "copy_logits_min": -750000000.0, "copy_num_tokens": 304.8125, "epoch": 1.7095225938218024, "gen_logits_max": 3.820082664489746, "gen_logits_mean": -17.130176544189453, "gen_logits_min": -28.964536666870117, "gen_logits_std": 3.2908935546875, "gen_loss": 0.2862202227115631, "grad_norm": 0.3798177416428168, "learning_rate": 2.0357052631578947e-05, "loss": 0.2855, "mean_copy_accuracy": 0.996655598282814, "mean_gen_accuracy": 0.8761868625879288, "mean_token_accuracy": 0.9032881855964661, "num_tokens": 1185908635.0, "sample_num_tokens": 7539.75, "step": 8370, "total_num_tokens": 1185938794.0, "z_loss": 0.000408721825806424 }, { "copy_logits_max": -5.195758819580078, "copy_logits_min": -625000064.0, "copy_num_tokens": 333.125, "epoch": 1.7097268317589993, "gen_logits_max": 3.3886711597442627, "gen_logits_mean": -17.919757843017578, "gen_logits_min": -29.70682716369629, "gen_logits_std": 3.3181676864624023, "gen_loss": 0.2883807122707367, "grad_norm": 0.3589116226419269, "learning_rate": 2.035578947368421e-05, "loss": 0.2758, "mean_copy_accuracy": 0.9968083798885345, "mean_gen_accuracy": 0.8817398995161057, "mean_token_accuracy": 0.9067143052816391, "num_tokens": 1186176731.0, "sample_num_tokens": 6936.75, "step": 8371, "total_num_tokens": 1186204478.0, "z_loss": 0.00043535925215110183 }, { "copy_logits_max": -6.010084629058838, "copy_logits_min": -750000000.0, "copy_num_tokens": 412.25, "epoch": 1.709931069696196, "gen_logits_max": 3.1089086532592773, "gen_logits_mean": -17.635902404785156, "gen_logits_min": -29.839344024658203, "gen_logits_std": 3.3079657554626465, "gen_loss": 0.2524510622024536, "grad_norm": 0.39842426066974956, "learning_rate": 2.0354526315789476e-05, "loss": 0.3064, "mean_copy_accuracy": 0.9962699711322784, "mean_gen_accuracy": 0.8714181184768677, "mean_token_accuracy": 0.8948351591825485, "num_tokens": 1186406647.0, "sample_num_tokens": 7582.75, "step": 8372, "total_num_tokens": 1186436978.0, "z_loss": 0.0004323016619309783 }, { "copy_logits_max": -7.616302967071533, "copy_logits_min": -750000000.0, "copy_num_tokens": 403.9375, "epoch": 1.710135307633393, "gen_logits_max": 3.177793502807617, "gen_logits_mean": -18.397552490234375, "gen_logits_min": -30.21446990966797, "gen_logits_std": 3.376431941986084, "gen_loss": 0.25609293580055237, "grad_norm": 0.37618317093894044, "learning_rate": 2.0353263157894737e-05, "loss": 0.2913, "mean_copy_accuracy": 0.9964866489171982, "mean_gen_accuracy": 0.873350664973259, "mean_token_accuracy": 0.9009487628936768, "num_tokens": 1186668102.0, "sample_num_tokens": 8841.5, "step": 8373, "total_num_tokens": 1186703468.0, "z_loss": 0.0003887774655595422 }, { "copy_logits_max": -6.723194122314453, "copy_logits_min": -750000000.0, "copy_num_tokens": 456.125, "epoch": 1.7103395455705899, "gen_logits_max": 3.6999740600585938, "gen_logits_mean": -16.405622482299805, "gen_logits_min": -28.4211483001709, "gen_logits_std": 3.252376079559326, "gen_loss": 0.3250980079174042, "grad_norm": 0.37365377581739695, "learning_rate": 2.0352e-05, "loss": 0.294, "mean_copy_accuracy": 0.9965101182460785, "mean_gen_accuracy": 0.8716166019439697, "mean_token_accuracy": 0.9004863351583481, "num_tokens": 1186942651.0, "sample_num_tokens": 8477.25, "step": 8374, "total_num_tokens": 1186976560.0, "z_loss": 0.0005310861160978675 }, { "copy_logits_max": -7.443714618682861, "copy_logits_min": -750000000.0, "copy_num_tokens": 546.0625, "epoch": 1.7105437835077866, "gen_logits_max": 2.506436347961426, "gen_logits_mean": -18.304960250854492, "gen_logits_min": -30.06372833251953, "gen_logits_std": 3.3536500930786133, "gen_loss": 0.26175159215927124, "grad_norm": 0.37514310864354417, "learning_rate": 2.0350736842105265e-05, "loss": 0.2669, "mean_copy_accuracy": 0.997645691037178, "mean_gen_accuracy": 0.882529154419899, "mean_token_accuracy": 0.9092304110527039, "num_tokens": 1187209260.0, "sample_num_tokens": 8950.5, "step": 8375, "total_num_tokens": 1187245062.0, "z_loss": 0.00042306992691010237 }, { "copy_logits_max": -8.744758605957031, "copy_logits_min": -687500032.0, "copy_num_tokens": 258.0625, "epoch": 1.7107480214449833, "gen_logits_max": 4.072947025299072, "gen_logits_mean": -16.819957733154297, "gen_logits_min": -28.572725296020508, "gen_logits_std": 3.258655309677124, "gen_loss": 0.2819661796092987, "grad_norm": 0.4170450934814318, "learning_rate": 2.0349473684210526e-05, "loss": 0.2806, "mean_copy_accuracy": 0.9971847534179688, "mean_gen_accuracy": 0.876289039850235, "mean_token_accuracy": 0.9055275768041611, "num_tokens": 1187478671.0, "sample_num_tokens": 7450.75, "step": 8376, "total_num_tokens": 1187508474.0, "z_loss": 0.0003928913502022624 }, { "copy_logits_max": -6.6431450843811035, "copy_logits_min": -750000000.0, "copy_num_tokens": 404.0625, "epoch": 1.7109522593821802, "gen_logits_max": 3.6655688285827637, "gen_logits_mean": -16.730419158935547, "gen_logits_min": -29.08331298828125, "gen_logits_std": 3.320525646209717, "gen_loss": 0.28886669874191284, "grad_norm": 0.37821383384152857, "learning_rate": 2.034821052631579e-05, "loss": 0.2813, "mean_copy_accuracy": 0.9975504577159882, "mean_gen_accuracy": 0.8771635740995407, "mean_token_accuracy": 0.9052849113941193, "num_tokens": 1187734610.0, "sample_num_tokens": 7882.5, "step": 8377, "total_num_tokens": 1187766140.0, "z_loss": 0.00045332848094403744 }, { "copy_logits_max": -6.35324764251709, "copy_logits_min": -750000000.0, "copy_num_tokens": 401.75, "epoch": 1.7111564973193771, "gen_logits_max": 4.547637462615967, "gen_logits_mean": -15.930803298950195, "gen_logits_min": -28.734718322753906, "gen_logits_std": 3.279205560684204, "gen_loss": 0.2975231111049652, "grad_norm": 0.34767100863172606, "learning_rate": 2.034694736842105e-05, "loss": 0.2853, "mean_copy_accuracy": 0.9975175261497498, "mean_gen_accuracy": 0.8722853809595108, "mean_token_accuracy": 0.9029197543859482, "num_tokens": 1188008387.0, "sample_num_tokens": 8055.75, "step": 8378, "total_num_tokens": 1188040610.0, "z_loss": 0.00043910215026699007 }, { "copy_logits_max": -5.296643257141113, "copy_logits_min": -750000064.0, "copy_num_tokens": 489.4375, "epoch": 1.7113607352565738, "gen_logits_max": 3.1687192916870117, "gen_logits_mean": -17.209409713745117, "gen_logits_min": -29.85262680053711, "gen_logits_std": 3.3450117111206055, "gen_loss": 0.2654907703399658, "grad_norm": 0.4443715322717535, "learning_rate": 2.0345684210526316e-05, "loss": 0.2776, "mean_copy_accuracy": 0.9962930828332901, "mean_gen_accuracy": 0.8788480311632156, "mean_token_accuracy": 0.907106950879097, "num_tokens": 1188265423.0, "sample_num_tokens": 8172.75, "step": 8379, "total_num_tokens": 1188298114.0, "z_loss": 0.00047253709635697305 }, { "copy_logits_max": -7.303168296813965, "copy_logits_min": -750000000.0, "copy_num_tokens": 670.4375, "epoch": 1.7115649731937708, "gen_logits_max": 3.32836651802063, "gen_logits_mean": -16.938491821289062, "gen_logits_min": -28.746471405029297, "gen_logits_std": 3.3292973041534424, "gen_loss": 0.2609459459781647, "grad_norm": 0.37118257705315144, "learning_rate": 2.034442105263158e-05, "loss": 0.2669, "mean_copy_accuracy": 0.9973536133766174, "mean_gen_accuracy": 0.8802807927131653, "mean_token_accuracy": 0.9089820235967636, "num_tokens": 1188536730.0, "sample_num_tokens": 10095.0, "step": 8380, "total_num_tokens": 1188577110.0, "z_loss": 0.00046051840763539076 }, { "copy_logits_max": -5.834811687469482, "copy_logits_min": -687500032.0, "copy_num_tokens": 390.625, "epoch": 1.7117692111309677, "gen_logits_max": 4.3139801025390625, "gen_logits_mean": -15.444127082824707, "gen_logits_min": -27.302371978759766, "gen_logits_std": 3.225754737854004, "gen_loss": 0.31537091732025146, "grad_norm": 0.34145030472640264, "learning_rate": 2.0343157894736844e-05, "loss": 0.2924, "mean_copy_accuracy": 0.9960366487503052, "mean_gen_accuracy": 0.8716522306203842, "mean_token_accuracy": 0.9008792042732239, "num_tokens": 1188807357.0, "sample_num_tokens": 6986.75, "step": 8381, "total_num_tokens": 1188835304.0, "z_loss": 0.0005395332118496299 }, { "copy_logits_max": -6.6997175216674805, "copy_logits_min": -750000000.0, "copy_num_tokens": 392.9375, "epoch": 1.7119734490681644, "gen_logits_max": 4.905794620513916, "gen_logits_mean": -15.91191291809082, "gen_logits_min": -28.527233123779297, "gen_logits_std": 3.284496784210205, "gen_loss": 0.25507959723472595, "grad_norm": 0.3497353950040986, "learning_rate": 2.0341894736842105e-05, "loss": 0.2537, "mean_copy_accuracy": 0.9969801008701324, "mean_gen_accuracy": 0.8816919475793839, "mean_token_accuracy": 0.9149351865053177, "num_tokens": 1189079645.0, "sample_num_tokens": 8812.25, "step": 8382, "total_num_tokens": 1189114894.0, "z_loss": 0.0004421506600920111 }, { "copy_logits_max": -7.091081142425537, "copy_logits_min": -750000000.0, "copy_num_tokens": 331.75, "epoch": 1.712177687005361, "gen_logits_max": 4.539017677307129, "gen_logits_mean": -15.898938179016113, "gen_logits_min": -27.855388641357422, "gen_logits_std": 3.2693867683410645, "gen_loss": 0.2954752445220947, "grad_norm": 0.39136844648971175, "learning_rate": 2.034063157894737e-05, "loss": 0.2671, "mean_copy_accuracy": 0.994971752166748, "mean_gen_accuracy": 0.8836781680583954, "mean_token_accuracy": 0.908583477139473, "num_tokens": 1189333648.0, "sample_num_tokens": 7761.0, "step": 8383, "total_num_tokens": 1189364692.0, "z_loss": 0.00045481749111786485 }, { "copy_logits_max": -7.331377983093262, "copy_logits_min": -750000000.0, "copy_num_tokens": 484.0, "epoch": 1.712381924942558, "gen_logits_max": 3.7207515239715576, "gen_logits_mean": -16.112041473388672, "gen_logits_min": -28.407318115234375, "gen_logits_std": 3.3007450103759766, "gen_loss": 0.2577955722808838, "grad_norm": 0.38403765344128465, "learning_rate": 2.033936842105263e-05, "loss": 0.2733, "mean_copy_accuracy": 0.996650829911232, "mean_gen_accuracy": 0.8797877579927444, "mean_token_accuracy": 0.9077327400445938, "num_tokens": 1189598777.0, "sample_num_tokens": 8629.25, "step": 8384, "total_num_tokens": 1189633294.0, "z_loss": 0.0004363380721770227 }, { "copy_logits_max": -4.670288562774658, "copy_logits_min": -750000000.0, "copy_num_tokens": 605.375, "epoch": 1.712586162879755, "gen_logits_max": 3.8395140171051025, "gen_logits_mean": -15.217428207397461, "gen_logits_min": -28.02517318725586, "gen_logits_std": 3.268056869506836, "gen_loss": 0.26831740140914917, "grad_norm": 0.37982968985241805, "learning_rate": 2.0338105263157895e-05, "loss": 0.2772, "mean_copy_accuracy": 0.9979881942272186, "mean_gen_accuracy": 0.8673080503940582, "mean_token_accuracy": 0.9063113182783127, "num_tokens": 1189878731.0, "sample_num_tokens": 8653.25, "step": 8385, "total_num_tokens": 1189913344.0, "z_loss": 0.00045149223296903074 }, { "copy_logits_max": -5.7938737869262695, "copy_logits_min": -750000000.0, "copy_num_tokens": 538.5625, "epoch": 1.7127904008169517, "gen_logits_max": 2.7965474128723145, "gen_logits_mean": -17.96845054626465, "gen_logits_min": -29.97264289855957, "gen_logits_std": 3.390702962875366, "gen_loss": 0.2624172866344452, "grad_norm": 0.3330021882527583, "learning_rate": 2.0336842105263156e-05, "loss": 0.2776, "mean_copy_accuracy": 0.996732085943222, "mean_gen_accuracy": 0.8747518360614777, "mean_token_accuracy": 0.9057286977767944, "num_tokens": 1190157319.0, "sample_num_tokens": 8771.75, "step": 8386, "total_num_tokens": 1190192406.0, "z_loss": 0.00045047904131934047 }, { "copy_logits_max": -6.311758995056152, "copy_logits_min": -750000000.0, "copy_num_tokens": 570.625, "epoch": 1.7129946387541486, "gen_logits_max": 3.220167875289917, "gen_logits_mean": -17.389789581298828, "gen_logits_min": -29.560077667236328, "gen_logits_std": 3.333806037902832, "gen_loss": 0.26590538024902344, "grad_norm": 0.345921677187449, "learning_rate": 2.033557894736842e-05, "loss": 0.2838, "mean_copy_accuracy": 0.9974820464849472, "mean_gen_accuracy": 0.8771404922008514, "mean_token_accuracy": 0.9034411907196045, "num_tokens": 1190443220.0, "sample_num_tokens": 9239.5, "step": 8387, "total_num_tokens": 1190480178.0, "z_loss": 0.0005151215009391308 }, { "copy_logits_max": -5.603961944580078, "copy_logits_min": -750000000.0, "copy_num_tokens": 291.875, "epoch": 1.7131988766913455, "gen_logits_max": 4.05552864074707, "gen_logits_mean": -16.870262145996094, "gen_logits_min": -28.776500701904297, "gen_logits_std": 3.3145008087158203, "gen_loss": 0.3212851285934448, "grad_norm": 0.34463412670128263, "learning_rate": 2.0334315789473684e-05, "loss": 0.2808, "mean_copy_accuracy": 0.99718177318573, "mean_gen_accuracy": 0.875879555940628, "mean_token_accuracy": 0.9044279456138611, "num_tokens": 1190714978.0, "sample_num_tokens": 6854.0, "step": 8388, "total_num_tokens": 1190742394.0, "z_loss": 0.0005226025241427124 }, { "copy_logits_max": -4.672994613647461, "copy_logits_min": -750000000.0, "copy_num_tokens": 532.1875, "epoch": 1.7134031146285422, "gen_logits_max": 3.577545642852783, "gen_logits_mean": -17.037853240966797, "gen_logits_min": -28.933921813964844, "gen_logits_std": 3.3198933601379395, "gen_loss": 0.2706323266029358, "grad_norm": 0.3683904211687977, "learning_rate": 2.033305263157895e-05, "loss": 0.2637, "mean_copy_accuracy": 0.9972639381885529, "mean_gen_accuracy": 0.8816241025924683, "mean_token_accuracy": 0.9097691178321838, "num_tokens": 1190970185.0, "sample_num_tokens": 8217.25, "step": 8389, "total_num_tokens": 1191003054.0, "z_loss": 0.0005296816816553473 }, { "copy_logits_max": -2.9076499938964844, "copy_logits_min": -750000000.0, "copy_num_tokens": 428.125, "epoch": 1.713607352565739, "gen_logits_max": 3.4280803203582764, "gen_logits_mean": -16.662723541259766, "gen_logits_min": -29.32845687866211, "gen_logits_std": 3.329845428466797, "gen_loss": 0.2890584468841553, "grad_norm": 0.36263861522123353, "learning_rate": 2.0331789473684213e-05, "loss": 0.2896, "mean_copy_accuracy": 0.9961562156677246, "mean_gen_accuracy": 0.8701620697975159, "mean_token_accuracy": 0.9022320210933685, "num_tokens": 1191215989.0, "sample_num_tokens": 7803.75, "step": 8390, "total_num_tokens": 1191247204.0, "z_loss": 0.0004868099931627512 }, { "copy_logits_max": -5.16416072845459, "copy_logits_min": -750000000.0, "copy_num_tokens": 373.625, "epoch": 1.713811590502936, "gen_logits_max": 4.983664035797119, "gen_logits_mean": -15.616434097290039, "gen_logits_min": -27.69672966003418, "gen_logits_std": 3.274655342102051, "gen_loss": 0.27398455142974854, "grad_norm": 0.3570499164594224, "learning_rate": 2.0330526315789474e-05, "loss": 0.2659, "mean_copy_accuracy": 0.996737003326416, "mean_gen_accuracy": 0.8838569819927216, "mean_token_accuracy": 0.9085258990526199, "num_tokens": 1191492956.0, "sample_num_tokens": 8223.5, "step": 8391, "total_num_tokens": 1191525850.0, "z_loss": 0.0004500577342696488 }, { "copy_logits_max": -6.871417999267578, "copy_logits_min": -750000000.0, "copy_num_tokens": 394.625, "epoch": 1.7140158284401328, "gen_logits_max": 3.9022555351257324, "gen_logits_mean": -17.58418083190918, "gen_logits_min": -29.49043846130371, "gen_logits_std": 3.3562490940093994, "gen_loss": 0.30707359313964844, "grad_norm": 0.3490417705850652, "learning_rate": 2.0329263157894738e-05, "loss": 0.2804, "mean_copy_accuracy": 0.9963608384132385, "mean_gen_accuracy": 0.8730409741401672, "mean_token_accuracy": 0.9031893312931061, "num_tokens": 1191761458.0, "sample_num_tokens": 8155.0, "step": 8392, "total_num_tokens": 1191794078.0, "z_loss": 0.0004919954808428884 }, { "copy_logits_max": -4.0438947677612305, "copy_logits_min": -750000000.0, "copy_num_tokens": 639.9375, "epoch": 1.7142200663773295, "gen_logits_max": 2.473069906234741, "gen_logits_mean": -17.697193145751953, "gen_logits_min": -29.699058532714844, "gen_logits_std": 3.422830581665039, "gen_loss": 0.22960031032562256, "grad_norm": 0.3593063308144685, "learning_rate": 2.0328e-05, "loss": 0.2507, "mean_copy_accuracy": 0.9974653720855713, "mean_gen_accuracy": 0.8864801228046417, "mean_token_accuracy": 0.9151690006256104, "num_tokens": 1192029064.0, "sample_num_tokens": 9377.5, "step": 8393, "total_num_tokens": 1192066574.0, "z_loss": 0.0003401893191039562 }, { "copy_logits_max": -5.44587516784668, "copy_logits_min": -750000000.0, "copy_num_tokens": 631.25, "epoch": 1.7144243043145264, "gen_logits_max": 2.455233573913574, "gen_logits_mean": -17.161949157714844, "gen_logits_min": -29.529935836791992, "gen_logits_std": 3.3651437759399414, "gen_loss": 0.28128260374069214, "grad_norm": 0.353973123905079, "learning_rate": 2.0326736842105263e-05, "loss": 0.2986, "mean_copy_accuracy": 0.9971653968095779, "mean_gen_accuracy": 0.8644383102655411, "mean_token_accuracy": 0.8987501412630081, "num_tokens": 1192308249.0, "sample_num_tokens": 9353.25, "step": 8394, "total_num_tokens": 1192345662.0, "z_loss": 0.0004255334206391126 }, { "copy_logits_max": -6.08803653717041, "copy_logits_min": -750000128.0, "copy_num_tokens": 538.5, "epoch": 1.7146285422517233, "gen_logits_max": 2.9719443321228027, "gen_logits_mean": -17.434246063232422, "gen_logits_min": -29.544185638427734, "gen_logits_std": 3.392956018447876, "gen_loss": 0.2543071508407593, "grad_norm": 0.3721901430409133, "learning_rate": 2.0325473684210524e-05, "loss": 0.2735, "mean_copy_accuracy": 0.99715256690979, "mean_gen_accuracy": 0.8735867291688919, "mean_token_accuracy": 0.9052546918392181, "num_tokens": 1192571225.0, "sample_num_tokens": 8977.75, "step": 8395, "total_num_tokens": 1192607136.0, "z_loss": 0.000394003203837201 }, { "copy_logits_max": -5.5034589767456055, "copy_logits_min": -687500032.0, "copy_num_tokens": 416.0, "epoch": 1.71483278018892, "gen_logits_max": 4.092076778411865, "gen_logits_mean": -16.42845344543457, "gen_logits_min": -28.792892456054688, "gen_logits_std": 3.3356194496154785, "gen_loss": 0.27637779712677, "grad_norm": 0.3245941475668788, "learning_rate": 2.0324210526315792e-05, "loss": 0.2536, "mean_copy_accuracy": 0.9970587193965912, "mean_gen_accuracy": 0.8822340816259384, "mean_token_accuracy": 0.9138709157705307, "num_tokens": 1192852557.0, "sample_num_tokens": 7864.25, "step": 8396, "total_num_tokens": 1192884014.0, "z_loss": 0.00042661232873797417 }, { "copy_logits_max": -2.32546329498291, "copy_logits_min": -687500032.0, "copy_num_tokens": 639.5625, "epoch": 1.715037018126117, "gen_logits_max": 3.2670183181762695, "gen_logits_mean": -16.053829193115234, "gen_logits_min": -28.513046264648438, "gen_logits_std": 3.3282361030578613, "gen_loss": 0.28458288311958313, "grad_norm": 0.36169904366121647, "learning_rate": 2.0322947368421053e-05, "loss": 0.2791, "mean_copy_accuracy": 0.9973819404840469, "mean_gen_accuracy": 0.8738392442464828, "mean_token_accuracy": 0.9072151929140091, "num_tokens": 1193129246.0, "sample_num_tokens": 9037.0, "step": 8397, "total_num_tokens": 1193165394.0, "z_loss": 0.00042209032108075917 }, { "copy_logits_max": -7.865373611450195, "copy_logits_min": -750000064.0, "copy_num_tokens": 333.25, "epoch": 1.715241256063314, "gen_logits_max": 4.417270660400391, "gen_logits_mean": -16.43647003173828, "gen_logits_min": -28.570728302001953, "gen_logits_std": 3.335824728012085, "gen_loss": 0.28022998571395874, "grad_norm": 0.3507159390822949, "learning_rate": 2.0321684210526317e-05, "loss": 0.2831, "mean_copy_accuracy": 0.9970224946737289, "mean_gen_accuracy": 0.8742484003305435, "mean_token_accuracy": 0.9031483232975006, "num_tokens": 1193402317.0, "sample_num_tokens": 7096.25, "step": 8398, "total_num_tokens": 1193430702.0, "z_loss": 0.0004178717499598861 }, { "copy_logits_max": -5.838567733764648, "copy_logits_min": -687500032.0, "copy_num_tokens": 485.4375, "epoch": 1.7154454940005106, "gen_logits_max": 4.153825283050537, "gen_logits_mean": -15.762338638305664, "gen_logits_min": -28.557872772216797, "gen_logits_std": 3.307490825653076, "gen_loss": 0.30506476759910583, "grad_norm": 0.36535663027981935, "learning_rate": 2.0320421052631578e-05, "loss": 0.292, "mean_copy_accuracy": 0.9973629266023636, "mean_gen_accuracy": 0.8701923340559006, "mean_token_accuracy": 0.9025136977434158, "num_tokens": 1193673866.0, "sample_num_tokens": 8815.5, "step": 8399, "total_num_tokens": 1193709128.0, "z_loss": 0.0004854290164075792 }, { "copy_logits_max": -5.136748790740967, "copy_logits_min": -687500032.0, "copy_num_tokens": 527.6875, "epoch": 1.7156497319377073, "gen_logits_max": 2.8836145401000977, "gen_logits_mean": -17.234037399291992, "gen_logits_min": -29.792964935302734, "gen_logits_std": 3.359318256378174, "gen_loss": 0.279349684715271, "grad_norm": 0.37156311849425716, "learning_rate": 2.0319157894736843e-05, "loss": 0.2828, "mean_copy_accuracy": 0.9970790296792984, "mean_gen_accuracy": 0.8718966692686081, "mean_token_accuracy": 0.9038326591253281, "num_tokens": 1193949718.0, "sample_num_tokens": 8856.5, "step": 8400, "total_num_tokens": 1193985144.0, "z_loss": 0.0004783541080541909 }, { "copy_logits_max": -8.131823539733887, "copy_logits_min": -750000000.0, "copy_num_tokens": 321.375, "epoch": 1.7158539698749042, "gen_logits_max": 4.359434127807617, "gen_logits_mean": -16.55132293701172, "gen_logits_min": -29.138687133789062, "gen_logits_std": 3.312930107116699, "gen_loss": 0.31774866580963135, "grad_norm": 0.36656858909236495, "learning_rate": 2.0317894736842103e-05, "loss": 0.2792, "mean_copy_accuracy": 0.9971425235271454, "mean_gen_accuracy": 0.8785019516944885, "mean_token_accuracy": 0.9064570963382721, "num_tokens": 1194223559.0, "sample_num_tokens": 7822.25, "step": 8401, "total_num_tokens": 1194254848.0, "z_loss": 0.0005309961270540953 }, { "copy_logits_max": -7.546075344085693, "copy_logits_min": -750000000.0, "copy_num_tokens": 597.4375, "epoch": 1.7160582078121012, "gen_logits_max": 4.407193183898926, "gen_logits_mean": -16.26274871826172, "gen_logits_min": -28.72162628173828, "gen_logits_std": 3.3443210124969482, "gen_loss": 0.2325301617383957, "grad_norm": 0.356880293985582, "learning_rate": 2.0316631578947368e-05, "loss": 0.2675, "mean_copy_accuracy": 0.9979657977819443, "mean_gen_accuracy": 0.8838888108730316, "mean_token_accuracy": 0.9109904021024704, "num_tokens": 1194486864.0, "sample_num_tokens": 9395.0, "step": 8402, "total_num_tokens": 1194524444.0, "z_loss": 0.00041109282756224275 }, { "copy_logits_max": -7.8731865882873535, "copy_logits_min": -750000000.0, "copy_num_tokens": 317.0, "epoch": 1.7162624457492979, "gen_logits_max": 4.657020568847656, "gen_logits_mean": -15.642160415649414, "gen_logits_min": -28.12256622314453, "gen_logits_std": 3.301151990890503, "gen_loss": 0.2942357659339905, "grad_norm": 0.35497805764391355, "learning_rate": 2.0315368421052632e-05, "loss": 0.2933, "mean_copy_accuracy": 0.9971584379673004, "mean_gen_accuracy": 0.8719569593667984, "mean_token_accuracy": 0.9017766863107681, "num_tokens": 1194751494.0, "sample_num_tokens": 6924.0, "step": 8403, "total_num_tokens": 1194779190.0, "z_loss": 0.0004671512870118022 }, { "copy_logits_max": -7.888059616088867, "copy_logits_min": -750000000.0, "copy_num_tokens": 413.5625, "epoch": 1.7164666836864948, "gen_logits_max": 4.165798187255859, "gen_logits_mean": -16.426572799682617, "gen_logits_min": -29.035717010498047, "gen_logits_std": 3.3460161685943604, "gen_loss": 0.26145923137664795, "grad_norm": 0.3820725838668328, "learning_rate": 2.0314105263157896e-05, "loss": 0.2672, "mean_copy_accuracy": 0.9961009919643402, "mean_gen_accuracy": 0.8778112679719925, "mean_token_accuracy": 0.9086443483829498, "num_tokens": 1195000516.0, "sample_num_tokens": 7744.5, "step": 8404, "total_num_tokens": 1195031494.0, "z_loss": 0.00045308665721677244 }, { "copy_logits_max": -8.398295402526855, "copy_logits_min": -750000000.0, "copy_num_tokens": 442.5, "epoch": 1.7166709216236917, "gen_logits_max": 3.1613223552703857, "gen_logits_mean": -18.897262573242188, "gen_logits_min": -30.631818771362305, "gen_logits_std": 3.41400408744812, "gen_loss": 0.2842724323272705, "grad_norm": 0.36534152235058354, "learning_rate": 2.031284210526316e-05, "loss": 0.2852, "mean_copy_accuracy": 0.9964513629674911, "mean_gen_accuracy": 0.8716343939304352, "mean_token_accuracy": 0.9023513942956924, "num_tokens": 1195278443.0, "sample_num_tokens": 9282.25, "step": 8405, "total_num_tokens": 1195315572.0, "z_loss": 0.00046815600944682956 }, { "copy_logits_max": -7.998361110687256, "copy_logits_min": -750000064.0, "copy_num_tokens": 486.875, "epoch": 1.7168751595608884, "gen_logits_max": 3.525451421737671, "gen_logits_mean": -15.764113426208496, "gen_logits_min": -28.235517501831055, "gen_logits_std": 3.2831902503967285, "gen_loss": 0.24103617668151855, "grad_norm": 0.3303299065004284, "learning_rate": 2.031157894736842e-05, "loss": 0.2633, "mean_copy_accuracy": 0.9973139464855194, "mean_gen_accuracy": 0.8806054145097733, "mean_token_accuracy": 0.9082913249731064, "num_tokens": 1195549263.0, "sample_num_tokens": 7818.25, "step": 8406, "total_num_tokens": 1195580536.0, "z_loss": 0.00040752231143414974 }, { "copy_logits_max": -8.63737678527832, "copy_logits_min": -750000000.0, "copy_num_tokens": 369.9375, "epoch": 1.7170793974980851, "gen_logits_max": 4.238811492919922, "gen_logits_mean": -16.795055389404297, "gen_logits_min": -29.043224334716797, "gen_logits_std": 3.328547954559326, "gen_loss": 0.3049432635307312, "grad_norm": 0.33329483023252504, "learning_rate": 2.0310315789473686e-05, "loss": 0.2727, "mean_copy_accuracy": 0.9971987307071686, "mean_gen_accuracy": 0.8776622116565704, "mean_token_accuracy": 0.90726338326931, "num_tokens": 1195822421.0, "sample_num_tokens": 8076.75, "step": 8407, "total_num_tokens": 1195854728.0, "z_loss": 0.0005163163878023624 }, { "copy_logits_max": -8.370009422302246, "copy_logits_min": -750000000.0, "copy_num_tokens": 592.0625, "epoch": 1.717283635435282, "gen_logits_max": 4.201779365539551, "gen_logits_mean": -15.154375076293945, "gen_logits_min": -27.89301300048828, "gen_logits_std": 3.2898406982421875, "gen_loss": 0.2437225580215454, "grad_norm": 0.39356729129753376, "learning_rate": 2.0309052631578947e-05, "loss": 0.2701, "mean_copy_accuracy": 0.997702419757843, "mean_gen_accuracy": 0.8724714070558548, "mean_token_accuracy": 0.9072451889514923, "num_tokens": 1196091644.0, "sample_num_tokens": 8615.5, "step": 8408, "total_num_tokens": 1196126106.0, "z_loss": 0.00042064377339556813 }, { "copy_logits_max": -8.960262298583984, "copy_logits_min": -750000000.0, "copy_num_tokens": 430.6875, "epoch": 1.717487873372479, "gen_logits_max": 3.4528322219848633, "gen_logits_mean": -17.255924224853516, "gen_logits_min": -29.068988800048828, "gen_logits_std": 3.2909631729125977, "gen_loss": 0.28349488973617554, "grad_norm": 0.3637802127557036, "learning_rate": 2.030778947368421e-05, "loss": 0.2859, "mean_copy_accuracy": 0.9971445798873901, "mean_gen_accuracy": 0.875552773475647, "mean_token_accuracy": 0.9034456014633179, "num_tokens": 1196362713.0, "sample_num_tokens": 8584.75, "step": 8409, "total_num_tokens": 1196397052.0, "z_loss": 0.0004590027965605259 }, { "copy_logits_max": -7.749765396118164, "copy_logits_min": -750000000.0, "copy_num_tokens": 543.1875, "epoch": 1.7176921113096757, "gen_logits_max": 2.4768471717834473, "gen_logits_mean": -18.203359603881836, "gen_logits_min": -30.619762420654297, "gen_logits_std": 3.4068472385406494, "gen_loss": 0.24683785438537598, "grad_norm": 0.3831196083503304, "learning_rate": 2.0306526315789472e-05, "loss": 0.2669, "mean_copy_accuracy": 0.9971111714839935, "mean_gen_accuracy": 0.8765366673469543, "mean_token_accuracy": 0.9104458093643188, "num_tokens": 1196630312.0, "sample_num_tokens": 8421.5, "step": 8410, "total_num_tokens": 1196663998.0, "z_loss": 0.00042739612399600446 }, { "copy_logits_max": -7.182822227478027, "copy_logits_min": -750000000.0, "copy_num_tokens": 422.875, "epoch": 1.7178963492468726, "gen_logits_max": 3.6895360946655273, "gen_logits_mean": -16.414714813232422, "gen_logits_min": -27.84457015991211, "gen_logits_std": 3.23245906829834, "gen_loss": 0.2813209593296051, "grad_norm": 0.4058013911011991, "learning_rate": 2.0305263157894736e-05, "loss": 0.2826, "mean_copy_accuracy": 0.9976305514574051, "mean_gen_accuracy": 0.8703996241092682, "mean_token_accuracy": 0.9046191722154617, "num_tokens": 1196921860.0, "sample_num_tokens": 8268.0, "step": 8411, "total_num_tokens": 1196954932.0, "z_loss": 0.0005165180773474276 }, { "copy_logits_max": -6.935061454772949, "copy_logits_min": -750000000.0, "copy_num_tokens": 618.375, "epoch": 1.7181005871840695, "gen_logits_max": 3.1332671642303467, "gen_logits_mean": -17.547828674316406, "gen_logits_min": -30.120014190673828, "gen_logits_std": 3.399057626724243, "gen_loss": 0.2660086154937744, "grad_norm": 0.35541295416678165, "learning_rate": 2.0304e-05, "loss": 0.2833, "mean_copy_accuracy": 0.9969534873962402, "mean_gen_accuracy": 0.8734440803527832, "mean_token_accuracy": 0.9044354259967804, "num_tokens": 1197189638.0, "sample_num_tokens": 9943.0, "step": 8412, "total_num_tokens": 1197229410.0, "z_loss": 0.0004283288726583123 }, { "copy_logits_max": -8.134073257446289, "copy_logits_min": -687500032.0, "copy_num_tokens": 512.5625, "epoch": 1.7183048251212663, "gen_logits_max": 3.771301746368408, "gen_logits_mean": -16.314117431640625, "gen_logits_min": -28.414453506469727, "gen_logits_std": 3.2990102767944336, "gen_loss": 0.2823101580142975, "grad_norm": 0.3334529748843032, "learning_rate": 2.0302736842105265e-05, "loss": 0.2722, "mean_copy_accuracy": 0.9980957955121994, "mean_gen_accuracy": 0.8721312284469604, "mean_token_accuracy": 0.9069134593009949, "num_tokens": 1197487070.0, "sample_num_tokens": 8342.5, "step": 8413, "total_num_tokens": 1197520440.0, "z_loss": 0.00046537391608580947 }, { "copy_logits_max": -6.243657112121582, "copy_logits_min": -687500032.0, "copy_num_tokens": 486.9375, "epoch": 1.718509063058463, "gen_logits_max": 2.883338451385498, "gen_logits_mean": -18.239253997802734, "gen_logits_min": -30.945659637451172, "gen_logits_std": 3.4462437629699707, "gen_loss": 0.24985133111476898, "grad_norm": 0.3551272628436805, "learning_rate": 2.0301473684210526e-05, "loss": 0.2442, "mean_copy_accuracy": 0.9975669234991074, "mean_gen_accuracy": 0.8849693834781647, "mean_token_accuracy": 0.9166475534439087, "num_tokens": 1197751651.0, "sample_num_tokens": 8335.75, "step": 8414, "total_num_tokens": 1197784994.0, "z_loss": 0.00042909581679850817 }, { "copy_logits_max": -5.333491325378418, "copy_logits_min": -750000000.0, "copy_num_tokens": 777.625, "epoch": 1.71871330099566, "gen_logits_max": 3.3824462890625, "gen_logits_mean": -14.917865753173828, "gen_logits_min": -27.273025512695312, "gen_logits_std": 3.256962537765503, "gen_loss": 0.24670574069023132, "grad_norm": 0.3423632384628218, "learning_rate": 2.030021052631579e-05, "loss": 0.2488, "mean_copy_accuracy": 0.9968406856060028, "mean_gen_accuracy": 0.8804077208042145, "mean_token_accuracy": 0.9155018180608749, "num_tokens": 1198041420.0, "sample_num_tokens": 10452.5, "step": 8415, "total_num_tokens": 1198083230.0, "z_loss": 0.00040267404983751476 }, { "copy_logits_max": -4.964913845062256, "copy_logits_min": -750000000.0, "copy_num_tokens": 629.6875, "epoch": 1.7189175389328568, "gen_logits_max": 3.2487521171569824, "gen_logits_mean": -16.6281795501709, "gen_logits_min": -29.78697967529297, "gen_logits_std": 3.39357590675354, "gen_loss": 0.2522810697555542, "grad_norm": 0.3575790509768701, "learning_rate": 2.0298947368421055e-05, "loss": 0.2548, "mean_copy_accuracy": 0.9971519708633423, "mean_gen_accuracy": 0.8818299770355225, "mean_token_accuracy": 0.9124152511358261, "num_tokens": 1198313696.0, "sample_num_tokens": 8991.0, "step": 8416, "total_num_tokens": 1198349660.0, "z_loss": 0.00043888273648917675 }, { "copy_logits_max": -6.937124252319336, "copy_logits_min": -687500032.0, "copy_num_tokens": 266.125, "epoch": 1.7191217768700535, "gen_logits_max": 4.743824005126953, "gen_logits_mean": -16.248130798339844, "gen_logits_min": -28.218488693237305, "gen_logits_std": 3.280458450317383, "gen_loss": 0.3215652108192444, "grad_norm": 0.33542188303335896, "learning_rate": 2.0297684210526316e-05, "loss": 0.2849, "mean_copy_accuracy": 0.9966978132724762, "mean_gen_accuracy": 0.8748122602701187, "mean_token_accuracy": 0.9026407450437546, "num_tokens": 1198571865.0, "sample_num_tokens": 7002.25, "step": 8417, "total_num_tokens": 1198599874.0, "z_loss": 0.0004909336566925049 }, { "copy_logits_max": -5.92926549911499, "copy_logits_min": -750000064.0, "copy_num_tokens": 232.1875, "epoch": 1.7193260148072504, "gen_logits_max": 5.129115104675293, "gen_logits_mean": -15.47420597076416, "gen_logits_min": -27.88829231262207, "gen_logits_std": 3.3009932041168213, "gen_loss": 0.3052598834037781, "grad_norm": 0.3633412425722769, "learning_rate": 2.029642105263158e-05, "loss": 0.2955, "mean_copy_accuracy": 0.9966756999492645, "mean_gen_accuracy": 0.8740949034690857, "mean_token_accuracy": 0.8999030292034149, "num_tokens": 1198844096.0, "sample_num_tokens": 7041.5, "step": 8418, "total_num_tokens": 1198872262.0, "z_loss": 0.0005500325351022184 }, { "copy_logits_max": -7.988058567047119, "copy_logits_min": -750000000.0, "copy_num_tokens": 529.3125, "epoch": 1.7195302527444474, "gen_logits_max": 3.782905101776123, "gen_logits_mean": -16.6456241607666, "gen_logits_min": -29.042421340942383, "gen_logits_std": 3.3807883262634277, "gen_loss": 0.2856661379337311, "grad_norm": 0.35637099463895, "learning_rate": 2.029515789473684e-05, "loss": 0.2692, "mean_copy_accuracy": 0.9974534511566162, "mean_gen_accuracy": 0.8772399425506592, "mean_token_accuracy": 0.9078275859355927, "num_tokens": 1199111164.0, "sample_num_tokens": 8753.0, "step": 8419, "total_num_tokens": 1199146176.0, "z_loss": 0.00047367691877298057 }, { "copy_logits_max": -8.080042839050293, "copy_logits_min": -750000000.0, "copy_num_tokens": 502.4375, "epoch": 1.719734490681644, "gen_logits_max": 3.642014980316162, "gen_logits_mean": -16.892860412597656, "gen_logits_min": -29.58708381652832, "gen_logits_std": 3.3787872791290283, "gen_loss": 0.28217172622680664, "grad_norm": 0.34419474472428097, "learning_rate": 2.029389473684211e-05, "loss": 0.2807, "mean_copy_accuracy": 0.9965200275182724, "mean_gen_accuracy": 0.8730200678110123, "mean_token_accuracy": 0.9050889164209366, "num_tokens": 1199386537.0, "sample_num_tokens": 8102.75, "step": 8420, "total_num_tokens": 1199418948.0, "z_loss": 0.000458639144198969 }, { "copy_logits_max": -7.7189764976501465, "copy_logits_min": -750000000.0, "copy_num_tokens": 368.8125, "epoch": 1.7199387286188408, "gen_logits_max": 4.050715446472168, "gen_logits_mean": -17.68661880493164, "gen_logits_min": -29.8721923828125, "gen_logits_std": 3.3828258514404297, "gen_loss": 0.3086327314376831, "grad_norm": 0.39216581323457117, "learning_rate": 2.029263157894737e-05, "loss": 0.2925, "mean_copy_accuracy": 0.9955465346574783, "mean_gen_accuracy": 0.874517410993576, "mean_token_accuracy": 0.8994891196489334, "num_tokens": 1199643570.0, "sample_num_tokens": 7982.5, "step": 8421, "total_num_tokens": 1199675500.0, "z_loss": 0.0004935641190968454 }, { "copy_logits_max": -7.635730266571045, "copy_logits_min": -750000000.0, "copy_num_tokens": 450.625, "epoch": 1.720142966556038, "gen_logits_max": 4.431595802307129, "gen_logits_mean": -16.10342025756836, "gen_logits_min": -28.88150405883789, "gen_logits_std": 3.359564781188965, "gen_loss": 0.25898832082748413, "grad_norm": 0.35899734741031714, "learning_rate": 2.0291368421052634e-05, "loss": 0.2588, "mean_copy_accuracy": 0.9973173588514328, "mean_gen_accuracy": 0.8859151154756546, "mean_token_accuracy": 0.9126194566488266, "num_tokens": 1199916400.0, "sample_num_tokens": 8059.0, "step": 8422, "total_num_tokens": 1199948636.0, "z_loss": 0.0004555374034680426 }, { "copy_logits_max": -5.706515312194824, "copy_logits_min": -750000000.0, "copy_num_tokens": 468.25, "epoch": 1.7203472044932346, "gen_logits_max": 3.1333210468292236, "gen_logits_mean": -16.93996810913086, "gen_logits_min": -29.399272918701172, "gen_logits_std": 3.327500343322754, "gen_loss": 0.27618134021759033, "grad_norm": 0.3351014928508003, "learning_rate": 2.0290105263157895e-05, "loss": 0.2681, "mean_copy_accuracy": 0.9975903183221817, "mean_gen_accuracy": 0.8780705034732819, "mean_token_accuracy": 0.9074009209871292, "num_tokens": 1200167730.0, "sample_num_tokens": 7453.5, "step": 8423, "total_num_tokens": 1200197544.0, "z_loss": 0.0004817954613827169 }, { "copy_logits_max": -6.724384307861328, "copy_logits_min": -687500032.0, "copy_num_tokens": 450.875, "epoch": 1.7205514424304313, "gen_logits_max": 3.3648481369018555, "gen_logits_mean": -17.73438262939453, "gen_logits_min": -30.768089294433594, "gen_logits_std": 3.422943353652954, "gen_loss": 0.28242021799087524, "grad_norm": 0.33082730923959736, "learning_rate": 2.028884210526316e-05, "loss": 0.2668, "mean_copy_accuracy": 0.9976983964443207, "mean_gen_accuracy": 0.8766092509031296, "mean_token_accuracy": 0.9084167927503586, "num_tokens": 1200432962.0, "sample_num_tokens": 8346.5, "step": 8424, "total_num_tokens": 1200466348.0, "z_loss": 0.0004499576170928776 }, { "copy_logits_max": -5.377882480621338, "copy_logits_min": -750000000.0, "copy_num_tokens": 456.3125, "epoch": 1.7207556803676283, "gen_logits_max": 3.146897315979004, "gen_logits_mean": -17.729759216308594, "gen_logits_min": -30.44283676147461, "gen_logits_std": 3.387908697128296, "gen_loss": 0.3073433041572571, "grad_norm": 0.3635799748134705, "learning_rate": 2.028757894736842e-05, "loss": 0.2809, "mean_copy_accuracy": 0.9970986396074295, "mean_gen_accuracy": 0.8727333694696426, "mean_token_accuracy": 0.9047500491142273, "num_tokens": 1200697945.0, "sample_num_tokens": 8401.75, "step": 8425, "total_num_tokens": 1200731552.0, "z_loss": 0.000498318113386631 }, { "copy_logits_max": -7.298317909240723, "copy_logits_min": -750000000.0, "copy_num_tokens": 410.6875, "epoch": 1.7209599183048252, "gen_logits_max": 3.875004768371582, "gen_logits_mean": -16.982444763183594, "gen_logits_min": -29.45009994506836, "gen_logits_std": 3.3097708225250244, "gen_loss": 0.31032419204711914, "grad_norm": 0.37657898397763145, "learning_rate": 2.0286315789473684e-05, "loss": 0.2873, "mean_copy_accuracy": 0.9969275146722794, "mean_gen_accuracy": 0.8703665733337402, "mean_token_accuracy": 0.9016915559768677, "num_tokens": 1200967714.0, "sample_num_tokens": 7856.0, "step": 8426, "total_num_tokens": 1200999138.0, "z_loss": 0.000503498304169625 }, { "copy_logits_max": -6.424517631530762, "copy_logits_min": -750000000.0, "copy_num_tokens": 470.3125, "epoch": 1.721164156242022, "gen_logits_max": 3.591247320175171, "gen_logits_mean": -16.551753997802734, "gen_logits_min": -28.669401168823242, "gen_logits_std": 3.317214012145996, "gen_loss": 0.2971746623516083, "grad_norm": 0.35514222322707606, "learning_rate": 2.0285052631578945e-05, "loss": 0.2828, "mean_copy_accuracy": 0.996328204870224, "mean_gen_accuracy": 0.8734770119190216, "mean_token_accuracy": 0.90425705909729, "num_tokens": 1201218658.0, "sample_num_tokens": 8244.5, "step": 8427, "total_num_tokens": 1201251636.0, "z_loss": 0.00045923396828584373 }, { "copy_logits_max": -7.787702560424805, "copy_logits_min": -750000000.0, "copy_num_tokens": 531.9375, "epoch": 1.7213683941792188, "gen_logits_max": 4.599510192871094, "gen_logits_mean": -14.765922546386719, "gen_logits_min": -28.68813133239746, "gen_logits_std": 3.3221888542175293, "gen_loss": 0.2717342972755432, "grad_norm": 0.353864489787094, "learning_rate": 2.0283789473684213e-05, "loss": 0.2803, "mean_copy_accuracy": 0.9976872801780701, "mean_gen_accuracy": 0.8716152459383011, "mean_token_accuracy": 0.9056554138660431, "num_tokens": 1201487550.0, "sample_num_tokens": 8514.0, "step": 8428, "total_num_tokens": 1201521606.0, "z_loss": 0.0004202073614578694 }, { "copy_logits_max": -7.333010673522949, "copy_logits_min": -687500032.0, "copy_num_tokens": 637.3125, "epoch": 1.7215726321164158, "gen_logits_max": 4.082372665405273, "gen_logits_mean": -16.005468368530273, "gen_logits_min": -28.81318473815918, "gen_logits_std": 3.3140413761138916, "gen_loss": 0.25052595138549805, "grad_norm": 0.3449339142231738, "learning_rate": 2.0282526315789477e-05, "loss": 0.2734, "mean_copy_accuracy": 0.9975403398275375, "mean_gen_accuracy": 0.8761800974607468, "mean_token_accuracy": 0.9081981629133224, "num_tokens": 1201767671.0, "sample_num_tokens": 10382.25, "step": 8429, "total_num_tokens": 1201809200.0, "z_loss": 0.0004254727391526103 }, { "copy_logits_max": -6.340646743774414, "copy_logits_min": -750000000.0, "copy_num_tokens": 483.1875, "epoch": 1.7217768700536125, "gen_logits_max": 3.305603504180908, "gen_logits_mean": -17.075984954833984, "gen_logits_min": -29.243896484375, "gen_logits_std": 3.360072135925293, "gen_loss": 0.274162232875824, "grad_norm": 0.3646748176943201, "learning_rate": 2.0281263157894738e-05, "loss": 0.3031, "mean_copy_accuracy": 0.9968761056661606, "mean_gen_accuracy": 0.86760613322258, "mean_token_accuracy": 0.8986743837594986, "num_tokens": 1202041793.0, "sample_num_tokens": 8227.25, "step": 8430, "total_num_tokens": 1202074702.0, "z_loss": 0.00040842522867023945 }, { "copy_logits_max": -7.5309576988220215, "copy_logits_min": -750000000.0, "copy_num_tokens": 558.0625, "epoch": 1.7219811079908092, "gen_logits_max": 3.6819865703582764, "gen_logits_mean": -16.360610961914062, "gen_logits_min": -28.9052734375, "gen_logits_std": 3.325165271759033, "gen_loss": 0.28103548288345337, "grad_norm": 0.35470692919240787, "learning_rate": 2.0280000000000002e-05, "loss": 0.274, "mean_copy_accuracy": 0.9967718422412872, "mean_gen_accuracy": 0.8781618177890778, "mean_token_accuracy": 0.9085230827331543, "num_tokens": 1202316315.0, "sample_num_tokens": 9293.75, "step": 8431, "total_num_tokens": 1202353490.0, "z_loss": 0.00042489796760492027 }, { "copy_logits_max": -8.151289939880371, "copy_logits_min": -750000000.0, "copy_num_tokens": 258.6875, "epoch": 1.722185345928006, "gen_logits_max": 3.8987553119659424, "gen_logits_mean": -17.277515411376953, "gen_logits_min": -29.34227752685547, "gen_logits_std": 3.315457344055176, "gen_loss": 0.2862894535064697, "grad_norm": 0.3108570800692241, "learning_rate": 2.0278736842105263e-05, "loss": 0.2576, "mean_copy_accuracy": 0.9969605505466461, "mean_gen_accuracy": 0.8818569630384445, "mean_token_accuracy": 0.9117713272571564, "num_tokens": 1202592838.0, "sample_num_tokens": 6698.0, "step": 8432, "total_num_tokens": 1202619630.0, "z_loss": 0.0004702926962636411 }, { "copy_logits_max": -6.998324394226074, "copy_logits_min": -750000128.0, "copy_num_tokens": 451.4375, "epoch": 1.722389583865203, "gen_logits_max": 4.010462284088135, "gen_logits_mean": -16.37108039855957, "gen_logits_min": -29.02625274658203, "gen_logits_std": 3.308220863342285, "gen_loss": 0.29804715514183044, "grad_norm": 0.33218622536253317, "learning_rate": 2.0277473684210528e-05, "loss": 0.2622, "mean_copy_accuracy": 0.997454822063446, "mean_gen_accuracy": 0.8795728534460068, "mean_token_accuracy": 0.912236675620079, "num_tokens": 1202871595.0, "sample_num_tokens": 7145.75, "step": 8433, "total_num_tokens": 1202900178.0, "z_loss": 0.00047782936599105597 }, { "copy_logits_max": -7.118658542633057, "copy_logits_min": -687500096.0, "copy_num_tokens": 702.625, "epoch": 1.7225938218023997, "gen_logits_max": 3.6738905906677246, "gen_logits_mean": -15.385153770446777, "gen_logits_min": -28.12847137451172, "gen_logits_std": 3.2875800132751465, "gen_loss": 0.24757111072540283, "grad_norm": 0.35538535956274814, "learning_rate": 2.027621052631579e-05, "loss": 0.251, "mean_copy_accuracy": 0.9969824850559235, "mean_gen_accuracy": 0.8840920627117157, "mean_token_accuracy": 0.9147888123989105, "num_tokens": 1203129144.0, "sample_num_tokens": 9507.5, "step": 8434, "total_num_tokens": 1203167174.0, "z_loss": 0.0004472594300750643 }, { "copy_logits_max": -7.104165554046631, "copy_logits_min": -750000000.0, "copy_num_tokens": 414.3125, "epoch": 1.7227980597395967, "gen_logits_max": 4.055335998535156, "gen_logits_mean": -15.94275188446045, "gen_logits_min": -28.188495635986328, "gen_logits_std": 3.296088218688965, "gen_loss": 0.277532696723938, "grad_norm": 0.3621409620451129, "learning_rate": 2.0274947368421053e-05, "loss": 0.2763, "mean_copy_accuracy": 0.9969099164009094, "mean_gen_accuracy": 0.8818312585353851, "mean_token_accuracy": 0.9062721282243729, "num_tokens": 1203382302.0, "sample_num_tokens": 8262.5, "step": 8435, "total_num_tokens": 1203415352.0, "z_loss": 0.0004445661907084286 }, { "copy_logits_max": -7.932681560516357, "copy_logits_min": -750000000.0, "copy_num_tokens": 454.0625, "epoch": 1.7230022976767936, "gen_logits_max": 3.540733575820923, "gen_logits_mean": -16.299755096435547, "gen_logits_min": -28.21372413635254, "gen_logits_std": 3.3099308013916016, "gen_loss": 0.27601709961891174, "grad_norm": 0.3498778559626919, "learning_rate": 2.0273684210526314e-05, "loss": 0.271, "mean_copy_accuracy": 0.9976707845926285, "mean_gen_accuracy": 0.8760169744491577, "mean_token_accuracy": 0.9069783985614777, "num_tokens": 1203663296.0, "sample_num_tokens": 8451.0, "step": 8436, "total_num_tokens": 1203697100.0, "z_loss": 0.0004273373924661428 }, { "copy_logits_max": -9.368721008300781, "copy_logits_min": -750000000.0, "copy_num_tokens": 354.75, "epoch": 1.7232065356139903, "gen_logits_max": 5.354402542114258, "gen_logits_mean": -15.590982437133789, "gen_logits_min": -27.498882293701172, "gen_logits_std": 3.262354850769043, "gen_loss": 0.28284752368927, "grad_norm": 0.3587325199516578, "learning_rate": 2.027242105263158e-05, "loss": 0.2639, "mean_copy_accuracy": 0.9964430034160614, "mean_gen_accuracy": 0.8837514817714691, "mean_token_accuracy": 0.9105497896671295, "num_tokens": 1203948145.0, "sample_num_tokens": 8085.25, "step": 8437, "total_num_tokens": 1203980486.0, "z_loss": 0.0004055363533552736 }, { "copy_logits_max": -7.759027004241943, "copy_logits_min": -750000128.0, "copy_num_tokens": 530.4375, "epoch": 1.723410773551187, "gen_logits_max": 3.970010995864868, "gen_logits_mean": -16.21949005126953, "gen_logits_min": -28.10511016845703, "gen_logits_std": 3.3069865703582764, "gen_loss": 0.31026047468185425, "grad_norm": 0.3569915633139083, "learning_rate": 2.0271157894736842e-05, "loss": 0.2784, "mean_copy_accuracy": 0.9966756999492645, "mean_gen_accuracy": 0.8737805485725403, "mean_token_accuracy": 0.9052044302225113, "num_tokens": 1204220566.0, "sample_num_tokens": 9060.0, "step": 8438, "total_num_tokens": 1204256806.0, "z_loss": 0.0004661301209125668 }, { "copy_logits_max": -8.176461219787598, "copy_logits_min": -687500032.0, "copy_num_tokens": 406.8125, "epoch": 1.723615011488384, "gen_logits_max": 2.5001776218414307, "gen_logits_mean": -19.209110260009766, "gen_logits_min": -31.10132598876953, "gen_logits_std": 3.4387171268463135, "gen_loss": 0.24184954166412354, "grad_norm": 0.35640947330523914, "learning_rate": 2.0269894736842107e-05, "loss": 0.2788, "mean_copy_accuracy": 0.9958342015743256, "mean_gen_accuracy": 0.879009023308754, "mean_token_accuracy": 0.9042805284261703, "num_tokens": 1204477688.0, "sample_num_tokens": 8120.0, "step": 8439, "total_num_tokens": 1204510168.0, "z_loss": 0.00040059350430965424 }, { "copy_logits_max": -6.366672039031982, "copy_logits_min": -687500032.0, "copy_num_tokens": 494.0625, "epoch": 1.7238192494255808, "gen_logits_max": 2.922973155975342, "gen_logits_mean": -18.185108184814453, "gen_logits_min": -30.239166259765625, "gen_logits_std": 3.371767520904541, "gen_loss": 0.27765214443206787, "grad_norm": 0.3618828218219908, "learning_rate": 2.0268631578947368e-05, "loss": 0.2828, "mean_copy_accuracy": 0.9973561316728592, "mean_gen_accuracy": 0.8734309375286102, "mean_token_accuracy": 0.9048508703708649, "num_tokens": 1204760192.0, "sample_num_tokens": 8827.5, "step": 8440, "total_num_tokens": 1204795502.0, "z_loss": 0.00045502925058826804 }, { "copy_logits_max": -7.939745903015137, "copy_logits_min": -750000000.0, "copy_num_tokens": 535.5, "epoch": 1.7240234873627776, "gen_logits_max": 4.641124248504639, "gen_logits_mean": -15.366673469543457, "gen_logits_min": -27.198036193847656, "gen_logits_std": 3.2577388286590576, "gen_loss": 0.28588902950286865, "grad_norm": 0.36964507822442944, "learning_rate": 2.0267368421052632e-05, "loss": 0.2743, "mean_copy_accuracy": 0.997271716594696, "mean_gen_accuracy": 0.879566952586174, "mean_token_accuracy": 0.9075221419334412, "num_tokens": 1205049724.0, "sample_num_tokens": 9791.0, "step": 8441, "total_num_tokens": 1205088888.0, "z_loss": 0.0004649795009754598 }, { "copy_logits_max": -5.443868637084961, "copy_logits_min": -687500032.0, "copy_num_tokens": 663.125, "epoch": 1.7242277252999745, "gen_logits_max": 2.815190076828003, "gen_logits_mean": -17.095020294189453, "gen_logits_min": -29.122589111328125, "gen_logits_std": 3.3377132415771484, "gen_loss": 0.28405940532684326, "grad_norm": 0.41423964837668087, "learning_rate": 2.0266105263157896e-05, "loss": 0.2801, "mean_copy_accuracy": 0.9966866672039032, "mean_gen_accuracy": 0.8757357895374298, "mean_token_accuracy": 0.9058478027582169, "num_tokens": 1205310276.0, "sample_num_tokens": 9248.5, "step": 8442, "total_num_tokens": 1205347270.0, "z_loss": 0.0004721095028799027 }, { "copy_logits_max": -6.550300598144531, "copy_logits_min": -687500032.0, "copy_num_tokens": 396.6875, "epoch": 1.7244319632371714, "gen_logits_max": 3.9240994453430176, "gen_logits_mean": -15.560176849365234, "gen_logits_min": -27.60048484802246, "gen_logits_std": 3.3282485008239746, "gen_loss": 0.2884230613708496, "grad_norm": 0.3292641821349556, "learning_rate": 2.0264842105263157e-05, "loss": 0.2574, "mean_copy_accuracy": 0.9967975914478302, "mean_gen_accuracy": 0.8810464292764664, "mean_token_accuracy": 0.9113510847091675, "num_tokens": 1205586146.0, "sample_num_tokens": 7275.5, "step": 8443, "total_num_tokens": 1205615248.0, "z_loss": 0.0004744892648886889 }, { "copy_logits_max": -7.470087051391602, "copy_logits_min": -750000000.0, "copy_num_tokens": 398.0, "epoch": 1.724636201174368, "gen_logits_max": 3.417259693145752, "gen_logits_mean": -17.516563415527344, "gen_logits_min": -29.320268630981445, "gen_logits_std": 3.358157157897949, "gen_loss": 0.271513968706131, "grad_norm": 0.35701114923285443, "learning_rate": 2.026357894736842e-05, "loss": 0.2708, "mean_copy_accuracy": 0.9960716813802719, "mean_gen_accuracy": 0.8792137205600739, "mean_token_accuracy": 0.9079292267560959, "num_tokens": 1205846818.0, "sample_num_tokens": 7972.5, "step": 8444, "total_num_tokens": 1205878708.0, "z_loss": 0.00045115227112546563 }, { "copy_logits_max": -8.50230884552002, "copy_logits_min": -750000000.0, "copy_num_tokens": 286.4375, "epoch": 1.7248404391115648, "gen_logits_max": 3.8203890323638916, "gen_logits_mean": -18.35894775390625, "gen_logits_min": -29.53131866455078, "gen_logits_std": 3.3222038745880127, "gen_loss": 0.2743701934814453, "grad_norm": 0.38154114326411814, "learning_rate": 2.0262315789473686e-05, "loss": 0.2927, "mean_copy_accuracy": 0.9965922981500626, "mean_gen_accuracy": 0.8737594336271286, "mean_token_accuracy": 0.900651142001152, "num_tokens": 1206099605.0, "sample_num_tokens": 7202.75, "step": 8445, "total_num_tokens": 1206128416.0, "z_loss": 0.00040168131818063557 }, { "copy_logits_max": -8.023469924926758, "copy_logits_min": -750000000.0, "copy_num_tokens": 474.1875, "epoch": 1.725044677048762, "gen_logits_max": 4.109156131744385, "gen_logits_mean": -16.36553192138672, "gen_logits_min": -28.18759536743164, "gen_logits_std": 3.2453453540802, "gen_loss": 0.2966979146003723, "grad_norm": 0.35150237223347164, "learning_rate": 2.026105263157895e-05, "loss": 0.2862, "mean_copy_accuracy": 0.9979356974363327, "mean_gen_accuracy": 0.8738310635089874, "mean_token_accuracy": 0.9030769616365433, "num_tokens": 1206366721.0, "sample_num_tokens": 9070.25, "step": 8446, "total_num_tokens": 1206403002.0, "z_loss": 0.0004954163450747728 }, { "copy_logits_max": -8.8226318359375, "copy_logits_min": -750000000.0, "copy_num_tokens": 264.625, "epoch": 1.7252489149859587, "gen_logits_max": 4.122380256652832, "gen_logits_mean": -17.947437286376953, "gen_logits_min": -29.14398956298828, "gen_logits_std": 3.278517484664917, "gen_loss": 0.2721605598926544, "grad_norm": 0.357575459249733, "learning_rate": 2.025978947368421e-05, "loss": 0.2775, "mean_copy_accuracy": 0.9955552071332932, "mean_gen_accuracy": 0.8769699931144714, "mean_token_accuracy": 0.9041925370693207, "num_tokens": 1206644191.0, "sample_num_tokens": 7013.25, "step": 8447, "total_num_tokens": 1206672244.0, "z_loss": 0.0004470859421417117 }, { "copy_logits_max": -6.8741607666015625, "copy_logits_min": -687500032.0, "copy_num_tokens": 482.125, "epoch": 1.7254531529231554, "gen_logits_max": 4.278087615966797, "gen_logits_mean": -15.978732109069824, "gen_logits_min": -27.563705444335938, "gen_logits_std": 3.2373600006103516, "gen_loss": 0.30465608835220337, "grad_norm": 0.32662553885737855, "learning_rate": 2.0258526315789475e-05, "loss": 0.2752, "mean_copy_accuracy": 0.9965149611234665, "mean_gen_accuracy": 0.877895176410675, "mean_token_accuracy": 0.9061343520879745, "num_tokens": 1206928418.0, "sample_num_tokens": 8874.0, "step": 8448, "total_num_tokens": 1206963914.0, "z_loss": 0.0004985605482943356 }, { "copy_logits_max": -6.8876953125, "copy_logits_min": -750000000.0, "copy_num_tokens": 539.1875, "epoch": 1.7256573908603523, "gen_logits_max": 3.770581007003784, "gen_logits_mean": -16.310684204101562, "gen_logits_min": -27.715839385986328, "gen_logits_std": 3.2321200370788574, "gen_loss": 0.2793189287185669, "grad_norm": 0.34434414118897044, "learning_rate": 2.0257263157894736e-05, "loss": 0.2753, "mean_copy_accuracy": 0.9967200756072998, "mean_gen_accuracy": 0.8758263140916824, "mean_token_accuracy": 0.9066195636987686, "num_tokens": 1207199738.0, "sample_num_tokens": 8929.0, "step": 8449, "total_num_tokens": 1207235454.0, "z_loss": 0.00048531804350204766 }, { "copy_logits_max": -6.271331787109375, "copy_logits_min": -687500032.0, "copy_num_tokens": 441.5625, "epoch": 1.7258616287975492, "gen_logits_max": 3.588921546936035, "gen_logits_mean": -15.40163516998291, "gen_logits_min": -26.775691986083984, "gen_logits_std": 3.164527416229248, "gen_loss": 0.265510618686676, "grad_norm": 0.3697084388070283, "learning_rate": 2.0256e-05, "loss": 0.28, "mean_copy_accuracy": 0.9963077902793884, "mean_gen_accuracy": 0.8739257007837296, "mean_token_accuracy": 0.9031768590211868, "num_tokens": 1207463155.0, "sample_num_tokens": 7602.25, "step": 8450, "total_num_tokens": 1207493564.0, "z_loss": 0.00044999635429121554 }, { "copy_logits_max": -5.980068683624268, "copy_logits_min": -750000000.0, "copy_num_tokens": 580.0, "epoch": 1.726065866734746, "gen_logits_max": 3.7298479080200195, "gen_logits_mean": -15.683358192443848, "gen_logits_min": -27.95311737060547, "gen_logits_std": 3.2949516773223877, "gen_loss": 0.2513883709907532, "grad_norm": 0.35590839365142324, "learning_rate": 2.025473684210526e-05, "loss": 0.2686, "mean_copy_accuracy": 0.9971145391464233, "mean_gen_accuracy": 0.8773664832115173, "mean_token_accuracy": 0.9081418067216873, "num_tokens": 1207716947.0, "sample_num_tokens": 8975.25, "step": 8451, "total_num_tokens": 1207752848.0, "z_loss": 0.0004765388439409435 }, { "copy_logits_max": -7.426941871643066, "copy_logits_min": -750000000.0, "copy_num_tokens": 437.9375, "epoch": 1.7262701046719429, "gen_logits_max": 4.900981426239014, "gen_logits_mean": -13.974472045898438, "gen_logits_min": -25.77437400817871, "gen_logits_std": 3.167160987854004, "gen_loss": 0.2726992964744568, "grad_norm": 0.3379261433295484, "learning_rate": 2.0253473684210526e-05, "loss": 0.2598, "mean_copy_accuracy": 0.9976892918348312, "mean_gen_accuracy": 0.8784181773662567, "mean_token_accuracy": 0.9112849682569504, "num_tokens": 1207999377.0, "sample_num_tokens": 8312.25, "step": 8452, "total_num_tokens": 1208032626.0, "z_loss": 0.0005213301628828049 }, { "copy_logits_max": -6.945672988891602, "copy_logits_min": -750000000.0, "copy_num_tokens": 336.125, "epoch": 1.7264743426091398, "gen_logits_max": 5.256570339202881, "gen_logits_mean": -14.790864944458008, "gen_logits_min": -26.062543869018555, "gen_logits_std": 3.153460741043091, "gen_loss": 0.31486982107162476, "grad_norm": 0.36100551462738845, "learning_rate": 2.025221052631579e-05, "loss": 0.3015, "mean_copy_accuracy": 0.9976425170898438, "mean_gen_accuracy": 0.8694792091846466, "mean_token_accuracy": 0.8984449505805969, "num_tokens": 1208280149.0, "sample_num_tokens": 8572.25, "step": 8453, "total_num_tokens": 1208314438.0, "z_loss": 0.000551733945030719 }, { "copy_logits_max": -7.0839009284973145, "copy_logits_min": -750000000.0, "copy_num_tokens": 416.125, "epoch": 1.7266785805463365, "gen_logits_max": 3.550478935241699, "gen_logits_mean": -16.73451042175293, "gen_logits_min": -28.113544464111328, "gen_logits_std": 3.2414448261260986, "gen_loss": 0.27020272612571716, "grad_norm": 0.34682218135493453, "learning_rate": 2.0250947368421054e-05, "loss": 0.2674, "mean_copy_accuracy": 0.9968772679567337, "mean_gen_accuracy": 0.8784512728452682, "mean_token_accuracy": 0.9090182036161423, "num_tokens": 1208555666.0, "sample_num_tokens": 7653.0, "step": 8454, "total_num_tokens": 1208586278.0, "z_loss": 0.0004349262162577361 }, { "copy_logits_max": -6.633697509765625, "copy_logits_min": -750000000.0, "copy_num_tokens": 404.5, "epoch": 1.7268828184835332, "gen_logits_max": 3.3742733001708984, "gen_logits_mean": -18.16118049621582, "gen_logits_min": -29.785110473632812, "gen_logits_std": 3.36883807182312, "gen_loss": 0.2952624261379242, "grad_norm": 0.35893760943759173, "learning_rate": 2.0249684210526315e-05, "loss": 0.2876, "mean_copy_accuracy": 0.9960605204105377, "mean_gen_accuracy": 0.8721913695335388, "mean_token_accuracy": 0.9030581712722778, "num_tokens": 1208825020.0, "sample_num_tokens": 7765.5, "step": 8455, "total_num_tokens": 1208856082.0, "z_loss": 0.000463100936030969 }, { "copy_logits_max": -6.590192794799805, "copy_logits_min": -750000000.0, "copy_num_tokens": 482.5, "epoch": 1.7270870564207301, "gen_logits_max": 4.322012901306152, "gen_logits_mean": -15.622618675231934, "gen_logits_min": -27.005538940429688, "gen_logits_std": 3.1830763816833496, "gen_loss": 0.2811052203178406, "grad_norm": 0.3508606558425681, "learning_rate": 2.024842105263158e-05, "loss": 0.2779, "mean_copy_accuracy": 0.9974463433027267, "mean_gen_accuracy": 0.8750306069850922, "mean_token_accuracy": 0.9070303291082382, "num_tokens": 1209095541.0, "sample_num_tokens": 8347.75, "step": 8456, "total_num_tokens": 1209128932.0, "z_loss": 0.0004323233151808381 }, { "copy_logits_max": -5.643527030944824, "copy_logits_min": -687500032.0, "copy_num_tokens": 376.75, "epoch": 1.727291294357927, "gen_logits_max": 4.4491777420043945, "gen_logits_mean": -15.546634674072266, "gen_logits_min": -27.325468063354492, "gen_logits_std": 3.2322568893432617, "gen_loss": 0.25845474004745483, "grad_norm": 0.3423321776143722, "learning_rate": 2.0247157894736844e-05, "loss": 0.2658, "mean_copy_accuracy": 0.9969393163919449, "mean_gen_accuracy": 0.877321407198906, "mean_token_accuracy": 0.9088362604379654, "num_tokens": 1209375750.0, "sample_num_tokens": 7207.0, "step": 8457, "total_num_tokens": 1209404578.0, "z_loss": 0.00045204354682937264 }, { "copy_logits_max": -6.27813196182251, "copy_logits_min": -750000000.0, "copy_num_tokens": 342.0625, "epoch": 1.7274955322951238, "gen_logits_max": 3.5425496101379395, "gen_logits_mean": -17.589399337768555, "gen_logits_min": -29.267913818359375, "gen_logits_std": 3.3002333641052246, "gen_loss": 0.27405911684036255, "grad_norm": 0.35478772372022205, "learning_rate": 2.0245894736842105e-05, "loss": 0.2639, "mean_copy_accuracy": 0.9976502358913422, "mean_gen_accuracy": 0.8790781646966934, "mean_token_accuracy": 0.9086024612188339, "num_tokens": 1209624170.0, "sample_num_tokens": 7696.5, "step": 8458, "total_num_tokens": 1209654956.0, "z_loss": 0.0004259725974407047 }, { "copy_logits_max": -7.653567790985107, "copy_logits_min": -687500032.0, "copy_num_tokens": 286.0625, "epoch": 1.7276997702323207, "gen_logits_max": 4.219235420227051, "gen_logits_mean": -17.500823974609375, "gen_logits_min": -29.072246551513672, "gen_logits_std": 3.291421413421631, "gen_loss": 0.27750253677368164, "grad_norm": 0.3869844889768308, "learning_rate": 2.024463157894737e-05, "loss": 0.2687, "mean_copy_accuracy": 0.9974717050790787, "mean_gen_accuracy": 0.8830555081367493, "mean_token_accuracy": 0.9081604927778244, "num_tokens": 1209874396.0, "sample_num_tokens": 7916.0, "step": 8459, "total_num_tokens": 1209906060.0, "z_loss": 0.0004266469622962177 }, { "copy_logits_max": -5.797283172607422, "copy_logits_min": -687500032.0, "copy_num_tokens": 554.625, "epoch": 1.7279040081695176, "gen_logits_max": 3.488945722579956, "gen_logits_mean": -16.210494995117188, "gen_logits_min": -28.268718719482422, "gen_logits_std": 3.2581944465637207, "gen_loss": 0.2623245120048523, "grad_norm": 0.34244028252547903, "learning_rate": 2.024336842105263e-05, "loss": 0.2826, "mean_copy_accuracy": 0.9968544393777847, "mean_gen_accuracy": 0.872982069849968, "mean_token_accuracy": 0.9013690203428268, "num_tokens": 1210140621.0, "sample_num_tokens": 9499.75, "step": 8460, "total_num_tokens": 1210178620.0, "z_loss": 0.00042691713315434754 }, { "copy_logits_max": -5.3402323722839355, "copy_logits_min": -687500032.0, "copy_num_tokens": 444.9375, "epoch": 1.7281082461067143, "gen_logits_max": 4.622173309326172, "gen_logits_mean": -15.671201705932617, "gen_logits_min": -27.690818786621094, "gen_logits_std": 3.2551088333129883, "gen_loss": 0.2795369327068329, "grad_norm": 0.33757388355286594, "learning_rate": 2.0242105263157898e-05, "loss": 0.2736, "mean_copy_accuracy": 0.9961521327495575, "mean_gen_accuracy": 0.8750516772270203, "mean_token_accuracy": 0.9066103994846344, "num_tokens": 1210410158.0, "sample_num_tokens": 7826.5, "step": 8461, "total_num_tokens": 1210441464.0, "z_loss": 0.0004235305532347411 }, { "copy_logits_max": -5.012386322021484, "copy_logits_min": -687500032.0, "copy_num_tokens": 472.8125, "epoch": 1.728312484043911, "gen_logits_max": 2.418862819671631, "gen_logits_mean": -18.342134475708008, "gen_logits_min": -30.550235748291016, "gen_logits_std": 3.414246082305908, "gen_loss": 0.2584173381328583, "grad_norm": 0.3838674862951764, "learning_rate": 2.024084210526316e-05, "loss": 0.2872, "mean_copy_accuracy": 0.9964804798364639, "mean_gen_accuracy": 0.8727462887763977, "mean_token_accuracy": 0.900719940662384, "num_tokens": 1210673791.0, "sample_num_tokens": 7821.75, "step": 8462, "total_num_tokens": 1210705078.0, "z_loss": 0.0004515302716754377 }, { "copy_logits_max": -5.9581618309021, "copy_logits_min": -750000000.0, "copy_num_tokens": 250.8125, "epoch": 1.728516721981108, "gen_logits_max": 4.1446099281311035, "gen_logits_mean": -17.881031036376953, "gen_logits_min": -29.666948318481445, "gen_logits_std": 3.3427891731262207, "gen_loss": 0.3234676718711853, "grad_norm": 0.38827747306031946, "learning_rate": 2.0239578947368423e-05, "loss": 0.296, "mean_copy_accuracy": 0.9969581216573715, "mean_gen_accuracy": 0.8717635273933411, "mean_token_accuracy": 0.8994078785181046, "num_tokens": 1210931364.0, "sample_num_tokens": 6947.5, "step": 8463, "total_num_tokens": 1210959154.0, "z_loss": 0.00047546037239953876 }, { "copy_logits_max": -1.4769561290740967, "copy_logits_min": -750000000.0, "copy_num_tokens": 585.0625, "epoch": 1.7287209599183049, "gen_logits_max": 5.147397994995117, "gen_logits_mean": -14.029671669006348, "gen_logits_min": -26.369579315185547, "gen_logits_std": 3.247537851333618, "gen_loss": 0.23331375420093536, "grad_norm": 0.39047178522570186, "learning_rate": 2.0238315789473684e-05, "loss": 0.2681, "mean_copy_accuracy": 0.9975587427616119, "mean_gen_accuracy": 0.8764287829399109, "mean_token_accuracy": 0.9083525687456131, "num_tokens": 1211189923.0, "sample_num_tokens": 8936.25, "step": 8464, "total_num_tokens": 1211225668.0, "z_loss": 0.0004401675541885197 }, { "copy_logits_max": -5.671489715576172, "copy_logits_min": -750000000.0, "copy_num_tokens": 522.6875, "epoch": 1.7289251978555016, "gen_logits_max": 4.25148868560791, "gen_logits_mean": -15.723381996154785, "gen_logits_min": -28.43291473388672, "gen_logits_std": 3.3112218379974365, "gen_loss": 0.25654512643814087, "grad_norm": 0.365920596517852, "learning_rate": 2.0237052631578948e-05, "loss": 0.2748, "mean_copy_accuracy": 0.996537059545517, "mean_gen_accuracy": 0.8808150142431259, "mean_token_accuracy": 0.9081424921751022, "num_tokens": 1211460274.0, "sample_num_tokens": 8418.5, "step": 8465, "total_num_tokens": 1211493948.0, "z_loss": 0.0004423487407620996 }, { "copy_logits_max": -5.06667423248291, "copy_logits_min": -750000000.0, "copy_num_tokens": 329.3125, "epoch": 1.7291294357926985, "gen_logits_max": 4.460091590881348, "gen_logits_mean": -16.13560676574707, "gen_logits_min": -28.025043487548828, "gen_logits_std": 3.2985129356384277, "gen_loss": 0.26890841126441956, "grad_norm": 0.3600481350507598, "learning_rate": 2.023578947368421e-05, "loss": 0.2925, "mean_copy_accuracy": 0.9958008229732513, "mean_gen_accuracy": 0.8713472038507462, "mean_token_accuracy": 0.8981005251407623, "num_tokens": 1211712450.0, "sample_num_tokens": 7750.0, "step": 8466, "total_num_tokens": 1211743450.0, "z_loss": 0.00040920215542428195 }, { "copy_logits_max": -5.990665435791016, "copy_logits_min": -750000000.0, "copy_num_tokens": 430.25, "epoch": 1.7293336737298954, "gen_logits_max": 4.096114158630371, "gen_logits_mean": -17.048934936523438, "gen_logits_min": -29.360252380371094, "gen_logits_std": 3.3512020111083984, "gen_loss": 0.30615413188934326, "grad_norm": 0.3895178771331782, "learning_rate": 2.0234526315789474e-05, "loss": 0.2844, "mean_copy_accuracy": 0.9955124109983444, "mean_gen_accuracy": 0.8758183270692825, "mean_token_accuracy": 0.9033050984144211, "num_tokens": 1211956145.0, "sample_num_tokens": 8163.75, "step": 8467, "total_num_tokens": 1211988800.0, "z_loss": 0.0004806717042811215 }, { "copy_logits_max": -6.135941028594971, "copy_logits_min": -750000000.0, "copy_num_tokens": 517.1875, "epoch": 1.7295379116670921, "gen_logits_max": 2.4693639278411865, "gen_logits_mean": -19.214061737060547, "gen_logits_min": -31.615215301513672, "gen_logits_std": 3.457820415496826, "gen_loss": 0.2806016504764557, "grad_norm": 0.36899172219243115, "learning_rate": 2.0233263157894734e-05, "loss": 0.2705, "mean_copy_accuracy": 0.9967999160289764, "mean_gen_accuracy": 0.8827188909053802, "mean_token_accuracy": 0.9074197262525558, "num_tokens": 1212218143.0, "sample_num_tokens": 9055.75, "step": 8468, "total_num_tokens": 1212254366.0, "z_loss": 0.00043905360507778823 }, { "copy_logits_max": -8.119976043701172, "copy_logits_min": -750000000.0, "copy_num_tokens": 310.375, "epoch": 1.7297421496042888, "gen_logits_max": 3.8496193885803223, "gen_logits_mean": -18.742097854614258, "gen_logits_min": -30.420394897460938, "gen_logits_std": 3.3874258995056152, "gen_loss": 0.3060857057571411, "grad_norm": 0.3938494535831173, "learning_rate": 2.0232000000000002e-05, "loss": 0.286, "mean_copy_accuracy": 0.9961428344249725, "mean_gen_accuracy": 0.8761117309331894, "mean_token_accuracy": 0.9032350182533264, "num_tokens": 1212463767.0, "sample_num_tokens": 8265.75, "step": 8469, "total_num_tokens": 1212496830.0, "z_loss": 0.0005054422654211521 }, { "copy_logits_max": -5.777470111846924, "copy_logits_min": -687500032.0, "copy_num_tokens": 309.3125, "epoch": 1.729946387541486, "gen_logits_max": 4.522692680358887, "gen_logits_mean": -16.705928802490234, "gen_logits_min": -28.895618438720703, "gen_logits_std": 3.3380932807922363, "gen_loss": 0.2935144007205963, "grad_norm": 0.3575043462407148, "learning_rate": 2.0230736842105266e-05, "loss": 0.2874, "mean_copy_accuracy": 0.99696384370327, "mean_gen_accuracy": 0.8751476258039474, "mean_token_accuracy": 0.9030360579490662, "num_tokens": 1212722899.0, "sample_num_tokens": 7035.75, "step": 8470, "total_num_tokens": 1212751042.0, "z_loss": 0.0004967458662576973 }, { "copy_logits_max": -5.805464744567871, "copy_logits_min": -750000000.0, "copy_num_tokens": 469.4375, "epoch": 1.7301506254786827, "gen_logits_max": 4.419620037078857, "gen_logits_mean": -15.792490005493164, "gen_logits_min": -28.39321517944336, "gen_logits_std": 3.3156352043151855, "gen_loss": 0.2710117995738983, "grad_norm": 0.3608180540563139, "learning_rate": 2.0229473684210527e-05, "loss": 0.2828, "mean_copy_accuracy": 0.9972269237041473, "mean_gen_accuracy": 0.8688062727451324, "mean_token_accuracy": 0.9046341776847839, "num_tokens": 1213002461.0, "sample_num_tokens": 8167.75, "step": 8471, "total_num_tokens": 1213035132.0, "z_loss": 0.000457396061392501 }, { "copy_logits_max": -5.803706169128418, "copy_logits_min": -750000000.0, "copy_num_tokens": 554.5, "epoch": 1.7303548634158794, "gen_logits_max": 4.7432146072387695, "gen_logits_mean": -15.620965003967285, "gen_logits_min": -28.472274780273438, "gen_logits_std": 3.348432779312134, "gen_loss": 0.24800002574920654, "grad_norm": 0.34297721443738993, "learning_rate": 2.022821052631579e-05, "loss": 0.2787, "mean_copy_accuracy": 0.9977308362722397, "mean_gen_accuracy": 0.8706096708774567, "mean_token_accuracy": 0.9048059582710266, "num_tokens": 1213276047.0, "sample_num_tokens": 9305.75, "step": 8472, "total_num_tokens": 1213313270.0, "z_loss": 0.00040333555079996586 }, { "copy_logits_max": -7.59262228012085, "copy_logits_min": -750000000.0, "copy_num_tokens": 366.125, "epoch": 1.7305591013530763, "gen_logits_max": 4.372976303100586, "gen_logits_mean": -16.323394775390625, "gen_logits_min": -28.784791946411133, "gen_logits_std": 3.333482265472412, "gen_loss": 0.30573970079421997, "grad_norm": 0.3601372367830848, "learning_rate": 2.0226947368421053e-05, "loss": 0.3078, "mean_copy_accuracy": 0.9952742606401443, "mean_gen_accuracy": 0.8662830889225006, "mean_token_accuracy": 0.8955797404050827, "num_tokens": 1213553414.0, "sample_num_tokens": 7281.5, "step": 8473, "total_num_tokens": 1213582540.0, "z_loss": 0.0004976552445441484 }, { "copy_logits_max": -4.725404739379883, "copy_logits_min": -687500032.0, "copy_num_tokens": 579.75, "epoch": 1.7307633392902733, "gen_logits_max": 4.176909446716309, "gen_logits_mean": -16.20140266418457, "gen_logits_min": -28.611976623535156, "gen_logits_std": 3.3545541763305664, "gen_loss": 0.28217440843582153, "grad_norm": 0.3728286716861852, "learning_rate": 2.0225684210526317e-05, "loss": 0.267, "mean_copy_accuracy": 0.9963820278644562, "mean_gen_accuracy": 0.8746767491102219, "mean_token_accuracy": 0.908221960067749, "num_tokens": 1213842617.0, "sample_num_tokens": 9850.75, "step": 8474, "total_num_tokens": 1213882020.0, "z_loss": 0.0004487655241973698 }, { "copy_logits_max": -4.138737678527832, "copy_logits_min": -687500032.0, "copy_num_tokens": 427.0625, "epoch": 1.73096757722747, "gen_logits_max": 4.367795944213867, "gen_logits_mean": -15.820493698120117, "gen_logits_min": -28.59130859375, "gen_logits_std": 3.3300986289978027, "gen_loss": 0.2660295367240906, "grad_norm": 0.3347844169692492, "learning_rate": 2.0224421052631578e-05, "loss": 0.2727, "mean_copy_accuracy": 0.9979220330715179, "mean_gen_accuracy": 0.8774484097957611, "mean_token_accuracy": 0.908097580075264, "num_tokens": 1214112041.0, "sample_num_tokens": 8076.25, "step": 8475, "total_num_tokens": 1214144346.0, "z_loss": 0.000427853170549497 }, { "copy_logits_max": -6.105208396911621, "copy_logits_min": -687500032.0, "copy_num_tokens": 383.0625, "epoch": 1.7311718151646667, "gen_logits_max": 2.855959415435791, "gen_logits_mean": -18.653663635253906, "gen_logits_min": -30.628164291381836, "gen_logits_std": 3.41717267036438, "gen_loss": 0.2758389115333557, "grad_norm": 0.35388446106140703, "learning_rate": 2.0223157894736842e-05, "loss": 0.2858, "mean_copy_accuracy": 0.9974375814199448, "mean_gen_accuracy": 0.8734224438667297, "mean_token_accuracy": 0.903458908200264, "num_tokens": 1214368180.0, "sample_num_tokens": 7938.0, "step": 8476, "total_num_tokens": 1214399932.0, "z_loss": 0.00046039343578740954 }, { "copy_logits_max": -5.162610054016113, "copy_logits_min": -750000000.0, "copy_num_tokens": 348.9375, "epoch": 1.7313760531018638, "gen_logits_max": 4.726192951202393, "gen_logits_mean": -15.336735725402832, "gen_logits_min": -27.334575653076172, "gen_logits_std": 3.293675422668457, "gen_loss": 0.2938489317893982, "grad_norm": 0.34780703310133065, "learning_rate": 2.0221894736842106e-05, "loss": 0.2849, "mean_copy_accuracy": 0.9967972040176392, "mean_gen_accuracy": 0.8746150583028793, "mean_token_accuracy": 0.9046162813901901, "num_tokens": 1214636043.0, "sample_num_tokens": 7505.75, "step": 8477, "total_num_tokens": 1214666066.0, "z_loss": 0.000432923756306991 }, { "copy_logits_max": -6.75595760345459, "copy_logits_min": -750000000.0, "copy_num_tokens": 291.0625, "epoch": 1.7315802910390605, "gen_logits_max": 3.8086109161376953, "gen_logits_mean": -18.84161376953125, "gen_logits_min": -30.684858322143555, "gen_logits_std": 3.4303627014160156, "gen_loss": 0.27002209424972534, "grad_norm": 0.3583905597358079, "learning_rate": 2.022063157894737e-05, "loss": 0.2729, "mean_copy_accuracy": 0.9965335875749588, "mean_gen_accuracy": 0.8809919655323029, "mean_token_accuracy": 0.9069985151290894, "num_tokens": 1214902442.0, "sample_num_tokens": 7158.5, "step": 8478, "total_num_tokens": 1214931076.0, "z_loss": 0.00040401663864031434 }, { "copy_logits_max": -7.590790271759033, "copy_logits_min": -687500032.0, "copy_num_tokens": 515.25, "epoch": 1.7317845289762572, "gen_logits_max": 4.021298408508301, "gen_logits_mean": -15.704405784606934, "gen_logits_min": -28.120189666748047, "gen_logits_std": 3.349517345428467, "gen_loss": 0.27259212732315063, "grad_norm": 0.34909918815298774, "learning_rate": 2.021936842105263e-05, "loss": 0.2821, "mean_copy_accuracy": 0.9967916905879974, "mean_gen_accuracy": 0.8767005950212479, "mean_token_accuracy": 0.9032018482685089, "num_tokens": 1215162623.0, "sample_num_tokens": 8728.25, "step": 8479, "total_num_tokens": 1215197536.0, "z_loss": 0.0004462831129785627 }, { "copy_logits_max": -7.549816131591797, "copy_logits_min": -687500032.0, "copy_num_tokens": 417.4375, "epoch": 1.7319887669134542, "gen_logits_max": 2.480523109436035, "gen_logits_mean": -19.032888412475586, "gen_logits_min": -31.11321258544922, "gen_logits_std": 3.4650299549102783, "gen_loss": 0.24242788553237915, "grad_norm": 0.3408447410249773, "learning_rate": 2.0218105263157896e-05, "loss": 0.2666, "mean_copy_accuracy": 0.9969256669282913, "mean_gen_accuracy": 0.8781255334615707, "mean_token_accuracy": 0.9096790552139282, "num_tokens": 1215441568.0, "sample_num_tokens": 7514.5, "step": 8480, "total_num_tokens": 1215471626.0, "z_loss": 0.0004264203307684511 }, { "copy_logits_max": -5.772216320037842, "copy_logits_min": -750000000.0, "copy_num_tokens": 666.4375, "epoch": 1.732193004850651, "gen_logits_max": 3.048027753829956, "gen_logits_mean": -17.30759048461914, "gen_logits_min": -30.11785316467285, "gen_logits_std": 3.428262948989868, "gen_loss": 0.259260892868042, "grad_norm": 0.3417217255961257, "learning_rate": 2.0216842105263157e-05, "loss": 0.2662, "mean_copy_accuracy": 0.9973659068346024, "mean_gen_accuracy": 0.8767335116863251, "mean_token_accuracy": 0.9094887971878052, "num_tokens": 1215711967.0, "sample_num_tokens": 9821.75, "step": 8481, "total_num_tokens": 1215751254.0, "z_loss": 0.0005297142779454589 }, { "copy_logits_max": -9.187143325805664, "copy_logits_min": -687500032.0, "copy_num_tokens": 311.625, "epoch": 1.7323972427878478, "gen_logits_max": 3.4597716331481934, "gen_logits_mean": -18.72811508178711, "gen_logits_min": -30.615097045898438, "gen_logits_std": 3.442495822906494, "gen_loss": 0.27597546577453613, "grad_norm": 0.37147864632975547, "learning_rate": 2.021557894736842e-05, "loss": 0.2905, "mean_copy_accuracy": 0.9962116628885269, "mean_gen_accuracy": 0.8777609914541245, "mean_token_accuracy": 0.9008399993181229, "num_tokens": 1215960651.0, "sample_num_tokens": 7302.75, "step": 8482, "total_num_tokens": 1215989862.0, "z_loss": 0.0004224043805152178 }, { "copy_logits_max": -7.787311553955078, "copy_logits_min": -625000064.0, "copy_num_tokens": 562.0, "epoch": 1.7326014807250447, "gen_logits_max": 4.393289566040039, "gen_logits_mean": -16.651695251464844, "gen_logits_min": -29.197795867919922, "gen_logits_std": 3.3685102462768555, "gen_loss": 0.27967119216918945, "grad_norm": 0.3582693800028382, "learning_rate": 2.0214315789473686e-05, "loss": 0.2747, "mean_copy_accuracy": 0.9976590573787689, "mean_gen_accuracy": 0.8775664120912552, "mean_token_accuracy": 0.9091816246509552, "num_tokens": 1216233153.0, "sample_num_tokens": 9056.75, "step": 8483, "total_num_tokens": 1216269380.0, "z_loss": 0.0004946059780195355 }, { "copy_logits_max": -8.327936172485352, "copy_logits_min": -687500032.0, "copy_num_tokens": 492.6875, "epoch": 1.7328057186622416, "gen_logits_max": 4.86368465423584, "gen_logits_mean": -14.695014953613281, "gen_logits_min": -27.07280731201172, "gen_logits_std": 3.273261070251465, "gen_loss": 0.2832053303718567, "grad_norm": 0.36803597383860953, "learning_rate": 2.0213052631578946e-05, "loss": 0.2807, "mean_copy_accuracy": 0.9967973083257675, "mean_gen_accuracy": 0.8734262436628342, "mean_token_accuracy": 0.9035029411315918, "num_tokens": 1216520743.0, "sample_num_tokens": 8699.25, "step": 8484, "total_num_tokens": 1216555540.0, "z_loss": 0.00046881314483471215 }, { "copy_logits_max": -7.449179649353027, "copy_logits_min": -750000000.0, "copy_num_tokens": 459.0625, "epoch": 1.7330099565994384, "gen_logits_max": 4.492926120758057, "gen_logits_mean": -15.897493362426758, "gen_logits_min": -28.242782592773438, "gen_logits_std": 3.3013219833374023, "gen_loss": 0.3056066930294037, "grad_norm": 0.39824681608211554, "learning_rate": 2.021178947368421e-05, "loss": 0.2879, "mean_copy_accuracy": 0.9963031560182571, "mean_gen_accuracy": 0.8695380240678787, "mean_token_accuracy": 0.9024202823638916, "num_tokens": 1216787747.0, "sample_num_tokens": 8443.25, "step": 8485, "total_num_tokens": 1216821520.0, "z_loss": 0.0005288891261443496 }, { "copy_logits_max": -8.102790832519531, "copy_logits_min": -750000000.0, "copy_num_tokens": 396.0625, "epoch": 1.733214194536635, "gen_logits_max": 3.8076813220977783, "gen_logits_mean": -17.62359619140625, "gen_logits_min": -29.383878707885742, "gen_logits_std": 3.3549959659576416, "gen_loss": 0.2650289535522461, "grad_norm": 0.32755583540778305, "learning_rate": 2.0210526315789475e-05, "loss": 0.2569, "mean_copy_accuracy": 0.9972462058067322, "mean_gen_accuracy": 0.8826228082180023, "mean_token_accuracy": 0.9127385318279266, "num_tokens": 1217066082.0, "sample_num_tokens": 7871.5, "step": 8486, "total_num_tokens": 1217097568.0, "z_loss": 0.00045057665556669235 }, { "copy_logits_max": -8.445001602172852, "copy_logits_min": -750000000.0, "copy_num_tokens": 213.125, "epoch": 1.733418432473832, "gen_logits_max": 4.71938943862915, "gen_logits_mean": -16.735458374023438, "gen_logits_min": -28.688369750976562, "gen_logits_std": 3.3095641136169434, "gen_loss": 0.26967722177505493, "grad_norm": 0.3404033490781336, "learning_rate": 2.020926315789474e-05, "loss": 0.2575, "mean_copy_accuracy": 0.9969480931758881, "mean_gen_accuracy": 0.8898699879646301, "mean_token_accuracy": 0.9118436276912689, "num_tokens": 1217321807.0, "sample_num_tokens": 7009.75, "step": 8487, "total_num_tokens": 1217349846.0, "z_loss": 0.0004376363067422062 }, { "copy_logits_max": -7.483908653259277, "copy_logits_min": -625000064.0, "copy_num_tokens": 374.5, "epoch": 1.733622670411029, "gen_logits_max": 3.918694019317627, "gen_logits_mean": -16.69283103942871, "gen_logits_min": -28.731990814208984, "gen_logits_std": 3.3336243629455566, "gen_loss": 0.28334468603134155, "grad_norm": 0.37069389682677656, "learning_rate": 2.0208e-05, "loss": 0.2797, "mean_copy_accuracy": 0.9970990866422653, "mean_gen_accuracy": 0.8747478425502777, "mean_token_accuracy": 0.9047513455152512, "num_tokens": 1217596439.0, "sample_num_tokens": 8022.25, "step": 8488, "total_num_tokens": 1217628528.0, "z_loss": 0.00046080275205895305 }, { "copy_logits_max": -8.393774032592773, "copy_logits_min": -687500032.0, "copy_num_tokens": 560.0625, "epoch": 1.7338269083482256, "gen_logits_max": 3.46384334564209, "gen_logits_mean": -16.926715850830078, "gen_logits_min": -28.673471450805664, "gen_logits_std": 3.3032593727111816, "gen_loss": 0.2559906542301178, "grad_norm": 0.32434923891180234, "learning_rate": 2.0206736842105265e-05, "loss": 0.2507, "mean_copy_accuracy": 0.9969127923250198, "mean_gen_accuracy": 0.8865527212619781, "mean_token_accuracy": 0.9136275500059128, "num_tokens": 1217865681.0, "sample_num_tokens": 9447.25, "step": 8489, "total_num_tokens": 1217903470.0, "z_loss": 0.000360820529749617 }, { "copy_logits_max": -8.69222354888916, "copy_logits_min": -750000064.0, "copy_num_tokens": 442.3125, "epoch": 1.7340311462854225, "gen_logits_max": 4.951671600341797, "gen_logits_mean": -14.90023136138916, "gen_logits_min": -27.042346954345703, "gen_logits_std": 3.292391300201416, "gen_loss": 0.32319504022598267, "grad_norm": 0.3511770544164662, "learning_rate": 2.0205473684210526e-05, "loss": 0.3028, "mean_copy_accuracy": 0.9963988214731216, "mean_gen_accuracy": 0.8699037134647369, "mean_token_accuracy": 0.8974645286798477, "num_tokens": 1218134610.0, "sample_num_tokens": 9212.0, "step": 8490, "total_num_tokens": 1218171458.0, "z_loss": 0.0005276022711768746 }, { "copy_logits_max": -7.835379600524902, "copy_logits_min": -687500032.0, "copy_num_tokens": 713.375, "epoch": 1.7342353842226195, "gen_logits_max": 2.8528246879577637, "gen_logits_mean": -16.45175552368164, "gen_logits_min": -28.465225219726562, "gen_logits_std": 3.3109893798828125, "gen_loss": 0.23602592945098877, "grad_norm": 0.36545011799269134, "learning_rate": 2.020421052631579e-05, "loss": 0.2683, "mean_copy_accuracy": 0.9975068718194962, "mean_gen_accuracy": 0.8755470812320709, "mean_token_accuracy": 0.9074641764163971, "num_tokens": 1218408737.0, "sample_num_tokens": 9948.75, "step": 8491, "total_num_tokens": 1218448532.0, "z_loss": 0.0003904705517925322 }, { "copy_logits_max": -7.927092552185059, "copy_logits_min": -625000000.0, "copy_num_tokens": 387.3125, "epoch": 1.7344396221598162, "gen_logits_max": 3.0794360637664795, "gen_logits_mean": -19.03403091430664, "gen_logits_min": -31.044248580932617, "gen_logits_std": 3.485215902328491, "gen_loss": 0.25574442744255066, "grad_norm": 0.31635510947117085, "learning_rate": 2.020294736842105e-05, "loss": 0.2687, "mean_copy_accuracy": 0.9960557371377945, "mean_gen_accuracy": 0.8807056695222855, "mean_token_accuracy": 0.9076816141605377, "num_tokens": 1218691388.0, "sample_num_tokens": 7854.0, "step": 8492, "total_num_tokens": 1218722804.0, "z_loss": 0.00039200077299028635 }, { "copy_logits_max": -9.903335571289062, "copy_logits_min": -750000064.0, "copy_num_tokens": 351.8125, "epoch": 1.7346438600970129, "gen_logits_max": 3.887932300567627, "gen_logits_mean": -17.69525146484375, "gen_logits_min": -29.941608428955078, "gen_logits_std": 3.383728504180908, "gen_loss": 0.2935755252838135, "grad_norm": 0.34542165962718224, "learning_rate": 2.0201684210526315e-05, "loss": 0.2779, "mean_copy_accuracy": 0.9970238506793976, "mean_gen_accuracy": 0.8793717175722122, "mean_token_accuracy": 0.9038678407669067, "num_tokens": 1218940362.0, "sample_num_tokens": 7751.0, "step": 8493, "total_num_tokens": 1218971366.0, "z_loss": 0.00038955535273998976 }, { "copy_logits_max": -8.708096504211426, "copy_logits_min": -750000000.0, "copy_num_tokens": 585.5, "epoch": 1.7348480980342098, "gen_logits_max": 3.6611459255218506, "gen_logits_mean": -15.906208038330078, "gen_logits_min": -28.076828002929688, "gen_logits_std": 3.348440647125244, "gen_loss": 0.2684337794780731, "grad_norm": 0.3578490834701585, "learning_rate": 2.020042105263158e-05, "loss": 0.272, "mean_copy_accuracy": 0.9971330463886261, "mean_gen_accuracy": 0.8751303553581238, "mean_token_accuracy": 0.9059107601642609, "num_tokens": 1219202963.0, "sample_num_tokens": 8654.75, "step": 8494, "total_num_tokens": 1219237582.0, "z_loss": 0.0003817219694610685 }, { "copy_logits_max": -9.187525749206543, "copy_logits_min": -750000000.0, "copy_num_tokens": 493.875, "epoch": 1.7350523359714067, "gen_logits_max": 4.007246971130371, "gen_logits_mean": -17.471778869628906, "gen_logits_min": -29.7216854095459, "gen_logits_std": 3.4186244010925293, "gen_loss": 0.2223559468984604, "grad_norm": 0.34173323690737817, "learning_rate": 2.0199157894736844e-05, "loss": 0.2668, "mean_copy_accuracy": 0.9970786571502686, "mean_gen_accuracy": 0.8816671967506409, "mean_token_accuracy": 0.9097800850868225, "num_tokens": 1219472588.0, "sample_num_tokens": 9064.0, "step": 8495, "total_num_tokens": 1219508844.0, "z_loss": 0.00034491304541006684 }, { "copy_logits_max": -9.868209838867188, "copy_logits_min": -687500032.0, "copy_num_tokens": 495.3125, "epoch": 1.7352565739086034, "gen_logits_max": 4.523018836975098, "gen_logits_mean": -15.104814529418945, "gen_logits_min": -27.225486755371094, "gen_logits_std": 3.269570827484131, "gen_loss": 0.27159321308135986, "grad_norm": 0.39631522550264603, "learning_rate": 2.0197894736842108e-05, "loss": 0.2771, "mean_copy_accuracy": 0.997117429971695, "mean_gen_accuracy": 0.876897856593132, "mean_token_accuracy": 0.9056959599256516, "num_tokens": 1219729957.0, "sample_num_tokens": 9264.75, "step": 8496, "total_num_tokens": 1219767016.0, "z_loss": 0.00042500137351453304 }, { "copy_logits_max": -9.296632766723633, "copy_logits_min": -750000000.0, "copy_num_tokens": 399.3125, "epoch": 1.7354608118458004, "gen_logits_max": 3.95810866355896, "gen_logits_mean": -15.977375030517578, "gen_logits_min": -28.49755096435547, "gen_logits_std": 3.3253073692321777, "gen_loss": 0.30362528562545776, "grad_norm": 0.32087290518155315, "learning_rate": 2.019663157894737e-05, "loss": 0.2712, "mean_copy_accuracy": 0.9973534345626831, "mean_gen_accuracy": 0.8781743198633194, "mean_token_accuracy": 0.9085209965705872, "num_tokens": 1220009894.0, "sample_num_tokens": 7732.0, "step": 8497, "total_num_tokens": 1220040822.0, "z_loss": 0.0004889846313744783 }, { "copy_logits_max": -8.769428253173828, "copy_logits_min": -750000000.0, "copy_num_tokens": 405.6875, "epoch": 1.7356650497829973, "gen_logits_max": 5.084563255310059, "gen_logits_mean": -15.820189476013184, "gen_logits_min": -28.270790100097656, "gen_logits_std": 3.3218183517456055, "gen_loss": 0.2911149859428406, "grad_norm": 0.34225915047932404, "learning_rate": 2.0195368421052633e-05, "loss": 0.2797, "mean_copy_accuracy": 0.9964036345481873, "mean_gen_accuracy": 0.8775300681591034, "mean_token_accuracy": 0.9068592488765717, "num_tokens": 1220261934.0, "sample_num_tokens": 8314.5, "step": 8498, "total_num_tokens": 1220295192.0, "z_loss": 0.0004632821655832231 }, { "copy_logits_max": -9.349084854125977, "copy_logits_min": -750000000.0, "copy_num_tokens": 406.0625, "epoch": 1.735869287720194, "gen_logits_max": 3.799135684967041, "gen_logits_mean": -17.422992706298828, "gen_logits_min": -29.513290405273438, "gen_logits_std": 3.391735315322876, "gen_loss": 0.25811082124710083, "grad_norm": 0.36904864180332764, "learning_rate": 2.0194105263157894e-05, "loss": 0.2883, "mean_copy_accuracy": 0.9959666132926941, "mean_gen_accuracy": 0.8748032450675964, "mean_token_accuracy": 0.9029803276062012, "num_tokens": 1220518887.0, "sample_num_tokens": 8286.25, "step": 8499, "total_num_tokens": 1220552032.0, "z_loss": 0.000393529306165874 }, { "epoch": 1.7360735256573907, "grad_norm": 0.34649834485775066, "learning_rate": 2.019284210526316e-05, "loss": 0.2893, "step": 8500 }, { "epoch": 1.7360735256573907, "eval_copy_logits_max": -8.621410369873047, "eval_copy_logits_min": -85.4028091430664, "eval_gen_logits_max": 2.1312527656555176, "eval_gen_logits_mean": -22.48244857788086, "eval_gen_logits_min": -33.74103546142578, "eval_gen_logits_std": 3.5220913887023926, "eval_gen_loss": 0.3015654683113098, "eval_loss": 0.2673618495464325, "eval_mean_copy_accuracy": 0.9921765029430389, "eval_mean_gen_accuracy": 0.8905404806137085, "eval_mean_token_accuracy": 0.903855711221695, "eval_num_tokens": 1220813526.0, "eval_runtime": 0.6737, "eval_samples_per_second": 11.875, "eval_steps_per_second": 2.969, "eval_total_num_tokens": 1220813526.0, "eval_z_loss": 0.00044124911073595285, "step": 8500 }, { "copy_logits_max": -9.941936492919922, "copy_logits_min": -750000000.0, "copy_num_tokens": 323.75, "epoch": 1.7360735256573907, "gen_logits_max": 4.894941806793213, "gen_logits_mean": -16.656814575195312, "gen_logits_min": -28.40203857421875, "gen_logits_std": 3.333533763885498, "gen_loss": 0.332329660654068, "grad_norm": 0.3436415087140621, "learning_rate": 2.019157894736842e-05, "loss": 0.2774, "mean_copy_accuracy": 0.997034952044487, "mean_gen_accuracy": 0.877020388841629, "mean_token_accuracy": 0.9049402177333832, "num_tokens": 228127.0, "sample_num_tokens": 8341.75, "step": 8501, "total_num_tokens": 261494.0, "z_loss": 0.0005007336148992181 }, { "copy_logits_max": -7.786556243896484, "copy_logits_min": -687500032.0, "copy_num_tokens": 424.25, "epoch": 1.7362777635945879, "gen_logits_max": 3.5853192806243896, "gen_logits_mean": -17.782854080200195, "gen_logits_min": -30.040803909301758, "gen_logits_std": 3.3862409591674805, "gen_loss": 0.307740718126297, "grad_norm": 0.3564141538876567, "learning_rate": 2.0190315789473687e-05, "loss": 0.2926, "mean_copy_accuracy": 0.9958224147558212, "mean_gen_accuracy": 0.8704845905303955, "mean_token_accuracy": 0.8998609036207199, "num_tokens": 479884.0, "sample_num_tokens": 8638.5, "step": 8502, "total_num_tokens": 514438.0, "z_loss": 0.00047951267333701253 }, { "copy_logits_max": -6.384957313537598, "copy_logits_min": -625000064.0, "copy_num_tokens": 797.625, "epoch": 1.7364820015317846, "gen_logits_max": 2.878444194793701, "gen_logits_mean": -17.04638671875, "gen_logits_min": -29.490182876586914, "gen_logits_std": 3.4057936668395996, "gen_loss": 0.2530950605869293, "grad_norm": 0.34470388002212604, "learning_rate": 2.0189052631578948e-05, "loss": 0.2858, "mean_copy_accuracy": 0.9959936439990997, "mean_gen_accuracy": 0.8790351748466492, "mean_token_accuracy": 0.9034820795059204, "num_tokens": 738688.0, "sample_num_tokens": 10540.5, "step": 8503, "total_num_tokens": 780850.0, "z_loss": 0.00047103402903303504 }, { "copy_logits_max": -4.660237789154053, "copy_logits_min": -687500032.0, "copy_num_tokens": 586.625, "epoch": 1.7366862394689813, "gen_logits_max": 3.0105929374694824, "gen_logits_mean": -16.62063980102539, "gen_logits_min": -29.749420166015625, "gen_logits_std": 3.4036757946014404, "gen_loss": 0.23264114558696747, "grad_norm": 0.3556006911085105, "learning_rate": 2.0187789473684212e-05, "loss": 0.279, "mean_copy_accuracy": 0.9962573498487473, "mean_gen_accuracy": 0.8710781037807465, "mean_token_accuracy": 0.9042713195085526, "num_tokens": 1012889.0, "sample_num_tokens": 8437.25, "step": 8504, "total_num_tokens": 1046638.0, "z_loss": 0.0004804349737241864 }, { "copy_logits_max": -7.426990032196045, "copy_logits_min": -750000000.0, "copy_num_tokens": 374.375, "epoch": 1.7368904774061782, "gen_logits_max": 4.696101665496826, "gen_logits_mean": -16.64008140563965, "gen_logits_min": -29.554424285888672, "gen_logits_std": 3.3761274814605713, "gen_loss": 0.280466765165329, "grad_norm": 0.3615790034191042, "learning_rate": 2.0186526315789473e-05, "loss": 0.2859, "mean_copy_accuracy": 0.99733567237854, "mean_gen_accuracy": 0.8697689175605774, "mean_token_accuracy": 0.9020209759473801, "num_tokens": 1299342.0, "sample_num_tokens": 7672.0, "step": 8505, "total_num_tokens": 1330030.0, "z_loss": 0.0005024807760491967 }, { "copy_logits_max": -6.009876728057861, "copy_logits_min": -750000064.0, "copy_num_tokens": 552.75, "epoch": 1.7370947153433751, "gen_logits_max": 3.817939043045044, "gen_logits_mean": -15.962860107421875, "gen_logits_min": -28.636215209960938, "gen_logits_std": 3.3362743854522705, "gen_loss": 0.2780556082725525, "grad_norm": 0.352448910256492, "learning_rate": 2.0185263157894738e-05, "loss": 0.2669, "mean_copy_accuracy": 0.9977343678474426, "mean_gen_accuracy": 0.8718505650758743, "mean_token_accuracy": 0.9099367558956146, "num_tokens": 1592184.0, "sample_num_tokens": 8942.5, "step": 8506, "total_num_tokens": 1627954.0, "z_loss": 0.0005441705579869449 }, { "copy_logits_max": -7.154953956604004, "copy_logits_min": -750000128.0, "copy_num_tokens": 429.375, "epoch": 1.7372989532805718, "gen_logits_max": 2.5408973693847656, "gen_logits_mean": -19.81495475769043, "gen_logits_min": -31.828279495239258, "gen_logits_std": 3.484299898147583, "gen_loss": 0.2768060564994812, "grad_norm": 0.3994952482995486, "learning_rate": 2.0184e-05, "loss": 0.2857, "mean_copy_accuracy": 0.9978561401367188, "mean_gen_accuracy": 0.8732662349939346, "mean_token_accuracy": 0.9022817313671112, "num_tokens": 1843159.0, "sample_num_tokens": 8321.75, "step": 8507, "total_num_tokens": 1876446.0, "z_loss": 0.0004863714275415987 }, { "copy_logits_max": -8.394638061523438, "copy_logits_min": -750000000.0, "copy_num_tokens": 368.1875, "epoch": 1.7375031912177687, "gen_logits_max": 4.8587846755981445, "gen_logits_mean": -15.680171966552734, "gen_logits_min": -28.669660568237305, "gen_logits_std": 3.334540843963623, "gen_loss": 0.2680432200431824, "grad_norm": 0.36504766553535056, "learning_rate": 2.0182736842105263e-05, "loss": 0.2735, "mean_copy_accuracy": 0.9959048181772232, "mean_gen_accuracy": 0.8804390281438828, "mean_token_accuracy": 0.9058229625225067, "num_tokens": 2100750.0, "sample_num_tokens": 7788.5, "step": 8508, "total_num_tokens": 2131904.0, "z_loss": 0.00046127603854984045 }, { "copy_logits_max": -10.213811874389648, "copy_logits_min": -750000064.0, "copy_num_tokens": 319.75, "epoch": 1.7377074291549657, "gen_logits_max": 3.787762403488159, "gen_logits_mean": -17.86631965637207, "gen_logits_min": -30.176471710205078, "gen_logits_std": 3.38143253326416, "gen_loss": 0.2797239422798157, "grad_norm": 0.33586432919478654, "learning_rate": 2.0181473684210524e-05, "loss": 0.2654, "mean_copy_accuracy": 0.9955659508705139, "mean_gen_accuracy": 0.8835161924362183, "mean_token_accuracy": 0.9100765585899353, "num_tokens": 2375305.0, "sample_num_tokens": 7904.25, "step": 8509, "total_num_tokens": 2406922.0, "z_loss": 0.00041862393845804036 }, { "copy_logits_max": -6.6758503913879395, "copy_logits_min": -750000000.0, "copy_num_tokens": 389.125, "epoch": 1.7379116670921624, "gen_logits_max": 3.2060012817382812, "gen_logits_mean": -17.253036499023438, "gen_logits_min": -29.710464477539062, "gen_logits_std": 3.4114859104156494, "gen_loss": 0.26281511783599854, "grad_norm": 0.32161421231653475, "learning_rate": 2.018021052631579e-05, "loss": 0.2475, "mean_copy_accuracy": 0.9968093782663345, "mean_gen_accuracy": 0.8830104470252991, "mean_token_accuracy": 0.9154670387506485, "num_tokens": 2656275.0, "sample_num_tokens": 6712.25, "step": 8510, "total_num_tokens": 2683124.0, "z_loss": 0.00042954791570082307 }, { "copy_logits_max": -6.652646541595459, "copy_logits_min": -687500032.0, "copy_num_tokens": 515.5, "epoch": 1.738115905029359, "gen_logits_max": 3.0758588314056396, "gen_logits_mean": -17.407859802246094, "gen_logits_min": -29.627965927124023, "gen_logits_std": 3.4167747497558594, "gen_loss": 0.29846203327178955, "grad_norm": 0.35581833385661854, "learning_rate": 2.0178947368421056e-05, "loss": 0.2844, "mean_copy_accuracy": 0.9972537904977798, "mean_gen_accuracy": 0.8718870729207993, "mean_token_accuracy": 0.9022168964147568, "num_tokens": 2906009.0, "sample_num_tokens": 8468.25, "step": 8511, "total_num_tokens": 2939882.0, "z_loss": 0.0004517827765084803 }, { "copy_logits_max": -7.962619781494141, "copy_logits_min": -687500032.0, "copy_num_tokens": 464.1875, "epoch": 1.738320142966556, "gen_logits_max": 3.455787420272827, "gen_logits_mean": -18.036678314208984, "gen_logits_min": -30.058626174926758, "gen_logits_std": 3.4154629707336426, "gen_loss": 0.265280157327652, "grad_norm": 0.3452442474381182, "learning_rate": 2.0177684210526317e-05, "loss": 0.2657, "mean_copy_accuracy": 0.9979548156261444, "mean_gen_accuracy": 0.878798171877861, "mean_token_accuracy": 0.9118737280368805, "num_tokens": 3172902.0, "sample_num_tokens": 8251.0, "step": 8512, "total_num_tokens": 3205906.0, "z_loss": 0.0003971016267314553 }, { "copy_logits_max": -7.674164295196533, "copy_logits_min": -687500032.0, "copy_num_tokens": 427.4375, "epoch": 1.738524380903753, "gen_logits_max": 3.707406520843506, "gen_logits_mean": -17.169445037841797, "gen_logits_min": -29.267578125, "gen_logits_std": 3.3875608444213867, "gen_loss": 0.2699676752090454, "grad_norm": 0.36794444582490504, "learning_rate": 2.017642105263158e-05, "loss": 0.2721, "mean_copy_accuracy": 0.9974422007799149, "mean_gen_accuracy": 0.8774060904979706, "mean_token_accuracy": 0.9074308574199677, "num_tokens": 3452580.0, "sample_num_tokens": 8093.0, "step": 8513, "total_num_tokens": 3484952.0, "z_loss": 0.00042108912020921707 }, { "copy_logits_max": -3.203838586807251, "copy_logits_min": -750000000.0, "copy_num_tokens": 609.625, "epoch": 1.7387286188409496, "gen_logits_max": 3.010451316833496, "gen_logits_mean": -16.76755142211914, "gen_logits_min": -29.970409393310547, "gen_logits_std": 3.387378692626953, "gen_loss": 0.2664983570575714, "grad_norm": 0.3841000716813305, "learning_rate": 2.0175157894736842e-05, "loss": 0.2686, "mean_copy_accuracy": 0.995974987745285, "mean_gen_accuracy": 0.8805834054946899, "mean_token_accuracy": 0.9089059382677078, "num_tokens": 3712848.0, "sample_num_tokens": 9240.5, "step": 8514, "total_num_tokens": 3749810.0, "z_loss": 0.0004186006844975054 }, { "copy_logits_max": -8.14001178741455, "copy_logits_min": -687500032.0, "copy_num_tokens": 526.625, "epoch": 1.7389328567781466, "gen_logits_max": 3.1273140907287598, "gen_logits_mean": -17.12759017944336, "gen_logits_min": -29.263355255126953, "gen_logits_std": 3.418367385864258, "gen_loss": 0.2431449592113495, "grad_norm": 0.33564928829434204, "learning_rate": 2.0173894736842106e-05, "loss": 0.2514, "mean_copy_accuracy": 0.9973707795143127, "mean_gen_accuracy": 0.8852763175964355, "mean_token_accuracy": 0.9145249128341675, "num_tokens": 3988115.0, "sample_num_tokens": 8360.75, "step": 8515, "total_num_tokens": 4021558.0, "z_loss": 0.000391014211345464 }, { "copy_logits_max": -8.507246017456055, "copy_logits_min": -750000000.0, "copy_num_tokens": 346.125, "epoch": 1.7391370947153435, "gen_logits_max": 3.1825950145721436, "gen_logits_mean": -17.736614227294922, "gen_logits_min": -29.593318939208984, "gen_logits_std": 3.421844959259033, "gen_loss": 0.24804671108722687, "grad_norm": 0.34810557802362035, "learning_rate": 2.0172631578947367e-05, "loss": 0.2772, "mean_copy_accuracy": 0.996799886226654, "mean_gen_accuracy": 0.8788953572511673, "mean_token_accuracy": 0.9031132310628891, "num_tokens": 4245319.0, "sample_num_tokens": 7644.25, "step": 8516, "total_num_tokens": 4275896.0, "z_loss": 0.00037238324875943363 }, { "copy_logits_max": -8.85323715209961, "copy_logits_min": -750000000.0, "copy_num_tokens": 383.4375, "epoch": 1.7393413326525402, "gen_logits_max": 4.027299880981445, "gen_logits_mean": -16.884737014770508, "gen_logits_min": -29.001522064208984, "gen_logits_std": 3.3776679039001465, "gen_loss": 0.28714320063591003, "grad_norm": 0.3784097409896855, "learning_rate": 2.017136842105263e-05, "loss": 0.2756, "mean_copy_accuracy": 0.9972575306892395, "mean_gen_accuracy": 0.8765529990196228, "mean_token_accuracy": 0.9062756896018982, "num_tokens": 4509168.0, "sample_num_tokens": 8531.5, "step": 8517, "total_num_tokens": 4543294.0, "z_loss": 0.00043362152064219117 }, { "copy_logits_max": -5.680016040802002, "copy_logits_min": -687500032.0, "copy_num_tokens": 410.3125, "epoch": 1.739545570589737, "gen_logits_max": 2.621455669403076, "gen_logits_mean": -18.525609970092773, "gen_logits_min": -30.700546264648438, "gen_logits_std": 3.4435625076293945, "gen_loss": 0.2715386748313904, "grad_norm": 0.3492251378489904, "learning_rate": 2.0170105263157896e-05, "loss": 0.2809, "mean_copy_accuracy": 0.9972277283668518, "mean_gen_accuracy": 0.874878779053688, "mean_token_accuracy": 0.9052830189466476, "num_tokens": 4787629.0, "sample_num_tokens": 7874.75, "step": 8518, "total_num_tokens": 4819128.0, "z_loss": 0.00046967066009528935 }, { "copy_logits_max": -7.832767486572266, "copy_logits_min": -750000000.0, "copy_num_tokens": 448.0, "epoch": 1.7397498085269338, "gen_logits_max": 3.5681803226470947, "gen_logits_mean": -16.818761825561523, "gen_logits_min": -28.971101760864258, "gen_logits_std": 3.3735768795013428, "gen_loss": 0.2926512062549591, "grad_norm": 0.3925778968973474, "learning_rate": 2.016884210526316e-05, "loss": 0.2967, "mean_copy_accuracy": 0.9962223917245865, "mean_gen_accuracy": 0.8714600801467896, "mean_token_accuracy": 0.9000017195940018, "num_tokens": 5040959.0, "sample_num_tokens": 8420.75, "step": 8519, "total_num_tokens": 5074642.0, "z_loss": 0.00046618827036581933 }, { "copy_logits_max": -6.577791213989258, "copy_logits_min": -750000000.0, "copy_num_tokens": 299.875, "epoch": 1.7399540464641308, "gen_logits_max": 5.521553993225098, "gen_logits_mean": -14.867480278015137, "gen_logits_min": -27.461244583129883, "gen_logits_std": 3.296994209289551, "gen_loss": 0.32336360216140747, "grad_norm": 0.36283950118652347, "learning_rate": 2.016757894736842e-05, "loss": 0.2794, "mean_copy_accuracy": 0.9957683831453323, "mean_gen_accuracy": 0.873679056763649, "mean_token_accuracy": 0.9046574085950851, "num_tokens": 5320787.0, "sample_num_tokens": 7532.25, "step": 8520, "total_num_tokens": 5350916.0, "z_loss": 0.00047884148079901934 }, { "copy_logits_max": -7.319021224975586, "copy_logits_min": -687500032.0, "copy_num_tokens": 445.9375, "epoch": 1.7401582844013275, "gen_logits_max": 3.7669248580932617, "gen_logits_mean": -16.50149917602539, "gen_logits_min": -28.789628982543945, "gen_logits_std": 3.3856263160705566, "gen_loss": 0.2680978775024414, "grad_norm": 0.35919531807061966, "learning_rate": 2.0166315789473685e-05, "loss": 0.262, "mean_copy_accuracy": 0.9967507421970367, "mean_gen_accuracy": 0.8823212683200836, "mean_token_accuracy": 0.9115157723426819, "num_tokens": 5580098.0, "sample_num_tokens": 7555.0, "step": 8521, "total_num_tokens": 5610318.0, "z_loss": 0.0004484444798436016 }, { "copy_logits_max": -6.722012519836426, "copy_logits_min": -687500096.0, "copy_num_tokens": 355.0625, "epoch": 1.7403625223385244, "gen_logits_max": 4.498138427734375, "gen_logits_mean": -16.352087020874023, "gen_logits_min": -28.4776554107666, "gen_logits_std": 3.357997417449951, "gen_loss": 0.29512646794319153, "grad_norm": 0.36188167503499225, "learning_rate": 2.0165052631578946e-05, "loss": 0.2861, "mean_copy_accuracy": 0.9972608685493469, "mean_gen_accuracy": 0.8708631694316864, "mean_token_accuracy": 0.9018082916736603, "num_tokens": 5851342.0, "sample_num_tokens": 7861.5, "step": 8522, "total_num_tokens": 5882788.0, "z_loss": 0.0005416248459368944 }, { "copy_logits_max": -2.74027681350708, "copy_logits_min": -750000000.0, "copy_num_tokens": 578.875, "epoch": 1.7405667602757213, "gen_logits_max": 3.6141245365142822, "gen_logits_mean": -15.93533706665039, "gen_logits_min": -28.42841148376465, "gen_logits_std": 3.3674261569976807, "gen_loss": 0.2842479348182678, "grad_norm": 0.32347056175415034, "learning_rate": 2.016378947368421e-05, "loss": 0.2502, "mean_copy_accuracy": 0.9980010837316513, "mean_gen_accuracy": 0.8779498189687729, "mean_token_accuracy": 0.9144850224256516, "num_tokens": 6137915.0, "sample_num_tokens": 9686.75, "step": 8523, "total_num_tokens": 6176662.0, "z_loss": 0.00048302265349775553 }, { "copy_logits_max": -2.9093990325927734, "copy_logits_min": -750000000.0, "copy_num_tokens": 409.125, "epoch": 1.740770998212918, "gen_logits_max": 3.738579750061035, "gen_logits_mean": -16.992233276367188, "gen_logits_min": -28.99492835998535, "gen_logits_std": 3.3566510677337646, "gen_loss": 0.3199303150177002, "grad_norm": 0.3643621903238914, "learning_rate": 2.0162526315789475e-05, "loss": 0.2838, "mean_copy_accuracy": 0.9964182376861572, "mean_gen_accuracy": 0.8744861781597137, "mean_token_accuracy": 0.9020875245332718, "num_tokens": 6413139.0, "sample_num_tokens": 7951.75, "step": 8524, "total_num_tokens": 6444946.0, "z_loss": 0.0005509977345354855 }, { "copy_logits_max": -0.8887803554534912, "copy_logits_min": -750000000.0, "copy_num_tokens": 373.375, "epoch": 1.7409752361501147, "gen_logits_max": 3.971503734588623, "gen_logits_mean": -16.606876373291016, "gen_logits_min": -28.838436126708984, "gen_logits_std": 3.3788723945617676, "gen_loss": 0.29696959257125854, "grad_norm": 0.3437586004945625, "learning_rate": 2.0161263157894736e-05, "loss": 0.2588, "mean_copy_accuracy": 0.9972953796386719, "mean_gen_accuracy": 0.8818872421979904, "mean_token_accuracy": 0.9103827625513077, "num_tokens": 6670490.0, "sample_num_tokens": 7487.0, "step": 8525, "total_num_tokens": 6700438.0, "z_loss": 0.000479145091958344 }, { "copy_logits_max": -1.5053361654281616, "copy_logits_min": -750000000.0, "copy_num_tokens": 382.1875, "epoch": 1.7411794740873117, "gen_logits_max": 4.11768102645874, "gen_logits_mean": -16.674327850341797, "gen_logits_min": -28.706653594970703, "gen_logits_std": 3.361924648284912, "gen_loss": 0.26464179158210754, "grad_norm": 0.3625131092208755, "learning_rate": 2.016e-05, "loss": 0.273, "mean_copy_accuracy": 0.9967110306024551, "mean_gen_accuracy": 0.8783143013715744, "mean_token_accuracy": 0.9057158082723618, "num_tokens": 6916169.0, "sample_num_tokens": 7100.25, "step": 8526, "total_num_tokens": 6944570.0, "z_loss": 0.0004299379652366042 }, { "copy_logits_max": -4.612797737121582, "copy_logits_min": -687500032.0, "copy_num_tokens": 581.0625, "epoch": 1.7413837120245086, "gen_logits_max": 3.5685620307922363, "gen_logits_mean": -16.97571563720703, "gen_logits_min": -28.99270248413086, "gen_logits_std": 3.392815113067627, "gen_loss": 0.27035725116729736, "grad_norm": 0.31993923519617345, "learning_rate": 2.0158736842105264e-05, "loss": 0.2621, "mean_copy_accuracy": 0.9972863793373108, "mean_gen_accuracy": 0.8797369748353958, "mean_token_accuracy": 0.9105353206396103, "num_tokens": 7196613.0, "sample_num_tokens": 9720.25, "step": 8527, "total_num_tokens": 7235494.0, "z_loss": 0.00043591586290858686 }, { "copy_logits_max": -0.9142928123474121, "copy_logits_min": -750000000.0, "copy_num_tokens": 521.6875, "epoch": 1.7415879499617053, "gen_logits_max": 3.9784722328186035, "gen_logits_mean": -16.84311294555664, "gen_logits_min": -29.108613967895508, "gen_logits_std": 3.3808844089508057, "gen_loss": 0.2719561457633972, "grad_norm": 0.35326400543891306, "learning_rate": 2.015747368421053e-05, "loss": 0.2681, "mean_copy_accuracy": 0.996998131275177, "mean_gen_accuracy": 0.8796838223934174, "mean_token_accuracy": 0.9095809459686279, "num_tokens": 7463293.0, "sample_num_tokens": 8833.75, "step": 8528, "total_num_tokens": 7498628.0, "z_loss": 0.00047422866919077933 }, { "copy_logits_max": -2.364138603210449, "copy_logits_min": -750000000.0, "copy_num_tokens": 482.75, "epoch": 1.7417921878989022, "gen_logits_max": 5.37592077255249, "gen_logits_mean": -14.500082015991211, "gen_logits_min": -26.92444610595703, "gen_logits_std": 3.300703525543213, "gen_loss": 0.28855961561203003, "grad_norm": 0.3469725864552118, "learning_rate": 2.015621052631579e-05, "loss": 0.2676, "mean_copy_accuracy": 0.9972075074911118, "mean_gen_accuracy": 0.8765375465154648, "mean_token_accuracy": 0.9076264053583145, "num_tokens": 7731922.0, "sample_num_tokens": 8342.5, "step": 8529, "total_num_tokens": 7765292.0, "z_loss": 0.0004894022713415325 }, { "copy_logits_max": -0.08508884906768799, "copy_logits_min": -750000000.0, "copy_num_tokens": 913.125, "epoch": 1.7419964258360991, "gen_logits_max": 2.993162155151367, "gen_logits_mean": -17.178192138671875, "gen_logits_min": -29.544775009155273, "gen_logits_std": 3.410379409790039, "gen_loss": 0.21813619136810303, "grad_norm": 0.36653999987619523, "learning_rate": 2.0154947368421054e-05, "loss": 0.2589, "mean_copy_accuracy": 0.9971686750650406, "mean_gen_accuracy": 0.8831125050783157, "mean_token_accuracy": 0.9121603071689606, "num_tokens": 8000428.0, "sample_num_tokens": 11831.5, "step": 8530, "total_num_tokens": 8047754.0, "z_loss": 0.00040421285666525364 }, { "copy_logits_max": -2.412276268005371, "copy_logits_min": -750000000.0, "copy_num_tokens": 730.4375, "epoch": 1.7422006637732959, "gen_logits_max": 0.8206533193588257, "gen_logits_mean": -20.3416805267334, "gen_logits_min": -32.54789733886719, "gen_logits_std": 3.5183286666870117, "gen_loss": 0.2637917995452881, "grad_norm": 0.32838742725059217, "learning_rate": 2.0153684210526315e-05, "loss": 0.2552, "mean_copy_accuracy": 0.9976327568292618, "mean_gen_accuracy": 0.8807482570409775, "mean_token_accuracy": 0.9133755713701248, "num_tokens": 8297272.0, "sample_num_tokens": 9941.5, "step": 8531, "total_num_tokens": 8337038.0, "z_loss": 0.0004730996733997017 }, { "copy_logits_max": -3.4378809928894043, "copy_logits_min": -750000064.0, "copy_num_tokens": 585.5625, "epoch": 1.7424049017104926, "gen_logits_max": 4.79673957824707, "gen_logits_mean": -15.061223983764648, "gen_logits_min": -27.21046257019043, "gen_logits_std": 3.2966392040252686, "gen_loss": 0.2666005790233612, "grad_norm": 0.37450282686076575, "learning_rate": 2.015242105263158e-05, "loss": 0.2751, "mean_copy_accuracy": 0.9969727098941803, "mean_gen_accuracy": 0.876665785908699, "mean_token_accuracy": 0.9083470851182938, "num_tokens": 8584160.0, "sample_num_tokens": 10663.5, "step": 8532, "total_num_tokens": 8626814.0, "z_loss": 0.0004507105622906238 }, { "copy_logits_max": -1.3733224868774414, "copy_logits_min": -750000000.0, "copy_num_tokens": 330.375, "epoch": 1.7426091396476897, "gen_logits_max": 4.2603230476379395, "gen_logits_mean": -16.465803146362305, "gen_logits_min": -28.529144287109375, "gen_logits_std": 3.3380653858184814, "gen_loss": 0.2974322438240051, "grad_norm": 0.36404571297371596, "learning_rate": 2.015115789473684e-05, "loss": 0.2643, "mean_copy_accuracy": 0.9967056512832642, "mean_gen_accuracy": 0.8830104917287827, "mean_token_accuracy": 0.9098362624645233, "num_tokens": 8865607.0, "sample_num_tokens": 7271.75, "step": 8533, "total_num_tokens": 8894694.0, "z_loss": 0.0005078887334093451 }, { "copy_logits_max": -7.3471879959106445, "copy_logits_min": -750000000.0, "copy_num_tokens": 413.4375, "epoch": 1.7428133775848864, "gen_logits_max": 3.8683037757873535, "gen_logits_mean": -16.806716918945312, "gen_logits_min": -28.74774169921875, "gen_logits_std": 3.3789072036743164, "gen_loss": 0.25215673446655273, "grad_norm": 0.3423767702990286, "learning_rate": 2.0149894736842104e-05, "loss": 0.2666, "mean_copy_accuracy": 0.9968716204166412, "mean_gen_accuracy": 0.8794799000024796, "mean_token_accuracy": 0.9092027395963669, "num_tokens": 9134161.0, "sample_num_tokens": 7994.75, "step": 8534, "total_num_tokens": 9166140.0, "z_loss": 0.0004383863997645676 }, { "copy_logits_max": -8.566972732543945, "copy_logits_min": -750000000.0, "copy_num_tokens": 281.9375, "epoch": 1.7430176155220831, "gen_logits_max": 3.855786085128784, "gen_logits_mean": -17.976974487304688, "gen_logits_min": -29.792972564697266, "gen_logits_std": 3.3971598148345947, "gen_loss": 0.3146671652793884, "grad_norm": 0.34957987270722296, "learning_rate": 2.014863157894737e-05, "loss": 0.3021, "mean_copy_accuracy": 0.9973300248384476, "mean_gen_accuracy": 0.8701514452695847, "mean_token_accuracy": 0.8957670629024506, "num_tokens": 9400291.0, "sample_num_tokens": 7542.25, "step": 8535, "total_num_tokens": 9430460.0, "z_loss": 0.000497428176458925 }, { "copy_logits_max": -8.026304244995117, "copy_logits_min": -750000000.0, "copy_num_tokens": 468.4375, "epoch": 1.74322185345928, "gen_logits_max": 3.891120433807373, "gen_logits_mean": -15.755706787109375, "gen_logits_min": -27.77277374267578, "gen_logits_std": 3.313185930252075, "gen_loss": 0.26815545558929443, "grad_norm": 0.3641181074902207, "learning_rate": 2.0147368421052633e-05, "loss": 0.2859, "mean_copy_accuracy": 0.99763423204422, "mean_gen_accuracy": 0.8760959506034851, "mean_token_accuracy": 0.9035278409719467, "num_tokens": 9657214.0, "sample_num_tokens": 8222.0, "step": 8536, "total_num_tokens": 9690102.0, "z_loss": 0.00039197748992592096 }, { "copy_logits_max": -7.459018707275391, "copy_logits_min": -687500032.0, "copy_num_tokens": 387.4375, "epoch": 1.743426091396477, "gen_logits_max": 3.3427374362945557, "gen_logits_mean": -17.242603302001953, "gen_logits_min": -29.35944366455078, "gen_logits_std": 3.3677802085876465, "gen_loss": 0.3204956650733948, "grad_norm": 0.3730125582239947, "learning_rate": 2.0146105263157897e-05, "loss": 0.2957, "mean_copy_accuracy": 0.9964382946491241, "mean_gen_accuracy": 0.8734367042779922, "mean_token_accuracy": 0.9002250134944916, "num_tokens": 9924807.0, "sample_num_tokens": 7980.25, "step": 8537, "total_num_tokens": 9956728.0, "z_loss": 0.0004875599406659603 }, { "copy_logits_max": -5.610658645629883, "copy_logits_min": -750000000.0, "copy_num_tokens": 585.625, "epoch": 1.7436303293336737, "gen_logits_max": 3.940927028656006, "gen_logits_mean": -15.446151733398438, "gen_logits_min": -27.58271598815918, "gen_logits_std": 3.330352783203125, "gen_loss": 0.23446118831634521, "grad_norm": 0.3887570757576507, "learning_rate": 2.0144842105263158e-05, "loss": 0.2789, "mean_copy_accuracy": 0.9962888062000275, "mean_gen_accuracy": 0.8789117336273193, "mean_token_accuracy": 0.9050488620996475, "num_tokens": 10186610.0, "sample_num_tokens": 8402.5, "step": 8538, "total_num_tokens": 10220220.0, "z_loss": 0.0003270941087976098 }, { "copy_logits_max": -4.736770153045654, "copy_logits_min": -750000064.0, "copy_num_tokens": 550.5, "epoch": 1.7438345672708706, "gen_logits_max": 3.1250767707824707, "gen_logits_mean": -16.267364501953125, "gen_logits_min": -28.746183395385742, "gen_logits_std": 3.3470458984375, "gen_loss": 0.2570621371269226, "grad_norm": 0.3756980777487554, "learning_rate": 2.0143578947368423e-05, "loss": 0.2747, "mean_copy_accuracy": 0.9976852685213089, "mean_gen_accuracy": 0.8741513192653656, "mean_token_accuracy": 0.9086773544549942, "num_tokens": 10448875.0, "sample_num_tokens": 7659.25, "step": 8539, "total_num_tokens": 10479512.0, "z_loss": 0.0003746420843526721 }, { "copy_logits_max": -4.5208048820495605, "copy_logits_min": -750000000.0, "copy_num_tokens": 616.625, "epoch": 1.7440388052080675, "gen_logits_max": 3.4281325340270996, "gen_logits_mean": -15.25238037109375, "gen_logits_min": -28.171463012695312, "gen_logits_std": 3.3169429302215576, "gen_loss": 0.26413363218307495, "grad_norm": 0.3548329889180539, "learning_rate": 2.0142315789473684e-05, "loss": 0.2693, "mean_copy_accuracy": 0.9977284669876099, "mean_gen_accuracy": 0.8726216405630112, "mean_token_accuracy": 0.9092966169118881, "num_tokens": 10740867.0, "sample_num_tokens": 8269.75, "step": 8540, "total_num_tokens": 10773946.0, "z_loss": 0.0004131228197365999 }, { "copy_logits_max": -7.3285298347473145, "copy_logits_min": -750000000.0, "copy_num_tokens": 507.375, "epoch": 1.7442430431452642, "gen_logits_max": 4.685756683349609, "gen_logits_mean": -14.580385208129883, "gen_logits_min": -27.078662872314453, "gen_logits_std": 3.2564163208007812, "gen_loss": 0.24705073237419128, "grad_norm": 0.39343471625553045, "learning_rate": 2.0141052631578948e-05, "loss": 0.2797, "mean_copy_accuracy": 0.9969782829284668, "mean_gen_accuracy": 0.8781450986862183, "mean_token_accuracy": 0.9048190265893936, "num_tokens": 11010252.0, "sample_num_tokens": 8379.0, "step": 8541, "total_num_tokens": 11043768.0, "z_loss": 0.00036868435563519597 }, { "copy_logits_max": -6.810009002685547, "copy_logits_min": -750000000.0, "copy_num_tokens": 328.5, "epoch": 1.744447281082461, "gen_logits_max": 3.986318588256836, "gen_logits_mean": -17.245098114013672, "gen_logits_min": -29.412071228027344, "gen_logits_std": 3.3435683250427246, "gen_loss": 0.2768722176551819, "grad_norm": 0.38803323524343863, "learning_rate": 2.013978947368421e-05, "loss": 0.2689, "mean_copy_accuracy": 0.9953565299510956, "mean_gen_accuracy": 0.8816266506910324, "mean_token_accuracy": 0.9074377864599228, "num_tokens": 11268979.0, "sample_num_tokens": 7339.25, "step": 8542, "total_num_tokens": 11298336.0, "z_loss": 0.0004291537625249475 }, { "copy_logits_max": -6.656250953674316, "copy_logits_min": -750000000.0, "copy_num_tokens": 442.9375, "epoch": 1.7446515190196579, "gen_logits_max": 3.215888023376465, "gen_logits_mean": -17.82998275756836, "gen_logits_min": -30.003829956054688, "gen_logits_std": 3.388218879699707, "gen_loss": 0.2654181122779846, "grad_norm": 0.3638705036354597, "learning_rate": 2.0138526315789476e-05, "loss": 0.2699, "mean_copy_accuracy": 0.9962560385465622, "mean_gen_accuracy": 0.8786107897758484, "mean_token_accuracy": 0.9061926305294037, "num_tokens": 11538477.0, "sample_num_tokens": 8557.25, "step": 8543, "total_num_tokens": 11572706.0, "z_loss": 0.0003917141875717789 }, { "copy_logits_max": -6.136555194854736, "copy_logits_min": -687500032.0, "copy_num_tokens": 607.6875, "epoch": 1.7448557569568548, "gen_logits_max": 3.964296817779541, "gen_logits_mean": -16.423608779907227, "gen_logits_min": -28.494029998779297, "gen_logits_std": 3.3087587356567383, "gen_loss": 0.27193912863731384, "grad_norm": 0.35884821150913204, "learning_rate": 2.0137263157894737e-05, "loss": 0.2607, "mean_copy_accuracy": 0.9962978512048721, "mean_gen_accuracy": 0.8817514330148697, "mean_token_accuracy": 0.9101499170064926, "num_tokens": 11829867.0, "sample_num_tokens": 10162.75, "step": 8544, "total_num_tokens": 11870518.0, "z_loss": 0.00043050525709986687 }, { "copy_logits_max": -8.526022911071777, "copy_logits_min": -750000000.0, "copy_num_tokens": 301.25, "epoch": 1.7450599948940515, "gen_logits_max": 3.527744770050049, "gen_logits_mean": -18.503957748413086, "gen_logits_min": -30.483501434326172, "gen_logits_std": 3.389324903488159, "gen_loss": 0.24376912415027618, "grad_norm": 0.36931906916471563, "learning_rate": 2.0136e-05, "loss": 0.2738, "mean_copy_accuracy": 0.9967270493507385, "mean_gen_accuracy": 0.882757231593132, "mean_token_accuracy": 0.9053773283958435, "num_tokens": 12087173.0, "sample_num_tokens": 7651.75, "step": 8545, "total_num_tokens": 12117780.0, "z_loss": 0.0003675901098176837 }, { "copy_logits_max": -7.529492378234863, "copy_logits_min": -750000000.0, "copy_num_tokens": 376.875, "epoch": 1.7452642328312484, "gen_logits_max": 3.9183053970336914, "gen_logits_mean": -15.997459411621094, "gen_logits_min": -28.50625991821289, "gen_logits_std": 3.306046485900879, "gen_loss": 0.30557721853256226, "grad_norm": 0.379397775428725, "learning_rate": 2.0134736842105263e-05, "loss": 0.2666, "mean_copy_accuracy": 0.9969437122344971, "mean_gen_accuracy": 0.8747864365577698, "mean_token_accuracy": 0.909963458776474, "num_tokens": 12364838.0, "sample_num_tokens": 7640.0, "step": 8546, "total_num_tokens": 12395398.0, "z_loss": 0.0005240632453933358 }, { "copy_logits_max": -6.561875820159912, "copy_logits_min": -750000000.0, "copy_num_tokens": 550.625, "epoch": 1.7454684707684454, "gen_logits_max": 3.5231261253356934, "gen_logits_mean": -16.397491455078125, "gen_logits_min": -28.74083137512207, "gen_logits_std": 3.3271138668060303, "gen_loss": 0.24759769439697266, "grad_norm": 0.32886114733401645, "learning_rate": 2.0133473684210527e-05, "loss": 0.2505, "mean_copy_accuracy": 0.997031956911087, "mean_gen_accuracy": 0.8791761249303818, "mean_token_accuracy": 0.9135865718126297, "num_tokens": 12648360.0, "sample_num_tokens": 9092.5, "step": 8547, "total_num_tokens": 12684730.0, "z_loss": 0.00037227850407361984 }, { "copy_logits_max": -5.984211444854736, "copy_logits_min": -750000000.0, "copy_num_tokens": 561.3125, "epoch": 1.745672708705642, "gen_logits_max": 4.238215446472168, "gen_logits_mean": -15.86910629272461, "gen_logits_min": -28.47661590576172, "gen_logits_std": 3.3231658935546875, "gen_loss": 0.28374022245407104, "grad_norm": 0.36152169918034543, "learning_rate": 2.0132210526315788e-05, "loss": 0.297, "mean_copy_accuracy": 0.9976553916931152, "mean_gen_accuracy": 0.8682599663734436, "mean_token_accuracy": 0.9002943187952042, "num_tokens": 12916566.0, "sample_num_tokens": 9000.5, "step": 8548, "total_num_tokens": 12952568.0, "z_loss": 0.00044347887160256505 }, { "copy_logits_max": -5.201632022857666, "copy_logits_min": -750000000.0, "copy_num_tokens": 577.5, "epoch": 1.7458769466428388, "gen_logits_max": 2.385507583618164, "gen_logits_mean": -18.229036331176758, "gen_logits_min": -30.36829948425293, "gen_logits_std": 3.407029628753662, "gen_loss": 0.26433509588241577, "grad_norm": 0.33827690902605273, "learning_rate": 2.0130947368421052e-05, "loss": 0.2534, "mean_copy_accuracy": 0.9977850317955017, "mean_gen_accuracy": 0.8832971900701523, "mean_token_accuracy": 0.9129379242658615, "num_tokens": 13172093.0, "sample_num_tokens": 8994.25, "step": 8549, "total_num_tokens": 13208070.0, "z_loss": 0.00047645054291933775 }, { "copy_logits_max": -7.7902703285217285, "copy_logits_min": -750000000.0, "copy_num_tokens": 444.5, "epoch": 1.7460811845800357, "gen_logits_max": 3.570721387863159, "gen_logits_mean": -17.699722290039062, "gen_logits_min": -29.66518211364746, "gen_logits_std": 3.3528738021850586, "gen_loss": 0.3051337003707886, "grad_norm": 0.3540936567891082, "learning_rate": 2.0129684210526316e-05, "loss": 0.2869, "mean_copy_accuracy": 0.9976402670145035, "mean_gen_accuracy": 0.8749977946281433, "mean_token_accuracy": 0.9033064544200897, "num_tokens": 13472454.0, "sample_num_tokens": 9424.0, "step": 8550, "total_num_tokens": 13510150.0, "z_loss": 0.00045931609929539263 }, { "copy_logits_max": -5.986483573913574, "copy_logits_min": -750000000.0, "copy_num_tokens": 329.875, "epoch": 1.7462854225172326, "gen_logits_max": 4.69989013671875, "gen_logits_mean": -15.119161605834961, "gen_logits_min": -26.75703239440918, "gen_logits_std": 3.2416093349456787, "gen_loss": 0.2842254638671875, "grad_norm": 0.36056388013236035, "learning_rate": 2.012842105263158e-05, "loss": 0.2874, "mean_copy_accuracy": 0.997175008058548, "mean_gen_accuracy": 0.8798379600048065, "mean_token_accuracy": 0.9032596498727798, "num_tokens": 13742998.0, "sample_num_tokens": 7759.5, "step": 8551, "total_num_tokens": 13774036.0, "z_loss": 0.00042054790537804365 }, { "copy_logits_max": -7.298245429992676, "copy_logits_min": -750000000.0, "copy_num_tokens": 503.8125, "epoch": 1.7464896604544293, "gen_logits_max": 2.4207382202148438, "gen_logits_mean": -18.105640411376953, "gen_logits_min": -29.750688552856445, "gen_logits_std": 3.349148750305176, "gen_loss": 0.2723756432533264, "grad_norm": 0.3531681588449802, "learning_rate": 2.0127157894736845e-05, "loss": 0.266, "mean_copy_accuracy": 0.9950184226036072, "mean_gen_accuracy": 0.8820512890815735, "mean_token_accuracy": 0.9100025594234467, "num_tokens": 13993763.0, "sample_num_tokens": 8275.75, "step": 8552, "total_num_tokens": 14026866.0, "z_loss": 0.0004081752267666161 }, { "copy_logits_max": -7.17994499206543, "copy_logits_min": -750000000.0, "copy_num_tokens": 419.6875, "epoch": 1.7466938983916263, "gen_logits_max": 3.8508431911468506, "gen_logits_mean": -17.123937606811523, "gen_logits_min": -29.048229217529297, "gen_logits_std": 3.365968704223633, "gen_loss": 0.286385178565979, "grad_norm": 0.34965320724793864, "learning_rate": 2.0125894736842106e-05, "loss": 0.2753, "mean_copy_accuracy": 0.9974070340394974, "mean_gen_accuracy": 0.8785482048988342, "mean_token_accuracy": 0.9047495275735855, "num_tokens": 14270037.0, "sample_num_tokens": 8324.75, "step": 8553, "total_num_tokens": 14303336.0, "z_loss": 0.0004301797307562083 }, { "copy_logits_max": -6.293277740478516, "copy_logits_min": -750000064.0, "copy_num_tokens": 510.875, "epoch": 1.7468981363288232, "gen_logits_max": 4.507491111755371, "gen_logits_mean": -15.20516300201416, "gen_logits_min": -27.539783477783203, "gen_logits_std": 3.274195671081543, "gen_loss": 0.25851985812187195, "grad_norm": 0.3740656343468477, "learning_rate": 2.012463157894737e-05, "loss": 0.2816, "mean_copy_accuracy": 0.9976092576980591, "mean_gen_accuracy": 0.8740521818399429, "mean_token_accuracy": 0.9033852517604828, "num_tokens": 14516775.0, "sample_num_tokens": 8877.75, "step": 8554, "total_num_tokens": 14552286.0, "z_loss": 0.0004077478952240199 }, { "copy_logits_max": -8.243077278137207, "copy_logits_min": -750000000.0, "copy_num_tokens": 343.6875, "epoch": 1.7471023742660199, "gen_logits_max": 4.3701581954956055, "gen_logits_mean": -16.77810287475586, "gen_logits_min": -28.387359619140625, "gen_logits_std": 3.3177177906036377, "gen_loss": 0.28360170125961304, "grad_norm": 0.35477988372704605, "learning_rate": 2.012336842105263e-05, "loss": 0.2888, "mean_copy_accuracy": 0.9974042475223541, "mean_gen_accuracy": 0.8753921091556549, "mean_token_accuracy": 0.9015991240739822, "num_tokens": 14767800.0, "sample_num_tokens": 7397.5, "step": 8555, "total_num_tokens": 14797390.0, "z_loss": 0.0004366817884147167 }, { "copy_logits_max": -5.960762023925781, "copy_logits_min": -687500096.0, "copy_num_tokens": 324.875, "epoch": 1.7473066122032166, "gen_logits_max": 4.00418758392334, "gen_logits_mean": -17.08978843688965, "gen_logits_min": -29.299510955810547, "gen_logits_std": 3.372763156890869, "gen_loss": 0.33012592792510986, "grad_norm": 0.3832964057305922, "learning_rate": 2.0122105263157896e-05, "loss": 0.2868, "mean_copy_accuracy": 0.997119128704071, "mean_gen_accuracy": 0.8707302808761597, "mean_token_accuracy": 0.9030096679925919, "num_tokens": 15031642.0, "sample_num_tokens": 6831.0, "step": 8556, "total_num_tokens": 15058966.0, "z_loss": 0.0005374607862904668 }, { "copy_logits_max": -7.515463829040527, "copy_logits_min": -750000000.0, "copy_num_tokens": 360.9375, "epoch": 1.7475108501404137, "gen_logits_max": 5.048995494842529, "gen_logits_mean": -15.760395050048828, "gen_logits_min": -27.65895652770996, "gen_logits_std": 3.2880780696868896, "gen_loss": 0.3017250895500183, "grad_norm": 0.35682430966645307, "learning_rate": 2.0120842105263156e-05, "loss": 0.2762, "mean_copy_accuracy": 0.9964103698730469, "mean_gen_accuracy": 0.8818766623735428, "mean_token_accuracy": 0.9072118699550629, "num_tokens": 15288492.0, "sample_num_tokens": 7593.0, "step": 8557, "total_num_tokens": 15318864.0, "z_loss": 0.00048823095858097076 }, { "copy_logits_max": -9.360613822937012, "copy_logits_min": -750000000.0, "copy_num_tokens": 505.3125, "epoch": 1.7477150880776104, "gen_logits_max": 3.8010506629943848, "gen_logits_mean": -17.427223205566406, "gen_logits_min": -29.529560089111328, "gen_logits_std": 3.3872437477111816, "gen_loss": 0.2860778570175171, "grad_norm": 0.36339515117022914, "learning_rate": 2.011957894736842e-05, "loss": 0.2948, "mean_copy_accuracy": 0.9965570569038391, "mean_gen_accuracy": 0.8697319030761719, "mean_token_accuracy": 0.9006785452365875, "num_tokens": 15557719.0, "sample_num_tokens": 9348.75, "step": 8558, "total_num_tokens": 15595114.0, "z_loss": 0.0004175362118985504 }, { "copy_logits_max": -7.566244602203369, "copy_logits_min": -750000000.0, "copy_num_tokens": 309.6875, "epoch": 1.7479193260148072, "gen_logits_max": 4.134076118469238, "gen_logits_mean": -16.607547760009766, "gen_logits_min": -28.528032302856445, "gen_logits_std": 3.3263440132141113, "gen_loss": 0.3162214457988739, "grad_norm": 0.38608984930332657, "learning_rate": 2.0118315789473685e-05, "loss": 0.3036, "mean_copy_accuracy": 0.9963034391403198, "mean_gen_accuracy": 0.8711429089307785, "mean_token_accuracy": 0.8962232917547226, "num_tokens": 15806434.0, "sample_num_tokens": 6822.0, "step": 8559, "total_num_tokens": 15833722.0, "z_loss": 0.000505869509652257 }, { "copy_logits_max": -5.620003700256348, "copy_logits_min": -687500032.0, "copy_num_tokens": 540.9375, "epoch": 1.748123563952004, "gen_logits_max": 3.592296600341797, "gen_logits_mean": -16.524188995361328, "gen_logits_min": -29.01117515563965, "gen_logits_std": 3.3597521781921387, "gen_loss": 0.28881534934043884, "grad_norm": 0.35921598665751764, "learning_rate": 2.011705263157895e-05, "loss": 0.2877, "mean_copy_accuracy": 0.9967532157897949, "mean_gen_accuracy": 0.8757240325212479, "mean_token_accuracy": 0.9024530798196793, "num_tokens": 16054531.0, "sample_num_tokens": 9336.25, "step": 8560, "total_num_tokens": 16091876.0, "z_loss": 0.0005002087564207613 }, { "copy_logits_max": -4.511594772338867, "copy_logits_min": -750000064.0, "copy_num_tokens": 587.125, "epoch": 1.748327801889201, "gen_logits_max": 4.521860122680664, "gen_logits_mean": -15.462919235229492, "gen_logits_min": -27.79861831665039, "gen_logits_std": 3.349929094314575, "gen_loss": 0.2370935082435608, "grad_norm": 0.3770578199290717, "learning_rate": 2.011578947368421e-05, "loss": 0.2681, "mean_copy_accuracy": 0.9974732846021652, "mean_gen_accuracy": 0.8787007927894592, "mean_token_accuracy": 0.9085199981927872, "num_tokens": 16322713.0, "sample_num_tokens": 8931.75, "step": 8561, "total_num_tokens": 16358440.0, "z_loss": 0.00039290799759328365 }, { "copy_logits_max": -5.85205078125, "copy_logits_min": -750000000.0, "copy_num_tokens": 487.5625, "epoch": 1.7485320398263977, "gen_logits_max": 3.264065742492676, "gen_logits_mean": -17.981611251831055, "gen_logits_min": -30.165203094482422, "gen_logits_std": 3.4262776374816895, "gen_loss": 0.25092509388923645, "grad_norm": 0.3594717286203965, "learning_rate": 2.0114526315789475e-05, "loss": 0.2606, "mean_copy_accuracy": 0.9971461296081543, "mean_gen_accuracy": 0.8805361390113831, "mean_token_accuracy": 0.9105758517980576, "num_tokens": 16580451.0, "sample_num_tokens": 7864.25, "step": 8562, "total_num_tokens": 16611908.0, "z_loss": 0.0004192751075606793 }, { "copy_logits_max": -5.904405117034912, "copy_logits_min": -750000000.0, "copy_num_tokens": 508.0, "epoch": 1.7487362777635946, "gen_logits_max": 4.96005392074585, "gen_logits_mean": -14.89506721496582, "gen_logits_min": -27.397232055664062, "gen_logits_std": 3.2812256813049316, "gen_loss": 0.2922576367855072, "grad_norm": 0.38200318850130316, "learning_rate": 2.0113263157894736e-05, "loss": 0.2828, "mean_copy_accuracy": 0.9968139380216599, "mean_gen_accuracy": 0.8772216886281967, "mean_token_accuracy": 0.9049701243638992, "num_tokens": 16841110.0, "sample_num_tokens": 8866.0, "step": 8563, "total_num_tokens": 16876574.0, "z_loss": 0.00048493180656805634 }, { "copy_logits_max": -7.00606107711792, "copy_logits_min": -750000000.0, "copy_num_tokens": 503.8125, "epoch": 1.7489405157007916, "gen_logits_max": 4.240828037261963, "gen_logits_mean": -16.412736892700195, "gen_logits_min": -28.338882446289062, "gen_logits_std": 3.3425416946411133, "gen_loss": 0.26902151107788086, "grad_norm": 0.34982664870334507, "learning_rate": 2.0112e-05, "loss": 0.2598, "mean_copy_accuracy": 0.99714694917202, "mean_gen_accuracy": 0.8890836834907532, "mean_token_accuracy": 0.9118029028177261, "num_tokens": 17123482.0, "sample_num_tokens": 8703.0, "step": 8564, "total_num_tokens": 17158294.0, "z_loss": 0.0004756782727781683 }, { "copy_logits_max": -6.1947150230407715, "copy_logits_min": -750000000.0, "copy_num_tokens": 496.25, "epoch": 1.7491447536379883, "gen_logits_max": 4.507493019104004, "gen_logits_mean": -15.450075149536133, "gen_logits_min": -27.217403411865234, "gen_logits_std": 3.2960309982299805, "gen_loss": 0.27429091930389404, "grad_norm": 0.36130194713791053, "learning_rate": 2.0110736842105264e-05, "loss": 0.2807, "mean_copy_accuracy": 0.9972173869609833, "mean_gen_accuracy": 0.8723528385162354, "mean_token_accuracy": 0.9048486202955246, "num_tokens": 17386226.0, "sample_num_tokens": 7901.0, "step": 8565, "total_num_tokens": 17417830.0, "z_loss": 0.00047227751929312944 }, { "copy_logits_max": -7.25889253616333, "copy_logits_min": -750000000.0, "copy_num_tokens": 488.3125, "epoch": 1.749348991575185, "gen_logits_max": 2.924788475036621, "gen_logits_mean": -17.57155990600586, "gen_logits_min": -29.569713592529297, "gen_logits_std": 3.3726587295532227, "gen_loss": 0.26421689987182617, "grad_norm": 0.3541674593366789, "learning_rate": 2.0109473684210525e-05, "loss": 0.2829, "mean_copy_accuracy": 0.9974462240934372, "mean_gen_accuracy": 0.8711395412683487, "mean_token_accuracy": 0.9038159549236298, "num_tokens": 17652882.0, "sample_num_tokens": 8309.5, "step": 8566, "total_num_tokens": 17686120.0, "z_loss": 0.00046437059063464403 }, { "copy_logits_max": -6.5304059982299805, "copy_logits_min": -750000000.0, "copy_num_tokens": 425.4375, "epoch": 1.749553229512382, "gen_logits_max": 4.710611343383789, "gen_logits_mean": -15.886460304260254, "gen_logits_min": -28.246658325195312, "gen_logits_std": 3.307389736175537, "gen_loss": 0.28529295325279236, "grad_norm": 0.35958754814892824, "learning_rate": 2.0108210526315793e-05, "loss": 0.2824, "mean_copy_accuracy": 0.996249258518219, "mean_gen_accuracy": 0.8787018358707428, "mean_token_accuracy": 0.9035785645246506, "num_tokens": 17916185.0, "sample_num_tokens": 7975.75, "step": 8567, "total_num_tokens": 17948088.0, "z_loss": 0.0004956074990332127 }, { "copy_logits_max": -7.560745716094971, "copy_logits_min": -750000000.0, "copy_num_tokens": 479.4375, "epoch": 1.7497574674495788, "gen_logits_max": 3.504794120788574, "gen_logits_mean": -17.58700942993164, "gen_logits_min": -29.70877456665039, "gen_logits_std": 3.389265537261963, "gen_loss": 0.27195972204208374, "grad_norm": 0.34195374650929455, "learning_rate": 2.0106947368421054e-05, "loss": 0.2565, "mean_copy_accuracy": 0.9968418627977371, "mean_gen_accuracy": 0.8840387761592865, "mean_token_accuracy": 0.9109282344579697, "num_tokens": 18192281.0, "sample_num_tokens": 9063.75, "step": 8568, "total_num_tokens": 18228536.0, "z_loss": 0.000436863221693784 }, { "copy_logits_max": -5.434927940368652, "copy_logits_min": -687500032.0, "copy_num_tokens": 498.5, "epoch": 1.7499617053867755, "gen_logits_max": 3.54068922996521, "gen_logits_mean": -16.301219940185547, "gen_logits_min": -28.55565071105957, "gen_logits_std": 3.3089663982391357, "gen_loss": 0.2699053883552551, "grad_norm": 0.3483116613695698, "learning_rate": 2.0105684210526318e-05, "loss": 0.2908, "mean_copy_accuracy": 0.9970178753137589, "mean_gen_accuracy": 0.8720494508743286, "mean_token_accuracy": 0.9040992259979248, "num_tokens": 18477047.0, "sample_num_tokens": 8004.25, "step": 8569, "total_num_tokens": 18509064.0, "z_loss": 0.00044559879461303353 }, { "copy_logits_max": -6.060662746429443, "copy_logits_min": -750000000.0, "copy_num_tokens": 525.5625, "epoch": 1.7501659433239725, "gen_logits_max": 4.175640106201172, "gen_logits_mean": -14.956182479858398, "gen_logits_min": -26.965421676635742, "gen_logits_std": 3.3013253211975098, "gen_loss": 0.25983723998069763, "grad_norm": 0.37620791816736066, "learning_rate": 2.010442105263158e-05, "loss": 0.2723, "mean_copy_accuracy": 0.9971697777509689, "mean_gen_accuracy": 0.8771352469921112, "mean_token_accuracy": 0.907803475856781, "num_tokens": 18751507.0, "sample_num_tokens": 8332.75, "step": 8570, "total_num_tokens": 18784838.0, "z_loss": 0.00042506365571171045 }, { "copy_logits_max": -6.118927478790283, "copy_logits_min": -687500032.0, "copy_num_tokens": 477.6875, "epoch": 1.7503701812611694, "gen_logits_max": 4.915051460266113, "gen_logits_mean": -15.523664474487305, "gen_logits_min": -27.210968017578125, "gen_logits_std": 3.3104774951934814, "gen_loss": 0.28655368089675903, "grad_norm": 0.34918475506400803, "learning_rate": 2.0103157894736843e-05, "loss": 0.2787, "mean_copy_accuracy": 0.9977562427520752, "mean_gen_accuracy": 0.8792489618062973, "mean_token_accuracy": 0.9068493694067001, "num_tokens": 19032134.0, "sample_num_tokens": 9643.5, "step": 8571, "total_num_tokens": 19070708.0, "z_loss": 0.0004846103547606617 }, { "copy_logits_max": -5.149484634399414, "copy_logits_min": -750000000.0, "copy_num_tokens": 292.6875, "epoch": 1.750574419198366, "gen_logits_max": 4.646385192871094, "gen_logits_mean": -17.268558502197266, "gen_logits_min": -29.374662399291992, "gen_logits_std": 3.3773891925811768, "gen_loss": 0.3156934678554535, "grad_norm": 0.34620402804819034, "learning_rate": 2.0101894736842104e-05, "loss": 0.292, "mean_copy_accuracy": 0.9967198669910431, "mean_gen_accuracy": 0.8735033869743347, "mean_token_accuracy": 0.9017342031002045, "num_tokens": 19309149.0, "sample_num_tokens": 7292.75, "step": 8572, "total_num_tokens": 19338320.0, "z_loss": 0.0005095704109407961 }, { "copy_logits_max": -5.363367557525635, "copy_logits_min": -750000000.0, "copy_num_tokens": 427.4375, "epoch": 1.7507786571355628, "gen_logits_max": 4.324019432067871, "gen_logits_mean": -17.067995071411133, "gen_logits_min": -28.86138153076172, "gen_logits_std": 3.381697654724121, "gen_loss": 0.2732003927230835, "grad_norm": 0.3458035233997146, "learning_rate": 2.010063157894737e-05, "loss": 0.2743, "mean_copy_accuracy": 0.997173398733139, "mean_gen_accuracy": 0.8766350299119949, "mean_token_accuracy": 0.9064996838569641, "num_tokens": 19572980.0, "sample_num_tokens": 9217.5, "step": 8573, "total_num_tokens": 19609850.0, "z_loss": 0.0004342221945989877 }, { "copy_logits_max": -5.785280227661133, "copy_logits_min": -687500032.0, "copy_num_tokens": 427.875, "epoch": 1.7509828950727597, "gen_logits_max": 4.872469425201416, "gen_logits_mean": -14.92951774597168, "gen_logits_min": -26.989723205566406, "gen_logits_std": 3.316406726837158, "gen_loss": 0.25529009103775024, "grad_norm": 0.35782669109779114, "learning_rate": 2.009936842105263e-05, "loss": 0.2737, "mean_copy_accuracy": 0.9973105639219284, "mean_gen_accuracy": 0.8796404302120209, "mean_token_accuracy": 0.906132847070694, "num_tokens": 19837526.0, "sample_num_tokens": 7911.5, "step": 8574, "total_num_tokens": 19869172.0, "z_loss": 0.0004021544591523707 }, { "copy_logits_max": -6.840347766876221, "copy_logits_min": -750000000.0, "copy_num_tokens": 539.0, "epoch": 1.7511871330099567, "gen_logits_max": 4.08519172668457, "gen_logits_mean": -16.547714233398438, "gen_logits_min": -28.813953399658203, "gen_logits_std": 3.3846397399902344, "gen_loss": 0.2811424434185028, "grad_norm": 0.36328716712668496, "learning_rate": 2.0098105263157894e-05, "loss": 0.2888, "mean_copy_accuracy": 0.9969944506883621, "mean_gen_accuracy": 0.8733005821704865, "mean_token_accuracy": 0.9016413241624832, "num_tokens": 20093574.0, "sample_num_tokens": 8888.5, "step": 8575, "total_num_tokens": 20129128.0, "z_loss": 0.0004297965788282454 }, { "copy_logits_max": -5.430029392242432, "copy_logits_min": -750000000.0, "copy_num_tokens": 634.125, "epoch": 1.7513913709471534, "gen_logits_max": 4.070662975311279, "gen_logits_mean": -15.611955642700195, "gen_logits_min": -27.5130615234375, "gen_logits_std": 3.317516803741455, "gen_loss": 0.2542840242385864, "grad_norm": 0.31734746114491813, "learning_rate": 2.0096842105263158e-05, "loss": 0.2672, "mean_copy_accuracy": 0.9975616186857224, "mean_gen_accuracy": 0.8775817900896072, "mean_token_accuracy": 0.9093440175056458, "num_tokens": 20393419.0, "sample_num_tokens": 8793.75, "step": 8576, "total_num_tokens": 20428594.0, "z_loss": 0.00043755825026892126 }, { "copy_logits_max": -6.040493965148926, "copy_logits_min": -750000000.0, "copy_num_tokens": 295.4375, "epoch": 1.7515956088843503, "gen_logits_max": 4.834869384765625, "gen_logits_mean": -15.271217346191406, "gen_logits_min": -26.679927825927734, "gen_logits_std": 3.228104591369629, "gen_loss": 0.30400893092155457, "grad_norm": 0.3869113520745174, "learning_rate": 2.0095578947368422e-05, "loss": 0.2663, "mean_copy_accuracy": 0.9969809502363205, "mean_gen_accuracy": 0.8842364549636841, "mean_token_accuracy": 0.9092934280633926, "num_tokens": 20647163.0, "sample_num_tokens": 7059.75, "step": 8577, "total_num_tokens": 20675402.0, "z_loss": 0.00047776862629689276 }, { "copy_logits_max": -7.392289161682129, "copy_logits_min": -750000064.0, "copy_num_tokens": 464.8125, "epoch": 1.7517998468215472, "gen_logits_max": 4.122056007385254, "gen_logits_mean": -16.4453182220459, "gen_logits_min": -28.406265258789062, "gen_logits_std": 3.327840805053711, "gen_loss": 0.25504758954048157, "grad_norm": 0.36449393558898624, "learning_rate": 2.0094315789473687e-05, "loss": 0.2748, "mean_copy_accuracy": 0.9970051795244217, "mean_gen_accuracy": 0.8740872740745544, "mean_token_accuracy": 0.9079993814229965, "num_tokens": 20920032.0, "sample_num_tokens": 8791.0, "step": 8578, "total_num_tokens": 20955196.0, "z_loss": 0.0004078072961419821 }, { "copy_logits_max": -6.829211711883545, "copy_logits_min": -687500032.0, "copy_num_tokens": 487.125, "epoch": 1.752004084758744, "gen_logits_max": 3.5536656379699707, "gen_logits_mean": -16.75020408630371, "gen_logits_min": -28.814910888671875, "gen_logits_std": 3.378756046295166, "gen_loss": 0.2569192051887512, "grad_norm": 0.34604344292311023, "learning_rate": 2.0093052631578948e-05, "loss": 0.2679, "mean_copy_accuracy": 0.9967280328273773, "mean_gen_accuracy": 0.8776363134384155, "mean_token_accuracy": 0.908992126584053, "num_tokens": 21191442.0, "sample_num_tokens": 8817.0, "step": 8579, "total_num_tokens": 21226710.0, "z_loss": 0.000391970737837255 }, { "copy_logits_max": -7.814203262329102, "copy_logits_min": -750000064.0, "copy_num_tokens": 331.1875, "epoch": 1.7522083226959406, "gen_logits_max": 3.6721034049987793, "gen_logits_mean": -17.884578704833984, "gen_logits_min": -29.890151977539062, "gen_logits_std": 3.424072742462158, "gen_loss": 0.2794322073459625, "grad_norm": 0.3566899769516399, "learning_rate": 2.0091789473684212e-05, "loss": 0.2675, "mean_copy_accuracy": 0.9973111152648926, "mean_gen_accuracy": 0.8806857019662857, "mean_token_accuracy": 0.908510684967041, "num_tokens": 21470054.0, "sample_num_tokens": 7513.5, "step": 8580, "total_num_tokens": 21500108.0, "z_loss": 0.0004309098585508764 }, { "copy_logits_max": -5.9223504066467285, "copy_logits_min": -625000064.0, "copy_num_tokens": 531.3125, "epoch": 1.7524125606331376, "gen_logits_max": 3.2567715644836426, "gen_logits_mean": -17.803443908691406, "gen_logits_min": -29.961904525756836, "gen_logits_std": 3.432495355606079, "gen_loss": 0.24675656855106354, "grad_norm": 0.36529189273492846, "learning_rate": 2.0090526315789473e-05, "loss": 0.2693, "mean_copy_accuracy": 0.997146874666214, "mean_gen_accuracy": 0.8800513744354248, "mean_token_accuracy": 0.9085206538438797, "num_tokens": 21739914.0, "sample_num_tokens": 10085.5, "step": 8581, "total_num_tokens": 21780256.0, "z_loss": 0.0004022358334623277 }, { "copy_logits_max": -2.459425687789917, "copy_logits_min": -750000000.0, "copy_num_tokens": 547.3125, "epoch": 1.7526167985703345, "gen_logits_max": 4.11277961730957, "gen_logits_mean": -15.62182903289795, "gen_logits_min": -27.910118103027344, "gen_logits_std": 3.3877079486846924, "gen_loss": 0.24439270794391632, "grad_norm": 0.34751182443332784, "learning_rate": 2.0089263157894737e-05, "loss": 0.2637, "mean_copy_accuracy": 0.997160017490387, "mean_gen_accuracy": 0.8771394938230515, "mean_token_accuracy": 0.9090896546840668, "num_tokens": 22019853.0, "sample_num_tokens": 8694.75, "step": 8582, "total_num_tokens": 22054632.0, "z_loss": 0.000364931533113122 }, { "copy_logits_max": -5.174744129180908, "copy_logits_min": -750000000.0, "copy_num_tokens": 422.6875, "epoch": 1.7528210365075312, "gen_logits_max": 4.11919641494751, "gen_logits_mean": -16.890668869018555, "gen_logits_min": -28.54738998413086, "gen_logits_std": 3.364288091659546, "gen_loss": 0.2867307960987091, "grad_norm": 0.36208720157421137, "learning_rate": 2.0087999999999998e-05, "loss": 0.2954, "mean_copy_accuracy": 0.9967954158782959, "mean_gen_accuracy": 0.8722700625658035, "mean_token_accuracy": 0.8998006731271744, "num_tokens": 22283345.0, "sample_num_tokens": 8744.75, "step": 8583, "total_num_tokens": 22318324.0, "z_loss": 0.00041527830762788653 }, { "copy_logits_max": -2.5855555534362793, "copy_logits_min": -625000064.0, "copy_num_tokens": 869.0625, "epoch": 1.753025274444728, "gen_logits_max": 3.680788278579712, "gen_logits_mean": -15.853626251220703, "gen_logits_min": -28.36855697631836, "gen_logits_std": 3.3854238986968994, "gen_loss": 0.2393653690814972, "grad_norm": 0.3414381616360993, "learning_rate": 2.0086736842105266e-05, "loss": 0.2666, "mean_copy_accuracy": 0.9974541515111923, "mean_gen_accuracy": 0.8756292462348938, "mean_token_accuracy": 0.910310834646225, "num_tokens": 22542281.0, "sample_num_tokens": 10120.25, "step": 8584, "total_num_tokens": 22582762.0, "z_loss": 0.00035900622606277466 }, { "copy_logits_max": -3.301210880279541, "copy_logits_min": -750000000.0, "copy_num_tokens": 362.125, "epoch": 1.753229512381925, "gen_logits_max": 5.48720645904541, "gen_logits_mean": -14.865873336791992, "gen_logits_min": -27.237550735473633, "gen_logits_std": 3.296025037765503, "gen_loss": 0.29895853996276855, "grad_norm": 0.3774803727519499, "learning_rate": 2.0085473684210527e-05, "loss": 0.2971, "mean_copy_accuracy": 0.9965390413999557, "mean_gen_accuracy": 0.8710021376609802, "mean_token_accuracy": 0.8986921161413193, "num_tokens": 22807257.0, "sample_num_tokens": 7648.25, "step": 8585, "total_num_tokens": 22837850.0, "z_loss": 0.0004309284267947078 }, { "copy_logits_max": -2.682401180267334, "copy_logits_min": -625000064.0, "copy_num_tokens": 601.5, "epoch": 1.7534337503191217, "gen_logits_max": 4.883736610412598, "gen_logits_mean": -13.882068634033203, "gen_logits_min": -26.239521026611328, "gen_logits_std": 3.281263589859009, "gen_loss": 0.2807905077934265, "grad_norm": 0.3551782206100305, "learning_rate": 2.008421052631579e-05, "loss": 0.2732, "mean_copy_accuracy": 0.9971743673086166, "mean_gen_accuracy": 0.8746343702077866, "mean_token_accuracy": 0.9087195694446564, "num_tokens": 23075891.0, "sample_num_tokens": 9156.25, "step": 8586, "total_num_tokens": 23112516.0, "z_loss": 0.0005127268377691507 }, { "copy_logits_max": -3.054934501647949, "copy_logits_min": -750000000.0, "copy_num_tokens": 409.25, "epoch": 1.7536379882563184, "gen_logits_max": 4.6137518882751465, "gen_logits_mean": -15.421056747436523, "gen_logits_min": -27.2636661529541, "gen_logits_std": 3.3041129112243652, "gen_loss": 0.26213955879211426, "grad_norm": 0.3358947521531659, "learning_rate": 2.0082947368421052e-05, "loss": 0.2594, "mean_copy_accuracy": 0.9973048716783524, "mean_gen_accuracy": 0.8843024522066116, "mean_token_accuracy": 0.913183256983757, "num_tokens": 23376558.0, "sample_num_tokens": 8445.5, "step": 8587, "total_num_tokens": 23410340.0, "z_loss": 0.00043034832924604416 }, { "copy_logits_max": -1.3763371706008911, "copy_logits_min": -750000000.0, "copy_num_tokens": 345.5625, "epoch": 1.7538422261935156, "gen_logits_max": 5.467194557189941, "gen_logits_mean": -14.521206855773926, "gen_logits_min": -26.546119689941406, "gen_logits_std": 3.2896664142608643, "gen_loss": 0.31010890007019043, "grad_norm": 0.3641639831704204, "learning_rate": 2.0081684210526316e-05, "loss": 0.2751, "mean_copy_accuracy": 0.9974893182516098, "mean_gen_accuracy": 0.8752490878105164, "mean_token_accuracy": 0.9063587784767151, "num_tokens": 23630776.0, "sample_num_tokens": 6821.5, "step": 8588, "total_num_tokens": 23658062.0, "z_loss": 0.0005059503018856049 }, { "copy_logits_max": -4.998117446899414, "copy_logits_min": -750000000.0, "copy_num_tokens": 474.75, "epoch": 1.7540464641307123, "gen_logits_max": 4.384798049926758, "gen_logits_mean": -16.472164154052734, "gen_logits_min": -28.44738006591797, "gen_logits_std": 3.375854015350342, "gen_loss": 0.290107786655426, "grad_norm": 0.3687056240339639, "learning_rate": 2.0080421052631577e-05, "loss": 0.2546, "mean_copy_accuracy": 0.9973608404397964, "mean_gen_accuracy": 0.8809258937835693, "mean_token_accuracy": 0.9116745442152023, "num_tokens": 23882191.0, "sample_num_tokens": 8284.75, "step": 8589, "total_num_tokens": 23915330.0, "z_loss": 0.00043245928827673197 }, { "copy_logits_max": -5.275462627410889, "copy_logits_min": -750000000.0, "copy_num_tokens": 325.3125, "epoch": 1.754250702067909, "gen_logits_max": 4.993297100067139, "gen_logits_mean": -16.22739028930664, "gen_logits_min": -28.27587127685547, "gen_logits_std": 3.3764050006866455, "gen_loss": 0.2729977071285248, "grad_norm": 0.33835237896156406, "learning_rate": 2.007915789473684e-05, "loss": 0.247, "mean_copy_accuracy": 0.9974503070116043, "mean_gen_accuracy": 0.8893831521272659, "mean_token_accuracy": 0.9161648750305176, "num_tokens": 24154603.0, "sample_num_tokens": 7202.25, "step": 8590, "total_num_tokens": 24183412.0, "z_loss": 0.00039256009040400386 }, { "copy_logits_max": -3.3188302516937256, "copy_logits_min": -750000000.0, "copy_num_tokens": 596.9375, "epoch": 1.754454940005106, "gen_logits_max": 3.883044719696045, "gen_logits_mean": -16.275074005126953, "gen_logits_min": -28.52318572998047, "gen_logits_std": 3.3893845081329346, "gen_loss": 0.26128071546554565, "grad_norm": 0.3218172337859819, "learning_rate": 2.0077894736842106e-05, "loss": 0.2662, "mean_copy_accuracy": 0.9974746704101562, "mean_gen_accuracy": 0.8745890855789185, "mean_token_accuracy": 0.9086562693119049, "num_tokens": 24444307.0, "sample_num_tokens": 8779.75, "step": 8591, "total_num_tokens": 24479426.0, "z_loss": 0.0004558498039841652 }, { "copy_logits_max": -5.449312210083008, "copy_logits_min": -750000000.0, "copy_num_tokens": 339.5625, "epoch": 1.7546591779423029, "gen_logits_max": 5.260673522949219, "gen_logits_mean": -15.572288513183594, "gen_logits_min": -27.853790283203125, "gen_logits_std": 3.3347554206848145, "gen_loss": 0.2935760021209717, "grad_norm": 0.3375076748990638, "learning_rate": 2.007663157894737e-05, "loss": 0.2821, "mean_copy_accuracy": 0.997328981757164, "mean_gen_accuracy": 0.8744283765554428, "mean_token_accuracy": 0.9051494598388672, "num_tokens": 24713011.0, "sample_num_tokens": 7968.25, "step": 8592, "total_num_tokens": 24744884.0, "z_loss": 0.0004508638521656394 }, { "copy_logits_max": -3.452596664428711, "copy_logits_min": -750000128.0, "copy_num_tokens": 590.25, "epoch": 1.7548634158794996, "gen_logits_max": 4.2553229331970215, "gen_logits_mean": -15.535222053527832, "gen_logits_min": -28.497026443481445, "gen_logits_std": 3.391880512237549, "gen_loss": 0.22682029008865356, "grad_norm": 0.3412031453289828, "learning_rate": 2.0075368421052634e-05, "loss": 0.2586, "mean_copy_accuracy": 0.9973630011081696, "mean_gen_accuracy": 0.8842450380325317, "mean_token_accuracy": 0.9118143171072006, "num_tokens": 24985267.0, "sample_num_tokens": 9387.75, "step": 8593, "total_num_tokens": 25022818.0, "z_loss": 0.00041900144424289465 }, { "copy_logits_max": -2.7178149223327637, "copy_logits_min": -687500032.0, "copy_num_tokens": 633.5, "epoch": 1.7550676538166965, "gen_logits_max": 4.345025539398193, "gen_logits_mean": -15.444122314453125, "gen_logits_min": -27.935869216918945, "gen_logits_std": 3.325016975402832, "gen_loss": 0.2902546226978302, "grad_norm": 0.35475726072780095, "learning_rate": 2.0074105263157895e-05, "loss": 0.2755, "mean_copy_accuracy": 0.9971284866333008, "mean_gen_accuracy": 0.8769539147615433, "mean_token_accuracy": 0.9075680375099182, "num_tokens": 25269065.0, "sample_num_tokens": 11277.25, "step": 8594, "total_num_tokens": 25314174.0, "z_loss": 0.0004938308848068118 }, { "copy_logits_max": -5.069884300231934, "copy_logits_min": -750000000.0, "copy_num_tokens": 733.3125, "epoch": 1.7552718917538934, "gen_logits_max": 4.121549129486084, "gen_logits_mean": -15.904521942138672, "gen_logits_min": -28.089981079101562, "gen_logits_std": 3.4245309829711914, "gen_loss": 0.24118563532829285, "grad_norm": 0.3536682195109887, "learning_rate": 2.007284210526316e-05, "loss": 0.2659, "mean_copy_accuracy": 0.9969962388277054, "mean_gen_accuracy": 0.880591094493866, "mean_token_accuracy": 0.9110907167196274, "num_tokens": 25560007.0, "sample_num_tokens": 10491.25, "step": 8595, "total_num_tokens": 25601972.0, "z_loss": 0.00040130247361958027 }, { "copy_logits_max": -6.0632123947143555, "copy_logits_min": -687500032.0, "copy_num_tokens": 416.375, "epoch": 1.7554761296910901, "gen_logits_max": 2.933413028717041, "gen_logits_mean": -19.02631378173828, "gen_logits_min": -31.300132751464844, "gen_logits_std": 3.4804975986480713, "gen_loss": 0.2585856318473816, "grad_norm": 0.32578841392239855, "learning_rate": 2.007157894736842e-05, "loss": 0.2506, "mean_copy_accuracy": 0.998139500617981, "mean_gen_accuracy": 0.8816450238227844, "mean_token_accuracy": 0.9137384593486786, "num_tokens": 25854165.0, "sample_num_tokens": 7392.25, "step": 8596, "total_num_tokens": 25883734.0, "z_loss": 0.0004545334668364376 }, { "copy_logits_max": -6.805508613586426, "copy_logits_min": -750000000.0, "copy_num_tokens": 417.0, "epoch": 1.7556803676282868, "gen_logits_max": 4.7628631591796875, "gen_logits_mean": -16.44209098815918, "gen_logits_min": -28.66048240661621, "gen_logits_std": 3.390695095062256, "gen_loss": 0.27092665433883667, "grad_norm": 0.35975443670501706, "learning_rate": 2.0070315789473685e-05, "loss": 0.2681, "mean_copy_accuracy": 0.9971774965524673, "mean_gen_accuracy": 0.8819238096475601, "mean_token_accuracy": 0.9096777886152267, "num_tokens": 26127850.0, "sample_num_tokens": 8775.5, "step": 8597, "total_num_tokens": 26162952.0, "z_loss": 0.0004329443327151239 }, { "copy_logits_max": -5.081954002380371, "copy_logits_min": -750000000.0, "copy_num_tokens": 638.9375, "epoch": 1.7558846055654838, "gen_logits_max": 4.921380996704102, "gen_logits_mean": -15.259000778198242, "gen_logits_min": -27.504722595214844, "gen_logits_std": 3.362852096557617, "gen_loss": 0.26523730158805847, "grad_norm": 0.3289162507530674, "learning_rate": 2.0069052631578946e-05, "loss": 0.2695, "mean_copy_accuracy": 0.9976077675819397, "mean_gen_accuracy": 0.8772604316473007, "mean_token_accuracy": 0.9081599563360214, "num_tokens": 26404466.0, "sample_num_tokens": 9802.5, "step": 8598, "total_num_tokens": 26443676.0, "z_loss": 0.0004803618066944182 }, { "copy_logits_max": -6.612086296081543, "copy_logits_min": -750000000.0, "copy_num_tokens": 489.875, "epoch": 1.7560888435026807, "gen_logits_max": 6.071149826049805, "gen_logits_mean": -13.730081558227539, "gen_logits_min": -27.20162010192871, "gen_logits_std": 3.315532684326172, "gen_loss": 0.2704474925994873, "grad_norm": 0.3651522846558959, "learning_rate": 2.006778947368421e-05, "loss": 0.2851, "mean_copy_accuracy": 0.9967969655990601, "mean_gen_accuracy": 0.8702401518821716, "mean_token_accuracy": 0.9045396596193314, "num_tokens": 26677974.0, "sample_num_tokens": 8847.5, "step": 8599, "total_num_tokens": 26713364.0, "z_loss": 0.00043575919698923826 }, { "copy_logits_max": -6.253261089324951, "copy_logits_min": -750000000.0, "copy_num_tokens": 485.875, "epoch": 1.7562930814398774, "gen_logits_max": 5.922443866729736, "gen_logits_mean": -14.221138954162598, "gen_logits_min": -27.039962768554688, "gen_logits_std": 3.3458456993103027, "gen_loss": 0.30142128467559814, "grad_norm": 0.35780183849738434, "learning_rate": 2.0066526315789474e-05, "loss": 0.2782, "mean_copy_accuracy": 0.9974204748868942, "mean_gen_accuracy": 0.8695842772722244, "mean_token_accuracy": 0.9051994979381561, "num_tokens": 26966141.0, "sample_num_tokens": 8282.75, "step": 8600, "total_num_tokens": 26999272.0, "z_loss": 0.0004845521179959178 }, { "copy_logits_max": -4.868476390838623, "copy_logits_min": -750000000.0, "copy_num_tokens": 475.125, "epoch": 1.7564973193770743, "gen_logits_max": 5.809237957000732, "gen_logits_mean": -14.320833206176758, "gen_logits_min": -27.088769912719727, "gen_logits_std": 3.351989984512329, "gen_loss": 0.27291738986968994, "grad_norm": 0.3207522897723404, "learning_rate": 2.006526315789474e-05, "loss": 0.2846, "mean_copy_accuracy": 0.9971322864294052, "mean_gen_accuracy": 0.8742793053388596, "mean_token_accuracy": 0.9029737114906311, "num_tokens": 27244278.0, "sample_num_tokens": 7701.0, "step": 8601, "total_num_tokens": 27275082.0, "z_loss": 0.0004431583802215755 }, { "copy_logits_max": -6.5604248046875, "copy_logits_min": -750000064.0, "copy_num_tokens": 488.25, "epoch": 1.7567015573142712, "gen_logits_max": 3.795963764190674, "gen_logits_mean": -17.25434112548828, "gen_logits_min": -29.36934471130371, "gen_logits_std": 3.415955066680908, "gen_loss": 0.2828371822834015, "grad_norm": 0.35405246974781474, "learning_rate": 2.0064e-05, "loss": 0.2749, "mean_copy_accuracy": 0.9969413131475449, "mean_gen_accuracy": 0.8791979402303696, "mean_token_accuracy": 0.905269667506218, "num_tokens": 27511510.0, "sample_num_tokens": 9330.5, "step": 8602, "total_num_tokens": 27548832.0, "z_loss": 0.0004625743313226849 }, { "copy_logits_max": -3.6467702388763428, "copy_logits_min": -750000000.0, "copy_num_tokens": 660.5625, "epoch": 1.756905795251468, "gen_logits_max": 4.713488578796387, "gen_logits_mean": -15.178518295288086, "gen_logits_min": -27.610519409179688, "gen_logits_std": 3.366441249847412, "gen_loss": 0.262625128030777, "grad_norm": 0.36643279742514695, "learning_rate": 2.0062736842105264e-05, "loss": 0.2807, "mean_copy_accuracy": 0.9971324801445007, "mean_gen_accuracy": 0.8736993372440338, "mean_token_accuracy": 0.905039444565773, "num_tokens": 27808179.0, "sample_num_tokens": 9518.75, "step": 8603, "total_num_tokens": 27846254.0, "z_loss": 0.00043160736095160246 }, { "copy_logits_max": -6.976022243499756, "copy_logits_min": -750000000.0, "copy_num_tokens": 293.6875, "epoch": 1.7571100331886647, "gen_logits_max": 5.833014965057373, "gen_logits_mean": -15.946900367736816, "gen_logits_min": -28.109691619873047, "gen_logits_std": 3.383852958679199, "gen_loss": 0.2903490364551544, "grad_norm": 0.3928751086251696, "learning_rate": 2.0061473684210528e-05, "loss": 0.287, "mean_copy_accuracy": 0.995682030916214, "mean_gen_accuracy": 0.8817011713981628, "mean_token_accuracy": 0.9027035534381866, "num_tokens": 28048233.0, "sample_num_tokens": 7302.25, "step": 8604, "total_num_tokens": 28077442.0, "z_loss": 0.00045347114792093635 }, { "copy_logits_max": -2.424208641052246, "copy_logits_min": -750000000.0, "copy_num_tokens": 507.0625, "epoch": 1.7573142711258616, "gen_logits_max": 3.6500420570373535, "gen_logits_mean": -16.947858810424805, "gen_logits_min": -29.925098419189453, "gen_logits_std": 3.463763475418091, "gen_loss": 0.24892455339431763, "grad_norm": 0.339386906836641, "learning_rate": 2.006021052631579e-05, "loss": 0.2664, "mean_copy_accuracy": 0.9970487803220749, "mean_gen_accuracy": 0.8824062049388885, "mean_token_accuracy": 0.9096047282218933, "num_tokens": 28301905.0, "sample_num_tokens": 8221.25, "step": 8605, "total_num_tokens": 28334790.0, "z_loss": 0.00041501654777675867 }, { "copy_logits_max": -6.5877251625061035, "copy_logits_min": -750000000.0, "copy_num_tokens": 390.375, "epoch": 1.7575185090630585, "gen_logits_max": 6.085628509521484, "gen_logits_mean": -15.738751411437988, "gen_logits_min": -27.71080780029297, "gen_logits_std": 3.4114503860473633, "gen_loss": 0.2590826749801636, "grad_norm": 0.3669208839283549, "learning_rate": 2.0058947368421054e-05, "loss": 0.2646, "mean_copy_accuracy": 0.9977027922868729, "mean_gen_accuracy": 0.8792978227138519, "mean_token_accuracy": 0.9080730974674225, "num_tokens": 28573814.0, "sample_num_tokens": 7842.5, "step": 8606, "total_num_tokens": 28605184.0, "z_loss": 0.0003776711237151176 }, { "copy_logits_max": -5.674082279205322, "copy_logits_min": -750000000.0, "copy_num_tokens": 380.125, "epoch": 1.7577227470002552, "gen_logits_max": 4.94663667678833, "gen_logits_mean": -16.49152183532715, "gen_logits_min": -29.04357147216797, "gen_logits_std": 3.42275333404541, "gen_loss": 0.2692760229110718, "grad_norm": 0.36732874065049664, "learning_rate": 2.0057684210526314e-05, "loss": 0.269, "mean_copy_accuracy": 0.9968936145305634, "mean_gen_accuracy": 0.8805058002471924, "mean_token_accuracy": 0.9075850695371628, "num_tokens": 28832583.0, "sample_num_tokens": 7695.25, "step": 8607, "total_num_tokens": 28863364.0, "z_loss": 0.00039331259904429317 }, { "copy_logits_max": -6.953081130981445, "copy_logits_min": -750000000.0, "copy_num_tokens": 381.3125, "epoch": 1.7579269849374521, "gen_logits_max": 6.449488162994385, "gen_logits_mean": -14.921492576599121, "gen_logits_min": -27.451805114746094, "gen_logits_std": 3.3862099647521973, "gen_loss": 0.26637089252471924, "grad_norm": 0.33741072932465954, "learning_rate": 2.0056421052631582e-05, "loss": 0.2509, "mean_copy_accuracy": 0.9978303462266922, "mean_gen_accuracy": 0.8858705908060074, "mean_token_accuracy": 0.915705218911171, "num_tokens": 29091522.0, "sample_num_tokens": 8360.0, "step": 8608, "total_num_tokens": 29124962.0, "z_loss": 0.0004018122854176909 }, { "copy_logits_max": -2.560999870300293, "copy_logits_min": -750000000.0, "copy_num_tokens": 555.6875, "epoch": 1.758131222874649, "gen_logits_max": 3.7890896797180176, "gen_logits_mean": -17.0582332611084, "gen_logits_min": -29.632633209228516, "gen_logits_std": 3.456146717071533, "gen_loss": 0.2793201208114624, "grad_norm": 0.38252963034632226, "learning_rate": 2.0055157894736843e-05, "loss": 0.2826, "mean_copy_accuracy": 0.9977340698242188, "mean_gen_accuracy": 0.871256873011589, "mean_token_accuracy": 0.90492944419384, "num_tokens": 29341169.0, "sample_num_tokens": 8775.25, "step": 8609, "total_num_tokens": 29376270.0, "z_loss": 0.0004377390432637185 }, { "copy_logits_max": -3.5757861137390137, "copy_logits_min": -750000000.0, "copy_num_tokens": 569.375, "epoch": 1.7583354608118458, "gen_logits_max": 5.8436455726623535, "gen_logits_mean": -14.668207168579102, "gen_logits_min": -27.57168960571289, "gen_logits_std": 3.366032600402832, "gen_loss": 0.26855748891830444, "grad_norm": 0.36882939409326576, "learning_rate": 2.0053894736842107e-05, "loss": 0.2936, "mean_copy_accuracy": 0.9967145472764969, "mean_gen_accuracy": 0.8723518252372742, "mean_token_accuracy": 0.9014150500297546, "num_tokens": 29596057.0, "sample_num_tokens": 8760.75, "step": 8610, "total_num_tokens": 29631100.0, "z_loss": 0.0004325009067542851 }, { "copy_logits_max": -5.409491539001465, "copy_logits_min": -750000064.0, "copy_num_tokens": 475.375, "epoch": 1.7585396987490425, "gen_logits_max": 5.452192783355713, "gen_logits_mean": -13.988115310668945, "gen_logits_min": -26.673532485961914, "gen_logits_std": 3.2901370525360107, "gen_loss": 0.24756957590579987, "grad_norm": 0.3821649525779185, "learning_rate": 2.0052631578947368e-05, "loss": 0.2819, "mean_copy_accuracy": 0.9972743988037109, "mean_gen_accuracy": 0.8717174977064133, "mean_token_accuracy": 0.9047553837299347, "num_tokens": 29855225.0, "sample_num_tokens": 7664.25, "step": 8611, "total_num_tokens": 29885882.0, "z_loss": 0.00035083212424069643 }, { "copy_logits_max": -1.4726848602294922, "copy_logits_min": -750000000.0, "copy_num_tokens": 349.3125, "epoch": 1.7587439366862396, "gen_logits_max": 6.1453046798706055, "gen_logits_mean": -14.742559432983398, "gen_logits_min": -27.099407196044922, "gen_logits_std": 3.3682315349578857, "gen_loss": 0.29648706316947937, "grad_norm": 0.37450230021112696, "learning_rate": 2.0051368421052633e-05, "loss": 0.2788, "mean_copy_accuracy": 0.9965641647577286, "mean_gen_accuracy": 0.8777434378862381, "mean_token_accuracy": 0.9040045738220215, "num_tokens": 30097599.0, "sample_num_tokens": 7473.75, "step": 8612, "total_num_tokens": 30127494.0, "z_loss": 0.00041264673927798867 }, { "copy_logits_max": -3.440075635910034, "copy_logits_min": -750000000.0, "copy_num_tokens": 510.5625, "epoch": 1.7589481746234363, "gen_logits_max": 3.5723626613616943, "gen_logits_mean": -17.420101165771484, "gen_logits_min": -29.85063934326172, "gen_logits_std": 3.4797139167785645, "gen_loss": 0.258334219455719, "grad_norm": 0.36130502336719006, "learning_rate": 2.0050105263157894e-05, "loss": 0.2813, "mean_copy_accuracy": 0.9970099031925201, "mean_gen_accuracy": 0.8691805005073547, "mean_token_accuracy": 0.903561070561409, "num_tokens": 30362796.0, "sample_num_tokens": 9095.0, "step": 8613, "total_num_tokens": 30399176.0, "z_loss": 0.0003936877765227109 }, { "copy_logits_max": -3.9949915409088135, "copy_logits_min": -687500032.0, "copy_num_tokens": 466.8125, "epoch": 1.759152412560633, "gen_logits_max": 3.732937812805176, "gen_logits_mean": -17.61534881591797, "gen_logits_min": -29.921730041503906, "gen_logits_std": 3.4483089447021484, "gen_loss": 0.2959291338920593, "grad_norm": 0.3738564243821943, "learning_rate": 2.0048842105263158e-05, "loss": 0.2765, "mean_copy_accuracy": 0.9974980652332306, "mean_gen_accuracy": 0.8747886568307877, "mean_token_accuracy": 0.9057015031576157, "num_tokens": 30634667.0, "sample_num_tokens": 8594.75, "step": 8614, "total_num_tokens": 30669046.0, "z_loss": 0.000459770584711805 }, { "copy_logits_max": -5.5063323974609375, "copy_logits_min": -750000064.0, "copy_num_tokens": 420.6875, "epoch": 1.75935665049783, "gen_logits_max": 5.171730995178223, "gen_logits_mean": -16.083614349365234, "gen_logits_min": -28.449390411376953, "gen_logits_std": 3.4110777378082275, "gen_loss": 0.27586182951927185, "grad_norm": 0.3753541054984893, "learning_rate": 2.004757894736842e-05, "loss": 0.2864, "mean_copy_accuracy": 0.9973945468664169, "mean_gen_accuracy": 0.8755562454462051, "mean_token_accuracy": 0.9043473154306412, "num_tokens": 30898999.0, "sample_num_tokens": 8512.25, "step": 8615, "total_num_tokens": 30933048.0, "z_loss": 0.0004056463949382305 }, { "copy_logits_max": -4.722955226898193, "copy_logits_min": -750000000.0, "copy_num_tokens": 341.75, "epoch": 1.759560888435027, "gen_logits_max": 4.136686325073242, "gen_logits_mean": -17.118526458740234, "gen_logits_min": -29.654586791992188, "gen_logits_std": 3.4433510303497314, "gen_loss": 0.26340293884277344, "grad_norm": 0.37452584273095724, "learning_rate": 2.0046315789473686e-05, "loss": 0.2796, "mean_copy_accuracy": 0.9957834184169769, "mean_gen_accuracy": 0.8791511803865433, "mean_token_accuracy": 0.9035981297492981, "num_tokens": 31154187.0, "sample_num_tokens": 7523.75, "step": 8616, "total_num_tokens": 31184282.0, "z_loss": 0.0004056230536662042 }, { "copy_logits_max": -4.72819709777832, "copy_logits_min": -750000000.0, "copy_num_tokens": 550.75, "epoch": 1.7597651263722236, "gen_logits_max": 3.6623363494873047, "gen_logits_mean": -17.65966033935547, "gen_logits_min": -29.943252563476562, "gen_logits_std": 3.466136932373047, "gen_loss": 0.25779297947883606, "grad_norm": 0.34172301922621084, "learning_rate": 2.004505263157895e-05, "loss": 0.2766, "mean_copy_accuracy": 0.9963003844022751, "mean_gen_accuracy": 0.8743396699428558, "mean_token_accuracy": 0.9051819145679474, "num_tokens": 31420345.0, "sample_num_tokens": 8694.25, "step": 8617, "total_num_tokens": 31455122.0, "z_loss": 0.00034875061828643084 }, { "copy_logits_max": -4.6955952644348145, "copy_logits_min": -750000000.0, "copy_num_tokens": 660.125, "epoch": 1.7599693643094205, "gen_logits_max": 2.4961066246032715, "gen_logits_mean": -17.372020721435547, "gen_logits_min": -29.91424560546875, "gen_logits_std": 3.4488797187805176, "gen_loss": 0.23659256100654602, "grad_norm": 0.3339253705345239, "learning_rate": 2.0043789473684212e-05, "loss": 0.2587, "mean_copy_accuracy": 0.997078612446785, "mean_gen_accuracy": 0.874155655503273, "mean_token_accuracy": 0.9108423292636871, "num_tokens": 31707882.0, "sample_num_tokens": 8662.0, "step": 8618, "total_num_tokens": 31742530.0, "z_loss": 0.0003597161849029362 }, { "copy_logits_max": -5.716123580932617, "copy_logits_min": -687500032.0, "copy_num_tokens": 336.4375, "epoch": 1.7601736022466175, "gen_logits_max": 3.5254719257354736, "gen_logits_mean": -18.32968521118164, "gen_logits_min": -30.49603271484375, "gen_logits_std": 3.466421604156494, "gen_loss": 0.2742643356323242, "grad_norm": 0.369413937568435, "learning_rate": 2.0042526315789476e-05, "loss": 0.2884, "mean_copy_accuracy": 0.9969095140695572, "mean_gen_accuracy": 0.8734479248523712, "mean_token_accuracy": 0.9000836610794067, "num_tokens": 31951780.0, "sample_num_tokens": 7877.0, "step": 8619, "total_num_tokens": 31983288.0, "z_loss": 0.0004119263612665236 }, { "copy_logits_max": -7.270573616027832, "copy_logits_min": -687500096.0, "copy_num_tokens": 341.3125, "epoch": 1.7603778401838142, "gen_logits_max": 5.38286018371582, "gen_logits_mean": -15.230953216552734, "gen_logits_min": -27.74419403076172, "gen_logits_std": 3.3518049716949463, "gen_loss": 0.2936117649078369, "grad_norm": 0.3748980182661977, "learning_rate": 2.0041263157894737e-05, "loss": 0.2906, "mean_copy_accuracy": 0.9973690807819366, "mean_gen_accuracy": 0.8695755302906036, "mean_token_accuracy": 0.902387261390686, "num_tokens": 32226943.0, "sample_num_tokens": 8376.75, "step": 8620, "total_num_tokens": 32260450.0, "z_loss": 0.0004423983918968588 }, { "copy_logits_max": -6.602761268615723, "copy_logits_min": -750000000.0, "copy_num_tokens": 288.0625, "epoch": 1.7605820781210109, "gen_logits_max": 5.0561747550964355, "gen_logits_mean": -17.343658447265625, "gen_logits_min": -29.036848068237305, "gen_logits_std": 3.4068470001220703, "gen_loss": 0.33169859647750854, "grad_norm": 0.3553305629473345, "learning_rate": 2.004e-05, "loss": 0.2951, "mean_copy_accuracy": 0.9963603168725967, "mean_gen_accuracy": 0.8738898485898972, "mean_token_accuracy": 0.8985159695148468, "num_tokens": 32496540.0, "sample_num_tokens": 7972.0, "step": 8621, "total_num_tokens": 32528428.0, "z_loss": 0.0004975391202606261 }, { "copy_logits_max": -6.194924354553223, "copy_logits_min": -625000064.0, "copy_num_tokens": 399.1875, "epoch": 1.7607863160582078, "gen_logits_max": 3.342411756515503, "gen_logits_mean": -17.93201446533203, "gen_logits_min": -30.05176544189453, "gen_logits_std": 3.446502685546875, "gen_loss": 0.29598483443260193, "grad_norm": 0.3509004008329245, "learning_rate": 2.0038736842105262e-05, "loss": 0.276, "mean_copy_accuracy": 0.99770887196064, "mean_gen_accuracy": 0.8760148733854294, "mean_token_accuracy": 0.9057213068008423, "num_tokens": 32764284.0, "sample_num_tokens": 8533.0, "step": 8622, "total_num_tokens": 32798416.0, "z_loss": 0.0004178575472906232 }, { "copy_logits_max": -4.846682071685791, "copy_logits_min": -687500032.0, "copy_num_tokens": 388.5, "epoch": 1.7609905539954047, "gen_logits_max": 3.8040173053741455, "gen_logits_mean": -17.275550842285156, "gen_logits_min": -29.291427612304688, "gen_logits_std": 3.4266064167022705, "gen_loss": 0.2704131603240967, "grad_norm": 0.35196254347339145, "learning_rate": 2.0037473684210526e-05, "loss": 0.2698, "mean_copy_accuracy": 0.9970654398202896, "mean_gen_accuracy": 0.875166267156601, "mean_token_accuracy": 0.908234491944313, "num_tokens": 33035969.0, "sample_num_tokens": 7739.25, "step": 8623, "total_num_tokens": 33066926.0, "z_loss": 0.0004069715505465865 }, { "copy_logits_max": -3.67370867729187, "copy_logits_min": -750000000.0, "copy_num_tokens": 524.1875, "epoch": 1.7611947919326014, "gen_logits_max": 5.658975601196289, "gen_logits_mean": -13.285181999206543, "gen_logits_min": -25.646427154541016, "gen_logits_std": 3.3291563987731934, "gen_loss": 0.2558836340904236, "grad_norm": 0.35929475808771494, "learning_rate": 2.0036210526315787e-05, "loss": 0.2695, "mean_copy_accuracy": 0.9969944059848785, "mean_gen_accuracy": 0.8817311674356461, "mean_token_accuracy": 0.9091458171606064, "num_tokens": 33302592.0, "sample_num_tokens": 9212.5, "step": 8624, "total_num_tokens": 33339442.0, "z_loss": 0.0003806372988037765 }, { "copy_logits_max": -3.443328380584717, "copy_logits_min": -687500032.0, "copy_num_tokens": 597.0625, "epoch": 1.7613990298697983, "gen_logits_max": 3.900418281555176, "gen_logits_mean": -15.663932800292969, "gen_logits_min": -27.79232406616211, "gen_logits_std": 3.3976125717163086, "gen_loss": 0.24268727004528046, "grad_norm": 0.3623836683793682, "learning_rate": 2.0034947368421055e-05, "loss": 0.2594, "mean_copy_accuracy": 0.9970935434103012, "mean_gen_accuracy": 0.8812083899974823, "mean_token_accuracy": 0.9116410315036774, "num_tokens": 33585192.0, "sample_num_tokens": 9109.5, "step": 8625, "total_num_tokens": 33621630.0, "z_loss": 0.0003398681292310357 }, { "copy_logits_max": -3.9535791873931885, "copy_logits_min": -750000064.0, "copy_num_tokens": 361.9375, "epoch": 1.7616032678069953, "gen_logits_max": 5.411108016967773, "gen_logits_mean": -14.426529884338379, "gen_logits_min": -27.043575286865234, "gen_logits_std": 3.375823974609375, "gen_loss": 0.2783222198486328, "grad_norm": 0.3698242023378295, "learning_rate": 2.0033684210526316e-05, "loss": 0.2659, "mean_copy_accuracy": 0.9954759478569031, "mean_gen_accuracy": 0.8856610953807831, "mean_token_accuracy": 0.9083801209926605, "num_tokens": 33829856.0, "sample_num_tokens": 7070.5, "step": 8626, "total_num_tokens": 33858138.0, "z_loss": 0.0004340072628110647 }, { "copy_logits_max": -6.040925025939941, "copy_logits_min": -750000000.0, "copy_num_tokens": 493.125, "epoch": 1.761807505744192, "gen_logits_max": 3.0128283500671387, "gen_logits_mean": -17.704551696777344, "gen_logits_min": -29.821508407592773, "gen_logits_std": 3.4701929092407227, "gen_loss": 0.25250792503356934, "grad_norm": 0.32311195719013525, "learning_rate": 2.003242105263158e-05, "loss": 0.2627, "mean_copy_accuracy": 0.9969582557678223, "mean_gen_accuracy": 0.8790562152862549, "mean_token_accuracy": 0.9101748764514923, "num_tokens": 34100669.0, "sample_num_tokens": 7969.25, "step": 8627, "total_num_tokens": 34132546.0, "z_loss": 0.0003674929612316191 }, { "copy_logits_max": -4.162942409515381, "copy_logits_min": -750000064.0, "copy_num_tokens": 508.625, "epoch": 1.7620117436813887, "gen_logits_max": 3.8403587341308594, "gen_logits_mean": -17.063861846923828, "gen_logits_min": -28.902931213378906, "gen_logits_std": 3.3899714946746826, "gen_loss": 0.2661193013191223, "grad_norm": 0.32751529141058305, "learning_rate": 2.003115789473684e-05, "loss": 0.2681, "mean_copy_accuracy": 0.9968057721853256, "mean_gen_accuracy": 0.8801550418138504, "mean_token_accuracy": 0.9072065353393555, "num_tokens": 34382599.0, "sample_num_tokens": 9084.75, "step": 8628, "total_num_tokens": 34418938.0, "z_loss": 0.0004226762684993446 }, { "copy_logits_max": -6.262388229370117, "copy_logits_min": -750000000.0, "copy_num_tokens": 485.75, "epoch": 1.7622159816185856, "gen_logits_max": 4.559029579162598, "gen_logits_mean": -15.730331420898438, "gen_logits_min": -27.954418182373047, "gen_logits_std": 3.3730294704437256, "gen_loss": 0.25610819458961487, "grad_norm": 0.35985213375984004, "learning_rate": 2.0029894736842106e-05, "loss": 0.2884, "mean_copy_accuracy": 0.9956938326358795, "mean_gen_accuracy": 0.8735799342393875, "mean_token_accuracy": 0.9016474485397339, "num_tokens": 34644862.0, "sample_num_tokens": 8622.0, "step": 8629, "total_num_tokens": 34679350.0, "z_loss": 0.000413482659496367 }, { "copy_logits_max": -4.721656322479248, "copy_logits_min": -750000000.0, "copy_num_tokens": 419.4375, "epoch": 1.7624202195557825, "gen_logits_max": 3.868821144104004, "gen_logits_mean": -17.611248016357422, "gen_logits_min": -30.150779724121094, "gen_logits_std": 3.4353957176208496, "gen_loss": 0.28983038663864136, "grad_norm": 0.3598401501633551, "learning_rate": 2.0028631578947367e-05, "loss": 0.2911, "mean_copy_accuracy": 0.9969549477100372, "mean_gen_accuracy": 0.8735142946243286, "mean_token_accuracy": 0.9013145714998245, "num_tokens": 34908232.0, "sample_num_tokens": 8675.0, "step": 8630, "total_num_tokens": 34942932.0, "z_loss": 0.00045279195182956755 }, { "copy_logits_max": -3.6122097969055176, "copy_logits_min": -750000000.0, "copy_num_tokens": 441.0625, "epoch": 1.7626244574929792, "gen_logits_max": 4.707494735717773, "gen_logits_mean": -16.16436767578125, "gen_logits_min": -28.027950286865234, "gen_logits_std": 3.3759536743164062, "gen_loss": 0.2708238363265991, "grad_norm": 0.30673226759443345, "learning_rate": 2.002736842105263e-05, "loss": 0.2599, "mean_copy_accuracy": 0.9976219534873962, "mean_gen_accuracy": 0.8804126232862473, "mean_token_accuracy": 0.9111712723970413, "num_tokens": 35206066.0, "sample_num_tokens": 9338.0, "step": 8631, "total_num_tokens": 35243418.0, "z_loss": 0.0004344428307376802 }, { "copy_logits_max": -2.843095302581787, "copy_logits_min": -750000064.0, "copy_num_tokens": 557.625, "epoch": 1.7628286954301762, "gen_logits_max": 4.679451942443848, "gen_logits_mean": -15.537943840026855, "gen_logits_min": -27.91077423095703, "gen_logits_std": 3.36602520942688, "gen_loss": 0.26795417070388794, "grad_norm": 0.3514937068681826, "learning_rate": 2.0026105263157895e-05, "loss": 0.2758, "mean_copy_accuracy": 0.9970138818025589, "mean_gen_accuracy": 0.8757444471120834, "mean_token_accuracy": 0.9058589041233063, "num_tokens": 35473470.0, "sample_num_tokens": 8304.5, "step": 8632, "total_num_tokens": 35506688.0, "z_loss": 0.00042896243394352496 }, { "copy_logits_max": -3.319734573364258, "copy_logits_min": -750000000.0, "copy_num_tokens": 561.75, "epoch": 1.763032933367373, "gen_logits_max": 5.157227993011475, "gen_logits_mean": -14.32676887512207, "gen_logits_min": -26.697467803955078, "gen_logits_std": 3.313896894454956, "gen_loss": 0.3009248375892639, "grad_norm": 0.3544119665422847, "learning_rate": 2.002484210526316e-05, "loss": 0.2908, "mean_copy_accuracy": 0.9958184957504272, "mean_gen_accuracy": 0.870178833603859, "mean_token_accuracy": 0.9014745503664017, "num_tokens": 35751073.0, "sample_num_tokens": 9048.25, "step": 8633, "total_num_tokens": 35787266.0, "z_loss": 0.0005182208842597902 }, { "copy_logits_max": -5.925209045410156, "copy_logits_min": -750000000.0, "copy_num_tokens": 414.3125, "epoch": 1.7632371713045698, "gen_logits_max": 4.132474899291992, "gen_logits_mean": -17.30434799194336, "gen_logits_min": -29.8009033203125, "gen_logits_std": 3.402250289916992, "gen_loss": 0.2872588038444519, "grad_norm": 0.35556529233458517, "learning_rate": 2.0023578947368424e-05, "loss": 0.2898, "mean_copy_accuracy": 0.997669979929924, "mean_gen_accuracy": 0.8695991635322571, "mean_token_accuracy": 0.9021437764167786, "num_tokens": 36035274.0, "sample_num_tokens": 8872.5, "step": 8634, "total_num_tokens": 36070764.0, "z_loss": 0.00047282499144785106 }, { "copy_logits_max": -5.929060935974121, "copy_logits_min": -687500032.0, "copy_num_tokens": 341.25, "epoch": 1.7634414092417665, "gen_logits_max": 4.445096015930176, "gen_logits_mean": -16.690540313720703, "gen_logits_min": -29.469310760498047, "gen_logits_std": 3.3732404708862305, "gen_loss": 0.3052961528301239, "grad_norm": 0.3440857935388281, "learning_rate": 2.0022315789473685e-05, "loss": 0.2673, "mean_copy_accuracy": 0.9973708093166351, "mean_gen_accuracy": 0.8810762465000153, "mean_token_accuracy": 0.908125028014183, "num_tokens": 36300707.0, "sample_num_tokens": 7470.75, "step": 8635, "total_num_tokens": 36330590.0, "z_loss": 0.0004931599833071232 }, { "copy_logits_max": -5.321324348449707, "copy_logits_min": -750000000.0, "copy_num_tokens": 366.8125, "epoch": 1.7636456471789634, "gen_logits_max": 3.5192971229553223, "gen_logits_mean": -17.258220672607422, "gen_logits_min": -29.795913696289062, "gen_logits_std": 3.387068271636963, "gen_loss": 0.27619802951812744, "grad_norm": 0.3264720874373961, "learning_rate": 2.002105263157895e-05, "loss": 0.2614, "mean_copy_accuracy": 0.996838852763176, "mean_gen_accuracy": 0.881101205945015, "mean_token_accuracy": 0.9111940413713455, "num_tokens": 36567394.0, "sample_num_tokens": 6727.0, "step": 8636, "total_num_tokens": 36594302.0, "z_loss": 0.00045263319043442607 }, { "copy_logits_max": -6.582800388336182, "copy_logits_min": -750000000.0, "copy_num_tokens": 419.0, "epoch": 1.7638498851161604, "gen_logits_max": 4.210605621337891, "gen_logits_mean": -17.19147300720215, "gen_logits_min": -29.29977035522461, "gen_logits_std": 3.400665283203125, "gen_loss": 0.2803446650505066, "grad_norm": 0.37883240106048455, "learning_rate": 2.001978947368421e-05, "loss": 0.2747, "mean_copy_accuracy": 0.9965039491653442, "mean_gen_accuracy": 0.8808761090040207, "mean_token_accuracy": 0.9056373238563538, "num_tokens": 36816751.0, "sample_num_tokens": 8128.25, "step": 8637, "total_num_tokens": 36849264.0, "z_loss": 0.00047289172653108835 }, { "copy_logits_max": -3.735764503479004, "copy_logits_min": -750000000.0, "copy_num_tokens": 404.1875, "epoch": 1.764054123053357, "gen_logits_max": 4.338064193725586, "gen_logits_mean": -16.71479034423828, "gen_logits_min": -29.16929817199707, "gen_logits_std": 3.3757710456848145, "gen_loss": 0.2537764310836792, "grad_norm": 0.35420629869950343, "learning_rate": 2.0018526315789474e-05, "loss": 0.2825, "mean_copy_accuracy": 0.9961312264204025, "mean_gen_accuracy": 0.8736941814422607, "mean_token_accuracy": 0.9030403345823288, "num_tokens": 37072534.0, "sample_num_tokens": 8041.0, "step": 8638, "total_num_tokens": 37104698.0, "z_loss": 0.00042347738053649664 }, { "copy_logits_max": -1.8194785118103027, "copy_logits_min": -687500032.0, "copy_num_tokens": 471.375, "epoch": 1.764258360990554, "gen_logits_max": 4.15897274017334, "gen_logits_mean": -15.82034683227539, "gen_logits_min": -27.83221435546875, "gen_logits_std": 3.3027279376983643, "gen_loss": 0.26014941930770874, "grad_norm": 0.3392271957980603, "learning_rate": 2.0017263157894735e-05, "loss": 0.2733, "mean_copy_accuracy": 0.9969831854104996, "mean_gen_accuracy": 0.8805253356695175, "mean_token_accuracy": 0.9066612273454666, "num_tokens": 37338144.0, "sample_num_tokens": 9423.0, "step": 8639, "total_num_tokens": 37375836.0, "z_loss": 0.0004470526473596692 }, { "copy_logits_max": -1.4784085750579834, "copy_logits_min": -687500032.0, "copy_num_tokens": 440.3125, "epoch": 1.764462598927751, "gen_logits_max": 3.384748935699463, "gen_logits_mean": -17.672216415405273, "gen_logits_min": -29.923030853271484, "gen_logits_std": 3.4064555168151855, "gen_loss": 0.24910742044448853, "grad_norm": 0.3558606108273989, "learning_rate": 2.0016e-05, "loss": 0.2699, "mean_copy_accuracy": 0.9979488253593445, "mean_gen_accuracy": 0.8781884759664536, "mean_token_accuracy": 0.9080200046300888, "num_tokens": 37612787.0, "sample_num_tokens": 7868.75, "step": 8640, "total_num_tokens": 37644262.0, "z_loss": 0.00041269653593190014 }, { "copy_logits_max": -1.2918286323547363, "copy_logits_min": -750000000.0, "copy_num_tokens": 635.0, "epoch": 1.7646668368649476, "gen_logits_max": 3.459371566772461, "gen_logits_mean": -16.733243942260742, "gen_logits_min": -28.968944549560547, "gen_logits_std": 3.368738889694214, "gen_loss": 0.2654341757297516, "grad_norm": 0.39528222813239655, "learning_rate": 2.0014736842105264e-05, "loss": 0.279, "mean_copy_accuracy": 0.9964577853679657, "mean_gen_accuracy": 0.8770636171102524, "mean_token_accuracy": 0.9077218025922775, "num_tokens": 37883321.0, "sample_num_tokens": 9125.75, "step": 8641, "total_num_tokens": 37919824.0, "z_loss": 0.0004692368966061622 }, { "copy_logits_max": -4.467755317687988, "copy_logits_min": -750000000.0, "copy_num_tokens": 355.0625, "epoch": 1.7648710748021443, "gen_logits_max": 3.201284885406494, "gen_logits_mean": -18.515193939208984, "gen_logits_min": -30.462684631347656, "gen_logits_std": 3.4409501552581787, "gen_loss": 0.3139149844646454, "grad_norm": 0.3749336945490563, "learning_rate": 2.0013473684210528e-05, "loss": 0.2703, "mean_copy_accuracy": 0.9965602308511734, "mean_gen_accuracy": 0.8801752626895905, "mean_token_accuracy": 0.9080403745174408, "num_tokens": 38153659.0, "sample_num_tokens": 7649.75, "step": 8642, "total_num_tokens": 38184258.0, "z_loss": 0.0005030176835134625 }, { "copy_logits_max": 0.07179731130599976, "copy_logits_min": -750000064.0, "copy_num_tokens": 469.5625, "epoch": 1.7650753127393415, "gen_logits_max": 4.36566162109375, "gen_logits_mean": -15.38440990447998, "gen_logits_min": -27.908029556274414, "gen_logits_std": 3.317084312438965, "gen_loss": 0.2913416028022766, "grad_norm": 0.3529211559625423, "learning_rate": 2.001221052631579e-05, "loss": 0.2916, "mean_copy_accuracy": 0.9961746335029602, "mean_gen_accuracy": 0.8714392930269241, "mean_token_accuracy": 0.8994294106960297, "num_tokens": 38401873.0, "sample_num_tokens": 7826.25, "step": 8643, "total_num_tokens": 38433178.0, "z_loss": 0.00042772371671162546 }, { "copy_logits_max": -2.483046293258667, "copy_logits_min": -750000000.0, "copy_num_tokens": 379.3125, "epoch": 1.7652795506765382, "gen_logits_max": 3.770214796066284, "gen_logits_mean": -16.25890350341797, "gen_logits_min": -28.444894790649414, "gen_logits_std": 3.3037633895874023, "gen_loss": 0.3116532564163208, "grad_norm": 0.3625093521467957, "learning_rate": 2.0010947368421053e-05, "loss": 0.2873, "mean_copy_accuracy": 0.9977860152721405, "mean_gen_accuracy": 0.872489407658577, "mean_token_accuracy": 0.901061624288559, "num_tokens": 38662068.0, "sample_num_tokens": 7671.5, "step": 8644, "total_num_tokens": 38692754.0, "z_loss": 0.00042664463398978114 }, { "copy_logits_max": -1.998655080795288, "copy_logits_min": -687500032.0, "copy_num_tokens": 458.6875, "epoch": 1.765483788613735, "gen_logits_max": 3.861849784851074, "gen_logits_mean": -16.041000366210938, "gen_logits_min": -28.59081268310547, "gen_logits_std": 3.3351635932922363, "gen_loss": 0.26825645565986633, "grad_norm": 0.3594568764881995, "learning_rate": 2.0009684210526318e-05, "loss": 0.2784, "mean_copy_accuracy": 0.99729123711586, "mean_gen_accuracy": 0.8742428123950958, "mean_token_accuracy": 0.9053080826997757, "num_tokens": 38920044.0, "sample_num_tokens": 8234.5, "step": 8645, "total_num_tokens": 38952982.0, "z_loss": 0.0004283144953660667 }, { "copy_logits_max": -3.0384926795959473, "copy_logits_min": -750000128.0, "copy_num_tokens": 359.3125, "epoch": 1.7656880265509318, "gen_logits_max": 4.068559169769287, "gen_logits_mean": -16.452377319335938, "gen_logits_min": -29.33758544921875, "gen_logits_std": 3.361780881881714, "gen_loss": 0.25008636713027954, "grad_norm": 0.34787395200910953, "learning_rate": 2.000842105263158e-05, "loss": 0.2738, "mean_copy_accuracy": 0.9970133155584335, "mean_gen_accuracy": 0.8819571882486343, "mean_token_accuracy": 0.9070261269807816, "num_tokens": 39189291.0, "sample_num_tokens": 7818.25, "step": 8646, "total_num_tokens": 39220564.0, "z_loss": 0.0003652655868791044 }, { "copy_logits_max": -4.676628589630127, "copy_logits_min": -687500032.0, "copy_num_tokens": 451.9375, "epoch": 1.7658922644881287, "gen_logits_max": 4.027191162109375, "gen_logits_mean": -16.53182029724121, "gen_logits_min": -29.529401779174805, "gen_logits_std": 3.3762195110321045, "gen_loss": 0.27813053131103516, "grad_norm": 0.3644278291828971, "learning_rate": 2.0007157894736843e-05, "loss": 0.288, "mean_copy_accuracy": 0.9962343275547028, "mean_gen_accuracy": 0.8736802935600281, "mean_token_accuracy": 0.9028062224388123, "num_tokens": 39461752.0, "sample_num_tokens": 8441.5, "step": 8647, "total_num_tokens": 39495518.0, "z_loss": 0.0004205536679364741 }, { "copy_logits_max": -3.2092490196228027, "copy_logits_min": -687500032.0, "copy_num_tokens": 506.3125, "epoch": 1.7660965024253255, "gen_logits_max": 2.614374876022339, "gen_logits_mean": -18.487377166748047, "gen_logits_min": -31.037525177001953, "gen_logits_std": 3.4446661472320557, "gen_loss": 0.27431952953338623, "grad_norm": 0.37717583642595964, "learning_rate": 2.0005894736842104e-05, "loss": 0.2858, "mean_copy_accuracy": 0.9957981705665588, "mean_gen_accuracy": 0.8746128231287003, "mean_token_accuracy": 0.9018035233020782, "num_tokens": 39717316.0, "sample_num_tokens": 8147.0, "step": 8648, "total_num_tokens": 39749904.0, "z_loss": 0.00043522490886971354 }, { "copy_logits_max": -3.1795310974121094, "copy_logits_min": -687500032.0, "copy_num_tokens": 279.5, "epoch": 1.7663007403625224, "gen_logits_max": 5.135163307189941, "gen_logits_mean": -16.18598175048828, "gen_logits_min": -29.257160186767578, "gen_logits_std": 3.3482108116149902, "gen_loss": 0.2618805170059204, "grad_norm": 0.3546066766086349, "learning_rate": 2.000463157894737e-05, "loss": 0.2896, "mean_copy_accuracy": 0.9955136328935623, "mean_gen_accuracy": 0.876554399728775, "mean_token_accuracy": 0.9011773020029068, "num_tokens": 39990734.0, "sample_num_tokens": 7189.5, "step": 8649, "total_num_tokens": 40019492.0, "z_loss": 0.0004185681464150548 }, { "copy_logits_max": -0.39362454414367676, "copy_logits_min": -750000000.0, "copy_num_tokens": 389.375, "epoch": 1.7665049782997193, "gen_logits_max": 5.247165203094482, "gen_logits_mean": -14.962967872619629, "gen_logits_min": -28.04343032836914, "gen_logits_std": 3.292292594909668, "gen_loss": 0.310418963432312, "grad_norm": 0.3576141452568531, "learning_rate": 2.0003368421052632e-05, "loss": 0.2969, "mean_copy_accuracy": 0.9966429322957993, "mean_gen_accuracy": 0.8726000934839249, "mean_token_accuracy": 0.8989726155996323, "num_tokens": 40246364.0, "sample_num_tokens": 8073.5, "step": 8650, "total_num_tokens": 40278658.0, "z_loss": 0.0005119341658428311 }, { "copy_logits_max": -0.68539959192276, "copy_logits_min": -750000000.0, "copy_num_tokens": 558.75, "epoch": 1.766709216236916, "gen_logits_max": 2.800107479095459, "gen_logits_mean": -17.58873748779297, "gen_logits_min": -30.805330276489258, "gen_logits_std": 3.424825668334961, "gen_loss": 0.24568697810173035, "grad_norm": 0.3499072165971208, "learning_rate": 2.0002105263157897e-05, "loss": 0.2832, "mean_copy_accuracy": 0.9977335333824158, "mean_gen_accuracy": 0.8750298768281937, "mean_token_accuracy": 0.9056081175804138, "num_tokens": 40526376.0, "sample_num_tokens": 8813.5, "step": 8651, "total_num_tokens": 40561630.0, "z_loss": 0.00039903464494273067 }, { "copy_logits_max": -1.9863100051879883, "copy_logits_min": -750000000.0, "copy_num_tokens": 501.0, "epoch": 1.7669134541741127, "gen_logits_max": 3.6821956634521484, "gen_logits_mean": -16.87038230895996, "gen_logits_min": -29.823198318481445, "gen_logits_std": 3.39957594871521, "gen_loss": 0.27920717000961304, "grad_norm": 0.36921003250556345, "learning_rate": 2.0000842105263158e-05, "loss": 0.2677, "mean_copy_accuracy": 0.9969610124826431, "mean_gen_accuracy": 0.8827627599239349, "mean_token_accuracy": 0.9077499955892563, "num_tokens": 40788731.0, "sample_num_tokens": 8594.75, "step": 8652, "total_num_tokens": 40823110.0, "z_loss": 0.0004357295692898333 }, { "copy_logits_max": -1.0417933464050293, "copy_logits_min": -625000064.0, "copy_num_tokens": 397.625, "epoch": 1.7671176921113096, "gen_logits_max": 4.559295654296875, "gen_logits_mean": -15.066287994384766, "gen_logits_min": -28.412246704101562, "gen_logits_std": 3.2888994216918945, "gen_loss": 0.28145021200180054, "grad_norm": 0.37861431271703083, "learning_rate": 1.9999578947368422e-05, "loss": 0.2689, "mean_copy_accuracy": 0.9960444569587708, "mean_gen_accuracy": 0.8775710910558701, "mean_token_accuracy": 0.9086267054080963, "num_tokens": 41081429.0, "sample_num_tokens": 7851.75, "step": 8653, "total_num_tokens": 41112836.0, "z_loss": 0.0004446934908628464 }, { "copy_logits_max": -0.42530763149261475, "copy_logits_min": -687500032.0, "copy_num_tokens": 451.6875, "epoch": 1.7673219300485066, "gen_logits_max": 4.3581085205078125, "gen_logits_mean": -15.68477725982666, "gen_logits_min": -28.335811614990234, "gen_logits_std": 3.3473763465881348, "gen_loss": 0.2692304849624634, "grad_norm": 0.3604378525980185, "learning_rate": 1.9998315789473683e-05, "loss": 0.2838, "mean_copy_accuracy": 0.9968520998954773, "mean_gen_accuracy": 0.8755478709936142, "mean_token_accuracy": 0.9041159301996231, "num_tokens": 41343776.0, "sample_num_tokens": 8672.5, "step": 8654, "total_num_tokens": 41378466.0, "z_loss": 0.0004386363143566996 }, { "copy_logits_max": 2.716963768005371, "copy_logits_min": -750000000.0, "copy_num_tokens": 471.25, "epoch": 1.7675261679857033, "gen_logits_max": 3.8168225288391113, "gen_logits_mean": -16.132831573486328, "gen_logits_min": -29.296985626220703, "gen_logits_std": 3.3260786533355713, "gen_loss": 0.27011972665786743, "grad_norm": 0.3757363561631016, "learning_rate": 1.9997052631578947e-05, "loss": 0.2708, "mean_copy_accuracy": 0.9964506030082703, "mean_gen_accuracy": 0.8791621327400208, "mean_token_accuracy": 0.9084683209657669, "num_tokens": 41608384.0, "sample_num_tokens": 8273.0, "step": 8655, "total_num_tokens": 41641476.0, "z_loss": 0.0005134345265105367 }, { "copy_logits_max": -3.068546772003174, "copy_logits_min": -750000000.0, "copy_num_tokens": 411.3125, "epoch": 1.7677304059229002, "gen_logits_max": 4.367166996002197, "gen_logits_mean": -16.59912872314453, "gen_logits_min": -28.95010757446289, "gen_logits_std": 3.3397250175476074, "gen_loss": 0.28344032168388367, "grad_norm": 0.34719273412686136, "learning_rate": 1.9995789473684208e-05, "loss": 0.277, "mean_copy_accuracy": 0.9974323660135269, "mean_gen_accuracy": 0.8727992475032806, "mean_token_accuracy": 0.9060389548540115, "num_tokens": 41906154.0, "sample_num_tokens": 7488.0, "step": 8656, "total_num_tokens": 41936106.0, "z_loss": 0.0004962623934261501 }, { "copy_logits_max": -1.3527764081954956, "copy_logits_min": -687500032.0, "copy_num_tokens": 621.4375, "epoch": 1.7679346438600971, "gen_logits_max": 3.630763292312622, "gen_logits_mean": -16.91521453857422, "gen_logits_min": -30.858964920043945, "gen_logits_std": 3.388639450073242, "gen_loss": 0.2423570305109024, "grad_norm": 0.33249700984450964, "learning_rate": 1.9994526315789476e-05, "loss": 0.2636, "mean_copy_accuracy": 0.9967915266752243, "mean_gen_accuracy": 0.8832799047231674, "mean_token_accuracy": 0.911636084318161, "num_tokens": 42183525.0, "sample_num_tokens": 9625.75, "step": 8657, "total_num_tokens": 42222028.0, "z_loss": 0.0004657890822272748 }, { "copy_logits_max": -2.84720516204834, "copy_logits_min": -750000000.0, "copy_num_tokens": 521.5625, "epoch": 1.7681388817972938, "gen_logits_max": 2.894491195678711, "gen_logits_mean": -18.177471160888672, "gen_logits_min": -30.895275115966797, "gen_logits_std": 3.4352641105651855, "gen_loss": 0.28133782744407654, "grad_norm": 0.3474239740942142, "learning_rate": 1.999326315789474e-05, "loss": 0.2611, "mean_copy_accuracy": 0.9976347535848618, "mean_gen_accuracy": 0.8791658580303192, "mean_token_accuracy": 0.910726398229599, "num_tokens": 42460143.0, "sample_num_tokens": 9401.25, "step": 8658, "total_num_tokens": 42497748.0, "z_loss": 0.0004645756271202117 }, { "copy_logits_max": -4.487854957580566, "copy_logits_min": -750000000.0, "copy_num_tokens": 259.25, "epoch": 1.7683431197344905, "gen_logits_max": 3.840879201889038, "gen_logits_mean": -17.937999725341797, "gen_logits_min": -30.730945587158203, "gen_logits_std": 3.410454511642456, "gen_loss": 0.2943372130393982, "grad_norm": 0.46482643708593785, "learning_rate": 1.9992e-05, "loss": 0.2795, "mean_copy_accuracy": 0.9963223040103912, "mean_gen_accuracy": 0.8782082349061966, "mean_token_accuracy": 0.9048496633768082, "num_tokens": 42738180.0, "sample_num_tokens": 6741.0, "step": 8659, "total_num_tokens": 42765144.0, "z_loss": 0.0004772843385580927 }, { "copy_logits_max": -2.3016068935394287, "copy_logits_min": -687500032.0, "copy_num_tokens": 368.0625, "epoch": 1.7685473576716875, "gen_logits_max": 4.138309478759766, "gen_logits_mean": -16.411659240722656, "gen_logits_min": -29.413305282592773, "gen_logits_std": 3.3431949615478516, "gen_loss": 0.26057207584381104, "grad_norm": 0.35977355145186907, "learning_rate": 1.9990736842105265e-05, "loss": 0.2701, "mean_copy_accuracy": 0.9971071928739548, "mean_gen_accuracy": 0.8821932077407837, "mean_token_accuracy": 0.9084676802158356, "num_tokens": 42996238.0, "sample_num_tokens": 8104.0, "step": 8660, "total_num_tokens": 43028654.0, "z_loss": 0.0004309244977775961 }, { "copy_logits_max": -1.012549877166748, "copy_logits_min": -750000000.0, "copy_num_tokens": 397.1875, "epoch": 1.7687515956088844, "gen_logits_max": 4.208421230316162, "gen_logits_mean": -16.14263153076172, "gen_logits_min": -29.312641143798828, "gen_logits_std": 3.349113941192627, "gen_loss": 0.2823571562767029, "grad_norm": 0.3652639466510978, "learning_rate": 1.9989473684210526e-05, "loss": 0.2756, "mean_copy_accuracy": 0.9976062476634979, "mean_gen_accuracy": 0.8772947937250137, "mean_token_accuracy": 0.9070868045091629, "num_tokens": 43263406.0, "sample_num_tokens": 7328.5, "step": 8661, "total_num_tokens": 43292720.0, "z_loss": 0.00044394918950274587 }, { "copy_logits_max": -5.809384346008301, "copy_logits_min": -750000000.0, "copy_num_tokens": 425.6875, "epoch": 1.768955833546081, "gen_logits_max": 2.3370513916015625, "gen_logits_mean": -19.098276138305664, "gen_logits_min": -31.58501625061035, "gen_logits_std": 3.455277442932129, "gen_loss": 0.27628642320632935, "grad_norm": 0.3556988119345723, "learning_rate": 1.998821052631579e-05, "loss": 0.2792, "mean_copy_accuracy": 0.9981765449047089, "mean_gen_accuracy": 0.8786024153232574, "mean_token_accuracy": 0.9061964750289917, "num_tokens": 43531586.0, "sample_num_tokens": 8165.5, "step": 8662, "total_num_tokens": 43564248.0, "z_loss": 0.0004733249661512673 }, { "copy_logits_max": -4.954972267150879, "copy_logits_min": -750000000.0, "copy_num_tokens": 522.4375, "epoch": 1.769160071483278, "gen_logits_max": 2.1929240226745605, "gen_logits_mean": -17.89174461364746, "gen_logits_min": -31.16879653930664, "gen_logits_std": 3.3752026557922363, "gen_loss": 0.27064985036849976, "grad_norm": 0.3373342392596132, "learning_rate": 1.998694736842105e-05, "loss": 0.2683, "mean_copy_accuracy": 0.9971392303705215, "mean_gen_accuracy": 0.8759728521108627, "mean_token_accuracy": 0.907293051481247, "num_tokens": 43810587.0, "sample_num_tokens": 8144.75, "step": 8663, "total_num_tokens": 43843166.0, "z_loss": 0.00043213475146330893 }, { "copy_logits_max": -5.416370868682861, "copy_logits_min": -750000000.0, "copy_num_tokens": 279.6875, "epoch": 1.769364309420475, "gen_logits_max": 3.8377833366394043, "gen_logits_mean": -17.904659271240234, "gen_logits_min": -30.49541473388672, "gen_logits_std": 3.3853230476379395, "gen_loss": 0.29667118191719055, "grad_norm": 0.35859474306247546, "learning_rate": 1.9985684210526316e-05, "loss": 0.2889, "mean_copy_accuracy": 0.9958657622337341, "mean_gen_accuracy": 0.8738092333078384, "mean_token_accuracy": 0.8997000902891159, "num_tokens": 44062073.0, "sample_num_tokens": 7398.25, "step": 8664, "total_num_tokens": 44091666.0, "z_loss": 0.00044940560474060476 }, { "copy_logits_max": -3.817232131958008, "copy_logits_min": -750000000.0, "copy_num_tokens": 529.25, "epoch": 1.7695685473576717, "gen_logits_max": 3.578596591949463, "gen_logits_mean": -16.80463218688965, "gen_logits_min": -29.590290069580078, "gen_logits_std": 3.3623299598693848, "gen_loss": 0.2893827557563782, "grad_norm": 0.3490687642481259, "learning_rate": 1.998442105263158e-05, "loss": 0.2802, "mean_copy_accuracy": 0.9970763325691223, "mean_gen_accuracy": 0.8781930208206177, "mean_token_accuracy": 0.9041256457567215, "num_tokens": 44330551.0, "sample_num_tokens": 9711.75, "step": 8665, "total_num_tokens": 44369398.0, "z_loss": 0.00046786508755758405 }, { "copy_logits_max": -4.830196857452393, "copy_logits_min": -750000000.0, "copy_num_tokens": 552.0, "epoch": 1.7697727852948684, "gen_logits_max": 3.6381707191467285, "gen_logits_mean": -16.437406539916992, "gen_logits_min": -29.26144790649414, "gen_logits_std": 3.3944387435913086, "gen_loss": 0.25169509649276733, "grad_norm": 0.33896619223989083, "learning_rate": 1.9983157894736844e-05, "loss": 0.2786, "mean_copy_accuracy": 0.9974713325500488, "mean_gen_accuracy": 0.8684190213680267, "mean_token_accuracy": 0.9042889326810837, "num_tokens": 44628587.0, "sample_num_tokens": 9010.25, "step": 8666, "total_num_tokens": 44664628.0, "z_loss": 0.00041610689368098974 }, { "copy_logits_max": -3.009281873703003, "copy_logits_min": -687500032.0, "copy_num_tokens": 509.1875, "epoch": 1.7699770232320655, "gen_logits_max": 3.9655230045318604, "gen_logits_mean": -16.17795753479004, "gen_logits_min": -29.02032470703125, "gen_logits_std": 3.39559268951416, "gen_loss": 0.20394942164421082, "grad_norm": 0.3480682839926006, "learning_rate": 1.9981894736842105e-05, "loss": 0.2507, "mean_copy_accuracy": 0.9973785877227783, "mean_gen_accuracy": 0.8892291635274887, "mean_token_accuracy": 0.9138292670249939, "num_tokens": 44895167.0, "sample_num_tokens": 8739.75, "step": 8667, "total_num_tokens": 44930126.0, "z_loss": 0.00034656579373404384 }, { "copy_logits_max": -6.937018394470215, "copy_logits_min": -625000064.0, "copy_num_tokens": 502.375, "epoch": 1.7701812611692622, "gen_logits_max": 3.7125229835510254, "gen_logits_mean": -16.80492401123047, "gen_logits_min": -29.573463439941406, "gen_logits_std": 3.35270357131958, "gen_loss": 0.260326623916626, "grad_norm": 0.31930890004131596, "learning_rate": 1.998063157894737e-05, "loss": 0.2755, "mean_copy_accuracy": 0.9976062327623367, "mean_gen_accuracy": 0.8750860393047333, "mean_token_accuracy": 0.9058156460523605, "num_tokens": 45155617.0, "sample_num_tokens": 8749.75, "step": 8668, "total_num_tokens": 45190616.0, "z_loss": 0.00041240680729970336 }, { "copy_logits_max": -6.571041107177734, "copy_logits_min": -687500032.0, "copy_num_tokens": 322.25, "epoch": 1.770385499106459, "gen_logits_max": 3.731034278869629, "gen_logits_mean": -17.702682495117188, "gen_logits_min": -31.00078773498535, "gen_logits_std": 3.419260025024414, "gen_loss": 0.2664591073989868, "grad_norm": 0.3422492429606292, "learning_rate": 1.997936842105263e-05, "loss": 0.2701, "mean_copy_accuracy": 0.9968208521604538, "mean_gen_accuracy": 0.8759686052799225, "mean_token_accuracy": 0.908038392663002, "num_tokens": 45429994.0, "sample_num_tokens": 6935.5, "step": 8669, "total_num_tokens": 45457736.0, "z_loss": 0.0004349381197243929 }, { "copy_logits_max": -5.79604434967041, "copy_logits_min": -750000000.0, "copy_num_tokens": 371.4375, "epoch": 1.7705897370436559, "gen_logits_max": 4.804739952087402, "gen_logits_mean": -16.45625114440918, "gen_logits_min": -29.29895782470703, "gen_logits_std": 3.35467529296875, "gen_loss": 0.30945536494255066, "grad_norm": 0.36351678509455526, "learning_rate": 1.9978105263157895e-05, "loss": 0.294, "mean_copy_accuracy": 0.9970128834247589, "mean_gen_accuracy": 0.8744656443595886, "mean_token_accuracy": 0.9021463990211487, "num_tokens": 45680090.0, "sample_num_tokens": 8489.0, "step": 8670, "total_num_tokens": 45714046.0, "z_loss": 0.00048158440040424466 }, { "copy_logits_max": -5.10542106628418, "copy_logits_min": -750000000.0, "copy_num_tokens": 397.9375, "epoch": 1.7707939749808528, "gen_logits_max": 4.052211761474609, "gen_logits_mean": -16.77191925048828, "gen_logits_min": -28.997478485107422, "gen_logits_std": 3.359243392944336, "gen_loss": 0.30477768182754517, "grad_norm": 0.3701223713670213, "learning_rate": 1.997684210526316e-05, "loss": 0.2698, "mean_copy_accuracy": 0.9965027868747711, "mean_gen_accuracy": 0.8810071647167206, "mean_token_accuracy": 0.9073962569236755, "num_tokens": 45940675.0, "sample_num_tokens": 8099.25, "step": 8671, "total_num_tokens": 45973072.0, "z_loss": 0.0004588076553773135 }, { "copy_logits_max": -5.084147930145264, "copy_logits_min": -750000000.0, "copy_num_tokens": 457.0625, "epoch": 1.7709982129180495, "gen_logits_max": 3.7785110473632812, "gen_logits_mean": -17.175214767456055, "gen_logits_min": -29.421314239501953, "gen_logits_std": 3.3818752765655518, "gen_loss": 0.2707662582397461, "grad_norm": 0.34286861397134, "learning_rate": 1.997557894736842e-05, "loss": 0.2585, "mean_copy_accuracy": 0.9960935115814209, "mean_gen_accuracy": 0.8877518624067307, "mean_token_accuracy": 0.9122398644685745, "num_tokens": 46210997.0, "sample_num_tokens": 8824.75, "step": 8672, "total_num_tokens": 46246296.0, "z_loss": 0.0004056571633554995 }, { "copy_logits_max": -0.5607945322990417, "copy_logits_min": -750000064.0, "copy_num_tokens": 859.9375, "epoch": 1.7712024508552464, "gen_logits_max": 2.238827705383301, "gen_logits_mean": -17.124744415283203, "gen_logits_min": -31.101741790771484, "gen_logits_std": 3.406761646270752, "gen_loss": 0.21667814254760742, "grad_norm": 0.35252900644856683, "learning_rate": 1.9974315789473684e-05, "loss": 0.2638, "mean_copy_accuracy": 0.9975258260965347, "mean_gen_accuracy": 0.8791622072458267, "mean_token_accuracy": 0.9106256663799286, "num_tokens": 46487088.0, "sample_num_tokens": 10253.0, "step": 8673, "total_num_tokens": 46528100.0, "z_loss": 0.000343575346050784 }, { "copy_logits_max": -2.6665866374969482, "copy_logits_min": -750000064.0, "copy_num_tokens": 308.0625, "epoch": 1.7714066887924433, "gen_logits_max": 4.093965530395508, "gen_logits_mean": -16.644638061523438, "gen_logits_min": -29.46735954284668, "gen_logits_std": 3.3513238430023193, "gen_loss": 0.32046741247177124, "grad_norm": 0.38791222028114025, "learning_rate": 1.997305263157895e-05, "loss": 0.29, "mean_copy_accuracy": 0.996761217713356, "mean_gen_accuracy": 0.8706685602664948, "mean_token_accuracy": 0.9010173231363297, "num_tokens": 46760379.0, "sample_num_tokens": 7166.75, "step": 8674, "total_num_tokens": 46789046.0, "z_loss": 0.0004791005630977452 }, { "copy_logits_max": -4.141955852508545, "copy_logits_min": -750000128.0, "copy_num_tokens": 430.0625, "epoch": 1.77161092672964, "gen_logits_max": 2.401510000228882, "gen_logits_mean": -18.70631980895996, "gen_logits_min": -31.514802932739258, "gen_logits_std": 3.4532594680786133, "gen_loss": 0.25175994634628296, "grad_norm": 0.38795851758070327, "learning_rate": 1.9971789473684213e-05, "loss": 0.2854, "mean_copy_accuracy": 0.9961618036031723, "mean_gen_accuracy": 0.8770907372236252, "mean_token_accuracy": 0.903672069311142, "num_tokens": 47019807.0, "sample_num_tokens": 8184.75, "step": 8675, "total_num_tokens": 47052546.0, "z_loss": 0.0003438744170125574 }, { "copy_logits_max": -0.4682575464248657, "copy_logits_min": -687500032.0, "copy_num_tokens": 392.9375, "epoch": 1.7718151646668368, "gen_logits_max": 2.98880672454834, "gen_logits_mean": -17.45991325378418, "gen_logits_min": -29.96327018737793, "gen_logits_std": 3.3846592903137207, "gen_loss": 0.2659083306789398, "grad_norm": 0.37157699707335395, "learning_rate": 1.9970526315789474e-05, "loss": 0.269, "mean_copy_accuracy": 0.9978043287992477, "mean_gen_accuracy": 0.8766591101884842, "mean_token_accuracy": 0.907874345779419, "num_tokens": 47275041.0, "sample_num_tokens": 7175.75, "step": 8676, "total_num_tokens": 47303744.0, "z_loss": 0.0003546334046404809 }, { "copy_logits_max": -0.46750882267951965, "copy_logits_min": -750000000.0, "copy_num_tokens": 498.0625, "epoch": 1.7720194026040337, "gen_logits_max": 3.108260154724121, "gen_logits_mean": -17.062175750732422, "gen_logits_min": -30.19314956665039, "gen_logits_std": 3.391934871673584, "gen_loss": 0.27027127146720886, "grad_norm": 0.3513016671082264, "learning_rate": 1.996926315789474e-05, "loss": 0.263, "mean_copy_accuracy": 0.9976983815431595, "mean_gen_accuracy": 0.8808494210243225, "mean_token_accuracy": 0.9102877527475357, "num_tokens": 47528286.0, "sample_num_tokens": 8269.5, "step": 8677, "total_num_tokens": 47561364.0, "z_loss": 0.0004103320825379342 }, { "copy_logits_max": -0.862697422504425, "copy_logits_min": -750000000.0, "copy_num_tokens": 410.4375, "epoch": 1.7722236405412306, "gen_logits_max": 3.8748745918273926, "gen_logits_mean": -16.236373901367188, "gen_logits_min": -29.59034538269043, "gen_logits_std": 3.344175100326538, "gen_loss": 0.2712555229663849, "grad_norm": 0.3543865541310096, "learning_rate": 1.9968e-05, "loss": 0.2764, "mean_copy_accuracy": 0.9977372288703918, "mean_gen_accuracy": 0.8789264857769012, "mean_token_accuracy": 0.9063078314065933, "num_tokens": 47793306.0, "sample_num_tokens": 8414.5, "step": 8678, "total_num_tokens": 47826964.0, "z_loss": 0.0003903857432305813 }, { "copy_logits_max": -4.737139701843262, "copy_logits_min": -750000000.0, "copy_num_tokens": 407.0625, "epoch": 1.7724278784784273, "gen_logits_max": 3.2521090507507324, "gen_logits_mean": -18.830785751342773, "gen_logits_min": -31.018577575683594, "gen_logits_std": 3.458514928817749, "gen_loss": 0.2851085066795349, "grad_norm": 0.3636725566396698, "learning_rate": 1.9966736842105264e-05, "loss": 0.2673, "mean_copy_accuracy": 0.9979719519615173, "mean_gen_accuracy": 0.879631295800209, "mean_token_accuracy": 0.9104129672050476, "num_tokens": 48082795.0, "sample_num_tokens": 9091.25, "step": 8679, "total_num_tokens": 48119160.0, "z_loss": 0.000413810892496258 }, { "copy_logits_max": -8.020689010620117, "copy_logits_min": -750000000.0, "copy_num_tokens": 259.9375, "epoch": 1.7726321164156242, "gen_logits_max": 4.257205963134766, "gen_logits_mean": -17.090456008911133, "gen_logits_min": -28.79197883605957, "gen_logits_std": 3.3251659870147705, "gen_loss": 0.3070428967475891, "grad_norm": 0.3679314402588224, "learning_rate": 1.9965473684210524e-05, "loss": 0.2841, "mean_copy_accuracy": 0.9966650605201721, "mean_gen_accuracy": 0.878589540719986, "mean_token_accuracy": 0.9050826132297516, "num_tokens": 48357943.0, "sample_num_tokens": 7558.75, "step": 8680, "total_num_tokens": 48388178.0, "z_loss": 0.0004021706699859351 }, { "copy_logits_max": -4.751463890075684, "copy_logits_min": -750000000.0, "copy_num_tokens": 577.8125, "epoch": 1.7728363543528212, "gen_logits_max": 3.709465265274048, "gen_logits_mean": -16.054244995117188, "gen_logits_min": -28.152416229248047, "gen_logits_std": 3.315607786178589, "gen_loss": 0.2576807141304016, "grad_norm": 0.3783541980691607, "learning_rate": 1.996421052631579e-05, "loss": 0.2759, "mean_copy_accuracy": 0.9970906376838684, "mean_gen_accuracy": 0.8755949586629868, "mean_token_accuracy": 0.9039585739374161, "num_tokens": 48629131.0, "sample_num_tokens": 9899.25, "step": 8681, "total_num_tokens": 48668728.0, "z_loss": 0.00038836547173559666 }, { "copy_logits_max": -2.5264134407043457, "copy_logits_min": -687500032.0, "copy_num_tokens": 388.0625, "epoch": 1.7730405922900179, "gen_logits_max": 3.665480375289917, "gen_logits_mean": -17.874292373657227, "gen_logits_min": -30.544776916503906, "gen_logits_std": 3.4148943424224854, "gen_loss": 0.28819042444229126, "grad_norm": 0.3634257703427265, "learning_rate": 1.9962947368421053e-05, "loss": 0.265, "mean_copy_accuracy": 0.9965070337057114, "mean_gen_accuracy": 0.8803598880767822, "mean_token_accuracy": 0.9099474847316742, "num_tokens": 48906713.0, "sample_num_tokens": 8065.25, "step": 8682, "total_num_tokens": 48938974.0, "z_loss": 0.00046015792759135365 }, { "copy_logits_max": -4.387387752532959, "copy_logits_min": -750000000.0, "copy_num_tokens": 335.4375, "epoch": 1.7732448302272146, "gen_logits_max": 3.5693395137786865, "gen_logits_mean": -16.936065673828125, "gen_logits_min": -29.319080352783203, "gen_logits_std": 3.3511009216308594, "gen_loss": 0.27307915687561035, "grad_norm": 0.3684726652913229, "learning_rate": 1.9961684210526317e-05, "loss": 0.2715, "mean_copy_accuracy": 0.9961262047290802, "mean_gen_accuracy": 0.8801979571580887, "mean_token_accuracy": 0.9072312116622925, "num_tokens": 49167172.0, "sample_num_tokens": 7385.0, "step": 8683, "total_num_tokens": 49196712.0, "z_loss": 0.00044021251960657537 }, { "copy_logits_max": -3.9414312839508057, "copy_logits_min": -750000000.0, "copy_num_tokens": 481.125, "epoch": 1.7734490681644115, "gen_logits_max": 3.731572389602661, "gen_logits_mean": -17.4810848236084, "gen_logits_min": -29.958166122436523, "gen_logits_std": 3.4010396003723145, "gen_loss": 0.2789575159549713, "grad_norm": 0.3465938460422022, "learning_rate": 1.996042105263158e-05, "loss": 0.2718, "mean_copy_accuracy": 0.9972805380821228, "mean_gen_accuracy": 0.8800818622112274, "mean_token_accuracy": 0.9064085930585861, "num_tokens": 49442180.0, "sample_num_tokens": 9249.0, "step": 8684, "total_num_tokens": 49479176.0, "z_loss": 0.00044385026558302343 }, { "copy_logits_max": -3.993157148361206, "copy_logits_min": -750000000.0, "copy_num_tokens": 399.125, "epoch": 1.7736533061016084, "gen_logits_max": 4.253763675689697, "gen_logits_mean": -15.43819522857666, "gen_logits_min": -28.135454177856445, "gen_logits_std": 3.310788154602051, "gen_loss": 0.29097169637680054, "grad_norm": 0.39459037416880527, "learning_rate": 1.9959157894736843e-05, "loss": 0.2837, "mean_copy_accuracy": 0.997153252363205, "mean_gen_accuracy": 0.8756039440631866, "mean_token_accuracy": 0.9032006114721298, "num_tokens": 49702084.0, "sample_num_tokens": 8191.5, "step": 8685, "total_num_tokens": 49734850.0, "z_loss": 0.00046156064490787685 }, { "copy_logits_max": -2.8749594688415527, "copy_logits_min": -687500032.0, "copy_num_tokens": 526.0625, "epoch": 1.7738575440388051, "gen_logits_max": 2.9747867584228516, "gen_logits_mean": -17.507732391357422, "gen_logits_min": -30.152259826660156, "gen_logits_std": 3.413534641265869, "gen_loss": 0.2892886996269226, "grad_norm": 0.36131874912619816, "learning_rate": 1.9957894736842107e-05, "loss": 0.269, "mean_copy_accuracy": 0.9967849403619766, "mean_gen_accuracy": 0.8790348172187805, "mean_token_accuracy": 0.9089133441448212, "num_tokens": 49954635.0, "sample_num_tokens": 8892.75, "step": 8686, "total_num_tokens": 49990206.0, "z_loss": 0.0004606874717865139 }, { "copy_logits_max": -4.920006275177002, "copy_logits_min": -750000000.0, "copy_num_tokens": 445.3125, "epoch": 1.774061781976002, "gen_logits_max": 3.373465061187744, "gen_logits_mean": -17.367984771728516, "gen_logits_min": -29.971845626831055, "gen_logits_std": 3.415060520172119, "gen_loss": 0.2745964527130127, "grad_norm": 0.3231872643893489, "learning_rate": 1.9956631578947368e-05, "loss": 0.272, "mean_copy_accuracy": 0.9972926080226898, "mean_gen_accuracy": 0.8798446506261826, "mean_token_accuracy": 0.9077616184949875, "num_tokens": 50233061.0, "sample_num_tokens": 8443.25, "step": 8687, "total_num_tokens": 50266834.0, "z_loss": 0.000424711499363184 }, { "copy_logits_max": -6.76387882232666, "copy_logits_min": -750000000.0, "copy_num_tokens": 365.1875, "epoch": 1.774266019913199, "gen_logits_max": 2.7792160511016846, "gen_logits_mean": -19.06460189819336, "gen_logits_min": -31.31470489501953, "gen_logits_std": 3.434950113296509, "gen_loss": 0.2945944666862488, "grad_norm": 0.38809362122841945, "learning_rate": 1.9955368421052632e-05, "loss": 0.2924, "mean_copy_accuracy": 0.9951895475387573, "mean_gen_accuracy": 0.8725030571222305, "mean_token_accuracy": 0.9002241790294647, "num_tokens": 50491651.0, "sample_num_tokens": 8027.25, "step": 8688, "total_num_tokens": 50523760.0, "z_loss": 0.0004367585643194616 }, { "copy_logits_max": -3.8764379024505615, "copy_logits_min": -750000000.0, "copy_num_tokens": 417.5, "epoch": 1.7744702578503957, "gen_logits_max": 3.339470863342285, "gen_logits_mean": -17.113021850585938, "gen_logits_min": -29.789003372192383, "gen_logits_std": 3.38102126121521, "gen_loss": 0.2967761158943176, "grad_norm": 0.3825569128612028, "learning_rate": 1.9954105263157893e-05, "loss": 0.2862, "mean_copy_accuracy": 0.9970879852771759, "mean_gen_accuracy": 0.8697213679552078, "mean_token_accuracy": 0.9033139497041702, "num_tokens": 50775234.0, "sample_num_tokens": 7489.5, "step": 8689, "total_num_tokens": 50805192.0, "z_loss": 0.0004266466130502522 }, { "copy_logits_max": -2.235182285308838, "copy_logits_min": -687500032.0, "copy_num_tokens": 683.875, "epoch": 1.7746744957875924, "gen_logits_max": 2.1167385578155518, "gen_logits_mean": -18.51351547241211, "gen_logits_min": -31.28707504272461, "gen_logits_std": 3.4619100093841553, "gen_loss": 0.2831841707229614, "grad_norm": 0.3428452290402912, "learning_rate": 1.995284210526316e-05, "loss": 0.28, "mean_copy_accuracy": 0.9981623142957687, "mean_gen_accuracy": 0.8665732443332672, "mean_token_accuracy": 0.9058191478252411, "num_tokens": 51068371.0, "sample_num_tokens": 10229.25, "step": 8690, "total_num_tokens": 51109288.0, "z_loss": 0.00044474698370322585 }, { "copy_logits_max": -2.2362074851989746, "copy_logits_min": -750000000.0, "copy_num_tokens": 406.4375, "epoch": 1.7748787337247893, "gen_logits_max": 5.063750743865967, "gen_logits_mean": -15.054882049560547, "gen_logits_min": -27.444406509399414, "gen_logits_std": 3.2843146324157715, "gen_loss": 0.30252766609191895, "grad_norm": 0.36478806585091067, "learning_rate": 1.9951578947368422e-05, "loss": 0.2912, "mean_copy_accuracy": 0.9973950833082199, "mean_gen_accuracy": 0.8722133785486221, "mean_token_accuracy": 0.9001733511686325, "num_tokens": 51339100.0, "sample_num_tokens": 8215.0, "step": 8691, "total_num_tokens": 51371960.0, "z_loss": 0.0005030020838603377 }, { "copy_logits_max": -1.9645769596099854, "copy_logits_min": -687500032.0, "copy_num_tokens": 436.875, "epoch": 1.7750829716619863, "gen_logits_max": 3.3010194301605225, "gen_logits_mean": -17.40685272216797, "gen_logits_min": -30.01763343811035, "gen_logits_std": 3.4250364303588867, "gen_loss": 0.2829180955886841, "grad_norm": 0.43599860978393956, "learning_rate": 1.9950315789473686e-05, "loss": 0.2711, "mean_copy_accuracy": 0.9969179481267929, "mean_gen_accuracy": 0.8783342689275742, "mean_token_accuracy": 0.9097221046686172, "num_tokens": 51616686.0, "sample_num_tokens": 7616.0, "step": 8692, "total_num_tokens": 51647150.0, "z_loss": 0.0005123313167132437 }, { "copy_logits_max": -4.0323381423950195, "copy_logits_min": -750000000.0, "copy_num_tokens": 316.0625, "epoch": 1.775287209599183, "gen_logits_max": 4.511037826538086, "gen_logits_mean": -16.681930541992188, "gen_logits_min": -29.554851531982422, "gen_logits_std": 3.375028371810913, "gen_loss": 0.29213735461235046, "grad_norm": 0.36817920352735956, "learning_rate": 1.9949052631578947e-05, "loss": 0.2622, "mean_copy_accuracy": 0.9966196119785309, "mean_gen_accuracy": 0.8849828243255615, "mean_token_accuracy": 0.9096235483884811, "num_tokens": 51880498.0, "sample_num_tokens": 6896.0, "step": 8693, "total_num_tokens": 51908082.0, "z_loss": 0.0005539905978366733 }, { "copy_logits_max": -1.968521237373352, "copy_logits_min": -750000000.0, "copy_num_tokens": 379.4375, "epoch": 1.7754914475363799, "gen_logits_max": 4.797386169433594, "gen_logits_mean": -15.810972213745117, "gen_logits_min": -28.435134887695312, "gen_logits_std": 3.3206162452697754, "gen_loss": 0.3255476951599121, "grad_norm": 0.3762956722109772, "learning_rate": 1.994778947368421e-05, "loss": 0.2849, "mean_copy_accuracy": 0.9955418258905411, "mean_gen_accuracy": 0.8725262731313705, "mean_token_accuracy": 0.9009048342704773, "num_tokens": 52132328.0, "sample_num_tokens": 8237.0, "step": 8694, "total_num_tokens": 52165276.0, "z_loss": 0.0006387523608282208 }, { "copy_logits_max": 1.0984935760498047, "copy_logits_min": -750000000.0, "copy_num_tokens": 553.125, "epoch": 1.7756956854735768, "gen_logits_max": 4.640644073486328, "gen_logits_mean": -14.318675994873047, "gen_logits_min": -27.12592315673828, "gen_logits_std": 3.2713334560394287, "gen_loss": 0.2545109987258911, "grad_norm": 0.3490953439243522, "learning_rate": 1.9946526315789472e-05, "loss": 0.278, "mean_copy_accuracy": 0.9967378824949265, "mean_gen_accuracy": 0.8750224411487579, "mean_token_accuracy": 0.9064632058143616, "num_tokens": 52408095.0, "sample_num_tokens": 8344.75, "step": 8695, "total_num_tokens": 52441474.0, "z_loss": 0.00044570869067683816 }, { "copy_logits_max": -1.360706090927124, "copy_logits_min": -687500032.0, "copy_num_tokens": 600.6875, "epoch": 1.7758999234107735, "gen_logits_max": 2.8976101875305176, "gen_logits_mean": -17.580455780029297, "gen_logits_min": -29.880647659301758, "gen_logits_std": 3.389169692993164, "gen_loss": 0.2692505121231079, "grad_norm": 0.34468658650541, "learning_rate": 1.9945263157894737e-05, "loss": 0.2694, "mean_copy_accuracy": 0.9976745694875717, "mean_gen_accuracy": 0.8772046118974686, "mean_token_accuracy": 0.9090456366539001, "num_tokens": 52677504.0, "sample_num_tokens": 9582.0, "step": 8696, "total_num_tokens": 52715832.0, "z_loss": 0.0004903864464722574 }, { "copy_logits_max": -0.2776637673377991, "copy_logits_min": -750000000.0, "copy_num_tokens": 367.5, "epoch": 1.7761041613479702, "gen_logits_max": 4.89440393447876, "gen_logits_mean": -14.540204048156738, "gen_logits_min": -27.130287170410156, "gen_logits_std": 3.2647008895874023, "gen_loss": 0.263700932264328, "grad_norm": 0.3541954450582853, "learning_rate": 1.9943999999999997e-05, "loss": 0.2845, "mean_copy_accuracy": 0.9968292564153671, "mean_gen_accuracy": 0.8799116313457489, "mean_token_accuracy": 0.9035225659608841, "num_tokens": 52930813.0, "sample_num_tokens": 7277.25, "step": 8697, "total_num_tokens": 52959922.0, "z_loss": 0.00048382897512055933 }, { "copy_logits_max": 0.3047465980052948, "copy_logits_min": -750000000.0, "copy_num_tokens": 648.3125, "epoch": 1.7763083992851674, "gen_logits_max": 3.7640862464904785, "gen_logits_mean": -16.383689880371094, "gen_logits_min": -29.057655334472656, "gen_logits_std": 3.3914012908935547, "gen_loss": 0.25828397274017334, "grad_norm": 0.3392226704689528, "learning_rate": 1.9942736842105265e-05, "loss": 0.2701, "mean_copy_accuracy": 0.9968892484903336, "mean_gen_accuracy": 0.878187507390976, "mean_token_accuracy": 0.9089542776346207, "num_tokens": 53191298.0, "sample_num_tokens": 9078.0, "step": 8698, "total_num_tokens": 53227610.0, "z_loss": 0.00046690041199326515 }, { "copy_logits_max": -4.794038772583008, "copy_logits_min": -750000000.0, "copy_num_tokens": 431.625, "epoch": 1.776512637222364, "gen_logits_max": 2.814241647720337, "gen_logits_mean": -19.552566528320312, "gen_logits_min": -31.4511661529541, "gen_logits_std": 3.4749786853790283, "gen_loss": 0.27103960514068604, "grad_norm": 0.3434269129871352, "learning_rate": 1.994147368421053e-05, "loss": 0.2562, "mean_copy_accuracy": 0.9977861642837524, "mean_gen_accuracy": 0.8824013471603394, "mean_token_accuracy": 0.9137276858091354, "num_tokens": 53485155.0, "sample_num_tokens": 7991.25, "step": 8699, "total_num_tokens": 53517120.0, "z_loss": 0.00047458260087296367 }, { "copy_logits_max": -2.690253734588623, "copy_logits_min": -750000064.0, "copy_num_tokens": 445.25, "epoch": 1.7767168751595608, "gen_logits_max": 4.043954849243164, "gen_logits_mean": -16.81443977355957, "gen_logits_min": -28.84309959411621, "gen_logits_std": 3.3713338375091553, "gen_loss": 0.26520660519599915, "grad_norm": 0.3381123663302531, "learning_rate": 1.994021052631579e-05, "loss": 0.2923, "mean_copy_accuracy": 0.9972820430994034, "mean_gen_accuracy": 0.8729757368564606, "mean_token_accuracy": 0.9000924676656723, "num_tokens": 53760280.0, "sample_num_tokens": 8802.5, "step": 8700, "total_num_tokens": 53795490.0, "z_loss": 0.0004449745756573975 }, { "copy_logits_max": -2.470618963241577, "copy_logits_min": -750000000.0, "copy_num_tokens": 333.0, "epoch": 1.7769211130967577, "gen_logits_max": 4.155500411987305, "gen_logits_mean": -17.176467895507812, "gen_logits_min": -29.1043701171875, "gen_logits_std": 3.362429141998291, "gen_loss": 0.28969454765319824, "grad_norm": 0.33861650563335094, "learning_rate": 1.9938947368421055e-05, "loss": 0.2705, "mean_copy_accuracy": 0.996976688504219, "mean_gen_accuracy": 0.8789209872484207, "mean_token_accuracy": 0.9068986773490906, "num_tokens": 54036855.0, "sample_num_tokens": 7349.75, "step": 8701, "total_num_tokens": 54066254.0, "z_loss": 0.0004952540621161461 }, { "copy_logits_max": -0.7669718861579895, "copy_logits_min": -750000000.0, "copy_num_tokens": 700.5, "epoch": 1.7771253510339546, "gen_logits_max": 2.9020566940307617, "gen_logits_mean": -17.220380783081055, "gen_logits_min": -29.458234786987305, "gen_logits_std": 3.4006805419921875, "gen_loss": 0.2727166712284088, "grad_norm": 0.31864006890355745, "learning_rate": 1.9937684210526316e-05, "loss": 0.2674, "mean_copy_accuracy": 0.9976758062839508, "mean_gen_accuracy": 0.8766469210386276, "mean_token_accuracy": 0.9081427752971649, "num_tokens": 54310551.0, "sample_num_tokens": 9878.75, "step": 8702, "total_num_tokens": 54350066.0, "z_loss": 0.0004855263978242874 }, { "copy_logits_max": -2.129490852355957, "copy_logits_min": -750000000.0, "copy_num_tokens": 482.875, "epoch": 1.7773295889711513, "gen_logits_max": 3.3616936206817627, "gen_logits_mean": -16.92101287841797, "gen_logits_min": -29.209766387939453, "gen_logits_std": 3.3572163581848145, "gen_loss": 0.2637428343296051, "grad_norm": 0.3382246161677861, "learning_rate": 1.993642105263158e-05, "loss": 0.2793, "mean_copy_accuracy": 0.9974143803119659, "mean_gen_accuracy": 0.8769654631614685, "mean_token_accuracy": 0.9032801985740662, "num_tokens": 54579476.0, "sample_num_tokens": 8514.0, "step": 8703, "total_num_tokens": 54613532.0, "z_loss": 0.0004909370909444988 }, { "copy_logits_max": -0.34141677618026733, "copy_logits_min": -750000000.0, "copy_num_tokens": 590.625, "epoch": 1.7775338269083483, "gen_logits_max": 3.8574469089508057, "gen_logits_mean": -16.093603134155273, "gen_logits_min": -29.010974884033203, "gen_logits_std": 3.334932327270508, "gen_loss": 0.26854121685028076, "grad_norm": 0.3470508049719836, "learning_rate": 1.993515789473684e-05, "loss": 0.2689, "mean_copy_accuracy": 0.9973490536212921, "mean_gen_accuracy": 0.8753503859043121, "mean_token_accuracy": 0.9077902734279633, "num_tokens": 54851300.0, "sample_num_tokens": 9335.5, "step": 8704, "total_num_tokens": 54888642.0, "z_loss": 0.00047041208017617464 }, { "copy_logits_max": -2.4683754444122314, "copy_logits_min": -750000128.0, "copy_num_tokens": 458.5, "epoch": 1.7777380648455452, "gen_logits_max": 4.1362481117248535, "gen_logits_mean": -16.789512634277344, "gen_logits_min": -29.261112213134766, "gen_logits_std": 3.3629703521728516, "gen_loss": 0.30511075258255005, "grad_norm": 0.37283696575335795, "learning_rate": 1.9933894736842105e-05, "loss": 0.3018, "mean_copy_accuracy": 0.9969512969255447, "mean_gen_accuracy": 0.8669601231813431, "mean_token_accuracy": 0.8970940858125687, "num_tokens": 55122315.0, "sample_num_tokens": 9399.25, "step": 8705, "total_num_tokens": 55159912.0, "z_loss": 0.0004901990760117769 }, { "copy_logits_max": -3.9357471466064453, "copy_logits_min": -750000000.0, "copy_num_tokens": 461.3125, "epoch": 1.777942302782742, "gen_logits_max": 3.6903252601623535, "gen_logits_mean": -17.109426498413086, "gen_logits_min": -29.386268615722656, "gen_logits_std": 3.4137282371520996, "gen_loss": 0.26085400581359863, "grad_norm": 0.36033529947447174, "learning_rate": 1.993263157894737e-05, "loss": 0.2798, "mean_copy_accuracy": 0.9961771070957184, "mean_gen_accuracy": 0.876296877861023, "mean_token_accuracy": 0.9022971093654633, "num_tokens": 55387460.0, "sample_num_tokens": 8399.5, "step": 8706, "total_num_tokens": 55421058.0, "z_loss": 0.00043855042895302176 }, { "copy_logits_max": -4.603211402893066, "copy_logits_min": -687500032.0, "copy_num_tokens": 487.875, "epoch": 1.7781465407199386, "gen_logits_max": 3.628662586212158, "gen_logits_mean": -16.784202575683594, "gen_logits_min": -28.90481948852539, "gen_logits_std": 3.3768951892852783, "gen_loss": 0.2703205943107605, "grad_norm": 0.3490296264628749, "learning_rate": 1.9931368421052634e-05, "loss": 0.283, "mean_copy_accuracy": 0.9963981658220291, "mean_gen_accuracy": 0.8752519488334656, "mean_token_accuracy": 0.9032618850469589, "num_tokens": 55658788.0, "sample_num_tokens": 8127.5, "step": 8707, "total_num_tokens": 55691298.0, "z_loss": 0.00046855327673256397 }, { "copy_logits_max": -3.313365936279297, "copy_logits_min": -750000128.0, "copy_num_tokens": 446.3125, "epoch": 1.7783507786571355, "gen_logits_max": 4.859076976776123, "gen_logits_mean": -15.445694923400879, "gen_logits_min": -27.704490661621094, "gen_logits_std": 3.332401990890503, "gen_loss": 0.3063565790653229, "grad_norm": 0.4203689306219258, "learning_rate": 1.9930105263157895e-05, "loss": 0.2748, "mean_copy_accuracy": 0.9966227263212204, "mean_gen_accuracy": 0.876163437962532, "mean_token_accuracy": 0.9074209779500961, "num_tokens": 55933478.0, "sample_num_tokens": 9064.5, "step": 8708, "total_num_tokens": 55969736.0, "z_loss": 0.0005309507250785828 }, { "copy_logits_max": -4.321908950805664, "copy_logits_min": -750000000.0, "copy_num_tokens": 520.0, "epoch": 1.7785550165943325, "gen_logits_max": 3.4257872104644775, "gen_logits_mean": -16.973722457885742, "gen_logits_min": -29.287492752075195, "gen_logits_std": 3.414271593093872, "gen_loss": 0.2620622515678406, "grad_norm": 0.33582829818862137, "learning_rate": 1.992884210526316e-05, "loss": 0.2704, "mean_copy_accuracy": 0.9976812303066254, "mean_gen_accuracy": 0.8756963461637497, "mean_token_accuracy": 0.9072403013706207, "num_tokens": 56215762.0, "sample_num_tokens": 8905.0, "step": 8709, "total_num_tokens": 56251382.0, "z_loss": 0.0004484136588871479 }, { "copy_logits_max": -3.2467076778411865, "copy_logits_min": -750000064.0, "copy_num_tokens": 587.625, "epoch": 1.7787592545315292, "gen_logits_max": 3.7824463844299316, "gen_logits_mean": -16.371028900146484, "gen_logits_min": -28.872467041015625, "gen_logits_std": 3.387951374053955, "gen_loss": 0.2674984633922577, "grad_norm": 0.34113856920310276, "learning_rate": 1.992757894736842e-05, "loss": 0.2732, "mean_copy_accuracy": 0.9970908164978027, "mean_gen_accuracy": 0.8772117793560028, "mean_token_accuracy": 0.9067706316709518, "num_tokens": 56487659.0, "sample_num_tokens": 9666.25, "step": 8710, "total_num_tokens": 56526324.0, "z_loss": 0.00044034497113898396 }, { "copy_logits_max": -1.1878385543823242, "copy_logits_min": -687500032.0, "copy_num_tokens": 465.5625, "epoch": 1.778963492468726, "gen_logits_max": 5.215202331542969, "gen_logits_mean": -14.232255935668945, "gen_logits_min": -26.650299072265625, "gen_logits_std": 3.2956433296203613, "gen_loss": 0.2732775807380676, "grad_norm": 0.3370843771818439, "learning_rate": 1.9926315789473684e-05, "loss": 0.27, "mean_copy_accuracy": 0.9972812235355377, "mean_gen_accuracy": 0.8796114921569824, "mean_token_accuracy": 0.9077192097902298, "num_tokens": 56766387.0, "sample_num_tokens": 8681.25, "step": 8711, "total_num_tokens": 56801112.0, "z_loss": 0.00045373683678917587 }, { "copy_logits_max": -3.245206594467163, "copy_logits_min": -687500032.0, "copy_num_tokens": 573.0625, "epoch": 1.779167730405923, "gen_logits_max": 3.5596065521240234, "gen_logits_mean": -16.776880264282227, "gen_logits_min": -29.518627166748047, "gen_logits_std": 3.384070873260498, "gen_loss": 0.2794722020626068, "grad_norm": 0.3402963162971205, "learning_rate": 1.992505263157895e-05, "loss": 0.2586, "mean_copy_accuracy": 0.9967862814664841, "mean_gen_accuracy": 0.8836532831192017, "mean_token_accuracy": 0.9127603769302368, "num_tokens": 57034767.0, "sample_num_tokens": 9426.75, "step": 8712, "total_num_tokens": 57072474.0, "z_loss": 0.00044859794434159994 }, { "copy_logits_max": -7.785252094268799, "copy_logits_min": -750000000.0, "copy_num_tokens": 309.375, "epoch": 1.7793719683431197, "gen_logits_max": 3.524850845336914, "gen_logits_mean": -18.810775756835938, "gen_logits_min": -30.62215805053711, "gen_logits_std": 3.4500808715820312, "gen_loss": 0.29424405097961426, "grad_norm": 0.359948297701267, "learning_rate": 1.992378947368421e-05, "loss": 0.2784, "mean_copy_accuracy": 0.9952299892902374, "mean_gen_accuracy": 0.8771660178899765, "mean_token_accuracy": 0.9054322689771652, "num_tokens": 57297130.0, "sample_num_tokens": 7599.5, "step": 8713, "total_num_tokens": 57327528.0, "z_loss": 0.0004392286064103246 }, { "copy_logits_max": -3.4714691638946533, "copy_logits_min": -750000000.0, "copy_num_tokens": 736.5625, "epoch": 1.7795762062803164, "gen_logits_max": 1.6274043321609497, "gen_logits_mean": -17.732805252075195, "gen_logits_min": -30.063995361328125, "gen_logits_std": 3.4487111568450928, "gen_loss": 0.2236499935388565, "grad_norm": 0.328511394183546, "learning_rate": 1.9922526315789477e-05, "loss": 0.2758, "mean_copy_accuracy": 0.9966872036457062, "mean_gen_accuracy": 0.8774414509534836, "mean_token_accuracy": 0.9060066491365433, "num_tokens": 57569804.0, "sample_num_tokens": 9384.0, "step": 8714, "total_num_tokens": 57607340.0, "z_loss": 0.0003502846520859748 }, { "copy_logits_max": -2.997406244277954, "copy_logits_min": -687500032.0, "copy_num_tokens": 499.1875, "epoch": 1.7797804442175134, "gen_logits_max": 2.584209680557251, "gen_logits_mean": -17.15500259399414, "gen_logits_min": -29.58152961730957, "gen_logits_std": 3.413013458251953, "gen_loss": 0.25027918815612793, "grad_norm": 0.35592853386349227, "learning_rate": 1.9921263157894738e-05, "loss": 0.2762, "mean_copy_accuracy": 0.9968435168266296, "mean_gen_accuracy": 0.8777577430009842, "mean_token_accuracy": 0.9064754694700241, "num_tokens": 57854364.0, "sample_num_tokens": 8171.0, "step": 8715, "total_num_tokens": 57887048.0, "z_loss": 0.00040751483174972236 }, { "copy_logits_max": -3.752206802368164, "copy_logits_min": -687500032.0, "copy_num_tokens": 383.0, "epoch": 1.7799846821547103, "gen_logits_max": 3.2534799575805664, "gen_logits_mean": -16.62334632873535, "gen_logits_min": -28.78437614440918, "gen_logits_std": 3.3721256256103516, "gen_loss": 0.31643328070640564, "grad_norm": 0.37055052763015256, "learning_rate": 1.9920000000000002e-05, "loss": 0.2978, "mean_copy_accuracy": 0.9969190210103989, "mean_gen_accuracy": 0.8723967373371124, "mean_token_accuracy": 0.8989022225141525, "num_tokens": 58122456.0, "sample_num_tokens": 8109.0, "step": 8716, "total_num_tokens": 58154892.0, "z_loss": 0.0004898309707641602 }, { "copy_logits_max": -2.0756967067718506, "copy_logits_min": -750000000.0, "copy_num_tokens": 507.1875, "epoch": 1.780188920091907, "gen_logits_max": 2.5578737258911133, "gen_logits_mean": -17.703712463378906, "gen_logits_min": -30.11907196044922, "gen_logits_std": 3.4249143600463867, "gen_loss": 0.26983556151390076, "grad_norm": 0.32205576884826553, "learning_rate": 1.9918736842105263e-05, "loss": 0.2509, "mean_copy_accuracy": 0.9971016049385071, "mean_gen_accuracy": 0.8857575058937073, "mean_token_accuracy": 0.9155154824256897, "num_tokens": 58402143.0, "sample_num_tokens": 8655.25, "step": 8717, "total_num_tokens": 58436764.0, "z_loss": 0.0004082340165041387 }, { "copy_logits_max": -3.261230945587158, "copy_logits_min": -750000000.0, "copy_num_tokens": 253.5625, "epoch": 1.780393158029104, "gen_logits_max": 3.921034097671509, "gen_logits_mean": -16.757829666137695, "gen_logits_min": -28.93084716796875, "gen_logits_std": 3.3816449642181396, "gen_loss": 0.26036378741264343, "grad_norm": 0.37633821740410295, "learning_rate": 1.9917473684210528e-05, "loss": 0.2772, "mean_copy_accuracy": 0.9974687397480011, "mean_gen_accuracy": 0.8843352943658829, "mean_token_accuracy": 0.9066912978887558, "num_tokens": 58662364.0, "sample_num_tokens": 6886.0, "step": 8718, "total_num_tokens": 58689908.0, "z_loss": 0.0003751590847969055 }, { "copy_logits_max": -5.974696159362793, "copy_logits_min": -687500032.0, "copy_num_tokens": 325.0625, "epoch": 1.7805973959663008, "gen_logits_max": 2.647322177886963, "gen_logits_mean": -18.769384384155273, "gen_logits_min": -30.625329971313477, "gen_logits_std": 3.468381404876709, "gen_loss": 0.2683968245983124, "grad_norm": 0.3446538651117228, "learning_rate": 1.991621052631579e-05, "loss": 0.272, "mean_copy_accuracy": 0.9971702992916107, "mean_gen_accuracy": 0.8808756321668625, "mean_token_accuracy": 0.9095444679260254, "num_tokens": 58945075.0, "sample_num_tokens": 7472.75, "step": 8719, "total_num_tokens": 58974966.0, "z_loss": 0.00036266830284148455 }, { "copy_logits_max": -3.8265905380249023, "copy_logits_min": -750000000.0, "copy_num_tokens": 537.875, "epoch": 1.7808016339034975, "gen_logits_max": 2.368826389312744, "gen_logits_mean": -17.599925994873047, "gen_logits_min": -29.84920883178711, "gen_logits_std": 3.4389891624450684, "gen_loss": 0.2860024571418762, "grad_norm": 0.35502545458483126, "learning_rate": 1.9914947368421053e-05, "loss": 0.2807, "mean_copy_accuracy": 0.9971917271614075, "mean_gen_accuracy": 0.8741766065359116, "mean_token_accuracy": 0.905050590634346, "num_tokens": 59221236.0, "sample_num_tokens": 8561.5, "step": 8720, "total_num_tokens": 59255482.0, "z_loss": 0.0004128899017814547 }, { "copy_logits_max": -3.4391491413116455, "copy_logits_min": -687500032.0, "copy_num_tokens": 586.1875, "epoch": 1.7810058718406943, "gen_logits_max": 3.4198460578918457, "gen_logits_mean": -16.93015480041504, "gen_logits_min": -29.666561126708984, "gen_logits_std": 3.4258906841278076, "gen_loss": 0.2880118787288666, "grad_norm": 0.3298743817583573, "learning_rate": 1.9913684210526314e-05, "loss": 0.2626, "mean_copy_accuracy": 0.996450275182724, "mean_gen_accuracy": 0.8815554231405258, "mean_token_accuracy": 0.9093474745750427, "num_tokens": 59488483.0, "sample_num_tokens": 9115.75, "step": 8721, "total_num_tokens": 59524946.0, "z_loss": 0.0004048151313327253 }, { "copy_logits_max": -4.254413604736328, "copy_logits_min": -750000000.0, "copy_num_tokens": 322.9375, "epoch": 1.7812101097778914, "gen_logits_max": 3.331195831298828, "gen_logits_mean": -18.469125747680664, "gen_logits_min": -30.495819091796875, "gen_logits_std": 3.4708518981933594, "gen_loss": 0.2756413221359253, "grad_norm": 0.33638082197388447, "learning_rate": 1.9912421052631578e-05, "loss": 0.263, "mean_copy_accuracy": 0.9971751272678375, "mean_gen_accuracy": 0.8805360496044159, "mean_token_accuracy": 0.9109851717948914, "num_tokens": 59788222.0, "sample_num_tokens": 7822.0, "step": 8722, "total_num_tokens": 59819510.0, "z_loss": 0.00038970011519268155 }, { "copy_logits_max": -0.005281783640384674, "copy_logits_min": -687500096.0, "copy_num_tokens": 590.1875, "epoch": 1.781414347715088, "gen_logits_max": 3.682894468307495, "gen_logits_mean": -15.806374549865723, "gen_logits_min": -28.86012077331543, "gen_logits_std": 3.389895439147949, "gen_loss": 0.24886804819107056, "grad_norm": 0.32059779558003426, "learning_rate": 1.9911157894736842e-05, "loss": 0.2712, "mean_copy_accuracy": 0.9975043535232544, "mean_gen_accuracy": 0.8743709772825241, "mean_token_accuracy": 0.9085048884153366, "num_tokens": 60075536.0, "sample_num_tokens": 8532.5, "step": 8723, "total_num_tokens": 60109666.0, "z_loss": 0.0003671253798529506 }, { "copy_logits_max": -2.8074593544006348, "copy_logits_min": -750000000.0, "copy_num_tokens": 491.5, "epoch": 1.7816185856522848, "gen_logits_max": 3.673781394958496, "gen_logits_mean": -17.521900177001953, "gen_logits_min": -29.74457550048828, "gen_logits_std": 3.4317662715911865, "gen_loss": 0.2905353605747223, "grad_norm": 0.33368327312771445, "learning_rate": 1.9909894736842107e-05, "loss": 0.2687, "mean_copy_accuracy": 0.9968083798885345, "mean_gen_accuracy": 0.8765021860599518, "mean_token_accuracy": 0.9081466197967529, "num_tokens": 60347735.0, "sample_num_tokens": 8897.75, "step": 8724, "total_num_tokens": 60383326.0, "z_loss": 0.0003898197610396892 }, { "copy_logits_max": -4.610833168029785, "copy_logits_min": -750000000.0, "copy_num_tokens": 448.125, "epoch": 1.7818228235894817, "gen_logits_max": 3.0313401222229004, "gen_logits_mean": -18.45156478881836, "gen_logits_min": -30.546016693115234, "gen_logits_std": 3.4942100048065186, "gen_loss": 0.25774911046028137, "grad_norm": 0.3283501723225753, "learning_rate": 1.990863157894737e-05, "loss": 0.2702, "mean_copy_accuracy": 0.9971227645874023, "mean_gen_accuracy": 0.8818250894546509, "mean_token_accuracy": 0.9070156067609787, "num_tokens": 60609491.0, "sample_num_tokens": 8836.25, "step": 8725, "total_num_tokens": 60644836.0, "z_loss": 0.00040118489414453506 }, { "copy_logits_max": -0.25629279017448425, "copy_logits_min": -687500032.0, "copy_num_tokens": 442.6875, "epoch": 1.7820270615266787, "gen_logits_max": 3.732546806335449, "gen_logits_mean": -16.991065979003906, "gen_logits_min": -29.45233917236328, "gen_logits_std": 3.4269566535949707, "gen_loss": 0.273200124502182, "grad_norm": 0.3239276407374054, "learning_rate": 1.9907368421052632e-05, "loss": 0.2759, "mean_copy_accuracy": 0.9974804520606995, "mean_gen_accuracy": 0.8746224641799927, "mean_token_accuracy": 0.905322790145874, "num_tokens": 60885284.0, "sample_num_tokens": 8673.5, "step": 8726, "total_num_tokens": 60919978.0, "z_loss": 0.0003871795197483152 }, { "copy_logits_max": -1.675889253616333, "copy_logits_min": -750000128.0, "copy_num_tokens": 423.8125, "epoch": 1.7822312994638754, "gen_logits_max": 3.883324146270752, "gen_logits_mean": -16.447315216064453, "gen_logits_min": -29.141307830810547, "gen_logits_std": 3.403690814971924, "gen_loss": 0.3114820122718811, "grad_norm": 0.3745843181264374, "learning_rate": 1.9906105263157896e-05, "loss": 0.2974, "mean_copy_accuracy": 0.9971806555986404, "mean_gen_accuracy": 0.8672651797533035, "mean_token_accuracy": 0.8997122347354889, "num_tokens": 61150423.0, "sample_num_tokens": 8695.75, "step": 8727, "total_num_tokens": 61185206.0, "z_loss": 0.00048022050759755075 }, { "copy_logits_max": -2.789212703704834, "copy_logits_min": -750000000.0, "copy_num_tokens": 395.25, "epoch": 1.7824355374010723, "gen_logits_max": 3.716905117034912, "gen_logits_mean": -16.90436363220215, "gen_logits_min": -29.18454360961914, "gen_logits_std": 3.3612070083618164, "gen_loss": 0.290189266204834, "grad_norm": 0.3506808227462571, "learning_rate": 1.9904842105263157e-05, "loss": 0.2745, "mean_copy_accuracy": 0.9966285228729248, "mean_gen_accuracy": 0.8747962713241577, "mean_token_accuracy": 0.9063081592321396, "num_tokens": 61430355.0, "sample_num_tokens": 7866.75, "step": 8728, "total_num_tokens": 61461822.0, "z_loss": 0.00042510160710662603 }, { "copy_logits_max": -2.785531997680664, "copy_logits_min": -750000000.0, "copy_num_tokens": 413.9375, "epoch": 1.7826397753382692, "gen_logits_max": 3.2250149250030518, "gen_logits_mean": -17.614665985107422, "gen_logits_min": -29.898033142089844, "gen_logits_std": 3.4288735389709473, "gen_loss": 0.2715614438056946, "grad_norm": 0.34618925809171536, "learning_rate": 1.990357894736842e-05, "loss": 0.2834, "mean_copy_accuracy": 0.9975336343050003, "mean_gen_accuracy": 0.8721517473459244, "mean_token_accuracy": 0.9029133319854736, "num_tokens": 61702402.0, "sample_num_tokens": 7698.0, "step": 8729, "total_num_tokens": 61733194.0, "z_loss": 0.00043078832095488906 }, { "copy_logits_max": -4.686856269836426, "copy_logits_min": -750000000.0, "copy_num_tokens": 353.875, "epoch": 1.782844013275466, "gen_logits_max": 3.744615077972412, "gen_logits_mean": -18.53341293334961, "gen_logits_min": -30.67766761779785, "gen_logits_std": 3.4617881774902344, "gen_loss": 0.270369291305542, "grad_norm": 0.35628724511881554, "learning_rate": 1.9902315789473682e-05, "loss": 0.2798, "mean_copy_accuracy": 0.9966580867767334, "mean_gen_accuracy": 0.8800780922174454, "mean_token_accuracy": 0.9057510048151016, "num_tokens": 61970149.0, "sample_num_tokens": 8384.75, "step": 8730, "total_num_tokens": 62003688.0, "z_loss": 0.00042069313349202275 }, { "copy_logits_max": -3.8806803226470947, "copy_logits_min": -687500096.0, "copy_num_tokens": 392.125, "epoch": 1.7830482512126626, "gen_logits_max": 2.4278767108917236, "gen_logits_mean": -19.82105255126953, "gen_logits_min": -32.033599853515625, "gen_logits_std": 3.5141682624816895, "gen_loss": 0.2797688841819763, "grad_norm": 0.3179159877063572, "learning_rate": 1.990105263157895e-05, "loss": 0.2716, "mean_copy_accuracy": 0.9963474720716476, "mean_gen_accuracy": 0.8820589780807495, "mean_token_accuracy": 0.9052901864051819, "num_tokens": 62233215.0, "sample_num_tokens": 7642.25, "step": 8731, "total_num_tokens": 62263784.0, "z_loss": 0.0004597450024448335 }, { "copy_logits_max": -5.409634590148926, "copy_logits_min": -750000000.0, "copy_num_tokens": 247.25, "epoch": 1.7832524891498596, "gen_logits_max": 4.37626314163208, "gen_logits_mean": -18.11556053161621, "gen_logits_min": -30.005640029907227, "gen_logits_std": 3.438162326812744, "gen_loss": 0.29585057497024536, "grad_norm": 0.36971732662294726, "learning_rate": 1.989978947368421e-05, "loss": 0.2999, "mean_copy_accuracy": 0.9966547042131424, "mean_gen_accuracy": 0.8730297237634659, "mean_token_accuracy": 0.8984334319829941, "num_tokens": 62483645.0, "sample_num_tokens": 6737.75, "step": 8732, "total_num_tokens": 62510596.0, "z_loss": 0.0004986521671526134 }, { "copy_logits_max": -2.0890328884124756, "copy_logits_min": -750000000.0, "copy_num_tokens": 419.75, "epoch": 1.7834567270870565, "gen_logits_max": 3.7492458820343018, "gen_logits_mean": -17.649646759033203, "gen_logits_min": -29.823768615722656, "gen_logits_std": 3.425529718399048, "gen_loss": 0.2877454161643982, "grad_norm": 0.34315457506740277, "learning_rate": 1.9898526315789475e-05, "loss": 0.2822, "mean_copy_accuracy": 0.9966287016868591, "mean_gen_accuracy": 0.8786938935518265, "mean_token_accuracy": 0.9048694521188736, "num_tokens": 62747154.0, "sample_num_tokens": 8569.0, "step": 8733, "total_num_tokens": 62781430.0, "z_loss": 0.00043188914423808455 }, { "copy_logits_max": -3.831768035888672, "copy_logits_min": -750000000.0, "copy_num_tokens": 448.3125, "epoch": 1.7836609650242532, "gen_logits_max": 3.104196071624756, "gen_logits_mean": -17.927474975585938, "gen_logits_min": -30.328777313232422, "gen_logits_std": 3.449183940887451, "gen_loss": 0.28812652826309204, "grad_norm": 0.3786090662888466, "learning_rate": 1.9897263157894736e-05, "loss": 0.2809, "mean_copy_accuracy": 0.9967836737632751, "mean_gen_accuracy": 0.8762989342212677, "mean_token_accuracy": 0.9031940847635269, "num_tokens": 63015420.0, "sample_num_tokens": 8444.5, "step": 8734, "total_num_tokens": 63049198.0, "z_loss": 0.0004634774522855878 }, { "copy_logits_max": -2.465373992919922, "copy_logits_min": -750000000.0, "copy_num_tokens": 509.125, "epoch": 1.7838652029614501, "gen_logits_max": 5.088110446929932, "gen_logits_mean": -14.315110206604004, "gen_logits_min": -27.241775512695312, "gen_logits_std": 3.3381807804107666, "gen_loss": 0.23866263031959534, "grad_norm": 0.3463038853179779, "learning_rate": 1.9896e-05, "loss": 0.2744, "mean_copy_accuracy": 0.9970929473638535, "mean_gen_accuracy": 0.8795978426933289, "mean_token_accuracy": 0.9077358245849609, "num_tokens": 63281557.0, "sample_num_tokens": 9290.75, "step": 8735, "total_num_tokens": 63318720.0, "z_loss": 0.00035421023494563997 }, { "copy_logits_max": -0.45937201380729675, "copy_logits_min": -750000000.0, "copy_num_tokens": 740.875, "epoch": 1.784069440898647, "gen_logits_max": 2.6970677375793457, "gen_logits_mean": -17.315622329711914, "gen_logits_min": -29.661354064941406, "gen_logits_std": 3.4375596046447754, "gen_loss": 0.21669849753379822, "grad_norm": 0.324261404055038, "learning_rate": 1.989473684210526e-05, "loss": 0.2519, "mean_copy_accuracy": 0.9971114546060562, "mean_gen_accuracy": 0.8813156932592392, "mean_token_accuracy": 0.9132735431194305, "num_tokens": 63573279.0, "sample_num_tokens": 10908.25, "step": 8736, "total_num_tokens": 63616912.0, "z_loss": 0.0003745421417988837 }, { "copy_logits_max": -1.0329563617706299, "copy_logits_min": -750000064.0, "copy_num_tokens": 439.6875, "epoch": 1.7842736788358438, "gen_logits_max": 3.827331781387329, "gen_logits_mean": -17.05150032043457, "gen_logits_min": -29.42431640625, "gen_logits_std": 3.4049177169799805, "gen_loss": 0.3173988461494446, "grad_norm": 0.37155013516582763, "learning_rate": 1.9893473684210526e-05, "loss": 0.2735, "mean_copy_accuracy": 0.9981360137462616, "mean_gen_accuracy": 0.8779609948396683, "mean_token_accuracy": 0.9091055244207382, "num_tokens": 63846234.0, "sample_num_tokens": 8498.5, "step": 8737, "total_num_tokens": 63880228.0, "z_loss": 0.0004946585977450013 }, { "copy_logits_max": 0.028914034366607666, "copy_logits_min": -750000000.0, "copy_num_tokens": 501.875, "epoch": 1.7844779167730405, "gen_logits_max": 4.529488563537598, "gen_logits_mean": -14.33768367767334, "gen_logits_min": -26.964500427246094, "gen_logits_std": 3.2662620544433594, "gen_loss": 0.28808435797691345, "grad_norm": 0.3431278149508379, "learning_rate": 1.989221052631579e-05, "loss": 0.276, "mean_copy_accuracy": 0.9973278939723969, "mean_gen_accuracy": 0.8790376037359238, "mean_token_accuracy": 0.907069593667984, "num_tokens": 64118823.0, "sample_num_tokens": 9130.75, "step": 8738, "total_num_tokens": 64155346.0, "z_loss": 0.000439674302469939 }, { "copy_logits_max": -2.401639938354492, "copy_logits_min": -687500032.0, "copy_num_tokens": 566.875, "epoch": 1.7846821547102374, "gen_logits_max": 2.662243127822876, "gen_logits_mean": -16.746475219726562, "gen_logits_min": -29.595144271850586, "gen_logits_std": 3.3718652725219727, "gen_loss": 0.30589109659194946, "grad_norm": 0.3782686369708295, "learning_rate": 1.9890947368421054e-05, "loss": 0.2943, "mean_copy_accuracy": 0.99714794754982, "mean_gen_accuracy": 0.865791842341423, "mean_token_accuracy": 0.9004643112421036, "num_tokens": 64384395.0, "sample_num_tokens": 8080.25, "step": 8739, "total_num_tokens": 64416716.0, "z_loss": 0.0004759141302201897 }, { "copy_logits_max": -5.196699142456055, "copy_logits_min": -750000000.0, "copy_num_tokens": 343.25, "epoch": 1.7848863926474343, "gen_logits_max": 3.01427960395813, "gen_logits_mean": -17.98727798461914, "gen_logits_min": -30.567142486572266, "gen_logits_std": 3.4347739219665527, "gen_loss": 0.28180554509162903, "grad_norm": 0.3711257927929809, "learning_rate": 1.988968421052632e-05, "loss": 0.2695, "mean_copy_accuracy": 0.9968487918376923, "mean_gen_accuracy": 0.8767559081315994, "mean_token_accuracy": 0.9084901958703995, "num_tokens": 64663268.0, "sample_num_tokens": 7169.0, "step": 8740, "total_num_tokens": 64691944.0, "z_loss": 0.00041397137101739645 }, { "copy_logits_max": -3.5999770164489746, "copy_logits_min": -750000000.0, "copy_num_tokens": 282.875, "epoch": 1.785090630584631, "gen_logits_max": 4.602538108825684, "gen_logits_mean": -16.10384750366211, "gen_logits_min": -28.781665802001953, "gen_logits_std": 3.3530325889587402, "gen_loss": 0.2677004039287567, "grad_norm": 0.35036926354592046, "learning_rate": 1.988842105263158e-05, "loss": 0.2636, "mean_copy_accuracy": 0.9974697381258011, "mean_gen_accuracy": 0.8828165084123611, "mean_token_accuracy": 0.9079404771327972, "num_tokens": 64906527.0, "sample_num_tokens": 6975.25, "step": 8741, "total_num_tokens": 64934428.0, "z_loss": 0.00039673419087193906 }, { "copy_logits_max": -4.612433433532715, "copy_logits_min": -750000000.0, "copy_num_tokens": 633.875, "epoch": 1.785294868521828, "gen_logits_max": 2.5599043369293213, "gen_logits_mean": -18.68334197998047, "gen_logits_min": -31.1214599609375, "gen_logits_std": 3.464298963546753, "gen_loss": 0.2797126770019531, "grad_norm": 0.34184632096068784, "learning_rate": 1.9887157894736844e-05, "loss": 0.2698, "mean_copy_accuracy": 0.9977417439222336, "mean_gen_accuracy": 0.8757653534412384, "mean_token_accuracy": 0.9092036485671997, "num_tokens": 65175618.0, "sample_num_tokens": 9785.0, "step": 8742, "total_num_tokens": 65214758.0, "z_loss": 0.00043017376447096467 }, { "copy_logits_max": -1.2168176174163818, "copy_logits_min": -687500032.0, "copy_num_tokens": 590.4375, "epoch": 1.7854991064590249, "gen_logits_max": 3.436182975769043, "gen_logits_mean": -15.979705810546875, "gen_logits_min": -28.560134887695312, "gen_logits_std": 3.353224277496338, "gen_loss": 0.2606254518032074, "grad_norm": 0.3755469066968208, "learning_rate": 1.9885894736842105e-05, "loss": 0.2866, "mean_copy_accuracy": 0.9966955184936523, "mean_gen_accuracy": 0.8702129423618317, "mean_token_accuracy": 0.9037742763757706, "num_tokens": 65435183.0, "sample_num_tokens": 8301.75, "step": 8743, "total_num_tokens": 65468390.0, "z_loss": 0.0004025754751637578 }, { "copy_logits_max": -6.769379138946533, "copy_logits_min": -750000000.0, "copy_num_tokens": 328.5625, "epoch": 1.7857033443962216, "gen_logits_max": 2.807805061340332, "gen_logits_mean": -19.060842514038086, "gen_logits_min": -31.062976837158203, "gen_logits_std": 3.465745210647583, "gen_loss": 0.2994391620159149, "grad_norm": 0.40871952065210543, "learning_rate": 1.988463157894737e-05, "loss": 0.2906, "mean_copy_accuracy": 0.996366873383522, "mean_gen_accuracy": 0.8711177557706833, "mean_token_accuracy": 0.9015260189771652, "num_tokens": 65685231.0, "sample_num_tokens": 7303.75, "step": 8744, "total_num_tokens": 65714446.0, "z_loss": 0.000411692017223686 }, { "copy_logits_max": -7.416865825653076, "copy_logits_min": -750000000.0, "copy_num_tokens": 346.1875, "epoch": 1.7859075823334183, "gen_logits_max": 3.0617597103118896, "gen_logits_mean": -17.998729705810547, "gen_logits_min": -30.31307601928711, "gen_logits_std": 3.435178756713867, "gen_loss": 0.2915889620780945, "grad_norm": 0.3451772632362456, "learning_rate": 1.988336842105263e-05, "loss": 0.2564, "mean_copy_accuracy": 0.9975808262825012, "mean_gen_accuracy": 0.8823262155056, "mean_token_accuracy": 0.9127875566482544, "num_tokens": 65976424.0, "sample_num_tokens": 7540.0, "step": 8745, "total_num_tokens": 66006584.0, "z_loss": 0.0003900468582287431 }, { "copy_logits_max": -5.258201599121094, "copy_logits_min": -750000000.0, "copy_num_tokens": 384.25, "epoch": 1.7861118202706152, "gen_logits_max": 3.087284564971924, "gen_logits_mean": -17.636676788330078, "gen_logits_min": -30.071266174316406, "gen_logits_std": 3.425466537475586, "gen_loss": 0.274649441242218, "grad_norm": 0.3597553691866038, "learning_rate": 1.9882105263157894e-05, "loss": 0.2702, "mean_copy_accuracy": 0.9971529543399811, "mean_gen_accuracy": 0.8753882795572281, "mean_token_accuracy": 0.9078478813171387, "num_tokens": 66241924.0, "sample_num_tokens": 7784.0, "step": 8746, "total_num_tokens": 66273060.0, "z_loss": 0.00041614854126237333 }, { "copy_logits_max": -4.191214561462402, "copy_logits_min": -750000000.0, "copy_num_tokens": 475.5625, "epoch": 1.7863160582078121, "gen_logits_max": 2.2880477905273438, "gen_logits_mean": -18.319110870361328, "gen_logits_min": -31.143310546875, "gen_logits_std": 3.45247220993042, "gen_loss": 0.24347518384456635, "grad_norm": 0.3746012885516959, "learning_rate": 1.988084210526316e-05, "loss": 0.2664, "mean_copy_accuracy": 0.9975026696920395, "mean_gen_accuracy": 0.8811532557010651, "mean_token_accuracy": 0.9087662845849991, "num_tokens": 66506640.0, "sample_num_tokens": 8473.5, "step": 8747, "total_num_tokens": 66540534.0, "z_loss": 0.0003654558677226305 }, { "copy_logits_max": -5.081840515136719, "copy_logits_min": -687500032.0, "copy_num_tokens": 387.625, "epoch": 1.7865202961450088, "gen_logits_max": 3.6418468952178955, "gen_logits_mean": -16.58733558654785, "gen_logits_min": -29.67409324645996, "gen_logits_std": 3.4023189544677734, "gen_loss": 0.25061142444610596, "grad_norm": 0.37447705514766916, "learning_rate": 1.9879578947368423e-05, "loss": 0.2905, "mean_copy_accuracy": 0.9974315464496613, "mean_gen_accuracy": 0.8753688782453537, "mean_token_accuracy": 0.9015044420957565, "num_tokens": 66774452.0, "sample_num_tokens": 8095.0, "step": 8748, "total_num_tokens": 66806832.0, "z_loss": 0.0003762508858926594 }, { "copy_logits_max": -6.138042449951172, "copy_logits_min": -750000000.0, "copy_num_tokens": 453.8125, "epoch": 1.7867245340822058, "gen_logits_max": 2.662471294403076, "gen_logits_mean": -18.221656799316406, "gen_logits_min": -30.25982666015625, "gen_logits_std": 3.4083595275878906, "gen_loss": 0.2975122332572937, "grad_norm": 0.3887207513509167, "learning_rate": 1.9878315789473684e-05, "loss": 0.2683, "mean_copy_accuracy": 0.9973257035017014, "mean_gen_accuracy": 0.8781388401985168, "mean_token_accuracy": 0.9085483253002167, "num_tokens": 67039440.0, "sample_num_tokens": 8462.0, "step": 8749, "total_num_tokens": 67073288.0, "z_loss": 0.00042034080252051353 }, { "copy_logits_max": -3.5827462673187256, "copy_logits_min": -750000000.0, "copy_num_tokens": 581.375, "epoch": 1.7869287720194027, "gen_logits_max": 2.5287418365478516, "gen_logits_mean": -17.379629135131836, "gen_logits_min": -30.50295639038086, "gen_logits_std": 3.396726369857788, "gen_loss": 0.25755324959754944, "grad_norm": 0.33902490641182814, "learning_rate": 1.987705263157895e-05, "loss": 0.2766, "mean_copy_accuracy": 0.997601717710495, "mean_gen_accuracy": 0.8729861974716187, "mean_token_accuracy": 0.9068702459335327, "num_tokens": 67320185.0, "sample_num_tokens": 8779.25, "step": 8750, "total_num_tokens": 67355302.0, "z_loss": 0.00043128133984282613 }, { "copy_logits_max": -6.00988245010376, "copy_logits_min": -750000000.0, "copy_num_tokens": 462.0625, "epoch": 1.7871330099565994, "gen_logits_max": 3.2577223777770996, "gen_logits_mean": -17.649578094482422, "gen_logits_min": -30.509567260742188, "gen_logits_std": 3.3992104530334473, "gen_loss": 0.25376570224761963, "grad_norm": 0.3490283893440752, "learning_rate": 1.987578947368421e-05, "loss": 0.2758, "mean_copy_accuracy": 0.9974483102560043, "mean_gen_accuracy": 0.8801688700914383, "mean_token_accuracy": 0.9095643311738968, "num_tokens": 67598731.0, "sample_num_tokens": 7840.25, "step": 8751, "total_num_tokens": 67630092.0, "z_loss": 0.00041020801290869713 }, { "copy_logits_max": -4.65201997756958, "copy_logits_min": -687500032.0, "copy_num_tokens": 391.9375, "epoch": 1.787337247893796, "gen_logits_max": 4.209650039672852, "gen_logits_mean": -17.143592834472656, "gen_logits_min": -29.566007614135742, "gen_logits_std": 3.3635292053222656, "gen_loss": 0.3021450638771057, "grad_norm": 0.38255039595663315, "learning_rate": 1.9874526315789474e-05, "loss": 0.2632, "mean_copy_accuracy": 0.9975902736186981, "mean_gen_accuracy": 0.8857971429824829, "mean_token_accuracy": 0.9094639718532562, "num_tokens": 67856905.0, "sample_num_tokens": 8444.25, "step": 8752, "total_num_tokens": 67890682.0, "z_loss": 0.0004938513739034534 }, { "copy_logits_max": -2.853614330291748, "copy_logits_min": -687500032.0, "copy_num_tokens": 371.6875, "epoch": 1.7875414858309933, "gen_logits_max": 3.3204474449157715, "gen_logits_mean": -17.91246223449707, "gen_logits_min": -30.429058074951172, "gen_logits_std": 3.4191336631774902, "gen_loss": 0.3028792142868042, "grad_norm": 0.3786817849049688, "learning_rate": 1.9873263157894738e-05, "loss": 0.2823, "mean_copy_accuracy": 0.9976008832454681, "mean_gen_accuracy": 0.8764189928770065, "mean_token_accuracy": 0.904216393828392, "num_tokens": 68112307.0, "sample_num_tokens": 7329.25, "step": 8753, "total_num_tokens": 68141624.0, "z_loss": 0.0005584657774306834 }, { "copy_logits_max": -5.32396125793457, "copy_logits_min": -750000000.0, "copy_num_tokens": 331.75, "epoch": 1.78774572376819, "gen_logits_max": 3.3243250846862793, "gen_logits_mean": -18.73194694519043, "gen_logits_min": -30.9530086517334, "gen_logits_std": 3.4295027256011963, "gen_loss": 0.2839883863925934, "grad_norm": 0.3520079141446098, "learning_rate": 1.9872e-05, "loss": 0.2732, "mean_copy_accuracy": 0.9973781704902649, "mean_gen_accuracy": 0.8781297504901886, "mean_token_accuracy": 0.9059429168701172, "num_tokens": 68377498.0, "sample_num_tokens": 7754.0, "step": 8754, "total_num_tokens": 68408514.0, "z_loss": 0.00045546048204414546 }, { "copy_logits_max": -6.246697425842285, "copy_logits_min": -750000000.0, "copy_num_tokens": 245.6875, "epoch": 1.7879499617053867, "gen_logits_max": 4.238961219787598, "gen_logits_mean": -17.510072708129883, "gen_logits_min": -29.60310935974121, "gen_logits_std": 3.3779044151306152, "gen_loss": 0.3014669716358185, "grad_norm": 0.3679274531340126, "learning_rate": 1.9870736842105266e-05, "loss": 0.2801, "mean_copy_accuracy": 0.9979610443115234, "mean_gen_accuracy": 0.8816185593605042, "mean_token_accuracy": 0.9033338725566864, "num_tokens": 68626456.0, "sample_num_tokens": 7304.5, "step": 8755, "total_num_tokens": 68655674.0, "z_loss": 0.00044567149598151445 }, { "copy_logits_max": -1.8596270084381104, "copy_logits_min": -750000064.0, "copy_num_tokens": 894.25, "epoch": 1.7881541996425836, "gen_logits_max": 2.2439804077148438, "gen_logits_mean": -16.767549514770508, "gen_logits_min": -29.91539192199707, "gen_logits_std": 3.423046827316284, "gen_loss": 0.20402367413043976, "grad_norm": 0.32974897450472684, "learning_rate": 1.9869473684210527e-05, "loss": 0.2766, "mean_copy_accuracy": 0.9965623468160629, "mean_gen_accuracy": 0.8750646710395813, "mean_token_accuracy": 0.905613362789154, "num_tokens": 68919080.0, "sample_num_tokens": 11272.5, "step": 8756, "total_num_tokens": 68964170.0, "z_loss": 0.00032195591484196484 }, { "copy_logits_max": -4.375673770904541, "copy_logits_min": -750000000.0, "copy_num_tokens": 366.5625, "epoch": 1.7883584375797805, "gen_logits_max": 4.429769515991211, "gen_logits_mean": -15.690591812133789, "gen_logits_min": -27.653182983398438, "gen_logits_std": 3.3151888847351074, "gen_loss": 0.2622109651565552, "grad_norm": 0.33388022049002175, "learning_rate": 1.9868210526315792e-05, "loss": 0.2681, "mean_copy_accuracy": 0.997236043214798, "mean_gen_accuracy": 0.8782037943601608, "mean_token_accuracy": 0.9079562723636627, "num_tokens": 69208072.0, "sample_num_tokens": 9037.0, "step": 8757, "total_num_tokens": 69244220.0, "z_loss": 0.0003999838954769075 }, { "copy_logits_max": -2.310245990753174, "copy_logits_min": -750000064.0, "copy_num_tokens": 547.875, "epoch": 1.7885626755169772, "gen_logits_max": 3.2970714569091797, "gen_logits_mean": -16.438804626464844, "gen_logits_min": -29.092981338500977, "gen_logits_std": 3.3574678897857666, "gen_loss": 0.26241639256477356, "grad_norm": 0.37473028934311203, "learning_rate": 1.9866947368421053e-05, "loss": 0.2663, "mean_copy_accuracy": 0.996941015124321, "mean_gen_accuracy": 0.8809899836778641, "mean_token_accuracy": 0.9089828133583069, "num_tokens": 69457913.0, "sample_num_tokens": 8594.25, "step": 8758, "total_num_tokens": 69492290.0, "z_loss": 0.0004494872409850359 }, { "copy_logits_max": -4.073315620422363, "copy_logits_min": -750000000.0, "copy_num_tokens": 569.5, "epoch": 1.7887669134541742, "gen_logits_max": 2.4467318058013916, "gen_logits_mean": -17.144046783447266, "gen_logits_min": -30.223814010620117, "gen_logits_std": 3.3980202674865723, "gen_loss": 0.2691909372806549, "grad_norm": 0.36199961410985704, "learning_rate": 1.9865684210526317e-05, "loss": 0.2619, "mean_copy_accuracy": 0.9975661188364029, "mean_gen_accuracy": 0.8760017156600952, "mean_token_accuracy": 0.9101230800151825, "num_tokens": 69732806.0, "sample_num_tokens": 8012.0, "step": 8759, "total_num_tokens": 69764854.0, "z_loss": 0.0004435188602656126 }, { "copy_logits_max": -5.942296028137207, "copy_logits_min": -750000000.0, "copy_num_tokens": 419.875, "epoch": 1.788971151391371, "gen_logits_max": 3.2822303771972656, "gen_logits_mean": -18.442554473876953, "gen_logits_min": -30.581308364868164, "gen_logits_std": 3.432608127593994, "gen_loss": 0.29248905181884766, "grad_norm": 0.3943999223956066, "learning_rate": 1.9864421052631578e-05, "loss": 0.2905, "mean_copy_accuracy": 0.9967199712991714, "mean_gen_accuracy": 0.8729960024356842, "mean_token_accuracy": 0.9015465676784515, "num_tokens": 70006632.0, "sample_num_tokens": 9203.5, "step": 8760, "total_num_tokens": 70043446.0, "z_loss": 0.00047864363295957446 }, { "copy_logits_max": -6.861423492431641, "copy_logits_min": -750000000.0, "copy_num_tokens": 511.8125, "epoch": 1.7891753893285678, "gen_logits_max": 3.1631851196289062, "gen_logits_mean": -17.995695114135742, "gen_logits_min": -29.842594146728516, "gen_logits_std": 3.385714292526245, "gen_loss": 0.27893370389938354, "grad_norm": 0.32969656886577986, "learning_rate": 1.9863157894736842e-05, "loss": 0.2913, "mean_copy_accuracy": 0.9975295811891556, "mean_gen_accuracy": 0.8720317929983139, "mean_token_accuracy": 0.9008650928735733, "num_tokens": 70280781.0, "sample_num_tokens": 9570.25, "step": 8761, "total_num_tokens": 70319062.0, "z_loss": 0.00047948904102668166 }, { "copy_logits_max": -5.4151835441589355, "copy_logits_min": -750000000.0, "copy_num_tokens": 344.4375, "epoch": 1.7893796272657645, "gen_logits_max": 3.6372220516204834, "gen_logits_mean": -17.501022338867188, "gen_logits_min": -29.788387298583984, "gen_logits_std": 3.3836684226989746, "gen_loss": 0.2809898257255554, "grad_norm": 0.3403735197097848, "learning_rate": 1.9861894736842103e-05, "loss": 0.2807, "mean_copy_accuracy": 0.9977973848581314, "mean_gen_accuracy": 0.8742351979017258, "mean_token_accuracy": 0.9055034071207047, "num_tokens": 70546194.0, "sample_num_tokens": 7101.5, "step": 8762, "total_num_tokens": 70574600.0, "z_loss": 0.0004569543816614896 }, { "copy_logits_max": -5.645925521850586, "copy_logits_min": -687500032.0, "copy_num_tokens": 253.8125, "epoch": 1.7895838652029614, "gen_logits_max": 5.481655597686768, "gen_logits_mean": -14.618720054626465, "gen_logits_min": -27.383102416992188, "gen_logits_std": 3.266467571258545, "gen_loss": 0.2670885920524597, "grad_norm": 0.33008602497367845, "learning_rate": 1.986063157894737e-05, "loss": 0.2611, "mean_copy_accuracy": 0.9970989227294922, "mean_gen_accuracy": 0.8838821351528168, "mean_token_accuracy": 0.9117400497198105, "num_tokens": 70813842.0, "sample_num_tokens": 6450.0, "step": 8763, "total_num_tokens": 70839642.0, "z_loss": 0.0004358800360932946 }, { "copy_logits_max": -7.197651386260986, "copy_logits_min": -750000000.0, "copy_num_tokens": 279.75, "epoch": 1.7897881031401583, "gen_logits_max": 4.442301273345947, "gen_logits_mean": -17.592458724975586, "gen_logits_min": -29.503782272338867, "gen_logits_std": 3.3847832679748535, "gen_loss": 0.3010575771331787, "grad_norm": 0.35962147549800255, "learning_rate": 1.9859368421052632e-05, "loss": 0.2814, "mean_copy_accuracy": 0.9969374984502792, "mean_gen_accuracy": 0.8776339888572693, "mean_token_accuracy": 0.9043979197740555, "num_tokens": 71068415.0, "sample_num_tokens": 7739.25, "step": 8764, "total_num_tokens": 71099372.0, "z_loss": 0.0005007496802136302 }, { "copy_logits_max": -5.689314842224121, "copy_logits_min": -750000000.0, "copy_num_tokens": 393.9375, "epoch": 1.789992341077355, "gen_logits_max": 3.1808691024780273, "gen_logits_mean": -18.142658233642578, "gen_logits_min": -30.159997940063477, "gen_logits_std": 3.3996124267578125, "gen_loss": 0.28135937452316284, "grad_norm": 0.32201247735777916, "learning_rate": 1.9858105263157896e-05, "loss": 0.283, "mean_copy_accuracy": 0.9980231076478958, "mean_gen_accuracy": 0.8760168552398682, "mean_token_accuracy": 0.9037584662437439, "num_tokens": 71343268.0, "sample_num_tokens": 8643.5, "step": 8765, "total_num_tokens": 71377842.0, "z_loss": 0.000514709041453898 }, { "copy_logits_max": -3.974180221557617, "copy_logits_min": -687500032.0, "copy_num_tokens": 495.875, "epoch": 1.790196579014552, "gen_logits_max": 3.389354944229126, "gen_logits_mean": -17.056852340698242, "gen_logits_min": -28.9086971282959, "gen_logits_std": 3.3748223781585693, "gen_loss": 0.2784547209739685, "grad_norm": 0.33099749886450425, "learning_rate": 1.985684210526316e-05, "loss": 0.2679, "mean_copy_accuracy": 0.996745765209198, "mean_gen_accuracy": 0.8787097930908203, "mean_token_accuracy": 0.9089789241552353, "num_tokens": 71626527.0, "sample_num_tokens": 9359.25, "step": 8766, "total_num_tokens": 71663964.0, "z_loss": 0.00048086117021739483 }, { "copy_logits_max": -3.971086025238037, "copy_logits_min": -687500032.0, "copy_num_tokens": 442.3125, "epoch": 1.790400816951749, "gen_logits_max": 2.9999990463256836, "gen_logits_mean": -17.34712791442871, "gen_logits_min": -29.578466415405273, "gen_logits_std": 3.3987088203430176, "gen_loss": 0.3263987898826599, "grad_norm": 0.3671713365246413, "learning_rate": 1.985557894736842e-05, "loss": 0.2907, "mean_copy_accuracy": 0.9971572160720825, "mean_gen_accuracy": 0.8715258687734604, "mean_token_accuracy": 0.8997242003679276, "num_tokens": 71882556.0, "sample_num_tokens": 7464.0, "step": 8767, "total_num_tokens": 71912412.0, "z_loss": 0.0005465829744935036 }, { "copy_logits_max": -3.7709267139434814, "copy_logits_min": -750000000.0, "copy_num_tokens": 538.5625, "epoch": 1.7906050548889456, "gen_logits_max": 3.0303080081939697, "gen_logits_mean": -17.352983474731445, "gen_logits_min": -29.42947006225586, "gen_logits_std": 3.412796974182129, "gen_loss": 0.2197495400905609, "grad_norm": 0.30818625596537114, "learning_rate": 1.9854315789473686e-05, "loss": 0.2629, "mean_copy_accuracy": 0.9970863461494446, "mean_gen_accuracy": 0.882177859544754, "mean_token_accuracy": 0.9104629307985306, "num_tokens": 72157820.0, "sample_num_tokens": 8704.5, "step": 8768, "total_num_tokens": 72192638.0, "z_loss": 0.00035532552283257246 }, { "copy_logits_max": -1.470093846321106, "copy_logits_min": -687500032.0, "copy_num_tokens": 390.3125, "epoch": 1.7908092928261423, "gen_logits_max": 4.305505752563477, "gen_logits_mean": -15.046674728393555, "gen_logits_min": -27.39832878112793, "gen_logits_std": 3.278547525405884, "gen_loss": 0.3064487874507904, "grad_norm": 0.3387588095222269, "learning_rate": 1.9853052631578947e-05, "loss": 0.281, "mean_copy_accuracy": 0.9977342933416367, "mean_gen_accuracy": 0.8731093853712082, "mean_token_accuracy": 0.9052680432796478, "num_tokens": 72440563.0, "sample_num_tokens": 7843.25, "step": 8769, "total_num_tokens": 72471936.0, "z_loss": 0.0004959213547408581 }, { "copy_logits_max": -4.772111892700195, "copy_logits_min": -750000000.0, "copy_num_tokens": 526.375, "epoch": 1.7910135307633392, "gen_logits_max": 3.584495782852173, "gen_logits_mean": -17.406421661376953, "gen_logits_min": -29.66509246826172, "gen_logits_std": 3.410576820373535, "gen_loss": 0.31243884563446045, "grad_norm": 0.3501215211378105, "learning_rate": 1.985178947368421e-05, "loss": 0.2871, "mean_copy_accuracy": 0.997073158621788, "mean_gen_accuracy": 0.8724246472120285, "mean_token_accuracy": 0.9051921367645264, "num_tokens": 72738011.0, "sample_num_tokens": 9221.75, "step": 8770, "total_num_tokens": 72774898.0, "z_loss": 0.0005062692798674107 }, { "copy_logits_max": -3.5970239639282227, "copy_logits_min": -687500096.0, "copy_num_tokens": 484.9375, "epoch": 1.7912177687005362, "gen_logits_max": 3.8216052055358887, "gen_logits_mean": -16.62735366821289, "gen_logits_min": -28.885787963867188, "gen_logits_std": 3.3249666690826416, "gen_loss": 0.2782926559448242, "grad_norm": 0.38501104259929203, "learning_rate": 1.9850526315789472e-05, "loss": 0.2783, "mean_copy_accuracy": 0.9969616234302521, "mean_gen_accuracy": 0.8780548423528671, "mean_token_accuracy": 0.9072452187538147, "num_tokens": 73035598.0, "sample_num_tokens": 9058.5, "step": 8771, "total_num_tokens": 73071832.0, "z_loss": 0.00045618327567353845 }, { "copy_logits_max": -2.012305736541748, "copy_logits_min": -750000064.0, "copy_num_tokens": 480.3125, "epoch": 1.7914220066377329, "gen_logits_max": 2.7994813919067383, "gen_logits_mean": -18.192758560180664, "gen_logits_min": -31.212926864624023, "gen_logits_std": 3.45725679397583, "gen_loss": 0.2720791697502136, "grad_norm": 0.3663173131414335, "learning_rate": 1.984926315789474e-05, "loss": 0.2724, "mean_copy_accuracy": 0.9971377998590469, "mean_gen_accuracy": 0.8753269761800766, "mean_token_accuracy": 0.9068952947854996, "num_tokens": 73315597.0, "sample_num_tokens": 8240.75, "step": 8772, "total_num_tokens": 73348560.0, "z_loss": 0.00044582868576981127 }, { "copy_logits_max": -5.24249267578125, "copy_logits_min": -750000000.0, "copy_num_tokens": 201.75, "epoch": 1.7916262445749298, "gen_logits_max": 4.657371997833252, "gen_logits_mean": -16.878707885742188, "gen_logits_min": -29.110130310058594, "gen_logits_std": 3.364670991897583, "gen_loss": 0.3005113899707794, "grad_norm": 0.3826404862871399, "learning_rate": 1.9848e-05, "loss": 0.2809, "mean_copy_accuracy": 0.9964393377304077, "mean_gen_accuracy": 0.8761608600616455, "mean_token_accuracy": 0.9036495536565781, "num_tokens": 73604112.0, "sample_num_tokens": 6705.5, "step": 8773, "total_num_tokens": 73630934.0, "z_loss": 0.00045442383270710707 }, { "copy_logits_max": -3.691488742828369, "copy_logits_min": -750000000.0, "copy_num_tokens": 431.75, "epoch": 1.7918304825121267, "gen_logits_max": 3.1128201484680176, "gen_logits_mean": -18.279010772705078, "gen_logits_min": -31.101465225219727, "gen_logits_std": 3.4731462001800537, "gen_loss": 0.2558891177177429, "grad_norm": 0.35921183743464286, "learning_rate": 1.9846736842105265e-05, "loss": 0.2875, "mean_copy_accuracy": 0.9965555816888809, "mean_gen_accuracy": 0.8755717724561691, "mean_token_accuracy": 0.9023991227149963, "num_tokens": 73871936.0, "sample_num_tokens": 8297.5, "step": 8774, "total_num_tokens": 73905126.0, "z_loss": 0.00039276390452869236 }, { "copy_logits_max": -1.914841651916504, "copy_logits_min": -750000000.0, "copy_num_tokens": 579.5625, "epoch": 1.7920347204493234, "gen_logits_max": 3.228025436401367, "gen_logits_mean": -16.908035278320312, "gen_logits_min": -29.373031616210938, "gen_logits_std": 3.436901807785034, "gen_loss": 0.25860729813575745, "grad_norm": 0.3648186264151752, "learning_rate": 1.9845473684210526e-05, "loss": 0.2718, "mean_copy_accuracy": 0.9974749386310577, "mean_gen_accuracy": 0.880483478307724, "mean_token_accuracy": 0.9097402989864349, "num_tokens": 74130585.0, "sample_num_tokens": 9150.25, "step": 8775, "total_num_tokens": 74167186.0, "z_loss": 0.0004617595986928791 }, { "copy_logits_max": -4.873594284057617, "copy_logits_min": -750000000.0, "copy_num_tokens": 555.875, "epoch": 1.7922389583865201, "gen_logits_max": 2.068171501159668, "gen_logits_mean": -19.042076110839844, "gen_logits_min": -31.457504272460938, "gen_logits_std": 3.5122618675231934, "gen_loss": 0.2548961043357849, "grad_norm": 0.36006272277824813, "learning_rate": 1.984421052631579e-05, "loss": 0.2716, "mean_copy_accuracy": 0.9967333823442459, "mean_gen_accuracy": 0.8768497854471207, "mean_token_accuracy": 0.9085224121809006, "num_tokens": 74427738.0, "sample_num_tokens": 8742.0, "step": 8776, "total_num_tokens": 74462706.0, "z_loss": 0.00041847751708701253 }, { "copy_logits_max": -3.7471225261688232, "copy_logits_min": -750000000.0, "copy_num_tokens": 553.75, "epoch": 1.7924431963237173, "gen_logits_max": 3.4505996704101562, "gen_logits_mean": -16.52611541748047, "gen_logits_min": -29.357311248779297, "gen_logits_std": 3.391895294189453, "gen_loss": 0.24830026924610138, "grad_norm": 0.3505141765528896, "learning_rate": 1.984294736842105e-05, "loss": 0.2646, "mean_copy_accuracy": 0.9976783990859985, "mean_gen_accuracy": 0.883150264620781, "mean_token_accuracy": 0.9104428440332413, "num_tokens": 74711422.0, "sample_num_tokens": 9936.0, "step": 8777, "total_num_tokens": 74751166.0, "z_loss": 0.0003409316996112466 }, { "copy_logits_max": -4.196203231811523, "copy_logits_min": -750000000.0, "copy_num_tokens": 343.8125, "epoch": 1.792647434260914, "gen_logits_max": 3.131988286972046, "gen_logits_mean": -17.60903549194336, "gen_logits_min": -30.266263961791992, "gen_logits_std": 3.438883066177368, "gen_loss": 0.24674390256404877, "grad_norm": 0.366851330197568, "learning_rate": 1.9841684210526315e-05, "loss": 0.2688, "mean_copy_accuracy": 0.9964727163314819, "mean_gen_accuracy": 0.8821747004985809, "mean_token_accuracy": 0.9084853976964951, "num_tokens": 74975633.0, "sample_num_tokens": 6947.75, "step": 8778, "total_num_tokens": 75003424.0, "z_loss": 0.00040985996020026505 }, { "copy_logits_max": -4.963620185852051, "copy_logits_min": -750000000.0, "copy_num_tokens": 367.375, "epoch": 1.7928516721981107, "gen_logits_max": 2.92334246635437, "gen_logits_mean": -18.560279846191406, "gen_logits_min": -31.3482723236084, "gen_logits_std": 3.4699370861053467, "gen_loss": 0.28030288219451904, "grad_norm": 0.34281528645015774, "learning_rate": 1.984042105263158e-05, "loss": 0.2708, "mean_copy_accuracy": 0.9968437701463699, "mean_gen_accuracy": 0.8778436779975891, "mean_token_accuracy": 0.9094340056180954, "num_tokens": 75257629.0, "sample_num_tokens": 8039.25, "step": 8779, "total_num_tokens": 75289786.0, "z_loss": 0.000441898126155138 }, { "copy_logits_max": -0.515514612197876, "copy_logits_min": -750000000.0, "copy_num_tokens": 480.75, "epoch": 1.7930559101353076, "gen_logits_max": 3.6612324714660645, "gen_logits_mean": -16.505271911621094, "gen_logits_min": -28.84627914428711, "gen_logits_std": 3.389218330383301, "gen_loss": 0.2660645544528961, "grad_norm": 0.3444311231767362, "learning_rate": 1.9839157894736844e-05, "loss": 0.2561, "mean_copy_accuracy": 0.9970072358846664, "mean_gen_accuracy": 0.8810336291790009, "mean_token_accuracy": 0.9133686721324921, "num_tokens": 75541181.0, "sample_num_tokens": 8214.25, "step": 8780, "total_num_tokens": 75574038.0, "z_loss": 0.0004560817906167358 }, { "copy_logits_max": -2.069183826446533, "copy_logits_min": -750000000.0, "copy_num_tokens": 656.875, "epoch": 1.7932601480725046, "gen_logits_max": 2.5603950023651123, "gen_logits_mean": -17.637958526611328, "gen_logits_min": -30.243282318115234, "gen_logits_std": 3.440446376800537, "gen_loss": 0.25435832142829895, "grad_norm": 0.3477274689391405, "learning_rate": 1.9837894736842108e-05, "loss": 0.2638, "mean_copy_accuracy": 0.997776210308075, "mean_gen_accuracy": 0.8766215145587921, "mean_token_accuracy": 0.9089406877756119, "num_tokens": 75795500.0, "sample_num_tokens": 9330.5, "step": 8781, "total_num_tokens": 75832822.0, "z_loss": 0.00039571610977873206 }, { "copy_logits_max": -4.323574066162109, "copy_logits_min": -750000064.0, "copy_num_tokens": 314.5625, "epoch": 1.7934643860097013, "gen_logits_max": 3.8624274730682373, "gen_logits_mean": -17.359294891357422, "gen_logits_min": -29.47449493408203, "gen_logits_std": 3.3972039222717285, "gen_loss": 0.2985861003398895, "grad_norm": 0.3789941497004613, "learning_rate": 1.983663157894737e-05, "loss": 0.2723, "mean_copy_accuracy": 0.9976341724395752, "mean_gen_accuracy": 0.8741302788257599, "mean_token_accuracy": 0.909141942858696, "num_tokens": 76073319.0, "sample_num_tokens": 6646.75, "step": 8782, "total_num_tokens": 76099906.0, "z_loss": 0.00040975038427859545 }, { "copy_logits_max": -1.2088497877120972, "copy_logits_min": -750000000.0, "copy_num_tokens": 485.9375, "epoch": 1.7936686239468982, "gen_logits_max": 4.65789794921875, "gen_logits_mean": -14.80119514465332, "gen_logits_min": -27.739521026611328, "gen_logits_std": 3.2967681884765625, "gen_loss": 0.30456316471099854, "grad_norm": 0.38711468488355366, "learning_rate": 1.9835368421052633e-05, "loss": 0.2867, "mean_copy_accuracy": 0.9965504705905914, "mean_gen_accuracy": 0.8750685602426529, "mean_token_accuracy": 0.9038192182779312, "num_tokens": 76318805.0, "sample_num_tokens": 8060.75, "step": 8783, "total_num_tokens": 76351048.0, "z_loss": 0.0004918075865134597 }, { "copy_logits_max": -0.7605217695236206, "copy_logits_min": -687500032.0, "copy_num_tokens": 688.1875, "epoch": 1.7938728618840951, "gen_logits_max": 2.430962562561035, "gen_logits_mean": -16.973995208740234, "gen_logits_min": -29.912181854248047, "gen_logits_std": 3.434502363204956, "gen_loss": 0.2192879617214203, "grad_norm": 0.34082146333410845, "learning_rate": 1.9834105263157894e-05, "loss": 0.2662, "mean_copy_accuracy": 0.9972606152296066, "mean_gen_accuracy": 0.8770117908716202, "mean_token_accuracy": 0.9093419462442398, "num_tokens": 76577857.0, "sample_num_tokens": 9085.75, "step": 8784, "total_num_tokens": 76614200.0, "z_loss": 0.00038906570989638567 }, { "copy_logits_max": -1.603834867477417, "copy_logits_min": -687500032.0, "copy_num_tokens": 439.0, "epoch": 1.7940770998212918, "gen_logits_max": 4.224052906036377, "gen_logits_mean": -15.469783782958984, "gen_logits_min": -27.823528289794922, "gen_logits_std": 3.3420121669769287, "gen_loss": 0.2591469883918762, "grad_norm": 0.3624250010421482, "learning_rate": 1.983284210526316e-05, "loss": 0.2771, "mean_copy_accuracy": 0.9975293725728989, "mean_gen_accuracy": 0.8776561468839645, "mean_token_accuracy": 0.9068926870822906, "num_tokens": 76835262.0, "sample_num_tokens": 8325.0, "step": 8785, "total_num_tokens": 76868562.0, "z_loss": 0.0003742396947927773 }, { "copy_logits_max": -4.337722301483154, "copy_logits_min": -750000000.0, "copy_num_tokens": 470.9375, "epoch": 1.7942813377584885, "gen_logits_max": 2.6955697536468506, "gen_logits_mean": -18.35303497314453, "gen_logits_min": -30.65483856201172, "gen_logits_std": 3.4401333332061768, "gen_loss": 0.2524533271789551, "grad_norm": 0.34438301309939967, "learning_rate": 1.983157894736842e-05, "loss": 0.2659, "mean_copy_accuracy": 0.9973420202732086, "mean_gen_accuracy": 0.8817407488822937, "mean_token_accuracy": 0.9115425944328308, "num_tokens": 77105931.0, "sample_num_tokens": 8399.25, "step": 8786, "total_num_tokens": 77139528.0, "z_loss": 0.00039433548226952553 }, { "copy_logits_max": 0.23653042316436768, "copy_logits_min": -750000000.0, "copy_num_tokens": 420.5, "epoch": 1.7944855756956855, "gen_logits_max": 4.19218635559082, "gen_logits_mean": -16.271446228027344, "gen_logits_min": -27.96906089782715, "gen_logits_std": 3.3057668209075928, "gen_loss": 0.295650839805603, "grad_norm": 0.3576804064695546, "learning_rate": 1.9830315789473684e-05, "loss": 0.2814, "mean_copy_accuracy": 0.9970786720514297, "mean_gen_accuracy": 0.8759512901306152, "mean_token_accuracy": 0.9035482406616211, "num_tokens": 77344845.0, "sample_num_tokens": 8263.25, "step": 8787, "total_num_tokens": 77377898.0, "z_loss": 0.00046301589463837445 }, { "copy_logits_max": -2.266880989074707, "copy_logits_min": -750000000.0, "copy_num_tokens": 629.8125, "epoch": 1.7946898136328824, "gen_logits_max": 2.80812406539917, "gen_logits_mean": -17.916807174682617, "gen_logits_min": -29.923810958862305, "gen_logits_std": 3.387950897216797, "gen_loss": 0.30681872367858887, "grad_norm": 0.36501879940700926, "learning_rate": 1.9829052631578948e-05, "loss": 0.265, "mean_copy_accuracy": 0.9971064478158951, "mean_gen_accuracy": 0.8769414573907852, "mean_token_accuracy": 0.9103269279003143, "num_tokens": 77629031.0, "sample_num_tokens": 9502.25, "step": 8788, "total_num_tokens": 77667040.0, "z_loss": 0.0004777720896527171 }, { "copy_logits_max": -6.815559387207031, "copy_logits_min": -750000000.0, "copy_num_tokens": 316.625, "epoch": 1.794894051570079, "gen_logits_max": 4.044792652130127, "gen_logits_mean": -17.610050201416016, "gen_logits_min": -29.466785430908203, "gen_logits_std": 3.401258707046509, "gen_loss": 0.24741263687610626, "grad_norm": 0.33522370610622626, "learning_rate": 1.9827789473684212e-05, "loss": 0.263, "mean_copy_accuracy": 0.9980130940675735, "mean_gen_accuracy": 0.8826935589313507, "mean_token_accuracy": 0.9092295318841934, "num_tokens": 77895883.0, "sample_num_tokens": 7093.25, "step": 8789, "total_num_tokens": 77924256.0, "z_loss": 0.0003995063598267734 }, { "copy_logits_max": -0.9489380121231079, "copy_logits_min": -750000064.0, "copy_num_tokens": 413.0, "epoch": 1.795098289507276, "gen_logits_max": 3.435447931289673, "gen_logits_mean": -16.491249084472656, "gen_logits_min": -28.51382827758789, "gen_logits_std": 3.309458017349243, "gen_loss": 0.29746779799461365, "grad_norm": 0.3676260462082994, "learning_rate": 1.9826526315789473e-05, "loss": 0.2903, "mean_copy_accuracy": 0.9974965453147888, "mean_gen_accuracy": 0.8680679053068161, "mean_token_accuracy": 0.9023966789245605, "num_tokens": 78150237.0, "sample_num_tokens": 7309.75, "step": 8790, "total_num_tokens": 78179476.0, "z_loss": 0.0005391084123402834 }, { "copy_logits_max": -5.597298622131348, "copy_logits_min": -750000000.0, "copy_num_tokens": 212.25, "epoch": 1.795302527444473, "gen_logits_max": 4.793804168701172, "gen_logits_mean": -16.445449829101562, "gen_logits_min": -27.968379974365234, "gen_logits_std": 3.277194023132324, "gen_loss": 0.3198612928390503, "grad_norm": 0.35450366499077546, "learning_rate": 1.9825263157894738e-05, "loss": 0.292, "mean_copy_accuracy": 0.9967593252658844, "mean_gen_accuracy": 0.8739960938692093, "mean_token_accuracy": 0.899332270026207, "num_tokens": 78412615.0, "sample_num_tokens": 7412.75, "step": 8791, "total_num_tokens": 78442266.0, "z_loss": 0.0004977229400537908 }, { "copy_logits_max": -2.4042797088623047, "copy_logits_min": -750000000.0, "copy_num_tokens": 455.625, "epoch": 1.7955067653816696, "gen_logits_max": 3.3828177452087402, "gen_logits_mean": -15.86367416381836, "gen_logits_min": -27.745487213134766, "gen_logits_std": 3.237910032272339, "gen_loss": 0.2853023409843445, "grad_norm": 0.35368641867944123, "learning_rate": 1.9824000000000002e-05, "loss": 0.2568, "mean_copy_accuracy": 0.9965256750583649, "mean_gen_accuracy": 0.8819977790117264, "mean_token_accuracy": 0.9119357019662857, "num_tokens": 78682080.0, "sample_num_tokens": 7921.5, "step": 8792, "total_num_tokens": 78713766.0, "z_loss": 0.00047735372208990157 }, { "copy_logits_max": -5.928006172180176, "copy_logits_min": -750000064.0, "copy_num_tokens": 550.0625, "epoch": 1.7957110033188664, "gen_logits_max": 3.011833667755127, "gen_logits_mean": -16.490589141845703, "gen_logits_min": -28.96649742126465, "gen_logits_std": 3.332801103591919, "gen_loss": 0.25192445516586304, "grad_norm": 0.3570947108327031, "learning_rate": 1.9822736842105263e-05, "loss": 0.287, "mean_copy_accuracy": 0.9964796006679535, "mean_gen_accuracy": 0.8738854229450226, "mean_token_accuracy": 0.9019816517829895, "num_tokens": 78953365.0, "sample_num_tokens": 9065.75, "step": 8793, "total_num_tokens": 78989628.0, "z_loss": 0.000409053114708513 }, { "copy_logits_max": -5.213689804077148, "copy_logits_min": -750000000.0, "copy_num_tokens": 396.8125, "epoch": 1.7959152412560633, "gen_logits_max": 3.132873058319092, "gen_logits_mean": -18.193870544433594, "gen_logits_min": -30.01883316040039, "gen_logits_std": 3.3796091079711914, "gen_loss": 0.2885584235191345, "grad_norm": 0.3412927569081796, "learning_rate": 1.9821473684210527e-05, "loss": 0.257, "mean_copy_accuracy": 0.996583953499794, "mean_gen_accuracy": 0.8803844004869461, "mean_token_accuracy": 0.9105750173330307, "num_tokens": 79225590.0, "sample_num_tokens": 8124.0, "step": 8794, "total_num_tokens": 79258086.0, "z_loss": 0.00043262908002361655 }, { "copy_logits_max": -6.118361473083496, "copy_logits_min": -687500032.0, "copy_num_tokens": 406.3125, "epoch": 1.7961194791932602, "gen_logits_max": 3.8527355194091797, "gen_logits_mean": -17.433794021606445, "gen_logits_min": -29.567134857177734, "gen_logits_std": 3.386885643005371, "gen_loss": 0.25655683875083923, "grad_norm": 0.3364276460068062, "learning_rate": 1.9820210526315788e-05, "loss": 0.266, "mean_copy_accuracy": 0.9968205094337463, "mean_gen_accuracy": 0.8827926963567734, "mean_token_accuracy": 0.9093623757362366, "num_tokens": 79500204.0, "sample_num_tokens": 8251.5, "step": 8795, "total_num_tokens": 79533210.0, "z_loss": 0.0003905578632839024 }, { "copy_logits_max": -1.9694465398788452, "copy_logits_min": -687500032.0, "copy_num_tokens": 682.875, "epoch": 1.796323717130457, "gen_logits_max": 2.640693426132202, "gen_logits_mean": -17.2696533203125, "gen_logits_min": -29.29568099975586, "gen_logits_std": 3.3796427249908447, "gen_loss": 0.23705294728279114, "grad_norm": 0.350132727158512, "learning_rate": 1.9818947368421056e-05, "loss": 0.25, "mean_copy_accuracy": 0.9971840977668762, "mean_gen_accuracy": 0.8829115182161331, "mean_token_accuracy": 0.9160712361335754, "num_tokens": 79775335.0, "sample_num_tokens": 8882.25, "step": 8796, "total_num_tokens": 79810864.0, "z_loss": 0.0004292410740163177 }, { "copy_logits_max": -3.524445056915283, "copy_logits_min": -750000064.0, "copy_num_tokens": 579.625, "epoch": 1.7965279550676538, "gen_logits_max": 3.5911948680877686, "gen_logits_mean": -15.420289993286133, "gen_logits_min": -27.450117111206055, "gen_logits_std": 3.305436849594116, "gen_loss": 0.2791633605957031, "grad_norm": 0.3539097342514143, "learning_rate": 1.9817684210526317e-05, "loss": 0.2823, "mean_copy_accuracy": 0.997079610824585, "mean_gen_accuracy": 0.8765440881252289, "mean_token_accuracy": 0.90493243932724, "num_tokens": 80024546.0, "sample_num_tokens": 8594.5, "step": 8797, "total_num_tokens": 80058924.0, "z_loss": 0.0004462254873942584 }, { "copy_logits_max": -6.004763603210449, "copy_logits_min": -750000000.0, "copy_num_tokens": 384.8125, "epoch": 1.7967321930048508, "gen_logits_max": 4.283738136291504, "gen_logits_mean": -16.5994930267334, "gen_logits_min": -28.088403701782227, "gen_logits_std": 3.310635805130005, "gen_loss": 0.3056984543800354, "grad_norm": 0.35175043421942376, "learning_rate": 1.981642105263158e-05, "loss": 0.2719, "mean_copy_accuracy": 0.9968306869268417, "mean_gen_accuracy": 0.8755415081977844, "mean_token_accuracy": 0.9082885682582855, "num_tokens": 80302452.0, "sample_num_tokens": 8463.0, "step": 8798, "total_num_tokens": 80336304.0, "z_loss": 0.00046394727542065084 }, { "copy_logits_max": -2.630275249481201, "copy_logits_min": -750000000.0, "copy_num_tokens": 662.0, "epoch": 1.7969364309420475, "gen_logits_max": 3.0168983936309814, "gen_logits_mean": -17.29078483581543, "gen_logits_min": -29.252460479736328, "gen_logits_std": 3.4206862449645996, "gen_loss": 0.23536530137062073, "grad_norm": 0.3492010776482274, "learning_rate": 1.9815157894736842e-05, "loss": 0.2812, "mean_copy_accuracy": 0.9961102455854416, "mean_gen_accuracy": 0.8774266988039017, "mean_token_accuracy": 0.9042274951934814, "num_tokens": 80560284.0, "sample_num_tokens": 9855.5, "step": 8799, "total_num_tokens": 80599706.0, "z_loss": 0.0003902305325027555 }, { "copy_logits_max": -2.9647750854492188, "copy_logits_min": -687500032.0, "copy_num_tokens": 637.1875, "epoch": 1.7971406688792442, "gen_logits_max": 2.121781349182129, "gen_logits_mean": -17.707481384277344, "gen_logits_min": -29.799081802368164, "gen_logits_std": 3.3976614475250244, "gen_loss": 0.21512335538864136, "grad_norm": 0.3270807744604917, "learning_rate": 1.9813894736842106e-05, "loss": 0.2543, "mean_copy_accuracy": 0.9979690611362457, "mean_gen_accuracy": 0.8823859840631485, "mean_token_accuracy": 0.9133002161979675, "num_tokens": 80830725.0, "sample_num_tokens": 8146.25, "step": 8800, "total_num_tokens": 80863310.0, "z_loss": 0.0003799714904744178 }, { "copy_logits_max": -5.526813507080078, "copy_logits_min": -750000000.0, "copy_num_tokens": 446.5625, "epoch": 1.797344906816441, "gen_logits_max": 3.157454490661621, "gen_logits_mean": -18.191118240356445, "gen_logits_min": -30.299013137817383, "gen_logits_std": 3.4343862533569336, "gen_loss": 0.24568983912467957, "grad_norm": 0.32622437190481435, "learning_rate": 1.9812631578947367e-05, "loss": 0.2583, "mean_copy_accuracy": 0.9981847256422043, "mean_gen_accuracy": 0.8834904581308365, "mean_token_accuracy": 0.9108077883720398, "num_tokens": 81104640.0, "sample_num_tokens": 8783.5, "step": 8801, "total_num_tokens": 81139774.0, "z_loss": 0.0003557087038643658 }, { "copy_logits_max": -3.6864256858825684, "copy_logits_min": -750000000.0, "copy_num_tokens": 456.5, "epoch": 1.797549144753638, "gen_logits_max": 3.8579893112182617, "gen_logits_mean": -15.525063514709473, "gen_logits_min": -27.315914154052734, "gen_logits_std": 3.273346424102783, "gen_loss": 0.2698395550251007, "grad_norm": 0.3692197704304153, "learning_rate": 1.981136842105263e-05, "loss": 0.2723, "mean_copy_accuracy": 0.9967968016862869, "mean_gen_accuracy": 0.8808174431324005, "mean_token_accuracy": 0.9084146022796631, "num_tokens": 81369356.0, "sample_num_tokens": 8548.0, "step": 8802, "total_num_tokens": 81403548.0, "z_loss": 0.0003801886341534555 }, { "copy_logits_max": -2.621438980102539, "copy_logits_min": -750000000.0, "copy_num_tokens": 436.9375, "epoch": 1.7977533826908347, "gen_logits_max": 3.6644272804260254, "gen_logits_mean": -16.558412551879883, "gen_logits_min": -28.47955894470215, "gen_logits_std": 3.307734727859497, "gen_loss": 0.27620935440063477, "grad_norm": 0.37531745891751783, "learning_rate": 1.9810105263157892e-05, "loss": 0.2817, "mean_copy_accuracy": 0.9966799318790436, "mean_gen_accuracy": 0.8796282857656479, "mean_token_accuracy": 0.9045475572347641, "num_tokens": 81629804.0, "sample_num_tokens": 8809.0, "step": 8803, "total_num_tokens": 81665040.0, "z_loss": 0.00038761633913964033 }, { "copy_logits_max": -2.2116119861602783, "copy_logits_min": -687500032.0, "copy_num_tokens": 399.0, "epoch": 1.7979576206280317, "gen_logits_max": 3.9977669715881348, "gen_logits_mean": -17.22454071044922, "gen_logits_min": -29.208412170410156, "gen_logits_std": 3.3752241134643555, "gen_loss": 0.30417466163635254, "grad_norm": 0.37715890351424847, "learning_rate": 1.980884210526316e-05, "loss": 0.2904, "mean_copy_accuracy": 0.996736690402031, "mean_gen_accuracy": 0.8780513405799866, "mean_token_accuracy": 0.9032341986894608, "num_tokens": 81894905.0, "sample_num_tokens": 8845.25, "step": 8804, "total_num_tokens": 81930286.0, "z_loss": 0.0004263566224835813 }, { "copy_logits_max": -0.22286880016326904, "copy_logits_min": -750000000.0, "copy_num_tokens": 476.75, "epoch": 1.7981618585652286, "gen_logits_max": 3.866044521331787, "gen_logits_mean": -16.182891845703125, "gen_logits_min": -28.28030014038086, "gen_logits_std": 3.334580183029175, "gen_loss": 0.2657148241996765, "grad_norm": 0.3789491844780978, "learning_rate": 1.980757894736842e-05, "loss": 0.2806, "mean_copy_accuracy": 0.997559443116188, "mean_gen_accuracy": 0.8720488101243973, "mean_token_accuracy": 0.9066243171691895, "num_tokens": 82165789.0, "sample_num_tokens": 7742.75, "step": 8805, "total_num_tokens": 82196760.0, "z_loss": 0.0003921202151104808 }, { "copy_logits_max": -0.8766778707504272, "copy_logits_min": -750000000.0, "copy_num_tokens": 626.5, "epoch": 1.7983660965024253, "gen_logits_max": 3.0555038452148438, "gen_logits_mean": -17.227659225463867, "gen_logits_min": -29.499467849731445, "gen_logits_std": 3.409609317779541, "gen_loss": 0.23515695333480835, "grad_norm": 0.3208646438765267, "learning_rate": 1.9806315789473685e-05, "loss": 0.2724, "mean_copy_accuracy": 0.9969616681337357, "mean_gen_accuracy": 0.8780185729265213, "mean_token_accuracy": 0.9075102508068085, "num_tokens": 82439306.0, "sample_num_tokens": 9201.5, "step": 8806, "total_num_tokens": 82476112.0, "z_loss": 0.00038421800127252936 }, { "copy_logits_max": -1.3622043132781982, "copy_logits_min": -750000000.0, "copy_num_tokens": 571.3125, "epoch": 1.798570334439622, "gen_logits_max": 3.6857552528381348, "gen_logits_mean": -16.705562591552734, "gen_logits_min": -29.144689559936523, "gen_logits_std": 3.3725056648254395, "gen_loss": 0.251327246427536, "grad_norm": 0.3533120324437764, "learning_rate": 1.980505263157895e-05, "loss": 0.2843, "mean_copy_accuracy": 0.997536301612854, "mean_gen_accuracy": 0.8710668683052063, "mean_token_accuracy": 0.9028414487838745, "num_tokens": 82716703.0, "sample_num_tokens": 9805.25, "step": 8807, "total_num_tokens": 82755924.0, "z_loss": 0.0004042559303343296 }, { "copy_logits_max": -2.646059274673462, "copy_logits_min": -750000000.0, "copy_num_tokens": 464.9375, "epoch": 1.7987745723768191, "gen_logits_max": 3.115445137023926, "gen_logits_mean": -18.355255126953125, "gen_logits_min": -30.451488494873047, "gen_logits_std": 3.4574012756347656, "gen_loss": 0.24491292238235474, "grad_norm": 0.3740219464734892, "learning_rate": 1.980378947368421e-05, "loss": 0.2683, "mean_copy_accuracy": 0.9972390234470367, "mean_gen_accuracy": 0.8782553970813751, "mean_token_accuracy": 0.9088546633720398, "num_tokens": 82976528.0, "sample_num_tokens": 7677.5, "step": 8808, "total_num_tokens": 83007238.0, "z_loss": 0.0004336509737186134 }, { "copy_logits_max": 0.06017649173736572, "copy_logits_min": -750000064.0, "copy_num_tokens": 477.5625, "epoch": 1.7989788103140159, "gen_logits_max": 4.629613876342773, "gen_logits_mean": -15.909309387207031, "gen_logits_min": -27.589685440063477, "gen_logits_std": 3.308643341064453, "gen_loss": 0.27050161361694336, "grad_norm": 0.3743281171029053, "learning_rate": 1.9802526315789475e-05, "loss": 0.2708, "mean_copy_accuracy": 0.9968571066856384, "mean_gen_accuracy": 0.8804996013641357, "mean_token_accuracy": 0.907550185918808, "num_tokens": 83233308.0, "sample_num_tokens": 8816.5, "step": 8809, "total_num_tokens": 83268574.0, "z_loss": 0.0004249264020472765 }, { "copy_logits_max": -1.518162727355957, "copy_logits_min": -750000000.0, "copy_num_tokens": 440.4375, "epoch": 1.7991830482512126, "gen_logits_max": 4.174036502838135, "gen_logits_mean": -15.999905586242676, "gen_logits_min": -27.969566345214844, "gen_logits_std": 3.3293585777282715, "gen_loss": 0.30556368827819824, "grad_norm": 0.353646835233528, "learning_rate": 1.9801263157894736e-05, "loss": 0.2817, "mean_copy_accuracy": 0.9970738440752029, "mean_gen_accuracy": 0.8693225830793381, "mean_token_accuracy": 0.904725506901741, "num_tokens": 83519847.0, "sample_num_tokens": 8168.25, "step": 8810, "total_num_tokens": 83552520.0, "z_loss": 0.0004817426670342684 }, { "copy_logits_max": -2.3491933345794678, "copy_logits_min": -687500032.0, "copy_num_tokens": 320.75, "epoch": 1.7993872861884095, "gen_logits_max": 5.474982261657715, "gen_logits_mean": -15.06938648223877, "gen_logits_min": -26.79956817626953, "gen_logits_std": 3.285313129425049, "gen_loss": 0.3340646028518677, "grad_norm": 0.3758852637826032, "learning_rate": 1.98e-05, "loss": 0.3007, "mean_copy_accuracy": 0.9965441823005676, "mean_gen_accuracy": 0.8716534823179245, "mean_token_accuracy": 0.8994642943143845, "num_tokens": 83761232.0, "sample_num_tokens": 7724.0, "step": 8811, "total_num_tokens": 83792128.0, "z_loss": 0.0005426447023637593 }, { "copy_logits_max": -1.60776948928833, "copy_logits_min": -750000000.0, "copy_num_tokens": 453.4375, "epoch": 1.7995915241256064, "gen_logits_max": 4.066192626953125, "gen_logits_mean": -16.066482543945312, "gen_logits_min": -28.219654083251953, "gen_logits_std": 3.331747055053711, "gen_loss": 0.269311785697937, "grad_norm": 0.362129362971808, "learning_rate": 1.9798736842105264e-05, "loss": 0.2857, "mean_copy_accuracy": 0.996252566576004, "mean_gen_accuracy": 0.8734541237354279, "mean_token_accuracy": 0.9023082107305527, "num_tokens": 84018944.0, "sample_num_tokens": 8223.5, "step": 8812, "total_num_tokens": 84051838.0, "z_loss": 0.0004381687904242426 }, { "copy_logits_max": -4.011402606964111, "copy_logits_min": -687500032.0, "copy_num_tokens": 438.0, "epoch": 1.7997957620628031, "gen_logits_max": 4.427090644836426, "gen_logits_mean": -16.203285217285156, "gen_logits_min": -28.304454803466797, "gen_logits_std": 3.3845102787017822, "gen_loss": 0.2673623561859131, "grad_norm": 0.36787606318104543, "learning_rate": 1.979747368421053e-05, "loss": 0.267, "mean_copy_accuracy": 0.9955413341522217, "mean_gen_accuracy": 0.8838100135326385, "mean_token_accuracy": 0.9077344536781311, "num_tokens": 84272104.0, "sample_num_tokens": 8012.5, "step": 8813, "total_num_tokens": 84304154.0, "z_loss": 0.00042693805880844593 }, { "copy_logits_max": -5.096817970275879, "copy_logits_min": -687500032.0, "copy_num_tokens": 467.5625, "epoch": 1.8, "gen_logits_max": 3.339446783065796, "gen_logits_mean": -18.300329208374023, "gen_logits_min": -30.76995086669922, "gen_logits_std": 3.4257285594940186, "gen_loss": 0.28315380215644836, "grad_norm": 0.35169439485085807, "learning_rate": 1.979621052631579e-05, "loss": 0.2798, "mean_copy_accuracy": 0.9968578815460205, "mean_gen_accuracy": 0.8763038814067841, "mean_token_accuracy": 0.9056408554315567, "num_tokens": 84538388.0, "sample_num_tokens": 8353.5, "step": 8814, "total_num_tokens": 84571802.0, "z_loss": 0.00044478982454165816 }, { "copy_logits_max": -2.7589728832244873, "copy_logits_min": -750000000.0, "copy_num_tokens": 393.6875, "epoch": 1.800204237937197, "gen_logits_max": 4.686063766479492, "gen_logits_mean": -16.052459716796875, "gen_logits_min": -28.053735733032227, "gen_logits_std": 3.3239598274230957, "gen_loss": 0.33061403036117554, "grad_norm": 0.37912899398277033, "learning_rate": 1.9794947368421054e-05, "loss": 0.3028, "mean_copy_accuracy": 0.9971002489328384, "mean_gen_accuracy": 0.8678438365459442, "mean_token_accuracy": 0.8967870473861694, "num_tokens": 84784133.0, "sample_num_tokens": 8147.75, "step": 8815, "total_num_tokens": 84816724.0, "z_loss": 0.0005072243511676788 }, { "copy_logits_max": -4.1364216804504395, "copy_logits_min": -750000000.0, "copy_num_tokens": 590.4375, "epoch": 1.8004084758743937, "gen_logits_max": 2.788350820541382, "gen_logits_mean": -17.982385635375977, "gen_logits_min": -30.094425201416016, "gen_logits_std": 3.4384288787841797, "gen_loss": 0.2658132314682007, "grad_norm": 0.36331593839326115, "learning_rate": 1.9793684210526315e-05, "loss": 0.2779, "mean_copy_accuracy": 0.9972459971904755, "mean_gen_accuracy": 0.8739689737558365, "mean_token_accuracy": 0.9046328365802765, "num_tokens": 85046550.0, "sample_num_tokens": 9264.5, "step": 8816, "total_num_tokens": 85083608.0, "z_loss": 0.0003947039949707687 }, { "copy_logits_max": -5.168900966644287, "copy_logits_min": -750000000.0, "copy_num_tokens": 509.5625, "epoch": 1.8006127138115904, "gen_logits_max": 2.4796347618103027, "gen_logits_mean": -18.656003952026367, "gen_logits_min": -30.95209503173828, "gen_logits_std": 3.4669225215911865, "gen_loss": 0.23396608233451843, "grad_norm": 0.34266496849029, "learning_rate": 1.979242105263158e-05, "loss": 0.2757, "mean_copy_accuracy": 0.9976188838481903, "mean_gen_accuracy": 0.8776950091123581, "mean_token_accuracy": 0.9058683663606644, "num_tokens": 85321659.0, "sample_num_tokens": 8923.25, "step": 8817, "total_num_tokens": 85357352.0, "z_loss": 0.0003503516491036862 }, { "copy_logits_max": -2.5698070526123047, "copy_logits_min": -750000064.0, "copy_num_tokens": 443.1875, "epoch": 1.8008169517487873, "gen_logits_max": 3.240002155303955, "gen_logits_mean": -16.731185913085938, "gen_logits_min": -29.247791290283203, "gen_logits_std": 3.3629279136657715, "gen_loss": 0.26743656396865845, "grad_norm": 0.32244958719409617, "learning_rate": 1.979115789473684e-05, "loss": 0.2558, "mean_copy_accuracy": 0.996918186545372, "mean_gen_accuracy": 0.8840651512145996, "mean_token_accuracy": 0.9121329337358475, "num_tokens": 85617217.0, "sample_num_tokens": 7824.75, "step": 8818, "total_num_tokens": 85648516.0, "z_loss": 0.0004322900203987956 }, { "copy_logits_max": -7.0168375968933105, "copy_logits_min": -750000000.0, "copy_num_tokens": 420.5625, "epoch": 1.8010211896859842, "gen_logits_max": 2.421020984649658, "gen_logits_mean": -18.86542510986328, "gen_logits_min": -31.09453582763672, "gen_logits_std": 3.468507766723633, "gen_loss": 0.2632831335067749, "grad_norm": 0.3539624010142168, "learning_rate": 1.9789894736842104e-05, "loss": 0.2721, "mean_copy_accuracy": 0.9971489459276199, "mean_gen_accuracy": 0.8787263333797455, "mean_token_accuracy": 0.907727524638176, "num_tokens": 85875543.0, "sample_num_tokens": 7989.25, "step": 8819, "total_num_tokens": 85907500.0, "z_loss": 0.0003606376121751964 }, { "copy_logits_max": -4.401411533355713, "copy_logits_min": -750000000.0, "copy_num_tokens": 432.9375, "epoch": 1.801225427623181, "gen_logits_max": 2.8292949199676514, "gen_logits_mean": -18.027816772460938, "gen_logits_min": -30.095062255859375, "gen_logits_std": 3.448363780975342, "gen_loss": 0.2753450274467468, "grad_norm": 0.3675400626369117, "learning_rate": 1.978863157894737e-05, "loss": 0.2543, "mean_copy_accuracy": 0.9979292750358582, "mean_gen_accuracy": 0.8812807947397232, "mean_token_accuracy": 0.9115456193685532, "num_tokens": 86144951.0, "sample_num_tokens": 7741.25, "step": 8820, "total_num_tokens": 86175916.0, "z_loss": 0.0003736908547580242 }, { "copy_logits_max": -3.1783270835876465, "copy_logits_min": -750000064.0, "copy_num_tokens": 485.8125, "epoch": 1.8014296655603779, "gen_logits_max": 3.8836841583251953, "gen_logits_mean": -16.186214447021484, "gen_logits_min": -28.526636123657227, "gen_logits_std": 3.3525304794311523, "gen_loss": 0.26356738805770874, "grad_norm": 0.3746231540805691, "learning_rate": 1.9787368421052633e-05, "loss": 0.2651, "mean_copy_accuracy": 0.9968962073326111, "mean_gen_accuracy": 0.884758859872818, "mean_token_accuracy": 0.9092893600463867, "num_tokens": 86408526.0, "sample_num_tokens": 9398.0, "step": 8821, "total_num_tokens": 86446118.0, "z_loss": 0.00038860252243466675 }, { "copy_logits_max": -5.389293670654297, "copy_logits_min": -750000000.0, "copy_num_tokens": 370.5, "epoch": 1.8016339034975748, "gen_logits_max": 3.396484136581421, "gen_logits_mean": -18.254730224609375, "gen_logits_min": -30.538002014160156, "gen_logits_std": 3.4543843269348145, "gen_loss": 0.30875062942504883, "grad_norm": 0.3981484082907668, "learning_rate": 1.9786105263157897e-05, "loss": 0.3002, "mean_copy_accuracy": 0.9972517937421799, "mean_gen_accuracy": 0.869523212313652, "mean_token_accuracy": 0.8968213498592377, "num_tokens": 86667217.0, "sample_num_tokens": 7891.25, "step": 8822, "total_num_tokens": 86698782.0, "z_loss": 0.00047058521886356175 }, { "copy_logits_max": -3.3598368167877197, "copy_logits_min": -750000000.0, "copy_num_tokens": 715.25, "epoch": 1.8018381414347715, "gen_logits_max": 4.094786643981934, "gen_logits_mean": -15.9976224899292, "gen_logits_min": -28.520824432373047, "gen_logits_std": 3.398101329803467, "gen_loss": 0.25386252999305725, "grad_norm": 0.3727202260642308, "learning_rate": 1.978484210526316e-05, "loss": 0.2888, "mean_copy_accuracy": 0.9967912286520004, "mean_gen_accuracy": 0.8762286305427551, "mean_token_accuracy": 0.9026339799165726, "num_tokens": 86910036.0, "sample_num_tokens": 9848.5, "step": 8823, "total_num_tokens": 86949430.0, "z_loss": 0.00037241095560602844 }, { "copy_logits_max": -6.2921037673950195, "copy_logits_min": -750000128.0, "copy_num_tokens": 374.25, "epoch": 1.8020423793719682, "gen_logits_max": 3.346231698989868, "gen_logits_mean": -18.23162269592285, "gen_logits_min": -30.205493927001953, "gen_logits_std": 3.446298122406006, "gen_loss": 0.284535676240921, "grad_norm": 0.3375472093152153, "learning_rate": 1.9783578947368423e-05, "loss": 0.2647, "mean_copy_accuracy": 0.9958456456661224, "mean_gen_accuracy": 0.8839265555143356, "mean_token_accuracy": 0.9097416549921036, "num_tokens": 87186700.0, "sample_num_tokens": 7640.5, "step": 8824, "total_num_tokens": 87217262.0, "z_loss": 0.00042637926526367664 }, { "copy_logits_max": -7.391852855682373, "copy_logits_min": -687500032.0, "copy_num_tokens": 318.25, "epoch": 1.8022466173091651, "gen_logits_max": 4.109145164489746, "gen_logits_mean": -17.618144989013672, "gen_logits_min": -29.499786376953125, "gen_logits_std": 3.391693592071533, "gen_loss": 0.27950093150138855, "grad_norm": 0.36168196566752503, "learning_rate": 1.9782315789473684e-05, "loss": 0.2635, "mean_copy_accuracy": 0.9964542090892792, "mean_gen_accuracy": 0.8844925165176392, "mean_token_accuracy": 0.910170391201973, "num_tokens": 87457488.0, "sample_num_tokens": 7399.5, "step": 8825, "total_num_tokens": 87487086.0, "z_loss": 0.00040804105810821056 }, { "copy_logits_max": -2.8807272911071777, "copy_logits_min": -750000000.0, "copy_num_tokens": 580.4375, "epoch": 1.802450855246362, "gen_logits_max": 3.416590690612793, "gen_logits_mean": -17.012731552124023, "gen_logits_min": -29.383949279785156, "gen_logits_std": 3.4106574058532715, "gen_loss": 0.2653568387031555, "grad_norm": 0.36195410802700834, "learning_rate": 1.9781052631578948e-05, "loss": 0.2462, "mean_copy_accuracy": 0.9975837469100952, "mean_gen_accuracy": 0.8839211910963058, "mean_token_accuracy": 0.9163389056921005, "num_tokens": 87734917.0, "sample_num_tokens": 9006.75, "step": 8826, "total_num_tokens": 87770944.0, "z_loss": 0.0004106643027625978 }, { "copy_logits_max": -4.023632049560547, "copy_logits_min": -687500032.0, "copy_num_tokens": 428.125, "epoch": 1.8026550931835588, "gen_logits_max": 3.8473737239837646, "gen_logits_mean": -17.34369659423828, "gen_logits_min": -29.737180709838867, "gen_logits_std": 3.425966739654541, "gen_loss": 0.30882298946380615, "grad_norm": 0.36956979217132196, "learning_rate": 1.977978947368421e-05, "loss": 0.2716, "mean_copy_accuracy": 0.9978751987218857, "mean_gen_accuracy": 0.8772673159837723, "mean_token_accuracy": 0.9071658253669739, "num_tokens": 87999376.0, "sample_num_tokens": 8078.0, "step": 8827, "total_num_tokens": 88031688.0, "z_loss": 0.0005337685579434037 }, { "copy_logits_max": -3.0302467346191406, "copy_logits_min": -750000000.0, "copy_num_tokens": 476.8125, "epoch": 1.8028593311207557, "gen_logits_max": 3.3312063217163086, "gen_logits_mean": -17.86971664428711, "gen_logits_min": -30.160600662231445, "gen_logits_std": 3.457566261291504, "gen_loss": 0.2501208186149597, "grad_norm": 0.33170140484893934, "learning_rate": 1.9778526315789473e-05, "loss": 0.2698, "mean_copy_accuracy": 0.9972534626722336, "mean_gen_accuracy": 0.8761521875858307, "mean_token_accuracy": 0.9079630076885223, "num_tokens": 88264223.0, "sample_num_tokens": 8681.25, "step": 8828, "total_num_tokens": 88298948.0, "z_loss": 0.00043620276846922934 }, { "copy_logits_max": -3.2944252490997314, "copy_logits_min": -750000000.0, "copy_num_tokens": 609.4375, "epoch": 1.8030635690579526, "gen_logits_max": 2.557520866394043, "gen_logits_mean": -18.17026138305664, "gen_logits_min": -30.369747161865234, "gen_logits_std": 3.4940524101257324, "gen_loss": 0.225373774766922, "grad_norm": 0.37197109744408724, "learning_rate": 1.9777263157894737e-05, "loss": 0.2759, "mean_copy_accuracy": 0.9972013384103775, "mean_gen_accuracy": 0.8746680170297623, "mean_token_accuracy": 0.9062971025705338, "num_tokens": 88532902.0, "sample_num_tokens": 8739.5, "step": 8829, "total_num_tokens": 88567860.0, "z_loss": 0.0003956913133151829 }, { "copy_logits_max": -3.0753045082092285, "copy_logits_min": -750000000.0, "copy_num_tokens": 428.25, "epoch": 1.8032678069951493, "gen_logits_max": 4.2800493240356445, "gen_logits_mean": -16.7392635345459, "gen_logits_min": -28.935054779052734, "gen_logits_std": 3.4091174602508545, "gen_loss": 0.27910149097442627, "grad_norm": 0.33210640034038735, "learning_rate": 1.9776000000000002e-05, "loss": 0.2721, "mean_copy_accuracy": 0.9967963993549347, "mean_gen_accuracy": 0.8780039548873901, "mean_token_accuracy": 0.908241480588913, "num_tokens": 88825530.0, "sample_num_tokens": 8581.5, "step": 8830, "total_num_tokens": 88859856.0, "z_loss": 0.0004623135318979621 }, { "copy_logits_max": -5.345758438110352, "copy_logits_min": -750000000.0, "copy_num_tokens": 683.5625, "epoch": 1.803472044932346, "gen_logits_max": 2.0222487449645996, "gen_logits_mean": -18.77223014831543, "gen_logits_min": -31.02932357788086, "gen_logits_std": 3.5144457817077637, "gen_loss": 0.22695642709732056, "grad_norm": 0.35490194029605093, "learning_rate": 1.9774736842105263e-05, "loss": 0.2803, "mean_copy_accuracy": 0.996965765953064, "mean_gen_accuracy": 0.8751942962408066, "mean_token_accuracy": 0.90370874106884, "num_tokens": 89097113.0, "sample_num_tokens": 9493.25, "step": 8831, "total_num_tokens": 89135086.0, "z_loss": 0.000368406530469656 }, { "copy_logits_max": -5.5639543533325195, "copy_logits_min": -750000000.0, "copy_num_tokens": 462.75, "epoch": 1.8036762828695432, "gen_logits_max": 3.2579917907714844, "gen_logits_mean": -18.166519165039062, "gen_logits_min": -30.49858283996582, "gen_logits_std": 3.446139335632324, "gen_loss": 0.2814180254936218, "grad_norm": 0.3650116630291561, "learning_rate": 1.9773473684210527e-05, "loss": 0.2862, "mean_copy_accuracy": 0.9963855445384979, "mean_gen_accuracy": 0.874687522649765, "mean_token_accuracy": 0.9004562944173813, "num_tokens": 89369874.0, "sample_num_tokens": 8774.5, "step": 8832, "total_num_tokens": 89404972.0, "z_loss": 0.00040219721267931163 }, { "copy_logits_max": -4.2978925704956055, "copy_logits_min": -750000128.0, "copy_num_tokens": 351.375, "epoch": 1.8038805208067399, "gen_logits_max": 4.3558502197265625, "gen_logits_mean": -16.384153366088867, "gen_logits_min": -28.920310974121094, "gen_logits_std": 3.4003615379333496, "gen_loss": 0.2678242325782776, "grad_norm": 0.34818238240198335, "learning_rate": 1.977221052631579e-05, "loss": 0.2746, "mean_copy_accuracy": 0.9973815083503723, "mean_gen_accuracy": 0.8827317953109741, "mean_token_accuracy": 0.9067595452070236, "num_tokens": 89613268.0, "sample_num_tokens": 7218.0, "step": 8833, "total_num_tokens": 89642140.0, "z_loss": 0.0004404710198286921 }, { "copy_logits_max": -4.255975723266602, "copy_logits_min": -687500032.0, "copy_num_tokens": 480.3125, "epoch": 1.8040847587439366, "gen_logits_max": 2.222714900970459, "gen_logits_mean": -19.693649291992188, "gen_logits_min": -31.947193145751953, "gen_logits_std": 3.5236759185791016, "gen_loss": 0.28885507583618164, "grad_norm": 0.3651968455866606, "learning_rate": 1.9770947368421052e-05, "loss": 0.2762, "mean_copy_accuracy": 0.9973205775022507, "mean_gen_accuracy": 0.875138908624649, "mean_token_accuracy": 0.9062550365924835, "num_tokens": 89878107.0, "sample_num_tokens": 8002.25, "step": 8834, "total_num_tokens": 89910116.0, "z_loss": 0.0004563122056424618 }, { "copy_logits_max": -4.130558490753174, "copy_logits_min": -750000000.0, "copy_num_tokens": 474.8125, "epoch": 1.8042889966811335, "gen_logits_max": 3.8811492919921875, "gen_logits_mean": -17.333709716796875, "gen_logits_min": -29.500017166137695, "gen_logits_std": 3.4000072479248047, "gen_loss": 0.2851892113685608, "grad_norm": 0.38216902113384216, "learning_rate": 1.9769684210526317e-05, "loss": 0.2809, "mean_copy_accuracy": 0.9977812021970749, "mean_gen_accuracy": 0.8758306056261063, "mean_token_accuracy": 0.9043111652135849, "num_tokens": 90129318.0, "sample_num_tokens": 8136.0, "step": 8835, "total_num_tokens": 90161862.0, "z_loss": 0.0004506528202909976 }, { "copy_logits_max": -4.362482070922852, "copy_logits_min": -750000000.0, "copy_num_tokens": 482.125, "epoch": 1.8044932346183304, "gen_logits_max": 3.988863706588745, "gen_logits_mean": -16.16750717163086, "gen_logits_min": -28.17508316040039, "gen_logits_std": 3.3545169830322266, "gen_loss": 0.2735854983329773, "grad_norm": 0.35479201529371085, "learning_rate": 1.9768421052631577e-05, "loss": 0.2728, "mean_copy_accuracy": 0.9979627728462219, "mean_gen_accuracy": 0.8783837705850601, "mean_token_accuracy": 0.906895101070404, "num_tokens": 90391822.0, "sample_num_tokens": 8693.5, "step": 8836, "total_num_tokens": 90426596.0, "z_loss": 0.0003982004418503493 }, { "copy_logits_max": -6.960225582122803, "copy_logits_min": -750000000.0, "copy_num_tokens": 254.125, "epoch": 1.8046974725555271, "gen_logits_max": 4.28168249130249, "gen_logits_mean": -17.54705810546875, "gen_logits_min": -29.1657657623291, "gen_logits_std": 3.3741631507873535, "gen_loss": 0.31867387890815735, "grad_norm": 0.3707076195817471, "learning_rate": 1.9767157894736845e-05, "loss": 0.3, "mean_copy_accuracy": 0.9966253936290741, "mean_gen_accuracy": 0.8711753487586975, "mean_token_accuracy": 0.898895338177681, "num_tokens": 90656164.0, "sample_num_tokens": 7499.0, "step": 8837, "total_num_tokens": 90686160.0, "z_loss": 0.0004807820077985525 }, { "copy_logits_max": -2.606285572052002, "copy_logits_min": -750000000.0, "copy_num_tokens": 675.9375, "epoch": 1.804901710492724, "gen_logits_max": 4.2743706703186035, "gen_logits_mean": -16.394710540771484, "gen_logits_min": -28.76995086669922, "gen_logits_std": 3.3890693187713623, "gen_loss": 0.20617273449897766, "grad_norm": 0.32457509176222554, "learning_rate": 1.9765894736842106e-05, "loss": 0.2489, "mean_copy_accuracy": 0.9979524612426758, "mean_gen_accuracy": 0.8851388841867447, "mean_token_accuracy": 0.9156950563192368, "num_tokens": 90929239.0, "sample_num_tokens": 9781.25, "step": 8838, "total_num_tokens": 90968364.0, "z_loss": 0.00033449503825977445 }, { "copy_logits_max": -5.62352180480957, "copy_logits_min": -750000064.0, "copy_num_tokens": 605.5625, "epoch": 1.805105948429921, "gen_logits_max": 2.7795207500457764, "gen_logits_mean": -18.359153747558594, "gen_logits_min": -30.780303955078125, "gen_logits_std": 3.5007433891296387, "gen_loss": 0.24446742236614227, "grad_norm": 0.37315380277475524, "learning_rate": 1.976463157894737e-05, "loss": 0.2772, "mean_copy_accuracy": 0.9980765730142593, "mean_gen_accuracy": 0.8742964416742325, "mean_token_accuracy": 0.9056483060121536, "num_tokens": 91220335.0, "sample_num_tokens": 8999.75, "step": 8839, "total_num_tokens": 91256334.0, "z_loss": 0.0003760721592698246 }, { "copy_logits_max": -5.577589988708496, "copy_logits_min": -750000000.0, "copy_num_tokens": 634.5, "epoch": 1.8053101863671177, "gen_logits_max": 3.5038373470306396, "gen_logits_mean": -17.999357223510742, "gen_logits_min": -30.31248664855957, "gen_logits_std": 3.4978151321411133, "gen_loss": 0.2520540654659271, "grad_norm": 0.39889102392789566, "learning_rate": 1.976336842105263e-05, "loss": 0.2811, "mean_copy_accuracy": 0.9973666816949844, "mean_gen_accuracy": 0.8722249567508698, "mean_token_accuracy": 0.9025152027606964, "num_tokens": 91479790.0, "sample_num_tokens": 10062.0, "step": 8840, "total_num_tokens": 91520038.0, "z_loss": 0.00038566565490327775 }, { "copy_logits_max": -6.21793270111084, "copy_logits_min": -625000064.0, "copy_num_tokens": 496.5, "epoch": 1.8055144243043144, "gen_logits_max": 4.382165908813477, "gen_logits_mean": -18.325695037841797, "gen_logits_min": -30.21299171447754, "gen_logits_std": 3.4590373039245605, "gen_loss": 0.289009153842926, "grad_norm": 0.33349609456623613, "learning_rate": 1.9762105263157896e-05, "loss": 0.277, "mean_copy_accuracy": 0.9978645592927933, "mean_gen_accuracy": 0.8760143518447876, "mean_token_accuracy": 0.9056593030691147, "num_tokens": 91774309.0, "sample_num_tokens": 9033.75, "step": 8841, "total_num_tokens": 91810444.0, "z_loss": 0.00040203597745858133 }, { "copy_logits_max": -5.368897914886475, "copy_logits_min": -750000000.0, "copy_num_tokens": 295.875, "epoch": 1.8057186622415113, "gen_logits_max": 4.944803237915039, "gen_logits_mean": -17.959781646728516, "gen_logits_min": -30.033924102783203, "gen_logits_std": 3.4152488708496094, "gen_loss": 0.30039408802986145, "grad_norm": 0.35011323293065544, "learning_rate": 1.9760842105263157e-05, "loss": 0.2661, "mean_copy_accuracy": 0.9973698258399963, "mean_gen_accuracy": 0.8807758986949921, "mean_token_accuracy": 0.9092738628387451, "num_tokens": 92042886.0, "sample_num_tokens": 7764.5, "step": 8842, "total_num_tokens": 92073944.0, "z_loss": 0.00045005453284829855 }, { "copy_logits_max": -7.0786967277526855, "copy_logits_min": -750000000.0, "copy_num_tokens": 414.3125, "epoch": 1.8059229001787083, "gen_logits_max": 4.327877998352051, "gen_logits_mean": -17.711620330810547, "gen_logits_min": -29.7648868560791, "gen_logits_std": 3.44233775138855, "gen_loss": 0.29882699251174927, "grad_norm": 0.3564008518066354, "learning_rate": 1.975957894736842e-05, "loss": 0.281, "mean_copy_accuracy": 0.9964786469936371, "mean_gen_accuracy": 0.8742513209581375, "mean_token_accuracy": 0.9039449989795685, "num_tokens": 92310914.0, "sample_num_tokens": 8149.0, "step": 8843, "total_num_tokens": 92343510.0, "z_loss": 0.00043891716632060707 }, { "copy_logits_max": -6.478285789489746, "copy_logits_min": -750000000.0, "copy_num_tokens": 363.0, "epoch": 1.806127138115905, "gen_logits_max": 3.9426991939544678, "gen_logits_mean": -17.8476505279541, "gen_logits_min": -30.1934814453125, "gen_logits_std": 3.415034532546997, "gen_loss": 0.3122316598892212, "grad_norm": 0.40578797316622517, "learning_rate": 1.9758315789473682e-05, "loss": 0.2868, "mean_copy_accuracy": 0.9969979971647263, "mean_gen_accuracy": 0.8716923147439957, "mean_token_accuracy": 0.901558980345726, "num_tokens": 92551428.0, "sample_num_tokens": 7768.0, "step": 8844, "total_num_tokens": 92582500.0, "z_loss": 0.0004786721838172525 }, { "copy_logits_max": -5.708510398864746, "copy_logits_min": -750000000.0, "copy_num_tokens": 421.8125, "epoch": 1.806331376053102, "gen_logits_max": 4.354671001434326, "gen_logits_mean": -17.635269165039062, "gen_logits_min": -29.73589324951172, "gen_logits_std": 3.4391021728515625, "gen_loss": 0.2451545000076294, "grad_norm": 0.3830989644967633, "learning_rate": 1.975705263157895e-05, "loss": 0.2717, "mean_copy_accuracy": 0.9971807599067688, "mean_gen_accuracy": 0.8775231838226318, "mean_token_accuracy": 0.9074919819831848, "num_tokens": 92819915.0, "sample_num_tokens": 8144.75, "step": 8845, "total_num_tokens": 92852494.0, "z_loss": 0.0003938997979275882 }, { "copy_logits_max": -5.110037803649902, "copy_logits_min": -750000000.0, "copy_num_tokens": 329.8125, "epoch": 1.8065356139902988, "gen_logits_max": 5.9710469245910645, "gen_logits_mean": -16.2504825592041, "gen_logits_min": -28.734189987182617, "gen_logits_std": 3.339364528656006, "gen_loss": 0.2830008864402771, "grad_norm": 0.3576508277885282, "learning_rate": 1.9755789473684214e-05, "loss": 0.2753, "mean_copy_accuracy": 0.9967533051967621, "mean_gen_accuracy": 0.8799168169498444, "mean_token_accuracy": 0.9069097638130188, "num_tokens": 93090061.0, "sample_num_tokens": 7403.75, "step": 8846, "total_num_tokens": 93119676.0, "z_loss": 0.00040725484723225236 }, { "copy_logits_max": -4.9443817138671875, "copy_logits_min": -750000000.0, "copy_num_tokens": 378.625, "epoch": 1.8067398519274955, "gen_logits_max": 3.3441147804260254, "gen_logits_mean": -18.739604949951172, "gen_logits_min": -30.822818756103516, "gen_logits_std": 3.470658779144287, "gen_loss": 0.2754395008087158, "grad_norm": 0.35085148584572556, "learning_rate": 1.9754526315789475e-05, "loss": 0.2808, "mean_copy_accuracy": 0.9969507306814194, "mean_gen_accuracy": 0.8752993643283844, "mean_token_accuracy": 0.9036934077739716, "num_tokens": 93344127.0, "sample_num_tokens": 7361.75, "step": 8847, "total_num_tokens": 93373574.0, "z_loss": 0.0004079569480381906 }, { "copy_logits_max": -8.235912322998047, "copy_logits_min": -750000000.0, "copy_num_tokens": 275.375, "epoch": 1.8069440898646922, "gen_logits_max": 4.7905988693237305, "gen_logits_mean": -17.2858943939209, "gen_logits_min": -29.376834869384766, "gen_logits_std": 3.39408540725708, "gen_loss": 0.2956729233264923, "grad_norm": 0.3940376138563762, "learning_rate": 1.975326315789474e-05, "loss": 0.2863, "mean_copy_accuracy": 0.9967262297868729, "mean_gen_accuracy": 0.8779166787862778, "mean_token_accuracy": 0.9032490253448486, "num_tokens": 93600980.0, "sample_num_tokens": 7148.0, "step": 8848, "total_num_tokens": 93629572.0, "z_loss": 0.0004406436055433005 }, { "copy_logits_max": -7.91276216506958, "copy_logits_min": -750000000.0, "copy_num_tokens": 391.25, "epoch": 1.8071483278018892, "gen_logits_max": 4.767406463623047, "gen_logits_mean": -15.950947761535645, "gen_logits_min": -28.20062255859375, "gen_logits_std": 3.31634521484375, "gen_loss": 0.27750325202941895, "grad_norm": 0.3954444010551726, "learning_rate": 1.9752e-05, "loss": 0.2802, "mean_copy_accuracy": 0.9959089308977127, "mean_gen_accuracy": 0.8777018487453461, "mean_token_accuracy": 0.9049911350011826, "num_tokens": 93859996.0, "sample_num_tokens": 8158.0, "step": 8849, "total_num_tokens": 93892628.0, "z_loss": 0.00040707929292693734 }, { "copy_logits_max": -5.960484027862549, "copy_logits_min": -750000000.0, "copy_num_tokens": 395.5625, "epoch": 1.807352565739086, "gen_logits_max": 3.546264171600342, "gen_logits_mean": -18.127681732177734, "gen_logits_min": -30.51923370361328, "gen_logits_std": 3.434370756149292, "gen_loss": 0.30029553174972534, "grad_norm": 0.33680871811792373, "learning_rate": 1.9750736842105264e-05, "loss": 0.2625, "mean_copy_accuracy": 0.9971518963575363, "mean_gen_accuracy": 0.8835120797157288, "mean_token_accuracy": 0.9099559634923935, "num_tokens": 94121216.0, "sample_num_tokens": 8333.0, "step": 8850, "total_num_tokens": 94154548.0, "z_loss": 0.00044889282435178757 }, { "copy_logits_max": -6.27877140045166, "copy_logits_min": -750000000.0, "copy_num_tokens": 448.6875, "epoch": 1.8075568036762828, "gen_logits_max": 3.930328607559204, "gen_logits_mean": -16.105396270751953, "gen_logits_min": -28.79165267944336, "gen_logits_std": 3.3760368824005127, "gen_loss": 0.26622021198272705, "grad_norm": 0.3628276890635088, "learning_rate": 1.9749473684210525e-05, "loss": 0.2627, "mean_copy_accuracy": 0.998201534152031, "mean_gen_accuracy": 0.8750228434801102, "mean_token_accuracy": 0.9100963622331619, "num_tokens": 94387215.0, "sample_num_tokens": 7627.75, "step": 8851, "total_num_tokens": 94417726.0, "z_loss": 0.0004384070634841919 }, { "copy_logits_max": -2.1600444316864014, "copy_logits_min": -562500096.0, "copy_num_tokens": 770.375, "epoch": 1.8077610416134797, "gen_logits_max": 4.025332927703857, "gen_logits_mean": -15.991374969482422, "gen_logits_min": -28.104984283447266, "gen_logits_std": 3.3664932250976562, "gen_loss": 0.24665389955043793, "grad_norm": 0.37277550473269006, "learning_rate": 1.974821052631579e-05, "loss": 0.2785, "mean_copy_accuracy": 0.996317595243454, "mean_gen_accuracy": 0.8786183893680573, "mean_token_accuracy": 0.9068102538585663, "num_tokens": 94659659.0, "sample_num_tokens": 10106.25, "step": 8852, "total_num_tokens": 94700084.0, "z_loss": 0.000425131234806031 }, { "copy_logits_max": -6.261281490325928, "copy_logits_min": -750000000.0, "copy_num_tokens": 598.25, "epoch": 1.8079652795506767, "gen_logits_max": 3.566427230834961, "gen_logits_mean": -17.36597442626953, "gen_logits_min": -29.733915328979492, "gen_logits_std": 3.3730883598327637, "gen_loss": 0.2996830940246582, "grad_norm": 0.3457480568108505, "learning_rate": 1.9746947368421054e-05, "loss": 0.2752, "mean_copy_accuracy": 0.9971216917037964, "mean_gen_accuracy": 0.8785941749811172, "mean_token_accuracy": 0.9091407358646393, "num_tokens": 94946959.0, "sample_num_tokens": 9763.25, "step": 8853, "total_num_tokens": 94986012.0, "z_loss": 0.0005256237927824259 }, { "copy_logits_max": -7.797627925872803, "copy_logits_min": -750000000.0, "copy_num_tokens": 341.375, "epoch": 1.8081695174878734, "gen_logits_max": 5.036606788635254, "gen_logits_mean": -16.100873947143555, "gen_logits_min": -28.411163330078125, "gen_logits_std": 3.3267064094543457, "gen_loss": 0.2750578820705414, "grad_norm": 0.33678271371105745, "learning_rate": 1.9745684210526318e-05, "loss": 0.2754, "mean_copy_accuracy": 0.9972794502973557, "mean_gen_accuracy": 0.877613827586174, "mean_token_accuracy": 0.9078676402568817, "num_tokens": 95251899.0, "sample_num_tokens": 8679.25, "step": 8854, "total_num_tokens": 95286616.0, "z_loss": 0.00044068327406421304 }, { "copy_logits_max": -3.8136606216430664, "copy_logits_min": -750000064.0, "copy_num_tokens": 582.9375, "epoch": 1.80837375542507, "gen_logits_max": 3.319739818572998, "gen_logits_mean": -17.759605407714844, "gen_logits_min": -30.308727264404297, "gen_logits_std": 3.4558310508728027, "gen_loss": 0.24654516577720642, "grad_norm": 0.3589054018176082, "learning_rate": 1.974442105263158e-05, "loss": 0.2617, "mean_copy_accuracy": 0.9968197047710419, "mean_gen_accuracy": 0.87693752348423, "mean_token_accuracy": 0.9120223224163055, "num_tokens": 95547415.0, "sample_num_tokens": 8975.75, "step": 8855, "total_num_tokens": 95583318.0, "z_loss": 0.00042616334394551814 }, { "copy_logits_max": -7.701651573181152, "copy_logits_min": -687500032.0, "copy_num_tokens": 427.875, "epoch": 1.808577993362267, "gen_logits_max": 3.447451114654541, "gen_logits_mean": -18.216899871826172, "gen_logits_min": -30.121726989746094, "gen_logits_std": 3.42596435546875, "gen_loss": 0.30367758870124817, "grad_norm": 0.3445399930187191, "learning_rate": 1.9743157894736843e-05, "loss": 0.288, "mean_copy_accuracy": 0.9968732297420502, "mean_gen_accuracy": 0.868676483631134, "mean_token_accuracy": 0.90113665163517, "num_tokens": 95833489.0, "sample_num_tokens": 8646.25, "step": 8856, "total_num_tokens": 95868074.0, "z_loss": 0.00046442457824014127 }, { "copy_logits_max": -6.342128753662109, "copy_logits_min": -750000000.0, "copy_num_tokens": 398.3125, "epoch": 1.808782231299464, "gen_logits_max": 5.041474342346191, "gen_logits_mean": -15.605131149291992, "gen_logits_min": -27.9876708984375, "gen_logits_std": 3.3195858001708984, "gen_loss": 0.26544123888015747, "grad_norm": 0.36093633773599904, "learning_rate": 1.9741894736842104e-05, "loss": 0.2596, "mean_copy_accuracy": 0.9976044595241547, "mean_gen_accuracy": 0.8792193531990051, "mean_token_accuracy": 0.9129760712385178, "num_tokens": 96126076.0, "sample_num_tokens": 7549.5, "step": 8857, "total_num_tokens": 96156274.0, "z_loss": 0.0004125028499402106 }, { "copy_logits_max": -5.014848709106445, "copy_logits_min": -750000000.0, "copy_num_tokens": 501.5625, "epoch": 1.8089864692366606, "gen_logits_max": 2.824434757232666, "gen_logits_mean": -16.834672927856445, "gen_logits_min": -29.488487243652344, "gen_logits_std": 3.4024243354797363, "gen_loss": 0.2496812641620636, "grad_norm": 0.34182531821091233, "learning_rate": 1.974063157894737e-05, "loss": 0.268, "mean_copy_accuracy": 0.9973288327455521, "mean_gen_accuracy": 0.8746791332960129, "mean_token_accuracy": 0.9080036580562592, "num_tokens": 96389017.0, "sample_num_tokens": 7403.75, "step": 8858, "total_num_tokens": 96418632.0, "z_loss": 0.0004176307120360434 }, { "copy_logits_max": -4.141575813293457, "copy_logits_min": -750000000.0, "copy_num_tokens": 549.5625, "epoch": 1.8091907071738575, "gen_logits_max": 4.0914530754089355, "gen_logits_mean": -14.967451095581055, "gen_logits_min": -27.265342712402344, "gen_logits_std": 3.28470516204834, "gen_loss": 0.27882176637649536, "grad_norm": 0.3268318351442832, "learning_rate": 1.9739368421052633e-05, "loss": 0.2719, "mean_copy_accuracy": 0.9970007687807083, "mean_gen_accuracy": 0.8780114650726318, "mean_token_accuracy": 0.9077812880277634, "num_tokens": 96670539.0, "sample_num_tokens": 8982.75, "step": 8859, "total_num_tokens": 96706470.0, "z_loss": 0.0004037570906803012 }, { "copy_logits_max": -7.243105888366699, "copy_logits_min": -750000000.0, "copy_num_tokens": 316.75, "epoch": 1.8093949451110545, "gen_logits_max": 3.9207396507263184, "gen_logits_mean": -16.948986053466797, "gen_logits_min": -29.198871612548828, "gen_logits_std": 3.3888440132141113, "gen_loss": 0.30195170640945435, "grad_norm": 0.3561396834321341, "learning_rate": 1.9738105263157894e-05, "loss": 0.2828, "mean_copy_accuracy": 0.996672734618187, "mean_gen_accuracy": 0.8776650726795197, "mean_token_accuracy": 0.9043924361467361, "num_tokens": 96942759.0, "sample_num_tokens": 7020.25, "step": 8860, "total_num_tokens": 96970840.0, "z_loss": 0.00047050369903445244 }, { "copy_logits_max": -6.48942756652832, "copy_logits_min": -750000000.0, "copy_num_tokens": 587.375, "epoch": 1.8095991830482512, "gen_logits_max": 3.7406318187713623, "gen_logits_mean": -16.296878814697266, "gen_logits_min": -28.578113555908203, "gen_logits_std": 3.371920585632324, "gen_loss": 0.24504075944423676, "grad_norm": 0.34629689848196177, "learning_rate": 1.9736842105263158e-05, "loss": 0.2994, "mean_copy_accuracy": 0.996847927570343, "mean_gen_accuracy": 0.8693703711032867, "mean_token_accuracy": 0.8974006474018097, "num_tokens": 97207049.0, "sample_num_tokens": 9112.75, "step": 8861, "total_num_tokens": 97243500.0, "z_loss": 0.00037172663724049926 }, { "copy_logits_max": -5.438575267791748, "copy_logits_min": -750000000.0, "copy_num_tokens": 694.625, "epoch": 1.8098034209854479, "gen_logits_max": 3.4235661029815674, "gen_logits_mean": -16.305530548095703, "gen_logits_min": -28.54925537109375, "gen_logits_std": 3.3895397186279297, "gen_loss": 0.25315946340560913, "grad_norm": 0.325663474462747, "learning_rate": 1.9735578947368422e-05, "loss": 0.254, "mean_copy_accuracy": 0.9977787584066391, "mean_gen_accuracy": 0.8773252069950104, "mean_token_accuracy": 0.9139719605445862, "num_tokens": 97511162.0, "sample_num_tokens": 9744.0, "step": 8862, "total_num_tokens": 97550138.0, "z_loss": 0.00039892573840916157 }, { "copy_logits_max": -6.4994354248046875, "copy_logits_min": -750000000.0, "copy_num_tokens": 281.4375, "epoch": 1.810007658922645, "gen_logits_max": 4.845334529876709, "gen_logits_mean": -16.169530868530273, "gen_logits_min": -28.387834548950195, "gen_logits_std": 3.3397912979125977, "gen_loss": 0.2749755382537842, "grad_norm": 0.35610637822515806, "learning_rate": 1.9734315789473687e-05, "loss": 0.276, "mean_copy_accuracy": 0.9967673867940903, "mean_gen_accuracy": 0.8781624734401703, "mean_token_accuracy": 0.9055110067129135, "num_tokens": 97774630.0, "sample_num_tokens": 7472.5, "step": 8863, "total_num_tokens": 97804520.0, "z_loss": 0.0004539946385193616 }, { "copy_logits_max": -6.459344863891602, "copy_logits_min": -750000000.0, "copy_num_tokens": 326.25, "epoch": 1.8102118968598417, "gen_logits_max": 3.5731563568115234, "gen_logits_mean": -17.54571533203125, "gen_logits_min": -29.69894027709961, "gen_logits_std": 3.394773244857788, "gen_loss": 0.2844594120979309, "grad_norm": 0.3288126507582982, "learning_rate": 1.9733052631578948e-05, "loss": 0.2627, "mean_copy_accuracy": 0.9965077489614487, "mean_gen_accuracy": 0.8850423842668533, "mean_token_accuracy": 0.9106007963418961, "num_tokens": 98046949.0, "sample_num_tokens": 7692.75, "step": 8864, "total_num_tokens": 98077720.0, "z_loss": 0.0004503721429500729 }, { "copy_logits_max": -6.027359962463379, "copy_logits_min": -687500032.0, "copy_num_tokens": 443.75, "epoch": 1.8104161347970384, "gen_logits_max": 3.3724918365478516, "gen_logits_mean": -16.91912841796875, "gen_logits_min": -29.266891479492188, "gen_logits_std": 3.368016004562378, "gen_loss": 0.30942416191101074, "grad_norm": 0.37090423555862345, "learning_rate": 1.9731789473684212e-05, "loss": 0.2936, "mean_copy_accuracy": 0.9967039674520493, "mean_gen_accuracy": 0.869392067193985, "mean_token_accuracy": 0.9011400192975998, "num_tokens": 98307978.0, "sample_num_tokens": 7785.5, "step": 8865, "total_num_tokens": 98339120.0, "z_loss": 0.0004810757818631828 }, { "copy_logits_max": -6.862093448638916, "copy_logits_min": -750000000.0, "copy_num_tokens": 376.8125, "epoch": 1.8106203727342354, "gen_logits_max": 4.110299110412598, "gen_logits_mean": -16.32256507873535, "gen_logits_min": -28.450149536132812, "gen_logits_std": 3.3636045455932617, "gen_loss": 0.30337315797805786, "grad_norm": 0.33793517372915194, "learning_rate": 1.9730526315789473e-05, "loss": 0.2739, "mean_copy_accuracy": 0.9963168203830719, "mean_gen_accuracy": 0.8775361329317093, "mean_token_accuracy": 0.906183734536171, "num_tokens": 98578808.0, "sample_num_tokens": 8224.0, "step": 8866, "total_num_tokens": 98611704.0, "z_loss": 0.00048182797036133707 }, { "copy_logits_max": -5.792189598083496, "copy_logits_min": -750000000.0, "copy_num_tokens": 278.125, "epoch": 1.8108246106714323, "gen_logits_max": 5.293618202209473, "gen_logits_mean": -16.105918884277344, "gen_logits_min": -27.940834045410156, "gen_logits_std": 3.339155673980713, "gen_loss": 0.2800084352493286, "grad_norm": 0.34359713920893936, "learning_rate": 1.9729263157894737e-05, "loss": 0.2762, "mean_copy_accuracy": 0.9974330514669418, "mean_gen_accuracy": 0.8810875862836838, "mean_token_accuracy": 0.9054529666900635, "num_tokens": 98843761.0, "sample_num_tokens": 8526.75, "step": 8867, "total_num_tokens": 98877868.0, "z_loss": 0.0004055019817315042 }, { "copy_logits_max": -4.484284400939941, "copy_logits_min": -687500032.0, "copy_num_tokens": 762.0, "epoch": 1.811028848608629, "gen_logits_max": 2.3939743041992188, "gen_logits_mean": -16.35976791381836, "gen_logits_min": -29.451351165771484, "gen_logits_std": 3.3925740718841553, "gen_loss": 0.23463912308216095, "grad_norm": 0.3024949311958038, "learning_rate": 1.9727999999999998e-05, "loss": 0.2538, "mean_copy_accuracy": 0.9979883432388306, "mean_gen_accuracy": 0.8782580345869064, "mean_token_accuracy": 0.9141513854265213, "num_tokens": 99119317.0, "sample_num_tokens": 9817.25, "step": 8868, "total_num_tokens": 99158586.0, "z_loss": 0.0003707156574819237 }, { "copy_logits_max": -6.664711952209473, "copy_logits_min": -687500032.0, "copy_num_tokens": 455.125, "epoch": 1.811233086545826, "gen_logits_max": 3.0007359981536865, "gen_logits_mean": -17.6541748046875, "gen_logits_min": -29.655227661132812, "gen_logits_std": 3.4223220348358154, "gen_loss": 0.2679685652256012, "grad_norm": 0.34015862472257924, "learning_rate": 1.9726736842105262e-05, "loss": 0.2671, "mean_copy_accuracy": 0.9971103519201279, "mean_gen_accuracy": 0.8769295960664749, "mean_token_accuracy": 0.907457247376442, "num_tokens": 99398610.0, "sample_num_tokens": 8774.0, "step": 8869, "total_num_tokens": 99433706.0, "z_loss": 0.0003921512980014086 }, { "copy_logits_max": -5.6196064949035645, "copy_logits_min": -750000000.0, "copy_num_tokens": 452.8125, "epoch": 1.8114373244830229, "gen_logits_max": 4.072223663330078, "gen_logits_mean": -14.978920936584473, "gen_logits_min": -27.491485595703125, "gen_logits_std": 3.2985546588897705, "gen_loss": 0.2836933135986328, "grad_norm": 0.35380178930022155, "learning_rate": 1.9725473684210527e-05, "loss": 0.2799, "mean_copy_accuracy": 0.9966654479503632, "mean_gen_accuracy": 0.8754217624664307, "mean_token_accuracy": 0.9045312255620956, "num_tokens": 99678006.0, "sample_num_tokens": 7742.5, "step": 8870, "total_num_tokens": 99708976.0, "z_loss": 0.00042462756391614676 }, { "copy_logits_max": -6.361425876617432, "copy_logits_min": -750000000.0, "copy_num_tokens": 353.4375, "epoch": 1.8116415624202196, "gen_logits_max": 3.5532751083374023, "gen_logits_mean": -17.000207901000977, "gen_logits_min": -29.10710906982422, "gen_logits_std": 3.4115281105041504, "gen_loss": 0.28111571073532104, "grad_norm": 0.32474695630630024, "learning_rate": 1.972421052631579e-05, "loss": 0.2456, "mean_copy_accuracy": 0.9969250857830048, "mean_gen_accuracy": 0.8881490677595139, "mean_token_accuracy": 0.9153788834810257, "num_tokens": 99945062.0, "sample_num_tokens": 7663.5, "step": 8871, "total_num_tokens": 99975716.0, "z_loss": 0.00038272846722975373 }, { "copy_logits_max": -6.3213934898376465, "copy_logits_min": -687500032.0, "copy_num_tokens": 335.3125, "epoch": 1.8118458003574163, "gen_logits_max": 4.267982482910156, "gen_logits_mean": -15.557221412658691, "gen_logits_min": -28.298240661621094, "gen_logits_std": 3.3565452098846436, "gen_loss": 0.2635882496833801, "grad_norm": 0.3532216533967788, "learning_rate": 1.9722947368421052e-05, "loss": 0.2816, "mean_copy_accuracy": 0.9969124644994736, "mean_gen_accuracy": 0.8794231116771698, "mean_token_accuracy": 0.9043254107236862, "num_tokens": 100202962.0, "sample_num_tokens": 7451.5, "step": 8872, "total_num_tokens": 100232768.0, "z_loss": 0.0004209596663713455 }, { "copy_logits_max": -5.968317985534668, "copy_logits_min": -750000000.0, "copy_num_tokens": 379.75, "epoch": 1.8120500382946132, "gen_logits_max": 2.869051456451416, "gen_logits_mean": -17.808948516845703, "gen_logits_min": -30.150863647460938, "gen_logits_std": 3.443779230117798, "gen_loss": 0.25480198860168457, "grad_norm": 0.3465777839694716, "learning_rate": 1.9721684210526316e-05, "loss": 0.2619, "mean_copy_accuracy": 0.9977934211492538, "mean_gen_accuracy": 0.8815543353557587, "mean_token_accuracy": 0.9101802855730057, "num_tokens": 100474168.0, "sample_num_tokens": 7439.5, "step": 8873, "total_num_tokens": 100503926.0, "z_loss": 0.00034042398328892887 }, { "copy_logits_max": -3.188835620880127, "copy_logits_min": -750000000.0, "copy_num_tokens": 537.125, "epoch": 1.8122542762318101, "gen_logits_max": 2.097679376602173, "gen_logits_mean": -17.939517974853516, "gen_logits_min": -30.45076560974121, "gen_logits_std": 3.4803364276885986, "gen_loss": 0.21002812683582306, "grad_norm": 0.34656389799414095, "learning_rate": 1.972042105263158e-05, "loss": 0.2632, "mean_copy_accuracy": 0.9971185624599457, "mean_gen_accuracy": 0.882488414645195, "mean_token_accuracy": 0.9113556444644928, "num_tokens": 100729554.0, "sample_num_tokens": 8289.0, "step": 8874, "total_num_tokens": 100762710.0, "z_loss": 0.0003935598651878536 }, { "copy_logits_max": -4.319411277770996, "copy_logits_min": -625000000.0, "copy_num_tokens": 604.375, "epoch": 1.8124585141690068, "gen_logits_max": 3.3250041007995605, "gen_logits_mean": -16.908912658691406, "gen_logits_min": -29.063617706298828, "gen_logits_std": 3.3886168003082275, "gen_loss": 0.2816464602947235, "grad_norm": 0.36629546713446365, "learning_rate": 1.971915789473684e-05, "loss": 0.2696, "mean_copy_accuracy": 0.9966728389263153, "mean_gen_accuracy": 0.8781513124704361, "mean_token_accuracy": 0.9094190150499344, "num_tokens": 101004196.0, "sample_num_tokens": 9570.5, "step": 8875, "total_num_tokens": 101042478.0, "z_loss": 0.00039216087316162884 }, { "copy_logits_max": -4.587684154510498, "copy_logits_min": -750000000.0, "copy_num_tokens": 314.875, "epoch": 1.8126627521062038, "gen_logits_max": 3.47395396232605, "gen_logits_mean": -17.2952880859375, "gen_logits_min": -29.13516616821289, "gen_logits_std": 3.385287284851074, "gen_loss": 0.29587453603744507, "grad_norm": 0.36210457626592457, "learning_rate": 1.9717894736842106e-05, "loss": 0.2696, "mean_copy_accuracy": 0.9968453049659729, "mean_gen_accuracy": 0.8780915141105652, "mean_token_accuracy": 0.9087157696485519, "num_tokens": 101279092.0, "sample_num_tokens": 7066.0, "step": 8876, "total_num_tokens": 101307356.0, "z_loss": 0.0004430551780387759 }, { "copy_logits_max": -3.3084936141967773, "copy_logits_min": -750000000.0, "copy_num_tokens": 578.5, "epoch": 1.8128669900434007, "gen_logits_max": 3.273942708969116, "gen_logits_mean": -17.143630981445312, "gen_logits_min": -29.32311248779297, "gen_logits_std": 3.450627088546753, "gen_loss": 0.2617496848106384, "grad_norm": 0.3075115340969384, "learning_rate": 1.9716631578947367e-05, "loss": 0.2468, "mean_copy_accuracy": 0.9973529279232025, "mean_gen_accuracy": 0.8854393362998962, "mean_token_accuracy": 0.916519820690155, "num_tokens": 101562719.0, "sample_num_tokens": 9453.25, "step": 8877, "total_num_tokens": 101600532.0, "z_loss": 0.00035425787791609764 }, { "copy_logits_max": -2.9576683044433594, "copy_logits_min": -750000064.0, "copy_num_tokens": 281.0625, "epoch": 1.8130712279805974, "gen_logits_max": 3.7357826232910156, "gen_logits_mean": -17.083477020263672, "gen_logits_min": -28.93450927734375, "gen_logits_std": 3.3749613761901855, "gen_loss": 0.3101179897785187, "grad_norm": 0.3255270105743026, "learning_rate": 1.9715368421052634e-05, "loss": 0.2721, "mean_copy_accuracy": 0.99735127389431, "mean_gen_accuracy": 0.8803256154060364, "mean_token_accuracy": 0.9074202179908752, "num_tokens": 101835990.0, "sample_num_tokens": 7374.5, "step": 8878, "total_num_tokens": 101865488.0, "z_loss": 0.0004527393903117627 }, { "copy_logits_max": -5.323060989379883, "copy_logits_min": -750000000.0, "copy_num_tokens": 670.9375, "epoch": 1.813275465917794, "gen_logits_max": 2.2113547325134277, "gen_logits_mean": -18.822219848632812, "gen_logits_min": -30.704496383666992, "gen_logits_std": 3.4794952869415283, "gen_loss": 0.2981090247631073, "grad_norm": 0.37831110398829587, "learning_rate": 1.9714105263157895e-05, "loss": 0.2632, "mean_copy_accuracy": 0.9972081482410431, "mean_gen_accuracy": 0.8806260973215103, "mean_token_accuracy": 0.9110704064369202, "num_tokens": 102121698.0, "sample_num_tokens": 10932.5, "step": 8879, "total_num_tokens": 102165428.0, "z_loss": 0.0004306799382902682 }, { "copy_logits_max": -4.095864772796631, "copy_logits_min": -750000000.0, "copy_num_tokens": 345.9375, "epoch": 1.813479703854991, "gen_logits_max": 3.402498483657837, "gen_logits_mean": -17.18901824951172, "gen_logits_min": -29.130390167236328, "gen_logits_std": 3.3871209621429443, "gen_loss": 0.29227447509765625, "grad_norm": 0.36065361306824645, "learning_rate": 1.971284210526316e-05, "loss": 0.2975, "mean_copy_accuracy": 0.9969679415225983, "mean_gen_accuracy": 0.8701135367155075, "mean_token_accuracy": 0.899167075753212, "num_tokens": 102379892.0, "sample_num_tokens": 6888.0, "step": 8880, "total_num_tokens": 102407444.0, "z_loss": 0.0003967390803154558 }, { "copy_logits_max": -2.985422372817993, "copy_logits_min": -687500032.0, "copy_num_tokens": 495.375, "epoch": 1.813683941792188, "gen_logits_max": 3.0807790756225586, "gen_logits_mean": -17.09575080871582, "gen_logits_min": -29.44669532775879, "gen_logits_std": 3.4134163856506348, "gen_loss": 0.2623441517353058, "grad_norm": 0.33870325112889427, "learning_rate": 1.971157894736842e-05, "loss": 0.2788, "mean_copy_accuracy": 0.9967535734176636, "mean_gen_accuracy": 0.8744996637105942, "mean_token_accuracy": 0.9047567993402481, "num_tokens": 102654715.0, "sample_num_tokens": 8709.25, "step": 8881, "total_num_tokens": 102689552.0, "z_loss": 0.00038427498657256365 }, { "copy_logits_max": -5.436753273010254, "copy_logits_min": -750000000.0, "copy_num_tokens": 509.0625, "epoch": 1.8138881797293847, "gen_logits_max": 3.7001471519470215, "gen_logits_mean": -15.689854621887207, "gen_logits_min": -27.830244064331055, "gen_logits_std": 3.362274646759033, "gen_loss": 0.2501562535762787, "grad_norm": 0.34550428329064076, "learning_rate": 1.9710315789473685e-05, "loss": 0.2641, "mean_copy_accuracy": 0.9966446310281754, "mean_gen_accuracy": 0.8787073791027069, "mean_token_accuracy": 0.9092994928359985, "num_tokens": 102908393.0, "sample_num_tokens": 8631.25, "step": 8882, "total_num_tokens": 102942918.0, "z_loss": 0.0003787541063502431 }, { "copy_logits_max": -3.9855399131774902, "copy_logits_min": -750000000.0, "copy_num_tokens": 329.625, "epoch": 1.8140924176665816, "gen_logits_max": 3.4572994709014893, "gen_logits_mean": -16.969894409179688, "gen_logits_min": -28.643808364868164, "gen_logits_std": 3.360506534576416, "gen_loss": 0.2891102731227875, "grad_norm": 0.3577641009134454, "learning_rate": 1.9709052631578946e-05, "loss": 0.2712, "mean_copy_accuracy": 0.9973840415477753, "mean_gen_accuracy": 0.8760812431573868, "mean_token_accuracy": 0.9075505584478378, "num_tokens": 103194314.0, "sample_num_tokens": 7705.5, "step": 8883, "total_num_tokens": 103225136.0, "z_loss": 0.00038084073457866907 }, { "copy_logits_max": -4.9732346534729, "copy_logits_min": -750000000.0, "copy_num_tokens": 517.75, "epoch": 1.8142966556037785, "gen_logits_max": 3.030210018157959, "gen_logits_mean": -17.845882415771484, "gen_logits_min": -29.9779052734375, "gen_logits_std": 3.4169564247131348, "gen_loss": 0.2401009351015091, "grad_norm": 0.34498308176222714, "learning_rate": 1.970778947368421e-05, "loss": 0.2697, "mean_copy_accuracy": 0.9975688755512238, "mean_gen_accuracy": 0.8822035193443298, "mean_token_accuracy": 0.9076387733221054, "num_tokens": 103458054.0, "sample_num_tokens": 8941.0, "step": 8884, "total_num_tokens": 103493818.0, "z_loss": 0.0003401265712454915 }, { "copy_logits_max": -5.815443992614746, "copy_logits_min": -750000000.0, "copy_num_tokens": 476.0, "epoch": 1.8145008935409752, "gen_logits_max": 2.9802958965301514, "gen_logits_mean": -17.866329193115234, "gen_logits_min": -30.04537010192871, "gen_logits_std": 3.4449448585510254, "gen_loss": 0.26922765374183655, "grad_norm": 0.3164784664412966, "learning_rate": 1.970652631578947e-05, "loss": 0.2516, "mean_copy_accuracy": 0.998169794678688, "mean_gen_accuracy": 0.8862302899360657, "mean_token_accuracy": 0.9141679108142853, "num_tokens": 103752366.0, "sample_num_tokens": 8531.0, "step": 8885, "total_num_tokens": 103786490.0, "z_loss": 0.0003775120130740106 }, { "copy_logits_max": -7.307704925537109, "copy_logits_min": -625000064.0, "copy_num_tokens": 514.25, "epoch": 1.814705131478172, "gen_logits_max": 3.123983144760132, "gen_logits_mean": -17.993431091308594, "gen_logits_min": -30.41747283935547, "gen_logits_std": 3.450439691543579, "gen_loss": 0.2380804717540741, "grad_norm": 0.317291666764188, "learning_rate": 1.970526315789474e-05, "loss": 0.2428, "mean_copy_accuracy": 0.9976958483457565, "mean_gen_accuracy": 0.8833177536725998, "mean_token_accuracy": 0.9171148091554642, "num_tokens": 104031253.0, "sample_num_tokens": 9064.25, "step": 8886, "total_num_tokens": 104067510.0, "z_loss": 0.0003659600333776325 }, { "copy_logits_max": -6.0118279457092285, "copy_logits_min": -687500032.0, "copy_num_tokens": 739.75, "epoch": 1.814909369415369, "gen_logits_max": 4.0116376876831055, "gen_logits_mean": -14.266278266906738, "gen_logits_min": -26.96212387084961, "gen_logits_std": 3.280381202697754, "gen_loss": 0.25902998447418213, "grad_norm": 0.37796393804096157, "learning_rate": 1.9704000000000003e-05, "loss": 0.2756, "mean_copy_accuracy": 0.9968244582414627, "mean_gen_accuracy": 0.878077045083046, "mean_token_accuracy": 0.9068682491779327, "num_tokens": 104301980.0, "sample_num_tokens": 10729.5, "step": 8887, "total_num_tokens": 104344898.0, "z_loss": 0.0004242033464834094 }, { "copy_logits_max": -5.156160354614258, "copy_logits_min": -687500032.0, "copy_num_tokens": 579.0, "epoch": 1.8151136073525658, "gen_logits_max": 2.8031036853790283, "gen_logits_mean": -16.931074142456055, "gen_logits_min": -29.317626953125, "gen_logits_std": 3.385974884033203, "gen_loss": 0.2752075493335724, "grad_norm": 0.34902375855998696, "learning_rate": 1.9702736842105264e-05, "loss": 0.2956, "mean_copy_accuracy": 0.9977267533540726, "mean_gen_accuracy": 0.8622331470251083, "mean_token_accuracy": 0.8996444344520569, "num_tokens": 104592612.0, "sample_num_tokens": 8570.0, "step": 8888, "total_num_tokens": 104626892.0, "z_loss": 0.000423076911829412 }, { "copy_logits_max": -4.123141765594482, "copy_logits_min": -750000000.0, "copy_num_tokens": 609.375, "epoch": 1.8153178452897625, "gen_logits_max": 3.10290789604187, "gen_logits_mean": -17.047752380371094, "gen_logits_min": -29.43094825744629, "gen_logits_std": 3.3779759407043457, "gen_loss": 0.2974036931991577, "grad_norm": 0.3624578899435834, "learning_rate": 1.970147368421053e-05, "loss": 0.278, "mean_copy_accuracy": 0.9977597743272781, "mean_gen_accuracy": 0.8731216490268707, "mean_token_accuracy": 0.9058884829282761, "num_tokens": 104851269.0, "sample_num_tokens": 9156.75, "step": 8889, "total_num_tokens": 104887896.0, "z_loss": 0.0004958667559549212 }, { "copy_logits_max": -7.005031108856201, "copy_logits_min": -750000000.0, "copy_num_tokens": 424.0, "epoch": 1.8155220832269594, "gen_logits_max": 3.19791841506958, "gen_logits_mean": -18.298072814941406, "gen_logits_min": -30.220502853393555, "gen_logits_std": 3.4230880737304688, "gen_loss": 0.3244589567184448, "grad_norm": 0.42809126882522325, "learning_rate": 1.970021052631579e-05, "loss": 0.2912, "mean_copy_accuracy": 0.9971636384725571, "mean_gen_accuracy": 0.8723250925540924, "mean_token_accuracy": 0.9033916592597961, "num_tokens": 105147275.0, "sample_num_tokens": 8719.75, "step": 8890, "total_num_tokens": 105182154.0, "z_loss": 0.00048032289487309754 }, { "copy_logits_max": -6.4020538330078125, "copy_logits_min": -750000000.0, "copy_num_tokens": 301.0625, "epoch": 1.8157263211641563, "gen_logits_max": 4.726297378540039, "gen_logits_mean": -15.085075378417969, "gen_logits_min": -27.34339141845703, "gen_logits_std": 3.2720513343811035, "gen_loss": 0.31345540285110474, "grad_norm": 0.3581217673709735, "learning_rate": 1.9698947368421054e-05, "loss": 0.2869, "mean_copy_accuracy": 0.9973489046096802, "mean_gen_accuracy": 0.8755239099264145, "mean_token_accuracy": 0.9047109335660934, "num_tokens": 105429257.0, "sample_num_tokens": 7636.75, "step": 8891, "total_num_tokens": 105459804.0, "z_loss": 0.0004757053393404931 }, { "copy_logits_max": -6.035083293914795, "copy_logits_min": -750000000.0, "copy_num_tokens": 567.8125, "epoch": 1.815930559101353, "gen_logits_max": 2.6481075286865234, "gen_logits_mean": -17.851703643798828, "gen_logits_min": -29.935413360595703, "gen_logits_std": 3.429863452911377, "gen_loss": 0.25742462277412415, "grad_norm": 0.37924163478331163, "learning_rate": 1.9697684210526315e-05, "loss": 0.2817, "mean_copy_accuracy": 0.9968138486146927, "mean_gen_accuracy": 0.8768405616283417, "mean_token_accuracy": 0.9028498083353043, "num_tokens": 105692313.0, "sample_num_tokens": 9047.75, "step": 8892, "total_num_tokens": 105728504.0, "z_loss": 0.0003887676866725087 }, { "copy_logits_max": -4.851032257080078, "copy_logits_min": -687500032.0, "copy_num_tokens": 479.25, "epoch": 1.81613479703855, "gen_logits_max": 1.966538906097412, "gen_logits_mean": -18.867900848388672, "gen_logits_min": -31.230167388916016, "gen_logits_std": 3.473864793777466, "gen_loss": 0.25884366035461426, "grad_norm": 0.7226263342427002, "learning_rate": 1.969642105263158e-05, "loss": 0.2698, "mean_copy_accuracy": 0.9972302466630936, "mean_gen_accuracy": 0.8769799470901489, "mean_token_accuracy": 0.9088765233755112, "num_tokens": 105967305.0, "sample_num_tokens": 8272.25, "step": 8893, "total_num_tokens": 106000394.0, "z_loss": 0.0004122911486774683 }, { "copy_logits_max": -4.880281448364258, "copy_logits_min": -750000000.0, "copy_num_tokens": 652.6875, "epoch": 1.816339034975747, "gen_logits_max": 2.0334596633911133, "gen_logits_mean": -18.230579376220703, "gen_logits_min": -30.547761917114258, "gen_logits_std": 3.4614453315734863, "gen_loss": 0.24663765728473663, "grad_norm": 0.3703948738378695, "learning_rate": 1.9695157894736843e-05, "loss": 0.257, "mean_copy_accuracy": 0.9961525648832321, "mean_gen_accuracy": 0.8866233229637146, "mean_token_accuracy": 0.9123356640338898, "num_tokens": 106231205.0, "sample_num_tokens": 9855.25, "step": 8894, "total_num_tokens": 106270626.0, "z_loss": 0.00035652663791552186 }, { "copy_logits_max": -6.122107982635498, "copy_logits_min": -625000064.0, "copy_num_tokens": 394.5, "epoch": 1.8165432729129436, "gen_logits_max": 2.798811197280884, "gen_logits_mean": -18.006881713867188, "gen_logits_min": -30.451332092285156, "gen_logits_std": 3.4463868141174316, "gen_loss": 0.2309160828590393, "grad_norm": 0.35748334093507533, "learning_rate": 1.9693894736842107e-05, "loss": 0.2822, "mean_copy_accuracy": 0.9965451210737228, "mean_gen_accuracy": 0.8728121072053909, "mean_token_accuracy": 0.9037747979164124, "num_tokens": 106515987.0, "sample_num_tokens": 7486.25, "step": 8895, "total_num_tokens": 106545932.0, "z_loss": 0.0003449635987635702 }, { "copy_logits_max": -3.349213123321533, "copy_logits_min": -687500032.0, "copy_num_tokens": 525.8125, "epoch": 1.8167475108501403, "gen_logits_max": 3.5879647731781006, "gen_logits_mean": -16.865798950195312, "gen_logits_min": -29.07642936706543, "gen_logits_std": 3.376138210296631, "gen_loss": 0.2363482415676117, "grad_norm": 0.36053606578942987, "learning_rate": 1.969263157894737e-05, "loss": 0.2744, "mean_copy_accuracy": 0.9974465519189835, "mean_gen_accuracy": 0.8729229420423508, "mean_token_accuracy": 0.9078520834445953, "num_tokens": 106802096.0, "sample_num_tokens": 8715.5, "step": 8896, "total_num_tokens": 106836958.0, "z_loss": 0.00039837846998125315 }, { "copy_logits_max": -4.283748626708984, "copy_logits_min": -687500032.0, "copy_num_tokens": 459.6875, "epoch": 1.8169517487873372, "gen_logits_max": 3.0272982120513916, "gen_logits_mean": -17.87610626220703, "gen_logits_min": -29.869226455688477, "gen_logits_std": 3.406156063079834, "gen_loss": 0.2799469530582428, "grad_norm": 0.36600525374453613, "learning_rate": 1.9691368421052633e-05, "loss": 0.2876, "mean_copy_accuracy": 0.9971075654029846, "mean_gen_accuracy": 0.8739483505487442, "mean_token_accuracy": 0.9016736745834351, "num_tokens": 107071112.0, "sample_num_tokens": 7646.0, "step": 8897, "total_num_tokens": 107101696.0, "z_loss": 0.0004923325031995773 }, { "copy_logits_max": -5.796685218811035, "copy_logits_min": -687500032.0, "copy_num_tokens": 370.75, "epoch": 1.8171559867245342, "gen_logits_max": 4.084992408752441, "gen_logits_mean": -17.50977897644043, "gen_logits_min": -29.58031463623047, "gen_logits_std": 3.396064519882202, "gen_loss": 0.28914856910705566, "grad_norm": 0.37204078124606477, "learning_rate": 1.9690105263157894e-05, "loss": 0.2594, "mean_copy_accuracy": 0.9977369755506516, "mean_gen_accuracy": 0.8813814073801041, "mean_token_accuracy": 0.9115414023399353, "num_tokens": 107335477.0, "sample_num_tokens": 7571.75, "step": 8898, "total_num_tokens": 107365764.0, "z_loss": 0.00048512930516153574 }, { "copy_logits_max": -3.3779704570770264, "copy_logits_min": -750000000.0, "copy_num_tokens": 743.0625, "epoch": 1.8173602246617309, "gen_logits_max": 2.5786428451538086, "gen_logits_mean": -16.89870262145996, "gen_logits_min": -29.463294982910156, "gen_logits_std": 3.3979315757751465, "gen_loss": 0.23942920565605164, "grad_norm": 0.36878603171405505, "learning_rate": 1.9688842105263158e-05, "loss": 0.2539, "mean_copy_accuracy": 0.9975958615541458, "mean_gen_accuracy": 0.8772163987159729, "mean_token_accuracy": 0.9134279489517212, "num_tokens": 107613987.0, "sample_num_tokens": 9530.25, "step": 8899, "total_num_tokens": 107652108.0, "z_loss": 0.0003615794703364372 }, { "copy_logits_max": -6.167337894439697, "copy_logits_min": -750000000.0, "copy_num_tokens": 421.6875, "epoch": 1.8175644625989278, "gen_logits_max": 2.6184802055358887, "gen_logits_mean": -19.05084228515625, "gen_logits_min": -31.35917854309082, "gen_logits_std": 3.482713222503662, "gen_loss": 0.24139270186424255, "grad_norm": 0.35674669079677146, "learning_rate": 1.9687578947368422e-05, "loss": 0.2579, "mean_copy_accuracy": 0.9957459717988968, "mean_gen_accuracy": 0.8825757950544357, "mean_token_accuracy": 0.9120036363601685, "num_tokens": 107902383.0, "sample_num_tokens": 8097.75, "step": 8900, "total_num_tokens": 107934774.0, "z_loss": 0.0004173468623775989 }, { "copy_logits_max": -3.534105062484741, "copy_logits_min": -687500032.0, "copy_num_tokens": 744.625, "epoch": 1.8177687005361247, "gen_logits_max": 2.9271044731140137, "gen_logits_mean": -17.03221893310547, "gen_logits_min": -29.319198608398438, "gen_logits_std": 3.3814868927001953, "gen_loss": 0.24610507488250732, "grad_norm": 0.3307722400059385, "learning_rate": 1.9686315789473683e-05, "loss": 0.2739, "mean_copy_accuracy": 0.9971376657485962, "mean_gen_accuracy": 0.8797638267278671, "mean_token_accuracy": 0.9075496345758438, "num_tokens": 108178709.0, "sample_num_tokens": 10588.75, "step": 8901, "total_num_tokens": 108221064.0, "z_loss": 0.00035318126901984215 }, { "copy_logits_max": -4.538421630859375, "copy_logits_min": -750000000.0, "copy_num_tokens": 524.75, "epoch": 1.8179729384733214, "gen_logits_max": 1.7303422689437866, "gen_logits_mean": -19.790647506713867, "gen_logits_min": -32.00917053222656, "gen_logits_std": 3.5190892219543457, "gen_loss": 0.2623169422149658, "grad_norm": 0.4035050650429677, "learning_rate": 1.968505263157895e-05, "loss": 0.2879, "mean_copy_accuracy": 0.9976504594087601, "mean_gen_accuracy": 0.8737654536962509, "mean_token_accuracy": 0.9041946083307266, "num_tokens": 108456597.0, "sample_num_tokens": 8440.25, "step": 8902, "total_num_tokens": 108490358.0, "z_loss": 0.00042117247357964516 }, { "copy_logits_max": -3.0857090950012207, "copy_logits_min": -687500032.0, "copy_num_tokens": 358.75, "epoch": 1.8181771764105181, "gen_logits_max": 4.002608776092529, "gen_logits_mean": -16.517702102661133, "gen_logits_min": -28.39249038696289, "gen_logits_std": 3.3884646892547607, "gen_loss": 0.2524937689304352, "grad_norm": 0.3620021458133372, "learning_rate": 1.9683789473684212e-05, "loss": 0.2733, "mean_copy_accuracy": 0.9970016181468964, "mean_gen_accuracy": 0.8815892040729523, "mean_token_accuracy": 0.9074390977621078, "num_tokens": 108717055.0, "sample_num_tokens": 8199.75, "step": 8903, "total_num_tokens": 108749854.0, "z_loss": 0.00038193626096472144 }, { "copy_logits_max": -3.6768436431884766, "copy_logits_min": -625000064.0, "copy_num_tokens": 420.625, "epoch": 1.818381414347715, "gen_logits_max": 3.9959163665771484, "gen_logits_mean": -17.542926788330078, "gen_logits_min": -29.915163040161133, "gen_logits_std": 3.395631790161133, "gen_loss": 0.279447078704834, "grad_norm": 0.409011404994291, "learning_rate": 1.9682526315789476e-05, "loss": 0.2679, "mean_copy_accuracy": 0.9968295246362686, "mean_gen_accuracy": 0.8781619071960449, "mean_token_accuracy": 0.9101391732692719, "num_tokens": 109004152.0, "sample_num_tokens": 8703.0, "step": 8904, "total_num_tokens": 109038964.0, "z_loss": 0.00045341718941926956 }, { "copy_logits_max": -5.825395584106445, "copy_logits_min": -750000000.0, "copy_num_tokens": 421.9375, "epoch": 1.818585652284912, "gen_logits_max": 3.52842378616333, "gen_logits_mean": -17.221494674682617, "gen_logits_min": -29.318954467773438, "gen_logits_std": 3.412201404571533, "gen_loss": 0.29335707426071167, "grad_norm": 0.3889009181302986, "learning_rate": 1.9681263157894737e-05, "loss": 0.2867, "mean_copy_accuracy": 0.9965713918209076, "mean_gen_accuracy": 0.8755882382392883, "mean_token_accuracy": 0.9027806222438812, "num_tokens": 109261544.0, "sample_num_tokens": 8792.0, "step": 8905, "total_num_tokens": 109296712.0, "z_loss": 0.0004922434454783797 }, { "copy_logits_max": -4.929380893707275, "copy_logits_min": -750000064.0, "copy_num_tokens": 508.1875, "epoch": 1.8187898902221087, "gen_logits_max": 2.4615838527679443, "gen_logits_mean": -18.442392349243164, "gen_logits_min": -30.459474563598633, "gen_logits_std": 3.4753527641296387, "gen_loss": 0.2462509274482727, "grad_norm": 0.34626589103551186, "learning_rate": 1.968e-05, "loss": 0.2563, "mean_copy_accuracy": 0.9971442520618439, "mean_gen_accuracy": 0.8800235390663147, "mean_token_accuracy": 0.9117162376642227, "num_tokens": 109535524.0, "sample_num_tokens": 8596.0, "step": 8906, "total_num_tokens": 109569908.0, "z_loss": 0.0003623862285166979 }, { "copy_logits_max": -6.244147300720215, "copy_logits_min": -750000000.0, "copy_num_tokens": 521.0625, "epoch": 1.8189941281593056, "gen_logits_max": 2.2882137298583984, "gen_logits_mean": -19.427841186523438, "gen_logits_min": -31.535381317138672, "gen_logits_std": 3.527843952178955, "gen_loss": 0.26509374380111694, "grad_norm": 0.3697355152497132, "learning_rate": 1.9678736842105262e-05, "loss": 0.2788, "mean_copy_accuracy": 0.9966527819633484, "mean_gen_accuracy": 0.8805551677942276, "mean_token_accuracy": 0.9053711444139481, "num_tokens": 109792681.0, "sample_num_tokens": 8901.25, "step": 8907, "total_num_tokens": 109828286.0, "z_loss": 0.0003718690713867545 }, { "copy_logits_max": -5.234494209289551, "copy_logits_min": -750000000.0, "copy_num_tokens": 637.0, "epoch": 1.8191983660965025, "gen_logits_max": 2.5988707542419434, "gen_logits_mean": -17.761167526245117, "gen_logits_min": -30.337360382080078, "gen_logits_std": 3.484828472137451, "gen_loss": 0.2585439682006836, "grad_norm": 0.3381277314074024, "learning_rate": 1.9677473684210527e-05, "loss": 0.2759, "mean_copy_accuracy": 0.9978048503398895, "mean_gen_accuracy": 0.8773728907108307, "mean_token_accuracy": 0.9061806350946426, "num_tokens": 110053681.0, "sample_num_tokens": 9752.25, "step": 8908, "total_num_tokens": 110092690.0, "z_loss": 0.0003881205338984728 }, { "copy_logits_max": -5.361239910125732, "copy_logits_min": -750000000.0, "copy_num_tokens": 483.0625, "epoch": 1.8194026040336992, "gen_logits_max": 3.715536117553711, "gen_logits_mean": -17.658443450927734, "gen_logits_min": -29.814712524414062, "gen_logits_std": 3.4688644409179688, "gen_loss": 0.27939194440841675, "grad_norm": 0.37004816395902945, "learning_rate": 1.9676210526315787e-05, "loss": 0.2878, "mean_copy_accuracy": 0.9974128305912018, "mean_gen_accuracy": 0.8745593726634979, "mean_token_accuracy": 0.9033098667860031, "num_tokens": 110342459.0, "sample_num_tokens": 8674.75, "step": 8909, "total_num_tokens": 110377158.0, "z_loss": 0.00040486775105819106 }, { "copy_logits_max": -2.3958640098571777, "copy_logits_min": -750000000.0, "copy_num_tokens": 314.0625, "epoch": 1.819606841970896, "gen_logits_max": 5.661004066467285, "gen_logits_mean": -14.421735763549805, "gen_logits_min": -26.818347930908203, "gen_logits_std": 3.339120864868164, "gen_loss": 0.29898062348365784, "grad_norm": 0.3829622995131473, "learning_rate": 1.9674947368421052e-05, "loss": 0.2918, "mean_copy_accuracy": 0.9966892004013062, "mean_gen_accuracy": 0.87333644926548, "mean_token_accuracy": 0.9013107866048813, "num_tokens": 110609240.0, "sample_num_tokens": 6792.0, "step": 8910, "total_num_tokens": 110636408.0, "z_loss": 0.0004545796546153724 }, { "copy_logits_max": -4.6170549392700195, "copy_logits_min": -625000064.0, "copy_num_tokens": 392.9375, "epoch": 1.8198110799080929, "gen_logits_max": 3.0904290676116943, "gen_logits_mean": -18.647808074951172, "gen_logits_min": -31.02898406982422, "gen_logits_std": 3.509124517440796, "gen_loss": 0.2724066972732544, "grad_norm": 0.3569928591645458, "learning_rate": 1.9673684210526316e-05, "loss": 0.2637, "mean_copy_accuracy": 0.9971794486045837, "mean_gen_accuracy": 0.8780688494443893, "mean_token_accuracy": 0.908226415514946, "num_tokens": 110872089.0, "sample_num_tokens": 7927.75, "step": 8911, "total_num_tokens": 110903800.0, "z_loss": 0.000391421519452706 }, { "copy_logits_max": -4.1127800941467285, "copy_logits_min": -687500032.0, "copy_num_tokens": 588.125, "epoch": 1.8200153178452898, "gen_logits_max": 3.5723419189453125, "gen_logits_mean": -16.043292999267578, "gen_logits_min": -28.527355194091797, "gen_logits_std": 3.3989953994750977, "gen_loss": 0.28194499015808105, "grad_norm": 0.40326562995280796, "learning_rate": 1.967242105263158e-05, "loss": 0.25, "mean_copy_accuracy": 0.9974083006381989, "mean_gen_accuracy": 0.8799611628055573, "mean_token_accuracy": 0.9145406186580658, "num_tokens": 111149876.0, "sample_num_tokens": 8811.0, "step": 8912, "total_num_tokens": 111185120.0, "z_loss": 0.00044747069478034973 }, { "copy_logits_max": -6.301109313964844, "copy_logits_min": -750000000.0, "copy_num_tokens": 351.0625, "epoch": 1.8202195557824865, "gen_logits_max": 4.996853351593018, "gen_logits_mean": -15.763593673706055, "gen_logits_min": -27.982778549194336, "gen_logits_std": 3.403397560119629, "gen_loss": 0.2660874128341675, "grad_norm": 0.3890913352935179, "learning_rate": 1.9671157894736845e-05, "loss": 0.3034, "mean_copy_accuracy": 0.9966768473386765, "mean_gen_accuracy": 0.8741675317287445, "mean_token_accuracy": 0.8972287476062775, "num_tokens": 111398682.0, "sample_num_tokens": 7940.5, "step": 8913, "total_num_tokens": 111430444.0, "z_loss": 0.00037889782106503844 }, { "copy_logits_max": -4.074063301086426, "copy_logits_min": -750000064.0, "copy_num_tokens": 679.625, "epoch": 1.8204237937196834, "gen_logits_max": 3.1686747074127197, "gen_logits_mean": -16.984830856323242, "gen_logits_min": -29.383031845092773, "gen_logits_std": 3.439068555831909, "gen_loss": 0.2835565507411957, "grad_norm": 0.34167842088901657, "learning_rate": 1.9669894736842106e-05, "loss": 0.2808, "mean_copy_accuracy": 0.9974109977483749, "mean_gen_accuracy": 0.8780867755413055, "mean_token_accuracy": 0.9058970957994461, "num_tokens": 111656558.0, "sample_num_tokens": 9664.0, "step": 8914, "total_num_tokens": 111695214.0, "z_loss": 0.0004173038760200143 }, { "copy_logits_max": -3.27472186088562, "copy_logits_min": -750000064.0, "copy_num_tokens": 378.6875, "epoch": 1.8206280316568804, "gen_logits_max": 4.466550350189209, "gen_logits_mean": -16.346145629882812, "gen_logits_min": -28.63085174560547, "gen_logits_std": 3.4420900344848633, "gen_loss": 0.2837161421775818, "grad_norm": 0.36396069204316867, "learning_rate": 1.966863157894737e-05, "loss": 0.2806, "mean_copy_accuracy": 0.9969853758811951, "mean_gen_accuracy": 0.8772586733102798, "mean_token_accuracy": 0.9038630425930023, "num_tokens": 111931403.0, "sample_num_tokens": 7676.25, "step": 8915, "total_num_tokens": 111962108.0, "z_loss": 0.00041996940854005516 }, { "copy_logits_max": -2.646959066390991, "copy_logits_min": -687500032.0, "copy_num_tokens": 449.9375, "epoch": 1.820832269594077, "gen_logits_max": 4.190003395080566, "gen_logits_mean": -15.684513092041016, "gen_logits_min": -28.052593231201172, "gen_logits_std": 3.4052958488464355, "gen_loss": 0.2752612829208374, "grad_norm": 0.37120194640445847, "learning_rate": 1.966736842105263e-05, "loss": 0.2582, "mean_copy_accuracy": 0.9969379007816315, "mean_gen_accuracy": 0.8837279081344604, "mean_token_accuracy": 0.9108660668134689, "num_tokens": 112201312.0, "sample_num_tokens": 8025.5, "step": 8916, "total_num_tokens": 112233414.0, "z_loss": 0.0004369692469481379 }, { "copy_logits_max": -3.9451894760131836, "copy_logits_min": -750000000.0, "copy_num_tokens": 588.3125, "epoch": 1.8210365075312738, "gen_logits_max": 3.3116626739501953, "gen_logits_mean": -17.402217864990234, "gen_logits_min": -29.927242279052734, "gen_logits_std": 3.4526333808898926, "gen_loss": 0.2905154228210449, "grad_norm": 0.36739390427925117, "learning_rate": 1.9666105263157895e-05, "loss": 0.2819, "mean_copy_accuracy": 0.9976181834936142, "mean_gen_accuracy": 0.8766977787017822, "mean_token_accuracy": 0.907770499587059, "num_tokens": 112478648.0, "sample_num_tokens": 9067.5, "step": 8917, "total_num_tokens": 112514918.0, "z_loss": 0.00046019157161936164 }, { "copy_logits_max": -5.373038291931152, "copy_logits_min": -750000000.0, "copy_num_tokens": 224.0625, "epoch": 1.821240745468471, "gen_logits_max": 4.4777679443359375, "gen_logits_mean": -16.70431900024414, "gen_logits_min": -28.977964401245117, "gen_logits_std": 3.4043374061584473, "gen_loss": 0.2600504159927368, "grad_norm": 0.3543420673959292, "learning_rate": 1.9664842105263156e-05, "loss": 0.2668, "mean_copy_accuracy": 0.9958782941102982, "mean_gen_accuracy": 0.8869025409221649, "mean_token_accuracy": 0.9085399955511093, "num_tokens": 112748785.0, "sample_num_tokens": 6811.25, "step": 8918, "total_num_tokens": 112776030.0, "z_loss": 0.00039220863254740834 }, { "copy_logits_max": -3.327406644821167, "copy_logits_min": -750000000.0, "copy_num_tokens": 483.25, "epoch": 1.8214449834056676, "gen_logits_max": 4.334329605102539, "gen_logits_mean": -16.197067260742188, "gen_logits_min": -28.706707000732422, "gen_logits_std": 3.408836841583252, "gen_loss": 0.2845577895641327, "grad_norm": 0.3488948632571098, "learning_rate": 1.9663578947368424e-05, "loss": 0.2712, "mean_copy_accuracy": 0.9961697161197662, "mean_gen_accuracy": 0.8762059807777405, "mean_token_accuracy": 0.90883769094944, "num_tokens": 113017557.0, "sample_num_tokens": 7874.75, "step": 8919, "total_num_tokens": 113049056.0, "z_loss": 0.00044332994730211794 }, { "copy_logits_max": -4.823866844177246, "copy_logits_min": -750000000.0, "copy_num_tokens": 529.25, "epoch": 1.8216492213428643, "gen_logits_max": 4.593797206878662, "gen_logits_mean": -16.402881622314453, "gen_logits_min": -28.450149536132812, "gen_logits_std": 3.406994342803955, "gen_loss": 0.26327604055404663, "grad_norm": 0.33593962456460535, "learning_rate": 1.9662315789473685e-05, "loss": 0.2667, "mean_copy_accuracy": 0.9973861128091812, "mean_gen_accuracy": 0.8777185380458832, "mean_token_accuracy": 0.9104524254798889, "num_tokens": 113293205.0, "sample_num_tokens": 9288.25, "step": 8920, "total_num_tokens": 113330358.0, "z_loss": 0.00042758381459861994 }, { "copy_logits_max": -3.426959991455078, "copy_logits_min": -750000000.0, "copy_num_tokens": 510.4375, "epoch": 1.8218534592800613, "gen_logits_max": 3.036508560180664, "gen_logits_mean": -18.01906967163086, "gen_logits_min": -30.003704071044922, "gen_logits_std": 3.460733652114868, "gen_loss": 0.28744107484817505, "grad_norm": 0.3499246529236822, "learning_rate": 1.966105263157895e-05, "loss": 0.2723, "mean_copy_accuracy": 0.9968565702438354, "mean_gen_accuracy": 0.8773252218961716, "mean_token_accuracy": 0.9061710238456726, "num_tokens": 113567671.0, "sample_num_tokens": 8449.75, "step": 8921, "total_num_tokens": 113601470.0, "z_loss": 0.00048765327665023506 }, { "copy_logits_max": -2.881497383117676, "copy_logits_min": -687500032.0, "copy_num_tokens": 632.6875, "epoch": 1.8220576972172582, "gen_logits_max": 3.126969575881958, "gen_logits_mean": -17.270038604736328, "gen_logits_min": -29.517099380493164, "gen_logits_std": 3.434664726257324, "gen_loss": 0.2952474355697632, "grad_norm": 0.3591447659684935, "learning_rate": 1.965978947368421e-05, "loss": 0.2923, "mean_copy_accuracy": 0.9974770843982697, "mean_gen_accuracy": 0.8721228688955307, "mean_token_accuracy": 0.9060031324625015, "num_tokens": 113843506.0, "sample_num_tokens": 9197.5, "step": 8922, "total_num_tokens": 113880296.0, "z_loss": 0.0004748185456264764 }, { "copy_logits_max": -6.57641077041626, "copy_logits_min": -750000000.0, "copy_num_tokens": 276.5625, "epoch": 1.822261935154455, "gen_logits_max": 3.8945505619049072, "gen_logits_mean": -18.037580490112305, "gen_logits_min": -29.652450561523438, "gen_logits_std": 3.4344851970672607, "gen_loss": 0.3379073739051819, "grad_norm": 0.37697525604104093, "learning_rate": 1.9658526315789474e-05, "loss": 0.2944, "mean_copy_accuracy": 0.9961326122283936, "mean_gen_accuracy": 0.8761231303215027, "mean_token_accuracy": 0.9006360620260239, "num_tokens": 114107651.0, "sample_num_tokens": 7289.25, "step": 8923, "total_num_tokens": 114136808.0, "z_loss": 0.0004681232967413962 }, { "copy_logits_max": -7.933002948760986, "copy_logits_min": -750000000.0, "copy_num_tokens": 327.4375, "epoch": 1.8224661730916518, "gen_logits_max": 3.307432174682617, "gen_logits_mean": -18.34691619873047, "gen_logits_min": -30.04751968383789, "gen_logits_std": 3.442702293395996, "gen_loss": 0.2804904580116272, "grad_norm": 0.3324521014303582, "learning_rate": 1.9657263157894735e-05, "loss": 0.256, "mean_copy_accuracy": 0.9971244782209396, "mean_gen_accuracy": 0.8786399662494659, "mean_token_accuracy": 0.911940410733223, "num_tokens": 114385903.0, "sample_num_tokens": 8652.75, "step": 8924, "total_num_tokens": 114420514.0, "z_loss": 0.0004257510881870985 }, { "copy_logits_max": -6.657503128051758, "copy_logits_min": -687500032.0, "copy_num_tokens": 521.125, "epoch": 1.8226704110288487, "gen_logits_max": 2.1781535148620605, "gen_logits_mean": -19.413484573364258, "gen_logits_min": -31.33649444580078, "gen_logits_std": 3.5284621715545654, "gen_loss": 0.24213093519210815, "grad_norm": 0.3783768097583365, "learning_rate": 1.9656e-05, "loss": 0.2789, "mean_copy_accuracy": 0.9950742125511169, "mean_gen_accuracy": 0.8795085102319717, "mean_token_accuracy": 0.9054235070943832, "num_tokens": 114644410.0, "sample_num_tokens": 8442.0, "step": 8925, "total_num_tokens": 114678178.0, "z_loss": 0.00035641976865008473 }, { "copy_logits_max": -3.8529415130615234, "copy_logits_min": -750000000.0, "copy_num_tokens": 535.1875, "epoch": 1.8228746489660455, "gen_logits_max": 1.6953833103179932, "gen_logits_mean": -19.134014129638672, "gen_logits_min": -31.348522186279297, "gen_logits_std": 3.53391695022583, "gen_loss": 0.23264580965042114, "grad_norm": 0.33445200867476615, "learning_rate": 1.965473684210526e-05, "loss": 0.2597, "mean_copy_accuracy": 0.9963835179805756, "mean_gen_accuracy": 0.8820865899324417, "mean_token_accuracy": 0.9109231680631638, "num_tokens": 114909907.0, "sample_num_tokens": 8593.25, "step": 8926, "total_num_tokens": 114944280.0, "z_loss": 0.0003718054504133761 }, { "copy_logits_max": -6.943301677703857, "copy_logits_min": -750000000.0, "copy_num_tokens": 338.6875, "epoch": 1.8230788869032422, "gen_logits_max": 3.340709686279297, "gen_logits_mean": -17.993858337402344, "gen_logits_min": -30.247127532958984, "gen_logits_std": 3.4549901485443115, "gen_loss": 0.3037964701652527, "grad_norm": 0.37975924604800315, "learning_rate": 1.9653473684210528e-05, "loss": 0.2884, "mean_copy_accuracy": 0.9964347183704376, "mean_gen_accuracy": 0.8767313808202744, "mean_token_accuracy": 0.9033096134662628, "num_tokens": 115162633.0, "sample_num_tokens": 6548.75, "step": 8927, "total_num_tokens": 115188828.0, "z_loss": 0.0003930299717467278 }, { "copy_logits_max": -5.5689849853515625, "copy_logits_min": -750000064.0, "copy_num_tokens": 571.625, "epoch": 1.823283124840439, "gen_logits_max": 2.5745062828063965, "gen_logits_mean": -17.917869567871094, "gen_logits_min": -30.86684226989746, "gen_logits_std": 3.4545772075653076, "gen_loss": 0.2891358733177185, "grad_norm": 0.36592413497595344, "learning_rate": 1.9652210526315792e-05, "loss": 0.2743, "mean_copy_accuracy": 0.9975599348545074, "mean_gen_accuracy": 0.8750165402889252, "mean_token_accuracy": 0.9061379581689835, "num_tokens": 115400656.0, "sample_num_tokens": 8817.0, "step": 8928, "total_num_tokens": 115435924.0, "z_loss": 0.0004526657867245376 }, { "copy_logits_max": -5.692286968231201, "copy_logits_min": -750000128.0, "copy_num_tokens": 410.5625, "epoch": 1.823487362777636, "gen_logits_max": 4.225122451782227, "gen_logits_mean": -15.600120544433594, "gen_logits_min": -28.248369216918945, "gen_logits_std": 3.3825817108154297, "gen_loss": 0.24623329937458038, "grad_norm": 0.3445158006910888, "learning_rate": 1.9650947368421053e-05, "loss": 0.2598, "mean_copy_accuracy": 0.9973985403776169, "mean_gen_accuracy": 0.882859081029892, "mean_token_accuracy": 0.9109390377998352, "num_tokens": 115683549.0, "sample_num_tokens": 7672.75, "step": 8929, "total_num_tokens": 115714240.0, "z_loss": 0.0004051715659443289 }, { "copy_logits_max": -7.274977207183838, "copy_logits_min": -750000000.0, "copy_num_tokens": 576.875, "epoch": 1.8236916007148327, "gen_logits_max": 3.582886219024658, "gen_logits_mean": -17.21924591064453, "gen_logits_min": -29.220338821411133, "gen_logits_std": 3.4407501220703125, "gen_loss": 0.2719743549823761, "grad_norm": 0.3563451242843696, "learning_rate": 1.9649684210526318e-05, "loss": 0.2899, "mean_copy_accuracy": 0.9969047755002975, "mean_gen_accuracy": 0.8708862215280533, "mean_token_accuracy": 0.9023345708847046, "num_tokens": 115954683.0, "sample_num_tokens": 9207.25, "step": 8930, "total_num_tokens": 115991512.0, "z_loss": 0.00045999186113476753 }, { "copy_logits_max": -5.078874588012695, "copy_logits_min": -750000000.0, "copy_num_tokens": 636.4375, "epoch": 1.8238958386520296, "gen_logits_max": 3.020961046218872, "gen_logits_mean": -17.56780242919922, "gen_logits_min": -29.63764190673828, "gen_logits_std": 3.449582576751709, "gen_loss": 0.2729736268520355, "grad_norm": 0.3780432548795348, "learning_rate": 1.964842105263158e-05, "loss": 0.282, "mean_copy_accuracy": 0.996403768658638, "mean_gen_accuracy": 0.8753582239151001, "mean_token_accuracy": 0.9057765454053879, "num_tokens": 116214479.0, "sample_num_tokens": 9833.75, "step": 8931, "total_num_tokens": 116253814.0, "z_loss": 0.0004354697884991765 }, { "copy_logits_max": -4.336080551147461, "copy_logits_min": -750000064.0, "copy_num_tokens": 577.1875, "epoch": 1.8241000765892266, "gen_logits_max": 3.5656208992004395, "gen_logits_mean": -16.211055755615234, "gen_logits_min": -28.4207706451416, "gen_logits_std": 3.4085171222686768, "gen_loss": 0.28553086519241333, "grad_norm": 0.3354826508976667, "learning_rate": 1.9647157894736843e-05, "loss": 0.2769, "mean_copy_accuracy": 0.9977158457040787, "mean_gen_accuracy": 0.8741068243980408, "mean_token_accuracy": 0.9051648825407028, "num_tokens": 116482802.0, "sample_num_tokens": 8986.5, "step": 8932, "total_num_tokens": 116518748.0, "z_loss": 0.000509214645717293 }, { "copy_logits_max": -6.453293800354004, "copy_logits_min": -750000000.0, "copy_num_tokens": 332.9375, "epoch": 1.8243043145264233, "gen_logits_max": 3.388446807861328, "gen_logits_mean": -18.98420524597168, "gen_logits_min": -30.794601440429688, "gen_logits_std": 3.4777987003326416, "gen_loss": 0.2862872779369354, "grad_norm": 0.6106241250384231, "learning_rate": 1.9645894736842104e-05, "loss": 0.2697, "mean_copy_accuracy": 0.9966207295656204, "mean_gen_accuracy": 0.8774377256631851, "mean_token_accuracy": 0.9083513766527176, "num_tokens": 116771009.0, "sample_num_tokens": 7604.25, "step": 8933, "total_num_tokens": 116801426.0, "z_loss": 0.0005044363206252456 }, { "copy_logits_max": -5.98282527923584, "copy_logits_min": -687500032.0, "copy_num_tokens": 400.25, "epoch": 1.82450855246362, "gen_logits_max": 4.671206474304199, "gen_logits_mean": -14.777402877807617, "gen_logits_min": -27.258525848388672, "gen_logits_std": 3.352644681930542, "gen_loss": 0.29293566942214966, "grad_norm": 0.3374322270409696, "learning_rate": 1.9644631578947368e-05, "loss": 0.2654, "mean_copy_accuracy": 0.9974383264780045, "mean_gen_accuracy": 0.8848550021648407, "mean_token_accuracy": 0.910092368721962, "num_tokens": 117040675.0, "sample_num_tokens": 8331.25, "step": 8934, "total_num_tokens": 117074000.0, "z_loss": 0.00050587416626513 }, { "copy_logits_max": -6.1767425537109375, "copy_logits_min": -750000000.0, "copy_num_tokens": 408.625, "epoch": 1.824712790400817, "gen_logits_max": 4.325905799865723, "gen_logits_mean": -17.10087776184082, "gen_logits_min": -29.436988830566406, "gen_logits_std": 3.4376440048217773, "gen_loss": 0.2985534071922302, "grad_norm": 0.36964201372196004, "learning_rate": 1.9643368421052632e-05, "loss": 0.2816, "mean_copy_accuracy": 0.9963930249214172, "mean_gen_accuracy": 0.8776655793190002, "mean_token_accuracy": 0.9039345532655716, "num_tokens": 117299299.0, "sample_num_tokens": 7698.25, "step": 8935, "total_num_tokens": 117330092.0, "z_loss": 0.00046186294639483094 }, { "copy_logits_max": -6.4855475425720215, "copy_logits_min": -750000000.0, "copy_num_tokens": 507.9375, "epoch": 1.8249170283380138, "gen_logits_max": 3.343295097351074, "gen_logits_mean": -17.65768814086914, "gen_logits_min": -30.035356521606445, "gen_logits_std": 3.477292537689209, "gen_loss": 0.2505435347557068, "grad_norm": 0.32886806963452436, "learning_rate": 1.9642105263157897e-05, "loss": 0.2724, "mean_copy_accuracy": 0.9971254169940948, "mean_gen_accuracy": 0.8808716535568237, "mean_token_accuracy": 0.9071268439292908, "num_tokens": 117565480.0, "sample_num_tokens": 9852.5, "step": 8936, "total_num_tokens": 117604890.0, "z_loss": 0.000429579900810495 }, { "copy_logits_max": -6.891930103302002, "copy_logits_min": -750000000.0, "copy_num_tokens": 428.0, "epoch": 1.8251212662752105, "gen_logits_max": 2.854159355163574, "gen_logits_mean": -17.817855834960938, "gen_logits_min": -30.589298248291016, "gen_logits_std": 3.4894261360168457, "gen_loss": 0.24927805364131927, "grad_norm": 0.35386329602964367, "learning_rate": 1.9640842105263158e-05, "loss": 0.2751, "mean_copy_accuracy": 0.9970817565917969, "mean_gen_accuracy": 0.8761194795370102, "mean_token_accuracy": 0.9062370806932449, "num_tokens": 117834387.0, "sample_num_tokens": 7534.75, "step": 8937, "total_num_tokens": 117864526.0, "z_loss": 0.0003994012367911637 }, { "copy_logits_max": -8.416298866271973, "copy_logits_min": -750000000.0, "copy_num_tokens": 412.875, "epoch": 1.8253255042124075, "gen_logits_max": 4.225437641143799, "gen_logits_mean": -16.091890335083008, "gen_logits_min": -28.65252685546875, "gen_logits_std": 3.4090285301208496, "gen_loss": 0.2758359909057617, "grad_norm": 0.37965096438069146, "learning_rate": 1.9639578947368422e-05, "loss": 0.2938, "mean_copy_accuracy": 0.9961937814950943, "mean_gen_accuracy": 0.8723853975534439, "mean_token_accuracy": 0.9002548903226852, "num_tokens": 118089862.0, "sample_num_tokens": 8394.0, "step": 8938, "total_num_tokens": 118123438.0, "z_loss": 0.0003860604192595929 }, { "copy_logits_max": -5.913919448852539, "copy_logits_min": -750000064.0, "copy_num_tokens": 394.0625, "epoch": 1.8255297421496044, "gen_logits_max": 2.8353188037872314, "gen_logits_mean": -17.069190979003906, "gen_logits_min": -29.598800659179688, "gen_logits_std": 3.4591312408447266, "gen_loss": 0.273745596408844, "grad_norm": 0.354442502450797, "learning_rate": 1.9638315789473683e-05, "loss": 0.2693, "mean_copy_accuracy": 0.9978179931640625, "mean_gen_accuracy": 0.8773180991411209, "mean_token_accuracy": 0.9076568633317947, "num_tokens": 118365096.0, "sample_num_tokens": 7248.0, "step": 8939, "total_num_tokens": 118394088.0, "z_loss": 0.00039553476381115615 }, { "copy_logits_max": -5.526894569396973, "copy_logits_min": -750000000.0, "copy_num_tokens": 472.6875, "epoch": 1.825733980086801, "gen_logits_max": 3.3885209560394287, "gen_logits_mean": -17.060367584228516, "gen_logits_min": -29.439895629882812, "gen_logits_std": 3.4544153213500977, "gen_loss": 0.272855669260025, "grad_norm": 0.37495096018559576, "learning_rate": 1.9637052631578947e-05, "loss": 0.2871, "mean_copy_accuracy": 0.9968019276857376, "mean_gen_accuracy": 0.8769871890544891, "mean_token_accuracy": 0.9009451866149902, "num_tokens": 118605971.0, "sample_num_tokens": 9191.75, "step": 8940, "total_num_tokens": 118642738.0, "z_loss": 0.0004182583070360124 }, { "copy_logits_max": -8.097460746765137, "copy_logits_min": -750000000.0, "copy_num_tokens": 695.4375, "epoch": 1.8259382180239978, "gen_logits_max": 2.8483407497406006, "gen_logits_mean": -17.143413543701172, "gen_logits_min": -29.36587142944336, "gen_logits_std": 3.4735195636749268, "gen_loss": 0.2379307597875595, "grad_norm": 0.36554027255464727, "learning_rate": 1.963578947368421e-05, "loss": 0.2488, "mean_copy_accuracy": 0.9968370199203491, "mean_gen_accuracy": 0.8855580985546112, "mean_token_accuracy": 0.9155871421098709, "num_tokens": 118883871.0, "sample_num_tokens": 10335.75, "step": 8941, "total_num_tokens": 118925214.0, "z_loss": 0.00034529739059507847 }, { "copy_logits_max": -7.37410831451416, "copy_logits_min": -750000000.0, "copy_num_tokens": 541.1875, "epoch": 1.826142455961195, "gen_logits_max": 3.3581976890563965, "gen_logits_mean": -17.18042755126953, "gen_logits_min": -29.69652557373047, "gen_logits_std": 3.474832534790039, "gen_loss": 0.25904983282089233, "grad_norm": 0.3353353122379793, "learning_rate": 1.9634526315789472e-05, "loss": 0.2626, "mean_copy_accuracy": 0.9971270263195038, "mean_gen_accuracy": 0.8809521198272705, "mean_token_accuracy": 0.9107448160648346, "num_tokens": 119171654.0, "sample_num_tokens": 9485.5, "step": 8942, "total_num_tokens": 119209596.0, "z_loss": 0.00037727560265921056 }, { "copy_logits_max": -5.726482391357422, "copy_logits_min": -750000000.0, "copy_num_tokens": 622.4375, "epoch": 1.8263466938983917, "gen_logits_max": 2.3862905502319336, "gen_logits_mean": -17.944515228271484, "gen_logits_min": -30.5502986907959, "gen_logits_std": 3.4914989471435547, "gen_loss": 0.2690211534500122, "grad_norm": 0.3638496328382071, "learning_rate": 1.963326315789474e-05, "loss": 0.2686, "mean_copy_accuracy": 0.9963007271289825, "mean_gen_accuracy": 0.8781571537256241, "mean_token_accuracy": 0.9085360914468765, "num_tokens": 119430345.0, "sample_num_tokens": 9315.25, "step": 8943, "total_num_tokens": 119467606.0, "z_loss": 0.00046400216524489224 }, { "copy_logits_max": -7.145230770111084, "copy_logits_min": -687500032.0, "copy_num_tokens": 453.8125, "epoch": 1.8265509318355884, "gen_logits_max": 3.462146759033203, "gen_logits_mean": -17.12649154663086, "gen_logits_min": -29.498939514160156, "gen_logits_std": 3.446829319000244, "gen_loss": 0.2732797861099243, "grad_norm": 0.3574361082417323, "learning_rate": 1.9632e-05, "loss": 0.281, "mean_copy_accuracy": 0.9967842847108841, "mean_gen_accuracy": 0.8735550493001938, "mean_token_accuracy": 0.9041868597269058, "num_tokens": 119710982.0, "sample_num_tokens": 8161.5, "step": 8944, "total_num_tokens": 119743628.0, "z_loss": 0.0004379404126666486 }, { "copy_logits_max": -8.396647453308105, "copy_logits_min": -750000064.0, "copy_num_tokens": 632.125, "epoch": 1.8267551697727853, "gen_logits_max": 4.0395002365112305, "gen_logits_mean": -16.116758346557617, "gen_logits_min": -28.758384704589844, "gen_logits_std": 3.402215003967285, "gen_loss": 0.2698863744735718, "grad_norm": 0.3820179586508661, "learning_rate": 1.9630736842105265e-05, "loss": 0.2945, "mean_copy_accuracy": 0.9964753091335297, "mean_gen_accuracy": 0.8744735568761826, "mean_token_accuracy": 0.8995377868413925, "num_tokens": 119971326.0, "sample_num_tokens": 9253.5, "step": 8945, "total_num_tokens": 120008340.0, "z_loss": 0.00045140631846152246 }, { "copy_logits_max": -7.356743335723877, "copy_logits_min": -750000064.0, "copy_num_tokens": 575.25, "epoch": 1.8269594077099822, "gen_logits_max": 3.7707266807556152, "gen_logits_mean": -18.088115692138672, "gen_logits_min": -30.28636932373047, "gen_logits_std": 3.493114471435547, "gen_loss": 0.257510781288147, "grad_norm": 0.34113388171878783, "learning_rate": 1.9629473684210526e-05, "loss": 0.2644, "mean_copy_accuracy": 0.9974434226751328, "mean_gen_accuracy": 0.8791036009788513, "mean_token_accuracy": 0.9101209789514542, "num_tokens": 120263547.0, "sample_num_tokens": 10314.75, "step": 8946, "total_num_tokens": 120304806.0, "z_loss": 0.0004783944459632039 }, { "copy_logits_max": -6.798872947692871, "copy_logits_min": -750000000.0, "copy_num_tokens": 513.75, "epoch": 1.827163645647179, "gen_logits_max": 4.396885395050049, "gen_logits_mean": -16.054401397705078, "gen_logits_min": -28.591453552246094, "gen_logits_std": 3.427967071533203, "gen_loss": 0.2649309039115906, "grad_norm": 0.3660372748739437, "learning_rate": 1.962821052631579e-05, "loss": 0.274, "mean_copy_accuracy": 0.9960983544588089, "mean_gen_accuracy": 0.8794519603252411, "mean_token_accuracy": 0.9054723829030991, "num_tokens": 120539658.0, "sample_num_tokens": 9273.0, "step": 8947, "total_num_tokens": 120576750.0, "z_loss": 0.0004411934351082891 }, { "copy_logits_max": -7.561285018920898, "copy_logits_min": -750000000.0, "copy_num_tokens": 419.5625, "epoch": 1.8273678835843759, "gen_logits_max": 3.8910415172576904, "gen_logits_mean": -17.663854598999023, "gen_logits_min": -30.184600830078125, "gen_logits_std": 3.474912643432617, "gen_loss": 0.2860233187675476, "grad_norm": 0.36218907926267435, "learning_rate": 1.962694736842105e-05, "loss": 0.2769, "mean_copy_accuracy": 0.9972217082977295, "mean_gen_accuracy": 0.8800740838050842, "mean_token_accuracy": 0.9064726531505585, "num_tokens": 120792669.0, "sample_num_tokens": 8441.75, "step": 8948, "total_num_tokens": 120826436.0, "z_loss": 0.0004476670583244413 }, { "copy_logits_max": -8.80876636505127, "copy_logits_min": -750000000.0, "copy_num_tokens": 424.25, "epoch": 1.8275721215215728, "gen_logits_max": 4.773099899291992, "gen_logits_mean": -16.13722801208496, "gen_logits_min": -28.320375442504883, "gen_logits_std": 3.4101314544677734, "gen_loss": 0.2946416139602661, "grad_norm": 0.36806587062157115, "learning_rate": 1.9625684210526316e-05, "loss": 0.2915, "mean_copy_accuracy": 0.9973222464323044, "mean_gen_accuracy": 0.8752159476280212, "mean_token_accuracy": 0.9027220457792282, "num_tokens": 121047060.0, "sample_num_tokens": 9258.5, "step": 8949, "total_num_tokens": 121084094.0, "z_loss": 0.00042269332334399223 }, { "copy_logits_max": -8.121387481689453, "copy_logits_min": -687500032.0, "copy_num_tokens": 450.375, "epoch": 1.8277763594587695, "gen_logits_max": 5.6643476486206055, "gen_logits_mean": -14.160154342651367, "gen_logits_min": -26.830177307128906, "gen_logits_std": 3.3588621616363525, "gen_loss": 0.31020206212997437, "grad_norm": 0.3600762902158254, "learning_rate": 1.9624421052631577e-05, "loss": 0.2767, "mean_copy_accuracy": 0.9969412237405777, "mean_gen_accuracy": 0.8717138767242432, "mean_token_accuracy": 0.9045986384153366, "num_tokens": 121331613.0, "sample_num_tokens": 8035.75, "step": 8950, "total_num_tokens": 121363756.0, "z_loss": 0.0005033440538682044 }, { "copy_logits_max": -9.47018051147461, "copy_logits_min": -750000000.0, "copy_num_tokens": 403.9375, "epoch": 1.8279805973959662, "gen_logits_max": 5.262409210205078, "gen_logits_mean": -16.087139129638672, "gen_logits_min": -28.36395263671875, "gen_logits_std": 3.43253493309021, "gen_loss": 0.22729361057281494, "grad_norm": 0.34143791310840377, "learning_rate": 1.9623157894736844e-05, "loss": 0.2553, "mean_copy_accuracy": 0.9970997720956802, "mean_gen_accuracy": 0.8878071755170822, "mean_token_accuracy": 0.9117505103349686, "num_tokens": 121615029.0, "sample_num_tokens": 8946.25, "step": 8951, "total_num_tokens": 121650814.0, "z_loss": 0.00034348166082054377 }, { "copy_logits_max": -7.330721855163574, "copy_logits_min": -750000064.0, "copy_num_tokens": 417.4375, "epoch": 1.8281848353331631, "gen_logits_max": 4.656147480010986, "gen_logits_mean": -15.76041316986084, "gen_logits_min": -28.245140075683594, "gen_logits_std": 3.4144253730773926, "gen_loss": 0.2845207452774048, "grad_norm": 0.3613728221730745, "learning_rate": 1.9621894736842105e-05, "loss": 0.2846, "mean_copy_accuracy": 0.9974116384983063, "mean_gen_accuracy": 0.8745357096195221, "mean_token_accuracy": 0.9042910486459732, "num_tokens": 121865823.0, "sample_num_tokens": 7942.75, "step": 8952, "total_num_tokens": 121897594.0, "z_loss": 0.00046041104360483587 }, { "copy_logits_max": -7.61538028717041, "copy_logits_min": -750000000.0, "copy_num_tokens": 375.5, "epoch": 1.82838907327036, "gen_logits_max": 4.612878322601318, "gen_logits_mean": -16.135892868041992, "gen_logits_min": -28.885766983032227, "gen_logits_std": 3.4118285179138184, "gen_loss": 0.32202938199043274, "grad_norm": 0.35926763354018576, "learning_rate": 1.962063157894737e-05, "loss": 0.2761, "mean_copy_accuracy": 0.99674092233181, "mean_gen_accuracy": 0.8777453899383545, "mean_token_accuracy": 0.9059614539146423, "num_tokens": 122136232.0, "sample_num_tokens": 7521.5, "step": 8953, "total_num_tokens": 122166318.0, "z_loss": 0.0005441012908704579 }, { "copy_logits_max": -7.549826622009277, "copy_logits_min": -750000064.0, "copy_num_tokens": 420.5625, "epoch": 1.8285933112075567, "gen_logits_max": 2.9408013820648193, "gen_logits_mean": -18.541454315185547, "gen_logits_min": -30.8609619140625, "gen_logits_std": 3.5387625694274902, "gen_loss": 0.25891953706741333, "grad_norm": 0.3588575514404227, "learning_rate": 1.9619368421052634e-05, "loss": 0.2705, "mean_copy_accuracy": 0.9967246651649475, "mean_gen_accuracy": 0.8753098249435425, "mean_token_accuracy": 0.9074860960245132, "num_tokens": 122421489.0, "sample_num_tokens": 7656.25, "step": 8954, "total_num_tokens": 122452114.0, "z_loss": 0.0003821837017312646 }, { "copy_logits_max": -8.945850372314453, "copy_logits_min": -750000000.0, "copy_num_tokens": 407.125, "epoch": 1.8287975491447537, "gen_logits_max": 3.7948966026306152, "gen_logits_mean": -17.20128059387207, "gen_logits_min": -29.6242733001709, "gen_logits_std": 3.4857709407806396, "gen_loss": 0.2525486648082733, "grad_norm": 0.3336032324985826, "learning_rate": 1.9618105263157895e-05, "loss": 0.2652, "mean_copy_accuracy": 0.9975394308567047, "mean_gen_accuracy": 0.8802777528762817, "mean_token_accuracy": 0.908761128783226, "num_tokens": 122693825.0, "sample_num_tokens": 7718.25, "step": 8955, "total_num_tokens": 122724698.0, "z_loss": 0.0003786128363572061 }, { "copy_logits_max": -6.957368850708008, "copy_logits_min": -750000000.0, "copy_num_tokens": 439.75, "epoch": 1.8290017870819506, "gen_logits_max": 4.263169288635254, "gen_logits_mean": -16.599458694458008, "gen_logits_min": -28.742807388305664, "gen_logits_std": 3.414525032043457, "gen_loss": 0.2978500723838806, "grad_norm": 0.3612817709635677, "learning_rate": 1.961684210526316e-05, "loss": 0.3013, "mean_copy_accuracy": 0.9961660653352737, "mean_gen_accuracy": 0.8671531677246094, "mean_token_accuracy": 0.8984051048755646, "num_tokens": 122958012.0, "sample_num_tokens": 8561.0, "step": 8956, "total_num_tokens": 122992256.0, "z_loss": 0.00046727590961381793 }, { "copy_logits_max": -7.737809181213379, "copy_logits_min": -687500032.0, "copy_num_tokens": 380.4375, "epoch": 1.8292060250191473, "gen_logits_max": 4.173365116119385, "gen_logits_mean": -17.571022033691406, "gen_logits_min": -29.642356872558594, "gen_logits_std": 3.4762179851531982, "gen_loss": 0.274251788854599, "grad_norm": 0.35334427299689747, "learning_rate": 1.961557894736842e-05, "loss": 0.2731, "mean_copy_accuracy": 0.9969119131565094, "mean_gen_accuracy": 0.8783739507198334, "mean_token_accuracy": 0.9047289043664932, "num_tokens": 123223121.0, "sample_num_tokens": 8558.25, "step": 8957, "total_num_tokens": 123257354.0, "z_loss": 0.0003771561896428466 }, { "copy_logits_max": -6.15911865234375, "copy_logits_min": -750000064.0, "copy_num_tokens": 636.25, "epoch": 1.829410262956344, "gen_logits_max": 3.298149824142456, "gen_logits_mean": -16.737937927246094, "gen_logits_min": -29.48114776611328, "gen_logits_std": 3.472238302230835, "gen_loss": 0.26739180088043213, "grad_norm": 0.34036137700356583, "learning_rate": 1.9614315789473685e-05, "loss": 0.2635, "mean_copy_accuracy": 0.9967282712459564, "mean_gen_accuracy": 0.8821103423833847, "mean_token_accuracy": 0.9100199192762375, "num_tokens": 123500365.0, "sample_num_tokens": 9785.75, "step": 8958, "total_num_tokens": 123539508.0, "z_loss": 0.0004009865515399724 }, { "copy_logits_max": -6.590984344482422, "copy_logits_min": -750000000.0, "copy_num_tokens": 509.625, "epoch": 1.829614500893541, "gen_logits_max": 3.170083999633789, "gen_logits_mean": -17.784517288208008, "gen_logits_min": -30.208694458007812, "gen_logits_std": 3.5138232707977295, "gen_loss": 0.2651866376399994, "grad_norm": 0.35250014873332985, "learning_rate": 1.9613052631578945e-05, "loss": 0.2486, "mean_copy_accuracy": 0.9962106198072433, "mean_gen_accuracy": 0.8861059993505478, "mean_token_accuracy": 0.9164864420890808, "num_tokens": 123793289.0, "sample_num_tokens": 8550.25, "step": 8959, "total_num_tokens": 123827490.0, "z_loss": 0.000404288643039763 }, { "copy_logits_max": -7.015472412109375, "copy_logits_min": -750000000.0, "copy_num_tokens": 406.625, "epoch": 1.8298187388307379, "gen_logits_max": 4.974088191986084, "gen_logits_mean": -15.591712951660156, "gen_logits_min": -27.73231315612793, "gen_logits_std": 3.4076356887817383, "gen_loss": 0.27718833088874817, "grad_norm": 0.351951134328679, "learning_rate": 1.9611789473684213e-05, "loss": 0.2726, "mean_copy_accuracy": 0.9977199137210846, "mean_gen_accuracy": 0.8770439922809601, "mean_token_accuracy": 0.90787373483181, "num_tokens": 124055023.0, "sample_num_tokens": 8189.25, "step": 8960, "total_num_tokens": 124087780.0, "z_loss": 0.00042702394421212375 }, { "copy_logits_max": -5.252846717834473, "copy_logits_min": -750000128.0, "copy_num_tokens": 498.875, "epoch": 1.8300229767679346, "gen_logits_max": 4.980255126953125, "gen_logits_mean": -14.58952522277832, "gen_logits_min": -27.44622802734375, "gen_logits_std": 3.369089126586914, "gen_loss": 0.25250762701034546, "grad_norm": 0.34830689259440156, "learning_rate": 1.9610526315789474e-05, "loss": 0.2718, "mean_copy_accuracy": 0.9969071298837662, "mean_gen_accuracy": 0.8783466070890427, "mean_token_accuracy": 0.9072563350200653, "num_tokens": 124315920.0, "sample_num_tokens": 8485.5, "step": 8961, "total_num_tokens": 124349862.0, "z_loss": 0.0004041612264700234 }, { "copy_logits_max": -3.4276368618011475, "copy_logits_min": -687500032.0, "copy_num_tokens": 452.1875, "epoch": 1.8302272147051315, "gen_logits_max": 5.671084403991699, "gen_logits_mean": -14.230916976928711, "gen_logits_min": -26.880599975585938, "gen_logits_std": 3.3582992553710938, "gen_loss": 0.28876614570617676, "grad_norm": 0.3592968300345218, "learning_rate": 1.960926315789474e-05, "loss": 0.2764, "mean_copy_accuracy": 0.9965142905712128, "mean_gen_accuracy": 0.8788815140724182, "mean_token_accuracy": 0.9050464779138565, "num_tokens": 124577074.0, "sample_num_tokens": 9284.0, "step": 8962, "total_num_tokens": 124614210.0, "z_loss": 0.0005264043575152755 }, { "copy_logits_max": -5.231392860412598, "copy_logits_min": -750000064.0, "copy_num_tokens": 539.375, "epoch": 1.8304314526423284, "gen_logits_max": 3.660871744155884, "gen_logits_mean": -16.61870002746582, "gen_logits_min": -29.052963256835938, "gen_logits_std": 3.4542672634124756, "gen_loss": 0.265829861164093, "grad_norm": 0.3427250220265965, "learning_rate": 1.9608e-05, "loss": 0.2629, "mean_copy_accuracy": 0.9976241886615753, "mean_gen_accuracy": 0.8766151666641235, "mean_token_accuracy": 0.9089127331972122, "num_tokens": 124851940.0, "sample_num_tokens": 7810.5, "step": 8963, "total_num_tokens": 124883182.0, "z_loss": 0.00043101701885461807 }, { "copy_logits_max": -6.772551536560059, "copy_logits_min": -750000000.0, "copy_num_tokens": 443.9375, "epoch": 1.8306356905795251, "gen_logits_max": 4.738956451416016, "gen_logits_mean": -15.493483543395996, "gen_logits_min": -28.063518524169922, "gen_logits_std": 3.383309841156006, "gen_loss": 0.26129037141799927, "grad_norm": 0.3641046366044161, "learning_rate": 1.9606736842105264e-05, "loss": 0.2704, "mean_copy_accuracy": 0.9964405596256256, "mean_gen_accuracy": 0.8763319402933121, "mean_token_accuracy": 0.9055531173944473, "num_tokens": 125112433.0, "sample_num_tokens": 8276.75, "step": 8964, "total_num_tokens": 125145540.0, "z_loss": 0.0004469262494239956 }, { "copy_logits_max": -5.992318630218506, "copy_logits_min": -750000000.0, "copy_num_tokens": 305.6875, "epoch": 1.8308399285167218, "gen_logits_max": 4.937658309936523, "gen_logits_mean": -15.845991134643555, "gen_logits_min": -27.940166473388672, "gen_logits_std": 3.373427391052246, "gen_loss": 0.2793838381767273, "grad_norm": 0.35125685617141894, "learning_rate": 1.9605473684210525e-05, "loss": 0.2755, "mean_copy_accuracy": 0.9961191266775131, "mean_gen_accuracy": 0.8818314075469971, "mean_token_accuracy": 0.9072137326002121, "num_tokens": 125388350.0, "sample_num_tokens": 7113.5, "step": 8965, "total_num_tokens": 125416804.0, "z_loss": 0.00038518087239935994 }, { "copy_logits_max": -5.539348602294922, "copy_logits_min": -750000000.0, "copy_num_tokens": 294.0625, "epoch": 1.8310441664539188, "gen_logits_max": 5.4131178855896, "gen_logits_mean": -15.187372207641602, "gen_logits_min": -28.143775939941406, "gen_logits_std": 3.402660608291626, "gen_loss": 0.2903367280960083, "grad_norm": 0.45925828766811144, "learning_rate": 1.960421052631579e-05, "loss": 0.3018, "mean_copy_accuracy": 0.9966868162155151, "mean_gen_accuracy": 0.8700233995914459, "mean_token_accuracy": 0.8986842185258865, "num_tokens": 125644850.0, "sample_num_tokens": 7159.5, "step": 8966, "total_num_tokens": 125673488.0, "z_loss": 0.0004801309551112354 }, { "copy_logits_max": -5.065035343170166, "copy_logits_min": -750000128.0, "copy_num_tokens": 500.5625, "epoch": 1.8312484043911157, "gen_logits_max": 3.734691619873047, "gen_logits_mean": -16.537242889404297, "gen_logits_min": -29.205490112304688, "gen_logits_std": 3.4281489849090576, "gen_loss": 0.28392329812049866, "grad_norm": 0.3296558522149641, "learning_rate": 1.9602947368421053e-05, "loss": 0.2609, "mean_copy_accuracy": 0.997676745057106, "mean_gen_accuracy": 0.8834547698497772, "mean_token_accuracy": 0.9114785343408585, "num_tokens": 125896039.0, "sample_num_tokens": 9113.25, "step": 8967, "total_num_tokens": 125932492.0, "z_loss": 0.0004201137344352901 }, { "copy_logits_max": -4.9153056144714355, "copy_logits_min": -750000000.0, "copy_num_tokens": 380.3125, "epoch": 1.8314526423283124, "gen_logits_max": 4.682275772094727, "gen_logits_mean": -16.254810333251953, "gen_logits_min": -28.59685516357422, "gen_logits_std": 3.413113594055176, "gen_loss": 0.2768334150314331, "grad_norm": 0.3408833597272597, "learning_rate": 1.9601684210526317e-05, "loss": 0.266, "mean_copy_accuracy": 0.9976236969232559, "mean_gen_accuracy": 0.8751399666070938, "mean_token_accuracy": 0.9116333872079849, "num_tokens": 126212421.0, "sample_num_tokens": 8561.25, "step": 8968, "total_num_tokens": 126246666.0, "z_loss": 0.0004090830043423921 }, { "copy_logits_max": -5.04741096496582, "copy_logits_min": -750000000.0, "copy_num_tokens": 597.0625, "epoch": 1.8316568802655093, "gen_logits_max": 3.0859196186065674, "gen_logits_mean": -17.269920349121094, "gen_logits_min": -29.828767776489258, "gen_logits_std": 3.464967727661133, "gen_loss": 0.30331557989120483, "grad_norm": 0.3518717147075908, "learning_rate": 1.9600421052631582e-05, "loss": 0.2808, "mean_copy_accuracy": 0.9974022060632706, "mean_gen_accuracy": 0.8719492703676224, "mean_token_accuracy": 0.9042354673147202, "num_tokens": 126479720.0, "sample_num_tokens": 9492.0, "step": 8969, "total_num_tokens": 126517688.0, "z_loss": 0.0004185986763332039 }, { "copy_logits_max": -6.342800140380859, "copy_logits_min": -750000000.0, "copy_num_tokens": 550.5, "epoch": 1.8318611182027063, "gen_logits_max": 4.481284141540527, "gen_logits_mean": -14.721404075622559, "gen_logits_min": -27.360023498535156, "gen_logits_std": 3.3188960552215576, "gen_loss": 0.24081388115882874, "grad_norm": 0.34776662650866125, "learning_rate": 1.9599157894736843e-05, "loss": 0.2585, "mean_copy_accuracy": 0.9969494491815567, "mean_gen_accuracy": 0.8813064992427826, "mean_token_accuracy": 0.9107706099748611, "num_tokens": 126747313.0, "sample_num_tokens": 8902.75, "step": 8970, "total_num_tokens": 126782924.0, "z_loss": 0.00029833216103725135 }, { "copy_logits_max": -7.307638168334961, "copy_logits_min": -750000000.0, "copy_num_tokens": 362.625, "epoch": 1.832065356139903, "gen_logits_max": 3.083850860595703, "gen_logits_mean": -18.931982040405273, "gen_logits_min": -31.27230453491211, "gen_logits_std": 3.5407562255859375, "gen_loss": 0.25807464122772217, "grad_norm": 0.3394322604557749, "learning_rate": 1.9597894736842107e-05, "loss": 0.2616, "mean_copy_accuracy": 0.9976456612348557, "mean_gen_accuracy": 0.8825362324714661, "mean_token_accuracy": 0.9121736437082291, "num_tokens": 127024942.0, "sample_num_tokens": 7162.0, "step": 8971, "total_num_tokens": 127053590.0, "z_loss": 0.000324457127135247 }, { "copy_logits_max": -4.949998378753662, "copy_logits_min": -750000000.0, "copy_num_tokens": 419.5, "epoch": 1.8322695940770997, "gen_logits_max": 3.98519229888916, "gen_logits_mean": -15.853837013244629, "gen_logits_min": -28.68185043334961, "gen_logits_std": 3.414818286895752, "gen_loss": 0.2976973056793213, "grad_norm": 0.33487578938180596, "learning_rate": 1.9596631578947368e-05, "loss": 0.2836, "mean_copy_accuracy": 0.9972972571849823, "mean_gen_accuracy": 0.8727502673864365, "mean_token_accuracy": 0.9032803773880005, "num_tokens": 127297024.0, "sample_num_tokens": 8173.0, "step": 8972, "total_num_tokens": 127329716.0, "z_loss": 0.0004118282231502235 }, { "copy_logits_max": -2.26625394821167, "copy_logits_min": -750000064.0, "copy_num_tokens": 534.1875, "epoch": 1.8324738320142968, "gen_logits_max": 2.8836660385131836, "gen_logits_mean": -17.457061767578125, "gen_logits_min": -30.134193420410156, "gen_logits_std": 3.487863540649414, "gen_loss": 0.26765960454940796, "grad_norm": 0.3368627904770099, "learning_rate": 1.9595368421052632e-05, "loss": 0.271, "mean_copy_accuracy": 0.9971171617507935, "mean_gen_accuracy": 0.8806093633174896, "mean_token_accuracy": 0.9066246598958969, "num_tokens": 127571491.0, "sample_num_tokens": 8604.25, "step": 8973, "total_num_tokens": 127605908.0, "z_loss": 0.00039067253237590194 }, { "copy_logits_max": -3.32304310798645, "copy_logits_min": -750000000.0, "copy_num_tokens": 479.0625, "epoch": 1.8326780699514935, "gen_logits_max": 3.869169235229492, "gen_logits_mean": -16.924606323242188, "gen_logits_min": -29.277673721313477, "gen_logits_std": 3.4720966815948486, "gen_loss": 0.27668508887290955, "grad_norm": 0.34624445465121934, "learning_rate": 1.9594105263157893e-05, "loss": 0.2823, "mean_copy_accuracy": 0.9971785992383957, "mean_gen_accuracy": 0.8691027462482452, "mean_token_accuracy": 0.9028177410364151, "num_tokens": 127845058.0, "sample_num_tokens": 8859.0, "step": 8974, "total_num_tokens": 127880494.0, "z_loss": 0.00040896146674640477 }, { "copy_logits_max": -4.515068054199219, "copy_logits_min": -750000000.0, "copy_num_tokens": 365.5, "epoch": 1.8328823078886902, "gen_logits_max": 4.358882904052734, "gen_logits_mean": -16.05772590637207, "gen_logits_min": -28.20635414123535, "gen_logits_std": 3.4249393939971924, "gen_loss": 0.2777317464351654, "grad_norm": 0.34628345897003726, "learning_rate": 1.9592842105263157e-05, "loss": 0.2795, "mean_copy_accuracy": 0.9977900236845016, "mean_gen_accuracy": 0.87777279317379, "mean_token_accuracy": 0.9063284397125244, "num_tokens": 128113360.0, "sample_num_tokens": 7664.0, "step": 8975, "total_num_tokens": 128144016.0, "z_loss": 0.00041223526932299137 }, { "copy_logits_max": -6.378641128540039, "copy_logits_min": -750000000.0, "copy_num_tokens": 420.25, "epoch": 1.8330865458258871, "gen_logits_max": 3.661074638366699, "gen_logits_mean": -17.083961486816406, "gen_logits_min": -29.471588134765625, "gen_logits_std": 3.4564735889434814, "gen_loss": 0.28020986914634705, "grad_norm": 0.37195585471728154, "learning_rate": 1.9591578947368422e-05, "loss": 0.2801, "mean_copy_accuracy": 0.9963431358337402, "mean_gen_accuracy": 0.8804317116737366, "mean_token_accuracy": 0.9059568047523499, "num_tokens": 128363563.0, "sample_num_tokens": 7813.25, "step": 8976, "total_num_tokens": 128394816.0, "z_loss": 0.0004340045852586627 }, { "copy_logits_max": -6.058919429779053, "copy_logits_min": -750000000.0, "copy_num_tokens": 594.9375, "epoch": 1.833290783763084, "gen_logits_max": 2.8065590858459473, "gen_logits_mean": -17.721378326416016, "gen_logits_min": -30.344486236572266, "gen_logits_std": 3.5173277854919434, "gen_loss": 0.26777976751327515, "grad_norm": 0.3396775330380636, "learning_rate": 1.9590315789473686e-05, "loss": 0.2562, "mean_copy_accuracy": 0.9973482340574265, "mean_gen_accuracy": 0.8809910267591476, "mean_token_accuracy": 0.9126812666654587, "num_tokens": 128644445.0, "sample_num_tokens": 9335.25, "step": 8977, "total_num_tokens": 128681786.0, "z_loss": 0.00041739066364243627 }, { "copy_logits_max": -4.297080993652344, "copy_logits_min": -750000000.0, "copy_num_tokens": 656.6875, "epoch": 1.8334950217002808, "gen_logits_max": 3.401975154876709, "gen_logits_mean": -17.098785400390625, "gen_logits_min": -29.691221237182617, "gen_logits_std": 3.475386381149292, "gen_loss": 0.24343663454055786, "grad_norm": 0.3274451242031619, "learning_rate": 1.9589052631578947e-05, "loss": 0.2642, "mean_copy_accuracy": 0.9981750100851059, "mean_gen_accuracy": 0.8770638853311539, "mean_token_accuracy": 0.9126127511262894, "num_tokens": 128942730.0, "sample_num_tokens": 10170.5, "step": 8978, "total_num_tokens": 128983412.0, "z_loss": 0.0003802632272709161 }, { "copy_logits_max": -7.310869216918945, "copy_logits_min": -750000000.0, "copy_num_tokens": 491.0, "epoch": 1.8336992596374777, "gen_logits_max": 3.57541561126709, "gen_logits_mean": -17.910297393798828, "gen_logits_min": -29.944353103637695, "gen_logits_std": 3.496894359588623, "gen_loss": 0.2535029649734497, "grad_norm": 0.3670177206607666, "learning_rate": 1.958778947368421e-05, "loss": 0.2893, "mean_copy_accuracy": 0.9974284321069717, "mean_gen_accuracy": 0.8723207861185074, "mean_token_accuracy": 0.9025453478097916, "num_tokens": 129211534.0, "sample_num_tokens": 9007.0, "step": 8979, "total_num_tokens": 129247562.0, "z_loss": 0.0003613771987147629 }, { "copy_logits_max": -6.768063545227051, "copy_logits_min": -750000000.0, "copy_num_tokens": 323.375, "epoch": 1.8339034975746746, "gen_logits_max": 3.4787514209747314, "gen_logits_mean": -18.752887725830078, "gen_logits_min": -30.59034538269043, "gen_logits_std": 3.518857717514038, "gen_loss": 0.2907768785953522, "grad_norm": 0.3650541387120552, "learning_rate": 1.9586526315789476e-05, "loss": 0.2965, "mean_copy_accuracy": 0.9966742843389511, "mean_gen_accuracy": 0.870873898267746, "mean_token_accuracy": 0.8987506628036499, "num_tokens": 129466036.0, "sample_num_tokens": 7985.0, "step": 8980, "total_num_tokens": 129497976.0, "z_loss": 0.00039806548738852143 }, { "copy_logits_max": -3.8022797107696533, "copy_logits_min": -750000000.0, "copy_num_tokens": 544.4375, "epoch": 1.8341077355118713, "gen_logits_max": 3.0243077278137207, "gen_logits_mean": -17.531509399414062, "gen_logits_min": -29.923969268798828, "gen_logits_std": 3.4583396911621094, "gen_loss": 0.28555721044540405, "grad_norm": 0.33516804379743886, "learning_rate": 1.9585263157894737e-05, "loss": 0.2796, "mean_copy_accuracy": 0.9972823411226273, "mean_gen_accuracy": 0.8737304210662842, "mean_token_accuracy": 0.9034678637981415, "num_tokens": 129723392.0, "sample_num_tokens": 8917.5, "step": 8981, "total_num_tokens": 129759062.0, "z_loss": 0.00041337547008879483 }, { "copy_logits_max": -4.450904846191406, "copy_logits_min": -750000000.0, "copy_num_tokens": 546.25, "epoch": 1.834311973449068, "gen_logits_max": 3.6037139892578125, "gen_logits_mean": -17.042314529418945, "gen_logits_min": -29.40993881225586, "gen_logits_std": 3.4932942390441895, "gen_loss": 0.2672373056411743, "grad_norm": 0.34527041905684996, "learning_rate": 1.9584e-05, "loss": 0.2812, "mean_copy_accuracy": 0.997630774974823, "mean_gen_accuracy": 0.868480309844017, "mean_token_accuracy": 0.9039954394102097, "num_tokens": 130009016.0, "sample_num_tokens": 8736.5, "step": 8982, "total_num_tokens": 130043962.0, "z_loss": 0.0003971825644839555 }, { "copy_logits_max": -6.499040126800537, "copy_logits_min": -687500032.0, "copy_num_tokens": 409.5625, "epoch": 1.834516211386265, "gen_logits_max": 4.504055976867676, "gen_logits_mean": -16.575565338134766, "gen_logits_min": -28.630264282226562, "gen_logits_std": 3.430114507675171, "gen_loss": 0.2691972851753235, "grad_norm": 0.3780395319382667, "learning_rate": 1.9582736842105262e-05, "loss": 0.2957, "mean_copy_accuracy": 0.9964911490678787, "mean_gen_accuracy": 0.8741313070058823, "mean_token_accuracy": 0.9006687849760056, "num_tokens": 130273671.0, "sample_num_tokens": 8975.75, "step": 8983, "total_num_tokens": 130309574.0, "z_loss": 0.0003620211500674486 }, { "copy_logits_max": -2.364804744720459, "copy_logits_min": -625000064.0, "copy_num_tokens": 558.0625, "epoch": 1.834720449323462, "gen_logits_max": 4.873544692993164, "gen_logits_mean": -14.625864028930664, "gen_logits_min": -27.058670043945312, "gen_logits_std": 3.408902168273926, "gen_loss": 0.24928812682628632, "grad_norm": 0.3536700110139923, "learning_rate": 1.958147368421053e-05, "loss": 0.2612, "mean_copy_accuracy": 0.997267097234726, "mean_gen_accuracy": 0.8809064775705338, "mean_token_accuracy": 0.9119111150503159, "num_tokens": 130535460.0, "sample_num_tokens": 8932.5, "step": 8984, "total_num_tokens": 130571190.0, "z_loss": 0.0003794546937569976 }, { "copy_logits_max": -2.2813310623168945, "copy_logits_min": -750000000.0, "copy_num_tokens": 827.625, "epoch": 1.8349246872606586, "gen_logits_max": 3.064120292663574, "gen_logits_mean": -17.346248626708984, "gen_logits_min": -29.827648162841797, "gen_logits_std": 3.502957344055176, "gen_loss": 0.27203136682510376, "grad_norm": 0.3181796939611226, "learning_rate": 1.958021052631579e-05, "loss": 0.2615, "mean_copy_accuracy": 0.9974941313266754, "mean_gen_accuracy": 0.8777658194303513, "mean_token_accuracy": 0.913493424654007, "num_tokens": 130829525.0, "sample_num_tokens": 11069.75, "step": 8985, "total_num_tokens": 130873804.0, "z_loss": 0.0004130138549953699 }, { "copy_logits_max": -1.67878258228302, "copy_logits_min": -750000000.0, "copy_num_tokens": 463.6875, "epoch": 1.8351289251978555, "gen_logits_max": 5.3107805252075195, "gen_logits_mean": -15.305695533752441, "gen_logits_min": -28.24864959716797, "gen_logits_std": 3.413041591644287, "gen_loss": 0.28357285261154175, "grad_norm": 0.35899396761153357, "learning_rate": 1.9578947368421055e-05, "loss": 0.2737, "mean_copy_accuracy": 0.9977778047323227, "mean_gen_accuracy": 0.8796544522047043, "mean_token_accuracy": 0.9084870666265488, "num_tokens": 131107451.0, "sample_num_tokens": 8346.75, "step": 8986, "total_num_tokens": 131140838.0, "z_loss": 0.00043298257514834404 }, { "copy_logits_max": -2.1412038803100586, "copy_logits_min": -750000064.0, "copy_num_tokens": 433.625, "epoch": 1.8353331631350525, "gen_logits_max": 4.777325630187988, "gen_logits_mean": -15.280057907104492, "gen_logits_min": -28.25959014892578, "gen_logits_std": 3.387113571166992, "gen_loss": 0.28630807995796204, "grad_norm": 0.32791116165081985, "learning_rate": 1.9577684210526316e-05, "loss": 0.267, "mean_copy_accuracy": 0.9974612891674042, "mean_gen_accuracy": 0.8817462772130966, "mean_token_accuracy": 0.9111037105321884, "num_tokens": 131390114.0, "sample_num_tokens": 8850.5, "step": 8987, "total_num_tokens": 131425516.0, "z_loss": 0.0004778754082508385 }, { "copy_logits_max": -2.8204712867736816, "copy_logits_min": -750000000.0, "copy_num_tokens": 491.8125, "epoch": 1.8355374010722492, "gen_logits_max": 4.009165287017822, "gen_logits_mean": -16.551654815673828, "gen_logits_min": -29.27903175354004, "gen_logits_std": 3.4577572345733643, "gen_loss": 0.2775226831436157, "grad_norm": 0.3481336650406496, "learning_rate": 1.957642105263158e-05, "loss": 0.2751, "mean_copy_accuracy": 0.9977283924818039, "mean_gen_accuracy": 0.8769485801458359, "mean_token_accuracy": 0.9059343338012695, "num_tokens": 131672357.0, "sample_num_tokens": 8162.25, "step": 8988, "total_num_tokens": 131705006.0, "z_loss": 0.00043594848830252886 }, { "copy_logits_max": -1.3788461685180664, "copy_logits_min": -687500032.0, "copy_num_tokens": 400.375, "epoch": 1.8357416390094459, "gen_logits_max": 4.833907604217529, "gen_logits_mean": -15.881202697753906, "gen_logits_min": -28.189815521240234, "gen_logits_std": 3.424704074859619, "gen_loss": 0.2977783679962158, "grad_norm": 0.35741493826331716, "learning_rate": 1.957515789473684e-05, "loss": 0.2817, "mean_copy_accuracy": 0.9966330528259277, "mean_gen_accuracy": 0.8762780725955963, "mean_token_accuracy": 0.9063507616519928, "num_tokens": 131942465.0, "sample_num_tokens": 7756.75, "step": 8989, "total_num_tokens": 131973492.0, "z_loss": 0.0004412229754962027 }, { "copy_logits_max": -4.149096488952637, "copy_logits_min": -750000000.0, "copy_num_tokens": 312.1875, "epoch": 1.8359458769466428, "gen_logits_max": 4.102127552032471, "gen_logits_mean": -18.258468627929688, "gen_logits_min": -30.054841995239258, "gen_logits_std": 3.4699089527130127, "gen_loss": 0.31626760959625244, "grad_norm": 0.38239507794229677, "learning_rate": 1.9573894736842105e-05, "loss": 0.2796, "mean_copy_accuracy": 0.9964495599269867, "mean_gen_accuracy": 0.8766463845968246, "mean_token_accuracy": 0.9035630077123642, "num_tokens": 132207908.0, "sample_num_tokens": 7949.5, "step": 8990, "total_num_tokens": 132239706.0, "z_loss": 0.0004311713273636997 }, { "copy_logits_max": -3.3356881141662598, "copy_logits_min": -750000000.0, "copy_num_tokens": 527.4375, "epoch": 1.8361501148838397, "gen_logits_max": 4.941730499267578, "gen_logits_mean": -16.321321487426758, "gen_logits_min": -28.528478622436523, "gen_logits_std": 3.454805612564087, "gen_loss": 0.2614993453025818, "grad_norm": 0.3589562203889689, "learning_rate": 1.9572631578947366e-05, "loss": 0.2729, "mean_copy_accuracy": 0.9968628436326981, "mean_gen_accuracy": 0.8772484958171844, "mean_token_accuracy": 0.9063337743282318, "num_tokens": 132455396.0, "sample_num_tokens": 9192.0, "step": 8991, "total_num_tokens": 132492164.0, "z_loss": 0.0003602202341426164 }, { "copy_logits_max": -0.6604810357093811, "copy_logits_min": -687500032.0, "copy_num_tokens": 564.5, "epoch": 1.8363543528210364, "gen_logits_max": 4.604752063751221, "gen_logits_mean": -15.03006649017334, "gen_logits_min": -27.139930725097656, "gen_logits_std": 3.3591043949127197, "gen_loss": 0.3013705015182495, "grad_norm": 0.34972641119436265, "learning_rate": 1.9571368421052634e-05, "loss": 0.2783, "mean_copy_accuracy": 0.996493324637413, "mean_gen_accuracy": 0.8771796971559525, "mean_token_accuracy": 0.9045829623937607, "num_tokens": 132731447.0, "sample_num_tokens": 8661.25, "step": 8992, "total_num_tokens": 132766092.0, "z_loss": 0.0004779878945555538 }, { "copy_logits_max": -4.188608646392822, "copy_logits_min": -687500032.0, "copy_num_tokens": 259.8125, "epoch": 1.8365585907582334, "gen_logits_max": 4.5998430252075195, "gen_logits_mean": -17.092151641845703, "gen_logits_min": -29.006805419921875, "gen_logits_std": 3.4375972747802734, "gen_loss": 0.2602478265762329, "grad_norm": 0.32009898508416185, "learning_rate": 1.9570105263157895e-05, "loss": 0.2612, "mean_copy_accuracy": 0.9973700940608978, "mean_gen_accuracy": 0.8812669813632965, "mean_token_accuracy": 0.9107342064380646, "num_tokens": 132999665.0, "sample_num_tokens": 7160.25, "step": 8993, "total_num_tokens": 133028306.0, "z_loss": 0.0003640407230705023 }, { "copy_logits_max": -1.5660769939422607, "copy_logits_min": -750000000.0, "copy_num_tokens": 386.125, "epoch": 1.8367628286954303, "gen_logits_max": 4.68772554397583, "gen_logits_mean": -16.149057388305664, "gen_logits_min": -28.575485229492188, "gen_logits_std": 3.420344352722168, "gen_loss": 0.28493157029151917, "grad_norm": 0.3424930675922651, "learning_rate": 1.956884210526316e-05, "loss": 0.2778, "mean_copy_accuracy": 0.9973147809505463, "mean_gen_accuracy": 0.8745469152927399, "mean_token_accuracy": 0.9064065217971802, "num_tokens": 133274816.0, "sample_num_tokens": 8124.5, "step": 8994, "total_num_tokens": 133307314.0, "z_loss": 0.0003951139806304127 }, { "copy_logits_max": 1.910851240158081, "copy_logits_min": -750000000.0, "copy_num_tokens": 407.0, "epoch": 1.836967066632627, "gen_logits_max": 6.112597942352295, "gen_logits_mean": -14.0411958694458, "gen_logits_min": -26.49129867553711, "gen_logits_std": 3.351757764816284, "gen_loss": 0.29729655385017395, "grad_norm": 0.3629561103388319, "learning_rate": 1.9567578947368423e-05, "loss": 0.2858, "mean_copy_accuracy": 0.9973716884851456, "mean_gen_accuracy": 0.8717635869979858, "mean_token_accuracy": 0.903992548584938, "num_tokens": 133548009.0, "sample_num_tokens": 8140.25, "step": 8995, "total_num_tokens": 133580570.0, "z_loss": 0.0004357703437563032 }, { "copy_logits_max": -2.1167783737182617, "copy_logits_min": -750000064.0, "copy_num_tokens": 420.0, "epoch": 1.8371713045698237, "gen_logits_max": 4.575146675109863, "gen_logits_mean": -16.390453338623047, "gen_logits_min": -28.468379974365234, "gen_logits_std": 3.397573471069336, "gen_loss": 0.29270440340042114, "grad_norm": 0.3693002516738016, "learning_rate": 1.9566315789473684e-05, "loss": 0.2799, "mean_copy_accuracy": 0.9969189018011093, "mean_gen_accuracy": 0.8745134472846985, "mean_token_accuracy": 0.905789315700531, "num_tokens": 133815689.0, "sample_num_tokens": 9228.25, "step": 8996, "total_num_tokens": 133852602.0, "z_loss": 0.00042769667925313115 }, { "copy_logits_max": -1.965389609336853, "copy_logits_min": -750000064.0, "copy_num_tokens": 483.625, "epoch": 1.8373755425070208, "gen_logits_max": 5.305061340332031, "gen_logits_mean": -16.171106338500977, "gen_logits_min": -28.389020919799805, "gen_logits_std": 3.427903652191162, "gen_loss": 0.24870403110980988, "grad_norm": 0.3788616375390705, "learning_rate": 1.956505263157895e-05, "loss": 0.2846, "mean_copy_accuracy": 0.9965922683477402, "mean_gen_accuracy": 0.875387042760849, "mean_token_accuracy": 0.9038182944059372, "num_tokens": 134076060.0, "sample_num_tokens": 8703.0, "step": 8997, "total_num_tokens": 134110872.0, "z_loss": 0.00038008118281140924 }, { "copy_logits_max": -3.1000852584838867, "copy_logits_min": -750000000.0, "copy_num_tokens": 554.5625, "epoch": 1.8375797804442175, "gen_logits_max": 3.8169898986816406, "gen_logits_mean": -18.344438552856445, "gen_logits_min": -30.757797241210938, "gen_logits_std": 3.503591775894165, "gen_loss": 0.2974433898925781, "grad_norm": 0.3804664544449858, "learning_rate": 1.956378947368421e-05, "loss": 0.2884, "mean_copy_accuracy": 0.99661023914814, "mean_gen_accuracy": 0.8734346181154251, "mean_token_accuracy": 0.9037933498620987, "num_tokens": 134350898.0, "sample_num_tokens": 9827.5, "step": 8998, "total_num_tokens": 134390208.0, "z_loss": 0.000515423365868628 }, { "copy_logits_max": -1.8305708169937134, "copy_logits_min": -750000064.0, "copy_num_tokens": 350.125, "epoch": 1.8377840183814143, "gen_logits_max": 4.719144344329834, "gen_logits_mean": -16.743806838989258, "gen_logits_min": -29.00320816040039, "gen_logits_std": 3.4263548851013184, "gen_loss": 0.29351943731307983, "grad_norm": 0.35828156218551477, "learning_rate": 1.9562526315789474e-05, "loss": 0.2879, "mean_copy_accuracy": 0.9954899102449417, "mean_gen_accuracy": 0.8750161379575729, "mean_token_accuracy": 0.9005652368068695, "num_tokens": 134627686.0, "sample_num_tokens": 8119.0, "step": 8999, "total_num_tokens": 134660162.0, "z_loss": 0.0004733623645734042 }, { "epoch": 1.8379882563186112, "grad_norm": 0.33438994076748, "learning_rate": 1.9561263157894738e-05, "loss": 0.2647, "step": 9000 }, { "epoch": 1.8379882563186112, "eval_copy_logits_max": -8.316831588745117, "eval_copy_logits_min": -85.9754638671875, "eval_gen_logits_max": 2.2642018795013428, "eval_gen_logits_mean": -22.61113929748535, "eval_gen_logits_min": -33.93251419067383, "eval_gen_logits_std": 3.5931525230407715, "eval_gen_loss": 0.30778831243515015, "eval_loss": 0.27370426058769226, "eval_mean_copy_accuracy": 0.9902215301990509, "eval_mean_gen_accuracy": 0.8877348601818085, "eval_mean_token_accuracy": 0.9011447131633759, "eval_num_tokens": 134939796.0, "eval_runtime": 0.7856, "eval_samples_per_second": 10.184, "eval_steps_per_second": 2.546, "eval_total_num_tokens": 134939796.0, "eval_z_loss": 0.0004330329829826951, "step": 9000 }, { "copy_logits_max": -1.0647921562194824, "copy_logits_min": -687500032.0, "copy_num_tokens": 522.625, "epoch": 1.838192494255808, "gen_logits_max": 5.689455986022949, "gen_logits_mean": -14.115550994873047, "gen_logits_min": -27.221052169799805, "gen_logits_std": 3.3547110557556152, "gen_loss": 0.2593356966972351, "grad_norm": 0.3387104417751532, "learning_rate": 1.9560000000000002e-05, "loss": 0.2791, "mean_copy_accuracy": 0.9971092715859413, "mean_gen_accuracy": 0.8784816563129425, "mean_token_accuracy": 0.907987654209137, "num_tokens": 135171102.0, "sample_num_tokens": 8740.0, "step": 9001, "total_num_tokens": 135206062.0, "z_loss": 0.0004395396390464157 }, { "copy_logits_max": 0.0680963397026062, "copy_logits_min": -750000000.0, "copy_num_tokens": 492.25, "epoch": 1.8383967321930048, "gen_logits_max": 5.534369468688965, "gen_logits_mean": -14.681156158447266, "gen_logits_min": -27.305355072021484, "gen_logits_std": 3.392010450363159, "gen_loss": 0.25862830877304077, "grad_norm": 0.3264167678300539, "learning_rate": 1.9558736842105263e-05, "loss": 0.2614, "mean_copy_accuracy": 0.9975680708885193, "mean_gen_accuracy": 0.8782186955213547, "mean_token_accuracy": 0.9114937633275986, "num_tokens": 135472984.0, "sample_num_tokens": 8722.0, "step": 9002, "total_num_tokens": 135507872.0, "z_loss": 0.00044201186392456293 }, { "copy_logits_max": -2.1387693881988525, "copy_logits_min": -687500032.0, "copy_num_tokens": 487.25, "epoch": 1.8386009701302017, "gen_logits_max": 4.199073791503906, "gen_logits_mean": -16.822280883789062, "gen_logits_min": -29.575939178466797, "gen_logits_std": 3.4815268516540527, "gen_loss": 0.25004842877388, "grad_norm": 0.3297015949641728, "learning_rate": 1.9557473684210528e-05, "loss": 0.2548, "mean_copy_accuracy": 0.9969732910394669, "mean_gen_accuracy": 0.8855919390916824, "mean_token_accuracy": 0.9130119681358337, "num_tokens": 135740516.0, "sample_num_tokens": 8129.0, "step": 9003, "total_num_tokens": 135773032.0, "z_loss": 0.0004264064191374928 }, { "copy_logits_max": -0.26089659333229065, "copy_logits_min": -750000000.0, "copy_num_tokens": 711.3125, "epoch": 1.8388052080673987, "gen_logits_max": 4.463630676269531, "gen_logits_mean": -15.48089599609375, "gen_logits_min": -28.370220184326172, "gen_logits_std": 3.416292190551758, "gen_loss": 0.23352406919002533, "grad_norm": 0.35711150817624376, "learning_rate": 1.955621052631579e-05, "loss": 0.2606, "mean_copy_accuracy": 0.9972881972789764, "mean_gen_accuracy": 0.8797340989112854, "mean_token_accuracy": 0.9115564972162247, "num_tokens": 136008747.0, "sample_num_tokens": 10510.25, "step": 9004, "total_num_tokens": 136050788.0, "z_loss": 0.0004650480404961854 }, { "copy_logits_max": -0.038884758949279785, "copy_logits_min": -687500032.0, "copy_num_tokens": 592.1875, "epoch": 1.8390094460045954, "gen_logits_max": 4.755252361297607, "gen_logits_mean": -14.981992721557617, "gen_logits_min": -27.878921508789062, "gen_logits_std": 3.4237523078918457, "gen_loss": 0.26532456278800964, "grad_norm": 0.32430889116637146, "learning_rate": 1.9554947368421053e-05, "loss": 0.2551, "mean_copy_accuracy": 0.9974308162927628, "mean_gen_accuracy": 0.8815484791994095, "mean_token_accuracy": 0.9124254584312439, "num_tokens": 136287285.0, "sample_num_tokens": 9498.25, "step": 9005, "total_num_tokens": 136325278.0, "z_loss": 0.0004309324431233108 }, { "copy_logits_max": -3.653014659881592, "copy_logits_min": -750000000.0, "copy_num_tokens": 334.5625, "epoch": 1.839213683941792, "gen_logits_max": 6.004395484924316, "gen_logits_mean": -14.548965454101562, "gen_logits_min": -27.128904342651367, "gen_logits_std": 3.332828998565674, "gen_loss": 0.26236748695373535, "grad_norm": 0.33824341310038747, "learning_rate": 1.9553684210526314e-05, "loss": 0.2648, "mean_copy_accuracy": 0.9965595453977585, "mean_gen_accuracy": 0.8846558332443237, "mean_token_accuracy": 0.9088586419820786, "num_tokens": 136565164.0, "sample_num_tokens": 8252.5, "step": 9006, "total_num_tokens": 136598174.0, "z_loss": 0.00040964054642245173 }, { "copy_logits_max": -3.729688882827759, "copy_logits_min": -750000000.0, "copy_num_tokens": 399.875, "epoch": 1.839417921878989, "gen_logits_max": 5.305419921875, "gen_logits_mean": -15.977802276611328, "gen_logits_min": -28.355926513671875, "gen_logits_std": 3.438018321990967, "gen_loss": 0.2723519206047058, "grad_norm": 0.3381799413400418, "learning_rate": 1.9552421052631578e-05, "loss": 0.2639, "mean_copy_accuracy": 0.9971581399440765, "mean_gen_accuracy": 0.8812072426080704, "mean_token_accuracy": 0.9092655330896378, "num_tokens": 136824673.0, "sample_num_tokens": 8175.75, "step": 9007, "total_num_tokens": 136857376.0, "z_loss": 0.0004532751045189798 }, { "copy_logits_max": 1.435240387916565, "copy_logits_min": -750000000.0, "copy_num_tokens": 532.125, "epoch": 1.839622159816186, "gen_logits_max": 6.029984474182129, "gen_logits_mean": -14.182196617126465, "gen_logits_min": -27.062374114990234, "gen_logits_std": 3.37495756149292, "gen_loss": 0.27723443508148193, "grad_norm": 0.326043362821738, "learning_rate": 1.9551157894736842e-05, "loss": 0.2496, "mean_copy_accuracy": 0.9970614165067673, "mean_gen_accuracy": 0.8842144161462784, "mean_token_accuracy": 0.9141454547643661, "num_tokens": 137110873.0, "sample_num_tokens": 8932.25, "step": 9008, "total_num_tokens": 137146602.0, "z_loss": 0.00043116818415001035 }, { "copy_logits_max": -7.193532943725586, "copy_logits_min": -750000000.0, "copy_num_tokens": 321.375, "epoch": 1.8398263977533826, "gen_logits_max": 4.411776542663574, "gen_logits_mean": -17.837642669677734, "gen_logits_min": -29.906606674194336, "gen_logits_std": 3.4764723777770996, "gen_loss": 0.27947527170181274, "grad_norm": 0.38255930920420667, "learning_rate": 1.9549894736842107e-05, "loss": 0.2865, "mean_copy_accuracy": 0.9962943494319916, "mean_gen_accuracy": 0.8779766410589218, "mean_token_accuracy": 0.9017942398786545, "num_tokens": 137355665.0, "sample_num_tokens": 7145.75, "step": 9009, "total_num_tokens": 137384248.0, "z_loss": 0.00046867362107150257 }, { "copy_logits_max": 1.4262902736663818, "copy_logits_min": -750000064.0, "copy_num_tokens": 637.125, "epoch": 1.8400306356905796, "gen_logits_max": 5.949504852294922, "gen_logits_mean": -13.850448608398438, "gen_logits_min": -27.17597007751465, "gen_logits_std": 3.408419609069824, "gen_loss": 0.21299311518669128, "grad_norm": 0.3281065262915774, "learning_rate": 1.954863157894737e-05, "loss": 0.2571, "mean_copy_accuracy": 0.9979914277791977, "mean_gen_accuracy": 0.8777292817831039, "mean_token_accuracy": 0.9116346389055252, "num_tokens": 137637692.0, "sample_num_tokens": 9853.0, "step": 9010, "total_num_tokens": 137677104.0, "z_loss": 0.00037363136652857065 }, { "copy_logits_max": -4.826462745666504, "copy_logits_min": -750000000.0, "copy_num_tokens": 625.9375, "epoch": 1.8402348736277765, "gen_logits_max": 4.104055881500244, "gen_logits_mean": -15.946582794189453, "gen_logits_min": -28.576492309570312, "gen_logits_std": 3.4780092239379883, "gen_loss": 0.24622073769569397, "grad_norm": 0.35198940004824514, "learning_rate": 1.9547368421052632e-05, "loss": 0.2758, "mean_copy_accuracy": 0.9976827204227448, "mean_gen_accuracy": 0.877156063914299, "mean_token_accuracy": 0.9053292125463486, "num_tokens": 137901000.0, "sample_num_tokens": 9259.0, "step": 9011, "total_num_tokens": 137938036.0, "z_loss": 0.00042210129322484136 }, { "copy_logits_max": -2.2518091201782227, "copy_logits_min": -750000000.0, "copy_num_tokens": 551.6875, "epoch": 1.8404391115649732, "gen_logits_max": 3.975058078765869, "gen_logits_mean": -16.582181930541992, "gen_logits_min": -29.066043853759766, "gen_logits_std": 3.455277919769287, "gen_loss": 0.2974078357219696, "grad_norm": 0.3476683934158161, "learning_rate": 1.9546105263157896e-05, "loss": 0.2853, "mean_copy_accuracy": 0.9964576959609985, "mean_gen_accuracy": 0.8734050840139389, "mean_token_accuracy": 0.9045957177877426, "num_tokens": 138170369.0, "sample_num_tokens": 8406.25, "step": 9012, "total_num_tokens": 138203994.0, "z_loss": 0.0005108023760840297 }, { "copy_logits_max": -1.071842908859253, "copy_logits_min": -687500032.0, "copy_num_tokens": 621.625, "epoch": 1.84064334950217, "gen_logits_max": 5.254031181335449, "gen_logits_mean": -12.99870491027832, "gen_logits_min": -26.06724739074707, "gen_logits_std": 3.3668789863586426, "gen_loss": 0.2512904703617096, "grad_norm": 0.36351125912483295, "learning_rate": 1.9544842105263157e-05, "loss": 0.27, "mean_copy_accuracy": 0.9966401755809784, "mean_gen_accuracy": 0.8802505135536194, "mean_token_accuracy": 0.9099073261022568, "num_tokens": 138434468.0, "sample_num_tokens": 8410.5, "step": 9013, "total_num_tokens": 138468110.0, "z_loss": 0.00041691737715154886 }, { "copy_logits_max": -5.628401756286621, "copy_logits_min": -750000064.0, "copy_num_tokens": 393.9375, "epoch": 1.8408475874393668, "gen_logits_max": 5.396744728088379, "gen_logits_mean": -15.653874397277832, "gen_logits_min": -28.08875274658203, "gen_logits_std": 3.3945093154907227, "gen_loss": 0.29374849796295166, "grad_norm": 0.36304496170841266, "learning_rate": 1.954357894736842e-05, "loss": 0.2809, "mean_copy_accuracy": 0.9964274913072586, "mean_gen_accuracy": 0.8779335767030716, "mean_token_accuracy": 0.9039388447999954, "num_tokens": 138696279.0, "sample_num_tokens": 7850.75, "step": 9014, "total_num_tokens": 138727682.0, "z_loss": 0.0004962229286320508 }, { "copy_logits_max": -5.231967926025391, "copy_logits_min": -687500032.0, "copy_num_tokens": 706.625, "epoch": 1.8410518253765638, "gen_logits_max": 3.2312021255493164, "gen_logits_mean": -17.501026153564453, "gen_logits_min": -30.32415008544922, "gen_logits_std": 3.5097837448120117, "gen_loss": 0.2692638635635376, "grad_norm": 0.34082361055368754, "learning_rate": 1.9542315789473682e-05, "loss": 0.2803, "mean_copy_accuracy": 0.9972241222858429, "mean_gen_accuracy": 0.8735691159963608, "mean_token_accuracy": 0.9062415957450867, "num_tokens": 138974188.0, "sample_num_tokens": 10285.5, "step": 9015, "total_num_tokens": 139015330.0, "z_loss": 0.00043661327799782157 }, { "copy_logits_max": -3.1078176498413086, "copy_logits_min": -750000064.0, "copy_num_tokens": 456.375, "epoch": 1.8412560633137605, "gen_logits_max": 4.139901161193848, "gen_logits_mean": -16.27979278564453, "gen_logits_min": -28.994686126708984, "gen_logits_std": 3.47285795211792, "gen_loss": 0.27057433128356934, "grad_norm": 0.3598670026128086, "learning_rate": 1.9541052631578947e-05, "loss": 0.2744, "mean_copy_accuracy": 0.997035801410675, "mean_gen_accuracy": 0.8785791248083115, "mean_token_accuracy": 0.9073686450719833, "num_tokens": 139249805.0, "sample_num_tokens": 8205.25, "step": 9016, "total_num_tokens": 139282626.0, "z_loss": 0.00042971380753442645 }, { "copy_logits_max": -1.4817242622375488, "copy_logits_min": -750000000.0, "copy_num_tokens": 537.1875, "epoch": 1.8414603012509574, "gen_logits_max": 3.971987247467041, "gen_logits_mean": -16.379535675048828, "gen_logits_min": -28.558170318603516, "gen_logits_std": 3.4729137420654297, "gen_loss": 0.2635824680328369, "grad_norm": 0.3478852332965277, "learning_rate": 1.953978947368421e-05, "loss": 0.2877, "mean_copy_accuracy": 0.9970077574253082, "mean_gen_accuracy": 0.8771650195121765, "mean_token_accuracy": 0.9040576070547104, "num_tokens": 139522601.0, "sample_num_tokens": 8086.25, "step": 9017, "total_num_tokens": 139554946.0, "z_loss": 0.00041588643216528 }, { "copy_logits_max": -4.937314987182617, "copy_logits_min": -687500032.0, "copy_num_tokens": 414.0, "epoch": 1.8416645391881543, "gen_logits_max": 3.7771377563476562, "gen_logits_mean": -17.259458541870117, "gen_logits_min": -29.69045066833496, "gen_logits_std": 3.501046895980835, "gen_loss": 0.29255062341690063, "grad_norm": 0.3619237161583064, "learning_rate": 1.9538526315789475e-05, "loss": 0.2907, "mean_copy_accuracy": 0.9947926849126816, "mean_gen_accuracy": 0.8760816603899002, "mean_token_accuracy": 0.9005240797996521, "num_tokens": 139774721.0, "sample_num_tokens": 8153.25, "step": 9018, "total_num_tokens": 139807334.0, "z_loss": 0.000472600368084386 }, { "copy_logits_max": -3.8423519134521484, "copy_logits_min": -750000000.0, "copy_num_tokens": 487.4375, "epoch": 1.841868777125351, "gen_logits_max": 4.881227493286133, "gen_logits_mean": -16.710060119628906, "gen_logits_min": -28.64570426940918, "gen_logits_std": 3.507368564605713, "gen_loss": 0.25601398944854736, "grad_norm": 0.3274687970229905, "learning_rate": 1.9537263157894736e-05, "loss": 0.2755, "mean_copy_accuracy": 0.9968182593584061, "mean_gen_accuracy": 0.8780430257320404, "mean_token_accuracy": 0.9064561128616333, "num_tokens": 140063246.0, "sample_num_tokens": 9161.5, "step": 9019, "total_num_tokens": 140099892.0, "z_loss": 0.0003767950111068785 }, { "copy_logits_max": -3.942354202270508, "copy_logits_min": -750000000.0, "copy_num_tokens": 351.25, "epoch": 1.8420730150625477, "gen_logits_max": 3.666858673095703, "gen_logits_mean": -18.147581100463867, "gen_logits_min": -30.312240600585938, "gen_logits_std": 3.5585086345672607, "gen_loss": 0.26175177097320557, "grad_norm": 0.3502820185202179, "learning_rate": 1.9536e-05, "loss": 0.2807, "mean_copy_accuracy": 0.9965672791004181, "mean_gen_accuracy": 0.8749389052391052, "mean_token_accuracy": 0.9054630100727081, "num_tokens": 140343819.0, "sample_num_tokens": 7352.75, "step": 9020, "total_num_tokens": 140373230.0, "z_loss": 0.0003681264352053404 }, { "copy_logits_max": 0.5659313797950745, "copy_logits_min": -687500032.0, "copy_num_tokens": 525.8125, "epoch": 1.8422772529997447, "gen_logits_max": 5.506143093109131, "gen_logits_mean": -14.782052993774414, "gen_logits_min": -27.33544158935547, "gen_logits_std": 3.4450035095214844, "gen_loss": 0.2727507948875427, "grad_norm": 0.30207186718885043, "learning_rate": 1.9534736842105265e-05, "loss": 0.2509, "mean_copy_accuracy": 0.9980522245168686, "mean_gen_accuracy": 0.8816723376512527, "mean_token_accuracy": 0.9153958708047867, "num_tokens": 140636697.0, "sample_num_tokens": 8869.25, "step": 9021, "total_num_tokens": 140672174.0, "z_loss": 0.0004209612379781902 }, { "copy_logits_max": -0.0036885440349578857, "copy_logits_min": -687500032.0, "copy_num_tokens": 581.5, "epoch": 1.8424814909369416, "gen_logits_max": 4.295558929443359, "gen_logits_mean": -15.756844520568848, "gen_logits_min": -28.281211853027344, "gen_logits_std": 3.491647481918335, "gen_loss": 0.27072739601135254, "grad_norm": 0.33693360759614677, "learning_rate": 1.9533473684210526e-05, "loss": 0.2634, "mean_copy_accuracy": 0.9977234750986099, "mean_gen_accuracy": 0.8769198656082153, "mean_token_accuracy": 0.9096469879150391, "num_tokens": 140906621.0, "sample_num_tokens": 8915.25, "step": 9022, "total_num_tokens": 140942282.0, "z_loss": 0.00043184927199035883 }, { "copy_logits_max": -3.798673629760742, "copy_logits_min": -750000000.0, "copy_num_tokens": 584.5, "epoch": 1.8426857288741383, "gen_logits_max": 5.26878547668457, "gen_logits_mean": -14.942998886108398, "gen_logits_min": -27.967281341552734, "gen_logits_std": 3.474774122238159, "gen_loss": 0.227088063955307, "grad_norm": 0.3326500846765486, "learning_rate": 1.953221052631579e-05, "loss": 0.2641, "mean_copy_accuracy": 0.996643528342247, "mean_gen_accuracy": 0.8859583288431168, "mean_token_accuracy": 0.9113850146532059, "num_tokens": 141193172.0, "sample_num_tokens": 9113.5, "step": 9023, "total_num_tokens": 141229626.0, "z_loss": 0.0003146198287140578 }, { "copy_logits_max": -1.3702579736709595, "copy_logits_min": -750000000.0, "copy_num_tokens": 550.125, "epoch": 1.8428899668113352, "gen_logits_max": 7.188754558563232, "gen_logits_mean": -13.430276870727539, "gen_logits_min": -26.72553253173828, "gen_logits_std": 3.403282403945923, "gen_loss": 0.26641643047332764, "grad_norm": 0.3247922872215169, "learning_rate": 1.953094736842105e-05, "loss": 0.268, "mean_copy_accuracy": 0.9968982487916946, "mean_gen_accuracy": 0.880389854311943, "mean_token_accuracy": 0.9092332869768143, "num_tokens": 141470536.0, "sample_num_tokens": 8820.0, "step": 9024, "total_num_tokens": 141505816.0, "z_loss": 0.00038652215152978897 }, { "copy_logits_max": -2.3215527534484863, "copy_logits_min": -750000064.0, "copy_num_tokens": 431.5625, "epoch": 1.8430942047485321, "gen_logits_max": 5.117489814758301, "gen_logits_mean": -15.201175689697266, "gen_logits_min": -28.26946258544922, "gen_logits_std": 3.4646406173706055, "gen_loss": 0.23560062050819397, "grad_norm": 0.36279494741972085, "learning_rate": 1.952968421052632e-05, "loss": 0.2646, "mean_copy_accuracy": 0.9973844885826111, "mean_gen_accuracy": 0.883495882153511, "mean_token_accuracy": 0.9099719971418381, "num_tokens": 141741873.0, "sample_num_tokens": 7963.25, "step": 9025, "total_num_tokens": 141773726.0, "z_loss": 0.0003517496807035059 }, { "copy_logits_max": -1.7294085025787354, "copy_logits_min": -750000064.0, "copy_num_tokens": 310.3125, "epoch": 1.8432984426857288, "gen_logits_max": 5.926203727722168, "gen_logits_mean": -15.153287887573242, "gen_logits_min": -27.892738342285156, "gen_logits_std": 3.4503493309020996, "gen_loss": 0.29098355770111084, "grad_norm": 0.36678315819177826, "learning_rate": 1.952842105263158e-05, "loss": 0.2667, "mean_copy_accuracy": 0.9962366968393326, "mean_gen_accuracy": 0.8841709941625595, "mean_token_accuracy": 0.9083302468061447, "num_tokens": 141990351.0, "sample_num_tokens": 6379.25, "step": 9026, "total_num_tokens": 142015868.0, "z_loss": 0.00042427307926118374 }, { "copy_logits_max": -1.4960533380508423, "copy_logits_min": -750000000.0, "copy_num_tokens": 762.875, "epoch": 1.8435026806229255, "gen_logits_max": 4.1325883865356445, "gen_logits_mean": -16.043407440185547, "gen_logits_min": -29.426706314086914, "gen_logits_std": 3.5134124755859375, "gen_loss": 0.2495584487915039, "grad_norm": 0.3362378926895186, "learning_rate": 1.9527157894736844e-05, "loss": 0.2727, "mean_copy_accuracy": 0.9980423748493195, "mean_gen_accuracy": 0.8734813779592514, "mean_token_accuracy": 0.907172828912735, "num_tokens": 142247184.0, "sample_num_tokens": 9591.5, "step": 9027, "total_num_tokens": 142285550.0, "z_loss": 0.00038813884020783007 }, { "copy_logits_max": -3.0295562744140625, "copy_logits_min": -750000000.0, "copy_num_tokens": 372.3125, "epoch": 1.8437069185601227, "gen_logits_max": 5.069235801696777, "gen_logits_mean": -16.05086898803711, "gen_logits_min": -28.526935577392578, "gen_logits_std": 3.484811544418335, "gen_loss": 0.2607371211051941, "grad_norm": 0.3481787946930216, "learning_rate": 1.9525894736842105e-05, "loss": 0.2701, "mean_copy_accuracy": 0.997131422162056, "mean_gen_accuracy": 0.8824677169322968, "mean_token_accuracy": 0.9074497520923615, "num_tokens": 142511097.0, "sample_num_tokens": 8194.75, "step": 9028, "total_num_tokens": 142543876.0, "z_loss": 0.00040389932109974325 }, { "copy_logits_max": -2.1286706924438477, "copy_logits_min": -687500032.0, "copy_num_tokens": 590.125, "epoch": 1.8439111564973194, "gen_logits_max": 3.135040283203125, "gen_logits_mean": -18.145549774169922, "gen_logits_min": -30.44240379333496, "gen_logits_std": 3.542273998260498, "gen_loss": 0.27770185470581055, "grad_norm": 0.3654662920947553, "learning_rate": 1.952463157894737e-05, "loss": 0.284, "mean_copy_accuracy": 0.99732606112957, "mean_gen_accuracy": 0.8743591159582138, "mean_token_accuracy": 0.9030667394399643, "num_tokens": 142778316.0, "sample_num_tokens": 9505.5, "step": 9029, "total_num_tokens": 142816338.0, "z_loss": 0.0004392396949697286 }, { "copy_logits_max": -1.9752627611160278, "copy_logits_min": -750000000.0, "copy_num_tokens": 636.4375, "epoch": 1.844115394434516, "gen_logits_max": 3.096968173980713, "gen_logits_mean": -17.743783950805664, "gen_logits_min": -30.386428833007812, "gen_logits_std": 3.5547115802764893, "gen_loss": 0.26689696311950684, "grad_norm": 0.3511126977684219, "learning_rate": 1.952336842105263e-05, "loss": 0.2648, "mean_copy_accuracy": 0.9981638640165329, "mean_gen_accuracy": 0.8748942017555237, "mean_token_accuracy": 0.908719927072525, "num_tokens": 143064216.0, "sample_num_tokens": 9595.0, "step": 9030, "total_num_tokens": 143102596.0, "z_loss": 0.0003949296078644693 }, { "copy_logits_max": -3.385730743408203, "copy_logits_min": -750000000.0, "copy_num_tokens": 240.4375, "epoch": 1.844319632371713, "gen_logits_max": 6.283461570739746, "gen_logits_mean": -14.912460327148438, "gen_logits_min": -27.836566925048828, "gen_logits_std": 3.3955299854278564, "gen_loss": 0.2903308570384979, "grad_norm": 0.3720526679698178, "learning_rate": 1.9522105263157895e-05, "loss": 0.2843, "mean_copy_accuracy": 0.9959532469511032, "mean_gen_accuracy": 0.8779664784669876, "mean_token_accuracy": 0.9017459452152252, "num_tokens": 143311116.0, "sample_num_tokens": 6623.5, "step": 9031, "total_num_tokens": 143337610.0, "z_loss": 0.00039705721428617835 }, { "copy_logits_max": -3.3804030418395996, "copy_logits_min": -750000000.0, "copy_num_tokens": 727.0625, "epoch": 1.84452387030891, "gen_logits_max": 4.118227005004883, "gen_logits_mean": -16.116313934326172, "gen_logits_min": -28.567129135131836, "gen_logits_std": 3.4829516410827637, "gen_loss": 0.23345059156417847, "grad_norm": 0.36564380039862443, "learning_rate": 1.9520842105263155e-05, "loss": 0.2639, "mean_copy_accuracy": 0.9961278289556503, "mean_gen_accuracy": 0.8808663934469223, "mean_token_accuracy": 0.9101069271564484, "num_tokens": 143572949.0, "sample_num_tokens": 10234.25, "step": 9032, "total_num_tokens": 143613886.0, "z_loss": 0.00037363084265962243 }, { "copy_logits_max": -1.0851714611053467, "copy_logits_min": -687500032.0, "copy_num_tokens": 660.0, "epoch": 1.8447281082461067, "gen_logits_max": 3.8531572818756104, "gen_logits_mean": -16.68326759338379, "gen_logits_min": -29.819948196411133, "gen_logits_std": 3.5116260051727295, "gen_loss": 0.22306296229362488, "grad_norm": 0.3247122647203842, "learning_rate": 1.9519578947368423e-05, "loss": 0.2737, "mean_copy_accuracy": 0.997759073972702, "mean_gen_accuracy": 0.8743209689855576, "mean_token_accuracy": 0.9069866985082626, "num_tokens": 143875182.0, "sample_num_tokens": 10056.5, "step": 9033, "total_num_tokens": 143915408.0, "z_loss": 0.0003517011646181345 }, { "copy_logits_max": -1.6345127820968628, "copy_logits_min": -750000000.0, "copy_num_tokens": 485.75, "epoch": 1.8449323461833036, "gen_logits_max": 3.25302791595459, "gen_logits_mean": -18.422306060791016, "gen_logits_min": -31.093271255493164, "gen_logits_std": 3.5876951217651367, "gen_loss": 0.22700054943561554, "grad_norm": 0.3300957490007973, "learning_rate": 1.9518315789473687e-05, "loss": 0.2622, "mean_copy_accuracy": 0.9962850660085678, "mean_gen_accuracy": 0.8835443556308746, "mean_token_accuracy": 0.9099390059709549, "num_tokens": 144133389.0, "sample_num_tokens": 7603.75, "step": 9034, "total_num_tokens": 144163804.0, "z_loss": 0.0003528497472871095 }, { "copy_logits_max": 2.229856252670288, "copy_logits_min": -750000000.0, "copy_num_tokens": 628.125, "epoch": 1.8451365841205005, "gen_logits_max": 5.326464653015137, "gen_logits_mean": -14.119400024414062, "gen_logits_min": -26.999324798583984, "gen_logits_std": 3.463489055633545, "gen_loss": 0.1974753886461258, "grad_norm": 0.35298091402424064, "learning_rate": 1.951705263157895e-05, "loss": 0.277, "mean_copy_accuracy": 0.9971472769975662, "mean_gen_accuracy": 0.8729725033044815, "mean_token_accuracy": 0.9038116186857224, "num_tokens": 144401714.0, "sample_num_tokens": 9083.5, "step": 9035, "total_num_tokens": 144438048.0, "z_loss": 0.00032458570785820484 }, { "copy_logits_max": -1.7704695463180542, "copy_logits_min": -750000000.0, "copy_num_tokens": 268.0, "epoch": 1.8453408220576972, "gen_logits_max": 6.719954967498779, "gen_logits_mean": -12.804983139038086, "gen_logits_min": -25.218170166015625, "gen_logits_std": 3.317985773086548, "gen_loss": 0.3097905218601227, "grad_norm": 0.3620603149610525, "learning_rate": 1.9515789473684213e-05, "loss": 0.2655, "mean_copy_accuracy": 0.9966005086898804, "mean_gen_accuracy": 0.8786113411188126, "mean_token_accuracy": 0.9084065705537796, "num_tokens": 144682824.0, "sample_num_tokens": 6417.5, "step": 9036, "total_num_tokens": 144708494.0, "z_loss": 0.0004977461067028344 }, { "copy_logits_max": -0.8754104375839233, "copy_logits_min": -750000000.0, "copy_num_tokens": 630.4375, "epoch": 1.845545059994894, "gen_logits_max": 4.96005392074585, "gen_logits_mean": -15.570404052734375, "gen_logits_min": -28.16618537902832, "gen_logits_std": 3.507317543029785, "gen_loss": 0.2235868275165558, "grad_norm": 0.35763773712740327, "learning_rate": 1.9514526315789474e-05, "loss": 0.2579, "mean_copy_accuracy": 0.9978211671113968, "mean_gen_accuracy": 0.8809806406497955, "mean_token_accuracy": 0.9140467047691345, "num_tokens": 144940090.0, "sample_num_tokens": 8805.0, "step": 9037, "total_num_tokens": 144975310.0, "z_loss": 0.00036036723759025335 }, { "copy_logits_max": -1.2197595834732056, "copy_logits_min": -687500032.0, "copy_num_tokens": 383.375, "epoch": 1.8457492979320909, "gen_logits_max": 4.774909496307373, "gen_logits_mean": -16.18805694580078, "gen_logits_min": -28.509883880615234, "gen_logits_std": 3.4155588150024414, "gen_loss": 0.3097343444824219, "grad_norm": 0.35901048880973746, "learning_rate": 1.9513263157894738e-05, "loss": 0.2726, "mean_copy_accuracy": 0.9975096583366394, "mean_gen_accuracy": 0.875823900103569, "mean_token_accuracy": 0.9078792482614517, "num_tokens": 145221571.0, "sample_num_tokens": 7545.75, "step": 9038, "total_num_tokens": 145251754.0, "z_loss": 0.0004962493549101055 }, { "copy_logits_max": 1.4190642833709717, "copy_logits_min": -687500032.0, "copy_num_tokens": 414.875, "epoch": 1.8459535358692878, "gen_logits_max": 5.143380641937256, "gen_logits_mean": -15.381824493408203, "gen_logits_min": -27.852991104125977, "gen_logits_std": 3.4349727630615234, "gen_loss": 0.2415778636932373, "grad_norm": 0.35548393613230506, "learning_rate": 1.9512e-05, "loss": 0.2736, "mean_copy_accuracy": 0.9971367418766022, "mean_gen_accuracy": 0.8786772787570953, "mean_token_accuracy": 0.9059707671403885, "num_tokens": 145500767.0, "sample_num_tokens": 7674.25, "step": 9039, "total_num_tokens": 145531464.0, "z_loss": 0.0004204516008030623 }, { "copy_logits_max": -0.6416553854942322, "copy_logits_min": -750000000.0, "copy_num_tokens": 492.4375, "epoch": 1.8461577738064845, "gen_logits_max": 4.287023544311523, "gen_logits_mean": -16.958406448364258, "gen_logits_min": -29.104320526123047, "gen_logits_std": 3.474561929702759, "gen_loss": 0.267721951007843, "grad_norm": 0.33454367868577295, "learning_rate": 1.9510736842105263e-05, "loss": 0.2639, "mean_copy_accuracy": 0.9975266307592392, "mean_gen_accuracy": 0.8832037150859833, "mean_token_accuracy": 0.9102791845798492, "num_tokens": 145768187.0, "sample_num_tokens": 9228.75, "step": 9040, "total_num_tokens": 145805102.0, "z_loss": 0.0004163557314313948 }, { "copy_logits_max": -1.016603946685791, "copy_logits_min": -750000000.0, "copy_num_tokens": 407.0, "epoch": 1.8463620117436814, "gen_logits_max": 5.974066734313965, "gen_logits_mean": -13.835190773010254, "gen_logits_min": -26.089305877685547, "gen_logits_std": 3.3586485385894775, "gen_loss": 0.2952690124511719, "grad_norm": 0.37245377727324774, "learning_rate": 1.9509473684210527e-05, "loss": 0.2777, "mean_copy_accuracy": 0.9972928613424301, "mean_gen_accuracy": 0.8743542581796646, "mean_token_accuracy": 0.9043433368206024, "num_tokens": 146035076.0, "sample_num_tokens": 8200.0, "step": 9041, "total_num_tokens": 146067876.0, "z_loss": 0.0004437601601239294 }, { "copy_logits_max": 0.5193402171134949, "copy_logits_min": -750000000.0, "copy_num_tokens": 430.25, "epoch": 1.8465662496808783, "gen_logits_max": 4.496208190917969, "gen_logits_mean": -15.876133918762207, "gen_logits_min": -28.209314346313477, "gen_logits_std": 3.3868322372436523, "gen_loss": 0.29012107849121094, "grad_norm": 0.3283027695074211, "learning_rate": 1.9508210526315792e-05, "loss": 0.2757, "mean_copy_accuracy": 0.9974882006645203, "mean_gen_accuracy": 0.8782852590084076, "mean_token_accuracy": 0.9063348472118378, "num_tokens": 146307401.0, "sample_num_tokens": 8466.75, "step": 9042, "total_num_tokens": 146341268.0, "z_loss": 0.00045205457718111575 }, { "copy_logits_max": -3.4501261711120605, "copy_logits_min": -750000000.0, "copy_num_tokens": 418.875, "epoch": 1.846770487618075, "gen_logits_max": 3.652329683303833, "gen_logits_mean": -17.29367446899414, "gen_logits_min": -29.724567413330078, "gen_logits_std": 3.514327049255371, "gen_loss": 0.21743983030319214, "grad_norm": 0.33222959245199285, "learning_rate": 1.9506947368421053e-05, "loss": 0.2698, "mean_copy_accuracy": 0.9972606599330902, "mean_gen_accuracy": 0.8779632300138474, "mean_token_accuracy": 0.9078210145235062, "num_tokens": 146576494.0, "sample_num_tokens": 7890.5, "step": 9043, "total_num_tokens": 146608056.0, "z_loss": 0.00034644227707758546 }, { "copy_logits_max": 1.0076569318771362, "copy_logits_min": -750000000.0, "copy_num_tokens": 533.875, "epoch": 1.8469747255552718, "gen_logits_max": 4.654943466186523, "gen_logits_mean": -15.312556266784668, "gen_logits_min": -27.657730102539062, "gen_logits_std": 3.4125254154205322, "gen_loss": 0.2578347325325012, "grad_norm": 0.36280745366879713, "learning_rate": 1.9505684210526317e-05, "loss": 0.279, "mean_copy_accuracy": 0.9974057674407959, "mean_gen_accuracy": 0.8743570297956467, "mean_token_accuracy": 0.9043185859918594, "num_tokens": 146833295.0, "sample_num_tokens": 9318.75, "step": 9044, "total_num_tokens": 146870570.0, "z_loss": 0.0003540765610523522 }, { "copy_logits_max": 0.10615246742963791, "copy_logits_min": -750000000.0, "copy_num_tokens": 567.6875, "epoch": 1.8471789634924687, "gen_logits_max": 4.87712287902832, "gen_logits_mean": -15.859935760498047, "gen_logits_min": -27.96898078918457, "gen_logits_std": 3.4409842491149902, "gen_loss": 0.2651273310184479, "grad_norm": 0.29972890344127556, "learning_rate": 1.9504421052631578e-05, "loss": 0.2526, "mean_copy_accuracy": 0.9978949874639511, "mean_gen_accuracy": 0.8823188990354538, "mean_token_accuracy": 0.9148203581571579, "num_tokens": 147135805.0, "sample_num_tokens": 9541.75, "step": 9045, "total_num_tokens": 147173972.0, "z_loss": 0.0003832986403722316 }, { "copy_logits_max": -1.0449292659759521, "copy_logits_min": -750000064.0, "copy_num_tokens": 320.625, "epoch": 1.8473832014296656, "gen_logits_max": 5.698525428771973, "gen_logits_mean": -15.05008602142334, "gen_logits_min": -26.533226013183594, "gen_logits_std": 3.3741538524627686, "gen_loss": 0.2855958640575409, "grad_norm": 0.34462269889720243, "learning_rate": 1.9503157894736842e-05, "loss": 0.276, "mean_copy_accuracy": 0.9967664033174515, "mean_gen_accuracy": 0.8801236152648926, "mean_token_accuracy": 0.9039266854524612, "num_tokens": 147384351.0, "sample_num_tokens": 7214.25, "step": 9046, "total_num_tokens": 147413208.0, "z_loss": 0.00044965214328840375 }, { "copy_logits_max": -0.5898615121841431, "copy_logits_min": -687500032.0, "copy_num_tokens": 424.75, "epoch": 1.8475874393668623, "gen_logits_max": 4.377943992614746, "gen_logits_mean": -16.71784210205078, "gen_logits_min": -29.004669189453125, "gen_logits_std": 3.485851764678955, "gen_loss": 0.2846665680408478, "grad_norm": 0.3478686948555796, "learning_rate": 1.9501894736842103e-05, "loss": 0.2643, "mean_copy_accuracy": 0.9982581287622452, "mean_gen_accuracy": 0.8810499608516693, "mean_token_accuracy": 0.9108857661485672, "num_tokens": 147665777.0, "sample_num_tokens": 7796.25, "step": 9047, "total_num_tokens": 147696962.0, "z_loss": 0.00045236237929202616 }, { "copy_logits_max": 2.413398265838623, "copy_logits_min": -750000000.0, "copy_num_tokens": 414.375, "epoch": 1.8477916773040592, "gen_logits_max": 6.278901100158691, "gen_logits_mean": -13.375107765197754, "gen_logits_min": -26.301546096801758, "gen_logits_std": 3.3736572265625, "gen_loss": 0.3115794360637665, "grad_norm": 0.41066885420650956, "learning_rate": 1.9500631578947367e-05, "loss": 0.2776, "mean_copy_accuracy": 0.9976762235164642, "mean_gen_accuracy": 0.8733475506305695, "mean_token_accuracy": 0.9071274399757385, "num_tokens": 147938184.0, "sample_num_tokens": 7907.0, "step": 9048, "total_num_tokens": 147969812.0, "z_loss": 0.0005245268112048507 }, { "copy_logits_max": -1.1202906370162964, "copy_logits_min": -750000000.0, "copy_num_tokens": 401.125, "epoch": 1.8479959152412562, "gen_logits_max": 4.002750396728516, "gen_logits_mean": -16.968650817871094, "gen_logits_min": -28.875282287597656, "gen_logits_std": 3.4679698944091797, "gen_loss": 0.2875153422355652, "grad_norm": 0.35362893043401505, "learning_rate": 1.9499368421052635e-05, "loss": 0.277, "mean_copy_accuracy": 0.9964806437492371, "mean_gen_accuracy": 0.8749223202466965, "mean_token_accuracy": 0.906531423330307, "num_tokens": 148219311.0, "sample_num_tokens": 8211.75, "step": 9049, "total_num_tokens": 148252158.0, "z_loss": 0.00042998825665563345 }, { "copy_logits_max": 0.4604613184928894, "copy_logits_min": -750000064.0, "copy_num_tokens": 642.75, "epoch": 1.8482001531784529, "gen_logits_max": 4.897701740264893, "gen_logits_mean": -14.960573196411133, "gen_logits_min": -27.561237335205078, "gen_logits_std": 3.4291539192199707, "gen_loss": 0.2584591209888458, "grad_norm": 0.3843115024453193, "learning_rate": 1.9498105263157896e-05, "loss": 0.2681, "mean_copy_accuracy": 0.9971545189619064, "mean_gen_accuracy": 0.8778387904167175, "mean_token_accuracy": 0.9092254787683487, "num_tokens": 148490867.0, "sample_num_tokens": 10165.75, "step": 9050, "total_num_tokens": 148531530.0, "z_loss": 0.0003746904549188912 }, { "copy_logits_max": -0.7690848112106323, "copy_logits_min": -750000000.0, "copy_num_tokens": 367.6875, "epoch": 1.8484043911156496, "gen_logits_max": 4.83757209777832, "gen_logits_mean": -15.456640243530273, "gen_logits_min": -28.05408477783203, "gen_logits_std": 3.406893491744995, "gen_loss": 0.2910158634185791, "grad_norm": 0.3742035343997815, "learning_rate": 1.949684210526316e-05, "loss": 0.3038, "mean_copy_accuracy": 0.9969670474529266, "mean_gen_accuracy": 0.8688906729221344, "mean_token_accuracy": 0.8965386003255844, "num_tokens": 148763296.0, "sample_num_tokens": 7598.0, "step": 9051, "total_num_tokens": 148793688.0, "z_loss": 0.0004901797510683537 }, { "copy_logits_max": -1.4870736598968506, "copy_logits_min": -750000000.0, "copy_num_tokens": 445.9375, "epoch": 1.8486086290528467, "gen_logits_max": 3.04697322845459, "gen_logits_mean": -17.43063735961914, "gen_logits_min": -29.985525131225586, "gen_logits_std": 3.495248794555664, "gen_loss": 0.25317203998565674, "grad_norm": 0.38744233292828545, "learning_rate": 1.949557894736842e-05, "loss": 0.2611, "mean_copy_accuracy": 0.9977235645055771, "mean_gen_accuracy": 0.882302314043045, "mean_token_accuracy": 0.9109346717596054, "num_tokens": 149055571.0, "sample_num_tokens": 7941.75, "step": 9052, "total_num_tokens": 149087338.0, "z_loss": 0.00037487337249331176 }, { "copy_logits_max": 0.34965139627456665, "copy_logits_min": -750000000.0, "copy_num_tokens": 411.625, "epoch": 1.8488128669900434, "gen_logits_max": 5.485909461975098, "gen_logits_mean": -14.054604530334473, "gen_logits_min": -26.232912063598633, "gen_logits_std": 3.3657565116882324, "gen_loss": 0.2723776698112488, "grad_norm": 0.34126495895353304, "learning_rate": 1.9494315789473686e-05, "loss": 0.2603, "mean_copy_accuracy": 0.9975281208753586, "mean_gen_accuracy": 0.8839090913534164, "mean_token_accuracy": 0.911719411611557, "num_tokens": 149325001.0, "sample_num_tokens": 7901.25, "step": 9053, "total_num_tokens": 149356606.0, "z_loss": 0.0004057313781231642 }, { "copy_logits_max": 0.07924988865852356, "copy_logits_min": -750000064.0, "copy_num_tokens": 340.8125, "epoch": 1.8490171049272401, "gen_logits_max": 6.165409088134766, "gen_logits_mean": -13.52824592590332, "gen_logits_min": -25.354217529296875, "gen_logits_std": 3.345191717147827, "gen_loss": 0.2983745336532593, "grad_norm": 0.35994789007817773, "learning_rate": 1.9493052631578947e-05, "loss": 0.2836, "mean_copy_accuracy": 0.9969555139541626, "mean_gen_accuracy": 0.8758476227521896, "mean_token_accuracy": 0.9037470072507858, "num_tokens": 149597090.0, "sample_num_tokens": 7817.5, "step": 9054, "total_num_tokens": 149628360.0, "z_loss": 0.0004559408116620034 }, { "copy_logits_max": -3.7570648193359375, "copy_logits_min": -750000000.0, "copy_num_tokens": 391.3125, "epoch": 1.849221342864437, "gen_logits_max": 5.289566993713379, "gen_logits_mean": -15.409716606140137, "gen_logits_min": -27.709381103515625, "gen_logits_std": 3.4228529930114746, "gen_loss": 0.24608859419822693, "grad_norm": 0.3424330473241876, "learning_rate": 1.949178947368421e-05, "loss": 0.2672, "mean_copy_accuracy": 0.9973392933607101, "mean_gen_accuracy": 0.8840083628892899, "mean_token_accuracy": 0.9091654717922211, "num_tokens": 149860860.0, "sample_num_tokens": 9254.5, "step": 9055, "total_num_tokens": 149897878.0, "z_loss": 0.00041615235386416316 }, { "copy_logits_max": -3.142576217651367, "copy_logits_min": -625000064.0, "copy_num_tokens": 521.1875, "epoch": 1.849425580801634, "gen_logits_max": 3.7723848819732666, "gen_logits_mean": -17.540388107299805, "gen_logits_min": -29.706096649169922, "gen_logits_std": 3.500833034515381, "gen_loss": 0.28571733832359314, "grad_norm": 0.44932279116233154, "learning_rate": 1.9490526315789472e-05, "loss": 0.2757, "mean_copy_accuracy": 0.9961686134338379, "mean_gen_accuracy": 0.8754794150590897, "mean_token_accuracy": 0.9049182236194611, "num_tokens": 150134314.0, "sample_num_tokens": 9130.5, "step": 9056, "total_num_tokens": 150170836.0, "z_loss": 0.00045418483205139637 }, { "copy_logits_max": -1.9875283241271973, "copy_logits_min": -687500032.0, "copy_num_tokens": 402.75, "epoch": 1.8496298187388307, "gen_logits_max": 5.285907745361328, "gen_logits_mean": -14.851873397827148, "gen_logits_min": -26.936351776123047, "gen_logits_std": 3.3719279766082764, "gen_loss": 0.28344839811325073, "grad_norm": 0.34890860002819774, "learning_rate": 1.9489263157894736e-05, "loss": 0.2739, "mean_copy_accuracy": 0.9971827864646912, "mean_gen_accuracy": 0.8753034025430679, "mean_token_accuracy": 0.9059742540121078, "num_tokens": 150399366.0, "sample_num_tokens": 8066.0, "step": 9057, "total_num_tokens": 150431630.0, "z_loss": 0.00044862276990897954 }, { "copy_logits_max": -0.13610881567001343, "copy_logits_min": -750000000.0, "copy_num_tokens": 438.8125, "epoch": 1.8498340566760276, "gen_logits_max": 5.588115692138672, "gen_logits_mean": -15.182167053222656, "gen_logits_min": -27.587158203125, "gen_logits_std": 3.3986425399780273, "gen_loss": 0.249114990234375, "grad_norm": 0.33957950043387986, "learning_rate": 1.9488e-05, "loss": 0.2495, "mean_copy_accuracy": 0.9971981197595596, "mean_gen_accuracy": 0.887684240937233, "mean_token_accuracy": 0.9145495593547821, "num_tokens": 150685922.0, "sample_num_tokens": 7968.0, "step": 9058, "total_num_tokens": 150717794.0, "z_loss": 0.0004201158881187439 }, { "copy_logits_max": -2.411754608154297, "copy_logits_min": -687500032.0, "copy_num_tokens": 359.125, "epoch": 1.8500382946132246, "gen_logits_max": 3.944333076477051, "gen_logits_mean": -17.49843978881836, "gen_logits_min": -29.64931297302246, "gen_logits_std": 3.4524545669555664, "gen_loss": 0.3124363124370575, "grad_norm": 0.4050723267357293, "learning_rate": 1.9486736842105265e-05, "loss": 0.2876, "mean_copy_accuracy": 0.9956476241350174, "mean_gen_accuracy": 0.874676987528801, "mean_token_accuracy": 0.9015043824911118, "num_tokens": 150937846.0, "sample_num_tokens": 7598.5, "step": 9059, "total_num_tokens": 150968240.0, "z_loss": 0.0004834402643609792 }, { "copy_logits_max": -1.5710630416870117, "copy_logits_min": -687500032.0, "copy_num_tokens": 337.375, "epoch": 1.8502425325504213, "gen_logits_max": 5.387911319732666, "gen_logits_mean": -14.798990249633789, "gen_logits_min": -27.029329299926758, "gen_logits_std": 3.3533384799957275, "gen_loss": 0.31017810106277466, "grad_norm": 0.34600487518676715, "learning_rate": 1.9485473684210526e-05, "loss": 0.2918, "mean_copy_accuracy": 0.9978573471307755, "mean_gen_accuracy": 0.8700515180826187, "mean_token_accuracy": 0.9009797126054764, "num_tokens": 151216047.0, "sample_num_tokens": 7469.25, "step": 9060, "total_num_tokens": 151245924.0, "z_loss": 0.00047565216664224863 }, { "copy_logits_max": -3.9826767444610596, "copy_logits_min": -687500032.0, "copy_num_tokens": 334.5, "epoch": 1.850446770487618, "gen_logits_max": 3.8282318115234375, "gen_logits_mean": -17.363513946533203, "gen_logits_min": -29.713476181030273, "gen_logits_std": 3.481210470199585, "gen_loss": 0.276938796043396, "grad_norm": 0.386420007185157, "learning_rate": 1.948421052631579e-05, "loss": 0.2846, "mean_copy_accuracy": 0.9959917962551117, "mean_gen_accuracy": 0.8760413080453873, "mean_token_accuracy": 0.9032005965709686, "num_tokens": 151486804.0, "sample_num_tokens": 6882.5, "step": 9061, "total_num_tokens": 151514334.0, "z_loss": 0.0004205218283459544 }, { "copy_logits_max": -4.410992622375488, "copy_logits_min": -750000000.0, "copy_num_tokens": 379.375, "epoch": 1.850651008424815, "gen_logits_max": 5.072914123535156, "gen_logits_mean": -15.856035232543945, "gen_logits_min": -27.967552185058594, "gen_logits_std": 3.413595199584961, "gen_loss": 0.24934859573841095, "grad_norm": 0.3313600342799987, "learning_rate": 1.9482947368421054e-05, "loss": 0.2421, "mean_copy_accuracy": 0.9964855760335922, "mean_gen_accuracy": 0.8904758244752884, "mean_token_accuracy": 0.9175619781017303, "num_tokens": 151767520.0, "sample_num_tokens": 7629.0, "step": 9062, "total_num_tokens": 151798036.0, "z_loss": 0.0003806970489677042 }, { "copy_logits_max": -2.7177748680114746, "copy_logits_min": -687500032.0, "copy_num_tokens": 603.875, "epoch": 1.8508552463620118, "gen_logits_max": 4.076344013214111, "gen_logits_mean": -16.28963851928711, "gen_logits_min": -28.827999114990234, "gen_logits_std": 3.4599087238311768, "gen_loss": 0.2464299201965332, "grad_norm": 0.35494562629899973, "learning_rate": 1.9481684210526315e-05, "loss": 0.2761, "mean_copy_accuracy": 0.997771218419075, "mean_gen_accuracy": 0.8749824166297913, "mean_token_accuracy": 0.906512513756752, "num_tokens": 152031585.0, "sample_num_tokens": 9203.25, "step": 9063, "total_num_tokens": 152068398.0, "z_loss": 0.0003973369603045285 }, { "copy_logits_max": -2.9364213943481445, "copy_logits_min": -750000000.0, "copy_num_tokens": 490.75, "epoch": 1.8510594842992085, "gen_logits_max": 3.9774837493896484, "gen_logits_mean": -16.828140258789062, "gen_logits_min": -29.150665283203125, "gen_logits_std": 3.470754861831665, "gen_loss": 0.2662133574485779, "grad_norm": 0.35167199193617094, "learning_rate": 1.948042105263158e-05, "loss": 0.2603, "mean_copy_accuracy": 0.9970597326755524, "mean_gen_accuracy": 0.8826683759689331, "mean_token_accuracy": 0.9115674942731857, "num_tokens": 152299613.0, "sample_num_tokens": 8006.75, "step": 9064, "total_num_tokens": 152331640.0, "z_loss": 0.00044913444435223937 }, { "copy_logits_max": 1.3405369520187378, "copy_logits_min": -750000000.0, "copy_num_tokens": 562.0, "epoch": 1.8512637222364055, "gen_logits_max": 3.2993931770324707, "gen_logits_mean": -16.76047706604004, "gen_logits_min": -30.150970458984375, "gen_logits_std": 3.466474771499634, "gen_loss": 0.27220839262008667, "grad_norm": 0.34953638296228745, "learning_rate": 1.947915789473684e-05, "loss": 0.2579, "mean_copy_accuracy": 0.9967611134052277, "mean_gen_accuracy": 0.8817775249481201, "mean_token_accuracy": 0.9142698347568512, "num_tokens": 152574206.0, "sample_num_tokens": 8508.5, "step": 9065, "total_num_tokens": 152608240.0, "z_loss": 0.0004872347926720977 }, { "copy_logits_max": -2.740434169769287, "copy_logits_min": -750000128.0, "copy_num_tokens": 389.125, "epoch": 1.8514679601736024, "gen_logits_max": 3.332906484603882, "gen_logits_mean": -18.041507720947266, "gen_logits_min": -30.62641716003418, "gen_logits_std": 3.51664137840271, "gen_loss": 0.29508665204048157, "grad_norm": 0.40015844853826926, "learning_rate": 1.9477894736842108e-05, "loss": 0.2656, "mean_copy_accuracy": 0.9967392086982727, "mean_gen_accuracy": 0.8790313750505447, "mean_token_accuracy": 0.9098632484674454, "num_tokens": 152857101.0, "sample_num_tokens": 7436.75, "step": 9066, "total_num_tokens": 152886848.0, "z_loss": 0.0004798143054358661 }, { "copy_logits_max": -2.8078720569610596, "copy_logits_min": -750000000.0, "copy_num_tokens": 412.0625, "epoch": 1.851672198110799, "gen_logits_max": 4.126191139221191, "gen_logits_mean": -17.212833404541016, "gen_logits_min": -29.965438842773438, "gen_logits_std": 3.462167501449585, "gen_loss": 0.3197672963142395, "grad_norm": 0.3678703302991977, "learning_rate": 1.947663157894737e-05, "loss": 0.2748, "mean_copy_accuracy": 0.9961617141962051, "mean_gen_accuracy": 0.8775599002838135, "mean_token_accuracy": 0.906475305557251, "num_tokens": 153121964.0, "sample_num_tokens": 8354.0, "step": 9067, "total_num_tokens": 153155380.0, "z_loss": 0.00046488133375532925 }, { "copy_logits_max": -2.3313684463500977, "copy_logits_min": -625000064.0, "copy_num_tokens": 713.75, "epoch": 1.8518764360479958, "gen_logits_max": 3.548616409301758, "gen_logits_mean": -16.93853187561035, "gen_logits_min": -29.41333770751953, "gen_logits_std": 3.459644317626953, "gen_loss": 0.28472214937210083, "grad_norm": 0.33954023298654834, "learning_rate": 1.9475368421052633e-05, "loss": 0.2671, "mean_copy_accuracy": 0.9969277828931808, "mean_gen_accuracy": 0.8760924488306046, "mean_token_accuracy": 0.9078712314367294, "num_tokens": 153387578.0, "sample_num_tokens": 10834.0, "step": 9068, "total_num_tokens": 153430914.0, "z_loss": 0.0004412400012370199 }, { "copy_logits_max": -5.073395729064941, "copy_logits_min": -750000000.0, "copy_num_tokens": 313.875, "epoch": 1.8520806739851927, "gen_logits_max": 3.655886650085449, "gen_logits_mean": -17.951087951660156, "gen_logits_min": -30.325786590576172, "gen_logits_std": 3.501032829284668, "gen_loss": 0.2973392903804779, "grad_norm": 0.3839525931280005, "learning_rate": 1.9474105263157894e-05, "loss": 0.2783, "mean_copy_accuracy": 0.9962459951639175, "mean_gen_accuracy": 0.8756714761257172, "mean_token_accuracy": 0.9053440988063812, "num_tokens": 153671188.0, "sample_num_tokens": 6705.0, "step": 9069, "total_num_tokens": 153698008.0, "z_loss": 0.00046164626837708056 }, { "copy_logits_max": -6.136905670166016, "copy_logits_min": -687500032.0, "copy_num_tokens": 323.75, "epoch": 1.8522849119223896, "gen_logits_max": 4.014960765838623, "gen_logits_mean": -17.56035614013672, "gen_logits_min": -30.62441062927246, "gen_logits_std": 3.4819796085357666, "gen_loss": 0.28115811944007874, "grad_norm": 0.3721743412410465, "learning_rate": 1.947284210526316e-05, "loss": 0.2573, "mean_copy_accuracy": 0.9967531263828278, "mean_gen_accuracy": 0.8845729678869247, "mean_token_accuracy": 0.9117496013641357, "num_tokens": 153920173.0, "sample_num_tokens": 7132.75, "step": 9070, "total_num_tokens": 153948704.0, "z_loss": 0.00041511119343340397 }, { "copy_logits_max": -5.3003668785095215, "copy_logits_min": -750000000.0, "copy_num_tokens": 440.6875, "epoch": 1.8524891498595863, "gen_logits_max": 4.7497076988220215, "gen_logits_mean": -15.467315673828125, "gen_logits_min": -28.07975959777832, "gen_logits_std": 3.4284443855285645, "gen_loss": 0.25175604224205017, "grad_norm": 0.35296992991490705, "learning_rate": 1.947157894736842e-05, "loss": 0.257, "mean_copy_accuracy": 0.9960975497961044, "mean_gen_accuracy": 0.8837633728981018, "mean_token_accuracy": 0.9128606468439102, "num_tokens": 154186266.0, "sample_num_tokens": 8273.5, "step": 9071, "total_num_tokens": 154219360.0, "z_loss": 0.00037016283022239804 }, { "copy_logits_max": -7.1767168045043945, "copy_logits_min": -750000000.0, "copy_num_tokens": 381.0, "epoch": 1.8526933877967833, "gen_logits_max": 3.872957706451416, "gen_logits_mean": -17.54296112060547, "gen_logits_min": -30.108356475830078, "gen_logits_std": 3.5070462226867676, "gen_loss": 0.2711523473262787, "grad_norm": 0.3782083105187619, "learning_rate": 1.9470315789473684e-05, "loss": 0.2778, "mean_copy_accuracy": 0.9963158071041107, "mean_gen_accuracy": 0.8766096532344818, "mean_token_accuracy": 0.9064287543296814, "num_tokens": 154453878.0, "sample_num_tokens": 7715.5, "step": 9072, "total_num_tokens": 154484740.0, "z_loss": 0.00036088016349822283 }, { "copy_logits_max": -2.8458163738250732, "copy_logits_min": -750000000.0, "copy_num_tokens": 417.5625, "epoch": 1.8528976257339802, "gen_logits_max": 6.2782721519470215, "gen_logits_mean": -13.20846939086914, "gen_logits_min": -25.99814796447754, "gen_logits_std": 3.32849383354187, "gen_loss": 0.27412617206573486, "grad_norm": 0.3432714595041647, "learning_rate": 1.9469052631578945e-05, "loss": 0.2822, "mean_copy_accuracy": 0.9966020286083221, "mean_gen_accuracy": 0.8754419386386871, "mean_token_accuracy": 0.9049128592014313, "num_tokens": 154729850.0, "sample_num_tokens": 8582.5, "step": 9073, "total_num_tokens": 154764180.0, "z_loss": 0.00043284898856654763 }, { "copy_logits_max": -8.382269859313965, "copy_logits_min": -750000000.0, "copy_num_tokens": 266.5, "epoch": 1.853101863671177, "gen_logits_max": 3.9438745975494385, "gen_logits_mean": -18.908573150634766, "gen_logits_min": -31.270309448242188, "gen_logits_std": 3.5458712577819824, "gen_loss": 0.2718069553375244, "grad_norm": 0.38735380947192727, "learning_rate": 1.9467789473684212e-05, "loss": 0.2922, "mean_copy_accuracy": 0.9964057952165604, "mean_gen_accuracy": 0.8722244948148727, "mean_token_accuracy": 0.8991865366697311, "num_tokens": 154988498.0, "sample_num_tokens": 6963.5, "step": 9074, "total_num_tokens": 155016352.0, "z_loss": 0.0004230787744745612 }, { "copy_logits_max": -4.97835636138916, "copy_logits_min": -750000000.0, "copy_num_tokens": 465.75, "epoch": 1.8533061016083736, "gen_logits_max": 4.162746429443359, "gen_logits_mean": -16.646486282348633, "gen_logits_min": -29.126237869262695, "gen_logits_std": 3.472548484802246, "gen_loss": 0.26422184705734253, "grad_norm": 0.3354782781633012, "learning_rate": 1.9466526315789477e-05, "loss": 0.2712, "mean_copy_accuracy": 0.997120127081871, "mean_gen_accuracy": 0.8778563141822815, "mean_token_accuracy": 0.9074862599372864, "num_tokens": 155262986.0, "sample_num_tokens": 8596.5, "step": 9075, "total_num_tokens": 155297372.0, "z_loss": 0.0004610922478605062 }, { "copy_logits_max": -4.950553894042969, "copy_logits_min": -750000000.0, "copy_num_tokens": 531.875, "epoch": 1.8535103395455705, "gen_logits_max": 3.202237606048584, "gen_logits_mean": -17.986000061035156, "gen_logits_min": -30.373851776123047, "gen_logits_std": 3.5208139419555664, "gen_loss": 0.2797275185585022, "grad_norm": 0.3524938924544036, "learning_rate": 1.9465263157894738e-05, "loss": 0.2699, "mean_copy_accuracy": 0.9974911063909531, "mean_gen_accuracy": 0.8720417320728302, "mean_token_accuracy": 0.9081350266933441, "num_tokens": 155550126.0, "sample_num_tokens": 8786.0, "step": 9076, "total_num_tokens": 155585270.0, "z_loss": 0.000482027797261253 }, { "copy_logits_max": -5.432937145233154, "copy_logits_min": -750000000.0, "copy_num_tokens": 487.6875, "epoch": 1.8537145774827675, "gen_logits_max": 4.273261070251465, "gen_logits_mean": -16.479488372802734, "gen_logits_min": -28.913799285888672, "gen_logits_std": 3.4588916301727295, "gen_loss": 0.2688599228858948, "grad_norm": 0.3408190214594489, "learning_rate": 1.9464000000000002e-05, "loss": 0.2701, "mean_copy_accuracy": 0.9969455599784851, "mean_gen_accuracy": 0.8769928365945816, "mean_token_accuracy": 0.9079141914844513, "num_tokens": 155822775.0, "sample_num_tokens": 8038.75, "step": 9077, "total_num_tokens": 155854930.0, "z_loss": 0.0004447398241609335 }, { "copy_logits_max": -7.511954307556152, "copy_logits_min": -750000064.0, "copy_num_tokens": 347.8125, "epoch": 1.8539188154199642, "gen_logits_max": 4.844494819641113, "gen_logits_mean": -16.922945022583008, "gen_logits_min": -29.257198333740234, "gen_logits_std": 3.449690818786621, "gen_loss": 0.304751992225647, "grad_norm": 0.3605268035349593, "learning_rate": 1.9462736842105263e-05, "loss": 0.2699, "mean_copy_accuracy": 0.9971708357334137, "mean_gen_accuracy": 0.8781248778104782, "mean_token_accuracy": 0.9086583703756332, "num_tokens": 156088036.0, "sample_num_tokens": 7721.0, "step": 9078, "total_num_tokens": 156118920.0, "z_loss": 0.0004481331561692059 }, { "copy_logits_max": -5.641713619232178, "copy_logits_min": -750000064.0, "copy_num_tokens": 527.6875, "epoch": 1.854123053357161, "gen_logits_max": 4.020443916320801, "gen_logits_mean": -17.181255340576172, "gen_logits_min": -29.546478271484375, "gen_logits_std": 3.4827957153320312, "gen_loss": 0.27944478392601013, "grad_norm": 0.34577621015853355, "learning_rate": 1.9461473684210527e-05, "loss": 0.264, "mean_copy_accuracy": 0.9965585768222809, "mean_gen_accuracy": 0.8819939941167831, "mean_token_accuracy": 0.9094849973917007, "num_tokens": 156346202.0, "sample_num_tokens": 9671.5, "step": 9079, "total_num_tokens": 156384888.0, "z_loss": 0.00042727895197458565 }, { "copy_logits_max": -6.6763482093811035, "copy_logits_min": -750000000.0, "copy_num_tokens": 485.375, "epoch": 1.854327291294358, "gen_logits_max": 4.8788371086120605, "gen_logits_mean": -15.806605339050293, "gen_logits_min": -28.8247127532959, "gen_logits_std": 3.4608144760131836, "gen_loss": 0.26157742738723755, "grad_norm": 0.3326184413848583, "learning_rate": 1.9460210526315788e-05, "loss": 0.2643, "mean_copy_accuracy": 0.9975059032440186, "mean_gen_accuracy": 0.8806413263082504, "mean_token_accuracy": 0.9111586660146713, "num_tokens": 156618037.0, "sample_num_tokens": 8516.75, "step": 9080, "total_num_tokens": 156652104.0, "z_loss": 0.000400097924284637 }, { "copy_logits_max": -5.899138927459717, "copy_logits_min": -687500032.0, "copy_num_tokens": 671.875, "epoch": 1.8545315292315547, "gen_logits_max": 5.211125373840332, "gen_logits_mean": -14.846542358398438, "gen_logits_min": -28.14537811279297, "gen_logits_std": 3.427246332168579, "gen_loss": 0.2634434401988983, "grad_norm": 0.34172914427882933, "learning_rate": 1.9458947368421052e-05, "loss": 0.2662, "mean_copy_accuracy": 0.9967686533927917, "mean_gen_accuracy": 0.8793420642614365, "mean_token_accuracy": 0.9095611423254013, "num_tokens": 156901355.0, "sample_num_tokens": 10184.75, "step": 9081, "total_num_tokens": 156942094.0, "z_loss": 0.0004638929385691881 }, { "copy_logits_max": -7.824932098388672, "copy_logits_min": -750000000.0, "copy_num_tokens": 450.625, "epoch": 1.8547357671687514, "gen_logits_max": 3.5532431602478027, "gen_logits_mean": -18.288379669189453, "gen_logits_min": -30.781614303588867, "gen_logits_std": 3.527019500732422, "gen_loss": 0.2706376910209656, "grad_norm": 0.3489015190235377, "learning_rate": 1.9457684210526317e-05, "loss": 0.286, "mean_copy_accuracy": 0.996807187795639, "mean_gen_accuracy": 0.8741902709007263, "mean_token_accuracy": 0.9029525369405746, "num_tokens": 157183895.0, "sample_num_tokens": 8387.25, "step": 9082, "total_num_tokens": 157217444.0, "z_loss": 0.0004247405449859798 }, { "copy_logits_max": -9.129535675048828, "copy_logits_min": -750000064.0, "copy_num_tokens": 370.1875, "epoch": 1.8549400051059486, "gen_logits_max": 3.413215398788452, "gen_logits_mean": -18.66640853881836, "gen_logits_min": -30.8908634185791, "gen_logits_std": 3.5101935863494873, "gen_loss": 0.27518218755722046, "grad_norm": 0.34385702564790455, "learning_rate": 1.945642105263158e-05, "loss": 0.267, "mean_copy_accuracy": 0.9962232112884521, "mean_gen_accuracy": 0.8811328113079071, "mean_token_accuracy": 0.9077764749526978, "num_tokens": 157451977.0, "sample_num_tokens": 8384.25, "step": 9083, "total_num_tokens": 157485514.0, "z_loss": 0.0004306842456571758 }, { "copy_logits_max": -8.560558319091797, "copy_logits_min": -750000000.0, "copy_num_tokens": 366.8125, "epoch": 1.8551442430431453, "gen_logits_max": 5.483059883117676, "gen_logits_mean": -15.228752136230469, "gen_logits_min": -28.03240203857422, "gen_logits_std": 3.4298787117004395, "gen_loss": 0.2252316027879715, "grad_norm": 0.3530259850421349, "learning_rate": 1.9455157894736842e-05, "loss": 0.2715, "mean_copy_accuracy": 0.9972393959760666, "mean_gen_accuracy": 0.8834667354822159, "mean_token_accuracy": 0.9087727963924408, "num_tokens": 157696338.0, "sample_num_tokens": 7834.5, "step": 9084, "total_num_tokens": 157727676.0, "z_loss": 0.0003603731165640056 }, { "copy_logits_max": -6.380855560302734, "copy_logits_min": -687500032.0, "copy_num_tokens": 516.75, "epoch": 1.855348480980342, "gen_logits_max": 5.403326034545898, "gen_logits_mean": -15.737844467163086, "gen_logits_min": -28.34815216064453, "gen_logits_std": 3.4507601261138916, "gen_loss": 0.25202834606170654, "grad_norm": 0.335466519503814, "learning_rate": 1.9453894736842106e-05, "loss": 0.2639, "mean_copy_accuracy": 0.9968148320913315, "mean_gen_accuracy": 0.882369339466095, "mean_token_accuracy": 0.9087002128362656, "num_tokens": 157958246.0, "sample_num_tokens": 8279.5, "step": 9085, "total_num_tokens": 157991364.0, "z_loss": 0.0004198919632472098 }, { "copy_logits_max": -8.191638946533203, "copy_logits_min": -750000128.0, "copy_num_tokens": 468.6875, "epoch": 1.855552718917539, "gen_logits_max": 6.372685432434082, "gen_logits_mean": -13.619955062866211, "gen_logits_min": -26.434654235839844, "gen_logits_std": 3.3516783714294434, "gen_loss": 0.2719000577926636, "grad_norm": 0.351523113157782, "learning_rate": 1.9452631578947367e-05, "loss": 0.2618, "mean_copy_accuracy": 0.9960478842258453, "mean_gen_accuracy": 0.8834932446479797, "mean_token_accuracy": 0.9111412465572357, "num_tokens": 158226547.0, "sample_num_tokens": 7911.25, "step": 9086, "total_num_tokens": 158258192.0, "z_loss": 0.0003757878439500928 }, { "copy_logits_max": -6.358307838439941, "copy_logits_min": -750000000.0, "copy_num_tokens": 478.9375, "epoch": 1.8557569568547359, "gen_logits_max": 4.3358154296875, "gen_logits_mean": -16.872634887695312, "gen_logits_min": -29.17251968383789, "gen_logits_std": 3.4868502616882324, "gen_loss": 0.281791090965271, "grad_norm": 0.37420875434899364, "learning_rate": 1.945136842105263e-05, "loss": 0.2866, "mean_copy_accuracy": 0.9946313351392746, "mean_gen_accuracy": 0.8763200044631958, "mean_token_accuracy": 0.9019759446382523, "num_tokens": 158493699.0, "sample_num_tokens": 8903.25, "step": 9087, "total_num_tokens": 158529312.0, "z_loss": 0.00038023816887289286 }, { "copy_logits_max": -7.264177322387695, "copy_logits_min": -750000000.0, "copy_num_tokens": 470.4375, "epoch": 1.8559611947919326, "gen_logits_max": 3.3333683013916016, "gen_logits_mean": -18.073644638061523, "gen_logits_min": -30.441558837890625, "gen_logits_std": 3.530087471008301, "gen_loss": 0.24244508147239685, "grad_norm": 0.3624077592882687, "learning_rate": 1.9450105263157896e-05, "loss": 0.2574, "mean_copy_accuracy": 0.9973079115152359, "mean_gen_accuracy": 0.886023998260498, "mean_token_accuracy": 0.912516787648201, "num_tokens": 158765269.0, "sample_num_tokens": 7795.75, "step": 9088, "total_num_tokens": 158796452.0, "z_loss": 0.0003189664566889405 }, { "copy_logits_max": -7.976017951965332, "copy_logits_min": -750000000.0, "copy_num_tokens": 285.5625, "epoch": 1.8561654327291295, "gen_logits_max": 3.7470345497131348, "gen_logits_mean": -18.20115852355957, "gen_logits_min": -30.803375244140625, "gen_logits_std": 3.5164566040039062, "gen_loss": 0.26843011379241943, "grad_norm": 0.3623045530530097, "learning_rate": 1.9448842105263157e-05, "loss": 0.2788, "mean_copy_accuracy": 0.9969062358140945, "mean_gen_accuracy": 0.8777261674404144, "mean_token_accuracy": 0.9049278646707535, "num_tokens": 159009581.0, "sample_num_tokens": 6529.25, "step": 9089, "total_num_tokens": 159035698.0, "z_loss": 0.00040569467819295824 }, { "copy_logits_max": -7.046452522277832, "copy_logits_min": -687500032.0, "copy_num_tokens": 467.0625, "epoch": 1.8563696706663264, "gen_logits_max": 4.106842041015625, "gen_logits_mean": -17.209775924682617, "gen_logits_min": -29.85024642944336, "gen_logits_std": 3.5078346729278564, "gen_loss": 0.2518473267555237, "grad_norm": 0.3734769829478557, "learning_rate": 1.9447578947368425e-05, "loss": 0.2914, "mean_copy_accuracy": 0.9969961792230606, "mean_gen_accuracy": 0.8705507814884186, "mean_token_accuracy": 0.899113342165947, "num_tokens": 159253591.0, "sample_num_tokens": 8667.75, "step": 9090, "total_num_tokens": 159288262.0, "z_loss": 0.0003631526487879455 }, { "copy_logits_max": -7.524139404296875, "copy_logits_min": -750000000.0, "copy_num_tokens": 529.375, "epoch": 1.8565739086035231, "gen_logits_max": 5.321063041687012, "gen_logits_mean": -15.205224990844727, "gen_logits_min": -27.814796447753906, "gen_logits_std": 3.411471366882324, "gen_loss": 0.25346192717552185, "grad_norm": 0.3656087723470026, "learning_rate": 1.9446315789473685e-05, "loss": 0.2784, "mean_copy_accuracy": 0.995895192027092, "mean_gen_accuracy": 0.874996080994606, "mean_token_accuracy": 0.906482607126236, "num_tokens": 159547726.0, "sample_num_tokens": 8908.5, "step": 9091, "total_num_tokens": 159583360.0, "z_loss": 0.0003693123289849609 }, { "copy_logits_max": -7.456305503845215, "copy_logits_min": -750000000.0, "copy_num_tokens": 472.3125, "epoch": 1.8567781465407198, "gen_logits_max": 4.892277717590332, "gen_logits_mean": -16.524524688720703, "gen_logits_min": -29.23831558227539, "gen_logits_std": 3.4774129390716553, "gen_loss": 0.2402319610118866, "grad_norm": 0.33769998072904905, "learning_rate": 1.944505263157895e-05, "loss": 0.2515, "mean_copy_accuracy": 0.9978194236755371, "mean_gen_accuracy": 0.8830514401197433, "mean_token_accuracy": 0.9148095995187759, "num_tokens": 159816426.0, "sample_num_tokens": 7668.0, "step": 9092, "total_num_tokens": 159847098.0, "z_loss": 0.00035518172080628574 }, { "copy_logits_max": -6.292348384857178, "copy_logits_min": -750000064.0, "copy_num_tokens": 797.5625, "epoch": 1.8569823844779167, "gen_logits_max": 5.547356605529785, "gen_logits_mean": -14.497720718383789, "gen_logits_min": -27.772350311279297, "gen_logits_std": 3.4297919273376465, "gen_loss": 0.23944267630577087, "grad_norm": 0.36360229036280334, "learning_rate": 1.944378947368421e-05, "loss": 0.2696, "mean_copy_accuracy": 0.9961466938257217, "mean_gen_accuracy": 0.8749917596578598, "mean_token_accuracy": 0.9109265208244324, "num_tokens": 160085026.0, "sample_num_tokens": 9970.5, "step": 9093, "total_num_tokens": 160124908.0, "z_loss": 0.00040372327202931046 }, { "copy_logits_max": -6.907101631164551, "copy_logits_min": -750000000.0, "copy_num_tokens": 466.625, "epoch": 1.8571866224151137, "gen_logits_max": 4.933024883270264, "gen_logits_mean": -16.825668334960938, "gen_logits_min": -28.959970474243164, "gen_logits_std": 3.4748806953430176, "gen_loss": 0.25744855403900146, "grad_norm": 0.3661763614601812, "learning_rate": 1.9442526315789475e-05, "loss": 0.2604, "mean_copy_accuracy": 0.9970857203006744, "mean_gen_accuracy": 0.8844011723995209, "mean_token_accuracy": 0.913033738732338, "num_tokens": 160365012.0, "sample_num_tokens": 8710.0, "step": 9094, "total_num_tokens": 160399852.0, "z_loss": 0.0003999922482762486 }, { "copy_logits_max": -5.146271705627441, "copy_logits_min": -750000000.0, "copy_num_tokens": 620.0625, "epoch": 1.8573908603523104, "gen_logits_max": 4.183106899261475, "gen_logits_mean": -17.416339874267578, "gen_logits_min": -30.329612731933594, "gen_logits_std": 3.490318536758423, "gen_loss": 0.2715623080730438, "grad_norm": 0.3197692093076553, "learning_rate": 1.9441263157894736e-05, "loss": 0.2648, "mean_copy_accuracy": 0.9967555701732635, "mean_gen_accuracy": 0.8776775449514389, "mean_token_accuracy": 0.9087672084569931, "num_tokens": 160648793.0, "sample_num_tokens": 9038.25, "step": 9095, "total_num_tokens": 160684946.0, "z_loss": 0.0004890973796136677 }, { "copy_logits_max": -7.234094619750977, "copy_logits_min": -750000000.0, "copy_num_tokens": 425.25, "epoch": 1.8575950982895073, "gen_logits_max": 6.032078266143799, "gen_logits_mean": -15.47654914855957, "gen_logits_min": -28.276859283447266, "gen_logits_std": 3.394735813140869, "gen_loss": 0.29445505142211914, "grad_norm": 0.34510205395378984, "learning_rate": 1.944e-05, "loss": 0.2745, "mean_copy_accuracy": 0.9969034940004349, "mean_gen_accuracy": 0.8762028515338898, "mean_token_accuracy": 0.9061426967382431, "num_tokens": 160921127.0, "sample_num_tokens": 8330.75, "step": 9096, "total_num_tokens": 160954450.0, "z_loss": 0.0004905512323603034 }, { "copy_logits_max": -5.845778465270996, "copy_logits_min": -625000064.0, "copy_num_tokens": 455.125, "epoch": 1.8577993362267042, "gen_logits_max": 5.116971969604492, "gen_logits_mean": -16.967330932617188, "gen_logits_min": -29.427352905273438, "gen_logits_std": 3.4532241821289062, "gen_loss": 0.28247225284576416, "grad_norm": 0.3689576180586325, "learning_rate": 1.943873684210526e-05, "loss": 0.2726, "mean_copy_accuracy": 0.997870460152626, "mean_gen_accuracy": 0.8773886263370514, "mean_token_accuracy": 0.9073881357908249, "num_tokens": 161201573.0, "sample_num_tokens": 8537.25, "step": 9097, "total_num_tokens": 161235722.0, "z_loss": 0.00048766419058665633 }, { "copy_logits_max": -8.747437477111816, "copy_logits_min": -687500032.0, "copy_num_tokens": 423.75, "epoch": 1.858003574163901, "gen_logits_max": 4.522613048553467, "gen_logits_mean": -17.779170989990234, "gen_logits_min": -30.584983825683594, "gen_logits_std": 3.4814932346343994, "gen_loss": 0.25864821672439575, "grad_norm": 0.3582635961140376, "learning_rate": 1.943747368421053e-05, "loss": 0.2773, "mean_copy_accuracy": 0.9969737529754639, "mean_gen_accuracy": 0.8756542503833771, "mean_token_accuracy": 0.9050771296024323, "num_tokens": 161468213.0, "sample_num_tokens": 8445.25, "step": 9098, "total_num_tokens": 161501994.0, "z_loss": 0.00042603680049069226 }, { "copy_logits_max": -4.570346832275391, "copy_logits_min": -750000000.0, "copy_num_tokens": 414.5625, "epoch": 1.8582078121010976, "gen_logits_max": 4.109136581420898, "gen_logits_mean": -16.890140533447266, "gen_logits_min": -29.463401794433594, "gen_logits_std": 3.4516687393188477, "gen_loss": 0.28319084644317627, "grad_norm": 0.32614768087797624, "learning_rate": 1.943621052631579e-05, "loss": 0.2664, "mean_copy_accuracy": 0.9971221685409546, "mean_gen_accuracy": 0.8820680230855942, "mean_token_accuracy": 0.9093612134456635, "num_tokens": 161740042.0, "sample_num_tokens": 7902.5, "step": 9099, "total_num_tokens": 161771652.0, "z_loss": 0.0005082348361611366 }, { "copy_logits_max": -0.9438905715942383, "copy_logits_min": -687500032.0, "copy_num_tokens": 477.0625, "epoch": 1.8584120500382946, "gen_logits_max": 4.398667812347412, "gen_logits_mean": -15.844040870666504, "gen_logits_min": -29.034608840942383, "gen_logits_std": 3.4179186820983887, "gen_loss": 0.2698637545108795, "grad_norm": 0.3497832021069343, "learning_rate": 1.9434947368421054e-05, "loss": 0.2796, "mean_copy_accuracy": 0.9970860779285431, "mean_gen_accuracy": 0.8773870766162872, "mean_token_accuracy": 0.9041614681482315, "num_tokens": 161991650.0, "sample_num_tokens": 8111.5, "step": 9100, "total_num_tokens": 162024096.0, "z_loss": 0.0004214815271552652 }, { "copy_logits_max": -5.589521884918213, "copy_logits_min": -750000000.0, "copy_num_tokens": 617.75, "epoch": 1.8586162879754915, "gen_logits_max": 4.443158149719238, "gen_logits_mean": -16.21477699279785, "gen_logits_min": -29.245153427124023, "gen_logits_std": 3.428537607192993, "gen_loss": 0.2729596495628357, "grad_norm": 0.31966848554877175, "learning_rate": 1.943368421052632e-05, "loss": 0.258, "mean_copy_accuracy": 0.9979155510663986, "mean_gen_accuracy": 0.8783617913722992, "mean_token_accuracy": 0.9110772013664246, "num_tokens": 162261608.0, "sample_num_tokens": 9705.0, "step": 9101, "total_num_tokens": 162300428.0, "z_loss": 0.000476584245916456 }, { "copy_logits_max": -6.121087074279785, "copy_logits_min": -750000000.0, "copy_num_tokens": 437.3125, "epoch": 1.8588205259126882, "gen_logits_max": 4.502956390380859, "gen_logits_mean": -16.589576721191406, "gen_logits_min": -29.209848403930664, "gen_logits_std": 3.4487788677215576, "gen_loss": 0.2755153477191925, "grad_norm": 0.3587901393067076, "learning_rate": 1.943242105263158e-05, "loss": 0.2793, "mean_copy_accuracy": 0.9970868676900864, "mean_gen_accuracy": 0.8757852464914322, "mean_token_accuracy": 0.9058336019515991, "num_tokens": 162537836.0, "sample_num_tokens": 8111.5, "step": 9102, "total_num_tokens": 162570282.0, "z_loss": 0.00043382978765293956 }, { "copy_logits_max": -6.83738374710083, "copy_logits_min": -750000000.0, "copy_num_tokens": 360.5625, "epoch": 1.8590247638498851, "gen_logits_max": 4.257390022277832, "gen_logits_mean": -17.32463836669922, "gen_logits_min": -29.530288696289062, "gen_logits_std": 3.465756893157959, "gen_loss": 0.28584936261177063, "grad_norm": 0.3625744091970341, "learning_rate": 1.9431157894736844e-05, "loss": 0.2828, "mean_copy_accuracy": 0.9957680553197861, "mean_gen_accuracy": 0.8776388168334961, "mean_token_accuracy": 0.9036126881837845, "num_tokens": 162801633.0, "sample_num_tokens": 8062.75, "step": 9103, "total_num_tokens": 162833884.0, "z_loss": 0.000444278703071177 }, { "copy_logits_max": -5.662104606628418, "copy_logits_min": -750000000.0, "copy_num_tokens": 479.6875, "epoch": 1.859229001787082, "gen_logits_max": 5.518869400024414, "gen_logits_mean": -14.73218822479248, "gen_logits_min": -27.059961318969727, "gen_logits_std": 3.390432596206665, "gen_loss": 0.2882690727710724, "grad_norm": 0.3644852748693374, "learning_rate": 1.9429894736842105e-05, "loss": 0.2746, "mean_copy_accuracy": 0.9959341436624527, "mean_gen_accuracy": 0.8775868713855743, "mean_token_accuracy": 0.9081651866436005, "num_tokens": 163060596.0, "sample_num_tokens": 7975.0, "step": 9104, "total_num_tokens": 163092496.0, "z_loss": 0.00045976456021890044 }, { "copy_logits_max": -4.228904724121094, "copy_logits_min": -750000000.0, "copy_num_tokens": 358.3125, "epoch": 1.8594332397242788, "gen_logits_max": 6.421749114990234, "gen_logits_mean": -14.767948150634766, "gen_logits_min": -27.69233512878418, "gen_logits_std": 3.457735300064087, "gen_loss": 0.26960721611976624, "grad_norm": 0.31355022581594943, "learning_rate": 1.942863157894737e-05, "loss": 0.2524, "mean_copy_accuracy": 0.9981643110513687, "mean_gen_accuracy": 0.8874883055686951, "mean_token_accuracy": 0.915309876203537, "num_tokens": 163352204.0, "sample_num_tokens": 7559.0, "step": 9105, "total_num_tokens": 163382440.0, "z_loss": 0.00046544510405510664 }, { "copy_logits_max": -1.8054134845733643, "copy_logits_min": -750000000.0, "copy_num_tokens": 395.875, "epoch": 1.8596374776614755, "gen_logits_max": 5.9672746658325195, "gen_logits_mean": -14.857866287231445, "gen_logits_min": -27.903488159179688, "gen_logits_std": 3.4217920303344727, "gen_loss": 0.30832183361053467, "grad_norm": 0.3574057910163698, "learning_rate": 1.942736842105263e-05, "loss": 0.2714, "mean_copy_accuracy": 0.9965604394674301, "mean_gen_accuracy": 0.8790241479873657, "mean_token_accuracy": 0.907481774687767, "num_tokens": 163620448.0, "sample_num_tokens": 7665.0, "step": 9106, "total_num_tokens": 163651108.0, "z_loss": 0.0004926076508127153 }, { "copy_logits_max": -5.347866058349609, "copy_logits_min": -750000000.0, "copy_num_tokens": 374.0625, "epoch": 1.8598417155986726, "gen_logits_max": 5.854736328125, "gen_logits_mean": -16.10653305053711, "gen_logits_min": -28.27747917175293, "gen_logits_std": 3.4314680099487305, "gen_loss": 0.28278297185897827, "grad_norm": 0.36301023819927336, "learning_rate": 1.9426105263157897e-05, "loss": 0.2771, "mean_copy_accuracy": 0.9971906244754791, "mean_gen_accuracy": 0.8765646368265152, "mean_token_accuracy": 0.9063640385866165, "num_tokens": 163891168.0, "sample_num_tokens": 8308.0, "step": 9107, "total_num_tokens": 163924400.0, "z_loss": 0.00043786014430224895 }, { "copy_logits_max": -2.2581496238708496, "copy_logits_min": -750000000.0, "copy_num_tokens": 522.5625, "epoch": 1.8600459535358693, "gen_logits_max": 4.990002155303955, "gen_logits_mean": -15.922274589538574, "gen_logits_min": -28.34005355834961, "gen_logits_std": 3.4692134857177734, "gen_loss": 0.24604928493499756, "grad_norm": 0.3330109474760837, "learning_rate": 1.942484210526316e-05, "loss": 0.2683, "mean_copy_accuracy": 0.9970863610506058, "mean_gen_accuracy": 0.8839240074157715, "mean_token_accuracy": 0.9093492478132248, "num_tokens": 164173882.0, "sample_num_tokens": 8656.5, "step": 9108, "total_num_tokens": 164208508.0, "z_loss": 0.0003808084293268621 }, { "copy_logits_max": -4.423883438110352, "copy_logits_min": -687500032.0, "copy_num_tokens": 593.125, "epoch": 1.860250191473066, "gen_logits_max": 5.615840435028076, "gen_logits_mean": -16.36284637451172, "gen_logits_min": -29.46652603149414, "gen_logits_std": 3.519432783126831, "gen_loss": 0.21569281816482544, "grad_norm": 0.35466347740419996, "learning_rate": 1.9423578947368423e-05, "loss": 0.2555, "mean_copy_accuracy": 0.9975952655076981, "mean_gen_accuracy": 0.8843303173780441, "mean_token_accuracy": 0.9129043370485306, "num_tokens": 164440213.0, "sample_num_tokens": 9637.25, "step": 9109, "total_num_tokens": 164478762.0, "z_loss": 0.00031018152367323637 }, { "copy_logits_max": -0.6642234921455383, "copy_logits_min": -750000000.0, "copy_num_tokens": 442.4375, "epoch": 1.860454429410263, "gen_logits_max": 4.804222583770752, "gen_logits_mean": -16.143455505371094, "gen_logits_min": -28.845773696899414, "gen_logits_std": 3.4917333126068115, "gen_loss": 0.27028563618659973, "grad_norm": 0.3658085194008191, "learning_rate": 1.9422315789473684e-05, "loss": 0.2723, "mean_copy_accuracy": 0.9978046268224716, "mean_gen_accuracy": 0.8724646717309952, "mean_token_accuracy": 0.9058617502450943, "num_tokens": 164723265.0, "sample_num_tokens": 7534.25, "step": 9110, "total_num_tokens": 164753402.0, "z_loss": 0.0003981308836955577 }, { "copy_logits_max": 1.2213656902313232, "copy_logits_min": -750000000.0, "copy_num_tokens": 586.5625, "epoch": 1.8606586673474599, "gen_logits_max": 5.853142261505127, "gen_logits_mean": -14.554229736328125, "gen_logits_min": -27.570629119873047, "gen_logits_std": 3.4234631061553955, "gen_loss": 0.24870307743549347, "grad_norm": 0.3576389545901254, "learning_rate": 1.9421052631578948e-05, "loss": 0.2766, "mean_copy_accuracy": 0.9971047192811966, "mean_gen_accuracy": 0.8731145560741425, "mean_token_accuracy": 0.9050072729587555, "num_tokens": 164996575.0, "sample_num_tokens": 8749.25, "step": 9111, "total_num_tokens": 165031572.0, "z_loss": 0.0004063011147081852 }, { "copy_logits_max": -2.1899476051330566, "copy_logits_min": -750000000.0, "copy_num_tokens": 569.5, "epoch": 1.8608629052846566, "gen_logits_max": 5.530679702758789, "gen_logits_mean": -16.16283416748047, "gen_logits_min": -28.947437286376953, "gen_logits_std": 3.4669289588928223, "gen_loss": 0.2951706051826477, "grad_norm": 0.36594153293336595, "learning_rate": 1.941978947368421e-05, "loss": 0.2845, "mean_copy_accuracy": 0.9984452426433563, "mean_gen_accuracy": 0.8720290064811707, "mean_token_accuracy": 0.9032972902059555, "num_tokens": 165271979.0, "sample_num_tokens": 9380.25, "step": 9112, "total_num_tokens": 165309500.0, "z_loss": 0.00046906020725145936 }, { "copy_logits_max": -3.46345853805542, "copy_logits_min": -750000000.0, "copy_num_tokens": 434.3125, "epoch": 1.8610671432218533, "gen_logits_max": 4.630962371826172, "gen_logits_mean": -17.017826080322266, "gen_logits_min": -29.561527252197266, "gen_logits_std": 3.522845506668091, "gen_loss": 0.262884259223938, "grad_norm": 0.33732744821363597, "learning_rate": 1.9418526315789473e-05, "loss": 0.276, "mean_copy_accuracy": 0.9971278756856918, "mean_gen_accuracy": 0.8750053197145462, "mean_token_accuracy": 0.9056609123945236, "num_tokens": 165561298.0, "sample_num_tokens": 8330.0, "step": 9113, "total_num_tokens": 165594618.0, "z_loss": 0.00039766516420058906 }, { "copy_logits_max": 0.0013426542282104492, "copy_logits_min": -750000000.0, "copy_num_tokens": 456.0, "epoch": 1.8612713811590504, "gen_logits_max": 4.7993316650390625, "gen_logits_mean": -15.889081954956055, "gen_logits_min": -28.240795135498047, "gen_logits_std": 3.459963321685791, "gen_loss": 0.31397712230682373, "grad_norm": 0.36014204969434543, "learning_rate": 1.9417263157894734e-05, "loss": 0.277, "mean_copy_accuracy": 0.9972271770238876, "mean_gen_accuracy": 0.8788151741027832, "mean_token_accuracy": 0.9058535695075989, "num_tokens": 165852003.0, "sample_num_tokens": 8984.25, "step": 9114, "total_num_tokens": 165887940.0, "z_loss": 0.000508657016325742 }, { "copy_logits_max": -1.7568105459213257, "copy_logits_min": -687500032.0, "copy_num_tokens": 498.25, "epoch": 1.8614756190962471, "gen_logits_max": 3.867337226867676, "gen_logits_mean": -17.314498901367188, "gen_logits_min": -29.716983795166016, "gen_logits_std": 3.486715078353882, "gen_loss": 0.319693922996521, "grad_norm": 0.3502537318640276, "learning_rate": 1.9416000000000002e-05, "loss": 0.2897, "mean_copy_accuracy": 0.9972948580980301, "mean_gen_accuracy": 0.8649233281612396, "mean_token_accuracy": 0.9009950757026672, "num_tokens": 166137464.0, "sample_num_tokens": 9063.0, "step": 9115, "total_num_tokens": 166173716.0, "z_loss": 0.000579755927901715 }, { "copy_logits_max": 0.6090895533561707, "copy_logits_min": -750000000.0, "copy_num_tokens": 377.9375, "epoch": 1.8616798570334439, "gen_logits_max": 4.9275407791137695, "gen_logits_mean": -16.03240966796875, "gen_logits_min": -28.956506729125977, "gen_logits_std": 3.485417366027832, "gen_loss": 0.29170823097229004, "grad_norm": 0.3667733706443924, "learning_rate": 1.9414736842105266e-05, "loss": 0.2976, "mean_copy_accuracy": 0.9965904206037521, "mean_gen_accuracy": 0.8701515644788742, "mean_token_accuracy": 0.9004940092563629, "num_tokens": 166410900.0, "sample_num_tokens": 7445.5, "step": 9116, "total_num_tokens": 166440682.0, "z_loss": 0.0005197850987315178 }, { "copy_logits_max": -0.40566539764404297, "copy_logits_min": -750000000.0, "copy_num_tokens": 504.6875, "epoch": 1.8618840949706408, "gen_logits_max": 4.524549961090088, "gen_logits_mean": -16.110239028930664, "gen_logits_min": -28.435379028320312, "gen_logits_std": 3.469346284866333, "gen_loss": 0.2502706050872803, "grad_norm": 0.3539585951370305, "learning_rate": 1.9413473684210527e-05, "loss": 0.2726, "mean_copy_accuracy": 0.9974857270717621, "mean_gen_accuracy": 0.8762755393981934, "mean_token_accuracy": 0.9072991162538528, "num_tokens": 166669448.0, "sample_num_tokens": 8446.0, "step": 9117, "total_num_tokens": 166703232.0, "z_loss": 0.00037521315971389413 }, { "copy_logits_max": -0.5851470828056335, "copy_logits_min": -750000000.0, "copy_num_tokens": 746.0, "epoch": 1.8620883329078377, "gen_logits_max": 4.540078163146973, "gen_logits_mean": -15.765863418579102, "gen_logits_min": -28.528491973876953, "gen_logits_std": 3.499455451965332, "gen_loss": 0.2400892972946167, "grad_norm": 0.31906856439994835, "learning_rate": 1.941221052631579e-05, "loss": 0.2508, "mean_copy_accuracy": 0.9978030920028687, "mean_gen_accuracy": 0.8830638825893402, "mean_token_accuracy": 0.9163328856229782, "num_tokens": 166976350.0, "sample_num_tokens": 9812.5, "step": 9118, "total_num_tokens": 167015600.0, "z_loss": 0.0003660616057459265 }, { "copy_logits_max": -3.406980037689209, "copy_logits_min": -750000000.0, "copy_num_tokens": 434.6875, "epoch": 1.8622925708450344, "gen_logits_max": 5.068646430969238, "gen_logits_mean": -16.196434020996094, "gen_logits_min": -28.40135955810547, "gen_logits_std": 3.5128355026245117, "gen_loss": 0.25771844387054443, "grad_norm": 0.3603807203770078, "learning_rate": 1.9410947368421052e-05, "loss": 0.2717, "mean_copy_accuracy": 0.9968261420726776, "mean_gen_accuracy": 0.8818663507699966, "mean_token_accuracy": 0.9093241095542908, "num_tokens": 167255562.0, "sample_num_tokens": 10053.0, "step": 9119, "total_num_tokens": 167295774.0, "z_loss": 0.00036725509562529624 }, { "copy_logits_max": -2.1710591316223145, "copy_logits_min": -750000000.0, "copy_num_tokens": 331.75, "epoch": 1.8624968087822313, "gen_logits_max": 4.838838577270508, "gen_logits_mean": -16.346710205078125, "gen_logits_min": -28.76142120361328, "gen_logits_std": 3.4728903770446777, "gen_loss": 0.27307847142219543, "grad_norm": 0.37154187758736196, "learning_rate": 1.9409684210526317e-05, "loss": 0.2676, "mean_copy_accuracy": 0.9971454441547394, "mean_gen_accuracy": 0.8809413313865662, "mean_token_accuracy": 0.9079484194517136, "num_tokens": 167530262.0, "sample_num_tokens": 7725.0, "step": 9120, "total_num_tokens": 167561162.0, "z_loss": 0.0003990617406088859 }, { "copy_logits_max": -1.8318376541137695, "copy_logits_min": -750000128.0, "copy_num_tokens": 372.875, "epoch": 1.8627010467194283, "gen_logits_max": 5.072093963623047, "gen_logits_mean": -16.324954986572266, "gen_logits_min": -28.384862899780273, "gen_logits_std": 3.5009422302246094, "gen_loss": 0.25100234150886536, "grad_norm": 0.3703321604264087, "learning_rate": 1.9408421052631578e-05, "loss": 0.2606, "mean_copy_accuracy": 0.9971209615468979, "mean_gen_accuracy": 0.8832264840602875, "mean_token_accuracy": 0.9100210070610046, "num_tokens": 167806189.0, "sample_num_tokens": 8569.25, "step": 9121, "total_num_tokens": 167840466.0, "z_loss": 0.0003403186856303364 }, { "copy_logits_max": -2.5885095596313477, "copy_logits_min": -750000064.0, "copy_num_tokens": 565.6875, "epoch": 1.862905284656625, "gen_logits_max": 3.2170424461364746, "gen_logits_mean": -18.39272689819336, "gen_logits_min": -30.80669403076172, "gen_logits_std": 3.5656356811523438, "gen_loss": 0.25433725118637085, "grad_norm": 0.3450919160432196, "learning_rate": 1.9407157894736842e-05, "loss": 0.2777, "mean_copy_accuracy": 0.9972959160804749, "mean_gen_accuracy": 0.8768342733383179, "mean_token_accuracy": 0.9046862423419952, "num_tokens": 168071611.0, "sample_num_tokens": 9489.25, "step": 9122, "total_num_tokens": 168109568.0, "z_loss": 0.0003656813642010093 }, { "copy_logits_max": -1.5099048614501953, "copy_logits_min": -750000128.0, "copy_num_tokens": 336.0625, "epoch": 1.8631095225938217, "gen_logits_max": 5.610726356506348, "gen_logits_mean": -16.29079246520996, "gen_logits_min": -28.40848159790039, "gen_logits_std": 3.4519729614257812, "gen_loss": 0.2998722493648529, "grad_norm": 0.6159120505768284, "learning_rate": 1.9405894736842106e-05, "loss": 0.294, "mean_copy_accuracy": 0.9959369897842407, "mean_gen_accuracy": 0.872758612036705, "mean_token_accuracy": 0.8982962369918823, "num_tokens": 168337290.0, "sample_num_tokens": 7996.5, "step": 9123, "total_num_tokens": 168369276.0, "z_loss": 0.00042683351784944534 }, { "copy_logits_max": 4.288492202758789, "copy_logits_min": -750000000.0, "copy_num_tokens": 770.625, "epoch": 1.8633137605310186, "gen_logits_max": 5.790323257446289, "gen_logits_mean": -13.807598114013672, "gen_logits_min": -27.169837951660156, "gen_logits_std": 3.3944270610809326, "gen_loss": 0.223486065864563, "grad_norm": 0.3328134062408906, "learning_rate": 1.940463157894737e-05, "loss": 0.267, "mean_copy_accuracy": 0.9968392252922058, "mean_gen_accuracy": 0.8808417469263077, "mean_token_accuracy": 0.9100586771965027, "num_tokens": 168601892.0, "sample_num_tokens": 10452.0, "step": 9124, "total_num_tokens": 168643700.0, "z_loss": 0.00032421702053397894 }, { "copy_logits_max": -4.514186859130859, "copy_logits_min": -750000000.0, "copy_num_tokens": 469.625, "epoch": 1.8635179984682155, "gen_logits_max": 3.8265085220336914, "gen_logits_mean": -17.52721405029297, "gen_logits_min": -29.367156982421875, "gen_logits_std": 3.4685611724853516, "gen_loss": 0.2756090462207794, "grad_norm": 0.36045850783716965, "learning_rate": 1.940336842105263e-05, "loss": 0.278, "mean_copy_accuracy": 0.9977721571922302, "mean_gen_accuracy": 0.8804823011159897, "mean_token_accuracy": 0.9070824533700943, "num_tokens": 168888502.0, "sample_num_tokens": 9014.0, "step": 9125, "total_num_tokens": 168924558.0, "z_loss": 0.0004158079100307077 }, { "copy_logits_max": -1.378050684928894, "copy_logits_min": -750000000.0, "copy_num_tokens": 501.3125, "epoch": 1.8637222364054122, "gen_logits_max": 4.149903774261475, "gen_logits_mean": -16.73588752746582, "gen_logits_min": -29.194355010986328, "gen_logits_std": 3.448970079421997, "gen_loss": 0.2815784811973572, "grad_norm": 0.341494917727559, "learning_rate": 1.9402105263157896e-05, "loss": 0.2696, "mean_copy_accuracy": 0.997024193406105, "mean_gen_accuracy": 0.8784731030464172, "mean_token_accuracy": 0.9070571660995483, "num_tokens": 169160969.0, "sample_num_tokens": 8655.75, "step": 9126, "total_num_tokens": 169195592.0, "z_loss": 0.00045606354251503944 }, { "copy_logits_max": 1.35478675365448, "copy_logits_min": -750000064.0, "copy_num_tokens": 485.1875, "epoch": 1.8639264743426092, "gen_logits_max": 6.023650646209717, "gen_logits_mean": -13.61734390258789, "gen_logits_min": -26.163503646850586, "gen_logits_std": 3.3047103881835938, "gen_loss": 0.28820207715034485, "grad_norm": 0.3376327937662694, "learning_rate": 1.9400842105263157e-05, "loss": 0.276, "mean_copy_accuracy": 0.9966007471084595, "mean_gen_accuracy": 0.8793117851018906, "mean_token_accuracy": 0.905835285782814, "num_tokens": 169427766.0, "sample_num_tokens": 8913.5, "step": 9127, "total_num_tokens": 169463420.0, "z_loss": 0.0004226116870995611 }, { "copy_logits_max": 0.30034875869750977, "copy_logits_min": -750000000.0, "copy_num_tokens": 471.4375, "epoch": 1.864130712279806, "gen_logits_max": 5.033237934112549, "gen_logits_mean": -15.415267944335938, "gen_logits_min": -28.120765686035156, "gen_logits_std": 3.395550489425659, "gen_loss": 0.26311439275741577, "grad_norm": 0.3489742872331934, "learning_rate": 1.939957894736842e-05, "loss": 0.2878, "mean_copy_accuracy": 0.9966096878051758, "mean_gen_accuracy": 0.8720360398292542, "mean_token_accuracy": 0.902307540178299, "num_tokens": 169694045.0, "sample_num_tokens": 8514.25, "step": 9128, "total_num_tokens": 169728102.0, "z_loss": 0.00041431363206356764 }, { "copy_logits_max": -4.016361236572266, "copy_logits_min": -750000000.0, "copy_num_tokens": 579.9375, "epoch": 1.8643349502170028, "gen_logits_max": 3.450364589691162, "gen_logits_mean": -17.114295959472656, "gen_logits_min": -29.800785064697266, "gen_logits_std": 3.477715015411377, "gen_loss": 0.2743332087993622, "grad_norm": 0.33945224525562206, "learning_rate": 1.9398315789473685e-05, "loss": 0.2726, "mean_copy_accuracy": 0.9973443895578384, "mean_gen_accuracy": 0.8736826777458191, "mean_token_accuracy": 0.9070264995098114, "num_tokens": 169970223.0, "sample_num_tokens": 9195.75, "step": 9129, "total_num_tokens": 170007006.0, "z_loss": 0.00045777446939609945 }, { "copy_logits_max": 2.720026731491089, "copy_logits_min": -750000000.0, "copy_num_tokens": 514.75, "epoch": 1.8645391881541995, "gen_logits_max": 5.100373268127441, "gen_logits_mean": -14.447487831115723, "gen_logits_min": -27.11931610107422, "gen_logits_std": 3.3410909175872803, "gen_loss": 0.2835341691970825, "grad_norm": 0.32637096492985845, "learning_rate": 1.9397052631578946e-05, "loss": 0.2623, "mean_copy_accuracy": 0.9979971200227737, "mean_gen_accuracy": 0.877965047955513, "mean_token_accuracy": 0.9090477377176285, "num_tokens": 170261101.0, "sample_num_tokens": 8701.25, "step": 9130, "total_num_tokens": 170295906.0, "z_loss": 0.0004058284976053983 }, { "copy_logits_max": -2.7996902465820312, "copy_logits_min": -750000000.0, "copy_num_tokens": 511.125, "epoch": 1.8647434260913964, "gen_logits_max": 2.6994452476501465, "gen_logits_mean": -18.57138442993164, "gen_logits_min": -30.89505958557129, "gen_logits_std": 3.5288989543914795, "gen_loss": 0.2374430000782013, "grad_norm": 0.34557384160836974, "learning_rate": 1.9395789473684214e-05, "loss": 0.2664, "mean_copy_accuracy": 0.9967847764492035, "mean_gen_accuracy": 0.8826486468315125, "mean_token_accuracy": 0.9089665114879608, "num_tokens": 170545937.0, "sample_num_tokens": 9419.75, "step": 9131, "total_num_tokens": 170583616.0, "z_loss": 0.00038063974352553487 }, { "copy_logits_max": -3.883011817932129, "copy_logits_min": -750000000.0, "copy_num_tokens": 439.875, "epoch": 1.8649476640285934, "gen_logits_max": 3.4442479610443115, "gen_logits_mean": -17.795001983642578, "gen_logits_min": -29.84268569946289, "gen_logits_std": 3.476825475692749, "gen_loss": 0.25155285000801086, "grad_norm": 0.3276214832238909, "learning_rate": 1.9394526315789475e-05, "loss": 0.2622, "mean_copy_accuracy": 0.9973656088113785, "mean_gen_accuracy": 0.8837438225746155, "mean_token_accuracy": 0.9105041772127151, "num_tokens": 170824257.0, "sample_num_tokens": 8502.25, "step": 9132, "total_num_tokens": 170858266.0, "z_loss": 0.0003927025245502591 }, { "copy_logits_max": 1.5920276641845703, "copy_logits_min": -687500032.0, "copy_num_tokens": 514.5, "epoch": 1.86515190196579, "gen_logits_max": 4.322154521942139, "gen_logits_mean": -15.260885238647461, "gen_logits_min": -28.00524139404297, "gen_logits_std": 3.387406349182129, "gen_loss": 0.2485445886850357, "grad_norm": 0.3441207755364016, "learning_rate": 1.939326315789474e-05, "loss": 0.2657, "mean_copy_accuracy": 0.996754840016365, "mean_gen_accuracy": 0.8820628374814987, "mean_token_accuracy": 0.909570038318634, "num_tokens": 171085560.0, "sample_num_tokens": 8251.0, "step": 9133, "total_num_tokens": 171118564.0, "z_loss": 0.00043445974006317556 }, { "copy_logits_max": -4.261554718017578, "copy_logits_min": -750000000.0, "copy_num_tokens": 517.0, "epoch": 1.865356139902987, "gen_logits_max": 3.008803606033325, "gen_logits_mean": -18.394895553588867, "gen_logits_min": -30.66564178466797, "gen_logits_std": 3.515716552734375, "gen_loss": 0.2740662693977356, "grad_norm": 0.36354352055140593, "learning_rate": 1.9392e-05, "loss": 0.2691, "mean_copy_accuracy": 0.9969512671232224, "mean_gen_accuracy": 0.87738998234272, "mean_token_accuracy": 0.9075326025485992, "num_tokens": 171360074.0, "sample_num_tokens": 9495.5, "step": 9134, "total_num_tokens": 171398056.0, "z_loss": 0.0004149593587499112 }, { "copy_logits_max": -2.773244857788086, "copy_logits_min": -750000000.0, "copy_num_tokens": 417.6875, "epoch": 1.865560377840184, "gen_logits_max": 2.861525297164917, "gen_logits_mean": -18.588237762451172, "gen_logits_min": -30.93769073486328, "gen_logits_std": 3.509927272796631, "gen_loss": 0.2785077691078186, "grad_norm": 0.32691646065050184, "learning_rate": 1.9390736842105264e-05, "loss": 0.2587, "mean_copy_accuracy": 0.9971051216125488, "mean_gen_accuracy": 0.8801090866327286, "mean_token_accuracy": 0.911150723695755, "num_tokens": 171649387.0, "sample_num_tokens": 8162.75, "step": 9135, "total_num_tokens": 171682038.0, "z_loss": 0.00043176207691431046 }, { "copy_logits_max": -0.3062894940376282, "copy_logits_min": -750000000.0, "copy_num_tokens": 355.3125, "epoch": 1.8657646157773806, "gen_logits_max": 3.3790106773376465, "gen_logits_mean": -17.53341293334961, "gen_logits_min": -29.802154541015625, "gen_logits_std": 3.4215736389160156, "gen_loss": 0.27612748742103577, "grad_norm": 0.33425868162793565, "learning_rate": 1.9389473684210525e-05, "loss": 0.2704, "mean_copy_accuracy": 0.996633380651474, "mean_gen_accuracy": 0.8781170845031738, "mean_token_accuracy": 0.9066951870918274, "num_tokens": 171926023.0, "sample_num_tokens": 7910.25, "step": 9136, "total_num_tokens": 171957664.0, "z_loss": 0.0004071115981787443 }, { "copy_logits_max": 0.5507537126541138, "copy_logits_min": -750000000.0, "copy_num_tokens": 519.3125, "epoch": 1.8659688537145773, "gen_logits_max": 3.1228694915771484, "gen_logits_mean": -17.200817108154297, "gen_logits_min": -29.87375831604004, "gen_logits_std": 3.430593967437744, "gen_loss": 0.2635604441165924, "grad_norm": 0.36506728631202645, "learning_rate": 1.938821052631579e-05, "loss": 0.2575, "mean_copy_accuracy": 0.9975143074989319, "mean_gen_accuracy": 0.8778126090764999, "mean_token_accuracy": 0.9120103418827057, "num_tokens": 172196068.0, "sample_num_tokens": 8315.0, "step": 9137, "total_num_tokens": 172229328.0, "z_loss": 0.0004059056518599391 }, { "copy_logits_max": -2.053835391998291, "copy_logits_min": -750000000.0, "copy_num_tokens": 438.0625, "epoch": 1.8661730916517745, "gen_logits_max": 4.363109588623047, "gen_logits_mean": -16.83203125, "gen_logits_min": -28.9289493560791, "gen_logits_std": 3.386051893234253, "gen_loss": 0.32999175786972046, "grad_norm": 0.3455604989671086, "learning_rate": 1.938694736842105e-05, "loss": 0.2707, "mean_copy_accuracy": 0.9974389970302582, "mean_gen_accuracy": 0.8779969811439514, "mean_token_accuracy": 0.9086479395627975, "num_tokens": 172489838.0, "sample_num_tokens": 8877.0, "step": 9138, "total_num_tokens": 172525346.0, "z_loss": 0.00047460250789299607 }, { "copy_logits_max": -3.0306150913238525, "copy_logits_min": -750000000.0, "copy_num_tokens": 325.3125, "epoch": 1.8663773295889712, "gen_logits_max": 4.53697395324707, "gen_logits_mean": -16.271106719970703, "gen_logits_min": -29.085012435913086, "gen_logits_std": 3.411937713623047, "gen_loss": 0.26442039012908936, "grad_norm": 0.40649167324071994, "learning_rate": 1.9385684210526318e-05, "loss": 0.2734, "mean_copy_accuracy": 0.996371865272522, "mean_gen_accuracy": 0.8804610669612885, "mean_token_accuracy": 0.9067494422197342, "num_tokens": 172743000.0, "sample_num_tokens": 6975.0, "step": 9139, "total_num_tokens": 172770900.0, "z_loss": 0.000414250825997442 }, { "copy_logits_max": -2.714817523956299, "copy_logits_min": -750000000.0, "copy_num_tokens": 460.0625, "epoch": 1.8665815675261679, "gen_logits_max": 3.223707437515259, "gen_logits_mean": -17.944299697875977, "gen_logits_min": -30.23195457458496, "gen_logits_std": 3.447418689727783, "gen_loss": 0.2805784344673157, "grad_norm": 0.3635853021150091, "learning_rate": 1.938442105263158e-05, "loss": 0.2826, "mean_copy_accuracy": 0.9967391341924667, "mean_gen_accuracy": 0.876534178853035, "mean_token_accuracy": 0.904559001326561, "num_tokens": 173012822.0, "sample_num_tokens": 9230.0, "step": 9140, "total_num_tokens": 173049742.0, "z_loss": 0.00042071405914612114 }, { "copy_logits_max": -2.667612075805664, "copy_logits_min": -687500032.0, "copy_num_tokens": 335.4375, "epoch": 1.8667858054633648, "gen_logits_max": 4.3329291343688965, "gen_logits_mean": -17.30243492126465, "gen_logits_min": -29.522701263427734, "gen_logits_std": 3.438674211502075, "gen_loss": 0.2942432761192322, "grad_norm": 0.34806207142033047, "learning_rate": 1.9383157894736843e-05, "loss": 0.2788, "mean_copy_accuracy": 0.9968540668487549, "mean_gen_accuracy": 0.8795410841703415, "mean_token_accuracy": 0.9061680734157562, "num_tokens": 173300019.0, "sample_num_tokens": 7861.75, "step": 9141, "total_num_tokens": 173331466.0, "z_loss": 0.000435207795817405 }, { "copy_logits_max": 1.3538644313812256, "copy_logits_min": -750000000.0, "copy_num_tokens": 405.0625, "epoch": 1.8669900434005617, "gen_logits_max": 4.322815418243408, "gen_logits_mean": -16.232810974121094, "gen_logits_min": -28.643653869628906, "gen_logits_std": 3.3553194999694824, "gen_loss": 0.31472963094711304, "grad_norm": 0.3376626554294821, "learning_rate": 1.9381894736842108e-05, "loss": 0.2739, "mean_copy_accuracy": 0.996496245265007, "mean_gen_accuracy": 0.8764575570821762, "mean_token_accuracy": 0.9066971391439438, "num_tokens": 173570120.0, "sample_num_tokens": 7815.0, "step": 9142, "total_num_tokens": 173601380.0, "z_loss": 0.0005114773521199822 }, { "copy_logits_max": -4.208310604095459, "copy_logits_min": -750000000.0, "copy_num_tokens": 369.9375, "epoch": 1.8671942813377584, "gen_logits_max": 3.387817621231079, "gen_logits_mean": -18.26784324645996, "gen_logits_min": -29.97773551940918, "gen_logits_std": 3.4618396759033203, "gen_loss": 0.2915416359901428, "grad_norm": 0.34158931529251463, "learning_rate": 1.938063157894737e-05, "loss": 0.2755, "mean_copy_accuracy": 0.9975356608629227, "mean_gen_accuracy": 0.8800003081560135, "mean_token_accuracy": 0.9074234068393707, "num_tokens": 173832326.0, "sample_num_tokens": 7724.0, "step": 9143, "total_num_tokens": 173863222.0, "z_loss": 0.0004689658817369491 }, { "copy_logits_max": -1.4563853740692139, "copy_logits_min": -750000064.0, "copy_num_tokens": 482.9375, "epoch": 1.8673985192749554, "gen_logits_max": 3.046764373779297, "gen_logits_mean": -18.1485652923584, "gen_logits_min": -30.480911254882812, "gen_logits_std": 3.464669942855835, "gen_loss": 0.3069348633289337, "grad_norm": 0.35481109266029104, "learning_rate": 1.9379368421052633e-05, "loss": 0.2583, "mean_copy_accuracy": 0.9977525919675827, "mean_gen_accuracy": 0.8782601207494736, "mean_token_accuracy": 0.9141056686639786, "num_tokens": 174125499.0, "sample_num_tokens": 8270.25, "step": 9144, "total_num_tokens": 174158580.0, "z_loss": 0.00048116667312569916 }, { "copy_logits_max": -3.441373109817505, "copy_logits_min": -687500032.0, "copy_num_tokens": 365.0, "epoch": 1.8676027572121523, "gen_logits_max": 3.6752378940582275, "gen_logits_mean": -17.696582794189453, "gen_logits_min": -29.842180252075195, "gen_logits_std": 3.4209096431732178, "gen_loss": 0.3329896926879883, "grad_norm": 0.38102994336141877, "learning_rate": 1.9378105263157894e-05, "loss": 0.2967, "mean_copy_accuracy": 0.9965934604406357, "mean_gen_accuracy": 0.869800329208374, "mean_token_accuracy": 0.8984844386577606, "num_tokens": 174402755.0, "sample_num_tokens": 7767.75, "step": 9145, "total_num_tokens": 174433826.0, "z_loss": 0.00046854946413077414 }, { "copy_logits_max": -2.6116130352020264, "copy_logits_min": -750000000.0, "copy_num_tokens": 482.8125, "epoch": 1.867806995149349, "gen_logits_max": 4.28973913192749, "gen_logits_mean": -16.302448272705078, "gen_logits_min": -28.925432205200195, "gen_logits_std": 3.4018301963806152, "gen_loss": 0.28176388144493103, "grad_norm": 0.3553424615955189, "learning_rate": 1.9376842105263158e-05, "loss": 0.2602, "mean_copy_accuracy": 0.997089073061943, "mean_gen_accuracy": 0.8820637762546539, "mean_token_accuracy": 0.9110884368419647, "num_tokens": 174668705.0, "sample_num_tokens": 8843.75, "step": 9146, "total_num_tokens": 174704080.0, "z_loss": 0.0004018308245576918 }, { "copy_logits_max": -4.449471473693848, "copy_logits_min": -750000000.0, "copy_num_tokens": 544.4375, "epoch": 1.8680112330865457, "gen_logits_max": 2.9836413860321045, "gen_logits_mean": -17.637298583984375, "gen_logits_min": -30.481897354125977, "gen_logits_std": 3.4722280502319336, "gen_loss": 0.27363523840904236, "grad_norm": 0.37382476200089376, "learning_rate": 1.9375578947368422e-05, "loss": 0.2716, "mean_copy_accuracy": 0.9960081726312637, "mean_gen_accuracy": 0.8725814819335938, "mean_token_accuracy": 0.9062622487545013, "num_tokens": 174943053.0, "sample_num_tokens": 8545.75, "step": 9147, "total_num_tokens": 174977236.0, "z_loss": 0.0004455252201296389 }, { "copy_logits_max": -4.156911849975586, "copy_logits_min": -750000064.0, "copy_num_tokens": 548.5, "epoch": 1.8682154710237426, "gen_logits_max": 2.348428249359131, "gen_logits_mean": -17.670867919921875, "gen_logits_min": -29.846019744873047, "gen_logits_std": 3.4100143909454346, "gen_loss": 0.23588857054710388, "grad_norm": 0.3543208766861407, "learning_rate": 1.9374315789473687e-05, "loss": 0.2697, "mean_copy_accuracy": 0.9960106462240219, "mean_gen_accuracy": 0.8788031786680222, "mean_token_accuracy": 0.9066881537437439, "num_tokens": 175191317.0, "sample_num_tokens": 8505.25, "step": 9148, "total_num_tokens": 175225338.0, "z_loss": 0.0003591815766412765 }, { "copy_logits_max": -2.5084176063537598, "copy_logits_min": -750000000.0, "copy_num_tokens": 536.6875, "epoch": 1.8684197089609396, "gen_logits_max": 3.467236280441284, "gen_logits_mean": -17.059030532836914, "gen_logits_min": -29.36477279663086, "gen_logits_std": 3.4525489807128906, "gen_loss": 0.23545370995998383, "grad_norm": 0.3636532276986032, "learning_rate": 1.9373052631578948e-05, "loss": 0.2546, "mean_copy_accuracy": 0.9979875385761261, "mean_gen_accuracy": 0.8788871616125107, "mean_token_accuracy": 0.9105154871940613, "num_tokens": 175473878.0, "sample_num_tokens": 8710.5, "step": 9149, "total_num_tokens": 175508720.0, "z_loss": 0.00036861453554593027 }, { "copy_logits_max": -5.642854690551758, "copy_logits_min": -750000000.0, "copy_num_tokens": 480.875, "epoch": 1.8686239468981363, "gen_logits_max": 3.6268954277038574, "gen_logits_mean": -17.478713989257812, "gen_logits_min": -29.656511306762695, "gen_logits_std": 3.442265510559082, "gen_loss": 0.2879713177680969, "grad_norm": 0.35804085386015716, "learning_rate": 1.9371789473684212e-05, "loss": 0.2701, "mean_copy_accuracy": 0.9961067736148834, "mean_gen_accuracy": 0.8792400658130646, "mean_token_accuracy": 0.9074824303388596, "num_tokens": 175748461.0, "sample_num_tokens": 8471.75, "step": 9150, "total_num_tokens": 175782348.0, "z_loss": 0.0005007090512663126 }, { "copy_logits_max": -7.538000583648682, "copy_logits_min": -750000128.0, "copy_num_tokens": 365.8125, "epoch": 1.8688281848353332, "gen_logits_max": 3.9207897186279297, "gen_logits_mean": -17.690898895263672, "gen_logits_min": -29.38861083984375, "gen_logits_std": 3.4156265258789062, "gen_loss": 0.31150299310684204, "grad_norm": 0.3171158087780421, "learning_rate": 1.9370526315789473e-05, "loss": 0.2734, "mean_copy_accuracy": 0.9963539391756058, "mean_gen_accuracy": 0.8782559484243393, "mean_token_accuracy": 0.9063486903905869, "num_tokens": 176040728.0, "sample_num_tokens": 8012.0, "step": 9151, "total_num_tokens": 176072776.0, "z_loss": 0.0004824302450288087 }, { "copy_logits_max": -4.447796821594238, "copy_logits_min": -750000000.0, "copy_num_tokens": 432.875, "epoch": 1.8690324227725301, "gen_logits_max": 3.4144911766052246, "gen_logits_mean": -18.225252151489258, "gen_logits_min": -30.26004981994629, "gen_logits_std": 3.4990901947021484, "gen_loss": 0.2563973665237427, "grad_norm": 0.32553572540586323, "learning_rate": 1.9369263157894737e-05, "loss": 0.2571, "mean_copy_accuracy": 0.9962736219167709, "mean_gen_accuracy": 0.8828717321157455, "mean_token_accuracy": 0.9127406924962997, "num_tokens": 176310376.0, "sample_num_tokens": 8699.5, "step": 9152, "total_num_tokens": 176345174.0, "z_loss": 0.00040564857772551477 }, { "copy_logits_max": -5.792325973510742, "copy_logits_min": -750000000.0, "copy_num_tokens": 415.25, "epoch": 1.8692366607097268, "gen_logits_max": 3.725409507751465, "gen_logits_mean": -17.62087631225586, "gen_logits_min": -30.183244705200195, "gen_logits_std": 3.4836654663085938, "gen_loss": 0.262224406003952, "grad_norm": 0.34240472005821626, "learning_rate": 1.9367999999999998e-05, "loss": 0.2698, "mean_copy_accuracy": 0.997299998998642, "mean_gen_accuracy": 0.8786700218915939, "mean_token_accuracy": 0.9075387269258499, "num_tokens": 176582763.0, "sample_num_tokens": 8020.25, "step": 9153, "total_num_tokens": 176614844.0, "z_loss": 0.00042710843263193965 }, { "copy_logits_max": -4.096806526184082, "copy_logits_min": -750000000.0, "copy_num_tokens": 603.5, "epoch": 1.8694408986469235, "gen_logits_max": 4.818706035614014, "gen_logits_mean": -14.589719772338867, "gen_logits_min": -26.993013381958008, "gen_logits_std": 3.377427816390991, "gen_loss": 0.2374946027994156, "grad_norm": 0.347847839872818, "learning_rate": 1.9366736842105263e-05, "loss": 0.278, "mean_copy_accuracy": 0.9978045076131821, "mean_gen_accuracy": 0.876557394862175, "mean_token_accuracy": 0.9061510413885117, "num_tokens": 176867792.0, "sample_num_tokens": 8983.0, "step": 9154, "total_num_tokens": 176903724.0, "z_loss": 0.0003735917853191495 }, { "copy_logits_max": -6.085479736328125, "copy_logits_min": -750000000.0, "copy_num_tokens": 507.5, "epoch": 1.8696451365841205, "gen_logits_max": 4.637063026428223, "gen_logits_mean": -16.183990478515625, "gen_logits_min": -28.04093360900879, "gen_logits_std": 3.36867094039917, "gen_loss": 0.3201243281364441, "grad_norm": 0.33980625720081337, "learning_rate": 1.9365473684210527e-05, "loss": 0.2736, "mean_copy_accuracy": 0.9974830448627472, "mean_gen_accuracy": 0.8765434771776199, "mean_token_accuracy": 0.9067430049180984, "num_tokens": 177157073.0, "sample_num_tokens": 9601.25, "step": 9155, "total_num_tokens": 177195478.0, "z_loss": 0.0005002209800295532 }, { "copy_logits_max": -2.353727340698242, "copy_logits_min": -625000064.0, "copy_num_tokens": 705.3125, "epoch": 1.8698493745213174, "gen_logits_max": 4.956460952758789, "gen_logits_mean": -14.319352149963379, "gen_logits_min": -26.782644271850586, "gen_logits_std": 3.353583812713623, "gen_loss": 0.28124645352363586, "grad_norm": 0.3205489139118828, "learning_rate": 1.936421052631579e-05, "loss": 0.2888, "mean_copy_accuracy": 0.9975855350494385, "mean_gen_accuracy": 0.8698437064886093, "mean_token_accuracy": 0.9017444998025894, "num_tokens": 177448539.0, "sample_num_tokens": 11007.25, "step": 9156, "total_num_tokens": 177492568.0, "z_loss": 0.0004857726162299514 }, { "copy_logits_max": -6.739945411682129, "copy_logits_min": -687500032.0, "copy_num_tokens": 402.4375, "epoch": 1.870053612458514, "gen_logits_max": 3.474776268005371, "gen_logits_mean": -18.292739868164062, "gen_logits_min": -30.261877059936523, "gen_logits_std": 3.523094654083252, "gen_loss": 0.29572945833206177, "grad_norm": 0.3275065996907709, "learning_rate": 1.9362947368421055e-05, "loss": 0.2776, "mean_copy_accuracy": 0.9974421858787537, "mean_gen_accuracy": 0.8752159923315048, "mean_token_accuracy": 0.9037186950445175, "num_tokens": 177720930.0, "sample_num_tokens": 8193.5, "step": 9157, "total_num_tokens": 177753704.0, "z_loss": 0.00044283020542934537 }, { "copy_logits_max": -3.8926875591278076, "copy_logits_min": -750000000.0, "copy_num_tokens": 645.625, "epoch": 1.870257850395711, "gen_logits_max": 4.021717071533203, "gen_logits_mean": -15.512070655822754, "gen_logits_min": -28.00762367248535, "gen_logits_std": 3.4190917015075684, "gen_loss": 0.2603992819786072, "grad_norm": 0.34549375198288895, "learning_rate": 1.9361684210526316e-05, "loss": 0.2759, "mean_copy_accuracy": 0.9973369091749191, "mean_gen_accuracy": 0.8750206381082535, "mean_token_accuracy": 0.9068126380443573, "num_tokens": 178011627.0, "sample_num_tokens": 10057.25, "step": 9158, "total_num_tokens": 178051856.0, "z_loss": 0.0003728115698322654 }, { "copy_logits_max": -3.9882662296295166, "copy_logits_min": -750000000.0, "copy_num_tokens": 420.875, "epoch": 1.870462088332908, "gen_logits_max": 3.1817712783813477, "gen_logits_mean": -17.846805572509766, "gen_logits_min": -30.132545471191406, "gen_logits_std": 3.523566961288452, "gen_loss": 0.28650155663490295, "grad_norm": 0.40333973639624654, "learning_rate": 1.936042105263158e-05, "loss": 0.3143, "mean_copy_accuracy": 0.9964056015014648, "mean_gen_accuracy": 0.8642767667770386, "mean_token_accuracy": 0.8930651843547821, "num_tokens": 178258830.0, "sample_num_tokens": 7526.0, "step": 9159, "total_num_tokens": 178288934.0, "z_loss": 0.0004480489296838641 }, { "copy_logits_max": -5.243084907531738, "copy_logits_min": -750000000.0, "copy_num_tokens": 539.5625, "epoch": 1.8706663262701047, "gen_logits_max": 2.9589147567749023, "gen_logits_mean": -18.267906188964844, "gen_logits_min": -30.690351486206055, "gen_logits_std": 3.5256810188293457, "gen_loss": 0.2833552956581116, "grad_norm": 0.3477236401664823, "learning_rate": 1.935915789473684e-05, "loss": 0.279, "mean_copy_accuracy": 0.9979699403047562, "mean_gen_accuracy": 0.8738674521446228, "mean_token_accuracy": 0.9041126072406769, "num_tokens": 178527004.0, "sample_num_tokens": 9309.0, "step": 9160, "total_num_tokens": 178564240.0, "z_loss": 0.0004413016722537577 }, { "copy_logits_max": -5.391899585723877, "copy_logits_min": -750000000.0, "copy_num_tokens": 492.0625, "epoch": 1.8708705642073014, "gen_logits_max": 3.6180617809295654, "gen_logits_mean": -16.867034912109375, "gen_logits_min": -29.642181396484375, "gen_logits_std": 3.4681272506713867, "gen_loss": 0.2885512709617615, "grad_norm": 0.359182537629042, "learning_rate": 1.9357894736842106e-05, "loss": 0.282, "mean_copy_accuracy": 0.9973326474428177, "mean_gen_accuracy": 0.8738119900226593, "mean_token_accuracy": 0.9036491513252258, "num_tokens": 178824243.0, "sample_num_tokens": 8426.25, "step": 9161, "total_num_tokens": 178857948.0, "z_loss": 0.0004858382744714618 }, { "copy_logits_max": -6.05092716217041, "copy_logits_min": -750000000.0, "copy_num_tokens": 254.6875, "epoch": 1.8710748021444985, "gen_logits_max": 4.415031433105469, "gen_logits_mean": -17.254165649414062, "gen_logits_min": -29.231185913085938, "gen_logits_std": 3.4634203910827637, "gen_loss": 0.32183802127838135, "grad_norm": 0.37907130354155877, "learning_rate": 1.9356631578947367e-05, "loss": 0.3015, "mean_copy_accuracy": 0.9965279847383499, "mean_gen_accuracy": 0.8707824796438217, "mean_token_accuracy": 0.8963524997234344, "num_tokens": 179073326.0, "sample_num_tokens": 7539.5, "step": 9162, "total_num_tokens": 179103484.0, "z_loss": 0.0004652847710531205 }, { "copy_logits_max": -6.1630964279174805, "copy_logits_min": -750000000.0, "copy_num_tokens": 622.6875, "epoch": 1.8712790400816952, "gen_logits_max": 4.412911891937256, "gen_logits_mean": -15.403193473815918, "gen_logits_min": -27.716873168945312, "gen_logits_std": 3.4439783096313477, "gen_loss": 0.23853865265846252, "grad_norm": 0.34687703914082313, "learning_rate": 1.935536842105263e-05, "loss": 0.2559, "mean_copy_accuracy": 0.9973222315311432, "mean_gen_accuracy": 0.8843561559915543, "mean_token_accuracy": 0.9131105542182922, "num_tokens": 179328571.0, "sample_num_tokens": 8707.25, "step": 9163, "total_num_tokens": 179363400.0, "z_loss": 0.00037938286550343037 }, { "copy_logits_max": -7.510507583618164, "copy_logits_min": -750000000.0, "copy_num_tokens": 326.3125, "epoch": 1.871483278018892, "gen_logits_max": 4.476117134094238, "gen_logits_mean": -17.008895874023438, "gen_logits_min": -28.939632415771484, "gen_logits_std": 3.426100969314575, "gen_loss": 0.33087822794914246, "grad_norm": 0.36020932089100854, "learning_rate": 1.9354105263157895e-05, "loss": 0.2934, "mean_copy_accuracy": 0.9970435202121735, "mean_gen_accuracy": 0.8670701533555984, "mean_token_accuracy": 0.900351345539093, "num_tokens": 179602979.0, "sample_num_tokens": 7229.75, "step": 9164, "total_num_tokens": 179631898.0, "z_loss": 0.0004858957545366138 }, { "copy_logits_max": -5.871117115020752, "copy_logits_min": -625000064.0, "copy_num_tokens": 385.8125, "epoch": 1.8716875159560888, "gen_logits_max": 5.054741382598877, "gen_logits_mean": -15.175335884094238, "gen_logits_min": -27.946744918823242, "gen_logits_std": 3.3824985027313232, "gen_loss": 0.24623292684555054, "grad_norm": 0.3551803681317582, "learning_rate": 1.935284210526316e-05, "loss": 0.269, "mean_copy_accuracy": 0.9965647459030151, "mean_gen_accuracy": 0.8810900002717972, "mean_token_accuracy": 0.9076849520206451, "num_tokens": 179841145.0, "sample_num_tokens": 7403.75, "step": 9165, "total_num_tokens": 179870760.0, "z_loss": 0.00034422415774315596 }, { "copy_logits_max": -5.350900650024414, "copy_logits_min": -625000000.0, "copy_num_tokens": 391.25, "epoch": 1.8718917538932858, "gen_logits_max": 4.34577751159668, "gen_logits_mean": -16.36048126220703, "gen_logits_min": -28.381080627441406, "gen_logits_std": 3.4318642616271973, "gen_loss": 0.3281407356262207, "grad_norm": 0.34571353752907436, "learning_rate": 1.935157894736842e-05, "loss": 0.29, "mean_copy_accuracy": 0.9979677349328995, "mean_gen_accuracy": 0.8719765692949295, "mean_token_accuracy": 0.9045639783143997, "num_tokens": 180119479.0, "sample_num_tokens": 7565.75, "step": 9166, "total_num_tokens": 180149742.0, "z_loss": 0.0004338564758654684 }, { "copy_logits_max": -7.380156517028809, "copy_logits_min": -750000000.0, "copy_num_tokens": 535.9375, "epoch": 1.8720959918304825, "gen_logits_max": 4.471573829650879, "gen_logits_mean": -15.357966423034668, "gen_logits_min": -27.99969482421875, "gen_logits_std": 3.4248480796813965, "gen_loss": 0.24731196463108063, "grad_norm": 0.35808694919572887, "learning_rate": 1.9350315789473685e-05, "loss": 0.2668, "mean_copy_accuracy": 0.9968403577804565, "mean_gen_accuracy": 0.874087393283844, "mean_token_accuracy": 0.9088468551635742, "num_tokens": 180388544.0, "sample_num_tokens": 8037.0, "step": 9167, "total_num_tokens": 180420692.0, "z_loss": 0.0003546075022313744 }, { "copy_logits_max": -8.748514175415039, "copy_logits_min": -750000000.0, "copy_num_tokens": 488.5625, "epoch": 1.8723002297676792, "gen_logits_max": 4.174469470977783, "gen_logits_mean": -16.873699188232422, "gen_logits_min": -29.097858428955078, "gen_logits_std": 3.510735034942627, "gen_loss": 0.2470286786556244, "grad_norm": 0.35095110405694535, "learning_rate": 1.9349052631578946e-05, "loss": 0.2662, "mean_copy_accuracy": 0.9972436130046844, "mean_gen_accuracy": 0.8805817663669586, "mean_token_accuracy": 0.9084697365760803, "num_tokens": 180666202.0, "sample_num_tokens": 9582.0, "step": 9168, "total_num_tokens": 180704530.0, "z_loss": 0.00033778772922232747 }, { "copy_logits_max": -6.224810600280762, "copy_logits_min": -687500032.0, "copy_num_tokens": 448.875, "epoch": 1.8725044677048763, "gen_logits_max": 4.416292190551758, "gen_logits_mean": -16.143138885498047, "gen_logits_min": -28.12753677368164, "gen_logits_std": 3.422312021255493, "gen_loss": 0.2763599753379822, "grad_norm": 0.3747180110107002, "learning_rate": 1.934778947368421e-05, "loss": 0.2663, "mean_copy_accuracy": 0.9975837171077728, "mean_gen_accuracy": 0.8798340111970901, "mean_token_accuracy": 0.9098367989063263, "num_tokens": 180915193.0, "sample_num_tokens": 8220.25, "step": 9169, "total_num_tokens": 180948074.0, "z_loss": 0.00035568061866797507 }, { "copy_logits_max": -7.591553211212158, "copy_logits_min": -750000064.0, "copy_num_tokens": 566.25, "epoch": 1.872708705642073, "gen_logits_max": 4.0345258712768555, "gen_logits_mean": -16.314857482910156, "gen_logits_min": -28.55380630493164, "gen_logits_std": 3.4618358612060547, "gen_loss": 0.2803478240966797, "grad_norm": 0.349727360805112, "learning_rate": 1.9346526315789475e-05, "loss": 0.2721, "mean_copy_accuracy": 0.9977677017450333, "mean_gen_accuracy": 0.8745790719985962, "mean_token_accuracy": 0.9061791300773621, "num_tokens": 181197730.0, "sample_num_tokens": 9252.5, "step": 9170, "total_num_tokens": 181234740.0, "z_loss": 0.00041672063525766134 }, { "copy_logits_max": -7.367694854736328, "copy_logits_min": -750000000.0, "copy_num_tokens": 500.75, "epoch": 1.8729129435792697, "gen_logits_max": 3.658750057220459, "gen_logits_mean": -17.245887756347656, "gen_logits_min": -29.423410415649414, "gen_logits_std": 3.517003059387207, "gen_loss": 0.23523356020450592, "grad_norm": 0.35677906310346097, "learning_rate": 1.9345263157894735e-05, "loss": 0.2686, "mean_copy_accuracy": 0.9975807517766953, "mean_gen_accuracy": 0.871860146522522, "mean_token_accuracy": 0.9089072793722153, "num_tokens": 181484867.0, "sample_num_tokens": 8444.75, "step": 9171, "total_num_tokens": 181518646.0, "z_loss": 0.0003511017421260476 }, { "copy_logits_max": -7.198281288146973, "copy_logits_min": -750000000.0, "copy_num_tokens": 293.75, "epoch": 1.8731171815164667, "gen_logits_max": 5.0530853271484375, "gen_logits_mean": -15.194186210632324, "gen_logits_min": -27.52606201171875, "gen_logits_std": 3.3879308700561523, "gen_loss": 0.26295119524002075, "grad_norm": 0.3899673383603947, "learning_rate": 1.9344000000000003e-05, "loss": 0.2862, "mean_copy_accuracy": 0.9971873164176941, "mean_gen_accuracy": 0.8806668370962143, "mean_token_accuracy": 0.9017223566770554, "num_tokens": 181721237.0, "sample_num_tokens": 7733.75, "step": 9172, "total_num_tokens": 181752172.0, "z_loss": 0.00040618557250127196 }, { "copy_logits_max": -7.806276798248291, "copy_logits_min": -750000000.0, "copy_num_tokens": 425.1875, "epoch": 1.8733214194536636, "gen_logits_max": 2.9295897483825684, "gen_logits_mean": -18.90172004699707, "gen_logits_min": -30.675491333007812, "gen_logits_std": 3.5622336864471436, "gen_loss": 0.24790683388710022, "grad_norm": 0.36299015197410783, "learning_rate": 1.9342736842105264e-05, "loss": 0.2852, "mean_copy_accuracy": 0.9970169067382812, "mean_gen_accuracy": 0.873711422085762, "mean_token_accuracy": 0.9017651081085205, "num_tokens": 181986190.0, "sample_num_tokens": 8321.0, "step": 9173, "total_num_tokens": 182019474.0, "z_loss": 0.000349780370015651 }, { "copy_logits_max": -7.533810138702393, "copy_logits_min": -750000000.0, "copy_num_tokens": 471.0625, "epoch": 1.8735256573908603, "gen_logits_max": 3.7404417991638184, "gen_logits_mean": -16.643966674804688, "gen_logits_min": -29.02471923828125, "gen_logits_std": 3.4374313354492188, "gen_loss": 0.28838178515434265, "grad_norm": 0.3443146205039492, "learning_rate": 1.934147368421053e-05, "loss": 0.2713, "mean_copy_accuracy": 0.9969537705183029, "mean_gen_accuracy": 0.8765816539525986, "mean_token_accuracy": 0.9084126204252243, "num_tokens": 182263408.0, "sample_num_tokens": 8355.0, "step": 9174, "total_num_tokens": 182296828.0, "z_loss": 0.00044588997843675315 }, { "copy_logits_max": -8.007190704345703, "copy_logits_min": -750000000.0, "copy_num_tokens": 455.0625, "epoch": 1.8737298953280572, "gen_logits_max": 3.5790252685546875, "gen_logits_mean": -17.116296768188477, "gen_logits_min": -29.096742630004883, "gen_logits_std": 3.4594106674194336, "gen_loss": 0.2626362442970276, "grad_norm": 0.35186939975220627, "learning_rate": 1.934021052631579e-05, "loss": 0.2812, "mean_copy_accuracy": 0.9970262199640274, "mean_gen_accuracy": 0.8784698396921158, "mean_token_accuracy": 0.9053663164377213, "num_tokens": 182523616.0, "sample_num_tokens": 8610.0, "step": 9175, "total_num_tokens": 182558056.0, "z_loss": 0.0003668760764412582 }, { "copy_logits_max": -6.816418170928955, "copy_logits_min": -750000000.0, "copy_num_tokens": 574.125, "epoch": 1.8739341332652542, "gen_logits_max": 2.9280238151550293, "gen_logits_mean": -17.78465461730957, "gen_logits_min": -30.037853240966797, "gen_logits_std": 3.5131192207336426, "gen_loss": 0.26912927627563477, "grad_norm": 0.31915057203889713, "learning_rate": 1.9338947368421054e-05, "loss": 0.2636, "mean_copy_accuracy": 0.9971721321344376, "mean_gen_accuracy": 0.8818351477384567, "mean_token_accuracy": 0.9109113663434982, "num_tokens": 182799583.0, "sample_num_tokens": 8776.75, "step": 9176, "total_num_tokens": 182834690.0, "z_loss": 0.0004119241493754089 }, { "copy_logits_max": -6.698623180389404, "copy_logits_min": -750000000.0, "copy_num_tokens": 705.25, "epoch": 1.8741383712024509, "gen_logits_max": 4.278661251068115, "gen_logits_mean": -14.734950065612793, "gen_logits_min": -27.024620056152344, "gen_logits_std": 3.4044876098632812, "gen_loss": 0.2122369408607483, "grad_norm": 0.3348916205023073, "learning_rate": 1.9337684210526315e-05, "loss": 0.2729, "mean_copy_accuracy": 0.9971389919519424, "mean_gen_accuracy": 0.8742875158786774, "mean_token_accuracy": 0.905667170882225, "num_tokens": 183085391.0, "sample_num_tokens": 9627.75, "step": 9177, "total_num_tokens": 183123902.0, "z_loss": 0.0003532581904437393 }, { "copy_logits_max": -6.213836193084717, "copy_logits_min": -625000064.0, "copy_num_tokens": 704.25, "epoch": 1.8743426091396476, "gen_logits_max": 4.445316791534424, "gen_logits_mean": -14.475790023803711, "gen_logits_min": -27.20608139038086, "gen_logits_std": 3.3435330390930176, "gen_loss": 0.22119012475013733, "grad_norm": 0.32506059058800035, "learning_rate": 1.933642105263158e-05, "loss": 0.2753, "mean_copy_accuracy": 0.9967435151338577, "mean_gen_accuracy": 0.8781137615442276, "mean_token_accuracy": 0.9065297991037369, "num_tokens": 183365149.0, "sample_num_tokens": 9398.25, "step": 9178, "total_num_tokens": 183402742.0, "z_loss": 0.0003575650916900486 }, { "copy_logits_max": -7.905227184295654, "copy_logits_min": -750000000.0, "copy_num_tokens": 354.5, "epoch": 1.8745468470768445, "gen_logits_max": 4.1235127449035645, "gen_logits_mean": -17.091724395751953, "gen_logits_min": -29.14178466796875, "gen_logits_std": 3.4048333168029785, "gen_loss": 0.2932969331741333, "grad_norm": 0.39447425640202144, "learning_rate": 1.933515789473684e-05, "loss": 0.2958, "mean_copy_accuracy": 0.995958223938942, "mean_gen_accuracy": 0.87454454600811, "mean_token_accuracy": 0.8989201784133911, "num_tokens": 183619123.0, "sample_num_tokens": 8772.75, "step": 9179, "total_num_tokens": 183654214.0, "z_loss": 0.00046288830344565213 }, { "copy_logits_max": -6.097080707550049, "copy_logits_min": -750000000.0, "copy_num_tokens": 362.25, "epoch": 1.8747510850140414, "gen_logits_max": 3.6206977367401123, "gen_logits_mean": -17.813127517700195, "gen_logits_min": -30.01592254638672, "gen_logits_std": 3.4514143466949463, "gen_loss": 0.29962024092674255, "grad_norm": 0.35910025916948063, "learning_rate": 1.9333894736842107e-05, "loss": 0.2657, "mean_copy_accuracy": 0.9970717132091522, "mean_gen_accuracy": 0.8812079727649689, "mean_token_accuracy": 0.910665899515152, "num_tokens": 183892324.0, "sample_num_tokens": 7398.0, "step": 9180, "total_num_tokens": 183921916.0, "z_loss": 0.00048185145715251565 }, { "copy_logits_max": -5.3768463134765625, "copy_logits_min": -750000000.0, "copy_num_tokens": 463.9375, "epoch": 1.8749553229512381, "gen_logits_max": 3.527360439300537, "gen_logits_mean": -17.414499282836914, "gen_logits_min": -29.360239028930664, "gen_logits_std": 3.436446189880371, "gen_loss": 0.29349464178085327, "grad_norm": 0.392045383985199, "learning_rate": 1.933263157894737e-05, "loss": 0.2831, "mean_copy_accuracy": 0.9965381324291229, "mean_gen_accuracy": 0.875673308968544, "mean_token_accuracy": 0.9020616412162781, "num_tokens": 184145042.0, "sample_num_tokens": 8800.5, "step": 9181, "total_num_tokens": 184180244.0, "z_loss": 0.0005076350062154233 }, { "copy_logits_max": -6.309256553649902, "copy_logits_min": -750000128.0, "copy_num_tokens": 521.0625, "epoch": 1.875159560888435, "gen_logits_max": 3.5119762420654297, "gen_logits_mean": -16.917675018310547, "gen_logits_min": -29.115562438964844, "gen_logits_std": 3.4265623092651367, "gen_loss": 0.2717133164405823, "grad_norm": 0.3423962146572057, "learning_rate": 1.9331368421052633e-05, "loss": 0.2669, "mean_copy_accuracy": 0.9976119250059128, "mean_gen_accuracy": 0.8799324035644531, "mean_token_accuracy": 0.9094153046607971, "num_tokens": 184409916.0, "sample_num_tokens": 8494.5, "step": 9182, "total_num_tokens": 184443894.0, "z_loss": 0.0004751895903609693 }, { "copy_logits_max": -7.113399505615234, "copy_logits_min": -625000000.0, "copy_num_tokens": 392.25, "epoch": 1.875363798825632, "gen_logits_max": 4.402746200561523, "gen_logits_mean": -16.36713409423828, "gen_logits_min": -28.154170989990234, "gen_logits_std": 3.3875460624694824, "gen_loss": 0.29108524322509766, "grad_norm": 0.3409286991312076, "learning_rate": 1.9330105263157897e-05, "loss": 0.2848, "mean_copy_accuracy": 0.996404305100441, "mean_gen_accuracy": 0.8759061098098755, "mean_token_accuracy": 0.9040239751338959, "num_tokens": 184668684.0, "sample_num_tokens": 8684.5, "step": 9183, "total_num_tokens": 184703422.0, "z_loss": 0.0005302579957060516 }, { "copy_logits_max": -6.346371650695801, "copy_logits_min": -750000000.0, "copy_num_tokens": 433.75, "epoch": 1.8755680367628287, "gen_logits_max": 2.8442001342773438, "gen_logits_mean": -18.51500701904297, "gen_logits_min": -30.7546329498291, "gen_logits_std": 3.5073893070220947, "gen_loss": 0.25967344641685486, "grad_norm": 0.36550927184628934, "learning_rate": 1.9328842105263158e-05, "loss": 0.27, "mean_copy_accuracy": 0.9968637228012085, "mean_gen_accuracy": 0.8780550807714462, "mean_token_accuracy": 0.9065438657999039, "num_tokens": 184933463.0, "sample_num_tokens": 8214.25, "step": 9184, "total_num_tokens": 184966320.0, "z_loss": 0.0004298679414205253 }, { "copy_logits_max": -5.794692039489746, "copy_logits_min": -750000000.0, "copy_num_tokens": 438.0, "epoch": 1.8757722747000254, "gen_logits_max": 4.079907417297363, "gen_logits_mean": -16.53695297241211, "gen_logits_min": -28.64539337158203, "gen_logits_std": 3.421415328979492, "gen_loss": 0.29812681674957275, "grad_norm": 0.3746987180761362, "learning_rate": 1.9327578947368422e-05, "loss": 0.267, "mean_copy_accuracy": 0.9970114380121231, "mean_gen_accuracy": 0.8800894469022751, "mean_token_accuracy": 0.908630445599556, "num_tokens": 185204323.0, "sample_num_tokens": 8328.25, "step": 9185, "total_num_tokens": 185237636.0, "z_loss": 0.0004483271040953696 }, { "copy_logits_max": -4.246044158935547, "copy_logits_min": -750000000.0, "copy_num_tokens": 466.3125, "epoch": 1.8759765126372223, "gen_logits_max": 4.231902122497559, "gen_logits_mean": -16.52672004699707, "gen_logits_min": -28.891132354736328, "gen_logits_std": 3.429893970489502, "gen_loss": 0.254153311252594, "grad_norm": 0.3596985530080808, "learning_rate": 1.9326315789473683e-05, "loss": 0.2664, "mean_copy_accuracy": 0.9974035918712616, "mean_gen_accuracy": 0.8757431954145432, "mean_token_accuracy": 0.90973861515522, "num_tokens": 185486928.0, "sample_num_tokens": 8594.5, "step": 9186, "total_num_tokens": 185521306.0, "z_loss": 0.0004526411066763103 }, { "copy_logits_max": -4.533926010131836, "copy_logits_min": -625000064.0, "copy_num_tokens": 746.375, "epoch": 1.8761807505744192, "gen_logits_max": 3.242239475250244, "gen_logits_mean": -16.499122619628906, "gen_logits_min": -28.897228240966797, "gen_logits_std": 3.4253621101379395, "gen_loss": 0.25734129548072815, "grad_norm": 0.3285896566100799, "learning_rate": 1.9325052631578948e-05, "loss": 0.2564, "mean_copy_accuracy": 0.9973606020212173, "mean_gen_accuracy": 0.8771516978740692, "mean_token_accuracy": 0.9132725745439529, "num_tokens": 185796233.0, "sample_num_tokens": 9464.25, "step": 9187, "total_num_tokens": 185834090.0, "z_loss": 0.0004874315927736461 }, { "copy_logits_max": -9.211508750915527, "copy_logits_min": -750000000.0, "copy_num_tokens": 232.8125, "epoch": 1.876384988511616, "gen_logits_max": 5.04569673538208, "gen_logits_mean": -16.94666862487793, "gen_logits_min": -28.232933044433594, "gen_logits_std": 3.3607869148254395, "gen_loss": 0.2894878089427948, "grad_norm": 0.35138161209831453, "learning_rate": 1.9323789473684212e-05, "loss": 0.2794, "mean_copy_accuracy": 0.9965809732675552, "mean_gen_accuracy": 0.8793149888515472, "mean_token_accuracy": 0.9048973470926285, "num_tokens": 186059016.0, "sample_num_tokens": 7610.5, "step": 9188, "total_num_tokens": 186089458.0, "z_loss": 0.00042646523797884583 }, { "copy_logits_max": -5.999576091766357, "copy_logits_min": -750000000.0, "copy_num_tokens": 467.125, "epoch": 1.8765892264488129, "gen_logits_max": 3.61187744140625, "gen_logits_mean": -17.49636459350586, "gen_logits_min": -29.593868255615234, "gen_logits_std": 3.431765079498291, "gen_loss": 0.29377537965774536, "grad_norm": 0.35389014655848583, "learning_rate": 1.9322526315789476e-05, "loss": 0.2725, "mean_copy_accuracy": 0.9965005666017532, "mean_gen_accuracy": 0.8782376348972321, "mean_token_accuracy": 0.9077602475881577, "num_tokens": 186326862.0, "sample_num_tokens": 8337.0, "step": 9189, "total_num_tokens": 186360210.0, "z_loss": 0.0004418272292241454 }, { "copy_logits_max": -5.401058197021484, "copy_logits_min": -750000000.0, "copy_num_tokens": 743.4375, "epoch": 1.8767934643860098, "gen_logits_max": 2.9561078548431396, "gen_logits_mean": -17.385210037231445, "gen_logits_min": -29.839107513427734, "gen_logits_std": 3.474412441253662, "gen_loss": 0.2265353798866272, "grad_norm": 0.3469350542716126, "learning_rate": 1.9321263157894737e-05, "loss": 0.2573, "mean_copy_accuracy": 0.9978528320789337, "mean_gen_accuracy": 0.878605455160141, "mean_token_accuracy": 0.9137251377105713, "num_tokens": 186620240.0, "sample_num_tokens": 9183.0, "step": 9190, "total_num_tokens": 186656972.0, "z_loss": 0.00036652397830039263 }, { "copy_logits_max": -4.077391147613525, "copy_logits_min": -750000000.0, "copy_num_tokens": 357.4375, "epoch": 1.8769977023232065, "gen_logits_max": 4.505008220672607, "gen_logits_mean": -16.171268463134766, "gen_logits_min": -28.330350875854492, "gen_logits_std": 3.4020228385925293, "gen_loss": 0.2816035747528076, "grad_norm": 0.37438825145314214, "learning_rate": 1.932e-05, "loss": 0.2854, "mean_copy_accuracy": 0.9976705312728882, "mean_gen_accuracy": 0.8707350641489029, "mean_token_accuracy": 0.9024212509393692, "num_tokens": 186899216.0, "sample_num_tokens": 7688.5, "step": 9191, "total_num_tokens": 186929970.0, "z_loss": 0.00041096180211752653 }, { "copy_logits_max": -5.766416072845459, "copy_logits_min": -750000000.0, "copy_num_tokens": 331.75, "epoch": 1.8772019402604032, "gen_logits_max": 4.113639831542969, "gen_logits_mean": -17.1030216217041, "gen_logits_min": -29.169206619262695, "gen_logits_std": 3.4303481578826904, "gen_loss": 0.26425260305404663, "grad_norm": 0.33808727849793385, "learning_rate": 1.9318736842105262e-05, "loss": 0.2722, "mean_copy_accuracy": 0.9974197298288345, "mean_gen_accuracy": 0.8786692768335342, "mean_token_accuracy": 0.9069329351186752, "num_tokens": 187158417.0, "sample_num_tokens": 7248.25, "step": 9192, "total_num_tokens": 187187410.0, "z_loss": 0.00044798722956329584 }, { "copy_logits_max": -2.58040714263916, "copy_logits_min": -750000000.0, "copy_num_tokens": 563.5625, "epoch": 1.8774061781976004, "gen_logits_max": 4.224492073059082, "gen_logits_mean": -15.444770812988281, "gen_logits_min": -27.924663543701172, "gen_logits_std": 3.357963800430298, "gen_loss": 0.2748052179813385, "grad_norm": 0.38978704414422716, "learning_rate": 1.9317473684210527e-05, "loss": 0.2798, "mean_copy_accuracy": 0.996924564242363, "mean_gen_accuracy": 0.8719203174114227, "mean_token_accuracy": 0.9043395817279816, "num_tokens": 187410911.0, "sample_num_tokens": 8088.75, "step": 9193, "total_num_tokens": 187443266.0, "z_loss": 0.00047559363883920014 }, { "copy_logits_max": -6.492626190185547, "copy_logits_min": -750000000.0, "copy_num_tokens": 383.375, "epoch": 1.877610416134797, "gen_logits_max": 4.6049885749816895, "gen_logits_mean": -16.48236083984375, "gen_logits_min": -28.525070190429688, "gen_logits_std": 3.3932626247406006, "gen_loss": 0.2534308433532715, "grad_norm": 0.34497045177896457, "learning_rate": 1.9316210526315788e-05, "loss": 0.2806, "mean_copy_accuracy": 0.9972557127475739, "mean_gen_accuracy": 0.8788334131240845, "mean_token_accuracy": 0.9050456136465073, "num_tokens": 187684058.0, "sample_num_tokens": 8589.0, "step": 9194, "total_num_tokens": 187718414.0, "z_loss": 0.000432213069871068 }, { "copy_logits_max": -2.0204100608825684, "copy_logits_min": -687500032.0, "copy_num_tokens": 554.0, "epoch": 1.8778146540719938, "gen_logits_max": 4.261194705963135, "gen_logits_mean": -15.680622100830078, "gen_logits_min": -28.130535125732422, "gen_logits_std": 3.3910627365112305, "gen_loss": 0.24284712970256805, "grad_norm": 0.35853037496702417, "learning_rate": 1.9314947368421052e-05, "loss": 0.2825, "mean_copy_accuracy": 0.997264102101326, "mean_gen_accuracy": 0.8783668875694275, "mean_token_accuracy": 0.9052974134683609, "num_tokens": 187959413.0, "sample_num_tokens": 9149.75, "step": 9195, "total_num_tokens": 187996012.0, "z_loss": 0.000424264813773334 }, { "copy_logits_max": -5.66103458404541, "copy_logits_min": -750000000.0, "copy_num_tokens": 476.5, "epoch": 1.8780188920091907, "gen_logits_max": 4.483456611633301, "gen_logits_mean": -16.072162628173828, "gen_logits_min": -28.281797409057617, "gen_logits_std": 3.3877832889556885, "gen_loss": 0.2929632365703583, "grad_norm": 0.3739402182638063, "learning_rate": 1.9313684210526316e-05, "loss": 0.2814, "mean_copy_accuracy": 0.995785653591156, "mean_gen_accuracy": 0.8755752146244049, "mean_token_accuracy": 0.9042000770568848, "num_tokens": 188223616.0, "sample_num_tokens": 8867.0, "step": 9196, "total_num_tokens": 188259084.0, "z_loss": 0.0004913108423352242 }, { "copy_logits_max": -5.528947830200195, "copy_logits_min": -750000000.0, "copy_num_tokens": 552.6875, "epoch": 1.8782231299463876, "gen_logits_max": 2.5163965225219727, "gen_logits_mean": -18.26272201538086, "gen_logits_min": -30.53234100341797, "gen_logits_std": 3.4962964057922363, "gen_loss": 0.2590678334236145, "grad_norm": 0.36564231363555794, "learning_rate": 1.931242105263158e-05, "loss": 0.2862, "mean_copy_accuracy": 0.9960049688816071, "mean_gen_accuracy": 0.8752385824918747, "mean_token_accuracy": 0.9017589688301086, "num_tokens": 188481677.0, "sample_num_tokens": 8434.75, "step": 9197, "total_num_tokens": 188515416.0, "z_loss": 0.0004198867827653885 }, { "copy_logits_max": -4.680991172790527, "copy_logits_min": -750000000.0, "copy_num_tokens": 402.25, "epoch": 1.8784273678835843, "gen_logits_max": 4.045633792877197, "gen_logits_mean": -17.03582000732422, "gen_logits_min": -28.950035095214844, "gen_logits_std": 3.4412693977355957, "gen_loss": 0.26271867752075195, "grad_norm": 0.34820806539507115, "learning_rate": 1.9311157894736845e-05, "loss": 0.2793, "mean_copy_accuracy": 0.9970304518938065, "mean_gen_accuracy": 0.8798300623893738, "mean_token_accuracy": 0.9035923629999161, "num_tokens": 188732837.0, "sample_num_tokens": 7496.25, "step": 9198, "total_num_tokens": 188762822.0, "z_loss": 0.0004599220701493323 }, { "copy_logits_max": -1.7703925371170044, "copy_logits_min": -750000064.0, "copy_num_tokens": 503.0625, "epoch": 1.8786316058207813, "gen_logits_max": 4.696508407592773, "gen_logits_mean": -14.802562713623047, "gen_logits_min": -26.88064193725586, "gen_logits_std": 3.3602590560913086, "gen_loss": 0.2539278268814087, "grad_norm": 0.339953030357134, "learning_rate": 1.9309894736842106e-05, "loss": 0.2704, "mean_copy_accuracy": 0.9969029575586319, "mean_gen_accuracy": 0.8780465573072433, "mean_token_accuracy": 0.9065321087837219, "num_tokens": 189015450.0, "sample_num_tokens": 9796.0, "step": 9199, "total_num_tokens": 189054634.0, "z_loss": 0.0003897684218827635 }, { "copy_logits_max": -3.068103790283203, "copy_logits_min": -750000000.0, "copy_num_tokens": 507.25, "epoch": 1.8788358437579782, "gen_logits_max": 2.8791654109954834, "gen_logits_mean": -17.801788330078125, "gen_logits_min": -29.98027992248535, "gen_logits_std": 3.4891839027404785, "gen_loss": 0.3011840581893921, "grad_norm": 0.3442730780984737, "learning_rate": 1.930863157894737e-05, "loss": 0.2727, "mean_copy_accuracy": 0.996965765953064, "mean_gen_accuracy": 0.8773903399705887, "mean_token_accuracy": 0.9074892699718475, "num_tokens": 189285723.0, "sample_num_tokens": 7758.75, "step": 9200, "total_num_tokens": 189316758.0, "z_loss": 0.00045285889063961804 }, { "copy_logits_max": -5.285926818847656, "copy_logits_min": -750000000.0, "copy_num_tokens": 480.25, "epoch": 1.879040081695175, "gen_logits_max": 4.932973861694336, "gen_logits_mean": -15.384995460510254, "gen_logits_min": -27.625778198242188, "gen_logits_std": 3.4159345626831055, "gen_loss": 0.28867286443710327, "grad_norm": 0.3434878098424033, "learning_rate": 1.930736842105263e-05, "loss": 0.2699, "mean_copy_accuracy": 0.996629387140274, "mean_gen_accuracy": 0.8808914721012115, "mean_token_accuracy": 0.9074028432369232, "num_tokens": 189552754.0, "sample_num_tokens": 8801.0, "step": 9201, "total_num_tokens": 189587958.0, "z_loss": 0.0004703194717876613 }, { "copy_logits_max": -3.4891421794891357, "copy_logits_min": -625000064.0, "copy_num_tokens": 516.875, "epoch": 1.8792443196323716, "gen_logits_max": 3.9621219635009766, "gen_logits_mean": -16.05662727355957, "gen_logits_min": -28.032447814941406, "gen_logits_std": 3.437898635864258, "gen_loss": 0.27787792682647705, "grad_norm": 0.33337131465080977, "learning_rate": 1.9306105263157895e-05, "loss": 0.2676, "mean_copy_accuracy": 0.997897282242775, "mean_gen_accuracy": 0.8776295036077499, "mean_token_accuracy": 0.9102809280157089, "num_tokens": 189838439.0, "sample_num_tokens": 8639.75, "step": 9202, "total_num_tokens": 189872998.0, "z_loss": 0.0003792527422774583 }, { "copy_logits_max": -4.777325630187988, "copy_logits_min": -687500032.0, "copy_num_tokens": 455.0625, "epoch": 1.8794485575695685, "gen_logits_max": 3.594069242477417, "gen_logits_mean": -17.640167236328125, "gen_logits_min": -29.932384490966797, "gen_logits_std": 3.4775044918060303, "gen_loss": 0.30903518199920654, "grad_norm": 0.32704467766973705, "learning_rate": 1.9304842105263156e-05, "loss": 0.2769, "mean_copy_accuracy": 0.9980021417140961, "mean_gen_accuracy": 0.8712344616651535, "mean_token_accuracy": 0.9055193513631821, "num_tokens": 190130943.0, "sample_num_tokens": 9170.25, "step": 9203, "total_num_tokens": 190167624.0, "z_loss": 0.00048597174463793635 }, { "copy_logits_max": -5.404130458831787, "copy_logits_min": -750000000.0, "copy_num_tokens": 368.3125, "epoch": 1.8796527955067655, "gen_logits_max": 3.7750394344329834, "gen_logits_mean": -17.878158569335938, "gen_logits_min": -29.919466018676758, "gen_logits_std": 3.527742624282837, "gen_loss": 0.2368137538433075, "grad_norm": 0.33678234885017483, "learning_rate": 1.930357894736842e-05, "loss": 0.2716, "mean_copy_accuracy": 0.9969748854637146, "mean_gen_accuracy": 0.8814045339822769, "mean_token_accuracy": 0.9072766005992889, "num_tokens": 190415973.0, "sample_num_tokens": 7873.25, "step": 9204, "total_num_tokens": 190447466.0, "z_loss": 0.00032927587744779885 }, { "copy_logits_max": -4.31900691986084, "copy_logits_min": -750000000.0, "copy_num_tokens": 346.25, "epoch": 1.8798570334439622, "gen_logits_max": 3.848092555999756, "gen_logits_mean": -17.16033935546875, "gen_logits_min": -29.310461044311523, "gen_logits_std": 3.4762163162231445, "gen_loss": 0.2907155454158783, "grad_norm": 0.35618996274384834, "learning_rate": 1.9302315789473685e-05, "loss": 0.2641, "mean_copy_accuracy": 0.9973461329936981, "mean_gen_accuracy": 0.8846641182899475, "mean_token_accuracy": 0.9113308489322662, "num_tokens": 190690064.0, "sample_num_tokens": 7583.0, "step": 9205, "total_num_tokens": 190720396.0, "z_loss": 0.000393581431126222 }, { "copy_logits_max": -2.6585853099823, "copy_logits_min": -750000000.0, "copy_num_tokens": 393.25, "epoch": 1.880061271381159, "gen_logits_max": 4.073980331420898, "gen_logits_mean": -16.120197296142578, "gen_logits_min": -28.11560821533203, "gen_logits_std": 3.431981086730957, "gen_loss": 0.2500309348106384, "grad_norm": 0.3340115047696013, "learning_rate": 1.930105263157895e-05, "loss": 0.2714, "mean_copy_accuracy": 0.9982710629701614, "mean_gen_accuracy": 0.8793629854917526, "mean_token_accuracy": 0.9083689749240875, "num_tokens": 190954474.0, "sample_num_tokens": 7574.5, "step": 9206, "total_num_tokens": 190984772.0, "z_loss": 0.0003465469926595688 }, { "copy_logits_max": -2.58579158782959, "copy_logits_min": -750000000.0, "copy_num_tokens": 376.875, "epoch": 1.880265509318356, "gen_logits_max": 5.1618428230285645, "gen_logits_mean": -16.133079528808594, "gen_logits_min": -28.42351531982422, "gen_logits_std": 3.458535671234131, "gen_loss": 0.28760239481925964, "grad_norm": 0.3727447447586275, "learning_rate": 1.929978947368421e-05, "loss": 0.287, "mean_copy_accuracy": 0.9962704032659531, "mean_gen_accuracy": 0.8805789202451706, "mean_token_accuracy": 0.9036066979169846, "num_tokens": 191207322.0, "sample_num_tokens": 8176.0, "step": 9207, "total_num_tokens": 191240026.0, "z_loss": 0.0003953907289542258 }, { "copy_logits_max": -0.9417889714241028, "copy_logits_min": -750000000.0, "copy_num_tokens": 371.3125, "epoch": 1.8804697472555527, "gen_logits_max": 4.010431289672852, "gen_logits_mean": -16.42355728149414, "gen_logits_min": -28.791065216064453, "gen_logits_std": 3.4387972354888916, "gen_loss": 0.2777237892150879, "grad_norm": 0.35963965382628793, "learning_rate": 1.9298526315789474e-05, "loss": 0.267, "mean_copy_accuracy": 0.9969325363636017, "mean_gen_accuracy": 0.880663588643074, "mean_token_accuracy": 0.9080616235733032, "num_tokens": 191458768.0, "sample_num_tokens": 7274.5, "step": 9208, "total_num_tokens": 191487866.0, "z_loss": 0.0003851085784845054 }, { "copy_logits_max": -3.1146974563598633, "copy_logits_min": -750000064.0, "copy_num_tokens": 393.875, "epoch": 1.8806739851927494, "gen_logits_max": 4.552840709686279, "gen_logits_mean": -17.19603729248047, "gen_logits_min": -29.12923812866211, "gen_logits_std": 3.475381851196289, "gen_loss": 0.27896827459335327, "grad_norm": 0.33896672253433213, "learning_rate": 1.929726315789474e-05, "loss": 0.2739, "mean_copy_accuracy": 0.9967643618583679, "mean_gen_accuracy": 0.8774599730968475, "mean_token_accuracy": 0.9055778384208679, "num_tokens": 191712889.0, "sample_num_tokens": 8997.25, "step": 9209, "total_num_tokens": 191748878.0, "z_loss": 0.00037607253761962056 }, { "copy_logits_max": -3.261005401611328, "copy_logits_min": -750000000.0, "copy_num_tokens": 588.375, "epoch": 1.8808782231299463, "gen_logits_max": 3.2019786834716797, "gen_logits_mean": -17.235063552856445, "gen_logits_min": -29.654220581054688, "gen_logits_std": 3.5112807750701904, "gen_loss": 0.2564283013343811, "grad_norm": 0.3424239466172985, "learning_rate": 1.9296e-05, "loss": 0.2664, "mean_copy_accuracy": 0.9980417490005493, "mean_gen_accuracy": 0.8791187554597855, "mean_token_accuracy": 0.9091488420963287, "num_tokens": 191980009.0, "sample_num_tokens": 8498.75, "step": 9210, "total_num_tokens": 192014004.0, "z_loss": 0.00038732029497623444 }, { "copy_logits_max": -2.739145278930664, "copy_logits_min": -750000000.0, "copy_num_tokens": 272.0625, "epoch": 1.8810824610671433, "gen_logits_max": 5.994261741638184, "gen_logits_mean": -15.25872802734375, "gen_logits_min": -27.465518951416016, "gen_logits_std": 3.3916611671447754, "gen_loss": 0.30899566411972046, "grad_norm": 0.34661175888847995, "learning_rate": 1.9294736842105264e-05, "loss": 0.2817, "mean_copy_accuracy": 0.9975978583097458, "mean_gen_accuracy": 0.8781143724918365, "mean_token_accuracy": 0.9050125330686569, "num_tokens": 192261551.0, "sample_num_tokens": 6734.75, "step": 9211, "total_num_tokens": 192288490.0, "z_loss": 0.00044519046787172556 }, { "copy_logits_max": -2.510699987411499, "copy_logits_min": -687500032.0, "copy_num_tokens": 429.375, "epoch": 1.88128669900434, "gen_logits_max": 3.2801761627197266, "gen_logits_mean": -17.802492141723633, "gen_logits_min": -30.15078353881836, "gen_logits_std": 3.514453411102295, "gen_loss": 0.2837430536746979, "grad_norm": 0.3863229983216162, "learning_rate": 1.9293473684210525e-05, "loss": 0.2871, "mean_copy_accuracy": 0.9975649863481522, "mean_gen_accuracy": 0.8719232678413391, "mean_token_accuracy": 0.9017224162817001, "num_tokens": 192527901.0, "sample_num_tokens": 8018.75, "step": 9212, "total_num_tokens": 192559976.0, "z_loss": 0.0004127799184061587 }, { "copy_logits_max": 1.8173655271530151, "copy_logits_min": -750000000.0, "copy_num_tokens": 614.1875, "epoch": 1.881490936941537, "gen_logits_max": 4.451828956604004, "gen_logits_mean": -15.111834526062012, "gen_logits_min": -27.642892837524414, "gen_logits_std": 3.411376953125, "gen_loss": 0.26247844099998474, "grad_norm": 0.34691795115158974, "learning_rate": 1.9292210526315793e-05, "loss": 0.2737, "mean_copy_accuracy": 0.9974979609251022, "mean_gen_accuracy": 0.8758001625537872, "mean_token_accuracy": 0.9065974652767181, "num_tokens": 192797446.0, "sample_num_tokens": 9110.5, "step": 9213, "total_num_tokens": 192833888.0, "z_loss": 0.00043588789412751794 }, { "copy_logits_max": 0.06835350394248962, "copy_logits_min": -750000000.0, "copy_num_tokens": 478.25, "epoch": 1.8816951748787338, "gen_logits_max": 3.451838970184326, "gen_logits_mean": -17.888736724853516, "gen_logits_min": -30.313329696655273, "gen_logits_std": 3.528395652770996, "gen_loss": 0.24699564278125763, "grad_norm": 0.36264492041210833, "learning_rate": 1.9290947368421053e-05, "loss": 0.2762, "mean_copy_accuracy": 0.9977757930755615, "mean_gen_accuracy": 0.8770290166139603, "mean_token_accuracy": 0.9056032747030258, "num_tokens": 193057401.0, "sample_num_tokens": 8659.25, "step": 9214, "total_num_tokens": 193092038.0, "z_loss": 0.00038161210250109434 }, { "copy_logits_max": -1.4137595891952515, "copy_logits_min": -687500032.0, "copy_num_tokens": 381.5, "epoch": 1.8818994128159305, "gen_logits_max": 3.8916072845458984, "gen_logits_mean": -16.501291275024414, "gen_logits_min": -28.835779190063477, "gen_logits_std": 3.438246726989746, "gen_loss": 0.2512679100036621, "grad_norm": 0.3662000326583126, "learning_rate": 1.9289684210526318e-05, "loss": 0.2555, "mean_copy_accuracy": 0.9964775890111923, "mean_gen_accuracy": 0.8856023102998734, "mean_token_accuracy": 0.9119793623685837, "num_tokens": 193326244.0, "sample_num_tokens": 7087.5, "step": 9215, "total_num_tokens": 193354594.0, "z_loss": 0.00041583291022107005 }, { "copy_logits_max": -3.437208890914917, "copy_logits_min": -750000064.0, "copy_num_tokens": 263.4375, "epoch": 1.8821036507531272, "gen_logits_max": 3.7597193717956543, "gen_logits_mean": -18.407562255859375, "gen_logits_min": -30.50473403930664, "gen_logits_std": 3.4880032539367676, "gen_loss": 0.28321051597595215, "grad_norm": 0.3852568855693604, "learning_rate": 1.928842105263158e-05, "loss": 0.2799, "mean_copy_accuracy": 0.9976690113544464, "mean_gen_accuracy": 0.8774848580360413, "mean_token_accuracy": 0.9043768495321274, "num_tokens": 193571346.0, "sample_num_tokens": 6683.5, "step": 9216, "total_num_tokens": 193598080.0, "z_loss": 0.0004451539716683328 }, { "copy_logits_max": 0.3930603265762329, "copy_logits_min": -750000000.0, "copy_num_tokens": 580.5625, "epoch": 1.8823078886903244, "gen_logits_max": 4.076929569244385, "gen_logits_mean": -16.902748107910156, "gen_logits_min": -29.71320343017578, "gen_logits_std": 3.4379498958587646, "gen_loss": 0.2635127902030945, "grad_norm": 0.35080923899824534, "learning_rate": 1.9287157894736843e-05, "loss": 0.272, "mean_copy_accuracy": 0.9977802336215973, "mean_gen_accuracy": 0.8757749199867249, "mean_token_accuracy": 0.9100822657346725, "num_tokens": 193851619.0, "sample_num_tokens": 9316.25, "step": 9217, "total_num_tokens": 193888884.0, "z_loss": 0.0003995811566710472 }, { "copy_logits_max": 1.8393802642822266, "copy_logits_min": -750000000.0, "copy_num_tokens": 744.25, "epoch": 1.882512126627521, "gen_logits_max": 3.194756269454956, "gen_logits_mean": -16.42386817932129, "gen_logits_min": -29.031757354736328, "gen_logits_std": 3.435312032699585, "gen_loss": 0.2778809666633606, "grad_norm": 0.30658113663791287, "learning_rate": 1.9285894736842104e-05, "loss": 0.2637, "mean_copy_accuracy": 0.9974496364593506, "mean_gen_accuracy": 0.873796135187149, "mean_token_accuracy": 0.9092002958059311, "num_tokens": 194155081.0, "sample_num_tokens": 10689.25, "step": 9218, "total_num_tokens": 194197838.0, "z_loss": 0.0004562927642837167 }, { "copy_logits_max": 1.2048940658569336, "copy_logits_min": -750000000.0, "copy_num_tokens": 518.0625, "epoch": 1.8827163645647178, "gen_logits_max": 4.816152095794678, "gen_logits_mean": -15.947781562805176, "gen_logits_min": -28.357101440429688, "gen_logits_std": 3.3846120834350586, "gen_loss": 0.2689797282218933, "grad_norm": 0.3732683279777786, "learning_rate": 1.9284631578947368e-05, "loss": 0.2798, "mean_copy_accuracy": 0.9978603422641754, "mean_gen_accuracy": 0.8756002634763718, "mean_token_accuracy": 0.906535193324089, "num_tokens": 194422774.0, "sample_num_tokens": 7858.0, "step": 9219, "total_num_tokens": 194454206.0, "z_loss": 0.00042852299520745873 }, { "copy_logits_max": -2.488744020462036, "copy_logits_min": -750000128.0, "copy_num_tokens": 425.0, "epoch": 1.8829206025019147, "gen_logits_max": 4.19399356842041, "gen_logits_mean": -17.521282196044922, "gen_logits_min": -30.065555572509766, "gen_logits_std": 3.4460206031799316, "gen_loss": 0.28428536653518677, "grad_norm": 0.3321350231728413, "learning_rate": 1.928336842105263e-05, "loss": 0.2662, "mean_copy_accuracy": 0.9973331987857819, "mean_gen_accuracy": 0.8782708942890167, "mean_token_accuracy": 0.9105149507522583, "num_tokens": 194695880.0, "sample_num_tokens": 8437.0, "step": 9220, "total_num_tokens": 194729628.0, "z_loss": 0.00046020623994991183 }, { "copy_logits_max": 0.35536569356918335, "copy_logits_min": -750000000.0, "copy_num_tokens": 577.0625, "epoch": 1.8831248404391117, "gen_logits_max": 4.040050983428955, "gen_logits_mean": -16.648681640625, "gen_logits_min": -28.72872543334961, "gen_logits_std": 3.4201414585113525, "gen_loss": 0.2561783790588379, "grad_norm": 0.3535257815736649, "learning_rate": 1.9282105263157897e-05, "loss": 0.2699, "mean_copy_accuracy": 0.9972852617502213, "mean_gen_accuracy": 0.8771171420812607, "mean_token_accuracy": 0.9085270911455154, "num_tokens": 194965609.0, "sample_num_tokens": 9803.25, "step": 9221, "total_num_tokens": 195004822.0, "z_loss": 0.000419619376771152 }, { "copy_logits_max": 0.4780919551849365, "copy_logits_min": -750000000.0, "copy_num_tokens": 497.125, "epoch": 1.8833290783763084, "gen_logits_max": 3.8121886253356934, "gen_logits_mean": -16.539501190185547, "gen_logits_min": -28.601789474487305, "gen_logits_std": 3.437319278717041, "gen_loss": 0.2658468186855316, "grad_norm": 0.32558313665304656, "learning_rate": 1.928084210526316e-05, "loss": 0.2576, "mean_copy_accuracy": 0.9981361776590347, "mean_gen_accuracy": 0.8804102092981339, "mean_token_accuracy": 0.9118445217609406, "num_tokens": 195245563.0, "sample_num_tokens": 7880.25, "step": 9222, "total_num_tokens": 195277084.0, "z_loss": 0.0004152190522290766 }, { "copy_logits_max": 0.7932466268539429, "copy_logits_min": -750000000.0, "copy_num_tokens": 593.3125, "epoch": 1.883533316313505, "gen_logits_max": 4.269959449768066, "gen_logits_mean": -16.065475463867188, "gen_logits_min": -28.45941162109375, "gen_logits_std": 3.433610200881958, "gen_loss": 0.21333687007427216, "grad_norm": 0.3469301230173306, "learning_rate": 1.9279578947368422e-05, "loss": 0.2767, "mean_copy_accuracy": 0.9975946992635727, "mean_gen_accuracy": 0.8772754818201065, "mean_token_accuracy": 0.905640184879303, "num_tokens": 195534826.0, "sample_num_tokens": 9474.0, "step": 9223, "total_num_tokens": 195572722.0, "z_loss": 0.00034953426802530885 }, { "copy_logits_max": -1.7279725074768066, "copy_logits_min": -750000000.0, "copy_num_tokens": 259.5, "epoch": 1.8837375542507022, "gen_logits_max": 5.274462699890137, "gen_logits_mean": -16.234975814819336, "gen_logits_min": -28.785667419433594, "gen_logits_std": 3.3744473457336426, "gen_loss": 0.316893070936203, "grad_norm": 0.3215937538343833, "learning_rate": 1.9278315789473686e-05, "loss": 0.2778, "mean_copy_accuracy": 0.9969728142023087, "mean_gen_accuracy": 0.8793373703956604, "mean_token_accuracy": 0.9048423320055008, "num_tokens": 195827126.0, "sample_num_tokens": 7541.0, "step": 9224, "total_num_tokens": 195857290.0, "z_loss": 0.0004634518700186163 }, { "copy_logits_max": -2.79384708404541, "copy_logits_min": -750000000.0, "copy_num_tokens": 361.25, "epoch": 1.883941792187899, "gen_logits_max": 3.6746363639831543, "gen_logits_mean": -17.82503890991211, "gen_logits_min": -30.33865737915039, "gen_logits_std": 3.477881669998169, "gen_loss": 0.27433323860168457, "grad_norm": 0.35628135216680107, "learning_rate": 1.9277052631578947e-05, "loss": 0.2752, "mean_copy_accuracy": 0.9969289153814316, "mean_gen_accuracy": 0.8792225867509842, "mean_token_accuracy": 0.9047749787569046, "num_tokens": 196077875.0, "sample_num_tokens": 7361.25, "step": 9225, "total_num_tokens": 196107320.0, "z_loss": 0.00045312417205423117 }, { "copy_logits_max": 0.44227486848831177, "copy_logits_min": -687500032.0, "copy_num_tokens": 524.4375, "epoch": 1.8841460301250956, "gen_logits_max": 3.503305673599243, "gen_logits_mean": -16.730674743652344, "gen_logits_min": -29.305591583251953, "gen_logits_std": 3.4265384674072266, "gen_loss": 0.29945892095565796, "grad_norm": 0.3403278643776227, "learning_rate": 1.927578947368421e-05, "loss": 0.2933, "mean_copy_accuracy": 0.9970025718212128, "mean_gen_accuracy": 0.8688472360372543, "mean_token_accuracy": 0.9011178314685822, "num_tokens": 196358761.0, "sample_num_tokens": 8668.75, "step": 9226, "total_num_tokens": 196393436.0, "z_loss": 0.0004270288918633014 }, { "copy_logits_max": -0.1661551594734192, "copy_logits_min": -750000064.0, "copy_num_tokens": 418.375, "epoch": 1.8843502680622926, "gen_logits_max": 4.188837051391602, "gen_logits_mean": -16.967458724975586, "gen_logits_min": -29.677894592285156, "gen_logits_std": 3.455977439880371, "gen_loss": 0.27035272121429443, "grad_norm": 0.35812594999483277, "learning_rate": 1.9274526315789473e-05, "loss": 0.2834, "mean_copy_accuracy": 0.9975080341100693, "mean_gen_accuracy": 0.8748246878385544, "mean_token_accuracy": 0.9035854637622833, "num_tokens": 196613277.0, "sample_num_tokens": 8053.75, "step": 9227, "total_num_tokens": 196645492.0, "z_loss": 0.0003991558332927525 }, { "copy_logits_max": -1.6024612188339233, "copy_logits_min": -750000000.0, "copy_num_tokens": 358.75, "epoch": 1.8845545059994895, "gen_logits_max": 4.2217817306518555, "gen_logits_mean": -17.0819149017334, "gen_logits_min": -29.466691970825195, "gen_logits_std": 3.4589009284973145, "gen_loss": 0.279960036277771, "grad_norm": 0.3378502391613349, "learning_rate": 1.9273263157894737e-05, "loss": 0.2599, "mean_copy_accuracy": 0.9979778230190277, "mean_gen_accuracy": 0.8808195739984512, "mean_token_accuracy": 0.9109909385442734, "num_tokens": 196884018.0, "sample_num_tokens": 7272.0, "step": 9228, "total_num_tokens": 196913106.0, "z_loss": 0.0004124206898268312 }, { "copy_logits_max": -2.2118754386901855, "copy_logits_min": -750000000.0, "copy_num_tokens": 442.6875, "epoch": 1.8847587439366862, "gen_logits_max": 4.955307960510254, "gen_logits_mean": -16.631080627441406, "gen_logits_min": -28.621997833251953, "gen_logits_std": 3.4368419647216797, "gen_loss": 0.2727905809879303, "grad_norm": 0.3350186103303221, "learning_rate": 1.9272e-05, "loss": 0.2784, "mean_copy_accuracy": 0.9977256804704666, "mean_gen_accuracy": 0.8764466047286987, "mean_token_accuracy": 0.9055463373661041, "num_tokens": 197168495.0, "sample_num_tokens": 8557.75, "step": 9229, "total_num_tokens": 197202726.0, "z_loss": 0.0004097547207493335 }, { "copy_logits_max": -0.7381612062454224, "copy_logits_min": -687500032.0, "copy_num_tokens": 462.6875, "epoch": 1.8849629818738831, "gen_logits_max": 4.072228908538818, "gen_logits_mean": -17.178850173950195, "gen_logits_min": -29.95233726501465, "gen_logits_std": 3.475642681121826, "gen_loss": 0.27324914932250977, "grad_norm": 0.36463177276042524, "learning_rate": 1.9270736842105265e-05, "loss": 0.2654, "mean_copy_accuracy": 0.9975047260522842, "mean_gen_accuracy": 0.8764934837818146, "mean_token_accuracy": 0.9099444448947906, "num_tokens": 197430823.0, "sample_num_tokens": 7924.75, "step": 9230, "total_num_tokens": 197462522.0, "z_loss": 0.00042052441858686507 }, { "copy_logits_max": -0.26601219177246094, "copy_logits_min": -750000064.0, "copy_num_tokens": 476.8125, "epoch": 1.88516721981108, "gen_logits_max": 4.508264064788818, "gen_logits_mean": -15.474098205566406, "gen_logits_min": -27.623199462890625, "gen_logits_std": 3.398940086364746, "gen_loss": 0.2714529037475586, "grad_norm": 0.35757822572974723, "learning_rate": 1.9269473684210526e-05, "loss": 0.2563, "mean_copy_accuracy": 0.9975108951330185, "mean_gen_accuracy": 0.8806374967098236, "mean_token_accuracy": 0.9143295288085938, "num_tokens": 197694613.0, "sample_num_tokens": 7804.75, "step": 9231, "total_num_tokens": 197725832.0, "z_loss": 0.0004195196961518377 }, { "copy_logits_max": 2.284608840942383, "copy_logits_min": -750000000.0, "copy_num_tokens": 530.1875, "epoch": 1.8853714577482767, "gen_logits_max": 5.479799270629883, "gen_logits_mean": -14.668753623962402, "gen_logits_min": -27.886964797973633, "gen_logits_std": 3.392287015914917, "gen_loss": 0.24322441220283508, "grad_norm": 0.3303786572668094, "learning_rate": 1.926821052631579e-05, "loss": 0.2689, "mean_copy_accuracy": 0.996984601020813, "mean_gen_accuracy": 0.8772694617509842, "mean_token_accuracy": 0.9073159694671631, "num_tokens": 197964191.0, "sample_num_tokens": 8853.75, "step": 9232, "total_num_tokens": 197999606.0, "z_loss": 0.00033077067928388715 }, { "copy_logits_max": 0.29515141248703003, "copy_logits_min": -750000000.0, "copy_num_tokens": 463.125, "epoch": 1.8855756956854735, "gen_logits_max": 4.980410575866699, "gen_logits_mean": -15.747395515441895, "gen_logits_min": -28.30484962463379, "gen_logits_std": 3.4163155555725098, "gen_loss": 0.28521883487701416, "grad_norm": 0.35979144453231293, "learning_rate": 1.926694736842105e-05, "loss": 0.2727, "mean_copy_accuracy": 0.9971368759870529, "mean_gen_accuracy": 0.8788914680480957, "mean_token_accuracy": 0.9075659513473511, "num_tokens": 198241212.0, "sample_num_tokens": 9400.5, "step": 9233, "total_num_tokens": 198278814.0, "z_loss": 0.0004081547958776355 }, { "copy_logits_max": -2.4208552837371826, "copy_logits_min": -750000000.0, "copy_num_tokens": 593.6875, "epoch": 1.8857799336226704, "gen_logits_max": 3.577810049057007, "gen_logits_mean": -17.577407836914062, "gen_logits_min": -30.465045928955078, "gen_logits_std": 3.5142054557800293, "gen_loss": 0.26627427339553833, "grad_norm": 0.3455504210253921, "learning_rate": 1.9265684210526316e-05, "loss": 0.2742, "mean_copy_accuracy": 0.9971278160810471, "mean_gen_accuracy": 0.8730457127094269, "mean_token_accuracy": 0.9058766812086105, "num_tokens": 198525569.0, "sample_num_tokens": 9705.25, "step": 9234, "total_num_tokens": 198564390.0, "z_loss": 0.0004018954059574753 }, { "copy_logits_max": -1.7626274824142456, "copy_logits_min": -750000000.0, "copy_num_tokens": 415.5625, "epoch": 1.8859841715598673, "gen_logits_max": 4.498739242553711, "gen_logits_mean": -16.974374771118164, "gen_logits_min": -29.027938842773438, "gen_logits_std": 3.4751663208007812, "gen_loss": 0.2691040635108948, "grad_norm": 0.34583887294435467, "learning_rate": 1.9264421052631577e-05, "loss": 0.2805, "mean_copy_accuracy": 0.9966716468334198, "mean_gen_accuracy": 0.879161924123764, "mean_token_accuracy": 0.9041185081005096, "num_tokens": 198799023.0, "sample_num_tokens": 9106.75, "step": 9235, "total_num_tokens": 198835450.0, "z_loss": 0.00038092664908617735 }, { "copy_logits_max": -1.5645872354507446, "copy_logits_min": -625000064.0, "copy_num_tokens": 511.625, "epoch": 1.886188409497064, "gen_logits_max": 3.7963833808898926, "gen_logits_mean": -17.204193115234375, "gen_logits_min": -29.720306396484375, "gen_logits_std": 3.4938502311706543, "gen_loss": 0.2671663165092468, "grad_norm": 0.353769665533451, "learning_rate": 1.926315789473684e-05, "loss": 0.2656, "mean_copy_accuracy": 0.9966205358505249, "mean_gen_accuracy": 0.8780331760644913, "mean_token_accuracy": 0.9103619605302811, "num_tokens": 199078892.0, "sample_num_tokens": 8268.5, "step": 9236, "total_num_tokens": 199111966.0, "z_loss": 0.00038115173811092973 }, { "copy_logits_max": -3.4149179458618164, "copy_logits_min": -750000000.0, "copy_num_tokens": 487.9375, "epoch": 1.886392647434261, "gen_logits_max": 4.80926513671875, "gen_logits_mean": -14.974498748779297, "gen_logits_min": -27.280338287353516, "gen_logits_std": 3.370478630065918, "gen_loss": 0.3262850046157837, "grad_norm": 0.3635223194930782, "learning_rate": 1.926189473684211e-05, "loss": 0.2774, "mean_copy_accuracy": 0.9958723187446594, "mean_gen_accuracy": 0.8772225230932236, "mean_token_accuracy": 0.905686542391777, "num_tokens": 199338263.0, "sample_num_tokens": 9293.25, "step": 9237, "total_num_tokens": 199375436.0, "z_loss": 0.0004654020885936916 }, { "copy_logits_max": 1.4605424404144287, "copy_logits_min": -750000000.0, "copy_num_tokens": 512.0625, "epoch": 1.8865968853714579, "gen_logits_max": 4.580829620361328, "gen_logits_mean": -15.05366325378418, "gen_logits_min": -27.78594207763672, "gen_logits_std": 3.423304557800293, "gen_loss": 0.2274617850780487, "grad_norm": 0.33575031337471845, "learning_rate": 1.926063157894737e-05, "loss": 0.2575, "mean_copy_accuracy": 0.9975664466619492, "mean_gen_accuracy": 0.881307378411293, "mean_token_accuracy": 0.912095919251442, "num_tokens": 199629556.0, "sample_num_tokens": 9667.0, "step": 9238, "total_num_tokens": 199668224.0, "z_loss": 0.00032724591437727213 }, { "copy_logits_max": -0.8199834227561951, "copy_logits_min": -750000000.0, "copy_num_tokens": 390.875, "epoch": 1.8868011233086546, "gen_logits_max": 4.239627361297607, "gen_logits_mean": -16.222707748413086, "gen_logits_min": -28.188514709472656, "gen_logits_std": 3.4392402172088623, "gen_loss": 0.268790602684021, "grad_norm": 0.3780316233945533, "learning_rate": 1.9259368421052634e-05, "loss": 0.2763, "mean_copy_accuracy": 0.9969552755355835, "mean_gen_accuracy": 0.877400353550911, "mean_token_accuracy": 0.9052736908197403, "num_tokens": 199899617.0, "sample_num_tokens": 8137.25, "step": 9239, "total_num_tokens": 199932166.0, "z_loss": 0.0003884733305312693 }, { "copy_logits_max": -0.008778691291809082, "copy_logits_min": -750000000.0, "copy_num_tokens": 479.6875, "epoch": 1.8870053612458513, "gen_logits_max": 3.8321304321289062, "gen_logits_mean": -16.676076889038086, "gen_logits_min": -28.888072967529297, "gen_logits_std": 3.4523251056671143, "gen_loss": 0.31282082200050354, "grad_norm": 0.3411752864644114, "learning_rate": 1.9258105263157895e-05, "loss": 0.2722, "mean_copy_accuracy": 0.9972091466188431, "mean_gen_accuracy": 0.872722253203392, "mean_token_accuracy": 0.9092491567134857, "num_tokens": 200204876.0, "sample_num_tokens": 9437.5, "step": 9240, "total_num_tokens": 200242626.0, "z_loss": 0.0004452866851352155 }, { "copy_logits_max": -0.09801545739173889, "copy_logits_min": -750000000.0, "copy_num_tokens": 517.5, "epoch": 1.8872095991830482, "gen_logits_max": 3.8203558921813965, "gen_logits_mean": -16.228015899658203, "gen_logits_min": -28.81927490234375, "gen_logits_std": 3.4678707122802734, "gen_loss": 0.2870151996612549, "grad_norm": 0.34099109873241584, "learning_rate": 1.925684210526316e-05, "loss": 0.268, "mean_copy_accuracy": 0.9960083514451981, "mean_gen_accuracy": 0.8768399357795715, "mean_token_accuracy": 0.9086660891771317, "num_tokens": 200486287.0, "sample_num_tokens": 8365.25, "step": 9241, "total_num_tokens": 200519748.0, "z_loss": 0.00041318239527754486 }, { "copy_logits_max": -1.5871248245239258, "copy_logits_min": -687500032.0, "copy_num_tokens": 347.25, "epoch": 1.8874138371202451, "gen_logits_max": 4.3703203201293945, "gen_logits_mean": -16.43178939819336, "gen_logits_min": -28.554052352905273, "gen_logits_std": 3.4619548320770264, "gen_loss": 0.24967722594738007, "grad_norm": 0.33720072035991644, "learning_rate": 1.925557894736842e-05, "loss": 0.2474, "mean_copy_accuracy": 0.9967817217111588, "mean_gen_accuracy": 0.8842923194169998, "mean_token_accuracy": 0.916521817445755, "num_tokens": 200776076.0, "sample_num_tokens": 7469.0, "step": 9242, "total_num_tokens": 200805952.0, "z_loss": 0.0003534230636432767 }, { "copy_logits_max": -0.48154014348983765, "copy_logits_min": -750000000.0, "copy_num_tokens": 403.0, "epoch": 1.8876180750574418, "gen_logits_max": 5.000067234039307, "gen_logits_mean": -15.530733108520508, "gen_logits_min": -28.22382354736328, "gen_logits_std": 3.40714430809021, "gen_loss": 0.3032565712928772, "grad_norm": 0.3815989678384377, "learning_rate": 1.9254315789473685e-05, "loss": 0.3059, "mean_copy_accuracy": 0.9970388263463974, "mean_gen_accuracy": 0.8709174245595932, "mean_token_accuracy": 0.8979759812355042, "num_tokens": 201039293.0, "sample_num_tokens": 8232.75, "step": 9243, "total_num_tokens": 201072224.0, "z_loss": 0.00043776119127869606 }, { "copy_logits_max": -2.9626951217651367, "copy_logits_min": -750000064.0, "copy_num_tokens": 355.25, "epoch": 1.8878223129946388, "gen_logits_max": 4.1122636795043945, "gen_logits_mean": -16.748504638671875, "gen_logits_min": -28.858999252319336, "gen_logits_std": 3.455169200897217, "gen_loss": 0.2786436676979065, "grad_norm": 0.34910492753589506, "learning_rate": 1.9253052631578945e-05, "loss": 0.2733, "mean_copy_accuracy": 0.9967550486326218, "mean_gen_accuracy": 0.8812611997127533, "mean_token_accuracy": 0.9070489257574081, "num_tokens": 201310239.0, "sample_num_tokens": 7695.75, "step": 9244, "total_num_tokens": 201341022.0, "z_loss": 0.0004518191854003817 }, { "copy_logits_max": 2.0488879680633545, "copy_logits_min": -750000000.0, "copy_num_tokens": 504.625, "epoch": 1.8880265509318357, "gen_logits_max": 4.775938987731934, "gen_logits_mean": -14.360368728637695, "gen_logits_min": -27.327072143554688, "gen_logits_std": 3.3551688194274902, "gen_loss": 0.2625158429145813, "grad_norm": 0.36152549548797314, "learning_rate": 1.925178947368421e-05, "loss": 0.2695, "mean_copy_accuracy": 0.997097909450531, "mean_gen_accuracy": 0.8766681849956512, "mean_token_accuracy": 0.9082707613706589, "num_tokens": 201565914.0, "sample_num_tokens": 8400.5, "step": 9245, "total_num_tokens": 201599516.0, "z_loss": 0.0004034404410049319 }, { "copy_logits_max": -4.629533767700195, "copy_logits_min": -750000000.0, "copy_num_tokens": 265.1875, "epoch": 1.8882307888690324, "gen_logits_max": 4.948251247406006, "gen_logits_mean": -16.46269416809082, "gen_logits_min": -28.31922149658203, "gen_logits_std": 3.434075355529785, "gen_loss": 0.2919021248817444, "grad_norm": 0.3697026528570736, "learning_rate": 1.9250526315789474e-05, "loss": 0.2611, "mean_copy_accuracy": 0.9974915534257889, "mean_gen_accuracy": 0.885760635137558, "mean_token_accuracy": 0.9124844074249268, "num_tokens": 201820290.0, "sample_num_tokens": 6833.0, "step": 9246, "total_num_tokens": 201847622.0, "z_loss": 0.00041886878898367286 }, { "copy_logits_max": -1.5817430019378662, "copy_logits_min": -750000000.0, "copy_num_tokens": 416.3125, "epoch": 1.888435026806229, "gen_logits_max": 4.563704013824463, "gen_logits_mean": -16.388153076171875, "gen_logits_min": -28.7003116607666, "gen_logits_std": 3.4484472274780273, "gen_loss": 0.2961313724517822, "grad_norm": 0.35590751628789025, "learning_rate": 1.924926315789474e-05, "loss": 0.2855, "mean_copy_accuracy": 0.997437447309494, "mean_gen_accuracy": 0.8768205046653748, "mean_token_accuracy": 0.9029065072536469, "num_tokens": 202074515.0, "sample_num_tokens": 8108.75, "step": 9247, "total_num_tokens": 202106950.0, "z_loss": 0.00047311687376350164 }, { "copy_logits_max": -2.1740529537200928, "copy_logits_min": -750000000.0, "copy_num_tokens": 519.4375, "epoch": 1.8886392647434262, "gen_logits_max": 3.7540314197540283, "gen_logits_mean": -17.17666244506836, "gen_logits_min": -29.806684494018555, "gen_logits_std": 3.4945905208587646, "gen_loss": 0.24120870232582092, "grad_norm": 0.3520039745648586, "learning_rate": 1.9248e-05, "loss": 0.2648, "mean_copy_accuracy": 0.9948228746652603, "mean_gen_accuracy": 0.8826645612716675, "mean_token_accuracy": 0.9104732573032379, "num_tokens": 202343337.0, "sample_num_tokens": 9118.25, "step": 9248, "total_num_tokens": 202379810.0, "z_loss": 0.0003534517309162766 }, { "copy_logits_max": -2.2710471153259277, "copy_logits_min": -750000000.0, "copy_num_tokens": 407.5625, "epoch": 1.888843502680623, "gen_logits_max": 3.825087547302246, "gen_logits_mean": -17.71210479736328, "gen_logits_min": -30.006778717041016, "gen_logits_std": 3.491392135620117, "gen_loss": 0.2809843122959137, "grad_norm": 0.33172290169503554, "learning_rate": 1.9246736842105264e-05, "loss": 0.2776, "mean_copy_accuracy": 0.9973725974559784, "mean_gen_accuracy": 0.881247878074646, "mean_token_accuracy": 0.9051739871501923, "num_tokens": 202617506.0, "sample_num_tokens": 8353.0, "step": 9249, "total_num_tokens": 202650918.0, "z_loss": 0.0004523092648014426 }, { "copy_logits_max": -1.415397047996521, "copy_logits_min": -687500032.0, "copy_num_tokens": 404.125, "epoch": 1.8890477406178197, "gen_logits_max": 3.6794376373291016, "gen_logits_mean": -17.155546188354492, "gen_logits_min": -29.508148193359375, "gen_logits_std": 3.468608856201172, "gen_loss": 0.26011937856674194, "grad_norm": 0.36127519248427775, "learning_rate": 1.9245473684210528e-05, "loss": 0.2833, "mean_copy_accuracy": 0.9975762814283371, "mean_gen_accuracy": 0.8754416853189468, "mean_token_accuracy": 0.9033962488174438, "num_tokens": 202897692.0, "sample_num_tokens": 7451.5, "step": 9250, "total_num_tokens": 202927498.0, "z_loss": 0.00043604831444099545 }, { "copy_logits_max": -2.062891721725464, "copy_logits_min": -750000000.0, "copy_num_tokens": 283.25, "epoch": 1.8892519785550166, "gen_logits_max": 4.8090009689331055, "gen_logits_mean": -16.65325927734375, "gen_logits_min": -28.771705627441406, "gen_logits_std": 3.440685749053955, "gen_loss": 0.31056785583496094, "grad_norm": 0.3786282578399914, "learning_rate": 1.924421052631579e-05, "loss": 0.2792, "mean_copy_accuracy": 0.9952946603298187, "mean_gen_accuracy": 0.8780871331691742, "mean_token_accuracy": 0.905891627073288, "num_tokens": 203165756.0, "sample_num_tokens": 7624.0, "step": 9251, "total_num_tokens": 203196252.0, "z_loss": 0.0004900447675026953 }, { "copy_logits_max": -2.253511428833008, "copy_logits_min": -750000000.0, "copy_num_tokens": 358.375, "epoch": 1.8894562164922135, "gen_logits_max": 4.095841407775879, "gen_logits_mean": -16.86044692993164, "gen_logits_min": -29.027095794677734, "gen_logits_std": 3.4581573009490967, "gen_loss": 0.24353501200675964, "grad_norm": 0.3178689926997577, "learning_rate": 1.9242947368421053e-05, "loss": 0.2488, "mean_copy_accuracy": 0.9974392056465149, "mean_gen_accuracy": 0.8914286196231842, "mean_token_accuracy": 0.9179002791643143, "num_tokens": 203447812.0, "sample_num_tokens": 7883.5, "step": 9252, "total_num_tokens": 203479346.0, "z_loss": 0.0003973012208007276 }, { "copy_logits_max": -2.3713090419769287, "copy_logits_min": -750000064.0, "copy_num_tokens": 395.5625, "epoch": 1.8896604544294102, "gen_logits_max": 3.896531581878662, "gen_logits_mean": -17.202058792114258, "gen_logits_min": -29.596817016601562, "gen_logits_std": 3.473745107650757, "gen_loss": 0.25782155990600586, "grad_norm": 0.3320862137837021, "learning_rate": 1.9241684210526314e-05, "loss": 0.2714, "mean_copy_accuracy": 0.9970176964998245, "mean_gen_accuracy": 0.8781186193227768, "mean_token_accuracy": 0.9064270406961441, "num_tokens": 203710640.0, "sample_num_tokens": 8049.5, "step": 9253, "total_num_tokens": 203742838.0, "z_loss": 0.00037216523196548223 }, { "copy_logits_max": -2.9252429008483887, "copy_logits_min": -687500032.0, "copy_num_tokens": 377.1875, "epoch": 1.8898646923666071, "gen_logits_max": 4.110751152038574, "gen_logits_mean": -17.114904403686523, "gen_logits_min": -29.46758270263672, "gen_logits_std": 3.4429450035095215, "gen_loss": 0.3043837547302246, "grad_norm": 0.33863846658625857, "learning_rate": 1.9240421052631582e-05, "loss": 0.2799, "mean_copy_accuracy": 0.9963685870170593, "mean_gen_accuracy": 0.8761540502309799, "mean_token_accuracy": 0.9059434980154037, "num_tokens": 203992326.0, "sample_num_tokens": 8006.0, "step": 9254, "total_num_tokens": 204024350.0, "z_loss": 0.0004777517751790583 }, { "copy_logits_max": -5.016038417816162, "copy_logits_min": -687500032.0, "copy_num_tokens": 583.75, "epoch": 1.890068930303804, "gen_logits_max": 2.8215508460998535, "gen_logits_mean": -18.076553344726562, "gen_logits_min": -30.109468460083008, "gen_logits_std": 3.482403039932251, "gen_loss": 0.25724053382873535, "grad_norm": 0.30841475768732507, "learning_rate": 1.9239157894736843e-05, "loss": 0.2578, "mean_copy_accuracy": 0.997876912355423, "mean_gen_accuracy": 0.8786797672510147, "mean_token_accuracy": 0.9129343181848526, "num_tokens": 204285524.0, "sample_num_tokens": 10191.0, "step": 9255, "total_num_tokens": 204326288.0, "z_loss": 0.0003817089891526848 }, { "copy_logits_max": -2.863792896270752, "copy_logits_min": -750000000.0, "copy_num_tokens": 380.375, "epoch": 1.8902731682410008, "gen_logits_max": 2.8308074474334717, "gen_logits_mean": -18.9074649810791, "gen_logits_min": -31.19542694091797, "gen_logits_std": 3.5412521362304688, "gen_loss": 0.26513245701789856, "grad_norm": 0.3458124012347628, "learning_rate": 1.9237894736842107e-05, "loss": 0.2746, "mean_copy_accuracy": 0.997453361749649, "mean_gen_accuracy": 0.8808389902114868, "mean_token_accuracy": 0.9070855677127838, "num_tokens": 204567245.0, "sample_num_tokens": 7409.75, "step": 9256, "total_num_tokens": 204596884.0, "z_loss": 0.00037748206523247063 }, { "copy_logits_max": -2.663762092590332, "copy_logits_min": -750000000.0, "copy_num_tokens": 418.1875, "epoch": 1.8904774061781975, "gen_logits_max": 3.058100700378418, "gen_logits_mean": -18.14535140991211, "gen_logits_min": -30.413127899169922, "gen_logits_std": 3.5152742862701416, "gen_loss": 0.2461429387331009, "grad_norm": 0.3624046472232974, "learning_rate": 1.9236631578947368e-05, "loss": 0.2741, "mean_copy_accuracy": 0.9968500137329102, "mean_gen_accuracy": 0.8802559673786163, "mean_token_accuracy": 0.9076464772224426, "num_tokens": 204828116.0, "sample_num_tokens": 7740.5, "step": 9257, "total_num_tokens": 204859078.0, "z_loss": 0.0004169509338680655 }, { "copy_logits_max": -5.737715721130371, "copy_logits_min": -750000000.0, "copy_num_tokens": 506.1875, "epoch": 1.8906816441153944, "gen_logits_max": 3.2850289344787598, "gen_logits_mean": -17.491836547851562, "gen_logits_min": -29.689701080322266, "gen_logits_std": 3.452075958251953, "gen_loss": 0.30359935760498047, "grad_norm": 0.3686708767916181, "learning_rate": 1.9235368421052632e-05, "loss": 0.2817, "mean_copy_accuracy": 0.9960003197193146, "mean_gen_accuracy": 0.8754499107599258, "mean_token_accuracy": 0.9035938829183578, "num_tokens": 205094665.0, "sample_num_tokens": 8963.25, "step": 9258, "total_num_tokens": 205130518.0, "z_loss": 0.0004680995480157435 }, { "copy_logits_max": -5.587133884429932, "copy_logits_min": -625000064.0, "copy_num_tokens": 325.9375, "epoch": 1.8908858820525913, "gen_logits_max": 3.2490806579589844, "gen_logits_mean": -18.6828670501709, "gen_logits_min": -30.51829719543457, "gen_logits_std": 3.497483253479004, "gen_loss": 0.3061108887195587, "grad_norm": 0.4110430949667, "learning_rate": 1.9234105263157893e-05, "loss": 0.27, "mean_copy_accuracy": 0.9976737797260284, "mean_gen_accuracy": 0.8778371214866638, "mean_token_accuracy": 0.9076421558856964, "num_tokens": 205353599.0, "sample_num_tokens": 7298.25, "step": 9259, "total_num_tokens": 205382792.0, "z_loss": 0.0004475100722629577 }, { "copy_logits_max": -5.881989479064941, "copy_logits_min": -687500032.0, "copy_num_tokens": 399.5625, "epoch": 1.891090119989788, "gen_logits_max": 2.584930419921875, "gen_logits_mean": -19.378082275390625, "gen_logits_min": -31.513469696044922, "gen_logits_std": 3.5155510902404785, "gen_loss": 0.2817373275756836, "grad_norm": 0.3642359512536101, "learning_rate": 1.9232842105263158e-05, "loss": 0.2747, "mean_copy_accuracy": 0.9974197447299957, "mean_gen_accuracy": 0.8746660202741623, "mean_token_accuracy": 0.9058789908885956, "num_tokens": 205624042.0, "sample_num_tokens": 8970.0, "step": 9260, "total_num_tokens": 205659922.0, "z_loss": 0.00042876595398411155 }, { "copy_logits_max": -4.432600021362305, "copy_logits_min": -750000000.0, "copy_num_tokens": 249.125, "epoch": 1.891294357926985, "gen_logits_max": 4.231879234313965, "gen_logits_mean": -16.12044334411621, "gen_logits_min": -28.09454345703125, "gen_logits_std": 3.319714069366455, "gen_loss": 0.3150280714035034, "grad_norm": 0.3468842653975222, "learning_rate": 1.923157894736842e-05, "loss": 0.2739, "mean_copy_accuracy": 0.9970460534095764, "mean_gen_accuracy": 0.8778383582830429, "mean_token_accuracy": 0.9075150638818741, "num_tokens": 205901288.0, "sample_num_tokens": 6387.0, "step": 9261, "total_num_tokens": 205926836.0, "z_loss": 0.00043331366032361984 }, { "copy_logits_max": -6.855127811431885, "copy_logits_min": -750000128.0, "copy_num_tokens": 466.0, "epoch": 1.891498595864182, "gen_logits_max": 2.5209078788757324, "gen_logits_mean": -19.363990783691406, "gen_logits_min": -31.330928802490234, "gen_logits_std": 3.562765598297119, "gen_loss": 0.24205031991004944, "grad_norm": 0.34353643831608693, "learning_rate": 1.9230315789473686e-05, "loss": 0.2486, "mean_copy_accuracy": 0.9971537441015244, "mean_gen_accuracy": 0.8870351612567902, "mean_token_accuracy": 0.9149141311645508, "num_tokens": 206170273.0, "sample_num_tokens": 8694.75, "step": 9262, "total_num_tokens": 206205052.0, "z_loss": 0.00033627578523010015 }, { "copy_logits_max": -2.881887435913086, "copy_logits_min": -750000064.0, "copy_num_tokens": 680.3125, "epoch": 1.8917028338013786, "gen_logits_max": 2.843618869781494, "gen_logits_mean": -16.49907112121582, "gen_logits_min": -28.507659912109375, "gen_logits_std": 3.36496639251709, "gen_loss": 0.2540295720100403, "grad_norm": 0.35039768699797713, "learning_rate": 1.922905263157895e-05, "loss": 0.2708, "mean_copy_accuracy": 0.9968935400247574, "mean_gen_accuracy": 0.8798782825469971, "mean_token_accuracy": 0.908673420548439, "num_tokens": 206443154.0, "sample_num_tokens": 9622.5, "step": 9263, "total_num_tokens": 206481644.0, "z_loss": 0.00040842272574082017 }, { "copy_logits_max": -3.1318840980529785, "copy_logits_min": -750000000.0, "copy_num_tokens": 474.1875, "epoch": 1.8919070717385753, "gen_logits_max": 2.7020254135131836, "gen_logits_mean": -18.084457397460938, "gen_logits_min": -30.3801212310791, "gen_logits_std": 3.48323655128479, "gen_loss": 0.2615336775779724, "grad_norm": 0.36363686035102355, "learning_rate": 1.922778947368421e-05, "loss": 0.2584, "mean_copy_accuracy": 0.9970142692327499, "mean_gen_accuracy": 0.8819354772567749, "mean_token_accuracy": 0.9113985598087311, "num_tokens": 206740963.0, "sample_num_tokens": 8142.25, "step": 9264, "total_num_tokens": 206773532.0, "z_loss": 0.00042193502304144204 }, { "copy_logits_max": -5.007498741149902, "copy_logits_min": -750000000.0, "copy_num_tokens": 501.0625, "epoch": 1.8921113096757722, "gen_logits_max": 4.336183547973633, "gen_logits_mean": -15.653430938720703, "gen_logits_min": -28.313724517822266, "gen_logits_std": 3.392991065979004, "gen_loss": 0.26996368169784546, "grad_norm": 0.346609621842039, "learning_rate": 1.9226526315789476e-05, "loss": 0.2905, "mean_copy_accuracy": 0.9969865828752518, "mean_gen_accuracy": 0.8726329505443573, "mean_token_accuracy": 0.9013689309358597, "num_tokens": 207023090.0, "sample_num_tokens": 8711.0, "step": 9265, "total_num_tokens": 207057934.0, "z_loss": 0.00039765617111697793 }, { "copy_logits_max": -4.465095043182373, "copy_logits_min": -750000000.0, "copy_num_tokens": 462.6875, "epoch": 1.8923155476129692, "gen_logits_max": 3.2940778732299805, "gen_logits_mean": -16.28390121459961, "gen_logits_min": -28.132715225219727, "gen_logits_std": 3.3520617485046387, "gen_loss": 0.27447181940078735, "grad_norm": 0.3695065113519265, "learning_rate": 1.9225263157894737e-05, "loss": 0.2732, "mean_copy_accuracy": 0.9970220923423767, "mean_gen_accuracy": 0.8783373683691025, "mean_token_accuracy": 0.908415898680687, "num_tokens": 207293089.0, "sample_num_tokens": 8059.75, "step": 9266, "total_num_tokens": 207325328.0, "z_loss": 0.00038138983654789627 }, { "copy_logits_max": -3.701172351837158, "copy_logits_min": -687500032.0, "copy_num_tokens": 588.875, "epoch": 1.8925197855501659, "gen_logits_max": 2.5871152877807617, "gen_logits_mean": -16.74534034729004, "gen_logits_min": -29.191999435424805, "gen_logits_std": 3.4434025287628174, "gen_loss": 0.2506280839443207, "grad_norm": 0.36796971463312805, "learning_rate": 1.9224e-05, "loss": 0.2799, "mean_copy_accuracy": 0.9968733787536621, "mean_gen_accuracy": 0.872166246175766, "mean_token_accuracy": 0.9042310565710068, "num_tokens": 207550733.0, "sample_num_tokens": 8486.75, "step": 9267, "total_num_tokens": 207584680.0, "z_loss": 0.00038668897468596697 }, { "copy_logits_max": -4.822994709014893, "copy_logits_min": -750000000.0, "copy_num_tokens": 475.375, "epoch": 1.8927240234873628, "gen_logits_max": 4.627314567565918, "gen_logits_mean": -15.516439437866211, "gen_logits_min": -27.665382385253906, "gen_logits_std": 3.409358024597168, "gen_loss": 0.2703699767589569, "grad_norm": 0.3747932701650013, "learning_rate": 1.9222736842105262e-05, "loss": 0.3024, "mean_copy_accuracy": 0.9968970119953156, "mean_gen_accuracy": 0.8724824339151382, "mean_token_accuracy": 0.8972264379262924, "num_tokens": 207808510.0, "sample_num_tokens": 9414.0, "step": 9268, "total_num_tokens": 207846166.0, "z_loss": 0.00038649741327390075 }, { "copy_logits_max": -3.52571964263916, "copy_logits_min": -750000128.0, "copy_num_tokens": 385.1875, "epoch": 1.8929282614245597, "gen_logits_max": 3.8447279930114746, "gen_logits_mean": -16.708831787109375, "gen_logits_min": -28.928340911865234, "gen_logits_std": 3.4389429092407227, "gen_loss": 0.2875309884548187, "grad_norm": 0.7299645590623094, "learning_rate": 1.9221473684210526e-05, "loss": 0.2786, "mean_copy_accuracy": 0.9969513714313507, "mean_gen_accuracy": 0.8765605539083481, "mean_token_accuracy": 0.9060440808534622, "num_tokens": 208083478.0, "sample_num_tokens": 7063.5, "step": 9269, "total_num_tokens": 208111732.0, "z_loss": 0.00042343130917288363 }, { "copy_logits_max": -0.6231819987297058, "copy_logits_min": -750000000.0, "copy_num_tokens": 541.9375, "epoch": 1.8931324993617564, "gen_logits_max": 4.315647602081299, "gen_logits_mean": -15.737812995910645, "gen_logits_min": -27.95645523071289, "gen_logits_std": 3.393644332885742, "gen_loss": 0.25049692392349243, "grad_norm": 0.3786318595545143, "learning_rate": 1.922021052631579e-05, "loss": 0.2866, "mean_copy_accuracy": 0.9971373379230499, "mean_gen_accuracy": 0.872959315776825, "mean_token_accuracy": 0.9005045890808105, "num_tokens": 208331132.0, "sample_num_tokens": 9376.5, "step": 9270, "total_num_tokens": 208368638.0, "z_loss": 0.00041204120498150587 }, { "copy_logits_max": -4.735568046569824, "copy_logits_min": -750000064.0, "copy_num_tokens": 516.6875, "epoch": 1.8933367372989531, "gen_logits_max": 3.046114921569824, "gen_logits_mean": -18.061845779418945, "gen_logits_min": -30.101943969726562, "gen_logits_std": 3.503458023071289, "gen_loss": 0.2554090619087219, "grad_norm": 0.3664952137157176, "learning_rate": 1.9218947368421055e-05, "loss": 0.2743, "mean_copy_accuracy": 0.9967603385448456, "mean_gen_accuracy": 0.8769153356552124, "mean_token_accuracy": 0.9060745239257812, "num_tokens": 208599897.0, "sample_num_tokens": 8798.75, "step": 9271, "total_num_tokens": 208635092.0, "z_loss": 0.0004193961212877184 }, { "copy_logits_max": -6.30582332611084, "copy_logits_min": -750000000.0, "copy_num_tokens": 303.0, "epoch": 1.8935409752361503, "gen_logits_max": 3.406371831893921, "gen_logits_mean": -18.884033203125, "gen_logits_min": -30.54800033569336, "gen_logits_std": 3.4556427001953125, "gen_loss": 0.29126453399658203, "grad_norm": 0.4266019139648494, "learning_rate": 1.9217684210526316e-05, "loss": 0.2847, "mean_copy_accuracy": 0.9955520331859589, "mean_gen_accuracy": 0.8784376978874207, "mean_token_accuracy": 0.904509037733078, "num_tokens": 208889177.0, "sample_num_tokens": 7823.75, "step": 9272, "total_num_tokens": 208920472.0, "z_loss": 0.0004872427962254733 }, { "copy_logits_max": -6.50592565536499, "copy_logits_min": -750000000.0, "copy_num_tokens": 445.6875, "epoch": 1.893745213173347, "gen_logits_max": 3.4329771995544434, "gen_logits_mean": -15.86283016204834, "gen_logits_min": -26.98722267150879, "gen_logits_std": 3.1959166526794434, "gen_loss": 0.2599407732486725, "grad_norm": 0.35075769873540197, "learning_rate": 1.921642105263158e-05, "loss": 0.2836, "mean_copy_accuracy": 0.9968922138214111, "mean_gen_accuracy": 0.8762915879487991, "mean_token_accuracy": 0.9061846733093262, "num_tokens": 209173962.0, "sample_num_tokens": 8668.5, "step": 9273, "total_num_tokens": 209208636.0, "z_loss": 0.00040385028114542365 }, { "copy_logits_max": -5.161976337432861, "copy_logits_min": -750000000.0, "copy_num_tokens": 438.5, "epoch": 1.8939494511105437, "gen_logits_max": 3.553149700164795, "gen_logits_mean": -15.726824760437012, "gen_logits_min": -27.220592498779297, "gen_logits_std": 3.212280750274658, "gen_loss": 0.29221245646476746, "grad_norm": 0.37390085942056556, "learning_rate": 1.921515789473684e-05, "loss": 0.2833, "mean_copy_accuracy": 0.9970119595527649, "mean_gen_accuracy": 0.8737129867076874, "mean_token_accuracy": 0.902469664812088, "num_tokens": 209438413.0, "sample_num_tokens": 7425.25, "step": 9274, "total_num_tokens": 209468114.0, "z_loss": 0.0004628519818652421 }, { "copy_logits_max": -4.238234996795654, "copy_logits_min": -687500032.0, "copy_num_tokens": 635.25, "epoch": 1.8941536890477406, "gen_logits_max": 2.0499982833862305, "gen_logits_mean": -17.328035354614258, "gen_logits_min": -28.770816802978516, "gen_logits_std": 3.29121470451355, "gen_loss": 0.2530249059200287, "grad_norm": 0.35217237319580225, "learning_rate": 1.9213894736842105e-05, "loss": 0.2628, "mean_copy_accuracy": 0.9970273375511169, "mean_gen_accuracy": 0.8806203901767731, "mean_token_accuracy": 0.9091057628393173, "num_tokens": 209698222.0, "sample_num_tokens": 9067.0, "step": 9275, "total_num_tokens": 209734490.0, "z_loss": 0.0004135635681450367 }, { "copy_logits_max": -5.01597785949707, "copy_logits_min": -750000000.0, "copy_num_tokens": 556.375, "epoch": 1.8943579269849375, "gen_logits_max": 1.8091537952423096, "gen_logits_mean": -18.134723663330078, "gen_logits_min": -29.344629287719727, "gen_logits_std": 3.2807092666625977, "gen_loss": 0.2789488434791565, "grad_norm": 0.4167127581836729, "learning_rate": 1.921263157894737e-05, "loss": 0.2512, "mean_copy_accuracy": 0.995704710483551, "mean_gen_accuracy": 0.8823797404766083, "mean_token_accuracy": 0.9140656292438507, "num_tokens": 209979278.0, "sample_num_tokens": 8700.5, "step": 9276, "total_num_tokens": 210014080.0, "z_loss": 0.00047451021964661777 }, { "copy_logits_max": -4.024868011474609, "copy_logits_min": -750000000.0, "copy_num_tokens": 387.5, "epoch": 1.8945621649221343, "gen_logits_max": 3.6099636554718018, "gen_logits_mean": -16.235828399658203, "gen_logits_min": -27.451400756835938, "gen_logits_std": 3.2153613567352295, "gen_loss": 0.33022788166999817, "grad_norm": 0.4793999643029419, "learning_rate": 1.921136842105263e-05, "loss": 0.2917, "mean_copy_accuracy": 0.9962833076715469, "mean_gen_accuracy": 0.8697594851255417, "mean_token_accuracy": 0.9036102294921875, "num_tokens": 210251689.0, "sample_num_tokens": 7872.25, "step": 9277, "total_num_tokens": 210283178.0, "z_loss": 0.0005842407699674368 }, { "copy_logits_max": -3.334029197692871, "copy_logits_min": -750000000.0, "copy_num_tokens": 407.6875, "epoch": 1.894766402859331, "gen_logits_max": 3.402930736541748, "gen_logits_mean": -15.865610122680664, "gen_logits_min": -26.75287437438965, "gen_logits_std": 3.1145615577697754, "gen_loss": 0.2814421057701111, "grad_norm": 0.3442957271004241, "learning_rate": 1.9210105263157898e-05, "loss": 0.2723, "mean_copy_accuracy": 0.9965918064117432, "mean_gen_accuracy": 0.8771499246358871, "mean_token_accuracy": 0.9071170091629028, "num_tokens": 210530772.0, "sample_num_tokens": 8545.5, "step": 9278, "total_num_tokens": 210564954.0, "z_loss": 0.0005339033668860793 }, { "copy_logits_max": -4.376509666442871, "copy_logits_min": -750000000.0, "copy_num_tokens": 303.875, "epoch": 1.894970640796528, "gen_logits_max": 3.5386147499084473, "gen_logits_mean": -16.871925354003906, "gen_logits_min": -27.978641510009766, "gen_logits_std": 3.1912450790405273, "gen_loss": 0.29647982120513916, "grad_norm": 0.4815323798447255, "learning_rate": 1.920884210526316e-05, "loss": 0.2636, "mean_copy_accuracy": 0.9963216632604599, "mean_gen_accuracy": 0.8804353624582291, "mean_token_accuracy": 0.9093048423528671, "num_tokens": 210774901.0, "sample_num_tokens": 6723.25, "step": 9279, "total_num_tokens": 210801794.0, "z_loss": 0.000590362586081028 }, { "copy_logits_max": -1.6965241432189941, "copy_logits_min": -750000000.0, "copy_num_tokens": 561.3125, "epoch": 1.8951748787337248, "gen_logits_max": 2.4396424293518066, "gen_logits_mean": -16.866504669189453, "gen_logits_min": -28.486770629882812, "gen_logits_std": 3.263157844543457, "gen_loss": 0.28524675965309143, "grad_norm": 0.3820854317796855, "learning_rate": 1.9207578947368423e-05, "loss": 0.2738, "mean_copy_accuracy": 0.9964788109064102, "mean_gen_accuracy": 0.8733065873384476, "mean_token_accuracy": 0.9057151228189468, "num_tokens": 211044570.0, "sample_num_tokens": 8721.0, "step": 9280, "total_num_tokens": 211079454.0, "z_loss": 0.0006508524529635906 }, { "copy_logits_max": -2.991075038909912, "copy_logits_min": -750000000.0, "copy_num_tokens": 372.375, "epoch": 1.8953791166709215, "gen_logits_max": 3.481982707977295, "gen_logits_mean": -17.159894943237305, "gen_logits_min": -28.439342498779297, "gen_logits_std": 3.3034582138061523, "gen_loss": 0.2599543333053589, "grad_norm": 0.360141215999148, "learning_rate": 1.9206315789473684e-05, "loss": 0.2759, "mean_copy_accuracy": 0.9961467236280441, "mean_gen_accuracy": 0.8790429979562759, "mean_token_accuracy": 0.9043035954236984, "num_tokens": 211296881.0, "sample_num_tokens": 8365.75, "step": 9281, "total_num_tokens": 211330344.0, "z_loss": 0.0004967094282619655 }, { "copy_logits_max": -3.5171473026275635, "copy_logits_min": -750000000.0, "copy_num_tokens": 371.375, "epoch": 1.8955833546081184, "gen_logits_max": 3.4122416973114014, "gen_logits_mean": -15.437141418457031, "gen_logits_min": -25.972633361816406, "gen_logits_std": 3.008925437927246, "gen_loss": 0.2849104404449463, "grad_norm": 0.3721863580393216, "learning_rate": 1.920505263157895e-05, "loss": 0.2574, "mean_copy_accuracy": 0.9964951872825623, "mean_gen_accuracy": 0.8816658109426498, "mean_token_accuracy": 0.9127125144004822, "num_tokens": 211565644.0, "sample_num_tokens": 7626.5, "step": 9282, "total_num_tokens": 211596150.0, "z_loss": 0.0005003178957849741 }, { "copy_logits_max": -4.562134742736816, "copy_logits_min": -687500032.0, "copy_num_tokens": 324.25, "epoch": 1.8957875925453154, "gen_logits_max": 3.6763033866882324, "gen_logits_mean": -17.45626449584961, "gen_logits_min": -28.565181732177734, "gen_logits_std": 3.284804582595825, "gen_loss": 0.3015166223049164, "grad_norm": 0.3692076641595173, "learning_rate": 1.920378947368421e-05, "loss": 0.2734, "mean_copy_accuracy": 0.9954804033041, "mean_gen_accuracy": 0.8776068389415741, "mean_token_accuracy": 0.9056436419487, "num_tokens": 211821260.0, "sample_num_tokens": 8461.5, "step": 9283, "total_num_tokens": 211855106.0, "z_loss": 0.000535127124749124 }, { "copy_logits_max": -2.686762571334839, "copy_logits_min": -750000000.0, "copy_num_tokens": 607.6875, "epoch": 1.895991830482512, "gen_logits_max": 3.498443126678467, "gen_logits_mean": -15.381001472473145, "gen_logits_min": -27.153438568115234, "gen_logits_std": 3.2848219871520996, "gen_loss": 0.2877635359764099, "grad_norm": 0.3624025687840369, "learning_rate": 1.9202526315789474e-05, "loss": 0.2787, "mean_copy_accuracy": 0.9971121698617935, "mean_gen_accuracy": 0.869469478726387, "mean_token_accuracy": 0.904680922627449, "num_tokens": 212098382.0, "sample_num_tokens": 10780.0, "step": 9284, "total_num_tokens": 212141502.0, "z_loss": 0.000557991792447865 }, { "copy_logits_max": -1.965639591217041, "copy_logits_min": -687500032.0, "copy_num_tokens": 608.125, "epoch": 1.896196068419709, "gen_logits_max": 2.3702588081359863, "gen_logits_mean": -17.097402572631836, "gen_logits_min": -29.395278930664062, "gen_logits_std": 3.4017417430877686, "gen_loss": 0.2746421694755554, "grad_norm": 0.3688835773403009, "learning_rate": 1.9201263157894735e-05, "loss": 0.2856, "mean_copy_accuracy": 0.9969700425863266, "mean_gen_accuracy": 0.8681208938360214, "mean_token_accuracy": 0.9031256586313248, "num_tokens": 212372863.0, "sample_num_tokens": 8435.25, "step": 9285, "total_num_tokens": 212406604.0, "z_loss": 0.0005577936535701156 }, { "copy_logits_max": -4.998713493347168, "copy_logits_min": -687500032.0, "copy_num_tokens": 586.375, "epoch": 1.896400306356906, "gen_logits_max": 2.0651135444641113, "gen_logits_mean": -18.499399185180664, "gen_logits_min": -30.349605560302734, "gen_logits_std": 3.5005037784576416, "gen_loss": 0.2441655993461609, "grad_norm": 0.34671088769058883, "learning_rate": 1.9200000000000003e-05, "loss": 0.2587, "mean_copy_accuracy": 0.9972962439060211, "mean_gen_accuracy": 0.8845196515321732, "mean_token_accuracy": 0.9123093038797379, "num_tokens": 212649294.0, "sample_num_tokens": 9573.0, "step": 9286, "total_num_tokens": 212687586.0, "z_loss": 0.00043611868750303984 }, { "copy_logits_max": -4.579208850860596, "copy_logits_min": -750000000.0, "copy_num_tokens": 442.5625, "epoch": 1.8966045442941026, "gen_logits_max": 3.528764486312866, "gen_logits_mean": -17.275922775268555, "gen_logits_min": -29.050552368164062, "gen_logits_std": 3.4262189865112305, "gen_loss": 0.283593088388443, "grad_norm": 0.3439500508038325, "learning_rate": 1.9198736842105263e-05, "loss": 0.2706, "mean_copy_accuracy": 0.9970791935920715, "mean_gen_accuracy": 0.8786712884902954, "mean_token_accuracy": 0.9077460616827011, "num_tokens": 212932924.0, "sample_num_tokens": 8246.0, "step": 9287, "total_num_tokens": 212965908.0, "z_loss": 0.0004210551851429045 }, { "copy_logits_max": -6.271383762359619, "copy_logits_min": -750000000.0, "copy_num_tokens": 416.9375, "epoch": 1.8968087822312993, "gen_logits_max": 2.7143259048461914, "gen_logits_mean": -19.100059509277344, "gen_logits_min": -31.210773468017578, "gen_logits_std": 3.5052132606506348, "gen_loss": 0.28690510988235474, "grad_norm": 0.39467677291000547, "learning_rate": 1.9197473684210528e-05, "loss": 0.295, "mean_copy_accuracy": 0.9956802725791931, "mean_gen_accuracy": 0.872857004404068, "mean_token_accuracy": 0.897850751876831, "num_tokens": 213184719.0, "sample_num_tokens": 8109.75, "step": 9288, "total_num_tokens": 213217158.0, "z_loss": 0.000404551305109635 }, { "copy_logits_max": -6.116604804992676, "copy_logits_min": -750000000.0, "copy_num_tokens": 399.375, "epoch": 1.8970130201684963, "gen_logits_max": 3.8972654342651367, "gen_logits_mean": -16.17838478088379, "gen_logits_min": -28.418773651123047, "gen_logits_std": 3.385854721069336, "gen_loss": 0.26403868198394775, "grad_norm": 0.3737161632253088, "learning_rate": 1.919621052631579e-05, "loss": 0.2794, "mean_copy_accuracy": 0.9958415478467941, "mean_gen_accuracy": 0.8799774646759033, "mean_token_accuracy": 0.9069115221500397, "num_tokens": 213437627.0, "sample_num_tokens": 7925.75, "step": 9289, "total_num_tokens": 213469330.0, "z_loss": 0.0003493789117783308 }, { "copy_logits_max": -7.967643737792969, "copy_logits_min": -750000000.0, "copy_num_tokens": 414.875, "epoch": 1.8972172581056932, "gen_logits_max": 4.332755088806152, "gen_logits_mean": -16.787803649902344, "gen_logits_min": -28.455154418945312, "gen_logits_std": 3.3604049682617188, "gen_loss": 0.3298141658306122, "grad_norm": 0.33685523531252126, "learning_rate": 1.9194947368421053e-05, "loss": 0.273, "mean_copy_accuracy": 0.9971046894788742, "mean_gen_accuracy": 0.8803706020116806, "mean_token_accuracy": 0.9071951061487198, "num_tokens": 213714083.0, "sample_num_tokens": 8789.25, "step": 9290, "total_num_tokens": 213749240.0, "z_loss": 0.000451412022812292 }, { "copy_logits_max": -7.5560622215271, "copy_logits_min": -687500032.0, "copy_num_tokens": 418.4375, "epoch": 1.89742149604289, "gen_logits_max": 3.813652992248535, "gen_logits_mean": -17.41357421875, "gen_logits_min": -29.26024627685547, "gen_logits_std": 3.4104180335998535, "gen_loss": 0.2895902991294861, "grad_norm": 0.3464723373713716, "learning_rate": 1.9193684210526317e-05, "loss": 0.2818, "mean_copy_accuracy": 0.9964500963687897, "mean_gen_accuracy": 0.8781678378582001, "mean_token_accuracy": 0.9045273214578629, "num_tokens": 213973633.0, "sample_num_tokens": 8446.75, "step": 9291, "total_num_tokens": 214007420.0, "z_loss": 0.00043249555164948106 }, { "copy_logits_max": -4.084596633911133, "copy_logits_min": -750000000.0, "copy_num_tokens": 598.3125, "epoch": 1.8976257339800868, "gen_logits_max": 2.2630107402801514, "gen_logits_mean": -17.363256454467773, "gen_logits_min": -29.22196388244629, "gen_logits_std": 3.3323745727539062, "gen_loss": 0.27148422598838806, "grad_norm": 0.35092048513862883, "learning_rate": 1.9192421052631578e-05, "loss": 0.2721, "mean_copy_accuracy": 0.9967310726642609, "mean_gen_accuracy": 0.8780571520328522, "mean_token_accuracy": 0.9087109267711639, "num_tokens": 214252434.0, "sample_num_tokens": 9188.0, "step": 9292, "total_num_tokens": 214289186.0, "z_loss": 0.0004336889542173594 }, { "copy_logits_max": -4.349977493286133, "copy_logits_min": -750000000.0, "copy_num_tokens": 523.375, "epoch": 1.8978299719172838, "gen_logits_max": 2.7830820083618164, "gen_logits_mean": -18.187210083007812, "gen_logits_min": -30.521848678588867, "gen_logits_std": 3.4562182426452637, "gen_loss": 0.2943823039531708, "grad_norm": 0.32075436241727673, "learning_rate": 1.9191157894736843e-05, "loss": 0.2579, "mean_copy_accuracy": 0.9970642924308777, "mean_gen_accuracy": 0.8838802576065063, "mean_token_accuracy": 0.911373496055603, "num_tokens": 214530130.0, "sample_num_tokens": 8911.5, "step": 9293, "total_num_tokens": 214565776.0, "z_loss": 0.00045235149445943534 }, { "copy_logits_max": -4.595008373260498, "copy_logits_min": -687500032.0, "copy_num_tokens": 420.0625, "epoch": 1.8980342098544805, "gen_logits_max": 4.436343193054199, "gen_logits_mean": -16.706680297851562, "gen_logits_min": -30.16243553161621, "gen_logits_std": 3.4271347522735596, "gen_loss": 0.25304368138313293, "grad_norm": 0.38948613659802533, "learning_rate": 1.9189894736842107e-05, "loss": 0.2669, "mean_copy_accuracy": 0.9954283088445663, "mean_gen_accuracy": 0.8857549428939819, "mean_token_accuracy": 0.9093708842992783, "num_tokens": 214791952.0, "sample_num_tokens": 8163.5, "step": 9294, "total_num_tokens": 214824606.0, "z_loss": 0.00037436699494719505 }, { "copy_logits_max": -4.552491664886475, "copy_logits_min": -687500032.0, "copy_num_tokens": 528.25, "epoch": 1.8982384477916772, "gen_logits_max": 3.1546263694763184, "gen_logits_mean": -17.938926696777344, "gen_logits_min": -30.004413604736328, "gen_logits_std": 3.446535348892212, "gen_loss": 0.2765679359436035, "grad_norm": 0.3658321025073759, "learning_rate": 1.918863157894737e-05, "loss": 0.2694, "mean_copy_accuracy": 0.9959774613380432, "mean_gen_accuracy": 0.876210168004036, "mean_token_accuracy": 0.90910404920578, "num_tokens": 215080036.0, "sample_num_tokens": 9066.5, "step": 9295, "total_num_tokens": 215116302.0, "z_loss": 0.00041327549843117595 }, { "copy_logits_max": -7.574819087982178, "copy_logits_min": -750000000.0, "copy_num_tokens": 464.0625, "epoch": 1.898442685728874, "gen_logits_max": 3.364983558654785, "gen_logits_mean": -16.859479904174805, "gen_logits_min": -29.162059783935547, "gen_logits_std": 3.4217538833618164, "gen_loss": 0.26152753829956055, "grad_norm": 0.3399481767763998, "learning_rate": 1.9187368421052632e-05, "loss": 0.2697, "mean_copy_accuracy": 0.9974597990512848, "mean_gen_accuracy": 0.8783718645572662, "mean_token_accuracy": 0.9073909223079681, "num_tokens": 215381135.0, "sample_num_tokens": 9241.25, "step": 9296, "total_num_tokens": 215418100.0, "z_loss": 0.000348440749803558 }, { "copy_logits_max": -6.243991851806641, "copy_logits_min": -750000000.0, "copy_num_tokens": 471.875, "epoch": 1.898646923666071, "gen_logits_max": 3.0945022106170654, "gen_logits_mean": -17.317609786987305, "gen_logits_min": -29.199161529541016, "gen_logits_std": 3.3972878456115723, "gen_loss": 0.2772029638290405, "grad_norm": 0.3519023804275013, "learning_rate": 1.9186105263157896e-05, "loss": 0.273, "mean_copy_accuracy": 0.9964272528886795, "mean_gen_accuracy": 0.8796523064374924, "mean_token_accuracy": 0.9083574265241623, "num_tokens": 215646122.0, "sample_num_tokens": 8413.5, "step": 9297, "total_num_tokens": 215679776.0, "z_loss": 0.0004225285956636071 }, { "copy_logits_max": -2.7632060050964355, "copy_logits_min": -750000000.0, "copy_num_tokens": 772.1875, "epoch": 1.8988511616032677, "gen_logits_max": 2.276477813720703, "gen_logits_mean": -17.369619369506836, "gen_logits_min": -29.962142944335938, "gen_logits_std": 3.4345834255218506, "gen_loss": 0.24837735295295715, "grad_norm": 0.3527406375425291, "learning_rate": 1.9184842105263157e-05, "loss": 0.2787, "mean_copy_accuracy": 0.9968401044607162, "mean_gen_accuracy": 0.872960165143013, "mean_token_accuracy": 0.9055687785148621, "num_tokens": 215914659.0, "sample_num_tokens": 9397.25, "step": 9298, "total_num_tokens": 215952248.0, "z_loss": 0.0004204603610560298 }, { "copy_logits_max": -6.113541603088379, "copy_logits_min": -750000128.0, "copy_num_tokens": 470.3125, "epoch": 1.8990553995404647, "gen_logits_max": 2.7933127880096436, "gen_logits_mean": -17.696781158447266, "gen_logits_min": -29.738712310791016, "gen_logits_std": 3.4131507873535156, "gen_loss": 0.25277256965637207, "grad_norm": 0.3678961107968252, "learning_rate": 1.918357894736842e-05, "loss": 0.2535, "mean_copy_accuracy": 0.9973550140857697, "mean_gen_accuracy": 0.8824749737977982, "mean_token_accuracy": 0.912583664059639, "num_tokens": 216189183.0, "sample_num_tokens": 7710.75, "step": 9299, "total_num_tokens": 216220026.0, "z_loss": 0.0003755340585485101 }, { "copy_logits_max": -8.202749252319336, "copy_logits_min": -750000128.0, "copy_num_tokens": 321.3125, "epoch": 1.8992596374776616, "gen_logits_max": 3.7310123443603516, "gen_logits_mean": -18.419218063354492, "gen_logits_min": -30.336082458496094, "gen_logits_std": 3.4457859992980957, "gen_loss": 0.26049959659576416, "grad_norm": 0.365757597611542, "learning_rate": 1.9182315789473683e-05, "loss": 0.2887, "mean_copy_accuracy": 0.9966419488191605, "mean_gen_accuracy": 0.8728073537349701, "mean_token_accuracy": 0.9035226702690125, "num_tokens": 216452768.0, "sample_num_tokens": 7225.0, "step": 9300, "total_num_tokens": 216481668.0, "z_loss": 0.00039956229738891125 }, { "copy_logits_max": -6.71537446975708, "copy_logits_min": -687500032.0, "copy_num_tokens": 517.25, "epoch": 1.8994638754148583, "gen_logits_max": 3.9767568111419678, "gen_logits_mean": -15.732538223266602, "gen_logits_min": -28.029651641845703, "gen_logits_std": 3.3278145790100098, "gen_loss": 0.2892213463783264, "grad_norm": 0.35735090968364513, "learning_rate": 1.9181052631578947e-05, "loss": 0.2685, "mean_copy_accuracy": 0.9964520633220673, "mean_gen_accuracy": 0.8812450766563416, "mean_token_accuracy": 0.9089505225419998, "num_tokens": 216719920.0, "sample_num_tokens": 9148.5, "step": 9301, "total_num_tokens": 216756514.0, "z_loss": 0.00045593129470944405 }, { "copy_logits_max": -5.404330730438232, "copy_logits_min": -750000000.0, "copy_num_tokens": 513.5625, "epoch": 1.899668113352055, "gen_logits_max": 3.834012031555176, "gen_logits_mean": -15.633806228637695, "gen_logits_min": -27.077913284301758, "gen_logits_std": 3.2392983436584473, "gen_loss": 0.2981477975845337, "grad_norm": 0.36093699522439593, "learning_rate": 1.9179789473684208e-05, "loss": 0.2753, "mean_copy_accuracy": 0.9969398677349091, "mean_gen_accuracy": 0.8722420781850815, "mean_token_accuracy": 0.9066876620054245, "num_tokens": 217010499.0, "sample_num_tokens": 8963.75, "step": 9302, "total_num_tokens": 217046354.0, "z_loss": 0.0004808235971722752 }, { "copy_logits_max": -5.8353071212768555, "copy_logits_min": -750000000.0, "copy_num_tokens": 473.5, "epoch": 1.8998723512892521, "gen_logits_max": 3.3034298419952393, "gen_logits_mean": -17.086286544799805, "gen_logits_min": -28.469562530517578, "gen_logits_std": 3.326779365539551, "gen_loss": 0.26692280173301697, "grad_norm": 0.33441227378946176, "learning_rate": 1.9178526315789475e-05, "loss": 0.2628, "mean_copy_accuracy": 0.9966766983270645, "mean_gen_accuracy": 0.8821526169776917, "mean_token_accuracy": 0.911435678601265, "num_tokens": 217286918.0, "sample_num_tokens": 8842.5, "step": 9303, "total_num_tokens": 217322288.0, "z_loss": 0.00042613246478140354 }, { "copy_logits_max": -5.227484703063965, "copy_logits_min": -750000064.0, "copy_num_tokens": 437.0625, "epoch": 1.9000765892264488, "gen_logits_max": 3.2887935638427734, "gen_logits_mean": -17.797409057617188, "gen_logits_min": -29.89629554748535, "gen_logits_std": 3.427219867706299, "gen_loss": 0.2631138861179352, "grad_norm": 0.3308023240740604, "learning_rate": 1.917726315789474e-05, "loss": 0.2793, "mean_copy_accuracy": 0.9965819418430328, "mean_gen_accuracy": 0.8749983757734299, "mean_token_accuracy": 0.9034251868724823, "num_tokens": 217554868.0, "sample_num_tokens": 8340.0, "step": 9304, "total_num_tokens": 217588228.0, "z_loss": 0.00044207798782736063 }, { "copy_logits_max": -5.185790061950684, "copy_logits_min": -750000000.0, "copy_num_tokens": 413.875, "epoch": 1.9002808271636455, "gen_logits_max": 3.561903953552246, "gen_logits_mean": -17.297054290771484, "gen_logits_min": -29.273683547973633, "gen_logits_std": 3.406095027923584, "gen_loss": 0.3027057647705078, "grad_norm": 0.34781729030593067, "learning_rate": 1.9176e-05, "loss": 0.2929, "mean_copy_accuracy": 0.9972033351659775, "mean_gen_accuracy": 0.8725858628749847, "mean_token_accuracy": 0.9004064351320267, "num_tokens": 217844077.0, "sample_num_tokens": 8440.75, "step": 9305, "total_num_tokens": 217877840.0, "z_loss": 0.00047435908345505595 }, { "copy_logits_max": -8.431414604187012, "copy_logits_min": -750000000.0, "copy_num_tokens": 295.75, "epoch": 1.9004850651008425, "gen_logits_max": 3.871370315551758, "gen_logits_mean": -17.27943229675293, "gen_logits_min": -28.887420654296875, "gen_logits_std": 3.3570876121520996, "gen_loss": 0.2850847542285919, "grad_norm": 0.34184833044608814, "learning_rate": 1.9174736842105265e-05, "loss": 0.2783, "mean_copy_accuracy": 0.9970007985830307, "mean_gen_accuracy": 0.8777611404657364, "mean_token_accuracy": 0.9051552712917328, "num_tokens": 218114545.0, "sample_num_tokens": 7308.25, "step": 9306, "total_num_tokens": 218143778.0, "z_loss": 0.0004071114235557616 }, { "copy_logits_max": -6.8885087966918945, "copy_logits_min": -750000000.0, "copy_num_tokens": 320.5, "epoch": 1.9006893030380394, "gen_logits_max": 3.327040195465088, "gen_logits_mean": -18.30685043334961, "gen_logits_min": -29.954980850219727, "gen_logits_std": 3.4280362129211426, "gen_loss": 0.31187689304351807, "grad_norm": 0.3601814050622995, "learning_rate": 1.9173473684210526e-05, "loss": 0.2781, "mean_copy_accuracy": 0.9960519224405289, "mean_gen_accuracy": 0.8811215609312057, "mean_token_accuracy": 0.9059793502092361, "num_tokens": 218386435.0, "sample_num_tokens": 7286.75, "step": 9307, "total_num_tokens": 218415582.0, "z_loss": 0.00043359468691051006 }, { "copy_logits_max": -5.818906784057617, "copy_logits_min": -750000000.0, "copy_num_tokens": 569.375, "epoch": 1.900893540975236, "gen_logits_max": 3.2587597370147705, "gen_logits_mean": -16.482528686523438, "gen_logits_min": -28.384681701660156, "gen_logits_std": 3.3370776176452637, "gen_loss": 0.26123538613319397, "grad_norm": 0.38298373157533383, "learning_rate": 1.917221052631579e-05, "loss": 0.2854, "mean_copy_accuracy": 0.9962802827358246, "mean_gen_accuracy": 0.8749646693468094, "mean_token_accuracy": 0.9036933928728104, "num_tokens": 218649644.0, "sample_num_tokens": 8973.0, "step": 9308, "total_num_tokens": 218685536.0, "z_loss": 0.0003668757854029536 }, { "copy_logits_max": -4.631907939910889, "copy_logits_min": -750000064.0, "copy_num_tokens": 617.25, "epoch": 1.901097778912433, "gen_logits_max": 4.225034713745117, "gen_logits_mean": -15.864063262939453, "gen_logits_min": -28.05341339111328, "gen_logits_std": 3.3819222450256348, "gen_loss": 0.28258341550827026, "grad_norm": 0.37102961712934135, "learning_rate": 1.917094736842105e-05, "loss": 0.2848, "mean_copy_accuracy": 0.996425673365593, "mean_gen_accuracy": 0.8723290860652924, "mean_token_accuracy": 0.9010936915874481, "num_tokens": 218916312.0, "sample_num_tokens": 9973.5, "step": 9309, "total_num_tokens": 218956206.0, "z_loss": 0.00042019301326945424 }, { "copy_logits_max": -5.337061882019043, "copy_logits_min": -625000000.0, "copy_num_tokens": 511.875, "epoch": 1.90130201684963, "gen_logits_max": 2.7398135662078857, "gen_logits_mean": -17.316246032714844, "gen_logits_min": -29.341957092285156, "gen_logits_std": 3.411755084991455, "gen_loss": 0.2508958578109741, "grad_norm": 0.35114075130748584, "learning_rate": 1.9169684210526315e-05, "loss": 0.2596, "mean_copy_accuracy": 0.9966411292552948, "mean_gen_accuracy": 0.8826406300067902, "mean_token_accuracy": 0.9126439839601517, "num_tokens": 219199182.0, "sample_num_tokens": 8061.0, "step": 9310, "total_num_tokens": 219231426.0, "z_loss": 0.00035658484557643533 }, { "copy_logits_max": -5.066037654876709, "copy_logits_min": -750000000.0, "copy_num_tokens": 455.25, "epoch": 1.9015062547868267, "gen_logits_max": 4.449257850646973, "gen_logits_mean": -15.198254585266113, "gen_logits_min": -27.023223876953125, "gen_logits_std": 3.252100944519043, "gen_loss": 0.2590022087097168, "grad_norm": 0.3375975851078577, "learning_rate": 1.916842105263158e-05, "loss": 0.2669, "mean_copy_accuracy": 0.9969087541103363, "mean_gen_accuracy": 0.8819138556718826, "mean_token_accuracy": 0.9091940671205521, "num_tokens": 219476199.0, "sample_num_tokens": 9296.25, "step": 9311, "total_num_tokens": 219513384.0, "z_loss": 0.00038982019759714603 }, { "copy_logits_max": -5.9832682609558105, "copy_logits_min": -750000000.0, "copy_num_tokens": 377.75, "epoch": 1.9017104927240234, "gen_logits_max": 3.73128080368042, "gen_logits_mean": -16.896984100341797, "gen_logits_min": -28.654911041259766, "gen_logits_std": 3.379436731338501, "gen_loss": 0.26750993728637695, "grad_norm": 0.3296201845460887, "learning_rate": 1.9167157894736844e-05, "loss": 0.2544, "mean_copy_accuracy": 0.9977252185344696, "mean_gen_accuracy": 0.8838485479354858, "mean_token_accuracy": 0.9123260974884033, "num_tokens": 219760315.0, "sample_num_tokens": 7632.25, "step": 9312, "total_num_tokens": 219790844.0, "z_loss": 0.0003985100775025785 }, { "copy_logits_max": -5.067138671875, "copy_logits_min": -750000064.0, "copy_num_tokens": 511.875, "epoch": 1.9019147306612203, "gen_logits_max": 4.030113220214844, "gen_logits_mean": -15.237607955932617, "gen_logits_min": -27.47153663635254, "gen_logits_std": 3.3063907623291016, "gen_loss": 0.285408616065979, "grad_norm": 0.3587528833335229, "learning_rate": 1.9165894736842105e-05, "loss": 0.2764, "mean_copy_accuracy": 0.996887132525444, "mean_gen_accuracy": 0.8780981153249741, "mean_token_accuracy": 0.9069980531930923, "num_tokens": 220009326.0, "sample_num_tokens": 7518.0, "step": 9313, "total_num_tokens": 220039398.0, "z_loss": 0.0004461958887986839 }, { "copy_logits_max": -5.360568046569824, "copy_logits_min": -687500032.0, "copy_num_tokens": 481.9375, "epoch": 1.9021189685984172, "gen_logits_max": 2.0474236011505127, "gen_logits_mean": -19.64858627319336, "gen_logits_min": -31.88550567626953, "gen_logits_std": 3.5869994163513184, "gen_loss": 0.22401195764541626, "grad_norm": 0.36180108675546624, "learning_rate": 1.916463157894737e-05, "loss": 0.2602, "mean_copy_accuracy": 0.9970034807920456, "mean_gen_accuracy": 0.8819569200277328, "mean_token_accuracy": 0.910030797123909, "num_tokens": 220265035.0, "sample_num_tokens": 8579.75, "step": 9314, "total_num_tokens": 220299354.0, "z_loss": 0.000325535424053669 }, { "copy_logits_max": -7.543779373168945, "copy_logits_min": -750000000.0, "copy_num_tokens": 445.625, "epoch": 1.902323206535614, "gen_logits_max": 2.7089293003082275, "gen_logits_mean": -18.72509002685547, "gen_logits_min": -30.761640548706055, "gen_logits_std": 3.5199027061462402, "gen_loss": 0.26826632022857666, "grad_norm": 0.3626562140496913, "learning_rate": 1.916336842105263e-05, "loss": 0.271, "mean_copy_accuracy": 0.9962247461080551, "mean_gen_accuracy": 0.8805635869503021, "mean_token_accuracy": 0.9051802158355713, "num_tokens": 220511548.0, "sample_num_tokens": 8377.5, "step": 9315, "total_num_tokens": 220545058.0, "z_loss": 0.00038311665412038565 }, { "copy_logits_max": -5.694398403167725, "copy_logits_min": -625000064.0, "copy_num_tokens": 763.375, "epoch": 1.9025274444728109, "gen_logits_max": 3.892030715942383, "gen_logits_mean": -15.0380277633667, "gen_logits_min": -27.14263153076172, "gen_logits_std": 3.3277010917663574, "gen_loss": 0.2666507363319397, "grad_norm": 0.336722421875194, "learning_rate": 1.9162105263157895e-05, "loss": 0.2533, "mean_copy_accuracy": 0.9971669316291809, "mean_gen_accuracy": 0.8830932080745697, "mean_token_accuracy": 0.9159519374370575, "num_tokens": 220790269.0, "sample_num_tokens": 9382.25, "step": 9316, "total_num_tokens": 220827798.0, "z_loss": 0.00034764083102345467 }, { "copy_logits_max": -5.005395412445068, "copy_logits_min": -625000000.0, "copy_num_tokens": 555.125, "epoch": 1.9027316824100078, "gen_logits_max": 4.449580192565918, "gen_logits_mean": -14.85352897644043, "gen_logits_min": -26.851871490478516, "gen_logits_std": 3.340614080429077, "gen_loss": 0.289088636636734, "grad_norm": 0.3408513865368535, "learning_rate": 1.916084210526316e-05, "loss": 0.2902, "mean_copy_accuracy": 0.9970500320196152, "mean_gen_accuracy": 0.8741825520992279, "mean_token_accuracy": 0.9017076641321182, "num_tokens": 221058758.0, "sample_num_tokens": 9550.5, "step": 9317, "total_num_tokens": 221096960.0, "z_loss": 0.00040238857036456466 }, { "copy_logits_max": -5.929548740386963, "copy_logits_min": -750000064.0, "copy_num_tokens": 616.625, "epoch": 1.9029359203472045, "gen_logits_max": 2.5394301414489746, "gen_logits_mean": -17.986425399780273, "gen_logits_min": -30.08799934387207, "gen_logits_std": 3.5062360763549805, "gen_loss": 0.267123281955719, "grad_norm": 0.37763957310030066, "learning_rate": 1.915957894736842e-05, "loss": 0.2763, "mean_copy_accuracy": 0.9974533468484879, "mean_gen_accuracy": 0.8678656667470932, "mean_token_accuracy": 0.9063976258039474, "num_tokens": 221343861.0, "sample_num_tokens": 9183.25, "step": 9318, "total_num_tokens": 221380594.0, "z_loss": 0.00034214137122035027 }, { "copy_logits_max": -6.964710712432861, "copy_logits_min": -687500032.0, "copy_num_tokens": 476.25, "epoch": 1.9031401582844012, "gen_logits_max": 2.2458181381225586, "gen_logits_mean": -19.111003875732422, "gen_logits_min": -31.140289306640625, "gen_logits_std": 3.532329559326172, "gen_loss": 0.2669558525085449, "grad_norm": 0.3588014490184864, "learning_rate": 1.9158315789473688e-05, "loss": 0.2722, "mean_copy_accuracy": 0.996971994638443, "mean_gen_accuracy": 0.8817596882581711, "mean_token_accuracy": 0.9070626497268677, "num_tokens": 221600532.0, "sample_num_tokens": 8295.5, "step": 9319, "total_num_tokens": 221633714.0, "z_loss": 0.000374320283299312 }, { "copy_logits_max": -5.528841972351074, "copy_logits_min": -750000000.0, "copy_num_tokens": 324.0625, "epoch": 1.9033443962215981, "gen_logits_max": 5.167280673980713, "gen_logits_mean": -14.990325927734375, "gen_logits_min": -26.764404296875, "gen_logits_std": 3.312499761581421, "gen_loss": 0.25721973180770874, "grad_norm": 0.33219240346778595, "learning_rate": 1.915705263157895e-05, "loss": 0.2669, "mean_copy_accuracy": 0.9972161948680878, "mean_gen_accuracy": 0.8830129653215408, "mean_token_accuracy": 0.908611848950386, "num_tokens": 221871095.0, "sample_num_tokens": 8618.75, "step": 9320, "total_num_tokens": 221905570.0, "z_loss": 0.00036164792254567146 }, { "copy_logits_max": -5.443356513977051, "copy_logits_min": -750000064.0, "copy_num_tokens": 482.1875, "epoch": 1.903548634158795, "gen_logits_max": 2.5192716121673584, "gen_logits_mean": -18.34671401977539, "gen_logits_min": -30.8507080078125, "gen_logits_std": 3.46536922454834, "gen_loss": 0.2670016884803772, "grad_norm": 0.35178229891901586, "learning_rate": 1.9155789473684213e-05, "loss": 0.2711, "mean_copy_accuracy": 0.9966892600059509, "mean_gen_accuracy": 0.8743832558393478, "mean_token_accuracy": 0.9085233062505722, "num_tokens": 222138985.0, "sample_num_tokens": 8009.75, "step": 9321, "total_num_tokens": 222171024.0, "z_loss": 0.0003775786026380956 }, { "copy_logits_max": -3.2249114513397217, "copy_logits_min": -750000000.0, "copy_num_tokens": 655.5, "epoch": 1.9037528720959918, "gen_logits_max": 3.166379928588867, "gen_logits_mean": -16.832515716552734, "gen_logits_min": -29.13555145263672, "gen_logits_std": 3.3959784507751465, "gen_loss": 0.2804776132106781, "grad_norm": 0.3373395416569748, "learning_rate": 1.9154526315789474e-05, "loss": 0.2656, "mean_copy_accuracy": 0.9960023164749146, "mean_gen_accuracy": 0.8806039094924927, "mean_token_accuracy": 0.9103353470563889, "num_tokens": 222407849.0, "sample_num_tokens": 9858.25, "step": 9322, "total_num_tokens": 222447282.0, "z_loss": 0.00042792625026777387 }, { "copy_logits_max": -3.251204013824463, "copy_logits_min": -750000064.0, "copy_num_tokens": 608.6875, "epoch": 1.9039571100331887, "gen_logits_max": 2.665721893310547, "gen_logits_mean": -17.216232299804688, "gen_logits_min": -29.294797897338867, "gen_logits_std": 3.422680616378784, "gen_loss": 0.28208333253860474, "grad_norm": 0.34568934601697965, "learning_rate": 1.9153263157894738e-05, "loss": 0.2735, "mean_copy_accuracy": 0.9970323741436005, "mean_gen_accuracy": 0.8761290013790131, "mean_token_accuracy": 0.9062528312206268, "num_tokens": 222672191.0, "sample_num_tokens": 9221.75, "step": 9323, "total_num_tokens": 222709078.0, "z_loss": 0.0004631370829883963 }, { "copy_logits_max": -4.977374076843262, "copy_logits_min": -687500032.0, "copy_num_tokens": 376.0, "epoch": 1.9041613479703856, "gen_logits_max": 3.5899362564086914, "gen_logits_mean": -18.17116928100586, "gen_logits_min": -30.165281295776367, "gen_logits_std": 3.4558844566345215, "gen_loss": 0.29821041226387024, "grad_norm": 0.37387137503636547, "learning_rate": 1.9152e-05, "loss": 0.2838, "mean_copy_accuracy": 0.9966917932033539, "mean_gen_accuracy": 0.8749800026416779, "mean_token_accuracy": 0.9027981758117676, "num_tokens": 222957100.0, "sample_num_tokens": 8210.5, "step": 9324, "total_num_tokens": 222989942.0, "z_loss": 0.0005564292659983039 }, { "copy_logits_max": -3.666121244430542, "copy_logits_min": -687500032.0, "copy_num_tokens": 454.5625, "epoch": 1.9043655859075823, "gen_logits_max": 2.4159929752349854, "gen_logits_mean": -19.0424861907959, "gen_logits_min": -31.023038864135742, "gen_logits_std": 3.4901185035705566, "gen_loss": 0.26173728704452515, "grad_norm": 0.35803816383785836, "learning_rate": 1.9150736842105263e-05, "loss": 0.2811, "mean_copy_accuracy": 0.9976104348897934, "mean_gen_accuracy": 0.8780514597892761, "mean_token_accuracy": 0.9045340418815613, "num_tokens": 223234564.0, "sample_num_tokens": 7959.0, "step": 9325, "total_num_tokens": 223266400.0, "z_loss": 0.0005060263792984188 }, { "copy_logits_max": -3.8974556922912598, "copy_logits_min": -750000000.0, "copy_num_tokens": 445.375, "epoch": 1.904569823844779, "gen_logits_max": 3.4908695220947266, "gen_logits_mean": -17.938798904418945, "gen_logits_min": -29.868709564208984, "gen_logits_std": 3.4487485885620117, "gen_loss": 0.3030143082141876, "grad_norm": 0.34680699954244243, "learning_rate": 1.9149473684210524e-05, "loss": 0.2874, "mean_copy_accuracy": 0.9964365363121033, "mean_gen_accuracy": 0.8749079704284668, "mean_token_accuracy": 0.903264120221138, "num_tokens": 223498121.0, "sample_num_tokens": 8500.75, "step": 9326, "total_num_tokens": 223532124.0, "z_loss": 0.000557495397515595 }, { "copy_logits_max": -4.747308731079102, "copy_logits_min": -750000000.0, "copy_num_tokens": 615.0625, "epoch": 1.904774061781976, "gen_logits_max": 3.738255739212036, "gen_logits_mean": -16.6754150390625, "gen_logits_min": -28.665122985839844, "gen_logits_std": 3.407197952270508, "gen_loss": 0.2525632977485657, "grad_norm": 0.33051054392275936, "learning_rate": 1.9148210526315792e-05, "loss": 0.2638, "mean_copy_accuracy": 0.9968718588352203, "mean_gen_accuracy": 0.8808692395687103, "mean_token_accuracy": 0.9081513732671738, "num_tokens": 223766866.0, "sample_num_tokens": 9887.5, "step": 9327, "total_num_tokens": 223806416.0, "z_loss": 0.00042733398731797934 }, { "copy_logits_max": -7.919189929962158, "copy_logits_min": -750000000.0, "copy_num_tokens": 321.5625, "epoch": 1.9049782997191729, "gen_logits_max": 4.95452356338501, "gen_logits_mean": -17.188753128051758, "gen_logits_min": -28.555469512939453, "gen_logits_std": 3.374508857727051, "gen_loss": 0.26934510469436646, "grad_norm": 0.34393872027498135, "learning_rate": 1.9146947368421053e-05, "loss": 0.2944, "mean_copy_accuracy": 0.9977325052022934, "mean_gen_accuracy": 0.8712395876646042, "mean_token_accuracy": 0.899753600358963, "num_tokens": 224034646.0, "sample_num_tokens": 7708.5, "step": 9328, "total_num_tokens": 224065480.0, "z_loss": 0.00040481420001015067 }, { "copy_logits_max": -4.491854190826416, "copy_logits_min": -750000000.0, "copy_num_tokens": 474.375, "epoch": 1.9051825376563696, "gen_logits_max": 3.4602222442626953, "gen_logits_mean": -17.017009735107422, "gen_logits_min": -29.25201416015625, "gen_logits_std": 3.448239326477051, "gen_loss": 0.2727903723716736, "grad_norm": 0.3368035038971042, "learning_rate": 1.9145684210526317e-05, "loss": 0.288, "mean_copy_accuracy": 0.996864065527916, "mean_gen_accuracy": 0.8722815066576004, "mean_token_accuracy": 0.9012911468744278, "num_tokens": 224310139.0, "sample_num_tokens": 8305.25, "step": 9329, "total_num_tokens": 224343360.0, "z_loss": 0.0004202346899546683 }, { "copy_logits_max": -6.4681854248046875, "copy_logits_min": -750000000.0, "copy_num_tokens": 379.0, "epoch": 1.9053867755935665, "gen_logits_max": 3.962791681289673, "gen_logits_mean": -17.66986083984375, "gen_logits_min": -29.536457061767578, "gen_logits_std": 3.4339466094970703, "gen_loss": 0.31904473900794983, "grad_norm": 0.4174996716339416, "learning_rate": 1.914442105263158e-05, "loss": 0.29, "mean_copy_accuracy": 0.9955938458442688, "mean_gen_accuracy": 0.8727445155382156, "mean_token_accuracy": 0.9009167104959488, "num_tokens": 224571305.0, "sample_num_tokens": 8367.75, "step": 9330, "total_num_tokens": 224604776.0, "z_loss": 0.00046501608449034393 }, { "copy_logits_max": -4.738698959350586, "copy_logits_min": -750000000.0, "copy_num_tokens": 650.875, "epoch": 1.9055910135307634, "gen_logits_max": 4.206439018249512, "gen_logits_mean": -17.114063262939453, "gen_logits_min": -29.04673194885254, "gen_logits_std": 3.4490854740142822, "gen_loss": 0.25127893686294556, "grad_norm": 0.3556487843345798, "learning_rate": 1.9143157894736842e-05, "loss": 0.2865, "mean_copy_accuracy": 0.9975408464670181, "mean_gen_accuracy": 0.8739197701215744, "mean_token_accuracy": 0.9040600955486298, "num_tokens": 224832235.0, "sample_num_tokens": 9826.75, "step": 9331, "total_num_tokens": 224871542.0, "z_loss": 0.00037093847640790045 }, { "copy_logits_max": -6.8685102462768555, "copy_logits_min": -750000000.0, "copy_num_tokens": 378.1875, "epoch": 1.9057952514679601, "gen_logits_max": 4.072457313537598, "gen_logits_mean": -18.15029525756836, "gen_logits_min": -29.65536880493164, "gen_logits_std": 3.449092149734497, "gen_loss": 0.33146238327026367, "grad_norm": 0.37083893031535914, "learning_rate": 1.9141894736842107e-05, "loss": 0.2963, "mean_copy_accuracy": 0.9973891377449036, "mean_gen_accuracy": 0.8683430105447769, "mean_token_accuracy": 0.8987764716148376, "num_tokens": 225102155.0, "sample_num_tokens": 8724.25, "step": 9332, "total_num_tokens": 225137052.0, "z_loss": 0.000489088473841548 }, { "copy_logits_max": -5.208327770233154, "copy_logits_min": -750000000.0, "copy_num_tokens": 467.1875, "epoch": 1.9059994894051568, "gen_logits_max": 4.110136985778809, "gen_logits_mean": -16.793943405151367, "gen_logits_min": -28.665489196777344, "gen_logits_std": 3.4374051094055176, "gen_loss": 0.29515784978866577, "grad_norm": 0.37166552014302406, "learning_rate": 1.9140631578947368e-05, "loss": 0.2894, "mean_copy_accuracy": 0.9969623535871506, "mean_gen_accuracy": 0.8686578273773193, "mean_token_accuracy": 0.9014827460050583, "num_tokens": 225368721.0, "sample_num_tokens": 8178.25, "step": 9333, "total_num_tokens": 225401434.0, "z_loss": 0.00042837345972657204 }, { "copy_logits_max": -4.092196941375732, "copy_logits_min": -687500032.0, "copy_num_tokens": 351.25, "epoch": 1.906203727342354, "gen_logits_max": 5.335680961608887, "gen_logits_mean": -15.746349334716797, "gen_logits_min": -27.91690444946289, "gen_logits_std": 3.3971779346466064, "gen_loss": 0.2784740924835205, "grad_norm": 0.3638003405174612, "learning_rate": 1.9139368421052632e-05, "loss": 0.2709, "mean_copy_accuracy": 0.9974995106458664, "mean_gen_accuracy": 0.8776921927928925, "mean_token_accuracy": 0.9090655744075775, "num_tokens": 225647458.0, "sample_num_tokens": 8054.0, "step": 9334, "total_num_tokens": 225679674.0, "z_loss": 0.000486024102428928 }, { "copy_logits_max": -6.321242809295654, "copy_logits_min": -687500032.0, "copy_num_tokens": 551.3125, "epoch": 1.9064079652795507, "gen_logits_max": 4.4185919761657715, "gen_logits_mean": -17.047229766845703, "gen_logits_min": -29.1998348236084, "gen_logits_std": 3.4921977519989014, "gen_loss": 0.27081429958343506, "grad_norm": 0.559393289622572, "learning_rate": 1.9138105263157896e-05, "loss": 0.2871, "mean_copy_accuracy": 0.9967149496078491, "mean_gen_accuracy": 0.8769357651472092, "mean_token_accuracy": 0.9031270295381546, "num_tokens": 225913067.0, "sample_num_tokens": 8896.25, "step": 9335, "total_num_tokens": 225948652.0, "z_loss": 0.0003943295741919428 }, { "copy_logits_max": -7.757434844970703, "copy_logits_min": -750000000.0, "copy_num_tokens": 209.5, "epoch": 1.9066122032167474, "gen_logits_max": 5.918673038482666, "gen_logits_mean": -15.623147964477539, "gen_logits_min": -27.603591918945312, "gen_logits_std": 3.383582592010498, "gen_loss": 0.2819082736968994, "grad_norm": 0.3504779459149374, "learning_rate": 1.913684210526316e-05, "loss": 0.2674, "mean_copy_accuracy": 0.9955922067165375, "mean_gen_accuracy": 0.8832117766141891, "mean_token_accuracy": 0.9085821956396103, "num_tokens": 226173104.0, "sample_num_tokens": 5711.5, "step": 9336, "total_num_tokens": 226195950.0, "z_loss": 0.00042693247087299824 }, { "copy_logits_max": -6.910490036010742, "copy_logits_min": -750000000.0, "copy_num_tokens": 250.6875, "epoch": 1.9068164411539443, "gen_logits_max": 3.4918558597564697, "gen_logits_mean": -18.91372299194336, "gen_logits_min": -30.615493774414062, "gen_logits_std": 3.5198981761932373, "gen_loss": 0.27138927578926086, "grad_norm": 0.33110872451966333, "learning_rate": 1.913557894736842e-05, "loss": 0.2695, "mean_copy_accuracy": 0.9958917051553726, "mean_gen_accuracy": 0.8858088999986649, "mean_token_accuracy": 0.9082238674163818, "num_tokens": 226432655.0, "sample_num_tokens": 6707.25, "step": 9337, "total_num_tokens": 226459484.0, "z_loss": 0.00045795878395438194 }, { "copy_logits_max": -6.685386657714844, "copy_logits_min": -687500032.0, "copy_num_tokens": 294.5625, "epoch": 1.9070206790911413, "gen_logits_max": 5.349501132965088, "gen_logits_mean": -16.131406784057617, "gen_logits_min": -28.037565231323242, "gen_logits_std": 3.4061903953552246, "gen_loss": 0.32614120841026306, "grad_norm": 0.3588321392308311, "learning_rate": 1.9134315789473686e-05, "loss": 0.2834, "mean_copy_accuracy": 0.99613818526268, "mean_gen_accuracy": 0.8793968856334686, "mean_token_accuracy": 0.9020711183547974, "num_tokens": 226679453.0, "sample_num_tokens": 8126.25, "step": 9338, "total_num_tokens": 226711958.0, "z_loss": 0.0005472980556078255 }, { "copy_logits_max": -6.698430061340332, "copy_logits_min": -750000000.0, "copy_num_tokens": 686.75, "epoch": 1.907224917028338, "gen_logits_max": 3.876272439956665, "gen_logits_mean": -15.926275253295898, "gen_logits_min": -28.258960723876953, "gen_logits_std": 3.4612245559692383, "gen_loss": 0.2196129411458969, "grad_norm": 0.33616644771937004, "learning_rate": 1.9133052631578947e-05, "loss": 0.2554, "mean_copy_accuracy": 0.9975082129240036, "mean_gen_accuracy": 0.8788125962018967, "mean_token_accuracy": 0.9119095206260681, "num_tokens": 226951935.0, "sample_num_tokens": 8812.25, "step": 9339, "total_num_tokens": 226987184.0, "z_loss": 0.0003463714965619147 }, { "copy_logits_max": -7.179204940795898, "copy_logits_min": -750000000.0, "copy_num_tokens": 653.375, "epoch": 1.907429154965535, "gen_logits_max": 3.740488052368164, "gen_logits_mean": -15.553910255432129, "gen_logits_min": -28.214984893798828, "gen_logits_std": 3.439652681350708, "gen_loss": 0.24135951697826385, "grad_norm": 0.3359282340056816, "learning_rate": 1.913178947368421e-05, "loss": 0.2618, "mean_copy_accuracy": 0.9968342334032059, "mean_gen_accuracy": 0.8793525397777557, "mean_token_accuracy": 0.9119329303503036, "num_tokens": 227223523.0, "sample_num_tokens": 8504.25, "step": 9340, "total_num_tokens": 227257540.0, "z_loss": 0.000393677648389712 }, { "copy_logits_max": -7.952460289001465, "copy_logits_min": -750000000.0, "copy_num_tokens": 532.375, "epoch": 1.9076333929027318, "gen_logits_max": 4.175426959991455, "gen_logits_mean": -16.278610229492188, "gen_logits_min": -28.740753173828125, "gen_logits_std": 3.4339451789855957, "gen_loss": 0.23279157280921936, "grad_norm": 0.32972415001263516, "learning_rate": 1.9130526315789472e-05, "loss": 0.2617, "mean_copy_accuracy": 0.9970463514328003, "mean_gen_accuracy": 0.885968953371048, "mean_token_accuracy": 0.9126574695110321, "num_tokens": 227495462.0, "sample_num_tokens": 9742.0, "step": 9341, "total_num_tokens": 227534430.0, "z_loss": 0.0003476723504718393 }, { "copy_logits_max": -7.917882919311523, "copy_logits_min": -687500032.0, "copy_num_tokens": 275.25, "epoch": 1.9078376308399285, "gen_logits_max": 4.639451026916504, "gen_logits_mean": -16.893905639648438, "gen_logits_min": -28.697742462158203, "gen_logits_std": 3.433004856109619, "gen_loss": 0.28351467847824097, "grad_norm": 0.34555236319706156, "learning_rate": 1.9129263157894736e-05, "loss": 0.2582, "mean_copy_accuracy": 0.9962594956159592, "mean_gen_accuracy": 0.8835647255182266, "mean_token_accuracy": 0.9123780429363251, "num_tokens": 227766071.0, "sample_num_tokens": 7079.75, "step": 9342, "total_num_tokens": 227794390.0, "z_loss": 0.0004020709893666208 }, { "copy_logits_max": -7.06979513168335, "copy_logits_min": -750000000.0, "copy_num_tokens": 420.5625, "epoch": 1.9080418687771252, "gen_logits_max": 3.1857569217681885, "gen_logits_mean": -19.272693634033203, "gen_logits_min": -31.26474380493164, "gen_logits_std": 3.555896043777466, "gen_loss": 0.2817046046257019, "grad_norm": 0.3588042215026898, "learning_rate": 1.9128e-05, "loss": 0.2915, "mean_copy_accuracy": 0.9969021677970886, "mean_gen_accuracy": 0.8761319369077682, "mean_token_accuracy": 0.9010597169399261, "num_tokens": 228027851.0, "sample_num_tokens": 8227.75, "step": 9343, "total_num_tokens": 228060762.0, "z_loss": 0.00040650920709595084 }, { "copy_logits_max": -5.494545936584473, "copy_logits_min": -687500032.0, "copy_num_tokens": 629.25, "epoch": 1.9082461067143222, "gen_logits_max": 1.771364688873291, "gen_logits_mean": -19.335575103759766, "gen_logits_min": -31.74645233154297, "gen_logits_std": 3.5688014030456543, "gen_loss": 0.2705359160900116, "grad_norm": 0.3209227920467599, "learning_rate": 1.9126736842105265e-05, "loss": 0.261, "mean_copy_accuracy": 0.9974411129951477, "mean_gen_accuracy": 0.8800272345542908, "mean_token_accuracy": 0.912594273686409, "num_tokens": 228315531.0, "sample_num_tokens": 9593.75, "step": 9344, "total_num_tokens": 228353906.0, "z_loss": 0.0004600081010721624 }, { "copy_logits_max": -4.905098915100098, "copy_logits_min": -687500032.0, "copy_num_tokens": 493.3125, "epoch": 1.908450344651519, "gen_logits_max": 3.8859572410583496, "gen_logits_mean": -16.306324005126953, "gen_logits_min": -28.35116195678711, "gen_logits_std": 3.4292898178100586, "gen_loss": 0.2826228737831116, "grad_norm": 0.37408344982391406, "learning_rate": 1.912547368421053e-05, "loss": 0.2822, "mean_copy_accuracy": 0.9965579211711884, "mean_gen_accuracy": 0.8777205049991608, "mean_token_accuracy": 0.9041880816221237, "num_tokens": 228572457.0, "sample_num_tokens": 8803.25, "step": 9345, "total_num_tokens": 228607670.0, "z_loss": 0.00043908145744353533 }, { "copy_logits_max": -6.509123802185059, "copy_logits_min": -750000000.0, "copy_num_tokens": 585.0625, "epoch": 1.9086545825887158, "gen_logits_max": 2.9915692806243896, "gen_logits_mean": -18.086589813232422, "gen_logits_min": -29.958789825439453, "gen_logits_std": 3.4950385093688965, "gen_loss": 0.24254122376441956, "grad_norm": 0.3532651941013891, "learning_rate": 1.912421052631579e-05, "loss": 0.2707, "mean_copy_accuracy": 0.9963845014572144, "mean_gen_accuracy": 0.8861669450998306, "mean_token_accuracy": 0.9073223769664764, "num_tokens": 228826168.0, "sample_num_tokens": 9548.0, "step": 9346, "total_num_tokens": 228864360.0, "z_loss": 0.000380960846086964 }, { "copy_logits_max": -7.6130781173706055, "copy_logits_min": -750000000.0, "copy_num_tokens": 368.125, "epoch": 1.9088588205259127, "gen_logits_max": 2.7169876098632812, "gen_logits_mean": -19.779293060302734, "gen_logits_min": -31.731098175048828, "gen_logits_std": 3.5703353881835938, "gen_loss": 0.2945496439933777, "grad_norm": 0.3642522810951262, "learning_rate": 1.9122947368421054e-05, "loss": 0.2878, "mean_copy_accuracy": 0.99605593085289, "mean_gen_accuracy": 0.874174177646637, "mean_token_accuracy": 0.9020773321390152, "num_tokens": 229095080.0, "sample_num_tokens": 7566.0, "step": 9347, "total_num_tokens": 229125344.0, "z_loss": 0.00043790851486846805 }, { "copy_logits_max": -7.912386417388916, "copy_logits_min": -750000000.0, "copy_num_tokens": 357.875, "epoch": 1.9090630584631096, "gen_logits_max": 3.7629499435424805, "gen_logits_mean": -18.329452514648438, "gen_logits_min": -30.526180267333984, "gen_logits_std": 3.4952573776245117, "gen_loss": 0.3095143735408783, "grad_norm": 0.3809383368140888, "learning_rate": 1.9121684210526315e-05, "loss": 0.2918, "mean_copy_accuracy": 0.9951819777488708, "mean_gen_accuracy": 0.8739574551582336, "mean_token_accuracy": 0.9001065492630005, "num_tokens": 229350259.0, "sample_num_tokens": 7764.75, "step": 9348, "total_num_tokens": 229381318.0, "z_loss": 0.00044204387813806534 }, { "copy_logits_max": -1.6935412883758545, "copy_logits_min": -750000000.0, "copy_num_tokens": 626.375, "epoch": 1.9092672964003063, "gen_logits_max": 3.3155887126922607, "gen_logits_mean": -17.572978973388672, "gen_logits_min": -29.71483039855957, "gen_logits_std": 3.4848785400390625, "gen_loss": 0.26161903142929077, "grad_norm": 0.31503402333433106, "learning_rate": 1.912042105263158e-05, "loss": 0.2723, "mean_copy_accuracy": 0.9976595044136047, "mean_gen_accuracy": 0.8715613931417465, "mean_token_accuracy": 0.9040080457925797, "num_tokens": 229632686.0, "sample_num_tokens": 9946.0, "step": 9349, "total_num_tokens": 229672470.0, "z_loss": 0.0004222735296934843 }, { "copy_logits_max": -4.963117599487305, "copy_logits_min": -750000000.0, "copy_num_tokens": 545.25, "epoch": 1.909471534337503, "gen_logits_max": 3.568902015686035, "gen_logits_mean": -17.31999969482422, "gen_logits_min": -29.396333694458008, "gen_logits_std": 3.4604406356811523, "gen_loss": 0.3068985939025879, "grad_norm": 0.32616362506794794, "learning_rate": 1.911915789473684e-05, "loss": 0.2573, "mean_copy_accuracy": 0.997776061296463, "mean_gen_accuracy": 0.8778864294290543, "mean_token_accuracy": 0.9119922071695328, "num_tokens": 229932802.0, "sample_num_tokens": 9209.5, "step": 9350, "total_num_tokens": 229969640.0, "z_loss": 0.0004373946867417544 }, { "copy_logits_max": -2.380430221557617, "copy_logits_min": -750000000.0, "copy_num_tokens": 474.3125, "epoch": 1.9096757722747, "gen_logits_max": 3.888775110244751, "gen_logits_mean": -17.410823822021484, "gen_logits_min": -30.070659637451172, "gen_logits_std": 3.47774600982666, "gen_loss": 0.2503971457481384, "grad_norm": 0.3521232571823122, "learning_rate": 1.9117894736842105e-05, "loss": 0.2779, "mean_copy_accuracy": 0.9978784769773483, "mean_gen_accuracy": 0.8748875707387924, "mean_token_accuracy": 0.906078889966011, "num_tokens": 230199593.0, "sample_num_tokens": 8463.75, "step": 9351, "total_num_tokens": 230233448.0, "z_loss": 0.0003964472562074661 }, { "copy_logits_max": -1.7134817838668823, "copy_logits_min": -750000064.0, "copy_num_tokens": 642.0, "epoch": 1.909880010211897, "gen_logits_max": 4.4870195388793945, "gen_logits_mean": -16.377059936523438, "gen_logits_min": -28.953231811523438, "gen_logits_std": 3.460505723953247, "gen_loss": 0.2677473723888397, "grad_norm": 0.3688943511047786, "learning_rate": 1.911663157894737e-05, "loss": 0.2727, "mean_copy_accuracy": 0.9967137575149536, "mean_gen_accuracy": 0.8783003985881805, "mean_token_accuracy": 0.9074196368455887, "num_tokens": 230459171.0, "sample_num_tokens": 9735.25, "step": 9352, "total_num_tokens": 230498112.0, "z_loss": 0.0004572602338157594 }, { "copy_logits_max": -4.421049118041992, "copy_logits_min": -750000000.0, "copy_num_tokens": 295.3125, "epoch": 1.9100842481490936, "gen_logits_max": 4.606781959533691, "gen_logits_mean": -17.352025985717773, "gen_logits_min": -29.459131240844727, "gen_logits_std": 3.4410104751586914, "gen_loss": 0.3186189532279968, "grad_norm": 0.3610617227513878, "learning_rate": 1.9115368421052633e-05, "loss": 0.283, "mean_copy_accuracy": 0.9965603351593018, "mean_gen_accuracy": 0.8797346353530884, "mean_token_accuracy": 0.9039848893880844, "num_tokens": 230717751.0, "sample_num_tokens": 7697.75, "step": 9353, "total_num_tokens": 230748542.0, "z_loss": 0.0004379941092338413 }, { "copy_logits_max": -4.297059059143066, "copy_logits_min": -750000000.0, "copy_num_tokens": 330.125, "epoch": 1.9102884860862905, "gen_logits_max": 3.6959638595581055, "gen_logits_mean": -17.710186004638672, "gen_logits_min": -29.98188018798828, "gen_logits_std": 3.48224139213562, "gen_loss": 0.2833130955696106, "grad_norm": 0.34708684670091006, "learning_rate": 1.9114105263157894e-05, "loss": 0.2594, "mean_copy_accuracy": 0.9962416291236877, "mean_gen_accuracy": 0.8845850974321365, "mean_token_accuracy": 0.9119779169559479, "num_tokens": 231009566.0, "sample_num_tokens": 6925.0, "step": 9354, "total_num_tokens": 231037266.0, "z_loss": 0.000425842241384089 }, { "copy_logits_max": -0.8713923692703247, "copy_logits_min": -750000000.0, "copy_num_tokens": 360.625, "epoch": 1.9104927240234875, "gen_logits_max": 5.0069403648376465, "gen_logits_mean": -15.344860076904297, "gen_logits_min": -27.948850631713867, "gen_logits_std": 3.385502815246582, "gen_loss": 0.31463822722435, "grad_norm": 0.39999264585862965, "learning_rate": 1.911284210526316e-05, "loss": 0.2948, "mean_copy_accuracy": 0.9969454407691956, "mean_gen_accuracy": 0.8693344444036484, "mean_token_accuracy": 0.8994632661342621, "num_tokens": 231266455.0, "sample_num_tokens": 7692.25, "step": 9355, "total_num_tokens": 231297224.0, "z_loss": 0.0004560496308840811 }, { "copy_logits_max": -3.9743494987487793, "copy_logits_min": -687500032.0, "copy_num_tokens": 556.0, "epoch": 1.9106969619606842, "gen_logits_max": 2.7981672286987305, "gen_logits_mean": -18.539276123046875, "gen_logits_min": -31.051651000976562, "gen_logits_std": 3.560805320739746, "gen_loss": 0.26702260971069336, "grad_norm": 0.3499572582642522, "learning_rate": 1.911157894736842e-05, "loss": 0.2538, "mean_copy_accuracy": 0.9969573765993118, "mean_gen_accuracy": 0.8818740695714951, "mean_token_accuracy": 0.9144773632287979, "num_tokens": 231523150.0, "sample_num_tokens": 8127.5, "step": 9356, "total_num_tokens": 231555660.0, "z_loss": 0.00042505725286900997 }, { "copy_logits_max": -1.6050937175750732, "copy_logits_min": -750000000.0, "copy_num_tokens": 527.875, "epoch": 1.9109011998978809, "gen_logits_max": 4.592419624328613, "gen_logits_mean": -16.239410400390625, "gen_logits_min": -29.418514251708984, "gen_logits_std": 3.453507900238037, "gen_loss": 0.2787103056907654, "grad_norm": 0.3703448022311434, "learning_rate": 1.9110315789473684e-05, "loss": 0.2784, "mean_copy_accuracy": 0.9971387982368469, "mean_gen_accuracy": 0.8790499716997147, "mean_token_accuracy": 0.9050517082214355, "num_tokens": 231788999.0, "sample_num_tokens": 8581.25, "step": 9357, "total_num_tokens": 231823324.0, "z_loss": 0.0004400289326440543 }, { "copy_logits_max": -4.870447635650635, "copy_logits_min": -687500032.0, "copy_num_tokens": 344.125, "epoch": 1.911105437835078, "gen_logits_max": 3.7245078086853027, "gen_logits_mean": -17.544687271118164, "gen_logits_min": -29.701597213745117, "gen_logits_std": 3.4470207691192627, "gen_loss": 0.30266258120536804, "grad_norm": 0.361613058855158, "learning_rate": 1.9109052631578948e-05, "loss": 0.2985, "mean_copy_accuracy": 0.9957378953695297, "mean_gen_accuracy": 0.872244119644165, "mean_token_accuracy": 0.8984479159116745, "num_tokens": 232037132.0, "sample_num_tokens": 7914.5, "step": 9358, "total_num_tokens": 232068790.0, "z_loss": 0.0004530521109700203 }, { "copy_logits_max": -5.473947525024414, "copy_logits_min": -687500032.0, "copy_num_tokens": 340.5625, "epoch": 1.9113096757722747, "gen_logits_max": 3.974240779876709, "gen_logits_mean": -17.134313583374023, "gen_logits_min": -29.94060516357422, "gen_logits_std": 3.4566433429718018, "gen_loss": 0.2673903703689575, "grad_norm": 0.36068914157244375, "learning_rate": 1.910778947368421e-05, "loss": 0.263, "mean_copy_accuracy": 0.9963824898004532, "mean_gen_accuracy": 0.8825279921293259, "mean_token_accuracy": 0.9095593392848969, "num_tokens": 232289689.0, "sample_num_tokens": 7207.75, "step": 9359, "total_num_tokens": 232318520.0, "z_loss": 0.0003894292749464512 }, { "copy_logits_max": -5.3892412185668945, "copy_logits_min": -750000000.0, "copy_num_tokens": 399.375, "epoch": 1.9115139137094714, "gen_logits_max": 3.9040048122406006, "gen_logits_mean": -18.14360809326172, "gen_logits_min": -30.47020149230957, "gen_logits_std": 3.493818521499634, "gen_loss": 0.3116384446620941, "grad_norm": 0.36623657050875347, "learning_rate": 1.9106526315789477e-05, "loss": 0.2913, "mean_copy_accuracy": 0.9964367747306824, "mean_gen_accuracy": 0.8749140202999115, "mean_token_accuracy": 0.9009857475757599, "num_tokens": 232544210.0, "sample_num_tokens": 8681.5, "step": 9360, "total_num_tokens": 232578936.0, "z_loss": 0.00047582032857462764 }, { "copy_logits_max": -3.0440688133239746, "copy_logits_min": -750000064.0, "copy_num_tokens": 548.5, "epoch": 1.9117181516466684, "gen_logits_max": 4.020605087280273, "gen_logits_mean": -17.11056900024414, "gen_logits_min": -29.76373291015625, "gen_logits_std": 3.4610376358032227, "gen_loss": 0.28899410367012024, "grad_norm": 0.34305080155995366, "learning_rate": 1.9105263157894738e-05, "loss": 0.2731, "mean_copy_accuracy": 0.9977698028087616, "mean_gen_accuracy": 0.8742619901895523, "mean_token_accuracy": 0.9068034142255783, "num_tokens": 232823524.0, "sample_num_tokens": 9766.0, "step": 9361, "total_num_tokens": 232862588.0, "z_loss": 0.0004248647019267082 }, { "copy_logits_max": -4.887916564941406, "copy_logits_min": -750000000.0, "copy_num_tokens": 762.3125, "epoch": 1.9119223895838653, "gen_logits_max": 2.582245349884033, "gen_logits_mean": -18.761581420898438, "gen_logits_min": -31.193592071533203, "gen_logits_std": 3.5586202144622803, "gen_loss": 0.24987822771072388, "grad_norm": 0.3469752204573439, "learning_rate": 1.9104000000000002e-05, "loss": 0.2784, "mean_copy_accuracy": 0.9968155026435852, "mean_gen_accuracy": 0.876882404088974, "mean_token_accuracy": 0.9043085873126984, "num_tokens": 233079488.0, "sample_num_tokens": 10753.0, "step": 9362, "total_num_tokens": 233122500.0, "z_loss": 0.0003657803754322231 }, { "copy_logits_max": -3.715681791305542, "copy_logits_min": -750000000.0, "copy_num_tokens": 697.0625, "epoch": 1.912126627521062, "gen_logits_max": 3.3425345420837402, "gen_logits_mean": -17.821849822998047, "gen_logits_min": -30.46431541442871, "gen_logits_std": 3.531036615371704, "gen_loss": 0.24428841471672058, "grad_norm": 0.34871494219239346, "learning_rate": 1.9102736842105263e-05, "loss": 0.2886, "mean_copy_accuracy": 0.9974788129329681, "mean_gen_accuracy": 0.8680534511804581, "mean_token_accuracy": 0.9013881236314774, "num_tokens": 233372761.0, "sample_num_tokens": 10315.75, "step": 9363, "total_num_tokens": 233414024.0, "z_loss": 0.0003700712404679507 }, { "copy_logits_max": -4.089504241943359, "copy_logits_min": -750000000.0, "copy_num_tokens": 415.125, "epoch": 1.912330865458259, "gen_logits_max": 5.239041328430176, "gen_logits_mean": -16.365083694458008, "gen_logits_min": -28.90985870361328, "gen_logits_std": 3.4411213397979736, "gen_loss": 0.27191486954689026, "grad_norm": 0.3546548397634645, "learning_rate": 1.9101473684210527e-05, "loss": 0.2805, "mean_copy_accuracy": 0.9968952834606171, "mean_gen_accuracy": 0.8756198585033417, "mean_token_accuracy": 0.9059339314699173, "num_tokens": 233652218.0, "sample_num_tokens": 8203.0, "step": 9364, "total_num_tokens": 233685030.0, "z_loss": 0.000408653897466138 }, { "copy_logits_max": -2.922797203063965, "copy_logits_min": -750000000.0, "copy_num_tokens": 629.375, "epoch": 1.9125351033954558, "gen_logits_max": 4.145910739898682, "gen_logits_mean": -16.52159309387207, "gen_logits_min": -29.46380615234375, "gen_logits_std": 3.4426426887512207, "gen_loss": 0.26729893684387207, "grad_norm": 0.37755444864614623, "learning_rate": 1.9100210526315788e-05, "loss": 0.2665, "mean_copy_accuracy": 0.9962425082921982, "mean_gen_accuracy": 0.8828768879175186, "mean_token_accuracy": 0.9107752740383148, "num_tokens": 233913137.0, "sample_num_tokens": 9073.25, "step": 9365, "total_num_tokens": 233949430.0, "z_loss": 0.0004355207202024758 }, { "copy_logits_max": -5.559333801269531, "copy_logits_min": -750000000.0, "copy_num_tokens": 528.4375, "epoch": 1.9127393413326526, "gen_logits_max": 2.0865554809570312, "gen_logits_mean": -19.8558349609375, "gen_logits_min": -32.17292022705078, "gen_logits_std": 3.6001033782958984, "gen_loss": 0.27424904704093933, "grad_norm": 0.3262716217089943, "learning_rate": 1.9098947368421053e-05, "loss": 0.2438, "mean_copy_accuracy": 0.9967322796583176, "mean_gen_accuracy": 0.8844264596700668, "mean_token_accuracy": 0.9160684645175934, "num_tokens": 234195256.0, "sample_num_tokens": 8433.5, "step": 9366, "total_num_tokens": 234228990.0, "z_loss": 0.00038222724106162786 }, { "copy_logits_max": -3.930997848510742, "copy_logits_min": -750000128.0, "copy_num_tokens": 595.6875, "epoch": 1.9129435792698493, "gen_logits_max": 3.878575325012207, "gen_logits_mean": -17.263229370117188, "gen_logits_min": -29.73066520690918, "gen_logits_std": 3.477724075317383, "gen_loss": 0.25225088000297546, "grad_norm": 0.3617264653817308, "learning_rate": 1.9097684210526313e-05, "loss": 0.2786, "mean_copy_accuracy": 0.9971408694982529, "mean_gen_accuracy": 0.8705523014068604, "mean_token_accuracy": 0.9041760563850403, "num_tokens": 234470122.0, "sample_num_tokens": 8921.0, "step": 9367, "total_num_tokens": 234505806.0, "z_loss": 0.0004048410919494927 }, { "copy_logits_max": -4.741968154907227, "copy_logits_min": -750000000.0, "copy_num_tokens": 532.375, "epoch": 1.9131478172070462, "gen_logits_max": 3.693284273147583, "gen_logits_mean": -17.779422760009766, "gen_logits_min": -29.990835189819336, "gen_logits_std": 3.4815220832824707, "gen_loss": 0.2691017985343933, "grad_norm": 0.35026334375027074, "learning_rate": 1.909642105263158e-05, "loss": 0.2766, "mean_copy_accuracy": 0.9963904023170471, "mean_gen_accuracy": 0.8794360309839249, "mean_token_accuracy": 0.9081143885850906, "num_tokens": 234739902.0, "sample_num_tokens": 8789.5, "step": 9368, "total_num_tokens": 234775060.0, "z_loss": 0.00038567627780139446 }, { "copy_logits_max": -5.1588029861450195, "copy_logits_min": -687500032.0, "copy_num_tokens": 389.125, "epoch": 1.9133520551442431, "gen_logits_max": 3.7174792289733887, "gen_logits_mean": -17.71135711669922, "gen_logits_min": -29.998571395874023, "gen_logits_std": 3.4612600803375244, "gen_loss": 0.2822473645210266, "grad_norm": 0.35469363948017524, "learning_rate": 1.9095157894736842e-05, "loss": 0.2794, "mean_copy_accuracy": 0.9964696168899536, "mean_gen_accuracy": 0.8766183406114578, "mean_token_accuracy": 0.9034249186515808, "num_tokens": 235004648.0, "sample_num_tokens": 7388.0, "step": 9369, "total_num_tokens": 235034200.0, "z_loss": 0.0004238103865645826 }, { "copy_logits_max": -5.93727445602417, "copy_logits_min": -750000000.0, "copy_num_tokens": 373.625, "epoch": 1.9135562930814398, "gen_logits_max": 3.5942437648773193, "gen_logits_mean": -17.785585403442383, "gen_logits_min": -29.815399169921875, "gen_logits_std": 3.4696707725524902, "gen_loss": 0.2820543646812439, "grad_norm": 0.3479018031863463, "learning_rate": 1.9093894736842106e-05, "loss": 0.2733, "mean_copy_accuracy": 0.9971756190061569, "mean_gen_accuracy": 0.8784413486719131, "mean_token_accuracy": 0.9049483835697174, "num_tokens": 235261855.0, "sample_num_tokens": 8342.75, "step": 9370, "total_num_tokens": 235295226.0, "z_loss": 0.00040553772123530507 }, { "copy_logits_max": -6.195192337036133, "copy_logits_min": -687500032.0, "copy_num_tokens": 597.5, "epoch": 1.9137605310186367, "gen_logits_max": 1.5504775047302246, "gen_logits_mean": -20.27291488647461, "gen_logits_min": -32.47344970703125, "gen_logits_std": 3.6345767974853516, "gen_loss": 0.2291317880153656, "grad_norm": 0.34961698337061614, "learning_rate": 1.909263157894737e-05, "loss": 0.263, "mean_copy_accuracy": 0.9966956377029419, "mean_gen_accuracy": 0.8797540962696075, "mean_token_accuracy": 0.909483939409256, "num_tokens": 235545468.0, "sample_num_tokens": 9862.5, "step": 9371, "total_num_tokens": 235584918.0, "z_loss": 0.0003168540133628994 }, { "copy_logits_max": -5.596678733825684, "copy_logits_min": -750000000.0, "copy_num_tokens": 492.625, "epoch": 1.9139647689558337, "gen_logits_max": 2.619955539703369, "gen_logits_mean": -18.475831985473633, "gen_logits_min": -30.554590225219727, "gen_logits_std": 3.5384206771850586, "gen_loss": 0.23821108043193817, "grad_norm": 0.41404796640477354, "learning_rate": 1.909136842105263e-05, "loss": 0.2641, "mean_copy_accuracy": 0.9973675310611725, "mean_gen_accuracy": 0.8776360601186752, "mean_token_accuracy": 0.9096933901309967, "num_tokens": 235825983.0, "sample_num_tokens": 8097.25, "step": 9372, "total_num_tokens": 235858372.0, "z_loss": 0.0003299187810625881 }, { "copy_logits_max": -6.1704277992248535, "copy_logits_min": -687500032.0, "copy_num_tokens": 471.1875, "epoch": 1.9141690068930304, "gen_logits_max": 3.9254794120788574, "gen_logits_mean": -17.144004821777344, "gen_logits_min": -29.38567543029785, "gen_logits_std": 3.425230026245117, "gen_loss": 0.30907613039016724, "grad_norm": 0.339148016626628, "learning_rate": 1.9090105263157896e-05, "loss": 0.2645, "mean_copy_accuracy": 0.9970039874315262, "mean_gen_accuracy": 0.8825740814208984, "mean_token_accuracy": 0.9096166640520096, "num_tokens": 236102838.0, "sample_num_tokens": 8804.0, "step": 9373, "total_num_tokens": 236138054.0, "z_loss": 0.00046823685988783836 }, { "copy_logits_max": -6.610989093780518, "copy_logits_min": -750000000.0, "copy_num_tokens": 332.25, "epoch": 1.914373244830227, "gen_logits_max": 3.5342488288879395, "gen_logits_mean": -18.156627655029297, "gen_logits_min": -30.277721405029297, "gen_logits_std": 3.4813194274902344, "gen_loss": 0.28103411197662354, "grad_norm": 0.35627199581088675, "learning_rate": 1.9088842105263157e-05, "loss": 0.2799, "mean_copy_accuracy": 0.9964746683835983, "mean_gen_accuracy": 0.8763545006513596, "mean_token_accuracy": 0.9043304771184921, "num_tokens": 236371129.0, "sample_num_tokens": 8239.75, "step": 9374, "total_num_tokens": 236404088.0, "z_loss": 0.00042539843707345426 }, { "copy_logits_max": -5.613348007202148, "copy_logits_min": -687500032.0, "copy_num_tokens": 429.9375, "epoch": 1.914577482767424, "gen_logits_max": 2.089749813079834, "gen_logits_mean": -19.735523223876953, "gen_logits_min": -31.725982666015625, "gen_logits_std": 3.5990824699401855, "gen_loss": 0.24354156851768494, "grad_norm": 0.36127909561744803, "learning_rate": 1.908757894736842e-05, "loss": 0.271, "mean_copy_accuracy": 0.9975166767835617, "mean_gen_accuracy": 0.8770043700933456, "mean_token_accuracy": 0.9071447849273682, "num_tokens": 236644803.0, "sample_num_tokens": 7806.25, "step": 9375, "total_num_tokens": 236676028.0, "z_loss": 0.0003395232488401234 }, { "copy_logits_max": -6.0028157234191895, "copy_logits_min": -750000000.0, "copy_num_tokens": 333.6875, "epoch": 1.914781720704621, "gen_logits_max": 3.4491443634033203, "gen_logits_mean": -18.299583435058594, "gen_logits_min": -30.16021156311035, "gen_logits_std": 3.4738388061523438, "gen_loss": 0.3306228220462799, "grad_norm": 0.38057862606062515, "learning_rate": 1.9086315789473686e-05, "loss": 0.3092, "mean_copy_accuracy": 0.996075227856636, "mean_gen_accuracy": 0.8651310950517654, "mean_token_accuracy": 0.8959652185440063, "num_tokens": 236900858.0, "sample_num_tokens": 8132.0, "step": 9376, "total_num_tokens": 236933386.0, "z_loss": 0.0004451961431186646 }, { "copy_logits_max": -6.114272117614746, "copy_logits_min": -687500032.0, "copy_num_tokens": 335.4375, "epoch": 1.9149859586418176, "gen_logits_max": 3.372467041015625, "gen_logits_mean": -18.090003967285156, "gen_logits_min": -30.17765235900879, "gen_logits_std": 3.5016989707946777, "gen_loss": 0.27168047428131104, "grad_norm": 0.37781301460593686, "learning_rate": 1.908505263157895e-05, "loss": 0.2699, "mean_copy_accuracy": 0.9966226667165756, "mean_gen_accuracy": 0.8769522905349731, "mean_token_accuracy": 0.9080084711313248, "num_tokens": 237160376.0, "sample_num_tokens": 7271.5, "step": 9377, "total_num_tokens": 237189462.0, "z_loss": 0.0004088411224074662 }, { "copy_logits_max": -5.587376117706299, "copy_logits_min": -750000000.0, "copy_num_tokens": 336.1875, "epoch": 1.9151901965790146, "gen_logits_max": 3.1078386306762695, "gen_logits_mean": -18.08502960205078, "gen_logits_min": -30.094615936279297, "gen_logits_std": 3.4936070442199707, "gen_loss": 0.27497339248657227, "grad_norm": 0.3253896297499414, "learning_rate": 1.908378947368421e-05, "loss": 0.2756, "mean_copy_accuracy": 0.996890977025032, "mean_gen_accuracy": 0.8755673468112946, "mean_token_accuracy": 0.9059111326932907, "num_tokens": 237431425.0, "sample_num_tokens": 7156.25, "step": 9378, "total_num_tokens": 237460050.0, "z_loss": 0.0003759502142202109 }, { "copy_logits_max": -3.684083938598633, "copy_logits_min": -750000000.0, "copy_num_tokens": 518.125, "epoch": 1.9153944345162115, "gen_logits_max": 3.473330020904541, "gen_logits_mean": -17.018747329711914, "gen_logits_min": -29.618732452392578, "gen_logits_std": 3.4595861434936523, "gen_loss": 0.2790713608264923, "grad_norm": 0.3494703509113822, "learning_rate": 1.9082526315789475e-05, "loss": 0.2719, "mean_copy_accuracy": 0.9969297498464584, "mean_gen_accuracy": 0.8826365023851395, "mean_token_accuracy": 0.9075829535722733, "num_tokens": 237688233.0, "sample_num_tokens": 8829.25, "step": 9379, "total_num_tokens": 237723550.0, "z_loss": 0.0004997748183086514 }, { "copy_logits_max": -4.573411464691162, "copy_logits_min": -750000000.0, "copy_num_tokens": 435.9375, "epoch": 1.9155986724534082, "gen_logits_max": 3.6427979469299316, "gen_logits_mean": -16.261320114135742, "gen_logits_min": -28.545673370361328, "gen_logits_std": 3.4172654151916504, "gen_loss": 0.28918322920799255, "grad_norm": 0.4432304177264898, "learning_rate": 1.9081263157894736e-05, "loss": 0.2832, "mean_copy_accuracy": 0.9971344470977783, "mean_gen_accuracy": 0.8750124275684357, "mean_token_accuracy": 0.9032824486494064, "num_tokens": 237954147.0, "sample_num_tokens": 7803.75, "step": 9380, "total_num_tokens": 237985362.0, "z_loss": 0.000446740014012903 }, { "copy_logits_max": -3.2295141220092773, "copy_logits_min": -750000000.0, "copy_num_tokens": 370.0625, "epoch": 1.915802910390605, "gen_logits_max": 4.303646564483643, "gen_logits_mean": -16.20327377319336, "gen_logits_min": -28.199661254882812, "gen_logits_std": 3.4158480167388916, "gen_loss": 0.2800862789154053, "grad_norm": 0.38766722604187487, "learning_rate": 1.908e-05, "loss": 0.2817, "mean_copy_accuracy": 0.99737448990345, "mean_gen_accuracy": 0.8742330521345139, "mean_token_accuracy": 0.9049917906522751, "num_tokens": 238236251.0, "sample_num_tokens": 7781.75, "step": 9381, "total_num_tokens": 238267378.0, "z_loss": 0.0004785913915839046 }, { "copy_logits_max": -1.9913920164108276, "copy_logits_min": -687500096.0, "copy_num_tokens": 750.0625, "epoch": 1.9160071483278018, "gen_logits_max": 3.3633837699890137, "gen_logits_mean": -16.427400588989258, "gen_logits_min": -28.591777801513672, "gen_logits_std": 3.44431471824646, "gen_loss": 0.24722281098365784, "grad_norm": 0.3631779077059088, "learning_rate": 1.907873684210526e-05, "loss": 0.2642, "mean_copy_accuracy": 0.9959742575883865, "mean_gen_accuracy": 0.8813086748123169, "mean_token_accuracy": 0.911566287279129, "num_tokens": 238503787.0, "sample_num_tokens": 9826.25, "step": 9382, "total_num_tokens": 238543092.0, "z_loss": 0.00039621361065655947 }, { "copy_logits_max": -2.663930892944336, "copy_logits_min": -750000064.0, "copy_num_tokens": 447.4375, "epoch": 1.9162113862649988, "gen_logits_max": 2.964054822921753, "gen_logits_mean": -16.144466400146484, "gen_logits_min": -28.43734359741211, "gen_logits_std": 3.422003746032715, "gen_loss": 0.22669091820716858, "grad_norm": 0.3259144707933101, "learning_rate": 1.9077473684210526e-05, "loss": 0.2398, "mean_copy_accuracy": 0.997254952788353, "mean_gen_accuracy": 0.8892852663993835, "mean_token_accuracy": 0.9155663251876831, "num_tokens": 238781698.0, "sample_num_tokens": 6731.0, "step": 9383, "total_num_tokens": 238808622.0, "z_loss": 0.00036731266300193965 }, { "copy_logits_max": -3.6320855617523193, "copy_logits_min": -750000000.0, "copy_num_tokens": 678.5, "epoch": 1.9164156242021955, "gen_logits_max": 3.2695822715759277, "gen_logits_mean": -16.552135467529297, "gen_logits_min": -28.791240692138672, "gen_logits_std": 3.4502739906311035, "gen_loss": 0.25528788566589355, "grad_norm": 0.32801371790164047, "learning_rate": 1.9076210526315793e-05, "loss": 0.262, "mean_copy_accuracy": 0.996744379401207, "mean_gen_accuracy": 0.8778578788042068, "mean_token_accuracy": 0.909202516078949, "num_tokens": 239058140.0, "sample_num_tokens": 9739.0, "step": 9384, "total_num_tokens": 239097096.0, "z_loss": 0.00039851671317592263 }, { "copy_logits_max": -4.423368453979492, "copy_logits_min": -750000000.0, "copy_num_tokens": 400.4375, "epoch": 1.9166198621393924, "gen_logits_max": 3.495696544647217, "gen_logits_mean": -18.158065795898438, "gen_logits_min": -30.186338424682617, "gen_logits_std": 3.498722553253174, "gen_loss": 0.3016965985298157, "grad_norm": 0.3954098355304098, "learning_rate": 1.9074947368421054e-05, "loss": 0.2842, "mean_copy_accuracy": 0.9965064972639084, "mean_gen_accuracy": 0.8744013458490372, "mean_token_accuracy": 0.9025347381830215, "num_tokens": 239327686.0, "sample_num_tokens": 9024.0, "step": 9385, "total_num_tokens": 239363782.0, "z_loss": 0.0004863038193434477 }, { "copy_logits_max": -2.3483834266662598, "copy_logits_min": -750000064.0, "copy_num_tokens": 508.6875, "epoch": 1.9168241000765893, "gen_logits_max": 3.1481504440307617, "gen_logits_mean": -17.395301818847656, "gen_logits_min": -29.716815948486328, "gen_logits_std": 3.468369960784912, "gen_loss": 0.2674073576927185, "grad_norm": 0.3646101171349397, "learning_rate": 1.907368421052632e-05, "loss": 0.2757, "mean_copy_accuracy": 0.9976534396409988, "mean_gen_accuracy": 0.8743064403533936, "mean_token_accuracy": 0.9063086360692978, "num_tokens": 239605289.0, "sample_num_tokens": 8508.75, "step": 9386, "total_num_tokens": 239639324.0, "z_loss": 0.0004147935542277992 }, { "copy_logits_max": -5.585214614868164, "copy_logits_min": -750000064.0, "copy_num_tokens": 523.125, "epoch": 1.917028338013786, "gen_logits_max": 3.7389395236968994, "gen_logits_mean": -17.59671401977539, "gen_logits_min": -29.305124282836914, "gen_logits_std": 3.4641575813293457, "gen_loss": 0.3003925681114197, "grad_norm": 0.3654693796186688, "learning_rate": 1.907242105263158e-05, "loss": 0.2789, "mean_copy_accuracy": 0.9972300231456757, "mean_gen_accuracy": 0.874374508857727, "mean_token_accuracy": 0.9046248197555542, "num_tokens": 239871629.0, "sample_num_tokens": 10138.25, "step": 9387, "total_num_tokens": 239912182.0, "z_loss": 0.0004368757945485413 }, { "copy_logits_max": -4.756808280944824, "copy_logits_min": -750000064.0, "copy_num_tokens": 395.75, "epoch": 1.9172325759509827, "gen_logits_max": 3.290828227996826, "gen_logits_mean": -17.594112396240234, "gen_logits_min": -29.908174514770508, "gen_logits_std": 3.4703855514526367, "gen_loss": 0.2659430205821991, "grad_norm": 0.3962919061917075, "learning_rate": 1.9071157894736844e-05, "loss": 0.29, "mean_copy_accuracy": 0.9966198205947876, "mean_gen_accuracy": 0.8718679696321487, "mean_token_accuracy": 0.9017887711524963, "num_tokens": 240139980.0, "sample_num_tokens": 7655.5, "step": 9388, "total_num_tokens": 240170602.0, "z_loss": 0.0004105315892957151 }, { "copy_logits_max": -2.3370957374572754, "copy_logits_min": -750000000.0, "copy_num_tokens": 540.5, "epoch": 1.9174368138881799, "gen_logits_max": 3.6342639923095703, "gen_logits_mean": -15.642119407653809, "gen_logits_min": -27.953210830688477, "gen_logits_std": 3.4038357734680176, "gen_loss": 0.27779215574264526, "grad_norm": 0.35329774224753746, "learning_rate": 1.9069894736842105e-05, "loss": 0.274, "mean_copy_accuracy": 0.9966607093811035, "mean_gen_accuracy": 0.8792898654937744, "mean_token_accuracy": 0.9057401120662689, "num_tokens": 240401246.0, "sample_num_tokens": 8163.5, "step": 9389, "total_num_tokens": 240433900.0, "z_loss": 0.00042165617924183607 }, { "copy_logits_max": -0.3640100061893463, "copy_logits_min": -687500032.0, "copy_num_tokens": 613.375, "epoch": 1.9176410518253766, "gen_logits_max": 3.880190849304199, "gen_logits_mean": -15.741182327270508, "gen_logits_min": -28.28061294555664, "gen_logits_std": 3.4179811477661133, "gen_loss": 0.23472702503204346, "grad_norm": 0.3215145774946518, "learning_rate": 1.906863157894737e-05, "loss": 0.2503, "mean_copy_accuracy": 0.9976300597190857, "mean_gen_accuracy": 0.8856127858161926, "mean_token_accuracy": 0.9157605320215225, "num_tokens": 240690176.0, "sample_num_tokens": 9654.0, "step": 9390, "total_num_tokens": 240728792.0, "z_loss": 0.0003939856542274356 }, { "copy_logits_max": -2.203608512878418, "copy_logits_min": -750000000.0, "copy_num_tokens": 420.125, "epoch": 1.9178452897625733, "gen_logits_max": 3.7430520057678223, "gen_logits_mean": -16.787824630737305, "gen_logits_min": -29.154565811157227, "gen_logits_std": 3.459986925125122, "gen_loss": 0.23292429745197296, "grad_norm": 0.33198721343747495, "learning_rate": 1.906736842105263e-05, "loss": 0.2335, "mean_copy_accuracy": 0.9977407455444336, "mean_gen_accuracy": 0.8898055255413055, "mean_token_accuracy": 0.9204668253660202, "num_tokens": 240987311.0, "sample_num_tokens": 7717.25, "step": 9391, "total_num_tokens": 241018180.0, "z_loss": 0.00036625683424063027 }, { "copy_logits_max": -2.954577684402466, "copy_logits_min": -750000000.0, "copy_num_tokens": 398.75, "epoch": 1.9180495276997702, "gen_logits_max": 3.579254627227783, "gen_logits_mean": -17.499534606933594, "gen_logits_min": -29.609636306762695, "gen_logits_std": 3.4738306999206543, "gen_loss": 0.27634361386299133, "grad_norm": 0.33786668340140136, "learning_rate": 1.9066105263157894e-05, "loss": 0.2671, "mean_copy_accuracy": 0.996888741850853, "mean_gen_accuracy": 0.8797519356012344, "mean_token_accuracy": 0.9103293269872665, "num_tokens": 241253094.0, "sample_num_tokens": 8010.0, "step": 9392, "total_num_tokens": 241285134.0, "z_loss": 0.00042394374031573534 }, { "copy_logits_max": -4.730531215667725, "copy_logits_min": -750000000.0, "copy_num_tokens": 349.125, "epoch": 1.9182537656369671, "gen_logits_max": 3.662670612335205, "gen_logits_mean": -18.01861572265625, "gen_logits_min": -29.897613525390625, "gen_logits_std": 3.4869234561920166, "gen_loss": 0.24639257788658142, "grad_norm": 0.3210110382527359, "learning_rate": 1.906484210526316e-05, "loss": 0.2402, "mean_copy_accuracy": 0.9971393048763275, "mean_gen_accuracy": 0.8879853785037994, "mean_token_accuracy": 0.916658952832222, "num_tokens": 241526225.0, "sample_num_tokens": 7921.25, "step": 9393, "total_num_tokens": 241557910.0, "z_loss": 0.0004001818015240133 }, { "copy_logits_max": -1.9551550149917603, "copy_logits_min": -750000000.0, "copy_num_tokens": 683.9375, "epoch": 1.9184580035741639, "gen_logits_max": 3.638317346572876, "gen_logits_mean": -16.154529571533203, "gen_logits_min": -28.447364807128906, "gen_logits_std": 3.419212818145752, "gen_loss": 0.2680100202560425, "grad_norm": 0.3156093909260527, "learning_rate": 1.9063578947368423e-05, "loss": 0.2665, "mean_copy_accuracy": 0.9977164417505264, "mean_gen_accuracy": 0.8749033659696579, "mean_token_accuracy": 0.9100720286369324, "num_tokens": 241823862.0, "sample_num_tokens": 10327.0, "step": 9394, "total_num_tokens": 241865170.0, "z_loss": 0.00043889242806471884 }, { "copy_logits_max": -2.187394142150879, "copy_logits_min": -750000128.0, "copy_num_tokens": 484.6875, "epoch": 1.9186622415113608, "gen_logits_max": 3.8512346744537354, "gen_logits_mean": -16.76636505126953, "gen_logits_min": -28.921649932861328, "gen_logits_std": 3.419093132019043, "gen_loss": 0.2572785019874573, "grad_norm": 0.33904310493005674, "learning_rate": 1.9062315789473684e-05, "loss": 0.2684, "mean_copy_accuracy": 0.9954181015491486, "mean_gen_accuracy": 0.8872661292552948, "mean_token_accuracy": 0.9082077592611313, "num_tokens": 242089862.0, "sample_num_tokens": 8552.5, "step": 9395, "total_num_tokens": 242124072.0, "z_loss": 0.0003990654950030148 }, { "copy_logits_max": -0.7593238353729248, "copy_logits_min": -750000000.0, "copy_num_tokens": 575.4375, "epoch": 1.9188664794485577, "gen_logits_max": 4.18714714050293, "gen_logits_mean": -15.740119934082031, "gen_logits_min": -28.02308464050293, "gen_logits_std": 3.423818588256836, "gen_loss": 0.2285444438457489, "grad_norm": 0.3289801191839008, "learning_rate": 1.9061052631578948e-05, "loss": 0.2498, "mean_copy_accuracy": 0.997110903263092, "mean_gen_accuracy": 0.8854076266288757, "mean_token_accuracy": 0.9151417315006256, "num_tokens": 242351369.0, "sample_num_tokens": 8634.25, "step": 9396, "total_num_tokens": 242385906.0, "z_loss": 0.00035706680500879884 }, { "copy_logits_max": -2.4956071376800537, "copy_logits_min": -750000000.0, "copy_num_tokens": 441.125, "epoch": 1.9190707173857544, "gen_logits_max": 4.015700817108154, "gen_logits_mean": -16.839492797851562, "gen_logits_min": -28.754112243652344, "gen_logits_std": 3.439610481262207, "gen_loss": 0.26887279748916626, "grad_norm": 0.35746399474595675, "learning_rate": 1.9059789473684212e-05, "loss": 0.287, "mean_copy_accuracy": 0.9969219267368317, "mean_gen_accuracy": 0.871658980846405, "mean_token_accuracy": 0.901298314332962, "num_tokens": 242613987.0, "sample_num_tokens": 7813.25, "step": 9397, "total_num_tokens": 242645240.0, "z_loss": 0.0004322953463997692 }, { "copy_logits_max": -0.8914238214492798, "copy_logits_min": -687500032.0, "copy_num_tokens": 604.1875, "epoch": 1.9192749553229511, "gen_logits_max": 3.4080142974853516, "gen_logits_mean": -17.05382537841797, "gen_logits_min": -29.232864379882812, "gen_logits_std": 3.4898409843444824, "gen_loss": 0.24709755182266235, "grad_norm": 0.34729290717536737, "learning_rate": 1.9058526315789473e-05, "loss": 0.2566, "mean_copy_accuracy": 0.9979400038719177, "mean_gen_accuracy": 0.8850369900465012, "mean_token_accuracy": 0.9158209711313248, "num_tokens": 242882099.0, "sample_num_tokens": 9601.25, "step": 9398, "total_num_tokens": 242920504.0, "z_loss": 0.00039184678462333977 }, { "copy_logits_max": -5.751193046569824, "copy_logits_min": -687500032.0, "copy_num_tokens": 387.75, "epoch": 1.919479193260148, "gen_logits_max": 3.0209240913391113, "gen_logits_mean": -18.78420639038086, "gen_logits_min": -30.753257751464844, "gen_logits_std": 3.5121545791625977, "gen_loss": 0.2823876738548279, "grad_norm": 0.3690775913030065, "learning_rate": 1.9057263157894738e-05, "loss": 0.2747, "mean_copy_accuracy": 0.9974357783794403, "mean_gen_accuracy": 0.8777805715799332, "mean_token_accuracy": 0.9054654091596603, "num_tokens": 243131717.0, "sample_num_tokens": 8093.25, "step": 9399, "total_num_tokens": 243164090.0, "z_loss": 0.0004284429596737027 }, { "copy_logits_max": -3.0903563499450684, "copy_logits_min": -750000000.0, "copy_num_tokens": 419.0, "epoch": 1.919683431197345, "gen_logits_max": 5.036471366882324, "gen_logits_mean": -15.031623840332031, "gen_logits_min": -27.038833618164062, "gen_logits_std": 3.339364528656006, "gen_loss": 0.2967265546321869, "grad_norm": 0.34021468651853276, "learning_rate": 1.9056e-05, "loss": 0.2793, "mean_copy_accuracy": 0.995643824338913, "mean_gen_accuracy": 0.8788227438926697, "mean_token_accuracy": 0.9059243351221085, "num_tokens": 243409696.0, "sample_num_tokens": 8807.0, "step": 9400, "total_num_tokens": 243444924.0, "z_loss": 0.000487400742713362 }, { "copy_logits_max": -1.5858592987060547, "copy_logits_min": -750000000.0, "copy_num_tokens": 543.0, "epoch": 1.9198876691345417, "gen_logits_max": 3.75911021232605, "gen_logits_mean": -16.765411376953125, "gen_logits_min": -28.47870635986328, "gen_logits_std": 3.429227113723755, "gen_loss": 0.2698577344417572, "grad_norm": 0.32786476084106586, "learning_rate": 1.9054736842105266e-05, "loss": 0.2639, "mean_copy_accuracy": 0.9979802072048187, "mean_gen_accuracy": 0.8729823529720306, "mean_token_accuracy": 0.9088643193244934, "num_tokens": 243701749.0, "sample_num_tokens": 9300.75, "step": 9401, "total_num_tokens": 243738952.0, "z_loss": 0.0003831009962595999 }, { "copy_logits_max": -2.5018012523651123, "copy_logits_min": -750000000.0, "copy_num_tokens": 753.6875, "epoch": 1.9200919070717386, "gen_logits_max": 2.990826368331909, "gen_logits_mean": -16.589933395385742, "gen_logits_min": -28.599674224853516, "gen_logits_std": 3.456017017364502, "gen_loss": 0.2484292834997177, "grad_norm": 0.30030642241670547, "learning_rate": 1.9053473684210527e-05, "loss": 0.2503, "mean_copy_accuracy": 0.9977705031633377, "mean_gen_accuracy": 0.8786146193742752, "mean_token_accuracy": 0.9133054912090302, "num_tokens": 243994666.0, "sample_num_tokens": 9966.5, "step": 9402, "total_num_tokens": 244034532.0, "z_loss": 0.00037051542312838137 }, { "copy_logits_max": -0.31572508811950684, "copy_logits_min": -750000000.0, "copy_num_tokens": 661.4375, "epoch": 1.9202961450089355, "gen_logits_max": 3.6791491508483887, "gen_logits_mean": -15.548290252685547, "gen_logits_min": -27.902395248413086, "gen_logits_std": 3.4082541465759277, "gen_loss": 0.2605109214782715, "grad_norm": 0.3542143629221971, "learning_rate": 1.905221052631579e-05, "loss": 0.2684, "mean_copy_accuracy": 0.9964231997728348, "mean_gen_accuracy": 0.8796216249465942, "mean_token_accuracy": 0.908010795712471, "num_tokens": 244264816.0, "sample_num_tokens": 9924.5, "step": 9403, "total_num_tokens": 244304514.0, "z_loss": 0.0003818366676568985 }, { "copy_logits_max": -2.661518096923828, "copy_logits_min": -750000000.0, "copy_num_tokens": 404.5625, "epoch": 1.9205003829461322, "gen_logits_max": 4.100714206695557, "gen_logits_mean": -15.33905029296875, "gen_logits_min": -28.067485809326172, "gen_logits_std": 3.392327070236206, "gen_loss": 0.27256909012794495, "grad_norm": 0.3614963462847791, "learning_rate": 1.9050947368421052e-05, "loss": 0.2837, "mean_copy_accuracy": 0.996559202671051, "mean_gen_accuracy": 0.8727925419807434, "mean_token_accuracy": 0.9033903181552887, "num_tokens": 244542933.0, "sample_num_tokens": 7815.75, "step": 9404, "total_num_tokens": 244574196.0, "z_loss": 0.0004149405285716057 }, { "copy_logits_max": -2.818948745727539, "copy_logits_min": -750000000.0, "copy_num_tokens": 247.5, "epoch": 1.920704620883329, "gen_logits_max": 4.881838798522949, "gen_logits_mean": -16.872421264648438, "gen_logits_min": -28.899303436279297, "gen_logits_std": 3.457616090774536, "gen_loss": 0.2659001350402832, "grad_norm": 0.3758954767889213, "learning_rate": 1.9049684210526317e-05, "loss": 0.2837, "mean_copy_accuracy": 0.9960566908121109, "mean_gen_accuracy": 0.8846865445375443, "mean_token_accuracy": 0.9037114828824997, "num_tokens": 244784502.0, "sample_num_tokens": 6823.5, "step": 9405, "total_num_tokens": 244811796.0, "z_loss": 0.0003801684360951185 }, { "copy_logits_max": -1.2829697132110596, "copy_logits_min": -750000000.0, "copy_num_tokens": 732.6875, "epoch": 1.9209088588205259, "gen_logits_max": 2.045342445373535, "gen_logits_mean": -18.23000717163086, "gen_logits_min": -30.988054275512695, "gen_logits_std": 3.5694096088409424, "gen_loss": 0.2371470034122467, "grad_norm": 0.32249965566940675, "learning_rate": 1.9048421052631578e-05, "loss": 0.2566, "mean_copy_accuracy": 0.9974348694086075, "mean_gen_accuracy": 0.8846250474452972, "mean_token_accuracy": 0.913191556930542, "num_tokens": 245058993.0, "sample_num_tokens": 10601.75, "step": 9406, "total_num_tokens": 245101400.0, "z_loss": 0.0003729186428245157 }, { "copy_logits_max": -4.530425071716309, "copy_logits_min": -750000128.0, "copy_num_tokens": 527.0, "epoch": 1.9211130967577228, "gen_logits_max": 2.1671571731567383, "gen_logits_mean": -18.71533203125, "gen_logits_min": -30.912921905517578, "gen_logits_std": 3.56888484954834, "gen_loss": 0.24189099669456482, "grad_norm": 0.3560217125129157, "learning_rate": 1.9047157894736842e-05, "loss": 0.2667, "mean_copy_accuracy": 0.9967359453439713, "mean_gen_accuracy": 0.8774912059307098, "mean_token_accuracy": 0.9082684516906738, "num_tokens": 245331878.0, "sample_num_tokens": 8541.0, "step": 9407, "total_num_tokens": 245366042.0, "z_loss": 0.00033315911423414946 }, { "copy_logits_max": -0.17454612255096436, "copy_logits_min": -750000000.0, "copy_num_tokens": 617.875, "epoch": 1.9213173346949195, "gen_logits_max": 3.478715419769287, "gen_logits_mean": -15.681840896606445, "gen_logits_min": -28.122661590576172, "gen_logits_std": 3.443852424621582, "gen_loss": 0.24016320705413818, "grad_norm": 0.36065373080398755, "learning_rate": 1.9045894736842103e-05, "loss": 0.2807, "mean_copy_accuracy": 0.9970716685056686, "mean_gen_accuracy": 0.8759625107049942, "mean_token_accuracy": 0.9053275436162949, "num_tokens": 245587361.0, "sample_num_tokens": 9035.25, "step": 9408, "total_num_tokens": 245623502.0, "z_loss": 0.0003724960843101144 }, { "copy_logits_max": -3.497653007507324, "copy_logits_min": -750000000.0, "copy_num_tokens": 589.5625, "epoch": 1.9215215726321164, "gen_logits_max": 2.6098415851593018, "gen_logits_mean": -18.220417022705078, "gen_logits_min": -30.396108627319336, "gen_logits_std": 3.530897378921509, "gen_loss": 0.29618215560913086, "grad_norm": 0.34271634959128155, "learning_rate": 1.904463157894737e-05, "loss": 0.2968, "mean_copy_accuracy": 0.9973562806844711, "mean_gen_accuracy": 0.865739181637764, "mean_token_accuracy": 0.9006721824407578, "num_tokens": 245869780.0, "sample_num_tokens": 9523.0, "step": 9409, "total_num_tokens": 245907872.0, "z_loss": 0.00040410124347545207 }, { "copy_logits_max": -2.9435532093048096, "copy_logits_min": -750000064.0, "copy_num_tokens": 441.5, "epoch": 1.9217258105693134, "gen_logits_max": 4.046725749969482, "gen_logits_mean": -16.668411254882812, "gen_logits_min": -29.47808074951172, "gen_logits_std": 3.439739227294922, "gen_loss": 0.30940595269203186, "grad_norm": 0.36988388129081456, "learning_rate": 1.904336842105263e-05, "loss": 0.3085, "mean_copy_accuracy": 0.9964413046836853, "mean_gen_accuracy": 0.8692125231027603, "mean_token_accuracy": 0.8951861411333084, "num_tokens": 246129961.0, "sample_num_tokens": 9215.25, "step": 9410, "total_num_tokens": 246166822.0, "z_loss": 0.00041175264050252736 }, { "copy_logits_max": -3.875011920928955, "copy_logits_min": -750000000.0, "copy_num_tokens": 409.4375, "epoch": 1.92193004850651, "gen_logits_max": 3.429985523223877, "gen_logits_mean": -17.854671478271484, "gen_logits_min": -30.098703384399414, "gen_logits_std": 3.489532947540283, "gen_loss": 0.30114269256591797, "grad_norm": 0.3521900318534935, "learning_rate": 1.9042105263157896e-05, "loss": 0.2784, "mean_copy_accuracy": 0.9962757378816605, "mean_gen_accuracy": 0.8775103688240051, "mean_token_accuracy": 0.9053248316049576, "num_tokens": 246387103.0, "sample_num_tokens": 7875.25, "step": 9411, "total_num_tokens": 246418604.0, "z_loss": 0.0004297658451832831 }, { "copy_logits_max": -3.6911942958831787, "copy_logits_min": -687500032.0, "copy_num_tokens": 665.4375, "epoch": 1.9221342864437068, "gen_logits_max": 3.9554543495178223, "gen_logits_mean": -15.512964248657227, "gen_logits_min": -28.099262237548828, "gen_logits_std": 3.4107656478881836, "gen_loss": 0.26253053545951843, "grad_norm": 0.3474912066856172, "learning_rate": 1.904084210526316e-05, "loss": 0.2711, "mean_copy_accuracy": 0.9975807517766953, "mean_gen_accuracy": 0.87579345703125, "mean_token_accuracy": 0.9087589830160141, "num_tokens": 246656891.0, "sample_num_tokens": 9860.75, "step": 9412, "total_num_tokens": 246696334.0, "z_loss": 0.0003731622127816081 }, { "copy_logits_max": 1.114618182182312, "copy_logits_min": -750000000.0, "copy_num_tokens": 498.1875, "epoch": 1.922338524380904, "gen_logits_max": 5.033423900604248, "gen_logits_mean": -13.85472297668457, "gen_logits_min": -26.26991844177246, "gen_logits_std": 3.325073719024658, "gen_loss": 0.2620331645011902, "grad_norm": 0.33045673471321035, "learning_rate": 1.903957894736842e-05, "loss": 0.2662, "mean_copy_accuracy": 0.9977481514215469, "mean_gen_accuracy": 0.8804532289505005, "mean_token_accuracy": 0.9112839102745056, "num_tokens": 246955654.0, "sample_num_tokens": 8956.5, "step": 9413, "total_num_tokens": 246991480.0, "z_loss": 0.00042808515718206763 }, { "copy_logits_max": -4.018783092498779, "copy_logits_min": -750000000.0, "copy_num_tokens": 269.0, "epoch": 1.9225427623181006, "gen_logits_max": 4.481449127197266, "gen_logits_mean": -16.344257354736328, "gen_logits_min": -28.588062286376953, "gen_logits_std": 3.4246182441711426, "gen_loss": 0.2837802767753601, "grad_norm": 0.3632032226882955, "learning_rate": 1.9038315789473685e-05, "loss": 0.2835, "mean_copy_accuracy": 0.9969220757484436, "mean_gen_accuracy": 0.8756021559238434, "mean_token_accuracy": 0.9022265523672104, "num_tokens": 247215050.0, "sample_num_tokens": 6903.5, "step": 9414, "total_num_tokens": 247242664.0, "z_loss": 0.000411029439419508 }, { "copy_logits_max": -3.5105690956115723, "copy_logits_min": -750000000.0, "copy_num_tokens": 508.9375, "epoch": 1.9227470002552973, "gen_logits_max": 3.644768714904785, "gen_logits_mean": -16.41238021850586, "gen_logits_min": -28.948944091796875, "gen_logits_std": 3.4423577785491943, "gen_loss": 0.25688236951828003, "grad_norm": 0.3509640434534129, "learning_rate": 1.9037052631578946e-05, "loss": 0.2773, "mean_copy_accuracy": 0.9976752549409866, "mean_gen_accuracy": 0.874506026506424, "mean_token_accuracy": 0.906261682510376, "num_tokens": 247478896.0, "sample_num_tokens": 8872.0, "step": 9415, "total_num_tokens": 247514384.0, "z_loss": 0.0003482086176518351 }, { "copy_logits_max": -4.146884918212891, "copy_logits_min": -750000000.0, "copy_num_tokens": 500.1875, "epoch": 1.9229512381924942, "gen_logits_max": 3.242722511291504, "gen_logits_mean": -17.300884246826172, "gen_logits_min": -29.612585067749023, "gen_logits_std": 3.462437391281128, "gen_loss": 0.26676642894744873, "grad_norm": 0.36376960959555377, "learning_rate": 1.903578947368421e-05, "loss": 0.2826, "mean_copy_accuracy": 0.9975896030664444, "mean_gen_accuracy": 0.8723704516887665, "mean_token_accuracy": 0.9050770848989487, "num_tokens": 247736218.0, "sample_num_tokens": 8888.0, "step": 9416, "total_num_tokens": 247771770.0, "z_loss": 0.00044015655294060707 }, { "copy_logits_max": -2.280149221420288, "copy_logits_min": -750000064.0, "copy_num_tokens": 451.625, "epoch": 1.9231554761296912, "gen_logits_max": 3.082752227783203, "gen_logits_mean": -17.369850158691406, "gen_logits_min": -30.07965850830078, "gen_logits_std": 3.4663500785827637, "gen_loss": 0.2780638635158539, "grad_norm": 0.35444825945398334, "learning_rate": 1.9034526315789475e-05, "loss": 0.2786, "mean_copy_accuracy": 0.9978214800357819, "mean_gen_accuracy": 0.8708878755569458, "mean_token_accuracy": 0.9050642549991608, "num_tokens": 248019497.0, "sample_num_tokens": 7438.25, "step": 9417, "total_num_tokens": 248049250.0, "z_loss": 0.00044179780525155365 }, { "copy_logits_max": -0.19673430919647217, "copy_logits_min": -750000000.0, "copy_num_tokens": 588.4375, "epoch": 1.9233597140668879, "gen_logits_max": 4.710850238800049, "gen_logits_mean": -14.53597640991211, "gen_logits_min": -27.046886444091797, "gen_logits_std": 3.3497071266174316, "gen_loss": 0.25880947709083557, "grad_norm": 0.3560126105022881, "learning_rate": 1.903326315789474e-05, "loss": 0.2673, "mean_copy_accuracy": 0.9971889108419418, "mean_gen_accuracy": 0.8815988749265671, "mean_token_accuracy": 0.9106486439704895, "num_tokens": 248285813.0, "sample_num_tokens": 9310.75, "step": 9418, "total_num_tokens": 248323056.0, "z_loss": 0.0004415407311171293 }, { "copy_logits_max": -0.16010090708732605, "copy_logits_min": -750000000.0, "copy_num_tokens": 553.0, "epoch": 1.9235639520040848, "gen_logits_max": 4.042342185974121, "gen_logits_mean": -14.619266510009766, "gen_logits_min": -27.294475555419922, "gen_logits_std": 3.3800363540649414, "gen_loss": 0.22719863057136536, "grad_norm": 0.3363808668838397, "learning_rate": 1.9032e-05, "loss": 0.2549, "mean_copy_accuracy": 0.997832253575325, "mean_gen_accuracy": 0.8810998946428299, "mean_token_accuracy": 0.9147424250841141, "num_tokens": 248559068.0, "sample_num_tokens": 8121.0, "step": 9419, "total_num_tokens": 248591552.0, "z_loss": 0.00038418505573645234 }, { "copy_logits_max": -1.4311355352401733, "copy_logits_min": -750000064.0, "copy_num_tokens": 515.0625, "epoch": 1.9237681899412817, "gen_logits_max": 4.42061185836792, "gen_logits_mean": -15.92945671081543, "gen_logits_min": -28.122787475585938, "gen_logits_std": 3.421257972717285, "gen_loss": 0.2631622552871704, "grad_norm": 0.31485491720981645, "learning_rate": 1.9030736842105264e-05, "loss": 0.2515, "mean_copy_accuracy": 0.9978366643190384, "mean_gen_accuracy": 0.8816449791193008, "mean_token_accuracy": 0.9146618098020554, "num_tokens": 248854391.0, "sample_num_tokens": 8607.75, "step": 9420, "total_num_tokens": 248888822.0, "z_loss": 0.0004318556748330593 }, { "copy_logits_max": -3.3997576236724854, "copy_logits_min": -750000000.0, "copy_num_tokens": 379.1875, "epoch": 1.9239724278784784, "gen_logits_max": 4.2514166831970215, "gen_logits_mean": -16.11361312866211, "gen_logits_min": -28.100963592529297, "gen_logits_std": 3.389362096786499, "gen_loss": 0.31561440229415894, "grad_norm": 0.3546695984271382, "learning_rate": 1.9029473684210525e-05, "loss": 0.287, "mean_copy_accuracy": 0.9967331886291504, "mean_gen_accuracy": 0.8758350610733032, "mean_token_accuracy": 0.9016320407390594, "num_tokens": 249119854.0, "sample_num_tokens": 7952.5, "step": 9421, "total_num_tokens": 249151664.0, "z_loss": 0.000512847094796598 }, { "copy_logits_max": -4.993377208709717, "copy_logits_min": -750000000.0, "copy_num_tokens": 409.125, "epoch": 1.9241766658156751, "gen_logits_max": 3.2996177673339844, "gen_logits_mean": -17.708465576171875, "gen_logits_min": -30.025020599365234, "gen_logits_std": 3.488661766052246, "gen_loss": 0.27203214168548584, "grad_norm": 0.3569667982340327, "learning_rate": 1.902821052631579e-05, "loss": 0.2877, "mean_copy_accuracy": 0.9973131269216537, "mean_gen_accuracy": 0.8739795386791229, "mean_token_accuracy": 0.9036007970571518, "num_tokens": 249388274.0, "sample_num_tokens": 8017.0, "step": 9422, "total_num_tokens": 249420342.0, "z_loss": 0.00043275958159938455 }, { "copy_logits_max": -4.326221466064453, "copy_logits_min": -750000000.0, "copy_num_tokens": 389.6875, "epoch": 1.924380903752872, "gen_logits_max": 2.4981250762939453, "gen_logits_mean": -19.432527542114258, "gen_logits_min": -31.545429229736328, "gen_logits_std": 3.560868501663208, "gen_loss": 0.26429691910743713, "grad_norm": 0.3129615344483255, "learning_rate": 1.902694736842105e-05, "loss": 0.276, "mean_copy_accuracy": 0.9968598932027817, "mean_gen_accuracy": 0.8782386034727097, "mean_token_accuracy": 0.9062052220106125, "num_tokens": 249681721.0, "sample_num_tokens": 7173.25, "step": 9423, "total_num_tokens": 249710414.0, "z_loss": 0.000406871666200459 }, { "copy_logits_max": -4.548050403594971, "copy_logits_min": -750000000.0, "copy_num_tokens": 597.6875, "epoch": 1.924585141690069, "gen_logits_max": 2.3257298469543457, "gen_logits_mean": -18.859874725341797, "gen_logits_min": -31.052600860595703, "gen_logits_std": 3.543154716491699, "gen_loss": 0.27131858468055725, "grad_norm": 0.34617298206058505, "learning_rate": 1.9025684210526315e-05, "loss": 0.2722, "mean_copy_accuracy": 0.9968495666980743, "mean_gen_accuracy": 0.8773152083158493, "mean_token_accuracy": 0.9065363109111786, "num_tokens": 249947973.0, "sample_num_tokens": 9756.75, "step": 9424, "total_num_tokens": 249987000.0, "z_loss": 0.00040832115337252617 }, { "copy_logits_max": -4.101109504699707, "copy_logits_min": -750000000.0, "copy_num_tokens": 598.9375, "epoch": 1.9247893796272657, "gen_logits_max": 3.7334823608398438, "gen_logits_mean": -15.492921829223633, "gen_logits_min": -28.21923828125, "gen_logits_std": 3.414224147796631, "gen_loss": 0.24793116748332977, "grad_norm": 0.3453197056298754, "learning_rate": 1.9024421052631583e-05, "loss": 0.2715, "mean_copy_accuracy": 0.9975132644176483, "mean_gen_accuracy": 0.8792680501937866, "mean_token_accuracy": 0.908311128616333, "num_tokens": 250219391.0, "sample_num_tokens": 8986.75, "step": 9425, "total_num_tokens": 250255338.0, "z_loss": 0.00040643251850269735 }, { "copy_logits_max": -3.81614351272583, "copy_logits_min": -750000000.0, "copy_num_tokens": 482.6875, "epoch": 1.9249936175644626, "gen_logits_max": 3.018716812133789, "gen_logits_mean": -17.854785919189453, "gen_logits_min": -29.935932159423828, "gen_logits_std": 3.5056300163269043, "gen_loss": 0.2611886262893677, "grad_norm": 0.37241916539376696, "learning_rate": 1.9023157894736843e-05, "loss": 0.2687, "mean_copy_accuracy": 0.9969967603683472, "mean_gen_accuracy": 0.8814454525709152, "mean_token_accuracy": 0.9073794633150101, "num_tokens": 250495010.0, "sample_num_tokens": 7681.5, "step": 9426, "total_num_tokens": 250525736.0, "z_loss": 0.00042198001756332815 }, { "copy_logits_max": -5.1093363761901855, "copy_logits_min": -750000064.0, "copy_num_tokens": 816.5625, "epoch": 1.9251978555016596, "gen_logits_max": 2.3739607334136963, "gen_logits_mean": -17.46176528930664, "gen_logits_min": -30.575424194335938, "gen_logits_std": 3.503793239593506, "gen_loss": 0.25093525648117065, "grad_norm": 0.3475760020569022, "learning_rate": 1.9021894736842108e-05, "loss": 0.2737, "mean_copy_accuracy": 0.9971473217010498, "mean_gen_accuracy": 0.8710954934358597, "mean_token_accuracy": 0.9078531116247177, "num_tokens": 250770768.0, "sample_num_tokens": 10953.0, "step": 9427, "total_num_tokens": 250814580.0, "z_loss": 0.0003968581440858543 }, { "copy_logits_max": -5.76278829574585, "copy_logits_min": -750000000.0, "copy_num_tokens": 513.5625, "epoch": 1.9254020934388563, "gen_logits_max": 2.928300142288208, "gen_logits_mean": -17.67141342163086, "gen_logits_min": -30.200408935546875, "gen_logits_std": 3.4959444999694824, "gen_loss": 0.25620609521865845, "grad_norm": 0.375154307905889, "learning_rate": 1.902063157894737e-05, "loss": 0.286, "mean_copy_accuracy": 0.9970199465751648, "mean_gen_accuracy": 0.8750952780246735, "mean_token_accuracy": 0.9018058329820633, "num_tokens": 251034226.0, "sample_num_tokens": 8267.5, "step": 9428, "total_num_tokens": 251067296.0, "z_loss": 0.00037243860424496233 }, { "copy_logits_max": -6.822164058685303, "copy_logits_min": -750000000.0, "copy_num_tokens": 406.5, "epoch": 1.925606331376053, "gen_logits_max": 4.145581245422363, "gen_logits_mean": -16.73482322692871, "gen_logits_min": -28.93459701538086, "gen_logits_std": 3.431307792663574, "gen_loss": 0.2961744964122772, "grad_norm": 0.3619891984441905, "learning_rate": 1.9019368421052633e-05, "loss": 0.2878, "mean_copy_accuracy": 0.996843084692955, "mean_gen_accuracy": 0.8768829852342606, "mean_token_accuracy": 0.9033340215682983, "num_tokens": 251288601.0, "sample_num_tokens": 7958.25, "step": 9429, "total_num_tokens": 251320434.0, "z_loss": 0.00047231969074346125 }, { "copy_logits_max": -7.957723617553711, "copy_logits_min": -750000000.0, "copy_num_tokens": 293.8125, "epoch": 1.92581056931325, "gen_logits_max": 4.314424991607666, "gen_logits_mean": -17.441495895385742, "gen_logits_min": -29.84716033935547, "gen_logits_std": 3.4674549102783203, "gen_loss": 0.29035258293151855, "grad_norm": 0.3536917787946268, "learning_rate": 1.9018105263157894e-05, "loss": 0.2733, "mean_copy_accuracy": 0.9979915916919708, "mean_gen_accuracy": 0.8784752488136292, "mean_token_accuracy": 0.9076555967330933, "num_tokens": 251557307.0, "sample_num_tokens": 7456.75, "step": 9430, "total_num_tokens": 251587134.0, "z_loss": 0.0004173860652372241 }, { "copy_logits_max": -6.15254020690918, "copy_logits_min": -750000000.0, "copy_num_tokens": 516.125, "epoch": 1.9260148072504468, "gen_logits_max": 3.119001626968384, "gen_logits_mean": -17.281818389892578, "gen_logits_min": -29.553932189941406, "gen_logits_std": 3.4505677223205566, "gen_loss": 0.27974358201026917, "grad_norm": 0.35986720592271654, "learning_rate": 1.9016842105263158e-05, "loss": 0.2802, "mean_copy_accuracy": 0.9969922304153442, "mean_gen_accuracy": 0.875382587313652, "mean_token_accuracy": 0.9049068540334702, "num_tokens": 251810952.0, "sample_num_tokens": 8519.5, "step": 9431, "total_num_tokens": 251845030.0, "z_loss": 0.00043660475057549775 }, { "copy_logits_max": -7.155120372772217, "copy_logits_min": -750000000.0, "copy_num_tokens": 579.8125, "epoch": 1.9262190451876435, "gen_logits_max": 2.5896668434143066, "gen_logits_mean": -18.60932159423828, "gen_logits_min": -30.918962478637695, "gen_logits_std": 3.5395166873931885, "gen_loss": 0.2649582028388977, "grad_norm": 0.729123969799685, "learning_rate": 1.901557894736842e-05, "loss": 0.284, "mean_copy_accuracy": 0.9969126582145691, "mean_gen_accuracy": 0.8752396702766418, "mean_token_accuracy": 0.9022538363933563, "num_tokens": 252063272.0, "sample_num_tokens": 9182.0, "step": 9432, "total_num_tokens": 252100000.0, "z_loss": 0.0004012316348962486 }, { "copy_logits_max": -6.7780442237854, "copy_logits_min": -750000000.0, "copy_num_tokens": 309.3125, "epoch": 1.9264232831248405, "gen_logits_max": 4.836745262145996, "gen_logits_mean": -16.020627975463867, "gen_logits_min": -28.133750915527344, "gen_logits_std": 3.4028096199035645, "gen_loss": 0.2530151605606079, "grad_norm": 0.3676360100482618, "learning_rate": 1.9014315789473687e-05, "loss": 0.2649, "mean_copy_accuracy": 0.9971765428781509, "mean_gen_accuracy": 0.8831047415733337, "mean_token_accuracy": 0.9108554720878601, "num_tokens": 252338016.0, "sample_num_tokens": 7464.5, "step": 9433, "total_num_tokens": 252367874.0, "z_loss": 0.0003691418678499758 }, { "copy_logits_max": -5.911637783050537, "copy_logits_min": -750000000.0, "copy_num_tokens": 661.375, "epoch": 1.9266275210620374, "gen_logits_max": 3.2100610733032227, "gen_logits_mean": -16.922697067260742, "gen_logits_min": -29.34506607055664, "gen_logits_std": 3.4687490463256836, "gen_loss": 0.25418683886528015, "grad_norm": 0.37803712689934277, "learning_rate": 1.9013052631578948e-05, "loss": 0.2917, "mean_copy_accuracy": 0.9957222491502762, "mean_gen_accuracy": 0.8715070188045502, "mean_token_accuracy": 0.9003204554319382, "num_tokens": 252595961.0, "sample_num_tokens": 9924.75, "step": 9434, "total_num_tokens": 252635660.0, "z_loss": 0.0004166641738265753 }, { "copy_logits_max": -4.252573013305664, "copy_logits_min": -687500032.0, "copy_num_tokens": 379.375, "epoch": 1.926831758999234, "gen_logits_max": 4.583034515380859, "gen_logits_mean": -15.218331336975098, "gen_logits_min": -27.072341918945312, "gen_logits_std": 3.3489933013916016, "gen_loss": 0.31498339772224426, "grad_norm": 0.3453670589900941, "learning_rate": 1.9011789473684212e-05, "loss": 0.2774, "mean_copy_accuracy": 0.9965873509645462, "mean_gen_accuracy": 0.8737928420305252, "mean_token_accuracy": 0.9048077762126923, "num_tokens": 252885326.0, "sample_num_tokens": 7807.0, "step": 9435, "total_num_tokens": 252916554.0, "z_loss": 0.00044989344314672053 }, { "copy_logits_max": -4.5093841552734375, "copy_logits_min": -750000000.0, "copy_num_tokens": 411.125, "epoch": 1.9270359969364308, "gen_logits_max": 3.4236183166503906, "gen_logits_mean": -16.6911563873291, "gen_logits_min": -28.99715805053711, "gen_logits_std": 3.433640480041504, "gen_loss": 0.2962888479232788, "grad_norm": 0.3582375780244525, "learning_rate": 1.9010526315789473e-05, "loss": 0.2811, "mean_copy_accuracy": 0.9965379387140274, "mean_gen_accuracy": 0.8768052607774734, "mean_token_accuracy": 0.9043672382831573, "num_tokens": 253151184.0, "sample_num_tokens": 7294.5, "step": 9436, "total_num_tokens": 253180362.0, "z_loss": 0.0004646604065783322 }, { "copy_logits_max": -4.548102378845215, "copy_logits_min": -750000000.0, "copy_num_tokens": 470.1875, "epoch": 1.9272402348736277, "gen_logits_max": 4.584558486938477, "gen_logits_mean": -15.441776275634766, "gen_logits_min": -27.55687713623047, "gen_logits_std": 3.379695415496826, "gen_loss": 0.2809056043624878, "grad_norm": 0.33348488137385884, "learning_rate": 1.9009263157894737e-05, "loss": 0.261, "mean_copy_accuracy": 0.9978495538234711, "mean_gen_accuracy": 0.8815672695636749, "mean_token_accuracy": 0.911362037062645, "num_tokens": 253416615.0, "sample_num_tokens": 8790.25, "step": 9437, "total_num_tokens": 253451776.0, "z_loss": 0.0004393622511997819 }, { "copy_logits_max": -7.0251288414001465, "copy_logits_min": -750000000.0, "copy_num_tokens": 353.625, "epoch": 1.9274444728108246, "gen_logits_max": 3.8075263500213623, "gen_logits_mean": -17.369333267211914, "gen_logits_min": -29.28091049194336, "gen_logits_std": 3.4743478298187256, "gen_loss": 0.25088661909103394, "grad_norm": 0.35953822416810693, "learning_rate": 1.9008e-05, "loss": 0.2772, "mean_copy_accuracy": 0.9974315464496613, "mean_gen_accuracy": 0.874263271689415, "mean_token_accuracy": 0.9051831066608429, "num_tokens": 253678888.0, "sample_num_tokens": 7549.5, "step": 9438, "total_num_tokens": 253709086.0, "z_loss": 0.00034356475225649774 }, { "copy_logits_max": -6.611905097961426, "copy_logits_min": -750000064.0, "copy_num_tokens": 368.9375, "epoch": 1.9276487107480214, "gen_logits_max": 3.4590630531311035, "gen_logits_mean": -17.69247817993164, "gen_logits_min": -29.494342803955078, "gen_logits_std": 3.4784162044525146, "gen_loss": 0.29047220945358276, "grad_norm": 0.36852514346194987, "learning_rate": 1.9006736842105263e-05, "loss": 0.2779, "mean_copy_accuracy": 0.9962569326162338, "mean_gen_accuracy": 0.8750780820846558, "mean_token_accuracy": 0.9069358706474304, "num_tokens": 253971967.0, "sample_num_tokens": 7819.75, "step": 9439, "total_num_tokens": 254003246.0, "z_loss": 0.0004190671897958964 }, { "copy_logits_max": -4.071578025817871, "copy_logits_min": -750000000.0, "copy_num_tokens": 520.125, "epoch": 1.9278529486852183, "gen_logits_max": 1.958644151687622, "gen_logits_mean": -19.151683807373047, "gen_logits_min": -31.464027404785156, "gen_logits_std": 3.565401554107666, "gen_loss": 0.2903759479522705, "grad_norm": 0.3459014394877184, "learning_rate": 1.9005473684210527e-05, "loss": 0.2838, "mean_copy_accuracy": 0.9965616762638092, "mean_gen_accuracy": 0.8712708055973053, "mean_token_accuracy": 0.9027242660522461, "num_tokens": 254242229.0, "sample_num_tokens": 8770.75, "step": 9440, "total_num_tokens": 254277312.0, "z_loss": 0.00041976000647991896 }, { "copy_logits_max": -5.576797962188721, "copy_logits_min": -750000000.0, "copy_num_tokens": 295.6875, "epoch": 1.9280571866224152, "gen_logits_max": 3.641016960144043, "gen_logits_mean": -18.57718276977539, "gen_logits_min": -30.419368743896484, "gen_logits_std": 3.5060741901397705, "gen_loss": 0.32029247283935547, "grad_norm": 0.3386709248644374, "learning_rate": 1.9004210526315788e-05, "loss": 0.2855, "mean_copy_accuracy": 0.9977890849113464, "mean_gen_accuracy": 0.8725269585847855, "mean_token_accuracy": 0.9030538648366928, "num_tokens": 254536162.0, "sample_num_tokens": 7048.0, "step": 9441, "total_num_tokens": 254564354.0, "z_loss": 0.0004836168955080211 }, { "copy_logits_max": -3.913848400115967, "copy_logits_min": -750000000.0, "copy_num_tokens": 660.0625, "epoch": 1.928261424559612, "gen_logits_max": 3.299619436264038, "gen_logits_mean": -17.494773864746094, "gen_logits_min": -30.286333084106445, "gen_logits_std": 3.490236759185791, "gen_loss": 0.272421270608902, "grad_norm": 0.3492468148334828, "learning_rate": 1.9002947368421056e-05, "loss": 0.2718, "mean_copy_accuracy": 0.9986480623483658, "mean_gen_accuracy": 0.873563751578331, "mean_token_accuracy": 0.9074895232915878, "num_tokens": 254797136.0, "sample_num_tokens": 10175.0, "step": 9442, "total_num_tokens": 254837836.0, "z_loss": 0.00039161351742222905 }, { "copy_logits_max": -5.124513626098633, "copy_logits_min": -750000000.0, "copy_num_tokens": 429.5625, "epoch": 1.9284656624968086, "gen_logits_max": 3.108485221862793, "gen_logits_mean": -18.371200561523438, "gen_logits_min": -30.790504455566406, "gen_logits_std": 3.5442352294921875, "gen_loss": 0.25087547302246094, "grad_norm": 0.3692794258120747, "learning_rate": 1.9001684210526316e-05, "loss": 0.26, "mean_copy_accuracy": 0.9970490485429764, "mean_gen_accuracy": 0.8800463229417801, "mean_token_accuracy": 0.9119758605957031, "num_tokens": 255055843.0, "sample_num_tokens": 7797.25, "step": 9443, "total_num_tokens": 255087032.0, "z_loss": 0.00041037023765966296 }, { "copy_logits_max": -5.396186351776123, "copy_logits_min": -750000000.0, "copy_num_tokens": 465.375, "epoch": 1.9286699004340058, "gen_logits_max": 3.1526408195495605, "gen_logits_mean": -18.06342124938965, "gen_logits_min": -30.721750259399414, "gen_logits_std": 3.494607925415039, "gen_loss": 0.26079171895980835, "grad_norm": 0.33518440123406223, "learning_rate": 1.900042105263158e-05, "loss": 0.2714, "mean_copy_accuracy": 0.9972196072340012, "mean_gen_accuracy": 0.8765623718500137, "mean_token_accuracy": 0.9072152376174927, "num_tokens": 255327638.0, "sample_num_tokens": 8936.0, "step": 9444, "total_num_tokens": 255363382.0, "z_loss": 0.0004024095833301544 }, { "copy_logits_max": -4.692398548126221, "copy_logits_min": -750000000.0, "copy_num_tokens": 532.9375, "epoch": 1.9288741383712025, "gen_logits_max": 2.5869455337524414, "gen_logits_mean": -18.170854568481445, "gen_logits_min": -30.364086151123047, "gen_logits_std": 3.503274440765381, "gen_loss": 0.25947967171669006, "grad_norm": 0.31073082763813115, "learning_rate": 1.899915789473684e-05, "loss": 0.2648, "mean_copy_accuracy": 0.9966843277215958, "mean_gen_accuracy": 0.8767598271369934, "mean_token_accuracy": 0.9079826027154922, "num_tokens": 255608480.0, "sample_num_tokens": 8583.0, "step": 9445, "total_num_tokens": 255642812.0, "z_loss": 0.0003816183307208121 }, { "copy_logits_max": -4.920722961425781, "copy_logits_min": -750000064.0, "copy_num_tokens": 528.9375, "epoch": 1.9290783763083992, "gen_logits_max": 3.6770758628845215, "gen_logits_mean": -16.783226013183594, "gen_logits_min": -28.922489166259766, "gen_logits_std": 3.4120848178863525, "gen_loss": 0.2899479269981384, "grad_norm": 0.32550728585772554, "learning_rate": 1.8997894736842106e-05, "loss": 0.2666, "mean_copy_accuracy": 0.9971379190683365, "mean_gen_accuracy": 0.8830253630876541, "mean_token_accuracy": 0.9088391214609146, "num_tokens": 255884155.0, "sample_num_tokens": 9554.75, "step": 9446, "total_num_tokens": 255922374.0, "z_loss": 0.00044035856262780726 }, { "copy_logits_max": -4.441376686096191, "copy_logits_min": -687500032.0, "copy_num_tokens": 342.4375, "epoch": 1.929282614245596, "gen_logits_max": 3.130858898162842, "gen_logits_mean": -17.86327362060547, "gen_logits_min": -30.28778839111328, "gen_logits_std": 3.4624886512756348, "gen_loss": 0.29496777057647705, "grad_norm": 0.38601800456339275, "learning_rate": 1.8996631578947367e-05, "loss": 0.3034, "mean_copy_accuracy": 0.9959007650613785, "mean_gen_accuracy": 0.8701202869415283, "mean_token_accuracy": 0.896494060754776, "num_tokens": 256149781.0, "sample_num_tokens": 6949.75, "step": 9447, "total_num_tokens": 256177580.0, "z_loss": 0.0004906746908091009 }, { "copy_logits_max": -6.072372913360596, "copy_logits_min": -687500032.0, "copy_num_tokens": 342.4375, "epoch": 1.929486852182793, "gen_logits_max": 3.487222671508789, "gen_logits_mean": -19.509662628173828, "gen_logits_min": -31.67700958251953, "gen_logits_std": 3.5559000968933105, "gen_loss": 0.2766820788383484, "grad_norm": 0.38059070970045533, "learning_rate": 1.899536842105263e-05, "loss": 0.2793, "mean_copy_accuracy": 0.996708407998085, "mean_gen_accuracy": 0.8794014006853104, "mean_token_accuracy": 0.9050389528274536, "num_tokens": 256397658.0, "sample_num_tokens": 6531.5, "step": 9448, "total_num_tokens": 256423784.0, "z_loss": 0.0004420003679115325 }, { "copy_logits_max": -5.022880554199219, "copy_logits_min": -750000000.0, "copy_num_tokens": 376.9375, "epoch": 1.9296910901199897, "gen_logits_max": 3.7745330333709717, "gen_logits_mean": -17.510292053222656, "gen_logits_min": -29.531856536865234, "gen_logits_std": 3.4526076316833496, "gen_loss": 0.2658587694168091, "grad_norm": 0.33529497250718393, "learning_rate": 1.8994105263157892e-05, "loss": 0.2637, "mean_copy_accuracy": 0.9976281225681305, "mean_gen_accuracy": 0.8816258609294891, "mean_token_accuracy": 0.9095646739006042, "num_tokens": 256676235.0, "sample_num_tokens": 8470.75, "step": 9449, "total_num_tokens": 256710118.0, "z_loss": 0.0004152504843659699 }, { "copy_logits_max": -6.519075393676758, "copy_logits_min": -750000000.0, "copy_num_tokens": 421.0625, "epoch": 1.9298953280571867, "gen_logits_max": 3.722254514694214, "gen_logits_mean": -16.65383529663086, "gen_logits_min": -29.166372299194336, "gen_logits_std": 3.4432923793792725, "gen_loss": 0.25820663571357727, "grad_norm": 0.3610374983479077, "learning_rate": 1.899284210526316e-05, "loss": 0.2649, "mean_copy_accuracy": 0.9972307831048965, "mean_gen_accuracy": 0.883471667766571, "mean_token_accuracy": 0.9112299531698227, "num_tokens": 256945211.0, "sample_num_tokens": 7536.25, "step": 9450, "total_num_tokens": 256975356.0, "z_loss": 0.0004236076201777905 }, { "copy_logits_max": -5.867961883544922, "copy_logits_min": -750000000.0, "copy_num_tokens": 507.625, "epoch": 1.9300995659943836, "gen_logits_max": 3.3112144470214844, "gen_logits_mean": -17.204463958740234, "gen_logits_min": -29.430908203125, "gen_logits_std": 3.438830852508545, "gen_loss": 0.2572981119155884, "grad_norm": 0.3887138752363107, "learning_rate": 1.8991578947368424e-05, "loss": 0.2851, "mean_copy_accuracy": 0.9969264268875122, "mean_gen_accuracy": 0.8795303404331207, "mean_token_accuracy": 0.9040350615978241, "num_tokens": 257182074.0, "sample_num_tokens": 8526.5, "step": 9451, "total_num_tokens": 257216180.0, "z_loss": 0.0003931052051484585 }, { "copy_logits_max": -8.543842315673828, "copy_logits_min": -750000064.0, "copy_num_tokens": 283.625, "epoch": 1.9303038039315803, "gen_logits_max": 2.725356101989746, "gen_logits_mean": -19.835113525390625, "gen_logits_min": -31.853084564208984, "gen_logits_std": 3.561828136444092, "gen_loss": 0.26910167932510376, "grad_norm": 0.353774191984452, "learning_rate": 1.8990315789473685e-05, "loss": 0.2758, "mean_copy_accuracy": 0.997436136007309, "mean_gen_accuracy": 0.8770383149385452, "mean_token_accuracy": 0.9077768176794052, "num_tokens": 257470988.0, "sample_num_tokens": 6970.5, "step": 9452, "total_num_tokens": 257498870.0, "z_loss": 0.0003891396336257458 }, { "copy_logits_max": -6.142447471618652, "copy_logits_min": -687500032.0, "copy_num_tokens": 602.375, "epoch": 1.930508041868777, "gen_logits_max": 3.3253653049468994, "gen_logits_mean": -15.838260650634766, "gen_logits_min": -28.47441291809082, "gen_logits_std": 3.4120781421661377, "gen_loss": 0.2455958127975464, "grad_norm": 0.3256127514629211, "learning_rate": 1.898905263157895e-05, "loss": 0.2613, "mean_copy_accuracy": 0.9975325763225555, "mean_gen_accuracy": 0.8797615468502045, "mean_token_accuracy": 0.9119679182767868, "num_tokens": 257734762.0, "sample_num_tokens": 9204.5, "step": 9453, "total_num_tokens": 257771580.0, "z_loss": 0.0003791917988564819 }, { "copy_logits_max": -4.257093906402588, "copy_logits_min": -687500032.0, "copy_num_tokens": 690.875, "epoch": 1.930712279805974, "gen_logits_max": 2.407109022140503, "gen_logits_mean": -17.865829467773438, "gen_logits_min": -30.49384117126465, "gen_logits_std": 3.5046825408935547, "gen_loss": 0.2685251832008362, "grad_norm": 0.30986547232008654, "learning_rate": 1.898778947368421e-05, "loss": 0.2497, "mean_copy_accuracy": 0.9986634403467178, "mean_gen_accuracy": 0.8768434226512909, "mean_token_accuracy": 0.9165715724229813, "num_tokens": 258044892.0, "sample_num_tokens": 9736.5, "step": 9454, "total_num_tokens": 258083838.0, "z_loss": 0.0003932711260858923 }, { "copy_logits_max": -5.907386779785156, "copy_logits_min": -687500032.0, "copy_num_tokens": 496.5625, "epoch": 1.9309165177431709, "gen_logits_max": 2.1647496223449707, "gen_logits_mean": -19.09819793701172, "gen_logits_min": -31.462236404418945, "gen_logits_std": 3.571364402770996, "gen_loss": 0.26891058683395386, "grad_norm": 0.365902950153545, "learning_rate": 1.8986526315789475e-05, "loss": 0.2615, "mean_copy_accuracy": 0.9958776831626892, "mean_gen_accuracy": 0.8848237544298172, "mean_token_accuracy": 0.9101767539978027, "num_tokens": 258314386.0, "sample_num_tokens": 8616.5, "step": 9455, "total_num_tokens": 258348852.0, "z_loss": 0.0004173151100985706 }, { "copy_logits_max": -7.874927043914795, "copy_logits_min": -750000000.0, "copy_num_tokens": 366.625, "epoch": 1.9311207556803676, "gen_logits_max": 3.1122405529022217, "gen_logits_mean": -18.129371643066406, "gen_logits_min": -30.155576705932617, "gen_logits_std": 3.4938747882843018, "gen_loss": 0.30075734853744507, "grad_norm": 0.3499606573316545, "learning_rate": 1.8985263157894736e-05, "loss": 0.3008, "mean_copy_accuracy": 0.9967340230941772, "mean_gen_accuracy": 0.8711834102869034, "mean_token_accuracy": 0.897835910320282, "num_tokens": 258584727.0, "sample_num_tokens": 7814.75, "step": 9456, "total_num_tokens": 258615986.0, "z_loss": 0.00040264526614919305 }, { "copy_logits_max": -8.119918823242188, "copy_logits_min": -750000000.0, "copy_num_tokens": 476.375, "epoch": 1.9313249936175645, "gen_logits_max": 2.48732328414917, "gen_logits_mean": -18.695720672607422, "gen_logits_min": -30.76254653930664, "gen_logits_std": 3.559000253677368, "gen_loss": 0.2517938017845154, "grad_norm": 0.34547004801921616, "learning_rate": 1.8984e-05, "loss": 0.2683, "mean_copy_accuracy": 0.9975873827934265, "mean_gen_accuracy": 0.8802724033594131, "mean_token_accuracy": 0.9079351872205734, "num_tokens": 258870500.0, "sample_num_tokens": 8134.5, "step": 9457, "total_num_tokens": 258903038.0, "z_loss": 0.0003359936526976526 }, { "copy_logits_max": -5.809488296508789, "copy_logits_min": -750000000.0, "copy_num_tokens": 554.5, "epoch": 1.9315292315547614, "gen_logits_max": 2.9138314723968506, "gen_logits_mean": -17.586511611938477, "gen_logits_min": -29.985042572021484, "gen_logits_std": 3.47540283203125, "gen_loss": 0.27731022238731384, "grad_norm": 0.40108238431989995, "learning_rate": 1.8982736842105264e-05, "loss": 0.281, "mean_copy_accuracy": 0.9975758790969849, "mean_gen_accuracy": 0.8728310018777847, "mean_token_accuracy": 0.9039065092802048, "num_tokens": 259121366.0, "sample_num_tokens": 9101.5, "step": 9458, "total_num_tokens": 259157772.0, "z_loss": 0.0003646036493591964 }, { "copy_logits_max": -6.316230297088623, "copy_logits_min": -625000000.0, "copy_num_tokens": 482.125, "epoch": 1.9317334694919581, "gen_logits_max": 2.737278699874878, "gen_logits_mean": -17.810039520263672, "gen_logits_min": -30.25763702392578, "gen_logits_std": 3.500140905380249, "gen_loss": 0.262856125831604, "grad_norm": 0.3627809495725222, "learning_rate": 1.898147368421053e-05, "loss": 0.2734, "mean_copy_accuracy": 0.9968262612819672, "mean_gen_accuracy": 0.88282810151577, "mean_token_accuracy": 0.9068839401006699, "num_tokens": 259390128.0, "sample_num_tokens": 8385.5, "step": 9459, "total_num_tokens": 259423670.0, "z_loss": 0.00037176141631789505 }, { "copy_logits_max": -6.991515159606934, "copy_logits_min": -750000000.0, "copy_num_tokens": 343.6875, "epoch": 1.9319377074291548, "gen_logits_max": 3.612807512283325, "gen_logits_mean": -17.659290313720703, "gen_logits_min": -29.66176986694336, "gen_logits_std": 3.428173780441284, "gen_loss": 0.2980693578720093, "grad_norm": 0.3394286311849921, "learning_rate": 1.898021052631579e-05, "loss": 0.2788, "mean_copy_accuracy": 0.9974017292261124, "mean_gen_accuracy": 0.8743111193180084, "mean_token_accuracy": 0.9035183936357498, "num_tokens": 259661854.0, "sample_num_tokens": 7895.5, "step": 9460, "total_num_tokens": 259693436.0, "z_loss": 0.00038191353087313473 }, { "copy_logits_max": -7.4103217124938965, "copy_logits_min": -750000064.0, "copy_num_tokens": 543.5625, "epoch": 1.9321419453663518, "gen_logits_max": 2.9221460819244385, "gen_logits_mean": -16.982343673706055, "gen_logits_min": -29.28023910522461, "gen_logits_std": 3.4495151042938232, "gen_loss": 0.24003177881240845, "grad_norm": 0.3296499166578165, "learning_rate": 1.8978947368421054e-05, "loss": 0.2555, "mean_copy_accuracy": 0.9975514113903046, "mean_gen_accuracy": 0.8834691792726517, "mean_token_accuracy": 0.913757711648941, "num_tokens": 259954863.0, "sample_num_tokens": 9088.75, "step": 9461, "total_num_tokens": 259991218.0, "z_loss": 0.00034371542278677225 }, { "copy_logits_max": -7.571971893310547, "copy_logits_min": -750000064.0, "copy_num_tokens": 278.375, "epoch": 1.9323461833035487, "gen_logits_max": 3.3332924842834473, "gen_logits_mean": -18.82188606262207, "gen_logits_min": -30.590957641601562, "gen_logits_std": 3.505539894104004, "gen_loss": 0.3134200870990753, "grad_norm": 0.3459818241012347, "learning_rate": 1.8977684210526315e-05, "loss": 0.2701, "mean_copy_accuracy": 0.9973510056734085, "mean_gen_accuracy": 0.8779037147760391, "mean_token_accuracy": 0.9079354405403137, "num_tokens": 260238330.0, "sample_num_tokens": 7817.0, "step": 9462, "total_num_tokens": 260269598.0, "z_loss": 0.00040888317744247615 }, { "copy_logits_max": -7.1367645263671875, "copy_logits_min": -750000000.0, "copy_num_tokens": 384.1875, "epoch": 1.9325504212407454, "gen_logits_max": 3.85795259475708, "gen_logits_mean": -16.772653579711914, "gen_logits_min": -28.580562591552734, "gen_logits_std": 3.3956077098846436, "gen_loss": 0.280918151140213, "grad_norm": 0.3339719039844148, "learning_rate": 1.897642105263158e-05, "loss": 0.2638, "mean_copy_accuracy": 0.9972122013568878, "mean_gen_accuracy": 0.8833296149969101, "mean_token_accuracy": 0.9087640047073364, "num_tokens": 260517982.0, "sample_num_tokens": 8351.0, "step": 9463, "total_num_tokens": 260551386.0, "z_loss": 0.00035523291444405913 }, { "copy_logits_max": -5.1308698654174805, "copy_logits_min": -687500032.0, "copy_num_tokens": 379.0625, "epoch": 1.9327546591779423, "gen_logits_max": 3.86667537689209, "gen_logits_mean": -16.535770416259766, "gen_logits_min": -28.76500701904297, "gen_logits_std": 3.421492576599121, "gen_loss": 0.2906528115272522, "grad_norm": 0.38615050369059994, "learning_rate": 1.897515789473684e-05, "loss": 0.2845, "mean_copy_accuracy": 0.9972749054431915, "mean_gen_accuracy": 0.8721397519111633, "mean_token_accuracy": 0.9026953130960464, "num_tokens": 260801227.0, "sample_num_tokens": 8018.25, "step": 9464, "total_num_tokens": 260833300.0, "z_loss": 0.00041573020280338824 }, { "copy_logits_max": -6.247798919677734, "copy_logits_min": -687500032.0, "copy_num_tokens": 499.375, "epoch": 1.9329588971151392, "gen_logits_max": 4.228529930114746, "gen_logits_mean": -16.518938064575195, "gen_logits_min": -28.506122589111328, "gen_logits_std": 3.4016146659851074, "gen_loss": 0.26676249504089355, "grad_norm": 0.34869567652863825, "learning_rate": 1.8973894736842104e-05, "loss": 0.2627, "mean_copy_accuracy": 0.9958845525979996, "mean_gen_accuracy": 0.8818129897117615, "mean_token_accuracy": 0.9097041338682175, "num_tokens": 261071303.0, "sample_num_tokens": 10371.25, "step": 9465, "total_num_tokens": 261112788.0, "z_loss": 0.00040145067032426596 }, { "copy_logits_max": -6.379823684692383, "copy_logits_min": -750000000.0, "copy_num_tokens": 426.0, "epoch": 1.933163135052336, "gen_logits_max": 3.1221842765808105, "gen_logits_mean": -18.138168334960938, "gen_logits_min": -30.412961959838867, "gen_logits_std": 3.474365234375, "gen_loss": 0.24209201335906982, "grad_norm": 0.35437574367286995, "learning_rate": 1.8972631578947372e-05, "loss": 0.2781, "mean_copy_accuracy": 0.9975483417510986, "mean_gen_accuracy": 0.8759762793779373, "mean_token_accuracy": 0.9048994183540344, "num_tokens": 261340596.0, "sample_num_tokens": 7432.0, "step": 9466, "total_num_tokens": 261370324.0, "z_loss": 0.0003492599935270846 }, { "copy_logits_max": -5.983839988708496, "copy_logits_min": -750000000.0, "copy_num_tokens": 302.5, "epoch": 1.9333673729895327, "gen_logits_max": 3.1814069747924805, "gen_logits_mean": -17.706966400146484, "gen_logits_min": -29.6661376953125, "gen_logits_std": 3.462296724319458, "gen_loss": 0.24695417284965515, "grad_norm": 0.34878657136499613, "learning_rate": 1.8971368421052633e-05, "loss": 0.2738, "mean_copy_accuracy": 0.9974774718284607, "mean_gen_accuracy": 0.8782389163970947, "mean_token_accuracy": 0.9070356488227844, "num_tokens": 261607864.0, "sample_num_tokens": 6919.5, "step": 9467, "total_num_tokens": 261635542.0, "z_loss": 0.0003623377997428179 }, { "copy_logits_max": -4.002171993255615, "copy_logits_min": -750000000.0, "copy_num_tokens": 494.375, "epoch": 1.9335716109267298, "gen_logits_max": 3.1740849018096924, "gen_logits_mean": -16.457157135009766, "gen_logits_min": -28.490764617919922, "gen_logits_std": 3.3518826961517334, "gen_loss": 0.27486175298690796, "grad_norm": 0.3474851824989199, "learning_rate": 1.8970105263157897e-05, "loss": 0.2619, "mean_copy_accuracy": 0.9972836524248123, "mean_gen_accuracy": 0.8799909353256226, "mean_token_accuracy": 0.9125958532094955, "num_tokens": 261895154.0, "sample_num_tokens": 8318.0, "step": 9468, "total_num_tokens": 261928426.0, "z_loss": 0.0004127126594539732 }, { "copy_logits_max": -4.978758811950684, "copy_logits_min": -750000000.0, "copy_num_tokens": 433.1875, "epoch": 1.9337758488639265, "gen_logits_max": 3.0988433361053467, "gen_logits_mean": -17.615468978881836, "gen_logits_min": -29.425334930419922, "gen_logits_std": 3.4289779663085938, "gen_loss": 0.26672035455703735, "grad_norm": 0.3364755538490864, "learning_rate": 1.8968842105263158e-05, "loss": 0.2721, "mean_copy_accuracy": 0.9974296987056732, "mean_gen_accuracy": 0.8785593509674072, "mean_token_accuracy": 0.9054998308420181, "num_tokens": 262153789.0, "sample_num_tokens": 8404.25, "step": 9469, "total_num_tokens": 262187406.0, "z_loss": 0.00041529565351083875 }, { "copy_logits_max": -5.482647895812988, "copy_logits_min": -750000000.0, "copy_num_tokens": 335.6875, "epoch": 1.9339800868011232, "gen_logits_max": 4.71065092086792, "gen_logits_mean": -16.36394500732422, "gen_logits_min": -27.953048706054688, "gen_logits_std": 3.3347465991973877, "gen_loss": 0.3161844313144684, "grad_norm": 0.3516620232946815, "learning_rate": 1.8967578947368422e-05, "loss": 0.2875, "mean_copy_accuracy": 0.9967615604400635, "mean_gen_accuracy": 0.8734337985515594, "mean_token_accuracy": 0.9011140614748001, "num_tokens": 262408090.0, "sample_num_tokens": 8010.5, "step": 9470, "total_num_tokens": 262440132.0, "z_loss": 0.0005367493140511215 }, { "copy_logits_max": -6.066568851470947, "copy_logits_min": -687500032.0, "copy_num_tokens": 316.8125, "epoch": 1.9341843247383201, "gen_logits_max": 3.570171594619751, "gen_logits_mean": -17.48061752319336, "gen_logits_min": -29.105649948120117, "gen_logits_std": 3.42828369140625, "gen_loss": 0.264321506023407, "grad_norm": 0.36231938526525653, "learning_rate": 1.8966315789473683e-05, "loss": 0.2833, "mean_copy_accuracy": 0.9962242543697357, "mean_gen_accuracy": 0.8773483783006668, "mean_token_accuracy": 0.9031566232442856, "num_tokens": 262673744.0, "sample_num_tokens": 6732.5, "step": 9471, "total_num_tokens": 262700674.0, "z_loss": 0.00041564623825252056 }, { "copy_logits_max": -3.21146297454834, "copy_logits_min": -687500096.0, "copy_num_tokens": 421.9375, "epoch": 1.934388562675517, "gen_logits_max": 4.11653470993042, "gen_logits_mean": -16.09186553955078, "gen_logits_min": -28.24709701538086, "gen_logits_std": 3.379086494445801, "gen_loss": 0.27037087082862854, "grad_norm": 0.3421424329252115, "learning_rate": 1.8965052631578948e-05, "loss": 0.2652, "mean_copy_accuracy": 0.9967479109764099, "mean_gen_accuracy": 0.880253940820694, "mean_token_accuracy": 0.9099782109260559, "num_tokens": 262943559.0, "sample_num_tokens": 7260.25, "step": 9472, "total_num_tokens": 262972600.0, "z_loss": 0.00045491891796700656 }, { "copy_logits_max": -4.423856258392334, "copy_logits_min": -750000000.0, "copy_num_tokens": 649.875, "epoch": 1.9345928006127138, "gen_logits_max": 3.2855417728424072, "gen_logits_mean": -15.481496810913086, "gen_logits_min": -27.846736907958984, "gen_logits_std": 3.3151330947875977, "gen_loss": 0.22556793689727783, "grad_norm": 0.3306827108134175, "learning_rate": 1.896378947368421e-05, "loss": 0.2416, "mean_copy_accuracy": 0.9976582527160645, "mean_gen_accuracy": 0.8873953074216843, "mean_token_accuracy": 0.9175330996513367, "num_tokens": 263224169.0, "sample_num_tokens": 9969.75, "step": 9473, "total_num_tokens": 263264048.0, "z_loss": 0.0003763986751437187 }, { "copy_logits_max": -2.110591173171997, "copy_logits_min": -750000000.0, "copy_num_tokens": 488.5625, "epoch": 1.9347970385499107, "gen_logits_max": 3.3036084175109863, "gen_logits_mean": -16.849693298339844, "gen_logits_min": -28.665287017822266, "gen_logits_std": 3.3815531730651855, "gen_loss": 0.2839396297931671, "grad_norm": 0.342676123889445, "learning_rate": 1.8962526315789476e-05, "loss": 0.265, "mean_copy_accuracy": 0.9969864040613174, "mean_gen_accuracy": 0.877281904220581, "mean_token_accuracy": 0.9083954393863678, "num_tokens": 263487604.0, "sample_num_tokens": 8329.5, "step": 9474, "total_num_tokens": 263520922.0, "z_loss": 0.00047650441410951316 }, { "copy_logits_max": -2.245957136154175, "copy_logits_min": -687500032.0, "copy_num_tokens": 582.1875, "epoch": 1.9350012764871076, "gen_logits_max": 2.90891170501709, "gen_logits_mean": -17.661022186279297, "gen_logits_min": -30.105770111083984, "gen_logits_std": 3.4389114379882812, "gen_loss": 0.2683069705963135, "grad_norm": 0.3687614153833684, "learning_rate": 1.8961263157894737e-05, "loss": 0.2749, "mean_copy_accuracy": 0.9963122606277466, "mean_gen_accuracy": 0.8743158131837845, "mean_token_accuracy": 0.9062035977840424, "num_tokens": 263759467.0, "sample_num_tokens": 8895.75, "step": 9475, "total_num_tokens": 263795050.0, "z_loss": 0.0004672557406593114 }, { "copy_logits_max": -1.088728666305542, "copy_logits_min": -687500032.0, "copy_num_tokens": 459.625, "epoch": 1.9352055144243043, "gen_logits_max": 3.9560012817382812, "gen_logits_mean": -16.914772033691406, "gen_logits_min": -28.95137596130371, "gen_logits_std": 3.4004626274108887, "gen_loss": 0.2727574110031128, "grad_norm": 0.34378674198612347, "learning_rate": 1.896e-05, "loss": 0.2675, "mean_copy_accuracy": 0.9979819357395172, "mean_gen_accuracy": 0.8802259415388107, "mean_token_accuracy": 0.9080696254968643, "num_tokens": 264023772.0, "sample_num_tokens": 8202.5, "step": 9476, "total_num_tokens": 264056582.0, "z_loss": 0.0004983153194189072 }, { "copy_logits_max": -5.0860137939453125, "copy_logits_min": -750000000.0, "copy_num_tokens": 495.9375, "epoch": 1.935409752361501, "gen_logits_max": 3.5731213092803955, "gen_logits_mean": -15.396562576293945, "gen_logits_min": -27.729347229003906, "gen_logits_std": 3.294297218322754, "gen_loss": 0.2618115544319153, "grad_norm": 0.3394101945552667, "learning_rate": 1.8958736842105262e-05, "loss": 0.2595, "mean_copy_accuracy": 0.9976729154586792, "mean_gen_accuracy": 0.8808777630329132, "mean_token_accuracy": 0.9120118916034698, "num_tokens": 264313208.0, "sample_num_tokens": 8631.0, "step": 9477, "total_num_tokens": 264347732.0, "z_loss": 0.0004334196273703128 }, { "copy_logits_max": -3.5061614513397217, "copy_logits_min": -750000000.0, "copy_num_tokens": 466.25, "epoch": 1.935613990298698, "gen_logits_max": 4.176122665405273, "gen_logits_mean": -15.501798629760742, "gen_logits_min": -27.437366485595703, "gen_logits_std": 3.330981731414795, "gen_loss": 0.2575901746749878, "grad_norm": 0.33457153984050125, "learning_rate": 1.8957473684210527e-05, "loss": 0.2654, "mean_copy_accuracy": 0.9976682364940643, "mean_gen_accuracy": 0.8809588104486465, "mean_token_accuracy": 0.9113057553768158, "num_tokens": 264587431.0, "sample_num_tokens": 8730.75, "step": 9478, "total_num_tokens": 264622354.0, "z_loss": 0.0004194249922875315 }, { "copy_logits_max": -3.3830068111419678, "copy_logits_min": -687500032.0, "copy_num_tokens": 617.8125, "epoch": 1.9358182282358949, "gen_logits_max": 2.611034631729126, "gen_logits_mean": -17.984527587890625, "gen_logits_min": -30.22064208984375, "gen_logits_std": 3.474386692047119, "gen_loss": 0.2467355728149414, "grad_norm": 0.34370707601285866, "learning_rate": 1.895621052631579e-05, "loss": 0.2643, "mean_copy_accuracy": 0.9974090605974197, "mean_gen_accuracy": 0.8778612166643143, "mean_token_accuracy": 0.9113656580448151, "num_tokens": 264859756.0, "sample_num_tokens": 9375.0, "step": 9479, "total_num_tokens": 264897256.0, "z_loss": 0.00038999979733489454 }, { "copy_logits_max": -0.4244064688682556, "copy_logits_min": -750000000.0, "copy_num_tokens": 564.5, "epoch": 1.9360224661730916, "gen_logits_max": 3.693189859390259, "gen_logits_mean": -15.01492691040039, "gen_logits_min": -27.20574188232422, "gen_logits_std": 3.2733840942382812, "gen_loss": 0.26773950457572937, "grad_norm": 0.3073491609788895, "learning_rate": 1.8954947368421052e-05, "loss": 0.2557, "mean_copy_accuracy": 0.9980756789445877, "mean_gen_accuracy": 0.8795097321271896, "mean_token_accuracy": 0.9136178493499756, "num_tokens": 265159515.0, "sample_num_tokens": 9463.25, "step": 9480, "total_num_tokens": 265197368.0, "z_loss": 0.0004202399286441505 }, { "copy_logits_max": -3.8838443756103516, "copy_logits_min": -750000000.0, "copy_num_tokens": 494.8125, "epoch": 1.9362267041102885, "gen_logits_max": 3.6750125885009766, "gen_logits_mean": -16.687183380126953, "gen_logits_min": -28.748146057128906, "gen_logits_std": 3.3915743827819824, "gen_loss": 0.25501537322998047, "grad_norm": 0.33218812637868844, "learning_rate": 1.8953684210526316e-05, "loss": 0.2625, "mean_copy_accuracy": 0.9981400817632675, "mean_gen_accuracy": 0.8745496571063995, "mean_token_accuracy": 0.9111076593399048, "num_tokens": 265441823.0, "sample_num_tokens": 7746.75, "step": 9481, "total_num_tokens": 265472810.0, "z_loss": 0.00038801401387900114 }, { "copy_logits_max": -1.9109818935394287, "copy_logits_min": -687500032.0, "copy_num_tokens": 679.0, "epoch": 1.9364309420474854, "gen_logits_max": 1.9737952947616577, "gen_logits_mean": -18.526670455932617, "gen_logits_min": -31.0795841217041, "gen_logits_std": 3.472288131713867, "gen_loss": 0.2328612208366394, "grad_norm": 0.30940724115767004, "learning_rate": 1.895242105263158e-05, "loss": 0.2462, "mean_copy_accuracy": 0.99726602435112, "mean_gen_accuracy": 0.8877067714929581, "mean_token_accuracy": 0.9152936339378357, "num_tokens": 265714114.0, "sample_num_tokens": 10177.5, "step": 9482, "total_num_tokens": 265754824.0, "z_loss": 0.0003664942632894963 }, { "copy_logits_max": -2.935494899749756, "copy_logits_min": -750000000.0, "copy_num_tokens": 416.6875, "epoch": 1.9366351799846822, "gen_logits_max": 2.9807233810424805, "gen_logits_mean": -16.878433227539062, "gen_logits_min": -29.21849822998047, "gen_logits_std": 3.407505512237549, "gen_loss": 0.2772781252861023, "grad_norm": 0.356914628039412, "learning_rate": 1.8951157894736845e-05, "loss": 0.2704, "mean_copy_accuracy": 0.9970887899398804, "mean_gen_accuracy": 0.8765182495117188, "mean_token_accuracy": 0.907877579331398, "num_tokens": 266009150.0, "sample_num_tokens": 7596.5, "step": 9483, "total_num_tokens": 266039536.0, "z_loss": 0.0004366447392385453 }, { "copy_logits_max": -3.2856006622314453, "copy_logits_min": -750000064.0, "copy_num_tokens": 448.8125, "epoch": 1.9368394179218789, "gen_logits_max": 2.870401382446289, "gen_logits_mean": -18.819015502929688, "gen_logits_min": -30.843276977539062, "gen_logits_std": 3.5046186447143555, "gen_loss": 0.249103844165802, "grad_norm": 0.34260244563994297, "learning_rate": 1.8949894736842106e-05, "loss": 0.2793, "mean_copy_accuracy": 0.9972529709339142, "mean_gen_accuracy": 0.8769378513097763, "mean_token_accuracy": 0.9061750620603561, "num_tokens": 266275117.0, "sample_num_tokens": 8374.75, "step": 9484, "total_num_tokens": 266308616.0, "z_loss": 0.00037503623752854764 }, { "copy_logits_max": -0.18453189730644226, "copy_logits_min": -750000064.0, "copy_num_tokens": 572.25, "epoch": 1.9370436558590758, "gen_logits_max": 3.7062668800354004, "gen_logits_mean": -15.595473289489746, "gen_logits_min": -28.10413360595703, "gen_logits_std": 3.315284013748169, "gen_loss": 0.2514837384223938, "grad_norm": 0.3570304653371993, "learning_rate": 1.894863157894737e-05, "loss": 0.2601, "mean_copy_accuracy": 0.9971204251050949, "mean_gen_accuracy": 0.8802835941314697, "mean_token_accuracy": 0.9109616428613663, "num_tokens": 266543757.0, "sample_num_tokens": 8889.75, "step": 9485, "total_num_tokens": 266579316.0, "z_loss": 0.00037990062264725566 }, { "copy_logits_max": -1.9488811492919922, "copy_logits_min": -750000000.0, "copy_num_tokens": 536.625, "epoch": 1.9372478937962727, "gen_logits_max": 3.595337390899658, "gen_logits_mean": -16.248641967773438, "gen_logits_min": -28.515731811523438, "gen_logits_std": 3.3837711811065674, "gen_loss": 0.2781447172164917, "grad_norm": 0.34063729172404894, "learning_rate": 1.894736842105263e-05, "loss": 0.2559, "mean_copy_accuracy": 0.9971071183681488, "mean_gen_accuracy": 0.8805443793535233, "mean_token_accuracy": 0.911568284034729, "num_tokens": 266829895.0, "sample_num_tokens": 9750.75, "step": 9486, "total_num_tokens": 266868898.0, "z_loss": 0.00039946482866071165 }, { "copy_logits_max": -2.199248790740967, "copy_logits_min": -750000064.0, "copy_num_tokens": 323.6875, "epoch": 1.9374521317334694, "gen_logits_max": 2.8049051761627197, "gen_logits_mean": -18.371410369873047, "gen_logits_min": -30.49512481689453, "gen_logits_std": 3.5084068775177, "gen_loss": 0.2955131530761719, "grad_norm": 0.38648706126812943, "learning_rate": 1.8946105263157895e-05, "loss": 0.278, "mean_copy_accuracy": 0.996337279677391, "mean_gen_accuracy": 0.8781326115131378, "mean_token_accuracy": 0.9041243940591812, "num_tokens": 267083832.0, "sample_num_tokens": 6591.5, "step": 9487, "total_num_tokens": 267110198.0, "z_loss": 0.00040733578498475254 }, { "copy_logits_max": -2.388899087905884, "copy_logits_min": -750000000.0, "copy_num_tokens": 534.625, "epoch": 1.9376563696706663, "gen_logits_max": 2.367184638977051, "gen_logits_mean": -18.129364013671875, "gen_logits_min": -30.272722244262695, "gen_logits_std": 3.5110692977905273, "gen_loss": 0.2828315198421478, "grad_norm": 0.39419884399109706, "learning_rate": 1.8944842105263156e-05, "loss": 0.302, "mean_copy_accuracy": 0.9977291077375412, "mean_gen_accuracy": 0.8665307462215424, "mean_token_accuracy": 0.8990801870822906, "num_tokens": 267340539.0, "sample_num_tokens": 8525.25, "step": 9488, "total_num_tokens": 267374640.0, "z_loss": 0.0004074383177794516 }, { "copy_logits_max": -3.239572763442993, "copy_logits_min": -750000000.0, "copy_num_tokens": 520.1875, "epoch": 1.9378606076078633, "gen_logits_max": 3.042057514190674, "gen_logits_mean": -17.617042541503906, "gen_logits_min": -29.386369705200195, "gen_logits_std": 3.463292121887207, "gen_loss": 0.2830124795436859, "grad_norm": 0.35240548099990193, "learning_rate": 1.894357894736842e-05, "loss": 0.272, "mean_copy_accuracy": 0.9978572726249695, "mean_gen_accuracy": 0.8751167953014374, "mean_token_accuracy": 0.9061270505189896, "num_tokens": 267615200.0, "sample_num_tokens": 8996.5, "step": 9489, "total_num_tokens": 267651186.0, "z_loss": 0.0003690794110298157 }, { "copy_logits_max": -2.821927070617676, "copy_logits_min": -750000000.0, "copy_num_tokens": 235.5625, "epoch": 1.93806484554506, "gen_logits_max": 4.5311784744262695, "gen_logits_mean": -15.953831672668457, "gen_logits_min": -27.994218826293945, "gen_logits_std": 3.3697614669799805, "gen_loss": 0.3232443034648895, "grad_norm": 0.3823172443125355, "learning_rate": 1.894231578947368e-05, "loss": 0.2925, "mean_copy_accuracy": 0.9971456080675125, "mean_gen_accuracy": 0.8764007389545441, "mean_token_accuracy": 0.901084303855896, "num_tokens": 267872919.0, "sample_num_tokens": 6702.25, "step": 9490, "total_num_tokens": 267899728.0, "z_loss": 0.00038613073411397636 }, { "copy_logits_max": -3.638916015625, "copy_logits_min": -750000000.0, "copy_num_tokens": 477.0, "epoch": 1.9382690834822567, "gen_logits_max": 2.313793659210205, "gen_logits_mean": -18.8443603515625, "gen_logits_min": -30.833026885986328, "gen_logits_std": 3.5167622566223145, "gen_loss": 0.25676074624061584, "grad_norm": 0.3496453684113518, "learning_rate": 1.894105263157895e-05, "loss": 0.2578, "mean_copy_accuracy": 0.9981411099433899, "mean_gen_accuracy": 0.8788679242134094, "mean_token_accuracy": 0.9108761847019196, "num_tokens": 268153085.0, "sample_num_tokens": 8547.75, "step": 9491, "total_num_tokens": 268187276.0, "z_loss": 0.000357775017619133 }, { "copy_logits_max": -4.104011058807373, "copy_logits_min": -750000000.0, "copy_num_tokens": 376.375, "epoch": 1.9384733214194536, "gen_logits_max": 2.8686978816986084, "gen_logits_mean": -17.92919158935547, "gen_logits_min": -30.2968692779541, "gen_logits_std": 3.488114356994629, "gen_loss": 0.2533467411994934, "grad_norm": 0.4081733236960097, "learning_rate": 1.8939789473684213e-05, "loss": 0.2832, "mean_copy_accuracy": 0.9965797513723373, "mean_gen_accuracy": 0.8735323548316956, "mean_token_accuracy": 0.9048597365617752, "num_tokens": 268425402.0, "sample_num_tokens": 7669.5, "step": 9492, "total_num_tokens": 268456080.0, "z_loss": 0.0003927569487132132 }, { "copy_logits_max": -5.834206581115723, "copy_logits_min": -750000000.0, "copy_num_tokens": 444.375, "epoch": 1.9386775593566505, "gen_logits_max": 3.070249557495117, "gen_logits_mean": -18.310508728027344, "gen_logits_min": -30.136367797851562, "gen_logits_std": 3.5203351974487305, "gen_loss": 0.2704756259918213, "grad_norm": 0.33194672853265794, "learning_rate": 1.8938526315789474e-05, "loss": 0.2569, "mean_copy_accuracy": 0.9973843991756439, "mean_gen_accuracy": 0.8820009827613831, "mean_token_accuracy": 0.9123746156692505, "num_tokens": 268697755.0, "sample_num_tokens": 8829.75, "step": 9493, "total_num_tokens": 268733074.0, "z_loss": 0.00036098581040278077 }, { "copy_logits_max": -3.7655491828918457, "copy_logits_min": -750000064.0, "copy_num_tokens": 398.4375, "epoch": 1.9388817972938472, "gen_logits_max": 3.409607410430908, "gen_logits_mean": -17.27167510986328, "gen_logits_min": -29.354888916015625, "gen_logits_std": 3.4682905673980713, "gen_loss": 0.24942518770694733, "grad_norm": 0.38169045379798006, "learning_rate": 1.893726315789474e-05, "loss": 0.2693, "mean_copy_accuracy": 0.9970775097608566, "mean_gen_accuracy": 0.8807318806648254, "mean_token_accuracy": 0.9082400798797607, "num_tokens": 268955447.0, "sample_num_tokens": 7706.75, "step": 9494, "total_num_tokens": 268986274.0, "z_loss": 0.00036591599928215146 }, { "copy_logits_max": -3.0637290477752686, "copy_logits_min": -750000000.0, "copy_num_tokens": 450.9375, "epoch": 1.9390860352310442, "gen_logits_max": 3.1136786937713623, "gen_logits_mean": -18.481307983398438, "gen_logits_min": -30.79749870300293, "gen_logits_std": 3.534899950027466, "gen_loss": 0.2694209814071655, "grad_norm": 0.37200016118291546, "learning_rate": 1.8936e-05, "loss": 0.293, "mean_copy_accuracy": 0.9967976063489914, "mean_gen_accuracy": 0.8701003044843674, "mean_token_accuracy": 0.9001584649085999, "num_tokens": 269213100.0, "sample_num_tokens": 7873.5, "step": 9495, "total_num_tokens": 269244594.0, "z_loss": 0.000407917657867074 }, { "copy_logits_max": -4.057802200317383, "copy_logits_min": -750000000.0, "copy_num_tokens": 239.0, "epoch": 1.939290273168241, "gen_logits_max": 4.858163833618164, "gen_logits_mean": -16.38338279724121, "gen_logits_min": -28.464984893798828, "gen_logits_std": 3.4097208976745605, "gen_loss": 0.26547983288764954, "grad_norm": 0.37662896550263175, "learning_rate": 1.8934736842105264e-05, "loss": 0.3064, "mean_copy_accuracy": 0.9970867931842804, "mean_gen_accuracy": 0.8727513253688812, "mean_token_accuracy": 0.8949279487133026, "num_tokens": 269465126.0, "sample_num_tokens": 7165.0, "step": 9496, "total_num_tokens": 269493786.0, "z_loss": 0.0003927767975255847 }, { "copy_logits_max": -5.086360931396484, "copy_logits_min": -750000000.0, "copy_num_tokens": 537.875, "epoch": 1.9394945111054378, "gen_logits_max": 3.4512085914611816, "gen_logits_mean": -16.412342071533203, "gen_logits_min": -28.779144287109375, "gen_logits_std": 3.4160096645355225, "gen_loss": 0.2812309265136719, "grad_norm": 0.3548248786480864, "learning_rate": 1.8933473684210525e-05, "loss": 0.262, "mean_copy_accuracy": 0.9966122210025787, "mean_gen_accuracy": 0.8774465024471283, "mean_token_accuracy": 0.9108948111534119, "num_tokens": 269732660.0, "sample_num_tokens": 8519.5, "step": 9497, "total_num_tokens": 269766738.0, "z_loss": 0.00046766677405685186 }, { "copy_logits_max": -3.0871663093566895, "copy_logits_min": -750000000.0, "copy_num_tokens": 559.1875, "epoch": 1.9396987490426345, "gen_logits_max": 4.656246185302734, "gen_logits_mean": -15.114605903625488, "gen_logits_min": -27.61901092529297, "gen_logits_std": 3.3756792545318604, "gen_loss": 0.2521147131919861, "grad_norm": 0.32857649957159685, "learning_rate": 1.893221052631579e-05, "loss": 0.2609, "mean_copy_accuracy": 0.9977836906909943, "mean_gen_accuracy": 0.8816096186637878, "mean_token_accuracy": 0.9132843017578125, "num_tokens": 270017035.0, "sample_num_tokens": 9297.75, "step": 9498, "total_num_tokens": 270054226.0, "z_loss": 0.00040689940215088427 }, { "copy_logits_max": -4.734648704528809, "copy_logits_min": -750000000.0, "copy_num_tokens": 662.125, "epoch": 1.9399029869798317, "gen_logits_max": 3.1994333267211914, "gen_logits_mean": -17.22153091430664, "gen_logits_min": -29.562053680419922, "gen_logits_std": 3.475188970565796, "gen_loss": 0.2412211000919342, "grad_norm": 0.33204485663175554, "learning_rate": 1.8930947368421053e-05, "loss": 0.2509, "mean_copy_accuracy": 0.9974680244922638, "mean_gen_accuracy": 0.8826641589403152, "mean_token_accuracy": 0.9145642966032028, "num_tokens": 270304698.0, "sample_num_tokens": 9238.0, "step": 9499, "total_num_tokens": 270341650.0, "z_loss": 0.0004169286403339356 }, { "epoch": 1.9401072249170284, "grad_norm": 0.35455086351360643, "learning_rate": 1.8929684210526318e-05, "loss": 0.2861, "step": 9500 }, { "epoch": 1.9401072249170284, "eval_copy_logits_max": -8.597827911376953, "eval_copy_logits_min": -87.18696594238281, "eval_gen_logits_max": 2.173104763031006, "eval_gen_logits_mean": -22.854175567626953, "eval_gen_logits_min": -34.16182327270508, "eval_gen_logits_std": 3.6685314178466797, "eval_gen_loss": 0.2865749001502991, "eval_loss": 0.2655186951160431, "eval_mean_copy_accuracy": 0.9941351413726807, "eval_mean_gen_accuracy": 0.891503244638443, "eval_mean_token_accuracy": 0.9046418964862823, "eval_num_tokens": 270598966.0, "eval_runtime": 0.7914, "eval_samples_per_second": 10.109, "eval_steps_per_second": 2.527, "eval_total_num_tokens": 270598966.0, "eval_z_loss": 0.00042626686627045274, "step": 9500 }, { "copy_logits_max": -3.0636773109436035, "copy_logits_min": -687500032.0, "copy_num_tokens": 557.0625, "epoch": 1.940311462854225, "gen_logits_max": 3.3013007640838623, "gen_logits_mean": -16.809898376464844, "gen_logits_min": -29.411773681640625, "gen_logits_std": 3.4504196643829346, "gen_loss": 0.23855926096439362, "grad_norm": 0.32839764222778983, "learning_rate": 1.892842105263158e-05, "loss": 0.2735, "mean_copy_accuracy": 0.9971216470003128, "mean_gen_accuracy": 0.8765499144792557, "mean_token_accuracy": 0.9061908274888992, "num_tokens": 270852961.0, "sample_num_tokens": 9158.75, "step": 9501, "total_num_tokens": 270889596.0, "z_loss": 0.0003751358308363706 }, { "copy_logits_max": -4.658737659454346, "copy_logits_min": -750000000.0, "copy_num_tokens": 444.75, "epoch": 1.940515700791422, "gen_logits_max": 4.181585311889648, "gen_logits_mean": -15.755476951599121, "gen_logits_min": -28.02722930908203, "gen_logits_std": 3.3844265937805176, "gen_loss": 0.31292325258255005, "grad_norm": 0.34360742329314453, "learning_rate": 1.8927157894736843e-05, "loss": 0.2627, "mean_copy_accuracy": 0.9970463216304779, "mean_gen_accuracy": 0.8762076795101166, "mean_token_accuracy": 0.9104826152324677, "num_tokens": 271150216.0, "sample_num_tokens": 7765.5, "step": 9502, "total_num_tokens": 271181278.0, "z_loss": 0.0005226065404713154 }, { "copy_logits_max": -6.338113307952881, "copy_logits_min": -750000000.0, "copy_num_tokens": 507.4375, "epoch": 1.940719938728619, "gen_logits_max": 3.667661666870117, "gen_logits_mean": -17.02219009399414, "gen_logits_min": -28.854965209960938, "gen_logits_std": 3.414031744003296, "gen_loss": 0.27914920449256897, "grad_norm": 0.3672520850215811, "learning_rate": 1.8925894736842104e-05, "loss": 0.282, "mean_copy_accuracy": 0.9965894818305969, "mean_gen_accuracy": 0.8741137981414795, "mean_token_accuracy": 0.9041360765695572, "num_tokens": 271407740.0, "sample_num_tokens": 9240.0, "step": 9503, "total_num_tokens": 271444700.0, "z_loss": 0.00042097343248315156 }, { "copy_logits_max": -4.713050842285156, "copy_logits_min": -687500032.0, "copy_num_tokens": 447.5, "epoch": 1.9409241766658156, "gen_logits_max": 4.038780689239502, "gen_logits_mean": -16.980316162109375, "gen_logits_min": -28.77381134033203, "gen_logits_std": 3.4355106353759766, "gen_loss": 0.31106752157211304, "grad_norm": 0.35334394491486015, "learning_rate": 1.8924631578947368e-05, "loss": 0.2814, "mean_copy_accuracy": 0.996708333492279, "mean_gen_accuracy": 0.8770941644906998, "mean_token_accuracy": 0.9040123522281647, "num_tokens": 271664997.0, "sample_num_tokens": 8651.25, "step": 9504, "total_num_tokens": 271699602.0, "z_loss": 0.000496349239256233 }, { "copy_logits_max": -5.986110687255859, "copy_logits_min": -687500032.0, "copy_num_tokens": 497.125, "epoch": 1.9411284146030126, "gen_logits_max": 4.120077133178711, "gen_logits_mean": -15.643880844116211, "gen_logits_min": -28.44805145263672, "gen_logits_std": 3.402144432067871, "gen_loss": 0.2387240082025528, "grad_norm": 0.3604868792073107, "learning_rate": 1.8923368421052633e-05, "loss": 0.2795, "mean_copy_accuracy": 0.9976662844419479, "mean_gen_accuracy": 0.8736017197370529, "mean_token_accuracy": 0.9038469642400742, "num_tokens": 271937639.0, "sample_num_tokens": 8485.25, "step": 9505, "total_num_tokens": 271971580.0, "z_loss": 0.0004258647677488625 }, { "copy_logits_max": -5.730287551879883, "copy_logits_min": -750000000.0, "copy_num_tokens": 580.75, "epoch": 1.9413326525402095, "gen_logits_max": 3.2575531005859375, "gen_logits_mean": -17.441593170166016, "gen_logits_min": -29.40315055847168, "gen_logits_std": 3.4924156665802, "gen_loss": 0.26242974400520325, "grad_norm": 0.34912620423813173, "learning_rate": 1.8922105263157893e-05, "loss": 0.2827, "mean_copy_accuracy": 0.9968844950199127, "mean_gen_accuracy": 0.8766984045505524, "mean_token_accuracy": 0.9027231931686401, "num_tokens": 272206855.0, "sample_num_tokens": 9863.25, "step": 9506, "total_num_tokens": 272246308.0, "z_loss": 0.00041712785605341196 }, { "copy_logits_max": -6.099231243133545, "copy_logits_min": -750000064.0, "copy_num_tokens": 584.3125, "epoch": 1.9415368904774062, "gen_logits_max": 4.360645771026611, "gen_logits_mean": -15.081664085388184, "gen_logits_min": -27.140583038330078, "gen_logits_std": 3.3472139835357666, "gen_loss": 0.2740482985973358, "grad_norm": 0.363692341553381, "learning_rate": 1.892084210526316e-05, "loss": 0.2778, "mean_copy_accuracy": 0.9972222298383713, "mean_gen_accuracy": 0.8755100518465042, "mean_token_accuracy": 0.9046586006879807, "num_tokens": 272479876.0, "sample_num_tokens": 9746.0, "step": 9507, "total_num_tokens": 272518860.0, "z_loss": 0.0004590275639202446 }, { "copy_logits_max": -6.2519941329956055, "copy_logits_min": -750000000.0, "copy_num_tokens": 318.8125, "epoch": 1.941741128414603, "gen_logits_max": 4.190394401550293, "gen_logits_mean": -16.68334197998047, "gen_logits_min": -28.686962127685547, "gen_logits_std": 3.4274067878723145, "gen_loss": 0.2931320369243622, "grad_norm": 0.3571644110966672, "learning_rate": 1.8919578947368422e-05, "loss": 0.2783, "mean_copy_accuracy": 0.9963319599628448, "mean_gen_accuracy": 0.8795678466558456, "mean_token_accuracy": 0.9040887653827667, "num_tokens": 272742149.0, "sample_num_tokens": 7559.75, "step": 9508, "total_num_tokens": 272772388.0, "z_loss": 0.00042523688171058893 }, { "copy_logits_max": -6.694467067718506, "copy_logits_min": -750000000.0, "copy_num_tokens": 337.1875, "epoch": 1.9419453663517998, "gen_logits_max": 5.071599960327148, "gen_logits_mean": -15.04974365234375, "gen_logits_min": -27.389453887939453, "gen_logits_std": 3.375244617462158, "gen_loss": 0.3044603168964386, "grad_norm": 0.3610014692579377, "learning_rate": 1.8918315789473686e-05, "loss": 0.2716, "mean_copy_accuracy": 0.9966671913862228, "mean_gen_accuracy": 0.8825863897800446, "mean_token_accuracy": 0.9084697514772415, "num_tokens": 273000858.0, "sample_num_tokens": 7484.0, "step": 9509, "total_num_tokens": 273030794.0, "z_loss": 0.00045508003677241504 }, { "copy_logits_max": -2.3240911960601807, "copy_logits_min": -687500032.0, "copy_num_tokens": 439.75, "epoch": 1.9421496042889967, "gen_logits_max": 3.319802761077881, "gen_logits_mean": -17.51750373840332, "gen_logits_min": -29.967973709106445, "gen_logits_std": 3.5241432189941406, "gen_loss": 0.25200819969177246, "grad_norm": 0.38887264355080026, "learning_rate": 1.8917052631578947e-05, "loss": 0.2882, "mean_copy_accuracy": 0.9968448579311371, "mean_gen_accuracy": 0.8717009276151657, "mean_token_accuracy": 0.9015356302261353, "num_tokens": 273260749.0, "sample_num_tokens": 7662.75, "step": 9510, "total_num_tokens": 273291400.0, "z_loss": 0.00039976503467187285 }, { "copy_logits_max": -6.106796741485596, "copy_logits_min": -750000000.0, "copy_num_tokens": 364.375, "epoch": 1.9423538422261934, "gen_logits_max": 4.861412048339844, "gen_logits_mean": -16.527067184448242, "gen_logits_min": -28.540157318115234, "gen_logits_std": 3.438453197479248, "gen_loss": 0.28373923897743225, "grad_norm": 0.3584792765965554, "learning_rate": 1.891578947368421e-05, "loss": 0.2739, "mean_copy_accuracy": 0.9973014891147614, "mean_gen_accuracy": 0.878604531288147, "mean_token_accuracy": 0.907333254814148, "num_tokens": 273546610.0, "sample_num_tokens": 7698.5, "step": 9511, "total_num_tokens": 273577404.0, "z_loss": 0.00046405705506913364 }, { "copy_logits_max": -5.033483505249023, "copy_logits_min": -750000000.0, "copy_num_tokens": 388.9375, "epoch": 1.9425580801633904, "gen_logits_max": 4.711657524108887, "gen_logits_mean": -16.913692474365234, "gen_logits_min": -28.656978607177734, "gen_logits_std": 3.429229259490967, "gen_loss": 0.36683183908462524, "grad_norm": 0.3289953060005385, "learning_rate": 1.8914526315789473e-05, "loss": 0.2781, "mean_copy_accuracy": 0.9976424425840378, "mean_gen_accuracy": 0.8794434815645218, "mean_token_accuracy": 0.9057785719633102, "num_tokens": 273797600.0, "sample_num_tokens": 9115.0, "step": 9512, "total_num_tokens": 273834060.0, "z_loss": 0.0005106175085529685 }, { "copy_logits_max": -5.2012434005737305, "copy_logits_min": -625000000.0, "copy_num_tokens": 341.1875, "epoch": 1.9427623181005873, "gen_logits_max": 4.7292609214782715, "gen_logits_mean": -16.286571502685547, "gen_logits_min": -28.391103744506836, "gen_logits_std": 3.4130430221557617, "gen_loss": 0.348701536655426, "grad_norm": 0.38250509867623966, "learning_rate": 1.8913263157894737e-05, "loss": 0.296, "mean_copy_accuracy": 0.9975608736276627, "mean_gen_accuracy": 0.864336684346199, "mean_token_accuracy": 0.8990875482559204, "num_tokens": 274072374.0, "sample_num_tokens": 8299.5, "step": 9513, "total_num_tokens": 274105572.0, "z_loss": 0.000513103324919939 }, { "copy_logits_max": -5.766358375549316, "copy_logits_min": -750000000.0, "copy_num_tokens": 468.625, "epoch": 1.942966556037784, "gen_logits_max": 4.448626518249512, "gen_logits_mean": -15.832151412963867, "gen_logits_min": -28.018901824951172, "gen_logits_std": 3.424313545227051, "gen_loss": 0.3015255928039551, "grad_norm": 0.3507159399465599, "learning_rate": 1.8911999999999998e-05, "loss": 0.2821, "mean_copy_accuracy": 0.9972134232521057, "mean_gen_accuracy": 0.8785885274410248, "mean_token_accuracy": 0.9049414992332458, "num_tokens": 274333833.0, "sample_num_tokens": 8695.25, "step": 9514, "total_num_tokens": 274368614.0, "z_loss": 0.00042950708302669227 }, { "copy_logits_max": -6.726490020751953, "copy_logits_min": -750000000.0, "copy_num_tokens": 586.75, "epoch": 1.9431707939749807, "gen_logits_max": 4.001955032348633, "gen_logits_mean": -15.738212585449219, "gen_logits_min": -27.973445892333984, "gen_logits_std": 3.4201793670654297, "gen_loss": 0.27448129653930664, "grad_norm": 0.3574043982400481, "learning_rate": 1.8910736842105266e-05, "loss": 0.2654, "mean_copy_accuracy": 0.9974711835384369, "mean_gen_accuracy": 0.8762066066265106, "mean_token_accuracy": 0.9099363833665848, "num_tokens": 274626849.0, "sample_num_tokens": 9977.75, "step": 9515, "total_num_tokens": 274666760.0, "z_loss": 0.00041871360735967755 }, { "copy_logits_max": -7.864779949188232, "copy_logits_min": -750000000.0, "copy_num_tokens": 400.25, "epoch": 1.9433750319121776, "gen_logits_max": 3.458674669265747, "gen_logits_mean": -17.746227264404297, "gen_logits_min": -30.332019805908203, "gen_logits_std": 3.517087459564209, "gen_loss": 0.26458820700645447, "grad_norm": 0.3483414237947522, "learning_rate": 1.8909473684210526e-05, "loss": 0.2636, "mean_copy_accuracy": 0.9978356808423996, "mean_gen_accuracy": 0.8801000565290451, "mean_token_accuracy": 0.910355731844902, "num_tokens": 274906785.0, "sample_num_tokens": 7982.25, "step": 9516, "total_num_tokens": 274938714.0, "z_loss": 0.0004209335893392563 }, { "copy_logits_max": -5.947966575622559, "copy_logits_min": -750000000.0, "copy_num_tokens": 566.9375, "epoch": 1.9435792698493746, "gen_logits_max": 2.746469259262085, "gen_logits_mean": -17.35193634033203, "gen_logits_min": -29.651437759399414, "gen_logits_std": 3.492367744445801, "gen_loss": 0.31000375747680664, "grad_norm": 0.36303957556805494, "learning_rate": 1.890821052631579e-05, "loss": 0.2763, "mean_copy_accuracy": 0.9975920468568802, "mean_gen_accuracy": 0.8711168020963669, "mean_token_accuracy": 0.9039301127195358, "num_tokens": 275159845.0, "sample_num_tokens": 8522.25, "step": 9517, "total_num_tokens": 275193934.0, "z_loss": 0.0004364974738564342 }, { "copy_logits_max": -6.0680646896362305, "copy_logits_min": -750000000.0, "copy_num_tokens": 309.875, "epoch": 1.9437835077865713, "gen_logits_max": 5.3652849197387695, "gen_logits_mean": -15.548463821411133, "gen_logits_min": -27.993391036987305, "gen_logits_std": 3.4529547691345215, "gen_loss": 0.24496491253376007, "grad_norm": 0.3866363825291079, "learning_rate": 1.8906947368421055e-05, "loss": 0.2778, "mean_copy_accuracy": 0.9970522522926331, "mean_gen_accuracy": 0.8792928904294968, "mean_token_accuracy": 0.9059295207262039, "num_tokens": 275420988.0, "sample_num_tokens": 6790.0, "step": 9518, "total_num_tokens": 275448148.0, "z_loss": 0.00035527109866961837 }, { "copy_logits_max": -5.318382263183594, "copy_logits_min": -750000000.0, "copy_num_tokens": 376.5625, "epoch": 1.9439877457237682, "gen_logits_max": 4.277179718017578, "gen_logits_mean": -15.867589950561523, "gen_logits_min": -28.317344665527344, "gen_logits_std": 3.4142940044403076, "gen_loss": 0.30727434158325195, "grad_norm": 0.3631300341560526, "learning_rate": 1.8905684210526316e-05, "loss": 0.2784, "mean_copy_accuracy": 0.9971665441989899, "mean_gen_accuracy": 0.8740233331918716, "mean_token_accuracy": 0.9040942192077637, "num_tokens": 275709619.0, "sample_num_tokens": 7965.25, "step": 9519, "total_num_tokens": 275741480.0, "z_loss": 0.0004802104376722127 }, { "copy_logits_max": -4.378137111663818, "copy_logits_min": -750000000.0, "copy_num_tokens": 584.875, "epoch": 1.9441919836609651, "gen_logits_max": 3.682668447494507, "gen_logits_mean": -16.20256233215332, "gen_logits_min": -28.524885177612305, "gen_logits_std": 3.452021360397339, "gen_loss": 0.2879161834716797, "grad_norm": 0.35258610889794195, "learning_rate": 1.890442105263158e-05, "loss": 0.2788, "mean_copy_accuracy": 0.9971665292978287, "mean_gen_accuracy": 0.8715812861919403, "mean_token_accuracy": 0.9062584340572357, "num_tokens": 275992838.0, "sample_num_tokens": 9243.5, "step": 9520, "total_num_tokens": 276029812.0, "z_loss": 0.0004042913787998259 }, { "copy_logits_max": -2.89044189453125, "copy_logits_min": -750000000.0, "copy_num_tokens": 569.125, "epoch": 1.9443962215981618, "gen_logits_max": 3.9964981079101562, "gen_logits_mean": -16.16730499267578, "gen_logits_min": -28.511043548583984, "gen_logits_std": 3.461714744567871, "gen_loss": 0.24035674333572388, "grad_norm": 0.3428810195321638, "learning_rate": 1.890315789473684e-05, "loss": 0.2491, "mean_copy_accuracy": 0.9972422122955322, "mean_gen_accuracy": 0.8846537470817566, "mean_token_accuracy": 0.9173981249332428, "num_tokens": 276271573.0, "sample_num_tokens": 9030.75, "step": 9521, "total_num_tokens": 276307696.0, "z_loss": 0.0003598985495045781 }, { "copy_logits_max": -5.39730167388916, "copy_logits_min": -750000000.0, "copy_num_tokens": 288.125, "epoch": 1.9446004595353585, "gen_logits_max": 4.6996169090271, "gen_logits_mean": -16.14193344116211, "gen_logits_min": -28.17677879333496, "gen_logits_std": 3.433103084564209, "gen_loss": 0.2776210904121399, "grad_norm": 0.31306691082232485, "learning_rate": 1.8901894736842106e-05, "loss": 0.2472, "mean_copy_accuracy": 0.9978403151035309, "mean_gen_accuracy": 0.8873706310987473, "mean_token_accuracy": 0.9149658381938934, "num_tokens": 276551899.0, "sample_num_tokens": 8151.25, "step": 9522, "total_num_tokens": 276584504.0, "z_loss": 0.0003910321684088558 }, { "copy_logits_max": -4.623985767364502, "copy_logits_min": -750000000.0, "copy_num_tokens": 510.9375, "epoch": 1.9448046974725557, "gen_logits_max": 3.975064277648926, "gen_logits_mean": -15.176308631896973, "gen_logits_min": -27.67601776123047, "gen_logits_std": 3.4120655059814453, "gen_loss": 0.2586906850337982, "grad_norm": 0.37654243752409267, "learning_rate": 1.890063157894737e-05, "loss": 0.2796, "mean_copy_accuracy": 0.9972069412469864, "mean_gen_accuracy": 0.8785920888185501, "mean_token_accuracy": 0.9058535099029541, "num_tokens": 276818237.0, "sample_num_tokens": 9053.25, "step": 9523, "total_num_tokens": 276854450.0, "z_loss": 0.00041474553290754557 }, { "copy_logits_max": -5.345402717590332, "copy_logits_min": -750000000.0, "copy_num_tokens": 462.25, "epoch": 1.9450089354097524, "gen_logits_max": 3.2045974731445312, "gen_logits_mean": -17.775697708129883, "gen_logits_min": -29.947032928466797, "gen_logits_std": 3.5077266693115234, "gen_loss": 0.3059706687927246, "grad_norm": 0.3164313326460134, "learning_rate": 1.8899368421052634e-05, "loss": 0.2739, "mean_copy_accuracy": 0.9983056485652924, "mean_gen_accuracy": 0.8740970343351364, "mean_token_accuracy": 0.906408429145813, "num_tokens": 277126876.0, "sample_num_tokens": 9088.0, "step": 9524, "total_num_tokens": 277163228.0, "z_loss": 0.00047263814485631883 }, { "copy_logits_max": -6.807033061981201, "copy_logits_min": -750000000.0, "copy_num_tokens": 432.125, "epoch": 1.945213173346949, "gen_logits_max": 3.3386168479919434, "gen_logits_mean": -17.710790634155273, "gen_logits_min": -29.89543914794922, "gen_logits_std": 3.5209109783172607, "gen_loss": 0.2677795886993408, "grad_norm": 0.3387556831696132, "learning_rate": 1.8898105263157895e-05, "loss": 0.2748, "mean_copy_accuracy": 0.9970574826002121, "mean_gen_accuracy": 0.8780679255723953, "mean_token_accuracy": 0.9064495265483856, "num_tokens": 277407437.0, "sample_num_tokens": 9174.75, "step": 9525, "total_num_tokens": 277444136.0, "z_loss": 0.00040335365338250995 }, { "copy_logits_max": -3.7787086963653564, "copy_logits_min": -687500032.0, "copy_num_tokens": 585.25, "epoch": 1.945417411284146, "gen_logits_max": 3.8423516750335693, "gen_logits_mean": -15.966754913330078, "gen_logits_min": -28.550121307373047, "gen_logits_std": 3.4641857147216797, "gen_loss": 0.2899576723575592, "grad_norm": 0.3787513321382188, "learning_rate": 1.889684210526316e-05, "loss": 0.2766, "mean_copy_accuracy": 0.9972110241651535, "mean_gen_accuracy": 0.8741650432348251, "mean_token_accuracy": 0.9042396694421768, "num_tokens": 277679823.0, "sample_num_tokens": 9634.25, "step": 9526, "total_num_tokens": 277718360.0, "z_loss": 0.0004120668745599687 }, { "copy_logits_max": -5.102931976318359, "copy_logits_min": -750000000.0, "copy_num_tokens": 404.25, "epoch": 1.945621649221343, "gen_logits_max": 3.8375515937805176, "gen_logits_mean": -16.67233657836914, "gen_logits_min": -28.82745361328125, "gen_logits_std": 3.4689486026763916, "gen_loss": 0.29384735226631165, "grad_norm": 0.3405815827124115, "learning_rate": 1.889557894736842e-05, "loss": 0.2665, "mean_copy_accuracy": 0.9963240474462509, "mean_gen_accuracy": 0.8830011039972305, "mean_token_accuracy": 0.9073422402143478, "num_tokens": 277956490.0, "sample_num_tokens": 8143.5, "step": 9527, "total_num_tokens": 277989064.0, "z_loss": 0.00039996454142965376 }, { "copy_logits_max": -3.7530925273895264, "copy_logits_min": -750000000.0, "copy_num_tokens": 567.1875, "epoch": 1.9458258871585397, "gen_logits_max": 3.784684181213379, "gen_logits_mean": -16.71841812133789, "gen_logits_min": -29.149656295776367, "gen_logits_std": 3.4869322776794434, "gen_loss": 0.27127349376678467, "grad_norm": 0.35679854897486796, "learning_rate": 1.8894315789473685e-05, "loss": 0.2762, "mean_copy_accuracy": 0.9969512820243835, "mean_gen_accuracy": 0.8784199059009552, "mean_token_accuracy": 0.9070478677749634, "num_tokens": 278234469.0, "sample_num_tokens": 8866.25, "step": 9528, "total_num_tokens": 278269934.0, "z_loss": 0.00041066049016080797 }, { "copy_logits_max": -5.633495807647705, "copy_logits_min": -750000000.0, "copy_num_tokens": 623.4375, "epoch": 1.9460301250957366, "gen_logits_max": 3.571866989135742, "gen_logits_mean": -16.518285751342773, "gen_logits_min": -28.868873596191406, "gen_logits_std": 3.4617702960968018, "gen_loss": 0.254336416721344, "grad_norm": 0.343887450904772, "learning_rate": 1.8893052631578946e-05, "loss": 0.2561, "mean_copy_accuracy": 0.9974908977746964, "mean_gen_accuracy": 0.8816078454256058, "mean_token_accuracy": 0.9122833907604218, "num_tokens": 278510548.0, "sample_num_tokens": 9302.0, "step": 9529, "total_num_tokens": 278547756.0, "z_loss": 0.0003306693397462368 }, { "copy_logits_max": -6.095549583435059, "copy_logits_min": -687500032.0, "copy_num_tokens": 452.375, "epoch": 1.9462343630329335, "gen_logits_max": 3.051576614379883, "gen_logits_mean": -17.3948974609375, "gen_logits_min": -29.799365997314453, "gen_logits_std": 3.5021090507507324, "gen_loss": 0.2545056641101837, "grad_norm": 0.32112482411291465, "learning_rate": 1.889178947368421e-05, "loss": 0.2622, "mean_copy_accuracy": 0.9973649233579636, "mean_gen_accuracy": 0.8847345411777496, "mean_token_accuracy": 0.9106111228466034, "num_tokens": 278786503.0, "sample_num_tokens": 8254.75, "step": 9530, "total_num_tokens": 278819522.0, "z_loss": 0.00034521869383752346 }, { "copy_logits_max": -5.437082290649414, "copy_logits_min": -625000064.0, "copy_num_tokens": 605.1875, "epoch": 1.9464386009701302, "gen_logits_max": 2.648040294647217, "gen_logits_mean": -17.740447998046875, "gen_logits_min": -29.80185890197754, "gen_logits_std": 3.497587203979492, "gen_loss": 0.24335341155529022, "grad_norm": 0.3487585786693406, "learning_rate": 1.8890526315789474e-05, "loss": 0.2784, "mean_copy_accuracy": 0.9973720461130142, "mean_gen_accuracy": 0.8718628138303757, "mean_token_accuracy": 0.904908299446106, "num_tokens": 279060089.0, "sample_num_tokens": 9701.75, "step": 9531, "total_num_tokens": 279098896.0, "z_loss": 0.000341803883202374 }, { "copy_logits_max": -7.2748613357543945, "copy_logits_min": -750000000.0, "copy_num_tokens": 376.0625, "epoch": 1.946642838907327, "gen_logits_max": 4.4195356369018555, "gen_logits_mean": -16.09813690185547, "gen_logits_min": -28.006101608276367, "gen_logits_std": 3.4100375175476074, "gen_loss": 0.2616896629333496, "grad_norm": 0.3498828794342507, "learning_rate": 1.888926315789474e-05, "loss": 0.2786, "mean_copy_accuracy": 0.9969544857740402, "mean_gen_accuracy": 0.8765044659376144, "mean_token_accuracy": 0.9036387950181961, "num_tokens": 279323700.0, "sample_num_tokens": 8230.5, "step": 9532, "total_num_tokens": 279356622.0, "z_loss": 0.00037760293344035745 }, { "copy_logits_max": -2.337298631668091, "copy_logits_min": -750000000.0, "copy_num_tokens": 441.8125, "epoch": 1.9468470768445238, "gen_logits_max": 4.813057899475098, "gen_logits_mean": -14.777143478393555, "gen_logits_min": -26.912893295288086, "gen_logits_std": 3.3635191917419434, "gen_loss": 0.31440412998199463, "grad_norm": 0.34817421461647025, "learning_rate": 1.8888000000000003e-05, "loss": 0.2907, "mean_copy_accuracy": 0.9972431361675262, "mean_gen_accuracy": 0.8724125027656555, "mean_token_accuracy": 0.9019908308982849, "num_tokens": 279612543.0, "sample_num_tokens": 8803.25, "step": 9533, "total_num_tokens": 279647756.0, "z_loss": 0.0005586115876212716 }, { "copy_logits_max": -6.888701915740967, "copy_logits_min": -750000000.0, "copy_num_tokens": 444.9375, "epoch": 1.9470513147817208, "gen_logits_max": 3.3540687561035156, "gen_logits_mean": -18.031644821166992, "gen_logits_min": -30.321697235107422, "gen_logits_std": 3.497089385986328, "gen_loss": 0.2902020812034607, "grad_norm": 0.331611779496739, "learning_rate": 1.8886736842105264e-05, "loss": 0.2721, "mean_copy_accuracy": 0.9978143572807312, "mean_gen_accuracy": 0.8780558109283447, "mean_token_accuracy": 0.9081173241138458, "num_tokens": 279897582.0, "sample_num_tokens": 8389.5, "step": 9534, "total_num_tokens": 279931140.0, "z_loss": 0.00045050724293105304 }, { "copy_logits_max": -4.922690391540527, "copy_logits_min": -750000000.0, "copy_num_tokens": 365.9375, "epoch": 1.9472555527189175, "gen_logits_max": 4.087063312530518, "gen_logits_mean": -16.483604431152344, "gen_logits_min": -28.854747772216797, "gen_logits_std": 3.4195058345794678, "gen_loss": 0.28917980194091797, "grad_norm": 0.331377225040759, "learning_rate": 1.8885473684210528e-05, "loss": 0.2777, "mean_copy_accuracy": 0.9972023069858551, "mean_gen_accuracy": 0.8754993677139282, "mean_token_accuracy": 0.9068013280630112, "num_tokens": 280183254.0, "sample_num_tokens": 7840.0, "step": 9535, "total_num_tokens": 280214614.0, "z_loss": 0.0004887179820798337 }, { "copy_logits_max": -9.286443710327148, "copy_logits_min": -750000000.0, "copy_num_tokens": 240.875, "epoch": 1.9474597906561144, "gen_logits_max": 4.113054275512695, "gen_logits_mean": -18.855300903320312, "gen_logits_min": -30.502967834472656, "gen_logits_std": 3.5268070697784424, "gen_loss": 0.27812322974205017, "grad_norm": 0.3530483089755537, "learning_rate": 1.888421052631579e-05, "loss": 0.2701, "mean_copy_accuracy": 0.9972137063741684, "mean_gen_accuracy": 0.8838307857513428, "mean_token_accuracy": 0.9082019776105881, "num_tokens": 280450823.0, "sample_num_tokens": 7597.25, "step": 9536, "total_num_tokens": 280481212.0, "z_loss": 0.00041431363206356764 }, { "copy_logits_max": -9.086729049682617, "copy_logits_min": -750000000.0, "copy_num_tokens": 347.875, "epoch": 1.9476640285933113, "gen_logits_max": 3.488184928894043, "gen_logits_mean": -18.565797805786133, "gen_logits_min": -30.11079216003418, "gen_logits_std": 3.5093226432800293, "gen_loss": 0.25263890624046326, "grad_norm": 0.3586143626372877, "learning_rate": 1.8882947368421053e-05, "loss": 0.2552, "mean_copy_accuracy": 0.9976495057344437, "mean_gen_accuracy": 0.8865319192409515, "mean_token_accuracy": 0.9130288660526276, "num_tokens": 280714711.0, "sample_num_tokens": 8181.25, "step": 9537, "total_num_tokens": 280747436.0, "z_loss": 0.0003653883177321404 }, { "copy_logits_max": -7.614456653594971, "copy_logits_min": -750000000.0, "copy_num_tokens": 480.8125, "epoch": 1.947868266530508, "gen_logits_max": 1.9639779329299927, "gen_logits_mean": -19.259706497192383, "gen_logits_min": -31.020862579345703, "gen_logits_std": 3.5277962684631348, "gen_loss": 0.24998779594898224, "grad_norm": 0.34513407189497436, "learning_rate": 1.8881684210526314e-05, "loss": 0.2905, "mean_copy_accuracy": 0.997551366686821, "mean_gen_accuracy": 0.8753210455179214, "mean_token_accuracy": 0.9014983177185059, "num_tokens": 280983946.0, "sample_num_tokens": 8407.5, "step": 9538, "total_num_tokens": 281017576.0, "z_loss": 0.0003657614579424262 }, { "copy_logits_max": -6.75400972366333, "copy_logits_min": -750000000.0, "copy_num_tokens": 401.8125, "epoch": 1.9480725044677047, "gen_logits_max": 3.8368120193481445, "gen_logits_mean": -17.501216888427734, "gen_logits_min": -29.393505096435547, "gen_logits_std": 3.4396963119506836, "gen_loss": 0.33919423818588257, "grad_norm": 0.34985560780912, "learning_rate": 1.888042105263158e-05, "loss": 0.2642, "mean_copy_accuracy": 0.9972182065248489, "mean_gen_accuracy": 0.8811003565788269, "mean_token_accuracy": 0.910752072930336, "num_tokens": 281261970.0, "sample_num_tokens": 8315.0, "step": 9539, "total_num_tokens": 281295230.0, "z_loss": 0.0005005085840821266 }, { "copy_logits_max": -5.879397392272949, "copy_logits_min": -750000000.0, "copy_num_tokens": 437.75, "epoch": 1.9482767424049017, "gen_logits_max": 3.112006425857544, "gen_logits_mean": -17.23648452758789, "gen_logits_min": -29.179258346557617, "gen_logits_std": 3.4745190143585205, "gen_loss": 0.2573978006839752, "grad_norm": 0.38147224702629773, "learning_rate": 1.8879157894736843e-05, "loss": 0.2835, "mean_copy_accuracy": 0.9973414987325668, "mean_gen_accuracy": 0.8752953261137009, "mean_token_accuracy": 0.9048239141702652, "num_tokens": 281510974.0, "sample_num_tokens": 7783.5, "step": 9540, "total_num_tokens": 281542108.0, "z_loss": 0.00037006480852141976 }, { "copy_logits_max": -7.597243785858154, "copy_logits_min": -687500032.0, "copy_num_tokens": 385.25, "epoch": 1.9484809803420986, "gen_logits_max": 4.368701934814453, "gen_logits_mean": -16.111677169799805, "gen_logits_min": -27.91812515258789, "gen_logits_std": 3.403862953186035, "gen_loss": 0.27269041538238525, "grad_norm": 0.3548438778384571, "learning_rate": 1.8877894736842107e-05, "loss": 0.2673, "mean_copy_accuracy": 0.9977418929338455, "mean_gen_accuracy": 0.8801963329315186, "mean_token_accuracy": 0.909436360001564, "num_tokens": 281778885.0, "sample_num_tokens": 7712.25, "step": 9541, "total_num_tokens": 281809734.0, "z_loss": 0.00038645468885079026 }, { "copy_logits_max": -6.935818672180176, "copy_logits_min": -750000000.0, "copy_num_tokens": 449.3125, "epoch": 1.9486852182792953, "gen_logits_max": 2.6316239833831787, "gen_logits_mean": -17.032785415649414, "gen_logits_min": -29.066547393798828, "gen_logits_std": 3.3869833946228027, "gen_loss": 0.2782320976257324, "grad_norm": 0.3506764405081353, "learning_rate": 1.8876631578947368e-05, "loss": 0.2671, "mean_copy_accuracy": 0.9975386261940002, "mean_gen_accuracy": 0.8807232677936554, "mean_token_accuracy": 0.9077098667621613, "num_tokens": 282060521.0, "sample_num_tokens": 7797.75, "step": 9542, "total_num_tokens": 282091712.0, "z_loss": 0.0003670394071377814 }, { "copy_logits_max": -8.18695068359375, "copy_logits_min": -750000000.0, "copy_num_tokens": 397.0, "epoch": 1.9488894562164922, "gen_logits_max": 3.1976490020751953, "gen_logits_mean": -18.142688751220703, "gen_logits_min": -29.933738708496094, "gen_logits_std": 3.4783811569213867, "gen_loss": 0.28893500566482544, "grad_norm": 0.3536629011552951, "learning_rate": 1.8875368421052632e-05, "loss": 0.2585, "mean_copy_accuracy": 0.9965638518333435, "mean_gen_accuracy": 0.8786000460386276, "mean_token_accuracy": 0.9116320312023163, "num_tokens": 282349314.0, "sample_num_tokens": 7864.0, "step": 9543, "total_num_tokens": 282380770.0, "z_loss": 0.00035913405008614063 }, { "copy_logits_max": -5.763451099395752, "copy_logits_min": -687500032.0, "copy_num_tokens": 412.25, "epoch": 1.9490936941536892, "gen_logits_max": 3.2931389808654785, "gen_logits_mean": -16.485946655273438, "gen_logits_min": -28.585548400878906, "gen_logits_std": 3.426316261291504, "gen_loss": 0.2827715277671814, "grad_norm": 0.3770208260032878, "learning_rate": 1.8874105263157893e-05, "loss": 0.2845, "mean_copy_accuracy": 0.9967972636222839, "mean_gen_accuracy": 0.8727499544620514, "mean_token_accuracy": 0.9030321389436722, "num_tokens": 282617286.0, "sample_num_tokens": 7620.5, "step": 9544, "total_num_tokens": 282647768.0, "z_loss": 0.00036971468944102526 }, { "copy_logits_max": -7.403352737426758, "copy_logits_min": -750000000.0, "copy_num_tokens": 544.8125, "epoch": 1.9492979320908859, "gen_logits_max": 3.362499475479126, "gen_logits_mean": -17.5457820892334, "gen_logits_min": -29.254817962646484, "gen_logits_std": 3.4614553451538086, "gen_loss": 0.29142192006111145, "grad_norm": 0.36466417993510164, "learning_rate": 1.8872842105263158e-05, "loss": 0.2991, "mean_copy_accuracy": 0.9974182397127151, "mean_gen_accuracy": 0.8676282316446304, "mean_token_accuracy": 0.8989962935447693, "num_tokens": 282878769.0, "sample_num_tokens": 10432.25, "step": 9545, "total_num_tokens": 282920498.0, "z_loss": 0.00040008011274039745 }, { "copy_logits_max": -7.803421974182129, "copy_logits_min": -750000000.0, "copy_num_tokens": 337.0625, "epoch": 1.9495021700280826, "gen_logits_max": 2.94404673576355, "gen_logits_mean": -19.02350616455078, "gen_logits_min": -30.81206512451172, "gen_logits_std": 3.4861721992492676, "gen_loss": 0.3145415186882019, "grad_norm": 0.3870583167537139, "learning_rate": 1.8871578947368422e-05, "loss": 0.2929, "mean_copy_accuracy": 0.997484415769577, "mean_gen_accuracy": 0.8693543374538422, "mean_token_accuracy": 0.8996445089578629, "num_tokens": 283154094.0, "sample_num_tokens": 8211.5, "step": 9546, "total_num_tokens": 283186940.0, "z_loss": 0.00042232486885041 }, { "copy_logits_max": -5.154139041900635, "copy_logits_min": -750000000.0, "copy_num_tokens": 525.125, "epoch": 1.9497064079652795, "gen_logits_max": 3.584404230117798, "gen_logits_mean": -15.827922821044922, "gen_logits_min": -28.14910888671875, "gen_logits_std": 3.398317575454712, "gen_loss": 0.2828727662563324, "grad_norm": 0.34464963485006195, "learning_rate": 1.8870315789473683e-05, "loss": 0.2748, "mean_copy_accuracy": 0.9974103420972824, "mean_gen_accuracy": 0.8752137720584869, "mean_token_accuracy": 0.905164048075676, "num_tokens": 283424979.0, "sample_num_tokens": 8631.75, "step": 9547, "total_num_tokens": 283459506.0, "z_loss": 0.00041953581967391074 }, { "copy_logits_max": -5.042232036590576, "copy_logits_min": -750000064.0, "copy_num_tokens": 726.1875, "epoch": 1.9499106459024764, "gen_logits_max": 2.569878101348877, "gen_logits_mean": -16.5604248046875, "gen_logits_min": -29.501880645751953, "gen_logits_std": 3.410151720046997, "gen_loss": 0.25930148363113403, "grad_norm": 0.35904661363695245, "learning_rate": 1.886905263157895e-05, "loss": 0.2696, "mean_copy_accuracy": 0.997728243470192, "mean_gen_accuracy": 0.8738935142755508, "mean_token_accuracy": 0.9096080660820007, "num_tokens": 283687882.0, "sample_num_tokens": 9897.5, "step": 9548, "total_num_tokens": 283727472.0, "z_loss": 0.00041073333704844117 }, { "copy_logits_max": -5.187478065490723, "copy_logits_min": -750000000.0, "copy_num_tokens": 553.875, "epoch": 1.9501148838396731, "gen_logits_max": 2.531982421875, "gen_logits_mean": -17.560779571533203, "gen_logits_min": -29.96245765686035, "gen_logits_std": 3.47774076461792, "gen_loss": 0.2713949680328369, "grad_norm": 0.3486630656025055, "learning_rate": 1.886778947368421e-05, "loss": 0.2619, "mean_copy_accuracy": 0.9979829788208008, "mean_gen_accuracy": 0.8754814267158508, "mean_token_accuracy": 0.9107352942228317, "num_tokens": 283990858.0, "sample_num_tokens": 8466.0, "step": 9549, "total_num_tokens": 284024722.0, "z_loss": 0.00044935091864317656 }, { "copy_logits_max": -5.195061683654785, "copy_logits_min": -750000000.0, "copy_num_tokens": 482.125, "epoch": 1.95031912177687, "gen_logits_max": 4.002476692199707, "gen_logits_mean": -15.820253372192383, "gen_logits_min": -28.139427185058594, "gen_logits_std": 3.400075912475586, "gen_loss": 0.2565004229545593, "grad_norm": 0.3532450761409695, "learning_rate": 1.8866526315789476e-05, "loss": 0.2635, "mean_copy_accuracy": 0.9971872568130493, "mean_gen_accuracy": 0.8759889453649521, "mean_token_accuracy": 0.9109794944524765, "num_tokens": 284284610.0, "sample_num_tokens": 7907.5, "step": 9550, "total_num_tokens": 284316240.0, "z_loss": 0.0004005558439530432 }, { "copy_logits_max": -5.806757926940918, "copy_logits_min": -687500032.0, "copy_num_tokens": 363.125, "epoch": 1.950523359714067, "gen_logits_max": 3.308622360229492, "gen_logits_mean": -16.73441505432129, "gen_logits_min": -28.79538345336914, "gen_logits_std": 3.4083657264709473, "gen_loss": 0.28994354605674744, "grad_norm": 0.36486296542934904, "learning_rate": 1.8865263157894737e-05, "loss": 0.2836, "mean_copy_accuracy": 0.9973873794078827, "mean_gen_accuracy": 0.8733821213245392, "mean_token_accuracy": 0.9039495587348938, "num_tokens": 284548846.0, "sample_num_tokens": 7182.0, "step": 9551, "total_num_tokens": 284577574.0, "z_loss": 0.00043187278788536787 }, { "copy_logits_max": -3.2922937870025635, "copy_logits_min": -750000000.0, "copy_num_tokens": 445.6875, "epoch": 1.9507275976512637, "gen_logits_max": 3.605269432067871, "gen_logits_mean": -15.668009757995605, "gen_logits_min": -28.014978408813477, "gen_logits_std": 3.3609726428985596, "gen_loss": 0.25453057885169983, "grad_norm": 0.3416765723202142, "learning_rate": 1.8864e-05, "loss": 0.2748, "mean_copy_accuracy": 0.997493103146553, "mean_gen_accuracy": 0.876315250992775, "mean_token_accuracy": 0.9075263738632202, "num_tokens": 284805770.0, "sample_num_tokens": 7449.0, "step": 9552, "total_num_tokens": 284835566.0, "z_loss": 0.000405732513172552 }, { "copy_logits_max": -6.547890663146973, "copy_logits_min": -750000000.0, "copy_num_tokens": 563.9375, "epoch": 1.9509318355884604, "gen_logits_max": 2.5243420600891113, "gen_logits_mean": -18.489702224731445, "gen_logits_min": -30.48200225830078, "gen_logits_std": 3.52823543548584, "gen_loss": 0.23682734370231628, "grad_norm": 0.3478716633792574, "learning_rate": 1.8862736842105262e-05, "loss": 0.2675, "mean_copy_accuracy": 0.9972957521677017, "mean_gen_accuracy": 0.8857689946889877, "mean_token_accuracy": 0.9089907854795456, "num_tokens": 285068866.0, "sample_num_tokens": 9997.0, "step": 9553, "total_num_tokens": 285108854.0, "z_loss": 0.0003524178173393011 }, { "copy_logits_max": -6.403915882110596, "copy_logits_min": -687500032.0, "copy_num_tokens": 553.0625, "epoch": 1.9511360735256575, "gen_logits_max": 3.4837307929992676, "gen_logits_mean": -16.465946197509766, "gen_logits_min": -29.196239471435547, "gen_logits_std": 3.414158582687378, "gen_loss": 0.2539820373058319, "grad_norm": 0.3314838111786099, "learning_rate": 1.8861473684210526e-05, "loss": 0.2462, "mean_copy_accuracy": 0.9976040422916412, "mean_gen_accuracy": 0.8777459412813187, "mean_token_accuracy": 0.9160578101873398, "num_tokens": 285361036.0, "sample_num_tokens": 8808.5, "step": 9554, "total_num_tokens": 285396270.0, "z_loss": 0.000439014082076028 }, { "copy_logits_max": -3.0308761596679688, "copy_logits_min": -750000000.0, "copy_num_tokens": 471.8125, "epoch": 1.9513403114628542, "gen_logits_max": 4.011621952056885, "gen_logits_mean": -16.186954498291016, "gen_logits_min": -28.30459976196289, "gen_logits_std": 3.4295458793640137, "gen_loss": 0.28891420364379883, "grad_norm": 0.3638358168621438, "learning_rate": 1.8860210526315787e-05, "loss": 0.2598, "mean_copy_accuracy": 0.9964757561683655, "mean_gen_accuracy": 0.8836638331413269, "mean_token_accuracy": 0.9123770594596863, "num_tokens": 285620630.0, "sample_num_tokens": 7845.0, "step": 9555, "total_num_tokens": 285652010.0, "z_loss": 0.0004477985203266144 }, { "copy_logits_max": -3.6048622131347656, "copy_logits_min": -750000000.0, "copy_num_tokens": 394.375, "epoch": 1.951544549400051, "gen_logits_max": 2.772965431213379, "gen_logits_mean": -17.171241760253906, "gen_logits_min": -29.246280670166016, "gen_logits_std": 3.4036054611206055, "gen_loss": 0.2689190208911896, "grad_norm": 0.36622788961811625, "learning_rate": 1.8858947368421055e-05, "loss": 0.2834, "mean_copy_accuracy": 0.9969348311424255, "mean_gen_accuracy": 0.876455545425415, "mean_token_accuracy": 0.9043255150318146, "num_tokens": 285878109.0, "sample_num_tokens": 7357.25, "step": 9556, "total_num_tokens": 285907538.0, "z_loss": 0.0004319940344430506 }, { "copy_logits_max": -5.003026962280273, "copy_logits_min": -750000000.0, "copy_num_tokens": 389.875, "epoch": 1.9517487873372479, "gen_logits_max": 3.6275711059570312, "gen_logits_mean": -15.932212829589844, "gen_logits_min": -28.135786056518555, "gen_logits_std": 3.281559467315674, "gen_loss": 0.266868531703949, "grad_norm": 0.3296739461396255, "learning_rate": 1.8857684210526316e-05, "loss": 0.2597, "mean_copy_accuracy": 0.9986430555582047, "mean_gen_accuracy": 0.8832935243844986, "mean_token_accuracy": 0.911238506436348, "num_tokens": 286133006.0, "sample_num_tokens": 8481.0, "step": 9557, "total_num_tokens": 286166930.0, "z_loss": 0.00039658971945755184 }, { "copy_logits_max": -2.9885377883911133, "copy_logits_min": -750000000.0, "copy_num_tokens": 429.4375, "epoch": 1.9519530252744448, "gen_logits_max": 2.8466062545776367, "gen_logits_mean": -17.18903350830078, "gen_logits_min": -29.094257354736328, "gen_logits_std": 3.3950343132019043, "gen_loss": 0.3035196363925934, "grad_norm": 0.35373974314729156, "learning_rate": 1.885642105263158e-05, "loss": 0.2852, "mean_copy_accuracy": 0.9971781820058823, "mean_gen_accuracy": 0.8736078590154648, "mean_token_accuracy": 0.9026636034250259, "num_tokens": 286411838.0, "sample_num_tokens": 8118.0, "step": 9558, "total_num_tokens": 286444310.0, "z_loss": 0.0004942055093124509 }, { "copy_logits_max": -4.2281575202941895, "copy_logits_min": -750000000.0, "copy_num_tokens": 351.875, "epoch": 1.9521572632116415, "gen_logits_max": 3.5609865188598633, "gen_logits_mean": -16.720617294311523, "gen_logits_min": -29.05946922302246, "gen_logits_std": 3.3986222743988037, "gen_loss": 0.3088139295578003, "grad_norm": 0.3580242270445902, "learning_rate": 1.8855157894736844e-05, "loss": 0.2668, "mean_copy_accuracy": 0.9970695674419403, "mean_gen_accuracy": 0.8833548277616501, "mean_token_accuracy": 0.9099276065826416, "num_tokens": 286688827.0, "sample_num_tokens": 7981.75, "step": 9559, "total_num_tokens": 286720754.0, "z_loss": 0.0005379720823839307 }, { "copy_logits_max": -4.559402942657471, "copy_logits_min": -687500096.0, "copy_num_tokens": 569.0625, "epoch": 1.9523615011488384, "gen_logits_max": 3.2744996547698975, "gen_logits_mean": -17.043045043945312, "gen_logits_min": -29.415836334228516, "gen_logits_std": 3.448301315307617, "gen_loss": 0.24762526154518127, "grad_norm": 0.37852346679035576, "learning_rate": 1.8853894736842105e-05, "loss": 0.2751, "mean_copy_accuracy": 0.9965942949056625, "mean_gen_accuracy": 0.8803713321685791, "mean_token_accuracy": 0.9082380086183548, "num_tokens": 286958765.0, "sample_num_tokens": 9322.25, "step": 9560, "total_num_tokens": 286996054.0, "z_loss": 0.0004263549344614148 }, { "copy_logits_max": -5.366358757019043, "copy_logits_min": -687500032.0, "copy_num_tokens": 498.25, "epoch": 1.9525657390860354, "gen_logits_max": 2.8227880001068115, "gen_logits_mean": -18.293060302734375, "gen_logits_min": -30.51064109802246, "gen_logits_std": 3.4989383220672607, "gen_loss": 0.2794263958930969, "grad_norm": 0.3360483363422506, "learning_rate": 1.885263157894737e-05, "loss": 0.2852, "mean_copy_accuracy": 0.9979134202003479, "mean_gen_accuracy": 0.8772858381271362, "mean_token_accuracy": 0.9038045257329941, "num_tokens": 287234808.0, "sample_num_tokens": 9618.5, "step": 9561, "total_num_tokens": 287273282.0, "z_loss": 0.00040749539039097726 }, { "copy_logits_max": -6.077946662902832, "copy_logits_min": -750000000.0, "copy_num_tokens": 364.5625, "epoch": 1.952769977023232, "gen_logits_max": 3.26416277885437, "gen_logits_mean": -17.186614990234375, "gen_logits_min": -29.335800170898438, "gen_logits_std": 3.410339832305908, "gen_loss": 0.30052995681762695, "grad_norm": 0.3749766593744989, "learning_rate": 1.885136842105263e-05, "loss": 0.3026, "mean_copy_accuracy": 0.9980279803276062, "mean_gen_accuracy": 0.8696990460157394, "mean_token_accuracy": 0.8982074111700058, "num_tokens": 287505190.0, "sample_num_tokens": 7551.0, "step": 9562, "total_num_tokens": 287535394.0, "z_loss": 0.00042773885070346296 }, { "copy_logits_max": -2.2946014404296875, "copy_logits_min": -750000000.0, "copy_num_tokens": 379.375, "epoch": 1.9529742149604288, "gen_logits_max": 3.406967878341675, "gen_logits_mean": -16.68840789794922, "gen_logits_min": -28.975263595581055, "gen_logits_std": 3.4592084884643555, "gen_loss": 0.27750757336616516, "grad_norm": 0.3492453203026166, "learning_rate": 1.8850105263157895e-05, "loss": 0.281, "mean_copy_accuracy": 0.997589647769928, "mean_gen_accuracy": 0.8759437203407288, "mean_token_accuracy": 0.9051616042852402, "num_tokens": 287778987.0, "sample_num_tokens": 7232.25, "step": 9563, "total_num_tokens": 287807916.0, "z_loss": 0.0004065512912347913 }, { "copy_logits_max": -3.854531764984131, "copy_logits_min": -687500032.0, "copy_num_tokens": 464.9375, "epoch": 1.9531784528976257, "gen_logits_max": 4.681800365447998, "gen_logits_mean": -14.99760627746582, "gen_logits_min": -27.231103897094727, "gen_logits_std": 3.3362841606140137, "gen_loss": 0.2928119897842407, "grad_norm": 0.33665373488429645, "learning_rate": 1.884884210526316e-05, "loss": 0.2729, "mean_copy_accuracy": 0.997534304857254, "mean_gen_accuracy": 0.8764373958110809, "mean_token_accuracy": 0.9060420095920563, "num_tokens": 288057103.0, "sample_num_tokens": 9334.75, "step": 9564, "total_num_tokens": 288094442.0, "z_loss": 0.000433171633630991 }, { "copy_logits_max": -3.6521658897399902, "copy_logits_min": -750000000.0, "copy_num_tokens": 530.8125, "epoch": 1.9533826908348226, "gen_logits_max": 2.1222567558288574, "gen_logits_mean": -18.900684356689453, "gen_logits_min": -31.24724769592285, "gen_logits_std": 3.5971081256866455, "gen_loss": 0.23530903458595276, "grad_norm": 0.35800925750123025, "learning_rate": 1.8847578947368423e-05, "loss": 0.2668, "mean_copy_accuracy": 0.996771901845932, "mean_gen_accuracy": 0.8799858689308167, "mean_token_accuracy": 0.9082668274641037, "num_tokens": 288317323.0, "sample_num_tokens": 8482.75, "step": 9565, "total_num_tokens": 288351254.0, "z_loss": 0.0003560120821930468 }, { "copy_logits_max": -3.699671506881714, "copy_logits_min": -750000000.0, "copy_num_tokens": 465.75, "epoch": 1.9535869287720193, "gen_logits_max": 2.789154291152954, "gen_logits_mean": -18.169784545898438, "gen_logits_min": -30.592628479003906, "gen_logits_std": 3.5434138774871826, "gen_loss": 0.2791004776954651, "grad_norm": 0.3632905498883546, "learning_rate": 1.8846315789473684e-05, "loss": 0.28, "mean_copy_accuracy": 0.9969947040081024, "mean_gen_accuracy": 0.878260150551796, "mean_token_accuracy": 0.9061298817396164, "num_tokens": 288600231.0, "sample_num_tokens": 8267.75, "step": 9566, "total_num_tokens": 288633302.0, "z_loss": 0.00037586147664114833 }, { "copy_logits_max": -4.6705780029296875, "copy_logits_min": -750000000.0, "copy_num_tokens": 456.5625, "epoch": 1.9537911667092163, "gen_logits_max": 3.7798309326171875, "gen_logits_mean": -16.560794830322266, "gen_logits_min": -28.921722412109375, "gen_logits_std": 3.457406997680664, "gen_loss": 0.3103673458099365, "grad_norm": 0.3594831672341586, "learning_rate": 1.884505263157895e-05, "loss": 0.2941, "mean_copy_accuracy": 0.9973577708005905, "mean_gen_accuracy": 0.8718142509460449, "mean_token_accuracy": 0.9011581689119339, "num_tokens": 288869609.0, "sample_num_tokens": 8746.75, "step": 9567, "total_num_tokens": 288904596.0, "z_loss": 0.0004235817468725145 }, { "copy_logits_max": -2.9255833625793457, "copy_logits_min": -687500032.0, "copy_num_tokens": 572.0, "epoch": 1.9539954046464132, "gen_logits_max": 3.024960994720459, "gen_logits_mean": -17.584505081176758, "gen_logits_min": -30.04106903076172, "gen_logits_std": 3.5425286293029785, "gen_loss": 0.2535439133644104, "grad_norm": 0.3655533938229597, "learning_rate": 1.884378947368421e-05, "loss": 0.2527, "mean_copy_accuracy": 0.9968664795160294, "mean_gen_accuracy": 0.8871426582336426, "mean_token_accuracy": 0.9138184189796448, "num_tokens": 289134374.0, "sample_num_tokens": 9562.5, "step": 9568, "total_num_tokens": 289172624.0, "z_loss": 0.0003530476242303848 }, { "copy_logits_max": -1.2409420013427734, "copy_logits_min": -750000000.0, "copy_num_tokens": 628.0, "epoch": 1.95419964258361, "gen_logits_max": 2.9083447456359863, "gen_logits_mean": -17.421432495117188, "gen_logits_min": -29.958969116210938, "gen_logits_std": 3.5158584117889404, "gen_loss": 0.2594185173511505, "grad_norm": 0.3733824070996829, "learning_rate": 1.8842526315789474e-05, "loss": 0.2659, "mean_copy_accuracy": 0.9978808611631393, "mean_gen_accuracy": 0.8759655803442001, "mean_token_accuracy": 0.9113022983074188, "num_tokens": 289431105.0, "sample_num_tokens": 10182.25, "step": 9569, "total_num_tokens": 289471834.0, "z_loss": 0.00037604436511173844 }, { "copy_logits_max": -3.844106435775757, "copy_logits_min": -750000000.0, "copy_num_tokens": 510.75, "epoch": 1.9544038805208066, "gen_logits_max": 3.5486865043640137, "gen_logits_mean": -16.70912742614746, "gen_logits_min": -28.304954528808594, "gen_logits_std": 3.40444278717041, "gen_loss": 0.24631303548812866, "grad_norm": 0.33589175721245385, "learning_rate": 1.8841263157894735e-05, "loss": 0.2605, "mean_copy_accuracy": 0.9976758807897568, "mean_gen_accuracy": 0.8806247413158417, "mean_token_accuracy": 0.9120721071958542, "num_tokens": 289705788.0, "sample_num_tokens": 8608.0, "step": 9570, "total_num_tokens": 289740220.0, "z_loss": 0.00032699829898774624 }, { "copy_logits_max": -0.45388758182525635, "copy_logits_min": -750000064.0, "copy_num_tokens": 365.4375, "epoch": 1.9546081184580035, "gen_logits_max": 6.113526344299316, "gen_logits_mean": -14.648786544799805, "gen_logits_min": -26.944293975830078, "gen_logits_std": 3.3423609733581543, "gen_loss": 0.2883385419845581, "grad_norm": 0.35640828823522325, "learning_rate": 1.884e-05, "loss": 0.264, "mean_copy_accuracy": 0.9972513169050217, "mean_gen_accuracy": 0.8804522305727005, "mean_token_accuracy": 0.9103009104728699, "num_tokens": 289993499.0, "sample_num_tokens": 7259.75, "step": 9571, "total_num_tokens": 290022538.0, "z_loss": 0.0004230911727063358 }, { "copy_logits_max": -0.6644433736801147, "copy_logits_min": -687500032.0, "copy_num_tokens": 608.5, "epoch": 1.9548123563952005, "gen_logits_max": 1.7622789144515991, "gen_logits_mean": -19.12723159790039, "gen_logits_min": -32.11549377441406, "gen_logits_std": 3.6077721118927, "gen_loss": 0.2746712565422058, "grad_norm": 0.3382606889373131, "learning_rate": 1.8838736842105267e-05, "loss": 0.2641, "mean_copy_accuracy": 0.9978577345609665, "mean_gen_accuracy": 0.8778876215219498, "mean_token_accuracy": 0.9106835126876831, "num_tokens": 290269789.0, "sample_num_tokens": 9125.75, "step": 9572, "total_num_tokens": 290306292.0, "z_loss": 0.00043053465196862817 }, { "copy_logits_max": 2.024672508239746, "copy_logits_min": -750000000.0, "copy_num_tokens": 335.5625, "epoch": 1.9550165943323972, "gen_logits_max": 5.501327037811279, "gen_logits_mean": -14.922857284545898, "gen_logits_min": -27.28907012939453, "gen_logits_std": 3.3920137882232666, "gen_loss": 0.31719571352005005, "grad_norm": 0.40731749161695824, "learning_rate": 1.8837473684210528e-05, "loss": 0.2626, "mean_copy_accuracy": 0.997240886092186, "mean_gen_accuracy": 0.8823089152574539, "mean_token_accuracy": 0.9099578261375427, "num_tokens": 290545464.0, "sample_num_tokens": 7848.5, "step": 9573, "total_num_tokens": 290576858.0, "z_loss": 0.0004706248000729829 }, { "copy_logits_max": 2.218946695327759, "copy_logits_min": -750000000.0, "copy_num_tokens": 529.75, "epoch": 1.955220832269594, "gen_logits_max": 5.014466285705566, "gen_logits_mean": -14.510435104370117, "gen_logits_min": -27.37801742553711, "gen_logits_std": 3.407457113265991, "gen_loss": 0.2552330493927002, "grad_norm": 0.38004952030204253, "learning_rate": 1.8836210526315792e-05, "loss": 0.2902, "mean_copy_accuracy": 0.9957726895809174, "mean_gen_accuracy": 0.8756680339574814, "mean_token_accuracy": 0.9005624651908875, "num_tokens": 290808052.0, "sample_num_tokens": 8772.0, "step": 9574, "total_num_tokens": 290843140.0, "z_loss": 0.00040943361818790436 }, { "copy_logits_max": 0.5190812349319458, "copy_logits_min": -750000000.0, "copy_num_tokens": 533.0, "epoch": 1.955425070206791, "gen_logits_max": 4.779733657836914, "gen_logits_mean": -15.997596740722656, "gen_logits_min": -28.618011474609375, "gen_logits_std": 3.4664466381073, "gen_loss": 0.2978917956352234, "grad_norm": 0.3335049883155493, "learning_rate": 1.8834947368421053e-05, "loss": 0.2806, "mean_copy_accuracy": 0.9974851310253143, "mean_gen_accuracy": 0.8722975999116898, "mean_token_accuracy": 0.9052934050559998, "num_tokens": 291084560.0, "sample_num_tokens": 9638.5, "step": 9575, "total_num_tokens": 291123114.0, "z_loss": 0.0004385349457152188 }, { "copy_logits_max": 1.1492221355438232, "copy_logits_min": -750000000.0, "copy_num_tokens": 602.5625, "epoch": 1.9556293081439877, "gen_logits_max": 4.908553123474121, "gen_logits_mean": -15.36673355102539, "gen_logits_min": -28.310081481933594, "gen_logits_std": 3.4598820209503174, "gen_loss": 0.26221638917922974, "grad_norm": 0.34952295442405273, "learning_rate": 1.8833684210526317e-05, "loss": 0.2664, "mean_copy_accuracy": 0.9962496757507324, "mean_gen_accuracy": 0.8840192407369614, "mean_token_accuracy": 0.9101332426071167, "num_tokens": 291361850.0, "sample_num_tokens": 8797.0, "step": 9576, "total_num_tokens": 291397038.0, "z_loss": 0.0003916764399036765 }, { "copy_logits_max": 2.27585768699646, "copy_logits_min": -750000064.0, "copy_num_tokens": 395.9375, "epoch": 1.9558335460811844, "gen_logits_max": 5.948390960693359, "gen_logits_mean": -14.479914665222168, "gen_logits_min": -27.22291374206543, "gen_logits_std": 3.4200870990753174, "gen_loss": 0.26867368817329407, "grad_norm": 0.32887504131601863, "learning_rate": 1.8832421052631578e-05, "loss": 0.2608, "mean_copy_accuracy": 0.9965372830629349, "mean_gen_accuracy": 0.8799115717411041, "mean_token_accuracy": 0.9100642502307892, "num_tokens": 291638036.0, "sample_num_tokens": 7844.5, "step": 9577, "total_num_tokens": 291669414.0, "z_loss": 0.00043868349166586995 }, { "copy_logits_max": -3.5471272468566895, "copy_logits_min": -687500032.0, "copy_num_tokens": 318.4375, "epoch": 1.9560377840183816, "gen_logits_max": 4.434752941131592, "gen_logits_mean": -17.893207550048828, "gen_logits_min": -29.967872619628906, "gen_logits_std": 3.5222887992858887, "gen_loss": 0.3026297390460968, "grad_norm": 0.35803219149184917, "learning_rate": 1.8831157894736843e-05, "loss": 0.2882, "mean_copy_accuracy": 0.9971048980951309, "mean_gen_accuracy": 0.8756326884031296, "mean_token_accuracy": 0.9029947072267532, "num_tokens": 291889104.0, "sample_num_tokens": 7468.0, "step": 9578, "total_num_tokens": 291918976.0, "z_loss": 0.0004591307952068746 }, { "copy_logits_max": -2.0965421199798584, "copy_logits_min": -750000000.0, "copy_num_tokens": 457.6875, "epoch": 1.9562420219555783, "gen_logits_max": 3.7288076877593994, "gen_logits_mean": -17.491680145263672, "gen_logits_min": -30.09076690673828, "gen_logits_std": 3.5278820991516113, "gen_loss": 0.288424015045166, "grad_norm": 0.34228486470858216, "learning_rate": 1.8829894736842104e-05, "loss": 0.2644, "mean_copy_accuracy": 0.9982027262449265, "mean_gen_accuracy": 0.8749217838048935, "mean_token_accuracy": 0.9092880189418793, "num_tokens": 292167974.0, "sample_num_tokens": 8605.0, "step": 9579, "total_num_tokens": 292202394.0, "z_loss": 0.0004690120695158839 }, { "copy_logits_max": -3.375765562057495, "copy_logits_min": -750000000.0, "copy_num_tokens": 445.875, "epoch": 1.956446259892775, "gen_logits_max": 3.8439698219299316, "gen_logits_mean": -17.438865661621094, "gen_logits_min": -29.668312072753906, "gen_logits_std": 3.5167415142059326, "gen_loss": 0.2991067171096802, "grad_norm": 0.3196085454530806, "learning_rate": 1.882863157894737e-05, "loss": 0.2636, "mean_copy_accuracy": 0.9977307468652725, "mean_gen_accuracy": 0.8787932991981506, "mean_token_accuracy": 0.9099133163690567, "num_tokens": 292451521.0, "sample_num_tokens": 8962.75, "step": 9580, "total_num_tokens": 292487372.0, "z_loss": 0.0004888004623353481 }, { "copy_logits_max": -2.6421892642974854, "copy_logits_min": -750000000.0, "copy_num_tokens": 410.6875, "epoch": 1.956650497829972, "gen_logits_max": 5.0917840003967285, "gen_logits_mean": -15.749736785888672, "gen_logits_min": -27.958911895751953, "gen_logits_std": 3.445042610168457, "gen_loss": 0.26897984743118286, "grad_norm": 0.33910416292982926, "learning_rate": 1.8827368421052632e-05, "loss": 0.2761, "mean_copy_accuracy": 0.9976759254932404, "mean_gen_accuracy": 0.8810023218393326, "mean_token_accuracy": 0.9045413583517075, "num_tokens": 292711957.0, "sample_num_tokens": 8912.75, "step": 9581, "total_num_tokens": 292747608.0, "z_loss": 0.000395116803701967 }, { "copy_logits_max": -3.8451638221740723, "copy_logits_min": -750000000.0, "copy_num_tokens": 497.75, "epoch": 1.9568547357671688, "gen_logits_max": 4.3254241943359375, "gen_logits_mean": -17.269729614257812, "gen_logits_min": -29.722694396972656, "gen_logits_std": 3.5202600955963135, "gen_loss": 0.27747511863708496, "grad_norm": 0.35341125302466286, "learning_rate": 1.8826105263157896e-05, "loss": 0.2737, "mean_copy_accuracy": 0.9974552243947983, "mean_gen_accuracy": 0.8781614899635315, "mean_token_accuracy": 0.9091831147670746, "num_tokens": 292986842.0, "sample_num_tokens": 9111.0, "step": 9582, "total_num_tokens": 293023286.0, "z_loss": 0.00041839579353109 }, { "copy_logits_max": -0.6499789953231812, "copy_logits_min": -687500032.0, "copy_num_tokens": 318.6875, "epoch": 1.9570589737043655, "gen_logits_max": 6.248047828674316, "gen_logits_mean": -14.369214057922363, "gen_logits_min": -26.574058532714844, "gen_logits_std": 3.4040002822875977, "gen_loss": 0.28973522782325745, "grad_norm": 0.33680790653518755, "learning_rate": 1.8824842105263157e-05, "loss": 0.2583, "mean_copy_accuracy": 0.9977879375219345, "mean_gen_accuracy": 0.8814268559217453, "mean_token_accuracy": 0.9114568084478378, "num_tokens": 293261479.0, "sample_num_tokens": 7376.75, "step": 9583, "total_num_tokens": 293290986.0, "z_loss": 0.00043027239735238254 }, { "copy_logits_max": -5.137090682983398, "copy_logits_min": -750000000.0, "copy_num_tokens": 446.5, "epoch": 1.9572632116415625, "gen_logits_max": 4.025187969207764, "gen_logits_mean": -18.109283447265625, "gen_logits_min": -30.174266815185547, "gen_logits_std": 3.552792549133301, "gen_loss": 0.2249564677476883, "grad_norm": 0.3513915914101833, "learning_rate": 1.882357894736842e-05, "loss": 0.2723, "mean_copy_accuracy": 0.9969166666269302, "mean_gen_accuracy": 0.8841470777988434, "mean_token_accuracy": 0.907129317522049, "num_tokens": 293513713.0, "sample_num_tokens": 9572.75, "step": 9584, "total_num_tokens": 293552004.0, "z_loss": 0.0003366188320796937 }, { "copy_logits_max": -1.6532312631607056, "copy_logits_min": -750000000.0, "copy_num_tokens": 424.125, "epoch": 1.9574674495787594, "gen_logits_max": 5.596887588500977, "gen_logits_mean": -15.232816696166992, "gen_logits_min": -27.339372634887695, "gen_logits_std": 3.454624652862549, "gen_loss": 0.2614350914955139, "grad_norm": 0.3179566648183861, "learning_rate": 1.8822315789473683e-05, "loss": 0.2557, "mean_copy_accuracy": 0.9973087012767792, "mean_gen_accuracy": 0.8831926882266998, "mean_token_accuracy": 0.9123476892709732, "num_tokens": 293801401.0, "sample_num_tokens": 8456.25, "step": 9585, "total_num_tokens": 293835226.0, "z_loss": 0.0003903471806552261 }, { "copy_logits_max": -2.007722854614258, "copy_logits_min": -750000000.0, "copy_num_tokens": 448.5625, "epoch": 1.957671687515956, "gen_logits_max": 4.248978137969971, "gen_logits_mean": -17.06739044189453, "gen_logits_min": -29.260238647460938, "gen_logits_std": 3.504410982131958, "gen_loss": 0.2732114791870117, "grad_norm": 0.3471152542240764, "learning_rate": 1.8821052631578947e-05, "loss": 0.2743, "mean_copy_accuracy": 0.9964777678251266, "mean_gen_accuracy": 0.8801684528589249, "mean_token_accuracy": 0.9071463048458099, "num_tokens": 294062883.0, "sample_num_tokens": 8526.75, "step": 9586, "total_num_tokens": 294096990.0, "z_loss": 0.0004298973362892866 }, { "copy_logits_max": 2.087557315826416, "copy_logits_min": -750000000.0, "copy_num_tokens": 727.6875, "epoch": 1.9578759254531528, "gen_logits_max": 4.772796630859375, "gen_logits_mean": -13.808082580566406, "gen_logits_min": -26.407318115234375, "gen_logits_std": 3.4013614654541016, "gen_loss": 0.25504565238952637, "grad_norm": 0.35212497955819705, "learning_rate": 1.881978947368421e-05, "loss": 0.2815, "mean_copy_accuracy": 0.997497484087944, "mean_gen_accuracy": 0.8701343089342117, "mean_token_accuracy": 0.902710810303688, "num_tokens": 294340073.0, "sample_num_tokens": 9871.25, "step": 9587, "total_num_tokens": 294379558.0, "z_loss": 0.00038792239502072334 }, { "copy_logits_max": 0.4555888772010803, "copy_logits_min": -687500032.0, "copy_num_tokens": 599.3125, "epoch": 1.9580801633903497, "gen_logits_max": 3.9475605487823486, "gen_logits_mean": -15.999404907226562, "gen_logits_min": -28.329410552978516, "gen_logits_std": 3.494650363922119, "gen_loss": 0.2221742868423462, "grad_norm": 0.31367372525221654, "learning_rate": 1.8818526315789472e-05, "loss": 0.256, "mean_copy_accuracy": 0.9974191188812256, "mean_gen_accuracy": 0.8844600915908813, "mean_token_accuracy": 0.9117514342069626, "num_tokens": 294629820.0, "sample_num_tokens": 8737.0, "step": 9588, "total_num_tokens": 294664768.0, "z_loss": 0.00033808936132118106 }, { "copy_logits_max": 0.2623758316040039, "copy_logits_min": -750000128.0, "copy_num_tokens": 533.6875, "epoch": 1.9582844013275467, "gen_logits_max": 4.895725250244141, "gen_logits_mean": -14.813394546508789, "gen_logits_min": -26.968921661376953, "gen_logits_std": 3.432225227355957, "gen_loss": 0.2586895823478699, "grad_norm": 0.34955893331586374, "learning_rate": 1.881726315789474e-05, "loss": 0.2772, "mean_copy_accuracy": 0.9981448799371719, "mean_gen_accuracy": 0.8732208758592606, "mean_token_accuracy": 0.905196025967598, "num_tokens": 294898280.0, "sample_num_tokens": 8372.5, "step": 9589, "total_num_tokens": 294931770.0, "z_loss": 0.0004183128185104579 }, { "copy_logits_max": 1.6392626762390137, "copy_logits_min": -750000064.0, "copy_num_tokens": 502.3125, "epoch": 1.9584886392647434, "gen_logits_max": 4.924281120300293, "gen_logits_mean": -14.583629608154297, "gen_logits_min": -27.098751068115234, "gen_logits_std": 3.424412965774536, "gen_loss": 0.25610941648483276, "grad_norm": 0.34027346555641885, "learning_rate": 1.8816e-05, "loss": 0.2776, "mean_copy_accuracy": 0.9967913925647736, "mean_gen_accuracy": 0.8786298334598541, "mean_token_accuracy": 0.9061427563428879, "num_tokens": 295186227.0, "sample_num_tokens": 9665.25, "step": 9590, "total_num_tokens": 295224888.0, "z_loss": 0.00037486758083105087 }, { "copy_logits_max": 0.869162380695343, "copy_logits_min": -750000000.0, "copy_num_tokens": 301.25, "epoch": 1.9586928772019403, "gen_logits_max": 6.0118489265441895, "gen_logits_mean": -14.327674865722656, "gen_logits_min": -26.717899322509766, "gen_logits_std": 3.390428066253662, "gen_loss": 0.319804310798645, "grad_norm": 0.3544692008133026, "learning_rate": 1.8814736842105265e-05, "loss": 0.2888, "mean_copy_accuracy": 0.9962728023529053, "mean_gen_accuracy": 0.8783497512340546, "mean_token_accuracy": 0.9020514637231827, "num_tokens": 295464159.0, "sample_num_tokens": 7741.75, "step": 9591, "total_num_tokens": 295495126.0, "z_loss": 0.0004785708151757717 }, { "copy_logits_max": -1.7211947441101074, "copy_logits_min": -687500032.0, "copy_num_tokens": 536.0, "epoch": 1.9588971151391372, "gen_logits_max": 4.537554740905762, "gen_logits_mean": -15.679983139038086, "gen_logits_min": -28.037534713745117, "gen_logits_std": 3.4507784843444824, "gen_loss": 0.2885400354862213, "grad_norm": 0.347110833944079, "learning_rate": 1.8813473684210526e-05, "loss": 0.2842, "mean_copy_accuracy": 0.9968582987785339, "mean_gen_accuracy": 0.8757695108652115, "mean_token_accuracy": 0.9040075838565826, "num_tokens": 295715826.0, "sample_num_tokens": 8921.0, "step": 9592, "total_num_tokens": 295751510.0, "z_loss": 0.00041314843110740185 }, { "copy_logits_max": -0.9607512950897217, "copy_logits_min": -750000128.0, "copy_num_tokens": 442.1875, "epoch": 1.959101353076334, "gen_logits_max": 3.5342416763305664, "gen_logits_mean": -17.24349594116211, "gen_logits_min": -29.664352416992188, "gen_logits_std": 3.550295352935791, "gen_loss": 0.2508351802825928, "grad_norm": 0.34371872145243165, "learning_rate": 1.881221052631579e-05, "loss": 0.2772, "mean_copy_accuracy": 0.9969186186790466, "mean_gen_accuracy": 0.8781006783246994, "mean_token_accuracy": 0.9078015685081482, "num_tokens": 295983798.0, "sample_num_tokens": 7557.0, "step": 9593, "total_num_tokens": 296014026.0, "z_loss": 0.00037347222678363323 }, { "copy_logits_max": -3.1783738136291504, "copy_logits_min": -750000064.0, "copy_num_tokens": 289.25, "epoch": 1.9593055910135306, "gen_logits_max": 3.9526548385620117, "gen_logits_mean": -17.45937156677246, "gen_logits_min": -29.766353607177734, "gen_logits_std": 3.526158332824707, "gen_loss": 0.2723442316055298, "grad_norm": 0.3315223514953627, "learning_rate": 1.881094736842105e-05, "loss": 0.2586, "mean_copy_accuracy": 0.9972736835479736, "mean_gen_accuracy": 0.8844795078039169, "mean_token_accuracy": 0.9121177941560745, "num_tokens": 296253818.0, "sample_num_tokens": 6757.5, "step": 9594, "total_num_tokens": 296280848.0, "z_loss": 0.0004091192677151412 }, { "copy_logits_max": -0.26866084337234497, "copy_logits_min": -687500032.0, "copy_num_tokens": 334.125, "epoch": 1.9595098289507276, "gen_logits_max": 4.756755828857422, "gen_logits_mean": -15.985296249389648, "gen_logits_min": -28.347196578979492, "gen_logits_std": 3.460547924041748, "gen_loss": 0.3003619909286499, "grad_norm": 0.35049213756183545, "learning_rate": 1.8809684210526316e-05, "loss": 0.2614, "mean_copy_accuracy": 0.9977234303951263, "mean_gen_accuracy": 0.8792207688093185, "mean_token_accuracy": 0.9097170233726501, "num_tokens": 296509510.0, "sample_num_tokens": 7334.5, "step": 9595, "total_num_tokens": 296538848.0, "z_loss": 0.0004423942300491035 }, { "copy_logits_max": -0.2159743309020996, "copy_logits_min": -687500032.0, "copy_num_tokens": 626.875, "epoch": 1.9597140668879245, "gen_logits_max": 3.716398239135742, "gen_logits_mean": -16.766525268554688, "gen_logits_min": -29.57955551147461, "gen_logits_std": 3.5454959869384766, "gen_loss": 0.23974603414535522, "grad_norm": 0.34668783305168704, "learning_rate": 1.8808421052631576e-05, "loss": 0.2714, "mean_copy_accuracy": 0.9973371773958206, "mean_gen_accuracy": 0.8756864368915558, "mean_token_accuracy": 0.9074601233005524, "num_tokens": 296779128.0, "sample_num_tokens": 9535.0, "step": 9596, "total_num_tokens": 296817268.0, "z_loss": 0.0003742998815141618 }, { "copy_logits_max": -5.0549092292785645, "copy_logits_min": -687500096.0, "copy_num_tokens": 541.5625, "epoch": 1.9599183048251212, "gen_logits_max": 2.0748066902160645, "gen_logits_mean": -20.16851043701172, "gen_logits_min": -32.411956787109375, "gen_logits_std": 3.6561241149902344, "gen_loss": 0.2634598910808563, "grad_norm": 0.3502278141348468, "learning_rate": 1.8807157894736844e-05, "loss": 0.2768, "mean_copy_accuracy": 0.9980243891477585, "mean_gen_accuracy": 0.8718635737895966, "mean_token_accuracy": 0.9054803401231766, "num_tokens": 297059338.0, "sample_num_tokens": 9142.5, "step": 9597, "total_num_tokens": 297095908.0, "z_loss": 0.00038637773832306266 }, { "copy_logits_max": -0.7976536154747009, "copy_logits_min": -750000000.0, "copy_num_tokens": 514.3125, "epoch": 1.9601225427623181, "gen_logits_max": 4.154314994812012, "gen_logits_mean": -17.12970542907715, "gen_logits_min": -29.575626373291016, "gen_logits_std": 3.5856540203094482, "gen_loss": 0.23269438743591309, "grad_norm": 0.34203863470003926, "learning_rate": 1.8805894736842105e-05, "loss": 0.2471, "mean_copy_accuracy": 0.9968084543943405, "mean_gen_accuracy": 0.8866623640060425, "mean_token_accuracy": 0.9145335257053375, "num_tokens": 297340506.0, "sample_num_tokens": 8864.5, "step": 9598, "total_num_tokens": 297375964.0, "z_loss": 0.0003304996935185045 }, { "copy_logits_max": -2.2378487586975098, "copy_logits_min": -750000128.0, "copy_num_tokens": 704.3125, "epoch": 1.960326780699515, "gen_logits_max": 2.184699058532715, "gen_logits_mean": -18.078433990478516, "gen_logits_min": -30.73695182800293, "gen_logits_std": 3.592992067337036, "gen_loss": 0.23960968852043152, "grad_norm": 0.3666543015889635, "learning_rate": 1.880463157894737e-05, "loss": 0.2816, "mean_copy_accuracy": 0.9969460964202881, "mean_gen_accuracy": 0.8754147291183472, "mean_token_accuracy": 0.9046676456928253, "num_tokens": 297590538.0, "sample_num_tokens": 9162.0, "step": 9599, "total_num_tokens": 297627186.0, "z_loss": 0.000365754880476743 }, { "copy_logits_max": -1.6425952911376953, "copy_logits_min": -750000000.0, "copy_num_tokens": 331.3125, "epoch": 1.9605310186367118, "gen_logits_max": 4.809847831726074, "gen_logits_mean": -16.707233428955078, "gen_logits_min": -28.53823471069336, "gen_logits_std": 3.5006918907165527, "gen_loss": 0.29048314690589905, "grad_norm": 0.33440388842092317, "learning_rate": 1.8803368421052634e-05, "loss": 0.2596, "mean_copy_accuracy": 0.996917188167572, "mean_gen_accuracy": 0.8811673372983932, "mean_token_accuracy": 0.9094693660736084, "num_tokens": 297874962.0, "sample_num_tokens": 7575.0, "step": 9600, "total_num_tokens": 297905262.0, "z_loss": 0.0004164565762039274 }, { "copy_logits_max": -0.04485058784484863, "copy_logits_min": -750000000.0, "copy_num_tokens": 659.625, "epoch": 1.9607352565739085, "gen_logits_max": 4.076048851013184, "gen_logits_mean": -16.162105560302734, "gen_logits_min": -28.95552635192871, "gen_logits_std": 3.4815688133239746, "gen_loss": 0.24883735179901123, "grad_norm": 0.32215567091683595, "learning_rate": 1.8802105263157895e-05, "loss": 0.2603, "mean_copy_accuracy": 0.9973657876253128, "mean_gen_accuracy": 0.8853841572999954, "mean_token_accuracy": 0.9112477004528046, "num_tokens": 298138926.0, "sample_num_tokens": 10218.0, "step": 9601, "total_num_tokens": 298179798.0, "z_loss": 0.00036522909067571163 }, { "copy_logits_max": -1.8903827667236328, "copy_logits_min": -750000064.0, "copy_num_tokens": 490.125, "epoch": 1.9609394945111054, "gen_logits_max": 2.572878360748291, "gen_logits_mean": -19.454662322998047, "gen_logits_min": -31.34501838684082, "gen_logits_std": 3.6180319786071777, "gen_loss": 0.2637102007865906, "grad_norm": 0.3359156331814447, "learning_rate": 1.880084210526316e-05, "loss": 0.2853, "mean_copy_accuracy": 0.9973457455635071, "mean_gen_accuracy": 0.873381108045578, "mean_token_accuracy": 0.9037521332502365, "num_tokens": 298417907.0, "sample_num_tokens": 8819.75, "step": 9602, "total_num_tokens": 298453186.0, "z_loss": 0.0004093764000572264 }, { "copy_logits_max": -0.47164803743362427, "copy_logits_min": -750000064.0, "copy_num_tokens": 345.75, "epoch": 1.9611437324483023, "gen_logits_max": 4.699815273284912, "gen_logits_mean": -16.940065383911133, "gen_logits_min": -29.234081268310547, "gen_logits_std": 3.5351738929748535, "gen_loss": 0.2871701717376709, "grad_norm": 0.3365290844074918, "learning_rate": 1.879957894736842e-05, "loss": 0.2659, "mean_copy_accuracy": 0.9976319819688797, "mean_gen_accuracy": 0.8817197382450104, "mean_token_accuracy": 0.9066967815160751, "num_tokens": 298691430.0, "sample_num_tokens": 7618.0, "step": 9603, "total_num_tokens": 298721902.0, "z_loss": 0.0004462058423087001 }, { "copy_logits_max": 0.9410295486450195, "copy_logits_min": -750000000.0, "copy_num_tokens": 330.0625, "epoch": 1.961347970385499, "gen_logits_max": 4.943007469177246, "gen_logits_mean": -16.521102905273438, "gen_logits_min": -28.44011688232422, "gen_logits_std": 3.5030224323272705, "gen_loss": 0.304531991481781, "grad_norm": 0.3339202817868534, "learning_rate": 1.8798315789473684e-05, "loss": 0.2703, "mean_copy_accuracy": 0.9966203570365906, "mean_gen_accuracy": 0.8796013742685318, "mean_token_accuracy": 0.9070314019918442, "num_tokens": 298956549.0, "sample_num_tokens": 7080.25, "step": 9604, "total_num_tokens": 298984870.0, "z_loss": 0.0004483303055167198 }, { "copy_logits_max": 5.121779918670654, "copy_logits_min": -750000000.0, "copy_num_tokens": 611.0625, "epoch": 1.961552208322696, "gen_logits_max": 5.660468101501465, "gen_logits_mean": -13.779359817504883, "gen_logits_min": -25.997241973876953, "gen_logits_std": 3.435044050216675, "gen_loss": 0.23629382252693176, "grad_norm": 0.3249964835661565, "learning_rate": 1.879705263157895e-05, "loss": 0.2566, "mean_copy_accuracy": 0.997616708278656, "mean_gen_accuracy": 0.8836795091629028, "mean_token_accuracy": 0.9120734483003616, "num_tokens": 299222873.0, "sample_num_tokens": 9264.75, "step": 9605, "total_num_tokens": 299259932.0, "z_loss": 0.00033653172431513667 }, { "copy_logits_max": 2.408461332321167, "copy_logits_min": -750000000.0, "copy_num_tokens": 409.75, "epoch": 1.9617564462598929, "gen_logits_max": 5.567340850830078, "gen_logits_mean": -14.969690322875977, "gen_logits_min": -26.751007080078125, "gen_logits_std": 3.431065559387207, "gen_loss": 0.3081325590610504, "grad_norm": 0.3297068750339448, "learning_rate": 1.8795789473684213e-05, "loss": 0.282, "mean_copy_accuracy": 0.9961719065904617, "mean_gen_accuracy": 0.8786214143037796, "mean_token_accuracy": 0.9044277966022491, "num_tokens": 299491675.0, "sample_num_tokens": 8787.25, "step": 9606, "total_num_tokens": 299526824.0, "z_loss": 0.0004600786487571895 }, { "copy_logits_max": -0.30654215812683105, "copy_logits_min": -625000064.0, "copy_num_tokens": 402.125, "epoch": 1.9619606841970896, "gen_logits_max": 4.67597770690918, "gen_logits_mean": -16.643882751464844, "gen_logits_min": -28.836214065551758, "gen_logits_std": 3.504413604736328, "gen_loss": 0.2799062728881836, "grad_norm": 0.352357631554241, "learning_rate": 1.8794526315789474e-05, "loss": 0.2643, "mean_copy_accuracy": 0.9973292648792267, "mean_gen_accuracy": 0.8784155249595642, "mean_token_accuracy": 0.9095659404993057, "num_tokens": 299785276.0, "sample_num_tokens": 7480.0, "step": 9607, "total_num_tokens": 299815196.0, "z_loss": 0.00042442468111403286 }, { "copy_logits_max": 0.646070659160614, "copy_logits_min": -750000000.0, "copy_num_tokens": 552.0, "epoch": 1.9621649221342863, "gen_logits_max": 5.385310649871826, "gen_logits_mean": -15.155939102172852, "gen_logits_min": -27.92131996154785, "gen_logits_std": 3.463042736053467, "gen_loss": 0.24875497817993164, "grad_norm": 0.30913872979304324, "learning_rate": 1.8793263157894738e-05, "loss": 0.2601, "mean_copy_accuracy": 0.9985590577125549, "mean_gen_accuracy": 0.8763611912727356, "mean_token_accuracy": 0.9117397964000702, "num_tokens": 300062724.0, "sample_num_tokens": 8633.5, "step": 9608, "total_num_tokens": 300097258.0, "z_loss": 0.0003720174718182534 }, { "copy_logits_max": 4.949911594390869, "copy_logits_min": -750000000.0, "copy_num_tokens": 535.75, "epoch": 1.9623691600714834, "gen_logits_max": 5.560026168823242, "gen_logits_mean": -14.059734344482422, "gen_logits_min": -26.361373901367188, "gen_logits_std": 3.449678659439087, "gen_loss": 0.2373426854610443, "grad_norm": 0.32220478808013225, "learning_rate": 1.8792e-05, "loss": 0.2515, "mean_copy_accuracy": 0.9976875633001328, "mean_gen_accuracy": 0.8847658634185791, "mean_token_accuracy": 0.9142958670854568, "num_tokens": 300339805.0, "sample_num_tokens": 8281.75, "step": 9609, "total_num_tokens": 300372932.0, "z_loss": 0.0003855165559798479 }, { "copy_logits_max": -3.7712035179138184, "copy_logits_min": -750000000.0, "copy_num_tokens": 327.4375, "epoch": 1.9625733980086801, "gen_logits_max": 3.716496706008911, "gen_logits_mean": -18.854915618896484, "gen_logits_min": -30.735614776611328, "gen_logits_std": 3.5505871772766113, "gen_loss": 0.2735773026943207, "grad_norm": 0.34505864608673337, "learning_rate": 1.8790736842105263e-05, "loss": 0.2798, "mean_copy_accuracy": 0.9962741434574127, "mean_gen_accuracy": 0.876397505402565, "mean_token_accuracy": 0.9039857536554337, "num_tokens": 300608400.0, "sample_num_tokens": 7347.5, "step": 9610, "total_num_tokens": 300637790.0, "z_loss": 0.0004051094001624733 }, { "copy_logits_max": 0.41422343254089355, "copy_logits_min": -750000000.0, "copy_num_tokens": 539.6875, "epoch": 1.9627776359458768, "gen_logits_max": 4.520966529846191, "gen_logits_mean": -16.38394546508789, "gen_logits_min": -28.765005111694336, "gen_logits_std": 3.4864187240600586, "gen_loss": 0.2458225041627884, "grad_norm": 0.321009027590206, "learning_rate": 1.8789473684210524e-05, "loss": 0.2598, "mean_copy_accuracy": 0.9978886991739273, "mean_gen_accuracy": 0.8829759359359741, "mean_token_accuracy": 0.912780687212944, "num_tokens": 300884563.0, "sample_num_tokens": 9420.75, "step": 9611, "total_num_tokens": 300922246.0, "z_loss": 0.0003607543767429888 }, { "copy_logits_max": -1.3369431495666504, "copy_logits_min": -750000000.0, "copy_num_tokens": 494.625, "epoch": 1.9629818738830738, "gen_logits_max": 3.992652654647827, "gen_logits_mean": -16.749187469482422, "gen_logits_min": -29.184738159179688, "gen_logits_std": 3.531068801879883, "gen_loss": 0.2599419355392456, "grad_norm": 0.36392641607915815, "learning_rate": 1.878821052631579e-05, "loss": 0.2685, "mean_copy_accuracy": 0.9963187277317047, "mean_gen_accuracy": 0.8806722313165665, "mean_token_accuracy": 0.9084291607141495, "num_tokens": 301146664.0, "sample_num_tokens": 8660.5, "step": 9612, "total_num_tokens": 301181306.0, "z_loss": 0.0004053384473081678 }, { "copy_logits_max": -0.06757551431655884, "copy_logits_min": -750000000.0, "copy_num_tokens": 619.5, "epoch": 1.9631861118202707, "gen_logits_max": 3.616528034210205, "gen_logits_mean": -17.004121780395508, "gen_logits_min": -29.3623104095459, "gen_logits_std": 3.514296054840088, "gen_loss": 0.2578797936439514, "grad_norm": 0.3653642218758959, "learning_rate": 1.8786947368421056e-05, "loss": 0.2899, "mean_copy_accuracy": 0.9972302913665771, "mean_gen_accuracy": 0.8689136952161789, "mean_token_accuracy": 0.9011116772890091, "num_tokens": 301433813.0, "sample_num_tokens": 9746.75, "step": 9613, "total_num_tokens": 301472800.0, "z_loss": 0.00039373699110001326 }, { "copy_logits_max": 1.632309079170227, "copy_logits_min": -750000000.0, "copy_num_tokens": 439.5625, "epoch": 1.9633903497574674, "gen_logits_max": 5.142227649688721, "gen_logits_mean": -15.475610733032227, "gen_logits_min": -28.39984893798828, "gen_logits_std": 3.4789481163024902, "gen_loss": 0.26394569873809814, "grad_norm": 0.3332951188104044, "learning_rate": 1.8785684210526317e-05, "loss": 0.2807, "mean_copy_accuracy": 0.9977804273366928, "mean_gen_accuracy": 0.8738107234239578, "mean_token_accuracy": 0.9038596004247665, "num_tokens": 301729021.0, "sample_num_tokens": 8270.25, "step": 9614, "total_num_tokens": 301762102.0, "z_loss": 0.0003802127030212432 }, { "copy_logits_max": -1.7376680374145508, "copy_logits_min": -687500032.0, "copy_num_tokens": 427.0625, "epoch": 1.9635945876946643, "gen_logits_max": 4.237196922302246, "gen_logits_mean": -17.06451988220215, "gen_logits_min": -29.026535034179688, "gen_logits_std": 3.5006775856018066, "gen_loss": 0.27958130836486816, "grad_norm": 0.33881184743743475, "learning_rate": 1.878442105263158e-05, "loss": 0.2713, "mean_copy_accuracy": 0.9975107461214066, "mean_gen_accuracy": 0.8742655515670776, "mean_token_accuracy": 0.9102896898984909, "num_tokens": 302016900.0, "sample_num_tokens": 8411.0, "step": 9615, "total_num_tokens": 302050544.0, "z_loss": 0.0004297194245737046 }, { "copy_logits_max": -1.465395212173462, "copy_logits_min": -750000000.0, "copy_num_tokens": 369.125, "epoch": 1.9637988256318613, "gen_logits_max": 4.078252792358398, "gen_logits_mean": -16.188383102416992, "gen_logits_min": -27.780597686767578, "gen_logits_std": 3.3842854499816895, "gen_loss": 0.27057814598083496, "grad_norm": 0.37363348709855215, "learning_rate": 1.8783157894736842e-05, "loss": 0.2709, "mean_copy_accuracy": 0.997417077422142, "mean_gen_accuracy": 0.8788447082042694, "mean_token_accuracy": 0.9068478345870972, "num_tokens": 302267213.0, "sample_num_tokens": 7391.25, "step": 9616, "total_num_tokens": 302296778.0, "z_loss": 0.0004177719238214195 }, { "copy_logits_max": -0.5932557582855225, "copy_logits_min": -750000000.0, "copy_num_tokens": 452.0, "epoch": 1.964003063569058, "gen_logits_max": 4.154098033905029, "gen_logits_mean": -16.540847778320312, "gen_logits_min": -28.725616455078125, "gen_logits_std": 3.449470043182373, "gen_loss": 0.25904378294944763, "grad_norm": 0.34025684015186025, "learning_rate": 1.8781894736842107e-05, "loss": 0.2641, "mean_copy_accuracy": 0.9975510686635971, "mean_gen_accuracy": 0.8810338526964188, "mean_token_accuracy": 0.9092163741588593, "num_tokens": 302512021.0, "sample_num_tokens": 8360.25, "step": 9617, "total_num_tokens": 302545462.0, "z_loss": 0.00039057634421624243 }, { "copy_logits_max": 2.0671942234039307, "copy_logits_min": -750000000.0, "copy_num_tokens": 580.75, "epoch": 1.9642073015062547, "gen_logits_max": 4.234658241271973, "gen_logits_mean": -14.797523498535156, "gen_logits_min": -26.792455673217773, "gen_logits_std": 3.3586795330047607, "gen_loss": 0.30931025743484497, "grad_norm": 0.35084643479016503, "learning_rate": 1.8780631578947368e-05, "loss": 0.2713, "mean_copy_accuracy": 0.9974691867828369, "mean_gen_accuracy": 0.87734255194664, "mean_token_accuracy": 0.9097055494785309, "num_tokens": 302785736.0, "sample_num_tokens": 9180.0, "step": 9618, "total_num_tokens": 302822456.0, "z_loss": 0.0005069643957540393 }, { "copy_logits_max": -0.669303297996521, "copy_logits_min": -687500032.0, "copy_num_tokens": 547.25, "epoch": 1.9644115394434516, "gen_logits_max": 3.9132609367370605, "gen_logits_mean": -16.747081756591797, "gen_logits_min": -28.875171661376953, "gen_logits_std": 3.4568276405334473, "gen_loss": 0.25300800800323486, "grad_norm": 0.33958497838326346, "learning_rate": 1.8779368421052632e-05, "loss": 0.2776, "mean_copy_accuracy": 0.9969076961278915, "mean_gen_accuracy": 0.8809234201908112, "mean_token_accuracy": 0.9091885834932327, "num_tokens": 303070177.0, "sample_num_tokens": 8583.25, "step": 9619, "total_num_tokens": 303104510.0, "z_loss": 0.0003531815600581467 }, { "copy_logits_max": -0.7223756909370422, "copy_logits_min": -687500032.0, "copy_num_tokens": 353.0, "epoch": 1.9646157773806485, "gen_logits_max": 5.089846611022949, "gen_logits_mean": -16.4420108795166, "gen_logits_min": -28.436161041259766, "gen_logits_std": 3.412891149520874, "gen_loss": 0.33274996280670166, "grad_norm": 0.3547766626629689, "learning_rate": 1.8778105263157893e-05, "loss": 0.2923, "mean_copy_accuracy": 0.9967031329870224, "mean_gen_accuracy": 0.8744074106216431, "mean_token_accuracy": 0.9012560844421387, "num_tokens": 303329153.0, "sample_num_tokens": 8446.75, "step": 9620, "total_num_tokens": 303362940.0, "z_loss": 0.0005066498415544629 }, { "copy_logits_max": -1.3983219861984253, "copy_logits_min": -750000000.0, "copy_num_tokens": 429.125, "epoch": 1.9648200153178452, "gen_logits_max": 3.9034008979797363, "gen_logits_mean": -16.236616134643555, "gen_logits_min": -28.098276138305664, "gen_logits_std": 3.3660619258880615, "gen_loss": 0.2851271629333496, "grad_norm": 0.3274032549901915, "learning_rate": 1.877684210526316e-05, "loss": 0.2796, "mean_copy_accuracy": 0.9977996051311493, "mean_gen_accuracy": 0.8759263902902603, "mean_token_accuracy": 0.9051208794116974, "num_tokens": 303598616.0, "sample_num_tokens": 7941.0, "step": 9621, "total_num_tokens": 303630380.0, "z_loss": 0.00040121408528648317 }, { "copy_logits_max": -0.6368099451065063, "copy_logits_min": -750000000.0, "copy_num_tokens": 550.5, "epoch": 1.9650242532550422, "gen_logits_max": 3.8916609287261963, "gen_logits_mean": -16.3355712890625, "gen_logits_min": -28.55701446533203, "gen_logits_std": 3.4471864700317383, "gen_loss": 0.24079450964927673, "grad_norm": 0.32577100118290797, "learning_rate": 1.877557894736842e-05, "loss": 0.2707, "mean_copy_accuracy": 0.9971330761909485, "mean_gen_accuracy": 0.8797626048326492, "mean_token_accuracy": 0.9067071676254272, "num_tokens": 303874470.0, "sample_num_tokens": 9511.5, "step": 9622, "total_num_tokens": 303912516.0, "z_loss": 0.00034039339516311884 }, { "copy_logits_max": -4.624917030334473, "copy_logits_min": -750000000.0, "copy_num_tokens": 381.125, "epoch": 1.965228491192239, "gen_logits_max": 3.8044590950012207, "gen_logits_mean": -18.046520233154297, "gen_logits_min": -30.017593383789062, "gen_logits_std": 3.5009047985076904, "gen_loss": 0.2748900055885315, "grad_norm": 0.33440072408318355, "learning_rate": 1.8774315789473686e-05, "loss": 0.27, "mean_copy_accuracy": 0.9968710988759995, "mean_gen_accuracy": 0.8808888494968414, "mean_token_accuracy": 0.9075865298509598, "num_tokens": 304152245.0, "sample_num_tokens": 8094.75, "step": 9623, "total_num_tokens": 304184624.0, "z_loss": 0.0003897913557011634 }, { "copy_logits_max": -2.927288055419922, "copy_logits_min": -750000000.0, "copy_num_tokens": 354.375, "epoch": 1.9654327291294358, "gen_logits_max": 2.3410017490386963, "gen_logits_mean": -19.818798065185547, "gen_logits_min": -31.795955657958984, "gen_logits_std": 3.604590654373169, "gen_loss": 0.2272508442401886, "grad_norm": 0.30940499076491557, "learning_rate": 1.8773052631578947e-05, "loss": 0.2452, "mean_copy_accuracy": 0.9971523731946945, "mean_gen_accuracy": 0.8904297947883606, "mean_token_accuracy": 0.9171018749475479, "num_tokens": 304440110.0, "sample_num_tokens": 7761.5, "step": 9624, "total_num_tokens": 304471156.0, "z_loss": 0.00034303037682548165 }, { "copy_logits_max": -1.8534600734710693, "copy_logits_min": -750000000.0, "copy_num_tokens": 449.8125, "epoch": 1.9656369670666325, "gen_logits_max": 5.378101348876953, "gen_logits_mean": -15.002816200256348, "gen_logits_min": -27.617950439453125, "gen_logits_std": 3.427797317504883, "gen_loss": 0.2848367691040039, "grad_norm": 0.3559990973295993, "learning_rate": 1.877178947368421e-05, "loss": 0.2786, "mean_copy_accuracy": 0.997213676571846, "mean_gen_accuracy": 0.8753517121076584, "mean_token_accuracy": 0.9056556671857834, "num_tokens": 304701335.0, "sample_num_tokens": 9021.75, "step": 9625, "total_num_tokens": 304737422.0, "z_loss": 0.00043113695573993027 }, { "copy_logits_max": -3.7618584632873535, "copy_logits_min": -687500096.0, "copy_num_tokens": 305.0625, "epoch": 1.9658412050038294, "gen_logits_max": 3.890259265899658, "gen_logits_mean": -17.964248657226562, "gen_logits_min": -30.22317886352539, "gen_logits_std": 3.4942235946655273, "gen_loss": 0.3286667466163635, "grad_norm": 0.35011617685533664, "learning_rate": 1.8770526315789475e-05, "loss": 0.2981, "mean_copy_accuracy": 0.9972532987594604, "mean_gen_accuracy": 0.8721310794353485, "mean_token_accuracy": 0.8994500190019608, "num_tokens": 304977570.0, "sample_num_tokens": 7603.0, "step": 9626, "total_num_tokens": 305007982.0, "z_loss": 0.0005208419752307236 }, { "copy_logits_max": -4.55632209777832, "copy_logits_min": -750000000.0, "copy_num_tokens": 457.25, "epoch": 1.9660454429410263, "gen_logits_max": 2.2887213230133057, "gen_logits_mean": -18.71135139465332, "gen_logits_min": -30.524869918823242, "gen_logits_std": 3.512881278991699, "gen_loss": 0.25918474793434143, "grad_norm": 0.3360766649756659, "learning_rate": 1.8769263157894736e-05, "loss": 0.2581, "mean_copy_accuracy": 0.9968082308769226, "mean_gen_accuracy": 0.8832024186849594, "mean_token_accuracy": 0.9107507914304733, "num_tokens": 305236909.0, "sample_num_tokens": 8381.25, "step": 9627, "total_num_tokens": 305270434.0, "z_loss": 0.0003625044773798436 }, { "copy_logits_max": -2.8083691596984863, "copy_logits_min": -750000000.0, "copy_num_tokens": 381.6875, "epoch": 1.966249680878223, "gen_logits_max": 3.0734970569610596, "gen_logits_mean": -17.912267684936523, "gen_logits_min": -29.528493881225586, "gen_logits_std": 3.4679603576660156, "gen_loss": 0.28782808780670166, "grad_norm": 0.33430006981530586, "learning_rate": 1.8768e-05, "loss": 0.2742, "mean_copy_accuracy": 0.9973392635583878, "mean_gen_accuracy": 0.8740379810333252, "mean_token_accuracy": 0.9053047299385071, "num_tokens": 305515429.0, "sample_num_tokens": 8552.25, "step": 9628, "total_num_tokens": 305549638.0, "z_loss": 0.0003969239769503474 }, { "copy_logits_max": 0.020901590585708618, "copy_logits_min": -687500032.0, "copy_num_tokens": 522.75, "epoch": 1.96645391881542, "gen_logits_max": 3.8006324768066406, "gen_logits_mean": -16.142255783081055, "gen_logits_min": -28.020811080932617, "gen_logits_std": 3.4045026302337646, "gen_loss": 0.23712411522865295, "grad_norm": 0.3201816355775891, "learning_rate": 1.8766736842105265e-05, "loss": 0.2726, "mean_copy_accuracy": 0.9967469125986099, "mean_gen_accuracy": 0.8823827356100082, "mean_token_accuracy": 0.9071344286203384, "num_tokens": 305791022.0, "sample_num_tokens": 8774.5, "step": 9629, "total_num_tokens": 305826120.0, "z_loss": 0.00035976627259515226 }, { "copy_logits_max": 0.39634236693382263, "copy_logits_min": -625000000.0, "copy_num_tokens": 521.0, "epoch": 1.966658156752617, "gen_logits_max": 3.881061553955078, "gen_logits_mean": -15.424232482910156, "gen_logits_min": -27.647642135620117, "gen_logits_std": 3.339989423751831, "gen_loss": 0.30145254731178284, "grad_norm": 0.3386206012662911, "learning_rate": 1.876547368421053e-05, "loss": 0.2767, "mean_copy_accuracy": 0.9982117414474487, "mean_gen_accuracy": 0.8784069865942001, "mean_token_accuracy": 0.9075965285301208, "num_tokens": 306066762.0, "sample_num_tokens": 8591.0, "step": 9630, "total_num_tokens": 306101126.0, "z_loss": 0.0004375097923912108 }, { "copy_logits_max": -4.042144298553467, "copy_logits_min": -750000064.0, "copy_num_tokens": 419.3125, "epoch": 1.9668623946898136, "gen_logits_max": 2.245327949523926, "gen_logits_mean": -18.700822830200195, "gen_logits_min": -30.611282348632812, "gen_logits_std": 3.499605417251587, "gen_loss": 0.2704106569290161, "grad_norm": 0.3453735789758502, "learning_rate": 1.876421052631579e-05, "loss": 0.2704, "mean_copy_accuracy": 0.9979125410318375, "mean_gen_accuracy": 0.8802791386842728, "mean_token_accuracy": 0.9090422093868256, "num_tokens": 306322426.0, "sample_num_tokens": 8100.0, "step": 9631, "total_num_tokens": 306354826.0, "z_loss": 0.0003859797725453973 }, { "copy_logits_max": -2.998694896697998, "copy_logits_min": -750000000.0, "copy_num_tokens": 674.625, "epoch": 1.9670666326270103, "gen_logits_max": 2.1373353004455566, "gen_logits_mean": -17.557079315185547, "gen_logits_min": -29.48062515258789, "gen_logits_std": 3.4242377281188965, "gen_loss": 0.22946998476982117, "grad_norm": 0.35345952402889874, "learning_rate": 1.8762947368421054e-05, "loss": 0.2492, "mean_copy_accuracy": 0.9976381659507751, "mean_gen_accuracy": 0.8848214894533157, "mean_token_accuracy": 0.914521649479866, "num_tokens": 306606562.0, "sample_num_tokens": 10291.5, "step": 9632, "total_num_tokens": 306647728.0, "z_loss": 0.00033336092019453645 }, { "copy_logits_max": -6.624017715454102, "copy_logits_min": -750000000.0, "copy_num_tokens": 400.0625, "epoch": 1.9672708705642075, "gen_logits_max": 3.3068909645080566, "gen_logits_mean": -17.42841339111328, "gen_logits_min": -29.524633407592773, "gen_logits_std": 3.4422476291656494, "gen_loss": 0.27354955673217773, "grad_norm": 0.3673228719854886, "learning_rate": 1.8761684210526315e-05, "loss": 0.2753, "mean_copy_accuracy": 0.9959812164306641, "mean_gen_accuracy": 0.880460187792778, "mean_token_accuracy": 0.9050539880990982, "num_tokens": 306873010.0, "sample_num_tokens": 7608.0, "step": 9633, "total_num_tokens": 306903442.0, "z_loss": 0.00040447473293170333 }, { "copy_logits_max": -4.864518165588379, "copy_logits_min": -750000000.0, "copy_num_tokens": 472.25, "epoch": 1.9674751085014042, "gen_logits_max": 2.788217544555664, "gen_logits_mean": -17.61305809020996, "gen_logits_min": -29.62554931640625, "gen_logits_std": 3.430126190185547, "gen_loss": 0.2896731495857239, "grad_norm": 0.3367748191431853, "learning_rate": 1.876042105263158e-05, "loss": 0.281, "mean_copy_accuracy": 0.9968028515577316, "mean_gen_accuracy": 0.8768487721681595, "mean_token_accuracy": 0.9035720229148865, "num_tokens": 307124650.0, "sample_num_tokens": 8485.5, "step": 9634, "total_num_tokens": 307158592.0, "z_loss": 0.0004470848070923239 }, { "copy_logits_max": -3.2011752128601074, "copy_logits_min": -687500032.0, "copy_num_tokens": 421.4375, "epoch": 1.9676793464386009, "gen_logits_max": 2.947779655456543, "gen_logits_mean": -17.15502166748047, "gen_logits_min": -29.308372497558594, "gen_logits_std": 3.446929931640625, "gen_loss": 0.30079376697540283, "grad_norm": 0.37958241017850297, "learning_rate": 1.875915789473684e-05, "loss": 0.2898, "mean_copy_accuracy": 0.9971287995576859, "mean_gen_accuracy": 0.8728093504905701, "mean_token_accuracy": 0.9028273224830627, "num_tokens": 307380353.0, "sample_num_tokens": 7514.25, "step": 9635, "total_num_tokens": 307410410.0, "z_loss": 0.0005216749850660563 }, { "copy_logits_max": -5.456049919128418, "copy_logits_min": -750000000.0, "copy_num_tokens": 597.875, "epoch": 1.9678835843757978, "gen_logits_max": 2.3754639625549316, "gen_logits_mean": -18.016315460205078, "gen_logits_min": -30.23241424560547, "gen_logits_std": 3.4939541816711426, "gen_loss": 0.23757699131965637, "grad_norm": 0.31499903544796315, "learning_rate": 1.8757894736842105e-05, "loss": 0.2621, "mean_copy_accuracy": 0.9974798858165741, "mean_gen_accuracy": 0.8811237066984177, "mean_token_accuracy": 0.9110742807388306, "num_tokens": 307654526.0, "sample_num_tokens": 8328.5, "step": 9636, "total_num_tokens": 307687840.0, "z_loss": 0.00035692015080712736 }, { "copy_logits_max": -5.631799697875977, "copy_logits_min": -750000000.0, "copy_num_tokens": 502.0, "epoch": 1.9680878223129947, "gen_logits_max": 3.2248904705047607, "gen_logits_mean": -16.189952850341797, "gen_logits_min": -28.058748245239258, "gen_logits_std": 3.338294506072998, "gen_loss": 0.2760225832462311, "grad_norm": 0.3587100423997261, "learning_rate": 1.8756631578947366e-05, "loss": 0.2943, "mean_copy_accuracy": 0.9971050024032593, "mean_gen_accuracy": 0.8750165104866028, "mean_token_accuracy": 0.901781290769577, "num_tokens": 307917788.0, "sample_num_tokens": 9266.0, "step": 9637, "total_num_tokens": 307954852.0, "z_loss": 0.0004431141132954508 }, { "copy_logits_max": -7.058834552764893, "copy_logits_min": -750000000.0, "copy_num_tokens": 526.3125, "epoch": 1.9682920602501914, "gen_logits_max": 3.074735164642334, "gen_logits_mean": -18.02558135986328, "gen_logits_min": -29.989151000976562, "gen_logits_std": 3.4818692207336426, "gen_loss": 0.2606765031814575, "grad_norm": 0.34197055443439167, "learning_rate": 1.8755368421052634e-05, "loss": 0.2581, "mean_copy_accuracy": 0.9969673603773117, "mean_gen_accuracy": 0.8801854252815247, "mean_token_accuracy": 0.9127231389284134, "num_tokens": 308198052.0, "sample_num_tokens": 9735.0, "step": 9638, "total_num_tokens": 308236992.0, "z_loss": 0.0003879186697304249 }, { "copy_logits_max": -7.384934902191162, "copy_logits_min": -625000064.0, "copy_num_tokens": 452.25, "epoch": 1.9684962981873884, "gen_logits_max": 3.171947717666626, "gen_logits_mean": -17.89913558959961, "gen_logits_min": -30.274229049682617, "gen_logits_std": 3.5180416107177734, "gen_loss": 0.2616657018661499, "grad_norm": 0.36000281530400857, "learning_rate": 1.8754105263157898e-05, "loss": 0.2708, "mean_copy_accuracy": 0.9967990666627884, "mean_gen_accuracy": 0.8836708813905716, "mean_token_accuracy": 0.9072729498147964, "num_tokens": 308474859.0, "sample_num_tokens": 8980.25, "step": 9639, "total_num_tokens": 308510780.0, "z_loss": 0.0003972347476519644 }, { "copy_logits_max": -7.084921836853027, "copy_logits_min": -750000000.0, "copy_num_tokens": 492.0625, "epoch": 1.9687005361245853, "gen_logits_max": 2.6845333576202393, "gen_logits_mean": -19.003950119018555, "gen_logits_min": -31.53546714782715, "gen_logits_std": 3.606785297393799, "gen_loss": 0.26903030276298523, "grad_norm": 0.31076707401367465, "learning_rate": 1.875284210526316e-05, "loss": 0.2561, "mean_copy_accuracy": 0.997490718960762, "mean_gen_accuracy": 0.8809421211481094, "mean_token_accuracy": 0.9125284105539322, "num_tokens": 308756128.0, "sample_num_tokens": 8518.5, "step": 9640, "total_num_tokens": 308790202.0, "z_loss": 0.000423634541220963 }, { "copy_logits_max": -7.904799938201904, "copy_logits_min": -750000128.0, "copy_num_tokens": 390.5, "epoch": 1.968904774061782, "gen_logits_max": 3.0611746311187744, "gen_logits_mean": -18.75656509399414, "gen_logits_min": -30.91781997680664, "gen_logits_std": 3.578178882598877, "gen_loss": 0.2644612193107605, "grad_norm": 0.29675992548771357, "learning_rate": 1.8751578947368423e-05, "loss": 0.2509, "mean_copy_accuracy": 0.9971285462379456, "mean_gen_accuracy": 0.8859283477067947, "mean_token_accuracy": 0.9122224748134613, "num_tokens": 309044989.0, "sample_num_tokens": 8127.75, "step": 9641, "total_num_tokens": 309077500.0, "z_loss": 0.0003809972549788654 }, { "copy_logits_max": -6.6558756828308105, "copy_logits_min": -750000000.0, "copy_num_tokens": 441.5, "epoch": 1.9691090119989787, "gen_logits_max": 3.5018959045410156, "gen_logits_mean": -17.18068504333496, "gen_logits_min": -30.073558807373047, "gen_logits_std": 3.4991064071655273, "gen_loss": 0.2632318139076233, "grad_norm": 0.33614411057720023, "learning_rate": 1.8750315789473684e-05, "loss": 0.2828, "mean_copy_accuracy": 0.99798683822155, "mean_gen_accuracy": 0.8699515461921692, "mean_token_accuracy": 0.902835801243782, "num_tokens": 309318229.0, "sample_num_tokens": 7737.75, "step": 9642, "total_num_tokens": 309349180.0, "z_loss": 0.00042079141712747514 }, { "copy_logits_max": -8.076794624328613, "copy_logits_min": -750000000.0, "copy_num_tokens": 339.75, "epoch": 1.9693132499361756, "gen_logits_max": 3.8118138313293457, "gen_logits_mean": -17.51723289489746, "gen_logits_min": -29.98548126220703, "gen_logits_std": 3.517364501953125, "gen_loss": 0.2984292209148407, "grad_norm": 0.3226831318834176, "learning_rate": 1.8749052631578948e-05, "loss": 0.2621, "mean_copy_accuracy": 0.9966096132993698, "mean_gen_accuracy": 0.8802032172679901, "mean_token_accuracy": 0.9088001996278763, "num_tokens": 309604842.0, "sample_num_tokens": 8289.0, "step": 9643, "total_num_tokens": 309637998.0, "z_loss": 0.00045747729018330574 }, { "copy_logits_max": -5.587704181671143, "copy_logits_min": -750000064.0, "copy_num_tokens": 645.125, "epoch": 1.9695174878733726, "gen_logits_max": 3.113603353500366, "gen_logits_mean": -17.19189453125, "gen_logits_min": -30.45532989501953, "gen_logits_std": 3.527651786804199, "gen_loss": 0.2433280497789383, "grad_norm": 0.2997393127938228, "learning_rate": 1.874778947368421e-05, "loss": 0.2356, "mean_copy_accuracy": 0.9984816610813141, "mean_gen_accuracy": 0.8818628787994385, "mean_token_accuracy": 0.9195445030927658, "num_tokens": 309906912.0, "sample_num_tokens": 9856.0, "step": 9644, "total_num_tokens": 309946336.0, "z_loss": 0.0003846710314974189 }, { "copy_logits_max": -6.522641181945801, "copy_logits_min": -750000000.0, "copy_num_tokens": 367.6875, "epoch": 1.9697217258105693, "gen_logits_max": 4.02154541015625, "gen_logits_mean": -16.59569549560547, "gen_logits_min": -29.91508674621582, "gen_logits_std": 3.4923348426818848, "gen_loss": 0.2563175857067108, "grad_norm": 0.33135297263027075, "learning_rate": 1.8746526315789474e-05, "loss": 0.2693, "mean_copy_accuracy": 0.9975254684686661, "mean_gen_accuracy": 0.8772241473197937, "mean_token_accuracy": 0.9071307629346848, "num_tokens": 310187398.0, "sample_num_tokens": 7962.5, "step": 9645, "total_num_tokens": 310219248.0, "z_loss": 0.0004112132010050118 }, { "copy_logits_max": -4.454495906829834, "copy_logits_min": -625000064.0, "copy_num_tokens": 624.5625, "epoch": 1.9699259637477662, "gen_logits_max": 3.5690927505493164, "gen_logits_mean": -16.387920379638672, "gen_logits_min": -29.55196762084961, "gen_logits_std": 3.493201971054077, "gen_loss": 0.25702691078186035, "grad_norm": 0.3433537112366095, "learning_rate": 1.8745263157894738e-05, "loss": 0.2703, "mean_copy_accuracy": 0.9976507127285004, "mean_gen_accuracy": 0.8788657188415527, "mean_token_accuracy": 0.9088994115591049, "num_tokens": 310460348.0, "sample_num_tokens": 8728.5, "step": 9646, "total_num_tokens": 310495262.0, "z_loss": 0.0004035700112581253 }, { "copy_logits_max": -8.718379974365234, "copy_logits_min": -750000000.0, "copy_num_tokens": 220.0, "epoch": 1.9701302016849631, "gen_logits_max": 4.5975518226623535, "gen_logits_mean": -17.941919326782227, "gen_logits_min": -29.748329162597656, "gen_logits_std": 3.523099422454834, "gen_loss": 0.27937984466552734, "grad_norm": 0.35065340927369687, "learning_rate": 1.8744000000000002e-05, "loss": 0.2895, "mean_copy_accuracy": 0.9979482591152191, "mean_gen_accuracy": 0.872547060251236, "mean_token_accuracy": 0.9008886516094208, "num_tokens": 310737271.0, "sample_num_tokens": 7053.25, "step": 9647, "total_num_tokens": 310765484.0, "z_loss": 0.0004275770334061235 }, { "copy_logits_max": -3.7779955863952637, "copy_logits_min": -750000000.0, "copy_num_tokens": 451.5625, "epoch": 1.9703344396221598, "gen_logits_max": 4.653458595275879, "gen_logits_mean": -16.259063720703125, "gen_logits_min": -28.67884635925293, "gen_logits_std": 3.467481851577759, "gen_loss": 0.28394150733947754, "grad_norm": 0.38162168135101726, "learning_rate": 1.8742736842105263e-05, "loss": 0.2886, "mean_copy_accuracy": 0.9974344968795776, "mean_gen_accuracy": 0.8731468021869659, "mean_token_accuracy": 0.9027436673641205, "num_tokens": 310999088.0, "sample_num_tokens": 7884.5, "step": 9648, "total_num_tokens": 311030626.0, "z_loss": 0.0004495710018090904 }, { "copy_logits_max": -6.829648494720459, "copy_logits_min": -750000000.0, "copy_num_tokens": 400.5, "epoch": 1.9705386775593565, "gen_logits_max": 4.564467430114746, "gen_logits_mean": -16.429813385009766, "gen_logits_min": -28.634811401367188, "gen_logits_std": 3.479979991912842, "gen_loss": 0.26882120966911316, "grad_norm": 0.34149218013326366, "learning_rate": 1.8741473684210527e-05, "loss": 0.2618, "mean_copy_accuracy": 0.997338742017746, "mean_gen_accuracy": 0.8835068047046661, "mean_token_accuracy": 0.9109911024570465, "num_tokens": 311269034.0, "sample_num_tokens": 8456.5, "step": 9649, "total_num_tokens": 311302860.0, "z_loss": 0.0004321867600083351 }, { "copy_logits_max": -4.065245628356934, "copy_logits_min": -750000000.0, "copy_num_tokens": 378.9375, "epoch": 1.9707429154965534, "gen_logits_max": 4.659063339233398, "gen_logits_mean": -16.795921325683594, "gen_logits_min": -29.148054122924805, "gen_logits_std": 3.468975067138672, "gen_loss": 0.29842424392700195, "grad_norm": 0.3356819915675018, "learning_rate": 1.8740210526315788e-05, "loss": 0.2804, "mean_copy_accuracy": 0.9967721253633499, "mean_gen_accuracy": 0.8746546059846878, "mean_token_accuracy": 0.9033842384815216, "num_tokens": 311545321.0, "sample_num_tokens": 7415.75, "step": 9650, "total_num_tokens": 311574984.0, "z_loss": 0.00043548489338718355 }, { "copy_logits_max": -4.018972396850586, "copy_logits_min": -750000000.0, "copy_num_tokens": 549.3125, "epoch": 1.9709471534337504, "gen_logits_max": 3.5335512161254883, "gen_logits_mean": -17.515758514404297, "gen_logits_min": -30.091442108154297, "gen_logits_std": 3.510868549346924, "gen_loss": 0.2640015184879303, "grad_norm": 0.3381183332470449, "learning_rate": 1.8738947368421053e-05, "loss": 0.2638, "mean_copy_accuracy": 0.9975921660661697, "mean_gen_accuracy": 0.8806684166193008, "mean_token_accuracy": 0.9098202139139175, "num_tokens": 311798041.0, "sample_num_tokens": 9120.25, "step": 9651, "total_num_tokens": 311834522.0, "z_loss": 0.0004030626732856035 }, { "copy_logits_max": -3.3710713386535645, "copy_logits_min": -750000000.0, "copy_num_tokens": 355.0, "epoch": 1.971151391370947, "gen_logits_max": 4.124274253845215, "gen_logits_mean": -17.316326141357422, "gen_logits_min": -29.700672149658203, "gen_logits_std": 3.4807465076446533, "gen_loss": 0.2894914150238037, "grad_norm": 0.3287452116802464, "learning_rate": 1.8737684210526314e-05, "loss": 0.2751, "mean_copy_accuracy": 0.9975089430809021, "mean_gen_accuracy": 0.8800628781318665, "mean_token_accuracy": 0.9105656147003174, "num_tokens": 312087535.0, "sample_num_tokens": 7966.25, "step": 9652, "total_num_tokens": 312119400.0, "z_loss": 0.000449362734798342 }, { "copy_logits_max": -6.493247985839844, "copy_logits_min": -750000000.0, "copy_num_tokens": 490.4375, "epoch": 1.971355629308144, "gen_logits_max": 1.9246258735656738, "gen_logits_mean": -19.262290954589844, "gen_logits_min": -31.825992584228516, "gen_logits_std": 3.63071346282959, "gen_loss": 0.24153167009353638, "grad_norm": 0.342553401317616, "learning_rate": 1.8736421052631578e-05, "loss": 0.2581, "mean_copy_accuracy": 0.9979954808950424, "mean_gen_accuracy": 0.8856998682022095, "mean_token_accuracy": 0.9119927734136581, "num_tokens": 312351304.0, "sample_num_tokens": 8048.5, "step": 9653, "total_num_tokens": 312383498.0, "z_loss": 0.0003482149913907051 }, { "copy_logits_max": -5.113605976104736, "copy_logits_min": -750000000.0, "copy_num_tokens": 462.75, "epoch": 1.971559867245341, "gen_logits_max": 3.443330764770508, "gen_logits_mean": -17.319374084472656, "gen_logits_min": -29.901397705078125, "gen_logits_std": 3.50028657913208, "gen_loss": 0.2818225622177124, "grad_norm": 0.37144811920258286, "learning_rate": 1.8735157894736846e-05, "loss": 0.2744, "mean_copy_accuracy": 0.9971002489328384, "mean_gen_accuracy": 0.8784157633781433, "mean_token_accuracy": 0.9105290174484253, "num_tokens": 312634316.0, "sample_num_tokens": 8320.0, "step": 9654, "total_num_tokens": 312667596.0, "z_loss": 0.0004153824411332607 }, { "copy_logits_max": -6.597441673278809, "copy_logits_min": -750000000.0, "copy_num_tokens": 495.125, "epoch": 1.9717641051825376, "gen_logits_max": 3.5283398628234863, "gen_logits_mean": -16.88064956665039, "gen_logits_min": -29.191884994506836, "gen_logits_std": 3.4741435050964355, "gen_loss": 0.27764761447906494, "grad_norm": 0.3396409537152654, "learning_rate": 1.8733894736842106e-05, "loss": 0.2645, "mean_copy_accuracy": 0.9977262914180756, "mean_gen_accuracy": 0.8799702972173691, "mean_token_accuracy": 0.9115419834852219, "num_tokens": 312910743.0, "sample_num_tokens": 8709.25, "step": 9655, "total_num_tokens": 312945580.0, "z_loss": 0.00041270244400948286 }, { "copy_logits_max": -5.169459342956543, "copy_logits_min": -750000000.0, "copy_num_tokens": 469.3125, "epoch": 1.9719683431197343, "gen_logits_max": 3.050997257232666, "gen_logits_mean": -17.13956069946289, "gen_logits_min": -29.71959686279297, "gen_logits_std": 3.5057084560394287, "gen_loss": 0.26404809951782227, "grad_norm": 0.3760472528715449, "learning_rate": 1.873263157894737e-05, "loss": 0.2779, "mean_copy_accuracy": 0.9981531500816345, "mean_gen_accuracy": 0.8745127469301224, "mean_token_accuracy": 0.9059183746576309, "num_tokens": 313173410.0, "sample_num_tokens": 8498.5, "step": 9656, "total_num_tokens": 313207404.0, "z_loss": 0.0003817493561655283 }, { "copy_logits_max": -5.0599517822265625, "copy_logits_min": -750000000.0, "copy_num_tokens": 478.75, "epoch": 1.9721725810569313, "gen_logits_max": 3.8812875747680664, "gen_logits_mean": -16.087026596069336, "gen_logits_min": -28.322795867919922, "gen_logits_std": 3.439795970916748, "gen_loss": 0.28124430775642395, "grad_norm": 0.3679855845857409, "learning_rate": 1.8731368421052632e-05, "loss": 0.2899, "mean_copy_accuracy": 0.9969058185815811, "mean_gen_accuracy": 0.8714544326066971, "mean_token_accuracy": 0.9011052399873734, "num_tokens": 313445476.0, "sample_num_tokens": 7905.5, "step": 9657, "total_num_tokens": 313477098.0, "z_loss": 0.0004297413397580385 }, { "copy_logits_max": -7.6484785079956055, "copy_logits_min": -750000000.0, "copy_num_tokens": 433.5, "epoch": 1.9723768189941282, "gen_logits_max": 2.898237943649292, "gen_logits_mean": -18.84088897705078, "gen_logits_min": -30.83892822265625, "gen_logits_std": 3.5670199394226074, "gen_loss": 0.2893463373184204, "grad_norm": 0.33609033004835126, "learning_rate": 1.8730105263157896e-05, "loss": 0.2699, "mean_copy_accuracy": 0.9975461214780807, "mean_gen_accuracy": 0.8777303844690323, "mean_token_accuracy": 0.9070746302604675, "num_tokens": 313709606.0, "sample_num_tokens": 8244.0, "step": 9658, "total_num_tokens": 313742582.0, "z_loss": 0.00039666728116571903 }, { "copy_logits_max": -5.634617805480957, "copy_logits_min": -687500032.0, "copy_num_tokens": 569.0, "epoch": 1.972581056931325, "gen_logits_max": 3.977447032928467, "gen_logits_mean": -16.221105575561523, "gen_logits_min": -28.414260864257812, "gen_logits_std": 3.4452898502349854, "gen_loss": 0.26484912633895874, "grad_norm": 0.359524633360143, "learning_rate": 1.8728842105263157e-05, "loss": 0.2753, "mean_copy_accuracy": 0.9972878992557526, "mean_gen_accuracy": 0.8775194734334946, "mean_token_accuracy": 0.9072224050760269, "num_tokens": 313973404.0, "sample_num_tokens": 9375.5, "step": 9659, "total_num_tokens": 314010906.0, "z_loss": 0.000332817027810961 }, { "copy_logits_max": -6.94764518737793, "copy_logits_min": -750000000.0, "copy_num_tokens": 414.1875, "epoch": 1.9727852948685218, "gen_logits_max": 4.069559097290039, "gen_logits_mean": -18.399658203125, "gen_logits_min": -30.454355239868164, "gen_logits_std": 3.5533313751220703, "gen_loss": 0.2715608477592468, "grad_norm": 0.3583935631660307, "learning_rate": 1.872757894736842e-05, "loss": 0.2843, "mean_copy_accuracy": 0.9963470995426178, "mean_gen_accuracy": 0.8759690970182419, "mean_token_accuracy": 0.9032182544469833, "num_tokens": 314232580.0, "sample_num_tokens": 8527.0, "step": 9660, "total_num_tokens": 314266688.0, "z_loss": 0.00037762679858133197 }, { "copy_logits_max": -6.5556511878967285, "copy_logits_min": -750000000.0, "copy_num_tokens": 376.875, "epoch": 1.9729895328057188, "gen_logits_max": 5.013422966003418, "gen_logits_mean": -16.467618942260742, "gen_logits_min": -29.047054290771484, "gen_logits_std": 3.4756762981414795, "gen_loss": 0.29314109683036804, "grad_norm": 0.35549176415750977, "learning_rate": 1.8726315789473682e-05, "loss": 0.2697, "mean_copy_accuracy": 0.9980365931987762, "mean_gen_accuracy": 0.8756963014602661, "mean_token_accuracy": 0.9089279621839523, "num_tokens": 314509394.0, "sample_num_tokens": 7013.0, "step": 9661, "total_num_tokens": 314537446.0, "z_loss": 0.00042023631976917386 }, { "copy_logits_max": -5.371095180511475, "copy_logits_min": -687500032.0, "copy_num_tokens": 656.4375, "epoch": 1.9731937707429155, "gen_logits_max": 1.590674638748169, "gen_logits_mean": -19.64916229248047, "gen_logits_min": -32.07938766479492, "gen_logits_std": 3.6414880752563477, "gen_loss": 0.23149968683719635, "grad_norm": 0.37841117932428386, "learning_rate": 1.872505263157895e-05, "loss": 0.2815, "mean_copy_accuracy": 0.9975729286670685, "mean_gen_accuracy": 0.8777770400047302, "mean_token_accuracy": 0.9038650393486023, "num_tokens": 314751175.0, "sample_num_tokens": 9659.25, "step": 9662, "total_num_tokens": 314789812.0, "z_loss": 0.0003485897905193269 }, { "copy_logits_max": -4.159100532531738, "copy_logits_min": -750000000.0, "copy_num_tokens": 440.75, "epoch": 1.9733980086801122, "gen_logits_max": 3.8713932037353516, "gen_logits_mean": -17.315460205078125, "gen_logits_min": -29.76024627685547, "gen_logits_std": 3.526704788208008, "gen_loss": 0.28462928533554077, "grad_norm": 0.3732209782531046, "learning_rate": 1.872378947368421e-05, "loss": 0.2922, "mean_copy_accuracy": 0.996756374835968, "mean_gen_accuracy": 0.8722691833972931, "mean_token_accuracy": 0.9024194031953812, "num_tokens": 315023159.0, "sample_num_tokens": 8189.25, "step": 9663, "total_num_tokens": 315055916.0, "z_loss": 0.0003643326635938138 }, { "copy_logits_max": -7.079828262329102, "copy_logits_min": -750000000.0, "copy_num_tokens": 590.9375, "epoch": 1.9736022466173093, "gen_logits_max": 3.4026505947113037, "gen_logits_mean": -16.960433959960938, "gen_logits_min": -29.253501892089844, "gen_logits_std": 3.4808027744293213, "gen_loss": 0.28073805570602417, "grad_norm": 0.3613584457222578, "learning_rate": 1.8722526315789475e-05, "loss": 0.2559, "mean_copy_accuracy": 0.9976020008325577, "mean_gen_accuracy": 0.884106770157814, "mean_token_accuracy": 0.9135826379060745, "num_tokens": 315282605.0, "sample_num_tokens": 9857.75, "step": 9664, "total_num_tokens": 315322036.0, "z_loss": 0.0003783975262194872 }, { "copy_logits_max": -7.349903583526611, "copy_logits_min": -750000000.0, "copy_num_tokens": 434.4375, "epoch": 1.973806484554506, "gen_logits_max": 3.5112783908843994, "gen_logits_mean": -16.520221710205078, "gen_logits_min": -28.807884216308594, "gen_logits_std": 3.480288505554199, "gen_loss": 0.2608458995819092, "grad_norm": 0.356376025866506, "learning_rate": 1.8721263157894736e-05, "loss": 0.2679, "mean_copy_accuracy": 0.9979454129934311, "mean_gen_accuracy": 0.8784842193126678, "mean_token_accuracy": 0.9083249419927597, "num_tokens": 315568580.0, "sample_num_tokens": 8160.5, "step": 9665, "total_num_tokens": 315601222.0, "z_loss": 0.0003783579159062356 }, { "copy_logits_max": -6.327400207519531, "copy_logits_min": -687500032.0, "copy_num_tokens": 444.625, "epoch": 1.9740107224917027, "gen_logits_max": 3.984762668609619, "gen_logits_mean": -16.866283416748047, "gen_logits_min": -29.098773956298828, "gen_logits_std": 3.4975950717926025, "gen_loss": 0.28709107637405396, "grad_norm": 0.37317030796025846, "learning_rate": 1.872e-05, "loss": 0.2853, "mean_copy_accuracy": 0.9975033551454544, "mean_gen_accuracy": 0.8753816783428192, "mean_token_accuracy": 0.9044395089149475, "num_tokens": 315826230.0, "sample_num_tokens": 8254.0, "step": 9666, "total_num_tokens": 315859246.0, "z_loss": 0.0003380933776497841 }, { "copy_logits_max": -5.971069812774658, "copy_logits_min": -687500032.0, "copy_num_tokens": 716.0, "epoch": 1.9742149604288997, "gen_logits_max": 2.2265684604644775, "gen_logits_mean": -17.70166778564453, "gen_logits_min": -30.143661499023438, "gen_logits_std": 3.545060634613037, "gen_loss": 0.2727677822113037, "grad_norm": 0.32961819777296675, "learning_rate": 1.8718736842105265e-05, "loss": 0.275, "mean_copy_accuracy": 0.9976565539836884, "mean_gen_accuracy": 0.8732281774282455, "mean_token_accuracy": 0.9068610668182373, "num_tokens": 316112918.0, "sample_num_tokens": 10015.0, "step": 9667, "total_num_tokens": 316152978.0, "z_loss": 0.0004273223748896271 }, { "copy_logits_max": -8.716903686523438, "copy_logits_min": -750000000.0, "copy_num_tokens": 331.5625, "epoch": 1.9744191983660966, "gen_logits_max": 3.4600093364715576, "gen_logits_mean": -18.067211151123047, "gen_logits_min": -29.67317008972168, "gen_logits_std": 3.498258590698242, "gen_loss": 0.26191022992134094, "grad_norm": 0.33548947945435453, "learning_rate": 1.8717473684210526e-05, "loss": 0.2774, "mean_copy_accuracy": 0.9966320842504501, "mean_gen_accuracy": 0.8793140053749084, "mean_token_accuracy": 0.905244305729866, "num_tokens": 316394859.0, "sample_num_tokens": 8080.25, "step": 9668, "total_num_tokens": 316427180.0, "z_loss": 0.0003626485413406044 }, { "copy_logits_max": -6.426668167114258, "copy_logits_min": -750000000.0, "copy_num_tokens": 306.8125, "epoch": 1.9746234363032933, "gen_logits_max": 4.460387229919434, "gen_logits_mean": -17.51920509338379, "gen_logits_min": -29.307090759277344, "gen_logits_std": 3.504171133041382, "gen_loss": 0.2862146496772766, "grad_norm": 0.34903465152683844, "learning_rate": 1.871621052631579e-05, "loss": 0.253, "mean_copy_accuracy": 0.9977752566337585, "mean_gen_accuracy": 0.8858449906110764, "mean_token_accuracy": 0.9133618772029877, "num_tokens": 316667810.0, "sample_num_tokens": 7086.0, "step": 9669, "total_num_tokens": 316696154.0, "z_loss": 0.0003916797577403486 }, { "copy_logits_max": -6.043765544891357, "copy_logits_min": -750000000.0, "copy_num_tokens": 354.125, "epoch": 1.9748276742404902, "gen_logits_max": 4.39185905456543, "gen_logits_mean": -16.90924072265625, "gen_logits_min": -28.968509674072266, "gen_logits_std": 3.4724936485290527, "gen_loss": 0.3099687397480011, "grad_norm": 0.37408462286531086, "learning_rate": 1.8714947368421054e-05, "loss": 0.2822, "mean_copy_accuracy": 0.9966792017221451, "mean_gen_accuracy": 0.8775638192892075, "mean_token_accuracy": 0.9041737467050552, "num_tokens": 316932087.0, "sample_num_tokens": 7894.25, "step": 9670, "total_num_tokens": 316963664.0, "z_loss": 0.0004683142760768533 }, { "copy_logits_max": -3.687345027923584, "copy_logits_min": -625000064.0, "copy_num_tokens": 384.1875, "epoch": 1.9750319121776871, "gen_logits_max": 3.7634143829345703, "gen_logits_mean": -17.517539978027344, "gen_logits_min": -29.818134307861328, "gen_logits_std": 3.552177906036377, "gen_loss": 0.2716108560562134, "grad_norm": 0.36659270759334295, "learning_rate": 1.871368421052632e-05, "loss": 0.2883, "mean_copy_accuracy": 0.9971687346696854, "mean_gen_accuracy": 0.8707053810358047, "mean_token_accuracy": 0.9015045464038849, "num_tokens": 317203717.0, "sample_num_tokens": 7620.25, "step": 9671, "total_num_tokens": 317234198.0, "z_loss": 0.00036236661253497005 }, { "copy_logits_max": -4.370661735534668, "copy_logits_min": -687500032.0, "copy_num_tokens": 374.5, "epoch": 1.9752361501148838, "gen_logits_max": 4.337077617645264, "gen_logits_mean": -16.695829391479492, "gen_logits_min": -28.985498428344727, "gen_logits_std": 3.5092334747314453, "gen_loss": 0.30017322301864624, "grad_norm": 0.3786069407641611, "learning_rate": 1.871242105263158e-05, "loss": 0.2734, "mean_copy_accuracy": 0.9979503303766251, "mean_gen_accuracy": 0.878346636891365, "mean_token_accuracy": 0.907829537987709, "num_tokens": 317457916.0, "sample_num_tokens": 7646.5, "step": 9672, "total_num_tokens": 317488502.0, "z_loss": 0.00038306531496345997 }, { "copy_logits_max": -3.908421516418457, "copy_logits_min": -625000064.0, "copy_num_tokens": 611.3125, "epoch": 1.9754403880520806, "gen_logits_max": 3.838435411453247, "gen_logits_mean": -16.08397674560547, "gen_logits_min": -28.883451461791992, "gen_logits_std": 3.4679322242736816, "gen_loss": 0.2585108280181885, "grad_norm": 0.3504079997274732, "learning_rate": 1.8711157894736844e-05, "loss": 0.2487, "mean_copy_accuracy": 0.9972952455282211, "mean_gen_accuracy": 0.8852790892124176, "mean_token_accuracy": 0.9142320603132248, "num_tokens": 317716189.0, "sample_num_tokens": 9431.25, "step": 9673, "total_num_tokens": 317753914.0, "z_loss": 0.0003739733365364373 }, { "copy_logits_max": -4.863718509674072, "copy_logits_min": -750000000.0, "copy_num_tokens": 549.6875, "epoch": 1.9756446259892775, "gen_logits_max": 2.6558823585510254, "gen_logits_mean": -17.670028686523438, "gen_logits_min": -30.196210861206055, "gen_logits_std": 3.5419628620147705, "gen_loss": 0.25903618335723877, "grad_norm": 0.3492826118225658, "learning_rate": 1.8709894736842105e-05, "loss": 0.2581, "mean_copy_accuracy": 0.9971053153276443, "mean_gen_accuracy": 0.8820325136184692, "mean_token_accuracy": 0.9114575237035751, "num_tokens": 317989588.0, "sample_num_tokens": 8572.5, "step": 9674, "total_num_tokens": 318023878.0, "z_loss": 0.0003573073772713542 }, { "copy_logits_max": -7.81485652923584, "copy_logits_min": -750000000.0, "copy_num_tokens": 223.4375, "epoch": 1.9758488639264744, "gen_logits_max": 4.581254005432129, "gen_logits_mean": -18.22980308532715, "gen_logits_min": -30.18813705444336, "gen_logits_std": 3.5288023948669434, "gen_loss": 0.30079764127731323, "grad_norm": 0.3682830129770534, "learning_rate": 1.870863157894737e-05, "loss": 0.2619, "mean_copy_accuracy": 0.9970942735671997, "mean_gen_accuracy": 0.8859177082777023, "mean_token_accuracy": 0.9109803289175034, "num_tokens": 318254124.0, "sample_num_tokens": 6922.5, "step": 9675, "total_num_tokens": 318281814.0, "z_loss": 0.00039998177089728415 }, { "copy_logits_max": -1.997032642364502, "copy_logits_min": -625000064.0, "copy_num_tokens": 533.875, "epoch": 1.9760531018636711, "gen_logits_max": 3.6293158531188965, "gen_logits_mean": -16.641918182373047, "gen_logits_min": -29.097108840942383, "gen_logits_std": 3.4999778270721436, "gen_loss": 0.26545023918151855, "grad_norm": 0.3563188566573946, "learning_rate": 1.870736842105263e-05, "loss": 0.2581, "mean_copy_accuracy": 0.9974248707294464, "mean_gen_accuracy": 0.8826044946908951, "mean_token_accuracy": 0.9126942604780197, "num_tokens": 318525909.0, "sample_num_tokens": 8200.75, "step": 9676, "total_num_tokens": 318558712.0, "z_loss": 0.00039118470158427954 }, { "copy_logits_max": -6.766610145568848, "copy_logits_min": -750000000.0, "copy_num_tokens": 351.375, "epoch": 1.976257339800868, "gen_logits_max": 3.737762212753296, "gen_logits_mean": -17.75748062133789, "gen_logits_min": -29.99456787109375, "gen_logits_std": 3.5299363136291504, "gen_loss": 0.26661211252212524, "grad_norm": 0.33764042777106, "learning_rate": 1.8706105263157894e-05, "loss": 0.2494, "mean_copy_accuracy": 0.9970792979001999, "mean_gen_accuracy": 0.8834730237722397, "mean_token_accuracy": 0.9150580614805222, "num_tokens": 318824205.0, "sample_num_tokens": 8100.75, "step": 9677, "total_num_tokens": 318856608.0, "z_loss": 0.0004050502320751548 }, { "copy_logits_max": -4.72675895690918, "copy_logits_min": -750000064.0, "copy_num_tokens": 524.375, "epoch": 1.976461577738065, "gen_logits_max": 3.7647202014923096, "gen_logits_mean": -16.855998992919922, "gen_logits_min": -29.223209381103516, "gen_logits_std": 3.482797384262085, "gen_loss": 0.32203003764152527, "grad_norm": 0.38211026068363096, "learning_rate": 1.870484210526316e-05, "loss": 0.297, "mean_copy_accuracy": 0.9957439601421356, "mean_gen_accuracy": 0.8681295961141586, "mean_token_accuracy": 0.8989301770925522, "num_tokens": 319075126.0, "sample_num_tokens": 9232.0, "step": 9678, "total_num_tokens": 319112054.0, "z_loss": 0.00048215696006082 }, { "copy_logits_max": -5.378693103790283, "copy_logits_min": -750000064.0, "copy_num_tokens": 541.5625, "epoch": 1.9766658156752617, "gen_logits_max": 4.25456428527832, "gen_logits_mean": -16.387096405029297, "gen_logits_min": -28.83148193359375, "gen_logits_std": 3.498194694519043, "gen_loss": 0.27898550033569336, "grad_norm": 0.34113023074439447, "learning_rate": 1.8703578947368423e-05, "loss": 0.2846, "mean_copy_accuracy": 0.996892437338829, "mean_gen_accuracy": 0.8718124032020569, "mean_token_accuracy": 0.9043253660202026, "num_tokens": 319364396.0, "sample_num_tokens": 10311.0, "step": 9679, "total_num_tokens": 319405640.0, "z_loss": 0.0004137601936236024 }, { "copy_logits_max": -1.688481092453003, "copy_logits_min": -750000000.0, "copy_num_tokens": 573.625, "epoch": 1.9768700536124584, "gen_logits_max": 4.528242588043213, "gen_logits_mean": -14.925633430480957, "gen_logits_min": -27.474445343017578, "gen_logits_std": 3.4432458877563477, "gen_loss": 0.2554277181625366, "grad_norm": 0.36141628293166633, "learning_rate": 1.8702315789473687e-05, "loss": 0.2726, "mean_copy_accuracy": 0.9959552884101868, "mean_gen_accuracy": 0.8780047595500946, "mean_token_accuracy": 0.9094446301460266, "num_tokens": 319636492.0, "sample_num_tokens": 8439.0, "step": 9680, "total_num_tokens": 319670248.0, "z_loss": 0.0004134706687182188 }, { "copy_logits_max": -6.511466979980469, "copy_logits_min": -750000000.0, "copy_num_tokens": 263.5625, "epoch": 1.9770742915496553, "gen_logits_max": 4.867384910583496, "gen_logits_mean": -16.68887710571289, "gen_logits_min": -28.724960327148438, "gen_logits_std": 3.484128475189209, "gen_loss": 0.27392274141311646, "grad_norm": 0.38335524417361116, "learning_rate": 1.8701052631578948e-05, "loss": 0.2843, "mean_copy_accuracy": 0.9971936196088791, "mean_gen_accuracy": 0.8773084133863449, "mean_token_accuracy": 0.9046431183815002, "num_tokens": 319897007.0, "sample_num_tokens": 7499.25, "step": 9681, "total_num_tokens": 319927004.0, "z_loss": 0.00043571143760345876 }, { "copy_logits_max": -5.350521087646484, "copy_logits_min": -750000000.0, "copy_num_tokens": 548.4375, "epoch": 1.9772785294868522, "gen_logits_max": 3.8938496112823486, "gen_logits_mean": -16.729923248291016, "gen_logits_min": -29.045997619628906, "gen_logits_std": 3.5215704441070557, "gen_loss": 0.266679048538208, "grad_norm": 0.35104778860842156, "learning_rate": 1.8699789473684212e-05, "loss": 0.2779, "mean_copy_accuracy": 0.9977999329566956, "mean_gen_accuracy": 0.8776791542768478, "mean_token_accuracy": 0.9079536944627762, "num_tokens": 320183493.0, "sample_num_tokens": 9012.25, "step": 9682, "total_num_tokens": 320219542.0, "z_loss": 0.00040750103653408587 }, { "copy_logits_max": -4.218570232391357, "copy_logits_min": -750000000.0, "copy_num_tokens": 415.0, "epoch": 1.977482767424049, "gen_logits_max": 3.703216791152954, "gen_logits_mean": -16.97970962524414, "gen_logits_min": -29.306915283203125, "gen_logits_std": 3.5037848949432373, "gen_loss": 0.2901817858219147, "grad_norm": 0.32920830549731744, "learning_rate": 1.8698526315789473e-05, "loss": 0.2617, "mean_copy_accuracy": 0.9975309073925018, "mean_gen_accuracy": 0.8834521472454071, "mean_token_accuracy": 0.9114278256893158, "num_tokens": 320455838.0, "sample_num_tokens": 8022.0, "step": 9683, "total_num_tokens": 320487926.0, "z_loss": 0.0003914578119292855 }, { "copy_logits_max": -4.687963008880615, "copy_logits_min": -687500032.0, "copy_num_tokens": 626.9375, "epoch": 1.9776870053612459, "gen_logits_max": 1.834503173828125, "gen_logits_mean": -19.02906036376953, "gen_logits_min": -31.497400283813477, "gen_logits_std": 3.6218924522399902, "gen_loss": 0.23288661241531372, "grad_norm": 0.3433381341749136, "learning_rate": 1.8697263157894738e-05, "loss": 0.2666, "mean_copy_accuracy": 0.9967086762189865, "mean_gen_accuracy": 0.8763946741819382, "mean_token_accuracy": 0.9088229686021805, "num_tokens": 320735530.0, "sample_num_tokens": 9576.0, "step": 9684, "total_num_tokens": 320773834.0, "z_loss": 0.00033105252077803016 }, { "copy_logits_max": -4.7828369140625, "copy_logits_min": -750000064.0, "copy_num_tokens": 394.8125, "epoch": 1.9778912432984428, "gen_logits_max": 4.690279960632324, "gen_logits_mean": -15.959233283996582, "gen_logits_min": -28.678485870361328, "gen_logits_std": 3.461728572845459, "gen_loss": 0.28247809410095215, "grad_norm": 0.37324426273502953, "learning_rate": 1.8696e-05, "loss": 0.2768, "mean_copy_accuracy": 0.9967922419309616, "mean_gen_accuracy": 0.8725594729185104, "mean_token_accuracy": 0.9061390310525894, "num_tokens": 321022354.0, "sample_num_tokens": 8446.0, "step": 9685, "total_num_tokens": 321056138.0, "z_loss": 0.00037695554783567786 }, { "copy_logits_max": -5.468367576599121, "copy_logits_min": -750000000.0, "copy_num_tokens": 231.4375, "epoch": 1.9780954812356395, "gen_logits_max": 4.882774353027344, "gen_logits_mean": -17.205341339111328, "gen_logits_min": -29.037368774414062, "gen_logits_std": 3.505420207977295, "gen_loss": 0.2984374761581421, "grad_norm": 0.3552823346250461, "learning_rate": 1.8694736842105263e-05, "loss": 0.2731, "mean_copy_accuracy": 0.9959360957145691, "mean_gen_accuracy": 0.8809057474136353, "mean_token_accuracy": 0.9062849581241608, "num_tokens": 321284024.0, "sample_num_tokens": 7114.0, "step": 9686, "total_num_tokens": 321312480.0, "z_loss": 0.00040009524673223495 }, { "copy_logits_max": -4.404022216796875, "copy_logits_min": -750000000.0, "copy_num_tokens": 484.75, "epoch": 1.9782997191728362, "gen_logits_max": 2.8695690631866455, "gen_logits_mean": -17.798419952392578, "gen_logits_min": -30.235397338867188, "gen_logits_std": 3.530886173248291, "gen_loss": 0.29407089948654175, "grad_norm": 0.3447619592098694, "learning_rate": 1.8693473684210527e-05, "loss": 0.2729, "mean_copy_accuracy": 0.9975538402795792, "mean_gen_accuracy": 0.8723300397396088, "mean_token_accuracy": 0.9069420546293259, "num_tokens": 321548913.0, "sample_num_tokens": 8308.25, "step": 9687, "total_num_tokens": 321582146.0, "z_loss": 0.0004485186072997749 }, { "copy_logits_max": -0.28607720136642456, "copy_logits_min": -750000000.0, "copy_num_tokens": 397.5, "epoch": 1.9785039571100334, "gen_logits_max": 5.144871711730957, "gen_logits_mean": -14.582986831665039, "gen_logits_min": -26.83462142944336, "gen_logits_std": 3.4067063331604004, "gen_loss": 0.2847338318824768, "grad_norm": 0.3518977818915894, "learning_rate": 1.869221052631579e-05, "loss": 0.2816, "mean_copy_accuracy": 0.9973867237567902, "mean_gen_accuracy": 0.87520532310009, "mean_token_accuracy": 0.9056137204170227, "num_tokens": 321837681.0, "sample_num_tokens": 7427.25, "step": 9688, "total_num_tokens": 321867390.0, "z_loss": 0.0004485221579670906 }, { "copy_logits_max": -4.474921703338623, "copy_logits_min": -687500032.0, "copy_num_tokens": 575.1875, "epoch": 1.97870819504723, "gen_logits_max": 4.0069732666015625, "gen_logits_mean": -16.00865936279297, "gen_logits_min": -28.30714988708496, "gen_logits_std": 3.4934959411621094, "gen_loss": 0.24051335453987122, "grad_norm": 0.34501078013220765, "learning_rate": 1.8690947368421052e-05, "loss": 0.2773, "mean_copy_accuracy": 0.9973869323730469, "mean_gen_accuracy": 0.8736674189567566, "mean_token_accuracy": 0.9060102105140686, "num_tokens": 322115697.0, "sample_num_tokens": 8980.75, "step": 9689, "total_num_tokens": 322151620.0, "z_loss": 0.00035924711846746504 }, { "copy_logits_max": -5.7928056716918945, "copy_logits_min": -750000000.0, "copy_num_tokens": 355.0625, "epoch": 1.9789124329844268, "gen_logits_max": 3.6160435676574707, "gen_logits_mean": -18.367267608642578, "gen_logits_min": -30.835214614868164, "gen_logits_std": 3.5604822635650635, "gen_loss": 0.28295665979385376, "grad_norm": 0.33691026394458, "learning_rate": 1.8689684210526317e-05, "loss": 0.2664, "mean_copy_accuracy": 0.9964895099401474, "mean_gen_accuracy": 0.8801524639129639, "mean_token_accuracy": 0.9090830385684967, "num_tokens": 322410369.0, "sample_num_tokens": 6927.75, "step": 9690, "total_num_tokens": 322438080.0, "z_loss": 0.00041193515062332153 }, { "copy_logits_max": -5.436872482299805, "copy_logits_min": -750000000.0, "copy_num_tokens": 363.8125, "epoch": 1.9791166709216237, "gen_logits_max": 3.5821194648742676, "gen_logits_mean": -17.88099479675293, "gen_logits_min": -30.216873168945312, "gen_logits_std": 3.5313358306884766, "gen_loss": 0.28730306029319763, "grad_norm": 0.3416640231454584, "learning_rate": 1.8688421052631578e-05, "loss": 0.277, "mean_copy_accuracy": 0.9971887618303299, "mean_gen_accuracy": 0.8737658709287643, "mean_token_accuracy": 0.9048285037279129, "num_tokens": 322686062.0, "sample_num_tokens": 7537.5, "step": 9691, "total_num_tokens": 322716212.0, "z_loss": 0.0004259036504663527 }, { "copy_logits_max": -7.20168399810791, "copy_logits_min": -750000000.0, "copy_num_tokens": 274.1875, "epoch": 1.9793209088588206, "gen_logits_max": 3.932685136795044, "gen_logits_mean": -18.65369415283203, "gen_logits_min": -31.066442489624023, "gen_logits_std": 3.550844669342041, "gen_loss": 0.2900469899177551, "grad_norm": 0.34152055210474624, "learning_rate": 1.8687157894736842e-05, "loss": 0.2736, "mean_copy_accuracy": 0.9972001910209656, "mean_gen_accuracy": 0.8811891674995422, "mean_token_accuracy": 0.9062710255384445, "num_tokens": 322945098.0, "sample_num_tokens": 7681.0, "step": 9692, "total_num_tokens": 322975822.0, "z_loss": 0.0004262230941094458 }, { "copy_logits_max": -5.4082536697387695, "copy_logits_min": -687500032.0, "copy_num_tokens": 796.0, "epoch": 1.9795251467960173, "gen_logits_max": 3.0842037200927734, "gen_logits_mean": -16.821998596191406, "gen_logits_min": -29.36318588256836, "gen_logits_std": 3.5002939701080322, "gen_loss": 0.21794389188289642, "grad_norm": 0.34619318197661847, "learning_rate": 1.8685894736842106e-05, "loss": 0.2723, "mean_copy_accuracy": 0.9975682944059372, "mean_gen_accuracy": 0.8764787018299103, "mean_token_accuracy": 0.9074691981077194, "num_tokens": 323224723.0, "sample_num_tokens": 11354.75, "step": 9693, "total_num_tokens": 323270142.0, "z_loss": 0.00033788621658459306 }, { "copy_logits_max": -7.076684951782227, "copy_logits_min": -750000000.0, "copy_num_tokens": 324.0, "epoch": 1.9797293847332142, "gen_logits_max": 4.625088214874268, "gen_logits_mean": -16.739585876464844, "gen_logits_min": -29.380205154418945, "gen_logits_std": 3.487504482269287, "gen_loss": 0.2650372385978699, "grad_norm": 0.3681278918277945, "learning_rate": 1.8684631578947367e-05, "loss": 0.2706, "mean_copy_accuracy": 0.9963911473751068, "mean_gen_accuracy": 0.8793955892324448, "mean_token_accuracy": 0.9063362181186676, "num_tokens": 323481600.0, "sample_num_tokens": 8350.5, "step": 9694, "total_num_tokens": 323515002.0, "z_loss": 0.00041563634295016527 }, { "copy_logits_max": -4.464900016784668, "copy_logits_min": -750000000.0, "copy_num_tokens": 502.5625, "epoch": 1.9799336226704112, "gen_logits_max": 3.4658663272857666, "gen_logits_mean": -16.272785186767578, "gen_logits_min": -28.827049255371094, "gen_logits_std": 3.487062692642212, "gen_loss": 0.237104594707489, "grad_norm": 0.31299057268802694, "learning_rate": 1.8683368421052635e-05, "loss": 0.2582, "mean_copy_accuracy": 0.9973214268684387, "mean_gen_accuracy": 0.8857192099094391, "mean_token_accuracy": 0.9119929671287537, "num_tokens": 323756551.0, "sample_num_tokens": 8637.25, "step": 9695, "total_num_tokens": 323791100.0, "z_loss": 0.0003513498813845217 }, { "copy_logits_max": -4.691408634185791, "copy_logits_min": -750000000.0, "copy_num_tokens": 359.0, "epoch": 1.9801378606076079, "gen_logits_max": 4.4771409034729, "gen_logits_mean": -16.177669525146484, "gen_logits_min": -28.809329986572266, "gen_logits_std": 3.4094491004943848, "gen_loss": 0.29338860511779785, "grad_norm": 0.3298556058491361, "learning_rate": 1.8682105263157896e-05, "loss": 0.2731, "mean_copy_accuracy": 0.997512549161911, "mean_gen_accuracy": 0.8773439973592758, "mean_token_accuracy": 0.9064667075872421, "num_tokens": 324020128.0, "sample_num_tokens": 8162.0, "step": 9696, "total_num_tokens": 324052776.0, "z_loss": 0.00041715160477906466 }, { "copy_logits_max": -7.354287147521973, "copy_logits_min": -750000000.0, "copy_num_tokens": 483.0, "epoch": 1.9803420985448046, "gen_logits_max": 3.2210006713867188, "gen_logits_mean": -17.55609130859375, "gen_logits_min": -30.157676696777344, "gen_logits_std": 3.5271987915039062, "gen_loss": 0.22724582254886627, "grad_norm": 0.33303403023937717, "learning_rate": 1.868084210526316e-05, "loss": 0.2655, "mean_copy_accuracy": 0.9961907118558884, "mean_gen_accuracy": 0.8837783932685852, "mean_token_accuracy": 0.9084430932998657, "num_tokens": 324280665.0, "sample_num_tokens": 8673.25, "step": 9697, "total_num_tokens": 324315358.0, "z_loss": 0.0003420644497964531 }, { "copy_logits_max": -4.721146106719971, "copy_logits_min": -750000000.0, "copy_num_tokens": 497.5, "epoch": 1.9805463364820015, "gen_logits_max": 3.503988742828369, "gen_logits_mean": -16.910541534423828, "gen_logits_min": -29.124465942382812, "gen_logits_std": 3.5029349327087402, "gen_loss": 0.27326563000679016, "grad_norm": 0.3364708817910085, "learning_rate": 1.867957894736842e-05, "loss": 0.2624, "mean_copy_accuracy": 0.9969407021999359, "mean_gen_accuracy": 0.8833251893520355, "mean_token_accuracy": 0.910431981086731, "num_tokens": 324541263.0, "sample_num_tokens": 8272.75, "step": 9698, "total_num_tokens": 324574354.0, "z_loss": 0.00043234843178652227 }, { "copy_logits_max": -6.927249431610107, "copy_logits_min": -750000000.0, "copy_num_tokens": 373.0625, "epoch": 1.9807505744191984, "gen_logits_max": 3.982461452484131, "gen_logits_mean": -16.67128562927246, "gen_logits_min": -29.09776496887207, "gen_logits_std": 3.4578137397766113, "gen_loss": 0.3186684250831604, "grad_norm": 0.3888346958762808, "learning_rate": 1.8678315789473685e-05, "loss": 0.2909, "mean_copy_accuracy": 0.9964497983455658, "mean_gen_accuracy": 0.8729014098644257, "mean_token_accuracy": 0.9006533026695251, "num_tokens": 324783414.0, "sample_num_tokens": 7707.5, "step": 9699, "total_num_tokens": 324814244.0, "z_loss": 0.00046963777276687324 }, { "copy_logits_max": -3.49721622467041, "copy_logits_min": -750000000.0, "copy_num_tokens": 582.6875, "epoch": 1.9809548123563951, "gen_logits_max": 4.297724723815918, "gen_logits_mean": -15.515705108642578, "gen_logits_min": -28.170759201049805, "gen_logits_std": 3.4222073554992676, "gen_loss": 0.22634641826152802, "grad_norm": 0.3185750957718909, "learning_rate": 1.8677052631578946e-05, "loss": 0.2545, "mean_copy_accuracy": 0.997571125626564, "mean_gen_accuracy": 0.8874266594648361, "mean_token_accuracy": 0.9126230478286743, "num_tokens": 325041387.0, "sample_num_tokens": 8767.75, "step": 9700, "total_num_tokens": 325076458.0, "z_loss": 0.000378006836399436 }, { "copy_logits_max": -7.414957046508789, "copy_logits_min": -687500032.0, "copy_num_tokens": 299.625, "epoch": 1.981159050293592, "gen_logits_max": 4.734487533569336, "gen_logits_mean": -16.736270904541016, "gen_logits_min": -28.800174713134766, "gen_logits_std": 3.4754350185394287, "gen_loss": 0.2989894151687622, "grad_norm": 0.3654708553769618, "learning_rate": 1.867578947368421e-05, "loss": 0.2925, "mean_copy_accuracy": 0.9956558048725128, "mean_gen_accuracy": 0.8771636039018631, "mean_token_accuracy": 0.8988778740167618, "num_tokens": 325275759.0, "sample_num_tokens": 7647.75, "step": 9701, "total_num_tokens": 325306350.0, "z_loss": 0.0004695239767897874 }, { "copy_logits_max": -3.7331430912017822, "copy_logits_min": -687500032.0, "copy_num_tokens": 417.6875, "epoch": 1.981363288230789, "gen_logits_max": 4.639734268188477, "gen_logits_mean": -16.05034065246582, "gen_logits_min": -28.0576171875, "gen_logits_std": 3.410989761352539, "gen_loss": 0.3449488580226898, "grad_norm": 0.377445391962647, "learning_rate": 1.867452631578947e-05, "loss": 0.2811, "mean_copy_accuracy": 0.9975209981203079, "mean_gen_accuracy": 0.8713892251253128, "mean_token_accuracy": 0.9044265300035477, "num_tokens": 325545639.0, "sample_num_tokens": 8731.75, "step": 9702, "total_num_tokens": 325580566.0, "z_loss": 0.0005132357473485172 }, { "copy_logits_max": -2.093519449234009, "copy_logits_min": -750000000.0, "copy_num_tokens": 585.375, "epoch": 1.9815675261679857, "gen_logits_max": 3.093651294708252, "gen_logits_mean": -16.63936996459961, "gen_logits_min": -29.114667892456055, "gen_logits_std": 3.466797351837158, "gen_loss": 0.2423686385154724, "grad_norm": 0.3397753078187806, "learning_rate": 1.867326315789474e-05, "loss": 0.257, "mean_copy_accuracy": 0.9976135045289993, "mean_gen_accuracy": 0.8828512132167816, "mean_token_accuracy": 0.9128234684467316, "num_tokens": 325812877.0, "sample_num_tokens": 8722.75, "step": 9703, "total_num_tokens": 325847768.0, "z_loss": 0.00038438753108493984 }, { "copy_logits_max": -3.677194118499756, "copy_logits_min": -750000000.0, "copy_num_tokens": 587.375, "epoch": 1.9817717641051824, "gen_logits_max": 3.1461739540100098, "gen_logits_mean": -17.259803771972656, "gen_logits_min": -29.284841537475586, "gen_logits_std": 3.467973232269287, "gen_loss": 0.273487389087677, "grad_norm": 0.34519120810685894, "learning_rate": 1.8672e-05, "loss": 0.2637, "mean_copy_accuracy": 0.9982977211475372, "mean_gen_accuracy": 0.8792815804481506, "mean_token_accuracy": 0.9096134454011917, "num_tokens": 326078040.0, "sample_num_tokens": 9226.0, "step": 9704, "total_num_tokens": 326114944.0, "z_loss": 0.000402999110519886 }, { "copy_logits_max": -4.582596778869629, "copy_logits_min": -750000000.0, "copy_num_tokens": 429.5, "epoch": 1.9819760020423793, "gen_logits_max": 3.408681869506836, "gen_logits_mean": -17.272985458374023, "gen_logits_min": -29.55865478515625, "gen_logits_std": 3.5218114852905273, "gen_loss": 0.2893901765346527, "grad_norm": 0.3418362104418695, "learning_rate": 1.8670736842105264e-05, "loss": 0.2799, "mean_copy_accuracy": 0.9971616566181183, "mean_gen_accuracy": 0.8746621012687683, "mean_token_accuracy": 0.9035778045654297, "num_tokens": 326335291.0, "sample_num_tokens": 7270.75, "step": 9705, "total_num_tokens": 326364374.0, "z_loss": 0.0004165775899309665 }, { "copy_logits_max": -7.095189094543457, "copy_logits_min": -750000064.0, "copy_num_tokens": 326.75, "epoch": 1.9821802399795763, "gen_logits_max": 3.6095688343048096, "gen_logits_mean": -17.751075744628906, "gen_logits_min": -30.349185943603516, "gen_logits_std": 3.5328853130340576, "gen_loss": 0.25957876443862915, "grad_norm": 0.3330926422350998, "learning_rate": 1.8669473684210525e-05, "loss": 0.2708, "mean_copy_accuracy": 0.9977044016122818, "mean_gen_accuracy": 0.8811022043228149, "mean_token_accuracy": 0.9086776673793793, "num_tokens": 326613306.0, "sample_num_tokens": 7644.0, "step": 9706, "total_num_tokens": 326643882.0, "z_loss": 0.00037270988104864955 }, { "copy_logits_max": -4.504948616027832, "copy_logits_min": -750000000.0, "copy_num_tokens": 474.3125, "epoch": 1.982384477916773, "gen_logits_max": 3.807150363922119, "gen_logits_mean": -15.793389320373535, "gen_logits_min": -28.04236602783203, "gen_logits_std": 3.470916271209717, "gen_loss": 0.26180508732795715, "grad_norm": 0.3391914737903879, "learning_rate": 1.866821052631579e-05, "loss": 0.2676, "mean_copy_accuracy": 0.9970444142818451, "mean_gen_accuracy": 0.8795359879732132, "mean_token_accuracy": 0.9098764359951019, "num_tokens": 326894380.0, "sample_num_tokens": 8202.0, "step": 9707, "total_num_tokens": 326927188.0, "z_loss": 0.00039798315265215933 }, { "copy_logits_max": -4.470290660858154, "copy_logits_min": -750000000.0, "copy_num_tokens": 405.0, "epoch": 1.98258871585397, "gen_logits_max": 3.6421117782592773, "gen_logits_mean": -17.21271514892578, "gen_logits_min": -29.218069076538086, "gen_logits_std": 3.473695993423462, "gen_loss": 0.3318280279636383, "grad_norm": 0.38075203069917063, "learning_rate": 1.8666947368421054e-05, "loss": 0.3105, "mean_copy_accuracy": 0.9971970468759537, "mean_gen_accuracy": 0.8656569868326187, "mean_token_accuracy": 0.8950702399015427, "num_tokens": 327162011.0, "sample_num_tokens": 8279.25, "step": 9708, "total_num_tokens": 327195128.0, "z_loss": 0.000495394691824913 }, { "copy_logits_max": -3.818352222442627, "copy_logits_min": -687500032.0, "copy_num_tokens": 593.1875, "epoch": 1.9827929537911668, "gen_logits_max": 3.924701690673828, "gen_logits_mean": -16.406871795654297, "gen_logits_min": -29.029052734375, "gen_logits_std": 3.4955034255981445, "gen_loss": 0.26783210039138794, "grad_norm": 0.33615920719986353, "learning_rate": 1.8665684210526315e-05, "loss": 0.2702, "mean_copy_accuracy": 0.9980366826057434, "mean_gen_accuracy": 0.8782903552055359, "mean_token_accuracy": 0.9081113040447235, "num_tokens": 327435009.0, "sample_num_tokens": 9389.25, "step": 9709, "total_num_tokens": 327472566.0, "z_loss": 0.0003993496939074248 }, { "copy_logits_max": -4.375813007354736, "copy_logits_min": -750000000.0, "copy_num_tokens": 473.3125, "epoch": 1.9829971917283635, "gen_logits_max": 3.2037906646728516, "gen_logits_mean": -17.42333221435547, "gen_logits_min": -29.64148712158203, "gen_logits_std": 3.5069870948791504, "gen_loss": 0.2869742512702942, "grad_norm": 0.33236452665219574, "learning_rate": 1.866442105263158e-05, "loss": 0.2665, "mean_copy_accuracy": 0.9982226490974426, "mean_gen_accuracy": 0.8749320358037949, "mean_token_accuracy": 0.9087613821029663, "num_tokens": 327727435.0, "sample_num_tokens": 8379.25, "step": 9710, "total_num_tokens": 327760952.0, "z_loss": 0.00045606071944348514 }, { "copy_logits_max": -5.680633068084717, "copy_logits_min": -750000000.0, "copy_num_tokens": 408.625, "epoch": 1.9832014296655602, "gen_logits_max": 5.242682456970215, "gen_logits_mean": -13.920857429504395, "gen_logits_min": -26.425132751464844, "gen_logits_std": 3.386617660522461, "gen_loss": 0.27176612615585327, "grad_norm": 0.36828183311945667, "learning_rate": 1.8663157894736844e-05, "loss": 0.2774, "mean_copy_accuracy": 0.9973466098308563, "mean_gen_accuracy": 0.8793985992670059, "mean_token_accuracy": 0.9049296528100967, "num_tokens": 327981471.0, "sample_num_tokens": 8936.25, "step": 9711, "total_num_tokens": 328017216.0, "z_loss": 0.000409131491323933 }, { "copy_logits_max": -5.441689491271973, "copy_logits_min": -687500032.0, "copy_num_tokens": 541.1875, "epoch": 1.9834056676027572, "gen_logits_max": 3.234069347381592, "gen_logits_mean": -17.177139282226562, "gen_logits_min": -29.634244918823242, "gen_logits_std": 3.508880615234375, "gen_loss": 0.27324390411376953, "grad_norm": 0.3412095867393249, "learning_rate": 1.8661894736842108e-05, "loss": 0.2786, "mean_copy_accuracy": 0.9960618019104004, "mean_gen_accuracy": 0.8752882778644562, "mean_token_accuracy": 0.9051958471536636, "num_tokens": 328254852.0, "sample_num_tokens": 8513.0, "step": 9712, "total_num_tokens": 328288904.0, "z_loss": 0.0004568842123262584 }, { "copy_logits_max": -2.916624069213867, "copy_logits_min": -750000000.0, "copy_num_tokens": 531.125, "epoch": 1.983609905539954, "gen_logits_max": 3.837078094482422, "gen_logits_mean": -16.13396453857422, "gen_logits_min": -28.822425842285156, "gen_logits_std": 3.484480857849121, "gen_loss": 0.25669345259666443, "grad_norm": 0.36399561913922, "learning_rate": 1.866063157894737e-05, "loss": 0.2733, "mean_copy_accuracy": 0.9969068318605423, "mean_gen_accuracy": 0.8800630122423172, "mean_token_accuracy": 0.9084243029356003, "num_tokens": 328523120.0, "sample_num_tokens": 8409.5, "step": 9713, "total_num_tokens": 328556758.0, "z_loss": 0.00034647915163077414 }, { "copy_logits_max": -6.533865928649902, "copy_logits_min": -750000000.0, "copy_num_tokens": 310.4375, "epoch": 1.9838141434771508, "gen_logits_max": 3.8171403408050537, "gen_logits_mean": -17.615375518798828, "gen_logits_min": -30.046146392822266, "gen_logits_std": 3.533684253692627, "gen_loss": 0.2647920548915863, "grad_norm": 0.3514918359307093, "learning_rate": 1.8659368421052633e-05, "loss": 0.2531, "mean_copy_accuracy": 0.9971877485513687, "mean_gen_accuracy": 0.8888840675354004, "mean_token_accuracy": 0.911889523267746, "num_tokens": 328794637.0, "sample_num_tokens": 7240.75, "step": 9714, "total_num_tokens": 328823600.0, "z_loss": 0.00038717754068784416 }, { "copy_logits_max": -5.4710187911987305, "copy_logits_min": -750000000.0, "copy_num_tokens": 600.9375, "epoch": 1.9840183814143477, "gen_logits_max": 4.056027412414551, "gen_logits_mean": -17.072975158691406, "gen_logits_min": -29.956653594970703, "gen_logits_std": 3.5120742321014404, "gen_loss": 0.2962626814842224, "grad_norm": 0.3561521800265052, "learning_rate": 1.8658105263157894e-05, "loss": 0.2896, "mean_copy_accuracy": 0.9974076747894287, "mean_gen_accuracy": 0.8712856024503708, "mean_token_accuracy": 0.9011476635932922, "num_tokens": 329051353.0, "sample_num_tokens": 9463.25, "step": 9715, "total_num_tokens": 329089206.0, "z_loss": 0.0004461805219762027 }, { "copy_logits_max": -5.662330627441406, "copy_logits_min": -750000000.0, "copy_num_tokens": 449.3125, "epoch": 1.9842226193515446, "gen_logits_max": 3.1238491535186768, "gen_logits_mean": -18.137950897216797, "gen_logits_min": -30.786577224731445, "gen_logits_std": 3.5609006881713867, "gen_loss": 0.2666930854320526, "grad_norm": 0.34180970413969525, "learning_rate": 1.8656842105263158e-05, "loss": 0.2696, "mean_copy_accuracy": 0.9979466646909714, "mean_gen_accuracy": 0.8741292506456375, "mean_token_accuracy": 0.9081687033176422, "num_tokens": 329329471.0, "sample_num_tokens": 7885.75, "step": 9716, "total_num_tokens": 329361014.0, "z_loss": 0.0004342871834523976 }, { "copy_logits_max": -5.288383483886719, "copy_logits_min": -750000064.0, "copy_num_tokens": 479.3125, "epoch": 1.9844268572887414, "gen_logits_max": 3.8646082878112793, "gen_logits_mean": -16.882022857666016, "gen_logits_min": -29.75605010986328, "gen_logits_std": 3.503082513809204, "gen_loss": 0.2890854775905609, "grad_norm": 0.3559380756843887, "learning_rate": 1.865557894736842e-05, "loss": 0.2783, "mean_copy_accuracy": 0.997591957449913, "mean_gen_accuracy": 0.8761812150478363, "mean_token_accuracy": 0.905061736702919, "num_tokens": 329578793.0, "sample_num_tokens": 8270.75, "step": 9717, "total_num_tokens": 329611876.0, "z_loss": 0.00047471534344367683 }, { "copy_logits_max": -6.957519054412842, "copy_logits_min": -687500032.0, "copy_num_tokens": 666.375, "epoch": 1.984631095225938, "gen_logits_max": 4.030378341674805, "gen_logits_mean": -15.638636589050293, "gen_logits_min": -28.548337936401367, "gen_logits_std": 3.4499337673187256, "gen_loss": 0.2555314302444458, "grad_norm": 0.3363612791349818, "learning_rate": 1.8654315789473684e-05, "loss": 0.2771, "mean_copy_accuracy": 0.9980272799730301, "mean_gen_accuracy": 0.8787142932415009, "mean_token_accuracy": 0.9075476676225662, "num_tokens": 329836894.0, "sample_num_tokens": 9423.0, "step": 9718, "total_num_tokens": 329874586.0, "z_loss": 0.00038362122722901404 }, { "copy_logits_max": -5.529032230377197, "copy_logits_min": -750000000.0, "copy_num_tokens": 558.3125, "epoch": 1.9848353331631352, "gen_logits_max": 4.246651649475098, "gen_logits_mean": -15.527902603149414, "gen_logits_min": -28.049015045166016, "gen_logits_std": 3.442763090133667, "gen_loss": 0.25574392080307007, "grad_norm": 0.35236069612010135, "learning_rate": 1.8653052631578948e-05, "loss": 0.2567, "mean_copy_accuracy": 0.9970870018005371, "mean_gen_accuracy": 0.8839959651231766, "mean_token_accuracy": 0.9117256551980972, "num_tokens": 330118422.0, "sample_num_tokens": 9270.5, "step": 9719, "total_num_tokens": 330155504.0, "z_loss": 0.0003761436091735959 }, { "copy_logits_max": -5.631595611572266, "copy_logits_min": -687500032.0, "copy_num_tokens": 465.0, "epoch": 1.985039571100332, "gen_logits_max": 3.11299991607666, "gen_logits_mean": -17.598735809326172, "gen_logits_min": -30.063899993896484, "gen_logits_std": 3.5485358238220215, "gen_loss": 0.24137218296527863, "grad_norm": 0.33463714512734843, "learning_rate": 1.8651789473684212e-05, "loss": 0.2618, "mean_copy_accuracy": 0.996845543384552, "mean_gen_accuracy": 0.8825011849403381, "mean_token_accuracy": 0.908994197845459, "num_tokens": 330372018.0, "sample_num_tokens": 7780.5, "step": 9720, "total_num_tokens": 330403140.0, "z_loss": 0.0004100279766134918 }, { "copy_logits_max": -7.247272968292236, "copy_logits_min": -750000000.0, "copy_num_tokens": 493.875, "epoch": 1.9852438090375286, "gen_logits_max": 3.6770224571228027, "gen_logits_mean": -17.483924865722656, "gen_logits_min": -29.889198303222656, "gen_logits_std": 3.5268516540527344, "gen_loss": 0.26299798488616943, "grad_norm": 0.3409618341068389, "learning_rate": 1.8650526315789476e-05, "loss": 0.2703, "mean_copy_accuracy": 0.9974056482315063, "mean_gen_accuracy": 0.8752250224351883, "mean_token_accuracy": 0.9079022407531738, "num_tokens": 330654273.0, "sample_num_tokens": 8351.25, "step": 9721, "total_num_tokens": 330687678.0, "z_loss": 0.00040620361687615514 }, { "copy_logits_max": -5.581743240356445, "copy_logits_min": -750000000.0, "copy_num_tokens": 473.0, "epoch": 1.9854480469747255, "gen_logits_max": 2.715771198272705, "gen_logits_mean": -18.195714950561523, "gen_logits_min": -30.899099349975586, "gen_logits_std": 3.547691822052002, "gen_loss": 0.2949577271938324, "grad_norm": 0.37842179652868835, "learning_rate": 1.8649263157894737e-05, "loss": 0.259, "mean_copy_accuracy": 0.9969232827425003, "mean_gen_accuracy": 0.8821730613708496, "mean_token_accuracy": 0.9123825430870056, "num_tokens": 330921151.0, "sample_num_tokens": 7977.25, "step": 9722, "total_num_tokens": 330953060.0, "z_loss": 0.0004727889026980847 }, { "copy_logits_max": -4.78234338760376, "copy_logits_min": -750000000.0, "copy_num_tokens": 392.8125, "epoch": 1.9856522849119225, "gen_logits_max": 4.428123474121094, "gen_logits_mean": -15.056175231933594, "gen_logits_min": -28.194490432739258, "gen_logits_std": 3.423722267150879, "gen_loss": 0.31379008293151855, "grad_norm": 0.3615841146196248, "learning_rate": 1.8648000000000002e-05, "loss": 0.2968, "mean_copy_accuracy": 0.9973621666431427, "mean_gen_accuracy": 0.866620734333992, "mean_token_accuracy": 0.8984593749046326, "num_tokens": 331194157.0, "sample_num_tokens": 7509.75, "step": 9723, "total_num_tokens": 331224196.0, "z_loss": 0.00042171176755800843 }, { "copy_logits_max": -8.58582878112793, "copy_logits_min": -750000000.0, "copy_num_tokens": 401.125, "epoch": 1.9858565228491192, "gen_logits_max": 3.0430831909179688, "gen_logits_mean": -19.33129119873047, "gen_logits_min": -31.478961944580078, "gen_logits_std": 3.620145320892334, "gen_loss": 0.25307726860046387, "grad_norm": 0.32997354216852237, "learning_rate": 1.8646736842105263e-05, "loss": 0.2753, "mean_copy_accuracy": 0.997075229883194, "mean_gen_accuracy": 0.8767801821231842, "mean_token_accuracy": 0.9049721658229828, "num_tokens": 331455846.0, "sample_num_tokens": 8906.0, "step": 9724, "total_num_tokens": 331491470.0, "z_loss": 0.00039863918209448457 }, { "copy_logits_max": -4.425863742828369, "copy_logits_min": -750000000.0, "copy_num_tokens": 278.3125, "epoch": 1.986060760786316, "gen_logits_max": 4.811237335205078, "gen_logits_mean": -17.36092758178711, "gen_logits_min": -29.823760986328125, "gen_logits_std": 3.517313003540039, "gen_loss": 0.28731197118759155, "grad_norm": 0.3698905410501554, "learning_rate": 1.8645473684210527e-05, "loss": 0.3058, "mean_copy_accuracy": 0.9963851124048233, "mean_gen_accuracy": 0.8694882392883301, "mean_token_accuracy": 0.8966579586267471, "num_tokens": 331721466.0, "sample_num_tokens": 7766.5, "step": 9725, "total_num_tokens": 331752532.0, "z_loss": 0.0004159938544034958 }, { "copy_logits_max": -7.445067405700684, "copy_logits_min": -687500032.0, "copy_num_tokens": 330.9375, "epoch": 1.986264998723513, "gen_logits_max": 3.642808437347412, "gen_logits_mean": -18.26837158203125, "gen_logits_min": -30.66775131225586, "gen_logits_std": 3.5595059394836426, "gen_loss": 0.27930378913879395, "grad_norm": 0.3530462798028047, "learning_rate": 1.8644210526315788e-05, "loss": 0.2681, "mean_copy_accuracy": 0.9976774007081985, "mean_gen_accuracy": 0.8836251646280289, "mean_token_accuracy": 0.9104509651660919, "num_tokens": 331989311.0, "sample_num_tokens": 7414.25, "step": 9726, "total_num_tokens": 332018968.0, "z_loss": 0.0003933607949875295 }, { "copy_logits_max": -3.372859477996826, "copy_logits_min": -750000128.0, "copy_num_tokens": 497.9375, "epoch": 1.9864692366607097, "gen_logits_max": 3.9326913356781006, "gen_logits_mean": -17.254623413085938, "gen_logits_min": -30.23729705810547, "gen_logits_std": 3.560203790664673, "gen_loss": 0.27172544598579407, "grad_norm": 0.34070026323857416, "learning_rate": 1.8642947368421052e-05, "loss": 0.2656, "mean_copy_accuracy": 0.99661485850811, "mean_gen_accuracy": 0.8763326853513718, "mean_token_accuracy": 0.9088330566883087, "num_tokens": 332269595.0, "sample_num_tokens": 8798.25, "step": 9727, "total_num_tokens": 332304788.0, "z_loss": 0.0004155987116973847 }, { "copy_logits_max": -2.120185613632202, "copy_logits_min": -625000064.0, "copy_num_tokens": 651.0, "epoch": 1.9866734745979064, "gen_logits_max": 4.391297340393066, "gen_logits_mean": -15.6235933303833, "gen_logits_min": -27.980388641357422, "gen_logits_std": 3.468878984451294, "gen_loss": 0.24898938834667206, "grad_norm": 0.3651283799592372, "learning_rate": 1.8641684210526316e-05, "loss": 0.2821, "mean_copy_accuracy": 0.9968766868114471, "mean_gen_accuracy": 0.8791429549455643, "mean_token_accuracy": 0.9034508317708969, "num_tokens": 332529524.0, "sample_num_tokens": 9871.5, "step": 9728, "total_num_tokens": 332569010.0, "z_loss": 0.0003924538614228368 }, { "copy_logits_max": -4.948662757873535, "copy_logits_min": -750000000.0, "copy_num_tokens": 442.8125, "epoch": 1.9868777125351034, "gen_logits_max": 4.084014892578125, "gen_logits_mean": -16.64425277709961, "gen_logits_min": -29.019371032714844, "gen_logits_std": 3.506248950958252, "gen_loss": 0.24654555320739746, "grad_norm": 0.32440415152664137, "learning_rate": 1.864042105263158e-05, "loss": 0.2617, "mean_copy_accuracy": 0.9978309273719788, "mean_gen_accuracy": 0.8810356110334396, "mean_token_accuracy": 0.9106475710868835, "num_tokens": 332804225.0, "sample_num_tokens": 8794.75, "step": 9729, "total_num_tokens": 332839404.0, "z_loss": 0.0003220995422452688 }, { "copy_logits_max": -3.707334518432617, "copy_logits_min": -750000000.0, "copy_num_tokens": 591.5, "epoch": 1.9870819504723003, "gen_logits_max": 3.7880125045776367, "gen_logits_mean": -16.412790298461914, "gen_logits_min": -28.84300422668457, "gen_logits_std": 3.501047372817993, "gen_loss": 0.2343919426202774, "grad_norm": 0.33937801745756874, "learning_rate": 1.8639157894736842e-05, "loss": 0.2709, "mean_copy_accuracy": 0.9976286590099335, "mean_gen_accuracy": 0.8778292089700699, "mean_token_accuracy": 0.908674880862236, "num_tokens": 333081830.0, "sample_num_tokens": 9210.0, "step": 9730, "total_num_tokens": 333118670.0, "z_loss": 0.0003308792656753212 }, { "copy_logits_max": -5.5909318923950195, "copy_logits_min": -750000000.0, "copy_num_tokens": 349.5625, "epoch": 1.987286188409497, "gen_logits_max": 3.8705639839172363, "gen_logits_mean": -17.030607223510742, "gen_logits_min": -30.320419311523438, "gen_logits_std": 3.514583110809326, "gen_loss": 0.2779274582862854, "grad_norm": 0.3693485557443945, "learning_rate": 1.8637894736842106e-05, "loss": 0.2837, "mean_copy_accuracy": 0.9969596713781357, "mean_gen_accuracy": 0.878995269536972, "mean_token_accuracy": 0.9037525653839111, "num_tokens": 333330563.0, "sample_num_tokens": 7187.25, "step": 9731, "total_num_tokens": 333359312.0, "z_loss": 0.00040212093153968453 }, { "copy_logits_max": -1.8516874313354492, "copy_logits_min": -750000064.0, "copy_num_tokens": 584.8125, "epoch": 1.987490426346694, "gen_logits_max": 3.8256545066833496, "gen_logits_mean": -17.100894927978516, "gen_logits_min": -29.728137969970703, "gen_logits_std": 3.5622081756591797, "gen_loss": 0.20935139060020447, "grad_norm": 0.3273616264880712, "learning_rate": 1.8636631578947367e-05, "loss": 0.248, "mean_copy_accuracy": 0.9979183673858643, "mean_gen_accuracy": 0.8839178085327148, "mean_token_accuracy": 0.9138974994421005, "num_tokens": 333597420.0, "sample_num_tokens": 8611.5, "step": 9732, "total_num_tokens": 333631866.0, "z_loss": 0.00031197996577247977 }, { "copy_logits_max": -3.519407272338867, "copy_logits_min": -750000000.0, "copy_num_tokens": 465.0625, "epoch": 1.9876946642838909, "gen_logits_max": 3.2694268226623535, "gen_logits_mean": -17.93292999267578, "gen_logits_min": -30.67339515686035, "gen_logits_std": 3.555488109588623, "gen_loss": 0.28287988901138306, "grad_norm": 0.3405429687516686, "learning_rate": 1.863536842105263e-05, "loss": 0.2675, "mean_copy_accuracy": 0.9976392537355423, "mean_gen_accuracy": 0.8786750137805939, "mean_token_accuracy": 0.9092824906110764, "num_tokens": 333886711.0, "sample_num_tokens": 8840.25, "step": 9733, "total_num_tokens": 333922072.0, "z_loss": 0.00041319173760712147 }, { "copy_logits_max": -0.9757578372955322, "copy_logits_min": -687500032.0, "copy_num_tokens": 423.875, "epoch": 1.9878989022210876, "gen_logits_max": 3.155565023422241, "gen_logits_mean": -17.107845306396484, "gen_logits_min": -29.499774932861328, "gen_logits_std": 3.5267179012298584, "gen_loss": 0.2841982841491699, "grad_norm": 0.3424523314787564, "learning_rate": 1.8634105263157896e-05, "loss": 0.2679, "mean_copy_accuracy": 0.9978366047143936, "mean_gen_accuracy": 0.8750660568475723, "mean_token_accuracy": 0.9086734354496002, "num_tokens": 334163691.0, "sample_num_tokens": 7265.25, "step": 9734, "total_num_tokens": 334192752.0, "z_loss": 0.00043096416629850864 }, { "copy_logits_max": 0.6266952753067017, "copy_logits_min": -750000064.0, "copy_num_tokens": 523.25, "epoch": 1.9881031401582843, "gen_logits_max": 4.959267616271973, "gen_logits_mean": -15.125910758972168, "gen_logits_min": -28.376697540283203, "gen_logits_std": 3.442148447036743, "gen_loss": 0.2620522081851959, "grad_norm": 0.33656809638057333, "learning_rate": 1.8632842105263157e-05, "loss": 0.2722, "mean_copy_accuracy": 0.9982472807168961, "mean_gen_accuracy": 0.8779165297746658, "mean_token_accuracy": 0.907416045665741, "num_tokens": 334418935.0, "sample_num_tokens": 9848.25, "step": 9735, "total_num_tokens": 334458328.0, "z_loss": 0.00044317872379906476 }, { "copy_logits_max": 1.663752555847168, "copy_logits_min": -687500032.0, "copy_num_tokens": 739.75, "epoch": 1.9883073780954812, "gen_logits_max": 3.802830696105957, "gen_logits_mean": -15.578189849853516, "gen_logits_min": -28.98249626159668, "gen_logits_std": 3.469050884246826, "gen_loss": 0.255790114402771, "grad_norm": 0.3275621091616099, "learning_rate": 1.8631578947368424e-05, "loss": 0.2812, "mean_copy_accuracy": 0.9978953897953033, "mean_gen_accuracy": 0.8690472394227982, "mean_token_accuracy": 0.9051903635263443, "num_tokens": 334698514.0, "sample_num_tokens": 9495.5, "step": 9736, "total_num_tokens": 334736496.0, "z_loss": 0.0004799086891580373 }, { "copy_logits_max": -2.4052205085754395, "copy_logits_min": -687500032.0, "copy_num_tokens": 602.625, "epoch": 1.9885116160326781, "gen_logits_max": 4.18064022064209, "gen_logits_mean": -15.819196701049805, "gen_logits_min": -28.920551300048828, "gen_logits_std": 3.4783012866973877, "gen_loss": 0.2608594298362732, "grad_norm": 0.3608324559163335, "learning_rate": 1.8630315789473685e-05, "loss": 0.2814, "mean_copy_accuracy": 0.9968911707401276, "mean_gen_accuracy": 0.8723291158676147, "mean_token_accuracy": 0.9054509699344635, "num_tokens": 334961694.0, "sample_num_tokens": 8249.5, "step": 9737, "total_num_tokens": 334994692.0, "z_loss": 0.0004735644324682653 }, { "copy_logits_max": -3.247567653656006, "copy_logits_min": -687500032.0, "copy_num_tokens": 351.25, "epoch": 1.9887158539698748, "gen_logits_max": 4.3895673751831055, "gen_logits_mean": -16.628292083740234, "gen_logits_min": -29.526046752929688, "gen_logits_std": 3.485755443572998, "gen_loss": 0.29444077610969543, "grad_norm": 0.3772468947513736, "learning_rate": 1.862905263157895e-05, "loss": 0.2786, "mean_copy_accuracy": 0.9967896789312363, "mean_gen_accuracy": 0.8765375167131424, "mean_token_accuracy": 0.9054038375616074, "num_tokens": 335239577.0, "sample_num_tokens": 7574.75, "step": 9738, "total_num_tokens": 335269876.0, "z_loss": 0.0004606957081705332 }, { "copy_logits_max": -4.001999855041504, "copy_logits_min": -750000000.0, "copy_num_tokens": 372.25, "epoch": 1.9889200919070718, "gen_logits_max": 3.9019010066986084, "gen_logits_mean": -17.81817626953125, "gen_logits_min": -30.837474822998047, "gen_logits_std": 3.5843820571899414, "gen_loss": 0.21546027064323425, "grad_norm": 0.3361562117361322, "learning_rate": 1.862778947368421e-05, "loss": 0.2572, "mean_copy_accuracy": 0.9968767762184143, "mean_gen_accuracy": 0.8832210153341293, "mean_token_accuracy": 0.9109801054000854, "num_tokens": 335506833.0, "sample_num_tokens": 7159.75, "step": 9739, "total_num_tokens": 335535472.0, "z_loss": 0.0003602250653784722 }, { "copy_logits_max": -2.5266215801239014, "copy_logits_min": -750000000.0, "copy_num_tokens": 433.4375, "epoch": 1.9891243298442687, "gen_logits_max": 3.0699400901794434, "gen_logits_mean": -18.386606216430664, "gen_logits_min": -31.22298240661621, "gen_logits_std": 3.548487663269043, "gen_loss": 0.3053118586540222, "grad_norm": 0.3574088257442118, "learning_rate": 1.8626526315789475e-05, "loss": 0.2779, "mean_copy_accuracy": 0.9970876127481461, "mean_gen_accuracy": 0.87491774559021, "mean_token_accuracy": 0.9049811065196991, "num_tokens": 335758956.0, "sample_num_tokens": 8166.5, "step": 9740, "total_num_tokens": 335791622.0, "z_loss": 0.0004584956041071564 }, { "copy_logits_max": -2.9635353088378906, "copy_logits_min": -750000000.0, "copy_num_tokens": 503.4375, "epoch": 1.9893285677814654, "gen_logits_max": 4.686574935913086, "gen_logits_mean": -16.793609619140625, "gen_logits_min": -30.369770050048828, "gen_logits_std": 3.528188943862915, "gen_loss": 0.2556959390640259, "grad_norm": 0.36360767757941737, "learning_rate": 1.8625263157894736e-05, "loss": 0.2799, "mean_copy_accuracy": 0.9973680824041367, "mean_gen_accuracy": 0.8800937533378601, "mean_token_accuracy": 0.9050607830286026, "num_tokens": 336030964.0, "sample_num_tokens": 9278.0, "step": 9741, "total_num_tokens": 336068076.0, "z_loss": 0.00037064211210235953 }, { "copy_logits_max": -2.4307384490966797, "copy_logits_min": -750000064.0, "copy_num_tokens": 421.125, "epoch": 1.989532805718662, "gen_logits_max": 3.8008604049682617, "gen_logits_mean": -17.474552154541016, "gen_logits_min": -30.180648803710938, "gen_logits_std": 3.5442099571228027, "gen_loss": 0.2670835554599762, "grad_norm": 0.37589795435821605, "learning_rate": 1.8624e-05, "loss": 0.2776, "mean_copy_accuracy": 0.9967762231826782, "mean_gen_accuracy": 0.8777066916227341, "mean_token_accuracy": 0.9056212157011032, "num_tokens": 336296137.0, "sample_num_tokens": 7944.25, "step": 9742, "total_num_tokens": 336327914.0, "z_loss": 0.00038955610943958163 }, { "copy_logits_max": -1.100074052810669, "copy_logits_min": -750000000.0, "copy_num_tokens": 528.5, "epoch": 1.9897370436558592, "gen_logits_max": 3.3413405418395996, "gen_logits_mean": -16.572280883789062, "gen_logits_min": -29.915178298950195, "gen_logits_std": 3.508884906768799, "gen_loss": 0.26974958181381226, "grad_norm": 0.34406568630947304, "learning_rate": 1.862273684210526e-05, "loss": 0.2684, "mean_copy_accuracy": 0.9976286441087723, "mean_gen_accuracy": 0.877845823764801, "mean_token_accuracy": 0.9084245562553406, "num_tokens": 336576861.0, "sample_num_tokens": 8337.75, "step": 9743, "total_num_tokens": 336610212.0, "z_loss": 0.00039209079113788903 }, { "copy_logits_max": -3.37792706489563, "copy_logits_min": -687500032.0, "copy_num_tokens": 490.5, "epoch": 1.989941281593056, "gen_logits_max": 4.002782821655273, "gen_logits_mean": -16.281457901000977, "gen_logits_min": -28.868061065673828, "gen_logits_std": 3.4853358268737793, "gen_loss": 0.2963927686214447, "grad_norm": 0.4997809354370775, "learning_rate": 1.862147368421053e-05, "loss": 0.2973, "mean_copy_accuracy": 0.996202290058136, "mean_gen_accuracy": 0.8734239637851715, "mean_token_accuracy": 0.899638906121254, "num_tokens": 336817855.0, "sample_num_tokens": 8470.25, "step": 9744, "total_num_tokens": 336851736.0, "z_loss": 0.00046397795085795224 }, { "copy_logits_max": -4.461273670196533, "copy_logits_min": -750000000.0, "copy_num_tokens": 553.0, "epoch": 1.9901455195302526, "gen_logits_max": 2.9296624660491943, "gen_logits_mean": -17.76582145690918, "gen_logits_min": -30.708938598632812, "gen_logits_std": 3.5559945106506348, "gen_loss": 0.26774150133132935, "grad_norm": 0.408098484684622, "learning_rate": 1.862021052631579e-05, "loss": 0.2577, "mean_copy_accuracy": 0.9965076148509979, "mean_gen_accuracy": 0.8810208439826965, "mean_token_accuracy": 0.9109031111001968, "num_tokens": 337091266.0, "sample_num_tokens": 8589.5, "step": 9745, "total_num_tokens": 337125624.0, "z_loss": 0.00042649114038795233 }, { "copy_logits_max": 0.7606325745582581, "copy_logits_min": -750000064.0, "copy_num_tokens": 676.5, "epoch": 1.9903497574674496, "gen_logits_max": 4.399144649505615, "gen_logits_mean": -14.689703941345215, "gen_logits_min": -28.10670280456543, "gen_logits_std": 3.4263195991516113, "gen_loss": 0.26853716373443604, "grad_norm": 0.3456326031090208, "learning_rate": 1.8618947368421054e-05, "loss": 0.2626, "mean_copy_accuracy": 0.9978403151035309, "mean_gen_accuracy": 0.8752626329660416, "mean_token_accuracy": 0.9105955362319946, "num_tokens": 337369903.0, "sample_num_tokens": 9725.25, "step": 9746, "total_num_tokens": 337408804.0, "z_loss": 0.0004240098060108721 }, { "copy_logits_max": -0.8775927424430847, "copy_logits_min": -750000000.0, "copy_num_tokens": 406.25, "epoch": 1.9905539954046465, "gen_logits_max": 3.706109046936035, "gen_logits_mean": -17.48014259338379, "gen_logits_min": -29.6162109375, "gen_logits_std": 3.521566390991211, "gen_loss": 0.3125951290130615, "grad_norm": 0.36157035018331035, "learning_rate": 1.8617684210526318e-05, "loss": 0.277, "mean_copy_accuracy": 0.9975557923316956, "mean_gen_accuracy": 0.8744117021560669, "mean_token_accuracy": 0.9053885340690613, "num_tokens": 337653604.0, "sample_num_tokens": 8088.5, "step": 9747, "total_num_tokens": 337685958.0, "z_loss": 0.0005909351748414338 }, { "copy_logits_max": 2.1230568885803223, "copy_logits_min": -687500032.0, "copy_num_tokens": 524.625, "epoch": 1.9907582333418432, "gen_logits_max": 2.9125795364379883, "gen_logits_mean": -17.906211853027344, "gen_logits_min": -30.551300048828125, "gen_logits_std": 3.5697579383850098, "gen_loss": 0.2788826823234558, "grad_norm": 0.364197437920073, "learning_rate": 1.861642105263158e-05, "loss": 0.2652, "mean_copy_accuracy": 0.997082069516182, "mean_gen_accuracy": 0.880561500787735, "mean_token_accuracy": 0.9090899080038071, "num_tokens": 337927196.0, "sample_num_tokens": 8785.0, "step": 9748, "total_num_tokens": 337962336.0, "z_loss": 0.000616333563812077 }, { "copy_logits_max": 0.13996374607086182, "copy_logits_min": -687500032.0, "copy_num_tokens": 536.625, "epoch": 1.9909624712790401, "gen_logits_max": 4.608671188354492, "gen_logits_mean": -14.723114013671875, "gen_logits_min": -27.300018310546875, "gen_logits_std": 3.44287109375, "gen_loss": 0.2508067786693573, "grad_norm": 0.35055768485129035, "learning_rate": 1.8615157894736843e-05, "loss": 0.2677, "mean_copy_accuracy": 0.9966690093278885, "mean_gen_accuracy": 0.8817448019981384, "mean_token_accuracy": 0.9095900356769562, "num_tokens": 338193151.0, "sample_num_tokens": 9217.75, "step": 9749, "total_num_tokens": 338230022.0, "z_loss": 0.0005603985628113151 }, { "copy_logits_max": -4.077441692352295, "copy_logits_min": -750000000.0, "copy_num_tokens": 322.25, "epoch": 1.991166709216237, "gen_logits_max": 2.510592460632324, "gen_logits_mean": -20.052526473999023, "gen_logits_min": -32.12028503417969, "gen_logits_std": 3.6750621795654297, "gen_loss": 0.2477165311574936, "grad_norm": 0.3788468755524082, "learning_rate": 1.8613894736842104e-05, "loss": 0.2827, "mean_copy_accuracy": 0.9977056682109833, "mean_gen_accuracy": 0.8792447298765182, "mean_token_accuracy": 0.9046571850776672, "num_tokens": 338469513.0, "sample_num_tokens": 7282.75, "step": 9750, "total_num_tokens": 338498644.0, "z_loss": 0.00036614545388147235 }, { "copy_logits_max": 0.9697845578193665, "copy_logits_min": -687500032.0, "copy_num_tokens": 554.9375, "epoch": 1.9913709471534338, "gen_logits_max": 4.174733638763428, "gen_logits_mean": -15.776761054992676, "gen_logits_min": -28.622859954833984, "gen_logits_std": 3.4850914478302, "gen_loss": 0.2595963478088379, "grad_norm": 0.3414578112136792, "learning_rate": 1.861263157894737e-05, "loss": 0.2572, "mean_copy_accuracy": 0.9961073249578476, "mean_gen_accuracy": 0.8880777060985565, "mean_token_accuracy": 0.9129601120948792, "num_tokens": 338727910.0, "sample_num_tokens": 9040.5, "step": 9751, "total_num_tokens": 338764072.0, "z_loss": 0.00041188974864780903 }, { "copy_logits_max": -1.2575905323028564, "copy_logits_min": -750000064.0, "copy_num_tokens": 528.0, "epoch": 1.9915751850906305, "gen_logits_max": 3.1060972213745117, "gen_logits_mean": -16.921756744384766, "gen_logits_min": -29.412979125976562, "gen_logits_std": 3.519961357116699, "gen_loss": 0.30411842465400696, "grad_norm": 0.37509248521165056, "learning_rate": 1.8611368421052633e-05, "loss": 0.2858, "mean_copy_accuracy": 0.9968039840459824, "mean_gen_accuracy": 0.8731626272201538, "mean_token_accuracy": 0.9036097526550293, "num_tokens": 338983902.0, "sample_num_tokens": 8442.0, "step": 9752, "total_num_tokens": 339017670.0, "z_loss": 0.00042268174001947045 }, { "copy_logits_max": 0.0764378011226654, "copy_logits_min": -750000064.0, "copy_num_tokens": 391.5625, "epoch": 1.9917794230278274, "gen_logits_max": 4.691073894500732, "gen_logits_mean": -15.385600090026855, "gen_logits_min": -28.151851654052734, "gen_logits_std": 3.446913242340088, "gen_loss": 0.345237672328949, "grad_norm": 0.36730173511268305, "learning_rate": 1.8610105263157897e-05, "loss": 0.2872, "mean_copy_accuracy": 0.9969346523284912, "mean_gen_accuracy": 0.8731620162725449, "mean_token_accuracy": 0.9022909849882126, "num_tokens": 339245911.0, "sample_num_tokens": 7605.75, "step": 9753, "total_num_tokens": 339276334.0, "z_loss": 0.0004267546464689076 }, { "copy_logits_max": -3.802447557449341, "copy_logits_min": -750000000.0, "copy_num_tokens": 547.0625, "epoch": 1.9919836609650243, "gen_logits_max": 2.4633121490478516, "gen_logits_mean": -18.17389678955078, "gen_logits_min": -30.545482635498047, "gen_logits_std": 3.6031837463378906, "gen_loss": 0.25441235303878784, "grad_norm": 0.34546100244545364, "learning_rate": 1.8608842105263158e-05, "loss": 0.2549, "mean_copy_accuracy": 0.9978928416967392, "mean_gen_accuracy": 0.8793098032474518, "mean_token_accuracy": 0.9123497158288956, "num_tokens": 339527701.0, "sample_num_tokens": 8579.25, "step": 9754, "total_num_tokens": 339562018.0, "z_loss": 0.00037181502557359636 }, { "copy_logits_max": -4.805638790130615, "copy_logits_min": -750000000.0, "copy_num_tokens": 299.4375, "epoch": 1.992187898902221, "gen_logits_max": 3.500410795211792, "gen_logits_mean": -17.736560821533203, "gen_logits_min": -29.719972610473633, "gen_logits_std": 3.541203022003174, "gen_loss": 0.27947670221328735, "grad_norm": 0.3524962565545875, "learning_rate": 1.8607578947368422e-05, "loss": 0.2842, "mean_copy_accuracy": 0.9970929175615311, "mean_gen_accuracy": 0.8737758696079254, "mean_token_accuracy": 0.9030650854110718, "num_tokens": 339802988.0, "sample_num_tokens": 6765.0, "step": 9755, "total_num_tokens": 339830048.0, "z_loss": 0.00039696384919807315 }, { "copy_logits_max": -5.693581581115723, "copy_logits_min": -750000000.0, "copy_num_tokens": 385.9375, "epoch": 1.992392136839418, "gen_logits_max": 3.1741387844085693, "gen_logits_mean": -18.757381439208984, "gen_logits_min": -30.99675750732422, "gen_logits_std": 3.57181978225708, "gen_loss": 0.3173161745071411, "grad_norm": 0.4271445500377791, "learning_rate": 1.8606315789473683e-05, "loss": 0.2869, "mean_copy_accuracy": 0.9963569492101669, "mean_gen_accuracy": 0.8692998588085175, "mean_token_accuracy": 0.9037156999111176, "num_tokens": 340081827.0, "sample_num_tokens": 7932.75, "step": 9756, "total_num_tokens": 340113558.0, "z_loss": 0.00045876018702983856 }, { "copy_logits_max": -2.874169111251831, "copy_logits_min": -750000000.0, "copy_num_tokens": 435.125, "epoch": 1.9925963747766149, "gen_logits_max": 3.7859537601470947, "gen_logits_mean": -17.869157791137695, "gen_logits_min": -30.547733306884766, "gen_logits_std": 3.5726184844970703, "gen_loss": 0.2801591157913208, "grad_norm": 0.36729379384564464, "learning_rate": 1.8605052631578948e-05, "loss": 0.2847, "mean_copy_accuracy": 0.9967093914747238, "mean_gen_accuracy": 0.8776397556066513, "mean_token_accuracy": 0.9049818217754364, "num_tokens": 340341123.0, "sample_num_tokens": 9150.25, "step": 9757, "total_num_tokens": 340377724.0, "z_loss": 0.00041258326382376254 }, { "copy_logits_max": 0.9555289149284363, "copy_logits_min": -750000000.0, "copy_num_tokens": 446.0625, "epoch": 1.9928006127138116, "gen_logits_max": 4.069343090057373, "gen_logits_mean": -16.59576416015625, "gen_logits_min": -28.860475540161133, "gen_logits_std": 3.530496120452881, "gen_loss": 0.2615886628627777, "grad_norm": 0.3412744739458068, "learning_rate": 1.860378947368421e-05, "loss": 0.2638, "mean_copy_accuracy": 0.9961965978145599, "mean_gen_accuracy": 0.8834197223186493, "mean_token_accuracy": 0.9094394892454147, "num_tokens": 340608745.0, "sample_num_tokens": 8864.75, "step": 9758, "total_num_tokens": 340644204.0, "z_loss": 0.00037540204357355833 }, { "copy_logits_max": -1.3569324016571045, "copy_logits_min": -750000000.0, "copy_num_tokens": 439.75, "epoch": 1.9930048506510083, "gen_logits_max": 3.0291824340820312, "gen_logits_mean": -17.71333885192871, "gen_logits_min": -30.35846710205078, "gen_logits_std": 3.5593056678771973, "gen_loss": 0.2909274995326996, "grad_norm": 0.3664390940695585, "learning_rate": 1.8602526315789473e-05, "loss": 0.286, "mean_copy_accuracy": 0.9978559911251068, "mean_gen_accuracy": 0.8687815070152283, "mean_token_accuracy": 0.9039488881826401, "num_tokens": 340891864.0, "sample_num_tokens": 7546.0, "step": 9759, "total_num_tokens": 340922048.0, "z_loss": 0.00044031767174601555 }, { "copy_logits_max": -0.6914446353912354, "copy_logits_min": -687500032.0, "copy_num_tokens": 516.9375, "epoch": 1.9932090885882052, "gen_logits_max": 3.4771411418914795, "gen_logits_mean": -16.27082061767578, "gen_logits_min": -28.90167236328125, "gen_logits_std": 3.501939296722412, "gen_loss": 0.2703482508659363, "grad_norm": 0.3886130192605279, "learning_rate": 1.860126315789474e-05, "loss": 0.2744, "mean_copy_accuracy": 0.9966491907835007, "mean_gen_accuracy": 0.8767657279968262, "mean_token_accuracy": 0.9063381999731064, "num_tokens": 341162226.0, "sample_num_tokens": 8161.0, "step": 9760, "total_num_tokens": 341194870.0, "z_loss": 0.00041478831553831697 }, { "copy_logits_max": -3.2280187606811523, "copy_logits_min": -750000000.0, "copy_num_tokens": 485.5625, "epoch": 1.9934133265254022, "gen_logits_max": 3.467447519302368, "gen_logits_mean": -16.309894561767578, "gen_logits_min": -28.730228424072266, "gen_logits_std": 3.4970905780792236, "gen_loss": 0.24742108583450317, "grad_norm": 0.3524644349509815, "learning_rate": 1.86e-05, "loss": 0.2707, "mean_copy_accuracy": 0.9970995336771011, "mean_gen_accuracy": 0.877681240439415, "mean_token_accuracy": 0.9072080850601196, "num_tokens": 341429215.0, "sample_num_tokens": 8073.25, "step": 9761, "total_num_tokens": 341461508.0, "z_loss": 0.0003457414568401873 }, { "copy_logits_max": -2.3510355949401855, "copy_logits_min": -687500032.0, "copy_num_tokens": 535.625, "epoch": 1.9936175644625989, "gen_logits_max": 2.7276716232299805, "gen_logits_mean": -18.17100715637207, "gen_logits_min": -30.537561416625977, "gen_logits_std": 3.5759479999542236, "gen_loss": 0.2648131847381592, "grad_norm": 0.34084440331759785, "learning_rate": 1.8598736842105266e-05, "loss": 0.2443, "mean_copy_accuracy": 0.9978666305541992, "mean_gen_accuracy": 0.8873813599348068, "mean_token_accuracy": 0.9166848361492157, "num_tokens": 341714884.0, "sample_num_tokens": 8318.0, "step": 9762, "total_num_tokens": 341748156.0, "z_loss": 0.0003750439500436187 }, { "copy_logits_max": -2.140761375427246, "copy_logits_min": -750000000.0, "copy_num_tokens": 592.9375, "epoch": 1.9938218023997958, "gen_logits_max": 3.886380672454834, "gen_logits_mean": -16.208904266357422, "gen_logits_min": -28.577239990234375, "gen_logits_std": 3.5072131156921387, "gen_loss": 0.2552339434623718, "grad_norm": 0.33742625500174916, "learning_rate": 1.8597473684210527e-05, "loss": 0.2886, "mean_copy_accuracy": 0.9963053464889526, "mean_gen_accuracy": 0.8751733303070068, "mean_token_accuracy": 0.9029425233602524, "num_tokens": 341992431.0, "sample_num_tokens": 9743.25, "step": 9763, "total_num_tokens": 342031404.0, "z_loss": 0.000361693964805454 }, { "copy_logits_max": -0.38487398624420166, "copy_logits_min": -750000000.0, "copy_num_tokens": 390.4375, "epoch": 1.9940260403369927, "gen_logits_max": 3.432602882385254, "gen_logits_mean": -16.949499130249023, "gen_logits_min": -29.2066650390625, "gen_logits_std": 3.5153756141662598, "gen_loss": 0.27538537979125977, "grad_norm": 0.3392605245392766, "learning_rate": 1.859621052631579e-05, "loss": 0.2716, "mean_copy_accuracy": 0.9977823942899704, "mean_gen_accuracy": 0.8717075437307358, "mean_token_accuracy": 0.9054537266492844, "num_tokens": 342273720.0, "sample_num_tokens": 7642.0, "step": 9764, "total_num_tokens": 342304288.0, "z_loss": 0.00040147462277673185 }, { "copy_logits_max": -3.3322129249572754, "copy_logits_min": -750000000.0, "copy_num_tokens": 409.1875, "epoch": 1.9942302782741894, "gen_logits_max": 3.003474712371826, "gen_logits_mean": -18.5203914642334, "gen_logits_min": -30.83218765258789, "gen_logits_std": 3.5761353969573975, "gen_loss": 0.29918766021728516, "grad_norm": 0.3324036698159211, "learning_rate": 1.8594947368421052e-05, "loss": 0.2611, "mean_copy_accuracy": 0.9974563866853714, "mean_gen_accuracy": 0.8791649341583252, "mean_token_accuracy": 0.9130119681358337, "num_tokens": 342555589.0, "sample_num_tokens": 8648.25, "step": 9765, "total_num_tokens": 342590182.0, "z_loss": 0.00046067696530371904 }, { "copy_logits_max": -0.8956415057182312, "copy_logits_min": -625000000.0, "copy_num_tokens": 651.3125, "epoch": 1.9944345162113861, "gen_logits_max": 2.5203394889831543, "gen_logits_mean": -17.640369415283203, "gen_logits_min": -30.28295135498047, "gen_logits_std": 3.608508348464966, "gen_loss": 0.2163425087928772, "grad_norm": 0.33266995767303303, "learning_rate": 1.8593684210526316e-05, "loss": 0.2542, "mean_copy_accuracy": 0.9984332323074341, "mean_gen_accuracy": 0.8833049386739731, "mean_token_accuracy": 0.914475753903389, "num_tokens": 342848069.0, "sample_num_tokens": 9259.25, "step": 9766, "total_num_tokens": 342885106.0, "z_loss": 0.0003521306498441845 }, { "copy_logits_max": -2.339040756225586, "copy_logits_min": -750000000.0, "copy_num_tokens": 358.3125, "epoch": 1.994638754148583, "gen_logits_max": 2.6201424598693848, "gen_logits_mean": -18.14124298095703, "gen_logits_min": -30.476367950439453, "gen_logits_std": 3.5781164169311523, "gen_loss": 0.2618139386177063, "grad_norm": 0.3516100332594691, "learning_rate": 1.8592421052631577e-05, "loss": 0.2669, "mean_copy_accuracy": 0.9964590519666672, "mean_gen_accuracy": 0.8848631531000137, "mean_token_accuracy": 0.9093728810548782, "num_tokens": 343105064.0, "sample_num_tokens": 6968.0, "step": 9767, "total_num_tokens": 343132936.0, "z_loss": 0.0003407919721212238 }, { "copy_logits_max": -2.340538263320923, "copy_logits_min": -750000000.0, "copy_num_tokens": 340.125, "epoch": 1.99484299208578, "gen_logits_max": 3.7555789947509766, "gen_logits_mean": -17.375478744506836, "gen_logits_min": -30.02792739868164, "gen_logits_std": 3.553757667541504, "gen_loss": 0.23042592406272888, "grad_norm": 0.34699313852088826, "learning_rate": 1.8591157894736845e-05, "loss": 0.2714, "mean_copy_accuracy": 0.9970168173313141, "mean_gen_accuracy": 0.881460964679718, "mean_token_accuracy": 0.908039778470993, "num_tokens": 343371901.0, "sample_num_tokens": 7918.75, "step": 9768, "total_num_tokens": 343403576.0, "z_loss": 0.00030999176669865847 }, { "copy_logits_max": -3.4079935550689697, "copy_logits_min": -750000000.0, "copy_num_tokens": 238.75, "epoch": 1.9950472300229767, "gen_logits_max": 4.325803756713867, "gen_logits_mean": -18.19491958618164, "gen_logits_min": -30.786285400390625, "gen_logits_std": 3.543264627456665, "gen_loss": 0.33156922459602356, "grad_norm": 0.3775367700808855, "learning_rate": 1.8589894736842106e-05, "loss": 0.2821, "mean_copy_accuracy": 0.9967577308416367, "mean_gen_accuracy": 0.8753501325845718, "mean_token_accuracy": 0.9061461091041565, "num_tokens": 343639723.0, "sample_num_tokens": 7341.25, "step": 9769, "total_num_tokens": 343669088.0, "z_loss": 0.0004582410037983209 }, { "copy_logits_max": -1.214580774307251, "copy_logits_min": -687500032.0, "copy_num_tokens": 660.0, "epoch": 1.9952514679601736, "gen_logits_max": 2.178802967071533, "gen_logits_mean": -17.97097396850586, "gen_logits_min": -30.50996971130371, "gen_logits_std": 3.60099196434021, "gen_loss": 0.2181909680366516, "grad_norm": 0.33783800664018093, "learning_rate": 1.858863157894737e-05, "loss": 0.2494, "mean_copy_accuracy": 0.9975596219301224, "mean_gen_accuracy": 0.8824017494916916, "mean_token_accuracy": 0.9129675477743149, "num_tokens": 343910126.0, "sample_num_tokens": 10013.0, "step": 9770, "total_num_tokens": 343950178.0, "z_loss": 0.00031565630342811346 }, { "copy_logits_max": -0.5019772052764893, "copy_logits_min": -750000000.0, "copy_num_tokens": 648.75, "epoch": 1.9954557058973705, "gen_logits_max": 4.110202789306641, "gen_logits_mean": -15.485757827758789, "gen_logits_min": -29.640520095825195, "gen_logits_std": 3.4871702194213867, "gen_loss": 0.20351086556911469, "grad_norm": 0.3739099925828874, "learning_rate": 1.858736842105263e-05, "loss": 0.2593, "mean_copy_accuracy": 0.9969482719898224, "mean_gen_accuracy": 0.8846760541200638, "mean_token_accuracy": 0.9107549786567688, "num_tokens": 344157181.0, "sample_num_tokens": 8822.25, "step": 9771, "total_num_tokens": 344192470.0, "z_loss": 0.0003388700424693525 }, { "copy_logits_max": -1.0856660604476929, "copy_logits_min": -750000000.0, "copy_num_tokens": 597.25, "epoch": 1.9956599438345672, "gen_logits_max": 2.604860305786133, "gen_logits_mean": -17.487586975097656, "gen_logits_min": -29.771682739257812, "gen_logits_std": 3.566523551940918, "gen_loss": 0.24531465768814087, "grad_norm": 0.33422989133888586, "learning_rate": 1.8586105263157895e-05, "loss": 0.2483, "mean_copy_accuracy": 0.9971269369125366, "mean_gen_accuracy": 0.8863344639539719, "mean_token_accuracy": 0.9152034670114517, "num_tokens": 344418582.0, "sample_num_tokens": 8741.5, "step": 9772, "total_num_tokens": 344453548.0, "z_loss": 0.00037781905848532915 }, { "copy_logits_max": -0.7722866535186768, "copy_logits_min": -750000000.0, "copy_num_tokens": 641.625, "epoch": 1.995864181771764, "gen_logits_max": 4.0711469650268555, "gen_logits_mean": -17.059246063232422, "gen_logits_min": -31.413497924804688, "gen_logits_std": 3.5875120162963867, "gen_loss": 0.2295149862766266, "grad_norm": 0.3552463535127465, "learning_rate": 1.8584842105263156e-05, "loss": 0.2595, "mean_copy_accuracy": 0.9979516714811325, "mean_gen_accuracy": 0.8834921270608902, "mean_token_accuracy": 0.911091759800911, "num_tokens": 344691571.0, "sample_num_tokens": 9874.25, "step": 9773, "total_num_tokens": 344731068.0, "z_loss": 0.0003266200073994696 }, { "copy_logits_max": -3.4404797554016113, "copy_logits_min": -750000000.0, "copy_num_tokens": 479.875, "epoch": 1.996068419708961, "gen_logits_max": 2.5846123695373535, "gen_logits_mean": -18.647228240966797, "gen_logits_min": -31.2847900390625, "gen_logits_std": 3.6088881492614746, "gen_loss": 0.24884013831615448, "grad_norm": 0.36320760952377207, "learning_rate": 1.858357894736842e-05, "loss": 0.2606, "mean_copy_accuracy": 0.9971666783094406, "mean_gen_accuracy": 0.8814681768417358, "mean_token_accuracy": 0.9101680815219879, "num_tokens": 344965309.0, "sample_num_tokens": 8661.25, "step": 9774, "total_num_tokens": 344999954.0, "z_loss": 0.00034221040550619364 }, { "copy_logits_max": -3.306847095489502, "copy_logits_min": -750000000.0, "copy_num_tokens": 426.5, "epoch": 1.9962726576461578, "gen_logits_max": 3.5347707271575928, "gen_logits_mean": -18.30248260498047, "gen_logits_min": -31.717620849609375, "gen_logits_std": 3.600822925567627, "gen_loss": 0.2750396728515625, "grad_norm": 0.3392566813727499, "learning_rate": 1.8582315789473685e-05, "loss": 0.2694, "mean_copy_accuracy": 0.9973882138729095, "mean_gen_accuracy": 0.8801310062408447, "mean_token_accuracy": 0.9092322587966919, "num_tokens": 345247213.0, "sample_num_tokens": 7724.25, "step": 9775, "total_num_tokens": 345278110.0, "z_loss": 0.00043602794175967574 }, { "copy_logits_max": -4.799171447753906, "copy_logits_min": -750000000.0, "copy_num_tokens": 320.375, "epoch": 1.9964768955833545, "gen_logits_max": 3.912609815597534, "gen_logits_mean": -18.55404281616211, "gen_logits_min": -31.345993041992188, "gen_logits_std": 3.5764315128326416, "gen_loss": 0.30960923433303833, "grad_norm": 0.3676192935398698, "learning_rate": 1.8581052631578946e-05, "loss": 0.2996, "mean_copy_accuracy": 0.9961382001638412, "mean_gen_accuracy": 0.8702838718891144, "mean_token_accuracy": 0.9001001864671707, "num_tokens": 345523750.0, "sample_num_tokens": 7533.5, "step": 9776, "total_num_tokens": 345553884.0, "z_loss": 0.0004455155576579273 }, { "copy_logits_max": -4.83919620513916, "copy_logits_min": -750000000.0, "copy_num_tokens": 350.9375, "epoch": 1.9966811335205514, "gen_logits_max": 4.477438449859619, "gen_logits_mean": -17.50713348388672, "gen_logits_min": -30.690162658691406, "gen_logits_std": 3.56298828125, "gen_loss": 0.2529703378677368, "grad_norm": 0.3408251108253686, "learning_rate": 1.8579789473684214e-05, "loss": 0.2539, "mean_copy_accuracy": 0.9977605044841766, "mean_gen_accuracy": 0.8855887651443481, "mean_token_accuracy": 0.9130191802978516, "num_tokens": 345798732.0, "sample_num_tokens": 7774.0, "step": 9777, "total_num_tokens": 345829828.0, "z_loss": 0.0003833176742773503 }, { "copy_logits_max": -3.7211551666259766, "copy_logits_min": -750000000.0, "copy_num_tokens": 580.4375, "epoch": 1.9968853714577484, "gen_logits_max": 2.9941227436065674, "gen_logits_mean": -17.80955696105957, "gen_logits_min": -30.49065399169922, "gen_logits_std": 3.5681357383728027, "gen_loss": 0.26531240344047546, "grad_norm": 0.32543157317314964, "learning_rate": 1.8578526315789474e-05, "loss": 0.2575, "mean_copy_accuracy": 0.997640922665596, "mean_gen_accuracy": 0.882579505443573, "mean_token_accuracy": 0.912619337439537, "num_tokens": 346076217.0, "sample_num_tokens": 8499.25, "step": 9778, "total_num_tokens": 346110214.0, "z_loss": 0.0003709426673594862 }, { "copy_logits_max": -3.8237216472625732, "copy_logits_min": -687500032.0, "copy_num_tokens": 607.8125, "epoch": 1.997089609394945, "gen_logits_max": 2.8266310691833496, "gen_logits_mean": -16.859426498413086, "gen_logits_min": -30.158525466918945, "gen_logits_std": 3.541163682937622, "gen_loss": 0.2388167530298233, "grad_norm": 0.3427987687765236, "learning_rate": 1.857726315789474e-05, "loss": 0.2599, "mean_copy_accuracy": 0.9978401362895966, "mean_gen_accuracy": 0.8806586712598801, "mean_token_accuracy": 0.9110058397054672, "num_tokens": 346354320.0, "sample_num_tokens": 8573.0, "step": 9779, "total_num_tokens": 346388612.0, "z_loss": 0.0004004164657089859 }, { "copy_logits_max": -7.116147041320801, "copy_logits_min": -750000000.0, "copy_num_tokens": 332.375, "epoch": 1.997293847332142, "gen_logits_max": 4.250545501708984, "gen_logits_mean": -16.64108657836914, "gen_logits_min": -30.046131134033203, "gen_logits_std": 3.4935481548309326, "gen_loss": 0.2704382538795471, "grad_norm": 0.35799551045056494, "learning_rate": 1.8576e-05, "loss": 0.2816, "mean_copy_accuracy": 0.9968538582324982, "mean_gen_accuracy": 0.8759170770645142, "mean_token_accuracy": 0.9047571420669556, "num_tokens": 346618164.0, "sample_num_tokens": 7548.0, "step": 9780, "total_num_tokens": 346648356.0, "z_loss": 0.0003595552989281714 }, { "copy_logits_max": -4.559593677520752, "copy_logits_min": -750000000.0, "copy_num_tokens": 537.625, "epoch": 1.997498085269339, "gen_logits_max": 3.267265796661377, "gen_logits_mean": -16.815196990966797, "gen_logits_min": -29.32488250732422, "gen_logits_std": 3.464735507965088, "gen_loss": 0.31141430139541626, "grad_norm": 0.33407032338120235, "learning_rate": 1.8574736842105264e-05, "loss": 0.2883, "mean_copy_accuracy": 0.9979429095983505, "mean_gen_accuracy": 0.8695071339607239, "mean_token_accuracy": 0.9025500416755676, "num_tokens": 346897837.0, "sample_num_tokens": 9178.25, "step": 9781, "total_num_tokens": 346934550.0, "z_loss": 0.00047472657752223313 }, { "copy_logits_max": -5.431333541870117, "copy_logits_min": -750000000.0, "copy_num_tokens": 384.625, "epoch": 1.9977023232065356, "gen_logits_max": 3.1915810108184814, "gen_logits_mean": -17.206022262573242, "gen_logits_min": -29.86968421936035, "gen_logits_std": 3.4889814853668213, "gen_loss": 0.27622509002685547, "grad_norm": 0.3618281644012438, "learning_rate": 1.8573473684210525e-05, "loss": 0.28, "mean_copy_accuracy": 0.997277244925499, "mean_gen_accuracy": 0.8741570562124252, "mean_token_accuracy": 0.9041130840778351, "num_tokens": 347158279.0, "sample_num_tokens": 7363.75, "step": 9782, "total_num_tokens": 347187734.0, "z_loss": 0.00040225297561846673 }, { "copy_logits_max": -3.3867592811584473, "copy_logits_min": -687500032.0, "copy_num_tokens": 628.875, "epoch": 1.9979065611437323, "gen_logits_max": 3.2133629322052, "gen_logits_mean": -15.88558292388916, "gen_logits_min": -29.891124725341797, "gen_logits_std": 3.4209799766540527, "gen_loss": 0.24570026993751526, "grad_norm": 0.3407969162167984, "learning_rate": 1.857221052631579e-05, "loss": 0.2755, "mean_copy_accuracy": 0.9980048835277557, "mean_gen_accuracy": 0.8772457987070084, "mean_token_accuracy": 0.9057001173496246, "num_tokens": 347424104.0, "sample_num_tokens": 9681.0, "step": 9783, "total_num_tokens": 347462828.0, "z_loss": 0.00033607479417696595 }, { "copy_logits_max": -5.00612735748291, "copy_logits_min": -750000000.0, "copy_num_tokens": 443.625, "epoch": 1.9981107990809293, "gen_logits_max": 3.8793954849243164, "gen_logits_mean": -16.2457275390625, "gen_logits_min": -28.071399688720703, "gen_logits_std": 3.4171342849731445, "gen_loss": 0.23657354712486267, "grad_norm": 0.32263426827290426, "learning_rate": 1.857094736842105e-05, "loss": 0.2638, "mean_copy_accuracy": 0.9977752268314362, "mean_gen_accuracy": 0.8839239776134491, "mean_token_accuracy": 0.9104221761226654, "num_tokens": 347713337.0, "sample_num_tokens": 9547.25, "step": 9784, "total_num_tokens": 347751526.0, "z_loss": 0.0003605308593250811 }, { "copy_logits_max": -3.5070960521698, "copy_logits_min": -750000000.0, "copy_num_tokens": 476.75, "epoch": 1.9983150370181262, "gen_logits_max": 3.928497552871704, "gen_logits_mean": -16.488187789916992, "gen_logits_min": -29.539413452148438, "gen_logits_std": 3.4596123695373535, "gen_loss": 0.2701517343521118, "grad_norm": 0.3650788956413813, "learning_rate": 1.8569684210526318e-05, "loss": 0.282, "mean_copy_accuracy": 0.9970536679029465, "mean_gen_accuracy": 0.8766836524009705, "mean_token_accuracy": 0.9031137973070145, "num_tokens": 347972783.0, "sample_num_tokens": 8167.25, "step": 9785, "total_num_tokens": 348005452.0, "z_loss": 0.00042059726547449827 }, { "copy_logits_max": -4.586576461791992, "copy_logits_min": -750000128.0, "copy_num_tokens": 532.125, "epoch": 1.9985192749553229, "gen_logits_max": 3.695441484451294, "gen_logits_mean": -15.657539367675781, "gen_logits_min": -28.672054290771484, "gen_logits_std": 3.3865323066711426, "gen_loss": 0.24659767746925354, "grad_norm": 0.3437228485650938, "learning_rate": 1.856842105263158e-05, "loss": 0.2796, "mean_copy_accuracy": 0.9981590509414673, "mean_gen_accuracy": 0.8741312325000763, "mean_token_accuracy": 0.9056224077939987, "num_tokens": 348251473.0, "sample_num_tokens": 8553.75, "step": 9786, "total_num_tokens": 348285688.0, "z_loss": 0.0003840663703158498 }, { "copy_logits_max": -3.8318824768066406, "copy_logits_min": -750000000.0, "copy_num_tokens": 364.1875, "epoch": 1.9987235128925198, "gen_logits_max": 3.837745189666748, "gen_logits_mean": -17.671222686767578, "gen_logits_min": -30.455455780029297, "gen_logits_std": 3.495276927947998, "gen_loss": 0.30692753195762634, "grad_norm": 0.3494402875114593, "learning_rate": 1.8567157894736843e-05, "loss": 0.2921, "mean_copy_accuracy": 0.9973475784063339, "mean_gen_accuracy": 0.8724112063646317, "mean_token_accuracy": 0.9008609056472778, "num_tokens": 348509046.0, "sample_num_tokens": 8115.0, "step": 9787, "total_num_tokens": 348541506.0, "z_loss": 0.00042784929974004626 }, { "copy_logits_max": -3.793213367462158, "copy_logits_min": -687500032.0, "copy_num_tokens": 436.9375, "epoch": 1.9989277508297167, "gen_logits_max": 3.1592214107513428, "gen_logits_mean": -16.829788208007812, "gen_logits_min": -29.2799015045166, "gen_logits_std": 3.4187822341918945, "gen_loss": 0.29593905806541443, "grad_norm": 0.3334389845135467, "learning_rate": 1.8565894736842107e-05, "loss": 0.2632, "mean_copy_accuracy": 0.9978685826063156, "mean_gen_accuracy": 0.8802785873413086, "mean_token_accuracy": 0.9114318937063217, "num_tokens": 348802700.0, "sample_num_tokens": 8074.5, "step": 9788, "total_num_tokens": 348834998.0, "z_loss": 0.0004731961525976658 }, { "copy_logits_max": -7.512463569641113, "copy_logits_min": -687500032.0, "copy_num_tokens": 386.5625, "epoch": 1.9991319887669134, "gen_logits_max": 3.619666576385498, "gen_logits_mean": -18.347484588623047, "gen_logits_min": -30.57999038696289, "gen_logits_std": 3.4729573726654053, "gen_loss": 0.32634028792381287, "grad_norm": 0.36135084222882524, "learning_rate": 1.856463157894737e-05, "loss": 0.2823, "mean_copy_accuracy": 0.9974492639303207, "mean_gen_accuracy": 0.8782711625099182, "mean_token_accuracy": 0.9054424166679382, "num_tokens": 349066044.0, "sample_num_tokens": 8919.5, "step": 9789, "total_num_tokens": 349101722.0, "z_loss": 0.0005033565685153008 }, { "copy_logits_max": -5.341861724853516, "copy_logits_min": -687500032.0, "copy_num_tokens": 564.875, "epoch": 1.9993362267041102, "gen_logits_max": 1.919025182723999, "gen_logits_mean": -19.53227996826172, "gen_logits_min": -31.712560653686523, "gen_logits_std": 3.5682661533355713, "gen_loss": 0.27405381202697754, "grad_norm": 0.3634780029979705, "learning_rate": 1.8563368421052633e-05, "loss": 0.2901, "mean_copy_accuracy": 0.9968935996294022, "mean_gen_accuracy": 0.8720166832208633, "mean_token_accuracy": 0.9013007134199142, "num_tokens": 349337395.0, "sample_num_tokens": 9854.75, "step": 9790, "total_num_tokens": 349376814.0, "z_loss": 0.0004678880504798144 }, { "copy_logits_max": -5.754299163818359, "copy_logits_min": -750000000.0, "copy_num_tokens": 465.25, "epoch": 1.999540464641307, "gen_logits_max": 3.0652079582214355, "gen_logits_mean": -17.708629608154297, "gen_logits_min": -30.097042083740234, "gen_logits_std": 3.4866700172424316, "gen_loss": 0.25498104095458984, "grad_norm": 0.32298037323989803, "learning_rate": 1.8562105263157894e-05, "loss": 0.2705, "mean_copy_accuracy": 0.9965004622936249, "mean_gen_accuracy": 0.8813446015119553, "mean_token_accuracy": 0.9072006940841675, "num_tokens": 349609855.0, "sample_num_tokens": 8632.75, "step": 9791, "total_num_tokens": 349644386.0, "z_loss": 0.0004146714636590332 }, { "copy_logits_max": -5.268195629119873, "copy_logits_min": -750000000.0, "copy_num_tokens": 428.25, "epoch": 1.999744702578504, "gen_logits_max": 3.343752384185791, "gen_logits_mean": -17.272159576416016, "gen_logits_min": -29.85403823852539, "gen_logits_std": 3.4478049278259277, "gen_loss": 0.2691826820373535, "grad_norm": 0.35998842567375294, "learning_rate": 1.8560842105263158e-05, "loss": 0.2801, "mean_copy_accuracy": 0.9973633587360382, "mean_gen_accuracy": 0.8771536946296692, "mean_token_accuracy": 0.9041450619697571, "num_tokens": 349873011.0, "sample_num_tokens": 7911.25, "step": 9792, "total_num_tokens": 349904656.0, "z_loss": 0.0004075444885529578 }, { "copy_logits_max": -7.560436248779297, "copy_logits_min": -750000128.0, "copy_num_tokens": 275.4375, "epoch": 1.9999489405157007, "gen_logits_max": 4.01022481918335, "gen_logits_mean": -17.213560104370117, "gen_logits_min": -29.54190444946289, "gen_logits_std": 3.434825897216797, "gen_loss": 0.2630859613418579, "grad_norm": 0.366255259605028, "learning_rate": 1.8559578947368422e-05, "loss": 0.2804, "mean_copy_accuracy": 0.9965675324201584, "mean_gen_accuracy": 0.879756510257721, "mean_token_accuracy": 0.9039159119129181, "num_tokens": 350118453.0, "sample_num_tokens": 7423.75, "step": 9793, "total_num_tokens": 350148148.0, "z_loss": 0.00042295223101973534 }, { "copy_logits_max": -5.417348861694336, "copy_logits_min": -750000000.0, "copy_num_tokens": 397.125, "epoch": 2.0002042379371967, "gen_logits_max": 3.1879758834838867, "gen_logits_mean": -18.487672805786133, "gen_logits_min": -30.409847259521484, "gen_logits_std": 3.5410428047180176, "gen_loss": 0.23163197934627533, "grad_norm": 0.3390275632166226, "learning_rate": 1.8558315789473686e-05, "loss": 0.2806, "mean_copy_accuracy": 0.9978417158126831, "mean_gen_accuracy": 0.8947176694869995, "mean_token_accuracy": 0.921437430381775, "num_tokens": 350472771.0, "sample_num_tokens": 7911.25, "step": 9794, "total_num_tokens": 350504416.0, "z_loss": 0.00041221349965780973 }, { "copy_logits_max": -4.289750099182129, "copy_logits_min": -750000000.0, "copy_num_tokens": 459.5, "epoch": 2.000408475874394, "gen_logits_max": 2.5645952224731445, "gen_logits_mean": -18.591060638427734, "gen_logits_min": -30.982524871826172, "gen_logits_std": 3.55641508102417, "gen_loss": 0.22424277663230896, "grad_norm": 0.37642196092362484, "learning_rate": 1.8557052631578947e-05, "loss": 0.2428, "mean_copy_accuracy": 0.9980156719684601, "mean_gen_accuracy": 0.8923608213663101, "mean_token_accuracy": 0.9157924205064774, "num_tokens": 350754791.0, "sample_num_tokens": 8187.25, "step": 9795, "total_num_tokens": 350787540.0, "z_loss": 0.0004358894075267017 }, { "copy_logits_max": -6.269612789154053, "copy_logits_min": -750000000.0, "copy_num_tokens": 406.9375, "epoch": 2.0006127138115906, "gen_logits_max": 2.8664212226867676, "gen_logits_mean": -18.908172607421875, "gen_logits_min": -31.74867820739746, "gen_logits_std": 3.583862781524658, "gen_loss": 0.22502630949020386, "grad_norm": 0.3836031564119072, "learning_rate": 1.8555789473684212e-05, "loss": 0.2336, "mean_copy_accuracy": 0.9962610453367233, "mean_gen_accuracy": 0.8927744328975677, "mean_token_accuracy": 0.919241264462471, "num_tokens": 351015698.0, "sample_num_tokens": 7781.5, "step": 9796, "total_num_tokens": 351046824.0, "z_loss": 0.0004224802542012185 }, { "copy_logits_max": -2.5938172340393066, "copy_logits_min": -750000064.0, "copy_num_tokens": 570.5625, "epoch": 2.0008169517487873, "gen_logits_max": 3.1612329483032227, "gen_logits_mean": -17.72644805908203, "gen_logits_min": -30.30562400817871, "gen_logits_std": 3.528003215789795, "gen_loss": 0.21466538310050964, "grad_norm": 0.3573090857439335, "learning_rate": 1.8554526315789473e-05, "loss": 0.2349, "mean_copy_accuracy": 0.9972152262926102, "mean_gen_accuracy": 0.8900784403085709, "mean_token_accuracy": 0.9193966686725616, "num_tokens": 351295530.0, "sample_num_tokens": 8856.0, "step": 9797, "total_num_tokens": 351330954.0, "z_loss": 0.0004912017611786723 }, { "copy_logits_max": -3.3856308460235596, "copy_logits_min": -750000000.0, "copy_num_tokens": 447.3125, "epoch": 2.001021189685984, "gen_logits_max": 3.362452268600464, "gen_logits_mean": -18.56327247619629, "gen_logits_min": -30.82037925720215, "gen_logits_std": 3.5698490142822266, "gen_loss": 0.24636030197143555, "grad_norm": 0.35980264390676087, "learning_rate": 1.8553263157894737e-05, "loss": 0.227, "mean_copy_accuracy": 0.9972626715898514, "mean_gen_accuracy": 0.8946605324745178, "mean_token_accuracy": 0.9196734726428986, "num_tokens": 351564980.0, "sample_num_tokens": 8084.0, "step": 9798, "total_num_tokens": 351597316.0, "z_loss": 0.0004506106488406658 }, { "copy_logits_max": -3.92533540725708, "copy_logits_min": -687500032.0, "copy_num_tokens": 399.75, "epoch": 2.001225427623181, "gen_logits_max": 3.6327922344207764, "gen_logits_mean": -18.67223358154297, "gen_logits_min": -30.8182315826416, "gen_logits_std": 3.571077823638916, "gen_loss": 0.27171432971954346, "grad_norm": 0.37137826219055303, "learning_rate": 1.8551999999999998e-05, "loss": 0.2251, "mean_copy_accuracy": 0.9977156817913055, "mean_gen_accuracy": 0.8948484361171722, "mean_token_accuracy": 0.9223020076751709, "num_tokens": 351846368.0, "sample_num_tokens": 7542.5, "step": 9799, "total_num_tokens": 351876538.0, "z_loss": 0.0005297265015542507 }, { "copy_logits_max": -4.930535793304443, "copy_logits_min": -750000000.0, "copy_num_tokens": 363.6875, "epoch": 2.001429665560378, "gen_logits_max": 3.982851505279541, "gen_logits_mean": -18.746849060058594, "gen_logits_min": -30.840648651123047, "gen_logits_std": 3.5862560272216797, "gen_loss": 0.24517899751663208, "grad_norm": 0.40084474750573873, "learning_rate": 1.8550736842105262e-05, "loss": 0.2199, "mean_copy_accuracy": 0.9974091649055481, "mean_gen_accuracy": 0.8952521681785583, "mean_token_accuracy": 0.9221112132072449, "num_tokens": 352110776.0, "sample_num_tokens": 8203.0, "step": 9800, "total_num_tokens": 352143588.0, "z_loss": 0.0004491051658987999 }, { "copy_logits_max": -6.097038269042969, "copy_logits_min": -687500032.0, "copy_num_tokens": 326.75, "epoch": 2.0016339034975745, "gen_logits_max": 4.35174036026001, "gen_logits_mean": -18.21847152709961, "gen_logits_min": -30.435832977294922, "gen_logits_std": 3.575237989425659, "gen_loss": 0.2517153024673462, "grad_norm": 0.383674169259957, "learning_rate": 1.854947368421053e-05, "loss": 0.2333, "mean_copy_accuracy": 0.9966669827699661, "mean_gen_accuracy": 0.8943801671266556, "mean_token_accuracy": 0.9183067828416824, "num_tokens": 352375781.0, "sample_num_tokens": 8529.75, "step": 9801, "total_num_tokens": 352409900.0, "z_loss": 0.0004953783354721963 }, { "copy_logits_max": -2.4677088260650635, "copy_logits_min": -687500032.0, "copy_num_tokens": 766.25, "epoch": 2.0018381414347717, "gen_logits_max": 2.217764377593994, "gen_logits_mean": -18.095476150512695, "gen_logits_min": -30.963890075683594, "gen_logits_std": 3.656986951828003, "gen_loss": 0.18860796093940735, "grad_norm": 0.34445721527442413, "learning_rate": 1.854821052631579e-05, "loss": 0.2067, "mean_copy_accuracy": 0.997255727648735, "mean_gen_accuracy": 0.8977513611316681, "mean_token_accuracy": 0.9274116605520248, "num_tokens": 352668695.0, "sample_num_tokens": 9390.25, "step": 9802, "total_num_tokens": 352706256.0, "z_loss": 0.00035719788866117597 }, { "copy_logits_max": -5.287834644317627, "copy_logits_min": -750000000.0, "copy_num_tokens": 316.375, "epoch": 2.0020423793719684, "gen_logits_max": 4.008988380432129, "gen_logits_mean": -17.388566970825195, "gen_logits_min": -29.800479888916016, "gen_logits_std": 3.547022819519043, "gen_loss": 0.2676984965801239, "grad_norm": 0.40092370262195565, "learning_rate": 1.8546947368421055e-05, "loss": 0.2503, "mean_copy_accuracy": 0.9964449256658554, "mean_gen_accuracy": 0.8891857862472534, "mean_token_accuracy": 0.9131641238927841, "num_tokens": 352916098.0, "sample_num_tokens": 7380.5, "step": 9803, "total_num_tokens": 352945620.0, "z_loss": 0.000440035859355703 }, { "copy_logits_max": -4.057468891143799, "copy_logits_min": -687500032.0, "copy_num_tokens": 395.25, "epoch": 2.002246617309165, "gen_logits_max": 2.734179973602295, "gen_logits_mean": -18.3289852142334, "gen_logits_min": -30.47357177734375, "gen_logits_std": 3.6097965240478516, "gen_loss": 0.2505756616592407, "grad_norm": 0.38912101882219907, "learning_rate": 1.8545684210526316e-05, "loss": 0.2337, "mean_copy_accuracy": 0.997230052947998, "mean_gen_accuracy": 0.890957236289978, "mean_token_accuracy": 0.9185760617256165, "num_tokens": 353159556.0, "sample_num_tokens": 7942.0, "step": 9804, "total_num_tokens": 353191324.0, "z_loss": 0.00039172667311504483 }, { "copy_logits_max": -4.697262763977051, "copy_logits_min": -750000000.0, "copy_num_tokens": 524.4375, "epoch": 2.002450855246362, "gen_logits_max": 1.8871355056762695, "gen_logits_mean": -19.06717872619629, "gen_logits_min": -31.577430725097656, "gen_logits_std": 3.6769485473632812, "gen_loss": 0.212815061211586, "grad_norm": 0.39227954827189204, "learning_rate": 1.854442105263158e-05, "loss": 0.2223, "mean_copy_accuracy": 0.9969549775123596, "mean_gen_accuracy": 0.8972955495119095, "mean_token_accuracy": 0.9213048070669174, "num_tokens": 353432173.0, "sample_num_tokens": 8274.25, "step": 9805, "total_num_tokens": 353465270.0, "z_loss": 0.00034119084011763334 }, { "copy_logits_max": -5.179055690765381, "copy_logits_min": -750000064.0, "copy_num_tokens": 389.75, "epoch": 2.002655093183559, "gen_logits_max": 3.2117977142333984, "gen_logits_mean": -18.443941116333008, "gen_logits_min": -30.691911697387695, "gen_logits_std": 3.6061971187591553, "gen_loss": 0.23060987889766693, "grad_norm": 0.3693374226389237, "learning_rate": 1.854315789473684e-05, "loss": 0.2324, "mean_copy_accuracy": 0.9975550025701523, "mean_gen_accuracy": 0.8941827118396759, "mean_token_accuracy": 0.91946080327034, "num_tokens": 353711015.0, "sample_num_tokens": 9205.75, "step": 9806, "total_num_tokens": 353747838.0, "z_loss": 0.00040268764132633805 }, { "copy_logits_max": -5.809444427490234, "copy_logits_min": -750000000.0, "copy_num_tokens": 649.3125, "epoch": 2.0028593311207556, "gen_logits_max": 2.7405924797058105, "gen_logits_mean": -17.943010330200195, "gen_logits_min": -30.812576293945312, "gen_logits_std": 3.638129711151123, "gen_loss": 0.21805007755756378, "grad_norm": 0.3565330710474908, "learning_rate": 1.8541894736842106e-05, "loss": 0.2176, "mean_copy_accuracy": 0.997841477394104, "mean_gen_accuracy": 0.8969888985157013, "mean_token_accuracy": 0.9259204864501953, "num_tokens": 354003921.0, "sample_num_tokens": 10329.25, "step": 9807, "total_num_tokens": 354045238.0, "z_loss": 0.00036965374602004886 }, { "copy_logits_max": -3.7027201652526855, "copy_logits_min": -750000000.0, "copy_num_tokens": 310.5, "epoch": 2.0030635690579524, "gen_logits_max": 3.6237125396728516, "gen_logits_mean": -16.91736602783203, "gen_logits_min": -29.514095306396484, "gen_logits_std": 3.5561931133270264, "gen_loss": 0.19519880414009094, "grad_norm": 0.3495025860298189, "learning_rate": 1.8540631578947367e-05, "loss": 0.2222, "mean_copy_accuracy": 0.9974667876958847, "mean_gen_accuracy": 0.8982967138290405, "mean_token_accuracy": 0.9221685379743576, "num_tokens": 354269335.0, "sample_num_tokens": 7015.75, "step": 9808, "total_num_tokens": 354297398.0, "z_loss": 0.00035326858051121235 }, { "copy_logits_max": -6.099678993225098, "copy_logits_min": -750000000.0, "copy_num_tokens": 313.625, "epoch": 2.0032678069951495, "gen_logits_max": 3.3686630725860596, "gen_logits_mean": -20.159225463867188, "gen_logits_min": -32.36351776123047, "gen_logits_std": 3.6768550872802734, "gen_loss": 0.2669495642185211, "grad_norm": 0.3394267251064564, "learning_rate": 1.8539368421052634e-05, "loss": 0.2334, "mean_copy_accuracy": 0.9974257051944733, "mean_gen_accuracy": 0.8922515362501144, "mean_token_accuracy": 0.9165777713060379, "num_tokens": 354531788.0, "sample_num_tokens": 8214.5, "step": 9809, "total_num_tokens": 354564646.0, "z_loss": 0.0005329701816663146 }, { "copy_logits_max": -3.2077605724334717, "copy_logits_min": -750000000.0, "copy_num_tokens": 517.0625, "epoch": 2.003472044932346, "gen_logits_max": 3.629335641860962, "gen_logits_mean": -17.793636322021484, "gen_logits_min": -30.647172927856445, "gen_logits_std": 3.6142375469207764, "gen_loss": 0.22411730885505676, "grad_norm": 0.37900491881891174, "learning_rate": 1.8538105263157895e-05, "loss": 0.2233, "mean_copy_accuracy": 0.9967869967222214, "mean_gen_accuracy": 0.8973422348499298, "mean_token_accuracy": 0.9222303628921509, "num_tokens": 354794925.0, "sample_num_tokens": 8656.75, "step": 9810, "total_num_tokens": 354829552.0, "z_loss": 0.0005807233392260969 }, { "copy_logits_max": -1.471153736114502, "copy_logits_min": -750000064.0, "copy_num_tokens": 688.0625, "epoch": 2.003676282869543, "gen_logits_max": 3.6195926666259766, "gen_logits_mean": -15.560731887817383, "gen_logits_min": -28.77954864501953, "gen_logits_std": 3.514565944671631, "gen_loss": 0.20393148064613342, "grad_norm": 0.34410837209703415, "learning_rate": 1.853684210526316e-05, "loss": 0.2175, "mean_copy_accuracy": 0.9978712052106857, "mean_gen_accuracy": 0.8912513852119446, "mean_token_accuracy": 0.9240066409111023, "num_tokens": 355099977.0, "sample_num_tokens": 8953.75, "step": 9811, "total_num_tokens": 355135792.0, "z_loss": 0.0004332246899139136 }, { "copy_logits_max": -5.947270393371582, "copy_logits_min": -687500032.0, "copy_num_tokens": 398.125, "epoch": 2.00388052080674, "gen_logits_max": 3.8134169578552246, "gen_logits_mean": -16.790691375732422, "gen_logits_min": -29.435590744018555, "gen_logits_std": 3.542076826095581, "gen_loss": 0.22967536747455597, "grad_norm": 0.3260094930688677, "learning_rate": 1.853557894736842e-05, "loss": 0.2215, "mean_copy_accuracy": 0.9972604811191559, "mean_gen_accuracy": 0.8997227102518082, "mean_token_accuracy": 0.9216375201940536, "num_tokens": 355366446.0, "sample_num_tokens": 8136.0, "step": 9812, "total_num_tokens": 355398990.0, "z_loss": 0.0004650692571885884 }, { "copy_logits_max": -5.050163269042969, "copy_logits_min": -750000064.0, "copy_num_tokens": 526.8125, "epoch": 2.0040847587439368, "gen_logits_max": 2.7723171710968018, "gen_logits_mean": -18.76691436767578, "gen_logits_min": -31.102581024169922, "gen_logits_std": 3.6720023155212402, "gen_loss": 0.21772465109825134, "grad_norm": 0.37796452578186696, "learning_rate": 1.8534315789473685e-05, "loss": 0.2311, "mean_copy_accuracy": 0.9975766688585281, "mean_gen_accuracy": 0.8921681493520737, "mean_token_accuracy": 0.9183139652013779, "num_tokens": 355639022.0, "sample_num_tokens": 8796.0, "step": 9813, "total_num_tokens": 355674206.0, "z_loss": 0.0004021428758278489 }, { "copy_logits_max": -3.750962018966675, "copy_logits_min": -750000000.0, "copy_num_tokens": 446.125, "epoch": 2.0042889966811335, "gen_logits_max": 3.983485698699951, "gen_logits_mean": -17.389528274536133, "gen_logits_min": -29.86771011352539, "gen_logits_std": 3.5408661365509033, "gen_loss": 0.22608304023742676, "grad_norm": 0.33855772859054706, "learning_rate": 1.853305263157895e-05, "loss": 0.2266, "mean_copy_accuracy": 0.9976627975702286, "mean_gen_accuracy": 0.8946338444948196, "mean_token_accuracy": 0.9209303557872772, "num_tokens": 355928223.0, "sample_num_tokens": 8604.75, "step": 9814, "total_num_tokens": 355962642.0, "z_loss": 0.00045216912985779345 }, { "copy_logits_max": -5.372345447540283, "copy_logits_min": -750000000.0, "copy_num_tokens": 392.1875, "epoch": 2.00449323461833, "gen_logits_max": 2.966066360473633, "gen_logits_mean": -17.905109405517578, "gen_logits_min": -30.607555389404297, "gen_logits_std": 3.61409854888916, "gen_loss": 0.1864987164735794, "grad_norm": 0.3769444268067411, "learning_rate": 1.853178947368421e-05, "loss": 0.2213, "mean_copy_accuracy": 0.9973163157701492, "mean_gen_accuracy": 0.9010512977838516, "mean_token_accuracy": 0.9226052761077881, "num_tokens": 356191178.0, "sample_num_tokens": 6997.0, "step": 9815, "total_num_tokens": 356219166.0, "z_loss": 0.0003770741750486195 }, { "copy_logits_max": -5.993446350097656, "copy_logits_min": -750000000.0, "copy_num_tokens": 419.375, "epoch": 2.0046974725555273, "gen_logits_max": 3.757950782775879, "gen_logits_mean": -17.518495559692383, "gen_logits_min": -30.267757415771484, "gen_logits_std": 3.5523478984832764, "gen_loss": 0.23136766254901886, "grad_norm": 0.3595492925796894, "learning_rate": 1.8530526315789474e-05, "loss": 0.2444, "mean_copy_accuracy": 0.9979075640439987, "mean_gen_accuracy": 0.8881649672985077, "mean_token_accuracy": 0.914254903793335, "num_tokens": 356451516.0, "sample_num_tokens": 8117.5, "step": 9816, "total_num_tokens": 356483986.0, "z_loss": 0.0004703249433077872 }, { "copy_logits_max": -7.473138809204102, "copy_logits_min": -750000000.0, "copy_num_tokens": 312.3125, "epoch": 2.004901710492724, "gen_logits_max": 4.551643371582031, "gen_logits_mean": -17.36161994934082, "gen_logits_min": -30.05872917175293, "gen_logits_std": 3.579042434692383, "gen_loss": 0.2186296284198761, "grad_norm": 0.3531706526061477, "learning_rate": 1.852926315789474e-05, "loss": 0.2211, "mean_copy_accuracy": 0.9969245791435242, "mean_gen_accuracy": 0.9020845144987106, "mean_token_accuracy": 0.9216194152832031, "num_tokens": 356717030.0, "sample_num_tokens": 7851.5, "step": 9817, "total_num_tokens": 356748436.0, "z_loss": 0.00040124202496372163 }, { "copy_logits_max": -5.492033004760742, "copy_logits_min": -750000000.0, "copy_num_tokens": 609.3125, "epoch": 2.0051059484299207, "gen_logits_max": 3.2636187076568604, "gen_logits_mean": -17.852291107177734, "gen_logits_min": -30.623079299926758, "gen_logits_std": 3.6150565147399902, "gen_loss": 0.23775574564933777, "grad_norm": 0.34846335457257427, "learning_rate": 1.8528000000000003e-05, "loss": 0.2332, "mean_copy_accuracy": 0.9980176836252213, "mean_gen_accuracy": 0.8941961824893951, "mean_token_accuracy": 0.9173387289047241, "num_tokens": 356976615.0, "sample_num_tokens": 9244.75, "step": 9818, "total_num_tokens": 357013594.0, "z_loss": 0.00046152991126291454 }, { "copy_logits_max": -6.819974422454834, "copy_logits_min": -750000064.0, "copy_num_tokens": 545.0625, "epoch": 2.005310186367118, "gen_logits_max": 2.269317626953125, "gen_logits_mean": -19.447988510131836, "gen_logits_min": -32.227333068847656, "gen_logits_std": 3.6864266395568848, "gen_loss": 0.21175451576709747, "grad_norm": 0.3327998390096986, "learning_rate": 1.8526736842105264e-05, "loss": 0.2053, "mean_copy_accuracy": 0.9976108372211456, "mean_gen_accuracy": 0.8969983160495758, "mean_token_accuracy": 0.9273047298192978, "num_tokens": 357285275.0, "sample_num_tokens": 9121.25, "step": 9819, "total_num_tokens": 357321760.0, "z_loss": 0.0004139892407692969 }, { "copy_logits_max": -8.616531372070312, "copy_logits_min": -750000000.0, "copy_num_tokens": 293.0, "epoch": 2.0055144243043146, "gen_logits_max": 4.027197360992432, "gen_logits_mean": -18.402565002441406, "gen_logits_min": -30.729339599609375, "gen_logits_std": 3.6125993728637695, "gen_loss": 0.24596953392028809, "grad_norm": 0.3738578685648412, "learning_rate": 1.8525473684210528e-05, "loss": 0.2327, "mean_copy_accuracy": 0.9976908564567566, "mean_gen_accuracy": 0.8950870335102081, "mean_token_accuracy": 0.9185213297605515, "num_tokens": 357548977.0, "sample_num_tokens": 7887.25, "step": 9820, "total_num_tokens": 357580526.0, "z_loss": 0.00038930450682528317 }, { "copy_logits_max": -6.0402679443359375, "copy_logits_min": -750000000.0, "copy_num_tokens": 404.0625, "epoch": 2.0057186622415113, "gen_logits_max": 2.5979185104370117, "gen_logits_mean": -19.334373474121094, "gen_logits_min": -32.23008728027344, "gen_logits_std": 3.6656508445739746, "gen_loss": 0.23633576929569244, "grad_norm": 0.37044281606140606, "learning_rate": 1.852421052631579e-05, "loss": 0.2223, "mean_copy_accuracy": 0.9981883019208908, "mean_gen_accuracy": 0.8985240459442139, "mean_token_accuracy": 0.9222319722175598, "num_tokens": 357809694.0, "sample_num_tokens": 8011.0, "step": 9821, "total_num_tokens": 357841738.0, "z_loss": 0.000477367895655334 }, { "copy_logits_max": -7.275378704071045, "copy_logits_min": -750000000.0, "copy_num_tokens": 452.875, "epoch": 2.005922900178708, "gen_logits_max": 1.8367804288864136, "gen_logits_mean": -21.406734466552734, "gen_logits_min": -33.948341369628906, "gen_logits_std": 3.7817161083221436, "gen_loss": 0.2120763659477234, "grad_norm": 0.3651153526168049, "learning_rate": 1.8522947368421053e-05, "loss": 0.225, "mean_copy_accuracy": 0.9976995587348938, "mean_gen_accuracy": 0.8926364630460739, "mean_token_accuracy": 0.9213750958442688, "num_tokens": 358093565.0, "sample_num_tokens": 8763.75, "step": 9822, "total_num_tokens": 358128620.0, "z_loss": 0.00040189866558648646 }, { "copy_logits_max": -7.396989822387695, "copy_logits_min": -750000000.0, "copy_num_tokens": 440.4375, "epoch": 2.006127138115905, "gen_logits_max": 3.477903366088867, "gen_logits_mean": -17.8818302154541, "gen_logits_min": -30.460538864135742, "gen_logits_std": 3.6058411598205566, "gen_loss": 0.20040304958820343, "grad_norm": 0.3456351328243379, "learning_rate": 1.8521684210526314e-05, "loss": 0.2235, "mean_copy_accuracy": 0.9978899508714676, "mean_gen_accuracy": 0.8998495787382126, "mean_token_accuracy": 0.9223128706216812, "num_tokens": 358351811.0, "sample_num_tokens": 7836.25, "step": 9823, "total_num_tokens": 358383156.0, "z_loss": 0.00031879174639470875 }, { "copy_logits_max": -7.337968349456787, "copy_logits_min": -750000000.0, "copy_num_tokens": 309.25, "epoch": 2.006331376053102, "gen_logits_max": 3.4718778133392334, "gen_logits_mean": -18.936588287353516, "gen_logits_min": -31.497587203979492, "gen_logits_std": 3.631800651550293, "gen_loss": 0.26550012826919556, "grad_norm": 0.34316711332785776, "learning_rate": 1.852042105263158e-05, "loss": 0.2305, "mean_copy_accuracy": 0.9967913031578064, "mean_gen_accuracy": 0.8978272378444672, "mean_token_accuracy": 0.9191480726003647, "num_tokens": 358625111.0, "sample_num_tokens": 7900.75, "step": 9824, "total_num_tokens": 358656714.0, "z_loss": 0.0004443214274942875 }, { "copy_logits_max": -7.717741012573242, "copy_logits_min": -750000000.0, "copy_num_tokens": 405.875, "epoch": 2.0065356139902986, "gen_logits_max": 3.5194153785705566, "gen_logits_mean": -18.77228546142578, "gen_logits_min": -31.580978393554688, "gen_logits_std": 3.6680641174316406, "gen_loss": 0.21867933869361877, "grad_norm": 0.39073154545493416, "learning_rate": 1.8519157894736843e-05, "loss": 0.2161, "mean_copy_accuracy": 0.9977063089609146, "mean_gen_accuracy": 0.9010859876871109, "mean_token_accuracy": 0.9238713979721069, "num_tokens": 358880410.0, "sample_num_tokens": 7662.5, "step": 9825, "total_num_tokens": 358911060.0, "z_loss": 0.00040114944567903876 }, { "copy_logits_max": -7.565990447998047, "copy_logits_min": -750000000.0, "copy_num_tokens": 385.0, "epoch": 2.0067398519274957, "gen_logits_max": 3.8864171504974365, "gen_logits_mean": -17.62744903564453, "gen_logits_min": -30.705474853515625, "gen_logits_std": 3.6134822368621826, "gen_loss": 0.23242530226707458, "grad_norm": 0.35558433824365976, "learning_rate": 1.8517894736842107e-05, "loss": 0.2184, "mean_copy_accuracy": 0.9975700378417969, "mean_gen_accuracy": 0.8992248624563217, "mean_token_accuracy": 0.9226823896169662, "num_tokens": 359151900.0, "sample_num_tokens": 7291.5, "step": 9826, "total_num_tokens": 359181066.0, "z_loss": 0.0003919617156498134 }, { "copy_logits_max": -2.3145551681518555, "copy_logits_min": -750000000.0, "copy_num_tokens": 504.9375, "epoch": 2.0069440898646924, "gen_logits_max": 2.789458751678467, "gen_logits_mean": -18.74527359008789, "gen_logits_min": -31.721271514892578, "gen_logits_std": 3.681511878967285, "gen_loss": 0.21567144989967346, "grad_norm": 0.36432778192421134, "learning_rate": 1.8516631578947368e-05, "loss": 0.2125, "mean_copy_accuracy": 0.9982125163078308, "mean_gen_accuracy": 0.8970162570476532, "mean_token_accuracy": 0.9267937690019608, "num_tokens": 359426283.0, "sample_num_tokens": 8234.75, "step": 9827, "total_num_tokens": 359459222.0, "z_loss": 0.00046707878937013447 }, { "copy_logits_max": -5.43345832824707, "copy_logits_min": -750000000.0, "copy_num_tokens": 305.1875, "epoch": 2.007148327801889, "gen_logits_max": 4.596378326416016, "gen_logits_mean": -16.8258113861084, "gen_logits_min": -30.07158660888672, "gen_logits_std": 3.557551145553589, "gen_loss": 0.2091750055551529, "grad_norm": 0.42258504430671096, "learning_rate": 1.8515368421052632e-05, "loss": 0.2371, "mean_copy_accuracy": 0.9968614131212234, "mean_gen_accuracy": 0.8961264193058014, "mean_token_accuracy": 0.915838286280632, "num_tokens": 359668496.0, "sample_num_tokens": 7289.0, "step": 9828, "total_num_tokens": 359697652.0, "z_loss": 0.0003734427154995501 }, { "copy_logits_max": -6.743307113647461, "copy_logits_min": -750000000.0, "copy_num_tokens": 455.6875, "epoch": 2.007352565739086, "gen_logits_max": 3.459298610687256, "gen_logits_mean": -17.870922088623047, "gen_logits_min": -30.900165557861328, "gen_logits_std": 3.6030664443969727, "gen_loss": 0.23124609887599945, "grad_norm": 0.3614620575489691, "learning_rate": 1.8514105263157897e-05, "loss": 0.226, "mean_copy_accuracy": 0.9964854270219803, "mean_gen_accuracy": 0.8943075984716415, "mean_token_accuracy": 0.9221267402172089, "num_tokens": 359917482.0, "sample_num_tokens": 8300.0, "step": 9829, "total_num_tokens": 359950682.0, "z_loss": 0.00047458428889513016 }, { "copy_logits_max": -3.3399910926818848, "copy_logits_min": -750000000.0, "copy_num_tokens": 689.5625, "epoch": 2.007556803676283, "gen_logits_max": 4.138240814208984, "gen_logits_mean": -15.62960147857666, "gen_logits_min": -28.565744400024414, "gen_logits_std": 3.5127134323120117, "gen_loss": 0.20360544323921204, "grad_norm": 0.34949457273127155, "learning_rate": 1.8512842105263158e-05, "loss": 0.2098, "mean_copy_accuracy": 0.9979386180639267, "mean_gen_accuracy": 0.901228278875351, "mean_token_accuracy": 0.9260823577642441, "num_tokens": 360202644.0, "sample_num_tokens": 10153.5, "step": 9830, "total_num_tokens": 360243258.0, "z_loss": 0.00037122509093023837 }, { "copy_logits_max": -6.9487528800964355, "copy_logits_min": -750000000.0, "copy_num_tokens": 521.5, "epoch": 2.0077610416134797, "gen_logits_max": 2.520317316055298, "gen_logits_mean": -19.420955657958984, "gen_logits_min": -32.513919830322266, "gen_logits_std": 3.683405637741089, "gen_loss": 0.24627289175987244, "grad_norm": 0.36087746406062765, "learning_rate": 1.8511578947368422e-05, "loss": 0.2203, "mean_copy_accuracy": 0.9979744702577591, "mean_gen_accuracy": 0.8971219062805176, "mean_token_accuracy": 0.9232051819562912, "num_tokens": 360473972.0, "sample_num_tokens": 8590.5, "step": 9831, "total_num_tokens": 360508334.0, "z_loss": 0.0004764723707921803 }, { "copy_logits_max": -6.778068542480469, "copy_logits_min": -750000000.0, "copy_num_tokens": 277.375, "epoch": 2.0079652795506764, "gen_logits_max": 4.486352920532227, "gen_logits_mean": -18.52057456970215, "gen_logits_min": -30.98607635498047, "gen_logits_std": 3.6424646377563477, "gen_loss": 0.22595196962356567, "grad_norm": 0.3824170714514731, "learning_rate": 1.8510315789473683e-05, "loss": 0.2342, "mean_copy_accuracy": 0.9970307946205139, "mean_gen_accuracy": 0.8939198702573776, "mean_token_accuracy": 0.916655644774437, "num_tokens": 360737825.0, "sample_num_tokens": 7705.75, "step": 9832, "total_num_tokens": 360768648.0, "z_loss": 0.00043441756861284375 }, { "copy_logits_max": -6.855430603027344, "copy_logits_min": -750000000.0, "copy_num_tokens": 550.1875, "epoch": 2.0081695174878735, "gen_logits_max": 2.62648868560791, "gen_logits_mean": -18.869476318359375, "gen_logits_min": -31.614749908447266, "gen_logits_std": 3.679655075073242, "gen_loss": 0.21498647332191467, "grad_norm": 0.3499379863717105, "learning_rate": 1.8509052631578947e-05, "loss": 0.2323, "mean_copy_accuracy": 0.9977269619703293, "mean_gen_accuracy": 0.8906685560941696, "mean_token_accuracy": 0.917996272444725, "num_tokens": 361014304.0, "sample_num_tokens": 8929.0, "step": 9833, "total_num_tokens": 361050020.0, "z_loss": 0.00037674358463846147 }, { "copy_logits_max": -5.8773651123046875, "copy_logits_min": -750000000.0, "copy_num_tokens": 486.0, "epoch": 2.0083737554250702, "gen_logits_max": 3.6177592277526855, "gen_logits_mean": -18.341997146606445, "gen_logits_min": -31.13184356689453, "gen_logits_std": 3.615720272064209, "gen_loss": 0.2598656415939331, "grad_norm": 0.3692283245458221, "learning_rate": 1.850778947368421e-05, "loss": 0.2376, "mean_copy_accuracy": 0.9974139481782913, "mean_gen_accuracy": 0.8935193121433258, "mean_token_accuracy": 0.9170736074447632, "num_tokens": 361258296.0, "sample_num_tokens": 9427.0, "step": 9834, "total_num_tokens": 361296004.0, "z_loss": 0.0005530681810341775 }, { "copy_logits_max": -6.130439758300781, "copy_logits_min": -750000000.0, "copy_num_tokens": 647.625, "epoch": 2.008577993362267, "gen_logits_max": 3.0902087688446045, "gen_logits_mean": -17.88638687133789, "gen_logits_min": -30.45464515686035, "gen_logits_std": 3.6351656913757324, "gen_loss": 0.21522068977355957, "grad_norm": 0.3523619527587988, "learning_rate": 1.8506526315789476e-05, "loss": 0.2187, "mean_copy_accuracy": 0.9975366741418839, "mean_gen_accuracy": 0.8941259682178497, "mean_token_accuracy": 0.9217707812786102, "num_tokens": 361538259.0, "sample_num_tokens": 9919.25, "step": 9835, "total_num_tokens": 361577936.0, "z_loss": 0.0003410699137020856 }, { "copy_logits_max": -6.169888019561768, "copy_logits_min": -687500032.0, "copy_num_tokens": 507.1875, "epoch": 2.008782231299464, "gen_logits_max": 5.015813827514648, "gen_logits_mean": -16.892486572265625, "gen_logits_min": -29.224815368652344, "gen_logits_std": 3.5764355659484863, "gen_loss": 0.20239610970020294, "grad_norm": 0.3477095122765146, "learning_rate": 1.8505263157894737e-05, "loss": 0.2161, "mean_copy_accuracy": 0.9973004758358002, "mean_gen_accuracy": 0.8973604291677475, "mean_token_accuracy": 0.9238449186086655, "num_tokens": 361806993.0, "sample_num_tokens": 7837.75, "step": 9836, "total_num_tokens": 361838344.0, "z_loss": 0.0003393960068933666 }, { "copy_logits_max": -5.277216911315918, "copy_logits_min": -750000000.0, "copy_num_tokens": 456.875, "epoch": 2.008986469236661, "gen_logits_max": 3.6447978019714355, "gen_logits_mean": -17.747817993164062, "gen_logits_min": -30.717247009277344, "gen_logits_std": 3.6061930656433105, "gen_loss": 0.22324247658252716, "grad_norm": 0.34302086775031, "learning_rate": 1.8504e-05, "loss": 0.2182, "mean_copy_accuracy": 0.9973280876874924, "mean_gen_accuracy": 0.8958801776170731, "mean_token_accuracy": 0.9227726757526398, "num_tokens": 362093449.0, "sample_num_tokens": 8142.25, "step": 9837, "total_num_tokens": 362126018.0, "z_loss": 0.0003846450708806515 }, { "copy_logits_max": -1.439125418663025, "copy_logits_min": -750000064.0, "copy_num_tokens": 650.9375, "epoch": 2.0091907071738575, "gen_logits_max": 4.14869499206543, "gen_logits_mean": -16.994449615478516, "gen_logits_min": -29.916004180908203, "gen_logits_std": 3.613307237625122, "gen_loss": 0.22063328325748444, "grad_norm": 0.3806482609283387, "learning_rate": 1.8502736842105262e-05, "loss": 0.2433, "mean_copy_accuracy": 0.9970388263463974, "mean_gen_accuracy": 0.8873982578516006, "mean_token_accuracy": 0.9153634607791901, "num_tokens": 362359497.0, "sample_num_tokens": 9569.75, "step": 9838, "total_num_tokens": 362397776.0, "z_loss": 0.0004440133925527334 }, { "copy_logits_max": -5.562440872192383, "copy_logits_min": -750000000.0, "copy_num_tokens": 297.0625, "epoch": 2.009394945111054, "gen_logits_max": 4.224817276000977, "gen_logits_mean": -19.16672134399414, "gen_logits_min": -32.0670166015625, "gen_logits_std": 3.678997039794922, "gen_loss": 0.2296450436115265, "grad_norm": 0.34752563760076505, "learning_rate": 1.8501473684210526e-05, "loss": 0.2255, "mean_copy_accuracy": 0.9978064745664597, "mean_gen_accuracy": 0.8963886946439743, "mean_token_accuracy": 0.9206161797046661, "num_tokens": 362627280.0, "sample_num_tokens": 7187.0, "step": 9839, "total_num_tokens": 362656028.0, "z_loss": 0.0004390508693177253 }, { "copy_logits_max": -3.214839220046997, "copy_logits_min": -750000000.0, "copy_num_tokens": 337.3125, "epoch": 2.0095991830482514, "gen_logits_max": 4.767226219177246, "gen_logits_mean": -17.509754180908203, "gen_logits_min": -30.405506134033203, "gen_logits_std": 3.6229984760284424, "gen_loss": 0.22630587220191956, "grad_norm": 0.36856386532045243, "learning_rate": 1.8500210526315787e-05, "loss": 0.2177, "mean_copy_accuracy": 0.9973016828298569, "mean_gen_accuracy": 0.9003660678863525, "mean_token_accuracy": 0.9218437969684601, "num_tokens": 362876325.0, "sample_num_tokens": 8069.75, "step": 9840, "total_num_tokens": 362908604.0, "z_loss": 0.0004574928607326001 }, { "copy_logits_max": -1.6185803413391113, "copy_logits_min": -750000000.0, "copy_num_tokens": 238.625, "epoch": 2.009803420985448, "gen_logits_max": 5.587804794311523, "gen_logits_mean": -16.61965560913086, "gen_logits_min": -29.336408615112305, "gen_logits_std": 3.5605309009552, "gen_loss": 0.23470577597618103, "grad_norm": 0.3618633093274379, "learning_rate": 1.849894736842105e-05, "loss": 0.2227, "mean_copy_accuracy": 0.9970624893903732, "mean_gen_accuracy": 0.8967692852020264, "mean_token_accuracy": 0.9204530715942383, "num_tokens": 363139218.0, "sample_num_tokens": 7226.0, "step": 9841, "total_num_tokens": 363168122.0, "z_loss": 0.0003942748880945146 }, { "copy_logits_max": -1.0983810424804688, "copy_logits_min": -625000064.0, "copy_num_tokens": 694.9375, "epoch": 2.0100076589226448, "gen_logits_max": 4.352985858917236, "gen_logits_mean": -16.49797821044922, "gen_logits_min": -30.143407821655273, "gen_logits_std": 3.5768141746520996, "gen_loss": 0.22228293120861053, "grad_norm": 0.3862520306454217, "learning_rate": 1.849768421052632e-05, "loss": 0.226, "mean_copy_accuracy": 0.9970917403697968, "mean_gen_accuracy": 0.8983190208673477, "mean_token_accuracy": 0.9208750873804092, "num_tokens": 363385703.0, "sample_num_tokens": 10311.75, "step": 9842, "total_num_tokens": 363426950.0, "z_loss": 0.00034838973078876734 }, { "copy_logits_max": -2.156888961791992, "copy_logits_min": -625000064.0, "copy_num_tokens": 420.375, "epoch": 2.010211896859842, "gen_logits_max": 4.784173965454102, "gen_logits_mean": -17.163721084594727, "gen_logits_min": -30.163005828857422, "gen_logits_std": 3.592979907989502, "gen_loss": 0.2318786382675171, "grad_norm": 0.3788600829863425, "learning_rate": 1.849642105263158e-05, "loss": 0.2324, "mean_copy_accuracy": 0.9971446394920349, "mean_gen_accuracy": 0.8920206129550934, "mean_token_accuracy": 0.9193509966135025, "num_tokens": 363653215.0, "sample_num_tokens": 7788.25, "step": 9843, "total_num_tokens": 363684368.0, "z_loss": 0.0004355431883595884 }, { "copy_logits_max": -3.794074535369873, "copy_logits_min": -687500032.0, "copy_num_tokens": 337.5625, "epoch": 2.0104161347970386, "gen_logits_max": 4.10650634765625, "gen_logits_mean": -18.777053833007812, "gen_logits_min": -31.681421279907227, "gen_logits_std": 3.6515207290649414, "gen_loss": 0.22615469992160797, "grad_norm": 0.42770709039722044, "learning_rate": 1.8495157894736844e-05, "loss": 0.2249, "mean_copy_accuracy": 0.9966828674077988, "mean_gen_accuracy": 0.8972554057836533, "mean_token_accuracy": 0.9208043962717056, "num_tokens": 363926268.0, "sample_num_tokens": 7669.0, "step": 9844, "total_num_tokens": 363956944.0, "z_loss": 0.00044979690574109554 }, { "copy_logits_max": -0.9011362791061401, "copy_logits_min": -687500032.0, "copy_num_tokens": 426.5, "epoch": 2.0106203727342353, "gen_logits_max": 3.759446144104004, "gen_logits_mean": -18.577190399169922, "gen_logits_min": -31.19651222229004, "gen_logits_std": 3.653317928314209, "gen_loss": 0.22307340800762177, "grad_norm": 0.3572234564945711, "learning_rate": 1.8493894736842105e-05, "loss": 0.222, "mean_copy_accuracy": 0.9978545159101486, "mean_gen_accuracy": 0.9006032645702362, "mean_token_accuracy": 0.9228048026561737, "num_tokens": 364187244.0, "sample_num_tokens": 8614.0, "step": 9845, "total_num_tokens": 364221700.0, "z_loss": 0.0004382878541946411 }, { "copy_logits_max": 1.8374450206756592, "copy_logits_min": -750000000.0, "copy_num_tokens": 546.625, "epoch": 2.010824610671432, "gen_logits_max": 4.755615234375, "gen_logits_mean": -15.918495178222656, "gen_logits_min": -29.171520233154297, "gen_logits_std": 3.5402069091796875, "gen_loss": 0.21371065080165863, "grad_norm": 0.36092884320403096, "learning_rate": 1.849263157894737e-05, "loss": 0.2139, "mean_copy_accuracy": 0.997405156493187, "mean_gen_accuracy": 0.9000416845083237, "mean_token_accuracy": 0.9238071739673615, "num_tokens": 364459863.0, "sample_num_tokens": 8802.75, "step": 9846, "total_num_tokens": 364495074.0, "z_loss": 0.0003835883107967675 }, { "copy_logits_max": -4.63137149810791, "copy_logits_min": -750000000.0, "copy_num_tokens": 418.9375, "epoch": 2.011028848608629, "gen_logits_max": 3.1605167388916016, "gen_logits_mean": -18.581762313842773, "gen_logits_min": -31.35831069946289, "gen_logits_std": 3.650514841079712, "gen_loss": 0.20285198092460632, "grad_norm": 0.3441358543905763, "learning_rate": 1.849136842105263e-05, "loss": 0.2121, "mean_copy_accuracy": 0.9980926364660263, "mean_gen_accuracy": 0.8993834853172302, "mean_token_accuracy": 0.9237255156040192, "num_tokens": 364752267.0, "sample_num_tokens": 7517.25, "step": 9847, "total_num_tokens": 364782336.0, "z_loss": 0.00034227946889586747 }, { "copy_logits_max": -3.3445849418640137, "copy_logits_min": -750000000.0, "copy_num_tokens": 464.5625, "epoch": 2.011233086545826, "gen_logits_max": 3.4116733074188232, "gen_logits_mean": -17.91318130493164, "gen_logits_min": -30.49140167236328, "gen_logits_std": 3.6130905151367188, "gen_loss": 0.2077731192111969, "grad_norm": 0.3933527337146251, "learning_rate": 1.8490105263157895e-05, "loss": 0.2281, "mean_copy_accuracy": 0.9976695477962494, "mean_gen_accuracy": 0.8948954641819, "mean_token_accuracy": 0.9201821833848953, "num_tokens": 365016790.0, "sample_num_tokens": 8276.0, "step": 9848, "total_num_tokens": 365049894.0, "z_loss": 0.0003699205699376762 }, { "copy_logits_max": 0.16290819644927979, "copy_logits_min": -750000000.0, "copy_num_tokens": 436.625, "epoch": 2.0114373244830226, "gen_logits_max": 4.3218231201171875, "gen_logits_mean": -16.485736846923828, "gen_logits_min": -29.80482292175293, "gen_logits_std": 3.5650312900543213, "gen_loss": 0.21552902460098267, "grad_norm": 0.35279619977669563, "learning_rate": 1.8488842105263156e-05, "loss": 0.2207, "mean_copy_accuracy": 0.9968253672122955, "mean_gen_accuracy": 0.8975192755460739, "mean_token_accuracy": 0.9214090555906296, "num_tokens": 365276202.0, "sample_num_tokens": 8215.0, "step": 9849, "total_num_tokens": 365309062.0, "z_loss": 0.00041095222695730627 }, { "copy_logits_max": -4.18573522567749, "copy_logits_min": -750000000.0, "copy_num_tokens": 432.6875, "epoch": 2.0116415624202197, "gen_logits_max": 4.179048538208008, "gen_logits_mean": -16.48343276977539, "gen_logits_min": -29.550792694091797, "gen_logits_std": 3.5542752742767334, "gen_loss": 0.22218260169029236, "grad_norm": 0.3327702808575381, "learning_rate": 1.8487578947368424e-05, "loss": 0.2197, "mean_copy_accuracy": 0.9979636371135712, "mean_gen_accuracy": 0.8987275511026382, "mean_token_accuracy": 0.922384724020958, "num_tokens": 365563018.0, "sample_num_tokens": 8220.5, "step": 9850, "total_num_tokens": 365595900.0, "z_loss": 0.0004248403711244464 }, { "copy_logits_max": -1.1415181159973145, "copy_logits_min": -750000000.0, "copy_num_tokens": 336.8125, "epoch": 2.0118458003574164, "gen_logits_max": 4.827807903289795, "gen_logits_mean": -16.591106414794922, "gen_logits_min": -29.136247634887695, "gen_logits_std": 3.5578055381774902, "gen_loss": 0.2404792308807373, "grad_norm": 0.38359929969940937, "learning_rate": 1.8486315789473684e-05, "loss": 0.2305, "mean_copy_accuracy": 0.9971199780702591, "mean_gen_accuracy": 0.8939340561628342, "mean_token_accuracy": 0.9186717718839645, "num_tokens": 365837034.0, "sample_num_tokens": 7485.5, "step": 9851, "total_num_tokens": 365866976.0, "z_loss": 0.0004654687363654375 }, { "copy_logits_max": -1.489332675933838, "copy_logits_min": -750000000.0, "copy_num_tokens": 505.625, "epoch": 2.012050038294613, "gen_logits_max": 3.8252034187316895, "gen_logits_mean": -16.71697235107422, "gen_logits_min": -29.699039459228516, "gen_logits_std": 3.5444159507751465, "gen_loss": 0.21371576189994812, "grad_norm": 0.39056737569781885, "learning_rate": 1.848505263157895e-05, "loss": 0.2253, "mean_copy_accuracy": 0.9964707046747208, "mean_gen_accuracy": 0.9011845737695694, "mean_token_accuracy": 0.922937884926796, "num_tokens": 366117000.0, "sample_num_tokens": 8731.5, "step": 9852, "total_num_tokens": 366151926.0, "z_loss": 0.00039234571158885956 }, { "copy_logits_max": -2.027122974395752, "copy_logits_min": -750000000.0, "copy_num_tokens": 560.5625, "epoch": 2.01225427623181, "gen_logits_max": 2.5615525245666504, "gen_logits_mean": -18.96865463256836, "gen_logits_min": -31.407197952270508, "gen_logits_std": 3.662783145904541, "gen_loss": 0.2321242243051529, "grad_norm": 0.3716421021205107, "learning_rate": 1.848378947368421e-05, "loss": 0.2452, "mean_copy_accuracy": 0.9977638870477676, "mean_gen_accuracy": 0.8865914195775986, "mean_token_accuracy": 0.9125887751579285, "num_tokens": 366383389.0, "sample_num_tokens": 9265.25, "step": 9853, "total_num_tokens": 366420450.0, "z_loss": 0.0004228311008773744 }, { "copy_logits_max": -3.4013233184814453, "copy_logits_min": -687500032.0, "copy_num_tokens": 514.3125, "epoch": 2.012458514169007, "gen_logits_max": 3.1458020210266113, "gen_logits_mean": -18.608001708984375, "gen_logits_min": -31.147438049316406, "gen_logits_std": 3.6575279235839844, "gen_loss": 0.21339713037014008, "grad_norm": 0.35732958717053986, "learning_rate": 1.8482526315789474e-05, "loss": 0.2282, "mean_copy_accuracy": 0.9977125078439713, "mean_gen_accuracy": 0.8963326960802078, "mean_token_accuracy": 0.9189066886901855, "num_tokens": 366672023.0, "sample_num_tokens": 9301.25, "step": 9854, "total_num_tokens": 366709228.0, "z_loss": 0.0003562472411431372 }, { "copy_logits_max": -2.3127615451812744, "copy_logits_min": -750000000.0, "copy_num_tokens": 709.4375, "epoch": 2.0126627521062037, "gen_logits_max": 4.412789344787598, "gen_logits_mean": -16.454774856567383, "gen_logits_min": -29.102758407592773, "gen_logits_std": 3.5475730895996094, "gen_loss": 0.21839582920074463, "grad_norm": 0.3565155100988741, "learning_rate": 1.848126315789474e-05, "loss": 0.2224, "mean_copy_accuracy": 0.9968618452548981, "mean_gen_accuracy": 0.8976860642433167, "mean_token_accuracy": 0.9223012328147888, "num_tokens": 366952878.0, "sample_num_tokens": 10313.5, "step": 9855, "total_num_tokens": 366994132.0, "z_loss": 0.00037062494084239006 }, { "copy_logits_max": -1.450646162033081, "copy_logits_min": -750000000.0, "copy_num_tokens": 597.75, "epoch": 2.0128669900434004, "gen_logits_max": 2.767002582550049, "gen_logits_mean": -18.15273094177246, "gen_logits_min": -30.968534469604492, "gen_logits_std": 3.6617517471313477, "gen_loss": 0.21209454536437988, "grad_norm": 0.44773292333107617, "learning_rate": 1.848e-05, "loss": 0.2244, "mean_copy_accuracy": 0.9965294748544693, "mean_gen_accuracy": 0.8986302614212036, "mean_token_accuracy": 0.9209903627634048, "num_tokens": 367216319.0, "sample_num_tokens": 9431.25, "step": 9856, "total_num_tokens": 367254044.0, "z_loss": 0.00042400910751894116 }, { "copy_logits_max": -0.5399110317230225, "copy_logits_min": -750000000.0, "copy_num_tokens": 636.0, "epoch": 2.0130712279805976, "gen_logits_max": 4.0019636154174805, "gen_logits_mean": -17.10457992553711, "gen_logits_min": -30.406253814697266, "gen_logits_std": 3.592197895050049, "gen_loss": 0.21173372864723206, "grad_norm": 0.3374236373845365, "learning_rate": 1.8478736842105264e-05, "loss": 0.2076, "mean_copy_accuracy": 0.9984879791736603, "mean_gen_accuracy": 0.897430345416069, "mean_token_accuracy": 0.9268270283937454, "num_tokens": 367520785.0, "sample_num_tokens": 10038.25, "step": 9857, "total_num_tokens": 367560938.0, "z_loss": 0.00033602493931539357 }, { "copy_logits_max": -0.24717259407043457, "copy_logits_min": -750000000.0, "copy_num_tokens": 475.3125, "epoch": 2.0132754659177943, "gen_logits_max": 3.9989757537841797, "gen_logits_mean": -17.243507385253906, "gen_logits_min": -30.151912689208984, "gen_logits_std": 3.5906982421875, "gen_loss": 0.23494252562522888, "grad_norm": 0.3336119403283796, "learning_rate": 1.8477473684210528e-05, "loss": 0.216, "mean_copy_accuracy": 0.9982731938362122, "mean_gen_accuracy": 0.8961005061864853, "mean_token_accuracy": 0.9250800907611847, "num_tokens": 367802852.0, "sample_num_tokens": 7957.0, "step": 9858, "total_num_tokens": 367834680.0, "z_loss": 0.0004478541959542781 }, { "copy_logits_max": -0.7755345702171326, "copy_logits_min": -750000064.0, "copy_num_tokens": 357.375, "epoch": 2.013479703854991, "gen_logits_max": 4.2847089767456055, "gen_logits_mean": -17.504173278808594, "gen_logits_min": -30.022415161132812, "gen_logits_std": 3.611743211746216, "gen_loss": 0.2273109406232834, "grad_norm": 0.3632784488656855, "learning_rate": 1.8476210526315792e-05, "loss": 0.2197, "mean_copy_accuracy": 0.9979834854602814, "mean_gen_accuracy": 0.8955831974744797, "mean_token_accuracy": 0.9228280484676361, "num_tokens": 368085632.0, "sample_num_tokens": 7151.0, "step": 9859, "total_num_tokens": 368114236.0, "z_loss": 0.0004319317522458732 }, { "copy_logits_max": -2.145735263824463, "copy_logits_min": -750000000.0, "copy_num_tokens": 381.5625, "epoch": 2.0136839417921877, "gen_logits_max": 3.7427191734313965, "gen_logits_mean": -18.610069274902344, "gen_logits_min": -31.44355010986328, "gen_logits_std": 3.6284403800964355, "gen_loss": 0.24770891666412354, "grad_norm": 0.3759077455324268, "learning_rate": 1.8474947368421053e-05, "loss": 0.2378, "mean_copy_accuracy": 0.9974165707826614, "mean_gen_accuracy": 0.8903523087501526, "mean_token_accuracy": 0.9159487932920456, "num_tokens": 368342540.0, "sample_num_tokens": 7593.0, "step": 9860, "total_num_tokens": 368372912.0, "z_loss": 0.00042813466279767454 }, { "copy_logits_max": -3.859715700149536, "copy_logits_min": -750000000.0, "copy_num_tokens": 301.5625, "epoch": 2.013888179729385, "gen_logits_max": 4.121125221252441, "gen_logits_mean": -18.994705200195312, "gen_logits_min": -31.495887756347656, "gen_logits_std": 3.664234161376953, "gen_loss": 0.22003278136253357, "grad_norm": 0.4217931674741083, "learning_rate": 1.8473684210526317e-05, "loss": 0.2396, "mean_copy_accuracy": 0.996906116604805, "mean_gen_accuracy": 0.8905606716871262, "mean_token_accuracy": 0.9159769713878632, "num_tokens": 368596597.0, "sample_num_tokens": 7383.25, "step": 9861, "total_num_tokens": 368626130.0, "z_loss": 0.0004569779266603291 }, { "copy_logits_max": -2.0894827842712402, "copy_logits_min": -750000000.0, "copy_num_tokens": 407.25, "epoch": 2.0140924176665815, "gen_logits_max": 4.118178367614746, "gen_logits_mean": -18.197246551513672, "gen_logits_min": -30.70297622680664, "gen_logits_std": 3.609043836593628, "gen_loss": 0.23423966765403748, "grad_norm": 0.36093885058697245, "learning_rate": 1.847242105263158e-05, "loss": 0.2242, "mean_copy_accuracy": 0.9972774386405945, "mean_gen_accuracy": 0.8971394151449203, "mean_token_accuracy": 0.9198482036590576, "num_tokens": 368852479.0, "sample_num_tokens": 8811.25, "step": 9862, "total_num_tokens": 368887724.0, "z_loss": 0.0004279145214240998 }, { "copy_logits_max": -2.2005581855773926, "copy_logits_min": -687500032.0, "copy_num_tokens": 586.3125, "epoch": 2.0142966556037782, "gen_logits_max": 3.076812744140625, "gen_logits_mean": -18.352279663085938, "gen_logits_min": -30.976577758789062, "gen_logits_std": 3.619802236557007, "gen_loss": 0.22298304736614227, "grad_norm": 0.3493744961234975, "learning_rate": 1.8471157894736843e-05, "loss": 0.2189, "mean_copy_accuracy": 0.9983542561531067, "mean_gen_accuracy": 0.8945974111557007, "mean_token_accuracy": 0.923601970076561, "num_tokens": 369115324.0, "sample_num_tokens": 9517.0, "step": 9863, "total_num_tokens": 369153392.0, "z_loss": 0.0003967714146710932 }, { "copy_logits_max": -1.8682551383972168, "copy_logits_min": -750000000.0, "copy_num_tokens": 353.25, "epoch": 2.0145008935409754, "gen_logits_max": 4.997181415557861, "gen_logits_mean": -16.95555877685547, "gen_logits_min": -29.29144287109375, "gen_logits_std": 3.5621209144592285, "gen_loss": 0.2192153036594391, "grad_norm": 0.3222607746272961, "learning_rate": 1.8469894736842104e-05, "loss": 0.2067, "mean_copy_accuracy": 0.9976586401462555, "mean_gen_accuracy": 0.9015648066997528, "mean_token_accuracy": 0.9263166040182114, "num_tokens": 369413782.0, "sample_num_tokens": 7672.5, "step": 9864, "total_num_tokens": 369444472.0, "z_loss": 0.0003845698665827513 }, { "copy_logits_max": -1.0481332540512085, "copy_logits_min": -750000000.0, "copy_num_tokens": 212.1875, "epoch": 2.014705131478172, "gen_logits_max": 5.292110919952393, "gen_logits_mean": -16.813785552978516, "gen_logits_min": -29.29776954650879, "gen_logits_std": 3.527022123336792, "gen_loss": 0.24276399612426758, "grad_norm": 0.3822498398091233, "learning_rate": 1.8468631578947368e-05, "loss": 0.2371, "mean_copy_accuracy": 0.9958272576332092, "mean_gen_accuracy": 0.8968409895896912, "mean_token_accuracy": 0.9160865247249603, "num_tokens": 369664931.0, "sample_num_tokens": 6820.25, "step": 9865, "total_num_tokens": 369692212.0, "z_loss": 0.0004985284758731723 }, { "copy_logits_max": -1.3627467155456543, "copy_logits_min": -750000000.0, "copy_num_tokens": 551.75, "epoch": 2.014909369415369, "gen_logits_max": 2.253427028656006, "gen_logits_mean": -19.4730281829834, "gen_logits_min": -32.078216552734375, "gen_logits_std": 3.6714465618133545, "gen_loss": 0.2338871955871582, "grad_norm": 0.36776116799312075, "learning_rate": 1.8467368421052632e-05, "loss": 0.2215, "mean_copy_accuracy": 0.9975180774927139, "mean_gen_accuracy": 0.8936899453401566, "mean_token_accuracy": 0.9216820150613785, "num_tokens": 369914846.0, "sample_num_tokens": 9124.0, "step": 9866, "total_num_tokens": 369951342.0, "z_loss": 0.00043502560583874583 }, { "copy_logits_max": 3.398677349090576, "copy_logits_min": -687500032.0, "copy_num_tokens": 473.6875, "epoch": 2.015113607352566, "gen_logits_max": 4.318795204162598, "gen_logits_mean": -16.411312103271484, "gen_logits_min": -29.018192291259766, "gen_logits_std": 3.494727611541748, "gen_loss": 0.2538484036922455, "grad_norm": 0.3417792318807474, "learning_rate": 1.8466105263157897e-05, "loss": 0.2279, "mean_copy_accuracy": 0.9972877949476242, "mean_gen_accuracy": 0.8965625613927841, "mean_token_accuracy": 0.9209388345479965, "num_tokens": 370179140.0, "sample_num_tokens": 8842.5, "step": 9867, "total_num_tokens": 370214510.0, "z_loss": 0.0004957499913871288 }, { "copy_logits_max": 1.7400734424591064, "copy_logits_min": -687500032.0, "copy_num_tokens": 843.625, "epoch": 2.0153178452897627, "gen_logits_max": 2.836992025375366, "gen_logits_mean": -16.77908706665039, "gen_logits_min": -29.732818603515625, "gen_logits_std": 3.5203301906585693, "gen_loss": 0.22688481211662292, "grad_norm": 0.3460032575740254, "learning_rate": 1.846484210526316e-05, "loss": 0.2245, "mean_copy_accuracy": 0.9972429573535919, "mean_gen_accuracy": 0.895598366856575, "mean_token_accuracy": 0.9212346225976944, "num_tokens": 370443350.0, "sample_num_tokens": 10580.5, "step": 9868, "total_num_tokens": 370485672.0, "z_loss": 0.0004238284891471267 }, { "copy_logits_max": 1.2830597162246704, "copy_logits_min": -750000000.0, "copy_num_tokens": 564.6875, "epoch": 2.0155220832269594, "gen_logits_max": 3.611865282058716, "gen_logits_mean": -16.774250030517578, "gen_logits_min": -29.55914306640625, "gen_logits_std": 3.556955099105835, "gen_loss": 0.23248568177223206, "grad_norm": 0.36549019736621946, "learning_rate": 1.8463578947368422e-05, "loss": 0.232, "mean_copy_accuracy": 0.9975888580083847, "mean_gen_accuracy": 0.8925042897462845, "mean_token_accuracy": 0.918458878993988, "num_tokens": 370725645.0, "sample_num_tokens": 9221.25, "step": 9869, "total_num_tokens": 370762530.0, "z_loss": 0.0003874427347909659 }, { "copy_logits_max": -1.7855534553527832, "copy_logits_min": -750000000.0, "copy_num_tokens": 443.8125, "epoch": 2.015726321164156, "gen_logits_max": 2.6982741355895996, "gen_logits_mean": -18.720169067382812, "gen_logits_min": -31.214977264404297, "gen_logits_std": 3.6516709327697754, "gen_loss": 0.19935348629951477, "grad_norm": 0.3620114146833358, "learning_rate": 1.8462315789473686e-05, "loss": 0.2161, "mean_copy_accuracy": 0.9967721551656723, "mean_gen_accuracy": 0.8998824208974838, "mean_token_accuracy": 0.9236719757318497, "num_tokens": 370984870.0, "sample_num_tokens": 8057.5, "step": 9870, "total_num_tokens": 371017100.0, "z_loss": 0.00031749848858453333 }, { "copy_logits_max": 1.6519429683685303, "copy_logits_min": -750000000.0, "copy_num_tokens": 410.0625, "epoch": 2.015930559101353, "gen_logits_max": 4.278121471405029, "gen_logits_mean": -17.191558837890625, "gen_logits_min": -30.126842498779297, "gen_logits_std": 3.600020170211792, "gen_loss": 0.22428464889526367, "grad_norm": 0.3473765809673246, "learning_rate": 1.8461052631578947e-05, "loss": 0.2253, "mean_copy_accuracy": 0.997793659567833, "mean_gen_accuracy": 0.8948179930448532, "mean_token_accuracy": 0.9214654713869095, "num_tokens": 371263648.0, "sample_num_tokens": 7493.0, "step": 9871, "total_num_tokens": 371293620.0, "z_loss": 0.00037916397559456527 }, { "copy_logits_max": -0.15487223863601685, "copy_logits_min": -687500032.0, "copy_num_tokens": 551.5625, "epoch": 2.01613479703855, "gen_logits_max": 3.5659127235412598, "gen_logits_mean": -17.211223602294922, "gen_logits_min": -30.053537368774414, "gen_logits_std": 3.586977243423462, "gen_loss": 0.2261500507593155, "grad_norm": 0.34956857165746547, "learning_rate": 1.845978947368421e-05, "loss": 0.2128, "mean_copy_accuracy": 0.9970996528863907, "mean_gen_accuracy": 0.899167075753212, "mean_token_accuracy": 0.9262273758649826, "num_tokens": 371520551.0, "sample_num_tokens": 8989.25, "step": 9872, "total_num_tokens": 371556508.0, "z_loss": 0.000356673524947837 }, { "copy_logits_max": 1.6859407424926758, "copy_logits_min": -750000000.0, "copy_num_tokens": 628.1875, "epoch": 2.0163390349757466, "gen_logits_max": 3.9639792442321777, "gen_logits_mean": -15.279326438903809, "gen_logits_min": -27.82563018798828, "gen_logits_std": 3.5051803588867188, "gen_loss": 0.18002985417842865, "grad_norm": 0.35609358987904494, "learning_rate": 1.8458526315789472e-05, "loss": 0.212, "mean_copy_accuracy": 0.997451901435852, "mean_gen_accuracy": 0.89984230697155, "mean_token_accuracy": 0.9249382466077805, "num_tokens": 371806006.0, "sample_num_tokens": 9620.5, "step": 9873, "total_num_tokens": 371844488.0, "z_loss": 0.0002579312422312796 }, { "copy_logits_max": -3.586526870727539, "copy_logits_min": -750000000.0, "copy_num_tokens": 322.25, "epoch": 2.0165432729129438, "gen_logits_max": 3.7353787422180176, "gen_logits_mean": -18.923879623413086, "gen_logits_min": -31.077333450317383, "gen_logits_std": 3.649141788482666, "gen_loss": 0.23469671607017517, "grad_norm": 0.3169683041446386, "learning_rate": 1.8457263157894737e-05, "loss": 0.2235, "mean_copy_accuracy": 0.9975527226924896, "mean_gen_accuracy": 0.8989487439393997, "mean_token_accuracy": 0.9224632829427719, "num_tokens": 372111585.0, "sample_num_tokens": 7921.75, "step": 9874, "total_num_tokens": 372143272.0, "z_loss": 0.0003481063758954406 }, { "copy_logits_max": -3.8694846630096436, "copy_logits_min": -750000000.0, "copy_num_tokens": 346.875, "epoch": 2.0167475108501405, "gen_logits_max": 3.620925188064575, "gen_logits_mean": -18.86412811279297, "gen_logits_min": -31.010366439819336, "gen_logits_std": 3.642256259918213, "gen_loss": 0.2670578956604004, "grad_norm": 0.3706569798903117, "learning_rate": 1.8456e-05, "loss": 0.2472, "mean_copy_accuracy": 0.9969534277915955, "mean_gen_accuracy": 0.8887946605682373, "mean_token_accuracy": 0.9149154424667358, "num_tokens": 372401916.0, "sample_num_tokens": 8007.0, "step": 9875, "total_num_tokens": 372433944.0, "z_loss": 0.0004624594294000417 }, { "copy_logits_max": -3.5386576652526855, "copy_logits_min": -750000000.0, "copy_num_tokens": 506.6875, "epoch": 2.016951748787337, "gen_logits_max": 3.0454959869384766, "gen_logits_mean": -18.60721206665039, "gen_logits_min": -31.061054229736328, "gen_logits_std": 3.636841297149658, "gen_loss": 0.2298029214143753, "grad_norm": 0.352017735544636, "learning_rate": 1.8454736842105265e-05, "loss": 0.2251, "mean_copy_accuracy": 0.9965840429067612, "mean_gen_accuracy": 0.8982000797986984, "mean_token_accuracy": 0.9203723967075348, "num_tokens": 372682524.0, "sample_num_tokens": 9536.5, "step": 9876, "total_num_tokens": 372720670.0, "z_loss": 0.000357322976924479 }, { "copy_logits_max": -0.37771135568618774, "copy_logits_min": -750000000.0, "copy_num_tokens": 416.25, "epoch": 2.017155986724534, "gen_logits_max": 4.287606716156006, "gen_logits_mean": -16.15582275390625, "gen_logits_min": -29.002775192260742, "gen_logits_std": 3.499502658843994, "gen_loss": 0.2439991980791092, "grad_norm": 0.3638931042691213, "learning_rate": 1.8453473684210526e-05, "loss": 0.2357, "mean_copy_accuracy": 0.9967113733291626, "mean_gen_accuracy": 0.8926299065351486, "mean_token_accuracy": 0.917670413851738, "num_tokens": 372970337.0, "sample_num_tokens": 8489.75, "step": 9877, "total_num_tokens": 373004296.0, "z_loss": 0.00039240269688889384 }, { "copy_logits_max": -2.2507567405700684, "copy_logits_min": -750000000.0, "copy_num_tokens": 417.25, "epoch": 2.017360224661731, "gen_logits_max": 4.001596450805664, "gen_logits_mean": -16.64466667175293, "gen_logits_min": -29.171890258789062, "gen_logits_std": 3.5506019592285156, "gen_loss": 0.23274944722652435, "grad_norm": 0.38923627513071163, "learning_rate": 1.845221052631579e-05, "loss": 0.2427, "mean_copy_accuracy": 0.9977224767208099, "mean_gen_accuracy": 0.8935828506946564, "mean_token_accuracy": 0.9157149195671082, "num_tokens": 373237086.0, "sample_num_tokens": 8517.5, "step": 9878, "total_num_tokens": 373271156.0, "z_loss": 0.00040193312452174723 }, { "copy_logits_max": -1.053133249282837, "copy_logits_min": -750000000.0, "copy_num_tokens": 590.9375, "epoch": 2.0175644625989277, "gen_logits_max": 3.995542526245117, "gen_logits_mean": -16.571914672851562, "gen_logits_min": -29.205493927001953, "gen_logits_std": 3.563070774078369, "gen_loss": 0.19761836528778076, "grad_norm": 0.34663239066918283, "learning_rate": 1.845094736842105e-05, "loss": 0.2165, "mean_copy_accuracy": 0.997533917427063, "mean_gen_accuracy": 0.900832936167717, "mean_token_accuracy": 0.9234737306833267, "num_tokens": 373505246.0, "sample_num_tokens": 8920.0, "step": 9879, "total_num_tokens": 373540926.0, "z_loss": 0.0003874695976264775 }, { "copy_logits_max": -5.0914812088012695, "copy_logits_min": -687500032.0, "copy_num_tokens": 534.875, "epoch": 2.0177687005361244, "gen_logits_max": 2.6803040504455566, "gen_logits_mean": -19.361480712890625, "gen_logits_min": -31.889793395996094, "gen_logits_std": 3.687399387359619, "gen_loss": 0.23228123784065247, "grad_norm": 0.33401589008389165, "learning_rate": 1.8449684210526316e-05, "loss": 0.2149, "mean_copy_accuracy": 0.9980466961860657, "mean_gen_accuracy": 0.9008119255304337, "mean_token_accuracy": 0.9251393526792526, "num_tokens": 373785124.0, "sample_num_tokens": 8870.5, "step": 9880, "total_num_tokens": 373820606.0, "z_loss": 0.00038293295074254274 }, { "copy_logits_max": -5.5920586585998535, "copy_logits_min": -750000000.0, "copy_num_tokens": 329.25, "epoch": 2.0179729384733216, "gen_logits_max": 3.77557373046875, "gen_logits_mean": -18.127145767211914, "gen_logits_min": -30.80084991455078, "gen_logits_std": 3.5972681045532227, "gen_loss": 0.24548010528087616, "grad_norm": 0.35039526520181286, "learning_rate": 1.844842105263158e-05, "loss": 0.232, "mean_copy_accuracy": 0.9970496743917465, "mean_gen_accuracy": 0.896304726600647, "mean_token_accuracy": 0.9188359826803207, "num_tokens": 374062396.0, "sample_num_tokens": 7166.0, "step": 9881, "total_num_tokens": 374091060.0, "z_loss": 0.000467551639303565 }, { "copy_logits_max": -4.859025955200195, "copy_logits_min": -750000000.0, "copy_num_tokens": 355.875, "epoch": 2.0181771764105183, "gen_logits_max": 3.6744625568389893, "gen_logits_mean": -17.415403366088867, "gen_logits_min": -29.87401580810547, "gen_logits_std": 3.5182204246520996, "gen_loss": 0.25807470083236694, "grad_norm": 0.3854877467416806, "learning_rate": 1.844715789473684e-05, "loss": 0.2365, "mean_copy_accuracy": 0.9977871924638748, "mean_gen_accuracy": 0.8940413147211075, "mean_token_accuracy": 0.9164036363363266, "num_tokens": 374314847.0, "sample_num_tokens": 8065.25, "step": 9882, "total_num_tokens": 374347108.0, "z_loss": 0.00044044930837117136 }, { "copy_logits_max": -1.3981897830963135, "copy_logits_min": -750000000.0, "copy_num_tokens": 416.1875, "epoch": 2.018381414347715, "gen_logits_max": 3.595266819000244, "gen_logits_mean": -17.47878074645996, "gen_logits_min": -29.927204132080078, "gen_logits_std": 3.590144157409668, "gen_loss": 0.22562864422798157, "grad_norm": 0.3642063867287724, "learning_rate": 1.844589473684211e-05, "loss": 0.222, "mean_copy_accuracy": 0.9964070916175842, "mean_gen_accuracy": 0.8959807902574539, "mean_token_accuracy": 0.9217175394296646, "num_tokens": 374597606.0, "sample_num_tokens": 8069.0, "step": 9883, "total_num_tokens": 374629882.0, "z_loss": 0.0004339540610089898 }, { "copy_logits_max": 0.7409087419509888, "copy_logits_min": -750000000.0, "copy_num_tokens": 527.625, "epoch": 2.0185856522849117, "gen_logits_max": 3.654531478881836, "gen_logits_mean": -16.945053100585938, "gen_logits_min": -29.767702102661133, "gen_logits_std": 3.547152280807495, "gen_loss": 0.2151734083890915, "grad_norm": 0.33792625986453634, "learning_rate": 1.844463157894737e-05, "loss": 0.2183, "mean_copy_accuracy": 0.9976273030042648, "mean_gen_accuracy": 0.9006140381097794, "mean_token_accuracy": 0.9223776161670685, "num_tokens": 374882529.0, "sample_num_tokens": 9580.25, "step": 9884, "total_num_tokens": 374920850.0, "z_loss": 0.00042410718742758036 }, { "copy_logits_max": -1.653020977973938, "copy_logits_min": -750000000.0, "copy_num_tokens": 491.125, "epoch": 2.018789890222109, "gen_logits_max": 2.538940906524658, "gen_logits_mean": -18.829729080200195, "gen_logits_min": -31.5849552154541, "gen_logits_std": 3.6196813583374023, "gen_loss": 0.24276329576969147, "grad_norm": 0.34579704408126505, "learning_rate": 1.8443368421052634e-05, "loss": 0.2324, "mean_copy_accuracy": 0.9981965124607086, "mean_gen_accuracy": 0.8898163437843323, "mean_token_accuracy": 0.9176361858844757, "num_tokens": 375185950.0, "sample_num_tokens": 8635.5, "step": 9885, "total_num_tokens": 375220492.0, "z_loss": 0.0005259050521999598 }, { "copy_logits_max": -1.7400505542755127, "copy_logits_min": -750000000.0, "copy_num_tokens": 477.25, "epoch": 2.0189941281593056, "gen_logits_max": 3.993244171142578, "gen_logits_mean": -17.175079345703125, "gen_logits_min": -29.553730010986328, "gen_logits_std": 3.5352530479431152, "gen_loss": 0.23636803030967712, "grad_norm": 0.35836394794111565, "learning_rate": 1.8442105263157895e-05, "loss": 0.2294, "mean_copy_accuracy": 0.9980104118585587, "mean_gen_accuracy": 0.8925431668758392, "mean_token_accuracy": 0.918743908405304, "num_tokens": 375457193.0, "sample_num_tokens": 8638.75, "step": 9886, "total_num_tokens": 375491748.0, "z_loss": 0.0004910330171696842 }, { "copy_logits_max": -1.7142958641052246, "copy_logits_min": -750000000.0, "copy_num_tokens": 505.125, "epoch": 2.0191983660965023, "gen_logits_max": 2.5614001750946045, "gen_logits_mean": -19.07621192932129, "gen_logits_min": -31.717771530151367, "gen_logits_std": 3.660343885421753, "gen_loss": 0.20450854301452637, "grad_norm": 0.3840412582047212, "learning_rate": 1.844084210526316e-05, "loss": 0.2249, "mean_copy_accuracy": 0.9981402903795242, "mean_gen_accuracy": 0.8947586566209793, "mean_token_accuracy": 0.9205851256847382, "num_tokens": 375726793.0, "sample_num_tokens": 8740.25, "step": 9887, "total_num_tokens": 375761754.0, "z_loss": 0.00036997650749981403 }, { "copy_logits_max": -4.164531707763672, "copy_logits_min": -750000000.0, "copy_num_tokens": 391.3125, "epoch": 2.0194026040336994, "gen_logits_max": 2.60024356842041, "gen_logits_mean": -19.349830627441406, "gen_logits_min": -32.021568298339844, "gen_logits_std": 3.6561944484710693, "gen_loss": 0.2236224114894867, "grad_norm": 0.34986235747959454, "learning_rate": 1.843957894736842e-05, "loss": 0.2213, "mean_copy_accuracy": 0.9975135177373886, "mean_gen_accuracy": 0.8978204429149628, "mean_token_accuracy": 0.9219266027212143, "num_tokens": 375999743.0, "sample_num_tokens": 7504.75, "step": 9888, "total_num_tokens": 376029762.0, "z_loss": 0.00039601651951670647 }, { "copy_logits_max": -2.318016529083252, "copy_logits_min": -750000000.0, "copy_num_tokens": 686.25, "epoch": 2.019606841970896, "gen_logits_max": 2.815225601196289, "gen_logits_mean": -16.980615615844727, "gen_logits_min": -29.91281509399414, "gen_logits_std": 3.5353026390075684, "gen_loss": 0.19841404259204865, "grad_norm": 0.3646019097705739, "learning_rate": 1.8438315789473684e-05, "loss": 0.2209, "mean_copy_accuracy": 0.997552677989006, "mean_gen_accuracy": 0.8947495520114899, "mean_token_accuracy": 0.9213204085826874, "num_tokens": 376269141.0, "sample_num_tokens": 9528.75, "step": 9889, "total_num_tokens": 376307256.0, "z_loss": 0.00032173324143514037 }, { "copy_logits_max": -5.271095275878906, "copy_logits_min": -750000064.0, "copy_num_tokens": 532.125, "epoch": 2.019811079908093, "gen_logits_max": 3.0979723930358887, "gen_logits_mean": -17.359838485717773, "gen_logits_min": -29.927841186523438, "gen_logits_std": 3.568207263946533, "gen_loss": 0.22287267446517944, "grad_norm": 0.3548113085290579, "learning_rate": 1.8437052631578945e-05, "loss": 0.2357, "mean_copy_accuracy": 0.9973045289516449, "mean_gen_accuracy": 0.8913449198007584, "mean_token_accuracy": 0.916593998670578, "num_tokens": 376551705.0, "sample_num_tokens": 9084.25, "step": 9890, "total_num_tokens": 376588042.0, "z_loss": 0.00038451547152362764 }, { "copy_logits_max": -4.402266025543213, "copy_logits_min": -750000000.0, "copy_num_tokens": 402.25, "epoch": 2.02001531784529, "gen_logits_max": 3.5432348251342773, "gen_logits_mean": -17.97069549560547, "gen_logits_min": -30.369606018066406, "gen_logits_std": 3.6027259826660156, "gen_loss": 0.21833181381225586, "grad_norm": 0.33973225662650886, "learning_rate": 1.8435789473684213e-05, "loss": 0.2186, "mean_copy_accuracy": 0.997282087802887, "mean_gen_accuracy": 0.9019956439733505, "mean_token_accuracy": 0.9229916781187057, "num_tokens": 376812618.0, "sample_num_tokens": 7990.5, "step": 9891, "total_num_tokens": 376844580.0, "z_loss": 0.00036839215317741036 }, { "copy_logits_max": -2.8386476039886475, "copy_logits_min": -687500032.0, "copy_num_tokens": 307.75, "epoch": 2.0202195557824867, "gen_logits_max": 4.366971969604492, "gen_logits_mean": -17.885202407836914, "gen_logits_min": -30.400978088378906, "gen_logits_std": 3.6059951782226562, "gen_loss": 0.24472512304782867, "grad_norm": 0.35331672930348623, "learning_rate": 1.8434526315789474e-05, "loss": 0.2183, "mean_copy_accuracy": 0.9978824406862259, "mean_gen_accuracy": 0.8976760655641556, "mean_token_accuracy": 0.92367222905159, "num_tokens": 377077375.0, "sample_num_tokens": 7368.75, "step": 9892, "total_num_tokens": 377106850.0, "z_loss": 0.00036482923314906657 }, { "copy_logits_max": -5.612761974334717, "copy_logits_min": -750000000.0, "copy_num_tokens": 403.625, "epoch": 2.0204237937196834, "gen_logits_max": 3.504082679748535, "gen_logits_mean": -17.83643341064453, "gen_logits_min": -30.713733673095703, "gen_logits_std": 3.5734777450561523, "gen_loss": 0.22705379128456116, "grad_norm": 0.37391805001804956, "learning_rate": 1.8433263157894738e-05, "loss": 0.2335, "mean_copy_accuracy": 0.9973787516355515, "mean_gen_accuracy": 0.8938586860895157, "mean_token_accuracy": 0.9194155633449554, "num_tokens": 377357432.0, "sample_num_tokens": 8424.0, "step": 9893, "total_num_tokens": 377391128.0, "z_loss": 0.000372190261259675 }, { "copy_logits_max": -6.319336891174316, "copy_logits_min": -750000000.0, "copy_num_tokens": 434.8125, "epoch": 2.02062803165688, "gen_logits_max": 3.4350640773773193, "gen_logits_mean": -17.782154083251953, "gen_logits_min": -30.360694885253906, "gen_logits_std": 3.5933218002319336, "gen_loss": 0.2595115005970001, "grad_norm": 0.3579362956627066, "learning_rate": 1.8432e-05, "loss": 0.2258, "mean_copy_accuracy": 0.9968277961015701, "mean_gen_accuracy": 0.8961290419101715, "mean_token_accuracy": 0.9201507717370987, "num_tokens": 377631606.0, "sample_num_tokens": 8386.0, "step": 9894, "total_num_tokens": 377665150.0, "z_loss": 0.0003780905972234905 }, { "copy_logits_max": -5.4685869216918945, "copy_logits_min": -687500032.0, "copy_num_tokens": 561.875, "epoch": 2.0208322695940772, "gen_logits_max": 3.496328115463257, "gen_logits_mean": -17.2536678314209, "gen_logits_min": -30.08938217163086, "gen_logits_std": 3.5906896591186523, "gen_loss": 0.2076970338821411, "grad_norm": 0.373218409153873, "learning_rate": 1.8430736842105263e-05, "loss": 0.2202, "mean_copy_accuracy": 0.9979929625988007, "mean_gen_accuracy": 0.8954620659351349, "mean_token_accuracy": 0.9233731031417847, "num_tokens": 377925053.0, "sample_num_tokens": 8861.75, "step": 9895, "total_num_tokens": 377960500.0, "z_loss": 0.0003361128328833729 }, { "copy_logits_max": -3.7642650604248047, "copy_logits_min": -687500032.0, "copy_num_tokens": 579.0, "epoch": 2.021036507531274, "gen_logits_max": 3.166581392288208, "gen_logits_mean": -18.45101547241211, "gen_logits_min": -31.140724182128906, "gen_logits_std": 3.6451401710510254, "gen_loss": 0.2455040067434311, "grad_norm": 0.3736113601299631, "learning_rate": 1.8429473684210528e-05, "loss": 0.2349, "mean_copy_accuracy": 0.997356042265892, "mean_gen_accuracy": 0.8942786902189255, "mean_token_accuracy": 0.9195120334625244, "num_tokens": 378189988.0, "sample_num_tokens": 8858.0, "step": 9896, "total_num_tokens": 378225420.0, "z_loss": 0.0004549868172034621 }, { "copy_logits_max": -3.096066951751709, "copy_logits_min": -750000000.0, "copy_num_tokens": 515.25, "epoch": 2.0212407454684707, "gen_logits_max": 3.717362880706787, "gen_logits_mean": -17.355731964111328, "gen_logits_min": -29.649669647216797, "gen_logits_std": 3.579852819442749, "gen_loss": 0.21593746542930603, "grad_norm": 0.3805701057334636, "learning_rate": 1.842821052631579e-05, "loss": 0.2256, "mean_copy_accuracy": 0.9966500401496887, "mean_gen_accuracy": 0.8971039950847626, "mean_token_accuracy": 0.9208968579769135, "num_tokens": 378455034.0, "sample_num_tokens": 9028.5, "step": 9897, "total_num_tokens": 378491148.0, "z_loss": 0.00033612753031775355 }, { "copy_logits_max": -2.654129981994629, "copy_logits_min": -750000000.0, "copy_num_tokens": 629.6875, "epoch": 2.021444983405668, "gen_logits_max": 3.3775339126586914, "gen_logits_mean": -17.03292465209961, "gen_logits_min": -29.61646270751953, "gen_logits_std": 3.5528624057769775, "gen_loss": 0.21023693680763245, "grad_norm": 0.37195145781235434, "learning_rate": 1.8426947368421053e-05, "loss": 0.2166, "mean_copy_accuracy": 0.9976223707199097, "mean_gen_accuracy": 0.8970666974782944, "mean_token_accuracy": 0.9239615947008133, "num_tokens": 378717536.0, "sample_num_tokens": 9637.0, "step": 9898, "total_num_tokens": 378756084.0, "z_loss": 0.000411936838645488 }, { "copy_logits_max": -2.367875814437866, "copy_logits_min": -687500032.0, "copy_num_tokens": 664.5625, "epoch": 2.0216492213428645, "gen_logits_max": 2.4316701889038086, "gen_logits_mean": -18.187196731567383, "gen_logits_min": -30.91907501220703, "gen_logits_std": 3.6117959022521973, "gen_loss": 0.20680615305900574, "grad_norm": 0.3770562319193425, "learning_rate": 1.8425684210526317e-05, "loss": 0.2434, "mean_copy_accuracy": 0.9976108223199844, "mean_gen_accuracy": 0.8887735307216644, "mean_token_accuracy": 0.9138822704553604, "num_tokens": 378994446.0, "sample_num_tokens": 9429.0, "step": 9899, "total_num_tokens": 379032162.0, "z_loss": 0.0004367380461189896 }, { "copy_logits_max": -5.035593032836914, "copy_logits_min": -687500032.0, "copy_num_tokens": 430.0, "epoch": 2.021853459280061, "gen_logits_max": 3.2084193229675293, "gen_logits_mean": -18.503482818603516, "gen_logits_min": -31.047256469726562, "gen_logits_std": 3.633573055267334, "gen_loss": 0.23663213849067688, "grad_norm": 0.3524091555481791, "learning_rate": 1.842442105263158e-05, "loss": 0.2121, "mean_copy_accuracy": 0.9977474212646484, "mean_gen_accuracy": 0.8986944854259491, "mean_token_accuracy": 0.9253229200839996, "num_tokens": 379264205.0, "sample_num_tokens": 8268.25, "step": 9900, "total_num_tokens": 379297278.0, "z_loss": 0.00039158464642241597 }, { "copy_logits_max": -7.317686557769775, "copy_logits_min": -750000000.0, "copy_num_tokens": 321.0, "epoch": 2.022057697217258, "gen_logits_max": 3.4925265312194824, "gen_logits_mean": -19.606212615966797, "gen_logits_min": -31.87036895751953, "gen_logits_std": 3.667171001434326, "gen_loss": 0.24020960927009583, "grad_norm": 0.36399273402354304, "learning_rate": 1.8423157894736842e-05, "loss": 0.2362, "mean_copy_accuracy": 0.9963885843753815, "mean_gen_accuracy": 0.8972150087356567, "mean_token_accuracy": 0.91694675385952, "num_tokens": 379535651.0, "sample_num_tokens": 8649.25, "step": 9901, "total_num_tokens": 379570248.0, "z_loss": 0.0004249716876074672 }, { "copy_logits_max": -2.3664684295654297, "copy_logits_min": -687500032.0, "copy_num_tokens": 658.0, "epoch": 2.022261935154455, "gen_logits_max": 2.6579532623291016, "gen_logits_mean": -17.271705627441406, "gen_logits_min": -30.397319793701172, "gen_logits_std": 3.567256450653076, "gen_loss": 0.21607859432697296, "grad_norm": 0.3592358293582621, "learning_rate": 1.8421894736842107e-05, "loss": 0.2238, "mean_copy_accuracy": 0.9973543733358383, "mean_gen_accuracy": 0.8939987272024155, "mean_token_accuracy": 0.9219237267971039, "num_tokens": 379837742.0, "sample_num_tokens": 9542.5, "step": 9902, "total_num_tokens": 379875912.0, "z_loss": 0.0004009940894320607 }, { "copy_logits_max": -1.1644716262817383, "copy_logits_min": -687500032.0, "copy_num_tokens": 571.1875, "epoch": 2.0224661730916518, "gen_logits_max": 3.630309581756592, "gen_logits_mean": -16.727962493896484, "gen_logits_min": -29.667705535888672, "gen_logits_std": 3.538621425628662, "gen_loss": 0.22081278264522552, "grad_norm": 0.37699573223337635, "learning_rate": 1.8420631578947368e-05, "loss": 0.2266, "mean_copy_accuracy": 0.9972608983516693, "mean_gen_accuracy": 0.8941072523593903, "mean_token_accuracy": 0.9197950959205627, "num_tokens": 380099588.0, "sample_num_tokens": 10437.5, "step": 9903, "total_num_tokens": 380141338.0, "z_loss": 0.0003555073926690966 }, { "copy_logits_max": -4.4658942222595215, "copy_logits_min": -750000000.0, "copy_num_tokens": 434.375, "epoch": 2.0226704110288485, "gen_logits_max": 2.875422239303589, "gen_logits_mean": -18.22854995727539, "gen_logits_min": -31.328542709350586, "gen_logits_std": 3.6170239448547363, "gen_loss": 0.21510568261146545, "grad_norm": 0.351336754042427, "learning_rate": 1.8419368421052632e-05, "loss": 0.2284, "mean_copy_accuracy": 0.9978175014257431, "mean_gen_accuracy": 0.8942428678274155, "mean_token_accuracy": 0.9207302778959274, "num_tokens": 380385723.0, "sample_num_tokens": 8044.75, "step": 9904, "total_num_tokens": 380417902.0, "z_loss": 0.0003476952842902392 }, { "copy_logits_max": -6.230531692504883, "copy_logits_min": -750000000.0, "copy_num_tokens": 569.5625, "epoch": 2.0228746489660456, "gen_logits_max": 2.2103724479675293, "gen_logits_mean": -19.80591583251953, "gen_logits_min": -32.13145065307617, "gen_logits_std": 3.676050901412964, "gen_loss": 0.2067742943763733, "grad_norm": 0.3276085874597246, "learning_rate": 1.8418105263157893e-05, "loss": 0.206, "mean_copy_accuracy": 0.9975263476371765, "mean_gen_accuracy": 0.9015516489744186, "mean_token_accuracy": 0.9284887909889221, "num_tokens": 380650686.0, "sample_num_tokens": 9454.5, "step": 9905, "total_num_tokens": 380688504.0, "z_loss": 0.00035514490446075797 }, { "copy_logits_max": -4.059976100921631, "copy_logits_min": -687500032.0, "copy_num_tokens": 560.4375, "epoch": 2.0230788869032423, "gen_logits_max": 2.7095608711242676, "gen_logits_mean": -18.5242919921875, "gen_logits_min": -30.82225227355957, "gen_logits_std": 3.626805305480957, "gen_loss": 0.23627927899360657, "grad_norm": 0.35503699150530543, "learning_rate": 1.8416842105263157e-05, "loss": 0.2293, "mean_copy_accuracy": 0.9976173043251038, "mean_gen_accuracy": 0.8925229460000992, "mean_token_accuracy": 0.9201874136924744, "num_tokens": 380927207.0, "sample_num_tokens": 9228.25, "step": 9906, "total_num_tokens": 380964120.0, "z_loss": 0.0004001391353085637 }, { "copy_logits_max": -5.088318347930908, "copy_logits_min": -750000000.0, "copy_num_tokens": 448.625, "epoch": 2.023283124840439, "gen_logits_max": 3.068366527557373, "gen_logits_mean": -18.63547134399414, "gen_logits_min": -31.383197784423828, "gen_logits_std": 3.640599250793457, "gen_loss": 0.2089950442314148, "grad_norm": 0.3562055510065923, "learning_rate": 1.841557894736842e-05, "loss": 0.2257, "mean_copy_accuracy": 0.9972254633903503, "mean_gen_accuracy": 0.890975832939148, "mean_token_accuracy": 0.921180248260498, "num_tokens": 381219318.0, "sample_num_tokens": 8501.0, "step": 9907, "total_num_tokens": 381253322.0, "z_loss": 0.0003650156722869724 }, { "copy_logits_max": -3.8417842388153076, "copy_logits_min": -750000000.0, "copy_num_tokens": 300.5, "epoch": 2.0234873627776357, "gen_logits_max": 3.6009626388549805, "gen_logits_mean": -17.535322189331055, "gen_logits_min": -30.01772689819336, "gen_logits_std": 3.5459651947021484, "gen_loss": 0.23653559386730194, "grad_norm": 0.3795286908268887, "learning_rate": 1.8414315789473686e-05, "loss": 0.2389, "mean_copy_accuracy": 0.9962079972028732, "mean_gen_accuracy": 0.8929729461669922, "mean_token_accuracy": 0.915609285235405, "num_tokens": 381479031.0, "sample_num_tokens": 6511.25, "step": 9908, "total_num_tokens": 381505076.0, "z_loss": 0.000399683223804459 }, { "copy_logits_max": -1.0692569017410278, "copy_logits_min": -750000064.0, "copy_num_tokens": 590.375, "epoch": 2.023691600714833, "gen_logits_max": 3.2439072132110596, "gen_logits_mean": -17.16034698486328, "gen_logits_min": -30.034332275390625, "gen_logits_std": 3.581631660461426, "gen_loss": 0.21333011984825134, "grad_norm": 0.3906569418495073, "learning_rate": 1.841305263157895e-05, "loss": 0.2341, "mean_copy_accuracy": 0.998184934258461, "mean_gen_accuracy": 0.8909498602151871, "mean_token_accuracy": 0.9172961860895157, "num_tokens": 381735386.0, "sample_num_tokens": 8912.0, "step": 9909, "total_num_tokens": 381771034.0, "z_loss": 0.0003735937352757901 }, { "copy_logits_max": -3.732126235961914, "copy_logits_min": -750000000.0, "copy_num_tokens": 312.25, "epoch": 2.0238958386520296, "gen_logits_max": 4.015322208404541, "gen_logits_mean": -18.105327606201172, "gen_logits_min": -30.640308380126953, "gen_logits_std": 3.6070845127105713, "gen_loss": 0.23059040307998657, "grad_norm": 0.3587727151427517, "learning_rate": 1.841178947368421e-05, "loss": 0.2343, "mean_copy_accuracy": 0.9966111034154892, "mean_gen_accuracy": 0.8965931832790375, "mean_token_accuracy": 0.9182200282812119, "num_tokens": 382000735.0, "sample_num_tokens": 7236.25, "step": 9910, "total_num_tokens": 382029680.0, "z_loss": 0.0004426585219334811 }, { "copy_logits_max": -2.055269241333008, "copy_logits_min": -750000000.0, "copy_num_tokens": 354.125, "epoch": 2.0241000765892263, "gen_logits_max": 2.933983087539673, "gen_logits_mean": -19.218650817871094, "gen_logits_min": -31.542755126953125, "gen_logits_std": 3.6516993045806885, "gen_loss": 0.235252246260643, "grad_norm": 0.3277169708673342, "learning_rate": 1.8410526315789475e-05, "loss": 0.2175, "mean_copy_accuracy": 0.9974223226308823, "mean_gen_accuracy": 0.9006834328174591, "mean_token_accuracy": 0.9245027154684067, "num_tokens": 382268734.0, "sample_num_tokens": 7211.0, "step": 9911, "total_num_tokens": 382297578.0, "z_loss": 0.00042919276165775955 }, { "copy_logits_max": -1.241217851638794, "copy_logits_min": -750000000.0, "copy_num_tokens": 459.0, "epoch": 2.0243043145264235, "gen_logits_max": 4.983911514282227, "gen_logits_mean": -16.273784637451172, "gen_logits_min": -29.208433151245117, "gen_logits_std": 3.501526355743408, "gen_loss": 0.2512887716293335, "grad_norm": 0.36653524569284657, "learning_rate": 1.8409263157894736e-05, "loss": 0.2262, "mean_copy_accuracy": 0.9974970072507858, "mean_gen_accuracy": 0.897579163312912, "mean_token_accuracy": 0.9217033088207245, "num_tokens": 382520184.0, "sample_num_tokens": 8591.0, "step": 9912, "total_num_tokens": 382554548.0, "z_loss": 0.0004861382767558098 }, { "copy_logits_max": -7.180281639099121, "copy_logits_min": -750000000.0, "copy_num_tokens": 407.8125, "epoch": 2.02450855246362, "gen_logits_max": 2.964676856994629, "gen_logits_mean": -19.868019104003906, "gen_logits_min": -32.47894287109375, "gen_logits_std": 3.687314510345459, "gen_loss": 0.2377355992794037, "grad_norm": 0.36904623859504027, "learning_rate": 1.8408e-05, "loss": 0.2434, "mean_copy_accuracy": 0.9970528185367584, "mean_gen_accuracy": 0.8929847478866577, "mean_token_accuracy": 0.9144954979419708, "num_tokens": 382789157.0, "sample_num_tokens": 8134.75, "step": 9913, "total_num_tokens": 382821696.0, "z_loss": 0.00044070143485441804 }, { "copy_logits_max": -1.9443658590316772, "copy_logits_min": -750000000.0, "copy_num_tokens": 579.75, "epoch": 2.024712790400817, "gen_logits_max": 4.0062103271484375, "gen_logits_mean": -15.99659252166748, "gen_logits_min": -28.724613189697266, "gen_logits_std": 3.5071372985839844, "gen_loss": 0.2285403162240982, "grad_norm": 0.35960720132911345, "learning_rate": 1.840673684210526e-05, "loss": 0.2287, "mean_copy_accuracy": 0.9970563501119614, "mean_gen_accuracy": 0.8957203924655914, "mean_token_accuracy": 0.9193249344825745, "num_tokens": 383051028.0, "sample_num_tokens": 9116.5, "step": 9914, "total_num_tokens": 383087494.0, "z_loss": 0.00042036897502839565 }, { "copy_logits_max": -4.048228740692139, "copy_logits_min": -750000000.0, "copy_num_tokens": 321.8125, "epoch": 2.0249170283380136, "gen_logits_max": 4.173548698425293, "gen_logits_mean": -17.362388610839844, "gen_logits_min": -29.56509017944336, "gen_logits_std": 3.5555191040039062, "gen_loss": 0.24212592840194702, "grad_norm": 0.3461160746112835, "learning_rate": 1.840547368421053e-05, "loss": 0.2374, "mean_copy_accuracy": 0.9962740689516068, "mean_gen_accuracy": 0.8909590095281601, "mean_token_accuracy": 0.9164854139089584, "num_tokens": 383312985.0, "sample_num_tokens": 6928.75, "step": 9915, "total_num_tokens": 383340700.0, "z_loss": 0.00043127278331667185 }, { "copy_logits_max": -4.496777057647705, "copy_logits_min": -687500032.0, "copy_num_tokens": 539.8125, "epoch": 2.0251212662752107, "gen_logits_max": 3.3450398445129395, "gen_logits_mean": -17.842578887939453, "gen_logits_min": -30.491561889648438, "gen_logits_std": 3.589200496673584, "gen_loss": 0.2178390622138977, "grad_norm": 0.3757967922352889, "learning_rate": 1.840421052631579e-05, "loss": 0.2283, "mean_copy_accuracy": 0.9978504478931427, "mean_gen_accuracy": 0.89327771961689, "mean_token_accuracy": 0.9197804480791092, "num_tokens": 383580945.0, "sample_num_tokens": 9212.75, "step": 9916, "total_num_tokens": 383617796.0, "z_loss": 0.00039362331153824925 }, { "copy_logits_max": -6.574028968811035, "copy_logits_min": -750000000.0, "copy_num_tokens": 439.375, "epoch": 2.0253255042124074, "gen_logits_max": 2.418224334716797, "gen_logits_mean": -19.996601104736328, "gen_logits_min": -32.232505798339844, "gen_logits_std": 3.6786224842071533, "gen_loss": 0.21935220062732697, "grad_norm": 0.3432714640719814, "learning_rate": 1.8402947368421054e-05, "loss": 0.2201, "mean_copy_accuracy": 0.9978507459163666, "mean_gen_accuracy": 0.8982512056827545, "mean_token_accuracy": 0.9226548224687576, "num_tokens": 383856992.0, "sample_num_tokens": 8517.5, "step": 9917, "total_num_tokens": 383891062.0, "z_loss": 0.0003968040691688657 }, { "copy_logits_max": -5.228417873382568, "copy_logits_min": -750000000.0, "copy_num_tokens": 346.0625, "epoch": 2.025529742149604, "gen_logits_max": 3.4089670181274414, "gen_logits_mean": -19.105056762695312, "gen_logits_min": -31.556255340576172, "gen_logits_std": 3.63896107673645, "gen_loss": 0.2294708490371704, "grad_norm": 0.36453273937473724, "learning_rate": 1.8401684210526315e-05, "loss": 0.2357, "mean_copy_accuracy": 0.9968117475509644, "mean_gen_accuracy": 0.8964918553829193, "mean_token_accuracy": 0.9166894555091858, "num_tokens": 384131281.0, "sample_num_tokens": 8195.75, "step": 9918, "total_num_tokens": 384164064.0, "z_loss": 0.00044115079799667 }, { "copy_logits_max": -4.0250372886657715, "copy_logits_min": -750000000.0, "copy_num_tokens": 415.5, "epoch": 2.0257339800868013, "gen_logits_max": 4.754461765289307, "gen_logits_mean": -16.112897872924805, "gen_logits_min": -28.80328369140625, "gen_logits_std": 3.4958112239837646, "gen_loss": 0.2165282964706421, "grad_norm": 0.33902925498384484, "learning_rate": 1.840042105263158e-05, "loss": 0.2224, "mean_copy_accuracy": 0.997955247759819, "mean_gen_accuracy": 0.8932046592235565, "mean_token_accuracy": 0.9214693158864975, "num_tokens": 384414944.0, "sample_num_tokens": 8231.5, "step": 9919, "total_num_tokens": 384447870.0, "z_loss": 0.0003538173041306436 }, { "copy_logits_max": -2.6890203952789307, "copy_logits_min": -750000000.0, "copy_num_tokens": 454.4375, "epoch": 2.025938218023998, "gen_logits_max": 3.287285327911377, "gen_logits_mean": -17.895015716552734, "gen_logits_min": -30.472896575927734, "gen_logits_std": 3.5901412963867188, "gen_loss": 0.252114474773407, "grad_norm": 0.3668900036536347, "learning_rate": 1.839915789473684e-05, "loss": 0.2416, "mean_copy_accuracy": 0.9966835528612137, "mean_gen_accuracy": 0.8926368057727814, "mean_token_accuracy": 0.9147969335317612, "num_tokens": 384675482.0, "sample_num_tokens": 7962.0, "step": 9920, "total_num_tokens": 384707330.0, "z_loss": 0.00044777296716347337 }, { "copy_logits_max": -3.4097280502319336, "copy_logits_min": -750000000.0, "copy_num_tokens": 558.0, "epoch": 2.0261424559611947, "gen_logits_max": 3.605064630508423, "gen_logits_mean": -17.436702728271484, "gen_logits_min": -30.05399513244629, "gen_logits_std": 3.574409008026123, "gen_loss": 0.21649786829948425, "grad_norm": 0.37733303411945307, "learning_rate": 1.8397894736842105e-05, "loss": 0.2329, "mean_copy_accuracy": 0.9969289749860764, "mean_gen_accuracy": 0.8925624936819077, "mean_token_accuracy": 0.9185228496789932, "num_tokens": 384942178.0, "sample_num_tokens": 8701.0, "step": 9921, "total_num_tokens": 384976982.0, "z_loss": 0.0004094931937288493 }, { "copy_logits_max": -3.2802233695983887, "copy_logits_min": -687500032.0, "copy_num_tokens": 416.5625, "epoch": 2.026346693898392, "gen_logits_max": 4.3612165451049805, "gen_logits_mean": -16.380340576171875, "gen_logits_min": -28.788074493408203, "gen_logits_std": 3.5057685375213623, "gen_loss": 0.26481783390045166, "grad_norm": 0.36599226051493094, "learning_rate": 1.839663157894737e-05, "loss": 0.2425, "mean_copy_accuracy": 0.9971765577793121, "mean_gen_accuracy": 0.888105183839798, "mean_token_accuracy": 0.9151023477315903, "num_tokens": 385198766.0, "sample_num_tokens": 8520.0, "step": 9922, "total_num_tokens": 385232846.0, "z_loss": 0.0005001329118385911 }, { "copy_logits_max": -1.0824832916259766, "copy_logits_min": -750000000.0, "copy_num_tokens": 507.9375, "epoch": 2.0265509318355885, "gen_logits_max": 2.7794346809387207, "gen_logits_mean": -17.801294326782227, "gen_logits_min": -30.646472930908203, "gen_logits_std": 3.6087076663970947, "gen_loss": 0.22629079222679138, "grad_norm": 0.3572782362962347, "learning_rate": 1.839536842105263e-05, "loss": 0.2213, "mean_copy_accuracy": 0.9976667612791061, "mean_gen_accuracy": 0.8958197236061096, "mean_token_accuracy": 0.9218890964984894, "num_tokens": 385459284.0, "sample_num_tokens": 8273.5, "step": 9923, "total_num_tokens": 385492378.0, "z_loss": 0.0004152458568569273 }, { "copy_logits_max": -3.7584729194641113, "copy_logits_min": -750000000.0, "copy_num_tokens": 287.6875, "epoch": 2.0267551697727852, "gen_logits_max": 3.9571568965911865, "gen_logits_mean": -17.679515838623047, "gen_logits_min": -30.172164916992188, "gen_logits_std": 3.5790843963623047, "gen_loss": 0.2525526285171509, "grad_norm": 0.3768678899952898, "learning_rate": 1.8394105263157898e-05, "loss": 0.2422, "mean_copy_accuracy": 0.9959754347801208, "mean_gen_accuracy": 0.894824281334877, "mean_token_accuracy": 0.9149302244186401, "num_tokens": 385700059.0, "sample_num_tokens": 6985.25, "step": 9924, "total_num_tokens": 385728000.0, "z_loss": 0.0004352233372628689 }, { "copy_logits_max": -4.61245584487915, "copy_logits_min": -750000000.0, "copy_num_tokens": 460.8125, "epoch": 2.026959407709982, "gen_logits_max": 2.214738368988037, "gen_logits_mean": -19.7605037689209, "gen_logits_min": -32.45214080810547, "gen_logits_std": 3.725802421569824, "gen_loss": 0.2093517929315567, "grad_norm": 0.38484111168885066, "learning_rate": 1.839284210526316e-05, "loss": 0.2312, "mean_copy_accuracy": 0.9975987523794174, "mean_gen_accuracy": 0.8950743973255157, "mean_token_accuracy": 0.9192391782999039, "num_tokens": 385950162.0, "sample_num_tokens": 8091.5, "step": 9925, "total_num_tokens": 385982528.0, "z_loss": 0.0003824652230832726 }, { "copy_logits_max": -2.3817825317382812, "copy_logits_min": -687500032.0, "copy_num_tokens": 568.4375, "epoch": 2.027163645647179, "gen_logits_max": 2.6949775218963623, "gen_logits_mean": -18.171119689941406, "gen_logits_min": -30.703994750976562, "gen_logits_std": 3.6530070304870605, "gen_loss": 0.2213805913925171, "grad_norm": 0.343858910206618, "learning_rate": 1.8391578947368423e-05, "loss": 0.2097, "mean_copy_accuracy": 0.9972489476203918, "mean_gen_accuracy": 0.9025892466306686, "mean_token_accuracy": 0.926960289478302, "num_tokens": 386223832.0, "sample_num_tokens": 9092.5, "step": 9926, "total_num_tokens": 386260202.0, "z_loss": 0.0003657422203104943 }, { "copy_logits_max": 2.568906307220459, "copy_logits_min": -750000128.0, "copy_num_tokens": 549.875, "epoch": 2.027367883584376, "gen_logits_max": 4.34096622467041, "gen_logits_mean": -16.46890640258789, "gen_logits_min": -29.215686798095703, "gen_logits_std": 3.5611462593078613, "gen_loss": 0.2185194194316864, "grad_norm": 0.3659063045330686, "learning_rate": 1.8390315789473684e-05, "loss": 0.2259, "mean_copy_accuracy": 0.9968286454677582, "mean_gen_accuracy": 0.8968989551067352, "mean_token_accuracy": 0.9208913892507553, "num_tokens": 386514838.0, "sample_num_tokens": 9546.0, "step": 9927, "total_num_tokens": 386553022.0, "z_loss": 0.00033436634112149477 }, { "copy_logits_max": 0.06407874822616577, "copy_logits_min": -750000000.0, "copy_num_tokens": 806.3125, "epoch": 2.0275721215215725, "gen_logits_max": 3.2119827270507812, "gen_logits_mean": -16.163494110107422, "gen_logits_min": -28.821453094482422, "gen_logits_std": 3.5518436431884766, "gen_loss": 0.22365827858448029, "grad_norm": 0.3510345461802253, "learning_rate": 1.838905263157895e-05, "loss": 0.2178, "mean_copy_accuracy": 0.9975704848766327, "mean_gen_accuracy": 0.894175797700882, "mean_token_accuracy": 0.9236569106578827, "num_tokens": 386801968.0, "sample_num_tokens": 10831.0, "step": 9928, "total_num_tokens": 386845292.0, "z_loss": 0.0003572292625904083 }, { "copy_logits_max": 0.17740660905838013, "copy_logits_min": -750000064.0, "copy_num_tokens": 384.25, "epoch": 2.0277763594587697, "gen_logits_max": 4.1805925369262695, "gen_logits_mean": -17.39946746826172, "gen_logits_min": -30.060705184936523, "gen_logits_std": 3.60579252243042, "gen_loss": 0.20574533939361572, "grad_norm": 0.37074234789171456, "learning_rate": 1.838778947368421e-05, "loss": 0.2213, "mean_copy_accuracy": 0.9970279335975647, "mean_gen_accuracy": 0.9039719998836517, "mean_token_accuracy": 0.9227954745292664, "num_tokens": 387060783.0, "sample_num_tokens": 8226.75, "step": 9929, "total_num_tokens": 387093690.0, "z_loss": 0.0003786483430303633 }, { "copy_logits_max": -3.3026702404022217, "copy_logits_min": -687500032.0, "copy_num_tokens": 318.9375, "epoch": 2.0279805973959664, "gen_logits_max": 3.536510467529297, "gen_logits_mean": -18.276077270507812, "gen_logits_min": -30.766786575317383, "gen_logits_std": 3.6236953735351562, "gen_loss": 0.2356526255607605, "grad_norm": 0.37285283879013825, "learning_rate": 1.8386526315789474e-05, "loss": 0.2295, "mean_copy_accuracy": 0.9974103718996048, "mean_gen_accuracy": 0.8973834812641144, "mean_token_accuracy": 0.9189772605895996, "num_tokens": 387306464.0, "sample_num_tokens": 7243.0, "step": 9930, "total_num_tokens": 387335436.0, "z_loss": 0.0003437016566749662 }, { "copy_logits_max": -3.505532741546631, "copy_logits_min": -687500032.0, "copy_num_tokens": 536.1875, "epoch": 2.028184835333163, "gen_logits_max": 2.955432415008545, "gen_logits_mean": -18.135967254638672, "gen_logits_min": -30.89935302734375, "gen_logits_std": 3.65246844291687, "gen_loss": 0.2322055697441101, "grad_norm": 0.3617420704744584, "learning_rate": 1.8385263157894735e-05, "loss": 0.2302, "mean_copy_accuracy": 0.9967441856861115, "mean_gen_accuracy": 0.8959683328866959, "mean_token_accuracy": 0.9183742254972458, "num_tokens": 387571998.0, "sample_num_tokens": 8999.5, "step": 9931, "total_num_tokens": 387607996.0, "z_loss": 0.0003887144266627729 }, { "copy_logits_max": -1.1751604080200195, "copy_logits_min": -687500096.0, "copy_num_tokens": 534.25, "epoch": 2.0283890732703598, "gen_logits_max": 3.7856345176696777, "gen_logits_mean": -17.210636138916016, "gen_logits_min": -29.872791290283203, "gen_logits_std": 3.579315185546875, "gen_loss": 0.25486642122268677, "grad_norm": 0.3764134898971249, "learning_rate": 1.8384000000000002e-05, "loss": 0.245, "mean_copy_accuracy": 0.9977674633264542, "mean_gen_accuracy": 0.8880730420351028, "mean_token_accuracy": 0.9146726280450821, "num_tokens": 387853253.0, "sample_num_tokens": 9318.75, "step": 9932, "total_num_tokens": 387890528.0, "z_loss": 0.00034811318619176745 }, { "copy_logits_max": -5.71476411819458, "copy_logits_min": -750000064.0, "copy_num_tokens": 277.625, "epoch": 2.028593311207557, "gen_logits_max": 4.015036582946777, "gen_logits_mean": -17.106952667236328, "gen_logits_min": -29.240463256835938, "gen_logits_std": 3.5416743755340576, "gen_loss": 0.23910997807979584, "grad_norm": 0.37998481835851455, "learning_rate": 1.8382736842105263e-05, "loss": 0.2317, "mean_copy_accuracy": 0.9975217133760452, "mean_gen_accuracy": 0.8962424546480179, "mean_token_accuracy": 0.9204878509044647, "num_tokens": 388120032.0, "sample_num_tokens": 6498.0, "step": 9933, "total_num_tokens": 388146024.0, "z_loss": 0.0003472238313406706 }, { "copy_logits_max": -2.328096389770508, "copy_logits_min": -750000000.0, "copy_num_tokens": 718.0625, "epoch": 2.0287975491447536, "gen_logits_max": 2.1785953044891357, "gen_logits_mean": -16.618389129638672, "gen_logits_min": -29.024524688720703, "gen_logits_std": 3.4754629135131836, "gen_loss": 0.18605634570121765, "grad_norm": 0.35997707689269903, "learning_rate": 1.8381473684210527e-05, "loss": 0.2087, "mean_copy_accuracy": 0.9977429807186127, "mean_gen_accuracy": 0.8962407559156418, "mean_token_accuracy": 0.926718458533287, "num_tokens": 388402095.0, "sample_num_tokens": 9261.75, "step": 9934, "total_num_tokens": 388439142.0, "z_loss": 0.0003208154230378568 }, { "copy_logits_max": -5.148334503173828, "copy_logits_min": -750000000.0, "copy_num_tokens": 384.4375, "epoch": 2.0290017870819503, "gen_logits_max": 3.5004091262817383, "gen_logits_mean": -18.011768341064453, "gen_logits_min": -30.213703155517578, "gen_logits_std": 3.613354206085205, "gen_loss": 0.22055548429489136, "grad_norm": 0.3387157255412576, "learning_rate": 1.8380210526315792e-05, "loss": 0.219, "mean_copy_accuracy": 0.9970599561929703, "mean_gen_accuracy": 0.8982856720685959, "mean_token_accuracy": 0.9232207387685776, "num_tokens": 388682533.0, "sample_num_tokens": 8232.25, "step": 9935, "total_num_tokens": 388715462.0, "z_loss": 0.00033529181382618845 }, { "copy_logits_max": -4.3135457038879395, "copy_logits_min": -750000000.0, "copy_num_tokens": 492.9375, "epoch": 2.0292060250191475, "gen_logits_max": 1.5617704391479492, "gen_logits_mean": -19.524866104125977, "gen_logits_min": -31.92405891418457, "gen_logits_std": 3.679311752319336, "gen_loss": 0.22137413918972015, "grad_norm": 0.35808184488348255, "learning_rate": 1.8378947368421053e-05, "loss": 0.2307, "mean_copy_accuracy": 0.9979092478752136, "mean_gen_accuracy": 0.8910005539655685, "mean_token_accuracy": 0.9192825108766556, "num_tokens": 388962183.0, "sample_num_tokens": 7458.75, "step": 9936, "total_num_tokens": 388992018.0, "z_loss": 0.0003884285979438573 }, { "copy_logits_max": -6.9573974609375, "copy_logits_min": -687500032.0, "copy_num_tokens": 551.125, "epoch": 2.029410262956344, "gen_logits_max": 2.870307683944702, "gen_logits_mean": -17.634260177612305, "gen_logits_min": -30.07636260986328, "gen_logits_std": 3.571748733520508, "gen_loss": 0.21674159169197083, "grad_norm": 0.35067456705098476, "learning_rate": 1.8377684210526317e-05, "loss": 0.216, "mean_copy_accuracy": 0.9976692050695419, "mean_gen_accuracy": 0.8972876816987991, "mean_token_accuracy": 0.9236136972904205, "num_tokens": 389248536.0, "sample_num_tokens": 8812.5, "step": 9937, "total_num_tokens": 389283786.0, "z_loss": 0.00036757916677743196 }, { "copy_logits_max": -4.455068588256836, "copy_logits_min": -687500032.0, "copy_num_tokens": 630.5625, "epoch": 2.029614500893541, "gen_logits_max": 2.283376693725586, "gen_logits_mean": -18.358623504638672, "gen_logits_min": -31.008134841918945, "gen_logits_std": 3.6268675327301025, "gen_loss": 0.23018446564674377, "grad_norm": 0.3526759564015671, "learning_rate": 1.8376421052631578e-05, "loss": 0.2269, "mean_copy_accuracy": 0.998020738363266, "mean_gen_accuracy": 0.8907782882452011, "mean_token_accuracy": 0.9196416586637497, "num_tokens": 389529278.0, "sample_num_tokens": 9281.5, "step": 9938, "total_num_tokens": 389566404.0, "z_loss": 0.0004433886497281492 }, { "copy_logits_max": -6.094605445861816, "copy_logits_min": -687500032.0, "copy_num_tokens": 619.0, "epoch": 2.0298187388307376, "gen_logits_max": 2.1433606147766113, "gen_logits_mean": -19.620708465576172, "gen_logits_min": -32.449066162109375, "gen_logits_std": 3.6887834072113037, "gen_loss": 0.18647262454032898, "grad_norm": 0.3637937942372586, "learning_rate": 1.8375157894736842e-05, "loss": 0.2069, "mean_copy_accuracy": 0.9968285411596298, "mean_gen_accuracy": 0.9055152982473373, "mean_token_accuracy": 0.927070677280426, "num_tokens": 389817984.0, "sample_num_tokens": 10157.0, "step": 9939, "total_num_tokens": 389858612.0, "z_loss": 0.00032193129300139844 }, { "copy_logits_max": -5.450436592102051, "copy_logits_min": -750000000.0, "copy_num_tokens": 374.25, "epoch": 2.0300229767679348, "gen_logits_max": 3.060305118560791, "gen_logits_mean": -19.34543228149414, "gen_logits_min": -31.921627044677734, "gen_logits_std": 3.6600875854492188, "gen_loss": 0.2048014998435974, "grad_norm": 0.34913353614449044, "learning_rate": 1.8373894736842107e-05, "loss": 0.2206, "mean_copy_accuracy": 0.9969914555549622, "mean_gen_accuracy": 0.8985795825719833, "mean_token_accuracy": 0.9206242859363556, "num_tokens": 390064681.0, "sample_num_tokens": 8085.25, "step": 9940, "total_num_tokens": 390097022.0, "z_loss": 0.0003992367419414222 }, { "copy_logits_max": -5.454257965087891, "copy_logits_min": -750000000.0, "copy_num_tokens": 608.375, "epoch": 2.0302272147051315, "gen_logits_max": 2.2741146087646484, "gen_logits_mean": -19.19170379638672, "gen_logits_min": -31.688234329223633, "gen_logits_std": 3.6180832386016846, "gen_loss": 0.23265229165554047, "grad_norm": 0.35821703766539986, "learning_rate": 1.837263157894737e-05, "loss": 0.2253, "mean_copy_accuracy": 0.9983527511358261, "mean_gen_accuracy": 0.8962053060531616, "mean_token_accuracy": 0.9216998070478439, "num_tokens": 390342273.0, "sample_num_tokens": 9350.25, "step": 9941, "total_num_tokens": 390379674.0, "z_loss": 0.0004372875555418432 }, { "copy_logits_max": -6.8266754150390625, "copy_logits_min": -750000000.0, "copy_num_tokens": 568.3125, "epoch": 2.030431452642328, "gen_logits_max": 2.0945253372192383, "gen_logits_mean": -19.688888549804688, "gen_logits_min": -32.08781051635742, "gen_logits_std": 3.650183916091919, "gen_loss": 0.23237884044647217, "grad_norm": 0.3693228379666331, "learning_rate": 1.8371368421052632e-05, "loss": 0.2312, "mean_copy_accuracy": 0.9974917620420456, "mean_gen_accuracy": 0.8950539529323578, "mean_token_accuracy": 0.9195836633443832, "num_tokens": 390632847.0, "sample_num_tokens": 9606.75, "step": 9942, "total_num_tokens": 390671274.0, "z_loss": 0.0003812338400166482 }, { "copy_logits_max": -6.255095958709717, "copy_logits_min": -750000000.0, "copy_num_tokens": 444.875, "epoch": 2.0306356905795253, "gen_logits_max": 2.9316537380218506, "gen_logits_mean": -18.930601119995117, "gen_logits_min": -31.38582420349121, "gen_logits_std": 3.6138007640838623, "gen_loss": 0.25247839093208313, "grad_norm": 0.3416926815500547, "learning_rate": 1.8370105263157896e-05, "loss": 0.2203, "mean_copy_accuracy": 0.9972622692584991, "mean_gen_accuracy": 0.8977752774953842, "mean_token_accuracy": 0.9219011068344116, "num_tokens": 390900122.0, "sample_num_tokens": 8413.5, "step": 9943, "total_num_tokens": 390933776.0, "z_loss": 0.0004502791562117636 }, { "copy_logits_max": -5.61663818359375, "copy_logits_min": -750000000.0, "copy_num_tokens": 381.6875, "epoch": 2.030839928516722, "gen_logits_max": 3.5221309661865234, "gen_logits_mean": -17.849491119384766, "gen_logits_min": -30.130950927734375, "gen_logits_std": 3.567974090576172, "gen_loss": 0.23473350703716278, "grad_norm": 0.37921148983281283, "learning_rate": 1.8368842105263157e-05, "loss": 0.2242, "mean_copy_accuracy": 0.9968533664941788, "mean_gen_accuracy": 0.8976217657327652, "mean_token_accuracy": 0.9212805926799774, "num_tokens": 391167771.0, "sample_num_tokens": 7532.75, "step": 9944, "total_num_tokens": 391197902.0, "z_loss": 0.00042867311276495457 }, { "copy_logits_max": -5.952274799346924, "copy_logits_min": -750000000.0, "copy_num_tokens": 278.3125, "epoch": 2.0310441664539187, "gen_logits_max": 3.9347386360168457, "gen_logits_mean": -18.651615142822266, "gen_logits_min": -31.19867706298828, "gen_logits_std": 3.6003923416137695, "gen_loss": 0.22826650738716125, "grad_norm": 0.35693849976948316, "learning_rate": 1.836757894736842e-05, "loss": 0.2344, "mean_copy_accuracy": 0.9974957853555679, "mean_gen_accuracy": 0.8914079666137695, "mean_token_accuracy": 0.9174569845199585, "num_tokens": 391445410.0, "sample_num_tokens": 7317.5, "step": 9945, "total_num_tokens": 391474680.0, "z_loss": 0.0003799491096287966 }, { "copy_logits_max": -4.826190948486328, "copy_logits_min": -687500032.0, "copy_num_tokens": 403.625, "epoch": 2.031248404391116, "gen_logits_max": 3.3301501274108887, "gen_logits_mean": -18.048629760742188, "gen_logits_min": -29.945968627929688, "gen_logits_std": 3.534945487976074, "gen_loss": 0.23934277892112732, "grad_norm": 0.35057273085379675, "learning_rate": 1.8366315789473682e-05, "loss": 0.2253, "mean_copy_accuracy": 0.9974023848772049, "mean_gen_accuracy": 0.8976290673017502, "mean_token_accuracy": 0.9207106232643127, "num_tokens": 391728480.0, "sample_num_tokens": 8415.5, "step": 9946, "total_num_tokens": 391762142.0, "z_loss": 0.0004013144935015589 }, { "copy_logits_max": -7.035600662231445, "copy_logits_min": -687500032.0, "copy_num_tokens": 408.8125, "epoch": 2.0314526423283126, "gen_logits_max": 2.6801533699035645, "gen_logits_mean": -19.670978546142578, "gen_logits_min": -31.943260192871094, "gen_logits_std": 3.6487772464752197, "gen_loss": 0.23239421844482422, "grad_norm": 0.4173417165609641, "learning_rate": 1.8365052631578947e-05, "loss": 0.2247, "mean_copy_accuracy": 0.9973070025444031, "mean_gen_accuracy": 0.8977174013853073, "mean_token_accuracy": 0.9213944673538208, "num_tokens": 392001399.0, "sample_num_tokens": 8899.75, "step": 9947, "total_num_tokens": 392036998.0, "z_loss": 0.0004106515261810273 }, { "copy_logits_max": -3.5279433727264404, "copy_logits_min": -750000000.0, "copy_num_tokens": 408.875, "epoch": 2.0316568802655093, "gen_logits_max": 3.6332807540893555, "gen_logits_mean": -17.7254638671875, "gen_logits_min": -30.32470703125, "gen_logits_std": 3.602379322052002, "gen_loss": 0.2269921600818634, "grad_norm": 0.37425326837737716, "learning_rate": 1.836378947368421e-05, "loss": 0.2194, "mean_copy_accuracy": 0.9981268495321274, "mean_gen_accuracy": 0.8952114135026932, "mean_token_accuracy": 0.9230818897485733, "num_tokens": 392290619.0, "sample_num_tokens": 7910.25, "step": 9948, "total_num_tokens": 392322260.0, "z_loss": 0.0003495583951007575 }, { "copy_logits_max": -1.6410613059997559, "copy_logits_min": -750000000.0, "copy_num_tokens": 703.875, "epoch": 2.031861118202706, "gen_logits_max": 3.4105043411254883, "gen_logits_mean": -16.82201385498047, "gen_logits_min": -29.729263305664062, "gen_logits_std": 3.607635021209717, "gen_loss": 0.18497274816036224, "grad_norm": 0.34198424620706347, "learning_rate": 1.8362526315789475e-05, "loss": 0.2185, "mean_copy_accuracy": 0.996480330824852, "mean_gen_accuracy": 0.8986791968345642, "mean_token_accuracy": 0.9225906282663345, "num_tokens": 392561749.0, "sample_num_tokens": 9831.25, "step": 9949, "total_num_tokens": 392601074.0, "z_loss": 0.0002918581012636423 }, { "copy_logits_max": -3.5025291442871094, "copy_logits_min": -750000000.0, "copy_num_tokens": 242.25, "epoch": 2.032065356139903, "gen_logits_max": 4.660129547119141, "gen_logits_mean": -17.271770477294922, "gen_logits_min": -29.808372497558594, "gen_logits_std": 3.55507493019104, "gen_loss": 0.26559963822364807, "grad_norm": 0.3600633604852514, "learning_rate": 1.836126315789474e-05, "loss": 0.2365, "mean_copy_accuracy": 0.9965205937623978, "mean_gen_accuracy": 0.894284576177597, "mean_token_accuracy": 0.9155071824789047, "num_tokens": 392836164.0, "sample_num_tokens": 7482.5, "step": 9950, "total_num_tokens": 392866094.0, "z_loss": 0.00043700714013539255 }, { "copy_logits_max": -0.8731579780578613, "copy_logits_min": -750000000.0, "copy_num_tokens": 603.5625, "epoch": 2.0322695940771, "gen_logits_max": 3.4101247787475586, "gen_logits_mean": -17.251293182373047, "gen_logits_min": -30.17408561706543, "gen_logits_std": 3.5916426181793213, "gen_loss": 0.2234608232975006, "grad_norm": 0.3590032984236985, "learning_rate": 1.836e-05, "loss": 0.2103, "mean_copy_accuracy": 0.9979911297559738, "mean_gen_accuracy": 0.896697998046875, "mean_token_accuracy": 0.9269072860479355, "num_tokens": 393113706.0, "sample_num_tokens": 9225.0, "step": 9951, "total_num_tokens": 393150606.0, "z_loss": 0.0003074676205869764 }, { "copy_logits_max": -2.1449079513549805, "copy_logits_min": -687500032.0, "copy_num_tokens": 387.75, "epoch": 2.0324738320142965, "gen_logits_max": 4.164043426513672, "gen_logits_mean": -16.917217254638672, "gen_logits_min": -30.264690399169922, "gen_logits_std": 3.568659543991089, "gen_loss": 0.2085997611284256, "grad_norm": 0.390303155481155, "learning_rate": 1.8358736842105265e-05, "loss": 0.2464, "mean_copy_accuracy": 0.9976961314678192, "mean_gen_accuracy": 0.8924379497766495, "mean_token_accuracy": 0.9123902171850204, "num_tokens": 393361093.0, "sample_num_tokens": 8135.25, "step": 9952, "total_num_tokens": 393393634.0, "z_loss": 0.000401905010221526 }, { "copy_logits_max": -1.7266227006912231, "copy_logits_min": -750000000.0, "copy_num_tokens": 520.375, "epoch": 2.0326780699514937, "gen_logits_max": 3.947270393371582, "gen_logits_mean": -17.226205825805664, "gen_logits_min": -30.17633056640625, "gen_logits_std": 3.583153247833252, "gen_loss": 0.20318132638931274, "grad_norm": 0.36249345646662506, "learning_rate": 1.8357473684210526e-05, "loss": 0.2157, "mean_copy_accuracy": 0.9978689402341843, "mean_gen_accuracy": 0.9003089517354965, "mean_token_accuracy": 0.9239436089992523, "num_tokens": 393631261.0, "sample_num_tokens": 9516.25, "step": 9953, "total_num_tokens": 393669326.0, "z_loss": 0.00033830784377641976 }, { "copy_logits_max": -2.534708261489868, "copy_logits_min": -750000000.0, "copy_num_tokens": 474.25, "epoch": 2.0328823078886904, "gen_logits_max": 3.8976364135742188, "gen_logits_mean": -17.462966918945312, "gen_logits_min": -30.845476150512695, "gen_logits_std": 3.5952396392822266, "gen_loss": 0.23624636232852936, "grad_norm": 0.3667184336616662, "learning_rate": 1.835621052631579e-05, "loss": 0.2215, "mean_copy_accuracy": 0.9974935501813889, "mean_gen_accuracy": 0.8959139734506607, "mean_token_accuracy": 0.9223785102367401, "num_tokens": 393892933.0, "sample_num_tokens": 8580.75, "step": 9954, "total_num_tokens": 393927256.0, "z_loss": 0.00040624465327709913 }, { "copy_logits_max": -0.4839647114276886, "copy_logits_min": -687500032.0, "copy_num_tokens": 767.875, "epoch": 2.033086545825887, "gen_logits_max": 3.780491590499878, "gen_logits_mean": -16.270919799804688, "gen_logits_min": -29.474807739257812, "gen_logits_std": 3.5520825386047363, "gen_loss": 0.20532813668251038, "grad_norm": 0.35319868248188074, "learning_rate": 1.835494736842105e-05, "loss": 0.215, "mean_copy_accuracy": 0.9979778975248337, "mean_gen_accuracy": 0.8923700302839279, "mean_token_accuracy": 0.9249740988016129, "num_tokens": 394175428.0, "sample_num_tokens": 9834.0, "step": 9955, "total_num_tokens": 394214764.0, "z_loss": 0.00046610867138952017 }, { "copy_logits_max": -4.627693176269531, "copy_logits_min": -750000000.0, "copy_num_tokens": 341.1875, "epoch": 2.033290783763084, "gen_logits_max": 4.813828468322754, "gen_logits_mean": -18.156705856323242, "gen_logits_min": -30.950908660888672, "gen_logits_std": 3.619781017303467, "gen_loss": 0.2794191241264343, "grad_norm": 0.34787175569646767, "learning_rate": 1.835368421052632e-05, "loss": 0.2271, "mean_copy_accuracy": 0.9970013052225113, "mean_gen_accuracy": 0.897562175989151, "mean_token_accuracy": 0.919976219534874, "num_tokens": 394453224.0, "sample_num_tokens": 7993.0, "step": 9956, "total_num_tokens": 394485196.0, "z_loss": 0.0005276541924104095 }, { "copy_logits_max": 2.047614574432373, "copy_logits_min": -750000064.0, "copy_num_tokens": 606.5, "epoch": 2.033495021700281, "gen_logits_max": 3.4296207427978516, "gen_logits_mean": -17.682357788085938, "gen_logits_min": -30.54781723022461, "gen_logits_std": 3.6469154357910156, "gen_loss": 0.2088029384613037, "grad_norm": 0.3948603026163812, "learning_rate": 1.835242105263158e-05, "loss": 0.2207, "mean_copy_accuracy": 0.9968201518058777, "mean_gen_accuracy": 0.8966867923736572, "mean_token_accuracy": 0.9224725663661957, "num_tokens": 394726957.0, "sample_num_tokens": 9355.75, "step": 9957, "total_num_tokens": 394764380.0, "z_loss": 0.0004130186280235648 }, { "copy_logits_max": -2.9207568168640137, "copy_logits_min": -687500032.0, "copy_num_tokens": 446.5, "epoch": 2.0336992596374777, "gen_logits_max": 2.987739086151123, "gen_logits_mean": -19.32650375366211, "gen_logits_min": -32.145652770996094, "gen_logits_std": 3.695681095123291, "gen_loss": 0.24511146545410156, "grad_norm": 0.35549161382308253, "learning_rate": 1.8351157894736844e-05, "loss": 0.2228, "mean_copy_accuracy": 0.9973094910383224, "mean_gen_accuracy": 0.8968862146139145, "mean_token_accuracy": 0.9210273325443268, "num_tokens": 394989197.0, "sample_num_tokens": 8509.25, "step": 9958, "total_num_tokens": 395023234.0, "z_loss": 0.00046908247168175876 }, { "copy_logits_max": -1.1071698665618896, "copy_logits_min": -687500032.0, "copy_num_tokens": 445.0625, "epoch": 2.0339034975746744, "gen_logits_max": 4.161684036254883, "gen_logits_mean": -16.338973999023438, "gen_logits_min": -29.87575912475586, "gen_logits_std": 3.569606304168701, "gen_loss": 0.22921311855316162, "grad_norm": 0.36025670523586434, "learning_rate": 1.8349894736842105e-05, "loss": 0.2243, "mean_copy_accuracy": 0.9981246739625931, "mean_gen_accuracy": 0.8943579345941544, "mean_token_accuracy": 0.9208637028932571, "num_tokens": 395260074.0, "sample_num_tokens": 7614.0, "step": 9959, "total_num_tokens": 395290530.0, "z_loss": 0.0004550911544356495 }, { "copy_logits_max": 0.30296677350997925, "copy_logits_min": -750000000.0, "copy_num_tokens": 681.9375, "epoch": 2.0341077355118715, "gen_logits_max": 3.4564170837402344, "gen_logits_mean": -16.302722930908203, "gen_logits_min": -29.221206665039062, "gen_logits_std": 3.568270683288574, "gen_loss": 0.198697030544281, "grad_norm": 0.3778560730472978, "learning_rate": 1.834863157894737e-05, "loss": 0.2083, "mean_copy_accuracy": 0.9983552992343903, "mean_gen_accuracy": 0.8988928645849228, "mean_token_accuracy": 0.9279195815324783, "num_tokens": 395545444.0, "sample_num_tokens": 9219.5, "step": 9960, "total_num_tokens": 395582322.0, "z_loss": 0.00034176441840827465 }, { "copy_logits_max": -0.7263818383216858, "copy_logits_min": -750000000.0, "copy_num_tokens": 412.8125, "epoch": 2.0343119734490682, "gen_logits_max": 3.7106635570526123, "gen_logits_mean": -17.527315139770508, "gen_logits_min": -30.22275161743164, "gen_logits_std": 3.5904700756073, "gen_loss": 0.24953079223632812, "grad_norm": 0.3794336853741169, "learning_rate": 1.834736842105263e-05, "loss": 0.2464, "mean_copy_accuracy": 0.9969956576824188, "mean_gen_accuracy": 0.8867372870445251, "mean_token_accuracy": 0.9145926535129547, "num_tokens": 395821371.0, "sample_num_tokens": 8148.25, "step": 9961, "total_num_tokens": 395853964.0, "z_loss": 0.0004255874955561012 }, { "copy_logits_max": -3.138615131378174, "copy_logits_min": -750000000.0, "copy_num_tokens": 435.5625, "epoch": 2.034516211386265, "gen_logits_max": 2.8431100845336914, "gen_logits_mean": -19.476247787475586, "gen_logits_min": -31.753223419189453, "gen_logits_std": 3.6762938499450684, "gen_loss": 0.23263448476791382, "grad_norm": 0.3671012807887085, "learning_rate": 1.8346105263157894e-05, "loss": 0.2279, "mean_copy_accuracy": 0.9963268339633942, "mean_gen_accuracy": 0.8997132480144501, "mean_token_accuracy": 0.9197394102811813, "num_tokens": 396085507.0, "sample_num_tokens": 8440.75, "step": 9962, "total_num_tokens": 396119270.0, "z_loss": 0.0003476325946394354 }, { "copy_logits_max": -3.3364431858062744, "copy_logits_min": -750000000.0, "copy_num_tokens": 333.375, "epoch": 2.0347204493234616, "gen_logits_max": 4.317593574523926, "gen_logits_mean": -17.047809600830078, "gen_logits_min": -29.724674224853516, "gen_logits_std": 3.5806007385253906, "gen_loss": 0.21918362379074097, "grad_norm": 0.33609684574427673, "learning_rate": 1.834484210526316e-05, "loss": 0.2219, "mean_copy_accuracy": 0.997818186879158, "mean_gen_accuracy": 0.8972517400979996, "mean_token_accuracy": 0.9214719980955124, "num_tokens": 396353733.0, "sample_num_tokens": 7181.25, "step": 9963, "total_num_tokens": 396382458.0, "z_loss": 0.00029785081278532743 }, { "copy_logits_max": -3.989778995513916, "copy_logits_min": -687500032.0, "copy_num_tokens": 360.5625, "epoch": 2.034924687260659, "gen_logits_max": 3.8938848972320557, "gen_logits_mean": -18.072265625, "gen_logits_min": -30.49114418029785, "gen_logits_std": 3.652642250061035, "gen_loss": 0.2195207178592682, "grad_norm": 0.3268218777551755, "learning_rate": 1.8343578947368423e-05, "loss": 0.2157, "mean_copy_accuracy": 0.9980068355798721, "mean_gen_accuracy": 0.8976452648639679, "mean_token_accuracy": 0.92454893887043, "num_tokens": 396642757.0, "sample_num_tokens": 6881.75, "step": 9964, "total_num_tokens": 396670284.0, "z_loss": 0.0003669059951789677 }, { "copy_logits_max": -3.877913236618042, "copy_logits_min": -750000000.0, "copy_num_tokens": 345.6875, "epoch": 2.0351289251978555, "gen_logits_max": 4.638523101806641, "gen_logits_mean": -17.07019805908203, "gen_logits_min": -29.547964096069336, "gen_logits_std": 3.5654566287994385, "gen_loss": 0.2748167812824249, "grad_norm": 0.3384479217874269, "learning_rate": 1.8342315789473687e-05, "loss": 0.2343, "mean_copy_accuracy": 0.9969470202922821, "mean_gen_accuracy": 0.8938968032598495, "mean_token_accuracy": 0.9173660129308701, "num_tokens": 396894168.0, "sample_num_tokens": 8779.0, "step": 9965, "total_num_tokens": 396929284.0, "z_loss": 0.0005337223410606384 }, { "copy_logits_max": -2.255218029022217, "copy_logits_min": -750000064.0, "copy_num_tokens": 561.3125, "epoch": 2.035333163135052, "gen_logits_max": 3.7257347106933594, "gen_logits_mean": -16.38116455078125, "gen_logits_min": -29.329736709594727, "gen_logits_std": 3.5715599060058594, "gen_loss": 0.1929812729358673, "grad_norm": 0.35931202844361954, "learning_rate": 1.8341052631578948e-05, "loss": 0.2033, "mean_copy_accuracy": 0.9977690875530243, "mean_gen_accuracy": 0.9003748744726181, "mean_token_accuracy": 0.9281962662935257, "num_tokens": 397176045.0, "sample_num_tokens": 8500.75, "step": 9966, "total_num_tokens": 397210048.0, "z_loss": 0.0003295274800620973 }, { "copy_logits_max": -3.921849012374878, "copy_logits_min": -750000000.0, "copy_num_tokens": 588.1875, "epoch": 2.0355374010722493, "gen_logits_max": 3.5599400997161865, "gen_logits_mean": -18.582088470458984, "gen_logits_min": -31.180503845214844, "gen_logits_std": 3.669626235961914, "gen_loss": 0.2433127462863922, "grad_norm": 0.3385654624960556, "learning_rate": 1.8339789473684212e-05, "loss": 0.2234, "mean_copy_accuracy": 0.9981922209262848, "mean_gen_accuracy": 0.8979518860578537, "mean_token_accuracy": 0.9230504333972931, "num_tokens": 397452882.0, "sample_num_tokens": 9388.0, "step": 9967, "total_num_tokens": 397490434.0, "z_loss": 0.0004210731422062963 }, { "copy_logits_max": -5.38999080657959, "copy_logits_min": -687500032.0, "copy_num_tokens": 349.0625, "epoch": 2.035741639009446, "gen_logits_max": 3.8639063835144043, "gen_logits_mean": -18.196521759033203, "gen_logits_min": -31.02469825744629, "gen_logits_std": 3.6427574157714844, "gen_loss": 0.24747344851493835, "grad_norm": 0.33992781930258553, "learning_rate": 1.8338526315789473e-05, "loss": 0.2297, "mean_copy_accuracy": 0.997336283326149, "mean_gen_accuracy": 0.8946217596530914, "mean_token_accuracy": 0.9203498065471649, "num_tokens": 397721921.0, "sample_num_tokens": 7602.75, "step": 9968, "total_num_tokens": 397752332.0, "z_loss": 0.00042478187242522836 }, { "copy_logits_max": -4.207427978515625, "copy_logits_min": -750000000.0, "copy_num_tokens": 476.4375, "epoch": 2.0359458769466428, "gen_logits_max": 3.5096144676208496, "gen_logits_mean": -16.825218200683594, "gen_logits_min": -29.793537139892578, "gen_logits_std": 3.567150115966797, "gen_loss": 0.19125467538833618, "grad_norm": 0.3879628150404598, "learning_rate": 1.8337263157894738e-05, "loss": 0.2259, "mean_copy_accuracy": 0.9968594610691071, "mean_gen_accuracy": 0.898330807685852, "mean_token_accuracy": 0.9211088120937347, "num_tokens": 397989523.0, "sample_num_tokens": 7743.25, "step": 9969, "total_num_tokens": 398020496.0, "z_loss": 0.0003599023330025375 }, { "copy_logits_max": -4.163137435913086, "copy_logits_min": -750000000.0, "copy_num_tokens": 534.5625, "epoch": 2.0361501148838395, "gen_logits_max": 2.175762176513672, "gen_logits_mean": -19.04004669189453, "gen_logits_min": -32.01919174194336, "gen_logits_std": 3.6887927055358887, "gen_loss": 0.1929721236228943, "grad_norm": 0.3804504996015084, "learning_rate": 1.8336e-05, "loss": 0.2208, "mean_copy_accuracy": 0.9978196620941162, "mean_gen_accuracy": 0.8962907493114471, "mean_token_accuracy": 0.9223009645938873, "num_tokens": 398251626.0, "sample_num_tokens": 8326.0, "step": 9970, "total_num_tokens": 398284930.0, "z_loss": 0.00042432668851688504 }, { "copy_logits_max": -8.848546981811523, "copy_logits_min": -750000000.0, "copy_num_tokens": 218.5, "epoch": 2.0363543528210366, "gen_logits_max": 3.1782329082489014, "gen_logits_mean": -21.287097930908203, "gen_logits_min": -33.360416412353516, "gen_logits_std": 3.755563735961914, "gen_loss": 0.2613130211830139, "grad_norm": 0.36628330309180157, "learning_rate": 1.8334736842105263e-05, "loss": 0.2449, "mean_copy_accuracy": 0.9972477853298187, "mean_gen_accuracy": 0.8941887617111206, "mean_token_accuracy": 0.9139223396778107, "num_tokens": 398507599.0, "sample_num_tokens": 6764.75, "step": 9971, "total_num_tokens": 398534658.0, "z_loss": 0.000411216082284227 }, { "copy_logits_max": -2.594392776489258, "copy_logits_min": -687500032.0, "copy_num_tokens": 669.625, "epoch": 2.0365585907582333, "gen_logits_max": 4.338289737701416, "gen_logits_mean": -15.296844482421875, "gen_logits_min": -28.052152633666992, "gen_logits_std": 3.4752197265625, "gen_loss": 0.19366610050201416, "grad_norm": 0.3579297891102766, "learning_rate": 1.8333473684210524e-05, "loss": 0.2194, "mean_copy_accuracy": 0.9974003583192825, "mean_gen_accuracy": 0.8969528824090958, "mean_token_accuracy": 0.9232160747051239, "num_tokens": 398777402.0, "sample_num_tokens": 9949.0, "step": 9972, "total_num_tokens": 398817198.0, "z_loss": 0.00038170345942489803 }, { "copy_logits_max": -5.380802154541016, "copy_logits_min": -687500032.0, "copy_num_tokens": 594.1875, "epoch": 2.03676282869543, "gen_logits_max": 3.405738353729248, "gen_logits_mean": -16.032318115234375, "gen_logits_min": -28.57377815246582, "gen_logits_std": 3.4383490085601807, "gen_loss": 0.18476828932762146, "grad_norm": 0.3473796732977709, "learning_rate": 1.833221052631579e-05, "loss": 0.2168, "mean_copy_accuracy": 0.9981869012117386, "mean_gen_accuracy": 0.89323291182518, "mean_token_accuracy": 0.9242424368858337, "num_tokens": 399072621.0, "sample_num_tokens": 8162.25, "step": 9973, "total_num_tokens": 399105270.0, "z_loss": 0.00037355811218731105 }, { "copy_logits_max": -4.143908500671387, "copy_logits_min": -750000000.0, "copy_num_tokens": 368.1875, "epoch": 2.036967066632627, "gen_logits_max": 3.6617560386657715, "gen_logits_mean": -17.661571502685547, "gen_logits_min": -29.978683471679688, "gen_logits_std": 3.585685968399048, "gen_loss": 0.23431456089019775, "grad_norm": 0.48527215620887776, "learning_rate": 1.8330947368421052e-05, "loss": 0.2404, "mean_copy_accuracy": 0.9971052259206772, "mean_gen_accuracy": 0.8913829475641251, "mean_token_accuracy": 0.9151712954044342, "num_tokens": 399324193.0, "sample_num_tokens": 7593.25, "step": 9974, "total_num_tokens": 399354566.0, "z_loss": 0.00041535162017680705 }, { "copy_logits_max": -2.94454288482666, "copy_logits_min": -750000000.0, "copy_num_tokens": 411.9375, "epoch": 2.037171304569824, "gen_logits_max": 4.876650810241699, "gen_logits_mean": -15.43442440032959, "gen_logits_min": -27.771684646606445, "gen_logits_std": 3.4682631492614746, "gen_loss": 0.24868124723434448, "grad_norm": 0.34328545917984693, "learning_rate": 1.8329684210526317e-05, "loss": 0.2289, "mean_copy_accuracy": 0.9970213025808334, "mean_gen_accuracy": 0.8975484669208527, "mean_token_accuracy": 0.9199105501174927, "num_tokens": 399609052.0, "sample_num_tokens": 9358.5, "step": 9975, "total_num_tokens": 399646486.0, "z_loss": 0.0004200462717562914 }, { "copy_logits_max": -2.123688220977783, "copy_logits_min": -750000000.0, "copy_num_tokens": 652.1875, "epoch": 2.0373755425070206, "gen_logits_max": 2.913856029510498, "gen_logits_mean": -17.080873489379883, "gen_logits_min": -29.968822479248047, "gen_logits_std": 3.599543571472168, "gen_loss": 0.19517070055007935, "grad_norm": 0.3504461517887889, "learning_rate": 1.832842105263158e-05, "loss": 0.2315, "mean_copy_accuracy": 0.9980487674474716, "mean_gen_accuracy": 0.8881008177995682, "mean_token_accuracy": 0.9174302071332932, "num_tokens": 399909033.0, "sample_num_tokens": 8732.25, "step": 9976, "total_num_tokens": 399943962.0, "z_loss": 0.00036025571171194315 }, { "copy_logits_max": -3.9959182739257812, "copy_logits_min": -750000064.0, "copy_num_tokens": 352.3125, "epoch": 2.0375797804442177, "gen_logits_max": 4.912537574768066, "gen_logits_mean": -15.801773071289062, "gen_logits_min": -28.09851837158203, "gen_logits_std": 3.5095622539520264, "gen_loss": 0.25343358516693115, "grad_norm": 0.34630668714433793, "learning_rate": 1.8327157894736842e-05, "loss": 0.2284, "mean_copy_accuracy": 0.9971605688333511, "mean_gen_accuracy": 0.8957033306360245, "mean_token_accuracy": 0.9204027950763702, "num_tokens": 400196377.0, "sample_num_tokens": 7611.75, "step": 9977, "total_num_tokens": 400226824.0, "z_loss": 0.0004920205683447421 }, { "copy_logits_max": -4.640720367431641, "copy_logits_min": -687500032.0, "copy_num_tokens": 342.4375, "epoch": 2.0377840183814144, "gen_logits_max": 2.8536124229431152, "gen_logits_mean": -19.996334075927734, "gen_logits_min": -32.555458068847656, "gen_logits_std": 3.6882400512695312, "gen_loss": 0.2275463342666626, "grad_norm": 0.37678365770926564, "learning_rate": 1.8325894736842106e-05, "loss": 0.2301, "mean_copy_accuracy": 0.9977963715791702, "mean_gen_accuracy": 0.8971319049596786, "mean_token_accuracy": 0.9190339744091034, "num_tokens": 400462839.0, "sample_num_tokens": 7493.25, "step": 9978, "total_num_tokens": 400492812.0, "z_loss": 0.0004119429213460535 }, { "copy_logits_max": -4.518458366394043, "copy_logits_min": -750000000.0, "copy_num_tokens": 568.0, "epoch": 2.037988256318611, "gen_logits_max": 4.036471366882324, "gen_logits_mean": -15.605454444885254, "gen_logits_min": -28.087635040283203, "gen_logits_std": 3.4913578033447266, "gen_loss": 0.23067748546600342, "grad_norm": 0.35562666643299196, "learning_rate": 1.8324631578947367e-05, "loss": 0.2239, "mean_copy_accuracy": 0.9971123784780502, "mean_gen_accuracy": 0.896159902215004, "mean_token_accuracy": 0.9203767776489258, "num_tokens": 400722452.0, "sample_num_tokens": 9285.5, "step": 9979, "total_num_tokens": 400759594.0, "z_loss": 0.00046229769941419363 }, { "copy_logits_max": -6.011686325073242, "copy_logits_min": -750000000.0, "copy_num_tokens": 375.125, "epoch": 2.038192494255808, "gen_logits_max": 3.788435935974121, "gen_logits_mean": -17.921846389770508, "gen_logits_min": -30.22644805908203, "gen_logits_std": 3.576310634613037, "gen_loss": 0.22757413983345032, "grad_norm": 0.3649989477100517, "learning_rate": 1.832336842105263e-05, "loss": 0.2368, "mean_copy_accuracy": 0.9965128898620605, "mean_gen_accuracy": 0.8931195139884949, "mean_token_accuracy": 0.9161362200975418, "num_tokens": 400977729.0, "sample_num_tokens": 7818.25, "step": 9980, "total_num_tokens": 401009002.0, "z_loss": 0.00042867494630627334 }, { "copy_logits_max": -6.764619827270508, "copy_logits_min": -750000000.0, "copy_num_tokens": 461.625, "epoch": 2.038396732193005, "gen_logits_max": 3.3118555545806885, "gen_logits_mean": -17.904773712158203, "gen_logits_min": -30.462072372436523, "gen_logits_std": 3.592801809310913, "gen_loss": 0.19615492224693298, "grad_norm": 0.36171266537934654, "learning_rate": 1.8322105263157896e-05, "loss": 0.2155, "mean_copy_accuracy": 0.9981390088796616, "mean_gen_accuracy": 0.8978553116321564, "mean_token_accuracy": 0.9240129590034485, "num_tokens": 401251555.0, "sample_num_tokens": 7693.25, "step": 9981, "total_num_tokens": 401282328.0, "z_loss": 0.0003438241546973586 }, { "copy_logits_max": -6.300056457519531, "copy_logits_min": -750000000.0, "copy_num_tokens": 453.0, "epoch": 2.0386009701302017, "gen_logits_max": 4.487710952758789, "gen_logits_mean": -16.73537826538086, "gen_logits_min": -29.079570770263672, "gen_logits_std": 3.552415132522583, "gen_loss": 0.19805195927619934, "grad_norm": 0.34772533496530217, "learning_rate": 1.832084210526316e-05, "loss": 0.2138, "mean_copy_accuracy": 0.9967563599348068, "mean_gen_accuracy": 0.9039362370967865, "mean_token_accuracy": 0.9246296584606171, "num_tokens": 401512027.0, "sample_num_tokens": 8928.75, "step": 9982, "total_num_tokens": 401547742.0, "z_loss": 0.0003857990959659219 }, { "copy_logits_max": -4.393317222595215, "copy_logits_min": -687500032.0, "copy_num_tokens": 524.0, "epoch": 2.0388052080673984, "gen_logits_max": 4.759955406188965, "gen_logits_mean": -15.448999404907227, "gen_logits_min": -27.78369140625, "gen_logits_std": 3.506819725036621, "gen_loss": 0.2104860544204712, "grad_norm": 0.351350920207934, "learning_rate": 1.831957894736842e-05, "loss": 0.2234, "mean_copy_accuracy": 0.9974386841058731, "mean_gen_accuracy": 0.8998153358697891, "mean_token_accuracy": 0.9207265228033066, "num_tokens": 401773169.0, "sample_num_tokens": 9566.75, "step": 9983, "total_num_tokens": 401811436.0, "z_loss": 0.00037833626265637577 }, { "copy_logits_max": -4.987585544586182, "copy_logits_min": -750000000.0, "copy_num_tokens": 474.75, "epoch": 2.0390094460045955, "gen_logits_max": 4.196708679199219, "gen_logits_mean": -17.329708099365234, "gen_logits_min": -29.615583419799805, "gen_logits_std": 3.5853347778320312, "gen_loss": 0.23526424169540405, "grad_norm": 0.3903040153430783, "learning_rate": 1.8318315789473685e-05, "loss": 0.2389, "mean_copy_accuracy": 0.9976712465286255, "mean_gen_accuracy": 0.8916324079036713, "mean_token_accuracy": 0.9166795909404755, "num_tokens": 402063196.0, "sample_num_tokens": 8540.5, "step": 9984, "total_num_tokens": 402097358.0, "z_loss": 0.0004392662667669356 }, { "copy_logits_max": -5.736530303955078, "copy_logits_min": -750000000.0, "copy_num_tokens": 374.125, "epoch": 2.0392136839417923, "gen_logits_max": 4.31901216506958, "gen_logits_mean": -17.752655029296875, "gen_logits_min": -30.35284996032715, "gen_logits_std": 3.5991597175598145, "gen_loss": 0.24099785089492798, "grad_norm": 0.35774513492868826, "learning_rate": 1.8317052631578946e-05, "loss": 0.2378, "mean_copy_accuracy": 0.9978845119476318, "mean_gen_accuracy": 0.8897878527641296, "mean_token_accuracy": 0.9150227904319763, "num_tokens": 402325672.0, "sample_num_tokens": 7530.5, "step": 9985, "total_num_tokens": 402355794.0, "z_loss": 0.0004165954887866974 }, { "copy_logits_max": -4.474068641662598, "copy_logits_min": -750000000.0, "copy_num_tokens": 484.125, "epoch": 2.039417921878989, "gen_logits_max": 5.791853427886963, "gen_logits_mean": -14.344583511352539, "gen_logits_min": -26.813533782958984, "gen_logits_std": 3.4624733924865723, "gen_loss": 0.2320929616689682, "grad_norm": 0.3549483990996802, "learning_rate": 1.831578947368421e-05, "loss": 0.2228, "mean_copy_accuracy": 0.9978515803813934, "mean_gen_accuracy": 0.8926634937524796, "mean_token_accuracy": 0.9225957989692688, "num_tokens": 402604901.0, "sample_num_tokens": 7756.75, "step": 9986, "total_num_tokens": 402635928.0, "z_loss": 0.0003735912614502013 }, { "copy_logits_max": -4.6585373878479, "copy_logits_min": -562500096.0, "copy_num_tokens": 620.5625, "epoch": 2.0396221598161857, "gen_logits_max": 3.5574800968170166, "gen_logits_mean": -18.28704071044922, "gen_logits_min": -30.816089630126953, "gen_logits_std": 3.6499650478363037, "gen_loss": 0.2212422490119934, "grad_norm": 0.3819246108285895, "learning_rate": 1.831452631578947e-05, "loss": 0.2256, "mean_copy_accuracy": 0.997580274939537, "mean_gen_accuracy": 0.8953027576208115, "mean_token_accuracy": 0.9227248132228851, "num_tokens": 402879115.0, "sample_num_tokens": 9768.75, "step": 9987, "total_num_tokens": 402918190.0, "z_loss": 0.0004002996429335326 }, { "copy_logits_max": -4.583660125732422, "copy_logits_min": -750000000.0, "copy_num_tokens": 533.5625, "epoch": 2.039826397753383, "gen_logits_max": 3.434353828430176, "gen_logits_mean": -17.79783058166504, "gen_logits_min": -30.306884765625, "gen_logits_std": 3.6320595741271973, "gen_loss": 0.2189951390028, "grad_norm": 0.3665408394914767, "learning_rate": 1.8313263157894736e-05, "loss": 0.2208, "mean_copy_accuracy": 0.9978478252887726, "mean_gen_accuracy": 0.8941423147916794, "mean_token_accuracy": 0.9217443913221359, "num_tokens": 403164894.0, "sample_num_tokens": 8824.5, "step": 9988, "total_num_tokens": 403200192.0, "z_loss": 0.0004226798773743212 }, { "copy_logits_max": -4.0248847007751465, "copy_logits_min": -625000064.0, "copy_num_tokens": 470.3125, "epoch": 2.0400306356905795, "gen_logits_max": 3.139057159423828, "gen_logits_mean": -18.308391571044922, "gen_logits_min": -31.10013771057129, "gen_logits_std": 3.6426472663879395, "gen_loss": 0.24999389052391052, "grad_norm": 0.36300202718793884, "learning_rate": 1.8312000000000004e-05, "loss": 0.2222, "mean_copy_accuracy": 0.99737249314785, "mean_gen_accuracy": 0.8953984975814819, "mean_token_accuracy": 0.9220826625823975, "num_tokens": 403442265.0, "sample_num_tokens": 8398.75, "step": 9989, "total_num_tokens": 403475860.0, "z_loss": 0.00041889879503287375 }, { "copy_logits_max": -2.0618247985839844, "copy_logits_min": -750000000.0, "copy_num_tokens": 463.9375, "epoch": 2.0402348736277762, "gen_logits_max": 3.392383098602295, "gen_logits_mean": -17.73636245727539, "gen_logits_min": -30.54787254333496, "gen_logits_std": 3.630984306335449, "gen_loss": 0.2503007650375366, "grad_norm": 0.37832910164460526, "learning_rate": 1.8310736842105264e-05, "loss": 0.237, "mean_copy_accuracy": 0.9964265525341034, "mean_gen_accuracy": 0.8895393908023834, "mean_token_accuracy": 0.9155734181404114, "num_tokens": 403700939.0, "sample_num_tokens": 8060.75, "step": 9990, "total_num_tokens": 403733182.0, "z_loss": 0.0005419980152510107 }, { "copy_logits_max": -5.2761054039001465, "copy_logits_min": -750000000.0, "copy_num_tokens": 346.1875, "epoch": 2.0404391115649734, "gen_logits_max": 4.515624046325684, "gen_logits_mean": -16.79722023010254, "gen_logits_min": -29.192564010620117, "gen_logits_std": 3.5708467960357666, "gen_loss": 0.2527132034301758, "grad_norm": 0.34582664892620235, "learning_rate": 1.830947368421053e-05, "loss": 0.2196, "mean_copy_accuracy": 0.9978498369455338, "mean_gen_accuracy": 0.8941136300563812, "mean_token_accuracy": 0.9225676655769348, "num_tokens": 403985130.0, "sample_num_tokens": 7481.0, "step": 9991, "total_num_tokens": 404015054.0, "z_loss": 0.00043701191316358745 }, { "copy_logits_max": -4.949383735656738, "copy_logits_min": -687500032.0, "copy_num_tokens": 550.0625, "epoch": 2.04064334950217, "gen_logits_max": 2.8608007431030273, "gen_logits_mean": -19.103252410888672, "gen_logits_min": -31.741416931152344, "gen_logits_std": 3.7057619094848633, "gen_loss": 0.20324207842350006, "grad_norm": 0.34425901002269954, "learning_rate": 1.830821052631579e-05, "loss": 0.2184, "mean_copy_accuracy": 0.9982933104038239, "mean_gen_accuracy": 0.8916401863098145, "mean_token_accuracy": 0.922189250588417, "num_tokens": 404286208.0, "sample_num_tokens": 8985.5, "step": 9992, "total_num_tokens": 404322150.0, "z_loss": 0.0003978920285589993 }, { "copy_logits_max": -3.7052149772644043, "copy_logits_min": -750000000.0, "copy_num_tokens": 443.4375, "epoch": 2.040847587439367, "gen_logits_max": 4.101239204406738, "gen_logits_mean": -16.63786506652832, "gen_logits_min": -29.14920425415039, "gen_logits_std": 3.568937063217163, "gen_loss": 0.2250145971775055, "grad_norm": 0.36133352843883365, "learning_rate": 1.8306947368421054e-05, "loss": 0.2287, "mean_copy_accuracy": 0.9971023350954056, "mean_gen_accuracy": 0.8951653689146042, "mean_token_accuracy": 0.9191493690013885, "num_tokens": 404551016.0, "sample_num_tokens": 7955.5, "step": 9993, "total_num_tokens": 404582838.0, "z_loss": 0.00040635853656567633 }, { "copy_logits_max": -4.972138404846191, "copy_logits_min": -687500032.0, "copy_num_tokens": 605.375, "epoch": 2.0410518253765635, "gen_logits_max": 2.8388166427612305, "gen_logits_mean": -18.514812469482422, "gen_logits_min": -30.941850662231445, "gen_logits_std": 3.6764116287231445, "gen_loss": 0.21218480169773102, "grad_norm": 0.350671003605274, "learning_rate": 1.8305684210526315e-05, "loss": 0.2168, "mean_copy_accuracy": 0.9974102526903152, "mean_gen_accuracy": 0.8971199989318848, "mean_token_accuracy": 0.922450378537178, "num_tokens": 404830608.0, "sample_num_tokens": 9580.5, "step": 9994, "total_num_tokens": 404868930.0, "z_loss": 0.00034194139880128205 }, { "copy_logits_max": -5.165256977081299, "copy_logits_min": -750000000.0, "copy_num_tokens": 474.125, "epoch": 2.0412560633137606, "gen_logits_max": 3.1685380935668945, "gen_logits_mean": -18.89763641357422, "gen_logits_min": -31.30032730102539, "gen_logits_std": 3.692896604537964, "gen_loss": 0.22817838191986084, "grad_norm": 0.39193239931130225, "learning_rate": 1.830442105263158e-05, "loss": 0.2293, "mean_copy_accuracy": 0.9971730709075928, "mean_gen_accuracy": 0.8937199860811234, "mean_token_accuracy": 0.9192734956741333, "num_tokens": 405114577.0, "sample_num_tokens": 8314.75, "step": 9995, "total_num_tokens": 405147836.0, "z_loss": 0.00038141824188642204 }, { "copy_logits_max": -2.668680191040039, "copy_logits_min": -750000064.0, "copy_num_tokens": 550.0625, "epoch": 2.0414603012509573, "gen_logits_max": 3.9797134399414062, "gen_logits_mean": -17.5064754486084, "gen_logits_min": -29.90787124633789, "gen_logits_std": 3.619288444519043, "gen_loss": 0.21816694736480713, "grad_norm": 0.36138631412603905, "learning_rate": 1.830315789473684e-05, "loss": 0.2252, "mean_copy_accuracy": 0.9975354373455048, "mean_gen_accuracy": 0.8933209180831909, "mean_token_accuracy": 0.9196672737598419, "num_tokens": 405372542.0, "sample_num_tokens": 8809.5, "step": 9996, "total_num_tokens": 405407780.0, "z_loss": 0.0003952909028157592 }, { "copy_logits_max": 1.497296690940857, "copy_logits_min": -750000000.0, "copy_num_tokens": 606.5, "epoch": 2.041664539188154, "gen_logits_max": 4.0712785720825195, "gen_logits_mean": -15.7421875, "gen_logits_min": -28.674758911132812, "gen_logits_std": 3.56211519241333, "gen_loss": 0.2287202626466751, "grad_norm": 0.35144277426302833, "learning_rate": 1.8301894736842108e-05, "loss": 0.2241, "mean_copy_accuracy": 0.9983143508434296, "mean_gen_accuracy": 0.8889874815940857, "mean_token_accuracy": 0.9223493188619614, "num_tokens": 405655938.0, "sample_num_tokens": 8371.0, "step": 9997, "total_num_tokens": 405689422.0, "z_loss": 0.0004024432855658233 }, { "copy_logits_max": -2.7493550777435303, "copy_logits_min": -750000000.0, "copy_num_tokens": 492.25, "epoch": 2.041868777125351, "gen_logits_max": 3.3413472175598145, "gen_logits_mean": -18.23599624633789, "gen_logits_min": -30.607746124267578, "gen_logits_std": 3.645132541656494, "gen_loss": 0.2418379932641983, "grad_norm": 0.36074456214651185, "learning_rate": 1.830063157894737e-05, "loss": 0.2242, "mean_copy_accuracy": 0.9976040124893188, "mean_gen_accuracy": 0.8925790339708328, "mean_token_accuracy": 0.9198756963014603, "num_tokens": 405927368.0, "sample_num_tokens": 8560.0, "step": 9998, "total_num_tokens": 405961608.0, "z_loss": 0.0004356403078418225 }, { "copy_logits_max": -4.2946977615356445, "copy_logits_min": -687500032.0, "copy_num_tokens": 495.0, "epoch": 2.042073015062548, "gen_logits_max": 3.3182270526885986, "gen_logits_mean": -18.683137893676758, "gen_logits_min": -31.01253318786621, "gen_logits_std": 3.649661064147949, "gen_loss": 0.1939184069633484, "grad_norm": 0.3849863035288485, "learning_rate": 1.8299368421052633e-05, "loss": 0.219, "mean_copy_accuracy": 0.998215839266777, "mean_gen_accuracy": 0.9022073149681091, "mean_token_accuracy": 0.9226230084896088, "num_tokens": 406177474.0, "sample_num_tokens": 9934.5, "step": 9999, "total_num_tokens": 406217212.0, "z_loss": 0.0003104803035967052 }, { "epoch": 2.0422772529997446, "grad_norm": 0.3560294000725266, "learning_rate": 1.8298105263157894e-05, "loss": 0.2345, "step": 10000 }, { "epoch": 2.0422772529997446, "eval_copy_logits_max": -8.297331809997559, "eval_copy_logits_min": -88.58103942871094, "eval_gen_logits_max": 1.6382560729980469, "eval_gen_logits_mean": -23.99137306213379, "eval_gen_logits_min": -35.729408264160156, "eval_gen_logits_std": 3.8419671058654785, "eval_gen_loss": 0.27392101287841797, "eval_loss": 0.24540361762046814, "eval_mean_copy_accuracy": 0.9988262951374054, "eval_mean_gen_accuracy": 0.8977096080780029, "eval_mean_token_accuracy": 0.9107895195484161, "eval_num_tokens": 406470728.0, "eval_runtime": 0.6806, "eval_samples_per_second": 11.755, "eval_steps_per_second": 2.939, "eval_total_num_tokens": 406470728.0, "eval_z_loss": 0.00036569469375535846, "step": 10000 } ], "logging_steps": 1, "max_steps": 24485, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.5072272302092583e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }