{ "best_global_step": 68000, "best_metric": 36.41809445833699, "best_model_checkpoint": "./hebrew_yiddish_model_improved/checkpoint-68000", "epoch": 31.16710875331565, "eval_steps": 500, "global_step": 70500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04420866489832007, "grad_norm": 170.889404296875, "learning_rate": 9.300000000000001e-07, "loss": 33.1983, "step": 100 }, { "epoch": 0.08841732979664015, "grad_norm": 20.797151565551758, "learning_rate": 1.93e-06, "loss": 6.805, "step": 200 }, { "epoch": 0.13262599469496023, "grad_norm": 14.6142578125, "learning_rate": 2.93e-06, "loss": 3.8098, "step": 300 }, { "epoch": 0.1768346595932803, "grad_norm": 10.495699882507324, "learning_rate": 3.9300000000000005e-06, "loss": 3.1019, "step": 400 }, { "epoch": 0.22104332449160036, "grad_norm": 9.405308723449707, "learning_rate": 4.93e-06, "loss": 2.6451, "step": 500 }, { "epoch": 0.22104332449160036, "eval_bleu": 0.01170015112301577, "eval_char_accuracy": 9.10356725183984, "eval_loss": 2.2571728229522705, "eval_runtime": 3004.8734, "eval_samples_per_second": 0.753, "eval_steps_per_second": 0.188, "step": 500 }, { "epoch": 0.26525198938992045, "grad_norm": 7.3878984451293945, "learning_rate": 5.93e-06, "loss": 2.3713, "step": 600 }, { "epoch": 0.3094606542882405, "grad_norm": 9.516755104064941, "learning_rate": 6.93e-06, "loss": 2.2177, "step": 700 }, { "epoch": 0.3536693191865606, "grad_norm": 8.333394050598145, "learning_rate": 7.93e-06, "loss": 2.1358, "step": 800 }, { "epoch": 0.3978779840848806, "grad_norm": 9.205838203430176, "learning_rate": 8.930000000000001e-06, "loss": 2.0759, "step": 900 }, { "epoch": 0.4420866489832007, "grad_norm": 9.194506645202637, "learning_rate": 9.930000000000001e-06, "loss": 2.0163, "step": 1000 }, { "epoch": 0.4420866489832007, "eval_bleu": 0.028655490079497793, "eval_char_accuracy": 9.236345779839631, "eval_loss": 1.8678231239318848, "eval_runtime": 2335.6327, "eval_samples_per_second": 0.969, "eval_steps_per_second": 0.242, "step": 1000 }, { "epoch": 0.48629531388152075, "grad_norm": 9.440414428710938, "learning_rate": 9.9958703374778e-06, "loss": 1.9759, "step": 1100 }, { "epoch": 0.5305039787798409, "grad_norm": 6.935140132904053, "learning_rate": 9.991429840142097e-06, "loss": 1.9348, "step": 1200 }, { "epoch": 0.5747126436781609, "grad_norm": 6.917918682098389, "learning_rate": 9.986989342806395e-06, "loss": 1.8887, "step": 1300 }, { "epoch": 0.618921308576481, "grad_norm": 7.898908615112305, "learning_rate": 9.982548845470693e-06, "loss": 1.8457, "step": 1400 }, { "epoch": 0.6631299734748011, "grad_norm": 5.319510459899902, "learning_rate": 9.978108348134992e-06, "loss": 1.8222, "step": 1500 }, { "epoch": 0.6631299734748011, "eval_bleu": 0.3214852150708254, "eval_char_accuracy": 9.11972035743592, "eval_loss": 1.664724588394165, "eval_runtime": 1712.1381, "eval_samples_per_second": 1.322, "eval_steps_per_second": 0.331, "step": 1500 }, { "epoch": 0.7073386383731212, "grad_norm": 6.94521427154541, "learning_rate": 9.97366785079929e-06, "loss": 1.7814, "step": 1600 }, { "epoch": 0.7515473032714411, "grad_norm": 9.468025207519531, "learning_rate": 9.96922735346359e-06, "loss": 1.7581, "step": 1700 }, { "epoch": 0.7957559681697612, "grad_norm": 6.772793769836426, "learning_rate": 9.964786856127887e-06, "loss": 1.7, "step": 1800 }, { "epoch": 0.8399646330680813, "grad_norm": 7.173699855804443, "learning_rate": 9.960346358792185e-06, "loss": 1.6909, "step": 1900 }, { "epoch": 0.8841732979664014, "grad_norm": 7.751150608062744, "learning_rate": 9.955905861456485e-06, "loss": 1.653, "step": 2000 }, { "epoch": 0.8841732979664014, "eval_bleu": 1.411049640874672, "eval_char_accuracy": 9.168179674224167, "eval_loss": 1.5142065286636353, "eval_runtime": 1480.5995, "eval_samples_per_second": 1.528, "eval_steps_per_second": 0.382, "step": 2000 }, { "epoch": 0.9283819628647215, "grad_norm": 8.334531784057617, "learning_rate": 9.951465364120782e-06, "loss": 1.6277, "step": 2100 }, { "epoch": 0.9725906277630415, "grad_norm": 7.158090114593506, "learning_rate": 9.947024866785082e-06, "loss": 1.5924, "step": 2200 }, { "epoch": 1.0167992926613616, "grad_norm": 6.8820672035217285, "learning_rate": 9.94258436944938e-06, "loss": 1.5838, "step": 2300 }, { "epoch": 1.0610079575596818, "grad_norm": 7.375690937042236, "learning_rate": 9.938143872113677e-06, "loss": 1.5664, "step": 2400 }, { "epoch": 1.1052166224580018, "grad_norm": 6.44377326965332, "learning_rate": 9.933703374777975e-06, "loss": 1.5474, "step": 2500 }, { "epoch": 1.1052166224580018, "eval_bleu": 2.4519410766932324, "eval_char_accuracy": 9.739676550213543, "eval_loss": 1.4180608987808228, "eval_runtime": 1501.1274, "eval_samples_per_second": 1.508, "eval_steps_per_second": 0.377, "step": 2500 }, { "epoch": 1.1494252873563218, "grad_norm": 6.70504093170166, "learning_rate": 9.929262877442275e-06, "loss": 1.5386, "step": 2600 }, { "epoch": 1.193633952254642, "grad_norm": 6.540093421936035, "learning_rate": 9.924822380106573e-06, "loss": 1.5148, "step": 2700 }, { "epoch": 1.237842617152962, "grad_norm": 9.24384880065918, "learning_rate": 9.92038188277087e-06, "loss": 1.483, "step": 2800 }, { "epoch": 1.282051282051282, "grad_norm": 8.332944869995117, "learning_rate": 9.91594138543517e-06, "loss": 1.4803, "step": 2900 }, { "epoch": 1.3262599469496021, "grad_norm": 6.69514799118042, "learning_rate": 9.911500888099468e-06, "loss": 1.4624, "step": 3000 }, { "epoch": 1.3262599469496021, "eval_bleu": 3.751276752491243, "eval_char_accuracy": 9.844348674476155, "eval_loss": 1.3348913192749023, "eval_runtime": 1400.2215, "eval_samples_per_second": 1.616, "eval_steps_per_second": 0.404, "step": 3000 }, { "epoch": 1.3704686118479221, "grad_norm": 8.663880348205566, "learning_rate": 9.907060390763767e-06, "loss": 1.4425, "step": 3100 }, { "epoch": 1.4146772767462423, "grad_norm": 5.447376251220703, "learning_rate": 9.902619893428065e-06, "loss": 1.4375, "step": 3200 }, { "epoch": 1.4588859416445623, "grad_norm": 6.044620513916016, "learning_rate": 9.898179396092363e-06, "loss": 1.4305, "step": 3300 }, { "epoch": 1.5030946065428825, "grad_norm": 8.018180847167969, "learning_rate": 9.893738898756662e-06, "loss": 1.4029, "step": 3400 }, { "epoch": 1.5473032714412025, "grad_norm": 6.330356597900391, "learning_rate": 9.88929840142096e-06, "loss": 1.3924, "step": 3500 }, { "epoch": 1.5473032714412025, "eval_bleu": 3.9398728773843197, "eval_char_accuracy": 10.569946177852154, "eval_loss": 1.2733469009399414, "eval_runtime": 1549.5103, "eval_samples_per_second": 1.46, "eval_steps_per_second": 0.365, "step": 3500 }, { "epoch": 1.5915119363395225, "grad_norm": 6.058270454406738, "learning_rate": 9.88485790408526e-06, "loss": 1.3903, "step": 3600 }, { "epoch": 1.6357206012378427, "grad_norm": 7.059033393859863, "learning_rate": 9.880417406749557e-06, "loss": 1.375, "step": 3700 }, { "epoch": 1.6799292661361627, "grad_norm": 7.247815132141113, "learning_rate": 9.875976909413855e-06, "loss": 1.3698, "step": 3800 }, { "epoch": 1.7241379310344827, "grad_norm": 9.74676513671875, "learning_rate": 9.871536412078153e-06, "loss": 1.3778, "step": 3900 }, { "epoch": 1.7683465959328029, "grad_norm": 8.913097381591797, "learning_rate": 9.867095914742452e-06, "loss": 1.3476, "step": 4000 }, { "epoch": 1.7683465959328029, "eval_bleu": 5.538908824668547, "eval_char_accuracy": 10.403892252324432, "eval_loss": 1.2210185527801514, "eval_runtime": 1317.2772, "eval_samples_per_second": 1.718, "eval_steps_per_second": 0.43, "step": 4000 }, { "epoch": 1.812555260831123, "grad_norm": 6.1718292236328125, "learning_rate": 9.86265541740675e-06, "loss": 1.3225, "step": 4100 }, { "epoch": 1.8567639257294428, "grad_norm": 6.037962913513184, "learning_rate": 9.858214920071048e-06, "loss": 1.3229, "step": 4200 }, { "epoch": 1.900972590627763, "grad_norm": 7.1866230964660645, "learning_rate": 9.853774422735348e-06, "loss": 1.3215, "step": 4300 }, { "epoch": 1.9451812555260832, "grad_norm": 6.122922420501709, "learning_rate": 9.849333925399645e-06, "loss": 1.3049, "step": 4400 }, { "epoch": 1.9893899204244032, "grad_norm": 8.162379264831543, "learning_rate": 9.844893428063945e-06, "loss": 1.3023, "step": 4500 }, { "epoch": 1.9893899204244032, "eval_bleu": 6.11387726505584, "eval_char_accuracy": 10.990249985462205, "eval_loss": 1.1699895858764648, "eval_runtime": 1642.8974, "eval_samples_per_second": 1.377, "eval_steps_per_second": 0.345, "step": 4500 }, { "epoch": 2.033598585322723, "grad_norm": 7.18614387512207, "learning_rate": 9.840452930728243e-06, "loss": 1.2704, "step": 4600 }, { "epoch": 2.0778072502210434, "grad_norm": 6.672239780426025, "learning_rate": 9.83601243339254e-06, "loss": 1.2762, "step": 4700 }, { "epoch": 2.1220159151193636, "grad_norm": 5.55977725982666, "learning_rate": 9.83157193605684e-06, "loss": 1.2591, "step": 4800 }, { "epoch": 2.1662245800176834, "grad_norm": 5.068392276763916, "learning_rate": 9.827131438721138e-06, "loss": 1.2592, "step": 4900 }, { "epoch": 2.2104332449160036, "grad_norm": 6.189523220062256, "learning_rate": 9.822690941385436e-06, "loss": 1.2404, "step": 5000 }, { "epoch": 2.2104332449160036, "eval_bleu": 8.191454703513129, "eval_char_accuracy": 10.766691004012431, "eval_loss": 1.1257007122039795, "eval_runtime": 1245.1612, "eval_samples_per_second": 1.817, "eval_steps_per_second": 0.455, "step": 5000 }, { "epoch": 2.2546419098143238, "grad_norm": 6.778228282928467, "learning_rate": 9.818250444049733e-06, "loss": 1.2415, "step": 5100 }, { "epoch": 2.2988505747126435, "grad_norm": 6.770288944244385, "learning_rate": 9.813809946714033e-06, "loss": 1.2195, "step": 5200 }, { "epoch": 2.3430592396109637, "grad_norm": 7.563081741333008, "learning_rate": 9.80936944937833e-06, "loss": 1.2234, "step": 5300 }, { "epoch": 2.387267904509284, "grad_norm": 5.849658489227295, "learning_rate": 9.80492895204263e-06, "loss": 1.213, "step": 5400 }, { "epoch": 2.4314765694076037, "grad_norm": 5.621692180633545, "learning_rate": 9.800488454706928e-06, "loss": 1.1884, "step": 5500 }, { "epoch": 2.4314765694076037, "eval_bleu": 8.483674545541598, "eval_char_accuracy": 11.35207955081444, "eval_loss": 1.0881929397583008, "eval_runtime": 1340.836, "eval_samples_per_second": 1.688, "eval_steps_per_second": 0.422, "step": 5500 }, { "epoch": 2.475685234305924, "grad_norm": 4.707101821899414, "learning_rate": 9.796047957371226e-06, "loss": 1.2185, "step": 5600 }, { "epoch": 2.519893899204244, "grad_norm": 7.06864070892334, "learning_rate": 9.791607460035525e-06, "loss": 1.1935, "step": 5700 }, { "epoch": 2.564102564102564, "grad_norm": 5.334202289581299, "learning_rate": 9.787166962699823e-06, "loss": 1.179, "step": 5800 }, { "epoch": 2.608311229000884, "grad_norm": 6.151766777038574, "learning_rate": 9.782726465364123e-06, "loss": 1.1809, "step": 5900 }, { "epoch": 2.6525198938992043, "grad_norm": 6.352200031280518, "learning_rate": 9.77828596802842e-06, "loss": 1.1872, "step": 6000 }, { "epoch": 2.6525198938992043, "eval_bleu": 10.180532150513427, "eval_char_accuracy": 11.06616958176379, "eval_loss": 1.0602151155471802, "eval_runtime": 1235.3352, "eval_samples_per_second": 1.832, "eval_steps_per_second": 0.458, "step": 6000 }, { "epoch": 2.696728558797524, "grad_norm": 5.816124439239502, "learning_rate": 9.773845470692718e-06, "loss": 1.1664, "step": 6100 }, { "epoch": 2.7409372236958442, "grad_norm": 5.711726188659668, "learning_rate": 9.769404973357018e-06, "loss": 1.1548, "step": 6200 }, { "epoch": 2.7851458885941645, "grad_norm": 7.0409979820251465, "learning_rate": 9.764964476021314e-06, "loss": 1.1517, "step": 6300 }, { "epoch": 2.8293545534924847, "grad_norm": 5.6412153244018555, "learning_rate": 9.760523978685613e-06, "loss": 1.1324, "step": 6400 }, { "epoch": 2.873563218390805, "grad_norm": 5.695923328399658, "learning_rate": 9.756083481349911e-06, "loss": 1.1515, "step": 6500 }, { "epoch": 2.873563218390805, "eval_bleu": 11.115878050984174, "eval_char_accuracy": 11.366940407962835, "eval_loss": 1.0245429277420044, "eval_runtime": 1311.2076, "eval_samples_per_second": 1.726, "eval_steps_per_second": 0.432, "step": 6500 }, { "epoch": 2.9177718832891246, "grad_norm": 8.059999465942383, "learning_rate": 9.75164298401421e-06, "loss": 1.1394, "step": 6600 }, { "epoch": 2.961980548187445, "grad_norm": 6.020379543304443, "learning_rate": 9.747202486678508e-06, "loss": 1.1391, "step": 6700 }, { "epoch": 3.0061892130857646, "grad_norm": 6.8791961669921875, "learning_rate": 9.742761989342808e-06, "loss": 1.1363, "step": 6800 }, { "epoch": 3.050397877984085, "grad_norm": 6.068972587585449, "learning_rate": 9.738321492007106e-06, "loss": 1.1078, "step": 6900 }, { "epoch": 3.094606542882405, "grad_norm": 6.407805919647217, "learning_rate": 9.733880994671404e-06, "loss": 1.0998, "step": 7000 }, { "epoch": 3.094606542882405, "eval_bleu": 11.884849220200902, "eval_char_accuracy": 11.544301507407814, "eval_loss": 1.0018765926361084, "eval_runtime": 1146.9434, "eval_samples_per_second": 1.973, "eval_steps_per_second": 0.493, "step": 7000 }, { "epoch": 3.138815207780725, "grad_norm": 5.876919269561768, "learning_rate": 9.729440497335703e-06, "loss": 1.1001, "step": 7100 }, { "epoch": 3.183023872679045, "grad_norm": 6.593369483947754, "learning_rate": 9.725000000000001e-06, "loss": 1.0866, "step": 7200 }, { "epoch": 3.227232537577365, "grad_norm": 5.931233882904053, "learning_rate": 9.7205595026643e-06, "loss": 1.0837, "step": 7300 }, { "epoch": 3.2714412024756854, "grad_norm": 5.210250377655029, "learning_rate": 9.716119005328598e-06, "loss": 1.0835, "step": 7400 }, { "epoch": 3.315649867374005, "grad_norm": 7.105567455291748, "learning_rate": 9.711678507992896e-06, "loss": 1.0855, "step": 7500 }, { "epoch": 3.315649867374005, "eval_bleu": 12.652778111315618, "eval_char_accuracy": 11.785628905013278, "eval_loss": 0.9757761359214783, "eval_runtime": 1196.0685, "eval_samples_per_second": 1.892, "eval_steps_per_second": 0.473, "step": 7500 }, { "epoch": 3.3598585322723253, "grad_norm": 6.592043399810791, "learning_rate": 9.707238010657194e-06, "loss": 1.0731, "step": 7600 }, { "epoch": 3.4040671971706455, "grad_norm": 5.485996723175049, "learning_rate": 9.702797513321492e-06, "loss": 1.0854, "step": 7700 }, { "epoch": 3.4482758620689653, "grad_norm": 5.682703495025635, "learning_rate": 9.698357015985791e-06, "loss": 1.0587, "step": 7800 }, { "epoch": 3.4924845269672855, "grad_norm": 5.363906383514404, "learning_rate": 9.693916518650089e-06, "loss": 1.0532, "step": 7900 }, { "epoch": 3.5366931918656057, "grad_norm": 6.669486999511719, "learning_rate": 9.689476021314388e-06, "loss": 1.0669, "step": 8000 }, { "epoch": 3.5366931918656057, "eval_bleu": 13.377471655820932, "eval_char_accuracy": 11.98721966285238, "eval_loss": 0.9500051140785217, "eval_runtime": 1248.4469, "eval_samples_per_second": 1.813, "eval_steps_per_second": 0.453, "step": 8000 }, { "epoch": 3.5809018567639255, "grad_norm": 5.867100238800049, "learning_rate": 9.685035523978686e-06, "loss": 1.061, "step": 8100 }, { "epoch": 3.6251105216622457, "grad_norm": 5.563074588775635, "learning_rate": 9.680595026642984e-06, "loss": 1.0486, "step": 8200 }, { "epoch": 3.669319186560566, "grad_norm": 5.551767826080322, "learning_rate": 9.676154529307284e-06, "loss": 1.0485, "step": 8300 }, { "epoch": 3.713527851458886, "grad_norm": 5.244560718536377, "learning_rate": 9.671714031971581e-06, "loss": 1.0351, "step": 8400 }, { "epoch": 3.757736516357206, "grad_norm": 6.0703887939453125, "learning_rate": 9.667273534635881e-06, "loss": 1.0327, "step": 8500 }, { "epoch": 3.757736516357206, "eval_bleu": 13.946298488171834, "eval_char_accuracy": 12.173949563543086, "eval_loss": 0.9292075037956238, "eval_runtime": 1246.2262, "eval_samples_per_second": 1.816, "eval_steps_per_second": 0.454, "step": 8500 }, { "epoch": 3.801945181255526, "grad_norm": 7.2362446784973145, "learning_rate": 9.662833037300179e-06, "loss": 1.0359, "step": 8600 }, { "epoch": 3.8461538461538463, "grad_norm": 6.165930271148682, "learning_rate": 9.658392539964478e-06, "loss": 1.0257, "step": 8700 }, { "epoch": 3.8903625110521665, "grad_norm": 5.4346923828125, "learning_rate": 9.653952042628774e-06, "loss": 1.0175, "step": 8800 }, { "epoch": 3.934571175950486, "grad_norm": 7.020615577697754, "learning_rate": 9.649511545293074e-06, "loss": 1.0088, "step": 8900 }, { "epoch": 3.9787798408488064, "grad_norm": 5.138782501220703, "learning_rate": 9.645071047957372e-06, "loss": 1.0, "step": 9000 }, { "epoch": 3.9787798408488064, "eval_bleu": 14.901762096368834, "eval_char_accuracy": 12.284436805820286, "eval_loss": 0.9140157103538513, "eval_runtime": 1119.2721, "eval_samples_per_second": 2.022, "eval_steps_per_second": 0.506, "step": 9000 }, { "epoch": 4.022988505747127, "grad_norm": 5.2244439125061035, "learning_rate": 9.64063055062167e-06, "loss": 1.0143, "step": 9100 }, { "epoch": 4.067197170645446, "grad_norm": 5.707061767578125, "learning_rate": 9.636190053285969e-06, "loss": 0.9982, "step": 9200 }, { "epoch": 4.111405835543766, "grad_norm": 7.022563934326172, "learning_rate": 9.631749555950267e-06, "loss": 0.9793, "step": 9300 }, { "epoch": 4.155614500442087, "grad_norm": 5.185485363006592, "learning_rate": 9.627309058614566e-06, "loss": 0.9709, "step": 9400 }, { "epoch": 4.199823165340407, "grad_norm": 5.606160640716553, "learning_rate": 9.622868561278864e-06, "loss": 0.9811, "step": 9500 }, { "epoch": 4.199823165340407, "eval_bleu": 15.481314802793468, "eval_char_accuracy": 12.530610135104576, "eval_loss": 0.892221212387085, "eval_runtime": 1141.1564, "eval_samples_per_second": 1.983, "eval_steps_per_second": 0.496, "step": 9500 }, { "epoch": 4.244031830238727, "grad_norm": 5.275352954864502, "learning_rate": 9.618428063943162e-06, "loss": 0.9869, "step": 9600 }, { "epoch": 4.288240495137047, "grad_norm": 5.318814277648926, "learning_rate": 9.613987566607461e-06, "loss": 0.9837, "step": 9700 }, { "epoch": 4.332449160035367, "grad_norm": 4.683131217956543, "learning_rate": 9.609547069271759e-06, "loss": 1.0033, "step": 9800 }, { "epoch": 4.376657824933687, "grad_norm": 5.217209339141846, "learning_rate": 9.605106571936059e-06, "loss": 0.9797, "step": 9900 }, { "epoch": 4.420866489832007, "grad_norm": 4.965491771697998, "learning_rate": 9.600666074600355e-06, "loss": 0.9676, "step": 10000 }, { "epoch": 4.420866489832007, "eval_bleu": 15.69531458571486, "eval_char_accuracy": 12.86142573771233, "eval_loss": 0.8761848211288452, "eval_runtime": 1222.4474, "eval_samples_per_second": 1.851, "eval_steps_per_second": 0.463, "step": 10000 }, { "epoch": 4.465075154730327, "grad_norm": 5.212737560272217, "learning_rate": 9.596225577264654e-06, "loss": 0.9805, "step": 10100 }, { "epoch": 4.5092838196286475, "grad_norm": 4.708688735961914, "learning_rate": 9.591785079928952e-06, "loss": 0.9562, "step": 10200 }, { "epoch": 4.553492484526967, "grad_norm": 5.798556327819824, "learning_rate": 9.587344582593252e-06, "loss": 0.9515, "step": 10300 }, { "epoch": 4.597701149425287, "grad_norm": 5.048225402832031, "learning_rate": 9.58290408525755e-06, "loss": 0.9641, "step": 10400 }, { "epoch": 4.641909814323608, "grad_norm": 5.326527118682861, "learning_rate": 9.578463587921847e-06, "loss": 0.9438, "step": 10500 }, { "epoch": 4.641909814323608, "eval_bleu": 16.565099319963913, "eval_char_accuracy": 12.801336184894907, "eval_loss": 0.8577448725700378, "eval_runtime": 1223.7218, "eval_samples_per_second": 1.849, "eval_steps_per_second": 0.463, "step": 10500 }, { "epoch": 4.6861184792219275, "grad_norm": 4.620062828063965, "learning_rate": 9.574023090586147e-06, "loss": 0.9324, "step": 10600 }, { "epoch": 4.730327144120247, "grad_norm": 5.602158546447754, "learning_rate": 9.569582593250444e-06, "loss": 0.9399, "step": 10700 }, { "epoch": 4.774535809018568, "grad_norm": 5.920672416687012, "learning_rate": 9.565142095914744e-06, "loss": 0.9628, "step": 10800 }, { "epoch": 4.818744473916888, "grad_norm": 6.023122787475586, "learning_rate": 9.560701598579042e-06, "loss": 0.9365, "step": 10900 }, { "epoch": 4.862953138815207, "grad_norm": 4.93074893951416, "learning_rate": 9.55626110124334e-06, "loss": 0.9336, "step": 11000 }, { "epoch": 4.862953138815207, "eval_bleu": 16.96890615437053, "eval_char_accuracy": 12.860133489264648, "eval_loss": 0.8414692282676697, "eval_runtime": 1163.3819, "eval_samples_per_second": 1.945, "eval_steps_per_second": 0.487, "step": 11000 }, { "epoch": 4.907161803713528, "grad_norm": 4.739640235900879, "learning_rate": 9.551820603907639e-06, "loss": 0.9326, "step": 11100 }, { "epoch": 4.951370468611848, "grad_norm": 5.271428108215332, "learning_rate": 9.547380106571937e-06, "loss": 0.9364, "step": 11200 }, { "epoch": 4.995579133510168, "grad_norm": 5.060763835906982, "learning_rate": 9.542939609236235e-06, "loss": 0.9371, "step": 11300 }, { "epoch": 5.039787798408488, "grad_norm": 5.890202045440674, "learning_rate": 9.538499111900532e-06, "loss": 0.928, "step": 11400 }, { "epoch": 5.083996463306808, "grad_norm": 5.524294376373291, "learning_rate": 9.534058614564832e-06, "loss": 0.9049, "step": 11500 }, { "epoch": 5.083996463306808, "eval_bleu": 17.294611693602207, "eval_char_accuracy": 12.83525770664668, "eval_loss": 0.8323912620544434, "eval_runtime": 1278.8595, "eval_samples_per_second": 1.77, "eval_steps_per_second": 0.443, "step": 11500 }, { "epoch": 5.128205128205128, "grad_norm": 5.51794958114624, "learning_rate": 9.52961811722913e-06, "loss": 0.9036, "step": 11600 }, { "epoch": 5.172413793103448, "grad_norm": 4.87642240524292, "learning_rate": 9.52517761989343e-06, "loss": 0.9008, "step": 11700 }, { "epoch": 5.216622458001768, "grad_norm": 5.899691581726074, "learning_rate": 9.520737122557727e-06, "loss": 0.9028, "step": 11800 }, { "epoch": 5.260831122900089, "grad_norm": 4.414923191070557, "learning_rate": 9.516296625222025e-06, "loss": 0.9253, "step": 11900 }, { "epoch": 5.305039787798409, "grad_norm": 5.061226844787598, "learning_rate": 9.511856127886324e-06, "loss": 0.8959, "step": 12000 }, { "epoch": 5.305039787798409, "eval_bleu": 17.916342701783332, "eval_char_accuracy": 12.91408486195556, "eval_loss": 0.8161113262176514, "eval_runtime": 1221.2267, "eval_samples_per_second": 1.853, "eval_steps_per_second": 0.463, "step": 12000 }, { "epoch": 5.349248452696728, "grad_norm": 4.727482318878174, "learning_rate": 9.50746003552398e-06, "loss": 0.8969, "step": 12100 }, { "epoch": 5.393457117595049, "grad_norm": 5.119187355041504, "learning_rate": 9.50301953818828e-06, "loss": 0.8876, "step": 12200 }, { "epoch": 5.437665782493369, "grad_norm": 6.153038024902344, "learning_rate": 9.498579040852575e-06, "loss": 0.884, "step": 12300 }, { "epoch": 5.4818744473916885, "grad_norm": 4.792954444885254, "learning_rate": 9.494138543516875e-06, "loss": 0.9114, "step": 12400 }, { "epoch": 5.526083112290009, "grad_norm": 5.180963516235352, "learning_rate": 9.489698046181173e-06, "loss": 0.9043, "step": 12500 }, { "epoch": 5.526083112290009, "eval_bleu": 18.15402344379284, "eval_char_accuracy": 13.32631211676757, "eval_loss": 0.8073405623435974, "eval_runtime": 1314.4294, "eval_samples_per_second": 1.722, "eval_steps_per_second": 0.431, "step": 12500 }, { "epoch": 5.570291777188329, "grad_norm": 4.507122039794922, "learning_rate": 9.48525754884547e-06, "loss": 0.88, "step": 12600 }, { "epoch": 5.614500442086649, "grad_norm": 5.46723747253418, "learning_rate": 9.48081705150977e-06, "loss": 0.8821, "step": 12700 }, { "epoch": 5.658709106984969, "grad_norm": 3.8277740478515625, "learning_rate": 9.476376554174068e-06, "loss": 0.881, "step": 12800 }, { "epoch": 5.702917771883289, "grad_norm": 6.790221214294434, "learning_rate": 9.471936056838367e-06, "loss": 0.8677, "step": 12900 }, { "epoch": 5.747126436781609, "grad_norm": 5.373092174530029, "learning_rate": 9.467495559502665e-06, "loss": 0.8828, "step": 13000 }, { "epoch": 5.747126436781609, "eval_bleu": 18.918836184739146, "eval_char_accuracy": 13.315328004962234, "eval_loss": 0.7943041324615479, "eval_runtime": 1225.6765, "eval_samples_per_second": 1.846, "eval_steps_per_second": 0.462, "step": 13000 }, { "epoch": 5.7913351016799295, "grad_norm": 5.485777854919434, "learning_rate": 9.463055062166965e-06, "loss": 0.8839, "step": 13100 }, { "epoch": 5.835543766578249, "grad_norm": 4.476625442504883, "learning_rate": 9.458614564831262e-06, "loss": 0.8751, "step": 13200 }, { "epoch": 5.87975243147657, "grad_norm": 5.016860485076904, "learning_rate": 9.45417406749556e-06, "loss": 0.874, "step": 13300 }, { "epoch": 5.92396109637489, "grad_norm": 5.216977596282959, "learning_rate": 9.44973357015986e-06, "loss": 0.8742, "step": 13400 }, { "epoch": 5.968169761273209, "grad_norm": 5.697312355041504, "learning_rate": 9.445293072824156e-06, "loss": 0.8663, "step": 13500 }, { "epoch": 5.968169761273209, "eval_bleu": 19.262115723990707, "eval_char_accuracy": 13.577331377730683, "eval_loss": 0.7794021368026733, "eval_runtime": 1237.7528, "eval_samples_per_second": 1.828, "eval_steps_per_second": 0.457, "step": 13500 }, { "epoch": 6.012378426171529, "grad_norm": 6.139482021331787, "learning_rate": 9.440852575488455e-06, "loss": 0.8541, "step": 13600 }, { "epoch": 6.05658709106985, "grad_norm": 6.129231929779053, "learning_rate": 9.436412078152753e-06, "loss": 0.8633, "step": 13700 }, { "epoch": 6.10079575596817, "grad_norm": 5.568439960479736, "learning_rate": 9.431971580817053e-06, "loss": 0.8642, "step": 13800 }, { "epoch": 6.14500442086649, "grad_norm": 3.72407603263855, "learning_rate": 9.42753108348135e-06, "loss": 0.8551, "step": 13900 }, { "epoch": 6.18921308576481, "grad_norm": 4.565580368041992, "learning_rate": 9.423090586145648e-06, "loss": 0.8339, "step": 14000 }, { "epoch": 6.18921308576481, "eval_bleu": 19.714087575830415, "eval_char_accuracy": 13.603499408796335, "eval_loss": 0.773350715637207, "eval_runtime": 1194.9872, "eval_samples_per_second": 1.894, "eval_steps_per_second": 0.474, "step": 14000 }, { "epoch": 6.23342175066313, "grad_norm": 4.000091552734375, "learning_rate": 9.418694493783305e-06, "loss": 0.8236, "step": 14100 }, { "epoch": 6.27763041556145, "grad_norm": 5.196188449859619, "learning_rate": 9.414253996447603e-06, "loss": 0.8418, "step": 14200 }, { "epoch": 6.32183908045977, "grad_norm": 4.242846488952637, "learning_rate": 9.409813499111902e-06, "loss": 0.8479, "step": 14300 }, { "epoch": 6.36604774535809, "grad_norm": 4.858129024505615, "learning_rate": 9.4053730017762e-06, "loss": 0.8574, "step": 14400 }, { "epoch": 6.410256410256411, "grad_norm": 5.038287162780762, "learning_rate": 9.400932504440498e-06, "loss": 0.8342, "step": 14500 }, { "epoch": 6.410256410256411, "eval_bleu": 20.043456229638707, "eval_char_accuracy": 13.644528297010384, "eval_loss": 0.7611100673675537, "eval_runtime": 1184.4791, "eval_samples_per_second": 1.911, "eval_steps_per_second": 0.478, "step": 14500 }, { "epoch": 6.45446507515473, "grad_norm": 5.464291095733643, "learning_rate": 9.396492007104796e-06, "loss": 0.824, "step": 14600 }, { "epoch": 6.49867374005305, "grad_norm": 5.205833435058594, "learning_rate": 9.392051509769095e-06, "loss": 0.8257, "step": 14700 }, { "epoch": 6.542882404951371, "grad_norm": 5.267154693603516, "learning_rate": 9.387611012433393e-06, "loss": 0.8266, "step": 14800 }, { "epoch": 6.5870910698496905, "grad_norm": 5.024267673492432, "learning_rate": 9.383170515097691e-06, "loss": 0.8244, "step": 14900 }, { "epoch": 6.63129973474801, "grad_norm": 6.4435601234436035, "learning_rate": 9.37873001776199e-06, "loss": 0.8424, "step": 15000 }, { "epoch": 6.63129973474801, "eval_bleu": 20.615410170391435, "eval_char_accuracy": 13.852257234975996, "eval_loss": 0.7521194219589233, "eval_runtime": 1203.1153, "eval_samples_per_second": 1.881, "eval_steps_per_second": 0.47, "step": 15000 }, { "epoch": 6.675508399646331, "grad_norm": 6.099647045135498, "learning_rate": 9.374289520426288e-06, "loss": 0.8348, "step": 15100 }, { "epoch": 6.719717064544651, "grad_norm": 5.707744598388672, "learning_rate": 9.369849023090588e-06, "loss": 0.8232, "step": 15200 }, { "epoch": 6.76392572944297, "grad_norm": 5.657838821411133, "learning_rate": 9.365408525754886e-06, "loss": 0.8219, "step": 15300 }, { "epoch": 6.808134394341291, "grad_norm": 5.4049506187438965, "learning_rate": 9.360968028419183e-06, "loss": 0.8247, "step": 15400 }, { "epoch": 6.852343059239611, "grad_norm": 4.4286088943481445, "learning_rate": 9.356527531083483e-06, "loss": 0.8285, "step": 15500 }, { "epoch": 6.852343059239611, "eval_bleu": 20.858850417596773, "eval_char_accuracy": 13.96112916669359, "eval_loss": 0.7454356551170349, "eval_runtime": 1198.6235, "eval_samples_per_second": 1.888, "eval_steps_per_second": 0.472, "step": 15500 }, { "epoch": 6.896551724137931, "grad_norm": 5.1050004959106445, "learning_rate": 9.35208703374778e-06, "loss": 0.8268, "step": 15600 }, { "epoch": 6.940760389036251, "grad_norm": 5.541866302490234, "learning_rate": 9.34764653641208e-06, "loss": 0.8082, "step": 15700 }, { "epoch": 6.984969053934571, "grad_norm": 5.256848335266113, "learning_rate": 9.343206039076376e-06, "loss": 0.8161, "step": 15800 }, { "epoch": 7.029177718832892, "grad_norm": 4.822115898132324, "learning_rate": 9.338765541740676e-06, "loss": 0.797, "step": 15900 }, { "epoch": 7.073386383731211, "grad_norm": 5.608044624328613, "learning_rate": 9.334325044404974e-06, "loss": 0.8203, "step": 16000 }, { "epoch": 7.073386383731211, "eval_bleu": 21.19771007109446, "eval_char_accuracy": 13.90782391822652, "eval_loss": 0.7372470498085022, "eval_runtime": 1148.4214, "eval_samples_per_second": 1.971, "eval_steps_per_second": 0.493, "step": 16000 }, { "epoch": 7.117595048629531, "grad_norm": 4.8734822273254395, "learning_rate": 9.329928952042629e-06, "loss": 0.8003, "step": 16100 }, { "epoch": 7.161803713527852, "grad_norm": 5.697593688964844, "learning_rate": 9.325488454706928e-06, "loss": 0.7959, "step": 16200 }, { "epoch": 7.206012378426172, "grad_norm": 4.409221172332764, "learning_rate": 9.321047957371226e-06, "loss": 0.7931, "step": 16300 }, { "epoch": 7.250221043324491, "grad_norm": 3.9396543502807617, "learning_rate": 9.316607460035526e-06, "loss": 0.8081, "step": 16400 }, { "epoch": 7.294429708222812, "grad_norm": 4.0121684074401855, "learning_rate": 9.312166962699823e-06, "loss": 0.7966, "step": 16500 }, { "epoch": 7.294429708222812, "eval_bleu": 21.597191065507623, "eval_char_accuracy": 14.009265421369912, "eval_loss": 0.7275057435035706, "eval_runtime": 1160.1006, "eval_samples_per_second": 1.951, "eval_steps_per_second": 0.488, "step": 16500 }, { "epoch": 7.338638373121132, "grad_norm": 4.6430463790893555, "learning_rate": 9.307726465364121e-06, "loss": 0.7973, "step": 16600 }, { "epoch": 7.3828470380194515, "grad_norm": 3.840611219406128, "learning_rate": 9.30328596802842e-06, "loss": 0.802, "step": 16700 }, { "epoch": 7.427055702917772, "grad_norm": 4.910470485687256, "learning_rate": 9.298845470692719e-06, "loss": 0.7806, "step": 16800 }, { "epoch": 7.471264367816092, "grad_norm": 4.984016418457031, "learning_rate": 9.294404973357016e-06, "loss": 0.7974, "step": 16900 }, { "epoch": 7.515473032714412, "grad_norm": 4.00908899307251, "learning_rate": 9.289964476021314e-06, "loss": 0.7603, "step": 17000 }, { "epoch": 7.515473032714412, "eval_bleu": 22.037363701432863, "eval_char_accuracy": 14.14915131583198, "eval_loss": 0.720319926738739, "eval_runtime": 1174.9005, "eval_samples_per_second": 1.926, "eval_steps_per_second": 0.482, "step": 17000 }, { "epoch": 7.559681697612732, "grad_norm": 4.46291446685791, "learning_rate": 9.285523978685614e-06, "loss": 0.8005, "step": 17100 }, { "epoch": 7.603890362511052, "grad_norm": 5.122077465057373, "learning_rate": 9.281083481349911e-06, "loss": 0.7707, "step": 17200 }, { "epoch": 7.648099027409372, "grad_norm": 4.963642597198486, "learning_rate": 9.276642984014211e-06, "loss": 0.8, "step": 17300 }, { "epoch": 7.6923076923076925, "grad_norm": 4.689663410186768, "learning_rate": 9.272202486678509e-06, "loss": 0.7793, "step": 17400 }, { "epoch": 7.736516357206012, "grad_norm": 3.97963809967041, "learning_rate": 9.267761989342807e-06, "loss": 0.7752, "step": 17500 }, { "epoch": 7.736516357206012, "eval_bleu": 22.13474948629954, "eval_char_accuracy": 14.175642409009557, "eval_loss": 0.7108041048049927, "eval_runtime": 1164.3154, "eval_samples_per_second": 1.944, "eval_steps_per_second": 0.486, "step": 17500 }, { "epoch": 7.780725022104333, "grad_norm": 4.921281814575195, "learning_rate": 9.263321492007106e-06, "loss": 0.7841, "step": 17600 }, { "epoch": 7.824933687002653, "grad_norm": 5.92393684387207, "learning_rate": 9.258880994671404e-06, "loss": 0.7847, "step": 17700 }, { "epoch": 7.869142351900972, "grad_norm": 4.8740644454956055, "learning_rate": 9.254440497335703e-06, "loss": 0.7515, "step": 17800 }, { "epoch": 7.913351016799293, "grad_norm": 4.831662178039551, "learning_rate": 9.250000000000001e-06, "loss": 0.7687, "step": 17900 }, { "epoch": 7.957559681697613, "grad_norm": 4.552217960357666, "learning_rate": 9.245559502664299e-06, "loss": 0.784, "step": 18000 }, { "epoch": 7.957559681697613, "eval_bleu": 22.510864155757993, "eval_char_accuracy": 14.26900735935491, "eval_loss": 0.7021002173423767, "eval_runtime": 1171.7338, "eval_samples_per_second": 1.931, "eval_steps_per_second": 0.483, "step": 18000 }, { "epoch": 8.001768346595933, "grad_norm": 6.623974800109863, "learning_rate": 9.241163410301954e-06, "loss": 0.7428, "step": 18100 }, { "epoch": 8.045977011494253, "grad_norm": 4.071683406829834, "learning_rate": 9.236722912966252e-06, "loss": 0.7523, "step": 18200 }, { "epoch": 8.090185676392572, "grad_norm": 4.206130504608154, "learning_rate": 9.232282415630552e-06, "loss": 0.7608, "step": 18300 }, { "epoch": 8.134394341290893, "grad_norm": 4.29170560836792, "learning_rate": 9.22784191829485e-06, "loss": 0.7538, "step": 18400 }, { "epoch": 8.178603006189213, "grad_norm": 5.72777795791626, "learning_rate": 9.223401420959149e-06, "loss": 0.7571, "step": 18500 }, { "epoch": 8.178603006189213, "eval_bleu": 22.71644669334606, "eval_char_accuracy": 14.289360272405974, "eval_loss": 0.6967329978942871, "eval_runtime": 1151.7127, "eval_samples_per_second": 1.965, "eval_steps_per_second": 0.491, "step": 18500 }, { "epoch": 8.222811671087532, "grad_norm": 4.11366605758667, "learning_rate": 9.218960923623447e-06, "loss": 0.7632, "step": 18600 }, { "epoch": 8.267020335985853, "grad_norm": 4.444884300231934, "learning_rate": 9.214520426287745e-06, "loss": 0.7452, "step": 18700 }, { "epoch": 8.311229000884174, "grad_norm": 4.654515743255615, "learning_rate": 9.210079928952044e-06, "loss": 0.7538, "step": 18800 }, { "epoch": 8.355437665782492, "grad_norm": 4.036041259765625, "learning_rate": 9.205639431616342e-06, "loss": 0.7597, "step": 18900 }, { "epoch": 8.399646330680813, "grad_norm": 4.778904914855957, "learning_rate": 9.201198934280641e-06, "loss": 0.7602, "step": 19000 }, { "epoch": 8.399646330680813, "eval_bleu": 23.184253769517653, "eval_char_accuracy": 14.43506128488263, "eval_loss": 0.6898081302642822, "eval_runtime": 1151.4129, "eval_samples_per_second": 1.965, "eval_steps_per_second": 0.492, "step": 19000 }, { "epoch": 8.443854995579134, "grad_norm": 4.703603744506836, "learning_rate": 9.196758436944937e-06, "loss": 0.7289, "step": 19100 }, { "epoch": 8.488063660477454, "grad_norm": 5.273664474487305, "learning_rate": 9.192317939609237e-06, "loss": 0.7457, "step": 19200 }, { "epoch": 8.532272325375773, "grad_norm": 6.892702102661133, "learning_rate": 9.187877442273535e-06, "loss": 0.7535, "step": 19300 }, { "epoch": 8.576480990274094, "grad_norm": 4.708390235900879, "learning_rate": 9.183436944937834e-06, "loss": 0.7484, "step": 19400 }, { "epoch": 8.620689655172415, "grad_norm": 4.149359226226807, "learning_rate": 9.178996447602132e-06, "loss": 0.7412, "step": 19500 }, { "epoch": 8.620689655172415, "eval_bleu": 23.22985023871008, "eval_char_accuracy": 14.53101073212336, "eval_loss": 0.6817213892936707, "eval_runtime": 1193.2506, "eval_samples_per_second": 1.897, "eval_steps_per_second": 0.474, "step": 19500 }, { "epoch": 8.664898320070733, "grad_norm": 4.802102088928223, "learning_rate": 9.17455595026643e-06, "loss": 0.7396, "step": 19600 }, { "epoch": 8.709106984969054, "grad_norm": 4.649740219116211, "learning_rate": 9.17011545293073e-06, "loss": 0.7356, "step": 19700 }, { "epoch": 8.753315649867375, "grad_norm": 4.58019495010376, "learning_rate": 9.165674955595027e-06, "loss": 0.7406, "step": 19800 }, { "epoch": 8.797524314765694, "grad_norm": 4.4533467292785645, "learning_rate": 9.161234458259327e-06, "loss": 0.7576, "step": 19900 }, { "epoch": 8.841732979664014, "grad_norm": 3.671436309814453, "learning_rate": 9.156793960923624e-06, "loss": 0.7492, "step": 20000 }, { "epoch": 8.841732979664014, "eval_bleu": 23.705592308275204, "eval_char_accuracy": 14.807551899928281, "eval_loss": 0.6738815903663635, "eval_runtime": 1161.4533, "eval_samples_per_second": 1.948, "eval_steps_per_second": 0.487, "step": 20000 }, { "epoch": 8.885941644562335, "grad_norm": 3.7899646759033203, "learning_rate": 9.152353463587922e-06, "loss": 0.7202, "step": 20100 }, { "epoch": 8.930150309460654, "grad_norm": 4.754965782165527, "learning_rate": 9.147957371225578e-06, "loss": 0.7218, "step": 20200 }, { "epoch": 8.974358974358974, "grad_norm": 4.5088911056518555, "learning_rate": 9.143516873889875e-06, "loss": 0.7483, "step": 20300 }, { "epoch": 9.018567639257295, "grad_norm": 4.284600734710693, "learning_rate": 9.139076376554175e-06, "loss": 0.7555, "step": 20400 }, { "epoch": 9.062776304155614, "grad_norm": 4.548980236053467, "learning_rate": 9.134635879218473e-06, "loss": 0.7229, "step": 20500 }, { "epoch": 9.062776304155614, "eval_bleu": 23.971806431263627, "eval_char_accuracy": 15.066970775801355, "eval_loss": 0.6719352006912231, "eval_runtime": 1187.4295, "eval_samples_per_second": 1.906, "eval_steps_per_second": 0.477, "step": 20500 }, { "epoch": 9.106984969053935, "grad_norm": 3.9492175579071045, "learning_rate": 9.130195381882772e-06, "loss": 0.7114, "step": 20600 }, { "epoch": 9.151193633952255, "grad_norm": 4.431577682495117, "learning_rate": 9.12575488454707e-06, "loss": 0.7237, "step": 20700 }, { "epoch": 9.195402298850574, "grad_norm": 5.588457107543945, "learning_rate": 9.121314387211368e-06, "loss": 0.7072, "step": 20800 }, { "epoch": 9.239610963748895, "grad_norm": 4.083963871002197, "learning_rate": 9.116873889875667e-06, "loss": 0.709, "step": 20900 }, { "epoch": 9.283819628647215, "grad_norm": 4.946596145629883, "learning_rate": 9.112433392539965e-06, "loss": 0.7189, "step": 21000 }, { "epoch": 9.283819628647215, "eval_bleu": 24.0525573328491, "eval_char_accuracy": 14.789137359548747, "eval_loss": 0.6652024388313293, "eval_runtime": 1170.2153, "eval_samples_per_second": 1.934, "eval_steps_per_second": 0.484, "step": 21000 }, { "epoch": 9.328028293545534, "grad_norm": 5.005494594573975, "learning_rate": 9.107992895204265e-06, "loss": 0.7038, "step": 21100 }, { "epoch": 9.372236958443855, "grad_norm": 4.802905559539795, "learning_rate": 9.103552397868562e-06, "loss": 0.7237, "step": 21200 }, { "epoch": 9.416445623342176, "grad_norm": 5.240441799163818, "learning_rate": 9.09911190053286e-06, "loss": 0.7182, "step": 21300 }, { "epoch": 9.460654288240494, "grad_norm": 4.5221452713012695, "learning_rate": 9.094671403197158e-06, "loss": 0.7202, "step": 21400 }, { "epoch": 9.504862953138815, "grad_norm": 5.610441207885742, "learning_rate": 9.090230905861457e-06, "loss": 0.7181, "step": 21500 }, { "epoch": 9.504862953138815, "eval_bleu": 24.34401492446839, "eval_char_accuracy": 14.892194173251749, "eval_loss": 0.6596232056617737, "eval_runtime": 1167.6698, "eval_samples_per_second": 1.938, "eval_steps_per_second": 0.485, "step": 21500 }, { "epoch": 9.549071618037136, "grad_norm": 4.156918048858643, "learning_rate": 9.085790408525755e-06, "loss": 0.6997, "step": 21600 }, { "epoch": 9.593280282935455, "grad_norm": 4.7859392166137695, "learning_rate": 9.081349911190053e-06, "loss": 0.705, "step": 21700 }, { "epoch": 9.637488947833775, "grad_norm": 4.572556972503662, "learning_rate": 9.076909413854353e-06, "loss": 0.704, "step": 21800 }, { "epoch": 9.681697612732096, "grad_norm": 4.419326305389404, "learning_rate": 9.07246891651865e-06, "loss": 0.7244, "step": 21900 }, { "epoch": 9.725906277630415, "grad_norm": 6.216968059539795, "learning_rate": 9.06802841918295e-06, "loss": 0.7222, "step": 22000 }, { "epoch": 9.725906277630415, "eval_bleu": 24.86242485342125, "eval_char_accuracy": 14.812074769495185, "eval_loss": 0.6533311009407043, "eval_runtime": 1154.9104, "eval_samples_per_second": 1.959, "eval_steps_per_second": 0.49, "step": 22000 }, { "epoch": 9.770114942528735, "grad_norm": 4.113579750061035, "learning_rate": 9.063587921847248e-06, "loss": 0.7052, "step": 22100 }, { "epoch": 9.814323607427056, "grad_norm": 3.747920274734497, "learning_rate": 9.059191829484903e-06, "loss": 0.7171, "step": 22200 }, { "epoch": 9.858532272325375, "grad_norm": 4.183202743530273, "learning_rate": 9.054751332149202e-06, "loss": 0.7113, "step": 22300 }, { "epoch": 9.902740937223696, "grad_norm": 5.176643371582031, "learning_rate": 9.050310834813499e-06, "loss": 0.7136, "step": 22400 }, { "epoch": 9.946949602122016, "grad_norm": 4.606767177581787, "learning_rate": 9.045870337477798e-06, "loss": 0.709, "step": 22500 }, { "epoch": 9.946949602122016, "eval_bleu": 25.087229996759017, "eval_char_accuracy": 15.1228605211638, "eval_loss": 0.6474445462226868, "eval_runtime": 1161.1823, "eval_samples_per_second": 1.949, "eval_steps_per_second": 0.487, "step": 22500 }, { "epoch": 9.991158267020335, "grad_norm": 3.8971004486083984, "learning_rate": 9.041429840142096e-06, "loss": 0.6905, "step": 22600 }, { "epoch": 10.035366931918656, "grad_norm": 3.8808772563934326, "learning_rate": 9.036989342806395e-06, "loss": 0.7088, "step": 22700 }, { "epoch": 10.079575596816976, "grad_norm": 3.6594464778900146, "learning_rate": 9.032548845470693e-06, "loss": 0.6775, "step": 22800 }, { "epoch": 10.123784261715295, "grad_norm": 4.82035493850708, "learning_rate": 9.028108348134991e-06, "loss": 0.6816, "step": 22900 }, { "epoch": 10.167992926613616, "grad_norm": 5.821552276611328, "learning_rate": 9.02366785079929e-06, "loss": 0.7089, "step": 23000 }, { "epoch": 10.167992926613616, "eval_bleu": 25.151932394216136, "eval_char_accuracy": 15.101538421776972, "eval_loss": 0.6455413699150085, "eval_runtime": 1140.3678, "eval_samples_per_second": 1.984, "eval_steps_per_second": 0.496, "step": 23000 }, { "epoch": 10.212201591511937, "grad_norm": 5.743397235870361, "learning_rate": 9.019227353463588e-06, "loss": 0.6966, "step": 23100 }, { "epoch": 10.256410256410255, "grad_norm": 4.763890266418457, "learning_rate": 9.014786856127888e-06, "loss": 0.6778, "step": 23200 }, { "epoch": 10.300618921308576, "grad_norm": 4.760409832000732, "learning_rate": 9.010346358792186e-06, "loss": 0.6773, "step": 23300 }, { "epoch": 10.344827586206897, "grad_norm": 5.199061393737793, "learning_rate": 9.005905861456483e-06, "loss": 0.6649, "step": 23400 }, { "epoch": 10.389036251105217, "grad_norm": 4.921041011810303, "learning_rate": 9.001465364120783e-06, "loss": 0.7005, "step": 23500 }, { "epoch": 10.389036251105217, "eval_bleu": 25.370282919525533, "eval_char_accuracy": 15.11510703047768, "eval_loss": 0.636418342590332, "eval_runtime": 1183.3386, "eval_samples_per_second": 1.912, "eval_steps_per_second": 0.478, "step": 23500 }, { "epoch": 10.433244916003536, "grad_norm": 5.0500874519348145, "learning_rate": 8.99702486678508e-06, "loss": 0.6939, "step": 23600 }, { "epoch": 10.477453580901857, "grad_norm": 4.551395893096924, "learning_rate": 8.992584369449379e-06, "loss": 0.6871, "step": 23700 }, { "epoch": 10.521662245800178, "grad_norm": 4.41419792175293, "learning_rate": 8.988143872113676e-06, "loss": 0.6912, "step": 23800 }, { "epoch": 10.565870910698496, "grad_norm": 4.153188705444336, "learning_rate": 8.983703374777976e-06, "loss": 0.6729, "step": 23900 }, { "epoch": 10.610079575596817, "grad_norm": 4.6854987144470215, "learning_rate": 8.979262877442274e-06, "loss": 0.6638, "step": 24000 }, { "epoch": 10.610079575596817, "eval_bleu": 25.76428121912705, "eval_char_accuracy": 15.263715601961634, "eval_loss": 0.6331145763397217, "eval_runtime": 1165.7617, "eval_samples_per_second": 1.941, "eval_steps_per_second": 0.486, "step": 24000 }, { "epoch": 10.654288240495138, "grad_norm": 4.550932884216309, "learning_rate": 8.974822380106573e-06, "loss": 0.6917, "step": 24100 }, { "epoch": 10.698496905393457, "grad_norm": 3.438528537750244, "learning_rate": 8.970381882770871e-06, "loss": 0.6769, "step": 24200 }, { "epoch": 10.742705570291777, "grad_norm": 5.151716232299805, "learning_rate": 8.965985790408526e-06, "loss": 0.6919, "step": 24300 }, { "epoch": 10.786914235190098, "grad_norm": 4.224579334259033, "learning_rate": 8.961545293072826e-06, "loss": 0.6836, "step": 24400 }, { "epoch": 10.831122900088417, "grad_norm": 4.475201606750488, "learning_rate": 8.957104795737124e-06, "loss": 0.6678, "step": 24500 }, { "epoch": 10.831122900088417, "eval_bleu": 25.915426080954088, "eval_char_accuracy": 15.355788303859299, "eval_loss": 0.6263041496276855, "eval_runtime": 1217.1755, "eval_samples_per_second": 1.859, "eval_steps_per_second": 0.465, "step": 24500 }, { "epoch": 10.875331564986737, "grad_norm": 4.936059474945068, "learning_rate": 8.952664298401421e-06, "loss": 0.6859, "step": 24600 }, { "epoch": 10.919540229885058, "grad_norm": 4.178312301635742, "learning_rate": 8.948223801065719e-06, "loss": 0.6848, "step": 24700 }, { "epoch": 10.963748894783377, "grad_norm": 4.409966468811035, "learning_rate": 8.943783303730019e-06, "loss": 0.6601, "step": 24800 }, { "epoch": 11.007957559681698, "grad_norm": 4.514906883239746, "learning_rate": 8.939342806394316e-06, "loss": 0.6607, "step": 24900 }, { "epoch": 11.052166224580018, "grad_norm": 4.496754169464111, "learning_rate": 8.934902309058614e-06, "loss": 0.6816, "step": 25000 }, { "epoch": 11.052166224580018, "eval_bleu": 26.197943745133696, "eval_char_accuracy": 15.380341024365343, "eval_loss": 0.6242961287498474, "eval_runtime": 1149.7114, "eval_samples_per_second": 1.968, "eval_steps_per_second": 0.492, "step": 25000 }, { "epoch": 11.096374889478337, "grad_norm": 5.08333158493042, "learning_rate": 8.930461811722914e-06, "loss": 0.6527, "step": 25100 }, { "epoch": 11.140583554376658, "grad_norm": 4.983675479888916, "learning_rate": 8.926021314387212e-06, "loss": 0.6653, "step": 25200 }, { "epoch": 11.184792219274978, "grad_norm": 3.601630926132202, "learning_rate": 8.921580817051511e-06, "loss": 0.6636, "step": 25300 }, { "epoch": 11.229000884173297, "grad_norm": 4.501917839050293, "learning_rate": 8.917140319715809e-06, "loss": 0.6526, "step": 25400 }, { "epoch": 11.273209549071618, "grad_norm": 5.131682872772217, "learning_rate": 8.912699822380108e-06, "loss": 0.6677, "step": 25500 }, { "epoch": 11.273209549071618, "eval_bleu": 26.250769263420604, "eval_char_accuracy": 15.740555279157972, "eval_loss": 0.6175717711448669, "eval_runtime": 1189.8087, "eval_samples_per_second": 1.902, "eval_steps_per_second": 0.476, "step": 25500 }, { "epoch": 11.317418213969939, "grad_norm": 4.299444675445557, "learning_rate": 8.908259325044406e-06, "loss": 0.6456, "step": 25600 }, { "epoch": 11.361626878868258, "grad_norm": 5.007399082183838, "learning_rate": 8.903818827708704e-06, "loss": 0.6561, "step": 25700 }, { "epoch": 11.405835543766578, "grad_norm": 3.582151174545288, "learning_rate": 8.899378330373003e-06, "loss": 0.6512, "step": 25800 }, { "epoch": 11.450044208664899, "grad_norm": 3.890277147293091, "learning_rate": 8.8949378330373e-06, "loss": 0.6507, "step": 25900 }, { "epoch": 11.494252873563218, "grad_norm": 3.8269190788269043, "learning_rate": 8.890497335701599e-06, "loss": 0.6611, "step": 26000 }, { "epoch": 11.494252873563218, "eval_bleu": 26.35429050723288, "eval_char_accuracy": 15.820351620802617, "eval_loss": 0.6139821410179138, "eval_runtime": 1186.1922, "eval_samples_per_second": 1.908, "eval_steps_per_second": 0.477, "step": 26000 }, { "epoch": 11.538461538461538, "grad_norm": 5.021885871887207, "learning_rate": 8.886056838365897e-06, "loss": 0.6617, "step": 26100 }, { "epoch": 11.582670203359859, "grad_norm": 4.4750142097473145, "learning_rate": 8.881616341030196e-06, "loss": 0.6535, "step": 26200 }, { "epoch": 11.626878868258178, "grad_norm": 4.10865592956543, "learning_rate": 8.877175843694494e-06, "loss": 0.6373, "step": 26300 }, { "epoch": 11.671087533156498, "grad_norm": 3.9609320163726807, "learning_rate": 8.87277975133215e-06, "loss": 0.6661, "step": 26400 }, { "epoch": 11.71529619805482, "grad_norm": 3.6800761222839355, "learning_rate": 8.868339253996449e-06, "loss": 0.6737, "step": 26500 }, { "epoch": 11.71529619805482, "eval_bleu": 26.71210227502706, "eval_char_accuracy": 15.76995393134284, "eval_loss": 0.6083924770355225, "eval_runtime": 1224.3025, "eval_samples_per_second": 1.848, "eval_steps_per_second": 0.462, "step": 26500 }, { "epoch": 11.759504862953138, "grad_norm": 3.5443811416625977, "learning_rate": 8.863898756660747e-06, "loss": 0.6446, "step": 26600 }, { "epoch": 11.803713527851459, "grad_norm": 3.9354844093322754, "learning_rate": 8.859458259325046e-06, "loss": 0.6503, "step": 26700 }, { "epoch": 11.84792219274978, "grad_norm": 4.526693344116211, "learning_rate": 8.855017761989344e-06, "loss": 0.6566, "step": 26800 }, { "epoch": 11.892130857648098, "grad_norm": 4.615814685821533, "learning_rate": 8.850577264653642e-06, "loss": 0.6502, "step": 26900 }, { "epoch": 11.936339522546419, "grad_norm": 5.460728645324707, "learning_rate": 8.84613676731794e-06, "loss": 0.6412, "step": 27000 }, { "epoch": 11.936339522546419, "eval_bleu": 27.117206449424618, "eval_char_accuracy": 15.925992931400993, "eval_loss": 0.606685221195221, "eval_runtime": 1222.8976, "eval_samples_per_second": 1.851, "eval_steps_per_second": 0.463, "step": 27000 }, { "epoch": 11.98054818744474, "grad_norm": 3.2317235469818115, "learning_rate": 8.84169626998224e-06, "loss": 0.6522, "step": 27100 }, { "epoch": 12.024756852343058, "grad_norm": 4.462329864501953, "learning_rate": 8.837255772646537e-06, "loss": 0.6305, "step": 27200 }, { "epoch": 12.068965517241379, "grad_norm": 4.46343469619751, "learning_rate": 8.832815275310835e-06, "loss": 0.6463, "step": 27300 }, { "epoch": 12.1131741821397, "grad_norm": 3.8137142658233643, "learning_rate": 8.828374777975134e-06, "loss": 0.6441, "step": 27400 }, { "epoch": 12.15738284703802, "grad_norm": 4.623500347137451, "learning_rate": 8.823934280639432e-06, "loss": 0.6546, "step": 27500 }, { "epoch": 12.15738284703802, "eval_bleu": 26.879696064806325, "eval_char_accuracy": 15.695326583488942, "eval_loss": 0.604811429977417, "eval_runtime": 1272.9217, "eval_samples_per_second": 1.778, "eval_steps_per_second": 0.445, "step": 27500 }, { "epoch": 12.20159151193634, "grad_norm": 4.691837787628174, "learning_rate": 8.819493783303732e-06, "loss": 0.627, "step": 27600 }, { "epoch": 12.24580017683466, "grad_norm": 3.8595967292785645, "learning_rate": 8.81505328596803e-06, "loss": 0.6359, "step": 27700 }, { "epoch": 12.29000884173298, "grad_norm": 4.391679286956787, "learning_rate": 8.810612788632327e-06, "loss": 0.6399, "step": 27800 }, { "epoch": 12.3342175066313, "grad_norm": 4.603471279144287, "learning_rate": 8.806172291296627e-06, "loss": 0.6246, "step": 27900 }, { "epoch": 12.37842617152962, "grad_norm": 4.229820251464844, "learning_rate": 8.801731793960925e-06, "loss": 0.632, "step": 28000 }, { "epoch": 12.37842617152962, "eval_bleu": 27.359804950978926, "eval_char_accuracy": 16.147613540179233, "eval_loss": 0.5997804999351501, "eval_runtime": 1169.0186, "eval_samples_per_second": 1.936, "eval_steps_per_second": 0.484, "step": 28000 }, { "epoch": 24.84526967285588, "grad_norm": 2.881751775741577, "learning_rate": 7.58385370205174e-06, "loss": 0.6249, "step": 28100 }, { "epoch": 24.93368700265252, "grad_norm": 3.160602569580078, "learning_rate": 7.574933095450491e-06, "loss": 0.6307, "step": 28200 }, { "epoch": 25.02210433244916, "grad_norm": 2.78442645072937, "learning_rate": 7.5660124888492425e-06, "loss": 0.6257, "step": 28300 }, { "epoch": 25.1105216622458, "grad_norm": 3.264493227005005, "learning_rate": 7.5570918822479935e-06, "loss": 0.6202, "step": 28400 }, { "epoch": 25.19893899204244, "grad_norm": 2.8973817825317383, "learning_rate": 7.548171275646745e-06, "loss": 0.615, "step": 28500 }, { "epoch": 25.19893899204244, "eval_bleu": 27.63506474619099, "eval_char_accuracy": 15.836181664286775, "eval_loss": 0.5920193195343018, "eval_runtime": 1166.2916, "eval_samples_per_second": 1.94, "eval_steps_per_second": 0.485, "step": 28500 }, { "epoch": 25.28735632183908, "grad_norm": 3.0750694274902344, "learning_rate": 7.5392506690454955e-06, "loss": 0.6107, "step": 28600 }, { "epoch": 25.375773651635722, "grad_norm": 3.805293083190918, "learning_rate": 7.5303300624442465e-06, "loss": 0.6148, "step": 28700 }, { "epoch": 25.46419098143236, "grad_norm": 3.0559804439544678, "learning_rate": 7.521409455842998e-06, "loss": 0.6115, "step": 28800 }, { "epoch": 25.552608311229, "grad_norm": 3.6253883838653564, "learning_rate": 7.512488849241749e-06, "loss": 0.6288, "step": 28900 }, { "epoch": 25.641025641025642, "grad_norm": 3.109668493270874, "learning_rate": 7.5035682426405e-06, "loss": 0.6304, "step": 29000 }, { "epoch": 25.641025641025642, "eval_bleu": 27.95070105367552, "eval_char_accuracy": 16.013865825843677, "eval_loss": 0.5859384536743164, "eval_runtime": 1167.2601, "eval_samples_per_second": 1.939, "eval_steps_per_second": 0.485, "step": 29000 }, { "epoch": 25.72944297082228, "grad_norm": 3.3268916606903076, "learning_rate": 7.494647636039251e-06, "loss": 0.6171, "step": 29100 }, { "epoch": 25.81786030061892, "grad_norm": 3.303433656692505, "learning_rate": 7.485727029438002e-06, "loss": 0.6173, "step": 29200 }, { "epoch": 25.906277630415563, "grad_norm": 3.157514810562134, "learning_rate": 7.476806422836753e-06, "loss": 0.6275, "step": 29300 }, { "epoch": 25.9946949602122, "grad_norm": 3.1818809509277344, "learning_rate": 7.467885816235505e-06, "loss": 0.6177, "step": 29400 }, { "epoch": 26.08311229000884, "grad_norm": 2.89052414894104, "learning_rate": 7.458965209634256e-06, "loss": 0.6207, "step": 29500 }, { "epoch": 26.08311229000884, "eval_bleu": 28.10992519261199, "eval_char_accuracy": 16.068786384870354, "eval_loss": 0.5828626751899719, "eval_runtime": 1192.0532, "eval_samples_per_second": 1.898, "eval_steps_per_second": 0.475, "step": 29500 }, { "epoch": 26.171529619805483, "grad_norm": 2.901235580444336, "learning_rate": 7.450044603033006e-06, "loss": 0.6098, "step": 29600 }, { "epoch": 26.25994694960212, "grad_norm": 3.0310065746307373, "learning_rate": 7.441123996431758e-06, "loss": 0.612, "step": 29700 }, { "epoch": 26.348364279398762, "grad_norm": 2.713298797607422, "learning_rate": 7.432203389830509e-06, "loss": 0.6072, "step": 29800 }, { "epoch": 26.436781609195403, "grad_norm": 2.767160654067993, "learning_rate": 7.42328278322926e-06, "loss": 0.602, "step": 29900 }, { "epoch": 26.52519893899204, "grad_norm": 2.705537796020508, "learning_rate": 7.414362176628012e-06, "loss": 0.593, "step": 30000 }, { "epoch": 26.52519893899204, "eval_bleu": 28.489369366652767, "eval_char_accuracy": 16.198657353862853, "eval_loss": 0.5803427696228027, "eval_runtime": 1172.366, "eval_samples_per_second": 1.93, "eval_steps_per_second": 0.483, "step": 30000 }, { "epoch": 26.613616268788682, "grad_norm": 2.6609673500061035, "learning_rate": 7.405441570026762e-06, "loss": 0.6088, "step": 30100 }, { "epoch": 26.702033598585324, "grad_norm": 2.791049003601074, "learning_rate": 7.396520963425513e-06, "loss": 0.6023, "step": 30200 }, { "epoch": 26.79045092838196, "grad_norm": 3.140176773071289, "learning_rate": 7.387600356824265e-06, "loss": 0.6017, "step": 30300 }, { "epoch": 26.878868258178603, "grad_norm": 2.9931647777557373, "learning_rate": 7.378768956289029e-06, "loss": 0.6024, "step": 30400 }, { "epoch": 26.967285587975244, "grad_norm": 4.075024127960205, "learning_rate": 7.369848349687779e-06, "loss": 0.5965, "step": 30500 }, { "epoch": 26.967285587975244, "eval_bleu": 28.79122843268101, "eval_char_accuracy": 16.25519322344914, "eval_loss": 0.5732487440109253, "eval_runtime": 1209.8195, "eval_samples_per_second": 1.871, "eval_steps_per_second": 0.468, "step": 30500 }, { "epoch": 27.05570291777188, "grad_norm": 3.099680185317993, "learning_rate": 7.36092774308653e-06, "loss": 0.593, "step": 30600 }, { "epoch": 27.144120247568523, "grad_norm": 3.1157305240631104, "learning_rate": 7.352007136485282e-06, "loss": 0.5946, "step": 30700 }, { "epoch": 27.232537577365164, "grad_norm": 3.04852557182312, "learning_rate": 7.343086529884033e-06, "loss": 0.5993, "step": 30800 }, { "epoch": 27.320954907161802, "grad_norm": 3.061809539794922, "learning_rate": 7.334165923282783e-06, "loss": 0.6005, "step": 30900 }, { "epoch": 27.409372236958443, "grad_norm": 2.8910889625549316, "learning_rate": 7.325245316681535e-06, "loss": 0.5913, "step": 31000 }, { "epoch": 27.409372236958443, "eval_bleu": 28.938039726178697, "eval_char_accuracy": 16.45419948439287, "eval_loss": 0.5735604166984558, "eval_runtime": 1241.1593, "eval_samples_per_second": 1.823, "eval_steps_per_second": 0.456, "step": 31000 }, { "epoch": 27.497789566755085, "grad_norm": 2.7394607067108154, "learning_rate": 7.316324710080286e-06, "loss": 0.6026, "step": 31100 }, { "epoch": 27.586206896551722, "grad_norm": 3.392388105392456, "learning_rate": 7.307404103479038e-06, "loss": 0.5924, "step": 31200 }, { "epoch": 27.674624226348364, "grad_norm": 3.1995959281921387, "learning_rate": 7.298483496877789e-06, "loss": 0.5901, "step": 31300 }, { "epoch": 27.763041556145005, "grad_norm": 2.861524820327759, "learning_rate": 7.289562890276539e-06, "loss": 0.5968, "step": 31400 }, { "epoch": 27.851458885941646, "grad_norm": 3.4911625385284424, "learning_rate": 7.280642283675291e-06, "loss": 0.5773, "step": 31500 }, { "epoch": 27.851458885941646, "eval_bleu": 29.039628428258037, "eval_char_accuracy": 16.476813832227382, "eval_loss": 0.5664685964584351, "eval_runtime": 1228.3935, "eval_samples_per_second": 1.842, "eval_steps_per_second": 0.461, "step": 31500 }, { "epoch": 27.939876215738284, "grad_norm": 2.8297295570373535, "learning_rate": 7.271721677074042e-06, "loss": 0.5998, "step": 31600 }, { "epoch": 28.028293545534925, "grad_norm": 2.918090581893921, "learning_rate": 7.262801070472793e-06, "loss": 0.5764, "step": 31700 }, { "epoch": 28.116710875331567, "grad_norm": 3.2439045906066895, "learning_rate": 7.2538804638715445e-06, "loss": 0.5902, "step": 31800 }, { "epoch": 28.205128205128204, "grad_norm": 2.908038854598999, "learning_rate": 7.2449598572702955e-06, "loss": 0.5677, "step": 31900 }, { "epoch": 28.293545534924846, "grad_norm": 2.714308023452759, "learning_rate": 7.236039250669046e-06, "loss": 0.5829, "step": 32000 }, { "epoch": 28.293545534924846, "eval_bleu": 29.286824185675332, "eval_char_accuracy": 16.39152543468007, "eval_loss": 0.5646906495094299, "eval_runtime": 1187.3897, "eval_samples_per_second": 1.906, "eval_steps_per_second": 0.477, "step": 32000 }, { "epoch": 28.381962864721487, "grad_norm": 3.1929683685302734, "learning_rate": 7.2271186440677975e-06, "loss": 0.5652, "step": 32100 }, { "epoch": 28.470380194518125, "grad_norm": 3.1815273761749268, "learning_rate": 7.2181980374665485e-06, "loss": 0.5863, "step": 32200 }, { "epoch": 28.558797524314766, "grad_norm": 2.6531717777252197, "learning_rate": 7.2092774308652995e-06, "loss": 0.575, "step": 32300 }, { "epoch": 28.647214854111407, "grad_norm": 3.3200690746307373, "learning_rate": 7.200356824264051e-06, "loss": 0.5812, "step": 32400 }, { "epoch": 28.735632183908045, "grad_norm": 3.211838722229004, "learning_rate": 7.191525423728814e-06, "loss": 0.5892, "step": 32500 }, { "epoch": 28.735632183908045, "eval_bleu": 29.548191226969195, "eval_char_accuracy": 16.650944310553147, "eval_loss": 0.5602275133132935, "eval_runtime": 1167.5468, "eval_samples_per_second": 1.938, "eval_steps_per_second": 0.485, "step": 32500 }, { "epoch": 28.824049513704686, "grad_norm": 2.816077947616577, "learning_rate": 7.182604817127565e-06, "loss": 0.5879, "step": 32600 }, { "epoch": 28.912466843501328, "grad_norm": 2.4974288940429688, "learning_rate": 7.173684210526316e-06, "loss": 0.5933, "step": 32700 }, { "epoch": 29.000884173297965, "grad_norm": 2.8034393787384033, "learning_rate": 7.164763603925068e-06, "loss": 0.5809, "step": 32800 }, { "epoch": 29.089301503094607, "grad_norm": 3.2486841678619385, "learning_rate": 7.155842997323818e-06, "loss": 0.5566, "step": 32900 }, { "epoch": 29.177718832891248, "grad_norm": 3.039702892303467, "learning_rate": 7.146922390722569e-06, "loss": 0.5758, "step": 33000 }, { "epoch": 29.177718832891248, "eval_bleu": 29.750314689195577, "eval_char_accuracy": 16.647713689433928, "eval_loss": 0.559512197971344, "eval_runtime": 1170.2717, "eval_samples_per_second": 1.934, "eval_steps_per_second": 0.484, "step": 33000 }, { "epoch": 29.266136162687886, "grad_norm": 2.6964523792266846, "learning_rate": 7.138001784121321e-06, "loss": 0.582, "step": 33100 }, { "epoch": 29.354553492484527, "grad_norm": 2.7977049350738525, "learning_rate": 7.129081177520072e-06, "loss": 0.5766, "step": 33200 }, { "epoch": 29.44297082228117, "grad_norm": 2.906003475189209, "learning_rate": 7.120160570918822e-06, "loss": 0.5665, "step": 33300 }, { "epoch": 29.531388152077806, "grad_norm": 2.39634108543396, "learning_rate": 7.111239964317574e-06, "loss": 0.5894, "step": 33400 }, { "epoch": 29.619805481874447, "grad_norm": 3.000608205795288, "learning_rate": 7.102319357716325e-06, "loss": 0.5548, "step": 33500 }, { "epoch": 29.619805481874447, "eval_bleu": 29.854475706201974, "eval_char_accuracy": 16.609269298115255, "eval_loss": 0.5555862784385681, "eval_runtime": 1181.3689, "eval_samples_per_second": 1.916, "eval_steps_per_second": 0.479, "step": 33500 }, { "epoch": 29.70822281167109, "grad_norm": 2.2232987880706787, "learning_rate": 7.093398751115076e-06, "loss": 0.5663, "step": 33600 }, { "epoch": 29.796640141467726, "grad_norm": 2.7053003311157227, "learning_rate": 7.084478144513828e-06, "loss": 0.582, "step": 33700 }, { "epoch": 29.885057471264368, "grad_norm": 2.741225481033325, "learning_rate": 7.075557537912578e-06, "loss": 0.5732, "step": 33800 }, { "epoch": 29.97347480106101, "grad_norm": 3.0870234966278076, "learning_rate": 7.066636931311329e-06, "loss": 0.5639, "step": 33900 }, { "epoch": 30.061892130857647, "grad_norm": 3.0533647537231445, "learning_rate": 7.057716324710081e-06, "loss": 0.5492, "step": 34000 }, { "epoch": 30.061892130857647, "eval_bleu": 30.108687084215376, "eval_char_accuracy": 16.658051677015422, "eval_loss": 0.551268458366394, "eval_runtime": 1235.9032, "eval_samples_per_second": 1.831, "eval_steps_per_second": 0.458, "step": 34000 }, { "epoch": 30.150309460654288, "grad_norm": 2.3661723136901855, "learning_rate": 7.048795718108832e-06, "loss": 0.5578, "step": 34100 }, { "epoch": 30.23872679045093, "grad_norm": 2.9469873905181885, "learning_rate": 7.039875111507584e-06, "loss": 0.5583, "step": 34200 }, { "epoch": 30.327144120247567, "grad_norm": 2.678952217102051, "learning_rate": 7.030954504906334e-06, "loss": 0.5464, "step": 34300 }, { "epoch": 30.41556145004421, "grad_norm": 3.1619069576263428, "learning_rate": 7.022033898305085e-06, "loss": 0.5521, "step": 34400 }, { "epoch": 30.50397877984085, "grad_norm": 2.292602300643921, "learning_rate": 7.013202497769849e-06, "loss": 0.5572, "step": 34500 }, { "epoch": 30.50397877984085, "eval_bleu": 30.365494070530787, "eval_char_accuracy": 16.97949847837745, "eval_loss": 0.5476405024528503, "eval_runtime": 1246.3763, "eval_samples_per_second": 1.816, "eval_steps_per_second": 0.454, "step": 34500 }, { "epoch": 30.592396109637487, "grad_norm": 3.0454201698303223, "learning_rate": 7.004281891168601e-06, "loss": 0.5707, "step": 34600 }, { "epoch": 30.68081343943413, "grad_norm": 2.8100173473358154, "learning_rate": 6.995361284567352e-06, "loss": 0.5556, "step": 34700 }, { "epoch": 30.76923076923077, "grad_norm": 3.1831016540527344, "learning_rate": 6.986440677966102e-06, "loss": 0.5672, "step": 34800 }, { "epoch": 30.857648099027408, "grad_norm": 2.431419610977173, "learning_rate": 6.977520071364854e-06, "loss": 0.5584, "step": 34900 }, { "epoch": 30.94606542882405, "grad_norm": 3.3351900577545166, "learning_rate": 6.968599464763605e-06, "loss": 0.5624, "step": 35000 }, { "epoch": 30.94606542882405, "eval_bleu": 30.539524601247965, "eval_char_accuracy": 16.909070937978534, "eval_loss": 0.5452527403831482, "eval_runtime": 1236.3807, "eval_samples_per_second": 1.83, "eval_steps_per_second": 0.458, "step": 35000 }, { "epoch": 31.03448275862069, "grad_norm": 3.0198135375976562, "learning_rate": 6.959678858162356e-06, "loss": 0.5552, "step": 35100 }, { "epoch": 31.122900088417328, "grad_norm": 2.9131996631622314, "learning_rate": 6.9507582515611075e-06, "loss": 0.5659, "step": 35200 }, { "epoch": 31.21131741821397, "grad_norm": 2.9061875343322754, "learning_rate": 6.941837644959858e-06, "loss": 0.5433, "step": 35300 }, { "epoch": 31.29973474801061, "grad_norm": 2.784221887588501, "learning_rate": 6.932917038358609e-06, "loss": 0.5368, "step": 35400 }, { "epoch": 31.38815207780725, "grad_norm": 3.2255430221557617, "learning_rate": 6.9239964317573605e-06, "loss": 0.5487, "step": 35500 }, { "epoch": 31.38815207780725, "eval_bleu": 30.650750659315953, "eval_char_accuracy": 17.46183021147646, "eval_loss": 0.5419927835464478, "eval_runtime": 1253.2408, "eval_samples_per_second": 1.806, "eval_steps_per_second": 0.452, "step": 35500 }, { "epoch": 31.47656940760389, "grad_norm": 2.7081570625305176, "learning_rate": 6.9150758251561115e-06, "loss": 0.547, "step": 35600 }, { "epoch": 31.56498673740053, "grad_norm": 3.0587680339813232, "learning_rate": 6.906155218554862e-06, "loss": 0.5375, "step": 35700 }, { "epoch": 31.653404067197172, "grad_norm": 3.988107681274414, "learning_rate": 6.8972346119536135e-06, "loss": 0.5662, "step": 35800 }, { "epoch": 31.74182139699381, "grad_norm": 2.761192798614502, "learning_rate": 6.8883140053523645e-06, "loss": 0.5511, "step": 35900 }, { "epoch": 31.83023872679045, "grad_norm": 3.2426974773406982, "learning_rate": 6.8793933987511155e-06, "loss": 0.5513, "step": 36000 }, { "epoch": 31.83023872679045, "eval_bleu": 30.80655740304501, "eval_char_accuracy": 17.068340559155903, "eval_loss": 0.5397204756736755, "eval_runtime": 1227.592, "eval_samples_per_second": 1.843, "eval_steps_per_second": 0.461, "step": 36000 }, { "epoch": 31.918656056587093, "grad_norm": 2.841747760772705, "learning_rate": 6.870472792149867e-06, "loss": 0.5494, "step": 36100 }, { "epoch": 32.00707338638373, "grad_norm": 2.91367244720459, "learning_rate": 6.8615521855486175e-06, "loss": 0.5437, "step": 36200 }, { "epoch": 32.09549071618037, "grad_norm": 2.6152751445770264, "learning_rate": 6.8526315789473685e-06, "loss": 0.5364, "step": 36300 }, { "epoch": 32.18390804597701, "grad_norm": 3.3306331634521484, "learning_rate": 6.84371097234612e-06, "loss": 0.5484, "step": 36400 }, { "epoch": 32.272325375773654, "grad_norm": 2.918839454650879, "learning_rate": 6.834879571810884e-06, "loss": 0.5403, "step": 36500 }, { "epoch": 32.272325375773654, "eval_bleu": 30.90212928374284, "eval_char_accuracy": 17.014066124353068, "eval_loss": 0.5365396738052368, "eval_runtime": 1212.5776, "eval_samples_per_second": 1.866, "eval_steps_per_second": 0.467, "step": 36500 }, { "epoch": 32.36074270557029, "grad_norm": 2.9582695960998535, "learning_rate": 6.825958965209634e-06, "loss": 0.5437, "step": 36600 }, { "epoch": 32.44916003536693, "grad_norm": 2.4590444564819336, "learning_rate": 6.817038358608385e-06, "loss": 0.5322, "step": 36700 }, { "epoch": 32.53757736516357, "grad_norm": 3.097669839859009, "learning_rate": 6.808117752007137e-06, "loss": 0.5522, "step": 36800 }, { "epoch": 32.62599469496021, "grad_norm": 3.6139161586761475, "learning_rate": 6.799197145405888e-06, "loss": 0.5482, "step": 36900 }, { "epoch": 32.714412024756854, "grad_norm": 2.6905388832092285, "learning_rate": 6.790276538804638e-06, "loss": 0.552, "step": 37000 }, { "epoch": 32.714412024756854, "eval_bleu": 31.095921116173244, "eval_char_accuracy": 16.863842242309506, "eval_loss": 0.5337512493133545, "eval_runtime": 1244.667, "eval_samples_per_second": 1.818, "eval_steps_per_second": 0.455, "step": 37000 }, { "epoch": 32.802829354553495, "grad_norm": 2.960695505142212, "learning_rate": 6.78135593220339e-06, "loss": 0.5371, "step": 37100 }, { "epoch": 32.89124668435013, "grad_norm": 2.452554702758789, "learning_rate": 6.772435325602141e-06, "loss": 0.5366, "step": 37200 }, { "epoch": 32.97966401414677, "grad_norm": 2.612982988357544, "learning_rate": 6.763514719000892e-06, "loss": 0.5338, "step": 37300 }, { "epoch": 33.06808134394341, "grad_norm": 2.8043036460876465, "learning_rate": 6.754594112399644e-06, "loss": 0.5387, "step": 37400 }, { "epoch": 33.15649867374005, "grad_norm": 3.143969774246216, "learning_rate": 6.745673505798395e-06, "loss": 0.5366, "step": 37500 }, { "epoch": 33.15649867374005, "eval_bleu": 31.331024045557687, "eval_char_accuracy": 17.135860540547526, "eval_loss": 0.5289241075515747, "eval_runtime": 1180.6163, "eval_samples_per_second": 1.917, "eval_steps_per_second": 0.479, "step": 37500 }, { "epoch": 16.622458001768347, "grad_norm": 4.168571949005127, "learning_rate": 8.375666074600355e-06, "loss": 0.5358, "step": 37600 }, { "epoch": 16.666666666666668, "grad_norm": 4.212833404541016, "learning_rate": 8.371225577264655e-06, "loss": 0.5383, "step": 37700 }, { "epoch": 16.710875331564985, "grad_norm": 4.949775218963623, "learning_rate": 8.366785079928953e-06, "loss": 0.5236, "step": 37800 }, { "epoch": 16.755083996463306, "grad_norm": 4.116398811340332, "learning_rate": 8.362344582593252e-06, "loss": 0.5608, "step": 37900 }, { "epoch": 16.799292661361626, "grad_norm": 3.553568124771118, "learning_rate": 8.35790408525755e-06, "loss": 0.536, "step": 38000 }, { "epoch": 16.799292661361626, "eval_bleu": 31.34562539257945, "eval_char_accuracy": 17.162674695837023, "eval_loss": 0.5315833687782288, "eval_runtime": 6444.9524, "eval_samples_per_second": 0.351, "eval_steps_per_second": 0.088, "step": 38000 }, { "epoch": 16.843501326259947, "grad_norm": 3.927187442779541, "learning_rate": 8.353463587921848e-06, "loss": 0.5325, "step": 38100 }, { "epoch": 16.887709991158268, "grad_norm": 4.272421360015869, "learning_rate": 8.349023090586146e-06, "loss": 0.5529, "step": 38200 }, { "epoch": 16.931918656056588, "grad_norm": 4.071782112121582, "learning_rate": 8.344582593250443e-06, "loss": 0.5237, "step": 38300 }, { "epoch": 16.97612732095491, "grad_norm": 4.790865898132324, "learning_rate": 8.340142095914743e-06, "loss": 0.5314, "step": 38400 }, { "epoch": 17.020335985853226, "grad_norm": 4.098753452301025, "learning_rate": 8.33570159857904e-06, "loss": 0.5383, "step": 38500 }, { "epoch": 17.020335985853226, "eval_bleu": 31.10960052484551, "eval_char_accuracy": 17.275423372897674, "eval_loss": 0.5304023623466492, "eval_runtime": 3895.5967, "eval_samples_per_second": 0.581, "eval_steps_per_second": 0.073, "step": 38500 }, { "epoch": 17.064544650751547, "grad_norm": 3.7190957069396973, "learning_rate": 8.33126110124334e-06, "loss": 0.5287, "step": 38600 }, { "epoch": 17.108753315649867, "grad_norm": 3.1610472202301025, "learning_rate": 8.326820603907638e-06, "loss": 0.5162, "step": 38700 }, { "epoch": 17.152961980548188, "grad_norm": 3.6475327014923096, "learning_rate": 8.322380106571936e-06, "loss": 0.5482, "step": 38800 }, { "epoch": 17.19717064544651, "grad_norm": 3.818483829498291, "learning_rate": 8.317939609236235e-06, "loss": 0.5241, "step": 38900 }, { "epoch": 17.24137931034483, "grad_norm": 4.884592056274414, "learning_rate": 8.313499111900533e-06, "loss": 0.5196, "step": 39000 }, { "epoch": 17.24137931034483, "eval_bleu": 31.1896879356928, "eval_char_accuracy": 17.38138774560797, "eval_loss": 0.5308491587638855, "eval_runtime": 3774.7192, "eval_samples_per_second": 0.6, "eval_steps_per_second": 0.075, "step": 39000 }, { "epoch": 17.285587975243146, "grad_norm": 4.993014335632324, "learning_rate": 8.309058614564833e-06, "loss": 0.545, "step": 39100 }, { "epoch": 17.329796640141467, "grad_norm": 4.632749557495117, "learning_rate": 8.30461811722913e-06, "loss": 0.5348, "step": 39200 }, { "epoch": 17.374005305039788, "grad_norm": 4.033156871795654, "learning_rate": 8.30017761989343e-06, "loss": 0.5331, "step": 39300 }, { "epoch": 17.418213969938108, "grad_norm": 4.257448196411133, "learning_rate": 8.295737122557728e-06, "loss": 0.5319, "step": 39400 }, { "epoch": 17.46242263483643, "grad_norm": 3.5236194133758545, "learning_rate": 8.291296625222026e-06, "loss": 0.5325, "step": 39500 }, { "epoch": 17.46242263483643, "eval_bleu": 31.606665496352097, "eval_char_accuracy": 17.43824667730618, "eval_loss": 0.5248032808303833, "eval_runtime": 3876.1071, "eval_samples_per_second": 0.584, "eval_steps_per_second": 0.073, "step": 39500 }, { "epoch": 17.50663129973475, "grad_norm": 3.5886051654815674, "learning_rate": 8.286900532859681e-06, "loss": 0.5285, "step": 39600 }, { "epoch": 17.550839964633067, "grad_norm": 3.325591802597046, "learning_rate": 8.282460035523979e-06, "loss": 0.5413, "step": 39700 }, { "epoch": 17.595048629531387, "grad_norm": 3.3814034461975098, "learning_rate": 8.278019538188278e-06, "loss": 0.5305, "step": 39800 }, { "epoch": 17.639257294429708, "grad_norm": 3.455845355987549, "learning_rate": 8.273579040852576e-06, "loss": 0.5304, "step": 39900 }, { "epoch": 17.68346595932803, "grad_norm": 4.927883625030518, "learning_rate": 8.269138543516875e-06, "loss": 0.5312, "step": 40000 }, { "epoch": 17.68346595932803, "eval_bleu": 31.727441350111246, "eval_char_accuracy": 17.487998242542112, "eval_loss": 0.5207505226135254, "eval_runtime": 3828.174, "eval_samples_per_second": 0.591, "eval_steps_per_second": 0.074, "step": 40000 }, { "epoch": 17.72767462422635, "grad_norm": 3.817272663116455, "learning_rate": 8.264698046181173e-06, "loss": 0.5332, "step": 40100 }, { "epoch": 17.77188328912467, "grad_norm": 4.887513637542725, "learning_rate": 8.260257548845471e-06, "loss": 0.547, "step": 40200 }, { "epoch": 17.816091954022987, "grad_norm": 3.4486873149871826, "learning_rate": 8.25581705150977e-06, "loss": 0.5212, "step": 40300 }, { "epoch": 17.860300618921308, "grad_norm": 5.059577941894531, "learning_rate": 8.251376554174068e-06, "loss": 0.531, "step": 40400 }, { "epoch": 17.90450928381963, "grad_norm": 3.6564972400665283, "learning_rate": 8.246936056838366e-06, "loss": 0.5241, "step": 40500 }, { "epoch": 17.90450928381963, "eval_bleu": 31.919289640871927, "eval_char_accuracy": 17.495105609004387, "eval_loss": 0.5174447894096375, "eval_runtime": 3798.4901, "eval_samples_per_second": 0.596, "eval_steps_per_second": 0.075, "step": 40500 }, { "epoch": 17.94871794871795, "grad_norm": 4.924537658691406, "learning_rate": 8.242495559502664e-06, "loss": 0.5033, "step": 40600 }, { "epoch": 17.99292661361627, "grad_norm": 3.774033546447754, "learning_rate": 8.238055062166964e-06, "loss": 0.51, "step": 40700 }, { "epoch": 18.03713527851459, "grad_norm": 3.3910040855407715, "learning_rate": 8.233614564831261e-06, "loss": 0.5293, "step": 40800 }, { "epoch": 18.081343943412907, "grad_norm": 3.2107174396514893, "learning_rate": 8.22917406749556e-06, "loss": 0.5302, "step": 40900 }, { "epoch": 18.125552608311228, "grad_norm": 4.6659440994262695, "learning_rate": 8.224733570159859e-06, "loss": 0.5202, "step": 41000 }, { "epoch": 18.125552608311228, "eval_bleu": 31.89653556055136, "eval_char_accuracy": 17.5849168761186, "eval_loss": 0.5214531421661377, "eval_runtime": 3977.6788, "eval_samples_per_second": 0.569, "eval_steps_per_second": 0.071, "step": 41000 }, { "epoch": 18.16976127320955, "grad_norm": 3.9466700553894043, "learning_rate": 8.220293072824156e-06, "loss": 0.5254, "step": 41100 }, { "epoch": 18.21396993810787, "grad_norm": 4.094106674194336, "learning_rate": 8.215852575488456e-06, "loss": 0.5117, "step": 41200 }, { "epoch": 18.25817860300619, "grad_norm": 3.436887741088867, "learning_rate": 8.211412078152754e-06, "loss": 0.5081, "step": 41300 }, { "epoch": 18.30238726790451, "grad_norm": 3.9109930992126465, "learning_rate": 8.206971580817053e-06, "loss": 0.4987, "step": 41400 }, { "epoch": 18.346595932802828, "grad_norm": 4.281425952911377, "learning_rate": 8.202531083481351e-06, "loss": 0.5185, "step": 41500 }, { "epoch": 18.346595932802828, "eval_bleu": 31.940986398903032, "eval_char_accuracy": 17.502859099690507, "eval_loss": 0.5167058110237122, "eval_runtime": 3891.2894, "eval_samples_per_second": 0.582, "eval_steps_per_second": 0.073, "step": 41500 }, { "epoch": 18.39080459770115, "grad_norm": 3.432302713394165, "learning_rate": 8.198134991119006e-06, "loss": 0.5038, "step": 41600 }, { "epoch": 18.43501326259947, "grad_norm": 4.24243688583374, "learning_rate": 8.193694493783304e-06, "loss": 0.5121, "step": 41700 }, { "epoch": 18.47922192749779, "grad_norm": 5.033168792724609, "learning_rate": 8.189253996447602e-06, "loss": 0.5118, "step": 41800 }, { "epoch": 18.52343059239611, "grad_norm": 4.706720352172852, "learning_rate": 8.184813499111901e-06, "loss": 0.5186, "step": 41900 }, { "epoch": 18.56763925729443, "grad_norm": 3.9910097122192383, "learning_rate": 8.1803730017762e-06, "loss": 0.5292, "step": 42000 }, { "epoch": 18.56763925729443, "eval_bleu": 32.07993378964389, "eval_char_accuracy": 17.514166273607763, "eval_loss": 0.5128440260887146, "eval_runtime": 3778.7977, "eval_samples_per_second": 0.599, "eval_steps_per_second": 0.075, "step": 42000 }, { "epoch": 18.61184792219275, "grad_norm": 3.151885747909546, "learning_rate": 8.175932504440499e-06, "loss": 0.507, "step": 42100 }, { "epoch": 18.65605658709107, "grad_norm": 4.533958911895752, "learning_rate": 8.171492007104797e-06, "loss": 0.5127, "step": 42200 }, { "epoch": 18.70026525198939, "grad_norm": 3.7578976154327393, "learning_rate": 8.167051509769094e-06, "loss": 0.4941, "step": 42300 }, { "epoch": 18.74447391688771, "grad_norm": 4.0137224197387695, "learning_rate": 8.162611012433394e-06, "loss": 0.528, "step": 42400 }, { "epoch": 18.78868258178603, "grad_norm": 3.5136590003967285, "learning_rate": 8.158170515097692e-06, "loss": 0.5182, "step": 42500 }, { "epoch": 18.78868258178603, "eval_bleu": 32.17307661125679, "eval_char_accuracy": 17.587178310902054, "eval_loss": 0.5164754986763, "eval_runtime": 3685.9406, "eval_samples_per_second": 0.614, "eval_steps_per_second": 0.077, "step": 42500 }, { "epoch": 18.83289124668435, "grad_norm": 3.985119342803955, "learning_rate": 8.153730017761991e-06, "loss": 0.5044, "step": 42600 }, { "epoch": 18.877099911582672, "grad_norm": 3.7768166065216064, "learning_rate": 8.149289520426289e-06, "loss": 0.5207, "step": 42700 }, { "epoch": 18.92130857648099, "grad_norm": 4.0938873291015625, "learning_rate": 8.144849023090587e-06, "loss": 0.4994, "step": 42800 }, { "epoch": 18.96551724137931, "grad_norm": 5.211061477661133, "learning_rate": 8.140408525754885e-06, "loss": 0.5064, "step": 42900 }, { "epoch": 19.00972590627763, "grad_norm": 5.048656463623047, "learning_rate": 8.135968028419184e-06, "loss": 0.513, "step": 43000 }, { "epoch": 19.00972590627763, "eval_bleu": 32.43805248126529, "eval_char_accuracy": 17.854350677461248, "eval_loss": 0.5138335824012756, "eval_runtime": 3805.5821, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.074, "step": 43000 }, { "epoch": 19.05393457117595, "grad_norm": 4.034296035766602, "learning_rate": 8.131527531083482e-06, "loss": 0.5046, "step": 43100 }, { "epoch": 19.09814323607427, "grad_norm": 4.680954933166504, "learning_rate": 8.12708703374778e-06, "loss": 0.5055, "step": 43200 }, { "epoch": 19.142351900972592, "grad_norm": 3.713162422180176, "learning_rate": 8.12264653641208e-06, "loss": 0.4984, "step": 43300 }, { "epoch": 19.18656056587091, "grad_norm": 4.682282447814941, "learning_rate": 8.118206039076377e-06, "loss": 0.5058, "step": 43400 }, { "epoch": 19.23076923076923, "grad_norm": 3.0817902088165283, "learning_rate": 8.113765541740676e-06, "loss": 0.5137, "step": 43500 }, { "epoch": 19.23076923076923, "eval_bleu": 32.76016868380882, "eval_char_accuracy": 18.078555783134863, "eval_loss": 0.5067340731620789, "eval_runtime": 3782.9515, "eval_samples_per_second": 0.598, "eval_steps_per_second": 0.075, "step": 43500 }, { "epoch": 19.27497789566755, "grad_norm": 3.184718370437622, "learning_rate": 8.109369449378332e-06, "loss": 0.4898, "step": 43600 }, { "epoch": 19.31918656056587, "grad_norm": 4.698686122894287, "learning_rate": 8.10492895204263e-06, "loss": 0.4895, "step": 43700 }, { "epoch": 19.363395225464192, "grad_norm": 4.166170597076416, "learning_rate": 8.100488454706927e-06, "loss": 0.5295, "step": 43800 }, { "epoch": 19.407603890362513, "grad_norm": 4.268864631652832, "learning_rate": 8.096047957371225e-06, "loss": 0.5108, "step": 43900 }, { "epoch": 19.45181255526083, "grad_norm": 4.044534206390381, "learning_rate": 8.091607460035525e-06, "loss": 0.4982, "step": 44000 }, { "epoch": 19.45181255526083, "eval_bleu": 32.58406366351333, "eval_char_accuracy": 17.866627037714274, "eval_loss": 0.5083564519882202, "eval_runtime": 3790.2739, "eval_samples_per_second": 0.597, "eval_steps_per_second": 0.075, "step": 44000 }, { "epoch": 19.49602122015915, "grad_norm": 3.902708053588867, "learning_rate": 8.087166962699822e-06, "loss": 0.4929, "step": 44100 }, { "epoch": 19.54022988505747, "grad_norm": 3.303893804550171, "learning_rate": 8.082726465364122e-06, "loss": 0.5016, "step": 44200 }, { "epoch": 19.58443854995579, "grad_norm": 3.6734492778778076, "learning_rate": 8.07828596802842e-06, "loss": 0.5026, "step": 44300 }, { "epoch": 19.628647214854112, "grad_norm": 3.167238473892212, "learning_rate": 8.073845470692718e-06, "loss": 0.5043, "step": 44400 }, { "epoch": 19.672855879752433, "grad_norm": 5.456859111785889, "learning_rate": 8.069404973357017e-06, "loss": 0.5124, "step": 44500 }, { "epoch": 19.672855879752433, "eval_bleu": 32.77558667165614, "eval_char_accuracy": 17.848212497334735, "eval_loss": 0.5059965252876282, "eval_runtime": 3783.8082, "eval_samples_per_second": 0.598, "eval_steps_per_second": 0.075, "step": 44500 }, { "epoch": 19.71706454465075, "grad_norm": 3.123551845550537, "learning_rate": 8.064964476021315e-06, "loss": 0.5179, "step": 44600 }, { "epoch": 19.76127320954907, "grad_norm": 3.3199622631073, "learning_rate": 8.060523978685614e-06, "loss": 0.4894, "step": 44700 }, { "epoch": 19.80548187444739, "grad_norm": 4.65825891494751, "learning_rate": 8.056083481349912e-06, "loss": 0.5107, "step": 44800 }, { "epoch": 19.849690539345712, "grad_norm": 4.311479091644287, "learning_rate": 8.05164298401421e-06, "loss": 0.5079, "step": 44900 }, { "epoch": 19.893899204244033, "grad_norm": 4.2873029708862305, "learning_rate": 8.047202486678508e-06, "loss": 0.4965, "step": 45000 }, { "epoch": 19.893899204244033, "eval_bleu": 32.841179711000166, "eval_char_accuracy": 17.94351582035162, "eval_loss": 0.50505131483078, "eval_runtime": 3785.565, "eval_samples_per_second": 0.598, "eval_steps_per_second": 0.075, "step": 45000 }, { "epoch": 19.938107869142353, "grad_norm": 3.7949116230010986, "learning_rate": 8.042761989342807e-06, "loss": 0.5077, "step": 45100 }, { "epoch": 19.98231653404067, "grad_norm": 3.2026827335357666, "learning_rate": 8.038321492007105e-06, "loss": 0.4988, "step": 45200 }, { "epoch": 20.02652519893899, "grad_norm": 2.805671215057373, "learning_rate": 8.033880994671403e-06, "loss": 0.5001, "step": 45300 }, { "epoch": 20.07073386383731, "grad_norm": 3.3434596061706543, "learning_rate": 8.029440497335702e-06, "loss": 0.5035, "step": 45400 }, { "epoch": 20.114942528735632, "grad_norm": 3.6001150608062744, "learning_rate": 8.025e-06, "loss": 0.4853, "step": 45500 }, { "epoch": 20.114942528735632, "eval_bleu": 32.81648144223395, "eval_char_accuracy": 17.863396416595055, "eval_loss": 0.5046483874320984, "eval_runtime": 4031.9059, "eval_samples_per_second": 0.561, "eval_steps_per_second": 0.07, "step": 45500 }, { "epoch": 20.159151193633953, "grad_norm": 3.5448977947235107, "learning_rate": 8.0205595026643e-06, "loss": 0.4825, "step": 45600 }, { "epoch": 20.203359858532274, "grad_norm": 4.382631778717041, "learning_rate": 8.016119005328598e-06, "loss": 0.4914, "step": 45700 }, { "epoch": 20.24756852343059, "grad_norm": 4.451620101928711, "learning_rate": 8.011722912966253e-06, "loss": 0.4968, "step": 45800 }, { "epoch": 20.29177718832891, "grad_norm": 3.3159124851226807, "learning_rate": 8.007282415630552e-06, "loss": 0.5039, "step": 45900 }, { "epoch": 20.335985853227232, "grad_norm": 2.9183428287506104, "learning_rate": 8.002841918294848e-06, "loss": 0.4777, "step": 46000 }, { "epoch": 20.335985853227232, "eval_bleu": 33.100392782196394, "eval_char_accuracy": 18.033004025353915, "eval_loss": 0.5001059174537659, "eval_runtime": 3896.3823, "eval_samples_per_second": 0.581, "eval_steps_per_second": 0.073, "step": 46000 }, { "epoch": 20.380194518125553, "grad_norm": 3.8907320499420166, "learning_rate": 7.998401420959148e-06, "loss": 0.5048, "step": 46100 }, { "epoch": 20.424403183023873, "grad_norm": 4.246114253997803, "learning_rate": 7.993960923623446e-06, "loss": 0.5167, "step": 46200 }, { "epoch": 20.468611847922194, "grad_norm": 3.472891092300415, "learning_rate": 7.989520426287745e-06, "loss": 0.4833, "step": 46300 }, { "epoch": 20.51282051282051, "grad_norm": 3.8271119594573975, "learning_rate": 7.985079928952043e-06, "loss": 0.5011, "step": 46400 }, { "epoch": 20.55702917771883, "grad_norm": 3.3347439765930176, "learning_rate": 7.980639431616341e-06, "loss": 0.4948, "step": 46500 }, { "epoch": 20.55702917771883, "eval_bleu": 33.22640095236002, "eval_char_accuracy": 18.046572634054623, "eval_loss": 0.5035552382469177, "eval_runtime": 3940.4224, "eval_samples_per_second": 0.574, "eval_steps_per_second": 0.072, "step": 46500 }, { "epoch": 20.601237842617152, "grad_norm": 4.584412574768066, "learning_rate": 7.97619893428064e-06, "loss": 0.4876, "step": 46600 }, { "epoch": 20.645446507515473, "grad_norm": 3.1262519359588623, "learning_rate": 7.971758436944938e-06, "loss": 0.4782, "step": 46700 }, { "epoch": 20.689655172413794, "grad_norm": 3.8017396926879883, "learning_rate": 7.967317939609238e-06, "loss": 0.4916, "step": 46800 }, { "epoch": 20.733863837312114, "grad_norm": 3.410078763961792, "learning_rate": 7.962877442273535e-06, "loss": 0.4879, "step": 46900 }, { "epoch": 20.778072502210435, "grad_norm": 3.9219188690185547, "learning_rate": 7.958436944937833e-06, "loss": 0.49, "step": 47000 }, { "epoch": 20.778072502210435, "eval_bleu": 33.48835531194773, "eval_char_accuracy": 18.2791773546382, "eval_loss": 0.496384859085083, "eval_runtime": 3906.6126, "eval_samples_per_second": 0.579, "eval_steps_per_second": 0.072, "step": 47000 }, { "epoch": 20.822281167108752, "grad_norm": 3.1248579025268555, "learning_rate": 7.953996447602133e-06, "loss": 0.4684, "step": 47100 }, { "epoch": 20.866489832007073, "grad_norm": 3.54280948638916, "learning_rate": 7.94955595026643e-06, "loss": 0.5028, "step": 47200 }, { "epoch": 20.910698496905393, "grad_norm": 3.795173168182373, "learning_rate": 7.945115452930728e-06, "loss": 0.4943, "step": 47300 }, { "epoch": 20.954907161803714, "grad_norm": 4.261209487915039, "learning_rate": 7.940674955595026e-06, "loss": 0.5021, "step": 47400 }, { "epoch": 20.999115826702035, "grad_norm": 3.6436707973480225, "learning_rate": 7.936234458259326e-06, "loss": 0.4788, "step": 47500 }, { "epoch": 20.999115826702035, "eval_bleu": 33.175611109241466, "eval_char_accuracy": 17.980021838998766, "eval_loss": 0.4984375238418579, "eval_runtime": 3855.045, "eval_samples_per_second": 0.587, "eval_steps_per_second": 0.073, "step": 47500 }, { "epoch": 21.043324491600355, "grad_norm": 3.6348063945770264, "learning_rate": 7.931793960923623e-06, "loss": 0.4844, "step": 47600 }, { "epoch": 21.087533156498672, "grad_norm": 3.3817226886749268, "learning_rate": 7.927353463587923e-06, "loss": 0.4784, "step": 47700 }, { "epoch": 21.131741821396993, "grad_norm": 3.7965996265411377, "learning_rate": 7.922957371225578e-06, "loss": 0.4838, "step": 47800 }, { "epoch": 21.175950486295314, "grad_norm": 3.5606181621551514, "learning_rate": 7.918516873889876e-06, "loss": 0.475, "step": 47900 }, { "epoch": 21.220159151193634, "grad_norm": 3.3465354442596436, "learning_rate": 7.914076376554176e-06, "loss": 0.467, "step": 48000 }, { "epoch": 21.220159151193634, "eval_bleu": 33.60385109048305, "eval_char_accuracy": 18.187427714852458, "eval_loss": 0.4971466660499573, "eval_runtime": 3900.3759, "eval_samples_per_second": 0.58, "eval_steps_per_second": 0.073, "step": 48000 }, { "epoch": 21.264367816091955, "grad_norm": 3.9273407459259033, "learning_rate": 7.909635879218473e-06, "loss": 0.4851, "step": 48100 }, { "epoch": 21.308576480990276, "grad_norm": 3.9251596927642822, "learning_rate": 7.905195381882771e-06, "loss": 0.4892, "step": 48200 }, { "epoch": 21.352785145888593, "grad_norm": 3.9308271408081055, "learning_rate": 7.900754884547069e-06, "loss": 0.4862, "step": 48300 }, { "epoch": 21.396993810786913, "grad_norm": 3.829035758972168, "learning_rate": 7.896314387211368e-06, "loss": 0.4731, "step": 48400 }, { "epoch": 21.441202475685234, "grad_norm": 3.8976125717163086, "learning_rate": 7.891873889875666e-06, "loss": 0.4891, "step": 48500 }, { "epoch": 21.441202475685234, "eval_bleu": 33.54744627448891, "eval_char_accuracy": 17.926070466307852, "eval_loss": 0.4960884749889374, "eval_runtime": 3686.1916, "eval_samples_per_second": 0.614, "eval_steps_per_second": 0.077, "step": 48500 }, { "epoch": 21.485411140583555, "grad_norm": 4.9077534675598145, "learning_rate": 7.887433392539964e-06, "loss": 0.4847, "step": 48600 }, { "epoch": 21.529619805481875, "grad_norm": 3.5922646522521973, "learning_rate": 7.882992895204264e-06, "loss": 0.4796, "step": 48700 }, { "epoch": 21.573828470380196, "grad_norm": 4.6223225593566895, "learning_rate": 7.878552397868561e-06, "loss": 0.4857, "step": 48800 }, { "epoch": 21.618037135278513, "grad_norm": 3.919811487197876, "learning_rate": 7.874111900532861e-06, "loss": 0.4809, "step": 48900 }, { "epoch": 21.662245800176834, "grad_norm": 3.9013800621032715, "learning_rate": 7.869671403197159e-06, "loss": 0.4915, "step": 49000 }, { "epoch": 21.662245800176834, "eval_bleu": 33.882009033019564, "eval_char_accuracy": 18.27691591985475, "eval_loss": 0.49374356865882874, "eval_runtime": 3896.3549, "eval_samples_per_second": 0.581, "eval_steps_per_second": 0.073, "step": 49000 }, { "epoch": 21.706454465075154, "grad_norm": 3.9991354942321777, "learning_rate": 7.865230905861457e-06, "loss": 0.481, "step": 49100 }, { "epoch": 21.750663129973475, "grad_norm": 3.6628310680389404, "learning_rate": 7.860790408525756e-06, "loss": 0.4828, "step": 49200 }, { "epoch": 21.794871794871796, "grad_norm": 4.16771125793457, "learning_rate": 7.856349911190054e-06, "loss": 0.4661, "step": 49300 }, { "epoch": 21.839080459770116, "grad_norm": 3.6251885890960693, "learning_rate": 7.851909413854353e-06, "loss": 0.477, "step": 49400 }, { "epoch": 21.883289124668433, "grad_norm": 3.231438636779785, "learning_rate": 7.847468916518651e-06, "loss": 0.4912, "step": 49500 }, { "epoch": 21.883289124668433, "eval_bleu": 33.56616396049602, "eval_char_accuracy": 18.571871628039208, "eval_loss": 0.48948654532432556, "eval_runtime": 3999.9475, "eval_samples_per_second": 0.566, "eval_steps_per_second": 0.071, "step": 49500 }, { "epoch": 21.927497789566754, "grad_norm": 3.362558603286743, "learning_rate": 7.843028419182949e-06, "loss": 0.466, "step": 49600 }, { "epoch": 21.971706454465075, "grad_norm": 3.2716805934906006, "learning_rate": 7.838587921847247e-06, "loss": 0.4823, "step": 49700 }, { "epoch": 22.015915119363395, "grad_norm": 3.9514899253845215, "learning_rate": 7.834147424511546e-06, "loss": 0.473, "step": 49800 }, { "epoch": 22.060123784261716, "grad_norm": 3.976128339767456, "learning_rate": 7.829751332149202e-06, "loss": 0.4677, "step": 49900 }, { "epoch": 22.104332449160037, "grad_norm": 3.5756001472473145, "learning_rate": 7.8253108348135e-06, "loss": 0.4782, "step": 50000 }, { "epoch": 22.104332449160037, "eval_bleu": 33.8602811247665, "eval_char_accuracy": 18.0223429756605, "eval_loss": 0.49151143431663513, "eval_runtime": 3783.7963, "eval_samples_per_second": 0.598, "eval_steps_per_second": 0.075, "step": 50000 }, { "epoch": 22.148541114058354, "grad_norm": 4.115743160247803, "learning_rate": 7.820870337477799e-06, "loss": 0.4663, "step": 50100 }, { "epoch": 22.192749778956674, "grad_norm": 3.9690654277801514, "learning_rate": 7.816429840142097e-06, "loss": 0.4672, "step": 50200 }, { "epoch": 22.236958443854995, "grad_norm": 6.096304893493652, "learning_rate": 7.811989342806396e-06, "loss": 0.4781, "step": 50300 }, { "epoch": 22.281167108753316, "grad_norm": 3.4138641357421875, "learning_rate": 7.807548845470694e-06, "loss": 0.4601, "step": 50400 }, { "epoch": 22.325375773651636, "grad_norm": 4.135013103485107, "learning_rate": 7.803108348134992e-06, "loss": 0.457, "step": 50500 }, { "epoch": 22.325375773651636, "eval_bleu": 33.92082534280131, "eval_char_accuracy": 18.250101764565255, "eval_loss": 0.49074092507362366, "eval_runtime": 3785.5189, "eval_samples_per_second": 0.598, "eval_steps_per_second": 0.075, "step": 50500 }, { "epoch": 22.369584438549957, "grad_norm": 3.6282777786254883, "learning_rate": 7.798712255772647e-06, "loss": 0.4682, "step": 50600 }, { "epoch": 22.413793103448278, "grad_norm": 4.353074073791504, "learning_rate": 7.794271758436945e-06, "loss": 0.4848, "step": 50700 }, { "epoch": 22.458001768346595, "grad_norm": 4.046987533569336, "learning_rate": 7.789831261101244e-06, "loss": 0.4741, "step": 50800 }, { "epoch": 22.502210433244915, "grad_norm": 3.5357377529144287, "learning_rate": 7.785390763765542e-06, "loss": 0.4679, "step": 50900 }, { "epoch": 22.546419098143236, "grad_norm": 4.168898105621338, "learning_rate": 7.780950266429842e-06, "loss": 0.4574, "step": 51000 }, { "epoch": 22.546419098143236, "eval_bleu": 34.07493475879003, "eval_char_accuracy": 18.467199503776595, "eval_loss": 0.4876865744590759, "eval_runtime": 3757.8345, "eval_samples_per_second": 0.602, "eval_steps_per_second": 0.075, "step": 51000 }, { "epoch": 22.590627763041557, "grad_norm": 3.8833634853363037, "learning_rate": 7.77650976909414e-06, "loss": 0.4509, "step": 51100 }, { "epoch": 22.634836427939877, "grad_norm": 4.101656436920166, "learning_rate": 7.772069271758437e-06, "loss": 0.4789, "step": 51200 }, { "epoch": 22.679045092838198, "grad_norm": 3.6229770183563232, "learning_rate": 7.767628774422737e-06, "loss": 0.4788, "step": 51300 }, { "epoch": 22.723253757736515, "grad_norm": 4.151303291320801, "learning_rate": 7.763188277087035e-06, "loss": 0.4644, "step": 51400 }, { "epoch": 22.767462422634836, "grad_norm": 3.8464596271514893, "learning_rate": 7.758747779751334e-06, "loss": 0.479, "step": 51500 }, { "epoch": 22.767462422634836, "eval_bleu": 33.912574515618175, "eval_char_accuracy": 18.20390388256046, "eval_loss": 0.4850179851055145, "eval_runtime": 3777.1639, "eval_samples_per_second": 0.599, "eval_steps_per_second": 0.075, "step": 51500 }, { "epoch": 22.811671087533156, "grad_norm": 3.5864808559417725, "learning_rate": 7.75430728241563e-06, "loss": 0.4759, "step": 51600 }, { "epoch": 22.855879752431477, "grad_norm": 4.002447605133057, "learning_rate": 7.74986678507993e-06, "loss": 0.4818, "step": 51700 }, { "epoch": 22.900088417329798, "grad_norm": 3.2455673217773438, "learning_rate": 7.745426287744227e-06, "loss": 0.4706, "step": 51800 }, { "epoch": 22.94429708222812, "grad_norm": 4.484466075897217, "learning_rate": 7.740985790408527e-06, "loss": 0.472, "step": 51900 }, { "epoch": 22.988505747126435, "grad_norm": 3.7575151920318604, "learning_rate": 7.736545293072825e-06, "loss": 0.4815, "step": 52000 }, { "epoch": 22.988505747126435, "eval_bleu": 34.03985124136929, "eval_char_accuracy": 18.30211476458464, "eval_loss": 0.4866866171360016, "eval_runtime": 3737.2976, "eval_samples_per_second": 0.606, "eval_steps_per_second": 0.076, "step": 52000 }, { "epoch": 23.032714412024756, "grad_norm": 3.872447967529297, "learning_rate": 7.732104795737123e-06, "loss": 0.469, "step": 52100 }, { "epoch": 23.076923076923077, "grad_norm": 4.61568546295166, "learning_rate": 7.727664298401422e-06, "loss": 0.4691, "step": 52200 }, { "epoch": 23.121131741821397, "grad_norm": 5.293509483337402, "learning_rate": 7.72322380106572e-06, "loss": 0.4683, "step": 52300 }, { "epoch": 23.165340406719718, "grad_norm": 3.319977045059204, "learning_rate": 7.71878330373002e-06, "loss": 0.4557, "step": 52400 }, { "epoch": 23.20954907161804, "grad_norm": 4.643522262573242, "learning_rate": 7.714342806394317e-06, "loss": 0.4751, "step": 52500 }, { "epoch": 23.20954907161804, "eval_bleu": 33.79355059548053, "eval_char_accuracy": 18.33700547267218, "eval_loss": 0.4858049154281616, "eval_runtime": 3861.3188, "eval_samples_per_second": 0.586, "eval_steps_per_second": 0.073, "step": 52500 }, { "epoch": 23.253757736516356, "grad_norm": 4.489968776702881, "learning_rate": 7.709902309058615e-06, "loss": 0.4697, "step": 52600 }, { "epoch": 23.297966401414676, "grad_norm": 3.9281039237976074, "learning_rate": 7.705461811722914e-06, "loss": 0.4642, "step": 52700 }, { "epoch": 23.342175066312997, "grad_norm": 3.1341047286987305, "learning_rate": 7.701021314387212e-06, "loss": 0.472, "step": 52800 }, { "epoch": 23.386383731211318, "grad_norm": 3.0309391021728516, "learning_rate": 7.69658081705151e-06, "loss": 0.4554, "step": 52900 }, { "epoch": 23.43059239610964, "grad_norm": 3.582554578781128, "learning_rate": 7.692140319715808e-06, "loss": 0.4594, "step": 53000 }, { "epoch": 23.43059239610964, "eval_bleu": 34.19229367353568, "eval_char_accuracy": 18.368665559640498, "eval_loss": 0.4822302460670471, "eval_runtime": 3827.9702, "eval_samples_per_second": 0.591, "eval_steps_per_second": 0.074, "step": 53000 }, { "epoch": 23.47480106100796, "grad_norm": 3.209322690963745, "learning_rate": 7.687699822380107e-06, "loss": 0.464, "step": 53100 }, { "epoch": 23.519009725906276, "grad_norm": 3.785881280899048, "learning_rate": 7.683259325044405e-06, "loss": 0.4654, "step": 53200 }, { "epoch": 23.563218390804597, "grad_norm": 3.9173619747161865, "learning_rate": 7.678818827708705e-06, "loss": 0.4456, "step": 53300 }, { "epoch": 23.607427055702917, "grad_norm": 3.420377731323242, "learning_rate": 7.674378330373003e-06, "loss": 0.4582, "step": 53400 }, { "epoch": 23.651635720601238, "grad_norm": 4.483918190002441, "learning_rate": 7.6699378330373e-06, "loss": 0.4505, "step": 53500 }, { "epoch": 23.651635720601238, "eval_bleu": 34.41827867118556, "eval_char_accuracy": 18.375449863990852, "eval_loss": 0.48489418625831604, "eval_runtime": 3773.4269, "eval_samples_per_second": 0.6, "eval_steps_per_second": 0.075, "step": 53500 }, { "epoch": 23.69584438549956, "grad_norm": 3.412243604660034, "learning_rate": 7.6654973357016e-06, "loss": 0.4483, "step": 53600 }, { "epoch": 23.74005305039788, "grad_norm": 3.7695186138153076, "learning_rate": 7.661056838365898e-06, "loss": 0.4574, "step": 53700 }, { "epoch": 23.784261715296196, "grad_norm": 3.107638120651245, "learning_rate": 7.656616341030197e-06, "loss": 0.4546, "step": 53800 }, { "epoch": 23.828470380194517, "grad_norm": 3.5435268878936768, "learning_rate": 7.652175843694495e-06, "loss": 0.4619, "step": 53900 }, { "epoch": 23.872679045092838, "grad_norm": 3.298466920852661, "learning_rate": 7.647735346358793e-06, "loss": 0.4553, "step": 54000 }, { "epoch": 23.872679045092838, "eval_bleu": 34.04104253811371, "eval_char_accuracy": 18.544088286413945, "eval_loss": 0.48133185505867004, "eval_runtime": 3777.4246, "eval_samples_per_second": 0.599, "eval_steps_per_second": 0.075, "step": 54000 }, { "epoch": 23.91688770999116, "grad_norm": 4.650321006774902, "learning_rate": 7.64329484902309e-06, "loss": 0.4716, "step": 54100 }, { "epoch": 23.96109637488948, "grad_norm": 4.385313034057617, "learning_rate": 7.63885435168739e-06, "loss": 0.4607, "step": 54200 }, { "epoch": 24.0053050397878, "grad_norm": 3.387877941131592, "learning_rate": 7.634413854351688e-06, "loss": 0.4474, "step": 54300 }, { "epoch": 24.049513704686117, "grad_norm": 3.485687494277954, "learning_rate": 7.629973357015986e-06, "loss": 0.4437, "step": 54400 }, { "epoch": 24.093722369584437, "grad_norm": 3.425071954727173, "learning_rate": 7.625532859680284e-06, "loss": 0.4505, "step": 54500 }, { "epoch": 24.093722369584437, "eval_bleu": 34.48651387006091, "eval_char_accuracy": 18.625499938618198, "eval_loss": 0.4781510829925537, "eval_runtime": 3771.0921, "eval_samples_per_second": 0.6, "eval_steps_per_second": 0.075, "step": 54500 }, { "epoch": 24.137931034482758, "grad_norm": 3.568829298019409, "learning_rate": 7.62113676731794e-06, "loss": 0.4577, "step": 54600 }, { "epoch": 24.18213969938108, "grad_norm": 3.818176746368408, "learning_rate": 7.616696269982239e-06, "loss": 0.4422, "step": 54700 }, { "epoch": 24.2263483642794, "grad_norm": 3.4810068607330322, "learning_rate": 7.612255772646538e-06, "loss": 0.4451, "step": 54800 }, { "epoch": 24.27055702917772, "grad_norm": 3.9345364570617676, "learning_rate": 7.6078152753108355e-06, "loss": 0.4579, "step": 54900 }, { "epoch": 24.31476569407604, "grad_norm": 3.4327445030212402, "learning_rate": 7.603374777975134e-06, "loss": 0.4602, "step": 55000 }, { "epoch": 24.31476569407604, "eval_bleu": 34.359813330803235, "eval_char_accuracy": 18.598362721216784, "eval_loss": 0.4790441393852234, "eval_runtime": 3813.2931, "eval_samples_per_second": 0.593, "eval_steps_per_second": 0.074, "step": 55000 }, { "epoch": 24.358974358974358, "grad_norm": 4.094949245452881, "learning_rate": 7.598934280639432e-06, "loss": 0.447, "step": 55100 }, { "epoch": 24.40318302387268, "grad_norm": 2.9666781425476074, "learning_rate": 7.59449378330373e-06, "loss": 0.4476, "step": 55200 }, { "epoch": 24.447391688771, "grad_norm": 4.096363544464111, "learning_rate": 7.5900532859680285e-06, "loss": 0.4663, "step": 55300 }, { "epoch": 24.49160035366932, "grad_norm": 3.9940991401672363, "learning_rate": 7.585612788632327e-06, "loss": 0.4462, "step": 55400 }, { "epoch": 24.53580901856764, "grad_norm": 3.8744585514068604, "learning_rate": 7.581172291296626e-06, "loss": 0.4524, "step": 55500 }, { "epoch": 24.53580901856764, "eval_bleu": 34.529394728459145, "eval_char_accuracy": 18.360912068954377, "eval_loss": 0.47978290915489197, "eval_runtime": 3766.9597, "eval_samples_per_second": 0.601, "eval_steps_per_second": 0.075, "step": 55500 }, { "epoch": 24.58001768346596, "grad_norm": 3.914504051208496, "learning_rate": 7.576731793960924e-06, "loss": 0.4486, "step": 55600 }, { "epoch": 24.624226348364278, "grad_norm": 3.662569761276245, "learning_rate": 7.572291296625223e-06, "loss": 0.4485, "step": 55700 }, { "epoch": 24.6684350132626, "grad_norm": 4.710388660430908, "learning_rate": 7.567850799289521e-06, "loss": 0.4501, "step": 55800 }, { "epoch": 24.71264367816092, "grad_norm": 3.7036995887756348, "learning_rate": 7.5634103019538195e-06, "loss": 0.4664, "step": 55900 }, { "epoch": 24.75685234305924, "grad_norm": 4.236560344696045, "learning_rate": 7.558969804618118e-06, "loss": 0.44, "step": 56000 }, { "epoch": 24.75685234305924, "eval_bleu": 34.737769815496804, "eval_char_accuracy": 18.640683857878514, "eval_loss": 0.4822787642478943, "eval_runtime": 3937.4711, "eval_samples_per_second": 0.575, "eval_steps_per_second": 0.072, "step": 56000 }, { "epoch": 24.80106100795756, "grad_norm": 3.5183193683624268, "learning_rate": 7.554529307282417e-06, "loss": 0.4709, "step": 56100 }, { "epoch": 24.84526967285588, "grad_norm": 3.629310369491577, "learning_rate": 7.5500888099467155e-06, "loss": 0.4488, "step": 56200 }, { "epoch": 24.8894783377542, "grad_norm": 3.5047855377197266, "learning_rate": 7.545648312611013e-06, "loss": 0.4366, "step": 56300 }, { "epoch": 24.93368700265252, "grad_norm": 3.6567394733428955, "learning_rate": 7.541207815275311e-06, "loss": 0.4398, "step": 56400 }, { "epoch": 24.97789566755084, "grad_norm": 3.3886139392852783, "learning_rate": 7.53676731793961e-06, "loss": 0.4409, "step": 56500 }, { "epoch": 24.97789566755084, "eval_bleu": 34.95026601924133, "eval_char_accuracy": 18.627115249177805, "eval_loss": 0.47342270612716675, "eval_runtime": 3910.743, "eval_samples_per_second": 0.579, "eval_steps_per_second": 0.072, "step": 56500 }, { "epoch": 25.02210433244916, "grad_norm": 3.62349009513855, "learning_rate": 7.5323268206039076e-06, "loss": 0.451, "step": 56600 }, { "epoch": 25.06631299734748, "grad_norm": 3.046114683151245, "learning_rate": 7.527886323268206e-06, "loss": 0.4431, "step": 56700 }, { "epoch": 25.1105216622458, "grad_norm": 3.627570629119873, "learning_rate": 7.523445825932505e-06, "loss": 0.4378, "step": 56800 }, { "epoch": 25.15473032714412, "grad_norm": 3.7135891914367676, "learning_rate": 7.5190053285968035e-06, "loss": 0.4466, "step": 56900 }, { "epoch": 25.19893899204244, "grad_norm": 3.590561866760254, "learning_rate": 7.514564831261102e-06, "loss": 0.4285, "step": 57000 }, { "epoch": 25.19893899204244, "eval_bleu": 34.69054725352436, "eval_char_accuracy": 18.843566864165304, "eval_loss": 0.47307994961738586, "eval_runtime": 3775.1614, "eval_samples_per_second": 0.599, "eval_steps_per_second": 0.075, "step": 57000 }, { "epoch": 25.24314765694076, "grad_norm": 3.0212132930755615, "learning_rate": 7.5101687388987574e-06, "loss": 0.4312, "step": 57100 }, { "epoch": 25.28735632183908, "grad_norm": 3.4297900199890137, "learning_rate": 7.505728241563056e-06, "loss": 0.4366, "step": 57200 }, { "epoch": 25.3315649867374, "grad_norm": 3.4311418533325195, "learning_rate": 7.501287744227355e-06, "loss": 0.46, "step": 57300 }, { "epoch": 25.375773651635722, "grad_norm": 4.873203754425049, "learning_rate": 7.496847246891652e-06, "loss": 0.4179, "step": 57400 }, { "epoch": 25.41998231653404, "grad_norm": 3.9877772331237793, "learning_rate": 7.49240674955595e-06, "loss": 0.4333, "step": 57500 }, { "epoch": 25.41998231653404, "eval_bleu": 35.02144316999377, "eval_char_accuracy": 18.679451311309112, "eval_loss": 0.47394952178001404, "eval_runtime": 3800.8583, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.074, "step": 57500 }, { "epoch": 25.46419098143236, "grad_norm": 3.1034445762634277, "learning_rate": 7.487966252220249e-06, "loss": 0.4389, "step": 57600 }, { "epoch": 25.50839964633068, "grad_norm": 3.4718830585479736, "learning_rate": 7.483525754884548e-06, "loss": 0.445, "step": 57700 }, { "epoch": 25.552608311229, "grad_norm": 5.202772617340088, "learning_rate": 7.479085257548846e-06, "loss": 0.4574, "step": 57800 }, { "epoch": 25.59681697612732, "grad_norm": 3.9256880283355713, "learning_rate": 7.474644760213144e-06, "loss": 0.4533, "step": 57900 }, { "epoch": 25.641025641025642, "grad_norm": 4.3005805015563965, "learning_rate": 7.470204262877443e-06, "loss": 0.46, "step": 58000 }, { "epoch": 25.641025641025642, "eval_bleu": 35.103539727809235, "eval_char_accuracy": 18.619038696379768, "eval_loss": 0.47134506702423096, "eval_runtime": 3949.2616, "eval_samples_per_second": 0.573, "eval_steps_per_second": 0.072, "step": 58000 }, { "epoch": 25.68523430592396, "grad_norm": 3.6098217964172363, "learning_rate": 7.4657637655417414e-06, "loss": 0.4412, "step": 58100 }, { "epoch": 25.72944297082228, "grad_norm": 4.803073883056641, "learning_rate": 7.46132326820604e-06, "loss": 0.4426, "step": 58200 }, { "epoch": 25.7736516357206, "grad_norm": 3.652554988861084, "learning_rate": 7.456882770870339e-06, "loss": 0.4238, "step": 58300 }, { "epoch": 25.81786030061892, "grad_norm": 3.131455659866333, "learning_rate": 7.4524422735346365e-06, "loss": 0.4547, "step": 58400 }, { "epoch": 25.862068965517242, "grad_norm": 2.8206682205200195, "learning_rate": 7.448001776198935e-06, "loss": 0.4413, "step": 58500 }, { "epoch": 25.862068965517242, "eval_bleu": 35.07528923164432, "eval_char_accuracy": 18.641653044214284, "eval_loss": 0.4746817350387573, "eval_runtime": 3743.0073, "eval_samples_per_second": 0.605, "eval_steps_per_second": 0.076, "step": 58500 }, { "epoch": 25.906277630415563, "grad_norm": 4.257846355438232, "learning_rate": 7.443561278863233e-06, "loss": 0.4573, "step": 58600 }, { "epoch": 25.95048629531388, "grad_norm": 3.686765670776367, "learning_rate": 7.439120781527532e-06, "loss": 0.4327, "step": 58700 }, { "epoch": 25.9946949602122, "grad_norm": 3.5389842987060547, "learning_rate": 7.4346802841918295e-06, "loss": 0.4518, "step": 58800 }, { "epoch": 26.03890362511052, "grad_norm": 4.141404628753662, "learning_rate": 7.430239786856128e-06, "loss": 0.4305, "step": 58900 }, { "epoch": 26.08311229000884, "grad_norm": 3.526571750640869, "learning_rate": 7.425799289520427e-06, "loss": 0.4552, "step": 59000 }, { "epoch": 26.08311229000884, "eval_bleu": 34.96524917230938, "eval_char_accuracy": 18.792199988369767, "eval_loss": 0.46930044889450073, "eval_runtime": 3810.8214, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.074, "step": 59000 }, { "epoch": 26.127320954907162, "grad_norm": 3.8201069831848145, "learning_rate": 7.421358792184725e-06, "loss": 0.4426, "step": 59100 }, { "epoch": 26.171529619805483, "grad_norm": 3.8380045890808105, "learning_rate": 7.416918294849024e-06, "loss": 0.4325, "step": 59200 }, { "epoch": 26.215738284703804, "grad_norm": 3.6785879135131836, "learning_rate": 7.412477797513322e-06, "loss": 0.429, "step": 59300 }, { "epoch": 26.25994694960212, "grad_norm": 3.6245944499969482, "learning_rate": 7.408081705150978e-06, "loss": 0.4397, "step": 59400 }, { "epoch": 26.30415561450044, "grad_norm": 2.7912895679473877, "learning_rate": 7.403641207815277e-06, "loss": 0.4315, "step": 59500 }, { "epoch": 26.30415561450044, "eval_bleu": 35.15527441849868, "eval_char_accuracy": 19.020281839386442, "eval_loss": 0.4680403172969818, "eval_runtime": 3740.058, "eval_samples_per_second": 0.605, "eval_steps_per_second": 0.076, "step": 59500 }, { "epoch": 26.348364279398762, "grad_norm": 3.258089303970337, "learning_rate": 7.399200710479574e-06, "loss": 0.4338, "step": 59600 }, { "epoch": 26.392572944297083, "grad_norm": 3.7382047176361084, "learning_rate": 7.394760213143872e-06, "loss": 0.4333, "step": 59700 }, { "epoch": 26.436781609195403, "grad_norm": 3.468820571899414, "learning_rate": 7.390319715808171e-06, "loss": 0.4252, "step": 59800 }, { "epoch": 26.480990274093724, "grad_norm": 3.170513391494751, "learning_rate": 7.3858792184724696e-06, "loss": 0.418, "step": 59900 }, { "epoch": 26.52519893899204, "grad_norm": 2.8892738819122314, "learning_rate": 7.381438721136767e-06, "loss": 0.423, "step": 60000 }, { "epoch": 26.52519893899204, "eval_bleu": 35.164909682330304, "eval_char_accuracy": 18.629053621849337, "eval_loss": 0.46949729323387146, "eval_runtime": 3793.0573, "eval_samples_per_second": 0.597, "eval_steps_per_second": 0.075, "step": 60000 }, { "epoch": 26.56940760389036, "grad_norm": 3.0883092880249023, "learning_rate": 7.376998223801066e-06, "loss": 0.4263, "step": 60100 }, { "epoch": 26.613616268788682, "grad_norm": 3.8994991779327393, "learning_rate": 7.372557726465365e-06, "loss": 0.4491, "step": 60200 }, { "epoch": 26.657824933687003, "grad_norm": 3.9781856536865234, "learning_rate": 7.368117229129663e-06, "loss": 0.4273, "step": 60300 }, { "epoch": 26.702033598585324, "grad_norm": 3.1654317378997803, "learning_rate": 7.363676731793962e-06, "loss": 0.4341, "step": 60400 }, { "epoch": 26.746242263483644, "grad_norm": 3.5967812538146973, "learning_rate": 7.35923623445826e-06, "loss": 0.4258, "step": 60500 }, { "epoch": 26.746242263483644, "eval_bleu": 35.29520947974874, "eval_char_accuracy": 18.62969974607318, "eval_loss": 0.46655765175819397, "eval_runtime": 3876.7178, "eval_samples_per_second": 0.584, "eval_steps_per_second": 0.073, "step": 60500 }, { "epoch": 26.79045092838196, "grad_norm": 3.2295174598693848, "learning_rate": 7.3547957371225584e-06, "loss": 0.4444, "step": 60600 }, { "epoch": 26.834659593280282, "grad_norm": 3.1186392307281494, "learning_rate": 7.350355239786857e-06, "loss": 0.4306, "step": 60700 }, { "epoch": 26.878868258178603, "grad_norm": 3.502453565597534, "learning_rate": 7.345914742451156e-06, "loss": 0.4354, "step": 60800 }, { "epoch": 26.923076923076923, "grad_norm": 3.5879507064819336, "learning_rate": 7.341474245115453e-06, "loss": 0.4335, "step": 60900 }, { "epoch": 26.967285587975244, "grad_norm": 3.889507293701172, "learning_rate": 7.337033747779751e-06, "loss": 0.4281, "step": 61000 }, { "epoch": 26.967285587975244, "eval_bleu": 35.375365100345206, "eval_char_accuracy": 19.059049292817036, "eval_loss": 0.4675982594490051, "eval_runtime": 3723.5779, "eval_samples_per_second": 0.608, "eval_steps_per_second": 0.076, "step": 61000 }, { "epoch": 27.011494252873565, "grad_norm": 3.7909648418426514, "learning_rate": 7.33259325044405e-06, "loss": 0.4296, "step": 61100 }, { "epoch": 27.05570291777188, "grad_norm": 3.4306418895721436, "learning_rate": 7.328152753108349e-06, "loss": 0.4141, "step": 61200 }, { "epoch": 27.099911582670202, "grad_norm": 3.8300061225891113, "learning_rate": 7.323712255772647e-06, "loss": 0.425, "step": 61300 }, { "epoch": 27.144120247568523, "grad_norm": 4.119434356689453, "learning_rate": 7.319271758436945e-06, "loss": 0.4244, "step": 61400 }, { "epoch": 27.188328912466844, "grad_norm": 4.19444465637207, "learning_rate": 7.314875666074601e-06, "loss": 0.4125, "step": 61500 }, { "epoch": 27.188328912466844, "eval_bleu": 35.36073596507612, "eval_char_accuracy": 18.821598640554633, "eval_loss": 0.4684741497039795, "eval_runtime": 3805.0784, "eval_samples_per_second": 0.595, "eval_steps_per_second": 0.074, "step": 61500 }, { "epoch": 27.232537577365164, "grad_norm": 3.978120803833008, "learning_rate": 7.3104351687389e-06, "loss": 0.4289, "step": 61600 }, { "epoch": 27.276746242263485, "grad_norm": 3.4959115982055664, "learning_rate": 7.3059946714031985e-06, "loss": 0.4161, "step": 61700 }, { "epoch": 27.320954907161802, "grad_norm": 3.2108712196350098, "learning_rate": 7.301554174067496e-06, "loss": 0.414, "step": 61800 }, { "epoch": 27.365163572060123, "grad_norm": 3.39117431640625, "learning_rate": 7.297113676731794e-06, "loss": 0.4117, "step": 61900 }, { "epoch": 27.409372236958443, "grad_norm": 4.122698783874512, "learning_rate": 7.292673179396093e-06, "loss": 0.4195, "step": 62000 }, { "epoch": 27.409372236958443, "eval_bleu": 35.73906581464263, "eval_char_accuracy": 18.893964553625082, "eval_loss": 0.46775731444358826, "eval_runtime": 4002.3604, "eval_samples_per_second": 0.565, "eval_steps_per_second": 0.071, "step": 62000 }, { "epoch": 27.453580901856764, "grad_norm": 4.0328497886657715, "learning_rate": 7.288232682060391e-06, "loss": 0.4175, "step": 62100 }, { "epoch": 27.497789566755085, "grad_norm": 3.621716260910034, "learning_rate": 7.283792184724689e-06, "loss": 0.4312, "step": 62200 }, { "epoch": 27.541998231653405, "grad_norm": 3.467021942138672, "learning_rate": 7.279351687388988e-06, "loss": 0.413, "step": 62300 }, { "epoch": 27.586206896551722, "grad_norm": 4.264924049377441, "learning_rate": 7.2749111900532866e-06, "loss": 0.4434, "step": 62400 }, { "epoch": 27.630415561450043, "grad_norm": 4.692573070526123, "learning_rate": 7.270470692717585e-06, "loss": 0.411, "step": 62500 }, { "epoch": 27.630415561450043, "eval_bleu": 35.56251676783941, "eval_char_accuracy": 18.76377052252066, "eval_loss": 0.4648979902267456, "eval_runtime": 3895.7724, "eval_samples_per_second": 0.581, "eval_steps_per_second": 0.073, "step": 62500 }, { "epoch": 27.674624226348364, "grad_norm": 4.678711414337158, "learning_rate": 7.266030195381884e-06, "loss": 0.414, "step": 62600 }, { "epoch": 27.718832891246684, "grad_norm": 3.940443754196167, "learning_rate": 7.261589698046182e-06, "loss": 0.4207, "step": 62700 }, { "epoch": 27.763041556145005, "grad_norm": 4.18531608581543, "learning_rate": 7.25714920071048e-06, "loss": 0.4361, "step": 62800 }, { "epoch": 27.807250221043326, "grad_norm": 3.8297817707061768, "learning_rate": 7.252708703374779e-06, "loss": 0.4191, "step": 62900 }, { "epoch": 27.851458885941646, "grad_norm": 2.9988715648651123, "learning_rate": 7.248268206039078e-06, "loss": 0.4261, "step": 63000 }, { "epoch": 27.851458885941646, "eval_bleu": 35.726217975853956, "eval_char_accuracy": 18.974407019493565, "eval_loss": 0.46203893423080444, "eval_runtime": 3779.7515, "eval_samples_per_second": 0.599, "eval_steps_per_second": 0.075, "step": 63000 }, { "epoch": 27.895667550839963, "grad_norm": 4.304018497467041, "learning_rate": 7.243827708703375e-06, "loss": 0.4257, "step": 63100 }, { "epoch": 27.939876215738284, "grad_norm": 2.8642430305480957, "learning_rate": 7.239387211367673e-06, "loss": 0.4234, "step": 63200 }, { "epoch": 27.984084880636605, "grad_norm": 2.9354429244995117, "learning_rate": 7.234946714031972e-06, "loss": 0.4129, "step": 63300 }, { "epoch": 28.028293545534925, "grad_norm": 3.6690609455108643, "learning_rate": 7.2305062166962706e-06, "loss": 0.421, "step": 63400 }, { "epoch": 28.072502210433246, "grad_norm": 3.8248374462127686, "learning_rate": 7.226065719360568e-06, "loss": 0.4207, "step": 63500 }, { "epoch": 28.072502210433246, "eval_bleu": 35.45932241928952, "eval_char_accuracy": 18.81093759086122, "eval_loss": 0.4623502492904663, "eval_runtime": 3871.1085, "eval_samples_per_second": 0.585, "eval_steps_per_second": 0.073, "step": 63500 }, { "epoch": 28.116710875331567, "grad_norm": 3.820878028869629, "learning_rate": 7.221625222024867e-06, "loss": 0.4292, "step": 63600 }, { "epoch": 28.160919540229884, "grad_norm": 3.8852899074554443, "learning_rate": 7.217229129662523e-06, "loss": 0.4122, "step": 63700 }, { "epoch": 28.205128205128204, "grad_norm": 2.952660322189331, "learning_rate": 7.212788632326822e-06, "loss": 0.395, "step": 63800 }, { "epoch": 28.249336870026525, "grad_norm": 3.2147085666656494, "learning_rate": 7.20834813499112e-06, "loss": 0.426, "step": 63900 }, { "epoch": 28.293545534924846, "grad_norm": 2.5556905269622803, "learning_rate": 7.203907637655418e-06, "loss": 0.4101, "step": 64000 }, { "epoch": 28.293545534924846, "eval_bleu": 35.66812604147292, "eval_char_accuracy": 19.085863448106533, "eval_loss": 0.46580231189727783, "eval_runtime": 3780.6393, "eval_samples_per_second": 0.599, "eval_steps_per_second": 0.075, "step": 64000 }, { "epoch": 28.337754199823166, "grad_norm": 3.6312379837036133, "learning_rate": 7.199467140319717e-06, "loss": 0.3968, "step": 64100 }, { "epoch": 28.381962864721487, "grad_norm": 3.2395622730255127, "learning_rate": 7.195026642984015e-06, "loss": 0.4069, "step": 64200 }, { "epoch": 28.426171529619804, "grad_norm": 2.855431079864502, "learning_rate": 7.1905861456483125e-06, "loss": 0.4118, "step": 64300 }, { "epoch": 28.470380194518125, "grad_norm": 4.113373756408691, "learning_rate": 7.186145648312611e-06, "loss": 0.4336, "step": 64400 }, { "epoch": 28.514588859416445, "grad_norm": 3.138721466064453, "learning_rate": 7.18170515097691e-06, "loss": 0.4164, "step": 64500 }, { "epoch": 28.514588859416445, "eval_bleu": 35.82463364744782, "eval_char_accuracy": 19.077786895308492, "eval_loss": 0.46322816610336304, "eval_runtime": 3765.7659, "eval_samples_per_second": 0.601, "eval_steps_per_second": 0.075, "step": 64500 }, { "epoch": 28.558797524314766, "grad_norm": 3.114168167114258, "learning_rate": 7.1772646536412085e-06, "loss": 0.4034, "step": 64600 }, { "epoch": 28.603006189213087, "grad_norm": 3.366145133972168, "learning_rate": 7.172824156305507e-06, "loss": 0.4163, "step": 64700 }, { "epoch": 28.647214854111407, "grad_norm": 3.4970927238464355, "learning_rate": 7.168383658969805e-06, "loss": 0.4186, "step": 64800 }, { "epoch": 28.691423519009724, "grad_norm": 3.682201385498047, "learning_rate": 7.163943161634104e-06, "loss": 0.4228, "step": 64900 }, { "epoch": 28.735632183908045, "grad_norm": 3.4790029525756836, "learning_rate": 7.159502664298402e-06, "loss": 0.4267, "step": 65000 }, { "epoch": 28.735632183908045, "eval_bleu": 36.010872307770896, "eval_char_accuracy": 19.013174472924163, "eval_loss": 0.46450960636138916, "eval_runtime": 3760.2447, "eval_samples_per_second": 0.602, "eval_steps_per_second": 0.075, "step": 65000 }, { "epoch": 28.779840848806366, "grad_norm": 3.1226418018341064, "learning_rate": 7.155062166962701e-06, "loss": 0.417, "step": 65100 }, { "epoch": 28.824049513704686, "grad_norm": 3.663931131362915, "learning_rate": 7.1506216696269995e-06, "loss": 0.4338, "step": 65200 }, { "epoch": 28.868258178603007, "grad_norm": 3.786334276199341, "learning_rate": 7.146181172291297e-06, "loss": 0.4386, "step": 65300 }, { "epoch": 28.912466843501328, "grad_norm": 3.8385679721832275, "learning_rate": 7.141740674955595e-06, "loss": 0.4245, "step": 65400 }, { "epoch": 28.956675508399645, "grad_norm": 3.7037734985351562, "learning_rate": 7.137300177619894e-06, "loss": 0.4162, "step": 65500 }, { "epoch": 28.956675508399645, "eval_bleu": 35.95501404953156, "eval_char_accuracy": 19.133999702782855, "eval_loss": 0.4584737718105316, "eval_runtime": 3842.767, "eval_samples_per_second": 0.589, "eval_steps_per_second": 0.074, "step": 65500 }, { "epoch": 29.000884173297965, "grad_norm": 3.640958786010742, "learning_rate": 7.1328596802841925e-06, "loss": 0.4192, "step": 65600 }, { "epoch": 29.045092838196286, "grad_norm": 4.846155166625977, "learning_rate": 7.128463587921848e-06, "loss": 0.391, "step": 65700 }, { "epoch": 29.089301503094607, "grad_norm": 4.6066060066223145, "learning_rate": 7.124023090586146e-06, "loss": 0.4022, "step": 65800 }, { "epoch": 29.133510167992927, "grad_norm": 4.162872791290283, "learning_rate": 7.119582593250445e-06, "loss": 0.4241, "step": 65900 }, { "epoch": 29.177718832891248, "grad_norm": 3.4699435234069824, "learning_rate": 7.115142095914743e-06, "loss": 0.3953, "step": 66000 }, { "epoch": 29.177718832891248, "eval_bleu": 36.04277308795141, "eval_char_accuracy": 19.24319469661237, "eval_loss": 0.46185824275016785, "eval_runtime": 3784.8936, "eval_samples_per_second": 0.598, "eval_steps_per_second": 0.075, "step": 66000 }, { "epoch": 29.221927497789565, "grad_norm": 3.7216436862945557, "learning_rate": 7.1107015985790415e-06, "loss": 0.4122, "step": 66100 }, { "epoch": 29.266136162687886, "grad_norm": 2.719778537750244, "learning_rate": 7.10626110124334e-06, "loss": 0.426, "step": 66200 }, { "epoch": 29.310344827586206, "grad_norm": 3.169494390487671, "learning_rate": 7.101820603907639e-06, "loss": 0.4127, "step": 66300 }, { "epoch": 29.354553492484527, "grad_norm": 3.290057420730591, "learning_rate": 7.097380106571936e-06, "loss": 0.4203, "step": 66400 }, { "epoch": 29.398762157382848, "grad_norm": 4.106680870056152, "learning_rate": 7.092939609236234e-06, "loss": 0.4161, "step": 66500 }, { "epoch": 29.398762157382848, "eval_bleu": 36.2912278817503, "eval_char_accuracy": 19.171151845653846, "eval_loss": 0.46162787079811096, "eval_runtime": 3994.9848, "eval_samples_per_second": 0.566, "eval_steps_per_second": 0.071, "step": 66500 }, { "epoch": 29.44297082228117, "grad_norm": 4.476029396057129, "learning_rate": 7.088499111900533e-06, "loss": 0.4, "step": 66600 }, { "epoch": 29.487179487179485, "grad_norm": 3.536804676055908, "learning_rate": 7.084058614564832e-06, "loss": 0.4384, "step": 66700 }, { "epoch": 29.531388152077806, "grad_norm": 3.252019166946411, "learning_rate": 7.07961811722913e-06, "loss": 0.418, "step": 66800 }, { "epoch": 29.575596816976127, "grad_norm": 3.5663766860961914, "learning_rate": 7.075177619893428e-06, "loss": 0.3903, "step": 66900 }, { "epoch": 29.619805481874447, "grad_norm": 3.640294313430786, "learning_rate": 7.070737122557727e-06, "loss": 0.4036, "step": 67000 }, { "epoch": 29.619805481874447, "eval_bleu": 36.2315118467333, "eval_char_accuracy": 19.165336727639257, "eval_loss": 0.46111467480659485, "eval_runtime": 3910.201, "eval_samples_per_second": 0.579, "eval_steps_per_second": 0.072, "step": 67000 }, { "epoch": 29.664014146772768, "grad_norm": 3.011579990386963, "learning_rate": 7.0662966252220255e-06, "loss": 0.4098, "step": 67100 }, { "epoch": 29.70822281167109, "grad_norm": 3.2572829723358154, "learning_rate": 7.061856127886324e-06, "loss": 0.4086, "step": 67200 }, { "epoch": 29.752431476569406, "grad_norm": 3.091500997543335, "learning_rate": 7.057415630550623e-06, "loss": 0.4221, "step": 67300 }, { "epoch": 29.796640141467726, "grad_norm": 3.5932700634002686, "learning_rate": 7.052975133214921e-06, "loss": 0.4148, "step": 67400 }, { "epoch": 29.840848806366047, "grad_norm": 3.8023266792297363, "learning_rate": 7.048534635879219e-06, "loss": 0.422, "step": 67500 }, { "epoch": 29.840848806366047, "eval_bleu": 36.145179122624285, "eval_char_accuracy": 19.604378137740763, "eval_loss": 0.4553970992565155, "eval_runtime": 3812.0469, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.074, "step": 67500 }, { "epoch": 29.885057471264368, "grad_norm": 3.5005674362182617, "learning_rate": 7.044094138543518e-06, "loss": 0.4087, "step": 67600 }, { "epoch": 29.92926613616269, "grad_norm": 3.4551196098327637, "learning_rate": 7.039653641207816e-06, "loss": 0.4034, "step": 67700 }, { "epoch": 29.97347480106101, "grad_norm": 3.8400402069091797, "learning_rate": 7.035257548845471e-06, "loss": 0.4113, "step": 67800 }, { "epoch": 30.01768346595933, "grad_norm": 3.8718202114105225, "learning_rate": 7.03081705150977e-06, "loss": 0.3976, "step": 67900 }, { "epoch": 30.061892130857647, "grad_norm": 4.010491371154785, "learning_rate": 7.026376554174068e-06, "loss": 0.3775, "step": 68000 }, { "epoch": 30.061892130857647, "eval_bleu": 36.41809445833699, "eval_char_accuracy": 19.088124882889986, "eval_loss": 0.45908066630363464, "eval_runtime": 3782.7245, "eval_samples_per_second": 0.598, "eval_steps_per_second": 0.075, "step": 68000 }, { "epoch": 30.106100795755967, "grad_norm": 3.099717140197754, "learning_rate": 7.021936056838367e-06, "loss": 0.417, "step": 68100 }, { "epoch": 30.150309460654288, "grad_norm": 4.091557502746582, "learning_rate": 7.017495559502665e-06, "loss": 0.4014, "step": 68200 }, { "epoch": 30.19451812555261, "grad_norm": 3.8190557956695557, "learning_rate": 7.013055062166963e-06, "loss": 0.4015, "step": 68300 }, { "epoch": 30.23872679045093, "grad_norm": 2.748532295227051, "learning_rate": 7.008614564831262e-06, "loss": 0.4198, "step": 68400 }, { "epoch": 30.28293545534925, "grad_norm": 3.2780096530914307, "learning_rate": 7.004174067495561e-06, "loss": 0.411, "step": 68500 }, { "epoch": 30.28293545534925, "eval_bleu": 36.015845962146955, "eval_char_accuracy": 19.275823969916456, "eval_loss": 0.4581856429576874, "eval_runtime": 3924.9875, "eval_samples_per_second": 0.577, "eval_steps_per_second": 0.072, "step": 68500 }, { "epoch": 30.327144120247567, "grad_norm": 4.234321117401123, "learning_rate": 6.999733570159859e-06, "loss": 0.4, "step": 68600 }, { "epoch": 30.371352785145888, "grad_norm": 3.7523138523101807, "learning_rate": 6.995293072824156e-06, "loss": 0.4088, "step": 68700 }, { "epoch": 30.41556145004421, "grad_norm": 3.919811725616455, "learning_rate": 6.990852575488455e-06, "loss": 0.3969, "step": 68800 }, { "epoch": 30.45977011494253, "grad_norm": 3.606421709060669, "learning_rate": 6.986412078152754e-06, "loss": 0.4065, "step": 68900 }, { "epoch": 30.50397877984085, "grad_norm": 3.1725118160247803, "learning_rate": 6.9819715808170514e-06, "loss": 0.3857, "step": 69000 }, { "epoch": 30.50397877984085, "eval_bleu": 36.22784353872542, "eval_char_accuracy": 18.95728472756172, "eval_loss": 0.4572690725326538, "eval_runtime": 3747.2374, "eval_samples_per_second": 0.604, "eval_steps_per_second": 0.076, "step": 69000 }, { "epoch": 30.54818744473917, "grad_norm": 4.069666385650635, "learning_rate": 6.97753108348135e-06, "loss": 0.4134, "step": 69100 }, { "epoch": 30.592396109637487, "grad_norm": 4.094821453094482, "learning_rate": 6.973090586145649e-06, "loss": 0.411, "step": 69200 }, { "epoch": 30.636604774535808, "grad_norm": 4.401938438415527, "learning_rate": 6.968650088809947e-06, "loss": 0.4038, "step": 69300 }, { "epoch": 30.68081343943413, "grad_norm": 4.26141357421875, "learning_rate": 6.964209591474246e-06, "loss": 0.4107, "step": 69400 }, { "epoch": 30.72502210433245, "grad_norm": 5.528987407684326, "learning_rate": 6.959769094138545e-06, "loss": 0.418, "step": 69500 }, { "epoch": 30.72502210433245, "eval_bleu": 36.36220877499752, "eval_char_accuracy": 19.370804230821417, "eval_loss": 0.45316198468208313, "eval_runtime": 3895.2486, "eval_samples_per_second": 0.581, "eval_steps_per_second": 0.073, "step": 69500 }, { "epoch": 30.76923076923077, "grad_norm": 3.351813554763794, "learning_rate": 6.9553285968028425e-06, "loss": 0.4068, "step": 69600 }, { "epoch": 30.81343943412909, "grad_norm": 3.1283462047576904, "learning_rate": 6.950888099467141e-06, "loss": 0.3927, "step": 69700 }, { "epoch": 30.857648099027408, "grad_norm": 3.6665456295013428, "learning_rate": 6.94644760213144e-06, "loss": 0.3918, "step": 69800 }, { "epoch": 30.90185676392573, "grad_norm": 3.402538299560547, "learning_rate": 6.942051509769094e-06, "loss": 0.4, "step": 69900 }, { "epoch": 30.94606542882405, "grad_norm": 3.561265468597412, "learning_rate": 6.937611012433393e-06, "loss": 0.3973, "step": 70000 }, { "epoch": 30.94606542882405, "eval_bleu": 36.38479979605211, "eval_char_accuracy": 19.52716629299149, "eval_loss": 0.45286065340042114, "eval_runtime": 3848.9845, "eval_samples_per_second": 0.588, "eval_steps_per_second": 0.074, "step": 70000 }, { "epoch": 30.99027409372237, "grad_norm": 3.6247286796569824, "learning_rate": 6.9331705150976915e-06, "loss": 0.3968, "step": 70100 }, { "epoch": 31.03448275862069, "grad_norm": 4.450321197509766, "learning_rate": 6.92873001776199e-06, "loss": 0.3961, "step": 70200 }, { "epoch": 31.07869142351901, "grad_norm": 3.2013375759124756, "learning_rate": 6.924289520426288e-06, "loss": 0.414, "step": 70300 }, { "epoch": 31.122900088417328, "grad_norm": 3.0451788902282715, "learning_rate": 6.919849023090587e-06, "loss": 0.4002, "step": 70400 }, { "epoch": 31.16710875331565, "grad_norm": 4.11805534362793, "learning_rate": 6.915408525754885e-06, "loss": 0.3935, "step": 70500 }, { "epoch": 31.16710875331565, "eval_bleu": 36.3836435610879, "eval_char_accuracy": 19.145629938812036, "eval_loss": 0.4523950517177582, "eval_runtime": 3959.4332, "eval_samples_per_second": 0.572, "eval_steps_per_second": 0.071, "step": 70500 }, { "epoch": 31.16710875331565, "step": 70500, "total_flos": 1.1474206696931328e+16, "train_loss": 0.05265532381314758, "train_runtime": 73728.1886, "train_samples_per_second": 24.544, "train_steps_per_second": 3.068 } ], "logging_steps": 100, "max_steps": 226200, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 5 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.1474206696931328e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }