flan-t5-corpora-mixed / trainer_state.json
alakxender's picture
Upload 12 files
b7d4d87 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 216015,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.006943962224845497,
"grad_norm": 2.206239700317383,
"learning_rate": 0.00029930699256996036,
"loss": 4.6213,
"step": 500
},
{
"epoch": 0.013887924449690994,
"grad_norm": 1.519214153289795,
"learning_rate": 0.0002986125963474758,
"loss": 4.5637,
"step": 1000
},
{
"epoch": 0.02083188667453649,
"grad_norm": 2.3049542903900146,
"learning_rate": 0.0002979182001249913,
"loss": 4.514,
"step": 1500
},
{
"epoch": 0.027775848899381988,
"grad_norm": 2.0959904193878174,
"learning_rate": 0.00029722380390250673,
"loss": 4.4586,
"step": 2000
},
{
"epoch": 0.034719811124227486,
"grad_norm": 2.667476177215576,
"learning_rate": 0.0002965294076800222,
"loss": 4.4977,
"step": 2500
},
{
"epoch": 0.04166377334907298,
"grad_norm": 1.8514024019241333,
"learning_rate": 0.00029583501145753765,
"loss": 4.4347,
"step": 3000
},
{
"epoch": 0.04860773557391848,
"grad_norm": 1.47382390499115,
"learning_rate": 0.00029514061523505306,
"loss": 4.4555,
"step": 3500
},
{
"epoch": 0.055551697798763976,
"grad_norm": 1.5439454317092896,
"learning_rate": 0.0002944462190125685,
"loss": 4.4132,
"step": 4000
},
{
"epoch": 0.06249566002360947,
"grad_norm": 2.2414743900299072,
"learning_rate": 0.000293751822790084,
"loss": 4.4148,
"step": 4500
},
{
"epoch": 0.06943962224845497,
"grad_norm": 2.522726058959961,
"learning_rate": 0.00029305742656759944,
"loss": 4.3732,
"step": 5000
},
{
"epoch": 0.07638358447330046,
"grad_norm": 2.1512997150421143,
"learning_rate": 0.0002923630303451149,
"loss": 4.3563,
"step": 5500
},
{
"epoch": 0.08332754669814596,
"grad_norm": 1.3137620687484741,
"learning_rate": 0.00029166863412263035,
"loss": 4.3309,
"step": 6000
},
{
"epoch": 0.09027150892299146,
"grad_norm": 2.2317123413085938,
"learning_rate": 0.0002909742379001458,
"loss": 4.335,
"step": 6500
},
{
"epoch": 0.09721547114783696,
"grad_norm": 1.4936091899871826,
"learning_rate": 0.0002902798416776612,
"loss": 4.3287,
"step": 7000
},
{
"epoch": 0.10415943337268245,
"grad_norm": 1.4944448471069336,
"learning_rate": 0.0002895854454551767,
"loss": 4.2728,
"step": 7500
},
{
"epoch": 0.11110339559752795,
"grad_norm": 2.103372573852539,
"learning_rate": 0.00028889104923269214,
"loss": 4.2945,
"step": 8000
},
{
"epoch": 0.11804735782237345,
"grad_norm": 1.6695603132247925,
"learning_rate": 0.0002881966530102076,
"loss": 4.3126,
"step": 8500
},
{
"epoch": 0.12499132004721894,
"grad_norm": 2.2535531520843506,
"learning_rate": 0.00028750225678772306,
"loss": 4.3014,
"step": 9000
},
{
"epoch": 0.13193528227206444,
"grad_norm": 2.5216352939605713,
"learning_rate": 0.0002868078605652385,
"loss": 4.233,
"step": 9500
},
{
"epoch": 0.13887924449690994,
"grad_norm": 2.3059794902801514,
"learning_rate": 0.0002861134643427539,
"loss": 4.2875,
"step": 10000
},
{
"epoch": 0.14582320672175544,
"grad_norm": 2.7461228370666504,
"learning_rate": 0.0002854190681202694,
"loss": 4.2872,
"step": 10500
},
{
"epoch": 0.15276716894660092,
"grad_norm": 1.7328039407730103,
"learning_rate": 0.00028472467189778484,
"loss": 4.2354,
"step": 11000
},
{
"epoch": 0.15971113117144642,
"grad_norm": 1.904403805732727,
"learning_rate": 0.0002840302756753003,
"loss": 4.2339,
"step": 11500
},
{
"epoch": 0.16665509339629192,
"grad_norm": 1.4885430335998535,
"learning_rate": 0.00028333587945281576,
"loss": 4.2214,
"step": 12000
},
{
"epoch": 0.17359905562113742,
"grad_norm": 1.5213295221328735,
"learning_rate": 0.0002826414832303312,
"loss": 4.1642,
"step": 12500
},
{
"epoch": 0.18054301784598292,
"grad_norm": 1.5499557256698608,
"learning_rate": 0.0002819470870078466,
"loss": 4.2083,
"step": 13000
},
{
"epoch": 0.18748698007082842,
"grad_norm": 1.7087458372116089,
"learning_rate": 0.0002812526907853621,
"loss": 4.192,
"step": 13500
},
{
"epoch": 0.19443094229567393,
"grad_norm": 1.4186779260635376,
"learning_rate": 0.00028055829456287754,
"loss": 4.1686,
"step": 14000
},
{
"epoch": 0.2013749045205194,
"grad_norm": 1.7402822971343994,
"learning_rate": 0.000279863898340393,
"loss": 4.1988,
"step": 14500
},
{
"epoch": 0.2083188667453649,
"grad_norm": 1.821722149848938,
"learning_rate": 0.00027916950211790846,
"loss": 4.1941,
"step": 15000
},
{
"epoch": 0.2152628289702104,
"grad_norm": 1.207729458808899,
"learning_rate": 0.0002784751058954239,
"loss": 4.1868,
"step": 15500
},
{
"epoch": 0.2222067911950559,
"grad_norm": 1.804909110069275,
"learning_rate": 0.0002777807096729393,
"loss": 4.1657,
"step": 16000
},
{
"epoch": 0.2291507534199014,
"grad_norm": 2.126279830932617,
"learning_rate": 0.0002770863134504548,
"loss": 4.1083,
"step": 16500
},
{
"epoch": 0.2360947156447469,
"grad_norm": 1.3076670169830322,
"learning_rate": 0.00027639191722797024,
"loss": 4.1446,
"step": 17000
},
{
"epoch": 0.24303867786959238,
"grad_norm": 2.152813196182251,
"learning_rate": 0.0002756975210054857,
"loss": 4.1444,
"step": 17500
},
{
"epoch": 0.24998264009443788,
"grad_norm": 1.9263052940368652,
"learning_rate": 0.00027500312478300116,
"loss": 4.1581,
"step": 18000
},
{
"epoch": 0.2569266023192834,
"grad_norm": 1.7251839637756348,
"learning_rate": 0.0002743087285605166,
"loss": 4.1468,
"step": 18500
},
{
"epoch": 0.2638705645441289,
"grad_norm": 1.2336386442184448,
"learning_rate": 0.0002736143323380321,
"loss": 4.1253,
"step": 19000
},
{
"epoch": 0.2708145267689744,
"grad_norm": 1.2728581428527832,
"learning_rate": 0.0002729199361155475,
"loss": 4.0935,
"step": 19500
},
{
"epoch": 0.2777584889938199,
"grad_norm": 1.6062270402908325,
"learning_rate": 0.00027222553989306294,
"loss": 4.1383,
"step": 20000
},
{
"epoch": 0.2847024512186654,
"grad_norm": 1.4133198261260986,
"learning_rate": 0.0002715311436705784,
"loss": 4.1044,
"step": 20500
},
{
"epoch": 0.2916464134435109,
"grad_norm": 1.421473741531372,
"learning_rate": 0.00027083674744809386,
"loss": 4.103,
"step": 21000
},
{
"epoch": 0.2985903756683564,
"grad_norm": 1.92391836643219,
"learning_rate": 0.0002701423512256093,
"loss": 4.0771,
"step": 21500
},
{
"epoch": 0.30553433789320184,
"grad_norm": 2.6180472373962402,
"learning_rate": 0.0002694479550031248,
"loss": 4.0713,
"step": 22000
},
{
"epoch": 0.31247830011804734,
"grad_norm": 2.3145902156829834,
"learning_rate": 0.0002687535587806402,
"loss": 4.0287,
"step": 22500
},
{
"epoch": 0.31942226234289284,
"grad_norm": 1.9222602844238281,
"learning_rate": 0.00026805916255815565,
"loss": 4.0714,
"step": 23000
},
{
"epoch": 0.32636622456773834,
"grad_norm": 1.6106317043304443,
"learning_rate": 0.0002673647663356711,
"loss": 4.0375,
"step": 23500
},
{
"epoch": 0.33331018679258384,
"grad_norm": 1.6297123432159424,
"learning_rate": 0.00026667037011318657,
"loss": 4.0668,
"step": 24000
},
{
"epoch": 0.34025414901742934,
"grad_norm": 2.0038726329803467,
"learning_rate": 0.000265975973890702,
"loss": 4.0266,
"step": 24500
},
{
"epoch": 0.34719811124227484,
"grad_norm": 1.7728261947631836,
"learning_rate": 0.0002652815776682175,
"loss": 4.0499,
"step": 25000
},
{
"epoch": 0.35414207346712034,
"grad_norm": 1.9266184568405151,
"learning_rate": 0.0002645871814457329,
"loss": 4.0055,
"step": 25500
},
{
"epoch": 0.36108603569196585,
"grad_norm": 3.5189244747161865,
"learning_rate": 0.00026389278522324835,
"loss": 4.004,
"step": 26000
},
{
"epoch": 0.36802999791681135,
"grad_norm": 1.9975138902664185,
"learning_rate": 0.0002631983890007638,
"loss": 3.9998,
"step": 26500
},
{
"epoch": 0.37497396014165685,
"grad_norm": 3.087763547897339,
"learning_rate": 0.00026250399277827927,
"loss": 4.0146,
"step": 27000
},
{
"epoch": 0.38191792236650235,
"grad_norm": 1.297499179840088,
"learning_rate": 0.0002618095965557947,
"loss": 4.0206,
"step": 27500
},
{
"epoch": 0.38886188459134785,
"grad_norm": 1.4603493213653564,
"learning_rate": 0.0002611152003333102,
"loss": 4.0337,
"step": 28000
},
{
"epoch": 0.3958058468161933,
"grad_norm": 1.5912282466888428,
"learning_rate": 0.0002604208041108256,
"loss": 3.9878,
"step": 28500
},
{
"epoch": 0.4027498090410388,
"grad_norm": 1.4256983995437622,
"learning_rate": 0.00025972640788834105,
"loss": 4.0107,
"step": 29000
},
{
"epoch": 0.4096937712658843,
"grad_norm": 1.6172006130218506,
"learning_rate": 0.0002590320116658565,
"loss": 4.0011,
"step": 29500
},
{
"epoch": 0.4166377334907298,
"grad_norm": 2.2637939453125,
"learning_rate": 0.00025833761544337197,
"loss": 4.0057,
"step": 30000
},
{
"epoch": 0.4235816957155753,
"grad_norm": 1.6595959663391113,
"learning_rate": 0.00025764321922088743,
"loss": 4.0112,
"step": 30500
},
{
"epoch": 0.4305256579404208,
"grad_norm": 1.7675671577453613,
"learning_rate": 0.0002569488229984029,
"loss": 3.9575,
"step": 31000
},
{
"epoch": 0.4374696201652663,
"grad_norm": 1.9230527877807617,
"learning_rate": 0.00025625442677591835,
"loss": 3.9734,
"step": 31500
},
{
"epoch": 0.4444135823901118,
"grad_norm": 1.6587070226669312,
"learning_rate": 0.00025556003055343375,
"loss": 3.973,
"step": 32000
},
{
"epoch": 0.4513575446149573,
"grad_norm": 1.8445744514465332,
"learning_rate": 0.0002548656343309492,
"loss": 3.9792,
"step": 32500
},
{
"epoch": 0.4583015068398028,
"grad_norm": 2.5224626064300537,
"learning_rate": 0.00025417123810846467,
"loss": 3.9501,
"step": 33000
},
{
"epoch": 0.4652454690646483,
"grad_norm": 1.8237272500991821,
"learning_rate": 0.00025347684188598013,
"loss": 3.9696,
"step": 33500
},
{
"epoch": 0.4721894312894938,
"grad_norm": 3.2028214931488037,
"learning_rate": 0.0002527824456634956,
"loss": 3.9386,
"step": 34000
},
{
"epoch": 0.47913339351433926,
"grad_norm": 1.7500147819519043,
"learning_rate": 0.00025208804944101105,
"loss": 3.9328,
"step": 34500
},
{
"epoch": 0.48607735573918476,
"grad_norm": 1.8961389064788818,
"learning_rate": 0.00025139365321852645,
"loss": 3.9132,
"step": 35000
},
{
"epoch": 0.49302131796403026,
"grad_norm": 1.3247839212417603,
"learning_rate": 0.0002506992569960419,
"loss": 3.9139,
"step": 35500
},
{
"epoch": 0.49996528018887576,
"grad_norm": 1.6709811687469482,
"learning_rate": 0.00025000486077355737,
"loss": 3.9646,
"step": 36000
},
{
"epoch": 0.5069092424137213,
"grad_norm": 1.717537760734558,
"learning_rate": 0.00024931046455107283,
"loss": 3.9328,
"step": 36500
},
{
"epoch": 0.5138532046385668,
"grad_norm": 2.0001790523529053,
"learning_rate": 0.0002486160683285883,
"loss": 3.9522,
"step": 37000
},
{
"epoch": 0.5207971668634123,
"grad_norm": 1.53634774684906,
"learning_rate": 0.00024792167210610375,
"loss": 3.8908,
"step": 37500
},
{
"epoch": 0.5277411290882578,
"grad_norm": 1.6681393384933472,
"learning_rate": 0.00024722727588361916,
"loss": 3.9057,
"step": 38000
},
{
"epoch": 0.5346850913131033,
"grad_norm": 1.4480671882629395,
"learning_rate": 0.0002465328796611346,
"loss": 3.9144,
"step": 38500
},
{
"epoch": 0.5416290535379488,
"grad_norm": 2.7067551612854004,
"learning_rate": 0.0002458384834386501,
"loss": 3.9379,
"step": 39000
},
{
"epoch": 0.5485730157627943,
"grad_norm": 1.919639229774475,
"learning_rate": 0.00024514408721616553,
"loss": 3.8559,
"step": 39500
},
{
"epoch": 0.5555169779876398,
"grad_norm": 1.9291149377822876,
"learning_rate": 0.000244449690993681,
"loss": 3.8821,
"step": 40000
},
{
"epoch": 0.5624609402124853,
"grad_norm": 2.4123694896698,
"learning_rate": 0.00024375529477119642,
"loss": 3.9055,
"step": 40500
},
{
"epoch": 0.5694049024373308,
"grad_norm": 1.5772641897201538,
"learning_rate": 0.00024306089854871186,
"loss": 3.9045,
"step": 41000
},
{
"epoch": 0.5763488646621763,
"grad_norm": 2.2179367542266846,
"learning_rate": 0.00024236650232622732,
"loss": 3.9022,
"step": 41500
},
{
"epoch": 0.5832928268870218,
"grad_norm": 1.1816768646240234,
"learning_rate": 0.00024167210610374278,
"loss": 3.8836,
"step": 42000
},
{
"epoch": 0.5902367891118673,
"grad_norm": 1.7630631923675537,
"learning_rate": 0.00024097770988125823,
"loss": 3.8657,
"step": 42500
},
{
"epoch": 0.5971807513367128,
"grad_norm": 1.777095913887024,
"learning_rate": 0.00024028331365877367,
"loss": 3.8529,
"step": 43000
},
{
"epoch": 0.6041247135615583,
"grad_norm": 1.7181869745254517,
"learning_rate": 0.00023958891743628913,
"loss": 3.8781,
"step": 43500
},
{
"epoch": 0.6110686757864037,
"grad_norm": 1.855504035949707,
"learning_rate": 0.00023889452121380459,
"loss": 3.8391,
"step": 44000
},
{
"epoch": 0.6180126380112492,
"grad_norm": 1.7183347940444946,
"learning_rate": 0.00023820012499132002,
"loss": 3.836,
"step": 44500
},
{
"epoch": 0.6249566002360947,
"grad_norm": 1.8155463933944702,
"learning_rate": 0.00023750572876883548,
"loss": 3.8238,
"step": 45000
},
{
"epoch": 0.6319005624609402,
"grad_norm": 1.4531205892562866,
"learning_rate": 0.00023681133254635094,
"loss": 3.8348,
"step": 45500
},
{
"epoch": 0.6388445246857857,
"grad_norm": 2.0805277824401855,
"learning_rate": 0.00023611693632386637,
"loss": 3.8441,
"step": 46000
},
{
"epoch": 0.6457884869106312,
"grad_norm": 2.2948648929595947,
"learning_rate": 0.00023542254010138183,
"loss": 3.8511,
"step": 46500
},
{
"epoch": 0.6527324491354767,
"grad_norm": 1.958189606666565,
"learning_rate": 0.0002347281438788973,
"loss": 3.8678,
"step": 47000
},
{
"epoch": 0.6596764113603222,
"grad_norm": 1.9029563665390015,
"learning_rate": 0.00023403374765641272,
"loss": 3.8204,
"step": 47500
},
{
"epoch": 0.6666203735851677,
"grad_norm": 1.6925806999206543,
"learning_rate": 0.00023333935143392818,
"loss": 3.861,
"step": 48000
},
{
"epoch": 0.6735643358100132,
"grad_norm": 2.417433023452759,
"learning_rate": 0.00023264495521144364,
"loss": 3.8151,
"step": 48500
},
{
"epoch": 0.6805082980348587,
"grad_norm": 1.94263756275177,
"learning_rate": 0.00023195055898895907,
"loss": 3.8351,
"step": 49000
},
{
"epoch": 0.6874522602597042,
"grad_norm": 2.0970757007598877,
"learning_rate": 0.00023125616276647453,
"loss": 3.8728,
"step": 49500
},
{
"epoch": 0.6943962224845497,
"grad_norm": 1.8286621570587158,
"learning_rate": 0.00023056176654399,
"loss": 3.8262,
"step": 50000
},
{
"epoch": 0.7013401847093952,
"grad_norm": 1.3233591318130493,
"learning_rate": 0.00022986737032150542,
"loss": 3.8186,
"step": 50500
},
{
"epoch": 0.7082841469342407,
"grad_norm": 1.760081171989441,
"learning_rate": 0.00022917297409902088,
"loss": 3.8268,
"step": 51000
},
{
"epoch": 0.7152281091590862,
"grad_norm": 2.040560722351074,
"learning_rate": 0.00022847857787653634,
"loss": 3.8425,
"step": 51500
},
{
"epoch": 0.7221720713839317,
"grad_norm": 2.493685007095337,
"learning_rate": 0.00022778418165405177,
"loss": 3.8362,
"step": 52000
},
{
"epoch": 0.7291160336087772,
"grad_norm": 1.7292836904525757,
"learning_rate": 0.00022708978543156723,
"loss": 3.8317,
"step": 52500
},
{
"epoch": 0.7360599958336227,
"grad_norm": 2.2565951347351074,
"learning_rate": 0.0002263953892090827,
"loss": 3.7835,
"step": 53000
},
{
"epoch": 0.7430039580584682,
"grad_norm": 1.6440356969833374,
"learning_rate": 0.00022570099298659812,
"loss": 3.8079,
"step": 53500
},
{
"epoch": 0.7499479202833137,
"grad_norm": 1.8633214235305786,
"learning_rate": 0.00022500659676411358,
"loss": 3.803,
"step": 54000
},
{
"epoch": 0.7568918825081592,
"grad_norm": 2.401519775390625,
"learning_rate": 0.00022431220054162904,
"loss": 3.7996,
"step": 54500
},
{
"epoch": 0.7638358447330047,
"grad_norm": 1.3482192754745483,
"learning_rate": 0.0002236178043191445,
"loss": 3.7957,
"step": 55000
},
{
"epoch": 0.7707798069578502,
"grad_norm": 2.4375321865081787,
"learning_rate": 0.00022292340809665993,
"loss": 3.8011,
"step": 55500
},
{
"epoch": 0.7777237691826957,
"grad_norm": 1.4207526445388794,
"learning_rate": 0.0002222290118741754,
"loss": 3.7809,
"step": 56000
},
{
"epoch": 0.7846677314075411,
"grad_norm": 2.166013717651367,
"learning_rate": 0.00022153461565169085,
"loss": 3.8016,
"step": 56500
},
{
"epoch": 0.7916116936323866,
"grad_norm": 1.4218441247940063,
"learning_rate": 0.00022084021942920628,
"loss": 3.7636,
"step": 57000
},
{
"epoch": 0.7985556558572321,
"grad_norm": 1.5661506652832031,
"learning_rate": 0.00022014582320672174,
"loss": 3.7958,
"step": 57500
},
{
"epoch": 0.8054996180820776,
"grad_norm": 1.311798095703125,
"learning_rate": 0.0002194514269842372,
"loss": 3.7593,
"step": 58000
},
{
"epoch": 0.8124435803069231,
"grad_norm": 1.3802398443222046,
"learning_rate": 0.00021875703076175264,
"loss": 3.7758,
"step": 58500
},
{
"epoch": 0.8193875425317686,
"grad_norm": 1.7688322067260742,
"learning_rate": 0.0002180626345392681,
"loss": 3.7685,
"step": 59000
},
{
"epoch": 0.8263315047566141,
"grad_norm": 1.8496917486190796,
"learning_rate": 0.00021736823831678355,
"loss": 3.8083,
"step": 59500
},
{
"epoch": 0.8332754669814596,
"grad_norm": 1.2840275764465332,
"learning_rate": 0.00021667384209429899,
"loss": 3.7912,
"step": 60000
},
{
"epoch": 0.8402194292063051,
"grad_norm": 1.4152112007141113,
"learning_rate": 0.00021597944587181445,
"loss": 3.75,
"step": 60500
},
{
"epoch": 0.8471633914311506,
"grad_norm": 2.001692771911621,
"learning_rate": 0.0002152850496493299,
"loss": 3.7748,
"step": 61000
},
{
"epoch": 0.8541073536559961,
"grad_norm": 2.6924116611480713,
"learning_rate": 0.00021459065342684534,
"loss": 3.7576,
"step": 61500
},
{
"epoch": 0.8610513158808416,
"grad_norm": 1.5775929689407349,
"learning_rate": 0.0002138962572043608,
"loss": 3.7357,
"step": 62000
},
{
"epoch": 0.8679952781056871,
"grad_norm": 2.122657060623169,
"learning_rate": 0.00021320186098187626,
"loss": 3.7513,
"step": 62500
},
{
"epoch": 0.8749392403305326,
"grad_norm": 1.8863738775253296,
"learning_rate": 0.0002125074647593917,
"loss": 3.7259,
"step": 63000
},
{
"epoch": 0.8818832025553781,
"grad_norm": 1.46346914768219,
"learning_rate": 0.00021181306853690715,
"loss": 3.7851,
"step": 63500
},
{
"epoch": 0.8888271647802236,
"grad_norm": 2.3657708168029785,
"learning_rate": 0.0002111186723144226,
"loss": 3.7523,
"step": 64000
},
{
"epoch": 0.8957711270050691,
"grad_norm": 1.5897114276885986,
"learning_rate": 0.00021042427609193804,
"loss": 3.7526,
"step": 64500
},
{
"epoch": 0.9027150892299146,
"grad_norm": 1.8869891166687012,
"learning_rate": 0.0002097298798694535,
"loss": 3.7406,
"step": 65000
},
{
"epoch": 0.9096590514547601,
"grad_norm": 1.891735315322876,
"learning_rate": 0.00020903548364696896,
"loss": 3.7229,
"step": 65500
},
{
"epoch": 0.9166030136796056,
"grad_norm": 1.4305230379104614,
"learning_rate": 0.0002083410874244844,
"loss": 3.7712,
"step": 66000
},
{
"epoch": 0.9235469759044511,
"grad_norm": 1.571385145187378,
"learning_rate": 0.00020764669120199985,
"loss": 3.739,
"step": 66500
},
{
"epoch": 0.9304909381292966,
"grad_norm": 1.64103102684021,
"learning_rate": 0.0002069522949795153,
"loss": 3.7291,
"step": 67000
},
{
"epoch": 0.9374349003541421,
"grad_norm": 1.683289647102356,
"learning_rate": 0.00020625789875703077,
"loss": 3.7349,
"step": 67500
},
{
"epoch": 0.9443788625789876,
"grad_norm": 1.9319536685943604,
"learning_rate": 0.0002055635025345462,
"loss": 3.7298,
"step": 68000
},
{
"epoch": 0.9513228248038331,
"grad_norm": 1.2139371633529663,
"learning_rate": 0.00020486910631206166,
"loss": 3.7077,
"step": 68500
},
{
"epoch": 0.9582667870286785,
"grad_norm": 2.366407871246338,
"learning_rate": 0.00020417471008957712,
"loss": 3.7156,
"step": 69000
},
{
"epoch": 0.965210749253524,
"grad_norm": 1.2618952989578247,
"learning_rate": 0.00020348031386709255,
"loss": 3.6964,
"step": 69500
},
{
"epoch": 0.9721547114783695,
"grad_norm": 1.3639748096466064,
"learning_rate": 0.000202785917644608,
"loss": 3.7082,
"step": 70000
},
{
"epoch": 0.979098673703215,
"grad_norm": 1.7581994533538818,
"learning_rate": 0.00020209152142212347,
"loss": 3.7073,
"step": 70500
},
{
"epoch": 0.9860426359280605,
"grad_norm": 2.42798113822937,
"learning_rate": 0.00020139712519963887,
"loss": 3.7116,
"step": 71000
},
{
"epoch": 0.992986598152906,
"grad_norm": 1.6432359218597412,
"learning_rate": 0.00020070272897715436,
"loss": 3.6997,
"step": 71500
},
{
"epoch": 0.9999305603777515,
"grad_norm": 1.079620361328125,
"learning_rate": 0.00020000833275466982,
"loss": 3.6916,
"step": 72000
},
{
"epoch": 1.0,
"eval_loss": 3.4631764888763428,
"eval_rouge1": 0.04941977533882873,
"eval_rouge2": 0.009172388840507199,
"eval_rougeL": 0.04889121785661922,
"eval_rougeLsum": 0.04914022943695945,
"eval_runtime": 3964.6307,
"eval_samples_per_second": 4.036,
"eval_steps_per_second": 2.018,
"step": 72005
},
{
"epoch": 1.006874522602597,
"grad_norm": 1.7696194648742676,
"learning_rate": 0.00019931393653218523,
"loss": 3.6153,
"step": 72500
},
{
"epoch": 1.0138184848274425,
"grad_norm": 1.091058611869812,
"learning_rate": 0.00019861954030970068,
"loss": 3.6146,
"step": 73000
},
{
"epoch": 1.020762447052288,
"grad_norm": 1.8638333082199097,
"learning_rate": 0.00019792514408721617,
"loss": 3.609,
"step": 73500
},
{
"epoch": 1.0277064092771335,
"grad_norm": 2.0278666019439697,
"learning_rate": 0.00019723074786473158,
"loss": 3.6129,
"step": 74000
},
{
"epoch": 1.034650371501979,
"grad_norm": 2.3207359313964844,
"learning_rate": 0.00019653635164224704,
"loss": 3.601,
"step": 74500
},
{
"epoch": 1.0415943337268245,
"grad_norm": 1.4969432353973389,
"learning_rate": 0.00019584195541976252,
"loss": 3.604,
"step": 75000
},
{
"epoch": 1.04853829595167,
"grad_norm": 1.567814588546753,
"learning_rate": 0.00019514755919727793,
"loss": 3.5989,
"step": 75500
},
{
"epoch": 1.0554822581765155,
"grad_norm": 1.7745885848999023,
"learning_rate": 0.00019445316297479339,
"loss": 3.5884,
"step": 76000
},
{
"epoch": 1.062426220401361,
"grad_norm": 1.9692455530166626,
"learning_rate": 0.00019375876675230885,
"loss": 3.6422,
"step": 76500
},
{
"epoch": 1.0693701826262065,
"grad_norm": 1.5432820320129395,
"learning_rate": 0.00019306437052982428,
"loss": 3.5908,
"step": 77000
},
{
"epoch": 1.076314144851052,
"grad_norm": 1.8240996599197388,
"learning_rate": 0.00019236997430733974,
"loss": 3.6187,
"step": 77500
},
{
"epoch": 1.0832581070758975,
"grad_norm": 1.2577378749847412,
"learning_rate": 0.0001916755780848552,
"loss": 3.537,
"step": 78000
},
{
"epoch": 1.090202069300743,
"grad_norm": 1.969597339630127,
"learning_rate": 0.00019098118186237063,
"loss": 3.6229,
"step": 78500
},
{
"epoch": 1.0971460315255885,
"grad_norm": 2.464380979537964,
"learning_rate": 0.0001902867856398861,
"loss": 3.6009,
"step": 79000
},
{
"epoch": 1.104089993750434,
"grad_norm": 2.1590375900268555,
"learning_rate": 0.00018959238941740155,
"loss": 3.5834,
"step": 79500
},
{
"epoch": 1.1110339559752795,
"grad_norm": 1.2929880619049072,
"learning_rate": 0.000188897993194917,
"loss": 3.605,
"step": 80000
},
{
"epoch": 1.117977918200125,
"grad_norm": 2.7584567070007324,
"learning_rate": 0.00018820359697243244,
"loss": 3.5863,
"step": 80500
},
{
"epoch": 1.1249218804249705,
"grad_norm": 2.8105475902557373,
"learning_rate": 0.0001875092007499479,
"loss": 3.5646,
"step": 81000
},
{
"epoch": 1.131865842649816,
"grad_norm": 1.6573877334594727,
"learning_rate": 0.00018681480452746336,
"loss": 3.6216,
"step": 81500
},
{
"epoch": 1.1388098048746615,
"grad_norm": 1.3650224208831787,
"learning_rate": 0.0001861204083049788,
"loss": 3.6155,
"step": 82000
},
{
"epoch": 1.145753767099507,
"grad_norm": 1.3206992149353027,
"learning_rate": 0.00018542601208249425,
"loss": 3.62,
"step": 82500
},
{
"epoch": 1.1526977293243525,
"grad_norm": 2.5373497009277344,
"learning_rate": 0.0001847316158600097,
"loss": 3.595,
"step": 83000
},
{
"epoch": 1.159641691549198,
"grad_norm": 1.503808856010437,
"learning_rate": 0.00018403721963752514,
"loss": 3.6213,
"step": 83500
},
{
"epoch": 1.1665856537740436,
"grad_norm": 2.725497007369995,
"learning_rate": 0.0001833428234150406,
"loss": 3.6006,
"step": 84000
},
{
"epoch": 1.173529615998889,
"grad_norm": 1.50645112991333,
"learning_rate": 0.00018264842719255606,
"loss": 3.6263,
"step": 84500
},
{
"epoch": 1.1804735782237346,
"grad_norm": 1.8035708665847778,
"learning_rate": 0.0001819540309700715,
"loss": 3.5539,
"step": 85000
},
{
"epoch": 1.18741754044858,
"grad_norm": 1.9327325820922852,
"learning_rate": 0.00018125963474758695,
"loss": 3.5822,
"step": 85500
},
{
"epoch": 1.1943615026734253,
"grad_norm": 1.14821195602417,
"learning_rate": 0.0001805652385251024,
"loss": 3.6075,
"step": 86000
},
{
"epoch": 1.201305464898271,
"grad_norm": 1.6110094785690308,
"learning_rate": 0.00017987084230261784,
"loss": 3.5879,
"step": 86500
},
{
"epoch": 1.2082494271231163,
"grad_norm": 1.9771841764450073,
"learning_rate": 0.0001791764460801333,
"loss": 3.5976,
"step": 87000
},
{
"epoch": 1.215193389347962,
"grad_norm": 1.6200594902038574,
"learning_rate": 0.00017848204985764876,
"loss": 3.5788,
"step": 87500
},
{
"epoch": 1.2221373515728073,
"grad_norm": 1.9356050491333008,
"learning_rate": 0.0001777876536351642,
"loss": 3.5611,
"step": 88000
},
{
"epoch": 1.229081313797653,
"grad_norm": 2.2079310417175293,
"learning_rate": 0.00017709325741267965,
"loss": 3.5828,
"step": 88500
},
{
"epoch": 1.2360252760224983,
"grad_norm": 1.6722863912582397,
"learning_rate": 0.0001763988611901951,
"loss": 3.5509,
"step": 89000
},
{
"epoch": 1.2429692382473438,
"grad_norm": 1.2027846574783325,
"learning_rate": 0.00017570446496771054,
"loss": 3.5512,
"step": 89500
},
{
"epoch": 1.2499132004721893,
"grad_norm": 3.1451849937438965,
"learning_rate": 0.000175010068745226,
"loss": 3.5704,
"step": 90000
},
{
"epoch": 1.2568571626970348,
"grad_norm": 1.64677095413208,
"learning_rate": 0.00017431567252274146,
"loss": 3.5879,
"step": 90500
},
{
"epoch": 1.2638011249218803,
"grad_norm": 1.2925798892974854,
"learning_rate": 0.0001736212763002569,
"loss": 3.5499,
"step": 91000
},
{
"epoch": 1.2707450871467258,
"grad_norm": 1.2768690586090088,
"learning_rate": 0.00017292688007777235,
"loss": 3.5538,
"step": 91500
},
{
"epoch": 1.2776890493715714,
"grad_norm": 2.6654281616210938,
"learning_rate": 0.0001722324838552878,
"loss": 3.5431,
"step": 92000
},
{
"epoch": 1.2846330115964169,
"grad_norm": 1.8698071241378784,
"learning_rate": 0.00017153808763280327,
"loss": 3.5646,
"step": 92500
},
{
"epoch": 1.2915769738212624,
"grad_norm": 1.6036585569381714,
"learning_rate": 0.0001708436914103187,
"loss": 3.5595,
"step": 93000
},
{
"epoch": 1.2985209360461079,
"grad_norm": 1.4960416555404663,
"learning_rate": 0.00017014929518783416,
"loss": 3.549,
"step": 93500
},
{
"epoch": 1.3054648982709534,
"grad_norm": 1.5892603397369385,
"learning_rate": 0.00016945489896534962,
"loss": 3.5144,
"step": 94000
},
{
"epoch": 1.3124088604957989,
"grad_norm": 1.2791684865951538,
"learning_rate": 0.00016876050274286506,
"loss": 3.5494,
"step": 94500
},
{
"epoch": 1.3193528227206444,
"grad_norm": 4.176353454589844,
"learning_rate": 0.00016806610652038052,
"loss": 3.5705,
"step": 95000
},
{
"epoch": 1.3262967849454899,
"grad_norm": 1.6479995250701904,
"learning_rate": 0.00016737171029789597,
"loss": 3.5644,
"step": 95500
},
{
"epoch": 1.3332407471703354,
"grad_norm": 1.6295863389968872,
"learning_rate": 0.0001666773140754114,
"loss": 3.5614,
"step": 96000
},
{
"epoch": 1.3401847093951809,
"grad_norm": 2.4127180576324463,
"learning_rate": 0.00016598291785292687,
"loss": 3.5488,
"step": 96500
},
{
"epoch": 1.3471286716200264,
"grad_norm": 2.4236507415771484,
"learning_rate": 0.00016528852163044233,
"loss": 3.5188,
"step": 97000
},
{
"epoch": 1.3540726338448719,
"grad_norm": 1.2415298223495483,
"learning_rate": 0.00016459412540795776,
"loss": 3.5843,
"step": 97500
},
{
"epoch": 1.3610165960697174,
"grad_norm": 2.39335298538208,
"learning_rate": 0.00016389972918547322,
"loss": 3.5445,
"step": 98000
},
{
"epoch": 1.3679605582945629,
"grad_norm": 1.481112003326416,
"learning_rate": 0.00016320533296298868,
"loss": 3.5725,
"step": 98500
},
{
"epoch": 1.3749045205194084,
"grad_norm": 1.8762099742889404,
"learning_rate": 0.0001625109367405041,
"loss": 3.5422,
"step": 99000
},
{
"epoch": 1.3818484827442539,
"grad_norm": 1.4844539165496826,
"learning_rate": 0.00016181654051801957,
"loss": 3.5469,
"step": 99500
},
{
"epoch": 1.3887924449690994,
"grad_norm": 1.776289701461792,
"learning_rate": 0.00016112214429553503,
"loss": 3.5329,
"step": 100000
},
{
"epoch": 1.3957364071939449,
"grad_norm": 1.566076636314392,
"learning_rate": 0.00016042774807305046,
"loss": 3.5146,
"step": 100500
},
{
"epoch": 1.4026803694187904,
"grad_norm": 1.8773123025894165,
"learning_rate": 0.00015973335185056592,
"loss": 3.5298,
"step": 101000
},
{
"epoch": 1.4096243316436359,
"grad_norm": 1.92935049533844,
"learning_rate": 0.00015903895562808138,
"loss": 3.5298,
"step": 101500
},
{
"epoch": 1.4165682938684814,
"grad_norm": 1.807790994644165,
"learning_rate": 0.0001583445594055968,
"loss": 3.5586,
"step": 102000
},
{
"epoch": 1.4235122560933269,
"grad_norm": 1.67229425907135,
"learning_rate": 0.00015765016318311227,
"loss": 3.5202,
"step": 102500
},
{
"epoch": 1.4304562183181724,
"grad_norm": 1.2355769872665405,
"learning_rate": 0.00015695576696062773,
"loss": 3.5161,
"step": 103000
},
{
"epoch": 1.4374001805430179,
"grad_norm": 1.7655647993087769,
"learning_rate": 0.00015626137073814316,
"loss": 3.5094,
"step": 103500
},
{
"epoch": 1.4443441427678634,
"grad_norm": 1.4021390676498413,
"learning_rate": 0.00015556697451565862,
"loss": 3.5533,
"step": 104000
},
{
"epoch": 1.4512881049927089,
"grad_norm": 1.7360609769821167,
"learning_rate": 0.00015487257829317408,
"loss": 3.4831,
"step": 104500
},
{
"epoch": 1.4582320672175544,
"grad_norm": 1.5841504335403442,
"learning_rate": 0.00015417818207068954,
"loss": 3.5008,
"step": 105000
},
{
"epoch": 1.4651760294423999,
"grad_norm": 1.7698231935501099,
"learning_rate": 0.00015348378584820497,
"loss": 3.4889,
"step": 105500
},
{
"epoch": 1.4721199916672454,
"grad_norm": 1.3631160259246826,
"learning_rate": 0.00015278938962572043,
"loss": 3.5313,
"step": 106000
},
{
"epoch": 1.4790639538920909,
"grad_norm": 1.3617082834243774,
"learning_rate": 0.0001520949934032359,
"loss": 3.5399,
"step": 106500
},
{
"epoch": 1.4860079161169364,
"grad_norm": 1.367946743965149,
"learning_rate": 0.00015140059718075132,
"loss": 3.545,
"step": 107000
},
{
"epoch": 1.492951878341782,
"grad_norm": 1.3500925302505493,
"learning_rate": 0.00015070620095826678,
"loss": 3.5544,
"step": 107500
},
{
"epoch": 1.4998958405666274,
"grad_norm": 3.89847731590271,
"learning_rate": 0.00015001180473578224,
"loss": 3.4885,
"step": 108000
},
{
"epoch": 1.5068398027914727,
"grad_norm": 2.2299306392669678,
"learning_rate": 0.00014931740851329767,
"loss": 3.4893,
"step": 108500
},
{
"epoch": 1.5137837650163184,
"grad_norm": 2.350405693054199,
"learning_rate": 0.00014862301229081313,
"loss": 3.4992,
"step": 109000
},
{
"epoch": 1.5207277272411637,
"grad_norm": 1.3148006200790405,
"learning_rate": 0.00014792861606832856,
"loss": 3.5146,
"step": 109500
},
{
"epoch": 1.5276716894660094,
"grad_norm": 1.533084750175476,
"learning_rate": 0.00014723421984584402,
"loss": 3.5149,
"step": 110000
},
{
"epoch": 1.5346156516908547,
"grad_norm": 1.3361338376998901,
"learning_rate": 0.00014653982362335948,
"loss": 3.4956,
"step": 110500
},
{
"epoch": 1.5415596139157004,
"grad_norm": 1.581416130065918,
"learning_rate": 0.00014584542740087492,
"loss": 3.5074,
"step": 111000
},
{
"epoch": 1.5485035761405457,
"grad_norm": 1.6259821653366089,
"learning_rate": 0.00014515103117839037,
"loss": 3.4961,
"step": 111500
},
{
"epoch": 1.5554475383653914,
"grad_norm": 1.981719732284546,
"learning_rate": 0.00014445663495590583,
"loss": 3.5192,
"step": 112000
},
{
"epoch": 1.5623915005902367,
"grad_norm": 1.2760684490203857,
"learning_rate": 0.00014376223873342127,
"loss": 3.5265,
"step": 112500
},
{
"epoch": 1.5693354628150824,
"grad_norm": 7.409369468688965,
"learning_rate": 0.00014306784251093673,
"loss": 3.5335,
"step": 113000
},
{
"epoch": 1.5762794250399277,
"grad_norm": 2.6644575595855713,
"learning_rate": 0.00014237344628845218,
"loss": 3.4924,
"step": 113500
},
{
"epoch": 1.5832233872647734,
"grad_norm": 1.3111194372177124,
"learning_rate": 0.00014167905006596764,
"loss": 3.5339,
"step": 114000
},
{
"epoch": 1.5901673494896187,
"grad_norm": 4.329043388366699,
"learning_rate": 0.00014098465384348308,
"loss": 3.4945,
"step": 114500
},
{
"epoch": 1.5971113117144644,
"grad_norm": 1.5919106006622314,
"learning_rate": 0.00014029025762099854,
"loss": 3.5081,
"step": 115000
},
{
"epoch": 1.6040552739393097,
"grad_norm": 1.7565652132034302,
"learning_rate": 0.000139595861398514,
"loss": 3.5337,
"step": 115500
},
{
"epoch": 1.6109992361641554,
"grad_norm": 1.8960776329040527,
"learning_rate": 0.00013890146517602943,
"loss": 3.4879,
"step": 116000
},
{
"epoch": 1.6179431983890007,
"grad_norm": 1.8651204109191895,
"learning_rate": 0.0001382070689535449,
"loss": 3.483,
"step": 116500
},
{
"epoch": 1.6248871606138464,
"grad_norm": 2.5513360500335693,
"learning_rate": 0.00013751267273106035,
"loss": 3.4763,
"step": 117000
},
{
"epoch": 1.6318311228386917,
"grad_norm": 1.5704069137573242,
"learning_rate": 0.00013681827650857578,
"loss": 3.4932,
"step": 117500
},
{
"epoch": 1.6387750850635374,
"grad_norm": 1.619181513786316,
"learning_rate": 0.00013612388028609124,
"loss": 3.5177,
"step": 118000
},
{
"epoch": 1.6457190472883827,
"grad_norm": 1.3884799480438232,
"learning_rate": 0.0001354294840636067,
"loss": 3.5247,
"step": 118500
},
{
"epoch": 1.6526630095132282,
"grad_norm": 1.5763874053955078,
"learning_rate": 0.00013473508784112213,
"loss": 3.4879,
"step": 119000
},
{
"epoch": 1.6596069717380737,
"grad_norm": 1.2959508895874023,
"learning_rate": 0.00013404069161863756,
"loss": 3.4924,
"step": 119500
},
{
"epoch": 1.6665509339629192,
"grad_norm": 3.481456756591797,
"learning_rate": 0.00013334629539615305,
"loss": 3.4886,
"step": 120000
},
{
"epoch": 1.6734948961877647,
"grad_norm": 6.646812438964844,
"learning_rate": 0.00013265189917366848,
"loss": 3.5136,
"step": 120500
},
{
"epoch": 1.6804388584126102,
"grad_norm": 1.200080394744873,
"learning_rate": 0.00013195750295118394,
"loss": 3.5017,
"step": 121000
},
{
"epoch": 1.6873828206374557,
"grad_norm": 1.4992713928222656,
"learning_rate": 0.0001312631067286994,
"loss": 3.4533,
"step": 121500
},
{
"epoch": 1.6943267828623012,
"grad_norm": 2.5522916316986084,
"learning_rate": 0.00013056871050621483,
"loss": 3.48,
"step": 122000
},
{
"epoch": 1.7012707450871467,
"grad_norm": 1.5243773460388184,
"learning_rate": 0.0001298743142837303,
"loss": 3.5058,
"step": 122500
},
{
"epoch": 1.7082147073119922,
"grad_norm": 1.4201898574829102,
"learning_rate": 0.00012917991806124572,
"loss": 3.4667,
"step": 123000
},
{
"epoch": 1.7151586695368377,
"grad_norm": 1.7786469459533691,
"learning_rate": 0.00012848552183876118,
"loss": 3.4931,
"step": 123500
},
{
"epoch": 1.7221026317616832,
"grad_norm": 3.3978912830352783,
"learning_rate": 0.00012779112561627664,
"loss": 3.4855,
"step": 124000
},
{
"epoch": 1.7290465939865287,
"grad_norm": 4.56933069229126,
"learning_rate": 0.00012709672939379207,
"loss": 3.4691,
"step": 124500
},
{
"epoch": 1.7359905562113742,
"grad_norm": 2.483752489089966,
"learning_rate": 0.00012640233317130753,
"loss": 3.4761,
"step": 125000
},
{
"epoch": 1.7429345184362197,
"grad_norm": 1.5909661054611206,
"learning_rate": 0.000125707936948823,
"loss": 3.5052,
"step": 125500
},
{
"epoch": 1.7498784806610652,
"grad_norm": 1.670730471611023,
"learning_rate": 0.00012501354072633842,
"loss": 3.4758,
"step": 126000
},
{
"epoch": 1.7568224428859107,
"grad_norm": 1.2424238920211792,
"learning_rate": 0.00012431914450385388,
"loss": 3.4405,
"step": 126500
},
{
"epoch": 1.7637664051107562,
"grad_norm": 1.6950280666351318,
"learning_rate": 0.00012362474828136934,
"loss": 3.5296,
"step": 127000
},
{
"epoch": 1.7707103673356017,
"grad_norm": 2.1729133129119873,
"learning_rate": 0.00012293035205888477,
"loss": 3.4733,
"step": 127500
},
{
"epoch": 1.7776543295604472,
"grad_norm": 1.6061240434646606,
"learning_rate": 0.00012223595583640023,
"loss": 3.4562,
"step": 128000
},
{
"epoch": 1.7845982917852927,
"grad_norm": 2.095271587371826,
"learning_rate": 0.0001215415596139157,
"loss": 3.4618,
"step": 128500
},
{
"epoch": 1.7915422540101382,
"grad_norm": 2.206932306289673,
"learning_rate": 0.00012084716339143114,
"loss": 3.4577,
"step": 129000
},
{
"epoch": 1.7984862162349837,
"grad_norm": 1.7425895929336548,
"learning_rate": 0.0001201527671689466,
"loss": 3.4979,
"step": 129500
},
{
"epoch": 1.8054301784598292,
"grad_norm": 2.1199731826782227,
"learning_rate": 0.00011945837094646204,
"loss": 3.4951,
"step": 130000
},
{
"epoch": 1.8123741406846747,
"grad_norm": 1.4702428579330444,
"learning_rate": 0.00011876397472397749,
"loss": 3.4778,
"step": 130500
},
{
"epoch": 1.81931810290952,
"grad_norm": 1.5938681364059448,
"learning_rate": 0.00011806957850149295,
"loss": 3.4782,
"step": 131000
},
{
"epoch": 1.8262620651343657,
"grad_norm": 1.5015869140625,
"learning_rate": 0.0001173751822790084,
"loss": 3.4367,
"step": 131500
},
{
"epoch": 1.833206027359211,
"grad_norm": 1.8470075130462646,
"learning_rate": 0.00011668078605652385,
"loss": 3.4463,
"step": 132000
},
{
"epoch": 1.8401499895840567,
"grad_norm": 2.0054242610931396,
"learning_rate": 0.0001159863898340393,
"loss": 3.4445,
"step": 132500
},
{
"epoch": 1.847093951808902,
"grad_norm": 1.4376716613769531,
"learning_rate": 0.00011529199361155473,
"loss": 3.4449,
"step": 133000
},
{
"epoch": 1.8540379140337477,
"grad_norm": 2.702432870864868,
"learning_rate": 0.0001145975973890702,
"loss": 3.4499,
"step": 133500
},
{
"epoch": 1.860981876258593,
"grad_norm": 1.6058772802352905,
"learning_rate": 0.00011390320116658565,
"loss": 3.4567,
"step": 134000
},
{
"epoch": 1.8679258384834387,
"grad_norm": 3.2056682109832764,
"learning_rate": 0.00011320880494410108,
"loss": 3.4845,
"step": 134500
},
{
"epoch": 1.874869800708284,
"grad_norm": 1.6341466903686523,
"learning_rate": 0.00011251440872161656,
"loss": 3.4699,
"step": 135000
},
{
"epoch": 1.8818137629331297,
"grad_norm": 1.4821678400039673,
"learning_rate": 0.00011182001249913199,
"loss": 3.453,
"step": 135500
},
{
"epoch": 1.888757725157975,
"grad_norm": 2.204435110092163,
"learning_rate": 0.00011112561627664743,
"loss": 3.3845,
"step": 136000
},
{
"epoch": 1.8957016873828207,
"grad_norm": 2.4527945518493652,
"learning_rate": 0.0001104312200541629,
"loss": 3.4595,
"step": 136500
},
{
"epoch": 1.902645649607666,
"grad_norm": 1.7573269605636597,
"learning_rate": 0.00010973682383167834,
"loss": 3.4554,
"step": 137000
},
{
"epoch": 1.9095896118325117,
"grad_norm": 2.0512332916259766,
"learning_rate": 0.00010904242760919379,
"loss": 3.4537,
"step": 137500
},
{
"epoch": 1.916533574057357,
"grad_norm": 2.056835174560547,
"learning_rate": 0.00010834803138670924,
"loss": 3.441,
"step": 138000
},
{
"epoch": 1.9234775362822027,
"grad_norm": 4.340169906616211,
"learning_rate": 0.00010765363516422469,
"loss": 3.4806,
"step": 138500
},
{
"epoch": 1.930421498507048,
"grad_norm": 2.0016748905181885,
"learning_rate": 0.00010695923894174015,
"loss": 3.4363,
"step": 139000
},
{
"epoch": 1.9373654607318938,
"grad_norm": 1.8290444612503052,
"learning_rate": 0.0001062648427192556,
"loss": 3.4428,
"step": 139500
},
{
"epoch": 1.944309422956739,
"grad_norm": 1.6090489625930786,
"learning_rate": 0.00010557044649677104,
"loss": 3.4514,
"step": 140000
},
{
"epoch": 1.9512533851815848,
"grad_norm": 1.5943214893341064,
"learning_rate": 0.0001048760502742865,
"loss": 3.425,
"step": 140500
},
{
"epoch": 1.95819734740643,
"grad_norm": 1.9066240787506104,
"learning_rate": 0.00010418165405180195,
"loss": 3.4637,
"step": 141000
},
{
"epoch": 1.9651413096312758,
"grad_norm": 1.718125820159912,
"learning_rate": 0.00010348725782931739,
"loss": 3.4784,
"step": 141500
},
{
"epoch": 1.972085271856121,
"grad_norm": 1.8587770462036133,
"learning_rate": 0.00010279286160683285,
"loss": 3.4058,
"step": 142000
},
{
"epoch": 1.9790292340809668,
"grad_norm": 2.709913492202759,
"learning_rate": 0.0001020984653843483,
"loss": 3.4442,
"step": 142500
},
{
"epoch": 1.985973196305812,
"grad_norm": 1.3006025552749634,
"learning_rate": 0.00010140406916186374,
"loss": 3.394,
"step": 143000
},
{
"epoch": 1.9929171585306575,
"grad_norm": 1.3316704034805298,
"learning_rate": 0.0001007096729393792,
"loss": 3.4307,
"step": 143500
},
{
"epoch": 1.999861120755503,
"grad_norm": 2.0763180255889893,
"learning_rate": 0.00010001527671689465,
"loss": 3.4408,
"step": 144000
},
{
"epoch": 2.0,
"eval_loss": 3.239363193511963,
"eval_rouge1": 0.055521599130116214,
"eval_rouge2": 0.013459250920580113,
"eval_rougeL": 0.054963256550744514,
"eval_rougeLsum": 0.05513512231622069,
"eval_runtime": 3954.943,
"eval_samples_per_second": 4.046,
"eval_steps_per_second": 2.023,
"step": 144010
},
{
"epoch": 2.0068050829803488,
"grad_norm": 2.819636106491089,
"learning_rate": 9.932088049441011e-05,
"loss": 3.3738,
"step": 144500
},
{
"epoch": 2.013749045205194,
"grad_norm": 1.5194923877716064,
"learning_rate": 9.862648427192555e-05,
"loss": 3.3729,
"step": 145000
},
{
"epoch": 2.0206930074300398,
"grad_norm": 1.9506241083145142,
"learning_rate": 9.7932088049441e-05,
"loss": 3.3605,
"step": 145500
},
{
"epoch": 2.027636969654885,
"grad_norm": 2.37030029296875,
"learning_rate": 9.723769182695646e-05,
"loss": 3.354,
"step": 146000
},
{
"epoch": 2.0345809318797308,
"grad_norm": 1.798161506652832,
"learning_rate": 9.65432956044719e-05,
"loss": 3.3393,
"step": 146500
},
{
"epoch": 2.041524894104576,
"grad_norm": 1.7773220539093018,
"learning_rate": 9.584889938198735e-05,
"loss": 3.3564,
"step": 147000
},
{
"epoch": 2.0484688563294218,
"grad_norm": 1.383130669593811,
"learning_rate": 9.515450315950281e-05,
"loss": 3.3493,
"step": 147500
},
{
"epoch": 2.055412818554267,
"grad_norm": 1.8702272176742554,
"learning_rate": 9.446010693701825e-05,
"loss": 3.3706,
"step": 148000
},
{
"epoch": 2.0623567807791128,
"grad_norm": 2.419377326965332,
"learning_rate": 9.37657107145337e-05,
"loss": 3.369,
"step": 148500
},
{
"epoch": 2.069300743003958,
"grad_norm": 1.842032551765442,
"learning_rate": 9.307131449204916e-05,
"loss": 3.3367,
"step": 149000
},
{
"epoch": 2.0762447052288033,
"grad_norm": 1.890652060508728,
"learning_rate": 9.23769182695646e-05,
"loss": 3.3245,
"step": 149500
},
{
"epoch": 2.083188667453649,
"grad_norm": 1.6845180988311768,
"learning_rate": 9.168252204708005e-05,
"loss": 3.3037,
"step": 150000
},
{
"epoch": 2.0901326296784943,
"grad_norm": 1.8656178712844849,
"learning_rate": 9.098812582459551e-05,
"loss": 3.3236,
"step": 150500
},
{
"epoch": 2.09707659190334,
"grad_norm": 1.933976173400879,
"learning_rate": 9.029372960211096e-05,
"loss": 3.3425,
"step": 151000
},
{
"epoch": 2.1040205541281853,
"grad_norm": 1.2388135194778442,
"learning_rate": 8.959933337962642e-05,
"loss": 3.3654,
"step": 151500
},
{
"epoch": 2.110964516353031,
"grad_norm": 2.0735113620758057,
"learning_rate": 8.890493715714186e-05,
"loss": 3.329,
"step": 152000
},
{
"epoch": 2.1179084785778763,
"grad_norm": 1.460747241973877,
"learning_rate": 8.821054093465731e-05,
"loss": 3.3325,
"step": 152500
},
{
"epoch": 2.124852440802722,
"grad_norm": 1.381603479385376,
"learning_rate": 8.751614471217277e-05,
"loss": 3.3549,
"step": 153000
},
{
"epoch": 2.1317964030275673,
"grad_norm": 2.0302138328552246,
"learning_rate": 8.682174848968821e-05,
"loss": 3.3099,
"step": 153500
},
{
"epoch": 2.138740365252413,
"grad_norm": 1.4594683647155762,
"learning_rate": 8.612735226720366e-05,
"loss": 3.3213,
"step": 154000
},
{
"epoch": 2.1456843274772583,
"grad_norm": 1.5398012399673462,
"learning_rate": 8.543295604471912e-05,
"loss": 3.3918,
"step": 154500
},
{
"epoch": 2.152628289702104,
"grad_norm": 1.6090102195739746,
"learning_rate": 8.473855982223456e-05,
"loss": 3.3241,
"step": 155000
},
{
"epoch": 2.1595722519269493,
"grad_norm": 1.5653716325759888,
"learning_rate": 8.404416359975001e-05,
"loss": 3.3673,
"step": 155500
},
{
"epoch": 2.166516214151795,
"grad_norm": 1.3755892515182495,
"learning_rate": 8.334976737726547e-05,
"loss": 3.3085,
"step": 156000
},
{
"epoch": 2.1734601763766404,
"grad_norm": 2.4337687492370605,
"learning_rate": 8.265537115478091e-05,
"loss": 3.3441,
"step": 156500
},
{
"epoch": 2.180404138601486,
"grad_norm": 1.5850881338119507,
"learning_rate": 8.196097493229637e-05,
"loss": 3.3193,
"step": 157000
},
{
"epoch": 2.1873481008263314,
"grad_norm": 2.200465679168701,
"learning_rate": 8.126657870981182e-05,
"loss": 3.3384,
"step": 157500
},
{
"epoch": 2.194292063051177,
"grad_norm": 2.115725517272949,
"learning_rate": 8.057218248732725e-05,
"loss": 3.3447,
"step": 158000
},
{
"epoch": 2.2012360252760224,
"grad_norm": 1.528279423713684,
"learning_rate": 7.987778626484272e-05,
"loss": 3.3155,
"step": 158500
},
{
"epoch": 2.208179987500868,
"grad_norm": 1.796518087387085,
"learning_rate": 7.918339004235816e-05,
"loss": 3.3501,
"step": 159000
},
{
"epoch": 2.2151239497257134,
"grad_norm": 2.301734685897827,
"learning_rate": 7.84889938198736e-05,
"loss": 3.3132,
"step": 159500
},
{
"epoch": 2.222067911950559,
"grad_norm": 1.7091695070266724,
"learning_rate": 7.779459759738908e-05,
"loss": 3.3373,
"step": 160000
},
{
"epoch": 2.2290118741754044,
"grad_norm": 1.247341275215149,
"learning_rate": 7.710020137490451e-05,
"loss": 3.3235,
"step": 160500
},
{
"epoch": 2.23595583640025,
"grad_norm": 3.0286383628845215,
"learning_rate": 7.640580515241995e-05,
"loss": 3.3162,
"step": 161000
},
{
"epoch": 2.2428997986250954,
"grad_norm": 1.669455885887146,
"learning_rate": 7.571140892993541e-05,
"loss": 3.2822,
"step": 161500
},
{
"epoch": 2.249843760849941,
"grad_norm": 1.5718942880630493,
"learning_rate": 7.501701270745086e-05,
"loss": 3.3213,
"step": 162000
},
{
"epoch": 2.2567877230747864,
"grad_norm": 1.445328950881958,
"learning_rate": 7.432261648496632e-05,
"loss": 3.3117,
"step": 162500
},
{
"epoch": 2.263731685299632,
"grad_norm": 2.205052614212036,
"learning_rate": 7.362822026248176e-05,
"loss": 3.3613,
"step": 163000
},
{
"epoch": 2.2706756475244774,
"grad_norm": 2.702474594116211,
"learning_rate": 7.293382403999721e-05,
"loss": 3.3368,
"step": 163500
},
{
"epoch": 2.277619609749323,
"grad_norm": 2.0145928859710693,
"learning_rate": 7.223942781751267e-05,
"loss": 3.3162,
"step": 164000
},
{
"epoch": 2.2845635719741684,
"grad_norm": 1.5684378147125244,
"learning_rate": 7.154503159502811e-05,
"loss": 3.3532,
"step": 164500
},
{
"epoch": 2.291507534199014,
"grad_norm": 1.6014593839645386,
"learning_rate": 7.085063537254357e-05,
"loss": 3.3014,
"step": 165000
},
{
"epoch": 2.2984514964238594,
"grad_norm": 1.8588491678237915,
"learning_rate": 7.015623915005902e-05,
"loss": 3.353,
"step": 165500
},
{
"epoch": 2.305395458648705,
"grad_norm": 1.7128487825393677,
"learning_rate": 6.946184292757447e-05,
"loss": 3.31,
"step": 166000
},
{
"epoch": 2.3123394208735504,
"grad_norm": 1.5588475465774536,
"learning_rate": 6.876744670508992e-05,
"loss": 3.3193,
"step": 166500
},
{
"epoch": 2.319283383098396,
"grad_norm": 1.8460384607315063,
"learning_rate": 6.807305048260537e-05,
"loss": 3.3158,
"step": 167000
},
{
"epoch": 2.3262273453232414,
"grad_norm": 1.634889006614685,
"learning_rate": 6.737865426012082e-05,
"loss": 3.3089,
"step": 167500
},
{
"epoch": 2.333171307548087,
"grad_norm": 2.4914541244506836,
"learning_rate": 6.668425803763628e-05,
"loss": 3.328,
"step": 168000
},
{
"epoch": 2.3401152697729324,
"grad_norm": 1.2565484046936035,
"learning_rate": 6.598986181515172e-05,
"loss": 3.2968,
"step": 168500
},
{
"epoch": 2.347059231997778,
"grad_norm": 2.460926055908203,
"learning_rate": 6.529546559266717e-05,
"loss": 3.353,
"step": 169000
},
{
"epoch": 2.3540031942226234,
"grad_norm": 2.0668790340423584,
"learning_rate": 6.460106937018263e-05,
"loss": 3.3118,
"step": 169500
},
{
"epoch": 2.360947156447469,
"grad_norm": 2.0417802333831787,
"learning_rate": 6.390667314769807e-05,
"loss": 3.3098,
"step": 170000
},
{
"epoch": 2.3678911186723144,
"grad_norm": 4.113938808441162,
"learning_rate": 6.321227692521352e-05,
"loss": 3.3298,
"step": 170500
},
{
"epoch": 2.37483508089716,
"grad_norm": 1.9370335340499878,
"learning_rate": 6.251788070272896e-05,
"loss": 3.3009,
"step": 171000
},
{
"epoch": 2.3817790431220054,
"grad_norm": 2.2328431606292725,
"learning_rate": 6.182348448024442e-05,
"loss": 3.3067,
"step": 171500
},
{
"epoch": 2.3887230053468507,
"grad_norm": 1.4481734037399292,
"learning_rate": 6.112908825775988e-05,
"loss": 3.3408,
"step": 172000
},
{
"epoch": 2.3956669675716964,
"grad_norm": 1.1176230907440186,
"learning_rate": 6.043469203527532e-05,
"loss": 3.3586,
"step": 172500
},
{
"epoch": 2.402610929796542,
"grad_norm": 2.566291332244873,
"learning_rate": 5.9740295812790774e-05,
"loss": 3.2948,
"step": 173000
},
{
"epoch": 2.4095548920213874,
"grad_norm": 1.9052931070327759,
"learning_rate": 5.9045899590306226e-05,
"loss": 3.3025,
"step": 173500
},
{
"epoch": 2.4164988542462327,
"grad_norm": 1.9683756828308105,
"learning_rate": 5.835150336782167e-05,
"loss": 3.3348,
"step": 174000
},
{
"epoch": 2.4234428164710784,
"grad_norm": 3.174572706222534,
"learning_rate": 5.7657107145337125e-05,
"loss": 3.329,
"step": 174500
},
{
"epoch": 2.430386778695924,
"grad_norm": 2.3149008750915527,
"learning_rate": 5.696271092285258e-05,
"loss": 3.2889,
"step": 175000
},
{
"epoch": 2.4373307409207694,
"grad_norm": 1.6363497972488403,
"learning_rate": 5.626831470036803e-05,
"loss": 3.3258,
"step": 175500
},
{
"epoch": 2.4442747031456147,
"grad_norm": 1.2083947658538818,
"learning_rate": 5.5573918477883476e-05,
"loss": 3.3456,
"step": 176000
},
{
"epoch": 2.4512186653704604,
"grad_norm": 2.0313804149627686,
"learning_rate": 5.487952225539893e-05,
"loss": 3.3154,
"step": 176500
},
{
"epoch": 2.458162627595306,
"grad_norm": 1.4387134313583374,
"learning_rate": 5.418512603291438e-05,
"loss": 3.303,
"step": 177000
},
{
"epoch": 2.4651065898201514,
"grad_norm": 1.9549680948257446,
"learning_rate": 5.349072981042983e-05,
"loss": 3.309,
"step": 177500
},
{
"epoch": 2.4720505520449967,
"grad_norm": 1.7797924280166626,
"learning_rate": 5.279633358794527e-05,
"loss": 3.3338,
"step": 178000
},
{
"epoch": 2.4789945142698424,
"grad_norm": 1.5036529302597046,
"learning_rate": 5.2101937365460725e-05,
"loss": 3.3122,
"step": 178500
},
{
"epoch": 2.4859384764946877,
"grad_norm": 2.212462902069092,
"learning_rate": 5.140754114297618e-05,
"loss": 3.2841,
"step": 179000
},
{
"epoch": 2.4928824387195334,
"grad_norm": 3.3562114238739014,
"learning_rate": 5.071314492049162e-05,
"loss": 3.3017,
"step": 179500
},
{
"epoch": 2.4998264009443787,
"grad_norm": 1.547029733657837,
"learning_rate": 5.0018748698007076e-05,
"loss": 3.3182,
"step": 180000
},
{
"epoch": 2.5067703631692244,
"grad_norm": 1.6302788257598877,
"learning_rate": 4.932435247552253e-05,
"loss": 3.2912,
"step": 180500
},
{
"epoch": 2.5137143253940697,
"grad_norm": 2.3086509704589844,
"learning_rate": 4.862995625303798e-05,
"loss": 3.2617,
"step": 181000
},
{
"epoch": 2.5206582876189154,
"grad_norm": 1.8361918926239014,
"learning_rate": 4.7935560030553427e-05,
"loss": 3.3025,
"step": 181500
},
{
"epoch": 2.5276022498437607,
"grad_norm": 2.591750144958496,
"learning_rate": 4.724116380806888e-05,
"loss": 3.3148,
"step": 182000
},
{
"epoch": 2.5345462120686064,
"grad_norm": 1.2800657749176025,
"learning_rate": 4.654676758558433e-05,
"loss": 3.3176,
"step": 182500
},
{
"epoch": 2.5414901742934517,
"grad_norm": 1.7381142377853394,
"learning_rate": 4.585237136309978e-05,
"loss": 3.2937,
"step": 183000
},
{
"epoch": 2.5484341365182974,
"grad_norm": 1.6898601055145264,
"learning_rate": 4.515797514061523e-05,
"loss": 3.2955,
"step": 183500
},
{
"epoch": 2.5553780987431427,
"grad_norm": 1.9518952369689941,
"learning_rate": 4.446357891813068e-05,
"loss": 3.3093,
"step": 184000
},
{
"epoch": 2.5623220609679884,
"grad_norm": 1.4132803678512573,
"learning_rate": 4.3769182695646135e-05,
"loss": 3.3142,
"step": 184500
},
{
"epoch": 2.5692660231928337,
"grad_norm": 1.304002046585083,
"learning_rate": 4.307478647316158e-05,
"loss": 3.2875,
"step": 185000
},
{
"epoch": 2.5762099854176794,
"grad_norm": 3.1700334548950195,
"learning_rate": 4.2380390250677033e-05,
"loss": 3.3445,
"step": 185500
},
{
"epoch": 2.5831539476425247,
"grad_norm": 1.5128083229064941,
"learning_rate": 4.1685994028192486e-05,
"loss": 3.3174,
"step": 186000
},
{
"epoch": 2.5900979098673704,
"grad_norm": 2.8518435955047607,
"learning_rate": 4.099159780570794e-05,
"loss": 3.2766,
"step": 186500
},
{
"epoch": 2.5970418720922157,
"grad_norm": 1.2636958360671997,
"learning_rate": 4.0297201583223384e-05,
"loss": 3.3032,
"step": 187000
},
{
"epoch": 2.6039858343170614,
"grad_norm": 2.1779420375823975,
"learning_rate": 3.960280536073884e-05,
"loss": 3.3202,
"step": 187500
},
{
"epoch": 2.6109297965419067,
"grad_norm": 1.8627387285232544,
"learning_rate": 3.890840913825429e-05,
"loss": 3.2831,
"step": 188000
},
{
"epoch": 2.6178737587667524,
"grad_norm": 2.000037670135498,
"learning_rate": 3.821401291576973e-05,
"loss": 3.3431,
"step": 188500
},
{
"epoch": 2.6248177209915977,
"grad_norm": 1.3186124563217163,
"learning_rate": 3.751961669328519e-05,
"loss": 3.3161,
"step": 189000
},
{
"epoch": 2.6317616832164434,
"grad_norm": 1.7129555940628052,
"learning_rate": 3.682522047080064e-05,
"loss": 3.2901,
"step": 189500
},
{
"epoch": 2.6387056454412887,
"grad_norm": 1.727120041847229,
"learning_rate": 3.6130824248316086e-05,
"loss": 3.3028,
"step": 190000
},
{
"epoch": 2.6456496076661344,
"grad_norm": 2.223973512649536,
"learning_rate": 3.543642802583154e-05,
"loss": 3.3096,
"step": 190500
},
{
"epoch": 2.6525935698909797,
"grad_norm": 1.908118486404419,
"learning_rate": 3.4742031803346984e-05,
"loss": 3.2971,
"step": 191000
},
{
"epoch": 2.6595375321158254,
"grad_norm": 1.4966055154800415,
"learning_rate": 3.404763558086244e-05,
"loss": 3.3041,
"step": 191500
},
{
"epoch": 2.6664814943406707,
"grad_norm": 1.5852959156036377,
"learning_rate": 3.335323935837789e-05,
"loss": 3.3396,
"step": 192000
},
{
"epoch": 2.6734254565655164,
"grad_norm": 1.956778883934021,
"learning_rate": 3.2658843135893335e-05,
"loss": 3.3153,
"step": 192500
},
{
"epoch": 2.6803694187903617,
"grad_norm": 1.6665066480636597,
"learning_rate": 3.196444691340879e-05,
"loss": 3.3051,
"step": 193000
},
{
"epoch": 2.6873133810152074,
"grad_norm": 1.5020607709884644,
"learning_rate": 3.127005069092424e-05,
"loss": 3.2956,
"step": 193500
},
{
"epoch": 2.6942573432400527,
"grad_norm": 1.861676573753357,
"learning_rate": 3.057565446843969e-05,
"loss": 3.3266,
"step": 194000
},
{
"epoch": 2.701201305464898,
"grad_norm": 1.6980831623077393,
"learning_rate": 2.988125824595514e-05,
"loss": 3.2896,
"step": 194500
},
{
"epoch": 2.7081452676897437,
"grad_norm": 3.289989709854126,
"learning_rate": 2.9186862023470588e-05,
"loss": 3.2898,
"step": 195000
},
{
"epoch": 2.7150892299145895,
"grad_norm": 1.6162209510803223,
"learning_rate": 2.849246580098604e-05,
"loss": 3.2698,
"step": 195500
},
{
"epoch": 2.7220331921394347,
"grad_norm": 4.295835018157959,
"learning_rate": 2.779806957850149e-05,
"loss": 3.2747,
"step": 196000
},
{
"epoch": 2.72897715436428,
"grad_norm": 2.021383762359619,
"learning_rate": 2.7103673356016942e-05,
"loss": 3.2866,
"step": 196500
},
{
"epoch": 2.7359211165891257,
"grad_norm": 5.153833389282227,
"learning_rate": 2.640927713353239e-05,
"loss": 3.3057,
"step": 197000
},
{
"epoch": 2.7428650788139715,
"grad_norm": 1.4507516622543335,
"learning_rate": 2.5714880911047844e-05,
"loss": 3.2617,
"step": 197500
},
{
"epoch": 2.7498090410388167,
"grad_norm": 1.0930436849594116,
"learning_rate": 2.5020484688563293e-05,
"loss": 3.3113,
"step": 198000
},
{
"epoch": 2.756753003263662,
"grad_norm": 2.5461559295654297,
"learning_rate": 2.4326088466078745e-05,
"loss": 3.3073,
"step": 198500
},
{
"epoch": 2.7636969654885077,
"grad_norm": 1.3845510482788086,
"learning_rate": 2.363169224359419e-05,
"loss": 3.2805,
"step": 199000
},
{
"epoch": 2.7706409277133535,
"grad_norm": 1.5489321947097778,
"learning_rate": 2.293729602110964e-05,
"loss": 3.2863,
"step": 199500
},
{
"epoch": 2.7775848899381987,
"grad_norm": 1.2201488018035889,
"learning_rate": 2.2242899798625093e-05,
"loss": 3.2959,
"step": 200000
},
{
"epoch": 2.784528852163044,
"grad_norm": 1.8551557064056396,
"learning_rate": 2.1548503576140542e-05,
"loss": 3.3299,
"step": 200500
},
{
"epoch": 2.7914728143878897,
"grad_norm": 2.28908371925354,
"learning_rate": 2.0854107353655995e-05,
"loss": 3.2962,
"step": 201000
},
{
"epoch": 2.7984167766127355,
"grad_norm": 2.0773096084594727,
"learning_rate": 2.0159711131171444e-05,
"loss": 3.2798,
"step": 201500
},
{
"epoch": 2.8053607388375807,
"grad_norm": 2.611323833465576,
"learning_rate": 1.9465314908686896e-05,
"loss": 3.3228,
"step": 202000
},
{
"epoch": 2.812304701062426,
"grad_norm": 2.0584192276000977,
"learning_rate": 1.8770918686202346e-05,
"loss": 3.3112,
"step": 202500
},
{
"epoch": 2.8192486632872718,
"grad_norm": 3.291172981262207,
"learning_rate": 1.8076522463717795e-05,
"loss": 3.3141,
"step": 203000
},
{
"epoch": 2.8261926255121175,
"grad_norm": 2.5255441665649414,
"learning_rate": 1.7382126241233247e-05,
"loss": 3.2951,
"step": 203500
},
{
"epoch": 2.8331365877369628,
"grad_norm": 1.3763819932937622,
"learning_rate": 1.6687730018748696e-05,
"loss": 3.294,
"step": 204000
},
{
"epoch": 2.840080549961808,
"grad_norm": 1.9026843309402466,
"learning_rate": 1.599333379626415e-05,
"loss": 3.3003,
"step": 204500
},
{
"epoch": 2.8470245121866538,
"grad_norm": 1.6121410131454468,
"learning_rate": 1.5298937573779598e-05,
"loss": 3.3096,
"step": 205000
},
{
"epoch": 2.8539684744114995,
"grad_norm": 2.3993430137634277,
"learning_rate": 1.4604541351295047e-05,
"loss": 3.3075,
"step": 205500
},
{
"epoch": 2.8609124366363448,
"grad_norm": 1.6766456365585327,
"learning_rate": 1.3910145128810498e-05,
"loss": 3.2873,
"step": 206000
},
{
"epoch": 2.86785639886119,
"grad_norm": 2.1749913692474365,
"learning_rate": 1.3215748906325947e-05,
"loss": 3.2432,
"step": 206500
},
{
"epoch": 2.8748003610860358,
"grad_norm": 1.5734447240829468,
"learning_rate": 1.2521352683841398e-05,
"loss": 3.2501,
"step": 207000
},
{
"epoch": 2.881744323310881,
"grad_norm": 1.3672767877578735,
"learning_rate": 1.1826956461356849e-05,
"loss": 3.292,
"step": 207500
},
{
"epoch": 2.8886882855357268,
"grad_norm": 3.4438602924346924,
"learning_rate": 1.11325602388723e-05,
"loss": 3.2755,
"step": 208000
},
{
"epoch": 2.895632247760572,
"grad_norm": 3.021101474761963,
"learning_rate": 1.043816401638775e-05,
"loss": 3.2599,
"step": 208500
},
{
"epoch": 2.9025762099854178,
"grad_norm": 1.2644829750061035,
"learning_rate": 9.743767793903202e-06,
"loss": 3.291,
"step": 209000
},
{
"epoch": 2.909520172210263,
"grad_norm": 1.7406469583511353,
"learning_rate": 9.04937157141865e-06,
"loss": 3.2871,
"step": 209500
},
{
"epoch": 2.9164641344351088,
"grad_norm": 1.8715460300445557,
"learning_rate": 8.354975348934102e-06,
"loss": 3.3359,
"step": 210000
},
{
"epoch": 2.923408096659954,
"grad_norm": 1.7464805841445923,
"learning_rate": 7.660579126449552e-06,
"loss": 3.263,
"step": 210500
},
{
"epoch": 2.9303520588847998,
"grad_norm": 1.6525601148605347,
"learning_rate": 6.966182903965002e-06,
"loss": 3.2654,
"step": 211000
},
{
"epoch": 2.937296021109645,
"grad_norm": 2.9503705501556396,
"learning_rate": 6.2717866814804524e-06,
"loss": 3.2797,
"step": 211500
},
{
"epoch": 2.9442399833344908,
"grad_norm": 1.702645182609558,
"learning_rate": 5.5773904589959024e-06,
"loss": 3.291,
"step": 212000
},
{
"epoch": 2.951183945559336,
"grad_norm": 1.7340339422225952,
"learning_rate": 4.882994236511353e-06,
"loss": 3.3051,
"step": 212500
},
{
"epoch": 2.9581279077841818,
"grad_norm": 1.9832649230957031,
"learning_rate": 4.188598014026803e-06,
"loss": 3.2941,
"step": 213000
},
{
"epoch": 2.965071870009027,
"grad_norm": 2.080734968185425,
"learning_rate": 3.4942017915422537e-06,
"loss": 3.3281,
"step": 213500
},
{
"epoch": 2.972015832233873,
"grad_norm": 1.7874020338058472,
"learning_rate": 2.799805569057704e-06,
"loss": 3.2567,
"step": 214000
},
{
"epoch": 2.978959794458718,
"grad_norm": 1.32713782787323,
"learning_rate": 2.1054093465731546e-06,
"loss": 3.3156,
"step": 214500
},
{
"epoch": 2.985903756683564,
"grad_norm": 2.1903092861175537,
"learning_rate": 1.4110131240886048e-06,
"loss": 3.2949,
"step": 215000
},
{
"epoch": 2.992847718908409,
"grad_norm": 1.8510949611663818,
"learning_rate": 7.166169016040551e-07,
"loss": 3.3084,
"step": 215500
},
{
"epoch": 2.999791681133255,
"grad_norm": 1.916053056716919,
"learning_rate": 2.222067911950559e-08,
"loss": 3.2611,
"step": 216000
}
],
"logging_steps": 500,
"max_steps": 216015,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.3301724526811136e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}