PumeTu's picture
Add files using upload-large-folder tool
4d3ff7d verified
raw
history blame
52 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 10,
"global_step": 261,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.011494252873563218,
"grad_norm": 0.45773613452911377,
"learning_rate": 0.0,
"loss": 1.065,
"step": 1
},
{
"epoch": 0.022988505747126436,
"grad_norm": 0.4720604717731476,
"learning_rate": 7.4074074074074075e-06,
"loss": 1.1025,
"step": 2
},
{
"epoch": 0.034482758620689655,
"grad_norm": 0.4526257812976837,
"learning_rate": 1.4814814814814815e-05,
"loss": 1.067,
"step": 3
},
{
"epoch": 0.04597701149425287,
"grad_norm": 0.44910383224487305,
"learning_rate": 2.2222222222222223e-05,
"loss": 1.0694,
"step": 4
},
{
"epoch": 0.05747126436781609,
"grad_norm": 0.4383523762226105,
"learning_rate": 2.962962962962963e-05,
"loss": 1.0767,
"step": 5
},
{
"epoch": 0.06896551724137931,
"grad_norm": 0.4290314018726349,
"learning_rate": 3.7037037037037037e-05,
"loss": 1.0414,
"step": 6
},
{
"epoch": 0.08045977011494253,
"grad_norm": 0.4098808467388153,
"learning_rate": 4.4444444444444447e-05,
"loss": 1.021,
"step": 7
},
{
"epoch": 0.09195402298850575,
"grad_norm": 0.4100661277770996,
"learning_rate": 5.185185185185185e-05,
"loss": 1.0113,
"step": 8
},
{
"epoch": 0.10344827586206896,
"grad_norm": 0.37694185972213745,
"learning_rate": 5.925925925925926e-05,
"loss": 0.9566,
"step": 9
},
{
"epoch": 0.11494252873563218,
"grad_norm": 0.3378658890724182,
"learning_rate": 6.666666666666667e-05,
"loss": 0.9539,
"step": 10
},
{
"epoch": 0.11494252873563218,
"eval_loss": 0.9960598945617676,
"eval_runtime": 605.6954,
"eval_samples_per_second": 16.342,
"eval_steps_per_second": 0.129,
"step": 10
},
{
"epoch": 0.12643678160919541,
"grad_norm": 0.29339659214019775,
"learning_rate": 7.407407407407407e-05,
"loss": 0.9163,
"step": 11
},
{
"epoch": 0.13793103448275862,
"grad_norm": 0.23120510578155518,
"learning_rate": 8.148148148148148e-05,
"loss": 0.8886,
"step": 12
},
{
"epoch": 0.14942528735632185,
"grad_norm": 0.18651129305362701,
"learning_rate": 8.888888888888889e-05,
"loss": 0.8748,
"step": 13
},
{
"epoch": 0.16091954022988506,
"grad_norm": 0.14475475251674652,
"learning_rate": 9.62962962962963e-05,
"loss": 0.8235,
"step": 14
},
{
"epoch": 0.1724137931034483,
"grad_norm": 0.11758769303560257,
"learning_rate": 0.0001037037037037037,
"loss": 0.8539,
"step": 15
},
{
"epoch": 0.1839080459770115,
"grad_norm": 0.10646044462919235,
"learning_rate": 0.00011111111111111112,
"loss": 0.8363,
"step": 16
},
{
"epoch": 0.19540229885057472,
"grad_norm": 0.10539838671684265,
"learning_rate": 0.00011851851851851852,
"loss": 0.819,
"step": 17
},
{
"epoch": 0.20689655172413793,
"grad_norm": 0.11887332051992416,
"learning_rate": 0.00012592592592592592,
"loss": 0.8069,
"step": 18
},
{
"epoch": 0.21839080459770116,
"grad_norm": 0.1281956136226654,
"learning_rate": 0.00013333333333333334,
"loss": 0.8042,
"step": 19
},
{
"epoch": 0.22988505747126436,
"grad_norm": 0.1338774859905243,
"learning_rate": 0.00014074074074074076,
"loss": 0.8283,
"step": 20
},
{
"epoch": 0.22988505747126436,
"eval_loss": 0.813089907169342,
"eval_runtime": 601.317,
"eval_samples_per_second": 16.461,
"eval_steps_per_second": 0.13,
"step": 20
},
{
"epoch": 0.2413793103448276,
"grad_norm": 0.12568038702011108,
"learning_rate": 0.00014814814814814815,
"loss": 0.8142,
"step": 21
},
{
"epoch": 0.25287356321839083,
"grad_norm": 0.11528006941080093,
"learning_rate": 0.00015555555555555556,
"loss": 0.794,
"step": 22
},
{
"epoch": 0.26436781609195403,
"grad_norm": 0.10474701225757599,
"learning_rate": 0.00016296296296296295,
"loss": 0.8231,
"step": 23
},
{
"epoch": 0.27586206896551724,
"grad_norm": 0.09240291267633438,
"learning_rate": 0.00017037037037037037,
"loss": 0.7874,
"step": 24
},
{
"epoch": 0.28735632183908044,
"grad_norm": 0.07213829457759857,
"learning_rate": 0.00017777777777777779,
"loss": 0.8075,
"step": 25
},
{
"epoch": 0.2988505747126437,
"grad_norm": 0.0564056895673275,
"learning_rate": 0.0001851851851851852,
"loss": 0.779,
"step": 26
},
{
"epoch": 0.3103448275862069,
"grad_norm": 0.04973220080137253,
"learning_rate": 0.0001925925925925926,
"loss": 0.7878,
"step": 27
},
{
"epoch": 0.3218390804597701,
"grad_norm": 0.04492342844605446,
"learning_rate": 0.0002,
"loss": 0.7876,
"step": 28
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.050321951508522034,
"learning_rate": 0.00019999098778567212,
"loss": 0.7686,
"step": 29
},
{
"epoch": 0.3448275862068966,
"grad_norm": 0.05602027848362923,
"learning_rate": 0.00019996395276708856,
"loss": 0.7525,
"step": 30
},
{
"epoch": 0.3448275862068966,
"eval_loss": 0.7719228863716125,
"eval_runtime": 606.7346,
"eval_samples_per_second": 16.314,
"eval_steps_per_second": 0.129,
"step": 30
},
{
"epoch": 0.3563218390804598,
"grad_norm": 0.055378761142492294,
"learning_rate": 0.00019991889981715698,
"loss": 0.777,
"step": 31
},
{
"epoch": 0.367816091954023,
"grad_norm": 0.052891045808792114,
"learning_rate": 0.00019985583705641418,
"loss": 0.7797,
"step": 32
},
{
"epoch": 0.3793103448275862,
"grad_norm": 0.044340070337057114,
"learning_rate": 0.00019977477585156252,
"loss": 0.7744,
"step": 33
},
{
"epoch": 0.39080459770114945,
"grad_norm": 0.037772953510284424,
"learning_rate": 0.00019967573081342103,
"loss": 0.7631,
"step": 34
},
{
"epoch": 0.40229885057471265,
"grad_norm": 0.03553105518221855,
"learning_rate": 0.0001995587197942919,
"loss": 0.7512,
"step": 35
},
{
"epoch": 0.41379310344827586,
"grad_norm": 0.0346490740776062,
"learning_rate": 0.0001994237638847428,
"loss": 0.7567,
"step": 36
},
{
"epoch": 0.42528735632183906,
"grad_norm": 0.03471450135111809,
"learning_rate": 0.0001992708874098054,
"loss": 0.7723,
"step": 37
},
{
"epoch": 0.4367816091954023,
"grad_norm": 0.03868038207292557,
"learning_rate": 0.00019910011792459087,
"loss": 0.77,
"step": 38
},
{
"epoch": 0.4482758620689655,
"grad_norm": 0.04141271859407425,
"learning_rate": 0.00019891148620932318,
"loss": 0.7764,
"step": 39
},
{
"epoch": 0.45977011494252873,
"grad_norm": 0.04017995670437813,
"learning_rate": 0.00019870502626379127,
"loss": 0.7418,
"step": 40
},
{
"epoch": 0.45977011494252873,
"eval_loss": 0.7480100393295288,
"eval_runtime": 601.4446,
"eval_samples_per_second": 16.457,
"eval_steps_per_second": 0.13,
"step": 40
},
{
"epoch": 0.47126436781609193,
"grad_norm": 0.04055652394890785,
"learning_rate": 0.00019848077530122083,
"loss": 0.7487,
"step": 41
},
{
"epoch": 0.4827586206896552,
"grad_norm": 0.03968408331274986,
"learning_rate": 0.00019823877374156647,
"loss": 0.7417,
"step": 42
},
{
"epoch": 0.4942528735632184,
"grad_norm": 0.03533465415239334,
"learning_rate": 0.00019797906520422677,
"loss": 0.7437,
"step": 43
},
{
"epoch": 0.5057471264367817,
"grad_norm": 0.03385720029473305,
"learning_rate": 0.00019770169650018172,
"loss": 0.7752,
"step": 44
},
{
"epoch": 0.5172413793103449,
"grad_norm": 0.030534420162439346,
"learning_rate": 0.00019740671762355548,
"loss": 0.7363,
"step": 45
},
{
"epoch": 0.5287356321839081,
"grad_norm": 0.02837216667830944,
"learning_rate": 0.0001970941817426052,
"loss": 0.74,
"step": 46
},
{
"epoch": 0.5402298850574713,
"grad_norm": 0.02879689820110798,
"learning_rate": 0.00019676414519013781,
"loss": 0.7246,
"step": 47
},
{
"epoch": 0.5517241379310345,
"grad_norm": 0.02807699516415596,
"learning_rate": 0.00019641666745335624,
"loss": 0.7465,
"step": 48
},
{
"epoch": 0.5632183908045977,
"grad_norm": 0.029897579923272133,
"learning_rate": 0.00019605181116313724,
"loss": 0.721,
"step": 49
},
{
"epoch": 0.5747126436781609,
"grad_norm": 0.028458919376134872,
"learning_rate": 0.00019566964208274254,
"loss": 0.7291,
"step": 50
},
{
"epoch": 0.5747126436781609,
"eval_loss": 0.7319945096969604,
"eval_runtime": 601.7402,
"eval_samples_per_second": 16.449,
"eval_steps_per_second": 0.13,
"step": 50
},
{
"epoch": 0.5862068965517241,
"grad_norm": 0.028347671031951904,
"learning_rate": 0.00019527022909596536,
"loss": 0.7396,
"step": 51
},
{
"epoch": 0.5977011494252874,
"grad_norm": 0.028352508321404457,
"learning_rate": 0.00019485364419471454,
"loss": 0.733,
"step": 52
},
{
"epoch": 0.6091954022988506,
"grad_norm": 0.025768019258975983,
"learning_rate": 0.00019441996246603846,
"loss": 0.7378,
"step": 53
},
{
"epoch": 0.6206896551724138,
"grad_norm": 0.0259398240596056,
"learning_rate": 0.00019396926207859084,
"loss": 0.7293,
"step": 54
},
{
"epoch": 0.632183908045977,
"grad_norm": 0.024649152532219887,
"learning_rate": 0.0001935016242685415,
"loss": 0.7287,
"step": 55
},
{
"epoch": 0.6436781609195402,
"grad_norm": 0.026843328028917313,
"learning_rate": 0.00019301713332493386,
"loss": 0.7354,
"step": 56
},
{
"epoch": 0.6551724137931034,
"grad_norm": 0.028928296640515327,
"learning_rate": 0.00019251587657449236,
"loss": 0.7376,
"step": 57
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.027994588017463684,
"learning_rate": 0.00019199794436588243,
"loss": 0.7389,
"step": 58
},
{
"epoch": 0.6781609195402298,
"grad_norm": 0.027422698214650154,
"learning_rate": 0.00019146343005342547,
"loss": 0.7275,
"step": 59
},
{
"epoch": 0.6896551724137931,
"grad_norm": 0.02938409335911274,
"learning_rate": 0.0001909124299802724,
"loss": 0.7244,
"step": 60
},
{
"epoch": 0.6896551724137931,
"eval_loss": 0.7189474701881409,
"eval_runtime": 605.038,
"eval_samples_per_second": 16.359,
"eval_steps_per_second": 0.129,
"step": 60
},
{
"epoch": 0.7011494252873564,
"grad_norm": 0.02854277938604355,
"learning_rate": 0.00019034504346103823,
"loss": 0.7339,
"step": 61
},
{
"epoch": 0.7126436781609196,
"grad_norm": 0.028162814676761627,
"learning_rate": 0.0001897613727639014,
"loss": 0.7349,
"step": 62
},
{
"epoch": 0.7241379310344828,
"grad_norm": 0.027026742696762085,
"learning_rate": 0.0001891615230921703,
"loss": 0.7008,
"step": 63
},
{
"epoch": 0.735632183908046,
"grad_norm": 0.02867995575070381,
"learning_rate": 0.000188545602565321,
"loss": 0.7404,
"step": 64
},
{
"epoch": 0.7471264367816092,
"grad_norm": 0.025570319965481758,
"learning_rate": 0.00018791372219950948,
"loss": 0.7203,
"step": 65
},
{
"epoch": 0.7586206896551724,
"grad_norm": 0.026673492044210434,
"learning_rate": 0.00018726599588756145,
"loss": 0.7188,
"step": 66
},
{
"epoch": 0.7701149425287356,
"grad_norm": 0.028060389682650566,
"learning_rate": 0.00018660254037844388,
"loss": 0.7129,
"step": 67
},
{
"epoch": 0.7816091954022989,
"grad_norm": 0.025640789419412613,
"learning_rate": 0.0001859234752562217,
"loss": 0.7081,
"step": 68
},
{
"epoch": 0.7931034482758621,
"grad_norm": 0.026264475658535957,
"learning_rate": 0.00018522892291850335,
"loss": 0.6971,
"step": 69
},
{
"epoch": 0.8045977011494253,
"grad_norm": 0.028900163248181343,
"learning_rate": 0.0001845190085543795,
"loss": 0.721,
"step": 70
},
{
"epoch": 0.8045977011494253,
"eval_loss": 0.7084506154060364,
"eval_runtime": 603.3937,
"eval_samples_per_second": 16.404,
"eval_steps_per_second": 0.129,
"step": 70
},
{
"epoch": 0.8160919540229885,
"grad_norm": 0.02878301776945591,
"learning_rate": 0.00018379386012185814,
"loss": 0.7192,
"step": 71
},
{
"epoch": 0.8275862068965517,
"grad_norm": 0.028177903965115547,
"learning_rate": 0.00018305360832480117,
"loss": 0.7258,
"step": 72
},
{
"epoch": 0.8390804597701149,
"grad_norm": 0.027689015492796898,
"learning_rate": 0.00018229838658936564,
"loss": 0.7327,
"step": 73
},
{
"epoch": 0.8505747126436781,
"grad_norm": 0.027420515194535255,
"learning_rate": 0.00018152833103995443,
"loss": 0.7007,
"step": 74
},
{
"epoch": 0.8620689655172413,
"grad_norm": 0.02628767490386963,
"learning_rate": 0.0001807435804746807,
"loss": 0.7106,
"step": 75
},
{
"epoch": 0.8735632183908046,
"grad_norm": 0.027107784524559975,
"learning_rate": 0.00017994427634035015,
"loss": 0.7056,
"step": 76
},
{
"epoch": 0.8850574712643678,
"grad_norm": 0.026692209765315056,
"learning_rate": 0.0001791305627069662,
"loss": 0.6898,
"step": 77
},
{
"epoch": 0.896551724137931,
"grad_norm": 0.02739943191409111,
"learning_rate": 0.00017830258624176225,
"loss": 0.71,
"step": 78
},
{
"epoch": 0.9080459770114943,
"grad_norm": 0.027159228920936584,
"learning_rate": 0.00017746049618276545,
"loss": 0.7104,
"step": 79
},
{
"epoch": 0.9195402298850575,
"grad_norm": 0.027961738407611847,
"learning_rate": 0.0001766044443118978,
"loss": 0.6964,
"step": 80
},
{
"epoch": 0.9195402298850575,
"eval_loss": 0.6998673677444458,
"eval_runtime": 599.046,
"eval_samples_per_second": 16.523,
"eval_steps_per_second": 0.13,
"step": 80
},
{
"epoch": 0.9310344827586207,
"grad_norm": 0.028458958491683006,
"learning_rate": 0.00017573458492761801,
"loss": 0.7048,
"step": 81
},
{
"epoch": 0.9425287356321839,
"grad_norm": 0.0295415036380291,
"learning_rate": 0.00017485107481711012,
"loss": 0.7089,
"step": 82
},
{
"epoch": 0.9540229885057471,
"grad_norm": 0.027641592547297478,
"learning_rate": 0.00017395407322802372,
"loss": 0.7184,
"step": 83
},
{
"epoch": 0.9655172413793104,
"grad_norm": 0.029828142374753952,
"learning_rate": 0.00017304374183977033,
"loss": 0.6878,
"step": 84
},
{
"epoch": 0.9770114942528736,
"grad_norm": 0.029184194281697273,
"learning_rate": 0.00017212024473438147,
"loss": 0.7223,
"step": 85
},
{
"epoch": 0.9885057471264368,
"grad_norm": 0.02929309941828251,
"learning_rate": 0.00017118374836693406,
"loss": 0.6936,
"step": 86
},
{
"epoch": 1.0,
"grad_norm": 0.03450490161776543,
"learning_rate": 0.00017023442153554777,
"loss": 0.6906,
"step": 87
},
{
"epoch": 1.0114942528735633,
"grad_norm": 0.03116275928914547,
"learning_rate": 0.00016927243535095997,
"loss": 0.6915,
"step": 88
},
{
"epoch": 1.0229885057471264,
"grad_norm": 0.03145065903663635,
"learning_rate": 0.00016829796320568416,
"loss": 0.6792,
"step": 89
},
{
"epoch": 1.0344827586206897,
"grad_norm": 0.03122427873313427,
"learning_rate": 0.00016731118074275704,
"loss": 0.6965,
"step": 90
},
{
"epoch": 1.0344827586206897,
"eval_loss": 0.6928127408027649,
"eval_runtime": 606.6499,
"eval_samples_per_second": 16.316,
"eval_steps_per_second": 0.129,
"step": 90
},
{
"epoch": 1.0459770114942528,
"grad_norm": 0.030179064720869064,
"learning_rate": 0.00016631226582407952,
"loss": 0.6726,
"step": 91
},
{
"epoch": 1.0574712643678161,
"grad_norm": 0.029219962656497955,
"learning_rate": 0.0001653013984983585,
"loss": 0.6792,
"step": 92
},
{
"epoch": 1.0689655172413792,
"grad_norm": 0.02873355709016323,
"learning_rate": 0.00016427876096865394,
"loss": 0.6921,
"step": 93
},
{
"epoch": 1.0804597701149425,
"grad_norm": 0.029455283656716347,
"learning_rate": 0.00016324453755953773,
"loss": 0.6829,
"step": 94
},
{
"epoch": 1.0919540229885056,
"grad_norm": 0.030247965827584267,
"learning_rate": 0.0001621989146838704,
"loss": 0.6737,
"step": 95
},
{
"epoch": 1.103448275862069,
"grad_norm": 0.029829107224941254,
"learning_rate": 0.00016114208080920123,
"loss": 0.6852,
"step": 96
},
{
"epoch": 1.1149425287356323,
"grad_norm": 0.031125420704483986,
"learning_rate": 0.0001600742264237979,
"loss": 0.6863,
"step": 97
},
{
"epoch": 1.1264367816091954,
"grad_norm": 0.029601775109767914,
"learning_rate": 0.00015899554400231232,
"loss": 0.6785,
"step": 98
},
{
"epoch": 1.1379310344827587,
"grad_norm": 0.031090950593352318,
"learning_rate": 0.0001579062279710879,
"loss": 0.7001,
"step": 99
},
{
"epoch": 1.1494252873563218,
"grad_norm": 0.030084125697612762,
"learning_rate": 0.00015680647467311557,
"loss": 0.6856,
"step": 100
},
{
"epoch": 1.1494252873563218,
"eval_loss": 0.686887264251709,
"eval_runtime": 599.4005,
"eval_samples_per_second": 16.513,
"eval_steps_per_second": 0.13,
"step": 100
},
{
"epoch": 1.160919540229885,
"grad_norm": 0.03157448023557663,
"learning_rate": 0.00015569648233264394,
"loss": 0.6836,
"step": 101
},
{
"epoch": 1.1724137931034484,
"grad_norm": 0.032123371958732605,
"learning_rate": 0.00015457645101945046,
"loss": 0.6827,
"step": 102
},
{
"epoch": 1.1839080459770115,
"grad_norm": 0.03208347037434578,
"learning_rate": 0.0001534465826127801,
"loss": 0.6823,
"step": 103
},
{
"epoch": 1.1954022988505748,
"grad_norm": 0.030994586646556854,
"learning_rate": 0.00015230708076495775,
"loss": 0.6729,
"step": 104
},
{
"epoch": 1.206896551724138,
"grad_norm": 0.03224639222025871,
"learning_rate": 0.00015115815086468102,
"loss": 0.6778,
"step": 105
},
{
"epoch": 1.2183908045977012,
"grad_norm": 0.030437005683779716,
"learning_rate": 0.00015000000000000001,
"loss": 0.674,
"step": 106
},
{
"epoch": 1.2298850574712643,
"grad_norm": 0.030820753425359726,
"learning_rate": 0.00014883283692099112,
"loss": 0.6799,
"step": 107
},
{
"epoch": 1.2413793103448276,
"grad_norm": 0.03301486000418663,
"learning_rate": 0.0001476568720021308,
"loss": 0.6851,
"step": 108
},
{
"epoch": 1.2528735632183907,
"grad_norm": 0.033183734863996506,
"learning_rate": 0.00014647231720437686,
"loss": 0.7094,
"step": 109
},
{
"epoch": 1.264367816091954,
"grad_norm": 0.03154841437935829,
"learning_rate": 0.00014527938603696376,
"loss": 0.6812,
"step": 110
},
{
"epoch": 1.264367816091954,
"eval_loss": 0.6802834868431091,
"eval_runtime": 597.3843,
"eval_samples_per_second": 16.569,
"eval_steps_per_second": 0.131,
"step": 110
},
{
"epoch": 1.2758620689655173,
"grad_norm": 0.03554477170109749,
"learning_rate": 0.00014407829351891857,
"loss": 0.679,
"step": 111
},
{
"epoch": 1.2873563218390804,
"grad_norm": 0.03362204134464264,
"learning_rate": 0.00014286925614030542,
"loss": 0.6722,
"step": 112
},
{
"epoch": 1.2988505747126438,
"grad_norm": 0.032853253185749054,
"learning_rate": 0.00014165249182320402,
"loss": 0.6879,
"step": 113
},
{
"epoch": 1.3103448275862069,
"grad_norm": 0.033823542296886444,
"learning_rate": 0.0001404282198824305,
"loss": 0.6627,
"step": 114
},
{
"epoch": 1.3218390804597702,
"grad_norm": 0.0326109379529953,
"learning_rate": 0.00013919666098600753,
"loss": 0.6751,
"step": 115
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.03431953117251396,
"learning_rate": 0.00013795803711538966,
"loss": 0.6827,
"step": 116
},
{
"epoch": 1.3448275862068966,
"grad_norm": 0.03328808769583702,
"learning_rate": 0.00013671257152545277,
"loss": 0.6648,
"step": 117
},
{
"epoch": 1.3563218390804597,
"grad_norm": 0.035666704177856445,
"learning_rate": 0.00013546048870425356,
"loss": 0.6771,
"step": 118
},
{
"epoch": 1.367816091954023,
"grad_norm": 0.03441452234983444,
"learning_rate": 0.00013420201433256689,
"loss": 0.6623,
"step": 119
},
{
"epoch": 1.3793103448275863,
"grad_norm": 0.03363391384482384,
"learning_rate": 0.00013293737524320797,
"loss": 0.6734,
"step": 120
},
{
"epoch": 1.3793103448275863,
"eval_loss": 0.6736027002334595,
"eval_runtime": 601.0855,
"eval_samples_per_second": 16.467,
"eval_steps_per_second": 0.13,
"step": 120
},
{
"epoch": 1.3908045977011494,
"grad_norm": 0.03441128134727478,
"learning_rate": 0.00013166679938014726,
"loss": 0.6625,
"step": 121
},
{
"epoch": 1.4022988505747127,
"grad_norm": 0.03332269564270973,
"learning_rate": 0.0001303905157574247,
"loss": 0.6738,
"step": 122
},
{
"epoch": 1.4137931034482758,
"grad_norm": 0.03411813825368881,
"learning_rate": 0.00012910875441787128,
"loss": 0.6511,
"step": 123
},
{
"epoch": 1.4252873563218391,
"grad_norm": 0.034489769488573074,
"learning_rate": 0.0001278217463916453,
"loss": 0.6579,
"step": 124
},
{
"epoch": 1.4367816091954024,
"grad_norm": 0.03562283143401146,
"learning_rate": 0.0001265297236545901,
"loss": 0.6508,
"step": 125
},
{
"epoch": 1.4482758620689655,
"grad_norm": 0.034433409571647644,
"learning_rate": 0.00012523291908642217,
"loss": 0.6867,
"step": 126
},
{
"epoch": 1.4597701149425286,
"grad_norm": 0.034091442823410034,
"learning_rate": 0.0001239315664287558,
"loss": 0.6609,
"step": 127
},
{
"epoch": 1.471264367816092,
"grad_norm": 0.034659866243600845,
"learning_rate": 0.00012262590024297225,
"loss": 0.6708,
"step": 128
},
{
"epoch": 1.4827586206896552,
"grad_norm": 0.03355137258768082,
"learning_rate": 0.0001213161558679416,
"loss": 0.6853,
"step": 129
},
{
"epoch": 1.4942528735632183,
"grad_norm": 0.03497845306992531,
"learning_rate": 0.00012000256937760445,
"loss": 0.658,
"step": 130
},
{
"epoch": 1.4942528735632183,
"eval_loss": 0.6697064638137817,
"eval_runtime": 595.4108,
"eval_samples_per_second": 16.624,
"eval_steps_per_second": 0.131,
"step": 130
},
{
"epoch": 1.5057471264367817,
"grad_norm": 0.03422487527132034,
"learning_rate": 0.00011868537753842051,
"loss": 0.6688,
"step": 131
},
{
"epoch": 1.5172413793103448,
"grad_norm": 0.034450776875019073,
"learning_rate": 0.00011736481776669306,
"loss": 0.6828,
"step": 132
},
{
"epoch": 1.528735632183908,
"grad_norm": 0.034574706107378006,
"learning_rate": 0.00011604112808577603,
"loss": 0.6776,
"step": 133
},
{
"epoch": 1.5402298850574714,
"grad_norm": 0.03596516326069832,
"learning_rate": 0.00011471454708317162,
"loss": 0.6276,
"step": 134
},
{
"epoch": 1.5517241379310345,
"grad_norm": 0.035429686307907104,
"learning_rate": 0.00011338531386752618,
"loss": 0.6427,
"step": 135
},
{
"epoch": 1.5632183908045976,
"grad_norm": 0.033536769449710846,
"learning_rate": 0.0001120536680255323,
"loss": 0.652,
"step": 136
},
{
"epoch": 1.5747126436781609,
"grad_norm": 0.03417116403579712,
"learning_rate": 0.00011071984957874479,
"loss": 0.6725,
"step": 137
},
{
"epoch": 1.5862068965517242,
"grad_norm": 0.03480486571788788,
"learning_rate": 0.00010938409894031794,
"loss": 0.6479,
"step": 138
},
{
"epoch": 1.5977011494252875,
"grad_norm": 0.03562786802649498,
"learning_rate": 0.00010804665687167262,
"loss": 0.6681,
"step": 139
},
{
"epoch": 1.6091954022988506,
"grad_norm": 0.034858204424381256,
"learning_rate": 0.00010670776443910024,
"loss": 0.6703,
"step": 140
},
{
"epoch": 1.6091954022988506,
"eval_loss": 0.6661256551742554,
"eval_runtime": 590.9429,
"eval_samples_per_second": 16.75,
"eval_steps_per_second": 0.132,
"step": 140
},
{
"epoch": 1.6206896551724137,
"grad_norm": 0.03447253257036209,
"learning_rate": 0.00010536766297031215,
"loss": 0.6643,
"step": 141
},
{
"epoch": 1.632183908045977,
"grad_norm": 0.03508715331554413,
"learning_rate": 0.00010402659401094152,
"loss": 0.6583,
"step": 142
},
{
"epoch": 1.6436781609195403,
"grad_norm": 0.0343579463660717,
"learning_rate": 0.00010268479928100614,
"loss": 0.6753,
"step": 143
},
{
"epoch": 1.6551724137931034,
"grad_norm": 0.0350567027926445,
"learning_rate": 0.00010134252063133975,
"loss": 0.6493,
"step": 144
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.034686826169490814,
"learning_rate": 0.0001,
"loss": 0.6515,
"step": 145
},
{
"epoch": 1.6781609195402298,
"grad_norm": 0.035702142864465714,
"learning_rate": 9.865747936866027e-05,
"loss": 0.6638,
"step": 146
},
{
"epoch": 1.6896551724137931,
"grad_norm": 0.034140318632125854,
"learning_rate": 9.73152007189939e-05,
"loss": 0.6496,
"step": 147
},
{
"epoch": 1.7011494252873565,
"grad_norm": 0.038083408027887344,
"learning_rate": 9.597340598905852e-05,
"loss": 0.6893,
"step": 148
},
{
"epoch": 1.7126436781609196,
"grad_norm": 0.03810959309339523,
"learning_rate": 9.463233702968783e-05,
"loss": 0.6728,
"step": 149
},
{
"epoch": 1.7241379310344827,
"grad_norm": 0.033978622406721115,
"learning_rate": 9.329223556089975e-05,
"loss": 0.6814,
"step": 150
},
{
"epoch": 1.7241379310344827,
"eval_loss": 0.6631415486335754,
"eval_runtime": 596.1091,
"eval_samples_per_second": 16.604,
"eval_steps_per_second": 0.131,
"step": 150
},
{
"epoch": 1.735632183908046,
"grad_norm": 0.03391426429152489,
"learning_rate": 9.195334312832742e-05,
"loss": 0.6421,
"step": 151
},
{
"epoch": 1.7471264367816093,
"grad_norm": 0.03562890738248825,
"learning_rate": 9.061590105968208e-05,
"loss": 0.6739,
"step": 152
},
{
"epoch": 1.7586206896551724,
"grad_norm": 0.03706149384379387,
"learning_rate": 8.928015042125523e-05,
"loss": 0.6579,
"step": 153
},
{
"epoch": 1.7701149425287355,
"grad_norm": 0.03591468557715416,
"learning_rate": 8.79463319744677e-05,
"loss": 0.645,
"step": 154
},
{
"epoch": 1.7816091954022988,
"grad_norm": 0.03639413043856621,
"learning_rate": 8.661468613247387e-05,
"loss": 0.6594,
"step": 155
},
{
"epoch": 1.793103448275862,
"grad_norm": 0.03598083183169365,
"learning_rate": 8.528545291682838e-05,
"loss": 0.6723,
"step": 156
},
{
"epoch": 1.8045977011494254,
"grad_norm": 0.03628537803888321,
"learning_rate": 8.395887191422397e-05,
"loss": 0.6715,
"step": 157
},
{
"epoch": 1.8160919540229885,
"grad_norm": 0.03826047480106354,
"learning_rate": 8.263518223330697e-05,
"loss": 0.6767,
"step": 158
},
{
"epoch": 1.8275862068965516,
"grad_norm": 0.03769649192690849,
"learning_rate": 8.131462246157953e-05,
"loss": 0.6552,
"step": 159
},
{
"epoch": 1.839080459770115,
"grad_norm": 0.03599262982606888,
"learning_rate": 7.999743062239557e-05,
"loss": 0.6688,
"step": 160
},
{
"epoch": 1.839080459770115,
"eval_loss": 0.6607028245925903,
"eval_runtime": 589.6737,
"eval_samples_per_second": 16.786,
"eval_steps_per_second": 0.132,
"step": 160
},
{
"epoch": 1.8505747126436782,
"grad_norm": 0.03584510087966919,
"learning_rate": 7.868384413205842e-05,
"loss": 0.672,
"step": 161
},
{
"epoch": 1.8620689655172413,
"grad_norm": 0.03545341268181801,
"learning_rate": 7.73740997570278e-05,
"loss": 0.6629,
"step": 162
},
{
"epoch": 1.8735632183908046,
"grad_norm": 0.035858072340488434,
"learning_rate": 7.606843357124426e-05,
"loss": 0.6391,
"step": 163
},
{
"epoch": 1.8850574712643677,
"grad_norm": 0.035794083029031754,
"learning_rate": 7.476708091357782e-05,
"loss": 0.6516,
"step": 164
},
{
"epoch": 1.896551724137931,
"grad_norm": 0.03674920275807381,
"learning_rate": 7.347027634540993e-05,
"loss": 0.6624,
"step": 165
},
{
"epoch": 1.9080459770114944,
"grad_norm": 0.034701887518167496,
"learning_rate": 7.217825360835473e-05,
"loss": 0.6574,
"step": 166
},
{
"epoch": 1.9195402298850575,
"grad_norm": 0.03606560826301575,
"learning_rate": 7.089124558212871e-05,
"loss": 0.6668,
"step": 167
},
{
"epoch": 1.9310344827586206,
"grad_norm": 0.03570757061243057,
"learning_rate": 6.960948424257532e-05,
"loss": 0.6358,
"step": 168
},
{
"epoch": 1.9425287356321839,
"grad_norm": 0.034459397196769714,
"learning_rate": 6.833320061985277e-05,
"loss": 0.6294,
"step": 169
},
{
"epoch": 1.9540229885057472,
"grad_norm": 0.035828083753585815,
"learning_rate": 6.706262475679205e-05,
"loss": 0.6781,
"step": 170
},
{
"epoch": 1.9540229885057472,
"eval_loss": 0.6586677432060242,
"eval_runtime": 590.832,
"eval_samples_per_second": 16.753,
"eval_steps_per_second": 0.132,
"step": 170
},
{
"epoch": 1.9655172413793105,
"grad_norm": 0.03741836175322533,
"learning_rate": 6.579798566743314e-05,
"loss": 0.6679,
"step": 171
},
{
"epoch": 1.9770114942528736,
"grad_norm": 0.03573041409254074,
"learning_rate": 6.453951129574644e-05,
"loss": 0.6805,
"step": 172
},
{
"epoch": 1.9885057471264367,
"grad_norm": 0.03586418181657791,
"learning_rate": 6.328742847454724e-05,
"loss": 0.6606,
"step": 173
},
{
"epoch": 2.0,
"grad_norm": 0.03636249899864197,
"learning_rate": 6.204196288461037e-05,
"loss": 0.6962,
"step": 174
},
{
"epoch": 2.0114942528735633,
"grad_norm": 0.04032210260629654,
"learning_rate": 6.080333901399251e-05,
"loss": 0.6426,
"step": 175
},
{
"epoch": 2.0229885057471266,
"grad_norm": 0.039117176085710526,
"learning_rate": 5.957178011756952e-05,
"loss": 0.6418,
"step": 176
},
{
"epoch": 2.0344827586206895,
"grad_norm": 0.037345871329307556,
"learning_rate": 5.834750817679606e-05,
"loss": 0.6406,
"step": 177
},
{
"epoch": 2.045977011494253,
"grad_norm": 0.03830842301249504,
"learning_rate": 5.713074385969457e-05,
"loss": 0.6262,
"step": 178
},
{
"epoch": 2.057471264367816,
"grad_norm": 0.037265144288539886,
"learning_rate": 5.59217064810814e-05,
"loss": 0.6385,
"step": 179
},
{
"epoch": 2.0689655172413794,
"grad_norm": 0.03784786909818649,
"learning_rate": 5.472061396303629e-05,
"loss": 0.618,
"step": 180
},
{
"epoch": 2.0689655172413794,
"eval_loss": 0.6581570506095886,
"eval_runtime": 593.2384,
"eval_samples_per_second": 16.685,
"eval_steps_per_second": 0.131,
"step": 180
},
{
"epoch": 2.0804597701149423,
"grad_norm": 0.03711731731891632,
"learning_rate": 5.3527682795623146e-05,
"loss": 0.6354,
"step": 181
},
{
"epoch": 2.0919540229885056,
"grad_norm": 0.037277545779943466,
"learning_rate": 5.234312799786921e-05,
"loss": 0.631,
"step": 182
},
{
"epoch": 2.103448275862069,
"grad_norm": 0.040995873510837555,
"learning_rate": 5.116716307900893e-05,
"loss": 0.631,
"step": 183
},
{
"epoch": 2.1149425287356323,
"grad_norm": 0.038000449538230896,
"learning_rate": 5.000000000000002e-05,
"loss": 0.6691,
"step": 184
},
{
"epoch": 2.1264367816091956,
"grad_norm": 0.03552795201539993,
"learning_rate": 4.884184913531902e-05,
"loss": 0.6147,
"step": 185
},
{
"epoch": 2.1379310344827585,
"grad_norm": 0.03610774129629135,
"learning_rate": 4.7692919235042255e-05,
"loss": 0.6259,
"step": 186
},
{
"epoch": 2.1494252873563218,
"grad_norm": 0.038160186260938644,
"learning_rate": 4.6553417387219886e-05,
"loss": 0.6375,
"step": 187
},
{
"epoch": 2.160919540229885,
"grad_norm": 0.03843645006418228,
"learning_rate": 4.542354898054953e-05,
"loss": 0.6329,
"step": 188
},
{
"epoch": 2.1724137931034484,
"grad_norm": 0.03745341673493385,
"learning_rate": 4.430351766735609e-05,
"loss": 0.6379,
"step": 189
},
{
"epoch": 2.1839080459770113,
"grad_norm": 0.03651278465986252,
"learning_rate": 4.3193525326884435e-05,
"loss": 0.6575,
"step": 190
},
{
"epoch": 2.1839080459770113,
"eval_loss": 0.657158374786377,
"eval_runtime": 591.1387,
"eval_samples_per_second": 16.744,
"eval_steps_per_second": 0.132,
"step": 190
},
{
"epoch": 2.1954022988505746,
"grad_norm": 0.03700386360287666,
"learning_rate": 4.209377202891212e-05,
"loss": 0.643,
"step": 191
},
{
"epoch": 2.206896551724138,
"grad_norm": 0.038101959973573685,
"learning_rate": 4.100445599768774e-05,
"loss": 0.6349,
"step": 192
},
{
"epoch": 2.218390804597701,
"grad_norm": 0.036965154111385345,
"learning_rate": 3.99257735762021e-05,
"loss": 0.6404,
"step": 193
},
{
"epoch": 2.2298850574712645,
"grad_norm": 0.03756578266620636,
"learning_rate": 3.885791919079878e-05,
"loss": 0.6615,
"step": 194
},
{
"epoch": 2.2413793103448274,
"grad_norm": 0.035037923604249954,
"learning_rate": 3.7801085316129615e-05,
"loss": 0.6237,
"step": 195
},
{
"epoch": 2.2528735632183907,
"grad_norm": 0.03736288473010063,
"learning_rate": 3.675546244046228e-05,
"loss": 0.6482,
"step": 196
},
{
"epoch": 2.264367816091954,
"grad_norm": 0.03758701682090759,
"learning_rate": 3.5721239031346066e-05,
"loss": 0.6101,
"step": 197
},
{
"epoch": 2.2758620689655173,
"grad_norm": 0.03676440194249153,
"learning_rate": 3.469860150164152e-05,
"loss": 0.6499,
"step": 198
},
{
"epoch": 2.2873563218390807,
"grad_norm": 0.03628651425242424,
"learning_rate": 3.36877341759205e-05,
"loss": 0.629,
"step": 199
},
{
"epoch": 2.2988505747126435,
"grad_norm": 0.03841525688767433,
"learning_rate": 3.268881925724297e-05,
"loss": 0.6372,
"step": 200
},
{
"epoch": 2.2988505747126435,
"eval_loss": 0.6563421487808228,
"eval_runtime": 591.4937,
"eval_samples_per_second": 16.734,
"eval_steps_per_second": 0.132,
"step": 200
},
{
"epoch": 2.310344827586207,
"grad_norm": 0.03878968209028244,
"learning_rate": 3.170203679431584e-05,
"loss": 0.6417,
"step": 201
},
{
"epoch": 2.32183908045977,
"grad_norm": 0.03762966766953468,
"learning_rate": 3.072756464904006e-05,
"loss": 0.6434,
"step": 202
},
{
"epoch": 2.3333333333333335,
"grad_norm": 0.03814293071627617,
"learning_rate": 2.976557846445225e-05,
"loss": 0.643,
"step": 203
},
{
"epoch": 2.344827586206897,
"grad_norm": 0.03540797904133797,
"learning_rate": 2.881625163306596e-05,
"loss": 0.6374,
"step": 204
},
{
"epoch": 2.3563218390804597,
"grad_norm": 0.03709061071276665,
"learning_rate": 2.7879755265618555e-05,
"loss": 0.6418,
"step": 205
},
{
"epoch": 2.367816091954023,
"grad_norm": 0.03803767263889313,
"learning_rate": 2.6956258160229695e-05,
"loss": 0.632,
"step": 206
},
{
"epoch": 2.3793103448275863,
"grad_norm": 0.03654790297150612,
"learning_rate": 2.6045926771976303e-05,
"loss": 0.6538,
"step": 207
},
{
"epoch": 2.3908045977011496,
"grad_norm": 0.03603474050760269,
"learning_rate": 2.514892518288988e-05,
"loss": 0.6145,
"step": 208
},
{
"epoch": 2.4022988505747125,
"grad_norm": 0.036107271909713745,
"learning_rate": 2.4265415072382016e-05,
"loss": 0.6222,
"step": 209
},
{
"epoch": 2.413793103448276,
"grad_norm": 0.036303840577602386,
"learning_rate": 2.339555568810221e-05,
"loss": 0.617,
"step": 210
},
{
"epoch": 2.413793103448276,
"eval_loss": 0.6558452248573303,
"eval_runtime": 595.1508,
"eval_samples_per_second": 16.631,
"eval_steps_per_second": 0.131,
"step": 210
},
{
"epoch": 2.425287356321839,
"grad_norm": 0.037371959537267685,
"learning_rate": 2.2539503817234553e-05,
"loss": 0.654,
"step": 211
},
{
"epoch": 2.4367816091954024,
"grad_norm": 0.037925343960523605,
"learning_rate": 2.1697413758237784e-05,
"loss": 0.6322,
"step": 212
},
{
"epoch": 2.4482758620689653,
"grad_norm": 0.03855719789862633,
"learning_rate": 2.0869437293033835e-05,
"loss": 0.6335,
"step": 213
},
{
"epoch": 2.4597701149425286,
"grad_norm": 0.037308286875486374,
"learning_rate": 2.0055723659649904e-05,
"loss": 0.6184,
"step": 214
},
{
"epoch": 2.471264367816092,
"grad_norm": 0.03688681870698929,
"learning_rate": 1.9256419525319313e-05,
"loss": 0.627,
"step": 215
},
{
"epoch": 2.4827586206896552,
"grad_norm": 0.038670193403959274,
"learning_rate": 1.8471668960045574e-05,
"loss": 0.6487,
"step": 216
},
{
"epoch": 2.4942528735632186,
"grad_norm": 0.03826979547739029,
"learning_rate": 1.7701613410634365e-05,
"loss": 0.6337,
"step": 217
},
{
"epoch": 2.5057471264367814,
"grad_norm": 0.03768225386738777,
"learning_rate": 1.6946391675198836e-05,
"loss": 0.6356,
"step": 218
},
{
"epoch": 2.5172413793103448,
"grad_norm": 0.037600524723529816,
"learning_rate": 1.620613987814189e-05,
"loss": 0.6237,
"step": 219
},
{
"epoch": 2.528735632183908,
"grad_norm": 0.036940865218639374,
"learning_rate": 1.5480991445620542e-05,
"loss": 0.6152,
"step": 220
},
{
"epoch": 2.528735632183908,
"eval_loss": 0.655238926410675,
"eval_runtime": 592.9652,
"eval_samples_per_second": 16.692,
"eval_steps_per_second": 0.132,
"step": 220
},
{
"epoch": 2.5402298850574714,
"grad_norm": 0.03826668858528137,
"learning_rate": 1.4771077081496654e-05,
"loss": 0.6511,
"step": 221
},
{
"epoch": 2.5517241379310347,
"grad_norm": 0.0361490473151207,
"learning_rate": 1.4076524743778319e-05,
"loss": 0.6311,
"step": 222
},
{
"epoch": 2.5632183908045976,
"grad_norm": 0.03727561607956886,
"learning_rate": 1.339745962155613e-05,
"loss": 0.6324,
"step": 223
},
{
"epoch": 2.574712643678161,
"grad_norm": 0.038558244705200195,
"learning_rate": 1.2734004112438568e-05,
"loss": 0.6568,
"step": 224
},
{
"epoch": 2.586206896551724,
"grad_norm": 0.03707597777247429,
"learning_rate": 1.2086277800490554e-05,
"loss": 0.649,
"step": 225
},
{
"epoch": 2.5977011494252875,
"grad_norm": 0.037075500935316086,
"learning_rate": 1.1454397434679021e-05,
"loss": 0.6603,
"step": 226
},
{
"epoch": 2.609195402298851,
"grad_norm": 0.03692416474223137,
"learning_rate": 1.083847690782972e-05,
"loss": 0.6295,
"step": 227
},
{
"epoch": 2.6206896551724137,
"grad_norm": 0.03675093874335289,
"learning_rate": 1.0238627236098619e-05,
"loss": 0.6209,
"step": 228
},
{
"epoch": 2.632183908045977,
"grad_norm": 0.03640436753630638,
"learning_rate": 9.65495653896179e-06,
"loss": 0.6339,
"step": 229
},
{
"epoch": 2.6436781609195403,
"grad_norm": 0.037452854216098785,
"learning_rate": 9.08757001972762e-06,
"loss": 0.6407,
"step": 230
},
{
"epoch": 2.6436781609195403,
"eval_loss": 0.6549434065818787,
"eval_runtime": 592.4603,
"eval_samples_per_second": 16.707,
"eval_steps_per_second": 0.132,
"step": 230
},
{
"epoch": 2.655172413793103,
"grad_norm": 0.03795592859387398,
"learning_rate": 8.536569946574546e-06,
"loss": 0.6534,
"step": 231
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.03788486495614052,
"learning_rate": 8.002055634117578e-06,
"loss": 0.6507,
"step": 232
},
{
"epoch": 2.67816091954023,
"grad_norm": 0.03641374036669731,
"learning_rate": 7.4841234255076495e-06,
"loss": 0.6282,
"step": 233
},
{
"epoch": 2.689655172413793,
"grad_norm": 0.03832123428583145,
"learning_rate": 6.9828666750661795e-06,
"loss": 0.6276,
"step": 234
},
{
"epoch": 2.7011494252873565,
"grad_norm": 0.03677581995725632,
"learning_rate": 6.498375731458528e-06,
"loss": 0.639,
"step": 235
},
{
"epoch": 2.7126436781609193,
"grad_norm": 0.03623311221599579,
"learning_rate": 6.030737921409169e-06,
"loss": 0.6425,
"step": 236
},
{
"epoch": 2.7241379310344827,
"grad_norm": 0.036890316754579544,
"learning_rate": 5.580037533961546e-06,
"loss": 0.6438,
"step": 237
},
{
"epoch": 2.735632183908046,
"grad_norm": 0.03658117353916168,
"learning_rate": 5.146355805285452e-06,
"loss": 0.6275,
"step": 238
},
{
"epoch": 2.7471264367816093,
"grad_norm": 0.03849633410573006,
"learning_rate": 4.729770904034647e-06,
"loss": 0.6357,
"step": 239
},
{
"epoch": 2.7586206896551726,
"grad_norm": 0.03720705956220627,
"learning_rate": 4.3303579172574885e-06,
"loss": 0.6498,
"step": 240
},
{
"epoch": 2.7586206896551726,
"eval_loss": 0.6547917723655701,
"eval_runtime": 593.6976,
"eval_samples_per_second": 16.672,
"eval_steps_per_second": 0.131,
"step": 240
},
{
"epoch": 2.7701149425287355,
"grad_norm": 0.03713015094399452,
"learning_rate": 3.948188836862776e-06,
"loss": 0.63,
"step": 241
},
{
"epoch": 2.781609195402299,
"grad_norm": 0.03703474998474121,
"learning_rate": 3.5833325466437694e-06,
"loss": 0.6267,
"step": 242
},
{
"epoch": 2.793103448275862,
"grad_norm": 0.0391690619289875,
"learning_rate": 3.2358548098621932e-06,
"loss": 0.6297,
"step": 243
},
{
"epoch": 2.8045977011494254,
"grad_norm": 0.03748522326350212,
"learning_rate": 2.905818257394799e-06,
"loss": 0.6289,
"step": 244
},
{
"epoch": 2.8160919540229887,
"grad_norm": 0.036214690655469894,
"learning_rate": 2.5932823764445392e-06,
"loss": 0.6274,
"step": 245
},
{
"epoch": 2.8275862068965516,
"grad_norm": 0.03697674348950386,
"learning_rate": 2.2983034998182997e-06,
"loss": 0.6205,
"step": 246
},
{
"epoch": 2.839080459770115,
"grad_norm": 0.0381477065384388,
"learning_rate": 2.0209347957732328e-06,
"loss": 0.6585,
"step": 247
},
{
"epoch": 2.8505747126436782,
"grad_norm": 0.03660466521978378,
"learning_rate": 1.7612262584335237e-06,
"loss": 0.6215,
"step": 248
},
{
"epoch": 2.862068965517241,
"grad_norm": 0.03698161989450455,
"learning_rate": 1.5192246987791981e-06,
"loss": 0.6454,
"step": 249
},
{
"epoch": 2.873563218390805,
"grad_norm": 0.037139616906642914,
"learning_rate": 1.2949737362087156e-06,
"loss": 0.6365,
"step": 250
},
{
"epoch": 2.873563218390805,
"eval_loss": 0.6547266840934753,
"eval_runtime": 593.9462,
"eval_samples_per_second": 16.665,
"eval_steps_per_second": 0.131,
"step": 250
},
{
"epoch": 2.8850574712643677,
"grad_norm": 0.03719855844974518,
"learning_rate": 1.0885137906768372e-06,
"loss": 0.628,
"step": 251
},
{
"epoch": 2.896551724137931,
"grad_norm": 0.03709800913929939,
"learning_rate": 8.998820754091531e-07,
"loss": 0.6447,
"step": 252
},
{
"epoch": 2.9080459770114944,
"grad_norm": 0.03928203135728836,
"learning_rate": 7.291125901946027e-07,
"loss": 0.6615,
"step": 253
},
{
"epoch": 2.9195402298850572,
"grad_norm": 0.03835693374276161,
"learning_rate": 5.762361152572115e-07,
"loss": 0.6382,
"step": 254
},
{
"epoch": 2.9310344827586206,
"grad_norm": 0.037969332188367844,
"learning_rate": 4.412802057081278e-07,
"loss": 0.6339,
"step": 255
},
{
"epoch": 2.942528735632184,
"grad_norm": 0.0369732528924942,
"learning_rate": 3.2426918657900704e-07,
"loss": 0.6557,
"step": 256
},
{
"epoch": 2.954022988505747,
"grad_norm": 0.03954119607806206,
"learning_rate": 2.2522414843748618e-07,
"loss": 0.6419,
"step": 257
},
{
"epoch": 2.9655172413793105,
"grad_norm": 0.03722945228219032,
"learning_rate": 1.4416294358582384e-07,
"loss": 0.6224,
"step": 258
},
{
"epoch": 2.9770114942528734,
"grad_norm": 0.037498198449611664,
"learning_rate": 8.110018284304133e-08,
"loss": 0.6472,
"step": 259
},
{
"epoch": 2.9885057471264367,
"grad_norm": 0.03706786781549454,
"learning_rate": 3.60472329114625e-08,
"loss": 0.6532,
"step": 260
},
{
"epoch": 2.9885057471264367,
"eval_loss": 0.6546847820281982,
"eval_runtime": 591.4156,
"eval_samples_per_second": 16.736,
"eval_steps_per_second": 0.132,
"step": 260
},
{
"epoch": 3.0,
"grad_norm": 0.0390935055911541,
"learning_rate": 9.012214327897006e-09,
"loss": 0.6416,
"step": 261
},
{
"epoch": 3.0,
"step": 261,
"total_flos": 4.878497427033686e+19,
"train_loss": 0.6974607110023499,
"train_runtime": 68284.7248,
"train_samples_per_second": 3.914,
"train_steps_per_second": 0.004
}
],
"logging_steps": 1.0,
"max_steps": 261,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.878497427033686e+19,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}