fin_simplifier / trainer_state.json
combe4259's picture
Upload 12 files
23a425a verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 10.0,
"eval_steps": 500,
"global_step": 3600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.027777777777777776,
"grad_norm": 104.33584594726562,
"learning_rate": 1.35e-06,
"loss": 13.5343,
"step": 10
},
{
"epoch": 0.05555555555555555,
"grad_norm": 57.29974365234375,
"learning_rate": 2.8500000000000002e-06,
"loss": 12.1877,
"step": 20
},
{
"epoch": 0.08333333333333333,
"grad_norm": 35.94429016113281,
"learning_rate": 4.35e-06,
"loss": 10.9908,
"step": 30
},
{
"epoch": 0.1111111111111111,
"grad_norm": 36.21198272705078,
"learning_rate": 5.850000000000001e-06,
"loss": 10.8169,
"step": 40
},
{
"epoch": 0.1388888888888889,
"grad_norm": 35.18545150756836,
"learning_rate": 7.35e-06,
"loss": 10.226,
"step": 50
},
{
"epoch": 0.16666666666666666,
"grad_norm": 30.034957885742188,
"learning_rate": 8.85e-06,
"loss": 9.8243,
"step": 60
},
{
"epoch": 0.19444444444444445,
"grad_norm": 26.107460021972656,
"learning_rate": 1.035e-05,
"loss": 9.7202,
"step": 70
},
{
"epoch": 0.2222222222222222,
"grad_norm": 28.405378341674805,
"learning_rate": 1.185e-05,
"loss": 9.655,
"step": 80
},
{
"epoch": 0.25,
"grad_norm": 31.871849060058594,
"learning_rate": 1.3350000000000001e-05,
"loss": 9.1121,
"step": 90
},
{
"epoch": 0.2777777777777778,
"grad_norm": 31.50947380065918,
"learning_rate": 1.485e-05,
"loss": 9.1896,
"step": 100
},
{
"epoch": 0.3055555555555556,
"grad_norm": 28.950199127197266,
"learning_rate": 1.635e-05,
"loss": 9.2017,
"step": 110
},
{
"epoch": 0.3333333333333333,
"grad_norm": 25.37229347229004,
"learning_rate": 1.785e-05,
"loss": 9.2022,
"step": 120
},
{
"epoch": 0.3611111111111111,
"grad_norm": 24.216697692871094,
"learning_rate": 1.935e-05,
"loss": 9.1344,
"step": 130
},
{
"epoch": 0.3888888888888889,
"grad_norm": 24.010272979736328,
"learning_rate": 2.085e-05,
"loss": 8.5873,
"step": 140
},
{
"epoch": 0.4166666666666667,
"grad_norm": 24.390907287597656,
"learning_rate": 2.235e-05,
"loss": 8.7195,
"step": 150
},
{
"epoch": 0.4444444444444444,
"grad_norm": 21.80168914794922,
"learning_rate": 2.385e-05,
"loss": 8.5485,
"step": 160
},
{
"epoch": 0.4722222222222222,
"grad_norm": 20.435110092163086,
"learning_rate": 2.535e-05,
"loss": 8.6359,
"step": 170
},
{
"epoch": 0.5,
"grad_norm": 24.11395835876465,
"learning_rate": 2.6850000000000002e-05,
"loss": 8.4246,
"step": 180
},
{
"epoch": 0.5277777777777778,
"grad_norm": 25.995811462402344,
"learning_rate": 2.8349999999999998e-05,
"loss": 8.5049,
"step": 190
},
{
"epoch": 0.5555555555555556,
"grad_norm": 23.826154708862305,
"learning_rate": 2.985e-05,
"loss": 8.4623,
"step": 200
},
{
"epoch": 0.5833333333333334,
"grad_norm": 20.094274520874023,
"learning_rate": 2.9920588235294118e-05,
"loss": 8.4495,
"step": 210
},
{
"epoch": 0.6111111111111112,
"grad_norm": 22.793804168701172,
"learning_rate": 2.983235294117647e-05,
"loss": 8.4131,
"step": 220
},
{
"epoch": 0.6388888888888888,
"grad_norm": 26.08730125427246,
"learning_rate": 2.9744117647058824e-05,
"loss": 8.3116,
"step": 230
},
{
"epoch": 0.6666666666666666,
"grad_norm": 24.06973648071289,
"learning_rate": 2.9655882352941176e-05,
"loss": 8.4003,
"step": 240
},
{
"epoch": 0.6944444444444444,
"grad_norm": 22.93770408630371,
"learning_rate": 2.956764705882353e-05,
"loss": 8.2579,
"step": 250
},
{
"epoch": 0.7222222222222222,
"grad_norm": 20.36606216430664,
"learning_rate": 2.9479411764705883e-05,
"loss": 8.2888,
"step": 260
},
{
"epoch": 0.75,
"grad_norm": 21.670818328857422,
"learning_rate": 2.9391176470588238e-05,
"loss": 8.1623,
"step": 270
},
{
"epoch": 0.7777777777777778,
"grad_norm": 17.550491333007812,
"learning_rate": 2.930294117647059e-05,
"loss": 8.0683,
"step": 280
},
{
"epoch": 0.8055555555555556,
"grad_norm": 25.335660934448242,
"learning_rate": 2.9214705882352944e-05,
"loss": 8.2784,
"step": 290
},
{
"epoch": 0.8333333333333334,
"grad_norm": 20.76925277709961,
"learning_rate": 2.9126470588235292e-05,
"loss": 8.2923,
"step": 300
},
{
"epoch": 0.8611111111111112,
"grad_norm": 23.108144760131836,
"learning_rate": 2.9038235294117647e-05,
"loss": 8.1554,
"step": 310
},
{
"epoch": 0.8888888888888888,
"grad_norm": 16.563575744628906,
"learning_rate": 2.895e-05,
"loss": 7.8543,
"step": 320
},
{
"epoch": 0.9166666666666666,
"grad_norm": 19.03083038330078,
"learning_rate": 2.8861764705882354e-05,
"loss": 8.0434,
"step": 330
},
{
"epoch": 0.9444444444444444,
"grad_norm": 18.121356964111328,
"learning_rate": 2.8773529411764706e-05,
"loss": 7.93,
"step": 340
},
{
"epoch": 0.9722222222222222,
"grad_norm": 21.65605926513672,
"learning_rate": 2.868529411764706e-05,
"loss": 8.0124,
"step": 350
},
{
"epoch": 1.0,
"grad_norm": 24.55741310119629,
"learning_rate": 2.8597058823529412e-05,
"loss": 7.547,
"step": 360
},
{
"epoch": 1.0277777777777777,
"grad_norm": 18.358055114746094,
"learning_rate": 2.8508823529411767e-05,
"loss": 6.9283,
"step": 370
},
{
"epoch": 1.0555555555555556,
"grad_norm": 18.642698287963867,
"learning_rate": 2.842058823529412e-05,
"loss": 7.1341,
"step": 380
},
{
"epoch": 1.0833333333333333,
"grad_norm": 30.198606491088867,
"learning_rate": 2.8332352941176474e-05,
"loss": 7.113,
"step": 390
},
{
"epoch": 1.1111111111111112,
"grad_norm": 19.851511001586914,
"learning_rate": 2.8244117647058825e-05,
"loss": 7.1696,
"step": 400
},
{
"epoch": 1.1388888888888888,
"grad_norm": 20.48548698425293,
"learning_rate": 2.8155882352941177e-05,
"loss": 7.1552,
"step": 410
},
{
"epoch": 1.1666666666666667,
"grad_norm": 18.470821380615234,
"learning_rate": 2.806764705882353e-05,
"loss": 6.8519,
"step": 420
},
{
"epoch": 1.1944444444444444,
"grad_norm": 20.821441650390625,
"learning_rate": 2.7979411764705883e-05,
"loss": 7.0113,
"step": 430
},
{
"epoch": 1.2222222222222223,
"grad_norm": 19.294477462768555,
"learning_rate": 2.7891176470588235e-05,
"loss": 7.3983,
"step": 440
},
{
"epoch": 1.25,
"grad_norm": 19.5776309967041,
"learning_rate": 2.780294117647059e-05,
"loss": 7.0649,
"step": 450
},
{
"epoch": 1.2777777777777777,
"grad_norm": 18.286705017089844,
"learning_rate": 2.771470588235294e-05,
"loss": 7.1587,
"step": 460
},
{
"epoch": 1.3055555555555556,
"grad_norm": 19.803571701049805,
"learning_rate": 2.7626470588235297e-05,
"loss": 6.9439,
"step": 470
},
{
"epoch": 1.3333333333333333,
"grad_norm": 20.182926177978516,
"learning_rate": 2.7538235294117648e-05,
"loss": 6.9072,
"step": 480
},
{
"epoch": 1.3611111111111112,
"grad_norm": 21.21506690979004,
"learning_rate": 2.7450000000000003e-05,
"loss": 6.7805,
"step": 490
},
{
"epoch": 1.3888888888888888,
"grad_norm": 18.884464263916016,
"learning_rate": 2.7361764705882355e-05,
"loss": 6.7809,
"step": 500
},
{
"epoch": 1.4166666666666667,
"grad_norm": 18.7882080078125,
"learning_rate": 2.7273529411764706e-05,
"loss": 7.1101,
"step": 510
},
{
"epoch": 1.4444444444444444,
"grad_norm": 17.172212600708008,
"learning_rate": 2.7185294117647058e-05,
"loss": 6.8579,
"step": 520
},
{
"epoch": 1.4722222222222223,
"grad_norm": 18.599943161010742,
"learning_rate": 2.7097058823529413e-05,
"loss": 6.9192,
"step": 530
},
{
"epoch": 1.5,
"grad_norm": 22.43140411376953,
"learning_rate": 2.7008823529411765e-05,
"loss": 6.9068,
"step": 540
},
{
"epoch": 1.5277777777777777,
"grad_norm": 16.94782257080078,
"learning_rate": 2.692058823529412e-05,
"loss": 7.0826,
"step": 550
},
{
"epoch": 1.5555555555555556,
"grad_norm": 18.50872802734375,
"learning_rate": 2.683235294117647e-05,
"loss": 6.8281,
"step": 560
},
{
"epoch": 1.5833333333333335,
"grad_norm": 22.08888053894043,
"learning_rate": 2.6744117647058826e-05,
"loss": 6.9378,
"step": 570
},
{
"epoch": 1.6111111111111112,
"grad_norm": 17.366363525390625,
"learning_rate": 2.6655882352941178e-05,
"loss": 6.6598,
"step": 580
},
{
"epoch": 1.6388888888888888,
"grad_norm": 17.86391830444336,
"learning_rate": 2.6567647058823533e-05,
"loss": 6.8419,
"step": 590
},
{
"epoch": 1.6666666666666665,
"grad_norm": 16.568017959594727,
"learning_rate": 2.6479411764705884e-05,
"loss": 6.9137,
"step": 600
},
{
"epoch": 1.6944444444444444,
"grad_norm": 21.780588150024414,
"learning_rate": 2.6391176470588236e-05,
"loss": 7.0719,
"step": 610
},
{
"epoch": 1.7222222222222223,
"grad_norm": 20.24903678894043,
"learning_rate": 2.6302941176470587e-05,
"loss": 6.8612,
"step": 620
},
{
"epoch": 1.75,
"grad_norm": 23.022014617919922,
"learning_rate": 2.621470588235294e-05,
"loss": 6.815,
"step": 630
},
{
"epoch": 1.7777777777777777,
"grad_norm": 19.738136291503906,
"learning_rate": 2.6126470588235294e-05,
"loss": 7.0848,
"step": 640
},
{
"epoch": 1.8055555555555556,
"grad_norm": 19.904956817626953,
"learning_rate": 2.6038235294117646e-05,
"loss": 7.1127,
"step": 650
},
{
"epoch": 1.8333333333333335,
"grad_norm": 20.379528045654297,
"learning_rate": 2.595e-05,
"loss": 7.1542,
"step": 660
},
{
"epoch": 1.8611111111111112,
"grad_norm": 17.21288299560547,
"learning_rate": 2.5861764705882352e-05,
"loss": 6.4236,
"step": 670
},
{
"epoch": 1.8888888888888888,
"grad_norm": 19.684776306152344,
"learning_rate": 2.5773529411764707e-05,
"loss": 6.97,
"step": 680
},
{
"epoch": 1.9166666666666665,
"grad_norm": 22.62903594970703,
"learning_rate": 2.568529411764706e-05,
"loss": 6.7521,
"step": 690
},
{
"epoch": 1.9444444444444444,
"grad_norm": 18.688766479492188,
"learning_rate": 2.5597058823529414e-05,
"loss": 6.7485,
"step": 700
},
{
"epoch": 1.9722222222222223,
"grad_norm": 19.558578491210938,
"learning_rate": 2.5508823529411765e-05,
"loss": 6.6582,
"step": 710
},
{
"epoch": 2.0,
"grad_norm": 20.0054874420166,
"learning_rate": 2.542058823529412e-05,
"loss": 6.4649,
"step": 720
},
{
"epoch": 2.0277777777777777,
"grad_norm": 17.48706817626953,
"learning_rate": 2.533235294117647e-05,
"loss": 6.0975,
"step": 730
},
{
"epoch": 2.0555555555555554,
"grad_norm": 21.419391632080078,
"learning_rate": 2.5244117647058823e-05,
"loss": 5.8201,
"step": 740
},
{
"epoch": 2.0833333333333335,
"grad_norm": 19.62574005126953,
"learning_rate": 2.5155882352941175e-05,
"loss": 5.792,
"step": 750
},
{
"epoch": 2.111111111111111,
"grad_norm": 24.182546615600586,
"learning_rate": 2.506764705882353e-05,
"loss": 6.002,
"step": 760
},
{
"epoch": 2.138888888888889,
"grad_norm": 17.403255462646484,
"learning_rate": 2.497941176470588e-05,
"loss": 5.8743,
"step": 770
},
{
"epoch": 2.1666666666666665,
"grad_norm": 19.421232223510742,
"learning_rate": 2.4891176470588237e-05,
"loss": 6.1047,
"step": 780
},
{
"epoch": 2.1944444444444446,
"grad_norm": 22.9908504486084,
"learning_rate": 2.4802941176470588e-05,
"loss": 5.9951,
"step": 790
},
{
"epoch": 2.2222222222222223,
"grad_norm": 18.04509925842285,
"learning_rate": 2.4714705882352943e-05,
"loss": 6.1774,
"step": 800
},
{
"epoch": 2.25,
"grad_norm": 16.365985870361328,
"learning_rate": 2.4626470588235295e-05,
"loss": 5.9879,
"step": 810
},
{
"epoch": 2.2777777777777777,
"grad_norm": 19.309423446655273,
"learning_rate": 2.453823529411765e-05,
"loss": 5.9292,
"step": 820
},
{
"epoch": 2.3055555555555554,
"grad_norm": 24.4521541595459,
"learning_rate": 2.4449999999999998e-05,
"loss": 6.0532,
"step": 830
},
{
"epoch": 2.3333333333333335,
"grad_norm": 22.1500244140625,
"learning_rate": 2.4361764705882353e-05,
"loss": 5.7139,
"step": 840
},
{
"epoch": 2.361111111111111,
"grad_norm": 18.496788024902344,
"learning_rate": 2.4273529411764705e-05,
"loss": 5.9176,
"step": 850
},
{
"epoch": 2.388888888888889,
"grad_norm": 19.089303970336914,
"learning_rate": 2.418529411764706e-05,
"loss": 6.0821,
"step": 860
},
{
"epoch": 2.4166666666666665,
"grad_norm": 18.071590423583984,
"learning_rate": 2.409705882352941e-05,
"loss": 5.9829,
"step": 870
},
{
"epoch": 2.4444444444444446,
"grad_norm": 19.58377456665039,
"learning_rate": 2.4008823529411766e-05,
"loss": 5.894,
"step": 880
},
{
"epoch": 2.4722222222222223,
"grad_norm": 20.550006866455078,
"learning_rate": 2.3920588235294118e-05,
"loss": 6.0293,
"step": 890
},
{
"epoch": 2.5,
"grad_norm": 22.616382598876953,
"learning_rate": 2.3832352941176473e-05,
"loss": 6.037,
"step": 900
},
{
"epoch": 2.5277777777777777,
"grad_norm": 22.372953414916992,
"learning_rate": 2.3744117647058824e-05,
"loss": 5.9353,
"step": 910
},
{
"epoch": 2.5555555555555554,
"grad_norm": 18.03106117248535,
"learning_rate": 2.365588235294118e-05,
"loss": 5.8287,
"step": 920
},
{
"epoch": 2.5833333333333335,
"grad_norm": 16.170204162597656,
"learning_rate": 2.356764705882353e-05,
"loss": 5.7742,
"step": 930
},
{
"epoch": 2.611111111111111,
"grad_norm": 17.15852165222168,
"learning_rate": 2.3479411764705882e-05,
"loss": 6.0491,
"step": 940
},
{
"epoch": 2.638888888888889,
"grad_norm": 20.76968002319336,
"learning_rate": 2.3391176470588234e-05,
"loss": 5.9122,
"step": 950
},
{
"epoch": 2.6666666666666665,
"grad_norm": 21.134782791137695,
"learning_rate": 2.330294117647059e-05,
"loss": 5.8734,
"step": 960
},
{
"epoch": 2.6944444444444446,
"grad_norm": 18.72968864440918,
"learning_rate": 2.321470588235294e-05,
"loss": 5.9093,
"step": 970
},
{
"epoch": 2.7222222222222223,
"grad_norm": 23.75607681274414,
"learning_rate": 2.3126470588235296e-05,
"loss": 5.9551,
"step": 980
},
{
"epoch": 2.75,
"grad_norm": 19.960378646850586,
"learning_rate": 2.3038235294117647e-05,
"loss": 6.0045,
"step": 990
},
{
"epoch": 2.7777777777777777,
"grad_norm": 16.736736297607422,
"learning_rate": 2.2950000000000002e-05,
"loss": 5.9759,
"step": 1000
},
{
"epoch": 2.8055555555555554,
"grad_norm": 22.561372756958008,
"learning_rate": 2.2861764705882354e-05,
"loss": 5.9796,
"step": 1010
},
{
"epoch": 2.8333333333333335,
"grad_norm": 16.96518898010254,
"learning_rate": 2.277352941176471e-05,
"loss": 6.0321,
"step": 1020
},
{
"epoch": 2.861111111111111,
"grad_norm": 20.031225204467773,
"learning_rate": 2.268529411764706e-05,
"loss": 5.9221,
"step": 1030
},
{
"epoch": 2.888888888888889,
"grad_norm": 16.219301223754883,
"learning_rate": 2.2597058823529415e-05,
"loss": 6.0382,
"step": 1040
},
{
"epoch": 2.9166666666666665,
"grad_norm": 19.537729263305664,
"learning_rate": 2.2508823529411764e-05,
"loss": 5.8556,
"step": 1050
},
{
"epoch": 2.9444444444444446,
"grad_norm": 20.713224411010742,
"learning_rate": 2.242058823529412e-05,
"loss": 5.9832,
"step": 1060
},
{
"epoch": 2.9722222222222223,
"grad_norm": 18.94963836669922,
"learning_rate": 2.233235294117647e-05,
"loss": 6.0023,
"step": 1070
},
{
"epoch": 3.0,
"grad_norm": 20.285676956176758,
"learning_rate": 2.2244117647058825e-05,
"loss": 5.8155,
"step": 1080
},
{
"epoch": 3.0277777777777777,
"grad_norm": 17.27924346923828,
"learning_rate": 2.2155882352941177e-05,
"loss": 5.2819,
"step": 1090
},
{
"epoch": 3.0555555555555554,
"grad_norm": 21.477184295654297,
"learning_rate": 2.206764705882353e-05,
"loss": 5.1979,
"step": 1100
},
{
"epoch": 3.0833333333333335,
"grad_norm": 15.144262313842773,
"learning_rate": 2.1979411764705883e-05,
"loss": 5.1172,
"step": 1110
},
{
"epoch": 3.111111111111111,
"grad_norm": 22.37119483947754,
"learning_rate": 2.1891176470588238e-05,
"loss": 5.2262,
"step": 1120
},
{
"epoch": 3.138888888888889,
"grad_norm": 20.329545974731445,
"learning_rate": 2.180294117647059e-05,
"loss": 5.2281,
"step": 1130
},
{
"epoch": 3.1666666666666665,
"grad_norm": 17.549114227294922,
"learning_rate": 2.1714705882352945e-05,
"loss": 5.1716,
"step": 1140
},
{
"epoch": 3.1944444444444446,
"grad_norm": 16.489368438720703,
"learning_rate": 2.1626470588235293e-05,
"loss": 5.3562,
"step": 1150
},
{
"epoch": 3.2222222222222223,
"grad_norm": 18.964805603027344,
"learning_rate": 2.1538235294117648e-05,
"loss": 5.5189,
"step": 1160
},
{
"epoch": 3.25,
"grad_norm": 18.049579620361328,
"learning_rate": 2.145e-05,
"loss": 5.1967,
"step": 1170
},
{
"epoch": 3.2777777777777777,
"grad_norm": 18.76259422302246,
"learning_rate": 2.136176470588235e-05,
"loss": 5.2686,
"step": 1180
},
{
"epoch": 3.3055555555555554,
"grad_norm": 20.977890014648438,
"learning_rate": 2.1273529411764706e-05,
"loss": 5.1385,
"step": 1190
},
{
"epoch": 3.3333333333333335,
"grad_norm": 19.02361488342285,
"learning_rate": 2.1185294117647058e-05,
"loss": 5.3758,
"step": 1200
},
{
"epoch": 3.361111111111111,
"grad_norm": 17.559446334838867,
"learning_rate": 2.1097058823529413e-05,
"loss": 5.3187,
"step": 1210
},
{
"epoch": 3.388888888888889,
"grad_norm": 21.838626861572266,
"learning_rate": 2.1008823529411764e-05,
"loss": 5.3414,
"step": 1220
},
{
"epoch": 3.4166666666666665,
"grad_norm": 19.832502365112305,
"learning_rate": 2.092058823529412e-05,
"loss": 5.3392,
"step": 1230
},
{
"epoch": 3.4444444444444446,
"grad_norm": 17.20013427734375,
"learning_rate": 2.083235294117647e-05,
"loss": 5.2786,
"step": 1240
},
{
"epoch": 3.4722222222222223,
"grad_norm": 19.66887855529785,
"learning_rate": 2.0744117647058826e-05,
"loss": 5.3352,
"step": 1250
},
{
"epoch": 3.5,
"grad_norm": 19.046735763549805,
"learning_rate": 2.0655882352941174e-05,
"loss": 5.2161,
"step": 1260
},
{
"epoch": 3.5277777777777777,
"grad_norm": 17.690959930419922,
"learning_rate": 2.056764705882353e-05,
"loss": 5.3777,
"step": 1270
},
{
"epoch": 3.5555555555555554,
"grad_norm": 19.2017822265625,
"learning_rate": 2.047941176470588e-05,
"loss": 5.3262,
"step": 1280
},
{
"epoch": 3.5833333333333335,
"grad_norm": 22.48141098022461,
"learning_rate": 2.0391176470588236e-05,
"loss": 5.2689,
"step": 1290
},
{
"epoch": 3.611111111111111,
"grad_norm": 21.33321189880371,
"learning_rate": 2.0302941176470587e-05,
"loss": 5.1867,
"step": 1300
},
{
"epoch": 3.638888888888889,
"grad_norm": 20.216909408569336,
"learning_rate": 2.0214705882352942e-05,
"loss": 5.3314,
"step": 1310
},
{
"epoch": 3.6666666666666665,
"grad_norm": 19.15237808227539,
"learning_rate": 2.0126470588235294e-05,
"loss": 5.3206,
"step": 1320
},
{
"epoch": 3.6944444444444446,
"grad_norm": 17.66857147216797,
"learning_rate": 2.003823529411765e-05,
"loss": 5.4498,
"step": 1330
},
{
"epoch": 3.7222222222222223,
"grad_norm": 20.253646850585938,
"learning_rate": 1.995e-05,
"loss": 5.4104,
"step": 1340
},
{
"epoch": 3.75,
"grad_norm": 19.723072052001953,
"learning_rate": 1.9861764705882355e-05,
"loss": 5.214,
"step": 1350
},
{
"epoch": 3.7777777777777777,
"grad_norm": 17.601451873779297,
"learning_rate": 1.9773529411764704e-05,
"loss": 5.356,
"step": 1360
},
{
"epoch": 3.8055555555555554,
"grad_norm": 19.1397705078125,
"learning_rate": 1.968529411764706e-05,
"loss": 5.4965,
"step": 1370
},
{
"epoch": 3.8333333333333335,
"grad_norm": 18.656843185424805,
"learning_rate": 1.959705882352941e-05,
"loss": 5.0704,
"step": 1380
},
{
"epoch": 3.861111111111111,
"grad_norm": 16.182132720947266,
"learning_rate": 1.9508823529411765e-05,
"loss": 5.1655,
"step": 1390
},
{
"epoch": 3.888888888888889,
"grad_norm": 18.828876495361328,
"learning_rate": 1.9420588235294117e-05,
"loss": 5.0816,
"step": 1400
},
{
"epoch": 3.9166666666666665,
"grad_norm": 18.60540199279785,
"learning_rate": 1.9332352941176472e-05,
"loss": 5.2968,
"step": 1410
},
{
"epoch": 3.9444444444444446,
"grad_norm": 17.49435806274414,
"learning_rate": 1.9244117647058823e-05,
"loss": 5.3383,
"step": 1420
},
{
"epoch": 3.9722222222222223,
"grad_norm": 19.32575798034668,
"learning_rate": 1.915588235294118e-05,
"loss": 5.1811,
"step": 1430
},
{
"epoch": 4.0,
"grad_norm": 23.117151260375977,
"learning_rate": 1.906764705882353e-05,
"loss": 5.3958,
"step": 1440
},
{
"epoch": 4.027777777777778,
"grad_norm": 14.918752670288086,
"learning_rate": 1.8979411764705885e-05,
"loss": 4.7873,
"step": 1450
},
{
"epoch": 4.055555555555555,
"grad_norm": 16.663442611694336,
"learning_rate": 1.8891176470588236e-05,
"loss": 4.812,
"step": 1460
},
{
"epoch": 4.083333333333333,
"grad_norm": 17.157150268554688,
"learning_rate": 1.8802941176470588e-05,
"loss": 4.7987,
"step": 1470
},
{
"epoch": 4.111111111111111,
"grad_norm": 18.143966674804688,
"learning_rate": 1.871470588235294e-05,
"loss": 4.6237,
"step": 1480
},
{
"epoch": 4.138888888888889,
"grad_norm": 16.317506790161133,
"learning_rate": 1.8626470588235295e-05,
"loss": 4.6253,
"step": 1490
},
{
"epoch": 4.166666666666667,
"grad_norm": 23.109792709350586,
"learning_rate": 1.8538235294117646e-05,
"loss": 4.7575,
"step": 1500
},
{
"epoch": 4.194444444444445,
"grad_norm": 16.878374099731445,
"learning_rate": 1.845e-05,
"loss": 4.6636,
"step": 1510
},
{
"epoch": 4.222222222222222,
"grad_norm": 18.059541702270508,
"learning_rate": 1.8361764705882353e-05,
"loss": 4.7979,
"step": 1520
},
{
"epoch": 4.25,
"grad_norm": 16.914222717285156,
"learning_rate": 1.8273529411764708e-05,
"loss": 4.7786,
"step": 1530
},
{
"epoch": 4.277777777777778,
"grad_norm": 18.730314254760742,
"learning_rate": 1.818529411764706e-05,
"loss": 4.7931,
"step": 1540
},
{
"epoch": 4.305555555555555,
"grad_norm": 17.19397735595703,
"learning_rate": 1.8097058823529414e-05,
"loss": 4.6813,
"step": 1550
},
{
"epoch": 4.333333333333333,
"grad_norm": 17.824308395385742,
"learning_rate": 1.8008823529411766e-05,
"loss": 4.8799,
"step": 1560
},
{
"epoch": 4.361111111111111,
"grad_norm": 17.733112335205078,
"learning_rate": 1.792058823529412e-05,
"loss": 4.7475,
"step": 1570
},
{
"epoch": 4.388888888888889,
"grad_norm": 17.163602828979492,
"learning_rate": 1.783235294117647e-05,
"loss": 4.8217,
"step": 1580
},
{
"epoch": 4.416666666666667,
"grad_norm": 19.913904190063477,
"learning_rate": 1.7744117647058824e-05,
"loss": 4.8949,
"step": 1590
},
{
"epoch": 4.444444444444445,
"grad_norm": 18.19684600830078,
"learning_rate": 1.7655882352941176e-05,
"loss": 4.9527,
"step": 1600
},
{
"epoch": 4.472222222222222,
"grad_norm": 19.120418548583984,
"learning_rate": 1.756764705882353e-05,
"loss": 4.8928,
"step": 1610
},
{
"epoch": 4.5,
"grad_norm": 18.0157470703125,
"learning_rate": 1.7479411764705882e-05,
"loss": 4.8177,
"step": 1620
},
{
"epoch": 4.527777777777778,
"grad_norm": 20.340044021606445,
"learning_rate": 1.7391176470588237e-05,
"loss": 4.6013,
"step": 1630
},
{
"epoch": 4.555555555555555,
"grad_norm": 16.55755043029785,
"learning_rate": 1.730294117647059e-05,
"loss": 4.8209,
"step": 1640
},
{
"epoch": 4.583333333333333,
"grad_norm": 17.506793975830078,
"learning_rate": 1.7214705882352944e-05,
"loss": 5.0675,
"step": 1650
},
{
"epoch": 4.611111111111111,
"grad_norm": 22.037546157836914,
"learning_rate": 1.7126470588235295e-05,
"loss": 4.8392,
"step": 1660
},
{
"epoch": 4.638888888888889,
"grad_norm": 17.09819221496582,
"learning_rate": 1.703823529411765e-05,
"loss": 4.7789,
"step": 1670
},
{
"epoch": 4.666666666666667,
"grad_norm": 23.13201904296875,
"learning_rate": 1.695e-05,
"loss": 4.8017,
"step": 1680
},
{
"epoch": 4.694444444444445,
"grad_norm": 19.751766204833984,
"learning_rate": 1.6861764705882354e-05,
"loss": 4.7095,
"step": 1690
},
{
"epoch": 4.722222222222222,
"grad_norm": 17.58235740661621,
"learning_rate": 1.6773529411764705e-05,
"loss": 4.7251,
"step": 1700
},
{
"epoch": 4.75,
"grad_norm": 17.987777709960938,
"learning_rate": 1.668529411764706e-05,
"loss": 4.8577,
"step": 1710
},
{
"epoch": 4.777777777777778,
"grad_norm": 21.950464248657227,
"learning_rate": 1.6597058823529412e-05,
"loss": 4.8194,
"step": 1720
},
{
"epoch": 4.805555555555555,
"grad_norm": 18.807058334350586,
"learning_rate": 1.6508823529411767e-05,
"loss": 4.8405,
"step": 1730
},
{
"epoch": 4.833333333333333,
"grad_norm": 19.066274642944336,
"learning_rate": 1.642058823529412e-05,
"loss": 4.8167,
"step": 1740
},
{
"epoch": 4.861111111111111,
"grad_norm": 16.53117561340332,
"learning_rate": 1.633235294117647e-05,
"loss": 4.9061,
"step": 1750
},
{
"epoch": 4.888888888888889,
"grad_norm": 19.067041397094727,
"learning_rate": 1.6244117647058825e-05,
"loss": 4.772,
"step": 1760
},
{
"epoch": 4.916666666666667,
"grad_norm": 17.190061569213867,
"learning_rate": 1.6155882352941177e-05,
"loss": 4.8326,
"step": 1770
},
{
"epoch": 4.944444444444445,
"grad_norm": 22.074304580688477,
"learning_rate": 1.606764705882353e-05,
"loss": 5.0498,
"step": 1780
},
{
"epoch": 4.972222222222222,
"grad_norm": 17.11249542236328,
"learning_rate": 1.597941176470588e-05,
"loss": 4.8728,
"step": 1790
},
{
"epoch": 5.0,
"grad_norm": 17.489036560058594,
"learning_rate": 1.5891176470588235e-05,
"loss": 4.7712,
"step": 1800
},
{
"epoch": 5.027777777777778,
"grad_norm": 18.673629760742188,
"learning_rate": 1.5802941176470586e-05,
"loss": 4.318,
"step": 1810
},
{
"epoch": 5.055555555555555,
"grad_norm": 18.343915939331055,
"learning_rate": 1.571470588235294e-05,
"loss": 4.4046,
"step": 1820
},
{
"epoch": 5.083333333333333,
"grad_norm": 18.69623565673828,
"learning_rate": 1.5626470588235293e-05,
"loss": 4.4717,
"step": 1830
},
{
"epoch": 5.111111111111111,
"grad_norm": 17.767257690429688,
"learning_rate": 1.5538235294117648e-05,
"loss": 4.3915,
"step": 1840
},
{
"epoch": 5.138888888888889,
"grad_norm": 18.257091522216797,
"learning_rate": 1.545e-05,
"loss": 4.4195,
"step": 1850
},
{
"epoch": 5.166666666666667,
"grad_norm": 18.255918502807617,
"learning_rate": 1.5361764705882354e-05,
"loss": 4.466,
"step": 1860
},
{
"epoch": 5.194444444444445,
"grad_norm": 17.16253089904785,
"learning_rate": 1.5273529411764706e-05,
"loss": 4.4639,
"step": 1870
},
{
"epoch": 5.222222222222222,
"grad_norm": 23.036542892456055,
"learning_rate": 1.5185294117647061e-05,
"loss": 4.4155,
"step": 1880
},
{
"epoch": 5.25,
"grad_norm": 16.37624168395996,
"learning_rate": 1.5097058823529414e-05,
"loss": 4.4856,
"step": 1890
},
{
"epoch": 5.277777777777778,
"grad_norm": 15.804655075073242,
"learning_rate": 1.5008823529411764e-05,
"loss": 4.3916,
"step": 1900
},
{
"epoch": 5.305555555555555,
"grad_norm": 18.654705047607422,
"learning_rate": 1.492058823529412e-05,
"loss": 4.5315,
"step": 1910
},
{
"epoch": 5.333333333333333,
"grad_norm": 17.40276336669922,
"learning_rate": 1.483235294117647e-05,
"loss": 4.4648,
"step": 1920
},
{
"epoch": 5.361111111111111,
"grad_norm": 14.804203987121582,
"learning_rate": 1.4744117647058824e-05,
"loss": 4.5326,
"step": 1930
},
{
"epoch": 5.388888888888889,
"grad_norm": 18.393539428710938,
"learning_rate": 1.4655882352941177e-05,
"loss": 4.3771,
"step": 1940
},
{
"epoch": 5.416666666666667,
"grad_norm": 21.74736213684082,
"learning_rate": 1.456764705882353e-05,
"loss": 4.424,
"step": 1950
},
{
"epoch": 5.444444444444445,
"grad_norm": 18.945545196533203,
"learning_rate": 1.4479411764705884e-05,
"loss": 4.3918,
"step": 1960
},
{
"epoch": 5.472222222222222,
"grad_norm": 21.048032760620117,
"learning_rate": 1.4391176470588237e-05,
"loss": 4.4165,
"step": 1970
},
{
"epoch": 5.5,
"grad_norm": 17.606342315673828,
"learning_rate": 1.4302941176470589e-05,
"loss": 4.4718,
"step": 1980
},
{
"epoch": 5.527777777777778,
"grad_norm": 16.075519561767578,
"learning_rate": 1.4214705882352942e-05,
"loss": 4.3442,
"step": 1990
},
{
"epoch": 5.555555555555555,
"grad_norm": 15.067805290222168,
"learning_rate": 1.4126470588235295e-05,
"loss": 4.4641,
"step": 2000
},
{
"epoch": 5.583333333333333,
"grad_norm": 16.70688819885254,
"learning_rate": 1.4038235294117649e-05,
"loss": 4.4619,
"step": 2010
},
{
"epoch": 5.611111111111111,
"grad_norm": 20.45810317993164,
"learning_rate": 1.395e-05,
"loss": 4.4448,
"step": 2020
},
{
"epoch": 5.638888888888889,
"grad_norm": 19.671323776245117,
"learning_rate": 1.3861764705882352e-05,
"loss": 4.5246,
"step": 2030
},
{
"epoch": 5.666666666666667,
"grad_norm": 17.035329818725586,
"learning_rate": 1.3773529411764705e-05,
"loss": 4.3425,
"step": 2040
},
{
"epoch": 5.694444444444445,
"grad_norm": 18.257850646972656,
"learning_rate": 1.3685294117647058e-05,
"loss": 4.3883,
"step": 2050
},
{
"epoch": 5.722222222222222,
"grad_norm": 17.29006004333496,
"learning_rate": 1.3597058823529412e-05,
"loss": 4.4048,
"step": 2060
},
{
"epoch": 5.75,
"grad_norm": 16.57717514038086,
"learning_rate": 1.3508823529411765e-05,
"loss": 4.4456,
"step": 2070
},
{
"epoch": 5.777777777777778,
"grad_norm": 21.21561622619629,
"learning_rate": 1.3420588235294117e-05,
"loss": 4.5329,
"step": 2080
},
{
"epoch": 5.805555555555555,
"grad_norm": 18.407976150512695,
"learning_rate": 1.333235294117647e-05,
"loss": 4.6047,
"step": 2090
},
{
"epoch": 5.833333333333333,
"grad_norm": 15.929880142211914,
"learning_rate": 1.3244117647058823e-05,
"loss": 4.421,
"step": 2100
},
{
"epoch": 5.861111111111111,
"grad_norm": 16.92407989501953,
"learning_rate": 1.3155882352941176e-05,
"loss": 4.4857,
"step": 2110
},
{
"epoch": 5.888888888888889,
"grad_norm": 17.74669647216797,
"learning_rate": 1.306764705882353e-05,
"loss": 4.4654,
"step": 2120
},
{
"epoch": 5.916666666666667,
"grad_norm": 19.744373321533203,
"learning_rate": 1.2979411764705881e-05,
"loss": 4.3837,
"step": 2130
},
{
"epoch": 5.944444444444445,
"grad_norm": 15.323151588439941,
"learning_rate": 1.2891176470588235e-05,
"loss": 4.3457,
"step": 2140
},
{
"epoch": 5.972222222222222,
"grad_norm": 18.93714141845703,
"learning_rate": 1.2802941176470588e-05,
"loss": 4.3657,
"step": 2150
},
{
"epoch": 6.0,
"grad_norm": 19.87427520751953,
"learning_rate": 1.2714705882352941e-05,
"loss": 4.4179,
"step": 2160
},
{
"epoch": 6.027777777777778,
"grad_norm": 15.826274871826172,
"learning_rate": 1.2626470588235295e-05,
"loss": 4.1858,
"step": 2170
},
{
"epoch": 6.055555555555555,
"grad_norm": 16.08074378967285,
"learning_rate": 1.2538235294117648e-05,
"loss": 4.1317,
"step": 2180
},
{
"epoch": 6.083333333333333,
"grad_norm": 15.976150512695312,
"learning_rate": 1.245e-05,
"loss": 4.1214,
"step": 2190
},
{
"epoch": 6.111111111111111,
"grad_norm": 17.065427780151367,
"learning_rate": 1.2361764705882353e-05,
"loss": 4.0471,
"step": 2200
},
{
"epoch": 6.138888888888889,
"grad_norm": 15.556553840637207,
"learning_rate": 1.2273529411764706e-05,
"loss": 4.0961,
"step": 2210
},
{
"epoch": 6.166666666666667,
"grad_norm": 16.178955078125,
"learning_rate": 1.218529411764706e-05,
"loss": 4.1129,
"step": 2220
},
{
"epoch": 6.194444444444445,
"grad_norm": 17.971904754638672,
"learning_rate": 1.2097058823529413e-05,
"loss": 4.111,
"step": 2230
},
{
"epoch": 6.222222222222222,
"grad_norm": 17.23676109313965,
"learning_rate": 1.2008823529411764e-05,
"loss": 4.209,
"step": 2240
},
{
"epoch": 6.25,
"grad_norm": 18.453350067138672,
"learning_rate": 1.1920588235294117e-05,
"loss": 4.1997,
"step": 2250
},
{
"epoch": 6.277777777777778,
"grad_norm": 17.884140014648438,
"learning_rate": 1.183235294117647e-05,
"loss": 4.1343,
"step": 2260
},
{
"epoch": 6.305555555555555,
"grad_norm": 18.340280532836914,
"learning_rate": 1.1744117647058824e-05,
"loss": 4.1262,
"step": 2270
},
{
"epoch": 6.333333333333333,
"grad_norm": 18.307395935058594,
"learning_rate": 1.1655882352941177e-05,
"loss": 4.1307,
"step": 2280
},
{
"epoch": 6.361111111111111,
"grad_norm": 17.725727081298828,
"learning_rate": 1.1567647058823529e-05,
"loss": 4.0369,
"step": 2290
},
{
"epoch": 6.388888888888889,
"grad_norm": 14.209752082824707,
"learning_rate": 1.1479411764705882e-05,
"loss": 4.1071,
"step": 2300
},
{
"epoch": 6.416666666666667,
"grad_norm": 21.862701416015625,
"learning_rate": 1.1391176470588235e-05,
"loss": 4.223,
"step": 2310
},
{
"epoch": 6.444444444444445,
"grad_norm": 13.577404975891113,
"learning_rate": 1.1302941176470589e-05,
"loss": 4.2441,
"step": 2320
},
{
"epoch": 6.472222222222222,
"grad_norm": 16.62024688720703,
"learning_rate": 1.1214705882352942e-05,
"loss": 4.1793,
"step": 2330
},
{
"epoch": 6.5,
"grad_norm": 17.887189865112305,
"learning_rate": 1.1126470588235295e-05,
"loss": 4.1837,
"step": 2340
},
{
"epoch": 6.527777777777778,
"grad_norm": 16.476367950439453,
"learning_rate": 1.1038235294117647e-05,
"loss": 4.1526,
"step": 2350
},
{
"epoch": 6.555555555555555,
"grad_norm": 17.012876510620117,
"learning_rate": 1.095e-05,
"loss": 4.1965,
"step": 2360
},
{
"epoch": 6.583333333333333,
"grad_norm": 17.46245765686035,
"learning_rate": 1.0861764705882353e-05,
"loss": 4.1394,
"step": 2370
},
{
"epoch": 6.611111111111111,
"grad_norm": 17.8790340423584,
"learning_rate": 1.0773529411764707e-05,
"loss": 4.1404,
"step": 2380
},
{
"epoch": 6.638888888888889,
"grad_norm": 16.047483444213867,
"learning_rate": 1.068529411764706e-05,
"loss": 4.1839,
"step": 2390
},
{
"epoch": 6.666666666666667,
"grad_norm": 15.634733200073242,
"learning_rate": 1.0597058823529412e-05,
"loss": 4.1608,
"step": 2400
},
{
"epoch": 6.694444444444445,
"grad_norm": 15.73880386352539,
"learning_rate": 1.0508823529411765e-05,
"loss": 4.1989,
"step": 2410
},
{
"epoch": 6.722222222222222,
"grad_norm": 18.751541137695312,
"learning_rate": 1.0420588235294118e-05,
"loss": 4.215,
"step": 2420
},
{
"epoch": 6.75,
"grad_norm": 18.6307430267334,
"learning_rate": 1.0332352941176472e-05,
"loss": 4.1345,
"step": 2430
},
{
"epoch": 6.777777777777778,
"grad_norm": 18.48142433166504,
"learning_rate": 1.0244117647058825e-05,
"loss": 4.2856,
"step": 2440
},
{
"epoch": 6.805555555555555,
"grad_norm": 17.484237670898438,
"learning_rate": 1.0155882352941176e-05,
"loss": 4.223,
"step": 2450
},
{
"epoch": 6.833333333333333,
"grad_norm": 16.67469024658203,
"learning_rate": 1.006764705882353e-05,
"loss": 4.1748,
"step": 2460
},
{
"epoch": 6.861111111111111,
"grad_norm": 18.385995864868164,
"learning_rate": 9.979411764705883e-06,
"loss": 4.1886,
"step": 2470
},
{
"epoch": 6.888888888888889,
"grad_norm": 20.845624923706055,
"learning_rate": 9.891176470588236e-06,
"loss": 4.1832,
"step": 2480
},
{
"epoch": 6.916666666666667,
"grad_norm": 18.856298446655273,
"learning_rate": 9.80294117647059e-06,
"loss": 4.2403,
"step": 2490
},
{
"epoch": 6.944444444444445,
"grad_norm": 16.965234756469727,
"learning_rate": 9.714705882352943e-06,
"loss": 4.2439,
"step": 2500
},
{
"epoch": 6.972222222222222,
"grad_norm": 16.04508399963379,
"learning_rate": 9.626470588235294e-06,
"loss": 4.1588,
"step": 2510
},
{
"epoch": 7.0,
"grad_norm": 17.192468643188477,
"learning_rate": 9.538235294117648e-06,
"loss": 4.2357,
"step": 2520
},
{
"epoch": 7.027777777777778,
"grad_norm": 14.738517761230469,
"learning_rate": 9.450000000000001e-06,
"loss": 3.8684,
"step": 2530
},
{
"epoch": 7.055555555555555,
"grad_norm": 16.8319034576416,
"learning_rate": 9.361764705882354e-06,
"loss": 3.8483,
"step": 2540
},
{
"epoch": 7.083333333333333,
"grad_norm": 14.79024600982666,
"learning_rate": 9.273529411764708e-06,
"loss": 3.9072,
"step": 2550
},
{
"epoch": 7.111111111111111,
"grad_norm": 12.333316802978516,
"learning_rate": 9.18529411764706e-06,
"loss": 3.9609,
"step": 2560
},
{
"epoch": 7.138888888888889,
"grad_norm": 14.567424774169922,
"learning_rate": 9.097058823529412e-06,
"loss": 3.9129,
"step": 2570
},
{
"epoch": 7.166666666666667,
"grad_norm": 18.647308349609375,
"learning_rate": 9.008823529411766e-06,
"loss": 3.9278,
"step": 2580
},
{
"epoch": 7.194444444444445,
"grad_norm": 16.815906524658203,
"learning_rate": 8.920588235294117e-06,
"loss": 3.9316,
"step": 2590
},
{
"epoch": 7.222222222222222,
"grad_norm": 19.90522575378418,
"learning_rate": 8.83235294117647e-06,
"loss": 3.9444,
"step": 2600
},
{
"epoch": 7.25,
"grad_norm": 15.773906707763672,
"learning_rate": 8.744117647058822e-06,
"loss": 4.037,
"step": 2610
},
{
"epoch": 7.277777777777778,
"grad_norm": 14.414016723632812,
"learning_rate": 8.655882352941176e-06,
"loss": 3.9834,
"step": 2620
},
{
"epoch": 7.305555555555555,
"grad_norm": 18.313098907470703,
"learning_rate": 8.567647058823529e-06,
"loss": 4.0229,
"step": 2630
},
{
"epoch": 7.333333333333333,
"grad_norm": 15.021878242492676,
"learning_rate": 8.479411764705882e-06,
"loss": 3.9891,
"step": 2640
},
{
"epoch": 7.361111111111111,
"grad_norm": 16.228864669799805,
"learning_rate": 8.391176470588235e-06,
"loss": 3.9871,
"step": 2650
},
{
"epoch": 7.388888888888889,
"grad_norm": 11.939830780029297,
"learning_rate": 8.302941176470589e-06,
"loss": 3.8708,
"step": 2660
},
{
"epoch": 7.416666666666667,
"grad_norm": 13.20190143585205,
"learning_rate": 8.21470588235294e-06,
"loss": 3.8982,
"step": 2670
},
{
"epoch": 7.444444444444445,
"grad_norm": 15.111628532409668,
"learning_rate": 8.126470588235294e-06,
"loss": 3.9871,
"step": 2680
},
{
"epoch": 7.472222222222222,
"grad_norm": 15.908904075622559,
"learning_rate": 8.038235294117647e-06,
"loss": 3.976,
"step": 2690
},
{
"epoch": 7.5,
"grad_norm": 18.098617553710938,
"learning_rate": 7.95e-06,
"loss": 4.0415,
"step": 2700
},
{
"epoch": 7.527777777777778,
"grad_norm": 16.921981811523438,
"learning_rate": 7.861764705882353e-06,
"loss": 4.0462,
"step": 2710
},
{
"epoch": 7.555555555555555,
"grad_norm": 14.457706451416016,
"learning_rate": 7.773529411764705e-06,
"loss": 3.9405,
"step": 2720
},
{
"epoch": 7.583333333333333,
"grad_norm": 17.50982093811035,
"learning_rate": 7.685294117647058e-06,
"loss": 3.9974,
"step": 2730
},
{
"epoch": 7.611111111111111,
"grad_norm": 15.285282135009766,
"learning_rate": 7.5970588235294116e-06,
"loss": 3.9969,
"step": 2740
},
{
"epoch": 7.638888888888889,
"grad_norm": 14.417180061340332,
"learning_rate": 7.508823529411765e-06,
"loss": 3.97,
"step": 2750
},
{
"epoch": 7.666666666666667,
"grad_norm": 14.781904220581055,
"learning_rate": 7.420588235294117e-06,
"loss": 3.9797,
"step": 2760
},
{
"epoch": 7.694444444444445,
"grad_norm": 16.317949295043945,
"learning_rate": 7.332352941176471e-06,
"loss": 3.9236,
"step": 2770
},
{
"epoch": 7.722222222222222,
"grad_norm": 16.340526580810547,
"learning_rate": 7.244117647058824e-06,
"loss": 4.0323,
"step": 2780
},
{
"epoch": 7.75,
"grad_norm": 13.924703598022461,
"learning_rate": 7.155882352941176e-06,
"loss": 3.8719,
"step": 2790
},
{
"epoch": 7.777777777777778,
"grad_norm": 19.019325256347656,
"learning_rate": 7.06764705882353e-06,
"loss": 3.9529,
"step": 2800
},
{
"epoch": 7.805555555555555,
"grad_norm": 17.188499450683594,
"learning_rate": 6.979411764705882e-06,
"loss": 4.0392,
"step": 2810
},
{
"epoch": 7.833333333333333,
"grad_norm": 13.893120765686035,
"learning_rate": 6.891176470588235e-06,
"loss": 3.9727,
"step": 2820
},
{
"epoch": 7.861111111111111,
"grad_norm": 13.257885932922363,
"learning_rate": 6.802941176470589e-06,
"loss": 4.0164,
"step": 2830
},
{
"epoch": 7.888888888888889,
"grad_norm": 15.29055118560791,
"learning_rate": 6.714705882352941e-06,
"loss": 4.0283,
"step": 2840
},
{
"epoch": 7.916666666666667,
"grad_norm": 15.208207130432129,
"learning_rate": 6.626470588235294e-06,
"loss": 3.9515,
"step": 2850
},
{
"epoch": 7.944444444444445,
"grad_norm": 15.335555076599121,
"learning_rate": 6.538235294117647e-06,
"loss": 4.0672,
"step": 2860
},
{
"epoch": 7.972222222222222,
"grad_norm": 14.554072380065918,
"learning_rate": 6.45e-06,
"loss": 4.0757,
"step": 2870
},
{
"epoch": 8.0,
"grad_norm": 17.360929489135742,
"learning_rate": 6.361764705882353e-06,
"loss": 3.9486,
"step": 2880
},
{
"epoch": 8.027777777777779,
"grad_norm": 11.244579315185547,
"learning_rate": 6.273529411764706e-06,
"loss": 3.8162,
"step": 2890
},
{
"epoch": 8.055555555555555,
"grad_norm": 16.21691131591797,
"learning_rate": 6.185294117647059e-06,
"loss": 3.8226,
"step": 2900
},
{
"epoch": 8.083333333333334,
"grad_norm": 16.201114654541016,
"learning_rate": 6.097058823529412e-06,
"loss": 3.7959,
"step": 2910
},
{
"epoch": 8.11111111111111,
"grad_norm": 17.415403366088867,
"learning_rate": 6.008823529411765e-06,
"loss": 3.7922,
"step": 2920
},
{
"epoch": 8.13888888888889,
"grad_norm": 10.362262725830078,
"learning_rate": 5.920588235294118e-06,
"loss": 3.7602,
"step": 2930
},
{
"epoch": 8.166666666666666,
"grad_norm": 12.301746368408203,
"learning_rate": 5.8323529411764706e-06,
"loss": 3.8236,
"step": 2940
},
{
"epoch": 8.194444444444445,
"grad_norm": 15.563980102539062,
"learning_rate": 5.744117647058824e-06,
"loss": 3.7988,
"step": 2950
},
{
"epoch": 8.222222222222221,
"grad_norm": 14.712089538574219,
"learning_rate": 5.655882352941177e-06,
"loss": 3.8601,
"step": 2960
},
{
"epoch": 8.25,
"grad_norm": 16.73733901977539,
"learning_rate": 5.5676470588235296e-06,
"loss": 3.7956,
"step": 2970
},
{
"epoch": 8.277777777777779,
"grad_norm": 15.459855079650879,
"learning_rate": 5.479411764705883e-06,
"loss": 3.7525,
"step": 2980
},
{
"epoch": 8.305555555555555,
"grad_norm": 16.97991943359375,
"learning_rate": 5.391176470588236e-06,
"loss": 3.7536,
"step": 2990
},
{
"epoch": 8.333333333333334,
"grad_norm": 13.196775436401367,
"learning_rate": 5.302941176470589e-06,
"loss": 3.88,
"step": 3000
},
{
"epoch": 8.36111111111111,
"grad_norm": 15.121062278747559,
"learning_rate": 5.214705882352941e-06,
"loss": 3.8269,
"step": 3010
},
{
"epoch": 8.38888888888889,
"grad_norm": 17.251754760742188,
"learning_rate": 5.1264705882352935e-06,
"loss": 3.8579,
"step": 3020
},
{
"epoch": 8.416666666666666,
"grad_norm": 14.106273651123047,
"learning_rate": 5.038235294117647e-06,
"loss": 3.8255,
"step": 3030
},
{
"epoch": 8.444444444444445,
"grad_norm": 16.304101943969727,
"learning_rate": 4.95e-06,
"loss": 3.8521,
"step": 3040
},
{
"epoch": 8.472222222222221,
"grad_norm": 25.067447662353516,
"learning_rate": 4.8617647058823525e-06,
"loss": 3.8901,
"step": 3050
},
{
"epoch": 8.5,
"grad_norm": 15.186288833618164,
"learning_rate": 4.773529411764706e-06,
"loss": 3.8275,
"step": 3060
},
{
"epoch": 8.527777777777779,
"grad_norm": 14.004067420959473,
"learning_rate": 4.685294117647059e-06,
"loss": 3.9064,
"step": 3070
},
{
"epoch": 8.555555555555555,
"grad_norm": 14.653326988220215,
"learning_rate": 4.5970588235294115e-06,
"loss": 3.7708,
"step": 3080
},
{
"epoch": 8.583333333333334,
"grad_norm": 15.567551612854004,
"learning_rate": 4.508823529411765e-06,
"loss": 3.8827,
"step": 3090
},
{
"epoch": 8.61111111111111,
"grad_norm": 15.521193504333496,
"learning_rate": 4.420588235294117e-06,
"loss": 3.7874,
"step": 3100
},
{
"epoch": 8.63888888888889,
"grad_norm": 13.801298141479492,
"learning_rate": 4.3323529411764705e-06,
"loss": 3.8132,
"step": 3110
},
{
"epoch": 8.666666666666666,
"grad_norm": 15.118513107299805,
"learning_rate": 4.244117647058824e-06,
"loss": 3.7819,
"step": 3120
},
{
"epoch": 8.694444444444445,
"grad_norm": 14.111237525939941,
"learning_rate": 4.155882352941176e-06,
"loss": 3.8452,
"step": 3130
},
{
"epoch": 8.722222222222221,
"grad_norm": 13.785816192626953,
"learning_rate": 4.0676470588235295e-06,
"loss": 3.8541,
"step": 3140
},
{
"epoch": 8.75,
"grad_norm": 13.524584770202637,
"learning_rate": 3.979411764705883e-06,
"loss": 3.8521,
"step": 3150
},
{
"epoch": 8.777777777777779,
"grad_norm": 15.52719497680664,
"learning_rate": 3.891176470588235e-06,
"loss": 3.8354,
"step": 3160
},
{
"epoch": 8.805555555555555,
"grad_norm": 17.874053955078125,
"learning_rate": 3.8029411764705885e-06,
"loss": 3.794,
"step": 3170
},
{
"epoch": 8.833333333333334,
"grad_norm": 15.086000442504883,
"learning_rate": 3.7147058823529414e-06,
"loss": 3.7661,
"step": 3180
},
{
"epoch": 8.86111111111111,
"grad_norm": 18.586685180664062,
"learning_rate": 3.6264705882352943e-06,
"loss": 3.8781,
"step": 3190
},
{
"epoch": 8.88888888888889,
"grad_norm": 16.09383201599121,
"learning_rate": 3.538235294117647e-06,
"loss": 3.8242,
"step": 3200
},
{
"epoch": 8.916666666666666,
"grad_norm": 13.737457275390625,
"learning_rate": 3.4500000000000004e-06,
"loss": 3.8439,
"step": 3210
},
{
"epoch": 8.944444444444445,
"grad_norm": 14.633013725280762,
"learning_rate": 3.361764705882353e-06,
"loss": 3.8111,
"step": 3220
},
{
"epoch": 8.972222222222221,
"grad_norm": 12.145355224609375,
"learning_rate": 3.2735294117647057e-06,
"loss": 3.8591,
"step": 3230
},
{
"epoch": 9.0,
"grad_norm": 15.101285934448242,
"learning_rate": 3.1852941176470586e-06,
"loss": 3.8327,
"step": 3240
},
{
"epoch": 9.027777777777779,
"grad_norm": 12.12878131866455,
"learning_rate": 3.097058823529412e-06,
"loss": 3.7079,
"step": 3250
},
{
"epoch": 9.055555555555555,
"grad_norm": 11.098604202270508,
"learning_rate": 3.0088235294117647e-06,
"loss": 3.6859,
"step": 3260
},
{
"epoch": 9.083333333333334,
"grad_norm": 11.60837459564209,
"learning_rate": 2.9205882352941176e-06,
"loss": 3.6906,
"step": 3270
},
{
"epoch": 9.11111111111111,
"grad_norm": 11.516129493713379,
"learning_rate": 2.8323529411764705e-06,
"loss": 3.7629,
"step": 3280
},
{
"epoch": 9.13888888888889,
"grad_norm": 11.439606666564941,
"learning_rate": 2.7441176470588238e-06,
"loss": 3.7087,
"step": 3290
},
{
"epoch": 9.166666666666666,
"grad_norm": 12.872289657592773,
"learning_rate": 2.6558823529411766e-06,
"loss": 3.754,
"step": 3300
},
{
"epoch": 9.194444444444445,
"grad_norm": 13.252586364746094,
"learning_rate": 2.5676470588235295e-06,
"loss": 3.7477,
"step": 3310
},
{
"epoch": 9.222222222222221,
"grad_norm": 14.191280364990234,
"learning_rate": 2.4794117647058824e-06,
"loss": 3.7703,
"step": 3320
},
{
"epoch": 9.25,
"grad_norm": 17.2321720123291,
"learning_rate": 2.3911764705882356e-06,
"loss": 3.7569,
"step": 3330
},
{
"epoch": 9.277777777777779,
"grad_norm": 15.821660995483398,
"learning_rate": 2.3029411764705885e-06,
"loss": 3.7321,
"step": 3340
},
{
"epoch": 9.305555555555555,
"grad_norm": 10.092562675476074,
"learning_rate": 2.2147058823529414e-06,
"loss": 3.7033,
"step": 3350
},
{
"epoch": 9.333333333333334,
"grad_norm": 19.321290969848633,
"learning_rate": 2.1264705882352942e-06,
"loss": 3.7683,
"step": 3360
},
{
"epoch": 9.36111111111111,
"grad_norm": 15.781285285949707,
"learning_rate": 2.038235294117647e-06,
"loss": 3.7304,
"step": 3370
},
{
"epoch": 9.38888888888889,
"grad_norm": 13.418021202087402,
"learning_rate": 1.95e-06,
"loss": 3.7117,
"step": 3380
},
{
"epoch": 9.416666666666666,
"grad_norm": 14.020906448364258,
"learning_rate": 1.861764705882353e-06,
"loss": 3.7131,
"step": 3390
},
{
"epoch": 9.444444444444445,
"grad_norm": 13.270752906799316,
"learning_rate": 1.773529411764706e-06,
"loss": 3.6744,
"step": 3400
},
{
"epoch": 9.472222222222221,
"grad_norm": 12.589795112609863,
"learning_rate": 1.6852941176470588e-06,
"loss": 3.7544,
"step": 3410
},
{
"epoch": 9.5,
"grad_norm": 13.454065322875977,
"learning_rate": 1.5970588235294118e-06,
"loss": 3.7782,
"step": 3420
},
{
"epoch": 9.527777777777779,
"grad_norm": 14.777534484863281,
"learning_rate": 1.5088235294117647e-06,
"loss": 3.7203,
"step": 3430
},
{
"epoch": 9.555555555555555,
"grad_norm": 14.142292022705078,
"learning_rate": 1.4205882352941178e-06,
"loss": 3.7331,
"step": 3440
},
{
"epoch": 9.583333333333334,
"grad_norm": 17.070104598999023,
"learning_rate": 1.3323529411764706e-06,
"loss": 3.7318,
"step": 3450
},
{
"epoch": 9.61111111111111,
"grad_norm": 13.819690704345703,
"learning_rate": 1.2441176470588237e-06,
"loss": 3.7379,
"step": 3460
},
{
"epoch": 9.63888888888889,
"grad_norm": 13.859333038330078,
"learning_rate": 1.1558823529411764e-06,
"loss": 3.6641,
"step": 3470
},
{
"epoch": 9.666666666666666,
"grad_norm": 12.934626579284668,
"learning_rate": 1.0676470588235295e-06,
"loss": 3.7164,
"step": 3480
},
{
"epoch": 9.694444444444445,
"grad_norm": 13.10987377166748,
"learning_rate": 9.794117647058823e-07,
"loss": 3.7427,
"step": 3490
},
{
"epoch": 9.722222222222221,
"grad_norm": 13.990032196044922,
"learning_rate": 8.911764705882353e-07,
"loss": 3.7195,
"step": 3500
},
{
"epoch": 9.75,
"grad_norm": 13.015247344970703,
"learning_rate": 8.029411764705883e-07,
"loss": 3.7378,
"step": 3510
},
{
"epoch": 9.777777777777779,
"grad_norm": 13.674323081970215,
"learning_rate": 7.147058823529411e-07,
"loss": 3.7118,
"step": 3520
},
{
"epoch": 9.805555555555555,
"grad_norm": 11.697880744934082,
"learning_rate": 6.264705882352941e-07,
"loss": 3.7387,
"step": 3530
},
{
"epoch": 9.833333333333334,
"grad_norm": 13.14771842956543,
"learning_rate": 5.382352941176471e-07,
"loss": 3.6967,
"step": 3540
},
{
"epoch": 9.86111111111111,
"grad_norm": 12.617823600769043,
"learning_rate": 4.5e-07,
"loss": 3.7122,
"step": 3550
},
{
"epoch": 9.88888888888889,
"grad_norm": 14.002346992492676,
"learning_rate": 3.6176470588235295e-07,
"loss": 3.7429,
"step": 3560
},
{
"epoch": 9.916666666666666,
"grad_norm": 14.497004508972168,
"learning_rate": 2.7352941176470587e-07,
"loss": 3.7298,
"step": 3570
},
{
"epoch": 9.944444444444445,
"grad_norm": 12.17514419555664,
"learning_rate": 1.852941176470588e-07,
"loss": 3.7254,
"step": 3580
},
{
"epoch": 9.972222222222221,
"grad_norm": 17.909778594970703,
"learning_rate": 9.705882352941177e-08,
"loss": 3.7097,
"step": 3590
},
{
"epoch": 10.0,
"grad_norm": 13.537755012512207,
"learning_rate": 8.823529411764706e-09,
"loss": 3.7612,
"step": 3600
}
],
"logging_steps": 10,
"max_steps": 3600,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4403097501696000.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}