{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 3600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.027777777777777776, "grad_norm": 104.33584594726562, "learning_rate": 1.35e-06, "loss": 13.5343, "step": 10 }, { "epoch": 0.05555555555555555, "grad_norm": 57.29974365234375, "learning_rate": 2.8500000000000002e-06, "loss": 12.1877, "step": 20 }, { "epoch": 0.08333333333333333, "grad_norm": 35.94429016113281, "learning_rate": 4.35e-06, "loss": 10.9908, "step": 30 }, { "epoch": 0.1111111111111111, "grad_norm": 36.21198272705078, "learning_rate": 5.850000000000001e-06, "loss": 10.8169, "step": 40 }, { "epoch": 0.1388888888888889, "grad_norm": 35.18545150756836, "learning_rate": 7.35e-06, "loss": 10.226, "step": 50 }, { "epoch": 0.16666666666666666, "grad_norm": 30.034957885742188, "learning_rate": 8.85e-06, "loss": 9.8243, "step": 60 }, { "epoch": 0.19444444444444445, "grad_norm": 26.107460021972656, "learning_rate": 1.035e-05, "loss": 9.7202, "step": 70 }, { "epoch": 0.2222222222222222, "grad_norm": 28.405378341674805, "learning_rate": 1.185e-05, "loss": 9.655, "step": 80 }, { "epoch": 0.25, "grad_norm": 31.871849060058594, "learning_rate": 1.3350000000000001e-05, "loss": 9.1121, "step": 90 }, { "epoch": 0.2777777777777778, "grad_norm": 31.50947380065918, "learning_rate": 1.485e-05, "loss": 9.1896, "step": 100 }, { "epoch": 0.3055555555555556, "grad_norm": 28.950199127197266, "learning_rate": 1.635e-05, "loss": 9.2017, "step": 110 }, { "epoch": 0.3333333333333333, "grad_norm": 25.37229347229004, "learning_rate": 1.785e-05, "loss": 9.2022, "step": 120 }, { "epoch": 0.3611111111111111, "grad_norm": 24.216697692871094, "learning_rate": 1.935e-05, "loss": 9.1344, "step": 130 }, { "epoch": 0.3888888888888889, "grad_norm": 24.010272979736328, "learning_rate": 2.085e-05, "loss": 8.5873, "step": 140 }, { "epoch": 0.4166666666666667, "grad_norm": 24.390907287597656, "learning_rate": 2.235e-05, "loss": 8.7195, "step": 150 }, { "epoch": 0.4444444444444444, "grad_norm": 21.80168914794922, "learning_rate": 2.385e-05, "loss": 8.5485, "step": 160 }, { "epoch": 0.4722222222222222, "grad_norm": 20.435110092163086, "learning_rate": 2.535e-05, "loss": 8.6359, "step": 170 }, { "epoch": 0.5, "grad_norm": 24.11395835876465, "learning_rate": 2.6850000000000002e-05, "loss": 8.4246, "step": 180 }, { "epoch": 0.5277777777777778, "grad_norm": 25.995811462402344, "learning_rate": 2.8349999999999998e-05, "loss": 8.5049, "step": 190 }, { "epoch": 0.5555555555555556, "grad_norm": 23.826154708862305, "learning_rate": 2.985e-05, "loss": 8.4623, "step": 200 }, { "epoch": 0.5833333333333334, "grad_norm": 20.094274520874023, "learning_rate": 2.9920588235294118e-05, "loss": 8.4495, "step": 210 }, { "epoch": 0.6111111111111112, "grad_norm": 22.793804168701172, "learning_rate": 2.983235294117647e-05, "loss": 8.4131, "step": 220 }, { "epoch": 0.6388888888888888, "grad_norm": 26.08730125427246, "learning_rate": 2.9744117647058824e-05, "loss": 8.3116, "step": 230 }, { "epoch": 0.6666666666666666, "grad_norm": 24.06973648071289, "learning_rate": 2.9655882352941176e-05, "loss": 8.4003, "step": 240 }, { "epoch": 0.6944444444444444, "grad_norm": 22.93770408630371, "learning_rate": 2.956764705882353e-05, "loss": 8.2579, "step": 250 }, { "epoch": 0.7222222222222222, "grad_norm": 20.36606216430664, "learning_rate": 2.9479411764705883e-05, "loss": 8.2888, "step": 260 }, { "epoch": 0.75, "grad_norm": 21.670818328857422, "learning_rate": 2.9391176470588238e-05, "loss": 8.1623, "step": 270 }, { "epoch": 0.7777777777777778, "grad_norm": 17.550491333007812, "learning_rate": 2.930294117647059e-05, "loss": 8.0683, "step": 280 }, { "epoch": 0.8055555555555556, "grad_norm": 25.335660934448242, "learning_rate": 2.9214705882352944e-05, "loss": 8.2784, "step": 290 }, { "epoch": 0.8333333333333334, "grad_norm": 20.76925277709961, "learning_rate": 2.9126470588235292e-05, "loss": 8.2923, "step": 300 }, { "epoch": 0.8611111111111112, "grad_norm": 23.108144760131836, "learning_rate": 2.9038235294117647e-05, "loss": 8.1554, "step": 310 }, { "epoch": 0.8888888888888888, "grad_norm": 16.563575744628906, "learning_rate": 2.895e-05, "loss": 7.8543, "step": 320 }, { "epoch": 0.9166666666666666, "grad_norm": 19.03083038330078, "learning_rate": 2.8861764705882354e-05, "loss": 8.0434, "step": 330 }, { "epoch": 0.9444444444444444, "grad_norm": 18.121356964111328, "learning_rate": 2.8773529411764706e-05, "loss": 7.93, "step": 340 }, { "epoch": 0.9722222222222222, "grad_norm": 21.65605926513672, "learning_rate": 2.868529411764706e-05, "loss": 8.0124, "step": 350 }, { "epoch": 1.0, "grad_norm": 24.55741310119629, "learning_rate": 2.8597058823529412e-05, "loss": 7.547, "step": 360 }, { "epoch": 1.0277777777777777, "grad_norm": 18.358055114746094, "learning_rate": 2.8508823529411767e-05, "loss": 6.9283, "step": 370 }, { "epoch": 1.0555555555555556, "grad_norm": 18.642698287963867, "learning_rate": 2.842058823529412e-05, "loss": 7.1341, "step": 380 }, { "epoch": 1.0833333333333333, "grad_norm": 30.198606491088867, "learning_rate": 2.8332352941176474e-05, "loss": 7.113, "step": 390 }, { "epoch": 1.1111111111111112, "grad_norm": 19.851511001586914, "learning_rate": 2.8244117647058825e-05, "loss": 7.1696, "step": 400 }, { "epoch": 1.1388888888888888, "grad_norm": 20.48548698425293, "learning_rate": 2.8155882352941177e-05, "loss": 7.1552, "step": 410 }, { "epoch": 1.1666666666666667, "grad_norm": 18.470821380615234, "learning_rate": 2.806764705882353e-05, "loss": 6.8519, "step": 420 }, { "epoch": 1.1944444444444444, "grad_norm": 20.821441650390625, "learning_rate": 2.7979411764705883e-05, "loss": 7.0113, "step": 430 }, { "epoch": 1.2222222222222223, "grad_norm": 19.294477462768555, "learning_rate": 2.7891176470588235e-05, "loss": 7.3983, "step": 440 }, { "epoch": 1.25, "grad_norm": 19.5776309967041, "learning_rate": 2.780294117647059e-05, "loss": 7.0649, "step": 450 }, { "epoch": 1.2777777777777777, "grad_norm": 18.286705017089844, "learning_rate": 2.771470588235294e-05, "loss": 7.1587, "step": 460 }, { "epoch": 1.3055555555555556, "grad_norm": 19.803571701049805, "learning_rate": 2.7626470588235297e-05, "loss": 6.9439, "step": 470 }, { "epoch": 1.3333333333333333, "grad_norm": 20.182926177978516, "learning_rate": 2.7538235294117648e-05, "loss": 6.9072, "step": 480 }, { "epoch": 1.3611111111111112, "grad_norm": 21.21506690979004, "learning_rate": 2.7450000000000003e-05, "loss": 6.7805, "step": 490 }, { "epoch": 1.3888888888888888, "grad_norm": 18.884464263916016, "learning_rate": 2.7361764705882355e-05, "loss": 6.7809, "step": 500 }, { "epoch": 1.4166666666666667, "grad_norm": 18.7882080078125, "learning_rate": 2.7273529411764706e-05, "loss": 7.1101, "step": 510 }, { "epoch": 1.4444444444444444, "grad_norm": 17.172212600708008, "learning_rate": 2.7185294117647058e-05, "loss": 6.8579, "step": 520 }, { "epoch": 1.4722222222222223, "grad_norm": 18.599943161010742, "learning_rate": 2.7097058823529413e-05, "loss": 6.9192, "step": 530 }, { "epoch": 1.5, "grad_norm": 22.43140411376953, "learning_rate": 2.7008823529411765e-05, "loss": 6.9068, "step": 540 }, { "epoch": 1.5277777777777777, "grad_norm": 16.94782257080078, "learning_rate": 2.692058823529412e-05, "loss": 7.0826, "step": 550 }, { "epoch": 1.5555555555555556, "grad_norm": 18.50872802734375, "learning_rate": 2.683235294117647e-05, "loss": 6.8281, "step": 560 }, { "epoch": 1.5833333333333335, "grad_norm": 22.08888053894043, "learning_rate": 2.6744117647058826e-05, "loss": 6.9378, "step": 570 }, { "epoch": 1.6111111111111112, "grad_norm": 17.366363525390625, "learning_rate": 2.6655882352941178e-05, "loss": 6.6598, "step": 580 }, { "epoch": 1.6388888888888888, "grad_norm": 17.86391830444336, "learning_rate": 2.6567647058823533e-05, "loss": 6.8419, "step": 590 }, { "epoch": 1.6666666666666665, "grad_norm": 16.568017959594727, "learning_rate": 2.6479411764705884e-05, "loss": 6.9137, "step": 600 }, { "epoch": 1.6944444444444444, "grad_norm": 21.780588150024414, "learning_rate": 2.6391176470588236e-05, "loss": 7.0719, "step": 610 }, { "epoch": 1.7222222222222223, "grad_norm": 20.24903678894043, "learning_rate": 2.6302941176470587e-05, "loss": 6.8612, "step": 620 }, { "epoch": 1.75, "grad_norm": 23.022014617919922, "learning_rate": 2.621470588235294e-05, "loss": 6.815, "step": 630 }, { "epoch": 1.7777777777777777, "grad_norm": 19.738136291503906, "learning_rate": 2.6126470588235294e-05, "loss": 7.0848, "step": 640 }, { "epoch": 1.8055555555555556, "grad_norm": 19.904956817626953, "learning_rate": 2.6038235294117646e-05, "loss": 7.1127, "step": 650 }, { "epoch": 1.8333333333333335, "grad_norm": 20.379528045654297, "learning_rate": 2.595e-05, "loss": 7.1542, "step": 660 }, { "epoch": 1.8611111111111112, "grad_norm": 17.21288299560547, "learning_rate": 2.5861764705882352e-05, "loss": 6.4236, "step": 670 }, { "epoch": 1.8888888888888888, "grad_norm": 19.684776306152344, "learning_rate": 2.5773529411764707e-05, "loss": 6.97, "step": 680 }, { "epoch": 1.9166666666666665, "grad_norm": 22.62903594970703, "learning_rate": 2.568529411764706e-05, "loss": 6.7521, "step": 690 }, { "epoch": 1.9444444444444444, "grad_norm": 18.688766479492188, "learning_rate": 2.5597058823529414e-05, "loss": 6.7485, "step": 700 }, { "epoch": 1.9722222222222223, "grad_norm": 19.558578491210938, "learning_rate": 2.5508823529411765e-05, "loss": 6.6582, "step": 710 }, { "epoch": 2.0, "grad_norm": 20.0054874420166, "learning_rate": 2.542058823529412e-05, "loss": 6.4649, "step": 720 }, { "epoch": 2.0277777777777777, "grad_norm": 17.48706817626953, "learning_rate": 2.533235294117647e-05, "loss": 6.0975, "step": 730 }, { "epoch": 2.0555555555555554, "grad_norm": 21.419391632080078, "learning_rate": 2.5244117647058823e-05, "loss": 5.8201, "step": 740 }, { "epoch": 2.0833333333333335, "grad_norm": 19.62574005126953, "learning_rate": 2.5155882352941175e-05, "loss": 5.792, "step": 750 }, { "epoch": 2.111111111111111, "grad_norm": 24.182546615600586, "learning_rate": 2.506764705882353e-05, "loss": 6.002, "step": 760 }, { "epoch": 2.138888888888889, "grad_norm": 17.403255462646484, "learning_rate": 2.497941176470588e-05, "loss": 5.8743, "step": 770 }, { "epoch": 2.1666666666666665, "grad_norm": 19.421232223510742, "learning_rate": 2.4891176470588237e-05, "loss": 6.1047, "step": 780 }, { "epoch": 2.1944444444444446, "grad_norm": 22.9908504486084, "learning_rate": 2.4802941176470588e-05, "loss": 5.9951, "step": 790 }, { "epoch": 2.2222222222222223, "grad_norm": 18.04509925842285, "learning_rate": 2.4714705882352943e-05, "loss": 6.1774, "step": 800 }, { "epoch": 2.25, "grad_norm": 16.365985870361328, "learning_rate": 2.4626470588235295e-05, "loss": 5.9879, "step": 810 }, { "epoch": 2.2777777777777777, "grad_norm": 19.309423446655273, "learning_rate": 2.453823529411765e-05, "loss": 5.9292, "step": 820 }, { "epoch": 2.3055555555555554, "grad_norm": 24.4521541595459, "learning_rate": 2.4449999999999998e-05, "loss": 6.0532, "step": 830 }, { "epoch": 2.3333333333333335, "grad_norm": 22.1500244140625, "learning_rate": 2.4361764705882353e-05, "loss": 5.7139, "step": 840 }, { "epoch": 2.361111111111111, "grad_norm": 18.496788024902344, "learning_rate": 2.4273529411764705e-05, "loss": 5.9176, "step": 850 }, { "epoch": 2.388888888888889, "grad_norm": 19.089303970336914, "learning_rate": 2.418529411764706e-05, "loss": 6.0821, "step": 860 }, { "epoch": 2.4166666666666665, "grad_norm": 18.071590423583984, "learning_rate": 2.409705882352941e-05, "loss": 5.9829, "step": 870 }, { "epoch": 2.4444444444444446, "grad_norm": 19.58377456665039, "learning_rate": 2.4008823529411766e-05, "loss": 5.894, "step": 880 }, { "epoch": 2.4722222222222223, "grad_norm": 20.550006866455078, "learning_rate": 2.3920588235294118e-05, "loss": 6.0293, "step": 890 }, { "epoch": 2.5, "grad_norm": 22.616382598876953, "learning_rate": 2.3832352941176473e-05, "loss": 6.037, "step": 900 }, { "epoch": 2.5277777777777777, "grad_norm": 22.372953414916992, "learning_rate": 2.3744117647058824e-05, "loss": 5.9353, "step": 910 }, { "epoch": 2.5555555555555554, "grad_norm": 18.03106117248535, "learning_rate": 2.365588235294118e-05, "loss": 5.8287, "step": 920 }, { "epoch": 2.5833333333333335, "grad_norm": 16.170204162597656, "learning_rate": 2.356764705882353e-05, "loss": 5.7742, "step": 930 }, { "epoch": 2.611111111111111, "grad_norm": 17.15852165222168, "learning_rate": 2.3479411764705882e-05, "loss": 6.0491, "step": 940 }, { "epoch": 2.638888888888889, "grad_norm": 20.76968002319336, "learning_rate": 2.3391176470588234e-05, "loss": 5.9122, "step": 950 }, { "epoch": 2.6666666666666665, "grad_norm": 21.134782791137695, "learning_rate": 2.330294117647059e-05, "loss": 5.8734, "step": 960 }, { "epoch": 2.6944444444444446, "grad_norm": 18.72968864440918, "learning_rate": 2.321470588235294e-05, "loss": 5.9093, "step": 970 }, { "epoch": 2.7222222222222223, "grad_norm": 23.75607681274414, "learning_rate": 2.3126470588235296e-05, "loss": 5.9551, "step": 980 }, { "epoch": 2.75, "grad_norm": 19.960378646850586, "learning_rate": 2.3038235294117647e-05, "loss": 6.0045, "step": 990 }, { "epoch": 2.7777777777777777, "grad_norm": 16.736736297607422, "learning_rate": 2.2950000000000002e-05, "loss": 5.9759, "step": 1000 }, { "epoch": 2.8055555555555554, "grad_norm": 22.561372756958008, "learning_rate": 2.2861764705882354e-05, "loss": 5.9796, "step": 1010 }, { "epoch": 2.8333333333333335, "grad_norm": 16.96518898010254, "learning_rate": 2.277352941176471e-05, "loss": 6.0321, "step": 1020 }, { "epoch": 2.861111111111111, "grad_norm": 20.031225204467773, "learning_rate": 2.268529411764706e-05, "loss": 5.9221, "step": 1030 }, { "epoch": 2.888888888888889, "grad_norm": 16.219301223754883, "learning_rate": 2.2597058823529415e-05, "loss": 6.0382, "step": 1040 }, { "epoch": 2.9166666666666665, "grad_norm": 19.537729263305664, "learning_rate": 2.2508823529411764e-05, "loss": 5.8556, "step": 1050 }, { "epoch": 2.9444444444444446, "grad_norm": 20.713224411010742, "learning_rate": 2.242058823529412e-05, "loss": 5.9832, "step": 1060 }, { "epoch": 2.9722222222222223, "grad_norm": 18.94963836669922, "learning_rate": 2.233235294117647e-05, "loss": 6.0023, "step": 1070 }, { "epoch": 3.0, "grad_norm": 20.285676956176758, "learning_rate": 2.2244117647058825e-05, "loss": 5.8155, "step": 1080 }, { "epoch": 3.0277777777777777, "grad_norm": 17.27924346923828, "learning_rate": 2.2155882352941177e-05, "loss": 5.2819, "step": 1090 }, { "epoch": 3.0555555555555554, "grad_norm": 21.477184295654297, "learning_rate": 2.206764705882353e-05, "loss": 5.1979, "step": 1100 }, { "epoch": 3.0833333333333335, "grad_norm": 15.144262313842773, "learning_rate": 2.1979411764705883e-05, "loss": 5.1172, "step": 1110 }, { "epoch": 3.111111111111111, "grad_norm": 22.37119483947754, "learning_rate": 2.1891176470588238e-05, "loss": 5.2262, "step": 1120 }, { "epoch": 3.138888888888889, "grad_norm": 20.329545974731445, "learning_rate": 2.180294117647059e-05, "loss": 5.2281, "step": 1130 }, { "epoch": 3.1666666666666665, "grad_norm": 17.549114227294922, "learning_rate": 2.1714705882352945e-05, "loss": 5.1716, "step": 1140 }, { "epoch": 3.1944444444444446, "grad_norm": 16.489368438720703, "learning_rate": 2.1626470588235293e-05, "loss": 5.3562, "step": 1150 }, { "epoch": 3.2222222222222223, "grad_norm": 18.964805603027344, "learning_rate": 2.1538235294117648e-05, "loss": 5.5189, "step": 1160 }, { "epoch": 3.25, "grad_norm": 18.049579620361328, "learning_rate": 2.145e-05, "loss": 5.1967, "step": 1170 }, { "epoch": 3.2777777777777777, "grad_norm": 18.76259422302246, "learning_rate": 2.136176470588235e-05, "loss": 5.2686, "step": 1180 }, { "epoch": 3.3055555555555554, "grad_norm": 20.977890014648438, "learning_rate": 2.1273529411764706e-05, "loss": 5.1385, "step": 1190 }, { "epoch": 3.3333333333333335, "grad_norm": 19.02361488342285, "learning_rate": 2.1185294117647058e-05, "loss": 5.3758, "step": 1200 }, { "epoch": 3.361111111111111, "grad_norm": 17.559446334838867, "learning_rate": 2.1097058823529413e-05, "loss": 5.3187, "step": 1210 }, { "epoch": 3.388888888888889, "grad_norm": 21.838626861572266, "learning_rate": 2.1008823529411764e-05, "loss": 5.3414, "step": 1220 }, { "epoch": 3.4166666666666665, "grad_norm": 19.832502365112305, "learning_rate": 2.092058823529412e-05, "loss": 5.3392, "step": 1230 }, { "epoch": 3.4444444444444446, "grad_norm": 17.20013427734375, "learning_rate": 2.083235294117647e-05, "loss": 5.2786, "step": 1240 }, { "epoch": 3.4722222222222223, "grad_norm": 19.66887855529785, "learning_rate": 2.0744117647058826e-05, "loss": 5.3352, "step": 1250 }, { "epoch": 3.5, "grad_norm": 19.046735763549805, "learning_rate": 2.0655882352941174e-05, "loss": 5.2161, "step": 1260 }, { "epoch": 3.5277777777777777, "grad_norm": 17.690959930419922, "learning_rate": 2.056764705882353e-05, "loss": 5.3777, "step": 1270 }, { "epoch": 3.5555555555555554, "grad_norm": 19.2017822265625, "learning_rate": 2.047941176470588e-05, "loss": 5.3262, "step": 1280 }, { "epoch": 3.5833333333333335, "grad_norm": 22.48141098022461, "learning_rate": 2.0391176470588236e-05, "loss": 5.2689, "step": 1290 }, { "epoch": 3.611111111111111, "grad_norm": 21.33321189880371, "learning_rate": 2.0302941176470587e-05, "loss": 5.1867, "step": 1300 }, { "epoch": 3.638888888888889, "grad_norm": 20.216909408569336, "learning_rate": 2.0214705882352942e-05, "loss": 5.3314, "step": 1310 }, { "epoch": 3.6666666666666665, "grad_norm": 19.15237808227539, "learning_rate": 2.0126470588235294e-05, "loss": 5.3206, "step": 1320 }, { "epoch": 3.6944444444444446, "grad_norm": 17.66857147216797, "learning_rate": 2.003823529411765e-05, "loss": 5.4498, "step": 1330 }, { "epoch": 3.7222222222222223, "grad_norm": 20.253646850585938, "learning_rate": 1.995e-05, "loss": 5.4104, "step": 1340 }, { "epoch": 3.75, "grad_norm": 19.723072052001953, "learning_rate": 1.9861764705882355e-05, "loss": 5.214, "step": 1350 }, { "epoch": 3.7777777777777777, "grad_norm": 17.601451873779297, "learning_rate": 1.9773529411764704e-05, "loss": 5.356, "step": 1360 }, { "epoch": 3.8055555555555554, "grad_norm": 19.1397705078125, "learning_rate": 1.968529411764706e-05, "loss": 5.4965, "step": 1370 }, { "epoch": 3.8333333333333335, "grad_norm": 18.656843185424805, "learning_rate": 1.959705882352941e-05, "loss": 5.0704, "step": 1380 }, { "epoch": 3.861111111111111, "grad_norm": 16.182132720947266, "learning_rate": 1.9508823529411765e-05, "loss": 5.1655, "step": 1390 }, { "epoch": 3.888888888888889, "grad_norm": 18.828876495361328, "learning_rate": 1.9420588235294117e-05, "loss": 5.0816, "step": 1400 }, { "epoch": 3.9166666666666665, "grad_norm": 18.60540199279785, "learning_rate": 1.9332352941176472e-05, "loss": 5.2968, "step": 1410 }, { "epoch": 3.9444444444444446, "grad_norm": 17.49435806274414, "learning_rate": 1.9244117647058823e-05, "loss": 5.3383, "step": 1420 }, { "epoch": 3.9722222222222223, "grad_norm": 19.32575798034668, "learning_rate": 1.915588235294118e-05, "loss": 5.1811, "step": 1430 }, { "epoch": 4.0, "grad_norm": 23.117151260375977, "learning_rate": 1.906764705882353e-05, "loss": 5.3958, "step": 1440 }, { "epoch": 4.027777777777778, "grad_norm": 14.918752670288086, "learning_rate": 1.8979411764705885e-05, "loss": 4.7873, "step": 1450 }, { "epoch": 4.055555555555555, "grad_norm": 16.663442611694336, "learning_rate": 1.8891176470588236e-05, "loss": 4.812, "step": 1460 }, { "epoch": 4.083333333333333, "grad_norm": 17.157150268554688, "learning_rate": 1.8802941176470588e-05, "loss": 4.7987, "step": 1470 }, { "epoch": 4.111111111111111, "grad_norm": 18.143966674804688, "learning_rate": 1.871470588235294e-05, "loss": 4.6237, "step": 1480 }, { "epoch": 4.138888888888889, "grad_norm": 16.317506790161133, "learning_rate": 1.8626470588235295e-05, "loss": 4.6253, "step": 1490 }, { "epoch": 4.166666666666667, "grad_norm": 23.109792709350586, "learning_rate": 1.8538235294117646e-05, "loss": 4.7575, "step": 1500 }, { "epoch": 4.194444444444445, "grad_norm": 16.878374099731445, "learning_rate": 1.845e-05, "loss": 4.6636, "step": 1510 }, { "epoch": 4.222222222222222, "grad_norm": 18.059541702270508, "learning_rate": 1.8361764705882353e-05, "loss": 4.7979, "step": 1520 }, { "epoch": 4.25, "grad_norm": 16.914222717285156, "learning_rate": 1.8273529411764708e-05, "loss": 4.7786, "step": 1530 }, { "epoch": 4.277777777777778, "grad_norm": 18.730314254760742, "learning_rate": 1.818529411764706e-05, "loss": 4.7931, "step": 1540 }, { "epoch": 4.305555555555555, "grad_norm": 17.19397735595703, "learning_rate": 1.8097058823529414e-05, "loss": 4.6813, "step": 1550 }, { "epoch": 4.333333333333333, "grad_norm": 17.824308395385742, "learning_rate": 1.8008823529411766e-05, "loss": 4.8799, "step": 1560 }, { "epoch": 4.361111111111111, "grad_norm": 17.733112335205078, "learning_rate": 1.792058823529412e-05, "loss": 4.7475, "step": 1570 }, { "epoch": 4.388888888888889, "grad_norm": 17.163602828979492, "learning_rate": 1.783235294117647e-05, "loss": 4.8217, "step": 1580 }, { "epoch": 4.416666666666667, "grad_norm": 19.913904190063477, "learning_rate": 1.7744117647058824e-05, "loss": 4.8949, "step": 1590 }, { "epoch": 4.444444444444445, "grad_norm": 18.19684600830078, "learning_rate": 1.7655882352941176e-05, "loss": 4.9527, "step": 1600 }, { "epoch": 4.472222222222222, "grad_norm": 19.120418548583984, "learning_rate": 1.756764705882353e-05, "loss": 4.8928, "step": 1610 }, { "epoch": 4.5, "grad_norm": 18.0157470703125, "learning_rate": 1.7479411764705882e-05, "loss": 4.8177, "step": 1620 }, { "epoch": 4.527777777777778, "grad_norm": 20.340044021606445, "learning_rate": 1.7391176470588237e-05, "loss": 4.6013, "step": 1630 }, { "epoch": 4.555555555555555, "grad_norm": 16.55755043029785, "learning_rate": 1.730294117647059e-05, "loss": 4.8209, "step": 1640 }, { "epoch": 4.583333333333333, "grad_norm": 17.506793975830078, "learning_rate": 1.7214705882352944e-05, "loss": 5.0675, "step": 1650 }, { "epoch": 4.611111111111111, "grad_norm": 22.037546157836914, "learning_rate": 1.7126470588235295e-05, "loss": 4.8392, "step": 1660 }, { "epoch": 4.638888888888889, "grad_norm": 17.09819221496582, "learning_rate": 1.703823529411765e-05, "loss": 4.7789, "step": 1670 }, { "epoch": 4.666666666666667, "grad_norm": 23.13201904296875, "learning_rate": 1.695e-05, "loss": 4.8017, "step": 1680 }, { "epoch": 4.694444444444445, "grad_norm": 19.751766204833984, "learning_rate": 1.6861764705882354e-05, "loss": 4.7095, "step": 1690 }, { "epoch": 4.722222222222222, "grad_norm": 17.58235740661621, "learning_rate": 1.6773529411764705e-05, "loss": 4.7251, "step": 1700 }, { "epoch": 4.75, "grad_norm": 17.987777709960938, "learning_rate": 1.668529411764706e-05, "loss": 4.8577, "step": 1710 }, { "epoch": 4.777777777777778, "grad_norm": 21.950464248657227, "learning_rate": 1.6597058823529412e-05, "loss": 4.8194, "step": 1720 }, { "epoch": 4.805555555555555, "grad_norm": 18.807058334350586, "learning_rate": 1.6508823529411767e-05, "loss": 4.8405, "step": 1730 }, { "epoch": 4.833333333333333, "grad_norm": 19.066274642944336, "learning_rate": 1.642058823529412e-05, "loss": 4.8167, "step": 1740 }, { "epoch": 4.861111111111111, "grad_norm": 16.53117561340332, "learning_rate": 1.633235294117647e-05, "loss": 4.9061, "step": 1750 }, { "epoch": 4.888888888888889, "grad_norm": 19.067041397094727, "learning_rate": 1.6244117647058825e-05, "loss": 4.772, "step": 1760 }, { "epoch": 4.916666666666667, "grad_norm": 17.190061569213867, "learning_rate": 1.6155882352941177e-05, "loss": 4.8326, "step": 1770 }, { "epoch": 4.944444444444445, "grad_norm": 22.074304580688477, "learning_rate": 1.606764705882353e-05, "loss": 5.0498, "step": 1780 }, { "epoch": 4.972222222222222, "grad_norm": 17.11249542236328, "learning_rate": 1.597941176470588e-05, "loss": 4.8728, "step": 1790 }, { "epoch": 5.0, "grad_norm": 17.489036560058594, "learning_rate": 1.5891176470588235e-05, "loss": 4.7712, "step": 1800 }, { "epoch": 5.027777777777778, "grad_norm": 18.673629760742188, "learning_rate": 1.5802941176470586e-05, "loss": 4.318, "step": 1810 }, { "epoch": 5.055555555555555, "grad_norm": 18.343915939331055, "learning_rate": 1.571470588235294e-05, "loss": 4.4046, "step": 1820 }, { "epoch": 5.083333333333333, "grad_norm": 18.69623565673828, "learning_rate": 1.5626470588235293e-05, "loss": 4.4717, "step": 1830 }, { "epoch": 5.111111111111111, "grad_norm": 17.767257690429688, "learning_rate": 1.5538235294117648e-05, "loss": 4.3915, "step": 1840 }, { "epoch": 5.138888888888889, "grad_norm": 18.257091522216797, "learning_rate": 1.545e-05, "loss": 4.4195, "step": 1850 }, { "epoch": 5.166666666666667, "grad_norm": 18.255918502807617, "learning_rate": 1.5361764705882354e-05, "loss": 4.466, "step": 1860 }, { "epoch": 5.194444444444445, "grad_norm": 17.16253089904785, "learning_rate": 1.5273529411764706e-05, "loss": 4.4639, "step": 1870 }, { "epoch": 5.222222222222222, "grad_norm": 23.036542892456055, "learning_rate": 1.5185294117647061e-05, "loss": 4.4155, "step": 1880 }, { "epoch": 5.25, "grad_norm": 16.37624168395996, "learning_rate": 1.5097058823529414e-05, "loss": 4.4856, "step": 1890 }, { "epoch": 5.277777777777778, "grad_norm": 15.804655075073242, "learning_rate": 1.5008823529411764e-05, "loss": 4.3916, "step": 1900 }, { "epoch": 5.305555555555555, "grad_norm": 18.654705047607422, "learning_rate": 1.492058823529412e-05, "loss": 4.5315, "step": 1910 }, { "epoch": 5.333333333333333, "grad_norm": 17.40276336669922, "learning_rate": 1.483235294117647e-05, "loss": 4.4648, "step": 1920 }, { "epoch": 5.361111111111111, "grad_norm": 14.804203987121582, "learning_rate": 1.4744117647058824e-05, "loss": 4.5326, "step": 1930 }, { "epoch": 5.388888888888889, "grad_norm": 18.393539428710938, "learning_rate": 1.4655882352941177e-05, "loss": 4.3771, "step": 1940 }, { "epoch": 5.416666666666667, "grad_norm": 21.74736213684082, "learning_rate": 1.456764705882353e-05, "loss": 4.424, "step": 1950 }, { "epoch": 5.444444444444445, "grad_norm": 18.945545196533203, "learning_rate": 1.4479411764705884e-05, "loss": 4.3918, "step": 1960 }, { "epoch": 5.472222222222222, "grad_norm": 21.048032760620117, "learning_rate": 1.4391176470588237e-05, "loss": 4.4165, "step": 1970 }, { "epoch": 5.5, "grad_norm": 17.606342315673828, "learning_rate": 1.4302941176470589e-05, "loss": 4.4718, "step": 1980 }, { "epoch": 5.527777777777778, "grad_norm": 16.075519561767578, "learning_rate": 1.4214705882352942e-05, "loss": 4.3442, "step": 1990 }, { "epoch": 5.555555555555555, "grad_norm": 15.067805290222168, "learning_rate": 1.4126470588235295e-05, "loss": 4.4641, "step": 2000 }, { "epoch": 5.583333333333333, "grad_norm": 16.70688819885254, "learning_rate": 1.4038235294117649e-05, "loss": 4.4619, "step": 2010 }, { "epoch": 5.611111111111111, "grad_norm": 20.45810317993164, "learning_rate": 1.395e-05, "loss": 4.4448, "step": 2020 }, { "epoch": 5.638888888888889, "grad_norm": 19.671323776245117, "learning_rate": 1.3861764705882352e-05, "loss": 4.5246, "step": 2030 }, { "epoch": 5.666666666666667, "grad_norm": 17.035329818725586, "learning_rate": 1.3773529411764705e-05, "loss": 4.3425, "step": 2040 }, { "epoch": 5.694444444444445, "grad_norm": 18.257850646972656, "learning_rate": 1.3685294117647058e-05, "loss": 4.3883, "step": 2050 }, { "epoch": 5.722222222222222, "grad_norm": 17.29006004333496, "learning_rate": 1.3597058823529412e-05, "loss": 4.4048, "step": 2060 }, { "epoch": 5.75, "grad_norm": 16.57717514038086, "learning_rate": 1.3508823529411765e-05, "loss": 4.4456, "step": 2070 }, { "epoch": 5.777777777777778, "grad_norm": 21.21561622619629, "learning_rate": 1.3420588235294117e-05, "loss": 4.5329, "step": 2080 }, { "epoch": 5.805555555555555, "grad_norm": 18.407976150512695, "learning_rate": 1.333235294117647e-05, "loss": 4.6047, "step": 2090 }, { "epoch": 5.833333333333333, "grad_norm": 15.929880142211914, "learning_rate": 1.3244117647058823e-05, "loss": 4.421, "step": 2100 }, { "epoch": 5.861111111111111, "grad_norm": 16.92407989501953, "learning_rate": 1.3155882352941176e-05, "loss": 4.4857, "step": 2110 }, { "epoch": 5.888888888888889, "grad_norm": 17.74669647216797, "learning_rate": 1.306764705882353e-05, "loss": 4.4654, "step": 2120 }, { "epoch": 5.916666666666667, "grad_norm": 19.744373321533203, "learning_rate": 1.2979411764705881e-05, "loss": 4.3837, "step": 2130 }, { "epoch": 5.944444444444445, "grad_norm": 15.323151588439941, "learning_rate": 1.2891176470588235e-05, "loss": 4.3457, "step": 2140 }, { "epoch": 5.972222222222222, "grad_norm": 18.93714141845703, "learning_rate": 1.2802941176470588e-05, "loss": 4.3657, "step": 2150 }, { "epoch": 6.0, "grad_norm": 19.87427520751953, "learning_rate": 1.2714705882352941e-05, "loss": 4.4179, "step": 2160 }, { "epoch": 6.027777777777778, "grad_norm": 15.826274871826172, "learning_rate": 1.2626470588235295e-05, "loss": 4.1858, "step": 2170 }, { "epoch": 6.055555555555555, "grad_norm": 16.08074378967285, "learning_rate": 1.2538235294117648e-05, "loss": 4.1317, "step": 2180 }, { "epoch": 6.083333333333333, "grad_norm": 15.976150512695312, "learning_rate": 1.245e-05, "loss": 4.1214, "step": 2190 }, { "epoch": 6.111111111111111, "grad_norm": 17.065427780151367, "learning_rate": 1.2361764705882353e-05, "loss": 4.0471, "step": 2200 }, { "epoch": 6.138888888888889, "grad_norm": 15.556553840637207, "learning_rate": 1.2273529411764706e-05, "loss": 4.0961, "step": 2210 }, { "epoch": 6.166666666666667, "grad_norm": 16.178955078125, "learning_rate": 1.218529411764706e-05, "loss": 4.1129, "step": 2220 }, { "epoch": 6.194444444444445, "grad_norm": 17.971904754638672, "learning_rate": 1.2097058823529413e-05, "loss": 4.111, "step": 2230 }, { "epoch": 6.222222222222222, "grad_norm": 17.23676109313965, "learning_rate": 1.2008823529411764e-05, "loss": 4.209, "step": 2240 }, { "epoch": 6.25, "grad_norm": 18.453350067138672, "learning_rate": 1.1920588235294117e-05, "loss": 4.1997, "step": 2250 }, { "epoch": 6.277777777777778, "grad_norm": 17.884140014648438, "learning_rate": 1.183235294117647e-05, "loss": 4.1343, "step": 2260 }, { "epoch": 6.305555555555555, "grad_norm": 18.340280532836914, "learning_rate": 1.1744117647058824e-05, "loss": 4.1262, "step": 2270 }, { "epoch": 6.333333333333333, "grad_norm": 18.307395935058594, "learning_rate": 1.1655882352941177e-05, "loss": 4.1307, "step": 2280 }, { "epoch": 6.361111111111111, "grad_norm": 17.725727081298828, "learning_rate": 1.1567647058823529e-05, "loss": 4.0369, "step": 2290 }, { "epoch": 6.388888888888889, "grad_norm": 14.209752082824707, "learning_rate": 1.1479411764705882e-05, "loss": 4.1071, "step": 2300 }, { "epoch": 6.416666666666667, "grad_norm": 21.862701416015625, "learning_rate": 1.1391176470588235e-05, "loss": 4.223, "step": 2310 }, { "epoch": 6.444444444444445, "grad_norm": 13.577404975891113, "learning_rate": 1.1302941176470589e-05, "loss": 4.2441, "step": 2320 }, { "epoch": 6.472222222222222, "grad_norm": 16.62024688720703, "learning_rate": 1.1214705882352942e-05, "loss": 4.1793, "step": 2330 }, { "epoch": 6.5, "grad_norm": 17.887189865112305, "learning_rate": 1.1126470588235295e-05, "loss": 4.1837, "step": 2340 }, { "epoch": 6.527777777777778, "grad_norm": 16.476367950439453, "learning_rate": 1.1038235294117647e-05, "loss": 4.1526, "step": 2350 }, { "epoch": 6.555555555555555, "grad_norm": 17.012876510620117, "learning_rate": 1.095e-05, "loss": 4.1965, "step": 2360 }, { "epoch": 6.583333333333333, "grad_norm": 17.46245765686035, "learning_rate": 1.0861764705882353e-05, "loss": 4.1394, "step": 2370 }, { "epoch": 6.611111111111111, "grad_norm": 17.8790340423584, "learning_rate": 1.0773529411764707e-05, "loss": 4.1404, "step": 2380 }, { "epoch": 6.638888888888889, "grad_norm": 16.047483444213867, "learning_rate": 1.068529411764706e-05, "loss": 4.1839, "step": 2390 }, { "epoch": 6.666666666666667, "grad_norm": 15.634733200073242, "learning_rate": 1.0597058823529412e-05, "loss": 4.1608, "step": 2400 }, { "epoch": 6.694444444444445, "grad_norm": 15.73880386352539, "learning_rate": 1.0508823529411765e-05, "loss": 4.1989, "step": 2410 }, { "epoch": 6.722222222222222, "grad_norm": 18.751541137695312, "learning_rate": 1.0420588235294118e-05, "loss": 4.215, "step": 2420 }, { "epoch": 6.75, "grad_norm": 18.6307430267334, "learning_rate": 1.0332352941176472e-05, "loss": 4.1345, "step": 2430 }, { "epoch": 6.777777777777778, "grad_norm": 18.48142433166504, "learning_rate": 1.0244117647058825e-05, "loss": 4.2856, "step": 2440 }, { "epoch": 6.805555555555555, "grad_norm": 17.484237670898438, "learning_rate": 1.0155882352941176e-05, "loss": 4.223, "step": 2450 }, { "epoch": 6.833333333333333, "grad_norm": 16.67469024658203, "learning_rate": 1.006764705882353e-05, "loss": 4.1748, "step": 2460 }, { "epoch": 6.861111111111111, "grad_norm": 18.385995864868164, "learning_rate": 9.979411764705883e-06, "loss": 4.1886, "step": 2470 }, { "epoch": 6.888888888888889, "grad_norm": 20.845624923706055, "learning_rate": 9.891176470588236e-06, "loss": 4.1832, "step": 2480 }, { "epoch": 6.916666666666667, "grad_norm": 18.856298446655273, "learning_rate": 9.80294117647059e-06, "loss": 4.2403, "step": 2490 }, { "epoch": 6.944444444444445, "grad_norm": 16.965234756469727, "learning_rate": 9.714705882352943e-06, "loss": 4.2439, "step": 2500 }, { "epoch": 6.972222222222222, "grad_norm": 16.04508399963379, "learning_rate": 9.626470588235294e-06, "loss": 4.1588, "step": 2510 }, { "epoch": 7.0, "grad_norm": 17.192468643188477, "learning_rate": 9.538235294117648e-06, "loss": 4.2357, "step": 2520 }, { "epoch": 7.027777777777778, "grad_norm": 14.738517761230469, "learning_rate": 9.450000000000001e-06, "loss": 3.8684, "step": 2530 }, { "epoch": 7.055555555555555, "grad_norm": 16.8319034576416, "learning_rate": 9.361764705882354e-06, "loss": 3.8483, "step": 2540 }, { "epoch": 7.083333333333333, "grad_norm": 14.79024600982666, "learning_rate": 9.273529411764708e-06, "loss": 3.9072, "step": 2550 }, { "epoch": 7.111111111111111, "grad_norm": 12.333316802978516, "learning_rate": 9.18529411764706e-06, "loss": 3.9609, "step": 2560 }, { "epoch": 7.138888888888889, "grad_norm": 14.567424774169922, "learning_rate": 9.097058823529412e-06, "loss": 3.9129, "step": 2570 }, { "epoch": 7.166666666666667, "grad_norm": 18.647308349609375, "learning_rate": 9.008823529411766e-06, "loss": 3.9278, "step": 2580 }, { "epoch": 7.194444444444445, "grad_norm": 16.815906524658203, "learning_rate": 8.920588235294117e-06, "loss": 3.9316, "step": 2590 }, { "epoch": 7.222222222222222, "grad_norm": 19.90522575378418, "learning_rate": 8.83235294117647e-06, "loss": 3.9444, "step": 2600 }, { "epoch": 7.25, "grad_norm": 15.773906707763672, "learning_rate": 8.744117647058822e-06, "loss": 4.037, "step": 2610 }, { "epoch": 7.277777777777778, "grad_norm": 14.414016723632812, "learning_rate": 8.655882352941176e-06, "loss": 3.9834, "step": 2620 }, { "epoch": 7.305555555555555, "grad_norm": 18.313098907470703, "learning_rate": 8.567647058823529e-06, "loss": 4.0229, "step": 2630 }, { "epoch": 7.333333333333333, "grad_norm": 15.021878242492676, "learning_rate": 8.479411764705882e-06, "loss": 3.9891, "step": 2640 }, { "epoch": 7.361111111111111, "grad_norm": 16.228864669799805, "learning_rate": 8.391176470588235e-06, "loss": 3.9871, "step": 2650 }, { "epoch": 7.388888888888889, "grad_norm": 11.939830780029297, "learning_rate": 8.302941176470589e-06, "loss": 3.8708, "step": 2660 }, { "epoch": 7.416666666666667, "grad_norm": 13.20190143585205, "learning_rate": 8.21470588235294e-06, "loss": 3.8982, "step": 2670 }, { "epoch": 7.444444444444445, "grad_norm": 15.111628532409668, "learning_rate": 8.126470588235294e-06, "loss": 3.9871, "step": 2680 }, { "epoch": 7.472222222222222, "grad_norm": 15.908904075622559, "learning_rate": 8.038235294117647e-06, "loss": 3.976, "step": 2690 }, { "epoch": 7.5, "grad_norm": 18.098617553710938, "learning_rate": 7.95e-06, "loss": 4.0415, "step": 2700 }, { "epoch": 7.527777777777778, "grad_norm": 16.921981811523438, "learning_rate": 7.861764705882353e-06, "loss": 4.0462, "step": 2710 }, { "epoch": 7.555555555555555, "grad_norm": 14.457706451416016, "learning_rate": 7.773529411764705e-06, "loss": 3.9405, "step": 2720 }, { "epoch": 7.583333333333333, "grad_norm": 17.50982093811035, "learning_rate": 7.685294117647058e-06, "loss": 3.9974, "step": 2730 }, { "epoch": 7.611111111111111, "grad_norm": 15.285282135009766, "learning_rate": 7.5970588235294116e-06, "loss": 3.9969, "step": 2740 }, { "epoch": 7.638888888888889, "grad_norm": 14.417180061340332, "learning_rate": 7.508823529411765e-06, "loss": 3.97, "step": 2750 }, { "epoch": 7.666666666666667, "grad_norm": 14.781904220581055, "learning_rate": 7.420588235294117e-06, "loss": 3.9797, "step": 2760 }, { "epoch": 7.694444444444445, "grad_norm": 16.317949295043945, "learning_rate": 7.332352941176471e-06, "loss": 3.9236, "step": 2770 }, { "epoch": 7.722222222222222, "grad_norm": 16.340526580810547, "learning_rate": 7.244117647058824e-06, "loss": 4.0323, "step": 2780 }, { "epoch": 7.75, "grad_norm": 13.924703598022461, "learning_rate": 7.155882352941176e-06, "loss": 3.8719, "step": 2790 }, { "epoch": 7.777777777777778, "grad_norm": 19.019325256347656, "learning_rate": 7.06764705882353e-06, "loss": 3.9529, "step": 2800 }, { "epoch": 7.805555555555555, "grad_norm": 17.188499450683594, "learning_rate": 6.979411764705882e-06, "loss": 4.0392, "step": 2810 }, { "epoch": 7.833333333333333, "grad_norm": 13.893120765686035, "learning_rate": 6.891176470588235e-06, "loss": 3.9727, "step": 2820 }, { "epoch": 7.861111111111111, "grad_norm": 13.257885932922363, "learning_rate": 6.802941176470589e-06, "loss": 4.0164, "step": 2830 }, { "epoch": 7.888888888888889, "grad_norm": 15.29055118560791, "learning_rate": 6.714705882352941e-06, "loss": 4.0283, "step": 2840 }, { "epoch": 7.916666666666667, "grad_norm": 15.208207130432129, "learning_rate": 6.626470588235294e-06, "loss": 3.9515, "step": 2850 }, { "epoch": 7.944444444444445, "grad_norm": 15.335555076599121, "learning_rate": 6.538235294117647e-06, "loss": 4.0672, "step": 2860 }, { "epoch": 7.972222222222222, "grad_norm": 14.554072380065918, "learning_rate": 6.45e-06, "loss": 4.0757, "step": 2870 }, { "epoch": 8.0, "grad_norm": 17.360929489135742, "learning_rate": 6.361764705882353e-06, "loss": 3.9486, "step": 2880 }, { "epoch": 8.027777777777779, "grad_norm": 11.244579315185547, "learning_rate": 6.273529411764706e-06, "loss": 3.8162, "step": 2890 }, { "epoch": 8.055555555555555, "grad_norm": 16.21691131591797, "learning_rate": 6.185294117647059e-06, "loss": 3.8226, "step": 2900 }, { "epoch": 8.083333333333334, "grad_norm": 16.201114654541016, "learning_rate": 6.097058823529412e-06, "loss": 3.7959, "step": 2910 }, { "epoch": 8.11111111111111, "grad_norm": 17.415403366088867, "learning_rate": 6.008823529411765e-06, "loss": 3.7922, "step": 2920 }, { "epoch": 8.13888888888889, "grad_norm": 10.362262725830078, "learning_rate": 5.920588235294118e-06, "loss": 3.7602, "step": 2930 }, { "epoch": 8.166666666666666, "grad_norm": 12.301746368408203, "learning_rate": 5.8323529411764706e-06, "loss": 3.8236, "step": 2940 }, { "epoch": 8.194444444444445, "grad_norm": 15.563980102539062, "learning_rate": 5.744117647058824e-06, "loss": 3.7988, "step": 2950 }, { "epoch": 8.222222222222221, "grad_norm": 14.712089538574219, "learning_rate": 5.655882352941177e-06, "loss": 3.8601, "step": 2960 }, { "epoch": 8.25, "grad_norm": 16.73733901977539, "learning_rate": 5.5676470588235296e-06, "loss": 3.7956, "step": 2970 }, { "epoch": 8.277777777777779, "grad_norm": 15.459855079650879, "learning_rate": 5.479411764705883e-06, "loss": 3.7525, "step": 2980 }, { "epoch": 8.305555555555555, "grad_norm": 16.97991943359375, "learning_rate": 5.391176470588236e-06, "loss": 3.7536, "step": 2990 }, { "epoch": 8.333333333333334, "grad_norm": 13.196775436401367, "learning_rate": 5.302941176470589e-06, "loss": 3.88, "step": 3000 }, { "epoch": 8.36111111111111, "grad_norm": 15.121062278747559, "learning_rate": 5.214705882352941e-06, "loss": 3.8269, "step": 3010 }, { "epoch": 8.38888888888889, "grad_norm": 17.251754760742188, "learning_rate": 5.1264705882352935e-06, "loss": 3.8579, "step": 3020 }, { "epoch": 8.416666666666666, "grad_norm": 14.106273651123047, "learning_rate": 5.038235294117647e-06, "loss": 3.8255, "step": 3030 }, { "epoch": 8.444444444444445, "grad_norm": 16.304101943969727, "learning_rate": 4.95e-06, "loss": 3.8521, "step": 3040 }, { "epoch": 8.472222222222221, "grad_norm": 25.067447662353516, "learning_rate": 4.8617647058823525e-06, "loss": 3.8901, "step": 3050 }, { "epoch": 8.5, "grad_norm": 15.186288833618164, "learning_rate": 4.773529411764706e-06, "loss": 3.8275, "step": 3060 }, { "epoch": 8.527777777777779, "grad_norm": 14.004067420959473, "learning_rate": 4.685294117647059e-06, "loss": 3.9064, "step": 3070 }, { "epoch": 8.555555555555555, "grad_norm": 14.653326988220215, "learning_rate": 4.5970588235294115e-06, "loss": 3.7708, "step": 3080 }, { "epoch": 8.583333333333334, "grad_norm": 15.567551612854004, "learning_rate": 4.508823529411765e-06, "loss": 3.8827, "step": 3090 }, { "epoch": 8.61111111111111, "grad_norm": 15.521193504333496, "learning_rate": 4.420588235294117e-06, "loss": 3.7874, "step": 3100 }, { "epoch": 8.63888888888889, "grad_norm": 13.801298141479492, "learning_rate": 4.3323529411764705e-06, "loss": 3.8132, "step": 3110 }, { "epoch": 8.666666666666666, "grad_norm": 15.118513107299805, "learning_rate": 4.244117647058824e-06, "loss": 3.7819, "step": 3120 }, { "epoch": 8.694444444444445, "grad_norm": 14.111237525939941, "learning_rate": 4.155882352941176e-06, "loss": 3.8452, "step": 3130 }, { "epoch": 8.722222222222221, "grad_norm": 13.785816192626953, "learning_rate": 4.0676470588235295e-06, "loss": 3.8541, "step": 3140 }, { "epoch": 8.75, "grad_norm": 13.524584770202637, "learning_rate": 3.979411764705883e-06, "loss": 3.8521, "step": 3150 }, { "epoch": 8.777777777777779, "grad_norm": 15.52719497680664, "learning_rate": 3.891176470588235e-06, "loss": 3.8354, "step": 3160 }, { "epoch": 8.805555555555555, "grad_norm": 17.874053955078125, "learning_rate": 3.8029411764705885e-06, "loss": 3.794, "step": 3170 }, { "epoch": 8.833333333333334, "grad_norm": 15.086000442504883, "learning_rate": 3.7147058823529414e-06, "loss": 3.7661, "step": 3180 }, { "epoch": 8.86111111111111, "grad_norm": 18.586685180664062, "learning_rate": 3.6264705882352943e-06, "loss": 3.8781, "step": 3190 }, { "epoch": 8.88888888888889, "grad_norm": 16.09383201599121, "learning_rate": 3.538235294117647e-06, "loss": 3.8242, "step": 3200 }, { "epoch": 8.916666666666666, "grad_norm": 13.737457275390625, "learning_rate": 3.4500000000000004e-06, "loss": 3.8439, "step": 3210 }, { "epoch": 8.944444444444445, "grad_norm": 14.633013725280762, "learning_rate": 3.361764705882353e-06, "loss": 3.8111, "step": 3220 }, { "epoch": 8.972222222222221, "grad_norm": 12.145355224609375, "learning_rate": 3.2735294117647057e-06, "loss": 3.8591, "step": 3230 }, { "epoch": 9.0, "grad_norm": 15.101285934448242, "learning_rate": 3.1852941176470586e-06, "loss": 3.8327, "step": 3240 }, { "epoch": 9.027777777777779, "grad_norm": 12.12878131866455, "learning_rate": 3.097058823529412e-06, "loss": 3.7079, "step": 3250 }, { "epoch": 9.055555555555555, "grad_norm": 11.098604202270508, "learning_rate": 3.0088235294117647e-06, "loss": 3.6859, "step": 3260 }, { "epoch": 9.083333333333334, "grad_norm": 11.60837459564209, "learning_rate": 2.9205882352941176e-06, "loss": 3.6906, "step": 3270 }, { "epoch": 9.11111111111111, "grad_norm": 11.516129493713379, "learning_rate": 2.8323529411764705e-06, "loss": 3.7629, "step": 3280 }, { "epoch": 9.13888888888889, "grad_norm": 11.439606666564941, "learning_rate": 2.7441176470588238e-06, "loss": 3.7087, "step": 3290 }, { "epoch": 9.166666666666666, "grad_norm": 12.872289657592773, "learning_rate": 2.6558823529411766e-06, "loss": 3.754, "step": 3300 }, { "epoch": 9.194444444444445, "grad_norm": 13.252586364746094, "learning_rate": 2.5676470588235295e-06, "loss": 3.7477, "step": 3310 }, { "epoch": 9.222222222222221, "grad_norm": 14.191280364990234, "learning_rate": 2.4794117647058824e-06, "loss": 3.7703, "step": 3320 }, { "epoch": 9.25, "grad_norm": 17.2321720123291, "learning_rate": 2.3911764705882356e-06, "loss": 3.7569, "step": 3330 }, { "epoch": 9.277777777777779, "grad_norm": 15.821660995483398, "learning_rate": 2.3029411764705885e-06, "loss": 3.7321, "step": 3340 }, { "epoch": 9.305555555555555, "grad_norm": 10.092562675476074, "learning_rate": 2.2147058823529414e-06, "loss": 3.7033, "step": 3350 }, { "epoch": 9.333333333333334, "grad_norm": 19.321290969848633, "learning_rate": 2.1264705882352942e-06, "loss": 3.7683, "step": 3360 }, { "epoch": 9.36111111111111, "grad_norm": 15.781285285949707, "learning_rate": 2.038235294117647e-06, "loss": 3.7304, "step": 3370 }, { "epoch": 9.38888888888889, "grad_norm": 13.418021202087402, "learning_rate": 1.95e-06, "loss": 3.7117, "step": 3380 }, { "epoch": 9.416666666666666, "grad_norm": 14.020906448364258, "learning_rate": 1.861764705882353e-06, "loss": 3.7131, "step": 3390 }, { "epoch": 9.444444444444445, "grad_norm": 13.270752906799316, "learning_rate": 1.773529411764706e-06, "loss": 3.6744, "step": 3400 }, { "epoch": 9.472222222222221, "grad_norm": 12.589795112609863, "learning_rate": 1.6852941176470588e-06, "loss": 3.7544, "step": 3410 }, { "epoch": 9.5, "grad_norm": 13.454065322875977, "learning_rate": 1.5970588235294118e-06, "loss": 3.7782, "step": 3420 }, { "epoch": 9.527777777777779, "grad_norm": 14.777534484863281, "learning_rate": 1.5088235294117647e-06, "loss": 3.7203, "step": 3430 }, { "epoch": 9.555555555555555, "grad_norm": 14.142292022705078, "learning_rate": 1.4205882352941178e-06, "loss": 3.7331, "step": 3440 }, { "epoch": 9.583333333333334, "grad_norm": 17.070104598999023, "learning_rate": 1.3323529411764706e-06, "loss": 3.7318, "step": 3450 }, { "epoch": 9.61111111111111, "grad_norm": 13.819690704345703, "learning_rate": 1.2441176470588237e-06, "loss": 3.7379, "step": 3460 }, { "epoch": 9.63888888888889, "grad_norm": 13.859333038330078, "learning_rate": 1.1558823529411764e-06, "loss": 3.6641, "step": 3470 }, { "epoch": 9.666666666666666, "grad_norm": 12.934626579284668, "learning_rate": 1.0676470588235295e-06, "loss": 3.7164, "step": 3480 }, { "epoch": 9.694444444444445, "grad_norm": 13.10987377166748, "learning_rate": 9.794117647058823e-07, "loss": 3.7427, "step": 3490 }, { "epoch": 9.722222222222221, "grad_norm": 13.990032196044922, "learning_rate": 8.911764705882353e-07, "loss": 3.7195, "step": 3500 }, { "epoch": 9.75, "grad_norm": 13.015247344970703, "learning_rate": 8.029411764705883e-07, "loss": 3.7378, "step": 3510 }, { "epoch": 9.777777777777779, "grad_norm": 13.674323081970215, "learning_rate": 7.147058823529411e-07, "loss": 3.7118, "step": 3520 }, { "epoch": 9.805555555555555, "grad_norm": 11.697880744934082, "learning_rate": 6.264705882352941e-07, "loss": 3.7387, "step": 3530 }, { "epoch": 9.833333333333334, "grad_norm": 13.14771842956543, "learning_rate": 5.382352941176471e-07, "loss": 3.6967, "step": 3540 }, { "epoch": 9.86111111111111, "grad_norm": 12.617823600769043, "learning_rate": 4.5e-07, "loss": 3.7122, "step": 3550 }, { "epoch": 9.88888888888889, "grad_norm": 14.002346992492676, "learning_rate": 3.6176470588235295e-07, "loss": 3.7429, "step": 3560 }, { "epoch": 9.916666666666666, "grad_norm": 14.497004508972168, "learning_rate": 2.7352941176470587e-07, "loss": 3.7298, "step": 3570 }, { "epoch": 9.944444444444445, "grad_norm": 12.17514419555664, "learning_rate": 1.852941176470588e-07, "loss": 3.7254, "step": 3580 }, { "epoch": 9.972222222222221, "grad_norm": 17.909778594970703, "learning_rate": 9.705882352941177e-08, "loss": 3.7097, "step": 3590 }, { "epoch": 10.0, "grad_norm": 13.537755012512207, "learning_rate": 8.823529411764706e-09, "loss": 3.7612, "step": 3600 } ], "logging_steps": 10, "max_steps": 3600, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4403097501696000.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }