| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 10.0, | |
| "eval_steps": 500, | |
| "global_step": 3600, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.027777777777777776, | |
| "grad_norm": 104.33584594726562, | |
| "learning_rate": 1.35e-06, | |
| "loss": 13.5343, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.05555555555555555, | |
| "grad_norm": 57.29974365234375, | |
| "learning_rate": 2.8500000000000002e-06, | |
| "loss": 12.1877, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.08333333333333333, | |
| "grad_norm": 35.94429016113281, | |
| "learning_rate": 4.35e-06, | |
| "loss": 10.9908, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.1111111111111111, | |
| "grad_norm": 36.21198272705078, | |
| "learning_rate": 5.850000000000001e-06, | |
| "loss": 10.8169, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.1388888888888889, | |
| "grad_norm": 35.18545150756836, | |
| "learning_rate": 7.35e-06, | |
| "loss": 10.226, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.16666666666666666, | |
| "grad_norm": 30.034957885742188, | |
| "learning_rate": 8.85e-06, | |
| "loss": 9.8243, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.19444444444444445, | |
| "grad_norm": 26.107460021972656, | |
| "learning_rate": 1.035e-05, | |
| "loss": 9.7202, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.2222222222222222, | |
| "grad_norm": 28.405378341674805, | |
| "learning_rate": 1.185e-05, | |
| "loss": 9.655, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 31.871849060058594, | |
| "learning_rate": 1.3350000000000001e-05, | |
| "loss": 9.1121, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.2777777777777778, | |
| "grad_norm": 31.50947380065918, | |
| "learning_rate": 1.485e-05, | |
| "loss": 9.1896, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.3055555555555556, | |
| "grad_norm": 28.950199127197266, | |
| "learning_rate": 1.635e-05, | |
| "loss": 9.2017, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.3333333333333333, | |
| "grad_norm": 25.37229347229004, | |
| "learning_rate": 1.785e-05, | |
| "loss": 9.2022, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.3611111111111111, | |
| "grad_norm": 24.216697692871094, | |
| "learning_rate": 1.935e-05, | |
| "loss": 9.1344, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.3888888888888889, | |
| "grad_norm": 24.010272979736328, | |
| "learning_rate": 2.085e-05, | |
| "loss": 8.5873, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.4166666666666667, | |
| "grad_norm": 24.390907287597656, | |
| "learning_rate": 2.235e-05, | |
| "loss": 8.7195, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.4444444444444444, | |
| "grad_norm": 21.80168914794922, | |
| "learning_rate": 2.385e-05, | |
| "loss": 8.5485, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.4722222222222222, | |
| "grad_norm": 20.435110092163086, | |
| "learning_rate": 2.535e-05, | |
| "loss": 8.6359, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 24.11395835876465, | |
| "learning_rate": 2.6850000000000002e-05, | |
| "loss": 8.4246, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.5277777777777778, | |
| "grad_norm": 25.995811462402344, | |
| "learning_rate": 2.8349999999999998e-05, | |
| "loss": 8.5049, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.5555555555555556, | |
| "grad_norm": 23.826154708862305, | |
| "learning_rate": 2.985e-05, | |
| "loss": 8.4623, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.5833333333333334, | |
| "grad_norm": 20.094274520874023, | |
| "learning_rate": 2.9920588235294118e-05, | |
| "loss": 8.4495, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.6111111111111112, | |
| "grad_norm": 22.793804168701172, | |
| "learning_rate": 2.983235294117647e-05, | |
| "loss": 8.4131, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.6388888888888888, | |
| "grad_norm": 26.08730125427246, | |
| "learning_rate": 2.9744117647058824e-05, | |
| "loss": 8.3116, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 24.06973648071289, | |
| "learning_rate": 2.9655882352941176e-05, | |
| "loss": 8.4003, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.6944444444444444, | |
| "grad_norm": 22.93770408630371, | |
| "learning_rate": 2.956764705882353e-05, | |
| "loss": 8.2579, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.7222222222222222, | |
| "grad_norm": 20.36606216430664, | |
| "learning_rate": 2.9479411764705883e-05, | |
| "loss": 8.2888, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 21.670818328857422, | |
| "learning_rate": 2.9391176470588238e-05, | |
| "loss": 8.1623, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.7777777777777778, | |
| "grad_norm": 17.550491333007812, | |
| "learning_rate": 2.930294117647059e-05, | |
| "loss": 8.0683, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.8055555555555556, | |
| "grad_norm": 25.335660934448242, | |
| "learning_rate": 2.9214705882352944e-05, | |
| "loss": 8.2784, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.8333333333333334, | |
| "grad_norm": 20.76925277709961, | |
| "learning_rate": 2.9126470588235292e-05, | |
| "loss": 8.2923, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.8611111111111112, | |
| "grad_norm": 23.108144760131836, | |
| "learning_rate": 2.9038235294117647e-05, | |
| "loss": 8.1554, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.8888888888888888, | |
| "grad_norm": 16.563575744628906, | |
| "learning_rate": 2.895e-05, | |
| "loss": 7.8543, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.9166666666666666, | |
| "grad_norm": 19.03083038330078, | |
| "learning_rate": 2.8861764705882354e-05, | |
| "loss": 8.0434, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.9444444444444444, | |
| "grad_norm": 18.121356964111328, | |
| "learning_rate": 2.8773529411764706e-05, | |
| "loss": 7.93, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.9722222222222222, | |
| "grad_norm": 21.65605926513672, | |
| "learning_rate": 2.868529411764706e-05, | |
| "loss": 8.0124, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 24.55741310119629, | |
| "learning_rate": 2.8597058823529412e-05, | |
| "loss": 7.547, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.0277777777777777, | |
| "grad_norm": 18.358055114746094, | |
| "learning_rate": 2.8508823529411767e-05, | |
| "loss": 6.9283, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.0555555555555556, | |
| "grad_norm": 18.642698287963867, | |
| "learning_rate": 2.842058823529412e-05, | |
| "loss": 7.1341, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.0833333333333333, | |
| "grad_norm": 30.198606491088867, | |
| "learning_rate": 2.8332352941176474e-05, | |
| "loss": 7.113, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.1111111111111112, | |
| "grad_norm": 19.851511001586914, | |
| "learning_rate": 2.8244117647058825e-05, | |
| "loss": 7.1696, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.1388888888888888, | |
| "grad_norm": 20.48548698425293, | |
| "learning_rate": 2.8155882352941177e-05, | |
| "loss": 7.1552, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.1666666666666667, | |
| "grad_norm": 18.470821380615234, | |
| "learning_rate": 2.806764705882353e-05, | |
| "loss": 6.8519, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.1944444444444444, | |
| "grad_norm": 20.821441650390625, | |
| "learning_rate": 2.7979411764705883e-05, | |
| "loss": 7.0113, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.2222222222222223, | |
| "grad_norm": 19.294477462768555, | |
| "learning_rate": 2.7891176470588235e-05, | |
| "loss": 7.3983, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 19.5776309967041, | |
| "learning_rate": 2.780294117647059e-05, | |
| "loss": 7.0649, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.2777777777777777, | |
| "grad_norm": 18.286705017089844, | |
| "learning_rate": 2.771470588235294e-05, | |
| "loss": 7.1587, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.3055555555555556, | |
| "grad_norm": 19.803571701049805, | |
| "learning_rate": 2.7626470588235297e-05, | |
| "loss": 6.9439, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.3333333333333333, | |
| "grad_norm": 20.182926177978516, | |
| "learning_rate": 2.7538235294117648e-05, | |
| "loss": 6.9072, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.3611111111111112, | |
| "grad_norm": 21.21506690979004, | |
| "learning_rate": 2.7450000000000003e-05, | |
| "loss": 6.7805, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.3888888888888888, | |
| "grad_norm": 18.884464263916016, | |
| "learning_rate": 2.7361764705882355e-05, | |
| "loss": 6.7809, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.4166666666666667, | |
| "grad_norm": 18.7882080078125, | |
| "learning_rate": 2.7273529411764706e-05, | |
| "loss": 7.1101, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.4444444444444444, | |
| "grad_norm": 17.172212600708008, | |
| "learning_rate": 2.7185294117647058e-05, | |
| "loss": 6.8579, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.4722222222222223, | |
| "grad_norm": 18.599943161010742, | |
| "learning_rate": 2.7097058823529413e-05, | |
| "loss": 6.9192, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 22.43140411376953, | |
| "learning_rate": 2.7008823529411765e-05, | |
| "loss": 6.9068, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.5277777777777777, | |
| "grad_norm": 16.94782257080078, | |
| "learning_rate": 2.692058823529412e-05, | |
| "loss": 7.0826, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.5555555555555556, | |
| "grad_norm": 18.50872802734375, | |
| "learning_rate": 2.683235294117647e-05, | |
| "loss": 6.8281, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.5833333333333335, | |
| "grad_norm": 22.08888053894043, | |
| "learning_rate": 2.6744117647058826e-05, | |
| "loss": 6.9378, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.6111111111111112, | |
| "grad_norm": 17.366363525390625, | |
| "learning_rate": 2.6655882352941178e-05, | |
| "loss": 6.6598, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.6388888888888888, | |
| "grad_norm": 17.86391830444336, | |
| "learning_rate": 2.6567647058823533e-05, | |
| "loss": 6.8419, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.6666666666666665, | |
| "grad_norm": 16.568017959594727, | |
| "learning_rate": 2.6479411764705884e-05, | |
| "loss": 6.9137, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.6944444444444444, | |
| "grad_norm": 21.780588150024414, | |
| "learning_rate": 2.6391176470588236e-05, | |
| "loss": 7.0719, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.7222222222222223, | |
| "grad_norm": 20.24903678894043, | |
| "learning_rate": 2.6302941176470587e-05, | |
| "loss": 6.8612, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 23.022014617919922, | |
| "learning_rate": 2.621470588235294e-05, | |
| "loss": 6.815, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.7777777777777777, | |
| "grad_norm": 19.738136291503906, | |
| "learning_rate": 2.6126470588235294e-05, | |
| "loss": 7.0848, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.8055555555555556, | |
| "grad_norm": 19.904956817626953, | |
| "learning_rate": 2.6038235294117646e-05, | |
| "loss": 7.1127, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.8333333333333335, | |
| "grad_norm": 20.379528045654297, | |
| "learning_rate": 2.595e-05, | |
| "loss": 7.1542, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.8611111111111112, | |
| "grad_norm": 17.21288299560547, | |
| "learning_rate": 2.5861764705882352e-05, | |
| "loss": 6.4236, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.8888888888888888, | |
| "grad_norm": 19.684776306152344, | |
| "learning_rate": 2.5773529411764707e-05, | |
| "loss": 6.97, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.9166666666666665, | |
| "grad_norm": 22.62903594970703, | |
| "learning_rate": 2.568529411764706e-05, | |
| "loss": 6.7521, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.9444444444444444, | |
| "grad_norm": 18.688766479492188, | |
| "learning_rate": 2.5597058823529414e-05, | |
| "loss": 6.7485, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.9722222222222223, | |
| "grad_norm": 19.558578491210938, | |
| "learning_rate": 2.5508823529411765e-05, | |
| "loss": 6.6582, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 20.0054874420166, | |
| "learning_rate": 2.542058823529412e-05, | |
| "loss": 6.4649, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.0277777777777777, | |
| "grad_norm": 17.48706817626953, | |
| "learning_rate": 2.533235294117647e-05, | |
| "loss": 6.0975, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.0555555555555554, | |
| "grad_norm": 21.419391632080078, | |
| "learning_rate": 2.5244117647058823e-05, | |
| "loss": 5.8201, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.0833333333333335, | |
| "grad_norm": 19.62574005126953, | |
| "learning_rate": 2.5155882352941175e-05, | |
| "loss": 5.792, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.111111111111111, | |
| "grad_norm": 24.182546615600586, | |
| "learning_rate": 2.506764705882353e-05, | |
| "loss": 6.002, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.138888888888889, | |
| "grad_norm": 17.403255462646484, | |
| "learning_rate": 2.497941176470588e-05, | |
| "loss": 5.8743, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 2.1666666666666665, | |
| "grad_norm": 19.421232223510742, | |
| "learning_rate": 2.4891176470588237e-05, | |
| "loss": 6.1047, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.1944444444444446, | |
| "grad_norm": 22.9908504486084, | |
| "learning_rate": 2.4802941176470588e-05, | |
| "loss": 5.9951, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 2.2222222222222223, | |
| "grad_norm": 18.04509925842285, | |
| "learning_rate": 2.4714705882352943e-05, | |
| "loss": 6.1774, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 16.365985870361328, | |
| "learning_rate": 2.4626470588235295e-05, | |
| "loss": 5.9879, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 2.2777777777777777, | |
| "grad_norm": 19.309423446655273, | |
| "learning_rate": 2.453823529411765e-05, | |
| "loss": 5.9292, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 2.3055555555555554, | |
| "grad_norm": 24.4521541595459, | |
| "learning_rate": 2.4449999999999998e-05, | |
| "loss": 6.0532, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 2.3333333333333335, | |
| "grad_norm": 22.1500244140625, | |
| "learning_rate": 2.4361764705882353e-05, | |
| "loss": 5.7139, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 2.361111111111111, | |
| "grad_norm": 18.496788024902344, | |
| "learning_rate": 2.4273529411764705e-05, | |
| "loss": 5.9176, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 2.388888888888889, | |
| "grad_norm": 19.089303970336914, | |
| "learning_rate": 2.418529411764706e-05, | |
| "loss": 6.0821, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 2.4166666666666665, | |
| "grad_norm": 18.071590423583984, | |
| "learning_rate": 2.409705882352941e-05, | |
| "loss": 5.9829, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 2.4444444444444446, | |
| "grad_norm": 19.58377456665039, | |
| "learning_rate": 2.4008823529411766e-05, | |
| "loss": 5.894, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 2.4722222222222223, | |
| "grad_norm": 20.550006866455078, | |
| "learning_rate": 2.3920588235294118e-05, | |
| "loss": 6.0293, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 22.616382598876953, | |
| "learning_rate": 2.3832352941176473e-05, | |
| "loss": 6.037, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.5277777777777777, | |
| "grad_norm": 22.372953414916992, | |
| "learning_rate": 2.3744117647058824e-05, | |
| "loss": 5.9353, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 2.5555555555555554, | |
| "grad_norm": 18.03106117248535, | |
| "learning_rate": 2.365588235294118e-05, | |
| "loss": 5.8287, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 2.5833333333333335, | |
| "grad_norm": 16.170204162597656, | |
| "learning_rate": 2.356764705882353e-05, | |
| "loss": 5.7742, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 2.611111111111111, | |
| "grad_norm": 17.15852165222168, | |
| "learning_rate": 2.3479411764705882e-05, | |
| "loss": 6.0491, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 2.638888888888889, | |
| "grad_norm": 20.76968002319336, | |
| "learning_rate": 2.3391176470588234e-05, | |
| "loss": 5.9122, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 2.6666666666666665, | |
| "grad_norm": 21.134782791137695, | |
| "learning_rate": 2.330294117647059e-05, | |
| "loss": 5.8734, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 2.6944444444444446, | |
| "grad_norm": 18.72968864440918, | |
| "learning_rate": 2.321470588235294e-05, | |
| "loss": 5.9093, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 2.7222222222222223, | |
| "grad_norm": 23.75607681274414, | |
| "learning_rate": 2.3126470588235296e-05, | |
| "loss": 5.9551, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 19.960378646850586, | |
| "learning_rate": 2.3038235294117647e-05, | |
| "loss": 6.0045, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 2.7777777777777777, | |
| "grad_norm": 16.736736297607422, | |
| "learning_rate": 2.2950000000000002e-05, | |
| "loss": 5.9759, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.8055555555555554, | |
| "grad_norm": 22.561372756958008, | |
| "learning_rate": 2.2861764705882354e-05, | |
| "loss": 5.9796, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 2.8333333333333335, | |
| "grad_norm": 16.96518898010254, | |
| "learning_rate": 2.277352941176471e-05, | |
| "loss": 6.0321, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 2.861111111111111, | |
| "grad_norm": 20.031225204467773, | |
| "learning_rate": 2.268529411764706e-05, | |
| "loss": 5.9221, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 2.888888888888889, | |
| "grad_norm": 16.219301223754883, | |
| "learning_rate": 2.2597058823529415e-05, | |
| "loss": 6.0382, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 2.9166666666666665, | |
| "grad_norm": 19.537729263305664, | |
| "learning_rate": 2.2508823529411764e-05, | |
| "loss": 5.8556, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 2.9444444444444446, | |
| "grad_norm": 20.713224411010742, | |
| "learning_rate": 2.242058823529412e-05, | |
| "loss": 5.9832, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 2.9722222222222223, | |
| "grad_norm": 18.94963836669922, | |
| "learning_rate": 2.233235294117647e-05, | |
| "loss": 6.0023, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 20.285676956176758, | |
| "learning_rate": 2.2244117647058825e-05, | |
| "loss": 5.8155, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 3.0277777777777777, | |
| "grad_norm": 17.27924346923828, | |
| "learning_rate": 2.2155882352941177e-05, | |
| "loss": 5.2819, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 3.0555555555555554, | |
| "grad_norm": 21.477184295654297, | |
| "learning_rate": 2.206764705882353e-05, | |
| "loss": 5.1979, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 3.0833333333333335, | |
| "grad_norm": 15.144262313842773, | |
| "learning_rate": 2.1979411764705883e-05, | |
| "loss": 5.1172, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 3.111111111111111, | |
| "grad_norm": 22.37119483947754, | |
| "learning_rate": 2.1891176470588238e-05, | |
| "loss": 5.2262, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 3.138888888888889, | |
| "grad_norm": 20.329545974731445, | |
| "learning_rate": 2.180294117647059e-05, | |
| "loss": 5.2281, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 3.1666666666666665, | |
| "grad_norm": 17.549114227294922, | |
| "learning_rate": 2.1714705882352945e-05, | |
| "loss": 5.1716, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 3.1944444444444446, | |
| "grad_norm": 16.489368438720703, | |
| "learning_rate": 2.1626470588235293e-05, | |
| "loss": 5.3562, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 3.2222222222222223, | |
| "grad_norm": 18.964805603027344, | |
| "learning_rate": 2.1538235294117648e-05, | |
| "loss": 5.5189, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 3.25, | |
| "grad_norm": 18.049579620361328, | |
| "learning_rate": 2.145e-05, | |
| "loss": 5.1967, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 3.2777777777777777, | |
| "grad_norm": 18.76259422302246, | |
| "learning_rate": 2.136176470588235e-05, | |
| "loss": 5.2686, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 3.3055555555555554, | |
| "grad_norm": 20.977890014648438, | |
| "learning_rate": 2.1273529411764706e-05, | |
| "loss": 5.1385, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 3.3333333333333335, | |
| "grad_norm": 19.02361488342285, | |
| "learning_rate": 2.1185294117647058e-05, | |
| "loss": 5.3758, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 3.361111111111111, | |
| "grad_norm": 17.559446334838867, | |
| "learning_rate": 2.1097058823529413e-05, | |
| "loss": 5.3187, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 3.388888888888889, | |
| "grad_norm": 21.838626861572266, | |
| "learning_rate": 2.1008823529411764e-05, | |
| "loss": 5.3414, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 3.4166666666666665, | |
| "grad_norm": 19.832502365112305, | |
| "learning_rate": 2.092058823529412e-05, | |
| "loss": 5.3392, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 3.4444444444444446, | |
| "grad_norm": 17.20013427734375, | |
| "learning_rate": 2.083235294117647e-05, | |
| "loss": 5.2786, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 3.4722222222222223, | |
| "grad_norm": 19.66887855529785, | |
| "learning_rate": 2.0744117647058826e-05, | |
| "loss": 5.3352, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "grad_norm": 19.046735763549805, | |
| "learning_rate": 2.0655882352941174e-05, | |
| "loss": 5.2161, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 3.5277777777777777, | |
| "grad_norm": 17.690959930419922, | |
| "learning_rate": 2.056764705882353e-05, | |
| "loss": 5.3777, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 3.5555555555555554, | |
| "grad_norm": 19.2017822265625, | |
| "learning_rate": 2.047941176470588e-05, | |
| "loss": 5.3262, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 3.5833333333333335, | |
| "grad_norm": 22.48141098022461, | |
| "learning_rate": 2.0391176470588236e-05, | |
| "loss": 5.2689, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 3.611111111111111, | |
| "grad_norm": 21.33321189880371, | |
| "learning_rate": 2.0302941176470587e-05, | |
| "loss": 5.1867, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 3.638888888888889, | |
| "grad_norm": 20.216909408569336, | |
| "learning_rate": 2.0214705882352942e-05, | |
| "loss": 5.3314, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 3.6666666666666665, | |
| "grad_norm": 19.15237808227539, | |
| "learning_rate": 2.0126470588235294e-05, | |
| "loss": 5.3206, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 3.6944444444444446, | |
| "grad_norm": 17.66857147216797, | |
| "learning_rate": 2.003823529411765e-05, | |
| "loss": 5.4498, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 3.7222222222222223, | |
| "grad_norm": 20.253646850585938, | |
| "learning_rate": 1.995e-05, | |
| "loss": 5.4104, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 3.75, | |
| "grad_norm": 19.723072052001953, | |
| "learning_rate": 1.9861764705882355e-05, | |
| "loss": 5.214, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 3.7777777777777777, | |
| "grad_norm": 17.601451873779297, | |
| "learning_rate": 1.9773529411764704e-05, | |
| "loss": 5.356, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 3.8055555555555554, | |
| "grad_norm": 19.1397705078125, | |
| "learning_rate": 1.968529411764706e-05, | |
| "loss": 5.4965, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 3.8333333333333335, | |
| "grad_norm": 18.656843185424805, | |
| "learning_rate": 1.959705882352941e-05, | |
| "loss": 5.0704, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 3.861111111111111, | |
| "grad_norm": 16.182132720947266, | |
| "learning_rate": 1.9508823529411765e-05, | |
| "loss": 5.1655, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 3.888888888888889, | |
| "grad_norm": 18.828876495361328, | |
| "learning_rate": 1.9420588235294117e-05, | |
| "loss": 5.0816, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 3.9166666666666665, | |
| "grad_norm": 18.60540199279785, | |
| "learning_rate": 1.9332352941176472e-05, | |
| "loss": 5.2968, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 3.9444444444444446, | |
| "grad_norm": 17.49435806274414, | |
| "learning_rate": 1.9244117647058823e-05, | |
| "loss": 5.3383, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 3.9722222222222223, | |
| "grad_norm": 19.32575798034668, | |
| "learning_rate": 1.915588235294118e-05, | |
| "loss": 5.1811, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 23.117151260375977, | |
| "learning_rate": 1.906764705882353e-05, | |
| "loss": 5.3958, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 4.027777777777778, | |
| "grad_norm": 14.918752670288086, | |
| "learning_rate": 1.8979411764705885e-05, | |
| "loss": 4.7873, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 4.055555555555555, | |
| "grad_norm": 16.663442611694336, | |
| "learning_rate": 1.8891176470588236e-05, | |
| "loss": 4.812, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 4.083333333333333, | |
| "grad_norm": 17.157150268554688, | |
| "learning_rate": 1.8802941176470588e-05, | |
| "loss": 4.7987, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 4.111111111111111, | |
| "grad_norm": 18.143966674804688, | |
| "learning_rate": 1.871470588235294e-05, | |
| "loss": 4.6237, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 4.138888888888889, | |
| "grad_norm": 16.317506790161133, | |
| "learning_rate": 1.8626470588235295e-05, | |
| "loss": 4.6253, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 4.166666666666667, | |
| "grad_norm": 23.109792709350586, | |
| "learning_rate": 1.8538235294117646e-05, | |
| "loss": 4.7575, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 4.194444444444445, | |
| "grad_norm": 16.878374099731445, | |
| "learning_rate": 1.845e-05, | |
| "loss": 4.6636, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 4.222222222222222, | |
| "grad_norm": 18.059541702270508, | |
| "learning_rate": 1.8361764705882353e-05, | |
| "loss": 4.7979, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 4.25, | |
| "grad_norm": 16.914222717285156, | |
| "learning_rate": 1.8273529411764708e-05, | |
| "loss": 4.7786, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 4.277777777777778, | |
| "grad_norm": 18.730314254760742, | |
| "learning_rate": 1.818529411764706e-05, | |
| "loss": 4.7931, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 4.305555555555555, | |
| "grad_norm": 17.19397735595703, | |
| "learning_rate": 1.8097058823529414e-05, | |
| "loss": 4.6813, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 4.333333333333333, | |
| "grad_norm": 17.824308395385742, | |
| "learning_rate": 1.8008823529411766e-05, | |
| "loss": 4.8799, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 4.361111111111111, | |
| "grad_norm": 17.733112335205078, | |
| "learning_rate": 1.792058823529412e-05, | |
| "loss": 4.7475, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 4.388888888888889, | |
| "grad_norm": 17.163602828979492, | |
| "learning_rate": 1.783235294117647e-05, | |
| "loss": 4.8217, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 4.416666666666667, | |
| "grad_norm": 19.913904190063477, | |
| "learning_rate": 1.7744117647058824e-05, | |
| "loss": 4.8949, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 4.444444444444445, | |
| "grad_norm": 18.19684600830078, | |
| "learning_rate": 1.7655882352941176e-05, | |
| "loss": 4.9527, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 4.472222222222222, | |
| "grad_norm": 19.120418548583984, | |
| "learning_rate": 1.756764705882353e-05, | |
| "loss": 4.8928, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 4.5, | |
| "grad_norm": 18.0157470703125, | |
| "learning_rate": 1.7479411764705882e-05, | |
| "loss": 4.8177, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 4.527777777777778, | |
| "grad_norm": 20.340044021606445, | |
| "learning_rate": 1.7391176470588237e-05, | |
| "loss": 4.6013, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 4.555555555555555, | |
| "grad_norm": 16.55755043029785, | |
| "learning_rate": 1.730294117647059e-05, | |
| "loss": 4.8209, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 4.583333333333333, | |
| "grad_norm": 17.506793975830078, | |
| "learning_rate": 1.7214705882352944e-05, | |
| "loss": 5.0675, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 4.611111111111111, | |
| "grad_norm": 22.037546157836914, | |
| "learning_rate": 1.7126470588235295e-05, | |
| "loss": 4.8392, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 4.638888888888889, | |
| "grad_norm": 17.09819221496582, | |
| "learning_rate": 1.703823529411765e-05, | |
| "loss": 4.7789, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 4.666666666666667, | |
| "grad_norm": 23.13201904296875, | |
| "learning_rate": 1.695e-05, | |
| "loss": 4.8017, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 4.694444444444445, | |
| "grad_norm": 19.751766204833984, | |
| "learning_rate": 1.6861764705882354e-05, | |
| "loss": 4.7095, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 4.722222222222222, | |
| "grad_norm": 17.58235740661621, | |
| "learning_rate": 1.6773529411764705e-05, | |
| "loss": 4.7251, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 4.75, | |
| "grad_norm": 17.987777709960938, | |
| "learning_rate": 1.668529411764706e-05, | |
| "loss": 4.8577, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 4.777777777777778, | |
| "grad_norm": 21.950464248657227, | |
| "learning_rate": 1.6597058823529412e-05, | |
| "loss": 4.8194, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 4.805555555555555, | |
| "grad_norm": 18.807058334350586, | |
| "learning_rate": 1.6508823529411767e-05, | |
| "loss": 4.8405, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 4.833333333333333, | |
| "grad_norm": 19.066274642944336, | |
| "learning_rate": 1.642058823529412e-05, | |
| "loss": 4.8167, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 4.861111111111111, | |
| "grad_norm": 16.53117561340332, | |
| "learning_rate": 1.633235294117647e-05, | |
| "loss": 4.9061, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 4.888888888888889, | |
| "grad_norm": 19.067041397094727, | |
| "learning_rate": 1.6244117647058825e-05, | |
| "loss": 4.772, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 4.916666666666667, | |
| "grad_norm": 17.190061569213867, | |
| "learning_rate": 1.6155882352941177e-05, | |
| "loss": 4.8326, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 4.944444444444445, | |
| "grad_norm": 22.074304580688477, | |
| "learning_rate": 1.606764705882353e-05, | |
| "loss": 5.0498, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 4.972222222222222, | |
| "grad_norm": 17.11249542236328, | |
| "learning_rate": 1.597941176470588e-05, | |
| "loss": 4.8728, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 17.489036560058594, | |
| "learning_rate": 1.5891176470588235e-05, | |
| "loss": 4.7712, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 5.027777777777778, | |
| "grad_norm": 18.673629760742188, | |
| "learning_rate": 1.5802941176470586e-05, | |
| "loss": 4.318, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 5.055555555555555, | |
| "grad_norm": 18.343915939331055, | |
| "learning_rate": 1.571470588235294e-05, | |
| "loss": 4.4046, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 5.083333333333333, | |
| "grad_norm": 18.69623565673828, | |
| "learning_rate": 1.5626470588235293e-05, | |
| "loss": 4.4717, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 5.111111111111111, | |
| "grad_norm": 17.767257690429688, | |
| "learning_rate": 1.5538235294117648e-05, | |
| "loss": 4.3915, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 5.138888888888889, | |
| "grad_norm": 18.257091522216797, | |
| "learning_rate": 1.545e-05, | |
| "loss": 4.4195, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 5.166666666666667, | |
| "grad_norm": 18.255918502807617, | |
| "learning_rate": 1.5361764705882354e-05, | |
| "loss": 4.466, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 5.194444444444445, | |
| "grad_norm": 17.16253089904785, | |
| "learning_rate": 1.5273529411764706e-05, | |
| "loss": 4.4639, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 5.222222222222222, | |
| "grad_norm": 23.036542892456055, | |
| "learning_rate": 1.5185294117647061e-05, | |
| "loss": 4.4155, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 5.25, | |
| "grad_norm": 16.37624168395996, | |
| "learning_rate": 1.5097058823529414e-05, | |
| "loss": 4.4856, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 5.277777777777778, | |
| "grad_norm": 15.804655075073242, | |
| "learning_rate": 1.5008823529411764e-05, | |
| "loss": 4.3916, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 5.305555555555555, | |
| "grad_norm": 18.654705047607422, | |
| "learning_rate": 1.492058823529412e-05, | |
| "loss": 4.5315, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 5.333333333333333, | |
| "grad_norm": 17.40276336669922, | |
| "learning_rate": 1.483235294117647e-05, | |
| "loss": 4.4648, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 5.361111111111111, | |
| "grad_norm": 14.804203987121582, | |
| "learning_rate": 1.4744117647058824e-05, | |
| "loss": 4.5326, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 5.388888888888889, | |
| "grad_norm": 18.393539428710938, | |
| "learning_rate": 1.4655882352941177e-05, | |
| "loss": 4.3771, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 5.416666666666667, | |
| "grad_norm": 21.74736213684082, | |
| "learning_rate": 1.456764705882353e-05, | |
| "loss": 4.424, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 5.444444444444445, | |
| "grad_norm": 18.945545196533203, | |
| "learning_rate": 1.4479411764705884e-05, | |
| "loss": 4.3918, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 5.472222222222222, | |
| "grad_norm": 21.048032760620117, | |
| "learning_rate": 1.4391176470588237e-05, | |
| "loss": 4.4165, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 5.5, | |
| "grad_norm": 17.606342315673828, | |
| "learning_rate": 1.4302941176470589e-05, | |
| "loss": 4.4718, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 5.527777777777778, | |
| "grad_norm": 16.075519561767578, | |
| "learning_rate": 1.4214705882352942e-05, | |
| "loss": 4.3442, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 5.555555555555555, | |
| "grad_norm": 15.067805290222168, | |
| "learning_rate": 1.4126470588235295e-05, | |
| "loss": 4.4641, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 5.583333333333333, | |
| "grad_norm": 16.70688819885254, | |
| "learning_rate": 1.4038235294117649e-05, | |
| "loss": 4.4619, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 5.611111111111111, | |
| "grad_norm": 20.45810317993164, | |
| "learning_rate": 1.395e-05, | |
| "loss": 4.4448, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 5.638888888888889, | |
| "grad_norm": 19.671323776245117, | |
| "learning_rate": 1.3861764705882352e-05, | |
| "loss": 4.5246, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 5.666666666666667, | |
| "grad_norm": 17.035329818725586, | |
| "learning_rate": 1.3773529411764705e-05, | |
| "loss": 4.3425, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 5.694444444444445, | |
| "grad_norm": 18.257850646972656, | |
| "learning_rate": 1.3685294117647058e-05, | |
| "loss": 4.3883, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 5.722222222222222, | |
| "grad_norm": 17.29006004333496, | |
| "learning_rate": 1.3597058823529412e-05, | |
| "loss": 4.4048, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 5.75, | |
| "grad_norm": 16.57717514038086, | |
| "learning_rate": 1.3508823529411765e-05, | |
| "loss": 4.4456, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 5.777777777777778, | |
| "grad_norm": 21.21561622619629, | |
| "learning_rate": 1.3420588235294117e-05, | |
| "loss": 4.5329, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 5.805555555555555, | |
| "grad_norm": 18.407976150512695, | |
| "learning_rate": 1.333235294117647e-05, | |
| "loss": 4.6047, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 5.833333333333333, | |
| "grad_norm": 15.929880142211914, | |
| "learning_rate": 1.3244117647058823e-05, | |
| "loss": 4.421, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 5.861111111111111, | |
| "grad_norm": 16.92407989501953, | |
| "learning_rate": 1.3155882352941176e-05, | |
| "loss": 4.4857, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 5.888888888888889, | |
| "grad_norm": 17.74669647216797, | |
| "learning_rate": 1.306764705882353e-05, | |
| "loss": 4.4654, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 5.916666666666667, | |
| "grad_norm": 19.744373321533203, | |
| "learning_rate": 1.2979411764705881e-05, | |
| "loss": 4.3837, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 5.944444444444445, | |
| "grad_norm": 15.323151588439941, | |
| "learning_rate": 1.2891176470588235e-05, | |
| "loss": 4.3457, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 5.972222222222222, | |
| "grad_norm": 18.93714141845703, | |
| "learning_rate": 1.2802941176470588e-05, | |
| "loss": 4.3657, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "grad_norm": 19.87427520751953, | |
| "learning_rate": 1.2714705882352941e-05, | |
| "loss": 4.4179, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 6.027777777777778, | |
| "grad_norm": 15.826274871826172, | |
| "learning_rate": 1.2626470588235295e-05, | |
| "loss": 4.1858, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 6.055555555555555, | |
| "grad_norm": 16.08074378967285, | |
| "learning_rate": 1.2538235294117648e-05, | |
| "loss": 4.1317, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 6.083333333333333, | |
| "grad_norm": 15.976150512695312, | |
| "learning_rate": 1.245e-05, | |
| "loss": 4.1214, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 6.111111111111111, | |
| "grad_norm": 17.065427780151367, | |
| "learning_rate": 1.2361764705882353e-05, | |
| "loss": 4.0471, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 6.138888888888889, | |
| "grad_norm": 15.556553840637207, | |
| "learning_rate": 1.2273529411764706e-05, | |
| "loss": 4.0961, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 6.166666666666667, | |
| "grad_norm": 16.178955078125, | |
| "learning_rate": 1.218529411764706e-05, | |
| "loss": 4.1129, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 6.194444444444445, | |
| "grad_norm": 17.971904754638672, | |
| "learning_rate": 1.2097058823529413e-05, | |
| "loss": 4.111, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 6.222222222222222, | |
| "grad_norm": 17.23676109313965, | |
| "learning_rate": 1.2008823529411764e-05, | |
| "loss": 4.209, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 6.25, | |
| "grad_norm": 18.453350067138672, | |
| "learning_rate": 1.1920588235294117e-05, | |
| "loss": 4.1997, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 6.277777777777778, | |
| "grad_norm": 17.884140014648438, | |
| "learning_rate": 1.183235294117647e-05, | |
| "loss": 4.1343, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 6.305555555555555, | |
| "grad_norm": 18.340280532836914, | |
| "learning_rate": 1.1744117647058824e-05, | |
| "loss": 4.1262, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 6.333333333333333, | |
| "grad_norm": 18.307395935058594, | |
| "learning_rate": 1.1655882352941177e-05, | |
| "loss": 4.1307, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 6.361111111111111, | |
| "grad_norm": 17.725727081298828, | |
| "learning_rate": 1.1567647058823529e-05, | |
| "loss": 4.0369, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 6.388888888888889, | |
| "grad_norm": 14.209752082824707, | |
| "learning_rate": 1.1479411764705882e-05, | |
| "loss": 4.1071, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 6.416666666666667, | |
| "grad_norm": 21.862701416015625, | |
| "learning_rate": 1.1391176470588235e-05, | |
| "loss": 4.223, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 6.444444444444445, | |
| "grad_norm": 13.577404975891113, | |
| "learning_rate": 1.1302941176470589e-05, | |
| "loss": 4.2441, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 6.472222222222222, | |
| "grad_norm": 16.62024688720703, | |
| "learning_rate": 1.1214705882352942e-05, | |
| "loss": 4.1793, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 6.5, | |
| "grad_norm": 17.887189865112305, | |
| "learning_rate": 1.1126470588235295e-05, | |
| "loss": 4.1837, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 6.527777777777778, | |
| "grad_norm": 16.476367950439453, | |
| "learning_rate": 1.1038235294117647e-05, | |
| "loss": 4.1526, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 6.555555555555555, | |
| "grad_norm": 17.012876510620117, | |
| "learning_rate": 1.095e-05, | |
| "loss": 4.1965, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 6.583333333333333, | |
| "grad_norm": 17.46245765686035, | |
| "learning_rate": 1.0861764705882353e-05, | |
| "loss": 4.1394, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 6.611111111111111, | |
| "grad_norm": 17.8790340423584, | |
| "learning_rate": 1.0773529411764707e-05, | |
| "loss": 4.1404, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 6.638888888888889, | |
| "grad_norm": 16.047483444213867, | |
| "learning_rate": 1.068529411764706e-05, | |
| "loss": 4.1839, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 6.666666666666667, | |
| "grad_norm": 15.634733200073242, | |
| "learning_rate": 1.0597058823529412e-05, | |
| "loss": 4.1608, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 6.694444444444445, | |
| "grad_norm": 15.73880386352539, | |
| "learning_rate": 1.0508823529411765e-05, | |
| "loss": 4.1989, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 6.722222222222222, | |
| "grad_norm": 18.751541137695312, | |
| "learning_rate": 1.0420588235294118e-05, | |
| "loss": 4.215, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 6.75, | |
| "grad_norm": 18.6307430267334, | |
| "learning_rate": 1.0332352941176472e-05, | |
| "loss": 4.1345, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 6.777777777777778, | |
| "grad_norm": 18.48142433166504, | |
| "learning_rate": 1.0244117647058825e-05, | |
| "loss": 4.2856, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 6.805555555555555, | |
| "grad_norm": 17.484237670898438, | |
| "learning_rate": 1.0155882352941176e-05, | |
| "loss": 4.223, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 6.833333333333333, | |
| "grad_norm": 16.67469024658203, | |
| "learning_rate": 1.006764705882353e-05, | |
| "loss": 4.1748, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 6.861111111111111, | |
| "grad_norm": 18.385995864868164, | |
| "learning_rate": 9.979411764705883e-06, | |
| "loss": 4.1886, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 6.888888888888889, | |
| "grad_norm": 20.845624923706055, | |
| "learning_rate": 9.891176470588236e-06, | |
| "loss": 4.1832, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 6.916666666666667, | |
| "grad_norm": 18.856298446655273, | |
| "learning_rate": 9.80294117647059e-06, | |
| "loss": 4.2403, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 6.944444444444445, | |
| "grad_norm": 16.965234756469727, | |
| "learning_rate": 9.714705882352943e-06, | |
| "loss": 4.2439, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 6.972222222222222, | |
| "grad_norm": 16.04508399963379, | |
| "learning_rate": 9.626470588235294e-06, | |
| "loss": 4.1588, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "grad_norm": 17.192468643188477, | |
| "learning_rate": 9.538235294117648e-06, | |
| "loss": 4.2357, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 7.027777777777778, | |
| "grad_norm": 14.738517761230469, | |
| "learning_rate": 9.450000000000001e-06, | |
| "loss": 3.8684, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 7.055555555555555, | |
| "grad_norm": 16.8319034576416, | |
| "learning_rate": 9.361764705882354e-06, | |
| "loss": 3.8483, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 7.083333333333333, | |
| "grad_norm": 14.79024600982666, | |
| "learning_rate": 9.273529411764708e-06, | |
| "loss": 3.9072, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 7.111111111111111, | |
| "grad_norm": 12.333316802978516, | |
| "learning_rate": 9.18529411764706e-06, | |
| "loss": 3.9609, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 7.138888888888889, | |
| "grad_norm": 14.567424774169922, | |
| "learning_rate": 9.097058823529412e-06, | |
| "loss": 3.9129, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 7.166666666666667, | |
| "grad_norm": 18.647308349609375, | |
| "learning_rate": 9.008823529411766e-06, | |
| "loss": 3.9278, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 7.194444444444445, | |
| "grad_norm": 16.815906524658203, | |
| "learning_rate": 8.920588235294117e-06, | |
| "loss": 3.9316, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 7.222222222222222, | |
| "grad_norm": 19.90522575378418, | |
| "learning_rate": 8.83235294117647e-06, | |
| "loss": 3.9444, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 7.25, | |
| "grad_norm": 15.773906707763672, | |
| "learning_rate": 8.744117647058822e-06, | |
| "loss": 4.037, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 7.277777777777778, | |
| "grad_norm": 14.414016723632812, | |
| "learning_rate": 8.655882352941176e-06, | |
| "loss": 3.9834, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 7.305555555555555, | |
| "grad_norm": 18.313098907470703, | |
| "learning_rate": 8.567647058823529e-06, | |
| "loss": 4.0229, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 7.333333333333333, | |
| "grad_norm": 15.021878242492676, | |
| "learning_rate": 8.479411764705882e-06, | |
| "loss": 3.9891, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 7.361111111111111, | |
| "grad_norm": 16.228864669799805, | |
| "learning_rate": 8.391176470588235e-06, | |
| "loss": 3.9871, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 7.388888888888889, | |
| "grad_norm": 11.939830780029297, | |
| "learning_rate": 8.302941176470589e-06, | |
| "loss": 3.8708, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 7.416666666666667, | |
| "grad_norm": 13.20190143585205, | |
| "learning_rate": 8.21470588235294e-06, | |
| "loss": 3.8982, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 7.444444444444445, | |
| "grad_norm": 15.111628532409668, | |
| "learning_rate": 8.126470588235294e-06, | |
| "loss": 3.9871, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 7.472222222222222, | |
| "grad_norm": 15.908904075622559, | |
| "learning_rate": 8.038235294117647e-06, | |
| "loss": 3.976, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 7.5, | |
| "grad_norm": 18.098617553710938, | |
| "learning_rate": 7.95e-06, | |
| "loss": 4.0415, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 7.527777777777778, | |
| "grad_norm": 16.921981811523438, | |
| "learning_rate": 7.861764705882353e-06, | |
| "loss": 4.0462, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 7.555555555555555, | |
| "grad_norm": 14.457706451416016, | |
| "learning_rate": 7.773529411764705e-06, | |
| "loss": 3.9405, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 7.583333333333333, | |
| "grad_norm": 17.50982093811035, | |
| "learning_rate": 7.685294117647058e-06, | |
| "loss": 3.9974, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 7.611111111111111, | |
| "grad_norm": 15.285282135009766, | |
| "learning_rate": 7.5970588235294116e-06, | |
| "loss": 3.9969, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 7.638888888888889, | |
| "grad_norm": 14.417180061340332, | |
| "learning_rate": 7.508823529411765e-06, | |
| "loss": 3.97, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 7.666666666666667, | |
| "grad_norm": 14.781904220581055, | |
| "learning_rate": 7.420588235294117e-06, | |
| "loss": 3.9797, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 7.694444444444445, | |
| "grad_norm": 16.317949295043945, | |
| "learning_rate": 7.332352941176471e-06, | |
| "loss": 3.9236, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 7.722222222222222, | |
| "grad_norm": 16.340526580810547, | |
| "learning_rate": 7.244117647058824e-06, | |
| "loss": 4.0323, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 7.75, | |
| "grad_norm": 13.924703598022461, | |
| "learning_rate": 7.155882352941176e-06, | |
| "loss": 3.8719, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 7.777777777777778, | |
| "grad_norm": 19.019325256347656, | |
| "learning_rate": 7.06764705882353e-06, | |
| "loss": 3.9529, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 7.805555555555555, | |
| "grad_norm": 17.188499450683594, | |
| "learning_rate": 6.979411764705882e-06, | |
| "loss": 4.0392, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 7.833333333333333, | |
| "grad_norm": 13.893120765686035, | |
| "learning_rate": 6.891176470588235e-06, | |
| "loss": 3.9727, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 7.861111111111111, | |
| "grad_norm": 13.257885932922363, | |
| "learning_rate": 6.802941176470589e-06, | |
| "loss": 4.0164, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 7.888888888888889, | |
| "grad_norm": 15.29055118560791, | |
| "learning_rate": 6.714705882352941e-06, | |
| "loss": 4.0283, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 7.916666666666667, | |
| "grad_norm": 15.208207130432129, | |
| "learning_rate": 6.626470588235294e-06, | |
| "loss": 3.9515, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 7.944444444444445, | |
| "grad_norm": 15.335555076599121, | |
| "learning_rate": 6.538235294117647e-06, | |
| "loss": 4.0672, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 7.972222222222222, | |
| "grad_norm": 14.554072380065918, | |
| "learning_rate": 6.45e-06, | |
| "loss": 4.0757, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "grad_norm": 17.360929489135742, | |
| "learning_rate": 6.361764705882353e-06, | |
| "loss": 3.9486, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 8.027777777777779, | |
| "grad_norm": 11.244579315185547, | |
| "learning_rate": 6.273529411764706e-06, | |
| "loss": 3.8162, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 8.055555555555555, | |
| "grad_norm": 16.21691131591797, | |
| "learning_rate": 6.185294117647059e-06, | |
| "loss": 3.8226, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 8.083333333333334, | |
| "grad_norm": 16.201114654541016, | |
| "learning_rate": 6.097058823529412e-06, | |
| "loss": 3.7959, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 8.11111111111111, | |
| "grad_norm": 17.415403366088867, | |
| "learning_rate": 6.008823529411765e-06, | |
| "loss": 3.7922, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 8.13888888888889, | |
| "grad_norm": 10.362262725830078, | |
| "learning_rate": 5.920588235294118e-06, | |
| "loss": 3.7602, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 8.166666666666666, | |
| "grad_norm": 12.301746368408203, | |
| "learning_rate": 5.8323529411764706e-06, | |
| "loss": 3.8236, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 8.194444444444445, | |
| "grad_norm": 15.563980102539062, | |
| "learning_rate": 5.744117647058824e-06, | |
| "loss": 3.7988, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 8.222222222222221, | |
| "grad_norm": 14.712089538574219, | |
| "learning_rate": 5.655882352941177e-06, | |
| "loss": 3.8601, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 8.25, | |
| "grad_norm": 16.73733901977539, | |
| "learning_rate": 5.5676470588235296e-06, | |
| "loss": 3.7956, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 8.277777777777779, | |
| "grad_norm": 15.459855079650879, | |
| "learning_rate": 5.479411764705883e-06, | |
| "loss": 3.7525, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 8.305555555555555, | |
| "grad_norm": 16.97991943359375, | |
| "learning_rate": 5.391176470588236e-06, | |
| "loss": 3.7536, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 8.333333333333334, | |
| "grad_norm": 13.196775436401367, | |
| "learning_rate": 5.302941176470589e-06, | |
| "loss": 3.88, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 8.36111111111111, | |
| "grad_norm": 15.121062278747559, | |
| "learning_rate": 5.214705882352941e-06, | |
| "loss": 3.8269, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 8.38888888888889, | |
| "grad_norm": 17.251754760742188, | |
| "learning_rate": 5.1264705882352935e-06, | |
| "loss": 3.8579, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 8.416666666666666, | |
| "grad_norm": 14.106273651123047, | |
| "learning_rate": 5.038235294117647e-06, | |
| "loss": 3.8255, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 8.444444444444445, | |
| "grad_norm": 16.304101943969727, | |
| "learning_rate": 4.95e-06, | |
| "loss": 3.8521, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 8.472222222222221, | |
| "grad_norm": 25.067447662353516, | |
| "learning_rate": 4.8617647058823525e-06, | |
| "loss": 3.8901, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 8.5, | |
| "grad_norm": 15.186288833618164, | |
| "learning_rate": 4.773529411764706e-06, | |
| "loss": 3.8275, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 8.527777777777779, | |
| "grad_norm": 14.004067420959473, | |
| "learning_rate": 4.685294117647059e-06, | |
| "loss": 3.9064, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 8.555555555555555, | |
| "grad_norm": 14.653326988220215, | |
| "learning_rate": 4.5970588235294115e-06, | |
| "loss": 3.7708, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 8.583333333333334, | |
| "grad_norm": 15.567551612854004, | |
| "learning_rate": 4.508823529411765e-06, | |
| "loss": 3.8827, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 8.61111111111111, | |
| "grad_norm": 15.521193504333496, | |
| "learning_rate": 4.420588235294117e-06, | |
| "loss": 3.7874, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 8.63888888888889, | |
| "grad_norm": 13.801298141479492, | |
| "learning_rate": 4.3323529411764705e-06, | |
| "loss": 3.8132, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 8.666666666666666, | |
| "grad_norm": 15.118513107299805, | |
| "learning_rate": 4.244117647058824e-06, | |
| "loss": 3.7819, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 8.694444444444445, | |
| "grad_norm": 14.111237525939941, | |
| "learning_rate": 4.155882352941176e-06, | |
| "loss": 3.8452, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 8.722222222222221, | |
| "grad_norm": 13.785816192626953, | |
| "learning_rate": 4.0676470588235295e-06, | |
| "loss": 3.8541, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 8.75, | |
| "grad_norm": 13.524584770202637, | |
| "learning_rate": 3.979411764705883e-06, | |
| "loss": 3.8521, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 8.777777777777779, | |
| "grad_norm": 15.52719497680664, | |
| "learning_rate": 3.891176470588235e-06, | |
| "loss": 3.8354, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 8.805555555555555, | |
| "grad_norm": 17.874053955078125, | |
| "learning_rate": 3.8029411764705885e-06, | |
| "loss": 3.794, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 8.833333333333334, | |
| "grad_norm": 15.086000442504883, | |
| "learning_rate": 3.7147058823529414e-06, | |
| "loss": 3.7661, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 8.86111111111111, | |
| "grad_norm": 18.586685180664062, | |
| "learning_rate": 3.6264705882352943e-06, | |
| "loss": 3.8781, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 8.88888888888889, | |
| "grad_norm": 16.09383201599121, | |
| "learning_rate": 3.538235294117647e-06, | |
| "loss": 3.8242, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 8.916666666666666, | |
| "grad_norm": 13.737457275390625, | |
| "learning_rate": 3.4500000000000004e-06, | |
| "loss": 3.8439, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 8.944444444444445, | |
| "grad_norm": 14.633013725280762, | |
| "learning_rate": 3.361764705882353e-06, | |
| "loss": 3.8111, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 8.972222222222221, | |
| "grad_norm": 12.145355224609375, | |
| "learning_rate": 3.2735294117647057e-06, | |
| "loss": 3.8591, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "grad_norm": 15.101285934448242, | |
| "learning_rate": 3.1852941176470586e-06, | |
| "loss": 3.8327, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 9.027777777777779, | |
| "grad_norm": 12.12878131866455, | |
| "learning_rate": 3.097058823529412e-06, | |
| "loss": 3.7079, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 9.055555555555555, | |
| "grad_norm": 11.098604202270508, | |
| "learning_rate": 3.0088235294117647e-06, | |
| "loss": 3.6859, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 9.083333333333334, | |
| "grad_norm": 11.60837459564209, | |
| "learning_rate": 2.9205882352941176e-06, | |
| "loss": 3.6906, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 9.11111111111111, | |
| "grad_norm": 11.516129493713379, | |
| "learning_rate": 2.8323529411764705e-06, | |
| "loss": 3.7629, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 9.13888888888889, | |
| "grad_norm": 11.439606666564941, | |
| "learning_rate": 2.7441176470588238e-06, | |
| "loss": 3.7087, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 9.166666666666666, | |
| "grad_norm": 12.872289657592773, | |
| "learning_rate": 2.6558823529411766e-06, | |
| "loss": 3.754, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 9.194444444444445, | |
| "grad_norm": 13.252586364746094, | |
| "learning_rate": 2.5676470588235295e-06, | |
| "loss": 3.7477, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 9.222222222222221, | |
| "grad_norm": 14.191280364990234, | |
| "learning_rate": 2.4794117647058824e-06, | |
| "loss": 3.7703, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 9.25, | |
| "grad_norm": 17.2321720123291, | |
| "learning_rate": 2.3911764705882356e-06, | |
| "loss": 3.7569, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 9.277777777777779, | |
| "grad_norm": 15.821660995483398, | |
| "learning_rate": 2.3029411764705885e-06, | |
| "loss": 3.7321, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 9.305555555555555, | |
| "grad_norm": 10.092562675476074, | |
| "learning_rate": 2.2147058823529414e-06, | |
| "loss": 3.7033, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 9.333333333333334, | |
| "grad_norm": 19.321290969848633, | |
| "learning_rate": 2.1264705882352942e-06, | |
| "loss": 3.7683, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 9.36111111111111, | |
| "grad_norm": 15.781285285949707, | |
| "learning_rate": 2.038235294117647e-06, | |
| "loss": 3.7304, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 9.38888888888889, | |
| "grad_norm": 13.418021202087402, | |
| "learning_rate": 1.95e-06, | |
| "loss": 3.7117, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 9.416666666666666, | |
| "grad_norm": 14.020906448364258, | |
| "learning_rate": 1.861764705882353e-06, | |
| "loss": 3.7131, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 9.444444444444445, | |
| "grad_norm": 13.270752906799316, | |
| "learning_rate": 1.773529411764706e-06, | |
| "loss": 3.6744, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 9.472222222222221, | |
| "grad_norm": 12.589795112609863, | |
| "learning_rate": 1.6852941176470588e-06, | |
| "loss": 3.7544, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 9.5, | |
| "grad_norm": 13.454065322875977, | |
| "learning_rate": 1.5970588235294118e-06, | |
| "loss": 3.7782, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 9.527777777777779, | |
| "grad_norm": 14.777534484863281, | |
| "learning_rate": 1.5088235294117647e-06, | |
| "loss": 3.7203, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 9.555555555555555, | |
| "grad_norm": 14.142292022705078, | |
| "learning_rate": 1.4205882352941178e-06, | |
| "loss": 3.7331, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 9.583333333333334, | |
| "grad_norm": 17.070104598999023, | |
| "learning_rate": 1.3323529411764706e-06, | |
| "loss": 3.7318, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 9.61111111111111, | |
| "grad_norm": 13.819690704345703, | |
| "learning_rate": 1.2441176470588237e-06, | |
| "loss": 3.7379, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 9.63888888888889, | |
| "grad_norm": 13.859333038330078, | |
| "learning_rate": 1.1558823529411764e-06, | |
| "loss": 3.6641, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 9.666666666666666, | |
| "grad_norm": 12.934626579284668, | |
| "learning_rate": 1.0676470588235295e-06, | |
| "loss": 3.7164, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 9.694444444444445, | |
| "grad_norm": 13.10987377166748, | |
| "learning_rate": 9.794117647058823e-07, | |
| "loss": 3.7427, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 9.722222222222221, | |
| "grad_norm": 13.990032196044922, | |
| "learning_rate": 8.911764705882353e-07, | |
| "loss": 3.7195, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 9.75, | |
| "grad_norm": 13.015247344970703, | |
| "learning_rate": 8.029411764705883e-07, | |
| "loss": 3.7378, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 9.777777777777779, | |
| "grad_norm": 13.674323081970215, | |
| "learning_rate": 7.147058823529411e-07, | |
| "loss": 3.7118, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 9.805555555555555, | |
| "grad_norm": 11.697880744934082, | |
| "learning_rate": 6.264705882352941e-07, | |
| "loss": 3.7387, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 9.833333333333334, | |
| "grad_norm": 13.14771842956543, | |
| "learning_rate": 5.382352941176471e-07, | |
| "loss": 3.6967, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 9.86111111111111, | |
| "grad_norm": 12.617823600769043, | |
| "learning_rate": 4.5e-07, | |
| "loss": 3.7122, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 9.88888888888889, | |
| "grad_norm": 14.002346992492676, | |
| "learning_rate": 3.6176470588235295e-07, | |
| "loss": 3.7429, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 9.916666666666666, | |
| "grad_norm": 14.497004508972168, | |
| "learning_rate": 2.7352941176470587e-07, | |
| "loss": 3.7298, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 9.944444444444445, | |
| "grad_norm": 12.17514419555664, | |
| "learning_rate": 1.852941176470588e-07, | |
| "loss": 3.7254, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 9.972222222222221, | |
| "grad_norm": 17.909778594970703, | |
| "learning_rate": 9.705882352941177e-08, | |
| "loss": 3.7097, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 13.537755012512207, | |
| "learning_rate": 8.823529411764706e-09, | |
| "loss": 3.7612, | |
| "step": 3600 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 3600, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4403097501696000.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |