| { | |
| "best_global_step": 6250, | |
| "best_metric": 0.98, | |
| "best_model_checkpoint": "dinov2-Base-finetuned-food101/checkpoint-6250", | |
| "epoch": 10.0, | |
| "eval_steps": 500, | |
| "global_step": 6250, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 89.06837463378906, | |
| "learning_rate": 2.8800000000000004e-07, | |
| "loss": 5.2089, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 106.91532135009766, | |
| "learning_rate": 6.08e-07, | |
| "loss": 5.2135, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 99.38673400878906, | |
| "learning_rate": 9.28e-07, | |
| "loss": 4.6396, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 85.49656677246094, | |
| "learning_rate": 1.248e-06, | |
| "loss": 4.2156, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 89.49871826171875, | |
| "learning_rate": 1.568e-06, | |
| "loss": 3.3982, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 89.63510131835938, | |
| "learning_rate": 1.8880000000000002e-06, | |
| "loss": 2.6911, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "grad_norm": 83.21440887451172, | |
| "learning_rate": 2.2080000000000003e-06, | |
| "loss": 2.1527, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 83.103515625, | |
| "learning_rate": 2.5280000000000006e-06, | |
| "loss": 1.5765, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.144, | |
| "grad_norm": 87.09320831298828, | |
| "learning_rate": 2.848e-06, | |
| "loss": 1.255, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 88.66500091552734, | |
| "learning_rate": 3.1680000000000004e-06, | |
| "loss": 1.0181, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.176, | |
| "grad_norm": 112.97905731201172, | |
| "learning_rate": 3.4880000000000003e-06, | |
| "loss": 0.8366, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 64.27356719970703, | |
| "learning_rate": 3.8080000000000006e-06, | |
| "loss": 0.7367, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.208, | |
| "grad_norm": 84.11141967773438, | |
| "learning_rate": 4.128e-06, | |
| "loss": 0.6918, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "grad_norm": 58.329368591308594, | |
| "learning_rate": 4.4480000000000004e-06, | |
| "loss": 0.6234, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 81.27049255371094, | |
| "learning_rate": 4.768000000000001e-06, | |
| "loss": 0.4478, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 55.579349517822266, | |
| "learning_rate": 5.088000000000001e-06, | |
| "loss": 0.5044, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.272, | |
| "grad_norm": 59.40501403808594, | |
| "learning_rate": 5.408e-06, | |
| "loss": 0.4686, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 133.0911407470703, | |
| "learning_rate": 5.728e-06, | |
| "loss": 0.3406, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.304, | |
| "grad_norm": 51.79353332519531, | |
| "learning_rate": 6.048e-06, | |
| "loss": 0.5976, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 80.84319305419922, | |
| "learning_rate": 6.368000000000001e-06, | |
| "loss": 0.5719, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.336, | |
| "grad_norm": 46.885093688964844, | |
| "learning_rate": 6.688e-06, | |
| "loss": 0.3955, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "grad_norm": 74.30220794677734, | |
| "learning_rate": 7.0080000000000005e-06, | |
| "loss": 0.4505, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.368, | |
| "grad_norm": 85.305908203125, | |
| "learning_rate": 7.328000000000001e-06, | |
| "loss": 0.3513, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 57.64317321777344, | |
| "learning_rate": 7.648e-06, | |
| "loss": 0.4213, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 32.956275939941406, | |
| "learning_rate": 7.968e-06, | |
| "loss": 0.3075, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.416, | |
| "grad_norm": 68.65741729736328, | |
| "learning_rate": 8.288000000000001e-06, | |
| "loss": 0.3231, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.432, | |
| "grad_norm": 61.947906494140625, | |
| "learning_rate": 8.608000000000001e-06, | |
| "loss": 0.3555, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 49.08671951293945, | |
| "learning_rate": 8.928000000000002e-06, | |
| "loss": 0.529, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.464, | |
| "grad_norm": 88.26673889160156, | |
| "learning_rate": 9.248e-06, | |
| "loss": 0.4929, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 82.20326232910156, | |
| "learning_rate": 9.568e-06, | |
| "loss": 0.6569, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.496, | |
| "grad_norm": 22.57343292236328, | |
| "learning_rate": 9.888000000000001e-06, | |
| "loss": 0.3489, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "grad_norm": 61.01542282104492, | |
| "learning_rate": 1.0208e-05, | |
| "loss": 0.6376, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.528, | |
| "grad_norm": 11.429601669311523, | |
| "learning_rate": 1.0528e-05, | |
| "loss": 0.4121, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.544, | |
| "grad_norm": 43.226837158203125, | |
| "learning_rate": 1.0848e-05, | |
| "loss": 0.4455, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 101.21278381347656, | |
| "learning_rate": 1.1168e-05, | |
| "loss": 0.4205, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": 45.80269241333008, | |
| "learning_rate": 1.1488e-05, | |
| "loss": 0.4058, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.592, | |
| "grad_norm": 88.07817077636719, | |
| "learning_rate": 1.1808000000000001e-05, | |
| "loss": 0.516, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.608, | |
| "grad_norm": 117.33232116699219, | |
| "learning_rate": 1.2128000000000001e-05, | |
| "loss": 0.4378, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.624, | |
| "grad_norm": 193.94338989257812, | |
| "learning_rate": 1.2448e-05, | |
| "loss": 0.5074, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 48.7374267578125, | |
| "learning_rate": 1.2768e-05, | |
| "loss": 0.3549, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.656, | |
| "grad_norm": 11.20517635345459, | |
| "learning_rate": 1.3088e-05, | |
| "loss": 0.3929, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.672, | |
| "grad_norm": 43.28117370605469, | |
| "learning_rate": 1.3408000000000001e-05, | |
| "loss": 0.2347, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.688, | |
| "grad_norm": 54.67144012451172, | |
| "learning_rate": 1.3728000000000001e-05, | |
| "loss": 0.4916, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.704, | |
| "grad_norm": 177.31204223632812, | |
| "learning_rate": 1.4048000000000002e-05, | |
| "loss": 0.5148, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 57.29036331176758, | |
| "learning_rate": 1.4368000000000002e-05, | |
| "loss": 0.4149, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.736, | |
| "grad_norm": 71.80987548828125, | |
| "learning_rate": 1.4688000000000002e-05, | |
| "loss": 0.3869, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.752, | |
| "grad_norm": 35.79969024658203, | |
| "learning_rate": 1.5008000000000001e-05, | |
| "loss": 0.4098, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 76.79283905029297, | |
| "learning_rate": 1.5328e-05, | |
| "loss": 0.3789, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.784, | |
| "grad_norm": 4.4785943031311035, | |
| "learning_rate": 1.5648e-05, | |
| "loss": 0.3875, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 222.21080017089844, | |
| "learning_rate": 1.5968e-05, | |
| "loss": 0.5796, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.816, | |
| "grad_norm": 85.26457214355469, | |
| "learning_rate": 1.6288e-05, | |
| "loss": 0.4111, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.832, | |
| "grad_norm": 19.47867202758789, | |
| "learning_rate": 1.6608e-05, | |
| "loss": 0.4095, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.848, | |
| "grad_norm": 91.59925842285156, | |
| "learning_rate": 1.6928e-05, | |
| "loss": 0.5244, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.864, | |
| "grad_norm": 20.435232162475586, | |
| "learning_rate": 1.7248e-05, | |
| "loss": 0.4291, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 104.42509460449219, | |
| "learning_rate": 1.7568000000000002e-05, | |
| "loss": 0.5047, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.896, | |
| "grad_norm": 142.441162109375, | |
| "learning_rate": 1.7888000000000002e-05, | |
| "loss": 0.5301, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.912, | |
| "grad_norm": 95.12364196777344, | |
| "learning_rate": 1.8208000000000003e-05, | |
| "loss": 0.513, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.928, | |
| "grad_norm": 98.76264953613281, | |
| "learning_rate": 1.8528000000000003e-05, | |
| "loss": 0.6661, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.944, | |
| "grad_norm": 65.93214416503906, | |
| "learning_rate": 1.8848000000000003e-05, | |
| "loss": 0.6454, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 178.0699462890625, | |
| "learning_rate": 1.9168000000000004e-05, | |
| "loss": 0.375, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.976, | |
| "grad_norm": 82.4294204711914, | |
| "learning_rate": 1.9488000000000004e-05, | |
| "loss": 0.578, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.992, | |
| "grad_norm": 68.52827453613281, | |
| "learning_rate": 1.9808e-05, | |
| "loss": 0.675, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_accuracy": 0.938, | |
| "eval_f1": 0.9398621531246216, | |
| "eval_loss": 0.17777465283870697, | |
| "eval_runtime": 25.103, | |
| "eval_samples_per_second": 59.754, | |
| "eval_steps_per_second": 14.938, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 1.008, | |
| "grad_norm": 90.21050262451172, | |
| "learning_rate": 1.998577777777778e-05, | |
| "loss": 0.5093, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.024, | |
| "grad_norm": 68.47109985351562, | |
| "learning_rate": 1.9950222222222225e-05, | |
| "loss": 0.4518, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 65.01594543457031, | |
| "learning_rate": 1.9914666666666668e-05, | |
| "loss": 0.5236, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.056, | |
| "grad_norm": 91.11334991455078, | |
| "learning_rate": 1.9879111111111113e-05, | |
| "loss": 0.5741, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.072, | |
| "grad_norm": 69.707763671875, | |
| "learning_rate": 1.984355555555556e-05, | |
| "loss": 0.5296, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.088, | |
| "grad_norm": 82.90894317626953, | |
| "learning_rate": 1.9808e-05, | |
| "loss": 0.6058, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.104, | |
| "grad_norm": 139.13919067382812, | |
| "learning_rate": 1.9772444444444446e-05, | |
| "loss": 0.6164, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 30.786418914794922, | |
| "learning_rate": 1.973688888888889e-05, | |
| "loss": 0.39, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.1360000000000001, | |
| "grad_norm": 73.54574584960938, | |
| "learning_rate": 1.9701333333333334e-05, | |
| "loss": 0.4312, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.152, | |
| "grad_norm": 43.80199432373047, | |
| "learning_rate": 1.966577777777778e-05, | |
| "loss": 0.375, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.168, | |
| "grad_norm": 152.24276733398438, | |
| "learning_rate": 1.9630222222222225e-05, | |
| "loss": 0.2291, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.184, | |
| "grad_norm": 73.12702941894531, | |
| "learning_rate": 1.9594666666666667e-05, | |
| "loss": 0.3965, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 61.434120178222656, | |
| "learning_rate": 1.9559111111111113e-05, | |
| "loss": 0.4949, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.216, | |
| "grad_norm": 57.76673126220703, | |
| "learning_rate": 1.9523555555555558e-05, | |
| "loss": 0.537, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.232, | |
| "grad_norm": 63.07472229003906, | |
| "learning_rate": 1.9488000000000004e-05, | |
| "loss": 0.3069, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.248, | |
| "grad_norm": 82.42784118652344, | |
| "learning_rate": 1.9452444444444446e-05, | |
| "loss": 0.4934, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.264, | |
| "grad_norm": 109.37876892089844, | |
| "learning_rate": 1.9416888888888888e-05, | |
| "loss": 0.4084, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 38.176002502441406, | |
| "learning_rate": 1.9381333333333334e-05, | |
| "loss": 0.4815, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.296, | |
| "grad_norm": 83.43049621582031, | |
| "learning_rate": 1.934577777777778e-05, | |
| "loss": 0.5391, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.312, | |
| "grad_norm": 1.9186406135559082, | |
| "learning_rate": 1.9310222222222225e-05, | |
| "loss": 0.3134, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.328, | |
| "grad_norm": 81.61128234863281, | |
| "learning_rate": 1.9274666666666667e-05, | |
| "loss": 0.4807, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.3439999999999999, | |
| "grad_norm": 37.480411529541016, | |
| "learning_rate": 1.9239111111111112e-05, | |
| "loss": 0.448, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.3599999999999999, | |
| "grad_norm": 80.07466125488281, | |
| "learning_rate": 1.9203555555555558e-05, | |
| "loss": 0.3658, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.376, | |
| "grad_norm": 55.322166442871094, | |
| "learning_rate": 1.9168000000000004e-05, | |
| "loss": 0.3584, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.392, | |
| "grad_norm": 2.566350221633911, | |
| "learning_rate": 1.9132444444444446e-05, | |
| "loss": 0.2511, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.408, | |
| "grad_norm": 25.400840759277344, | |
| "learning_rate": 1.909688888888889e-05, | |
| "loss": 0.2569, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.424, | |
| "grad_norm": 21.915630340576172, | |
| "learning_rate": 1.9061333333333333e-05, | |
| "loss": 0.3917, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 80.36392211914062, | |
| "learning_rate": 1.902577777777778e-05, | |
| "loss": 0.307, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.456, | |
| "grad_norm": 167.50462341308594, | |
| "learning_rate": 1.8990222222222224e-05, | |
| "loss": 0.4593, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.472, | |
| "grad_norm": 108.12252807617188, | |
| "learning_rate": 1.8954666666666667e-05, | |
| "loss": 0.5507, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.488, | |
| "grad_norm": 43.201087951660156, | |
| "learning_rate": 1.8919111111111112e-05, | |
| "loss": 0.4131, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.504, | |
| "grad_norm": 66.8707046508789, | |
| "learning_rate": 1.8883555555555558e-05, | |
| "loss": 0.3105, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 23.931177139282227, | |
| "learning_rate": 1.8848000000000003e-05, | |
| "loss": 0.374, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.536, | |
| "grad_norm": 112.31582641601562, | |
| "learning_rate": 1.8812444444444445e-05, | |
| "loss": 0.4161, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.552, | |
| "grad_norm": 88.35121154785156, | |
| "learning_rate": 1.877688888888889e-05, | |
| "loss": 0.6474, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.568, | |
| "grad_norm": 76.46733093261719, | |
| "learning_rate": 1.8741333333333336e-05, | |
| "loss": 0.3997, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.584, | |
| "grad_norm": 110.73181915283203, | |
| "learning_rate": 1.870577777777778e-05, | |
| "loss": 0.6053, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 35.111751556396484, | |
| "learning_rate": 1.8670222222222224e-05, | |
| "loss": 0.2589, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.616, | |
| "grad_norm": 77.90402221679688, | |
| "learning_rate": 1.8634666666666666e-05, | |
| "loss": 0.485, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.6320000000000001, | |
| "grad_norm": 5.89962100982666, | |
| "learning_rate": 1.8599111111111112e-05, | |
| "loss": 0.4286, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.6480000000000001, | |
| "grad_norm": 50.08778762817383, | |
| "learning_rate": 1.8563555555555557e-05, | |
| "loss": 0.3526, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.6640000000000001, | |
| "grad_norm": 36.45596694946289, | |
| "learning_rate": 1.8528000000000003e-05, | |
| "loss": 0.38, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.6800000000000002, | |
| "grad_norm": 33.204345703125, | |
| "learning_rate": 1.8492444444444445e-05, | |
| "loss": 0.4526, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.696, | |
| "grad_norm": 49.752349853515625, | |
| "learning_rate": 1.845688888888889e-05, | |
| "loss": 0.3143, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.712, | |
| "grad_norm": 69.24263763427734, | |
| "learning_rate": 1.8421333333333336e-05, | |
| "loss": 0.3269, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.728, | |
| "grad_norm": 191.12741088867188, | |
| "learning_rate": 1.838577777777778e-05, | |
| "loss": 0.4755, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.744, | |
| "grad_norm": 86.41985321044922, | |
| "learning_rate": 1.8350222222222224e-05, | |
| "loss": 0.6529, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 102.5144271850586, | |
| "learning_rate": 1.8314666666666666e-05, | |
| "loss": 0.4386, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.776, | |
| "grad_norm": 124.9957275390625, | |
| "learning_rate": 1.827911111111111e-05, | |
| "loss": 0.5639, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.792, | |
| "grad_norm": 21.024431228637695, | |
| "learning_rate": 1.8243555555555557e-05, | |
| "loss": 0.3691, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.808, | |
| "grad_norm": 68.7905502319336, | |
| "learning_rate": 1.8208000000000003e-05, | |
| "loss": 0.4668, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.8239999999999998, | |
| "grad_norm": 58.76439666748047, | |
| "learning_rate": 1.8172444444444445e-05, | |
| "loss": 0.5254, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.8399999999999999, | |
| "grad_norm": 52.31269073486328, | |
| "learning_rate": 1.813688888888889e-05, | |
| "loss": 0.3713, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.8559999999999999, | |
| "grad_norm": 47.486427307128906, | |
| "learning_rate": 1.8101333333333336e-05, | |
| "loss": 0.4245, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.8719999999999999, | |
| "grad_norm": 68.47742462158203, | |
| "learning_rate": 1.806577777777778e-05, | |
| "loss": 0.6182, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.888, | |
| "grad_norm": 65.69060516357422, | |
| "learning_rate": 1.8030222222222223e-05, | |
| "loss": 0.499, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.904, | |
| "grad_norm": 76.0249252319336, | |
| "learning_rate": 1.799466666666667e-05, | |
| "loss": 0.3416, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 46.81635284423828, | |
| "learning_rate": 1.795911111111111e-05, | |
| "loss": 0.413, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.936, | |
| "grad_norm": 45.73046112060547, | |
| "learning_rate": 1.7923555555555557e-05, | |
| "loss": 0.4074, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.952, | |
| "grad_norm": 35.92128372192383, | |
| "learning_rate": 1.7888000000000002e-05, | |
| "loss": 0.428, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.968, | |
| "grad_norm": 54.154273986816406, | |
| "learning_rate": 1.7852444444444444e-05, | |
| "loss": 0.3654, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.984, | |
| "grad_norm": 117.35140228271484, | |
| "learning_rate": 1.781688888888889e-05, | |
| "loss": 0.5069, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 69.95264434814453, | |
| "learning_rate": 1.7781333333333335e-05, | |
| "loss": 0.5284, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_accuracy": 0.9613333333333334, | |
| "eval_f1": 0.9614742314125441, | |
| "eval_loss": 0.12478982657194138, | |
| "eval_runtime": 24.9506, | |
| "eval_samples_per_second": 60.119, | |
| "eval_steps_per_second": 15.03, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 2.016, | |
| "grad_norm": 90.76451873779297, | |
| "learning_rate": 1.774577777777778e-05, | |
| "loss": 0.4187, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 2.032, | |
| "grad_norm": 22.007369995117188, | |
| "learning_rate": 1.7710222222222223e-05, | |
| "loss": 0.2309, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 2.048, | |
| "grad_norm": 56.51552963256836, | |
| "learning_rate": 1.767466666666667e-05, | |
| "loss": 0.3627, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 2.064, | |
| "grad_norm": 52.548004150390625, | |
| "learning_rate": 1.7639111111111114e-05, | |
| "loss": 0.4389, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 54.10152053833008, | |
| "learning_rate": 1.7603555555555556e-05, | |
| "loss": 0.4167, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.096, | |
| "grad_norm": 55.00556564331055, | |
| "learning_rate": 1.7568000000000002e-05, | |
| "loss": 0.3528, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 2.112, | |
| "grad_norm": 12.216611862182617, | |
| "learning_rate": 1.7532444444444444e-05, | |
| "loss": 0.1805, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 2.128, | |
| "grad_norm": 114.3968505859375, | |
| "learning_rate": 1.749688888888889e-05, | |
| "loss": 0.3243, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 2.144, | |
| "grad_norm": 5.575631141662598, | |
| "learning_rate": 1.7461333333333335e-05, | |
| "loss": 0.4095, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 97.60520935058594, | |
| "learning_rate": 1.742577777777778e-05, | |
| "loss": 0.4591, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.176, | |
| "grad_norm": 35.92754364013672, | |
| "learning_rate": 1.7390222222222223e-05, | |
| "loss": 0.3305, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 2.192, | |
| "grad_norm": 46.582794189453125, | |
| "learning_rate": 1.735466666666667e-05, | |
| "loss": 0.4082, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 2.208, | |
| "grad_norm": 73.6832504272461, | |
| "learning_rate": 1.7319111111111114e-05, | |
| "loss": 0.3443, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 2.224, | |
| "grad_norm": 110.241455078125, | |
| "learning_rate": 1.728355555555556e-05, | |
| "loss": 0.2277, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 30.731782913208008, | |
| "learning_rate": 1.7248e-05, | |
| "loss": 0.2708, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.2560000000000002, | |
| "grad_norm": 15.310711860656738, | |
| "learning_rate": 1.7212444444444444e-05, | |
| "loss": 0.3453, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 2.2720000000000002, | |
| "grad_norm": 50.28248977661133, | |
| "learning_rate": 1.717688888888889e-05, | |
| "loss": 0.2364, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 2.288, | |
| "grad_norm": 57.56525802612305, | |
| "learning_rate": 1.7141333333333335e-05, | |
| "loss": 0.2759, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 2.304, | |
| "grad_norm": 63.90391540527344, | |
| "learning_rate": 1.710577777777778e-05, | |
| "loss": 0.3155, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 28.447397232055664, | |
| "learning_rate": 1.7070222222222222e-05, | |
| "loss": 0.4112, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 2.336, | |
| "grad_norm": 51.297603607177734, | |
| "learning_rate": 1.7034666666666668e-05, | |
| "loss": 0.29, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 2.352, | |
| "grad_norm": 29.22579002380371, | |
| "learning_rate": 1.6999111111111114e-05, | |
| "loss": 0.3201, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 2.368, | |
| "grad_norm": 49.331199645996094, | |
| "learning_rate": 1.696355555555556e-05, | |
| "loss": 0.4024, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 2.384, | |
| "grad_norm": 45.29281234741211, | |
| "learning_rate": 1.6928e-05, | |
| "loss": 0.2184, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 63.356895446777344, | |
| "learning_rate": 1.6892444444444447e-05, | |
| "loss": 0.2487, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.416, | |
| "grad_norm": 3.2891721725463867, | |
| "learning_rate": 1.685688888888889e-05, | |
| "loss": 0.2616, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 2.432, | |
| "grad_norm": 66.31077575683594, | |
| "learning_rate": 1.6821333333333334e-05, | |
| "loss": 0.3052, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 2.448, | |
| "grad_norm": 105.19303894042969, | |
| "learning_rate": 1.678577777777778e-05, | |
| "loss": 0.4421, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 2.464, | |
| "grad_norm": 31.327606201171875, | |
| "learning_rate": 1.6750222222222222e-05, | |
| "loss": 0.3093, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 78.41735076904297, | |
| "learning_rate": 1.6714666666666668e-05, | |
| "loss": 0.5758, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 2.496, | |
| "grad_norm": 45.926918029785156, | |
| "learning_rate": 1.6679111111111113e-05, | |
| "loss": 0.2822, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 2.512, | |
| "grad_norm": 90.95999908447266, | |
| "learning_rate": 1.664355555555556e-05, | |
| "loss": 0.3204, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 2.528, | |
| "grad_norm": 16.62053871154785, | |
| "learning_rate": 1.6608e-05, | |
| "loss": 0.3205, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 2.544, | |
| "grad_norm": 163.37112426757812, | |
| "learning_rate": 1.6572444444444446e-05, | |
| "loss": 0.2595, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 5.807415008544922, | |
| "learning_rate": 1.6536888888888892e-05, | |
| "loss": 0.2315, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.576, | |
| "grad_norm": 32.90766525268555, | |
| "learning_rate": 1.6501333333333334e-05, | |
| "loss": 0.3351, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 2.592, | |
| "grad_norm": 53.188724517822266, | |
| "learning_rate": 1.646577777777778e-05, | |
| "loss": 0.2578, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 2.608, | |
| "grad_norm": 29.000858306884766, | |
| "learning_rate": 1.6430222222222222e-05, | |
| "loss": 0.4, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 2.624, | |
| "grad_norm": 7.635106563568115, | |
| "learning_rate": 1.6394666666666667e-05, | |
| "loss": 0.315, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": 2.9866526126861572, | |
| "learning_rate": 1.6359111111111113e-05, | |
| "loss": 0.2032, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 2.656, | |
| "grad_norm": 68.7740707397461, | |
| "learning_rate": 1.632355555555556e-05, | |
| "loss": 0.508, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 2.672, | |
| "grad_norm": 58.381858825683594, | |
| "learning_rate": 1.6288e-05, | |
| "loss": 0.4532, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 2.6879999999999997, | |
| "grad_norm": 83.0326156616211, | |
| "learning_rate": 1.6252444444444446e-05, | |
| "loss": 0.3074, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 2.7039999999999997, | |
| "grad_norm": 48.644752502441406, | |
| "learning_rate": 1.621688888888889e-05, | |
| "loss": 0.2919, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 2.7199999999999998, | |
| "grad_norm": 43.27705001831055, | |
| "learning_rate": 1.6181333333333337e-05, | |
| "loss": 0.4442, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.7359999999999998, | |
| "grad_norm": 62.09077072143555, | |
| "learning_rate": 1.614577777777778e-05, | |
| "loss": 0.3772, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 2.752, | |
| "grad_norm": 97.17964935302734, | |
| "learning_rate": 1.611022222222222e-05, | |
| "loss": 0.2832, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 2.768, | |
| "grad_norm": 113.52243041992188, | |
| "learning_rate": 1.6074666666666667e-05, | |
| "loss": 0.3146, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 2.784, | |
| "grad_norm": 46.1737060546875, | |
| "learning_rate": 1.6039111111111113e-05, | |
| "loss": 0.6467, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 53.177696228027344, | |
| "learning_rate": 1.6003555555555558e-05, | |
| "loss": 0.3414, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 2.816, | |
| "grad_norm": 72.3310775756836, | |
| "learning_rate": 1.5968e-05, | |
| "loss": 0.3393, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 2.832, | |
| "grad_norm": 24.155517578125, | |
| "learning_rate": 1.5932444444444446e-05, | |
| "loss": 0.2632, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 2.848, | |
| "grad_norm": 71.61637878417969, | |
| "learning_rate": 1.589688888888889e-05, | |
| "loss": 0.2974, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 2.864, | |
| "grad_norm": 47.56396484375, | |
| "learning_rate": 1.5861333333333337e-05, | |
| "loss": 0.3304, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 52.59245300292969, | |
| "learning_rate": 1.582577777777778e-05, | |
| "loss": 0.2938, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.896, | |
| "grad_norm": 68.7037582397461, | |
| "learning_rate": 1.5790222222222225e-05, | |
| "loss": 0.2522, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 2.912, | |
| "grad_norm": 62.64757537841797, | |
| "learning_rate": 1.5754666666666667e-05, | |
| "loss": 0.2989, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 2.928, | |
| "grad_norm": 58.7310791015625, | |
| "learning_rate": 1.5719111111111112e-05, | |
| "loss": 0.175, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 2.944, | |
| "grad_norm": 0.8208155035972595, | |
| "learning_rate": 1.5683555555555558e-05, | |
| "loss": 0.454, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 53.65021514892578, | |
| "learning_rate": 1.5648e-05, | |
| "loss": 0.2396, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 2.976, | |
| "grad_norm": 48.882816314697266, | |
| "learning_rate": 1.5612444444444445e-05, | |
| "loss": 0.412, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 2.992, | |
| "grad_norm": 75.47936248779297, | |
| "learning_rate": 1.557688888888889e-05, | |
| "loss": 0.3455, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_accuracy": 0.9513333333333334, | |
| "eval_f1": 0.9522551210889584, | |
| "eval_loss": 0.1548186093568802, | |
| "eval_runtime": 25.3019, | |
| "eval_samples_per_second": 59.284, | |
| "eval_steps_per_second": 14.821, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 3.008, | |
| "grad_norm": 14.862210273742676, | |
| "learning_rate": 1.5541333333333337e-05, | |
| "loss": 0.3599, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 3.024, | |
| "grad_norm": 39.82673263549805, | |
| "learning_rate": 1.550577777777778e-05, | |
| "loss": 0.4111, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "grad_norm": 64.49237823486328, | |
| "learning_rate": 1.5470222222222224e-05, | |
| "loss": 0.1988, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 3.056, | |
| "grad_norm": 53.11403274536133, | |
| "learning_rate": 1.543466666666667e-05, | |
| "loss": 0.292, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 3.072, | |
| "grad_norm": 61.9869499206543, | |
| "learning_rate": 1.5399111111111112e-05, | |
| "loss": 0.1491, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 3.088, | |
| "grad_norm": 24.670124053955078, | |
| "learning_rate": 1.5363555555555557e-05, | |
| "loss": 0.3061, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 3.104, | |
| "grad_norm": 5.021854877471924, | |
| "learning_rate": 1.5328e-05, | |
| "loss": 0.3416, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 3.12, | |
| "grad_norm": 53.80380630493164, | |
| "learning_rate": 1.5292444444444445e-05, | |
| "loss": 0.2582, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 3.136, | |
| "grad_norm": 43.22938537597656, | |
| "learning_rate": 1.525688888888889e-05, | |
| "loss": 0.131, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 3.152, | |
| "grad_norm": 51.80830001831055, | |
| "learning_rate": 1.5221333333333335e-05, | |
| "loss": 0.4225, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 3.168, | |
| "grad_norm": 51.97641372680664, | |
| "learning_rate": 1.518577777777778e-05, | |
| "loss": 0.2496, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 3.184, | |
| "grad_norm": 78.88411712646484, | |
| "learning_rate": 1.5150222222222224e-05, | |
| "loss": 0.2843, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "grad_norm": 80.55473327636719, | |
| "learning_rate": 1.5114666666666668e-05, | |
| "loss": 0.2579, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 3.216, | |
| "grad_norm": 10.610305786132812, | |
| "learning_rate": 1.5079111111111113e-05, | |
| "loss": 0.251, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 3.232, | |
| "grad_norm": 18.754613876342773, | |
| "learning_rate": 1.5043555555555555e-05, | |
| "loss": 0.2215, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 3.248, | |
| "grad_norm": 91.42521667480469, | |
| "learning_rate": 1.5008000000000001e-05, | |
| "loss": 0.4424, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 3.2640000000000002, | |
| "grad_norm": 72.18260955810547, | |
| "learning_rate": 1.4972444444444445e-05, | |
| "loss": 0.2405, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 3.2800000000000002, | |
| "grad_norm": 25.848726272583008, | |
| "learning_rate": 1.493688888888889e-05, | |
| "loss": 0.1684, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 3.296, | |
| "grad_norm": 42.30290222167969, | |
| "learning_rate": 1.4901333333333334e-05, | |
| "loss": 0.1324, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 3.312, | |
| "grad_norm": 121.94938659667969, | |
| "learning_rate": 1.486577777777778e-05, | |
| "loss": 0.3352, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 3.328, | |
| "grad_norm": 26.369827270507812, | |
| "learning_rate": 1.4830222222222224e-05, | |
| "loss": 0.264, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 3.344, | |
| "grad_norm": 77.36036682128906, | |
| "learning_rate": 1.4794666666666669e-05, | |
| "loss": 0.4048, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 3.36, | |
| "grad_norm": 68.10829162597656, | |
| "learning_rate": 1.4759111111111113e-05, | |
| "loss": 0.255, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 3.376, | |
| "grad_norm": 50.70305633544922, | |
| "learning_rate": 1.4723555555555557e-05, | |
| "loss": 0.2573, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 3.392, | |
| "grad_norm": 17.606157302856445, | |
| "learning_rate": 1.4688000000000002e-05, | |
| "loss": 0.1742, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 3.408, | |
| "grad_norm": 47.515872955322266, | |
| "learning_rate": 1.4652444444444445e-05, | |
| "loss": 0.3382, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 3.424, | |
| "grad_norm": 35.3089599609375, | |
| "learning_rate": 1.461688888888889e-05, | |
| "loss": 0.2477, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 3.44, | |
| "grad_norm": 53.55549240112305, | |
| "learning_rate": 1.4581333333333334e-05, | |
| "loss": 0.1245, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 3.456, | |
| "grad_norm": 52.0760498046875, | |
| "learning_rate": 1.454577777777778e-05, | |
| "loss": 0.2309, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 3.472, | |
| "grad_norm": 12.876019477844238, | |
| "learning_rate": 1.4510222222222223e-05, | |
| "loss": 0.2314, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 3.488, | |
| "grad_norm": 126.93236541748047, | |
| "learning_rate": 1.4474666666666669e-05, | |
| "loss": 0.363, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 3.504, | |
| "grad_norm": 33.92259979248047, | |
| "learning_rate": 1.4439111111111113e-05, | |
| "loss": 0.1426, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 3.52, | |
| "grad_norm": 39.19934844970703, | |
| "learning_rate": 1.4403555555555556e-05, | |
| "loss": 0.4135, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 3.536, | |
| "grad_norm": 95.46045684814453, | |
| "learning_rate": 1.4368000000000002e-05, | |
| "loss": 0.2766, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 3.552, | |
| "grad_norm": 70.87342834472656, | |
| "learning_rate": 1.4332444444444446e-05, | |
| "loss": 0.2622, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 3.568, | |
| "grad_norm": 80.17041778564453, | |
| "learning_rate": 1.429688888888889e-05, | |
| "loss": 0.2669, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 3.584, | |
| "grad_norm": 51.33781433105469, | |
| "learning_rate": 1.4261333333333334e-05, | |
| "loss": 0.1214, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 3.6, | |
| "grad_norm": 8.450504302978516, | |
| "learning_rate": 1.4225777777777779e-05, | |
| "loss": 0.1397, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 3.616, | |
| "grad_norm": 71.05221557617188, | |
| "learning_rate": 1.4190222222222223e-05, | |
| "loss": 0.3126, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 3.632, | |
| "grad_norm": 107.07088470458984, | |
| "learning_rate": 1.4154666666666668e-05, | |
| "loss": 0.2445, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 3.648, | |
| "grad_norm": 129.06072998046875, | |
| "learning_rate": 1.4119111111111112e-05, | |
| "loss": 0.343, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 3.664, | |
| "grad_norm": 114.03457641601562, | |
| "learning_rate": 1.4083555555555556e-05, | |
| "loss": 0.4042, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 3.68, | |
| "grad_norm": 98.84168243408203, | |
| "learning_rate": 1.4048000000000002e-05, | |
| "loss": 0.4231, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 3.6959999999999997, | |
| "grad_norm": 87.2681655883789, | |
| "learning_rate": 1.4012444444444446e-05, | |
| "loss": 0.3805, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 3.7119999999999997, | |
| "grad_norm": 8.487804412841797, | |
| "learning_rate": 1.3976888888888891e-05, | |
| "loss": 0.1977, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 3.7279999999999998, | |
| "grad_norm": 57.447540283203125, | |
| "learning_rate": 1.3941333333333333e-05, | |
| "loss": 0.3665, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 3.7439999999999998, | |
| "grad_norm": 21.477903366088867, | |
| "learning_rate": 1.3905777777777779e-05, | |
| "loss": 0.1807, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 3.76, | |
| "grad_norm": 3.5772314071655273, | |
| "learning_rate": 1.3870222222222223e-05, | |
| "loss": 0.2618, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 3.776, | |
| "grad_norm": 71.77428436279297, | |
| "learning_rate": 1.3834666666666668e-05, | |
| "loss": 0.1927, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 3.792, | |
| "grad_norm": 115.86641693115234, | |
| "learning_rate": 1.3799111111111112e-05, | |
| "loss": 0.2278, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 3.808, | |
| "grad_norm": 32.912540435791016, | |
| "learning_rate": 1.3763555555555556e-05, | |
| "loss": 0.3169, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 3.824, | |
| "grad_norm": 90.283447265625, | |
| "learning_rate": 1.3728000000000001e-05, | |
| "loss": 0.4014, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 3.84, | |
| "grad_norm": 48.976383209228516, | |
| "learning_rate": 1.3692444444444445e-05, | |
| "loss": 0.3318, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 3.856, | |
| "grad_norm": 8.107906341552734, | |
| "learning_rate": 1.365688888888889e-05, | |
| "loss": 0.0932, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 3.872, | |
| "grad_norm": 77.96946716308594, | |
| "learning_rate": 1.3621333333333335e-05, | |
| "loss": 0.1904, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 3.888, | |
| "grad_norm": 31.673736572265625, | |
| "learning_rate": 1.358577777777778e-05, | |
| "loss": 0.3834, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 3.904, | |
| "grad_norm": 132.92593383789062, | |
| "learning_rate": 1.3550222222222222e-05, | |
| "loss": 0.3153, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 3.92, | |
| "grad_norm": 48.49889373779297, | |
| "learning_rate": 1.3514666666666668e-05, | |
| "loss": 0.2576, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 3.936, | |
| "grad_norm": 34.92246627807617, | |
| "learning_rate": 1.3479111111111112e-05, | |
| "loss": 0.16, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 3.952, | |
| "grad_norm": 21.563865661621094, | |
| "learning_rate": 1.3443555555555556e-05, | |
| "loss": 0.1484, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 3.968, | |
| "grad_norm": 78.57711029052734, | |
| "learning_rate": 1.3408000000000001e-05, | |
| "loss": 0.2304, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 3.984, | |
| "grad_norm": 18.119483947753906, | |
| "learning_rate": 1.3372444444444445e-05, | |
| "loss": 0.2055, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 121.16084289550781, | |
| "learning_rate": 1.333688888888889e-05, | |
| "loss": 0.1915, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_accuracy": 0.9566666666666667, | |
| "eval_f1": 0.9586420284715019, | |
| "eval_loss": 0.13208819925785065, | |
| "eval_runtime": 25.6321, | |
| "eval_samples_per_second": 58.52, | |
| "eval_steps_per_second": 14.63, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 4.016, | |
| "grad_norm": 67.88136291503906, | |
| "learning_rate": 1.3301333333333334e-05, | |
| "loss": 0.1068, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 4.032, | |
| "grad_norm": 74.76669311523438, | |
| "learning_rate": 1.326577777777778e-05, | |
| "loss": 0.2202, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 4.048, | |
| "grad_norm": 78.5439453125, | |
| "learning_rate": 1.3230222222222224e-05, | |
| "loss": 0.257, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 4.064, | |
| "grad_norm": 59.01443099975586, | |
| "learning_rate": 1.3194666666666668e-05, | |
| "loss": 0.1632, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 4.08, | |
| "grad_norm": 42.725563049316406, | |
| "learning_rate": 1.3159111111111111e-05, | |
| "loss": 0.1885, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 4.096, | |
| "grad_norm": 60.14830017089844, | |
| "learning_rate": 1.3123555555555557e-05, | |
| "loss": 0.2077, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 4.112, | |
| "grad_norm": 74.96038055419922, | |
| "learning_rate": 1.3088e-05, | |
| "loss": 0.261, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 4.128, | |
| "grad_norm": 97.1154556274414, | |
| "learning_rate": 1.3052444444444445e-05, | |
| "loss": 0.3009, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 4.144, | |
| "grad_norm": 11.91926097869873, | |
| "learning_rate": 1.301688888888889e-05, | |
| "loss": 0.185, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 4.16, | |
| "grad_norm": 29.406475067138672, | |
| "learning_rate": 1.2981333333333334e-05, | |
| "loss": 0.2454, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 4.176, | |
| "grad_norm": 4.042813777923584, | |
| "learning_rate": 1.294577777777778e-05, | |
| "loss": 0.1601, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 4.192, | |
| "grad_norm": 99.75011444091797, | |
| "learning_rate": 1.2910222222222223e-05, | |
| "loss": 0.2583, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 4.208, | |
| "grad_norm": 72.59245300292969, | |
| "learning_rate": 1.2874666666666669e-05, | |
| "loss": 0.3869, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 4.224, | |
| "grad_norm": 79.0741195678711, | |
| "learning_rate": 1.2839111111111111e-05, | |
| "loss": 0.2198, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 4.24, | |
| "grad_norm": 31.89919662475586, | |
| "learning_rate": 1.2803555555555557e-05, | |
| "loss": 0.1197, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 4.256, | |
| "grad_norm": 94.28645324707031, | |
| "learning_rate": 1.2768e-05, | |
| "loss": 0.2221, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 4.272, | |
| "grad_norm": 47.83198547363281, | |
| "learning_rate": 1.2732444444444444e-05, | |
| "loss": 0.2703, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 4.288, | |
| "grad_norm": 26.954627990722656, | |
| "learning_rate": 1.269688888888889e-05, | |
| "loss": 0.1873, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 4.304, | |
| "grad_norm": 19.696392059326172, | |
| "learning_rate": 1.2661333333333334e-05, | |
| "loss": 0.1055, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 4.32, | |
| "grad_norm": 85.9021224975586, | |
| "learning_rate": 1.262577777777778e-05, | |
| "loss": 0.2359, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 4.336, | |
| "grad_norm": 69.51016235351562, | |
| "learning_rate": 1.2590222222222223e-05, | |
| "loss": 0.3632, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 4.352, | |
| "grad_norm": 44.004737854003906, | |
| "learning_rate": 1.2554666666666669e-05, | |
| "loss": 0.2787, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 4.368, | |
| "grad_norm": 14.063050270080566, | |
| "learning_rate": 1.2519111111111112e-05, | |
| "loss": 0.2244, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 4.384, | |
| "grad_norm": 34.73302459716797, | |
| "learning_rate": 1.2483555555555558e-05, | |
| "loss": 0.2162, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 4.4, | |
| "grad_norm": 3.56476092338562, | |
| "learning_rate": 1.2448e-05, | |
| "loss": 0.1797, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 4.416, | |
| "grad_norm": 57.786460876464844, | |
| "learning_rate": 1.2412444444444444e-05, | |
| "loss": 0.2107, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 4.432, | |
| "grad_norm": 70.37076568603516, | |
| "learning_rate": 1.237688888888889e-05, | |
| "loss": 0.2161, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 4.448, | |
| "grad_norm": 29.15947151184082, | |
| "learning_rate": 1.2341333333333333e-05, | |
| "loss": 0.1679, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 4.464, | |
| "grad_norm": 46.475250244140625, | |
| "learning_rate": 1.2305777777777779e-05, | |
| "loss": 0.1782, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 4.48, | |
| "grad_norm": 31.443790435791016, | |
| "learning_rate": 1.2270222222222223e-05, | |
| "loss": 0.1657, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 4.496, | |
| "grad_norm": 51.556819915771484, | |
| "learning_rate": 1.2234666666666668e-05, | |
| "loss": 0.3409, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 4.5120000000000005, | |
| "grad_norm": 3.3252880573272705, | |
| "learning_rate": 1.2199111111111112e-05, | |
| "loss": 0.1991, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 4.5280000000000005, | |
| "grad_norm": 5.360611438751221, | |
| "learning_rate": 1.2163555555555558e-05, | |
| "loss": 0.2764, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 4.5440000000000005, | |
| "grad_norm": 7.163545608520508, | |
| "learning_rate": 1.2128000000000001e-05, | |
| "loss": 0.1315, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 4.5600000000000005, | |
| "grad_norm": 52.50614547729492, | |
| "learning_rate": 1.2092444444444444e-05, | |
| "loss": 0.2633, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 4.576, | |
| "grad_norm": 6.198153972625732, | |
| "learning_rate": 1.2056888888888889e-05, | |
| "loss": 0.2934, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 4.592, | |
| "grad_norm": 1.0431843996047974, | |
| "learning_rate": 1.2021333333333333e-05, | |
| "loss": 0.1791, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 4.608, | |
| "grad_norm": 85.78390502929688, | |
| "learning_rate": 1.1985777777777779e-05, | |
| "loss": 0.3817, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 4.624, | |
| "grad_norm": 59.080360412597656, | |
| "learning_rate": 1.1950222222222222e-05, | |
| "loss": 0.4065, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 4.64, | |
| "grad_norm": 20.528331756591797, | |
| "learning_rate": 1.1914666666666668e-05, | |
| "loss": 0.1737, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 4.656, | |
| "grad_norm": 41.44429016113281, | |
| "learning_rate": 1.1879111111111112e-05, | |
| "loss": 0.2387, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 4.672, | |
| "grad_norm": 0.18879224359989166, | |
| "learning_rate": 1.1843555555555557e-05, | |
| "loss": 0.2133, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 4.688, | |
| "grad_norm": 45.2426872253418, | |
| "learning_rate": 1.1808000000000001e-05, | |
| "loss": 0.285, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 4.704, | |
| "grad_norm": 61.843971252441406, | |
| "learning_rate": 1.1772444444444447e-05, | |
| "loss": 0.2561, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 4.72, | |
| "grad_norm": 0.8159428238868713, | |
| "learning_rate": 1.1736888888888889e-05, | |
| "loss": 0.185, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 4.736, | |
| "grad_norm": 70.10686492919922, | |
| "learning_rate": 1.1701333333333333e-05, | |
| "loss": 0.1155, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 4.752, | |
| "grad_norm": 22.079153060913086, | |
| "learning_rate": 1.1665777777777778e-05, | |
| "loss": 0.197, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 4.768, | |
| "grad_norm": 38.955101013183594, | |
| "learning_rate": 1.1630222222222222e-05, | |
| "loss": 0.0747, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 4.784, | |
| "grad_norm": 55.808860778808594, | |
| "learning_rate": 1.1594666666666668e-05, | |
| "loss": 0.2218, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 4.8, | |
| "grad_norm": 223.2913360595703, | |
| "learning_rate": 1.1559111111111111e-05, | |
| "loss": 0.3266, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 4.816, | |
| "grad_norm": 46.87549591064453, | |
| "learning_rate": 1.1523555555555557e-05, | |
| "loss": 0.2688, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 4.832, | |
| "grad_norm": 73.7055892944336, | |
| "learning_rate": 1.1488e-05, | |
| "loss": 0.2473, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 4.848, | |
| "grad_norm": 3.0956780910491943, | |
| "learning_rate": 1.1452444444444446e-05, | |
| "loss": 0.2409, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 4.864, | |
| "grad_norm": 6.121743679046631, | |
| "learning_rate": 1.141688888888889e-05, | |
| "loss": 0.2398, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 4.88, | |
| "grad_norm": 1.6634913682937622, | |
| "learning_rate": 1.1381333333333336e-05, | |
| "loss": 0.1823, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 4.896, | |
| "grad_norm": 3.9630935192108154, | |
| "learning_rate": 1.1345777777777778e-05, | |
| "loss": 0.1687, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 4.912, | |
| "grad_norm": 41.131324768066406, | |
| "learning_rate": 1.1310222222222222e-05, | |
| "loss": 0.1912, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 4.928, | |
| "grad_norm": 121.9698715209961, | |
| "learning_rate": 1.1274666666666667e-05, | |
| "loss": 0.5025, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 4.944, | |
| "grad_norm": 58.44524002075195, | |
| "learning_rate": 1.1239111111111111e-05, | |
| "loss": 0.286, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 4.96, | |
| "grad_norm": 11.90481948852539, | |
| "learning_rate": 1.1203555555555557e-05, | |
| "loss": 0.2558, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 4.976, | |
| "grad_norm": 18.4560546875, | |
| "learning_rate": 1.1168e-05, | |
| "loss": 0.2252, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 4.992, | |
| "grad_norm": 28.277318954467773, | |
| "learning_rate": 1.1132444444444446e-05, | |
| "loss": 0.1953, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_accuracy": 0.978, | |
| "eval_f1": 0.9784349951065628, | |
| "eval_loss": 0.07715080678462982, | |
| "eval_runtime": 25.1098, | |
| "eval_samples_per_second": 59.738, | |
| "eval_steps_per_second": 14.934, | |
| "step": 3125 | |
| }, | |
| { | |
| "epoch": 5.008, | |
| "grad_norm": 38.60945510864258, | |
| "learning_rate": 1.109688888888889e-05, | |
| "loss": 0.0791, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 5.024, | |
| "grad_norm": 0.11121569573879242, | |
| "learning_rate": 1.1061333333333335e-05, | |
| "loss": 0.2513, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 5.04, | |
| "grad_norm": 7.318007946014404, | |
| "learning_rate": 1.102577777777778e-05, | |
| "loss": 0.2184, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 5.056, | |
| "grad_norm": 73.5037612915039, | |
| "learning_rate": 1.0990222222222221e-05, | |
| "loss": 0.2048, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 5.072, | |
| "grad_norm": 6.801938533782959, | |
| "learning_rate": 1.0954666666666667e-05, | |
| "loss": 0.2091, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 5.088, | |
| "grad_norm": 1.4947863817214966, | |
| "learning_rate": 1.091911111111111e-05, | |
| "loss": 0.1544, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 5.104, | |
| "grad_norm": 2.0187385082244873, | |
| "learning_rate": 1.0883555555555556e-05, | |
| "loss": 0.1092, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 5.12, | |
| "grad_norm": 49.457881927490234, | |
| "learning_rate": 1.0848e-05, | |
| "loss": 0.2173, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 5.136, | |
| "grad_norm": 4.548677444458008, | |
| "learning_rate": 1.0812444444444446e-05, | |
| "loss": 0.3133, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 5.152, | |
| "grad_norm": 23.157283782958984, | |
| "learning_rate": 1.077688888888889e-05, | |
| "loss": 0.1977, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 5.168, | |
| "grad_norm": 92.80079650878906, | |
| "learning_rate": 1.0741333333333335e-05, | |
| "loss": 0.197, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 5.184, | |
| "grad_norm": 23.261850357055664, | |
| "learning_rate": 1.0705777777777779e-05, | |
| "loss": 0.1754, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 5.2, | |
| "grad_norm": 70.62091827392578, | |
| "learning_rate": 1.0670222222222224e-05, | |
| "loss": 0.1377, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 5.216, | |
| "grad_norm": 1.145323395729065, | |
| "learning_rate": 1.0634666666666667e-05, | |
| "loss": 0.2681, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 5.232, | |
| "grad_norm": 82.71636199951172, | |
| "learning_rate": 1.059911111111111e-05, | |
| "loss": 0.1608, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 5.248, | |
| "grad_norm": 121.01233673095703, | |
| "learning_rate": 1.0563555555555556e-05, | |
| "loss": 0.3524, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 5.264, | |
| "grad_norm": 67.27264404296875, | |
| "learning_rate": 1.0528e-05, | |
| "loss": 0.1628, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 5.28, | |
| "grad_norm": 107.12232208251953, | |
| "learning_rate": 1.0492444444444445e-05, | |
| "loss": 0.2571, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 5.296, | |
| "grad_norm": 51.44169998168945, | |
| "learning_rate": 1.045688888888889e-05, | |
| "loss": 0.2379, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 5.312, | |
| "grad_norm": 92.96754455566406, | |
| "learning_rate": 1.0421333333333335e-05, | |
| "loss": 0.1719, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 5.328, | |
| "grad_norm": 0.3315001130104065, | |
| "learning_rate": 1.0385777777777779e-05, | |
| "loss": 0.1194, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 5.344, | |
| "grad_norm": 0.11354901641607285, | |
| "learning_rate": 1.0350222222222224e-05, | |
| "loss": 0.2085, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 5.36, | |
| "grad_norm": 43.333709716796875, | |
| "learning_rate": 1.0314666666666668e-05, | |
| "loss": 0.1196, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 5.376, | |
| "grad_norm": 75.45565032958984, | |
| "learning_rate": 1.0279111111111114e-05, | |
| "loss": 0.1169, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 5.392, | |
| "grad_norm": 26.565641403198242, | |
| "learning_rate": 1.0243555555555556e-05, | |
| "loss": 0.1446, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 5.408, | |
| "grad_norm": 43.27265930175781, | |
| "learning_rate": 1.0208e-05, | |
| "loss": 0.2626, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 5.424, | |
| "grad_norm": 120.37715911865234, | |
| "learning_rate": 1.0172444444444445e-05, | |
| "loss": 0.1848, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 5.44, | |
| "grad_norm": 36.994632720947266, | |
| "learning_rate": 1.0136888888888889e-05, | |
| "loss": 0.1891, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 5.456, | |
| "grad_norm": 48.42155456542969, | |
| "learning_rate": 1.0101333333333334e-05, | |
| "loss": 0.2197, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 5.4719999999999995, | |
| "grad_norm": 3.1949923038482666, | |
| "learning_rate": 1.0065777777777778e-05, | |
| "loss": 0.2084, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 5.4879999999999995, | |
| "grad_norm": 125.98743438720703, | |
| "learning_rate": 1.0030222222222224e-05, | |
| "loss": 0.1868, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 5.504, | |
| "grad_norm": 96.58470916748047, | |
| "learning_rate": 9.994666666666668e-06, | |
| "loss": 0.2648, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 5.52, | |
| "grad_norm": 46.52883529663086, | |
| "learning_rate": 9.959111111111111e-06, | |
| "loss": 0.2097, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 5.536, | |
| "grad_norm": 50.802574157714844, | |
| "learning_rate": 9.923555555555557e-06, | |
| "loss": 0.3501, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 5.552, | |
| "grad_norm": 56.12847900390625, | |
| "learning_rate": 9.888000000000001e-06, | |
| "loss": 0.2818, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 5.568, | |
| "grad_norm": 84.0240249633789, | |
| "learning_rate": 9.852444444444446e-06, | |
| "loss": 0.2938, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 5.584, | |
| "grad_norm": 18.963293075561523, | |
| "learning_rate": 9.81688888888889e-06, | |
| "loss": 0.2539, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 5.6, | |
| "grad_norm": 20.54253578186035, | |
| "learning_rate": 9.781333333333334e-06, | |
| "loss": 0.1566, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 5.616, | |
| "grad_norm": 41.77975082397461, | |
| "learning_rate": 9.745777777777778e-06, | |
| "loss": 0.1616, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 5.632, | |
| "grad_norm": 101.94864654541016, | |
| "learning_rate": 9.710222222222223e-06, | |
| "loss": 0.2014, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 5.648, | |
| "grad_norm": 66.22476196289062, | |
| "learning_rate": 9.674666666666667e-06, | |
| "loss": 0.2032, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 5.664, | |
| "grad_norm": 35.40584182739258, | |
| "learning_rate": 9.639111111111113e-06, | |
| "loss": 0.2035, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 5.68, | |
| "grad_norm": 18.508302688598633, | |
| "learning_rate": 9.603555555555557e-06, | |
| "loss": 0.2169, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 5.696, | |
| "grad_norm": 2.2750229835510254, | |
| "learning_rate": 9.568e-06, | |
| "loss": 0.1242, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 5.712, | |
| "grad_norm": 7.904684543609619, | |
| "learning_rate": 9.532444444444446e-06, | |
| "loss": 0.1488, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 5.728, | |
| "grad_norm": 66.34574890136719, | |
| "learning_rate": 9.49688888888889e-06, | |
| "loss": 0.3348, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 5.744, | |
| "grad_norm": 2.1702802181243896, | |
| "learning_rate": 9.461333333333334e-06, | |
| "loss": 0.169, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 5.76, | |
| "grad_norm": 43.21394348144531, | |
| "learning_rate": 9.425777777777778e-06, | |
| "loss": 0.1906, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 5.776, | |
| "grad_norm": 96.57420349121094, | |
| "learning_rate": 9.390222222222223e-06, | |
| "loss": 0.2049, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 5.792, | |
| "grad_norm": 60.05061721801758, | |
| "learning_rate": 9.354666666666667e-06, | |
| "loss": 0.2975, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 5.808, | |
| "grad_norm": 7.084517478942871, | |
| "learning_rate": 9.319111111111113e-06, | |
| "loss": 0.1722, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 5.824, | |
| "grad_norm": 4.0784759521484375, | |
| "learning_rate": 9.283555555555556e-06, | |
| "loss": 0.1725, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 5.84, | |
| "grad_norm": 60.38849639892578, | |
| "learning_rate": 9.248e-06, | |
| "loss": 0.2305, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 5.856, | |
| "grad_norm": 42.00960159301758, | |
| "learning_rate": 9.212444444444446e-06, | |
| "loss": 0.2262, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 5.872, | |
| "grad_norm": 6.289929389953613, | |
| "learning_rate": 9.17688888888889e-06, | |
| "loss": 0.1101, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 5.888, | |
| "grad_norm": 46.45234298706055, | |
| "learning_rate": 9.141333333333333e-06, | |
| "loss": 0.2282, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 5.904, | |
| "grad_norm": 29.963151931762695, | |
| "learning_rate": 9.105777777777779e-06, | |
| "loss": 0.2787, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 5.92, | |
| "grad_norm": 37.11601257324219, | |
| "learning_rate": 9.070222222222223e-06, | |
| "loss": 0.2421, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 5.936, | |
| "grad_norm": 21.949438095092773, | |
| "learning_rate": 9.034666666666667e-06, | |
| "loss": 0.1956, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 5.952, | |
| "grad_norm": 1.3940507173538208, | |
| "learning_rate": 8.999111111111112e-06, | |
| "loss": 0.2005, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 5.968, | |
| "grad_norm": 166.17929077148438, | |
| "learning_rate": 8.963555555555556e-06, | |
| "loss": 0.1114, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 5.984, | |
| "grad_norm": 68.92640686035156, | |
| "learning_rate": 8.928000000000002e-06, | |
| "loss": 0.2453, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "grad_norm": 40.45570755004883, | |
| "learning_rate": 8.892444444444445e-06, | |
| "loss": 0.183, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_accuracy": 0.9773333333333334, | |
| "eval_f1": 0.9781269999063931, | |
| "eval_loss": 0.08286113291978836, | |
| "eval_runtime": 25.561, | |
| "eval_samples_per_second": 58.683, | |
| "eval_steps_per_second": 14.671, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 6.016, | |
| "grad_norm": 55.92890167236328, | |
| "learning_rate": 8.85688888888889e-06, | |
| "loss": 0.1875, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 6.032, | |
| "grad_norm": 40.81064987182617, | |
| "learning_rate": 8.821333333333333e-06, | |
| "loss": 0.1211, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 6.048, | |
| "grad_norm": 37.10798645019531, | |
| "learning_rate": 8.785777777777779e-06, | |
| "loss": 0.1888, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 6.064, | |
| "grad_norm": 80.93401336669922, | |
| "learning_rate": 8.750222222222223e-06, | |
| "loss": 0.1778, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 6.08, | |
| "grad_norm": 3.9104630947113037, | |
| "learning_rate": 8.714666666666666e-06, | |
| "loss": 0.0544, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 6.096, | |
| "grad_norm": 75.7132568359375, | |
| "learning_rate": 8.679111111111112e-06, | |
| "loss": 0.2315, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 6.112, | |
| "grad_norm": 3.832777738571167, | |
| "learning_rate": 8.643555555555556e-06, | |
| "loss": 0.1239, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 6.128, | |
| "grad_norm": 64.49993896484375, | |
| "learning_rate": 8.608000000000001e-06, | |
| "loss": 0.2204, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 6.144, | |
| "grad_norm": 6.4475202560424805, | |
| "learning_rate": 8.572444444444445e-06, | |
| "loss": 0.1066, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 6.16, | |
| "grad_norm": 31.276575088500977, | |
| "learning_rate": 8.53688888888889e-06, | |
| "loss": 0.1227, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 6.176, | |
| "grad_norm": 24.951356887817383, | |
| "learning_rate": 8.501333333333334e-06, | |
| "loss": 0.1531, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 6.192, | |
| "grad_norm": 19.674461364746094, | |
| "learning_rate": 8.465777777777778e-06, | |
| "loss": 0.1219, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 6.208, | |
| "grad_norm": 50.07936477661133, | |
| "learning_rate": 8.430222222222222e-06, | |
| "loss": 0.1664, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 6.224, | |
| "grad_norm": 60.344425201416016, | |
| "learning_rate": 8.394666666666668e-06, | |
| "loss": 0.1654, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 6.24, | |
| "grad_norm": 0.577318012714386, | |
| "learning_rate": 8.359111111111112e-06, | |
| "loss": 0.1509, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 6.256, | |
| "grad_norm": 119.1912612915039, | |
| "learning_rate": 8.323555555555555e-06, | |
| "loss": 0.2452, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 6.272, | |
| "grad_norm": 92.77301025390625, | |
| "learning_rate": 8.288000000000001e-06, | |
| "loss": 0.1129, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 6.288, | |
| "grad_norm": 0.2689651548862457, | |
| "learning_rate": 8.252444444444445e-06, | |
| "loss": 0.1511, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 6.304, | |
| "grad_norm": 4.910072326660156, | |
| "learning_rate": 8.21688888888889e-06, | |
| "loss": 0.1867, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 6.32, | |
| "grad_norm": 147.96607971191406, | |
| "learning_rate": 8.181333333333334e-06, | |
| "loss": 0.1756, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 6.336, | |
| "grad_norm": 3.9301342964172363, | |
| "learning_rate": 8.145777777777778e-06, | |
| "loss": 0.0956, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 6.352, | |
| "grad_norm": 4.562084197998047, | |
| "learning_rate": 8.110222222222222e-06, | |
| "loss": 0.0985, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 6.368, | |
| "grad_norm": 163.62957763671875, | |
| "learning_rate": 8.074666666666667e-06, | |
| "loss": 0.2904, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 6.384, | |
| "grad_norm": 168.3592529296875, | |
| "learning_rate": 8.039111111111111e-06, | |
| "loss": 0.2561, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 6.4, | |
| "grad_norm": 70.33308410644531, | |
| "learning_rate": 8.003555555555557e-06, | |
| "loss": 0.1576, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 6.416, | |
| "grad_norm": 86.92400360107422, | |
| "learning_rate": 7.968e-06, | |
| "loss": 0.2956, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 6.432, | |
| "grad_norm": 100.0543212890625, | |
| "learning_rate": 7.932444444444444e-06, | |
| "loss": 0.2009, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 6.448, | |
| "grad_norm": 0.26956602931022644, | |
| "learning_rate": 7.89688888888889e-06, | |
| "loss": 0.1287, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 6.464, | |
| "grad_norm": 19.65234375, | |
| "learning_rate": 7.861333333333334e-06, | |
| "loss": 0.0629, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 6.48, | |
| "grad_norm": 11.0438871383667, | |
| "learning_rate": 7.82577777777778e-06, | |
| "loss": 0.0386, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 6.496, | |
| "grad_norm": 26.837541580200195, | |
| "learning_rate": 7.790222222222222e-06, | |
| "loss": 0.0877, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 6.5120000000000005, | |
| "grad_norm": 2.23330020904541, | |
| "learning_rate": 7.754666666666667e-06, | |
| "loss": 0.1536, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 6.5280000000000005, | |
| "grad_norm": 103.52494049072266, | |
| "learning_rate": 7.719111111111111e-06, | |
| "loss": 0.1594, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 6.5440000000000005, | |
| "grad_norm": 90.99219512939453, | |
| "learning_rate": 7.683555555555556e-06, | |
| "loss": 0.1525, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 6.5600000000000005, | |
| "grad_norm": 3.9606099128723145, | |
| "learning_rate": 7.648e-06, | |
| "loss": 0.1651, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 6.576, | |
| "grad_norm": 0.9935932159423828, | |
| "learning_rate": 7.612444444444444e-06, | |
| "loss": 0.1217, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 6.592, | |
| "grad_norm": 0.7840667366981506, | |
| "learning_rate": 7.576888888888889e-06, | |
| "loss": 0.0967, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 6.608, | |
| "grad_norm": 0.795747697353363, | |
| "learning_rate": 7.5413333333333335e-06, | |
| "loss": 0.132, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 6.624, | |
| "grad_norm": 5.798128604888916, | |
| "learning_rate": 7.505777777777778e-06, | |
| "loss": 0.1085, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 6.64, | |
| "grad_norm": 57.652103424072266, | |
| "learning_rate": 7.470222222222223e-06, | |
| "loss": 0.1493, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 6.656, | |
| "grad_norm": 0.1584286391735077, | |
| "learning_rate": 7.434666666666668e-06, | |
| "loss": 0.1348, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 6.672, | |
| "grad_norm": 35.631591796875, | |
| "learning_rate": 7.3991111111111114e-06, | |
| "loss": 0.1659, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 6.688, | |
| "grad_norm": 36.18688201904297, | |
| "learning_rate": 7.363555555555556e-06, | |
| "loss": 0.2248, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 6.704, | |
| "grad_norm": 63.91709899902344, | |
| "learning_rate": 7.328000000000001e-06, | |
| "loss": 0.1932, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 6.72, | |
| "grad_norm": 3.0715153217315674, | |
| "learning_rate": 7.2924444444444455e-06, | |
| "loss": 0.1612, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 6.736, | |
| "grad_norm": 24.774658203125, | |
| "learning_rate": 7.25688888888889e-06, | |
| "loss": 0.1223, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 6.752, | |
| "grad_norm": 0.2880302369594574, | |
| "learning_rate": 7.221333333333333e-06, | |
| "loss": 0.0766, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 6.768, | |
| "grad_norm": 193.8282470703125, | |
| "learning_rate": 7.185777777777778e-06, | |
| "loss": 0.1071, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 6.784, | |
| "grad_norm": 3.09204363822937, | |
| "learning_rate": 7.150222222222223e-06, | |
| "loss": 0.2013, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 6.8, | |
| "grad_norm": 3.475525379180908, | |
| "learning_rate": 7.114666666666667e-06, | |
| "loss": 0.1071, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 6.816, | |
| "grad_norm": 105.51129150390625, | |
| "learning_rate": 7.079111111111112e-06, | |
| "loss": 0.0891, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 6.832, | |
| "grad_norm": 12.143157958984375, | |
| "learning_rate": 7.043555555555556e-06, | |
| "loss": 0.1286, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 6.848, | |
| "grad_norm": 0.539508581161499, | |
| "learning_rate": 7.0080000000000005e-06, | |
| "loss": 0.11, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 6.864, | |
| "grad_norm": 16.954879760742188, | |
| "learning_rate": 6.972444444444445e-06, | |
| "loss": 0.0895, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 6.88, | |
| "grad_norm": 136.84512329101562, | |
| "learning_rate": 6.93688888888889e-06, | |
| "loss": 0.2736, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 6.896, | |
| "grad_norm": 0.7156215906143188, | |
| "learning_rate": 6.9013333333333346e-06, | |
| "loss": 0.1564, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 6.912, | |
| "grad_norm": 3.3197035789489746, | |
| "learning_rate": 6.8657777777777776e-06, | |
| "loss": 0.0335, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 6.928, | |
| "grad_norm": 22.07130241394043, | |
| "learning_rate": 6.830222222222222e-06, | |
| "loss": 0.1674, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 6.944, | |
| "grad_norm": 13.202752113342285, | |
| "learning_rate": 6.794666666666667e-06, | |
| "loss": 0.0608, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 6.96, | |
| "grad_norm": 49.284000396728516, | |
| "learning_rate": 6.759111111111112e-06, | |
| "loss": 0.1577, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 6.976, | |
| "grad_norm": 7.672852993011475, | |
| "learning_rate": 6.723555555555556e-06, | |
| "loss": 0.1058, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 6.992, | |
| "grad_norm": 0.3509444296360016, | |
| "learning_rate": 6.688e-06, | |
| "loss": 0.1513, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_accuracy": 0.972, | |
| "eval_f1": 0.9735106051850267, | |
| "eval_loss": 0.11335264146327972, | |
| "eval_runtime": 25.4218, | |
| "eval_samples_per_second": 59.004, | |
| "eval_steps_per_second": 14.751, | |
| "step": 4375 | |
| }, | |
| { | |
| "epoch": 7.008, | |
| "grad_norm": 6.444321155548096, | |
| "learning_rate": 6.652444444444445e-06, | |
| "loss": 0.1258, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 7.024, | |
| "grad_norm": 9.443979263305664, | |
| "learning_rate": 6.6168888888888896e-06, | |
| "loss": 0.0816, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 7.04, | |
| "grad_norm": 73.74608612060547, | |
| "learning_rate": 6.581333333333334e-06, | |
| "loss": 0.1251, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 7.056, | |
| "grad_norm": 1.7877308130264282, | |
| "learning_rate": 6.545777777777779e-06, | |
| "loss": 0.0425, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 7.072, | |
| "grad_norm": 52.212074279785156, | |
| "learning_rate": 6.510222222222222e-06, | |
| "loss": 0.1424, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 7.088, | |
| "grad_norm": 108.42538452148438, | |
| "learning_rate": 6.474666666666667e-06, | |
| "loss": 0.1876, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 7.104, | |
| "grad_norm": 0.7036087512969971, | |
| "learning_rate": 6.439111111111111e-06, | |
| "loss": 0.0646, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 7.12, | |
| "grad_norm": 30.391590118408203, | |
| "learning_rate": 6.403555555555556e-06, | |
| "loss": 0.1532, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 7.136, | |
| "grad_norm": 30.540075302124023, | |
| "learning_rate": 6.368000000000001e-06, | |
| "loss": 0.115, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 7.152, | |
| "grad_norm": 0.48061010241508484, | |
| "learning_rate": 6.332444444444445e-06, | |
| "loss": 0.0793, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 7.168, | |
| "grad_norm": 28.764617919921875, | |
| "learning_rate": 6.296888888888889e-06, | |
| "loss": 0.071, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 7.184, | |
| "grad_norm": 16.58357810974121, | |
| "learning_rate": 6.261333333333334e-06, | |
| "loss": 0.0931, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 7.2, | |
| "grad_norm": 88.44710540771484, | |
| "learning_rate": 6.225777777777779e-06, | |
| "loss": 0.2221, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 7.216, | |
| "grad_norm": 16.722288131713867, | |
| "learning_rate": 6.190222222222223e-06, | |
| "loss": 0.0601, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 7.232, | |
| "grad_norm": 36.703765869140625, | |
| "learning_rate": 6.154666666666668e-06, | |
| "loss": 0.0834, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 7.248, | |
| "grad_norm": 6.038788795471191, | |
| "learning_rate": 6.119111111111111e-06, | |
| "loss": 0.107, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 7.264, | |
| "grad_norm": 82.84561920166016, | |
| "learning_rate": 6.083555555555556e-06, | |
| "loss": 0.0518, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 7.28, | |
| "grad_norm": 4.832958221435547, | |
| "learning_rate": 6.048e-06, | |
| "loss": 0.0873, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 7.296, | |
| "grad_norm": 0.04455806314945221, | |
| "learning_rate": 6.012444444444445e-06, | |
| "loss": 0.0628, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 7.312, | |
| "grad_norm": 74.5232162475586, | |
| "learning_rate": 5.97688888888889e-06, | |
| "loss": 0.2425, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 7.328, | |
| "grad_norm": 77.59324645996094, | |
| "learning_rate": 5.941333333333334e-06, | |
| "loss": 0.1329, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 7.344, | |
| "grad_norm": 14.539669036865234, | |
| "learning_rate": 5.905777777777778e-06, | |
| "loss": 0.0915, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 7.36, | |
| "grad_norm": 22.89209747314453, | |
| "learning_rate": 5.870222222222223e-06, | |
| "loss": 0.1302, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 7.376, | |
| "grad_norm": 20.032310485839844, | |
| "learning_rate": 5.834666666666668e-06, | |
| "loss": 0.1079, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 7.392, | |
| "grad_norm": 92.58062744140625, | |
| "learning_rate": 5.799111111111112e-06, | |
| "loss": 0.0798, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 7.408, | |
| "grad_norm": 0.06242356449365616, | |
| "learning_rate": 5.763555555555555e-06, | |
| "loss": 0.0412, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 7.424, | |
| "grad_norm": 30.884904861450195, | |
| "learning_rate": 5.728e-06, | |
| "loss": 0.1122, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 7.44, | |
| "grad_norm": 64.5287857055664, | |
| "learning_rate": 5.692444444444445e-06, | |
| "loss": 0.2161, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 7.456, | |
| "grad_norm": 53.6827392578125, | |
| "learning_rate": 5.6568888888888894e-06, | |
| "loss": 0.3298, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 7.4719999999999995, | |
| "grad_norm": 109.19721221923828, | |
| "learning_rate": 5.621333333333334e-06, | |
| "loss": 0.1787, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 7.4879999999999995, | |
| "grad_norm": 0.5951263904571533, | |
| "learning_rate": 5.585777777777778e-06, | |
| "loss": 0.0459, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 7.504, | |
| "grad_norm": 64.03005981445312, | |
| "learning_rate": 5.550222222222223e-06, | |
| "loss": 0.1391, | |
| "step": 4690 | |
| }, | |
| { | |
| "epoch": 7.52, | |
| "grad_norm": 8.298081398010254, | |
| "learning_rate": 5.514666666666667e-06, | |
| "loss": 0.0991, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 7.536, | |
| "grad_norm": 142.68936157226562, | |
| "learning_rate": 5.479111111111112e-06, | |
| "loss": 0.1358, | |
| "step": 4710 | |
| }, | |
| { | |
| "epoch": 7.552, | |
| "grad_norm": 21.265317916870117, | |
| "learning_rate": 5.443555555555557e-06, | |
| "loss": 0.1401, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 7.568, | |
| "grad_norm": 30.693742752075195, | |
| "learning_rate": 5.408e-06, | |
| "loss": 0.0769, | |
| "step": 4730 | |
| }, | |
| { | |
| "epoch": 7.584, | |
| "grad_norm": 64.67269134521484, | |
| "learning_rate": 5.372444444444444e-06, | |
| "loss": 0.1787, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 7.6, | |
| "grad_norm": 0.0010892553254961967, | |
| "learning_rate": 5.336888888888889e-06, | |
| "loss": 0.1196, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 7.616, | |
| "grad_norm": 81.94013977050781, | |
| "learning_rate": 5.301333333333334e-06, | |
| "loss": 0.0926, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 7.632, | |
| "grad_norm": 12.81042766571045, | |
| "learning_rate": 5.2657777777777785e-06, | |
| "loss": 0.0762, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 7.648, | |
| "grad_norm": 20.876262664794922, | |
| "learning_rate": 5.230222222222223e-06, | |
| "loss": 0.1249, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 7.664, | |
| "grad_norm": 3.2425315380096436, | |
| "learning_rate": 5.194666666666667e-06, | |
| "loss": 0.0505, | |
| "step": 4790 | |
| }, | |
| { | |
| "epoch": 7.68, | |
| "grad_norm": 54.530662536621094, | |
| "learning_rate": 5.159111111111112e-06, | |
| "loss": 0.0492, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 7.696, | |
| "grad_norm": 1.59047269821167, | |
| "learning_rate": 5.123555555555556e-06, | |
| "loss": 0.0707, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 7.712, | |
| "grad_norm": 0.6524139046669006, | |
| "learning_rate": 5.088000000000001e-06, | |
| "loss": 0.0962, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 7.728, | |
| "grad_norm": 34.15525436401367, | |
| "learning_rate": 5.052444444444446e-06, | |
| "loss": 0.0608, | |
| "step": 4830 | |
| }, | |
| { | |
| "epoch": 7.744, | |
| "grad_norm": 25.281831741333008, | |
| "learning_rate": 5.016888888888889e-06, | |
| "loss": 0.0812, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 7.76, | |
| "grad_norm": 25.739002227783203, | |
| "learning_rate": 4.9813333333333335e-06, | |
| "loss": 0.1723, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 7.776, | |
| "grad_norm": 0.08939272165298462, | |
| "learning_rate": 4.945777777777778e-06, | |
| "loss": 0.0563, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 7.792, | |
| "grad_norm": 17.250640869140625, | |
| "learning_rate": 4.910222222222223e-06, | |
| "loss": 0.1413, | |
| "step": 4870 | |
| }, | |
| { | |
| "epoch": 7.808, | |
| "grad_norm": 1.4842759370803833, | |
| "learning_rate": 4.874666666666667e-06, | |
| "loss": 0.1468, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 7.824, | |
| "grad_norm": 45.6221809387207, | |
| "learning_rate": 4.839111111111111e-06, | |
| "loss": 0.1271, | |
| "step": 4890 | |
| }, | |
| { | |
| "epoch": 7.84, | |
| "grad_norm": 75.50129699707031, | |
| "learning_rate": 4.803555555555556e-06, | |
| "loss": 0.0428, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 7.856, | |
| "grad_norm": 36.81220626831055, | |
| "learning_rate": 4.768000000000001e-06, | |
| "loss": 0.1584, | |
| "step": 4910 | |
| }, | |
| { | |
| "epoch": 7.872, | |
| "grad_norm": 2.676856517791748, | |
| "learning_rate": 4.7324444444444455e-06, | |
| "loss": 0.1083, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 7.888, | |
| "grad_norm": 0.38402771949768066, | |
| "learning_rate": 4.696888888888889e-06, | |
| "loss": 0.0717, | |
| "step": 4930 | |
| }, | |
| { | |
| "epoch": 7.904, | |
| "grad_norm": 64.51920318603516, | |
| "learning_rate": 4.661333333333334e-06, | |
| "loss": 0.1976, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 7.92, | |
| "grad_norm": 0.05417017638683319, | |
| "learning_rate": 4.625777777777778e-06, | |
| "loss": 0.0584, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 7.936, | |
| "grad_norm": 39.375465393066406, | |
| "learning_rate": 4.5902222222222225e-06, | |
| "loss": 0.0555, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 7.952, | |
| "grad_norm": 0.13842245936393738, | |
| "learning_rate": 4.554666666666667e-06, | |
| "loss": 0.1941, | |
| "step": 4970 | |
| }, | |
| { | |
| "epoch": 7.968, | |
| "grad_norm": 0.08617054671049118, | |
| "learning_rate": 4.519111111111111e-06, | |
| "loss": 0.1379, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 7.984, | |
| "grad_norm": 1.8630857467651367, | |
| "learning_rate": 4.483555555555556e-06, | |
| "loss": 0.1613, | |
| "step": 4990 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "grad_norm": 0.00913298036903143, | |
| "learning_rate": 4.4480000000000004e-06, | |
| "loss": 0.1222, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_accuracy": 0.972, | |
| "eval_f1": 0.9728645176079822, | |
| "eval_loss": 0.11636786162853241, | |
| "eval_runtime": 25.6737, | |
| "eval_samples_per_second": 58.426, | |
| "eval_steps_per_second": 14.606, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 8.016, | |
| "grad_norm": 0.013823838904500008, | |
| "learning_rate": 4.412444444444445e-06, | |
| "loss": 0.0628, | |
| "step": 5010 | |
| }, | |
| { | |
| "epoch": 8.032, | |
| "grad_norm": 0.8125199675559998, | |
| "learning_rate": 4.37688888888889e-06, | |
| "loss": 0.0465, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 8.048, | |
| "grad_norm": 20.5372371673584, | |
| "learning_rate": 4.341333333333334e-06, | |
| "loss": 0.1877, | |
| "step": 5030 | |
| }, | |
| { | |
| "epoch": 8.064, | |
| "grad_norm": 104.10809326171875, | |
| "learning_rate": 4.305777777777778e-06, | |
| "loss": 0.0914, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 8.08, | |
| "grad_norm": 8.148750305175781, | |
| "learning_rate": 4.270222222222223e-06, | |
| "loss": 0.1, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 8.096, | |
| "grad_norm": 20.803829193115234, | |
| "learning_rate": 4.234666666666667e-06, | |
| "loss": 0.2355, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 8.112, | |
| "grad_norm": 5.129018306732178, | |
| "learning_rate": 4.199111111111112e-06, | |
| "loss": 0.0997, | |
| "step": 5070 | |
| }, | |
| { | |
| "epoch": 8.128, | |
| "grad_norm": 226.55096435546875, | |
| "learning_rate": 4.1635555555555554e-06, | |
| "loss": 0.2214, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 8.144, | |
| "grad_norm": 7.318321704864502, | |
| "learning_rate": 4.128e-06, | |
| "loss": 0.0553, | |
| "step": 5090 | |
| }, | |
| { | |
| "epoch": 8.16, | |
| "grad_norm": 21.40049171447754, | |
| "learning_rate": 4.092444444444445e-06, | |
| "loss": 0.0479, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 8.176, | |
| "grad_norm": 0.28443557024002075, | |
| "learning_rate": 4.0568888888888895e-06, | |
| "loss": 0.0686, | |
| "step": 5110 | |
| }, | |
| { | |
| "epoch": 8.192, | |
| "grad_norm": 0.7155297994613647, | |
| "learning_rate": 4.021333333333333e-06, | |
| "loss": 0.0494, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 8.208, | |
| "grad_norm": 3.961090087890625, | |
| "learning_rate": 3.985777777777778e-06, | |
| "loss": 0.1256, | |
| "step": 5130 | |
| }, | |
| { | |
| "epoch": 8.224, | |
| "grad_norm": 47.36995315551758, | |
| "learning_rate": 3.950222222222223e-06, | |
| "loss": 0.1989, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 8.24, | |
| "grad_norm": 2.0493502616882324, | |
| "learning_rate": 3.914666666666667e-06, | |
| "loss": 0.0808, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 8.256, | |
| "grad_norm": 83.30001831054688, | |
| "learning_rate": 3.879111111111111e-06, | |
| "loss": 0.0907, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 8.272, | |
| "grad_norm": 0.30890294909477234, | |
| "learning_rate": 3.843555555555556e-06, | |
| "loss": 0.0017, | |
| "step": 5170 | |
| }, | |
| { | |
| "epoch": 8.288, | |
| "grad_norm": 10.248932838439941, | |
| "learning_rate": 3.8080000000000006e-06, | |
| "loss": 0.0589, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 8.304, | |
| "grad_norm": 49.623043060302734, | |
| "learning_rate": 3.7724444444444445e-06, | |
| "loss": 0.1135, | |
| "step": 5190 | |
| }, | |
| { | |
| "epoch": 8.32, | |
| "grad_norm": 0.22713905572891235, | |
| "learning_rate": 3.736888888888889e-06, | |
| "loss": 0.0997, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 8.336, | |
| "grad_norm": 0.3724857568740845, | |
| "learning_rate": 3.7013333333333334e-06, | |
| "loss": 0.0764, | |
| "step": 5210 | |
| }, | |
| { | |
| "epoch": 8.352, | |
| "grad_norm": 0.576058030128479, | |
| "learning_rate": 3.665777777777778e-06, | |
| "loss": 0.0866, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 8.368, | |
| "grad_norm": 67.60527038574219, | |
| "learning_rate": 3.630222222222223e-06, | |
| "loss": 0.0999, | |
| "step": 5230 | |
| }, | |
| { | |
| "epoch": 8.384, | |
| "grad_norm": 32.816749572753906, | |
| "learning_rate": 3.5946666666666667e-06, | |
| "loss": 0.1252, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 8.4, | |
| "grad_norm": 53.60542678833008, | |
| "learning_rate": 3.5591111111111114e-06, | |
| "loss": 0.1626, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 8.416, | |
| "grad_norm": 25.89781951904297, | |
| "learning_rate": 3.5235555555555556e-06, | |
| "loss": 0.1803, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 8.432, | |
| "grad_norm": 7.168365001678467, | |
| "learning_rate": 3.4880000000000003e-06, | |
| "loss": 0.0965, | |
| "step": 5270 | |
| }, | |
| { | |
| "epoch": 8.448, | |
| "grad_norm": 84.50868225097656, | |
| "learning_rate": 3.452444444444445e-06, | |
| "loss": 0.0859, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 8.464, | |
| "grad_norm": 33.135231018066406, | |
| "learning_rate": 3.416888888888889e-06, | |
| "loss": 0.1881, | |
| "step": 5290 | |
| }, | |
| { | |
| "epoch": 8.48, | |
| "grad_norm": 27.80241584777832, | |
| "learning_rate": 3.3813333333333335e-06, | |
| "loss": 0.0535, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 8.496, | |
| "grad_norm": 5.0428361892700195, | |
| "learning_rate": 3.345777777777778e-06, | |
| "loss": 0.1666, | |
| "step": 5310 | |
| }, | |
| { | |
| "epoch": 8.512, | |
| "grad_norm": 0.08426607400178909, | |
| "learning_rate": 3.3102222222222225e-06, | |
| "loss": 0.1447, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 8.528, | |
| "grad_norm": 13.224571228027344, | |
| "learning_rate": 3.274666666666667e-06, | |
| "loss": 0.2522, | |
| "step": 5330 | |
| }, | |
| { | |
| "epoch": 8.544, | |
| "grad_norm": 36.19147491455078, | |
| "learning_rate": 3.239111111111111e-06, | |
| "loss": 0.1451, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 8.56, | |
| "grad_norm": 1.1866250038146973, | |
| "learning_rate": 3.2035555555555557e-06, | |
| "loss": 0.1019, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 8.576, | |
| "grad_norm": 61.716758728027344, | |
| "learning_rate": 3.1680000000000004e-06, | |
| "loss": 0.1035, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 8.592, | |
| "grad_norm": 7.928038120269775, | |
| "learning_rate": 3.1324444444444447e-06, | |
| "loss": 0.1035, | |
| "step": 5370 | |
| }, | |
| { | |
| "epoch": 8.608, | |
| "grad_norm": 1.8415522575378418, | |
| "learning_rate": 3.0968888888888894e-06, | |
| "loss": 0.0851, | |
| "step": 5380 | |
| }, | |
| { | |
| "epoch": 8.624, | |
| "grad_norm": 6.056102275848389, | |
| "learning_rate": 3.0613333333333332e-06, | |
| "loss": 0.1557, | |
| "step": 5390 | |
| }, | |
| { | |
| "epoch": 8.64, | |
| "grad_norm": 113.13326263427734, | |
| "learning_rate": 3.025777777777778e-06, | |
| "loss": 0.1385, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 8.656, | |
| "grad_norm": 52.255775451660156, | |
| "learning_rate": 2.9902222222222226e-06, | |
| "loss": 0.0754, | |
| "step": 5410 | |
| }, | |
| { | |
| "epoch": 8.672, | |
| "grad_norm": 0.006745174061506987, | |
| "learning_rate": 2.954666666666667e-06, | |
| "loss": 0.0764, | |
| "step": 5420 | |
| }, | |
| { | |
| "epoch": 8.688, | |
| "grad_norm": 2.841777801513672, | |
| "learning_rate": 2.9191111111111116e-06, | |
| "loss": 0.1011, | |
| "step": 5430 | |
| }, | |
| { | |
| "epoch": 8.704, | |
| "grad_norm": 2.33146071434021, | |
| "learning_rate": 2.8835555555555554e-06, | |
| "loss": 0.1613, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 8.72, | |
| "grad_norm": 36.63135528564453, | |
| "learning_rate": 2.848e-06, | |
| "loss": 0.0473, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 8.736, | |
| "grad_norm": 9.347567558288574, | |
| "learning_rate": 2.8124444444444448e-06, | |
| "loss": 0.0489, | |
| "step": 5460 | |
| }, | |
| { | |
| "epoch": 8.752, | |
| "grad_norm": 35.201026916503906, | |
| "learning_rate": 2.776888888888889e-06, | |
| "loss": 0.0275, | |
| "step": 5470 | |
| }, | |
| { | |
| "epoch": 8.768, | |
| "grad_norm": 0.5583624243736267, | |
| "learning_rate": 2.7413333333333337e-06, | |
| "loss": 0.0659, | |
| "step": 5480 | |
| }, | |
| { | |
| "epoch": 8.784, | |
| "grad_norm": 33.14691162109375, | |
| "learning_rate": 2.7057777777777776e-06, | |
| "loss": 0.0206, | |
| "step": 5490 | |
| }, | |
| { | |
| "epoch": 8.8, | |
| "grad_norm": 58.47487258911133, | |
| "learning_rate": 2.6702222222222223e-06, | |
| "loss": 0.0899, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 8.816, | |
| "grad_norm": 0.43246692419052124, | |
| "learning_rate": 2.634666666666667e-06, | |
| "loss": 0.1579, | |
| "step": 5510 | |
| }, | |
| { | |
| "epoch": 8.832, | |
| "grad_norm": 0.14556622505187988, | |
| "learning_rate": 2.5991111111111112e-06, | |
| "loss": 0.1158, | |
| "step": 5520 | |
| }, | |
| { | |
| "epoch": 8.848, | |
| "grad_norm": 81.55155944824219, | |
| "learning_rate": 2.563555555555556e-06, | |
| "loss": 0.1548, | |
| "step": 5530 | |
| }, | |
| { | |
| "epoch": 8.864, | |
| "grad_norm": 0.013808293268084526, | |
| "learning_rate": 2.5280000000000006e-06, | |
| "loss": 0.1388, | |
| "step": 5540 | |
| }, | |
| { | |
| "epoch": 8.88, | |
| "grad_norm": 1.4838815927505493, | |
| "learning_rate": 2.4924444444444445e-06, | |
| "loss": 0.0083, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 8.896, | |
| "grad_norm": 98.49444580078125, | |
| "learning_rate": 2.456888888888889e-06, | |
| "loss": 0.0779, | |
| "step": 5560 | |
| }, | |
| { | |
| "epoch": 8.912, | |
| "grad_norm": 6.951257705688477, | |
| "learning_rate": 2.4213333333333334e-06, | |
| "loss": 0.0272, | |
| "step": 5570 | |
| }, | |
| { | |
| "epoch": 8.928, | |
| "grad_norm": 0.1495244801044464, | |
| "learning_rate": 2.385777777777778e-06, | |
| "loss": 0.0335, | |
| "step": 5580 | |
| }, | |
| { | |
| "epoch": 8.943999999999999, | |
| "grad_norm": 0.030582094565033913, | |
| "learning_rate": 2.3502222222222224e-06, | |
| "loss": 0.072, | |
| "step": 5590 | |
| }, | |
| { | |
| "epoch": 8.96, | |
| "grad_norm": 0.10239086300134659, | |
| "learning_rate": 2.3146666666666666e-06, | |
| "loss": 0.0554, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 8.975999999999999, | |
| "grad_norm": 6.669166564941406, | |
| "learning_rate": 2.2791111111111113e-06, | |
| "loss": 0.0266, | |
| "step": 5610 | |
| }, | |
| { | |
| "epoch": 8.992, | |
| "grad_norm": 115.25152587890625, | |
| "learning_rate": 2.2435555555555556e-06, | |
| "loss": 0.2001, | |
| "step": 5620 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_accuracy": 0.9773333333333334, | |
| "eval_f1": 0.9782696872674377, | |
| "eval_loss": 0.08652861416339874, | |
| "eval_runtime": 25.2033, | |
| "eval_samples_per_second": 59.516, | |
| "eval_steps_per_second": 14.879, | |
| "step": 5625 | |
| }, | |
| { | |
| "epoch": 9.008, | |
| "grad_norm": 6.198025703430176, | |
| "learning_rate": 2.2080000000000003e-06, | |
| "loss": 0.0688, | |
| "step": 5630 | |
| }, | |
| { | |
| "epoch": 9.024, | |
| "grad_norm": 0.08385764062404633, | |
| "learning_rate": 2.1724444444444446e-06, | |
| "loss": 0.0546, | |
| "step": 5640 | |
| }, | |
| { | |
| "epoch": 9.04, | |
| "grad_norm": 37.93516540527344, | |
| "learning_rate": 2.1368888888888892e-06, | |
| "loss": 0.0769, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 9.056, | |
| "grad_norm": 0.6508764028549194, | |
| "learning_rate": 2.1013333333333335e-06, | |
| "loss": 0.063, | |
| "step": 5660 | |
| }, | |
| { | |
| "epoch": 9.072, | |
| "grad_norm": 0.00627525057643652, | |
| "learning_rate": 2.0657777777777778e-06, | |
| "loss": 0.0713, | |
| "step": 5670 | |
| }, | |
| { | |
| "epoch": 9.088, | |
| "grad_norm": 6.918279647827148, | |
| "learning_rate": 2.0302222222222225e-06, | |
| "loss": 0.1266, | |
| "step": 5680 | |
| }, | |
| { | |
| "epoch": 9.104, | |
| "grad_norm": 5.829410552978516, | |
| "learning_rate": 1.9946666666666667e-06, | |
| "loss": 0.0075, | |
| "step": 5690 | |
| }, | |
| { | |
| "epoch": 9.12, | |
| "grad_norm": 0.38724929094314575, | |
| "learning_rate": 1.9591111111111114e-06, | |
| "loss": 0.0384, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 9.136, | |
| "grad_norm": 21.070451736450195, | |
| "learning_rate": 1.9235555555555557e-06, | |
| "loss": 0.0747, | |
| "step": 5710 | |
| }, | |
| { | |
| "epoch": 9.152, | |
| "grad_norm": 36.819156646728516, | |
| "learning_rate": 1.8880000000000002e-06, | |
| "loss": 0.0138, | |
| "step": 5720 | |
| }, | |
| { | |
| "epoch": 9.168, | |
| "grad_norm": 0.718610405921936, | |
| "learning_rate": 1.8524444444444444e-06, | |
| "loss": 0.035, | |
| "step": 5730 | |
| }, | |
| { | |
| "epoch": 9.184, | |
| "grad_norm": 124.62521362304688, | |
| "learning_rate": 1.8168888888888891e-06, | |
| "loss": 0.0578, | |
| "step": 5740 | |
| }, | |
| { | |
| "epoch": 9.2, | |
| "grad_norm": 24.688091278076172, | |
| "learning_rate": 1.7813333333333336e-06, | |
| "loss": 0.1136, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 9.216, | |
| "grad_norm": 2.244947910308838, | |
| "learning_rate": 1.7457777777777779e-06, | |
| "loss": 0.2043, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 9.232, | |
| "grad_norm": 41.393150329589844, | |
| "learning_rate": 1.7102222222222224e-06, | |
| "loss": 0.0379, | |
| "step": 5770 | |
| }, | |
| { | |
| "epoch": 9.248, | |
| "grad_norm": 0.04206838831305504, | |
| "learning_rate": 1.6746666666666668e-06, | |
| "loss": 0.0528, | |
| "step": 5780 | |
| }, | |
| { | |
| "epoch": 9.264, | |
| "grad_norm": 0.4184645116329193, | |
| "learning_rate": 1.6391111111111113e-06, | |
| "loss": 0.0387, | |
| "step": 5790 | |
| }, | |
| { | |
| "epoch": 9.28, | |
| "grad_norm": 0.005343704950064421, | |
| "learning_rate": 1.6035555555555558e-06, | |
| "loss": 0.0581, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 9.296, | |
| "grad_norm": 4.760532379150391, | |
| "learning_rate": 1.568e-06, | |
| "loss": 0.0244, | |
| "step": 5810 | |
| }, | |
| { | |
| "epoch": 9.312, | |
| "grad_norm": 168.53097534179688, | |
| "learning_rate": 1.5324444444444445e-06, | |
| "loss": 0.0925, | |
| "step": 5820 | |
| }, | |
| { | |
| "epoch": 9.328, | |
| "grad_norm": 0.011192296631634235, | |
| "learning_rate": 1.496888888888889e-06, | |
| "loss": 0.0392, | |
| "step": 5830 | |
| }, | |
| { | |
| "epoch": 9.344, | |
| "grad_norm": 76.10388946533203, | |
| "learning_rate": 1.4613333333333335e-06, | |
| "loss": 0.0382, | |
| "step": 5840 | |
| }, | |
| { | |
| "epoch": 9.36, | |
| "grad_norm": 1.840432047843933, | |
| "learning_rate": 1.4257777777777778e-06, | |
| "loss": 0.0388, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 9.376, | |
| "grad_norm": 12.041976928710938, | |
| "learning_rate": 1.3902222222222222e-06, | |
| "loss": 0.0301, | |
| "step": 5860 | |
| }, | |
| { | |
| "epoch": 9.392, | |
| "grad_norm": 0.9841470122337341, | |
| "learning_rate": 1.354666666666667e-06, | |
| "loss": 0.1642, | |
| "step": 5870 | |
| }, | |
| { | |
| "epoch": 9.408, | |
| "grad_norm": 123.9576644897461, | |
| "learning_rate": 1.3191111111111112e-06, | |
| "loss": 0.0573, | |
| "step": 5880 | |
| }, | |
| { | |
| "epoch": 9.424, | |
| "grad_norm": 0.32323941588401794, | |
| "learning_rate": 1.2835555555555557e-06, | |
| "loss": 0.0091, | |
| "step": 5890 | |
| }, | |
| { | |
| "epoch": 9.44, | |
| "grad_norm": 17.422338485717773, | |
| "learning_rate": 1.248e-06, | |
| "loss": 0.0385, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 9.456, | |
| "grad_norm": 9.723565101623535, | |
| "learning_rate": 1.2124444444444446e-06, | |
| "loss": 0.1448, | |
| "step": 5910 | |
| }, | |
| { | |
| "epoch": 9.472, | |
| "grad_norm": 68.42117309570312, | |
| "learning_rate": 1.176888888888889e-06, | |
| "loss": 0.1427, | |
| "step": 5920 | |
| }, | |
| { | |
| "epoch": 9.488, | |
| "grad_norm": 1.3120919466018677, | |
| "learning_rate": 1.1413333333333334e-06, | |
| "loss": 0.0665, | |
| "step": 5930 | |
| }, | |
| { | |
| "epoch": 9.504, | |
| "grad_norm": 20.033823013305664, | |
| "learning_rate": 1.1057777777777779e-06, | |
| "loss": 0.1087, | |
| "step": 5940 | |
| }, | |
| { | |
| "epoch": 9.52, | |
| "grad_norm": 104.10298156738281, | |
| "learning_rate": 1.0702222222222223e-06, | |
| "loss": 0.1248, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 9.536, | |
| "grad_norm": 94.86436462402344, | |
| "learning_rate": 1.0346666666666668e-06, | |
| "loss": 0.0534, | |
| "step": 5960 | |
| }, | |
| { | |
| "epoch": 9.552, | |
| "grad_norm": 52.450225830078125, | |
| "learning_rate": 9.991111111111113e-07, | |
| "loss": 0.0851, | |
| "step": 5970 | |
| }, | |
| { | |
| "epoch": 9.568, | |
| "grad_norm": 140.80067443847656, | |
| "learning_rate": 9.635555555555556e-07, | |
| "loss": 0.0533, | |
| "step": 5980 | |
| }, | |
| { | |
| "epoch": 9.584, | |
| "grad_norm": 0.12079375237226486, | |
| "learning_rate": 9.28e-07, | |
| "loss": 0.0779, | |
| "step": 5990 | |
| }, | |
| { | |
| "epoch": 9.6, | |
| "grad_norm": 2.7201499938964844, | |
| "learning_rate": 8.924444444444445e-07, | |
| "loss": 0.0224, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 9.616, | |
| "grad_norm": 0.0656299740076065, | |
| "learning_rate": 8.568888888888889e-07, | |
| "loss": 0.0276, | |
| "step": 6010 | |
| }, | |
| { | |
| "epoch": 9.632, | |
| "grad_norm": 0.43925511837005615, | |
| "learning_rate": 8.213333333333334e-07, | |
| "loss": 0.0124, | |
| "step": 6020 | |
| }, | |
| { | |
| "epoch": 9.648, | |
| "grad_norm": 0.43047311902046204, | |
| "learning_rate": 7.857777777777778e-07, | |
| "loss": 0.0309, | |
| "step": 6030 | |
| }, | |
| { | |
| "epoch": 9.664, | |
| "grad_norm": 0.023367440328001976, | |
| "learning_rate": 7.502222222222223e-07, | |
| "loss": 0.0081, | |
| "step": 6040 | |
| }, | |
| { | |
| "epoch": 9.68, | |
| "grad_norm": 0.7051363587379456, | |
| "learning_rate": 7.146666666666667e-07, | |
| "loss": 0.0099, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 9.696, | |
| "grad_norm": 1.881511926651001, | |
| "learning_rate": 6.791111111111112e-07, | |
| "loss": 0.046, | |
| "step": 6060 | |
| }, | |
| { | |
| "epoch": 9.712, | |
| "grad_norm": 13.757627487182617, | |
| "learning_rate": 6.435555555555556e-07, | |
| "loss": 0.1202, | |
| "step": 6070 | |
| }, | |
| { | |
| "epoch": 9.728, | |
| "grad_norm": 59.86408996582031, | |
| "learning_rate": 6.08e-07, | |
| "loss": 0.1712, | |
| "step": 6080 | |
| }, | |
| { | |
| "epoch": 9.744, | |
| "grad_norm": 18.352689743041992, | |
| "learning_rate": 5.724444444444445e-07, | |
| "loss": 0.0606, | |
| "step": 6090 | |
| }, | |
| { | |
| "epoch": 9.76, | |
| "grad_norm": 0.10592364519834518, | |
| "learning_rate": 5.368888888888889e-07, | |
| "loss": 0.063, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 9.776, | |
| "grad_norm": 0.009613972157239914, | |
| "learning_rate": 5.013333333333334e-07, | |
| "loss": 0.0522, | |
| "step": 6110 | |
| }, | |
| { | |
| "epoch": 9.792, | |
| "grad_norm": 32.441123962402344, | |
| "learning_rate": 4.6577777777777785e-07, | |
| "loss": 0.0699, | |
| "step": 6120 | |
| }, | |
| { | |
| "epoch": 9.808, | |
| "grad_norm": 3.635063648223877, | |
| "learning_rate": 4.302222222222223e-07, | |
| "loss": 0.0856, | |
| "step": 6130 | |
| }, | |
| { | |
| "epoch": 9.824, | |
| "grad_norm": 59.86997985839844, | |
| "learning_rate": 3.9466666666666665e-07, | |
| "loss": 0.0339, | |
| "step": 6140 | |
| }, | |
| { | |
| "epoch": 9.84, | |
| "grad_norm": 85.35664367675781, | |
| "learning_rate": 3.5911111111111113e-07, | |
| "loss": 0.0553, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 9.856, | |
| "grad_norm": 5.168771743774414, | |
| "learning_rate": 3.2355555555555556e-07, | |
| "loss": 0.0423, | |
| "step": 6160 | |
| }, | |
| { | |
| "epoch": 9.872, | |
| "grad_norm": 72.00662231445312, | |
| "learning_rate": 2.8800000000000004e-07, | |
| "loss": 0.0767, | |
| "step": 6170 | |
| }, | |
| { | |
| "epoch": 9.888, | |
| "grad_norm": 1.257222056388855, | |
| "learning_rate": 2.5244444444444446e-07, | |
| "loss": 0.021, | |
| "step": 6180 | |
| }, | |
| { | |
| "epoch": 9.904, | |
| "grad_norm": 0.03861124441027641, | |
| "learning_rate": 2.1688888888888892e-07, | |
| "loss": 0.101, | |
| "step": 6190 | |
| }, | |
| { | |
| "epoch": 9.92, | |
| "grad_norm": 60.662452697753906, | |
| "learning_rate": 1.8133333333333337e-07, | |
| "loss": 0.0256, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 9.936, | |
| "grad_norm": 57.533695220947266, | |
| "learning_rate": 1.457777777777778e-07, | |
| "loss": 0.0846, | |
| "step": 6210 | |
| }, | |
| { | |
| "epoch": 9.952, | |
| "grad_norm": 0.019413741305470467, | |
| "learning_rate": 1.1022222222222222e-07, | |
| "loss": 0.0633, | |
| "step": 6220 | |
| }, | |
| { | |
| "epoch": 9.968, | |
| "grad_norm": 0.07436740398406982, | |
| "learning_rate": 7.466666666666667e-08, | |
| "loss": 0.0487, | |
| "step": 6230 | |
| }, | |
| { | |
| "epoch": 9.984, | |
| "grad_norm": 0.010338145308196545, | |
| "learning_rate": 3.911111111111111e-08, | |
| "loss": 0.0035, | |
| "step": 6240 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 47.08232498168945, | |
| "learning_rate": 3.555555555555556e-09, | |
| "loss": 0.0618, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_accuracy": 0.98, | |
| "eval_f1": 0.9809426117559896, | |
| "eval_loss": 0.09235712885856628, | |
| "eval_runtime": 25.2279, | |
| "eval_samples_per_second": 59.458, | |
| "eval_steps_per_second": 14.864, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "step": 6250, | |
| "total_flos": 1.02317615087616e+19, | |
| "train_loss": 0.2805785644757748, | |
| "train_runtime": 4473.2535, | |
| "train_samples_per_second": 22.355, | |
| "train_steps_per_second": 1.397 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 6250, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.02317615087616e+19, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |