| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.3034410844629822, | |
| "eval_steps": 100, | |
| "global_step": 10000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0013034410844629822, | |
| "grad_norm": 752.3058471679688, | |
| "learning_rate": 3.6000000000000005e-08, | |
| "loss": 36.8414, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0026068821689259644, | |
| "grad_norm": 842.267822265625, | |
| "learning_rate": 7.6e-08, | |
| "loss": 38.6302, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.003910323253388947, | |
| "grad_norm": 743.934326171875, | |
| "learning_rate": 1.16e-07, | |
| "loss": 36.3021, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.005213764337851929, | |
| "grad_norm": 934.6981201171875, | |
| "learning_rate": 1.56e-07, | |
| "loss": 36.9985, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.006517205422314911, | |
| "grad_norm": 649.5113525390625, | |
| "learning_rate": 1.96e-07, | |
| "loss": 33.0931, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.007820646506777894, | |
| "grad_norm": 552.53662109375, | |
| "learning_rate": 2.3600000000000002e-07, | |
| "loss": 32.0638, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.009124087591240875, | |
| "grad_norm": 582.6400146484375, | |
| "learning_rate": 2.7600000000000004e-07, | |
| "loss": 30.6222, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.010427528675703858, | |
| "grad_norm": 445.86651611328125, | |
| "learning_rate": 3.160000000000001e-07, | |
| "loss": 28.4039, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.01173096976016684, | |
| "grad_norm": 409.77642822265625, | |
| "learning_rate": 3.56e-07, | |
| "loss": 27.1438, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.013034410844629822, | |
| "grad_norm": 361.9573059082031, | |
| "learning_rate": 3.9600000000000005e-07, | |
| "loss": 24.495, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.013034410844629822, | |
| "eval/acc": 4.651162624359131, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.013034410844629822, | |
| "eval_loss": 18.547975540161133, | |
| "eval_runtime": 0.9127, | |
| "eval_samples_per_second": 47.114, | |
| "eval_steps_per_second": 1.096, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.014337851929092805, | |
| "grad_norm": 313.210693359375, | |
| "learning_rate": 4.3600000000000004e-07, | |
| "loss": 21.7606, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.01564129301355579, | |
| "grad_norm": 329.11328125, | |
| "learning_rate": 4.760000000000001e-07, | |
| "loss": 19.5126, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.01694473409801877, | |
| "grad_norm": 208.9153289794922, | |
| "learning_rate": 5.16e-07, | |
| "loss": 18.0711, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.01824817518248175, | |
| "grad_norm": 220.79066467285156, | |
| "learning_rate": 5.560000000000001e-07, | |
| "loss": 16.9045, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.019551616266944735, | |
| "grad_norm": 167.1522216796875, | |
| "learning_rate": 5.960000000000001e-07, | |
| "loss": 14.9046, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.020855057351407715, | |
| "grad_norm": 154.0718994140625, | |
| "learning_rate": 6.360000000000001e-07, | |
| "loss": 14.7601, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.0221584984358707, | |
| "grad_norm": 146.39012145996094, | |
| "learning_rate": 6.76e-07, | |
| "loss": 12.5387, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.02346193952033368, | |
| "grad_norm": 153.8921356201172, | |
| "learning_rate": 7.16e-07, | |
| "loss": 11.6882, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.024765380604796664, | |
| "grad_norm": 187.7710723876953, | |
| "learning_rate": 7.56e-07, | |
| "loss": 11.9919, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.026068821689259645, | |
| "grad_norm": 163.95228576660156, | |
| "learning_rate": 7.960000000000001e-07, | |
| "loss": 10.8187, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.026068821689259645, | |
| "eval/acc": 11.627906799316406, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.026068821689259645, | |
| "eval_loss": 8.683622360229492, | |
| "eval_runtime": 0.5536, | |
| "eval_samples_per_second": 77.674, | |
| "eval_steps_per_second": 1.806, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.02737226277372263, | |
| "grad_norm": 121.16007232666016, | |
| "learning_rate": 8.36e-07, | |
| "loss": 9.9573, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.02867570385818561, | |
| "grad_norm": 123.3974609375, | |
| "learning_rate": 8.760000000000001e-07, | |
| "loss": 9.3645, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.029979144942648594, | |
| "grad_norm": 149.9007110595703, | |
| "learning_rate": 9.160000000000001e-07, | |
| "loss": 9.1913, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.03128258602711158, | |
| "grad_norm": 142.5546875, | |
| "learning_rate": 9.56e-07, | |
| "loss": 8.1642, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.03258602711157456, | |
| "grad_norm": 111.52351379394531, | |
| "learning_rate": 9.96e-07, | |
| "loss": 8.1291, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.03388946819603754, | |
| "grad_norm": 112.73123931884766, | |
| "learning_rate": 1.0360000000000001e-06, | |
| "loss": 7.6783, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.03519290928050052, | |
| "grad_norm": 94.62492370605469, | |
| "learning_rate": 1.0760000000000002e-06, | |
| "loss": 7.333, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.0364963503649635, | |
| "grad_norm": 105.54913330078125, | |
| "learning_rate": 1.1160000000000002e-06, | |
| "loss": 6.6041, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.03779979144942649, | |
| "grad_norm": 93.97553253173828, | |
| "learning_rate": 1.156e-06, | |
| "loss": 6.769, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.03910323253388947, | |
| "grad_norm": 78.89429473876953, | |
| "learning_rate": 1.196e-06, | |
| "loss": 6.3188, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.03910323253388947, | |
| "eval/acc": 18.604650497436523, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.03910323253388947, | |
| "eval_loss": 6.040415287017822, | |
| "eval_runtime": 0.5521, | |
| "eval_samples_per_second": 77.881, | |
| "eval_steps_per_second": 1.811, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.04040667361835245, | |
| "grad_norm": 88.73673248291016, | |
| "learning_rate": 1.2360000000000001e-06, | |
| "loss": 5.643, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.04171011470281543, | |
| "grad_norm": 83.74315643310547, | |
| "learning_rate": 1.276e-06, | |
| "loss": 5.0575, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.04301355578727842, | |
| "grad_norm": 83.0094223022461, | |
| "learning_rate": 1.316e-06, | |
| "loss": 4.8596, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.0443169968717414, | |
| "grad_norm": 66.99898529052734, | |
| "learning_rate": 1.356e-06, | |
| "loss": 4.444, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.04562043795620438, | |
| "grad_norm": 61.338409423828125, | |
| "learning_rate": 1.396e-06, | |
| "loss": 4.1019, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.04692387904066736, | |
| "grad_norm": 49.183837890625, | |
| "learning_rate": 1.436e-06, | |
| "loss": 3.7076, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.04822732012513034, | |
| "grad_norm": 43.407833099365234, | |
| "learning_rate": 1.4760000000000001e-06, | |
| "loss": 3.4065, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.04953076120959333, | |
| "grad_norm": 36.92807388305664, | |
| "learning_rate": 1.5160000000000002e-06, | |
| "loss": 3.2919, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.05083420229405631, | |
| "grad_norm": 31.856853485107422, | |
| "learning_rate": 1.556e-06, | |
| "loss": 2.8133, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.05213764337851929, | |
| "grad_norm": 25.495525360107422, | |
| "learning_rate": 1.596e-06, | |
| "loss": 2.7088, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.05213764337851929, | |
| "eval/acc": 16.279069900512695, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.05213764337851929, | |
| "eval_loss": 3.6577463150024414, | |
| "eval_runtime": 0.5513, | |
| "eval_samples_per_second": 77.993, | |
| "eval_steps_per_second": 1.814, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.05344108446298227, | |
| "grad_norm": 24.631906509399414, | |
| "learning_rate": 1.636e-06, | |
| "loss": 2.4304, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.05474452554744526, | |
| "grad_norm": 21.909217834472656, | |
| "learning_rate": 1.6760000000000001e-06, | |
| "loss": 2.534, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.05604796663190824, | |
| "grad_norm": 22.260988235473633, | |
| "learning_rate": 1.7160000000000002e-06, | |
| "loss": 2.2622, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.05735140771637122, | |
| "grad_norm": 20.961124420166016, | |
| "learning_rate": 1.7560000000000002e-06, | |
| "loss": 2.203, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.0586548488008342, | |
| "grad_norm": 17.357723236083984, | |
| "learning_rate": 1.7960000000000003e-06, | |
| "loss": 2.0872, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.05995828988529719, | |
| "grad_norm": 23.459308624267578, | |
| "learning_rate": 1.8360000000000003e-06, | |
| "loss": 2.1486, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.06126173096976017, | |
| "grad_norm": 16.572664260864258, | |
| "learning_rate": 1.8760000000000001e-06, | |
| "loss": 1.8763, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.06256517205422316, | |
| "grad_norm": 16.683259963989258, | |
| "learning_rate": 1.916e-06, | |
| "loss": 1.9834, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.06386861313868614, | |
| "grad_norm": 17.584997177124023, | |
| "learning_rate": 1.956e-06, | |
| "loss": 1.9754, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.06517205422314912, | |
| "grad_norm": 16.946754455566406, | |
| "learning_rate": 1.996e-06, | |
| "loss": 1.84, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.06517205422314912, | |
| "eval/acc": 30.23255729675293, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.06517205422314912, | |
| "eval_loss": 3.3365631103515625, | |
| "eval_runtime": 0.5506, | |
| "eval_samples_per_second": 78.1, | |
| "eval_steps_per_second": 1.816, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.0664754953076121, | |
| "grad_norm": 16.998144149780273, | |
| "learning_rate": 2.036e-06, | |
| "loss": 1.951, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.06777893639207508, | |
| "grad_norm": 15.914703369140625, | |
| "learning_rate": 2.076e-06, | |
| "loss": 1.6538, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.06908237747653806, | |
| "grad_norm": 16.67144775390625, | |
| "learning_rate": 2.116e-06, | |
| "loss": 1.8369, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.07038581856100104, | |
| "grad_norm": 17.469003677368164, | |
| "learning_rate": 2.156e-06, | |
| "loss": 1.7525, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.07168925964546402, | |
| "grad_norm": 17.825891494750977, | |
| "learning_rate": 2.1960000000000002e-06, | |
| "loss": 1.741, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.072992700729927, | |
| "grad_norm": 16.591110229492188, | |
| "learning_rate": 2.2360000000000003e-06, | |
| "loss": 1.7004, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.07429614181439, | |
| "grad_norm": 17.972606658935547, | |
| "learning_rate": 2.2760000000000003e-06, | |
| "loss": 1.5859, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.07559958289885298, | |
| "grad_norm": 16.083576202392578, | |
| "learning_rate": 2.3160000000000004e-06, | |
| "loss": 1.6437, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.07690302398331596, | |
| "grad_norm": 18.013198852539062, | |
| "learning_rate": 2.3560000000000004e-06, | |
| "loss": 1.6496, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.07820646506777894, | |
| "grad_norm": 17.562707901000977, | |
| "learning_rate": 2.3960000000000004e-06, | |
| "loss": 1.5905, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.07820646506777894, | |
| "eval/acc": 25.581396102905273, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.07820646506777894, | |
| "eval_loss": 3.272217035293579, | |
| "eval_runtime": 0.5506, | |
| "eval_samples_per_second": 78.096, | |
| "eval_steps_per_second": 1.816, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.07950990615224192, | |
| "grad_norm": 16.752840042114258, | |
| "learning_rate": 2.4360000000000005e-06, | |
| "loss": 1.6781, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.0808133472367049, | |
| "grad_norm": 15.609387397766113, | |
| "learning_rate": 2.476e-06, | |
| "loss": 1.6322, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.08211678832116788, | |
| "grad_norm": 18.39044952392578, | |
| "learning_rate": 2.516e-06, | |
| "loss": 1.6085, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.08342022940563086, | |
| "grad_norm": 15.455676078796387, | |
| "learning_rate": 2.556e-06, | |
| "loss": 1.5879, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.08472367049009384, | |
| "grad_norm": 17.240724563598633, | |
| "learning_rate": 2.5960000000000002e-06, | |
| "loss": 1.6769, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.08602711157455684, | |
| "grad_norm": 15.329434394836426, | |
| "learning_rate": 2.6360000000000003e-06, | |
| "loss": 1.623, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.08733055265901982, | |
| "grad_norm": 15.390430450439453, | |
| "learning_rate": 2.6760000000000003e-06, | |
| "loss": 1.5269, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.0886339937434828, | |
| "grad_norm": 13.903982162475586, | |
| "learning_rate": 2.7160000000000003e-06, | |
| "loss": 1.5213, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.08993743482794578, | |
| "grad_norm": 15.723600387573242, | |
| "learning_rate": 2.7560000000000004e-06, | |
| "loss": 1.544, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.09124087591240876, | |
| "grad_norm": 16.601280212402344, | |
| "learning_rate": 2.7960000000000004e-06, | |
| "loss": 1.5074, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.09124087591240876, | |
| "eval/acc": 23.255813598632812, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.09124087591240876, | |
| "eval_loss": 3.2280378341674805, | |
| "eval_runtime": 0.5537, | |
| "eval_samples_per_second": 77.661, | |
| "eval_steps_per_second": 1.806, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.09254431699687174, | |
| "grad_norm": 15.303380966186523, | |
| "learning_rate": 2.8360000000000005e-06, | |
| "loss": 1.6147, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.09384775808133472, | |
| "grad_norm": 15.977986335754395, | |
| "learning_rate": 2.8760000000000005e-06, | |
| "loss": 1.4851, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.0951511991657977, | |
| "grad_norm": 15.908977508544922, | |
| "learning_rate": 2.9160000000000005e-06, | |
| "loss": 1.517, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.09645464025026068, | |
| "grad_norm": 14.383811950683594, | |
| "learning_rate": 2.956e-06, | |
| "loss": 1.5444, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.09775808133472368, | |
| "grad_norm": 12.663350105285645, | |
| "learning_rate": 2.996e-06, | |
| "loss": 1.5018, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.09906152241918666, | |
| "grad_norm": 15.087569236755371, | |
| "learning_rate": 3.0360000000000002e-06, | |
| "loss": 1.4602, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.10036496350364964, | |
| "grad_norm": 13.563980102539062, | |
| "learning_rate": 3.0760000000000003e-06, | |
| "loss": 1.3855, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.10166840458811262, | |
| "grad_norm": 13.872782707214355, | |
| "learning_rate": 3.1160000000000003e-06, | |
| "loss": 1.508, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.1029718456725756, | |
| "grad_norm": 12.23460865020752, | |
| "learning_rate": 3.1560000000000004e-06, | |
| "loss": 1.3865, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.10427528675703858, | |
| "grad_norm": 15.744820594787598, | |
| "learning_rate": 3.1960000000000004e-06, | |
| "loss": 1.5776, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.10427528675703858, | |
| "eval/acc": 23.255813598632812, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.10427528675703858, | |
| "eval_loss": 3.1086668968200684, | |
| "eval_runtime": 0.5521, | |
| "eval_samples_per_second": 77.884, | |
| "eval_steps_per_second": 1.811, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.10557872784150156, | |
| "grad_norm": 12.964938163757324, | |
| "learning_rate": 3.2360000000000004e-06, | |
| "loss": 1.4783, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.10688216892596454, | |
| "grad_norm": 16.409147262573242, | |
| "learning_rate": 3.2760000000000005e-06, | |
| "loss": 1.3763, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.10818561001042754, | |
| "grad_norm": 13.46617317199707, | |
| "learning_rate": 3.3160000000000005e-06, | |
| "loss": 1.4161, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.10948905109489052, | |
| "grad_norm": 14.7039213180542, | |
| "learning_rate": 3.3560000000000006e-06, | |
| "loss": 1.5434, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.1107924921793535, | |
| "grad_norm": 14.37901782989502, | |
| "learning_rate": 3.3960000000000006e-06, | |
| "loss": 1.4212, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.11209593326381648, | |
| "grad_norm": 13.210816383361816, | |
| "learning_rate": 3.4360000000000006e-06, | |
| "loss": 1.4053, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.11339937434827946, | |
| "grad_norm": 13.743114471435547, | |
| "learning_rate": 3.4760000000000007e-06, | |
| "loss": 1.4231, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.11470281543274244, | |
| "grad_norm": 12.634490013122559, | |
| "learning_rate": 3.5160000000000007e-06, | |
| "loss": 1.3584, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.11600625651720542, | |
| "grad_norm": 15.65221881866455, | |
| "learning_rate": 3.5560000000000008e-06, | |
| "loss": 1.399, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.1173096976016684, | |
| "grad_norm": 14.89765453338623, | |
| "learning_rate": 3.596e-06, | |
| "loss": 1.3935, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.1173096976016684, | |
| "eval/acc": 23.255813598632812, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.1173096976016684, | |
| "eval_loss": 3.096344232559204, | |
| "eval_runtime": 0.5513, | |
| "eval_samples_per_second": 77.992, | |
| "eval_steps_per_second": 1.814, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.11861313868613138, | |
| "grad_norm": 14.929734230041504, | |
| "learning_rate": 3.636e-06, | |
| "loss": 1.4005, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.11991657977059438, | |
| "grad_norm": 12.793665885925293, | |
| "learning_rate": 3.676e-06, | |
| "loss": 1.4152, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.12122002085505736, | |
| "grad_norm": 13.772797584533691, | |
| "learning_rate": 3.716e-06, | |
| "loss": 1.3823, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.12252346193952034, | |
| "grad_norm": 11.430520057678223, | |
| "learning_rate": 3.756e-06, | |
| "loss": 1.3623, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.12382690302398332, | |
| "grad_norm": 13.903288841247559, | |
| "learning_rate": 3.796e-06, | |
| "loss": 1.3491, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.1251303441084463, | |
| "grad_norm": 14.225196838378906, | |
| "learning_rate": 3.836e-06, | |
| "loss": 1.3605, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.1264337851929093, | |
| "grad_norm": 13.653999328613281, | |
| "learning_rate": 3.876000000000001e-06, | |
| "loss": 1.4258, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.12773722627737227, | |
| "grad_norm": 12.619461059570312, | |
| "learning_rate": 3.916e-06, | |
| "loss": 1.2765, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.12904066736183525, | |
| "grad_norm": 12.887979507446289, | |
| "learning_rate": 3.956000000000001e-06, | |
| "loss": 1.3446, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.13034410844629823, | |
| "grad_norm": 13.362163543701172, | |
| "learning_rate": 3.996e-06, | |
| "loss": 1.4322, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.13034410844629823, | |
| "eval/acc": 23.255813598632812, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.13034410844629823, | |
| "eval_loss": 3.0436527729034424, | |
| "eval_runtime": 0.5774, | |
| "eval_samples_per_second": 74.467, | |
| "eval_steps_per_second": 1.732, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.13164754953076122, | |
| "grad_norm": 13.34825611114502, | |
| "learning_rate": 4.036000000000001e-06, | |
| "loss": 1.3434, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.1329509906152242, | |
| "grad_norm": 12.807318687438965, | |
| "learning_rate": 4.0760000000000004e-06, | |
| "loss": 1.3971, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.13425443169968718, | |
| "grad_norm": 10.88805103302002, | |
| "learning_rate": 4.116000000000001e-06, | |
| "loss": 1.3324, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.13555787278415016, | |
| "grad_norm": 12.1721830368042, | |
| "learning_rate": 4.1560000000000005e-06, | |
| "loss": 1.3454, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.13686131386861314, | |
| "grad_norm": 16.927200317382812, | |
| "learning_rate": 4.196e-06, | |
| "loss": 1.3912, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.13816475495307612, | |
| "grad_norm": 11.07986068725586, | |
| "learning_rate": 4.236e-06, | |
| "loss": 1.2858, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.1394681960375391, | |
| "grad_norm": 13.776060104370117, | |
| "learning_rate": 4.276e-06, | |
| "loss": 1.4019, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.14077163712200208, | |
| "grad_norm": 13.49791145324707, | |
| "learning_rate": 4.316e-06, | |
| "loss": 1.4221, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.14207507820646506, | |
| "grad_norm": 11.622773170471191, | |
| "learning_rate": 4.356e-06, | |
| "loss": 1.3731, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.14337851929092804, | |
| "grad_norm": 13.743051528930664, | |
| "learning_rate": 4.396e-06, | |
| "loss": 1.3511, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.14337851929092804, | |
| "eval/acc": 25.581396102905273, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.14337851929092804, | |
| "eval_loss": 3.016920804977417, | |
| "eval_runtime": 0.5524, | |
| "eval_samples_per_second": 77.847, | |
| "eval_steps_per_second": 1.81, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.14468196037539102, | |
| "grad_norm": 14.456645011901855, | |
| "learning_rate": 4.436e-06, | |
| "loss": 1.3994, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.145985401459854, | |
| "grad_norm": 12.78945255279541, | |
| "learning_rate": 4.476e-06, | |
| "loss": 1.3486, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.14728884254431698, | |
| "grad_norm": 12.899959564208984, | |
| "learning_rate": 4.5160000000000005e-06, | |
| "loss": 1.3342, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.14859228362878, | |
| "grad_norm": 12.025766372680664, | |
| "learning_rate": 4.556e-06, | |
| "loss": 1.3066, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.14989572471324297, | |
| "grad_norm": 11.712949752807617, | |
| "learning_rate": 4.5960000000000006e-06, | |
| "loss": 1.4095, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.15119916579770595, | |
| "grad_norm": 14.212655067443848, | |
| "learning_rate": 4.636e-06, | |
| "loss": 1.2781, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.15250260688216893, | |
| "grad_norm": 13.639365196228027, | |
| "learning_rate": 4.676000000000001e-06, | |
| "loss": 1.3783, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.15380604796663191, | |
| "grad_norm": 11.413806915283203, | |
| "learning_rate": 4.716e-06, | |
| "loss": 1.2901, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.1551094890510949, | |
| "grad_norm": 11.520100593566895, | |
| "learning_rate": 4.756000000000001e-06, | |
| "loss": 1.3142, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.15641293013555788, | |
| "grad_norm": 13.1220064163208, | |
| "learning_rate": 4.796e-06, | |
| "loss": 1.3254, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.15641293013555788, | |
| "eval/acc": 27.9069766998291, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.15641293013555788, | |
| "eval_loss": 2.8769519329071045, | |
| "eval_runtime": 0.5522, | |
| "eval_samples_per_second": 77.864, | |
| "eval_steps_per_second": 1.811, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.15771637122002086, | |
| "grad_norm": 11.807994842529297, | |
| "learning_rate": 4.836e-06, | |
| "loss": 1.339, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.15901981230448384, | |
| "grad_norm": 11.208297729492188, | |
| "learning_rate": 4.876e-06, | |
| "loss": 1.1896, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.16032325338894682, | |
| "grad_norm": 13.063114166259766, | |
| "learning_rate": 4.916e-06, | |
| "loss": 1.2701, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.1616266944734098, | |
| "grad_norm": 11.611763000488281, | |
| "learning_rate": 4.9560000000000005e-06, | |
| "loss": 1.3212, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.16293013555787278, | |
| "grad_norm": 10.884580612182617, | |
| "learning_rate": 4.996e-06, | |
| "loss": 1.227, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.16423357664233576, | |
| "grad_norm": 11.97398567199707, | |
| "learning_rate": 5.0360000000000006e-06, | |
| "loss": 1.3075, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.16553701772679874, | |
| "grad_norm": 13.973258972167969, | |
| "learning_rate": 5.076000000000001e-06, | |
| "loss": 1.2388, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.16684045881126172, | |
| "grad_norm": 13.00340461730957, | |
| "learning_rate": 5.116000000000001e-06, | |
| "loss": 1.3462, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.1681438998957247, | |
| "grad_norm": 11.750258445739746, | |
| "learning_rate": 5.156e-06, | |
| "loss": 1.2093, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.16944734098018768, | |
| "grad_norm": 12.117288589477539, | |
| "learning_rate": 5.196e-06, | |
| "loss": 1.223, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.16944734098018768, | |
| "eval/acc": 32.55813980102539, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.16944734098018768, | |
| "eval_loss": 2.936992645263672, | |
| "eval_runtime": 0.5526, | |
| "eval_samples_per_second": 77.814, | |
| "eval_steps_per_second": 1.81, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.1707507820646507, | |
| "grad_norm": 12.747390747070312, | |
| "learning_rate": 5.236e-06, | |
| "loss": 1.2956, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.17205422314911367, | |
| "grad_norm": 10.593498229980469, | |
| "learning_rate": 5.276e-06, | |
| "loss": 1.1996, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.17335766423357665, | |
| "grad_norm": 11.945181846618652, | |
| "learning_rate": 5.3160000000000004e-06, | |
| "loss": 1.33, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.17466110531803963, | |
| "grad_norm": 12.65109634399414, | |
| "learning_rate": 5.356e-06, | |
| "loss": 1.2295, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.17596454640250261, | |
| "grad_norm": 11.467466354370117, | |
| "learning_rate": 5.3960000000000005e-06, | |
| "loss": 1.2227, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.1772679874869656, | |
| "grad_norm": 12.898762702941895, | |
| "learning_rate": 5.436e-06, | |
| "loss": 1.2573, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.17857142857142858, | |
| "grad_norm": 11.188071250915527, | |
| "learning_rate": 5.476000000000001e-06, | |
| "loss": 1.3103, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.17987486965589156, | |
| "grad_norm": 12.179079055786133, | |
| "learning_rate": 5.516e-06, | |
| "loss": 1.259, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.18117831074035454, | |
| "grad_norm": 12.672003746032715, | |
| "learning_rate": 5.556000000000001e-06, | |
| "loss": 1.1756, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.18248175182481752, | |
| "grad_norm": 11.671830177307129, | |
| "learning_rate": 5.596e-06, | |
| "loss": 1.2905, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.18248175182481752, | |
| "eval/acc": 27.9069766998291, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.18248175182481752, | |
| "eval_loss": 2.8736231327056885, | |
| "eval_runtime": 0.5533, | |
| "eval_samples_per_second": 77.719, | |
| "eval_steps_per_second": 1.807, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.1837851929092805, | |
| "grad_norm": 12.279439926147461, | |
| "learning_rate": 5.636000000000001e-06, | |
| "loss": 1.2422, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.18508863399374348, | |
| "grad_norm": 12.59632396697998, | |
| "learning_rate": 5.676e-06, | |
| "loss": 1.1998, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.18639207507820646, | |
| "grad_norm": 10.290858268737793, | |
| "learning_rate": 5.716000000000001e-06, | |
| "loss": 1.3073, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.18769551616266944, | |
| "grad_norm": 11.450456619262695, | |
| "learning_rate": 5.7560000000000005e-06, | |
| "loss": 1.2917, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.18899895724713242, | |
| "grad_norm": 10.898682594299316, | |
| "learning_rate": 5.796000000000001e-06, | |
| "loss": 1.1988, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.1903023983315954, | |
| "grad_norm": 11.755196571350098, | |
| "learning_rate": 5.8360000000000005e-06, | |
| "loss": 1.4151, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.19160583941605838, | |
| "grad_norm": 10.369739532470703, | |
| "learning_rate": 5.876000000000001e-06, | |
| "loss": 1.2748, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.19290928050052136, | |
| "grad_norm": 10.368874549865723, | |
| "learning_rate": 5.916000000000001e-06, | |
| "loss": 1.2456, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.19421272158498437, | |
| "grad_norm": 10.07337474822998, | |
| "learning_rate": 5.956000000000001e-06, | |
| "loss": 1.1918, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.19551616266944735, | |
| "grad_norm": 12.127270698547363, | |
| "learning_rate": 5.996000000000001e-06, | |
| "loss": 1.1726, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.19551616266944735, | |
| "eval/acc": 32.55813980102539, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.19551616266944735, | |
| "eval_loss": 2.9003522396087646, | |
| "eval_runtime": 0.5544, | |
| "eval_samples_per_second": 77.556, | |
| "eval_steps_per_second": 1.804, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.19681960375391033, | |
| "grad_norm": 12.454967498779297, | |
| "learning_rate": 6.036000000000001e-06, | |
| "loss": 1.3509, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.1981230448383733, | |
| "grad_norm": 11.402469635009766, | |
| "learning_rate": 6.076000000000001e-06, | |
| "loss": 1.2365, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.1994264859228363, | |
| "grad_norm": 12.890278816223145, | |
| "learning_rate": 6.116000000000001e-06, | |
| "loss": 1.2295, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.20072992700729927, | |
| "grad_norm": 12.542150497436523, | |
| "learning_rate": 6.156000000000001e-06, | |
| "loss": 1.2789, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.20203336809176226, | |
| "grad_norm": 10.868870735168457, | |
| "learning_rate": 6.196000000000001e-06, | |
| "loss": 1.232, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.20333680917622524, | |
| "grad_norm": 12.972379684448242, | |
| "learning_rate": 6.236000000000001e-06, | |
| "loss": 1.2111, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.20464025026068822, | |
| "grad_norm": 14.00292682647705, | |
| "learning_rate": 6.2760000000000006e-06, | |
| "loss": 1.1551, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.2059436913451512, | |
| "grad_norm": 11.713733673095703, | |
| "learning_rate": 6.316000000000001e-06, | |
| "loss": 1.2256, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.20724713242961418, | |
| "grad_norm": 11.81581974029541, | |
| "learning_rate": 6.356000000000001e-06, | |
| "loss": 1.2375, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.20855057351407716, | |
| "grad_norm": 9.595722198486328, | |
| "learning_rate": 6.396e-06, | |
| "loss": 1.2307, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.20855057351407716, | |
| "eval/acc": 34.88372039794922, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.20855057351407716, | |
| "eval_loss": 2.89196515083313, | |
| "eval_runtime": 0.5526, | |
| "eval_samples_per_second": 77.814, | |
| "eval_steps_per_second": 1.81, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.20985401459854014, | |
| "grad_norm": 14.193363189697266, | |
| "learning_rate": 6.436e-06, | |
| "loss": 1.2503, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.21115745568300312, | |
| "grad_norm": 10.671473503112793, | |
| "learning_rate": 6.476e-06, | |
| "loss": 1.1697, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.2124608967674661, | |
| "grad_norm": 12.921130180358887, | |
| "learning_rate": 6.516e-06, | |
| "loss": 1.1124, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.21376433785192908, | |
| "grad_norm": 12.321484565734863, | |
| "learning_rate": 6.556e-06, | |
| "loss": 1.2272, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.21506777893639206, | |
| "grad_norm": 13.49770450592041, | |
| "learning_rate": 6.596e-06, | |
| "loss": 1.2014, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.21637122002085507, | |
| "grad_norm": 10.752897262573242, | |
| "learning_rate": 6.6360000000000005e-06, | |
| "loss": 1.174, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.21767466110531805, | |
| "grad_norm": 12.024086952209473, | |
| "learning_rate": 6.676e-06, | |
| "loss": 1.1625, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.21897810218978103, | |
| "grad_norm": 12.498847961425781, | |
| "learning_rate": 6.716000000000001e-06, | |
| "loss": 1.1683, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.220281543274244, | |
| "grad_norm": 11.955095291137695, | |
| "learning_rate": 6.756e-06, | |
| "loss": 1.2441, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.221584984358707, | |
| "grad_norm": 10.969300270080566, | |
| "learning_rate": 6.796000000000001e-06, | |
| "loss": 1.2547, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.221584984358707, | |
| "eval/acc": 32.55813980102539, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.221584984358707, | |
| "eval_loss": 2.735595464706421, | |
| "eval_runtime": 0.5549, | |
| "eval_samples_per_second": 77.488, | |
| "eval_steps_per_second": 1.802, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.22288842544316997, | |
| "grad_norm": 10.412980079650879, | |
| "learning_rate": 6.836e-06, | |
| "loss": 1.2473, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.22419186652763295, | |
| "grad_norm": 12.962031364440918, | |
| "learning_rate": 6.876000000000001e-06, | |
| "loss": 1.1759, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.22549530761209594, | |
| "grad_norm": 10.370616912841797, | |
| "learning_rate": 6.916e-06, | |
| "loss": 1.1444, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.22679874869655892, | |
| "grad_norm": 12.069488525390625, | |
| "learning_rate": 6.956000000000001e-06, | |
| "loss": 1.2177, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.2281021897810219, | |
| "grad_norm": 11.831305503845215, | |
| "learning_rate": 6.9960000000000004e-06, | |
| "loss": 1.2315, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.22940563086548488, | |
| "grad_norm": 10.261811256408691, | |
| "learning_rate": 7.036000000000001e-06, | |
| "loss": 1.1478, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.23070907194994786, | |
| "grad_norm": 10.814574241638184, | |
| "learning_rate": 7.0760000000000005e-06, | |
| "loss": 1.221, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.23201251303441084, | |
| "grad_norm": 11.184773445129395, | |
| "learning_rate": 7.116000000000001e-06, | |
| "loss": 1.2984, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.23331595411887382, | |
| "grad_norm": 11.853842735290527, | |
| "learning_rate": 7.156000000000001e-06, | |
| "loss": 1.2325, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.2346193952033368, | |
| "grad_norm": 10.178322792053223, | |
| "learning_rate": 7.196000000000001e-06, | |
| "loss": 1.1664, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.2346193952033368, | |
| "eval/acc": 34.88372039794922, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.2346193952033368, | |
| "eval_loss": 2.893901824951172, | |
| "eval_runtime": 0.5541, | |
| "eval_samples_per_second": 77.605, | |
| "eval_steps_per_second": 1.805, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.23592283628779978, | |
| "grad_norm": 11.30508804321289, | |
| "learning_rate": 7.236000000000001e-06, | |
| "loss": 1.2602, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.23722627737226276, | |
| "grad_norm": 11.195526123046875, | |
| "learning_rate": 7.276000000000001e-06, | |
| "loss": 1.1529, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.23852971845672574, | |
| "grad_norm": 11.082310676574707, | |
| "learning_rate": 7.316000000000001e-06, | |
| "loss": 1.2024, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.23983315954118875, | |
| "grad_norm": 10.974154472351074, | |
| "learning_rate": 7.356000000000001e-06, | |
| "loss": 1.1329, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.24113660062565173, | |
| "grad_norm": 10.155501365661621, | |
| "learning_rate": 7.396000000000001e-06, | |
| "loss": 1.1555, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.2424400417101147, | |
| "grad_norm": 10.690115928649902, | |
| "learning_rate": 7.436000000000001e-06, | |
| "loss": 1.1916, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.2437434827945777, | |
| "grad_norm": 11.77647876739502, | |
| "learning_rate": 7.476000000000001e-06, | |
| "loss": 1.0674, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.24504692387904067, | |
| "grad_norm": 13.536336898803711, | |
| "learning_rate": 7.516000000000001e-06, | |
| "loss": 1.2325, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.24635036496350365, | |
| "grad_norm": 10.988912582397461, | |
| "learning_rate": 7.556000000000001e-06, | |
| "loss": 1.1597, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.24765380604796663, | |
| "grad_norm": 11.346904754638672, | |
| "learning_rate": 7.5960000000000015e-06, | |
| "loss": 1.1883, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.24765380604796663, | |
| "eval/acc": 34.88372039794922, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.24765380604796663, | |
| "eval_loss": 2.8580784797668457, | |
| "eval_runtime": 0.5538, | |
| "eval_samples_per_second": 77.645, | |
| "eval_steps_per_second": 1.806, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.24895724713242962, | |
| "grad_norm": 10.486469268798828, | |
| "learning_rate": 7.636e-06, | |
| "loss": 1.1989, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.2502606882168926, | |
| "grad_norm": 11.191844940185547, | |
| "learning_rate": 7.676e-06, | |
| "loss": 1.1934, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.2515641293013556, | |
| "grad_norm": 12.818986892700195, | |
| "learning_rate": 7.716e-06, | |
| "loss": 1.1856, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.2528675703858186, | |
| "grad_norm": 9.980338096618652, | |
| "learning_rate": 7.756e-06, | |
| "loss": 1.1685, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.25417101147028154, | |
| "grad_norm": 10.59505558013916, | |
| "learning_rate": 7.796e-06, | |
| "loss": 1.0932, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.25547445255474455, | |
| "grad_norm": 10.21989631652832, | |
| "learning_rate": 7.836000000000001e-06, | |
| "loss": 1.2254, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.2567778936392075, | |
| "grad_norm": 9.082103729248047, | |
| "learning_rate": 7.876e-06, | |
| "loss": 1.1439, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.2580813347236705, | |
| "grad_norm": 10.54208755493164, | |
| "learning_rate": 7.916e-06, | |
| "loss": 1.2031, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.25938477580813346, | |
| "grad_norm": 11.807458877563477, | |
| "learning_rate": 7.956e-06, | |
| "loss": 1.1575, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.26068821689259647, | |
| "grad_norm": 11.20957088470459, | |
| "learning_rate": 7.996000000000001e-06, | |
| "loss": 1.152, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.26068821689259647, | |
| "eval/acc": 34.88372039794922, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.26068821689259647, | |
| "eval_loss": 2.9252498149871826, | |
| "eval_runtime": 0.5546, | |
| "eval_samples_per_second": 77.535, | |
| "eval_steps_per_second": 1.803, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.2619916579770594, | |
| "grad_norm": 9.236865997314453, | |
| "learning_rate": 8.036e-06, | |
| "loss": 1.1518, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.26329509906152243, | |
| "grad_norm": 10.173084259033203, | |
| "learning_rate": 8.076e-06, | |
| "loss": 1.1738, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.2645985401459854, | |
| "grad_norm": 11.158531188964844, | |
| "learning_rate": 8.116e-06, | |
| "loss": 1.1942, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.2659019812304484, | |
| "grad_norm": 10.654205322265625, | |
| "learning_rate": 8.156000000000001e-06, | |
| "loss": 1.1965, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.26720542231491134, | |
| "grad_norm": 10.954093933105469, | |
| "learning_rate": 8.196e-06, | |
| "loss": 1.0949, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.26850886339937435, | |
| "grad_norm": 10.480634689331055, | |
| "learning_rate": 8.236e-06, | |
| "loss": 1.2128, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.2698123044838373, | |
| "grad_norm": 9.64358901977539, | |
| "learning_rate": 8.276e-06, | |
| "loss": 1.1713, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.2711157455683003, | |
| "grad_norm": 9.68060302734375, | |
| "learning_rate": 8.316000000000001e-06, | |
| "loss": 1.1275, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.27241918665276327, | |
| "grad_norm": 10.211024284362793, | |
| "learning_rate": 8.356000000000001e-06, | |
| "loss": 1.2368, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.2737226277372263, | |
| "grad_norm": 10.19279670715332, | |
| "learning_rate": 8.396e-06, | |
| "loss": 1.1649, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.2737226277372263, | |
| "eval/acc": 37.20930099487305, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.2737226277372263, | |
| "eval_loss": 2.894489288330078, | |
| "eval_runtime": 0.5551, | |
| "eval_samples_per_second": 77.469, | |
| "eval_steps_per_second": 1.802, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.2750260688216893, | |
| "grad_norm": 11.496298789978027, | |
| "learning_rate": 8.436e-06, | |
| "loss": 1.1936, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.27632950990615224, | |
| "grad_norm": 10.342120170593262, | |
| "learning_rate": 8.476000000000002e-06, | |
| "loss": 1.2169, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.27763295099061525, | |
| "grad_norm": 10.583955764770508, | |
| "learning_rate": 8.516000000000001e-06, | |
| "loss": 1.1169, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.2789363920750782, | |
| "grad_norm": 10.484763145446777, | |
| "learning_rate": 8.556e-06, | |
| "loss": 1.1492, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.2802398331595412, | |
| "grad_norm": 10.46810245513916, | |
| "learning_rate": 8.596e-06, | |
| "loss": 1.2559, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.28154327424400416, | |
| "grad_norm": 10.129209518432617, | |
| "learning_rate": 8.636000000000002e-06, | |
| "loss": 1.0982, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.28284671532846717, | |
| "grad_norm": 9.844231605529785, | |
| "learning_rate": 8.676000000000001e-06, | |
| "loss": 1.1378, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.2841501564129301, | |
| "grad_norm": 11.35154914855957, | |
| "learning_rate": 8.716000000000001e-06, | |
| "loss": 1.2192, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.28545359749739313, | |
| "grad_norm": 11.104358673095703, | |
| "learning_rate": 8.756e-06, | |
| "loss": 1.1804, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.2867570385818561, | |
| "grad_norm": 8.416515350341797, | |
| "learning_rate": 8.796000000000002e-06, | |
| "loss": 1.135, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.2867570385818561, | |
| "eval/acc": 32.55813980102539, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.2867570385818561, | |
| "eval_loss": 2.850806713104248, | |
| "eval_runtime": 0.5533, | |
| "eval_samples_per_second": 77.714, | |
| "eval_steps_per_second": 1.807, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.2880604796663191, | |
| "grad_norm": 11.20003890991211, | |
| "learning_rate": 8.836000000000001e-06, | |
| "loss": 1.1998, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.28936392075078204, | |
| "grad_norm": 12.205933570861816, | |
| "learning_rate": 8.876e-06, | |
| "loss": 1.1331, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.29066736183524505, | |
| "grad_norm": 9.875853538513184, | |
| "learning_rate": 8.916e-06, | |
| "loss": 1.0744, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.291970802919708, | |
| "grad_norm": 11.795681953430176, | |
| "learning_rate": 8.956e-06, | |
| "loss": 1.1378, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.293274244004171, | |
| "grad_norm": 9.370049476623535, | |
| "learning_rate": 8.996e-06, | |
| "loss": 1.0586, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.29457768508863397, | |
| "grad_norm": 10.6432466506958, | |
| "learning_rate": 9.036e-06, | |
| "loss": 1.1936, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.295881126173097, | |
| "grad_norm": 10.588776588439941, | |
| "learning_rate": 9.076000000000001e-06, | |
| "loss": 1.0813, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.29718456725756, | |
| "grad_norm": 10.122645378112793, | |
| "learning_rate": 9.116e-06, | |
| "loss": 1.2067, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.29848800834202294, | |
| "grad_norm": 9.388029098510742, | |
| "learning_rate": 9.156e-06, | |
| "loss": 1.1546, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.29979144942648595, | |
| "grad_norm": 9.928315162658691, | |
| "learning_rate": 9.196e-06, | |
| "loss": 1.1134, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.29979144942648595, | |
| "eval/acc": 41.86046600341797, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.29979144942648595, | |
| "eval_loss": 2.8716952800750732, | |
| "eval_runtime": 0.554, | |
| "eval_samples_per_second": 77.619, | |
| "eval_steps_per_second": 1.805, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.3010948905109489, | |
| "grad_norm": 10.03122615814209, | |
| "learning_rate": 9.236000000000001e-06, | |
| "loss": 1.3212, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.3023983315954119, | |
| "grad_norm": 10.512228012084961, | |
| "learning_rate": 9.276e-06, | |
| "loss": 1.1369, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.30370177267987486, | |
| "grad_norm": 10.605701446533203, | |
| "learning_rate": 9.316e-06, | |
| "loss": 1.2389, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.30500521376433787, | |
| "grad_norm": 11.414910316467285, | |
| "learning_rate": 9.356e-06, | |
| "loss": 1.1003, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.3063086548488008, | |
| "grad_norm": 9.643972396850586, | |
| "learning_rate": 9.396000000000001e-06, | |
| "loss": 1.1028, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.30761209593326383, | |
| "grad_norm": 11.462910652160645, | |
| "learning_rate": 9.436e-06, | |
| "loss": 1.1437, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.3089155370177268, | |
| "grad_norm": 10.556984901428223, | |
| "learning_rate": 9.476e-06, | |
| "loss": 1.1802, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.3102189781021898, | |
| "grad_norm": 11.555737495422363, | |
| "learning_rate": 9.516e-06, | |
| "loss": 1.1376, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.31152241918665274, | |
| "grad_norm": 9.358216285705566, | |
| "learning_rate": 9.556000000000001e-06, | |
| "loss": 1.0511, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.31282586027111575, | |
| "grad_norm": 9.375101089477539, | |
| "learning_rate": 9.596000000000001e-06, | |
| "loss": 1.0519, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.31282586027111575, | |
| "eval/acc": 32.55813980102539, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.31282586027111575, | |
| "eval_loss": 2.992863655090332, | |
| "eval_runtime": 0.5533, | |
| "eval_samples_per_second": 77.716, | |
| "eval_steps_per_second": 1.807, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.3141293013555787, | |
| "grad_norm": 10.487874984741211, | |
| "learning_rate": 9.636e-06, | |
| "loss": 1.0795, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.3154327424400417, | |
| "grad_norm": 14.056046485900879, | |
| "learning_rate": 9.676e-06, | |
| "loss": 1.1586, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.31673618352450467, | |
| "grad_norm": 10.049337387084961, | |
| "learning_rate": 9.716000000000002e-06, | |
| "loss": 1.1348, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.3180396246089677, | |
| "grad_norm": 11.084318161010742, | |
| "learning_rate": 9.756000000000001e-06, | |
| "loss": 1.056, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.3193430656934307, | |
| "grad_norm": 10.96147346496582, | |
| "learning_rate": 9.796e-06, | |
| "loss": 1.1561, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.32064650677789364, | |
| "grad_norm": 9.765122413635254, | |
| "learning_rate": 9.836e-06, | |
| "loss": 1.1844, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.32194994786235664, | |
| "grad_norm": 12.490370750427246, | |
| "learning_rate": 9.876000000000002e-06, | |
| "loss": 1.1342, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.3232533889468196, | |
| "grad_norm": 9.971538543701172, | |
| "learning_rate": 9.916000000000001e-06, | |
| "loss": 1.0907, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.3245568300312826, | |
| "grad_norm": 11.306795120239258, | |
| "learning_rate": 9.956000000000001e-06, | |
| "loss": 1.1521, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.32586027111574556, | |
| "grad_norm": 10.270991325378418, | |
| "learning_rate": 9.996e-06, | |
| "loss": 1.1473, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.32586027111574556, | |
| "eval/acc": 37.20930099487305, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.32586027111574556, | |
| "eval_loss": 2.875680446624756, | |
| "eval_runtime": 0.554, | |
| "eval_samples_per_second": 77.622, | |
| "eval_steps_per_second": 1.805, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.32716371220020857, | |
| "grad_norm": 10.23509407043457, | |
| "learning_rate": 9.996e-06, | |
| "loss": 1.216, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.3284671532846715, | |
| "grad_norm": 12.120686531066895, | |
| "learning_rate": 9.991555555555557e-06, | |
| "loss": 1.0669, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.32977059436913453, | |
| "grad_norm": 11.20948314666748, | |
| "learning_rate": 9.987111111111112e-06, | |
| "loss": 1.0889, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.3310740354535975, | |
| "grad_norm": 11.085042953491211, | |
| "learning_rate": 9.982666666666667e-06, | |
| "loss": 1.212, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.3323774765380605, | |
| "grad_norm": 11.783760070800781, | |
| "learning_rate": 9.978222222222223e-06, | |
| "loss": 1.2059, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.33368091762252344, | |
| "grad_norm": 11.339371681213379, | |
| "learning_rate": 9.973777777777778e-06, | |
| "loss": 1.1027, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.33498435870698645, | |
| "grad_norm": 10.946623802185059, | |
| "learning_rate": 9.969333333333335e-06, | |
| "loss": 1.1663, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.3362877997914494, | |
| "grad_norm": 9.797304153442383, | |
| "learning_rate": 9.96488888888889e-06, | |
| "loss": 1.147, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.3375912408759124, | |
| "grad_norm": 10.305734634399414, | |
| "learning_rate": 9.960444444444444e-06, | |
| "loss": 1.2113, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.33889468196037537, | |
| "grad_norm": 9.742680549621582, | |
| "learning_rate": 9.956000000000001e-06, | |
| "loss": 1.1096, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.33889468196037537, | |
| "eval/acc": 32.55813980102539, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.33889468196037537, | |
| "eval_loss": 2.838628053665161, | |
| "eval_runtime": 0.5541, | |
| "eval_samples_per_second": 77.604, | |
| "eval_steps_per_second": 1.805, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.3401981230448384, | |
| "grad_norm": 11.681222915649414, | |
| "learning_rate": 9.951555555555556e-06, | |
| "loss": 1.1573, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 0.3415015641293014, | |
| "grad_norm": 10.580199241638184, | |
| "learning_rate": 9.947111111111112e-06, | |
| "loss": 1.1942, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.34280500521376434, | |
| "grad_norm": 9.525206565856934, | |
| "learning_rate": 9.942666666666667e-06, | |
| "loss": 1.139, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 0.34410844629822734, | |
| "grad_norm": 11.521892547607422, | |
| "learning_rate": 9.938222222222224e-06, | |
| "loss": 1.2106, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.3454118873826903, | |
| "grad_norm": 10.282144546508789, | |
| "learning_rate": 9.933777777777779e-06, | |
| "loss": 1.068, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.3467153284671533, | |
| "grad_norm": 10.942089080810547, | |
| "learning_rate": 9.929333333333333e-06, | |
| "loss": 1.0709, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.34801876955161626, | |
| "grad_norm": 12.269514083862305, | |
| "learning_rate": 9.92488888888889e-06, | |
| "loss": 1.073, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 0.34932221063607927, | |
| "grad_norm": 10.467517852783203, | |
| "learning_rate": 9.920444444444445e-06, | |
| "loss": 1.1178, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.3506256517205422, | |
| "grad_norm": 11.05263900756836, | |
| "learning_rate": 9.916000000000001e-06, | |
| "loss": 1.1126, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 0.35192909280500523, | |
| "grad_norm": 10.848026275634766, | |
| "learning_rate": 9.911555555555556e-06, | |
| "loss": 1.1425, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.35192909280500523, | |
| "eval/acc": 32.55813980102539, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.35192909280500523, | |
| "eval_loss": 2.8443257808685303, | |
| "eval_runtime": 0.5552, | |
| "eval_samples_per_second": 77.455, | |
| "eval_steps_per_second": 1.801, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.3532325338894682, | |
| "grad_norm": 9.627706527709961, | |
| "learning_rate": 9.907111111111111e-06, | |
| "loss": 1.0892, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 0.3545359749739312, | |
| "grad_norm": 9.48183822631836, | |
| "learning_rate": 9.902666666666668e-06, | |
| "loss": 1.1611, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.35583941605839414, | |
| "grad_norm": 10.31680965423584, | |
| "learning_rate": 9.898222222222224e-06, | |
| "loss": 1.1474, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 0.35714285714285715, | |
| "grad_norm": 9.613831520080566, | |
| "learning_rate": 9.893777777777779e-06, | |
| "loss": 1.0592, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.3584462982273201, | |
| "grad_norm": 14.002620697021484, | |
| "learning_rate": 9.889333333333334e-06, | |
| "loss": 1.1399, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.3597497393117831, | |
| "grad_norm": 9.574627876281738, | |
| "learning_rate": 9.884888888888889e-06, | |
| "loss": 1.0702, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.36105318039624607, | |
| "grad_norm": 11.370795249938965, | |
| "learning_rate": 9.880444444444445e-06, | |
| "loss": 1.1089, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 0.3623566214807091, | |
| "grad_norm": 11.44530963897705, | |
| "learning_rate": 9.876000000000002e-06, | |
| "loss": 1.0896, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.3636600625651721, | |
| "grad_norm": 10.26310920715332, | |
| "learning_rate": 9.871555555555557e-06, | |
| "loss": 1.2288, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 0.36496350364963503, | |
| "grad_norm": 10.72587776184082, | |
| "learning_rate": 9.867111111111111e-06, | |
| "loss": 1.1154, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.36496350364963503, | |
| "eval/acc": 39.53488540649414, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.36496350364963503, | |
| "eval_loss": 2.9019012451171875, | |
| "eval_runtime": 0.553, | |
| "eval_samples_per_second": 77.752, | |
| "eval_steps_per_second": 1.808, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.36626694473409804, | |
| "grad_norm": 10.30538272857666, | |
| "learning_rate": 9.862666666666668e-06, | |
| "loss": 1.086, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 0.367570385818561, | |
| "grad_norm": 9.331382751464844, | |
| "learning_rate": 9.858222222222223e-06, | |
| "loss": 1.2179, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.368873826903024, | |
| "grad_norm": 9.834467887878418, | |
| "learning_rate": 9.85377777777778e-06, | |
| "loss": 1.1286, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 0.37017726798748696, | |
| "grad_norm": 11.874444961547852, | |
| "learning_rate": 9.849333333333334e-06, | |
| "loss": 1.1325, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.37148070907194997, | |
| "grad_norm": 10.40954875946045, | |
| "learning_rate": 9.844888888888889e-06, | |
| "loss": 1.1669, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.3727841501564129, | |
| "grad_norm": 10.013657569885254, | |
| "learning_rate": 9.840444444444446e-06, | |
| "loss": 1.0895, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.3740875912408759, | |
| "grad_norm": 10.641711235046387, | |
| "learning_rate": 9.836e-06, | |
| "loss": 1.1342, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 0.3753910323253389, | |
| "grad_norm": 9.41917896270752, | |
| "learning_rate": 9.831555555555557e-06, | |
| "loss": 1.0698, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.3766944734098019, | |
| "grad_norm": 10.998407363891602, | |
| "learning_rate": 9.827111111111112e-06, | |
| "loss": 1.0777, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 0.37799791449426484, | |
| "grad_norm": 10.565347671508789, | |
| "learning_rate": 9.822666666666667e-06, | |
| "loss": 1.1446, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.37799791449426484, | |
| "eval/acc": 34.88372039794922, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.37799791449426484, | |
| "eval_loss": 2.8377606868743896, | |
| "eval_runtime": 0.5537, | |
| "eval_samples_per_second": 77.653, | |
| "eval_steps_per_second": 1.806, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.37930135557872785, | |
| "grad_norm": 10.49682903289795, | |
| "learning_rate": 9.818222222222223e-06, | |
| "loss": 1.0774, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 0.3806047966631908, | |
| "grad_norm": 10.447504997253418, | |
| "learning_rate": 9.813777777777778e-06, | |
| "loss": 1.0116, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.3819082377476538, | |
| "grad_norm": 9.127096176147461, | |
| "learning_rate": 9.809333333333335e-06, | |
| "loss": 1.0786, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 0.38321167883211676, | |
| "grad_norm": 11.165003776550293, | |
| "learning_rate": 9.80488888888889e-06, | |
| "loss": 1.0451, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.3845151199165798, | |
| "grad_norm": 11.500470161437988, | |
| "learning_rate": 9.800444444444446e-06, | |
| "loss": 1.1676, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.3858185610010427, | |
| "grad_norm": 9.945548057556152, | |
| "learning_rate": 9.796e-06, | |
| "loss": 1.0829, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.38712200208550573, | |
| "grad_norm": 9.096894264221191, | |
| "learning_rate": 9.791555555555556e-06, | |
| "loss": 1.1571, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 0.38842544316996874, | |
| "grad_norm": 9.676164627075195, | |
| "learning_rate": 9.787111111111112e-06, | |
| "loss": 1.1088, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.3897288842544317, | |
| "grad_norm": 9.788176536560059, | |
| "learning_rate": 9.782666666666667e-06, | |
| "loss": 1.1283, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 0.3910323253388947, | |
| "grad_norm": 9.841941833496094, | |
| "learning_rate": 9.778222222222224e-06, | |
| "loss": 1.1106, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.3910323253388947, | |
| "eval/acc": 34.88372039794922, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.3910323253388947, | |
| "eval_loss": 2.733646869659424, | |
| "eval_runtime": 0.5496, | |
| "eval_samples_per_second": 78.234, | |
| "eval_steps_per_second": 1.819, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.39233576642335766, | |
| "grad_norm": 11.72174072265625, | |
| "learning_rate": 9.773777777777778e-06, | |
| "loss": 1.0701, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 0.39363920750782067, | |
| "grad_norm": 9.394064903259277, | |
| "learning_rate": 9.769333333333333e-06, | |
| "loss": 1.0966, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 0.3949426485922836, | |
| "grad_norm": 10.733139991760254, | |
| "learning_rate": 9.76488888888889e-06, | |
| "loss": 1.1911, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 0.3962460896767466, | |
| "grad_norm": 10.33555793762207, | |
| "learning_rate": 9.760444444444446e-06, | |
| "loss": 1.0843, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.3975495307612096, | |
| "grad_norm": 10.826848983764648, | |
| "learning_rate": 9.756000000000001e-06, | |
| "loss": 1.2552, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.3988529718456726, | |
| "grad_norm": 14.324176788330078, | |
| "learning_rate": 9.751555555555556e-06, | |
| "loss": 1.0654, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 0.40015641293013554, | |
| "grad_norm": 9.932692527770996, | |
| "learning_rate": 9.74711111111111e-06, | |
| "loss": 1.1099, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 0.40145985401459855, | |
| "grad_norm": 8.769567489624023, | |
| "learning_rate": 9.742666666666667e-06, | |
| "loss": 1.126, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 0.4027632950990615, | |
| "grad_norm": 9.914202690124512, | |
| "learning_rate": 9.738222222222224e-06, | |
| "loss": 1.0349, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 0.4040667361835245, | |
| "grad_norm": 8.979110717773438, | |
| "learning_rate": 9.733777777777779e-06, | |
| "loss": 1.1147, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.4040667361835245, | |
| "eval/acc": 34.88372039794922, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.4040667361835245, | |
| "eval_loss": 2.9223451614379883, | |
| "eval_runtime": 0.5703, | |
| "eval_samples_per_second": 75.404, | |
| "eval_steps_per_second": 1.754, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.40537017726798746, | |
| "grad_norm": 10.178040504455566, | |
| "learning_rate": 9.729333333333334e-06, | |
| "loss": 1.0661, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 0.40667361835245047, | |
| "grad_norm": 9.03530216217041, | |
| "learning_rate": 9.724888888888888e-06, | |
| "loss": 1.1282, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 0.4079770594369134, | |
| "grad_norm": 9.545401573181152, | |
| "learning_rate": 9.720444444444445e-06, | |
| "loss": 1.0933, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 0.40928050052137643, | |
| "grad_norm": 9.34640121459961, | |
| "learning_rate": 9.716000000000002e-06, | |
| "loss": 1.1273, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 0.41058394160583944, | |
| "grad_norm": 9.456986427307129, | |
| "learning_rate": 9.711555555555556e-06, | |
| "loss": 1.1527, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.4118873826903024, | |
| "grad_norm": 10.370234489440918, | |
| "learning_rate": 9.707111111111111e-06, | |
| "loss": 1.0639, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 0.4131908237747654, | |
| "grad_norm": 9.064216613769531, | |
| "learning_rate": 9.702666666666668e-06, | |
| "loss": 1.0942, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 0.41449426485922836, | |
| "grad_norm": 10.234908103942871, | |
| "learning_rate": 9.698222222222223e-06, | |
| "loss": 1.167, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 0.41579770594369136, | |
| "grad_norm": 10.56281566619873, | |
| "learning_rate": 9.693777777777779e-06, | |
| "loss": 1.0659, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 0.4171011470281543, | |
| "grad_norm": 10.342962265014648, | |
| "learning_rate": 9.689333333333334e-06, | |
| "loss": 1.0829, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.4171011470281543, | |
| "eval/acc": 37.20930099487305, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.4171011470281543, | |
| "eval_loss": 2.8262035846710205, | |
| "eval_runtime": 0.5536, | |
| "eval_samples_per_second": 77.675, | |
| "eval_steps_per_second": 1.806, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.4184045881126173, | |
| "grad_norm": 10.684874534606934, | |
| "learning_rate": 9.684888888888889e-06, | |
| "loss": 1.1349, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 0.4197080291970803, | |
| "grad_norm": 10.516358375549316, | |
| "learning_rate": 9.680444444444445e-06, | |
| "loss": 1.1277, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 0.4210114702815433, | |
| "grad_norm": 10.139744758605957, | |
| "learning_rate": 9.676e-06, | |
| "loss": 1.0163, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 0.42231491136600624, | |
| "grad_norm": 8.341273307800293, | |
| "learning_rate": 9.671555555555557e-06, | |
| "loss": 1.0264, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 0.42361835245046925, | |
| "grad_norm": 10.223051071166992, | |
| "learning_rate": 9.667111111111112e-06, | |
| "loss": 1.135, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.4249217935349322, | |
| "grad_norm": 10.643424987792969, | |
| "learning_rate": 9.662666666666668e-06, | |
| "loss": 1.1079, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 0.4262252346193952, | |
| "grad_norm": 10.129752159118652, | |
| "learning_rate": 9.658222222222223e-06, | |
| "loss": 1.1081, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 0.42752867570385816, | |
| "grad_norm": 9.503373146057129, | |
| "learning_rate": 9.653777777777778e-06, | |
| "loss": 1.1133, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 0.42883211678832117, | |
| "grad_norm": 10.085280418395996, | |
| "learning_rate": 9.649333333333334e-06, | |
| "loss": 1.0587, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 0.4301355578727841, | |
| "grad_norm": 8.572416305541992, | |
| "learning_rate": 9.64488888888889e-06, | |
| "loss": 1.0355, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.4301355578727841, | |
| "eval/acc": 39.53488540649414, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.4301355578727841, | |
| "eval_loss": 2.875706911087036, | |
| "eval_runtime": 0.8759, | |
| "eval_samples_per_second": 49.094, | |
| "eval_steps_per_second": 1.142, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.43143899895724713, | |
| "grad_norm": 11.9055814743042, | |
| "learning_rate": 9.640444444444446e-06, | |
| "loss": 1.1596, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 0.43274244004171014, | |
| "grad_norm": 7.891529083251953, | |
| "learning_rate": 9.636e-06, | |
| "loss": 1.0478, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 0.4340458811261731, | |
| "grad_norm": 10.109358787536621, | |
| "learning_rate": 9.631555555555555e-06, | |
| "loss": 1.037, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 0.4353493222106361, | |
| "grad_norm": 9.673956871032715, | |
| "learning_rate": 9.627111111111112e-06, | |
| "loss": 1.0827, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 0.43665276329509906, | |
| "grad_norm": 9.339848518371582, | |
| "learning_rate": 9.622666666666668e-06, | |
| "loss": 1.0662, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.43795620437956206, | |
| "grad_norm": 9.403885841369629, | |
| "learning_rate": 9.618222222222223e-06, | |
| "loss": 1.051, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 0.439259645464025, | |
| "grad_norm": 10.163128852844238, | |
| "learning_rate": 9.613777777777778e-06, | |
| "loss": 1.082, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 0.440563086548488, | |
| "grad_norm": 8.699789047241211, | |
| "learning_rate": 9.609333333333333e-06, | |
| "loss": 1.1382, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 0.441866527632951, | |
| "grad_norm": 10.108110427856445, | |
| "learning_rate": 9.60488888888889e-06, | |
| "loss": 1.1227, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 0.443169968717414, | |
| "grad_norm": 11.483874320983887, | |
| "learning_rate": 9.600444444444446e-06, | |
| "loss": 1.0851, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.443169968717414, | |
| "eval/acc": 37.20930099487305, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.443169968717414, | |
| "eval_loss": 2.804619312286377, | |
| "eval_runtime": 0.5503, | |
| "eval_samples_per_second": 78.14, | |
| "eval_steps_per_second": 1.817, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.44447340980187694, | |
| "grad_norm": 9.97952938079834, | |
| "learning_rate": 9.596000000000001e-06, | |
| "loss": 1.02, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 0.44577685088633995, | |
| "grad_norm": 9.445475578308105, | |
| "learning_rate": 9.591555555555556e-06, | |
| "loss": 1.1166, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 0.4470802919708029, | |
| "grad_norm": 10.408344268798828, | |
| "learning_rate": 9.58711111111111e-06, | |
| "loss": 1.1305, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 0.4483837330552659, | |
| "grad_norm": 10.129816055297852, | |
| "learning_rate": 9.582666666666667e-06, | |
| "loss": 1.0882, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 0.44968717413972886, | |
| "grad_norm": 11.24634838104248, | |
| "learning_rate": 9.578222222222224e-06, | |
| "loss": 1.1458, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.45099061522419187, | |
| "grad_norm": 9.015290260314941, | |
| "learning_rate": 9.573777777777779e-06, | |
| "loss": 1.1692, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 0.4522940563086548, | |
| "grad_norm": 11.587824821472168, | |
| "learning_rate": 9.569333333333333e-06, | |
| "loss": 1.1005, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 0.45359749739311783, | |
| "grad_norm": 11.352563858032227, | |
| "learning_rate": 9.56488888888889e-06, | |
| "loss": 1.0646, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 0.45490093847758084, | |
| "grad_norm": 9.49247932434082, | |
| "learning_rate": 9.560444444444445e-06, | |
| "loss": 1.0682, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 0.4562043795620438, | |
| "grad_norm": 11.200020790100098, | |
| "learning_rate": 9.556000000000001e-06, | |
| "loss": 1.0354, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.4562043795620438, | |
| "eval/acc": 32.55813980102539, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.4562043795620438, | |
| "eval_loss": 2.850745916366577, | |
| "eval_runtime": 0.5511, | |
| "eval_samples_per_second": 78.029, | |
| "eval_steps_per_second": 1.815, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.4575078206465068, | |
| "grad_norm": 9.414541244506836, | |
| "learning_rate": 9.551555555555556e-06, | |
| "loss": 1.0458, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 0.45881126173096975, | |
| "grad_norm": 11.558536529541016, | |
| "learning_rate": 9.547111111111111e-06, | |
| "loss": 1.1236, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 0.46011470281543276, | |
| "grad_norm": 9.692702293395996, | |
| "learning_rate": 9.542666666666668e-06, | |
| "loss": 1.1055, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 0.4614181438998957, | |
| "grad_norm": 11.946589469909668, | |
| "learning_rate": 9.538222222222222e-06, | |
| "loss": 1.0746, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 0.4627215849843587, | |
| "grad_norm": 9.754605293273926, | |
| "learning_rate": 9.533777777777779e-06, | |
| "loss": 1.1951, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.4640250260688217, | |
| "grad_norm": 9.67271614074707, | |
| "learning_rate": 9.529333333333334e-06, | |
| "loss": 1.0727, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 0.4653284671532847, | |
| "grad_norm": 9.403274536132812, | |
| "learning_rate": 9.52488888888889e-06, | |
| "loss": 1.0781, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 0.46663190823774764, | |
| "grad_norm": 9.350010871887207, | |
| "learning_rate": 9.520444444444445e-06, | |
| "loss": 1.141, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 0.46793534932221065, | |
| "grad_norm": 11.810049057006836, | |
| "learning_rate": 9.516e-06, | |
| "loss": 1.0642, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 0.4692387904066736, | |
| "grad_norm": 9.524765014648438, | |
| "learning_rate": 9.511555555555557e-06, | |
| "loss": 1.0196, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.4692387904066736, | |
| "eval/acc": 34.88372039794922, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.4692387904066736, | |
| "eval_loss": 2.8087544441223145, | |
| "eval_runtime": 0.5519, | |
| "eval_samples_per_second": 77.911, | |
| "eval_steps_per_second": 1.812, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.4705422314911366, | |
| "grad_norm": 11.370705604553223, | |
| "learning_rate": 9.507111111111111e-06, | |
| "loss": 1.0253, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 0.47184567257559956, | |
| "grad_norm": 8.690403938293457, | |
| "learning_rate": 9.502666666666668e-06, | |
| "loss": 1.071, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 0.47314911366006257, | |
| "grad_norm": 9.085663795471191, | |
| "learning_rate": 9.498222222222223e-06, | |
| "loss": 1.1507, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 0.4744525547445255, | |
| "grad_norm": 9.252151489257812, | |
| "learning_rate": 9.493777777777778e-06, | |
| "loss": 1.1038, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 0.47575599582898853, | |
| "grad_norm": 11.12983226776123, | |
| "learning_rate": 9.489333333333334e-06, | |
| "loss": 1.0457, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.4770594369134515, | |
| "grad_norm": 9.117828369140625, | |
| "learning_rate": 9.48488888888889e-06, | |
| "loss": 1.0704, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 0.4783628779979145, | |
| "grad_norm": 10.71731185913086, | |
| "learning_rate": 9.480444444444446e-06, | |
| "loss": 1.0461, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 0.4796663190823775, | |
| "grad_norm": 8.495375633239746, | |
| "learning_rate": 9.476e-06, | |
| "loss": 1.1173, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 0.48096976016684045, | |
| "grad_norm": 10.223701477050781, | |
| "learning_rate": 9.471555555555555e-06, | |
| "loss": 1.1301, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 0.48227320125130346, | |
| "grad_norm": 10.180765151977539, | |
| "learning_rate": 9.467111111111112e-06, | |
| "loss": 1.1262, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.48227320125130346, | |
| "eval/acc": 37.20930099487305, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.48227320125130346, | |
| "eval_loss": 2.7609646320343018, | |
| "eval_runtime": 0.5504, | |
| "eval_samples_per_second": 78.122, | |
| "eval_steps_per_second": 1.817, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.4835766423357664, | |
| "grad_norm": 10.506937026977539, | |
| "learning_rate": 9.462666666666668e-06, | |
| "loss": 1.0366, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 0.4848800834202294, | |
| "grad_norm": 11.131478309631348, | |
| "learning_rate": 9.458222222222223e-06, | |
| "loss": 0.9718, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 0.4861835245046924, | |
| "grad_norm": 11.127948760986328, | |
| "learning_rate": 9.453777777777778e-06, | |
| "loss": 1.1568, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 0.4874869655891554, | |
| "grad_norm": 10.03661060333252, | |
| "learning_rate": 9.449333333333333e-06, | |
| "loss": 1.0066, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 0.48879040667361834, | |
| "grad_norm": 11.38666820526123, | |
| "learning_rate": 9.44488888888889e-06, | |
| "loss": 1.0457, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.49009384775808135, | |
| "grad_norm": 9.510127067565918, | |
| "learning_rate": 9.440444444444446e-06, | |
| "loss": 1.1167, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 0.4913972888425443, | |
| "grad_norm": 10.810651779174805, | |
| "learning_rate": 9.436e-06, | |
| "loss": 1.1126, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 0.4927007299270073, | |
| "grad_norm": 9.202433586120605, | |
| "learning_rate": 9.431555555555556e-06, | |
| "loss": 1.0681, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 0.49400417101147026, | |
| "grad_norm": 8.647710800170898, | |
| "learning_rate": 9.427111111111112e-06, | |
| "loss": 1.0295, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 0.49530761209593327, | |
| "grad_norm": 11.453765869140625, | |
| "learning_rate": 9.422666666666667e-06, | |
| "loss": 1.015, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.49530761209593327, | |
| "eval/acc": 37.20930099487305, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.49530761209593327, | |
| "eval_loss": 2.7812387943267822, | |
| "eval_runtime": 0.5494, | |
| "eval_samples_per_second": 78.268, | |
| "eval_steps_per_second": 1.82, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.4966110531803962, | |
| "grad_norm": 10.551323890686035, | |
| "learning_rate": 9.418222222222224e-06, | |
| "loss": 1.1188, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 0.49791449426485923, | |
| "grad_norm": 10.307533264160156, | |
| "learning_rate": 9.413777777777778e-06, | |
| "loss": 1.0767, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 0.4992179353493222, | |
| "grad_norm": 12.098529815673828, | |
| "learning_rate": 9.409333333333333e-06, | |
| "loss": 1.0597, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 0.5005213764337852, | |
| "grad_norm": 10.920623779296875, | |
| "learning_rate": 9.40488888888889e-06, | |
| "loss": 0.9847, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 0.5018248175182481, | |
| "grad_norm": 10.035759925842285, | |
| "learning_rate": 9.400444444444445e-06, | |
| "loss": 1.07, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.5031282586027112, | |
| "grad_norm": 10.293031692504883, | |
| "learning_rate": 9.396000000000001e-06, | |
| "loss": 1.0453, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 0.5044316996871742, | |
| "grad_norm": 9.7219877243042, | |
| "learning_rate": 9.391555555555556e-06, | |
| "loss": 1.0783, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 0.5057351407716372, | |
| "grad_norm": 9.780116081237793, | |
| "learning_rate": 9.387111111111113e-06, | |
| "loss": 1.021, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 0.5070385818561001, | |
| "grad_norm": 10.145584106445312, | |
| "learning_rate": 9.382666666666667e-06, | |
| "loss": 1.0744, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 0.5083420229405631, | |
| "grad_norm": 9.737056732177734, | |
| "learning_rate": 9.378222222222222e-06, | |
| "loss": 1.1837, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.5083420229405631, | |
| "eval/acc": 34.88372039794922, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.5083420229405631, | |
| "eval_loss": 2.6774258613586426, | |
| "eval_runtime": 0.5548, | |
| "eval_samples_per_second": 77.509, | |
| "eval_steps_per_second": 1.803, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.5096454640250261, | |
| "grad_norm": 9.52910041809082, | |
| "learning_rate": 9.373777777777779e-06, | |
| "loss": 1.0, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 0.5109489051094891, | |
| "grad_norm": 11.480224609375, | |
| "learning_rate": 9.369333333333334e-06, | |
| "loss": 1.029, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 0.512252346193952, | |
| "grad_norm": 8.294060707092285, | |
| "learning_rate": 9.36488888888889e-06, | |
| "loss": 1.0584, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 0.513555787278415, | |
| "grad_norm": 8.96554946899414, | |
| "learning_rate": 9.360444444444445e-06, | |
| "loss": 1.0415, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 0.514859228362878, | |
| "grad_norm": 10.146249771118164, | |
| "learning_rate": 9.356e-06, | |
| "loss": 0.9763, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.516162669447341, | |
| "grad_norm": 9.620243072509766, | |
| "learning_rate": 9.351555555555556e-06, | |
| "loss": 1.0677, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 0.5174661105318039, | |
| "grad_norm": 8.995674133300781, | |
| "learning_rate": 9.347111111111113e-06, | |
| "loss": 1.0893, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 0.5187695516162669, | |
| "grad_norm": 10.30301284790039, | |
| "learning_rate": 9.342666666666668e-06, | |
| "loss": 1.0958, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 0.5200729927007299, | |
| "grad_norm": 9.020184516906738, | |
| "learning_rate": 9.338222222222223e-06, | |
| "loss": 1.0115, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 0.5213764337851929, | |
| "grad_norm": 11.706809997558594, | |
| "learning_rate": 9.333777777777777e-06, | |
| "loss": 1.1306, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.5213764337851929, | |
| "eval/acc": 34.88372039794922, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.5213764337851929, | |
| "eval_loss": 2.719060182571411, | |
| "eval_runtime": 0.5502, | |
| "eval_samples_per_second": 78.155, | |
| "eval_steps_per_second": 1.818, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.5226798748696558, | |
| "grad_norm": 10.49409294128418, | |
| "learning_rate": 9.329333333333334e-06, | |
| "loss": 1.0554, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 0.5239833159541188, | |
| "grad_norm": 7.883603572845459, | |
| "learning_rate": 9.32488888888889e-06, | |
| "loss": 0.9968, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 0.5252867570385819, | |
| "grad_norm": 11.045550346374512, | |
| "learning_rate": 9.320444444444445e-06, | |
| "loss": 1.1689, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 0.5265901981230449, | |
| "grad_norm": 9.245767593383789, | |
| "learning_rate": 9.316e-06, | |
| "loss": 1.0647, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 0.5278936392075079, | |
| "grad_norm": 8.662199974060059, | |
| "learning_rate": 9.311555555555555e-06, | |
| "loss": 0.9952, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.5291970802919708, | |
| "grad_norm": 8.584678649902344, | |
| "learning_rate": 9.307111111111112e-06, | |
| "loss": 1.0025, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 0.5305005213764338, | |
| "grad_norm": 8.951703071594238, | |
| "learning_rate": 9.302666666666668e-06, | |
| "loss": 1.0182, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 0.5318039624608968, | |
| "grad_norm": 11.469212532043457, | |
| "learning_rate": 9.298222222222223e-06, | |
| "loss": 1.086, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 0.5331074035453598, | |
| "grad_norm": 10.124979972839355, | |
| "learning_rate": 9.293777777777778e-06, | |
| "loss": 1.0614, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 0.5344108446298227, | |
| "grad_norm": 9.715713500976562, | |
| "learning_rate": 9.289333333333334e-06, | |
| "loss": 1.029, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.5344108446298227, | |
| "eval/acc": 39.53488540649414, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.5344108446298227, | |
| "eval_loss": 2.6891214847564697, | |
| "eval_runtime": 0.5531, | |
| "eval_samples_per_second": 77.75, | |
| "eval_steps_per_second": 1.808, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.5357142857142857, | |
| "grad_norm": 10.887805938720703, | |
| "learning_rate": 9.28488888888889e-06, | |
| "loss": 1.1104, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 0.5370177267987487, | |
| "grad_norm": 8.276104927062988, | |
| "learning_rate": 9.280444444444446e-06, | |
| "loss": 1.0563, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 0.5383211678832117, | |
| "grad_norm": 9.104747772216797, | |
| "learning_rate": 9.276e-06, | |
| "loss": 1.0732, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 0.5396246089676746, | |
| "grad_norm": 10.727592468261719, | |
| "learning_rate": 9.271555555555555e-06, | |
| "loss": 1.0253, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 0.5409280500521376, | |
| "grad_norm": 10.487238883972168, | |
| "learning_rate": 9.267111111111112e-06, | |
| "loss": 1.0849, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.5422314911366006, | |
| "grad_norm": 9.830368995666504, | |
| "learning_rate": 9.262666666666667e-06, | |
| "loss": 1.0699, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 0.5435349322210636, | |
| "grad_norm": 9.725363731384277, | |
| "learning_rate": 9.258222222222223e-06, | |
| "loss": 1.0149, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 0.5448383733055265, | |
| "grad_norm": 10.23435115814209, | |
| "learning_rate": 9.253777777777778e-06, | |
| "loss": 0.9648, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 0.5461418143899895, | |
| "grad_norm": 8.573326110839844, | |
| "learning_rate": 9.249333333333335e-06, | |
| "loss": 1.0607, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 0.5474452554744526, | |
| "grad_norm": 9.514001846313477, | |
| "learning_rate": 9.24488888888889e-06, | |
| "loss": 1.0196, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.5474452554744526, | |
| "eval/acc": 32.55813980102539, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.5474452554744526, | |
| "eval_loss": 2.5407004356384277, | |
| "eval_runtime": 0.5515, | |
| "eval_samples_per_second": 77.969, | |
| "eval_steps_per_second": 1.813, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.5487486965589156, | |
| "grad_norm": 9.46273136138916, | |
| "learning_rate": 9.240444444444444e-06, | |
| "loss": 1.0557, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 0.5500521376433786, | |
| "grad_norm": 12.82573127746582, | |
| "learning_rate": 9.236000000000001e-06, | |
| "loss": 1.051, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 0.5513555787278415, | |
| "grad_norm": 10.965460777282715, | |
| "learning_rate": 9.231555555555556e-06, | |
| "loss": 0.9239, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 0.5526590198123045, | |
| "grad_norm": 9.015987396240234, | |
| "learning_rate": 9.227111111111112e-06, | |
| "loss": 1.0477, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 0.5539624608967675, | |
| "grad_norm": 8.61673355102539, | |
| "learning_rate": 9.222666666666667e-06, | |
| "loss": 1.0693, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.5552659019812305, | |
| "grad_norm": 9.152997016906738, | |
| "learning_rate": 9.218222222222222e-06, | |
| "loss": 1.0104, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 0.5565693430656934, | |
| "grad_norm": 8.82421588897705, | |
| "learning_rate": 9.213777777777779e-06, | |
| "loss": 1.0304, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 0.5578727841501564, | |
| "grad_norm": 9.665721893310547, | |
| "learning_rate": 9.209333333333335e-06, | |
| "loss": 1.0608, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 0.5591762252346194, | |
| "grad_norm": 10.174515724182129, | |
| "learning_rate": 9.20488888888889e-06, | |
| "loss": 1.099, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 0.5604796663190824, | |
| "grad_norm": 9.723739624023438, | |
| "learning_rate": 9.200444444444445e-06, | |
| "loss": 1.0327, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.5604796663190824, | |
| "eval/acc": 34.88372039794922, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.5604796663190824, | |
| "eval_loss": 2.560245990753174, | |
| "eval_runtime": 0.5506, | |
| "eval_samples_per_second": 78.095, | |
| "eval_steps_per_second": 1.816, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.5617831074035453, | |
| "grad_norm": 9.028182983398438, | |
| "learning_rate": 9.196e-06, | |
| "loss": 1.0447, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 0.5630865484880083, | |
| "grad_norm": 9.231035232543945, | |
| "learning_rate": 9.191555555555556e-06, | |
| "loss": 1.0014, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 0.5643899895724713, | |
| "grad_norm": 9.409144401550293, | |
| "learning_rate": 9.187111111111113e-06, | |
| "loss": 1.0805, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 0.5656934306569343, | |
| "grad_norm": 9.330337524414062, | |
| "learning_rate": 9.182666666666668e-06, | |
| "loss": 0.9831, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 0.5669968717413972, | |
| "grad_norm": 9.44364070892334, | |
| "learning_rate": 9.178222222222222e-06, | |
| "loss": 1.1507, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.5683003128258602, | |
| "grad_norm": 8.195267677307129, | |
| "learning_rate": 9.173777777777777e-06, | |
| "loss": 1.0538, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 0.5696037539103233, | |
| "grad_norm": 10.082292556762695, | |
| "learning_rate": 9.169333333333334e-06, | |
| "loss": 1.1772, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 0.5709071949947863, | |
| "grad_norm": 7.957224369049072, | |
| "learning_rate": 9.16488888888889e-06, | |
| "loss": 0.9671, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 0.5722106360792493, | |
| "grad_norm": 9.066376686096191, | |
| "learning_rate": 9.160444444444445e-06, | |
| "loss": 0.9564, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 0.5735140771637122, | |
| "grad_norm": 9.167228698730469, | |
| "learning_rate": 9.156e-06, | |
| "loss": 1.0625, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.5735140771637122, | |
| "eval/acc": 37.20930099487305, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.5735140771637122, | |
| "eval_loss": 2.62846040725708, | |
| "eval_runtime": 0.5519, | |
| "eval_samples_per_second": 77.91, | |
| "eval_steps_per_second": 1.812, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.5748175182481752, | |
| "grad_norm": 11.493626594543457, | |
| "learning_rate": 9.151555555555557e-06, | |
| "loss": 1.0258, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 0.5761209593326382, | |
| "grad_norm": 11.340927124023438, | |
| "learning_rate": 9.147111111111111e-06, | |
| "loss": 1.0167, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 0.5774244004171012, | |
| "grad_norm": 9.083796501159668, | |
| "learning_rate": 9.142666666666668e-06, | |
| "loss": 1.0704, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 0.5787278415015641, | |
| "grad_norm": 10.585103988647461, | |
| "learning_rate": 9.138222222222223e-06, | |
| "loss": 1.1001, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 0.5800312825860271, | |
| "grad_norm": 10.192399024963379, | |
| "learning_rate": 9.133777777777778e-06, | |
| "loss": 1.0589, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.5813347236704901, | |
| "grad_norm": 9.637321472167969, | |
| "learning_rate": 9.129333333333334e-06, | |
| "loss": 1.0217, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 0.5826381647549531, | |
| "grad_norm": 11.652050018310547, | |
| "learning_rate": 9.124888888888889e-06, | |
| "loss": 1.1136, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 0.583941605839416, | |
| "grad_norm": 9.2413969039917, | |
| "learning_rate": 9.120444444444446e-06, | |
| "loss": 1.0009, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 0.585245046923879, | |
| "grad_norm": 9.579240798950195, | |
| "learning_rate": 9.116e-06, | |
| "loss": 1.0948, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 0.586548488008342, | |
| "grad_norm": 10.748444557189941, | |
| "learning_rate": 9.111555555555557e-06, | |
| "loss": 1.0969, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.586548488008342, | |
| "eval/acc": 34.88372039794922, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.586548488008342, | |
| "eval_loss": 2.528625726699829, | |
| "eval_runtime": 0.5514, | |
| "eval_samples_per_second": 77.977, | |
| "eval_steps_per_second": 1.813, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.587851929092805, | |
| "grad_norm": 9.715644836425781, | |
| "learning_rate": 9.107111111111112e-06, | |
| "loss": 1.1046, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 0.5891553701772679, | |
| "grad_norm": 9.33938980102539, | |
| "learning_rate": 9.102666666666667e-06, | |
| "loss": 0.9897, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 0.5904588112617309, | |
| "grad_norm": 8.88958740234375, | |
| "learning_rate": 9.098222222222223e-06, | |
| "loss": 1.1344, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 0.591762252346194, | |
| "grad_norm": 10.64390754699707, | |
| "learning_rate": 9.093777777777778e-06, | |
| "loss": 1.0106, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 0.593065693430657, | |
| "grad_norm": 9.564251899719238, | |
| "learning_rate": 9.089333333333335e-06, | |
| "loss": 1.0985, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.59436913451512, | |
| "grad_norm": 9.475229263305664, | |
| "learning_rate": 9.08488888888889e-06, | |
| "loss": 1.0643, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 0.5956725755995829, | |
| "grad_norm": 8.694733619689941, | |
| "learning_rate": 9.080444444444444e-06, | |
| "loss": 1.0899, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 0.5969760166840459, | |
| "grad_norm": 9.67250919342041, | |
| "learning_rate": 9.076000000000001e-06, | |
| "loss": 1.0518, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 0.5982794577685089, | |
| "grad_norm": 9.918119430541992, | |
| "learning_rate": 9.071555555555557e-06, | |
| "loss": 1.139, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 0.5995828988529719, | |
| "grad_norm": 11.0655517578125, | |
| "learning_rate": 9.067111111111112e-06, | |
| "loss": 1.0195, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.5995828988529719, | |
| "eval/acc": 37.20930099487305, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.5995828988529719, | |
| "eval_loss": 2.581063747406006, | |
| "eval_runtime": 0.5974, | |
| "eval_samples_per_second": 71.985, | |
| "eval_steps_per_second": 1.674, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.6008863399374348, | |
| "grad_norm": 9.673873901367188, | |
| "learning_rate": 9.062666666666667e-06, | |
| "loss": 1.0139, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 0.6021897810218978, | |
| "grad_norm": 9.95392894744873, | |
| "learning_rate": 9.058222222222222e-06, | |
| "loss": 1.0544, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 0.6034932221063608, | |
| "grad_norm": 11.47777271270752, | |
| "learning_rate": 9.053777777777778e-06, | |
| "loss": 1.0851, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 0.6047966631908238, | |
| "grad_norm": 8.379030227661133, | |
| "learning_rate": 9.049333333333335e-06, | |
| "loss": 1.0244, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 0.6061001042752867, | |
| "grad_norm": 8.413164138793945, | |
| "learning_rate": 9.04488888888889e-06, | |
| "loss": 1.0619, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.6074035453597497, | |
| "grad_norm": 10.171146392822266, | |
| "learning_rate": 9.040444444444445e-06, | |
| "loss": 1.0794, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 0.6087069864442127, | |
| "grad_norm": 10.772948265075684, | |
| "learning_rate": 9.036e-06, | |
| "loss": 1.0795, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 0.6100104275286757, | |
| "grad_norm": 9.68770980834961, | |
| "learning_rate": 9.031555555555556e-06, | |
| "loss": 1.0261, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 0.6113138686131386, | |
| "grad_norm": 9.47791862487793, | |
| "learning_rate": 9.027111111111113e-06, | |
| "loss": 1.1494, | |
| "step": 4690 | |
| }, | |
| { | |
| "epoch": 0.6126173096976016, | |
| "grad_norm": 9.655404090881348, | |
| "learning_rate": 9.022666666666667e-06, | |
| "loss": 1.1113, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.6126173096976016, | |
| "eval/acc": 41.86046600341797, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.6126173096976016, | |
| "eval_loss": 2.4572794437408447, | |
| "eval_runtime": 0.551, | |
| "eval_samples_per_second": 78.043, | |
| "eval_steps_per_second": 1.815, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.6139207507820647, | |
| "grad_norm": 11.384035110473633, | |
| "learning_rate": 9.018222222222222e-06, | |
| "loss": 0.9002, | |
| "step": 4710 | |
| }, | |
| { | |
| "epoch": 0.6152241918665277, | |
| "grad_norm": 9.862360000610352, | |
| "learning_rate": 9.013777777777779e-06, | |
| "loss": 0.9838, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 0.6165276329509907, | |
| "grad_norm": 8.860601425170898, | |
| "learning_rate": 9.009333333333334e-06, | |
| "loss": 1.0471, | |
| "step": 4730 | |
| }, | |
| { | |
| "epoch": 0.6178310740354536, | |
| "grad_norm": 9.085923194885254, | |
| "learning_rate": 9.00488888888889e-06, | |
| "loss": 1.1413, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 0.6191345151199166, | |
| "grad_norm": 7.881019115447998, | |
| "learning_rate": 9.000444444444445e-06, | |
| "loss": 1.0241, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.6204379562043796, | |
| "grad_norm": 9.55480670928955, | |
| "learning_rate": 8.996e-06, | |
| "loss": 1.0634, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 0.6217413972888426, | |
| "grad_norm": 8.191434860229492, | |
| "learning_rate": 8.991555555555556e-06, | |
| "loss": 1.0264, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 0.6230448383733055, | |
| "grad_norm": 11.498793601989746, | |
| "learning_rate": 8.987111111111111e-06, | |
| "loss": 1.0466, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 0.6243482794577685, | |
| "grad_norm": 8.848291397094727, | |
| "learning_rate": 8.982666666666668e-06, | |
| "loss": 0.9813, | |
| "step": 4790 | |
| }, | |
| { | |
| "epoch": 0.6256517205422315, | |
| "grad_norm": 8.858402252197266, | |
| "learning_rate": 8.978222222222223e-06, | |
| "loss": 1.0143, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.6256517205422315, | |
| "eval/acc": 34.88372039794922, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.6256517205422315, | |
| "eval_loss": 2.6291277408599854, | |
| "eval_runtime": 0.557, | |
| "eval_samples_per_second": 77.206, | |
| "eval_steps_per_second": 1.795, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.6269551616266945, | |
| "grad_norm": 9.703082084655762, | |
| "learning_rate": 8.97377777777778e-06, | |
| "loss": 1.0354, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 0.6282586027111574, | |
| "grad_norm": 8.061450004577637, | |
| "learning_rate": 8.969333333333334e-06, | |
| "loss": 1.0989, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 0.6295620437956204, | |
| "grad_norm": 8.38237476348877, | |
| "learning_rate": 8.964888888888889e-06, | |
| "loss": 1.0303, | |
| "step": 4830 | |
| }, | |
| { | |
| "epoch": 0.6308654848800834, | |
| "grad_norm": 9.098999977111816, | |
| "learning_rate": 8.960444444444445e-06, | |
| "loss": 1.1374, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 0.6321689259645464, | |
| "grad_norm": 8.959243774414062, | |
| "learning_rate": 8.956e-06, | |
| "loss": 1.0242, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.6334723670490093, | |
| "grad_norm": 10.157614707946777, | |
| "learning_rate": 8.951555555555557e-06, | |
| "loss": 1.134, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 0.6347758081334723, | |
| "grad_norm": 10.983575820922852, | |
| "learning_rate": 8.947111111111112e-06, | |
| "loss": 0.9518, | |
| "step": 4870 | |
| }, | |
| { | |
| "epoch": 0.6360792492179353, | |
| "grad_norm": 11.162731170654297, | |
| "learning_rate": 8.942666666666667e-06, | |
| "loss": 1.0702, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 0.6373826903023984, | |
| "grad_norm": 9.005561828613281, | |
| "learning_rate": 8.938222222222223e-06, | |
| "loss": 1.0228, | |
| "step": 4890 | |
| }, | |
| { | |
| "epoch": 0.6386861313868614, | |
| "grad_norm": 9.825065612792969, | |
| "learning_rate": 8.93377777777778e-06, | |
| "loss": 1.0373, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.6386861313868614, | |
| "eval/acc": 32.55813980102539, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.6386861313868614, | |
| "eval_loss": 2.755546808242798, | |
| "eval_runtime": 0.5529, | |
| "eval_samples_per_second": 77.765, | |
| "eval_steps_per_second": 1.808, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.6399895724713243, | |
| "grad_norm": 8.52741813659668, | |
| "learning_rate": 8.929333333333334e-06, | |
| "loss": 1.066, | |
| "step": 4910 | |
| }, | |
| { | |
| "epoch": 0.6412930135557873, | |
| "grad_norm": 9.974360466003418, | |
| "learning_rate": 8.92488888888889e-06, | |
| "loss": 1.0397, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 0.6425964546402503, | |
| "grad_norm": 8.10251235961914, | |
| "learning_rate": 8.920444444444444e-06, | |
| "loss": 1.0832, | |
| "step": 4930 | |
| }, | |
| { | |
| "epoch": 0.6438998957247133, | |
| "grad_norm": 10.143448829650879, | |
| "learning_rate": 8.916e-06, | |
| "loss": 1.0112, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 0.6452033368091762, | |
| "grad_norm": 10.25130844116211, | |
| "learning_rate": 8.911555555555557e-06, | |
| "loss": 1.0808, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.6465067778936392, | |
| "grad_norm": 11.107799530029297, | |
| "learning_rate": 8.907111111111112e-06, | |
| "loss": 1.0547, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 0.6478102189781022, | |
| "grad_norm": 10.128641128540039, | |
| "learning_rate": 8.902666666666667e-06, | |
| "loss": 1.0721, | |
| "step": 4970 | |
| }, | |
| { | |
| "epoch": 0.6491136600625652, | |
| "grad_norm": 10.3110933303833, | |
| "learning_rate": 8.898222222222222e-06, | |
| "loss": 0.9976, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 0.6504171011470281, | |
| "grad_norm": 8.941389083862305, | |
| "learning_rate": 8.893777777777778e-06, | |
| "loss": 1.0196, | |
| "step": 4990 | |
| }, | |
| { | |
| "epoch": 0.6517205422314911, | |
| "grad_norm": 10.89724063873291, | |
| "learning_rate": 8.889333333333335e-06, | |
| "loss": 1.0551, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.6517205422314911, | |
| "eval/acc": 34.88372039794922, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.6517205422314911, | |
| "eval_loss": 2.655290126800537, | |
| "eval_runtime": 0.5523, | |
| "eval_samples_per_second": 77.86, | |
| "eval_steps_per_second": 1.811, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.6530239833159541, | |
| "grad_norm": 8.989163398742676, | |
| "learning_rate": 8.88488888888889e-06, | |
| "loss": 0.9939, | |
| "step": 5010 | |
| }, | |
| { | |
| "epoch": 0.6543274244004171, | |
| "grad_norm": 10.607318878173828, | |
| "learning_rate": 8.880444444444445e-06, | |
| "loss": 0.9934, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 0.65563086548488, | |
| "grad_norm": 9.469202995300293, | |
| "learning_rate": 8.876e-06, | |
| "loss": 0.9683, | |
| "step": 5030 | |
| }, | |
| { | |
| "epoch": 0.656934306569343, | |
| "grad_norm": 8.721243858337402, | |
| "learning_rate": 8.871555555555556e-06, | |
| "loss": 1.0208, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 0.658237747653806, | |
| "grad_norm": 10.423583984375, | |
| "learning_rate": 8.867111111111112e-06, | |
| "loss": 1.0284, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 0.6595411887382691, | |
| "grad_norm": 9.084837913513184, | |
| "learning_rate": 8.862666666666667e-06, | |
| "loss": 1.0282, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 0.6608446298227321, | |
| "grad_norm": 9.282791137695312, | |
| "learning_rate": 8.858222222222222e-06, | |
| "loss": 1.0565, | |
| "step": 5070 | |
| }, | |
| { | |
| "epoch": 0.662148070907195, | |
| "grad_norm": 9.868699073791504, | |
| "learning_rate": 8.853777777777779e-06, | |
| "loss": 1.0749, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 0.663451511991658, | |
| "grad_norm": 9.887734413146973, | |
| "learning_rate": 8.849333333333334e-06, | |
| "loss": 1.1138, | |
| "step": 5090 | |
| }, | |
| { | |
| "epoch": 0.664754953076121, | |
| "grad_norm": 9.476777076721191, | |
| "learning_rate": 8.84488888888889e-06, | |
| "loss": 0.987, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.664754953076121, | |
| "eval/acc": 34.88372039794922, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.664754953076121, | |
| "eval_loss": 2.7898762226104736, | |
| "eval_runtime": 0.555, | |
| "eval_samples_per_second": 77.483, | |
| "eval_steps_per_second": 1.802, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.666058394160584, | |
| "grad_norm": 8.887063026428223, | |
| "learning_rate": 8.840444444444445e-06, | |
| "loss": 1.0133, | |
| "step": 5110 | |
| }, | |
| { | |
| "epoch": 0.6673618352450469, | |
| "grad_norm": 8.670061111450195, | |
| "learning_rate": 8.836000000000001e-06, | |
| "loss": 1.0152, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 0.6686652763295099, | |
| "grad_norm": 10.9150390625, | |
| "learning_rate": 8.831555555555556e-06, | |
| "loss": 1.0866, | |
| "step": 5130 | |
| }, | |
| { | |
| "epoch": 0.6699687174139729, | |
| "grad_norm": 9.245491027832031, | |
| "learning_rate": 8.827111111111111e-06, | |
| "loss": 1.0282, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 0.6712721584984359, | |
| "grad_norm": 8.521747589111328, | |
| "learning_rate": 8.822666666666668e-06, | |
| "loss": 0.9966, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 0.6725755995828988, | |
| "grad_norm": 10.43109130859375, | |
| "learning_rate": 8.818222222222223e-06, | |
| "loss": 1.0034, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 0.6738790406673618, | |
| "grad_norm": 8.546734809875488, | |
| "learning_rate": 8.813777777777779e-06, | |
| "loss": 1.034, | |
| "step": 5170 | |
| }, | |
| { | |
| "epoch": 0.6751824817518248, | |
| "grad_norm": 9.694477081298828, | |
| "learning_rate": 8.809333333333334e-06, | |
| "loss": 1.0633, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 0.6764859228362878, | |
| "grad_norm": 9.622178077697754, | |
| "learning_rate": 8.804888888888889e-06, | |
| "loss": 1.0111, | |
| "step": 5190 | |
| }, | |
| { | |
| "epoch": 0.6777893639207507, | |
| "grad_norm": 8.308663368225098, | |
| "learning_rate": 8.800444444444445e-06, | |
| "loss": 1.0342, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.6777893639207507, | |
| "eval/acc": 39.53488540649414, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.6777893639207507, | |
| "eval_loss": 2.570695161819458, | |
| "eval_runtime": 0.9101, | |
| "eval_samples_per_second": 47.246, | |
| "eval_steps_per_second": 1.099, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.6790928050052137, | |
| "grad_norm": 8.973245620727539, | |
| "learning_rate": 8.796000000000002e-06, | |
| "loss": 1.0253, | |
| "step": 5210 | |
| }, | |
| { | |
| "epoch": 0.6803962460896767, | |
| "grad_norm": 10.03006649017334, | |
| "learning_rate": 8.791555555555557e-06, | |
| "loss": 1.0575, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 0.6816996871741398, | |
| "grad_norm": 9.414715766906738, | |
| "learning_rate": 8.787111111111112e-06, | |
| "loss": 1.0044, | |
| "step": 5230 | |
| }, | |
| { | |
| "epoch": 0.6830031282586028, | |
| "grad_norm": 10.497982025146484, | |
| "learning_rate": 8.782666666666666e-06, | |
| "loss": 1.0556, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 0.6843065693430657, | |
| "grad_norm": 9.626456260681152, | |
| "learning_rate": 8.778222222222223e-06, | |
| "loss": 1.0294, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.6856100104275287, | |
| "grad_norm": 8.467864036560059, | |
| "learning_rate": 8.77377777777778e-06, | |
| "loss": 1.0754, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 0.6869134515119917, | |
| "grad_norm": 10.255539894104004, | |
| "learning_rate": 8.769333333333334e-06, | |
| "loss": 1.013, | |
| "step": 5270 | |
| }, | |
| { | |
| "epoch": 0.6882168925964547, | |
| "grad_norm": 9.694276809692383, | |
| "learning_rate": 8.764888888888889e-06, | |
| "loss": 1.0573, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 0.6895203336809176, | |
| "grad_norm": 9.310009002685547, | |
| "learning_rate": 8.760444444444444e-06, | |
| "loss": 0.9638, | |
| "step": 5290 | |
| }, | |
| { | |
| "epoch": 0.6908237747653806, | |
| "grad_norm": 8.105212211608887, | |
| "learning_rate": 8.756e-06, | |
| "loss": 0.9974, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.6908237747653806, | |
| "eval/acc": 34.88372039794922, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.6908237747653806, | |
| "eval_loss": 2.696380376815796, | |
| "eval_runtime": 0.5519, | |
| "eval_samples_per_second": 77.91, | |
| "eval_steps_per_second": 1.812, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.6921272158498436, | |
| "grad_norm": 9.850837707519531, | |
| "learning_rate": 8.751555555555557e-06, | |
| "loss": 1.0513, | |
| "step": 5310 | |
| }, | |
| { | |
| "epoch": 0.6934306569343066, | |
| "grad_norm": 9.398372650146484, | |
| "learning_rate": 8.747111111111112e-06, | |
| "loss": 0.9589, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 0.6947340980187695, | |
| "grad_norm": 9.533944129943848, | |
| "learning_rate": 8.742666666666667e-06, | |
| "loss": 1.0183, | |
| "step": 5330 | |
| }, | |
| { | |
| "epoch": 0.6960375391032325, | |
| "grad_norm": 9.51920223236084, | |
| "learning_rate": 8.738222222222222e-06, | |
| "loss": 0.9897, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 0.6973409801876955, | |
| "grad_norm": 8.380166053771973, | |
| "learning_rate": 8.733777777777778e-06, | |
| "loss": 0.9377, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 0.6986444212721585, | |
| "grad_norm": 9.34756088256836, | |
| "learning_rate": 8.729333333333335e-06, | |
| "loss": 0.9827, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 0.6999478623566214, | |
| "grad_norm": 8.969053268432617, | |
| "learning_rate": 8.72488888888889e-06, | |
| "loss": 1.0291, | |
| "step": 5370 | |
| }, | |
| { | |
| "epoch": 0.7012513034410844, | |
| "grad_norm": 11.20063304901123, | |
| "learning_rate": 8.720444444444444e-06, | |
| "loss": 1.0111, | |
| "step": 5380 | |
| }, | |
| { | |
| "epoch": 0.7025547445255474, | |
| "grad_norm": 9.322565078735352, | |
| "learning_rate": 8.716000000000001e-06, | |
| "loss": 0.9815, | |
| "step": 5390 | |
| }, | |
| { | |
| "epoch": 0.7038581856100105, | |
| "grad_norm": 9.362802505493164, | |
| "learning_rate": 8.711555555555556e-06, | |
| "loss": 1.0608, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.7038581856100105, | |
| "eval/acc": 32.55813980102539, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.7038581856100105, | |
| "eval_loss": 2.7143032550811768, | |
| "eval_runtime": 0.5523, | |
| "eval_samples_per_second": 77.854, | |
| "eval_steps_per_second": 1.811, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.7051616266944735, | |
| "grad_norm": 7.582075119018555, | |
| "learning_rate": 8.707111111111112e-06, | |
| "loss": 0.983, | |
| "step": 5410 | |
| }, | |
| { | |
| "epoch": 0.7064650677789364, | |
| "grad_norm": 9.668004989624023, | |
| "learning_rate": 8.702666666666667e-06, | |
| "loss": 0.9948, | |
| "step": 5420 | |
| }, | |
| { | |
| "epoch": 0.7077685088633994, | |
| "grad_norm": 9.572304725646973, | |
| "learning_rate": 8.698222222222224e-06, | |
| "loss": 0.989, | |
| "step": 5430 | |
| }, | |
| { | |
| "epoch": 0.7090719499478624, | |
| "grad_norm": 9.887052536010742, | |
| "learning_rate": 8.693777777777779e-06, | |
| "loss": 1.0045, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 0.7103753910323254, | |
| "grad_norm": 10.95411205291748, | |
| "learning_rate": 8.689333333333333e-06, | |
| "loss": 1.1597, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 0.7116788321167883, | |
| "grad_norm": 8.732405662536621, | |
| "learning_rate": 8.68488888888889e-06, | |
| "loss": 0.9946, | |
| "step": 5460 | |
| }, | |
| { | |
| "epoch": 0.7129822732012513, | |
| "grad_norm": 10.515278816223145, | |
| "learning_rate": 8.680444444444445e-06, | |
| "loss": 1.0129, | |
| "step": 5470 | |
| }, | |
| { | |
| "epoch": 0.7142857142857143, | |
| "grad_norm": 11.1256685256958, | |
| "learning_rate": 8.676000000000001e-06, | |
| "loss": 1.0248, | |
| "step": 5480 | |
| }, | |
| { | |
| "epoch": 0.7155891553701773, | |
| "grad_norm": 8.381686210632324, | |
| "learning_rate": 8.671555555555556e-06, | |
| "loss": 1.1366, | |
| "step": 5490 | |
| }, | |
| { | |
| "epoch": 0.7168925964546402, | |
| "grad_norm": 9.804370880126953, | |
| "learning_rate": 8.667111111111111e-06, | |
| "loss": 1.0714, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.7168925964546402, | |
| "eval/acc": 32.55813980102539, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.7168925964546402, | |
| "eval_loss": 2.6976399421691895, | |
| "eval_runtime": 0.5537, | |
| "eval_samples_per_second": 77.656, | |
| "eval_steps_per_second": 1.806, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.7181960375391032, | |
| "grad_norm": 8.139718055725098, | |
| "learning_rate": 8.662666666666668e-06, | |
| "loss": 1.0368, | |
| "step": 5510 | |
| }, | |
| { | |
| "epoch": 0.7194994786235662, | |
| "grad_norm": 10.484297752380371, | |
| "learning_rate": 8.658222222222224e-06, | |
| "loss": 1.0055, | |
| "step": 5520 | |
| }, | |
| { | |
| "epoch": 0.7208029197080292, | |
| "grad_norm": 8.326539993286133, | |
| "learning_rate": 8.653777777777779e-06, | |
| "loss": 1.0275, | |
| "step": 5530 | |
| }, | |
| { | |
| "epoch": 0.7221063607924921, | |
| "grad_norm": 9.043944358825684, | |
| "learning_rate": 8.649333333333334e-06, | |
| "loss": 1.0032, | |
| "step": 5540 | |
| }, | |
| { | |
| "epoch": 0.7234098018769551, | |
| "grad_norm": 8.754300117492676, | |
| "learning_rate": 8.644888888888889e-06, | |
| "loss": 0.9642, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 0.7247132429614181, | |
| "grad_norm": 8.33322525024414, | |
| "learning_rate": 8.640444444444445e-06, | |
| "loss": 1.014, | |
| "step": 5560 | |
| }, | |
| { | |
| "epoch": 0.7260166840458812, | |
| "grad_norm": 8.533761024475098, | |
| "learning_rate": 8.636000000000002e-06, | |
| "loss": 1.0439, | |
| "step": 5570 | |
| }, | |
| { | |
| "epoch": 0.7273201251303442, | |
| "grad_norm": 10.058277130126953, | |
| "learning_rate": 8.631555555555557e-06, | |
| "loss": 0.9821, | |
| "step": 5580 | |
| }, | |
| { | |
| "epoch": 0.7286235662148071, | |
| "grad_norm": 7.9940009117126465, | |
| "learning_rate": 8.627111111111111e-06, | |
| "loss": 1.0351, | |
| "step": 5590 | |
| }, | |
| { | |
| "epoch": 0.7299270072992701, | |
| "grad_norm": 9.121907234191895, | |
| "learning_rate": 8.622666666666666e-06, | |
| "loss": 0.9859, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.7299270072992701, | |
| "eval/acc": 39.53488540649414, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.7299270072992701, | |
| "eval_loss": 2.6650607585906982, | |
| "eval_runtime": 0.552, | |
| "eval_samples_per_second": 77.897, | |
| "eval_steps_per_second": 1.812, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.7312304483837331, | |
| "grad_norm": 9.132323265075684, | |
| "learning_rate": 8.618222222222223e-06, | |
| "loss": 1.0289, | |
| "step": 5610 | |
| }, | |
| { | |
| "epoch": 0.7325338894681961, | |
| "grad_norm": 8.751036643981934, | |
| "learning_rate": 8.61377777777778e-06, | |
| "loss": 1.0203, | |
| "step": 5620 | |
| }, | |
| { | |
| "epoch": 0.733837330552659, | |
| "grad_norm": 8.791621208190918, | |
| "learning_rate": 8.609333333333334e-06, | |
| "loss": 0.9776, | |
| "step": 5630 | |
| }, | |
| { | |
| "epoch": 0.735140771637122, | |
| "grad_norm": 7.251979827880859, | |
| "learning_rate": 8.604888888888889e-06, | |
| "loss": 0.9381, | |
| "step": 5640 | |
| }, | |
| { | |
| "epoch": 0.736444212721585, | |
| "grad_norm": 9.67597484588623, | |
| "learning_rate": 8.600444444444444e-06, | |
| "loss": 1.1627, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 0.737747653806048, | |
| "grad_norm": 9.147563934326172, | |
| "learning_rate": 8.596e-06, | |
| "loss": 1.0479, | |
| "step": 5660 | |
| }, | |
| { | |
| "epoch": 0.7390510948905109, | |
| "grad_norm": 11.876523971557617, | |
| "learning_rate": 8.591555555555557e-06, | |
| "loss": 1.0011, | |
| "step": 5670 | |
| }, | |
| { | |
| "epoch": 0.7403545359749739, | |
| "grad_norm": 9.589251518249512, | |
| "learning_rate": 8.587111111111112e-06, | |
| "loss": 1.0448, | |
| "step": 5680 | |
| }, | |
| { | |
| "epoch": 0.7416579770594369, | |
| "grad_norm": 10.050942420959473, | |
| "learning_rate": 8.582666666666667e-06, | |
| "loss": 1.0001, | |
| "step": 5690 | |
| }, | |
| { | |
| "epoch": 0.7429614181438999, | |
| "grad_norm": 7.628367900848389, | |
| "learning_rate": 8.578222222222223e-06, | |
| "loss": 1.0111, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.7429614181438999, | |
| "eval/acc": 37.20930099487305, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.7429614181438999, | |
| "eval_loss": 2.59824800491333, | |
| "eval_runtime": 0.5524, | |
| "eval_samples_per_second": 77.842, | |
| "eval_steps_per_second": 1.81, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.7442648592283628, | |
| "grad_norm": 9.08203125, | |
| "learning_rate": 8.573777777777778e-06, | |
| "loss": 1.0043, | |
| "step": 5710 | |
| }, | |
| { | |
| "epoch": 0.7455683003128258, | |
| "grad_norm": 10.87893009185791, | |
| "learning_rate": 8.569333333333335e-06, | |
| "loss": 0.981, | |
| "step": 5720 | |
| }, | |
| { | |
| "epoch": 0.7468717413972888, | |
| "grad_norm": 9.921890258789062, | |
| "learning_rate": 8.56488888888889e-06, | |
| "loss": 1.0664, | |
| "step": 5730 | |
| }, | |
| { | |
| "epoch": 0.7481751824817519, | |
| "grad_norm": 8.49359130859375, | |
| "learning_rate": 8.560444444444446e-06, | |
| "loss": 1.1034, | |
| "step": 5740 | |
| }, | |
| { | |
| "epoch": 0.7494786235662148, | |
| "grad_norm": 8.523398399353027, | |
| "learning_rate": 8.556e-06, | |
| "loss": 0.9143, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.7507820646506778, | |
| "grad_norm": 9.348311424255371, | |
| "learning_rate": 8.551555555555556e-06, | |
| "loss": 1.011, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 0.7520855057351408, | |
| "grad_norm": 10.186081886291504, | |
| "learning_rate": 8.547111111111112e-06, | |
| "loss": 1.11, | |
| "step": 5770 | |
| }, | |
| { | |
| "epoch": 0.7533889468196038, | |
| "grad_norm": 8.896495819091797, | |
| "learning_rate": 8.542666666666667e-06, | |
| "loss": 1.0081, | |
| "step": 5780 | |
| }, | |
| { | |
| "epoch": 0.7546923879040668, | |
| "grad_norm": 8.774282455444336, | |
| "learning_rate": 8.538222222222224e-06, | |
| "loss": 1.0392, | |
| "step": 5790 | |
| }, | |
| { | |
| "epoch": 0.7559958289885297, | |
| "grad_norm": 10.161205291748047, | |
| "learning_rate": 8.533777777777778e-06, | |
| "loss": 1.0442, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.7559958289885297, | |
| "eval/acc": 37.20930099487305, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.7559958289885297, | |
| "eval_loss": 2.6203322410583496, | |
| "eval_runtime": 0.5612, | |
| "eval_samples_per_second": 76.627, | |
| "eval_steps_per_second": 1.782, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.7572992700729927, | |
| "grad_norm": 11.288007736206055, | |
| "learning_rate": 8.529333333333333e-06, | |
| "loss": 1.0112, | |
| "step": 5810 | |
| }, | |
| { | |
| "epoch": 0.7586027111574557, | |
| "grad_norm": 8.31664752960205, | |
| "learning_rate": 8.52488888888889e-06, | |
| "loss": 0.9818, | |
| "step": 5820 | |
| }, | |
| { | |
| "epoch": 0.7599061522419187, | |
| "grad_norm": 9.366721153259277, | |
| "learning_rate": 8.520444444444446e-06, | |
| "loss": 1.0025, | |
| "step": 5830 | |
| }, | |
| { | |
| "epoch": 0.7612095933263816, | |
| "grad_norm": 9.547416687011719, | |
| "learning_rate": 8.516000000000001e-06, | |
| "loss": 1.0241, | |
| "step": 5840 | |
| }, | |
| { | |
| "epoch": 0.7625130344108446, | |
| "grad_norm": 9.147334098815918, | |
| "learning_rate": 8.511555555555556e-06, | |
| "loss": 1.0055, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 0.7638164754953076, | |
| "grad_norm": 9.335505485534668, | |
| "learning_rate": 8.50711111111111e-06, | |
| "loss": 0.9671, | |
| "step": 5860 | |
| }, | |
| { | |
| "epoch": 0.7651199165797706, | |
| "grad_norm": 9.130939483642578, | |
| "learning_rate": 8.502666666666667e-06, | |
| "loss": 0.9679, | |
| "step": 5870 | |
| }, | |
| { | |
| "epoch": 0.7664233576642335, | |
| "grad_norm": 9.89399528503418, | |
| "learning_rate": 8.498222222222224e-06, | |
| "loss": 1.0736, | |
| "step": 5880 | |
| }, | |
| { | |
| "epoch": 0.7677267987486965, | |
| "grad_norm": 9.254303932189941, | |
| "learning_rate": 8.493777777777779e-06, | |
| "loss": 1.0552, | |
| "step": 5890 | |
| }, | |
| { | |
| "epoch": 0.7690302398331595, | |
| "grad_norm": 8.202300071716309, | |
| "learning_rate": 8.489333333333334e-06, | |
| "loss": 0.9509, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.7690302398331595, | |
| "eval/acc": 32.55813980102539, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.7690302398331595, | |
| "eval_loss": 2.7594034671783447, | |
| "eval_runtime": 0.5559, | |
| "eval_samples_per_second": 77.347, | |
| "eval_steps_per_second": 1.799, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.7703336809176226, | |
| "grad_norm": 7.404378890991211, | |
| "learning_rate": 8.484888888888888e-06, | |
| "loss": 1.0923, | |
| "step": 5910 | |
| }, | |
| { | |
| "epoch": 0.7716371220020855, | |
| "grad_norm": 9.642816543579102, | |
| "learning_rate": 8.480444444444445e-06, | |
| "loss": 0.967, | |
| "step": 5920 | |
| }, | |
| { | |
| "epoch": 0.7729405630865485, | |
| "grad_norm": 10.284126281738281, | |
| "learning_rate": 8.476000000000002e-06, | |
| "loss": 0.9209, | |
| "step": 5930 | |
| }, | |
| { | |
| "epoch": 0.7742440041710115, | |
| "grad_norm": 9.019957542419434, | |
| "learning_rate": 8.471555555555556e-06, | |
| "loss": 1.0556, | |
| "step": 5940 | |
| }, | |
| { | |
| "epoch": 0.7755474452554745, | |
| "grad_norm": 10.67587661743164, | |
| "learning_rate": 8.467111111111111e-06, | |
| "loss": 1.0267, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 0.7768508863399375, | |
| "grad_norm": 9.622302055358887, | |
| "learning_rate": 8.462666666666666e-06, | |
| "loss": 1.0583, | |
| "step": 5960 | |
| }, | |
| { | |
| "epoch": 0.7781543274244004, | |
| "grad_norm": 8.635871887207031, | |
| "learning_rate": 8.458222222222223e-06, | |
| "loss": 0.9977, | |
| "step": 5970 | |
| }, | |
| { | |
| "epoch": 0.7794577685088634, | |
| "grad_norm": 9.332335472106934, | |
| "learning_rate": 8.453777777777779e-06, | |
| "loss": 0.9918, | |
| "step": 5980 | |
| }, | |
| { | |
| "epoch": 0.7807612095933264, | |
| "grad_norm": 8.821697235107422, | |
| "learning_rate": 8.449333333333334e-06, | |
| "loss": 1.0279, | |
| "step": 5990 | |
| }, | |
| { | |
| "epoch": 0.7820646506777894, | |
| "grad_norm": 7.9288763999938965, | |
| "learning_rate": 8.444888888888889e-06, | |
| "loss": 0.9941, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.7820646506777894, | |
| "eval/acc": 41.86046600341797, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.7820646506777894, | |
| "eval_loss": 2.6315321922302246, | |
| "eval_runtime": 0.5518, | |
| "eval_samples_per_second": 77.927, | |
| "eval_steps_per_second": 1.812, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.7833680917622523, | |
| "grad_norm": 8.8925199508667, | |
| "learning_rate": 8.440444444444445e-06, | |
| "loss": 0.9833, | |
| "step": 6010 | |
| }, | |
| { | |
| "epoch": 0.7846715328467153, | |
| "grad_norm": 10.178521156311035, | |
| "learning_rate": 8.436e-06, | |
| "loss": 1.0427, | |
| "step": 6020 | |
| }, | |
| { | |
| "epoch": 0.7859749739311783, | |
| "grad_norm": 8.328222274780273, | |
| "learning_rate": 8.431555555555557e-06, | |
| "loss": 0.9223, | |
| "step": 6030 | |
| }, | |
| { | |
| "epoch": 0.7872784150156413, | |
| "grad_norm": 7.066805839538574, | |
| "learning_rate": 8.427111111111112e-06, | |
| "loss": 1.0382, | |
| "step": 6040 | |
| }, | |
| { | |
| "epoch": 0.7885818561001042, | |
| "grad_norm": 10.042593955993652, | |
| "learning_rate": 8.422666666666668e-06, | |
| "loss": 1.0121, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 0.7898852971845672, | |
| "grad_norm": 10.048341751098633, | |
| "learning_rate": 8.418222222222223e-06, | |
| "loss": 1.0191, | |
| "step": 6060 | |
| }, | |
| { | |
| "epoch": 0.7911887382690302, | |
| "grad_norm": 9.812093734741211, | |
| "learning_rate": 8.413777777777778e-06, | |
| "loss": 1.0178, | |
| "step": 6070 | |
| }, | |
| { | |
| "epoch": 0.7924921793534933, | |
| "grad_norm": 8.843996047973633, | |
| "learning_rate": 8.409333333333334e-06, | |
| "loss": 1.0332, | |
| "step": 6080 | |
| }, | |
| { | |
| "epoch": 0.7937956204379562, | |
| "grad_norm": 9.375243186950684, | |
| "learning_rate": 8.40488888888889e-06, | |
| "loss": 0.97, | |
| "step": 6090 | |
| }, | |
| { | |
| "epoch": 0.7950990615224192, | |
| "grad_norm": 9.695100784301758, | |
| "learning_rate": 8.400444444444446e-06, | |
| "loss": 0.9974, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.7950990615224192, | |
| "eval/acc": 34.88372039794922, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.7950990615224192, | |
| "eval_loss": 2.6288933753967285, | |
| "eval_runtime": 0.5581, | |
| "eval_samples_per_second": 77.053, | |
| "eval_steps_per_second": 1.792, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.7964025026068822, | |
| "grad_norm": 10.059000015258789, | |
| "learning_rate": 8.396e-06, | |
| "loss": 0.9742, | |
| "step": 6110 | |
| }, | |
| { | |
| "epoch": 0.7977059436913452, | |
| "grad_norm": 9.15655517578125, | |
| "learning_rate": 8.391555555555555e-06, | |
| "loss": 1.0031, | |
| "step": 6120 | |
| }, | |
| { | |
| "epoch": 0.7990093847758082, | |
| "grad_norm": 10.105188369750977, | |
| "learning_rate": 8.387111111111112e-06, | |
| "loss": 0.9663, | |
| "step": 6130 | |
| }, | |
| { | |
| "epoch": 0.8003128258602711, | |
| "grad_norm": 8.860940933227539, | |
| "learning_rate": 8.382666666666669e-06, | |
| "loss": 0.9867, | |
| "step": 6140 | |
| }, | |
| { | |
| "epoch": 0.8016162669447341, | |
| "grad_norm": 9.332911491394043, | |
| "learning_rate": 8.378222222222223e-06, | |
| "loss": 0.9407, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 0.8029197080291971, | |
| "grad_norm": 9.756667137145996, | |
| "learning_rate": 8.373777777777778e-06, | |
| "loss": 1.0216, | |
| "step": 6160 | |
| }, | |
| { | |
| "epoch": 0.8042231491136601, | |
| "grad_norm": 9.770795822143555, | |
| "learning_rate": 8.369333333333333e-06, | |
| "loss": 1.0218, | |
| "step": 6170 | |
| }, | |
| { | |
| "epoch": 0.805526590198123, | |
| "grad_norm": 9.790980339050293, | |
| "learning_rate": 8.36488888888889e-06, | |
| "loss": 1.0977, | |
| "step": 6180 | |
| }, | |
| { | |
| "epoch": 0.806830031282586, | |
| "grad_norm": 9.125996589660645, | |
| "learning_rate": 8.360444444444446e-06, | |
| "loss": 1.0185, | |
| "step": 6190 | |
| }, | |
| { | |
| "epoch": 0.808133472367049, | |
| "grad_norm": 8.784990310668945, | |
| "learning_rate": 8.356000000000001e-06, | |
| "loss": 0.9859, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.808133472367049, | |
| "eval/acc": 37.20930099487305, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.808133472367049, | |
| "eval_loss": 2.6269452571868896, | |
| "eval_runtime": 0.5551, | |
| "eval_samples_per_second": 77.468, | |
| "eval_steps_per_second": 1.802, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.809436913451512, | |
| "grad_norm": 10.93716049194336, | |
| "learning_rate": 8.351555555555556e-06, | |
| "loss": 1.0452, | |
| "step": 6210 | |
| }, | |
| { | |
| "epoch": 0.8107403545359749, | |
| "grad_norm": 8.550542831420898, | |
| "learning_rate": 8.34711111111111e-06, | |
| "loss": 1.0022, | |
| "step": 6220 | |
| }, | |
| { | |
| "epoch": 0.8120437956204379, | |
| "grad_norm": 9.354156494140625, | |
| "learning_rate": 8.342666666666667e-06, | |
| "loss": 0.9901, | |
| "step": 6230 | |
| }, | |
| { | |
| "epoch": 0.8133472367049009, | |
| "grad_norm": 11.288535118103027, | |
| "learning_rate": 8.338222222222224e-06, | |
| "loss": 1.0463, | |
| "step": 6240 | |
| }, | |
| { | |
| "epoch": 0.814650677789364, | |
| "grad_norm": 8.247468948364258, | |
| "learning_rate": 8.333777777777779e-06, | |
| "loss": 1.0336, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 0.8159541188738269, | |
| "grad_norm": 9.26513385772705, | |
| "learning_rate": 8.329333333333333e-06, | |
| "loss": 1.0107, | |
| "step": 6260 | |
| }, | |
| { | |
| "epoch": 0.8172575599582899, | |
| "grad_norm": 8.089369773864746, | |
| "learning_rate": 8.324888888888888e-06, | |
| "loss": 0.9727, | |
| "step": 6270 | |
| }, | |
| { | |
| "epoch": 0.8185610010427529, | |
| "grad_norm": 8.736642837524414, | |
| "learning_rate": 8.320444444444445e-06, | |
| "loss": 1.0667, | |
| "step": 6280 | |
| }, | |
| { | |
| "epoch": 0.8198644421272159, | |
| "grad_norm": 8.858651161193848, | |
| "learning_rate": 8.316000000000001e-06, | |
| "loss": 1.081, | |
| "step": 6290 | |
| }, | |
| { | |
| "epoch": 0.8211678832116789, | |
| "grad_norm": 8.538246154785156, | |
| "learning_rate": 8.311555555555556e-06, | |
| "loss": 0.9886, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.8211678832116789, | |
| "eval/acc": 39.53488540649414, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.8211678832116789, | |
| "eval_loss": 2.624267101287842, | |
| "eval_runtime": 0.5516, | |
| "eval_samples_per_second": 77.959, | |
| "eval_steps_per_second": 1.813, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.8224713242961418, | |
| "grad_norm": 9.205613136291504, | |
| "learning_rate": 8.307111111111111e-06, | |
| "loss": 0.9988, | |
| "step": 6310 | |
| }, | |
| { | |
| "epoch": 0.8237747653806048, | |
| "grad_norm": 8.61228084564209, | |
| "learning_rate": 8.302666666666668e-06, | |
| "loss": 1.0361, | |
| "step": 6320 | |
| }, | |
| { | |
| "epoch": 0.8250782064650678, | |
| "grad_norm": 9.030414581298828, | |
| "learning_rate": 8.298222222222222e-06, | |
| "loss": 1.0017, | |
| "step": 6330 | |
| }, | |
| { | |
| "epoch": 0.8263816475495308, | |
| "grad_norm": 7.929698944091797, | |
| "learning_rate": 8.293777777777779e-06, | |
| "loss": 1.0242, | |
| "step": 6340 | |
| }, | |
| { | |
| "epoch": 0.8276850886339937, | |
| "grad_norm": 10.961642265319824, | |
| "learning_rate": 8.289333333333334e-06, | |
| "loss": 1.033, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 0.8289885297184567, | |
| "grad_norm": 9.38997745513916, | |
| "learning_rate": 8.28488888888889e-06, | |
| "loss": 1.0846, | |
| "step": 6360 | |
| }, | |
| { | |
| "epoch": 0.8302919708029197, | |
| "grad_norm": 8.70460033416748, | |
| "learning_rate": 8.280444444444445e-06, | |
| "loss": 0.9333, | |
| "step": 6370 | |
| }, | |
| { | |
| "epoch": 0.8315954118873827, | |
| "grad_norm": 9.825383186340332, | |
| "learning_rate": 8.276e-06, | |
| "loss": 1.0019, | |
| "step": 6380 | |
| }, | |
| { | |
| "epoch": 0.8328988529718456, | |
| "grad_norm": 8.25622272491455, | |
| "learning_rate": 8.271555555555557e-06, | |
| "loss": 0.977, | |
| "step": 6390 | |
| }, | |
| { | |
| "epoch": 0.8342022940563086, | |
| "grad_norm": 10.422846794128418, | |
| "learning_rate": 8.267111111111111e-06, | |
| "loss": 1.027, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.8342022940563086, | |
| "eval/acc": 37.20930099487305, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.8342022940563086, | |
| "eval_loss": 2.673701763153076, | |
| "eval_runtime": 0.5557, | |
| "eval_samples_per_second": 77.383, | |
| "eval_steps_per_second": 1.8, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.8355057351407716, | |
| "grad_norm": 9.53593921661377, | |
| "learning_rate": 8.262666666666668e-06, | |
| "loss": 1.0361, | |
| "step": 6410 | |
| }, | |
| { | |
| "epoch": 0.8368091762252347, | |
| "grad_norm": 9.09162425994873, | |
| "learning_rate": 8.258222222222223e-06, | |
| "loss": 1.0367, | |
| "step": 6420 | |
| }, | |
| { | |
| "epoch": 0.8381126173096975, | |
| "grad_norm": 9.458831787109375, | |
| "learning_rate": 8.253777777777778e-06, | |
| "loss": 1.0165, | |
| "step": 6430 | |
| }, | |
| { | |
| "epoch": 0.8394160583941606, | |
| "grad_norm": 9.845352172851562, | |
| "learning_rate": 8.249333333333334e-06, | |
| "loss": 0.9662, | |
| "step": 6440 | |
| }, | |
| { | |
| "epoch": 0.8407194994786236, | |
| "grad_norm": 7.90129280090332, | |
| "learning_rate": 8.24488888888889e-06, | |
| "loss": 0.9909, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 0.8420229405630866, | |
| "grad_norm": 8.902530670166016, | |
| "learning_rate": 8.240444444444446e-06, | |
| "loss": 1.0187, | |
| "step": 6460 | |
| }, | |
| { | |
| "epoch": 0.8433263816475496, | |
| "grad_norm": 8.841060638427734, | |
| "learning_rate": 8.236e-06, | |
| "loss": 0.9962, | |
| "step": 6470 | |
| }, | |
| { | |
| "epoch": 0.8446298227320125, | |
| "grad_norm": 8.994577407836914, | |
| "learning_rate": 8.231555555555555e-06, | |
| "loss": 1.0147, | |
| "step": 6480 | |
| }, | |
| { | |
| "epoch": 0.8459332638164755, | |
| "grad_norm": 8.313756942749023, | |
| "learning_rate": 8.227111111111112e-06, | |
| "loss": 0.8592, | |
| "step": 6490 | |
| }, | |
| { | |
| "epoch": 0.8472367049009385, | |
| "grad_norm": 9.097774505615234, | |
| "learning_rate": 8.222666666666668e-06, | |
| "loss": 1.071, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.8472367049009385, | |
| "eval/acc": 37.20930099487305, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.8472367049009385, | |
| "eval_loss": 2.67804217338562, | |
| "eval_runtime": 0.5526, | |
| "eval_samples_per_second": 77.809, | |
| "eval_steps_per_second": 1.81, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.8485401459854015, | |
| "grad_norm": 8.86130428314209, | |
| "learning_rate": 8.218222222222223e-06, | |
| "loss": 1.0751, | |
| "step": 6510 | |
| }, | |
| { | |
| "epoch": 0.8498435870698644, | |
| "grad_norm": 10.352911949157715, | |
| "learning_rate": 8.213777777777778e-06, | |
| "loss": 1.0069, | |
| "step": 6520 | |
| }, | |
| { | |
| "epoch": 0.8511470281543274, | |
| "grad_norm": 9.668673515319824, | |
| "learning_rate": 8.209333333333333e-06, | |
| "loss": 0.9834, | |
| "step": 6530 | |
| }, | |
| { | |
| "epoch": 0.8524504692387904, | |
| "grad_norm": 10.304662704467773, | |
| "learning_rate": 8.20488888888889e-06, | |
| "loss": 1.0477, | |
| "step": 6540 | |
| }, | |
| { | |
| "epoch": 0.8537539103232534, | |
| "grad_norm": 8.507194519042969, | |
| "learning_rate": 8.200444444444446e-06, | |
| "loss": 1.0732, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 0.8550573514077163, | |
| "grad_norm": 8.823519706726074, | |
| "learning_rate": 8.196e-06, | |
| "loss": 1.1194, | |
| "step": 6560 | |
| }, | |
| { | |
| "epoch": 0.8563607924921793, | |
| "grad_norm": 11.177069664001465, | |
| "learning_rate": 8.191555555555556e-06, | |
| "loss": 0.9341, | |
| "step": 6570 | |
| }, | |
| { | |
| "epoch": 0.8576642335766423, | |
| "grad_norm": 8.849434852600098, | |
| "learning_rate": 8.18711111111111e-06, | |
| "loss": 1.0493, | |
| "step": 6580 | |
| }, | |
| { | |
| "epoch": 0.8589676746611054, | |
| "grad_norm": 8.759775161743164, | |
| "learning_rate": 8.182666666666667e-06, | |
| "loss": 0.9479, | |
| "step": 6590 | |
| }, | |
| { | |
| "epoch": 0.8602711157455682, | |
| "grad_norm": 9.2578706741333, | |
| "learning_rate": 8.178222222222224e-06, | |
| "loss": 1.0334, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.8602711157455682, | |
| "eval/acc": 34.88372039794922, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.8602711157455682, | |
| "eval_loss": 2.6864330768585205, | |
| "eval_runtime": 0.5529, | |
| "eval_samples_per_second": 77.777, | |
| "eval_steps_per_second": 1.809, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.8615745568300313, | |
| "grad_norm": 8.463264465332031, | |
| "learning_rate": 8.173777777777778e-06, | |
| "loss": 0.9916, | |
| "step": 6610 | |
| }, | |
| { | |
| "epoch": 0.8628779979144943, | |
| "grad_norm": 9.883336067199707, | |
| "learning_rate": 8.169333333333333e-06, | |
| "loss": 0.9955, | |
| "step": 6620 | |
| }, | |
| { | |
| "epoch": 0.8641814389989573, | |
| "grad_norm": 8.82459545135498, | |
| "learning_rate": 8.16488888888889e-06, | |
| "loss": 1.0315, | |
| "step": 6630 | |
| }, | |
| { | |
| "epoch": 0.8654848800834203, | |
| "grad_norm": 10.618040084838867, | |
| "learning_rate": 8.160444444444445e-06, | |
| "loss": 0.9399, | |
| "step": 6640 | |
| }, | |
| { | |
| "epoch": 0.8667883211678832, | |
| "grad_norm": 11.280345916748047, | |
| "learning_rate": 8.156000000000001e-06, | |
| "loss": 1.1035, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 0.8680917622523462, | |
| "grad_norm": 8.534235000610352, | |
| "learning_rate": 8.151555555555556e-06, | |
| "loss": 0.9105, | |
| "step": 6660 | |
| }, | |
| { | |
| "epoch": 0.8693952033368092, | |
| "grad_norm": 9.337313652038574, | |
| "learning_rate": 8.147111111111113e-06, | |
| "loss": 1.0439, | |
| "step": 6670 | |
| }, | |
| { | |
| "epoch": 0.8706986444212722, | |
| "grad_norm": 8.120159149169922, | |
| "learning_rate": 8.142666666666667e-06, | |
| "loss": 0.8715, | |
| "step": 6680 | |
| }, | |
| { | |
| "epoch": 0.8720020855057351, | |
| "grad_norm": 9.389538764953613, | |
| "learning_rate": 8.138222222222222e-06, | |
| "loss": 1.0232, | |
| "step": 6690 | |
| }, | |
| { | |
| "epoch": 0.8733055265901981, | |
| "grad_norm": 9.953109741210938, | |
| "learning_rate": 8.133777777777779e-06, | |
| "loss": 1.0433, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.8733055265901981, | |
| "eval/acc": 32.55813980102539, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.8733055265901981, | |
| "eval_loss": 2.6176934242248535, | |
| "eval_runtime": 0.5514, | |
| "eval_samples_per_second": 77.99, | |
| "eval_steps_per_second": 1.814, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.8746089676746611, | |
| "grad_norm": 8.203495979309082, | |
| "learning_rate": 8.129333333333334e-06, | |
| "loss": 1.0338, | |
| "step": 6710 | |
| }, | |
| { | |
| "epoch": 0.8759124087591241, | |
| "grad_norm": 9.573360443115234, | |
| "learning_rate": 8.12488888888889e-06, | |
| "loss": 0.9788, | |
| "step": 6720 | |
| }, | |
| { | |
| "epoch": 0.877215849843587, | |
| "grad_norm": 9.354838371276855, | |
| "learning_rate": 8.120444444444445e-06, | |
| "loss": 0.9955, | |
| "step": 6730 | |
| }, | |
| { | |
| "epoch": 0.87851929092805, | |
| "grad_norm": 9.718021392822266, | |
| "learning_rate": 8.116e-06, | |
| "loss": 1.1349, | |
| "step": 6740 | |
| }, | |
| { | |
| "epoch": 0.879822732012513, | |
| "grad_norm": 8.48845386505127, | |
| "learning_rate": 8.111555555555556e-06, | |
| "loss": 1.0507, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 0.881126173096976, | |
| "grad_norm": 9.579551696777344, | |
| "learning_rate": 8.107111111111113e-06, | |
| "loss": 0.9856, | |
| "step": 6760 | |
| }, | |
| { | |
| "epoch": 0.882429614181439, | |
| "grad_norm": 9.837549209594727, | |
| "learning_rate": 8.102666666666668e-06, | |
| "loss": 1.0175, | |
| "step": 6770 | |
| }, | |
| { | |
| "epoch": 0.883733055265902, | |
| "grad_norm": 8.538361549377441, | |
| "learning_rate": 8.098222222222223e-06, | |
| "loss": 1.0427, | |
| "step": 6780 | |
| }, | |
| { | |
| "epoch": 0.885036496350365, | |
| "grad_norm": 10.268227577209473, | |
| "learning_rate": 8.093777777777777e-06, | |
| "loss": 0.9472, | |
| "step": 6790 | |
| }, | |
| { | |
| "epoch": 0.886339937434828, | |
| "grad_norm": 10.689719200134277, | |
| "learning_rate": 8.089333333333334e-06, | |
| "loss": 1.0209, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.886339937434828, | |
| "eval/acc": 34.88372039794922, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.886339937434828, | |
| "eval_loss": 2.6165432929992676, | |
| "eval_runtime": 0.5568, | |
| "eval_samples_per_second": 77.234, | |
| "eval_steps_per_second": 1.796, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.887643378519291, | |
| "grad_norm": 9.130402565002441, | |
| "learning_rate": 8.08488888888889e-06, | |
| "loss": 1.01, | |
| "step": 6810 | |
| }, | |
| { | |
| "epoch": 0.8889468196037539, | |
| "grad_norm": 8.528085708618164, | |
| "learning_rate": 8.080444444444445e-06, | |
| "loss": 0.9943, | |
| "step": 6820 | |
| }, | |
| { | |
| "epoch": 0.8902502606882169, | |
| "grad_norm": 10.689129829406738, | |
| "learning_rate": 8.076e-06, | |
| "loss": 1.0258, | |
| "step": 6830 | |
| }, | |
| { | |
| "epoch": 0.8915537017726799, | |
| "grad_norm": 9.432913780212402, | |
| "learning_rate": 8.071555555555555e-06, | |
| "loss": 1.0207, | |
| "step": 6840 | |
| }, | |
| { | |
| "epoch": 0.8928571428571429, | |
| "grad_norm": 8.51905632019043, | |
| "learning_rate": 8.067111111111112e-06, | |
| "loss": 1.0492, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 0.8941605839416058, | |
| "grad_norm": 9.575328826904297, | |
| "learning_rate": 8.062666666666668e-06, | |
| "loss": 0.919, | |
| "step": 6860 | |
| }, | |
| { | |
| "epoch": 0.8954640250260688, | |
| "grad_norm": 9.271153450012207, | |
| "learning_rate": 8.058222222222223e-06, | |
| "loss": 0.9525, | |
| "step": 6870 | |
| }, | |
| { | |
| "epoch": 0.8967674661105318, | |
| "grad_norm": 9.966239929199219, | |
| "learning_rate": 8.053777777777778e-06, | |
| "loss": 0.9551, | |
| "step": 6880 | |
| }, | |
| { | |
| "epoch": 0.8980709071949948, | |
| "grad_norm": 8.789650917053223, | |
| "learning_rate": 8.049333333333333e-06, | |
| "loss": 0.9712, | |
| "step": 6890 | |
| }, | |
| { | |
| "epoch": 0.8993743482794577, | |
| "grad_norm": 10.292884826660156, | |
| "learning_rate": 8.04488888888889e-06, | |
| "loss": 0.9979, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.8993743482794577, | |
| "eval/acc": 32.55813980102539, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.8993743482794577, | |
| "eval_loss": 2.6249427795410156, | |
| "eval_runtime": 0.5525, | |
| "eval_samples_per_second": 77.821, | |
| "eval_steps_per_second": 1.81, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.9006777893639207, | |
| "grad_norm": 9.126242637634277, | |
| "learning_rate": 8.040444444444446e-06, | |
| "loss": 1.0284, | |
| "step": 6910 | |
| }, | |
| { | |
| "epoch": 0.9019812304483837, | |
| "grad_norm": 8.590139389038086, | |
| "learning_rate": 8.036e-06, | |
| "loss": 0.9972, | |
| "step": 6920 | |
| }, | |
| { | |
| "epoch": 0.9032846715328468, | |
| "grad_norm": 8.998208999633789, | |
| "learning_rate": 8.031555555555555e-06, | |
| "loss": 0.9327, | |
| "step": 6930 | |
| }, | |
| { | |
| "epoch": 0.9045881126173096, | |
| "grad_norm": 9.858304977416992, | |
| "learning_rate": 8.027111111111112e-06, | |
| "loss": 0.9598, | |
| "step": 6940 | |
| }, | |
| { | |
| "epoch": 0.9058915537017727, | |
| "grad_norm": 9.873553276062012, | |
| "learning_rate": 8.022666666666667e-06, | |
| "loss": 1.013, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 0.9071949947862357, | |
| "grad_norm": 7.501348972320557, | |
| "learning_rate": 8.018222222222223e-06, | |
| "loss": 1.0439, | |
| "step": 6960 | |
| }, | |
| { | |
| "epoch": 0.9084984358706987, | |
| "grad_norm": 9.33034610748291, | |
| "learning_rate": 8.013777777777778e-06, | |
| "loss": 1.0219, | |
| "step": 6970 | |
| }, | |
| { | |
| "epoch": 0.9098018769551617, | |
| "grad_norm": 8.672886848449707, | |
| "learning_rate": 8.009333333333335e-06, | |
| "loss": 0.9912, | |
| "step": 6980 | |
| }, | |
| { | |
| "epoch": 0.9111053180396246, | |
| "grad_norm": 9.21350383758545, | |
| "learning_rate": 8.00488888888889e-06, | |
| "loss": 0.9569, | |
| "step": 6990 | |
| }, | |
| { | |
| "epoch": 0.9124087591240876, | |
| "grad_norm": 8.252593994140625, | |
| "learning_rate": 8.000444444444444e-06, | |
| "loss": 0.9535, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.9124087591240876, | |
| "eval/acc": 30.23255729675293, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.9124087591240876, | |
| "eval_loss": 2.6694142818450928, | |
| "eval_runtime": 0.5555, | |
| "eval_samples_per_second": 77.407, | |
| "eval_steps_per_second": 1.8, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.9137122002085506, | |
| "grad_norm": 9.754995346069336, | |
| "learning_rate": 7.996000000000001e-06, | |
| "loss": 0.9894, | |
| "step": 7010 | |
| }, | |
| { | |
| "epoch": 0.9150156412930136, | |
| "grad_norm": 9.45035171508789, | |
| "learning_rate": 7.991555555555556e-06, | |
| "loss": 0.968, | |
| "step": 7020 | |
| }, | |
| { | |
| "epoch": 0.9163190823774765, | |
| "grad_norm": 10.195062637329102, | |
| "learning_rate": 7.987111111111112e-06, | |
| "loss": 1.0784, | |
| "step": 7030 | |
| }, | |
| { | |
| "epoch": 0.9176225234619395, | |
| "grad_norm": 9.188143730163574, | |
| "learning_rate": 7.982666666666667e-06, | |
| "loss": 0.9223, | |
| "step": 7040 | |
| }, | |
| { | |
| "epoch": 0.9189259645464025, | |
| "grad_norm": 10.677811622619629, | |
| "learning_rate": 7.978222222222222e-06, | |
| "loss": 0.9489, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 0.9202294056308655, | |
| "grad_norm": 8.982565879821777, | |
| "learning_rate": 7.973777777777779e-06, | |
| "loss": 0.9646, | |
| "step": 7060 | |
| }, | |
| { | |
| "epoch": 0.9215328467153284, | |
| "grad_norm": 9.741477012634277, | |
| "learning_rate": 7.969333333333335e-06, | |
| "loss": 0.9839, | |
| "step": 7070 | |
| }, | |
| { | |
| "epoch": 0.9228362877997914, | |
| "grad_norm": 9.745488166809082, | |
| "learning_rate": 7.96488888888889e-06, | |
| "loss": 1.0026, | |
| "step": 7080 | |
| }, | |
| { | |
| "epoch": 0.9241397288842544, | |
| "grad_norm": 9.319021224975586, | |
| "learning_rate": 7.960444444444445e-06, | |
| "loss": 0.9635, | |
| "step": 7090 | |
| }, | |
| { | |
| "epoch": 0.9254431699687174, | |
| "grad_norm": 9.34664249420166, | |
| "learning_rate": 7.956e-06, | |
| "loss": 0.9563, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.9254431699687174, | |
| "eval/acc": 34.88372039794922, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.9254431699687174, | |
| "eval_loss": 2.6430439949035645, | |
| "eval_runtime": 0.5521, | |
| "eval_samples_per_second": 77.886, | |
| "eval_steps_per_second": 1.811, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.9267466110531803, | |
| "grad_norm": 10.529402732849121, | |
| "learning_rate": 7.951555555555556e-06, | |
| "loss": 0.9977, | |
| "step": 7110 | |
| }, | |
| { | |
| "epoch": 0.9280500521376434, | |
| "grad_norm": 9.836724281311035, | |
| "learning_rate": 7.947111111111113e-06, | |
| "loss": 0.9703, | |
| "step": 7120 | |
| }, | |
| { | |
| "epoch": 0.9293534932221064, | |
| "grad_norm": 10.40678596496582, | |
| "learning_rate": 7.942666666666668e-06, | |
| "loss": 0.9726, | |
| "step": 7130 | |
| }, | |
| { | |
| "epoch": 0.9306569343065694, | |
| "grad_norm": 8.559309959411621, | |
| "learning_rate": 7.938222222222222e-06, | |
| "loss": 0.9991, | |
| "step": 7140 | |
| }, | |
| { | |
| "epoch": 0.9319603753910324, | |
| "grad_norm": 8.88638973236084, | |
| "learning_rate": 7.933777777777777e-06, | |
| "loss": 1.0273, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 0.9332638164754953, | |
| "grad_norm": 8.549494743347168, | |
| "learning_rate": 7.929333333333334e-06, | |
| "loss": 0.9444, | |
| "step": 7160 | |
| }, | |
| { | |
| "epoch": 0.9345672575599583, | |
| "grad_norm": 8.719738960266113, | |
| "learning_rate": 7.92488888888889e-06, | |
| "loss": 0.9725, | |
| "step": 7170 | |
| }, | |
| { | |
| "epoch": 0.9358706986444213, | |
| "grad_norm": 8.908008575439453, | |
| "learning_rate": 7.920444444444445e-06, | |
| "loss": 0.9216, | |
| "step": 7180 | |
| }, | |
| { | |
| "epoch": 0.9371741397288843, | |
| "grad_norm": 8.950148582458496, | |
| "learning_rate": 7.916e-06, | |
| "loss": 1.0316, | |
| "step": 7190 | |
| }, | |
| { | |
| "epoch": 0.9384775808133472, | |
| "grad_norm": 8.901386260986328, | |
| "learning_rate": 7.911555555555555e-06, | |
| "loss": 1.0231, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.9384775808133472, | |
| "eval/acc": 34.88372039794922, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.9384775808133472, | |
| "eval_loss": 2.628709316253662, | |
| "eval_runtime": 0.5528, | |
| "eval_samples_per_second": 77.786, | |
| "eval_steps_per_second": 1.809, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.9397810218978102, | |
| "grad_norm": 8.983177185058594, | |
| "learning_rate": 7.907111111111111e-06, | |
| "loss": 1.0309, | |
| "step": 7210 | |
| }, | |
| { | |
| "epoch": 0.9410844629822732, | |
| "grad_norm": 10.097892761230469, | |
| "learning_rate": 7.902666666666668e-06, | |
| "loss": 1.0109, | |
| "step": 7220 | |
| }, | |
| { | |
| "epoch": 0.9423879040667362, | |
| "grad_norm": 8.999421119689941, | |
| "learning_rate": 7.898222222222223e-06, | |
| "loss": 0.9416, | |
| "step": 7230 | |
| }, | |
| { | |
| "epoch": 0.9436913451511991, | |
| "grad_norm": 9.044880867004395, | |
| "learning_rate": 7.893777777777778e-06, | |
| "loss": 0.9678, | |
| "step": 7240 | |
| }, | |
| { | |
| "epoch": 0.9449947862356621, | |
| "grad_norm": 9.498455047607422, | |
| "learning_rate": 7.889333333333334e-06, | |
| "loss": 1.0159, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 0.9462982273201251, | |
| "grad_norm": 8.960881233215332, | |
| "learning_rate": 7.884888888888889e-06, | |
| "loss": 1.0648, | |
| "step": 7260 | |
| }, | |
| { | |
| "epoch": 0.9476016684045881, | |
| "grad_norm": 8.187444686889648, | |
| "learning_rate": 7.880444444444446e-06, | |
| "loss": 1.0935, | |
| "step": 7270 | |
| }, | |
| { | |
| "epoch": 0.948905109489051, | |
| "grad_norm": 8.623115539550781, | |
| "learning_rate": 7.876e-06, | |
| "loss": 1.0459, | |
| "step": 7280 | |
| }, | |
| { | |
| "epoch": 0.950208550573514, | |
| "grad_norm": 8.242715835571289, | |
| "learning_rate": 7.871555555555557e-06, | |
| "loss": 0.9429, | |
| "step": 7290 | |
| }, | |
| { | |
| "epoch": 0.9515119916579771, | |
| "grad_norm": 8.49636459350586, | |
| "learning_rate": 7.867111111111112e-06, | |
| "loss": 0.8846, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.9515119916579771, | |
| "eval/acc": 37.20930099487305, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.9515119916579771, | |
| "eval_loss": 2.692638397216797, | |
| "eval_runtime": 0.5526, | |
| "eval_samples_per_second": 77.807, | |
| "eval_steps_per_second": 1.809, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.9528154327424401, | |
| "grad_norm": 8.402484893798828, | |
| "learning_rate": 7.862666666666667e-06, | |
| "loss": 0.9219, | |
| "step": 7310 | |
| }, | |
| { | |
| "epoch": 0.954118873826903, | |
| "grad_norm": 9.127314567565918, | |
| "learning_rate": 7.858222222222223e-06, | |
| "loss": 0.9732, | |
| "step": 7320 | |
| }, | |
| { | |
| "epoch": 0.955422314911366, | |
| "grad_norm": 8.822267532348633, | |
| "learning_rate": 7.853777777777778e-06, | |
| "loss": 1.0115, | |
| "step": 7330 | |
| }, | |
| { | |
| "epoch": 0.956725755995829, | |
| "grad_norm": 9.45130729675293, | |
| "learning_rate": 7.849333333333335e-06, | |
| "loss": 0.9701, | |
| "step": 7340 | |
| }, | |
| { | |
| "epoch": 0.958029197080292, | |
| "grad_norm": 9.352774620056152, | |
| "learning_rate": 7.84488888888889e-06, | |
| "loss": 1.0008, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 0.959332638164755, | |
| "grad_norm": 8.283885955810547, | |
| "learning_rate": 7.840444444444444e-06, | |
| "loss": 1.0424, | |
| "step": 7360 | |
| }, | |
| { | |
| "epoch": 0.9606360792492179, | |
| "grad_norm": 8.821931838989258, | |
| "learning_rate": 7.836000000000001e-06, | |
| "loss": 0.9506, | |
| "step": 7370 | |
| }, | |
| { | |
| "epoch": 0.9619395203336809, | |
| "grad_norm": 10.744322776794434, | |
| "learning_rate": 7.831555555555557e-06, | |
| "loss": 1.0553, | |
| "step": 7380 | |
| }, | |
| { | |
| "epoch": 0.9632429614181439, | |
| "grad_norm": 8.165914535522461, | |
| "learning_rate": 7.827111111111112e-06, | |
| "loss": 0.9444, | |
| "step": 7390 | |
| }, | |
| { | |
| "epoch": 0.9645464025026069, | |
| "grad_norm": 9.222362518310547, | |
| "learning_rate": 7.822666666666667e-06, | |
| "loss": 1.0401, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.9645464025026069, | |
| "eval/acc": 30.23255729675293, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.9645464025026069, | |
| "eval_loss": 2.5550081729888916, | |
| "eval_runtime": 0.5536, | |
| "eval_samples_per_second": 77.675, | |
| "eval_steps_per_second": 1.806, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.9658498435870698, | |
| "grad_norm": 7.525315284729004, | |
| "learning_rate": 7.818222222222222e-06, | |
| "loss": 1.0211, | |
| "step": 7410 | |
| }, | |
| { | |
| "epoch": 0.9671532846715328, | |
| "grad_norm": 10.365772247314453, | |
| "learning_rate": 7.813777777777778e-06, | |
| "loss": 1.0113, | |
| "step": 7420 | |
| }, | |
| { | |
| "epoch": 0.9684567257559958, | |
| "grad_norm": 9.825669288635254, | |
| "learning_rate": 7.809333333333335e-06, | |
| "loss": 0.9951, | |
| "step": 7430 | |
| }, | |
| { | |
| "epoch": 0.9697601668404588, | |
| "grad_norm": 9.201279640197754, | |
| "learning_rate": 7.80488888888889e-06, | |
| "loss": 0.9878, | |
| "step": 7440 | |
| }, | |
| { | |
| "epoch": 0.9710636079249217, | |
| "grad_norm": 9.135232925415039, | |
| "learning_rate": 7.800444444444445e-06, | |
| "loss": 0.9692, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 0.9723670490093848, | |
| "grad_norm": 10.133102416992188, | |
| "learning_rate": 7.796e-06, | |
| "loss": 0.9488, | |
| "step": 7460 | |
| }, | |
| { | |
| "epoch": 0.9736704900938478, | |
| "grad_norm": 8.100567817687988, | |
| "learning_rate": 7.791555555555556e-06, | |
| "loss": 0.9558, | |
| "step": 7470 | |
| }, | |
| { | |
| "epoch": 0.9749739311783108, | |
| "grad_norm": 8.011831283569336, | |
| "learning_rate": 7.787111111111113e-06, | |
| "loss": 0.9695, | |
| "step": 7480 | |
| }, | |
| { | |
| "epoch": 0.9762773722627737, | |
| "grad_norm": 8.650604248046875, | |
| "learning_rate": 7.782666666666667e-06, | |
| "loss": 1.0838, | |
| "step": 7490 | |
| }, | |
| { | |
| "epoch": 0.9775808133472367, | |
| "grad_norm": 9.186234474182129, | |
| "learning_rate": 7.778222222222222e-06, | |
| "loss": 0.9779, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.9775808133472367, | |
| "eval/acc": 34.88372039794922, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.9775808133472367, | |
| "eval_loss": 2.6376609802246094, | |
| "eval_runtime": 0.5509, | |
| "eval_samples_per_second": 78.048, | |
| "eval_steps_per_second": 1.815, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.9788842544316997, | |
| "grad_norm": 9.186497688293457, | |
| "learning_rate": 7.773777777777777e-06, | |
| "loss": 0.9524, | |
| "step": 7510 | |
| }, | |
| { | |
| "epoch": 0.9801876955161627, | |
| "grad_norm": 8.840336799621582, | |
| "learning_rate": 7.769333333333334e-06, | |
| "loss": 0.9805, | |
| "step": 7520 | |
| }, | |
| { | |
| "epoch": 0.9814911366006257, | |
| "grad_norm": 9.51953411102295, | |
| "learning_rate": 7.76488888888889e-06, | |
| "loss": 0.9898, | |
| "step": 7530 | |
| }, | |
| { | |
| "epoch": 0.9827945776850886, | |
| "grad_norm": 9.390219688415527, | |
| "learning_rate": 7.760444444444445e-06, | |
| "loss": 1.0244, | |
| "step": 7540 | |
| }, | |
| { | |
| "epoch": 0.9840980187695516, | |
| "grad_norm": 8.21574592590332, | |
| "learning_rate": 7.756e-06, | |
| "loss": 0.9465, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 0.9854014598540146, | |
| "grad_norm": 9.998575210571289, | |
| "learning_rate": 7.751555555555556e-06, | |
| "loss": 1.0289, | |
| "step": 7560 | |
| }, | |
| { | |
| "epoch": 0.9867049009384776, | |
| "grad_norm": 9.237029075622559, | |
| "learning_rate": 7.747111111111111e-06, | |
| "loss": 0.9701, | |
| "step": 7570 | |
| }, | |
| { | |
| "epoch": 0.9880083420229405, | |
| "grad_norm": 11.330434799194336, | |
| "learning_rate": 7.742666666666668e-06, | |
| "loss": 1.0181, | |
| "step": 7580 | |
| }, | |
| { | |
| "epoch": 0.9893117831074035, | |
| "grad_norm": 7.958991050720215, | |
| "learning_rate": 7.738222222222223e-06, | |
| "loss": 1.0152, | |
| "step": 7590 | |
| }, | |
| { | |
| "epoch": 0.9906152241918665, | |
| "grad_norm": 8.143290519714355, | |
| "learning_rate": 7.73377777777778e-06, | |
| "loss": 1.0054, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.9906152241918665, | |
| "eval/acc": 34.88372039794922, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.9906152241918665, | |
| "eval_loss": 2.6791605949401855, | |
| "eval_runtime": 0.553, | |
| "eval_samples_per_second": 77.752, | |
| "eval_steps_per_second": 1.808, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.9919186652763295, | |
| "grad_norm": 9.640328407287598, | |
| "learning_rate": 7.729333333333334e-06, | |
| "loss": 1.0106, | |
| "step": 7610 | |
| }, | |
| { | |
| "epoch": 0.9932221063607924, | |
| "grad_norm": 7.355710029602051, | |
| "learning_rate": 7.724888888888889e-06, | |
| "loss": 0.9055, | |
| "step": 7620 | |
| }, | |
| { | |
| "epoch": 0.9945255474452555, | |
| "grad_norm": 9.306617736816406, | |
| "learning_rate": 7.720444444444445e-06, | |
| "loss": 1.0173, | |
| "step": 7630 | |
| }, | |
| { | |
| "epoch": 0.9958289885297185, | |
| "grad_norm": 9.533848762512207, | |
| "learning_rate": 7.716e-06, | |
| "loss": 0.9594, | |
| "step": 7640 | |
| }, | |
| { | |
| "epoch": 0.9971324296141815, | |
| "grad_norm": 8.485058784484863, | |
| "learning_rate": 7.711555555555557e-06, | |
| "loss": 0.9638, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 0.9984358706986444, | |
| "grad_norm": 9.320188522338867, | |
| "learning_rate": 7.707111111111112e-06, | |
| "loss": 1.0431, | |
| "step": 7660 | |
| }, | |
| { | |
| "epoch": 0.9997393117831074, | |
| "grad_norm": 9.840314865112305, | |
| "learning_rate": 7.702666666666667e-06, | |
| "loss": 0.9663, | |
| "step": 7670 | |
| }, | |
| { | |
| "epoch": 1.0010427528675705, | |
| "grad_norm": 10.212252616882324, | |
| "learning_rate": 7.698222222222223e-06, | |
| "loss": 0.8872, | |
| "step": 7680 | |
| }, | |
| { | |
| "epoch": 1.0023461939520333, | |
| "grad_norm": 10.817450523376465, | |
| "learning_rate": 7.69377777777778e-06, | |
| "loss": 0.8731, | |
| "step": 7690 | |
| }, | |
| { | |
| "epoch": 1.0036496350364963, | |
| "grad_norm": 9.630830764770508, | |
| "learning_rate": 7.689333333333334e-06, | |
| "loss": 0.9175, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 1.0036496350364963, | |
| "eval/acc": 41.86046600341797, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 1.0036496350364963, | |
| "eval_loss": 2.9220545291900635, | |
| "eval_runtime": 0.5716, | |
| "eval_samples_per_second": 75.233, | |
| "eval_steps_per_second": 1.75, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 1.0049530761209593, | |
| "grad_norm": 8.57374095916748, | |
| "learning_rate": 7.68488888888889e-06, | |
| "loss": 0.9758, | |
| "step": 7710 | |
| }, | |
| { | |
| "epoch": 1.0062565172054223, | |
| "grad_norm": 9.157445907592773, | |
| "learning_rate": 7.680444444444444e-06, | |
| "loss": 0.8355, | |
| "step": 7720 | |
| }, | |
| { | |
| "epoch": 1.0075599582898853, | |
| "grad_norm": 9.53898811340332, | |
| "learning_rate": 7.676e-06, | |
| "loss": 0.8846, | |
| "step": 7730 | |
| }, | |
| { | |
| "epoch": 1.0088633993743483, | |
| "grad_norm": 11.01609992980957, | |
| "learning_rate": 7.671555555555557e-06, | |
| "loss": 0.9107, | |
| "step": 7740 | |
| }, | |
| { | |
| "epoch": 1.0101668404588113, | |
| "grad_norm": 8.74721622467041, | |
| "learning_rate": 7.667111111111112e-06, | |
| "loss": 0.8947, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 1.0114702815432743, | |
| "grad_norm": 10.709732055664062, | |
| "learning_rate": 7.662666666666667e-06, | |
| "loss": 0.96, | |
| "step": 7760 | |
| }, | |
| { | |
| "epoch": 1.0127737226277371, | |
| "grad_norm": 9.86235523223877, | |
| "learning_rate": 7.658222222222222e-06, | |
| "loss": 0.9517, | |
| "step": 7770 | |
| }, | |
| { | |
| "epoch": 1.0140771637122001, | |
| "grad_norm": 9.738570213317871, | |
| "learning_rate": 7.653777777777778e-06, | |
| "loss": 0.9245, | |
| "step": 7780 | |
| }, | |
| { | |
| "epoch": 1.0153806047966631, | |
| "grad_norm": 8.564919471740723, | |
| "learning_rate": 7.649333333333335e-06, | |
| "loss": 0.8769, | |
| "step": 7790 | |
| }, | |
| { | |
| "epoch": 1.0166840458811262, | |
| "grad_norm": 10.096901893615723, | |
| "learning_rate": 7.64488888888889e-06, | |
| "loss": 0.9472, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 1.0166840458811262, | |
| "eval/acc": 39.53488540649414, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 1.0166840458811262, | |
| "eval_loss": 2.8397738933563232, | |
| "eval_runtime": 0.5698, | |
| "eval_samples_per_second": 75.459, | |
| "eval_steps_per_second": 1.755, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 1.0179874869655892, | |
| "grad_norm": 7.725857257843018, | |
| "learning_rate": 7.640444444444445e-06, | |
| "loss": 0.9419, | |
| "step": 7810 | |
| }, | |
| { | |
| "epoch": 1.0192909280500522, | |
| "grad_norm": 8.992986679077148, | |
| "learning_rate": 7.636e-06, | |
| "loss": 0.8854, | |
| "step": 7820 | |
| }, | |
| { | |
| "epoch": 1.0205943691345152, | |
| "grad_norm": 9.632916450500488, | |
| "learning_rate": 7.631555555555556e-06, | |
| "loss": 0.9982, | |
| "step": 7830 | |
| }, | |
| { | |
| "epoch": 1.0218978102189782, | |
| "grad_norm": 9.003362655639648, | |
| "learning_rate": 7.627111111111112e-06, | |
| "loss": 0.8817, | |
| "step": 7840 | |
| }, | |
| { | |
| "epoch": 1.023201251303441, | |
| "grad_norm": 8.941239356994629, | |
| "learning_rate": 7.622666666666667e-06, | |
| "loss": 0.9168, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 1.024504692387904, | |
| "grad_norm": 9.012649536132812, | |
| "learning_rate": 7.618222222222222e-06, | |
| "loss": 0.9765, | |
| "step": 7860 | |
| }, | |
| { | |
| "epoch": 1.025808133472367, | |
| "grad_norm": 8.48647689819336, | |
| "learning_rate": 7.613777777777779e-06, | |
| "loss": 0.9321, | |
| "step": 7870 | |
| }, | |
| { | |
| "epoch": 1.02711157455683, | |
| "grad_norm": 9.642101287841797, | |
| "learning_rate": 7.609333333333334e-06, | |
| "loss": 0.9204, | |
| "step": 7880 | |
| }, | |
| { | |
| "epoch": 1.028415015641293, | |
| "grad_norm": 7.952564239501953, | |
| "learning_rate": 7.604888888888889e-06, | |
| "loss": 0.9259, | |
| "step": 7890 | |
| }, | |
| { | |
| "epoch": 1.029718456725756, | |
| "grad_norm": 10.145586013793945, | |
| "learning_rate": 7.600444444444445e-06, | |
| "loss": 0.963, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 1.029718456725756, | |
| "eval/acc": 41.86046600341797, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 1.029718456725756, | |
| "eval_loss": 2.715751886367798, | |
| "eval_runtime": 0.5523, | |
| "eval_samples_per_second": 77.86, | |
| "eval_steps_per_second": 1.811, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 1.031021897810219, | |
| "grad_norm": 9.017576217651367, | |
| "learning_rate": 7.5960000000000015e-06, | |
| "loss": 0.8985, | |
| "step": 7910 | |
| }, | |
| { | |
| "epoch": 1.032325338894682, | |
| "grad_norm": 8.437040328979492, | |
| "learning_rate": 7.591555555555556e-06, | |
| "loss": 0.9481, | |
| "step": 7920 | |
| }, | |
| { | |
| "epoch": 1.033628779979145, | |
| "grad_norm": 8.106425285339355, | |
| "learning_rate": 7.587111111111112e-06, | |
| "loss": 0.8974, | |
| "step": 7930 | |
| }, | |
| { | |
| "epoch": 1.0349322210636078, | |
| "grad_norm": 7.865415573120117, | |
| "learning_rate": 7.582666666666667e-06, | |
| "loss": 0.8884, | |
| "step": 7940 | |
| }, | |
| { | |
| "epoch": 1.0362356621480708, | |
| "grad_norm": 11.911471366882324, | |
| "learning_rate": 7.5782222222222225e-06, | |
| "loss": 0.8795, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 1.0375391032325338, | |
| "grad_norm": 8.606977462768555, | |
| "learning_rate": 7.573777777777779e-06, | |
| "loss": 0.8699, | |
| "step": 7960 | |
| }, | |
| { | |
| "epoch": 1.0388425443169969, | |
| "grad_norm": 10.2034330368042, | |
| "learning_rate": 7.569333333333334e-06, | |
| "loss": 0.8918, | |
| "step": 7970 | |
| }, | |
| { | |
| "epoch": 1.0401459854014599, | |
| "grad_norm": 11.00263786315918, | |
| "learning_rate": 7.56488888888889e-06, | |
| "loss": 0.889, | |
| "step": 7980 | |
| }, | |
| { | |
| "epoch": 1.0414494264859229, | |
| "grad_norm": 8.472570419311523, | |
| "learning_rate": 7.5604444444444445e-06, | |
| "loss": 0.9292, | |
| "step": 7990 | |
| }, | |
| { | |
| "epoch": 1.0427528675703859, | |
| "grad_norm": 12.965922355651855, | |
| "learning_rate": 7.556000000000001e-06, | |
| "loss": 0.9053, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.0427528675703859, | |
| "eval/acc": 39.53488540649414, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.0427528675703859, | |
| "eval_loss": 2.7604176998138428, | |
| "eval_runtime": 0.5527, | |
| "eval_samples_per_second": 77.805, | |
| "eval_steps_per_second": 1.809, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.0440563086548489, | |
| "grad_norm": 9.576687812805176, | |
| "learning_rate": 7.551555555555557e-06, | |
| "loss": 0.952, | |
| "step": 8010 | |
| }, | |
| { | |
| "epoch": 1.045359749739312, | |
| "grad_norm": 8.876384735107422, | |
| "learning_rate": 7.5471111111111115e-06, | |
| "loss": 0.8395, | |
| "step": 8020 | |
| }, | |
| { | |
| "epoch": 1.0466631908237747, | |
| "grad_norm": 8.862028121948242, | |
| "learning_rate": 7.542666666666667e-06, | |
| "loss": 0.9816, | |
| "step": 8030 | |
| }, | |
| { | |
| "epoch": 1.0479666319082377, | |
| "grad_norm": 7.471101760864258, | |
| "learning_rate": 7.538222222222222e-06, | |
| "loss": 0.8824, | |
| "step": 8040 | |
| }, | |
| { | |
| "epoch": 1.0492700729927007, | |
| "grad_norm": 10.809747695922852, | |
| "learning_rate": 7.533777777777779e-06, | |
| "loss": 0.9463, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 1.0505735140771637, | |
| "grad_norm": 8.711124420166016, | |
| "learning_rate": 7.529333333333334e-06, | |
| "loss": 0.9188, | |
| "step": 8060 | |
| }, | |
| { | |
| "epoch": 1.0518769551616267, | |
| "grad_norm": 9.918045997619629, | |
| "learning_rate": 7.524888888888889e-06, | |
| "loss": 0.9275, | |
| "step": 8070 | |
| }, | |
| { | |
| "epoch": 1.0531803962460897, | |
| "grad_norm": 7.851119518280029, | |
| "learning_rate": 7.520444444444445e-06, | |
| "loss": 1.0079, | |
| "step": 8080 | |
| }, | |
| { | |
| "epoch": 1.0544838373305527, | |
| "grad_norm": 9.710519790649414, | |
| "learning_rate": 7.516000000000001e-06, | |
| "loss": 0.9625, | |
| "step": 8090 | |
| }, | |
| { | |
| "epoch": 1.0557872784150157, | |
| "grad_norm": 9.297921180725098, | |
| "learning_rate": 7.511555555555556e-06, | |
| "loss": 0.9715, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 1.0557872784150157, | |
| "eval/acc": 39.53488540649414, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 1.0557872784150157, | |
| "eval_loss": 2.6021246910095215, | |
| "eval_runtime": 0.5527, | |
| "eval_samples_per_second": 77.804, | |
| "eval_steps_per_second": 1.809, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 1.0570907194994785, | |
| "grad_norm": 7.866173267364502, | |
| "learning_rate": 7.507111111111112e-06, | |
| "loss": 0.9433, | |
| "step": 8110 | |
| }, | |
| { | |
| "epoch": 1.0583941605839415, | |
| "grad_norm": 9.581183433532715, | |
| "learning_rate": 7.502666666666667e-06, | |
| "loss": 1.0003, | |
| "step": 8120 | |
| }, | |
| { | |
| "epoch": 1.0596976016684045, | |
| "grad_norm": 9.06647777557373, | |
| "learning_rate": 7.4982222222222225e-06, | |
| "loss": 0.8665, | |
| "step": 8130 | |
| }, | |
| { | |
| "epoch": 1.0610010427528676, | |
| "grad_norm": 10.333742141723633, | |
| "learning_rate": 7.493777777777779e-06, | |
| "loss": 0.9426, | |
| "step": 8140 | |
| }, | |
| { | |
| "epoch": 1.0623044838373306, | |
| "grad_norm": 7.566351413726807, | |
| "learning_rate": 7.489333333333334e-06, | |
| "loss": 0.9917, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 1.0636079249217936, | |
| "grad_norm": 8.821928977966309, | |
| "learning_rate": 7.4848888888888895e-06, | |
| "loss": 0.8495, | |
| "step": 8160 | |
| }, | |
| { | |
| "epoch": 1.0649113660062566, | |
| "grad_norm": 9.295007705688477, | |
| "learning_rate": 7.480444444444444e-06, | |
| "loss": 0.8889, | |
| "step": 8170 | |
| }, | |
| { | |
| "epoch": 1.0662148070907196, | |
| "grad_norm": 8.136956214904785, | |
| "learning_rate": 7.476000000000001e-06, | |
| "loss": 1.0155, | |
| "step": 8180 | |
| }, | |
| { | |
| "epoch": 1.0675182481751824, | |
| "grad_norm": 8.968605995178223, | |
| "learning_rate": 7.471555555555557e-06, | |
| "loss": 0.8767, | |
| "step": 8190 | |
| }, | |
| { | |
| "epoch": 1.0688216892596454, | |
| "grad_norm": 9.742268562316895, | |
| "learning_rate": 7.4671111111111115e-06, | |
| "loss": 0.9174, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 1.0688216892596454, | |
| "eval/acc": 39.53488540649414, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 1.0688216892596454, | |
| "eval_loss": 2.713195562362671, | |
| "eval_runtime": 0.5585, | |
| "eval_samples_per_second": 76.986, | |
| "eval_steps_per_second": 1.79, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 1.0701251303441084, | |
| "grad_norm": 10.187495231628418, | |
| "learning_rate": 7.462666666666667e-06, | |
| "loss": 0.9243, | |
| "step": 8210 | |
| }, | |
| { | |
| "epoch": 1.0714285714285714, | |
| "grad_norm": 8.872064590454102, | |
| "learning_rate": 7.458222222222224e-06, | |
| "loss": 0.9854, | |
| "step": 8220 | |
| }, | |
| { | |
| "epoch": 1.0727320125130344, | |
| "grad_norm": 8.593219757080078, | |
| "learning_rate": 7.4537777777777785e-06, | |
| "loss": 0.8947, | |
| "step": 8230 | |
| }, | |
| { | |
| "epoch": 1.0740354535974974, | |
| "grad_norm": 8.671141624450684, | |
| "learning_rate": 7.449333333333334e-06, | |
| "loss": 0.8982, | |
| "step": 8240 | |
| }, | |
| { | |
| "epoch": 1.0753388946819604, | |
| "grad_norm": 8.44241714477539, | |
| "learning_rate": 7.444888888888889e-06, | |
| "loss": 0.8571, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 1.0766423357664234, | |
| "grad_norm": 8.147622108459473, | |
| "learning_rate": 7.440444444444445e-06, | |
| "loss": 1.0003, | |
| "step": 8260 | |
| }, | |
| { | |
| "epoch": 1.0779457768508864, | |
| "grad_norm": 7.976443767547607, | |
| "learning_rate": 7.436000000000001e-06, | |
| "loss": 0.9462, | |
| "step": 8270 | |
| }, | |
| { | |
| "epoch": 1.0792492179353492, | |
| "grad_norm": 11.551979064941406, | |
| "learning_rate": 7.431555555555556e-06, | |
| "loss": 0.822, | |
| "step": 8280 | |
| }, | |
| { | |
| "epoch": 1.0805526590198122, | |
| "grad_norm": 9.746484756469727, | |
| "learning_rate": 7.427111111111112e-06, | |
| "loss": 0.9055, | |
| "step": 8290 | |
| }, | |
| { | |
| "epoch": 1.0818561001042752, | |
| "grad_norm": 8.56600570678711, | |
| "learning_rate": 7.422666666666667e-06, | |
| "loss": 0.9878, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 1.0818561001042752, | |
| "eval/acc": 41.86046600341797, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 1.0818561001042752, | |
| "eval_loss": 2.6222879886627197, | |
| "eval_runtime": 0.5525, | |
| "eval_samples_per_second": 77.832, | |
| "eval_steps_per_second": 1.81, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 1.0831595411887383, | |
| "grad_norm": 9.583529472351074, | |
| "learning_rate": 7.418222222222223e-06, | |
| "loss": 0.88, | |
| "step": 8310 | |
| }, | |
| { | |
| "epoch": 1.0844629822732013, | |
| "grad_norm": 8.403987884521484, | |
| "learning_rate": 7.413777777777779e-06, | |
| "loss": 0.9225, | |
| "step": 8320 | |
| }, | |
| { | |
| "epoch": 1.0857664233576643, | |
| "grad_norm": 8.449149131774902, | |
| "learning_rate": 7.409333333333334e-06, | |
| "loss": 0.8888, | |
| "step": 8330 | |
| }, | |
| { | |
| "epoch": 1.0870698644421273, | |
| "grad_norm": 9.25545883178711, | |
| "learning_rate": 7.4048888888888895e-06, | |
| "loss": 0.9197, | |
| "step": 8340 | |
| }, | |
| { | |
| "epoch": 1.0883733055265903, | |
| "grad_norm": 9.97526741027832, | |
| "learning_rate": 7.400444444444444e-06, | |
| "loss": 0.9448, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 1.0896767466110533, | |
| "grad_norm": 8.82689380645752, | |
| "learning_rate": 7.396000000000001e-06, | |
| "loss": 0.9921, | |
| "step": 8360 | |
| }, | |
| { | |
| "epoch": 1.090980187695516, | |
| "grad_norm": 9.498625755310059, | |
| "learning_rate": 7.3915555555555565e-06, | |
| "loss": 0.9156, | |
| "step": 8370 | |
| }, | |
| { | |
| "epoch": 1.092283628779979, | |
| "grad_norm": 9.056727409362793, | |
| "learning_rate": 7.387111111111111e-06, | |
| "loss": 0.9225, | |
| "step": 8380 | |
| }, | |
| { | |
| "epoch": 1.093587069864442, | |
| "grad_norm": 10.002091407775879, | |
| "learning_rate": 7.382666666666667e-06, | |
| "loss": 0.8715, | |
| "step": 8390 | |
| }, | |
| { | |
| "epoch": 1.094890510948905, | |
| "grad_norm": 10.686137199401855, | |
| "learning_rate": 7.378222222222224e-06, | |
| "loss": 0.9742, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 1.094890510948905, | |
| "eval/acc": 41.86046600341797, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 1.094890510948905, | |
| "eval_loss": 2.6428284645080566, | |
| "eval_runtime": 0.5508, | |
| "eval_samples_per_second": 78.075, | |
| "eval_steps_per_second": 1.816, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 1.0961939520333681, | |
| "grad_norm": 9.688252449035645, | |
| "learning_rate": 7.3737777777777785e-06, | |
| "loss": 0.8819, | |
| "step": 8410 | |
| }, | |
| { | |
| "epoch": 1.0974973931178311, | |
| "grad_norm": 8.46037769317627, | |
| "learning_rate": 7.369333333333334e-06, | |
| "loss": 0.8957, | |
| "step": 8420 | |
| }, | |
| { | |
| "epoch": 1.0988008342022941, | |
| "grad_norm": 12.241666793823242, | |
| "learning_rate": 7.364888888888889e-06, | |
| "loss": 0.9476, | |
| "step": 8430 | |
| }, | |
| { | |
| "epoch": 1.1001042752867571, | |
| "grad_norm": 9.35562801361084, | |
| "learning_rate": 7.360444444444445e-06, | |
| "loss": 0.8638, | |
| "step": 8440 | |
| }, | |
| { | |
| "epoch": 1.10140771637122, | |
| "grad_norm": 9.53718090057373, | |
| "learning_rate": 7.356000000000001e-06, | |
| "loss": 0.8484, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 1.102711157455683, | |
| "grad_norm": 8.596136093139648, | |
| "learning_rate": 7.351555555555556e-06, | |
| "loss": 0.8997, | |
| "step": 8460 | |
| }, | |
| { | |
| "epoch": 1.104014598540146, | |
| "grad_norm": 8.995074272155762, | |
| "learning_rate": 7.347111111111112e-06, | |
| "loss": 0.9136, | |
| "step": 8470 | |
| }, | |
| { | |
| "epoch": 1.105318039624609, | |
| "grad_norm": 9.429573059082031, | |
| "learning_rate": 7.342666666666667e-06, | |
| "loss": 0.8752, | |
| "step": 8480 | |
| }, | |
| { | |
| "epoch": 1.106621480709072, | |
| "grad_norm": 8.272865295410156, | |
| "learning_rate": 7.338222222222223e-06, | |
| "loss": 0.9411, | |
| "step": 8490 | |
| }, | |
| { | |
| "epoch": 1.107924921793535, | |
| "grad_norm": 9.662363052368164, | |
| "learning_rate": 7.333777777777779e-06, | |
| "loss": 0.9048, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 1.107924921793535, | |
| "eval/acc": 41.86046600341797, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 1.107924921793535, | |
| "eval_loss": 2.7484524250030518, | |
| "eval_runtime": 0.5518, | |
| "eval_samples_per_second": 77.922, | |
| "eval_steps_per_second": 1.812, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 1.109228362877998, | |
| "grad_norm": 10.379301071166992, | |
| "learning_rate": 7.329333333333334e-06, | |
| "loss": 0.9296, | |
| "step": 8510 | |
| }, | |
| { | |
| "epoch": 1.110531803962461, | |
| "grad_norm": 8.337886810302734, | |
| "learning_rate": 7.324888888888889e-06, | |
| "loss": 0.9018, | |
| "step": 8520 | |
| }, | |
| { | |
| "epoch": 1.1118352450469238, | |
| "grad_norm": 9.126009941101074, | |
| "learning_rate": 7.320444444444446e-06, | |
| "loss": 0.9447, | |
| "step": 8530 | |
| }, | |
| { | |
| "epoch": 1.1131386861313868, | |
| "grad_norm": 9.507220268249512, | |
| "learning_rate": 7.316000000000001e-06, | |
| "loss": 0.922, | |
| "step": 8540 | |
| }, | |
| { | |
| "epoch": 1.1144421272158498, | |
| "grad_norm": 9.640961647033691, | |
| "learning_rate": 7.3115555555555565e-06, | |
| "loss": 0.9118, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 1.1157455683003128, | |
| "grad_norm": 9.205474853515625, | |
| "learning_rate": 7.307111111111111e-06, | |
| "loss": 0.888, | |
| "step": 8560 | |
| }, | |
| { | |
| "epoch": 1.1170490093847758, | |
| "grad_norm": 8.839990615844727, | |
| "learning_rate": 7.302666666666667e-06, | |
| "loss": 0.9424, | |
| "step": 8570 | |
| }, | |
| { | |
| "epoch": 1.1183524504692388, | |
| "grad_norm": 9.424860000610352, | |
| "learning_rate": 7.2982222222222235e-06, | |
| "loss": 0.9087, | |
| "step": 8580 | |
| }, | |
| { | |
| "epoch": 1.1196558915537018, | |
| "grad_norm": 10.8524169921875, | |
| "learning_rate": 7.293777777777778e-06, | |
| "loss": 0.8574, | |
| "step": 8590 | |
| }, | |
| { | |
| "epoch": 1.1209593326381648, | |
| "grad_norm": 9.900219917297363, | |
| "learning_rate": 7.289333333333334e-06, | |
| "loss": 0.9162, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 1.1209593326381648, | |
| "eval/acc": 39.53488540649414, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 1.1209593326381648, | |
| "eval_loss": 2.7909610271453857, | |
| "eval_runtime": 0.5532, | |
| "eval_samples_per_second": 77.735, | |
| "eval_steps_per_second": 1.808, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 1.1222627737226278, | |
| "grad_norm": 8.885669708251953, | |
| "learning_rate": 7.284888888888889e-06, | |
| "loss": 0.8483, | |
| "step": 8610 | |
| }, | |
| { | |
| "epoch": 1.1235662148070906, | |
| "grad_norm": 10.051748275756836, | |
| "learning_rate": 7.2804444444444455e-06, | |
| "loss": 0.9287, | |
| "step": 8620 | |
| }, | |
| { | |
| "epoch": 1.1248696558915536, | |
| "grad_norm": 10.11687183380127, | |
| "learning_rate": 7.276000000000001e-06, | |
| "loss": 0.9642, | |
| "step": 8630 | |
| }, | |
| { | |
| "epoch": 1.1261730969760166, | |
| "grad_norm": 8.279516220092773, | |
| "learning_rate": 7.271555555555556e-06, | |
| "loss": 0.8276, | |
| "step": 8640 | |
| }, | |
| { | |
| "epoch": 1.1274765380604796, | |
| "grad_norm": 9.638724327087402, | |
| "learning_rate": 7.267111111111112e-06, | |
| "loss": 0.9309, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 1.1287799791449427, | |
| "grad_norm": 9.2564115524292, | |
| "learning_rate": 7.2626666666666665e-06, | |
| "loss": 0.8981, | |
| "step": 8660 | |
| }, | |
| { | |
| "epoch": 1.1300834202294057, | |
| "grad_norm": 11.3716402053833, | |
| "learning_rate": 7.258222222222223e-06, | |
| "loss": 0.9597, | |
| "step": 8670 | |
| }, | |
| { | |
| "epoch": 1.1313868613138687, | |
| "grad_norm": 8.568989753723145, | |
| "learning_rate": 7.253777777777779e-06, | |
| "loss": 0.8171, | |
| "step": 8680 | |
| }, | |
| { | |
| "epoch": 1.1326903023983317, | |
| "grad_norm": 9.600732803344727, | |
| "learning_rate": 7.249333333333334e-06, | |
| "loss": 0.9596, | |
| "step": 8690 | |
| }, | |
| { | |
| "epoch": 1.1339937434827947, | |
| "grad_norm": 8.98791790008545, | |
| "learning_rate": 7.244888888888889e-06, | |
| "loss": 0.8949, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 1.1339937434827947, | |
| "eval/acc": 37.20930099487305, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 1.1339937434827947, | |
| "eval_loss": 2.729647397994995, | |
| "eval_runtime": 0.5521, | |
| "eval_samples_per_second": 77.885, | |
| "eval_steps_per_second": 1.811, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 1.1352971845672575, | |
| "grad_norm": 7.747639179229736, | |
| "learning_rate": 7.240444444444446e-06, | |
| "loss": 0.9241, | |
| "step": 8710 | |
| }, | |
| { | |
| "epoch": 1.1366006256517205, | |
| "grad_norm": 8.257080078125, | |
| "learning_rate": 7.236000000000001e-06, | |
| "loss": 0.9498, | |
| "step": 8720 | |
| }, | |
| { | |
| "epoch": 1.1379040667361835, | |
| "grad_norm": 8.281147003173828, | |
| "learning_rate": 7.231555555555556e-06, | |
| "loss": 0.9207, | |
| "step": 8730 | |
| }, | |
| { | |
| "epoch": 1.1392075078206465, | |
| "grad_norm": 10.297804832458496, | |
| "learning_rate": 7.227111111111111e-06, | |
| "loss": 0.9211, | |
| "step": 8740 | |
| }, | |
| { | |
| "epoch": 1.1405109489051095, | |
| "grad_norm": 8.747051239013672, | |
| "learning_rate": 7.222666666666667e-06, | |
| "loss": 0.8808, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 1.1418143899895725, | |
| "grad_norm": 8.297418594360352, | |
| "learning_rate": 7.2182222222222235e-06, | |
| "loss": 0.9545, | |
| "step": 8760 | |
| }, | |
| { | |
| "epoch": 1.1431178310740355, | |
| "grad_norm": 8.677216529846191, | |
| "learning_rate": 7.213777777777778e-06, | |
| "loss": 0.8664, | |
| "step": 8770 | |
| }, | |
| { | |
| "epoch": 1.1444212721584983, | |
| "grad_norm": 9.342988014221191, | |
| "learning_rate": 7.209333333333334e-06, | |
| "loss": 0.8342, | |
| "step": 8780 | |
| }, | |
| { | |
| "epoch": 1.1457247132429613, | |
| "grad_norm": 9.079336166381836, | |
| "learning_rate": 7.204888888888889e-06, | |
| "loss": 0.9371, | |
| "step": 8790 | |
| }, | |
| { | |
| "epoch": 1.1470281543274243, | |
| "grad_norm": 9.262984275817871, | |
| "learning_rate": 7.200444444444445e-06, | |
| "loss": 0.8873, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 1.1470281543274243, | |
| "eval/acc": 41.86046600341797, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 1.1470281543274243, | |
| "eval_loss": 2.822016477584839, | |
| "eval_runtime": 0.5525, | |
| "eval_samples_per_second": 77.829, | |
| "eval_steps_per_second": 1.81, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 1.1483315954118873, | |
| "grad_norm": 10.144986152648926, | |
| "learning_rate": 7.196000000000001e-06, | |
| "loss": 0.938, | |
| "step": 8810 | |
| }, | |
| { | |
| "epoch": 1.1496350364963503, | |
| "grad_norm": 8.611014366149902, | |
| "learning_rate": 7.191555555555556e-06, | |
| "loss": 0.9392, | |
| "step": 8820 | |
| }, | |
| { | |
| "epoch": 1.1509384775808134, | |
| "grad_norm": 6.575359344482422, | |
| "learning_rate": 7.187111111111112e-06, | |
| "loss": 0.8865, | |
| "step": 8830 | |
| }, | |
| { | |
| "epoch": 1.1522419186652764, | |
| "grad_norm": 8.885351181030273, | |
| "learning_rate": 7.182666666666668e-06, | |
| "loss": 0.908, | |
| "step": 8840 | |
| }, | |
| { | |
| "epoch": 1.1535453597497394, | |
| "grad_norm": 10.43062973022461, | |
| "learning_rate": 7.178222222222223e-06, | |
| "loss": 0.9037, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 1.1548488008342024, | |
| "grad_norm": 9.297091484069824, | |
| "learning_rate": 7.173777777777779e-06, | |
| "loss": 0.9363, | |
| "step": 8860 | |
| }, | |
| { | |
| "epoch": 1.1561522419186652, | |
| "grad_norm": 9.116903305053711, | |
| "learning_rate": 7.1693333333333335e-06, | |
| "loss": 0.9396, | |
| "step": 8870 | |
| }, | |
| { | |
| "epoch": 1.1574556830031282, | |
| "grad_norm": 9.323028564453125, | |
| "learning_rate": 7.164888888888889e-06, | |
| "loss": 0.8927, | |
| "step": 8880 | |
| }, | |
| { | |
| "epoch": 1.1587591240875912, | |
| "grad_norm": 9.512086868286133, | |
| "learning_rate": 7.160444444444446e-06, | |
| "loss": 0.9617, | |
| "step": 8890 | |
| }, | |
| { | |
| "epoch": 1.1600625651720542, | |
| "grad_norm": 8.222494125366211, | |
| "learning_rate": 7.156000000000001e-06, | |
| "loss": 0.8694, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 1.1600625651720542, | |
| "eval/acc": 41.86046600341797, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 1.1600625651720542, | |
| "eval_loss": 2.7194950580596924, | |
| "eval_runtime": 0.5505, | |
| "eval_samples_per_second": 78.114, | |
| "eval_steps_per_second": 1.817, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 1.1613660062565172, | |
| "grad_norm": 10.078699111938477, | |
| "learning_rate": 7.151555555555556e-06, | |
| "loss": 0.9702, | |
| "step": 8910 | |
| }, | |
| { | |
| "epoch": 1.1626694473409802, | |
| "grad_norm": 7.739863395690918, | |
| "learning_rate": 7.147111111111111e-06, | |
| "loss": 0.9196, | |
| "step": 8920 | |
| }, | |
| { | |
| "epoch": 1.1639728884254432, | |
| "grad_norm": 9.16687297821045, | |
| "learning_rate": 7.142666666666668e-06, | |
| "loss": 0.8814, | |
| "step": 8930 | |
| }, | |
| { | |
| "epoch": 1.1652763295099062, | |
| "grad_norm": 9.084263801574707, | |
| "learning_rate": 7.138222222222223e-06, | |
| "loss": 0.8838, | |
| "step": 8940 | |
| }, | |
| { | |
| "epoch": 1.1665797705943692, | |
| "grad_norm": 9.483837127685547, | |
| "learning_rate": 7.133777777777778e-06, | |
| "loss": 0.8867, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 1.167883211678832, | |
| "grad_norm": 8.17795181274414, | |
| "learning_rate": 7.129333333333334e-06, | |
| "loss": 0.8548, | |
| "step": 8960 | |
| }, | |
| { | |
| "epoch": 1.169186652763295, | |
| "grad_norm": 10.508554458618164, | |
| "learning_rate": 7.124888888888889e-06, | |
| "loss": 0.894, | |
| "step": 8970 | |
| }, | |
| { | |
| "epoch": 1.170490093847758, | |
| "grad_norm": 9.039753913879395, | |
| "learning_rate": 7.120444444444445e-06, | |
| "loss": 0.9424, | |
| "step": 8980 | |
| }, | |
| { | |
| "epoch": 1.171793534932221, | |
| "grad_norm": 10.616847038269043, | |
| "learning_rate": 7.116000000000001e-06, | |
| "loss": 0.9333, | |
| "step": 8990 | |
| }, | |
| { | |
| "epoch": 1.173096976016684, | |
| "grad_norm": 11.284839630126953, | |
| "learning_rate": 7.111555555555556e-06, | |
| "loss": 0.879, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 1.173096976016684, | |
| "eval/acc": 39.53488540649414, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 1.173096976016684, | |
| "eval_loss": 2.869804620742798, | |
| "eval_runtime": 0.552, | |
| "eval_samples_per_second": 77.901, | |
| "eval_steps_per_second": 1.812, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 1.174400417101147, | |
| "grad_norm": 8.72288703918457, | |
| "learning_rate": 7.1071111111111115e-06, | |
| "loss": 0.8851, | |
| "step": 9010 | |
| }, | |
| { | |
| "epoch": 1.17570385818561, | |
| "grad_norm": 10.025834083557129, | |
| "learning_rate": 7.102666666666668e-06, | |
| "loss": 0.9131, | |
| "step": 9020 | |
| }, | |
| { | |
| "epoch": 1.177007299270073, | |
| "grad_norm": 9.2742338180542, | |
| "learning_rate": 7.098222222222223e-06, | |
| "loss": 0.9457, | |
| "step": 9030 | |
| }, | |
| { | |
| "epoch": 1.178310740354536, | |
| "grad_norm": 8.592848777770996, | |
| "learning_rate": 7.093777777777779e-06, | |
| "loss": 0.8704, | |
| "step": 9040 | |
| }, | |
| { | |
| "epoch": 1.1796141814389989, | |
| "grad_norm": 9.556818962097168, | |
| "learning_rate": 7.0893333333333334e-06, | |
| "loss": 0.9106, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 1.1809176225234619, | |
| "grad_norm": 9.446195602416992, | |
| "learning_rate": 7.084888888888889e-06, | |
| "loss": 0.8518, | |
| "step": 9060 | |
| }, | |
| { | |
| "epoch": 1.182221063607925, | |
| "grad_norm": 8.797647476196289, | |
| "learning_rate": 7.080444444444446e-06, | |
| "loss": 0.927, | |
| "step": 9070 | |
| }, | |
| { | |
| "epoch": 1.183524504692388, | |
| "grad_norm": 7.664761066436768, | |
| "learning_rate": 7.0760000000000005e-06, | |
| "loss": 0.8785, | |
| "step": 9080 | |
| }, | |
| { | |
| "epoch": 1.184827945776851, | |
| "grad_norm": 8.273712158203125, | |
| "learning_rate": 7.071555555555556e-06, | |
| "loss": 0.9844, | |
| "step": 9090 | |
| }, | |
| { | |
| "epoch": 1.186131386861314, | |
| "grad_norm": 8.067178726196289, | |
| "learning_rate": 7.067111111111111e-06, | |
| "loss": 0.9642, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 1.186131386861314, | |
| "eval/acc": 41.86046600341797, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 1.186131386861314, | |
| "eval_loss": 2.6579670906066895, | |
| "eval_runtime": 0.5546, | |
| "eval_samples_per_second": 77.529, | |
| "eval_steps_per_second": 1.803, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 1.187434827945777, | |
| "grad_norm": 9.999368667602539, | |
| "learning_rate": 7.062666666666668e-06, | |
| "loss": 0.9165, | |
| "step": 9110 | |
| }, | |
| { | |
| "epoch": 1.1887382690302397, | |
| "grad_norm": 11.097794532775879, | |
| "learning_rate": 7.058222222222223e-06, | |
| "loss": 0.8975, | |
| "step": 9120 | |
| }, | |
| { | |
| "epoch": 1.1900417101147027, | |
| "grad_norm": 9.338957786560059, | |
| "learning_rate": 7.053777777777778e-06, | |
| "loss": 0.9487, | |
| "step": 9130 | |
| }, | |
| { | |
| "epoch": 1.1913451511991657, | |
| "grad_norm": 8.180441856384277, | |
| "learning_rate": 7.049333333333334e-06, | |
| "loss": 0.8133, | |
| "step": 9140 | |
| }, | |
| { | |
| "epoch": 1.1926485922836287, | |
| "grad_norm": 9.26723575592041, | |
| "learning_rate": 7.04488888888889e-06, | |
| "loss": 0.8796, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 1.1939520333680917, | |
| "grad_norm": 10.798274993896484, | |
| "learning_rate": 7.040444444444445e-06, | |
| "loss": 0.9258, | |
| "step": 9160 | |
| }, | |
| { | |
| "epoch": 1.1952554744525548, | |
| "grad_norm": 9.296907424926758, | |
| "learning_rate": 7.036000000000001e-06, | |
| "loss": 0.9351, | |
| "step": 9170 | |
| }, | |
| { | |
| "epoch": 1.1965589155370178, | |
| "grad_norm": 9.426993370056152, | |
| "learning_rate": 7.031555555555556e-06, | |
| "loss": 0.9557, | |
| "step": 9180 | |
| }, | |
| { | |
| "epoch": 1.1978623566214808, | |
| "grad_norm": 8.745172500610352, | |
| "learning_rate": 7.0271111111111114e-06, | |
| "loss": 0.9959, | |
| "step": 9190 | |
| }, | |
| { | |
| "epoch": 1.1991657977059438, | |
| "grad_norm": 8.444724082946777, | |
| "learning_rate": 7.022666666666668e-06, | |
| "loss": 0.8816, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 1.1991657977059438, | |
| "eval/acc": 39.53488540649414, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 1.1991657977059438, | |
| "eval_loss": 2.6600775718688965, | |
| "eval_runtime": 0.5567, | |
| "eval_samples_per_second": 77.247, | |
| "eval_steps_per_second": 1.796, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 1.2004692387904066, | |
| "grad_norm": 9.101587295532227, | |
| "learning_rate": 7.018222222222223e-06, | |
| "loss": 0.9375, | |
| "step": 9210 | |
| }, | |
| { | |
| "epoch": 1.2017726798748696, | |
| "grad_norm": 9.597663879394531, | |
| "learning_rate": 7.0137777777777785e-06, | |
| "loss": 0.9135, | |
| "step": 9220 | |
| }, | |
| { | |
| "epoch": 1.2030761209593326, | |
| "grad_norm": 9.677689552307129, | |
| "learning_rate": 7.009333333333333e-06, | |
| "loss": 0.9415, | |
| "step": 9230 | |
| }, | |
| { | |
| "epoch": 1.2043795620437956, | |
| "grad_norm": 8.487961769104004, | |
| "learning_rate": 7.00488888888889e-06, | |
| "loss": 0.9985, | |
| "step": 9240 | |
| }, | |
| { | |
| "epoch": 1.2056830031282586, | |
| "grad_norm": 8.874425888061523, | |
| "learning_rate": 7.000444444444446e-06, | |
| "loss": 0.8465, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 1.2069864442127216, | |
| "grad_norm": 10.664003372192383, | |
| "learning_rate": 6.9960000000000004e-06, | |
| "loss": 1.0205, | |
| "step": 9260 | |
| }, | |
| { | |
| "epoch": 1.2082898852971846, | |
| "grad_norm": 9.072754859924316, | |
| "learning_rate": 6.991555555555556e-06, | |
| "loss": 0.9317, | |
| "step": 9270 | |
| }, | |
| { | |
| "epoch": 1.2095933263816476, | |
| "grad_norm": 10.553153038024902, | |
| "learning_rate": 6.987111111111111e-06, | |
| "loss": 0.9212, | |
| "step": 9280 | |
| }, | |
| { | |
| "epoch": 1.2108967674661106, | |
| "grad_norm": 7.488556385040283, | |
| "learning_rate": 6.9826666666666675e-06, | |
| "loss": 0.8059, | |
| "step": 9290 | |
| }, | |
| { | |
| "epoch": 1.2122002085505734, | |
| "grad_norm": 10.70551586151123, | |
| "learning_rate": 6.978222222222223e-06, | |
| "loss": 0.949, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 1.2122002085505734, | |
| "eval/acc": 41.86046600341797, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 1.2122002085505734, | |
| "eval_loss": 2.724426031112671, | |
| "eval_runtime": 0.5754, | |
| "eval_samples_per_second": 74.733, | |
| "eval_steps_per_second": 1.738, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 1.2135036496350364, | |
| "grad_norm": 8.544173240661621, | |
| "learning_rate": 6.973777777777778e-06, | |
| "loss": 0.8677, | |
| "step": 9310 | |
| }, | |
| { | |
| "epoch": 1.2148070907194994, | |
| "grad_norm": 9.747323036193848, | |
| "learning_rate": 6.969333333333334e-06, | |
| "loss": 0.9325, | |
| "step": 9320 | |
| }, | |
| { | |
| "epoch": 1.2161105318039624, | |
| "grad_norm": 10.305643081665039, | |
| "learning_rate": 6.96488888888889e-06, | |
| "loss": 0.8978, | |
| "step": 9330 | |
| }, | |
| { | |
| "epoch": 1.2174139728884255, | |
| "grad_norm": 9.521451950073242, | |
| "learning_rate": 6.960444444444445e-06, | |
| "loss": 0.9385, | |
| "step": 9340 | |
| }, | |
| { | |
| "epoch": 1.2187174139728885, | |
| "grad_norm": 9.554557800292969, | |
| "learning_rate": 6.956000000000001e-06, | |
| "loss": 0.806, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 1.2200208550573515, | |
| "grad_norm": 9.167433738708496, | |
| "learning_rate": 6.951555555555556e-06, | |
| "loss": 0.8892, | |
| "step": 9360 | |
| }, | |
| { | |
| "epoch": 1.2213242961418145, | |
| "grad_norm": 9.763519287109375, | |
| "learning_rate": 6.947111111111111e-06, | |
| "loss": 0.956, | |
| "step": 9370 | |
| }, | |
| { | |
| "epoch": 1.2226277372262775, | |
| "grad_norm": 10.049515724182129, | |
| "learning_rate": 6.942666666666668e-06, | |
| "loss": 1.0004, | |
| "step": 9380 | |
| }, | |
| { | |
| "epoch": 1.2239311783107403, | |
| "grad_norm": 9.015792846679688, | |
| "learning_rate": 6.938222222222223e-06, | |
| "loss": 0.8952, | |
| "step": 9390 | |
| }, | |
| { | |
| "epoch": 1.2252346193952033, | |
| "grad_norm": 8.805821418762207, | |
| "learning_rate": 6.9337777777777784e-06, | |
| "loss": 0.9088, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 1.2252346193952033, | |
| "eval/acc": 41.86046600341797, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 1.2252346193952033, | |
| "eval_loss": 2.7320239543914795, | |
| "eval_runtime": 0.5515, | |
| "eval_samples_per_second": 77.971, | |
| "eval_steps_per_second": 1.813, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 1.2265380604796663, | |
| "grad_norm": 9.7424955368042, | |
| "learning_rate": 6.929333333333333e-06, | |
| "loss": 0.8769, | |
| "step": 9410 | |
| }, | |
| { | |
| "epoch": 1.2278415015641293, | |
| "grad_norm": 9.860488891601562, | |
| "learning_rate": 6.92488888888889e-06, | |
| "loss": 0.9535, | |
| "step": 9420 | |
| }, | |
| { | |
| "epoch": 1.2291449426485923, | |
| "grad_norm": 9.606118202209473, | |
| "learning_rate": 6.9204444444444455e-06, | |
| "loss": 0.8852, | |
| "step": 9430 | |
| }, | |
| { | |
| "epoch": 1.2304483837330553, | |
| "grad_norm": 8.621418952941895, | |
| "learning_rate": 6.916e-06, | |
| "loss": 0.8938, | |
| "step": 9440 | |
| }, | |
| { | |
| "epoch": 1.2317518248175183, | |
| "grad_norm": 9.280407905578613, | |
| "learning_rate": 6.911555555555556e-06, | |
| "loss": 0.8569, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 1.2330552659019811, | |
| "grad_norm": 9.812846183776855, | |
| "learning_rate": 6.907111111111113e-06, | |
| "loss": 0.9469, | |
| "step": 9460 | |
| }, | |
| { | |
| "epoch": 1.2343587069864441, | |
| "grad_norm": 10.344027519226074, | |
| "learning_rate": 6.9026666666666674e-06, | |
| "loss": 0.9226, | |
| "step": 9470 | |
| }, | |
| { | |
| "epoch": 1.2356621480709071, | |
| "grad_norm": 9.68948745727539, | |
| "learning_rate": 6.898222222222223e-06, | |
| "loss": 0.9407, | |
| "step": 9480 | |
| }, | |
| { | |
| "epoch": 1.2369655891553701, | |
| "grad_norm": 8.801238059997559, | |
| "learning_rate": 6.893777777777778e-06, | |
| "loss": 0.918, | |
| "step": 9490 | |
| }, | |
| { | |
| "epoch": 1.2382690302398331, | |
| "grad_norm": 10.129354476928711, | |
| "learning_rate": 6.889333333333334e-06, | |
| "loss": 0.8354, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 1.2382690302398331, | |
| "eval/acc": 41.86046600341797, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 1.2382690302398331, | |
| "eval_loss": 2.6192688941955566, | |
| "eval_runtime": 0.5967, | |
| "eval_samples_per_second": 72.062, | |
| "eval_steps_per_second": 1.676, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 1.2395724713242962, | |
| "grad_norm": 8.593018531799316, | |
| "learning_rate": 6.88488888888889e-06, | |
| "loss": 0.8486, | |
| "step": 9510 | |
| }, | |
| { | |
| "epoch": 1.2408759124087592, | |
| "grad_norm": 10.093417167663574, | |
| "learning_rate": 6.880444444444445e-06, | |
| "loss": 0.8747, | |
| "step": 9520 | |
| }, | |
| { | |
| "epoch": 1.2421793534932222, | |
| "grad_norm": 10.845832824707031, | |
| "learning_rate": 6.876000000000001e-06, | |
| "loss": 0.9131, | |
| "step": 9530 | |
| }, | |
| { | |
| "epoch": 1.2434827945776852, | |
| "grad_norm": 8.653305053710938, | |
| "learning_rate": 6.871555555555556e-06, | |
| "loss": 0.8911, | |
| "step": 9540 | |
| }, | |
| { | |
| "epoch": 1.244786235662148, | |
| "grad_norm": 9.522208213806152, | |
| "learning_rate": 6.867111111111112e-06, | |
| "loss": 0.9512, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 1.246089676746611, | |
| "grad_norm": 10.715890884399414, | |
| "learning_rate": 6.862666666666668e-06, | |
| "loss": 0.7942, | |
| "step": 9560 | |
| }, | |
| { | |
| "epoch": 1.247393117831074, | |
| "grad_norm": 9.762306213378906, | |
| "learning_rate": 6.858222222222223e-06, | |
| "loss": 0.8735, | |
| "step": 9570 | |
| }, | |
| { | |
| "epoch": 1.248696558915537, | |
| "grad_norm": 8.764912605285645, | |
| "learning_rate": 6.853777777777778e-06, | |
| "loss": 0.9222, | |
| "step": 9580 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 8.265697479248047, | |
| "learning_rate": 6.849333333333333e-06, | |
| "loss": 0.9455, | |
| "step": 9590 | |
| }, | |
| { | |
| "epoch": 1.251303441084463, | |
| "grad_norm": 8.233742713928223, | |
| "learning_rate": 6.84488888888889e-06, | |
| "loss": 0.8745, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 1.251303441084463, | |
| "eval/acc": 41.86046600341797, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 1.251303441084463, | |
| "eval_loss": 2.7820498943328857, | |
| "eval_runtime": 0.5501, | |
| "eval_samples_per_second": 78.165, | |
| "eval_steps_per_second": 1.818, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 1.252606882168926, | |
| "grad_norm": 10.830070495605469, | |
| "learning_rate": 6.8404444444444454e-06, | |
| "loss": 0.9719, | |
| "step": 9610 | |
| }, | |
| { | |
| "epoch": 1.253910323253389, | |
| "grad_norm": 11.779263496398926, | |
| "learning_rate": 6.836e-06, | |
| "loss": 0.9731, | |
| "step": 9620 | |
| }, | |
| { | |
| "epoch": 1.255213764337852, | |
| "grad_norm": 7.983153820037842, | |
| "learning_rate": 6.831555555555556e-06, | |
| "loss": 0.8534, | |
| "step": 9630 | |
| }, | |
| { | |
| "epoch": 1.2565172054223148, | |
| "grad_norm": 11.44545841217041, | |
| "learning_rate": 6.8271111111111125e-06, | |
| "loss": 0.9109, | |
| "step": 9640 | |
| }, | |
| { | |
| "epoch": 1.2578206465067778, | |
| "grad_norm": 8.1010103225708, | |
| "learning_rate": 6.822666666666667e-06, | |
| "loss": 0.919, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 1.2591240875912408, | |
| "grad_norm": 8.409565925598145, | |
| "learning_rate": 6.818222222222223e-06, | |
| "loss": 0.9451, | |
| "step": 9660 | |
| }, | |
| { | |
| "epoch": 1.2604275286757038, | |
| "grad_norm": 9.691168785095215, | |
| "learning_rate": 6.813777777777778e-06, | |
| "loss": 0.892, | |
| "step": 9670 | |
| }, | |
| { | |
| "epoch": 1.2617309697601669, | |
| "grad_norm": 10.412053108215332, | |
| "learning_rate": 6.809333333333334e-06, | |
| "loss": 0.8998, | |
| "step": 9680 | |
| }, | |
| { | |
| "epoch": 1.2630344108446299, | |
| "grad_norm": 10.049180030822754, | |
| "learning_rate": 6.80488888888889e-06, | |
| "loss": 0.8622, | |
| "step": 9690 | |
| }, | |
| { | |
| "epoch": 1.2643378519290929, | |
| "grad_norm": 9.016233444213867, | |
| "learning_rate": 6.800444444444445e-06, | |
| "loss": 0.9278, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 1.2643378519290929, | |
| "eval/acc": 41.86046600341797, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 1.2643378519290929, | |
| "eval_loss": 2.7881505489349365, | |
| "eval_runtime": 0.5523, | |
| "eval_samples_per_second": 77.86, | |
| "eval_steps_per_second": 1.811, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 1.2656412930135557, | |
| "grad_norm": 9.524649620056152, | |
| "learning_rate": 6.796000000000001e-06, | |
| "loss": 0.8966, | |
| "step": 9710 | |
| }, | |
| { | |
| "epoch": 1.2669447340980189, | |
| "grad_norm": 10.176855087280273, | |
| "learning_rate": 6.7915555555555555e-06, | |
| "loss": 0.9484, | |
| "step": 9720 | |
| }, | |
| { | |
| "epoch": 1.2682481751824817, | |
| "grad_norm": 8.774044036865234, | |
| "learning_rate": 6.787111111111112e-06, | |
| "loss": 0.8667, | |
| "step": 9730 | |
| }, | |
| { | |
| "epoch": 1.2695516162669447, | |
| "grad_norm": 8.964950561523438, | |
| "learning_rate": 6.782666666666668e-06, | |
| "loss": 0.8335, | |
| "step": 9740 | |
| }, | |
| { | |
| "epoch": 1.2708550573514077, | |
| "grad_norm": 9.385854721069336, | |
| "learning_rate": 6.778222222222223e-06, | |
| "loss": 0.8379, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 1.2721584984358707, | |
| "grad_norm": 10.424607276916504, | |
| "learning_rate": 6.773777777777778e-06, | |
| "loss": 0.9662, | |
| "step": 9760 | |
| }, | |
| { | |
| "epoch": 1.2734619395203337, | |
| "grad_norm": 7.88417387008667, | |
| "learning_rate": 6.769333333333335e-06, | |
| "loss": 0.9382, | |
| "step": 9770 | |
| }, | |
| { | |
| "epoch": 1.2747653806047967, | |
| "grad_norm": 10.213610649108887, | |
| "learning_rate": 6.76488888888889e-06, | |
| "loss": 0.8691, | |
| "step": 9780 | |
| }, | |
| { | |
| "epoch": 1.2760688216892597, | |
| "grad_norm": 9.095560073852539, | |
| "learning_rate": 6.760444444444445e-06, | |
| "loss": 0.9166, | |
| "step": 9790 | |
| }, | |
| { | |
| "epoch": 1.2773722627737225, | |
| "grad_norm": 8.787574768066406, | |
| "learning_rate": 6.756e-06, | |
| "loss": 0.8944, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 1.2773722627737225, | |
| "eval/acc": 41.86046600341797, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 1.2773722627737225, | |
| "eval_loss": 2.719513416290283, | |
| "eval_runtime": 0.5524, | |
| "eval_samples_per_second": 77.849, | |
| "eval_steps_per_second": 1.81, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 1.2786757038581857, | |
| "grad_norm": 8.333978652954102, | |
| "learning_rate": 6.751555555555556e-06, | |
| "loss": 0.8888, | |
| "step": 9810 | |
| }, | |
| { | |
| "epoch": 1.2799791449426485, | |
| "grad_norm": 10.098021507263184, | |
| "learning_rate": 6.7471111111111124e-06, | |
| "loss": 0.9634, | |
| "step": 9820 | |
| }, | |
| { | |
| "epoch": 1.2812825860271115, | |
| "grad_norm": 9.07473373413086, | |
| "learning_rate": 6.742666666666667e-06, | |
| "loss": 0.9637, | |
| "step": 9830 | |
| }, | |
| { | |
| "epoch": 1.2825860271115745, | |
| "grad_norm": 7.207655429840088, | |
| "learning_rate": 6.738222222222223e-06, | |
| "loss": 0.9221, | |
| "step": 9840 | |
| }, | |
| { | |
| "epoch": 1.2838894681960376, | |
| "grad_norm": 8.756492614746094, | |
| "learning_rate": 6.733777777777778e-06, | |
| "loss": 0.8939, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 1.2851929092805006, | |
| "grad_norm": 8.18359375, | |
| "learning_rate": 6.729333333333334e-06, | |
| "loss": 0.9374, | |
| "step": 9860 | |
| }, | |
| { | |
| "epoch": 1.2864963503649636, | |
| "grad_norm": 8.941536903381348, | |
| "learning_rate": 6.72488888888889e-06, | |
| "loss": 0.9355, | |
| "step": 9870 | |
| }, | |
| { | |
| "epoch": 1.2877997914494266, | |
| "grad_norm": 10.77589225769043, | |
| "learning_rate": 6.720444444444445e-06, | |
| "loss": 0.9452, | |
| "step": 9880 | |
| }, | |
| { | |
| "epoch": 1.2891032325338894, | |
| "grad_norm": 9.430168151855469, | |
| "learning_rate": 6.716000000000001e-06, | |
| "loss": 0.9284, | |
| "step": 9890 | |
| }, | |
| { | |
| "epoch": 1.2904066736183524, | |
| "grad_norm": 8.396258354187012, | |
| "learning_rate": 6.7115555555555554e-06, | |
| "loss": 0.9645, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 1.2904066736183524, | |
| "eval/acc": 41.86046600341797, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 1.2904066736183524, | |
| "eval_loss": 2.688626527786255, | |
| "eval_runtime": 0.5512, | |
| "eval_samples_per_second": 78.016, | |
| "eval_steps_per_second": 1.814, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 1.2917101147028154, | |
| "grad_norm": 9.193182945251465, | |
| "learning_rate": 6.707111111111112e-06, | |
| "loss": 0.9774, | |
| "step": 9910 | |
| }, | |
| { | |
| "epoch": 1.2930135557872784, | |
| "grad_norm": 8.28522777557373, | |
| "learning_rate": 6.702666666666668e-06, | |
| "loss": 0.9131, | |
| "step": 9920 | |
| }, | |
| { | |
| "epoch": 1.2943169968717414, | |
| "grad_norm": 8.093572616577148, | |
| "learning_rate": 6.6982222222222225e-06, | |
| "loss": 0.8372, | |
| "step": 9930 | |
| }, | |
| { | |
| "epoch": 1.2956204379562044, | |
| "grad_norm": 9.774358749389648, | |
| "learning_rate": 6.693777777777778e-06, | |
| "loss": 0.9098, | |
| "step": 9940 | |
| }, | |
| { | |
| "epoch": 1.2969238790406674, | |
| "grad_norm": 9.629504203796387, | |
| "learning_rate": 6.689333333333335e-06, | |
| "loss": 0.8035, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 1.2982273201251304, | |
| "grad_norm": 8.22866439819336, | |
| "learning_rate": 6.68488888888889e-06, | |
| "loss": 0.8679, | |
| "step": 9960 | |
| }, | |
| { | |
| "epoch": 1.2995307612095934, | |
| "grad_norm": 9.282711029052734, | |
| "learning_rate": 6.680444444444445e-06, | |
| "loss": 0.8877, | |
| "step": 9970 | |
| }, | |
| { | |
| "epoch": 1.3008342022940562, | |
| "grad_norm": 9.920709609985352, | |
| "learning_rate": 6.676e-06, | |
| "loss": 0.8942, | |
| "step": 9980 | |
| }, | |
| { | |
| "epoch": 1.3021376433785192, | |
| "grad_norm": 8.841803550720215, | |
| "learning_rate": 6.671555555555556e-06, | |
| "loss": 0.8912, | |
| "step": 9990 | |
| }, | |
| { | |
| "epoch": 1.3034410844629822, | |
| "grad_norm": 7.006541728973389, | |
| "learning_rate": 6.667111111111112e-06, | |
| "loss": 0.8737, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.3034410844629822, | |
| "eval/acc": 41.86046600341797, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.3034410844629822, | |
| "eval_loss": 2.7103235721588135, | |
| "eval_runtime": 0.5532, | |
| "eval_samples_per_second": 77.73, | |
| "eval_steps_per_second": 1.808, | |
| "step": 10000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 25000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 5000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 32, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |